yp202031121
SONY’s Work Note on 2023.11.21
阅读代码 dataset.py
这段代码用于从一个jsonl文件创建数据集,以便对OpenBuddy模型进行微调。
略
数据处理
所有文件夹里的jsonl文件都合并成一个jsonl文件(文件夹名字命名jsonl文件)— task2.py
import os
import json
def merge_jsonl_folders(input_folder, output_folder):
for foldername in os.listdir(input_folder):
folderpath = os.path.join(input_folder, foldername)
if os.path.isdir(folderpath):
output_file = os.path.join(output_folder, f"{foldername}.jsonl")
with open(output_file, 'w') as outfile:
for filename in os.listdir(folderpath):
if filename.endswith('.jsonl'):
filepath = os.path.join(folderpath, filename)
with open(filepath, 'r')as infile:
for line in infile:
outfile.write(line)
# 输入文件夹路径
input_folder = '/home/im/Downloads/dpo_test_2-1107/'
# 输出文件夹路径
output_folder = '/home/im/yp1121/task_jsonl_1107/'
# 遍历文件夹并合并文件
merge_jsonl_folders(input_folder, output_folder)
jsonl文件大合并 – task3.py & task4.py
import os
import json
input_folder = '/home/im/yp1121/task_jsonl_1107/'
output_file = '/home/im/yp1121/task_all_1107.jsonl'
def merge_jsonl_files(input_folder, output_file):
with open(output_file, 'w') as outfile:
for filename in os.listdir(input_folder):
if filename.endswith('.jsonl'):
filepath = os.path.join(input_folder, filename)
with open(filepath, 'r') as infile:
for line in infile:
outfile.write(line)
merge_jsonl_files(input_folder, output_file)
import jsonlines
def merge_jsonl(file1, file2, output_file):
# 读取第一个JSONL文件中的内容
with jsonlines.open(file1, 'r') as reader:
data1 = list(reader)
# 读取第二个JSONL文件中的内容
with jsonlines.open(file2, 'r') as reader:
data2 = list(reader)
# 合并两个数据列表
merged_data = data1 + data2
# 将合并后的内容写入输出文件
with jsonlines.open(output_file, 'w') as writer:
writer.write_all(merged_data)
# 示例文件路径
file1_path = 'task_all.jsonl'
file2_path = 'task_all_1107.jsonl'
output_file_path = 'all.jsonl'
# 合并两个JSONL文件
merge_jsonl(file1_path, file2_path, output_file_path)
jsonl文件数据清洗1(删除额外的空格/删除奇怪的字符-Remove non-ASCII characters and strange strings)–task5.py
import json
import re
def clean_data(file_path):
cleaned_data = []
unique_messages = set()
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
data = json.loads(line.strip())
#data = json.dumps(data, ensure_ascii=False)
# Clean messages
messages = data['messages']
cleaned_messages = messages
# Remove duplicates
messages_str = json.dumps(cleaned_messages, sort_keys=True)
if messages_str not in unique_messages:
unique_messages.add(messages_str)
cleaned_data.append(data)
# Remove extra whitespace
cleaned_data = [re.sub(r'\s+', ' ', json.dumps(data)) for data in cleaned_data]
# Remove non-ASCII characters and strange strings
cleaned_data = [re.sub(r'[^\x00-\x7F]+', '', data) for data in cleaned_data]
return cleaned_data
# 调用函数进行数据清洗
file_path = 'all.jsonl'
cleaned_data = clean_data(file_path)
# 将清洗后的数据写入新文件
output_file_path = 'all_cleaned_file.jsonl'
with open(output_file_path, 'w', encoding='utf-8') as outfile:
for data in cleaned_data:
outfile.write(data + '\n')
删除重复率大于50%的数据(id之间)– task6.py
import jsonlines
import json
from collections import Counter
def count_message_occurrences(input_file):
message_counts = Counter()
total_messages = 0
with jsonlines.open(input_file) as reader:
for obj in reader:
messages = obj['messages']
last_message = json.dumps(messages[-1]['content']) # 将字典转换为字符串
message_counts[last_message] += 1
total_messages += 1
return message_counts, total_messages
def remove_high_duplicates(input_file, output_file, threshold):
message_counts, total_messages = count_message_occurrences(input_file)
unique_messages = set()
for message, count in message_counts.items():
if count / total_messages <= threshold:
unique_messages.add(message)
with jsonlines.open(input_file) as reader, jsonlines.open(output_file, 'w') as writer:
for obj in reader:
messages = obj['messages']
last_message = json.dumps(messages[-1]['content']) # 将字典转换为字符串
if last_message in unique_messages:
writer.write(obj)
input_file = 'all_cleaned_file.jsonl'
output_file = 'all_cleaned_cleaned_file.jsonl'
threshold = 0.5
remove_high_duplicates(input_file, output_file, threshold)
合并messages获得长对话,1024token–1024*8字符,还考虑到一些key也算进去了,所以我们的max_chars为1024×10—task7.py
import json
def merge_messages(input_file, output_file, max_chars=10240):
merged_data = {"id": None, "messages": []}
current_chars = 0
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
for line in f_in:
data = json.loads(line)
data_id = data['id']
messages = data['messages']
if merged_data['id'] is None:
merged_data['id'] = data_id
for message in messages:
content = message['content']
content_length = len(content)
if current_chars + content_length > max_chars:
f_out.write(json.dumps(merged_data) + '\n')
merged_data['id'] = data_id
merged_data['messages'] = []
current_chars = 0
merged_data['messages'].append(message)
current_chars += content_length
f_out.write(json.dumps(merged_data) + '\n')
input_file = 'all_cleaned_cleaned_file.jsonl'
output_file = 'all_concat_1024.jsonl'
# 调用函数进行合并
merge_messages(input_file, output_file)
合并后的数据发现有很多role为system的重复说一样的话,删除重复的字典 — task8.py
import json
def remove_duplicates(input_file, output_file):
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
for line in f_in:
data = json.loads(line)
data_id = data['id']
messages = data['messages']
# 使用集合来去除重复的字典
unique_messages = list({json.dumps(msg) for msg in messages})
# 将去重后的消息重新放回数据字典中
data['messages'] = [json.loads(msg) for msg in unique_messages]
# 写入处理后的数据到输出文件
f_out.write(json.dumps(data) + '\n')
# 指定输入和输出文件路径
input_file = 'all_concat_1024.jsonl'
output_file = 'all_concat_1024_test.jsonl'
# 调用函数去除重复字典
remove_duplicates(input_file, output_file)
上面的code有点问题,出来的结果很怪,后来觉得这一步也可能是多余的,因为system的输出都是在声明角色,感觉还是很重要的不能删除。
每个message只保留一个system的,且放到第一
import jsonlines
# 读取原始的JSONL文件
with jsonlines.open('all_concat_1024.jsonl', 'r') as reader:
# 创建一个新的JSONL文件用于存储处理后的数据
with jsonlines.open('all_concat_1024_final2.jsonl', 'w') as writer:
# 遍历原始数据中的每个数据项
for data in reader:
# 获取messages列表
messages = data['messages']
# 查找"role"为"system"的字典
system_message = next((msg for msg in messages if msg['role'] == 'system'), None)
if system_message:
# 将"role"为"system"的字典移动到列表的第一个位置
messages.remove(system_message)
messages.insert(0, system_message)
# 写入处理后的数据到新的JSONL文件
writer.write(data)
输出处理好的jsonl数据集:
2222条对话数据,每个数据包含10240+个英文字母的长对话,对话第一个为system确定角色,后续会user和assistant的对话。
today is over