yp202031121

SONY’s Work Note on 2023.11.21

阅读代码 dataset.py

这段代码用于从一个jsonl文件创建数据集，以便对OpenBuddy模型进行微调。

略

数据处理

所有文件夹里的jsonl文件都合并成一个jsonl文件（文件夹名字命名jsonl文件）— task2.py

import os
import json

def merge_jsonl_folders(input_folder, output_folder):
    for foldername in os.listdir(input_folder):
        folderpath = os.path.join(input_folder, foldername)
        if os.path.isdir(folderpath):
            output_file = os.path.join(output_folder, f"{foldername}.jsonl")
            with open(output_file, 'w') as outfile:
                for filename in os.listdir(folderpath):
                    if filename.endswith('.jsonl'):
                        filepath = os.path.join(folderpath, filename)
                        with open(filepath, 'r')as infile:
                            for line in infile:
                                outfile.write(line)

# 输入文件夹路径
input_folder = '/home/im/Downloads/dpo_test_2-1107/'

# 输出文件夹路径
output_folder = '/home/im/yp1121/task_jsonl_1107/'

# 遍历文件夹并合并文件
merge_jsonl_folders(input_folder, output_folder)

jsonl文件大合并 – task3.py & task4.py

import os
import json
input_folder = '/home/im/yp1121/task_jsonl_1107/'
output_file = '/home/im/yp1121/task_all_1107.jsonl'
def merge_jsonl_files(input_folder, output_file):
    with open(output_file, 'w') as outfile:
        for filename in os.listdir(input_folder):
            if filename.endswith('.jsonl'):
                filepath = os.path.join(input_folder, filename)
                with open(filepath, 'r') as infile:
                    for line in infile:
                        outfile.write(line)
merge_jsonl_files(input_folder, output_file)

import jsonlines

def merge_jsonl(file1, file2, output_file):
    # 读取第一个JSONL文件中的内容
    with jsonlines.open(file1, 'r') as reader:
        data1 = list(reader)

    # 读取第二个JSONL文件中的内容
    with jsonlines.open(file2, 'r') as reader:
        data2 = list(reader)

    # 合并两个数据列表
    merged_data = data1 + data2

    # 将合并后的内容写入输出文件
    with jsonlines.open(output_file, 'w') as writer:
        writer.write_all(merged_data)

# 示例文件路径
file1_path = 'task_all.jsonl'
file2_path = 'task_all_1107.jsonl'
output_file_path = 'all.jsonl'

# 合并两个JSONL文件
merge_jsonl(file1_path, file2_path, output_file_path)

jsonl文件数据清洗1（删除额外的空格/删除奇怪的字符-Remove non-ASCII characters and strange strings）–task5.py

import json
import re

def clean_data(file_path):
    cleaned_data = []
    unique_messages = set()

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())

            #data = json.dumps(data, ensure_ascii=False)
            # Clean messages
            messages = data['messages']
            cleaned_messages = messages


            # Remove duplicates
            messages_str = json.dumps(cleaned_messages, sort_keys=True)
            if messages_str not in unique_messages:
                unique_messages.add(messages_str)
                cleaned_data.append(data)

    # Remove extra whitespace
    cleaned_data = [re.sub(r'\s+', ' ', json.dumps(data)) for data in cleaned_data]

    # Remove non-ASCII characters and strange strings
    cleaned_data = [re.sub(r'[^\x00-\x7F]+', '', data) for data in cleaned_data]

    return cleaned_data

# 调用函数进行数据清洗
file_path = 'all.jsonl'
cleaned_data = clean_data(file_path)

# 将清洗后的数据写入新文件
output_file_path = 'all_cleaned_file.jsonl'
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    for data in cleaned_data:
        outfile.write(data + '\n')

删除重复率大于50%的数据（id之间）– task6.py

import jsonlines
import json
from collections import Counter

def count_message_occurrences(input_file):
    message_counts = Counter()
    total_messages = 0

    with jsonlines.open(input_file) as reader:
        for obj in reader:
            messages = obj['messages']
            last_message = json.dumps(messages[-1]['content'])  # 将字典转换为字符串
            message_counts[last_message] += 1
            total_messages += 1

    return message_counts, total_messages

def remove_high_duplicates(input_file, output_file, threshold):
    message_counts, total_messages = count_message_occurrences(input_file)
    unique_messages = set()

    for message, count in message_counts.items():
        if count / total_messages <= threshold:
            unique_messages.add(message)

    with jsonlines.open(input_file) as reader, jsonlines.open(output_file, 'w') as writer:
        for obj in reader:
            messages = obj['messages']
            last_message = json.dumps(messages[-1]['content'])  # 将字典转换为字符串
            if last_message in unique_messages:
                writer.write(obj)

input_file = 'all_cleaned_file.jsonl'
output_file = 'all_cleaned_cleaned_file.jsonl'
threshold = 0.5
remove_high_duplicates(input_file, output_file, threshold)

合并messages获得长对话，1024token–1024*8字符,还考虑到一些key也算进去了，所以我们的max_chars为1024×10—task7.py

import json

def merge_messages(input_file, output_file, max_chars=10240):
    merged_data = {"id": None, "messages": []}
    current_chars = 0

    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            data = json.loads(line)
            data_id = data['id']
            messages = data['messages']

            if merged_data['id'] is None:
                merged_data['id'] = data_id

            for message in messages:
                content = message['content']
                content_length = len(content)

                if current_chars + content_length > max_chars:
                    f_out.write(json.dumps(merged_data) + '\n')
                    merged_data['id'] = data_id
                    merged_data['messages'] = []
                    current_chars = 0

                merged_data['messages'].append(message)
                current_chars += content_length

        f_out.write(json.dumps(merged_data) + '\n')
input_file = 'all_cleaned_cleaned_file.jsonl'
output_file = 'all_concat_1024.jsonl'
# 调用函数进行合并
merge_messages(input_file, output_file)

合并后的数据发现有很多role为system的重复说一样的话，删除重复的字典 — task8.py

import json

def remove_duplicates(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            data = json.loads(line)
            data_id = data['id']
            messages = data['messages']
            
            # 使用集合来去除重复的字典
            unique_messages = list({json.dumps(msg) for msg in messages})
            
            # 将去重后的消息重新放回数据字典中
            data['messages'] = [json.loads(msg) for msg in unique_messages]
            
            # 写入处理后的数据到输出文件
            f_out.write(json.dumps(data) + '\n')

# 指定输入和输出文件路径
input_file = 'all_concat_1024.jsonl'
output_file = 'all_concat_1024_test.jsonl'

# 调用函数去除重复字典
remove_duplicates(input_file, output_file)

上面的code有点问题，出来的结果很怪，后来觉得这一步也可能是多余的，因为system的输出都是在声明角色，感觉还是很重要的不能删除。

每个message只保留一个system的，且放到第一

import jsonlines

# 读取原始的JSONL文件
with jsonlines.open('all_concat_1024.jsonl', 'r') as reader:
    # 创建一个新的JSONL文件用于存储处理后的数据
    with jsonlines.open('all_concat_1024_final2.jsonl', 'w') as writer:
        # 遍历原始数据中的每个数据项
        for data in reader:
            # 获取messages列表
            messages = data['messages']
            
            # 查找"role"为"system"的字典
            system_message = next((msg for msg in messages if msg['role'] == 'system'), None)
            
            if system_message:
                # 将"role"为"system"的字典移动到列表的第一个位置
                messages.remove(system_message)
                messages.insert(0, system_message)
            
            # 写入处理后的数据到新的JSONL文件
            writer.write(data)

输出处理好的jsonl数据集：

2222条对话数据，每个数据包含10240+个英文字母的长对话，对话第一个为system确定角色，后续会user和assistant的对话。

today is over