File size: 1,144 Bytes
e3e3f87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
from copy import deepcopy

raw_train_data = []
with open("./QiaoBan/data/child_chat_data.json", "r", encoding="utf-8") as fr:
    raw_train_data = json.load(fr)

def construct_dialog_sample(dialog):
    splited_data = dialog.split("</s>")
    # assert len(splited_data) % 2 == 0
    if len(splited_data) % 2 == 1:
        splited_data.append("智能助手:嗯嗯。")
    chat_data = []
    history = []
    for i in range(0, len(splited_data), 2):
        user = splited_data[i].split(":" if ":" in splited_data[i] else ":")[-1]
        assitant = splited_data[i+1].split(":" if ":" in splited_data[i+1] else ":")[-1]
        chat_data.append({
            "prompt": user,
            "response": assitant,
            "history": deepcopy(history)
        })
        history.append([user, assitant])
    return chat_data

with open("chat_train_data.json", "w", encoding="utf-8") as fw:
    for sample in raw_train_data:
        dialog = sample["input"]
        chat_data = construct_dialog_sample(dialog)
        for data in chat_data:
            fw.write(json.dumps(data, ensure_ascii=False))
            fw.write("\n")