Commit
·
40d90bf
1
Parent(s):
a547042
add data
Browse files- .gitattributes +3 -0
- data/alpaca_data_zh_51k-clean.json +3 -0
- data/alpaca_gpt4_data_zh-clean.json +3 -0
- data/read_data.py +45 -0
- data/sharegpt-70k.json +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/alpaca_data_zh_51k-clean.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/alpaca_gpt4_data_zh-clean.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/sharegpt-70k.json filter=lfs diff=lfs merge=lfs -text
|
data/alpaca_data_zh_51k-clean.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1c1962ed88f95f87ecbe70addd816fa3ade0ee5494a220a3c4972429e7cf111
|
| 3 |
+
size 18810090
|
data/alpaca_gpt4_data_zh-clean.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:951f1331cacabc7b5de2a5d72592a103be0676daba8d92ae7c67b061639e0f46
|
| 3 |
+
size 35100511
|
data/read_data.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
|
| 4 |
+
jsonl_file_path = 'common_zh_70k.jsonl'
|
| 5 |
+
|
| 6 |
+
results = []
|
| 7 |
+
# 打开JSON Lines文件
|
| 8 |
+
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
|
| 9 |
+
# 逐行读取文件内容
|
| 10 |
+
for line in tqdm(file):
|
| 11 |
+
# 解析JSON行
|
| 12 |
+
json_object = json.loads(line.strip())
|
| 13 |
+
|
| 14 |
+
# 处理json_object,根据需要执行操作
|
| 15 |
+
#print(json_object['conversation'])
|
| 16 |
+
#print(len(json_object['conversation']))
|
| 17 |
+
#print(json_object['conversation'][0])
|
| 18 |
+
|
| 19 |
+
if len(json_object['conversation'])>=2:
|
| 20 |
+
rr = []
|
| 21 |
+
for cc in range(len(json_object['conversation'])-1):
|
| 22 |
+
rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])])
|
| 23 |
+
|
| 24 |
+
info = {
|
| 25 |
+
"instruction": str(json_object['conversation'][-1]['human']),
|
| 26 |
+
"input": "",
|
| 27 |
+
"output": str(json_object['conversation'][-1]['assistant']),
|
| 28 |
+
"history": rr
|
| 29 |
+
}
|
| 30 |
+
results.append(info)
|
| 31 |
+
|
| 32 |
+
if len(json_object['conversation'])==1:
|
| 33 |
+
info = {
|
| 34 |
+
"instruction": str(json_object['conversation'][0]['human']),
|
| 35 |
+
"input": "",
|
| 36 |
+
"output": str(json_object['conversation'][0]['assistant']),
|
| 37 |
+
"history": []
|
| 38 |
+
}
|
| 39 |
+
results.append(info)
|
| 40 |
+
|
| 41 |
+
# 打印完第一行后终止循环
|
| 42 |
+
#break
|
| 43 |
+
|
| 44 |
+
with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1:
|
| 45 |
+
json.dump(results, f1, ensure_ascii=False, indent=4)
|
data/sharegpt-70k.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32bc6e7016fbdab5ee97a97bfb275246a5514b1326d8abfd71f1307b64e9ea8f
|
| 3 |
+
size 287978587
|