yuccaaa commited on
Commit
0ea0810
·
verified ·
1 Parent(s): 41e2de4

Upload pretrain_data/instruct/1.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. pretrain_data/instruct/1.py +45 -0
pretrain_data/instruct/1.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+
4
+ # 设置随机种子以便可复现
5
+ random.seed(42)
6
+
7
+ # 读取原始 jsonl 文件
8
+ input_file = "alpaca-gpt4.jsonl" # 请替换为你的路径
9
+ with open(input_file, "r", encoding="utf-8") as f:
10
+ data = [json.loads(line) for line in f]
11
+
12
+ # 打乱数据
13
+ random.shuffle(data)
14
+
15
+ # 重新格式化为 {"text": instruction + input, "result": output}
16
+ processed = []
17
+ for item in data:
18
+ instruction = item.get("instruction", "")
19
+ input_text = item.get("input", "")
20
+ output_text = item.get("output", "")
21
+ text = instruction + "\n" + input_text if input_text else instruction
22
+ processed.append({
23
+ "text": text.strip(),
24
+ "result": output_text.strip()
25
+ })
26
+
27
+ # 划分数据
28
+ n = len(processed)
29
+ n_train = int(n * 0.8)
30
+ n_valid = int(n * 0.1)
31
+ n_test = n - n_train - n_valid # 保证总数一致
32
+
33
+ train_data = processed[:n_train]
34
+ valid_data = processed[n_train:n_train + n_valid]
35
+ test_data = processed[n_train + n_valid:]
36
+
37
+ # 写入三个文件
38
+ def write_jsonl(filename, dataset):
39
+ with open(filename, "w", encoding="utf-8") as f:
40
+ for item in dataset:
41
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
42
+
43
+ write_jsonl("alpaca-gpt4-train.jsonl", train_data)
44
+ write_jsonl("alpaca-gpt4-valid.jsonl", valid_data)
45
+ write_jsonl("alpaca-gpt4-test.jsonl", test_data)