import json import random # 设置随机种子以便可复现 random.seed(42) # 读取原始 jsonl 文件 input_file = "alpaca-gpt4.jsonl" # 请替换为你的路径 with open(input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] # 打乱数据 random.shuffle(data) # 重新格式化为 {"text": instruction + input, "result": output} processed = [] for item in data: instruction = item.get("instruction", "") input_text = item.get("input", "") output_text = item.get("output", "") text = instruction + "\n" + input_text if input_text else instruction processed.append({ "text": text.strip(), "result": output_text.strip() }) # 划分数据 n = len(processed) n_train = int(n * 0.8) n_valid = int(n * 0.1) n_test = n - n_train - n_valid # 保证总数一致 train_data = processed[:n_train] valid_data = processed[n_train:n_train + n_valid] test_data = processed[n_train + n_valid:] # 写入三个文件 def write_jsonl(filename, dataset): with open(filename, "w", encoding="utf-8") as f: for item in dataset: f.write(json.dumps(item, ensure_ascii=False) + "\n") write_jsonl("alpaca-gpt4-train.jsonl", train_data) write_jsonl("alpaca-gpt4-valid.jsonl", valid_data) write_jsonl("alpaca-gpt4-test.jsonl", test_data)