| | import json |
| | import random |
| |
|
| | |
| | random.seed(42) |
| |
|
| | |
| | input_file = "alpaca-gpt4.jsonl" |
| | with open(input_file, "r", encoding="utf-8") as f: |
| | data = [json.loads(line) for line in f] |
| |
|
| | |
| | random.shuffle(data) |
| |
|
| | |
| | processed = [] |
| | for item in data: |
| | instruction = item.get("instruction", "") |
| | input_text = item.get("input", "") |
| | output_text = item.get("output", "") |
| | text = instruction + "\n" + input_text if input_text else instruction |
| | processed.append({ |
| | "text": text.strip(), |
| | "result": output_text.strip() |
| | }) |
| |
|
| | |
| | n = len(processed) |
| | n_train = int(n * 0.8) |
| | n_valid = int(n * 0.1) |
| | n_test = n - n_train - n_valid |
| |
|
| | train_data = processed[:n_train] |
| | valid_data = processed[n_train:n_train + n_valid] |
| | test_data = processed[n_train + n_valid:] |
| |
|
| | |
| | def write_jsonl(filename, dataset): |
| | with open(filename, "w", encoding="utf-8") as f: |
| | for item in dataset: |
| | f.write(json.dumps(item, ensure_ascii=False) + "\n") |
| |
|
| | write_jsonl("alpaca-gpt4-train.jsonl", train_data) |
| | write_jsonl("alpaca-gpt4-valid.jsonl", valid_data) |
| | write_jsonl("alpaca-gpt4-test.jsonl", test_data) |
| |
|