File size: 2,733 Bytes
799d677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import random
import json
from pathlib import Path
from argparse import ArgumentParser


NO_INPUT_PROMPT: str = "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。"


def main():
    parser = ArgumentParser()
    parser.add_argument("--ichikara-dir", type=str, required=True)
    parser.add_argument("--answer-carefully-dir", type=str, required=True)
    parser.add_argument("--output-dir", type=str, required=True)
    args = parser.parse_args()

    ichikara_filenames: list[str] = [
        "ichikara-instruction-003-001-1",
        "ichikara-instruction-003-003-1",
    ]
    saved_ichikara_samples: list[dict] = []
    for ichikara_filename in ichikara_filenames:
        ichikara_filepath: Path = Path(f"{args.ichikara_dir}/{ichikara_filename}.json")
        print(ichikara_filepath)
        with ichikara_filepath.open(mode="r", encoding="utf-8") as f:
            loaded_samples = json.load(f)
        for loaded_sample in loaded_samples:
            saved_ichikara_samples.append(
                {
                    "ID": loaded_sample["ID"],
                    "messages": [
                        {"role": "system", "content": NO_INPUT_PROMPT},
                        {"role": "user", "content": loaded_sample["text"]},
                        {"role": "assistant", "content": loaded_sample["output"]},
                    ],
                }
            )

    random.seed(42)
    random.shuffle(saved_ichikara_samples)
    with Path(f"{args.output_dir}/ichikara.jsonl").open("w", encoding="utf-8") as f:
        for sample in saved_ichikara_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + "\n")

    answer_carefully_filepath: Path = Path(f"{args.answer_carefully_dir}/AnswerCarefullyVersion002_Dev.json")
    with answer_carefully_filepath.open(mode="r", encoding="utf-8") as f:
        loaded_samples = json.load(f)
    saved_answer_carefully_samples: list[dict] = []
    for loaded_sample in loaded_samples:
        saved_answer_carefully_samples.append(
            {
                "ID": loaded_sample["ID"],
                "messages": [
                    {"role": "system", "content": NO_INPUT_PROMPT},
                    {"role": "user", "content": loaded_sample["text"]},
                    {"role": "assistant", "content": loaded_sample["output"]},
                ],
            }
        )

    random.seed(42)
    random.shuffle(saved_answer_carefully_samples)
    with Path(f"{args.output_dir}/answer_carefully.jsonl").open("w", encoding="utf-8") as f:
        for sample in saved_answer_carefully_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + "\n")




if __name__ == "__main__":
    main()