File size: 2,982 Bytes
f3efd06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""

一键生成「字级对齐」JSONL,供上传 Hugging Face Dataset。



每行字段与 prepare_char_jsonl.py 一致:

  source, target, char_labels, n_chars

  (分层验证另含 task、corpus,需 --keep_metadata)



输出目录:data/hf_aligned/



用法(须使用项目约定 Python,见 .cursor/rules/python-env.mdc):

  python build_hf_aligned_jsonl.py

  python build_hf_aligned_jsonl.py --full-train   # 全量 cec_train,耗时长、体积大

"""
from __future__ import annotations

import argparse
import subprocess
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent
PREPARE = ROOT / "prepare_char_jsonl.py"
OUT_DIR = ROOT / "data" / "hf_aligned"


def run(cmd: list[str]) -> None:
    print("+", " ".join(cmd))
    r = subprocess.run(cmd, cwd=str(ROOT))
    if r.returncode != 0:
        raise SystemExit(r.returncode)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "--full-train",
        action="store_true",
        help="同时导出全量 cec_train 对齐到 cec_train_char.jsonl(体积大,慎用)",
    )
    ap.add_argument(
        "--train-sample",
        type=int,
        default=50_000,
        help="非 --full-train 时,训练集抽样行数(默认 50000)",
    )
    args = ap.parse_args()

    py = sys.executable
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    run(
        [
            py,
            str(PREPARE),
            "--in_jsonl",
            "data/val_task_stratified.jsonl",
            "--out_jsonl",
            "data/hf_aligned/val_task_stratified_char.jsonl",
            "--keep_metadata",
        ]
    )
    run(
        [
            py,
            str(PREPARE),
            "--in_jsonl",
            "data/cec_validation.jsonl",
            "--out_jsonl",
            "data/hf_aligned/cec_validation_char.jsonl",
        ]
    )

    if args.full_train:
        run(
            [
                py,
                str(PREPARE),
                "--in_jsonl",
                "data/cec_train.jsonl",
                "--out_jsonl",
                "data/hf_aligned/cec_train_char.jsonl",
            ]
        )
    else:
        if args.train_sample == 50_000:
            train_out = "data/hf_aligned/cec_train_char_sample_50k.jsonl"
        else:
            train_out = f"data/hf_aligned/cec_train_char_sample_{args.train_sample}.jsonl"
        run(
            [
                py,
                str(PREPARE),
                "--in_jsonl",
                "data/cec_train.jsonl",
                "--out_jsonl",
                train_out,
                "--limit",
                str(args.train_sample),
            ]
        )

    print("\n完成。上传 Hugging Face Dataset 时建议使用目录:", OUT_DIR.resolve())
    print("说明见:data/hf_aligned/README.md")


if __name__ == "__main__":
    main()