File size: 2,982 Bytes
f3efd06 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """
一键生成「字级对齐」JSONL,供上传 Hugging Face Dataset。
每行字段与 prepare_char_jsonl.py 一致:
source, target, char_labels, n_chars
(分层验证另含 task、corpus,需 --keep_metadata)
输出目录:data/hf_aligned/
用法(须使用项目约定 Python,见 .cursor/rules/python-env.mdc):
python build_hf_aligned_jsonl.py
python build_hf_aligned_jsonl.py --full-train # 全量 cec_train,耗时长、体积大
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent
PREPARE = ROOT / "prepare_char_jsonl.py"
OUT_DIR = ROOT / "data" / "hf_aligned"
def run(cmd: list[str]) -> None:
print("+", " ".join(cmd))
r = subprocess.run(cmd, cwd=str(ROOT))
if r.returncode != 0:
raise SystemExit(r.returncode)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument(
"--full-train",
action="store_true",
help="同时导出全量 cec_train 对齐到 cec_train_char.jsonl(体积大,慎用)",
)
ap.add_argument(
"--train-sample",
type=int,
default=50_000,
help="非 --full-train 时,训练集抽样行数(默认 50000)",
)
args = ap.parse_args()
py = sys.executable
OUT_DIR.mkdir(parents=True, exist_ok=True)
run(
[
py,
str(PREPARE),
"--in_jsonl",
"data/val_task_stratified.jsonl",
"--out_jsonl",
"data/hf_aligned/val_task_stratified_char.jsonl",
"--keep_metadata",
]
)
run(
[
py,
str(PREPARE),
"--in_jsonl",
"data/cec_validation.jsonl",
"--out_jsonl",
"data/hf_aligned/cec_validation_char.jsonl",
]
)
if args.full_train:
run(
[
py,
str(PREPARE),
"--in_jsonl",
"data/cec_train.jsonl",
"--out_jsonl",
"data/hf_aligned/cec_train_char.jsonl",
]
)
else:
if args.train_sample == 50_000:
train_out = "data/hf_aligned/cec_train_char_sample_50k.jsonl"
else:
train_out = f"data/hf_aligned/cec_train_char_sample_{args.train_sample}.jsonl"
run(
[
py,
str(PREPARE),
"--in_jsonl",
"data/cec_train.jsonl",
"--out_jsonl",
train_out,
"--limit",
str(args.train_sample),
]
)
print("\n完成。上传 Hugging Face Dataset 时建议使用目录:", OUT_DIR.resolve())
print("说明见:data/hf_aligned/README.md")
if __name__ == "__main__":
main()
|