""" 一键生成「字级对齐」JSONL,供上传 Hugging Face Dataset。 每行字段与 prepare_char_jsonl.py 一致: source, target, char_labels, n_chars (分层验证另含 task、corpus,需 --keep_metadata) 输出目录:data/hf_aligned/ 用法(须使用项目约定 Python,见 .cursor/rules/python-env.mdc): python build_hf_aligned_jsonl.py python build_hf_aligned_jsonl.py --full-train # 全量 cec_train,耗时长、体积大 """ from __future__ import annotations import argparse import subprocess import sys from pathlib import Path ROOT = Path(__file__).resolve().parent PREPARE = ROOT / "prepare_char_jsonl.py" OUT_DIR = ROOT / "data" / "hf_aligned" def run(cmd: list[str]) -> None: print("+", " ".join(cmd)) r = subprocess.run(cmd, cwd=str(ROOT)) if r.returncode != 0: raise SystemExit(r.returncode) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument( "--full-train", action="store_true", help="同时导出全量 cec_train 对齐到 cec_train_char.jsonl(体积大,慎用)", ) ap.add_argument( "--train-sample", type=int, default=50_000, help="非 --full-train 时,训练集抽样行数(默认 50000)", ) args = ap.parse_args() py = sys.executable OUT_DIR.mkdir(parents=True, exist_ok=True) run( [ py, str(PREPARE), "--in_jsonl", "data/val_task_stratified.jsonl", "--out_jsonl", "data/hf_aligned/val_task_stratified_char.jsonl", "--keep_metadata", ] ) run( [ py, str(PREPARE), "--in_jsonl", "data/cec_validation.jsonl", "--out_jsonl", "data/hf_aligned/cec_validation_char.jsonl", ] ) if args.full_train: run( [ py, str(PREPARE), "--in_jsonl", "data/cec_train.jsonl", "--out_jsonl", "data/hf_aligned/cec_train_char.jsonl", ] ) else: if args.train_sample == 50_000: train_out = "data/hf_aligned/cec_train_char_sample_50k.jsonl" else: train_out = f"data/hf_aligned/cec_train_char_sample_{args.train_sample}.jsonl" run( [ py, str(PREPARE), "--in_jsonl", "data/cec_train.jsonl", "--out_jsonl", train_out, "--limit", str(args.train_sample), ] ) print("\n完成。上传 Hugging Face Dataset 时建议使用目录:", OUT_DIR.resolve()) print("说明见:data/hf_aligned/README.md") if __name__ == "__main__": main()