| """ | |
| 一键生成「字级对齐」JSONL,供上传 Hugging Face Dataset。 | |
| 每行字段与 prepare_char_jsonl.py 一致: | |
| source, target, char_labels, n_chars | |
| (分层验证另含 task、corpus,需 --keep_metadata) | |
| 输出目录:data/hf_aligned/ | |
| 用法(须使用项目约定 Python,见 .cursor/rules/python-env.mdc): | |
| python build_hf_aligned_jsonl.py | |
| python build_hf_aligned_jsonl.py --full-train # 全量 cec_train,耗时长、体积大 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent | |
| PREPARE = ROOT / "prepare_char_jsonl.py" | |
| OUT_DIR = ROOT / "data" / "hf_aligned" | |
| def run(cmd: list[str]) -> None: | |
| print("+", " ".join(cmd)) | |
| r = subprocess.run(cmd, cwd=str(ROOT)) | |
| if r.returncode != 0: | |
| raise SystemExit(r.returncode) | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument( | |
| "--full-train", | |
| action="store_true", | |
| help="同时导出全量 cec_train 对齐到 cec_train_char.jsonl(体积大,慎用)", | |
| ) | |
| ap.add_argument( | |
| "--train-sample", | |
| type=int, | |
| default=50_000, | |
| help="非 --full-train 时,训练集抽样行数(默认 50000)", | |
| ) | |
| args = ap.parse_args() | |
| py = sys.executable | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| run( | |
| [ | |
| py, | |
| str(PREPARE), | |
| "--in_jsonl", | |
| "data/val_task_stratified.jsonl", | |
| "--out_jsonl", | |
| "data/hf_aligned/val_task_stratified_char.jsonl", | |
| "--keep_metadata", | |
| ] | |
| ) | |
| run( | |
| [ | |
| py, | |
| str(PREPARE), | |
| "--in_jsonl", | |
| "data/cec_validation.jsonl", | |
| "--out_jsonl", | |
| "data/hf_aligned/cec_validation_char.jsonl", | |
| ] | |
| ) | |
| if args.full_train: | |
| run( | |
| [ | |
| py, | |
| str(PREPARE), | |
| "--in_jsonl", | |
| "data/cec_train.jsonl", | |
| "--out_jsonl", | |
| "data/hf_aligned/cec_train_char.jsonl", | |
| ] | |
| ) | |
| else: | |
| if args.train_sample == 50_000: | |
| train_out = "data/hf_aligned/cec_train_char_sample_50k.jsonl" | |
| else: | |
| train_out = f"data/hf_aligned/cec_train_char_sample_{args.train_sample}.jsonl" | |
| run( | |
| [ | |
| py, | |
| str(PREPARE), | |
| "--in_jsonl", | |
| "data/cec_train.jsonl", | |
| "--out_jsonl", | |
| train_out, | |
| "--limit", | |
| str(args.train_sample), | |
| ] | |
| ) | |
| print("\n完成。上传 Hugging Face Dataset 时建议使用目录:", OUT_DIR.resolve()) | |
| print("说明见:data/hf_aligned/README.md") | |
| if __name__ == "__main__": | |
| main() | |