ChineseErrorDetectorElectra / build_hf_aligned_jsonl.py

Upload build_hf_aligned_jsonl.py

f3efd06 verified 15 days ago

2.98 kB

	"""
	一键生成「字级对齐」JSONL，供上传 Hugging Face Dataset。

	每行字段与 prepare_char_jsonl.py 一致：
	source, target, char_labels, n_chars
	（分层验证另含 task、corpus，需 --keep_metadata）

	输出目录：data/hf_aligned/

	用法（须使用项目约定 Python，见 .cursor/rules/python-env.mdc）：
	python build_hf_aligned_jsonl.py
	python build_hf_aligned_jsonl.py --full-train # 全量 cec_train，耗时长、体积大
	"""
	from __future__ import annotations

	import argparse
	import subprocess
	import sys
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent
	PREPARE = ROOT / "prepare_char_jsonl.py"
	OUT_DIR = ROOT / "data" / "hf_aligned"


	def run(cmd: list[str]) -> None:
	print("+", " ".join(cmd))
	r = subprocess.run(cmd, cwd=str(ROOT))
	if r.returncode != 0:
	raise SystemExit(r.returncode)


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument(
	"--full-train",
	action="store_true",
	help="同时导出全量 cec_train 对齐到 cec_train_char.jsonl（体积大，慎用）",
	)
	ap.add_argument(
	"--train-sample",
	type=int,
	default=50_000,
	help="非 --full-train 时，训练集抽样行数（默认 50000）",
	)
	args = ap.parse_args()

	py = sys.executable
	OUT_DIR.mkdir(parents=True, exist_ok=True)

	run(
	[
	py,
	str(PREPARE),
	"--in_jsonl",
	"data/val_task_stratified.jsonl",
	"--out_jsonl",
	"data/hf_aligned/val_task_stratified_char.jsonl",
	"--keep_metadata",
	]
	)
	run(
	[
	py,
	str(PREPARE),
	"--in_jsonl",
	"data/cec_validation.jsonl",
	"--out_jsonl",
	"data/hf_aligned/cec_validation_char.jsonl",
	]
	)

	if args.full_train:
	run(
	[
	py,
	str(PREPARE),
	"--in_jsonl",
	"data/cec_train.jsonl",
	"--out_jsonl",
	"data/hf_aligned/cec_train_char.jsonl",
	]
	)
	else:
	if args.train_sample == 50_000:
	train_out = "data/hf_aligned/cec_train_char_sample_50k.jsonl"
	else:
	train_out = f"data/hf_aligned/cec_train_char_sample_{args.train_sample}.jsonl"
	run(
	[
	py,
	str(PREPARE),
	"--in_jsonl",
	"data/cec_train.jsonl",
	"--out_jsonl",
	train_out,
	"--limit",
	str(args.train_sample),
	]
	)

	print("\n完成。上传 Hugging Face Dataset 时建议使用目录：", OUT_DIR.resolve())
	print("说明见：data/hf_aligned/README.md")


	if __name__ == "__main__":
	main()