"""Generate the SAT-classifier fine-tune dataset and write it to data/finetune/. Run from the project root: python -m scripts.build_classifier_dataset Outputs: data/finetune/train.jsonl — chat-format SFT examples data/finetune/val.jsonl data/finetune/stats.json — coverage report These files are ready to push to the Hub as a dataset (📡 Sharing is Caring). """ import sys from collections import Counter from pathlib import Path try: sys.stdout.reconfigure(encoding="utf-8") except (AttributeError, ValueError): pass from src.finetune import build_examples, split, write_jsonl from src.finetune.dataset import json def main(per_phrasing: int = 8): out = Path("data/finetune") out.mkdir(parents=True, exist_ok=True) examples = build_examples(per_phrasing=per_phrasing) train, val = split(examples, val_fraction=0.1) write_jsonl(out / "train.jsonl", train) write_jsonl(out / "val.jsonl", val) by_account = Counter(e["label"]["cuenta"] for e in examples) by_kind = Counter(e["label"]["kind"] for e in examples) stats = { "total": len(examples), "train": len(train), "val": len(val), "accounts": len(by_account), "by_kind": dict(by_kind), "by_account": dict(by_account.most_common()), } (out / "stats.json").write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Wrote {len(train)} train / {len(val)} val examples across " f"{len(by_account)} accounts → {out}/") print("By kind:", dict(by_kind)) print("\nExample:") print(json.dumps(examples[0], ensure_ascii=False, indent=2)) if __name__ == "__main__": main()