PocketAccountant: custom ledger UI + deterministic agent (engine, ledger, retrieval, classifier)
c55ab5e verified | """Generate the SAT-classifier fine-tune dataset and write it to data/finetune/. | |
| Run from the project root: python -m scripts.build_classifier_dataset | |
| Outputs: | |
| data/finetune/train.jsonl — chat-format SFT examples | |
| data/finetune/val.jsonl | |
| data/finetune/stats.json — coverage report | |
| These files are ready to push to the Hub as a dataset (📡 Sharing is Caring). | |
| """ | |
| import sys | |
| from collections import Counter | |
| from pathlib import Path | |
| try: | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| except (AttributeError, ValueError): | |
| pass | |
| from src.finetune import build_examples, split, write_jsonl | |
| from src.finetune.dataset import json | |
| def main(per_phrasing: int = 8): | |
| out = Path("data/finetune") | |
| out.mkdir(parents=True, exist_ok=True) | |
| examples = build_examples(per_phrasing=per_phrasing) | |
| train, val = split(examples, val_fraction=0.1) | |
| write_jsonl(out / "train.jsonl", train) | |
| write_jsonl(out / "val.jsonl", val) | |
| by_account = Counter(e["label"]["cuenta"] for e in examples) | |
| by_kind = Counter(e["label"]["kind"] for e in examples) | |
| stats = { | |
| "total": len(examples), | |
| "train": len(train), | |
| "val": len(val), | |
| "accounts": len(by_account), | |
| "by_kind": dict(by_kind), | |
| "by_account": dict(by_account.most_common()), | |
| } | |
| (out / "stats.json").write_text(json.dumps(stats, ensure_ascii=False, indent=2), | |
| encoding="utf-8") | |
| print(f"Wrote {len(train)} train / {len(val)} val examples across " | |
| f"{len(by_account)} accounts → {out}/") | |
| print("By kind:", dict(by_kind)) | |
| print("\nExample:") | |
| print(json.dumps(examples[0], ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |