PocketAccountant / scripts /build_classifier_dataset.py
eldinosaur's picture
PocketAccountant: custom ledger UI + deterministic agent (engine, ledger, retrieval, classifier)
c55ab5e verified
Raw
History Blame Contribute Delete
1.74 kB
"""Generate the SAT-classifier fine-tune dataset and write it to data/finetune/.
Run from the project root: python -m scripts.build_classifier_dataset
Outputs:
data/finetune/train.jsonl — chat-format SFT examples
data/finetune/val.jsonl
data/finetune/stats.json — coverage report
These files are ready to push to the Hub as a dataset (📡 Sharing is Caring).
"""
import sys
from collections import Counter
from pathlib import Path
try:
sys.stdout.reconfigure(encoding="utf-8")
except (AttributeError, ValueError):
pass
from src.finetune import build_examples, split, write_jsonl
from src.finetune.dataset import json
def main(per_phrasing: int = 8):
out = Path("data/finetune")
out.mkdir(parents=True, exist_ok=True)
examples = build_examples(per_phrasing=per_phrasing)
train, val = split(examples, val_fraction=0.1)
write_jsonl(out / "train.jsonl", train)
write_jsonl(out / "val.jsonl", val)
by_account = Counter(e["label"]["cuenta"] for e in examples)
by_kind = Counter(e["label"]["kind"] for e in examples)
stats = {
"total": len(examples),
"train": len(train),
"val": len(val),
"accounts": len(by_account),
"by_kind": dict(by_kind),
"by_account": dict(by_account.most_common()),
}
(out / "stats.json").write_text(json.dumps(stats, ensure_ascii=False, indent=2),
encoding="utf-8")
print(f"Wrote {len(train)} train / {len(val)} val examples across "
f"{len(by_account)} accounts → {out}/")
print("By kind:", dict(by_kind))
print("\nExample:")
print(json.dumps(examples[0], ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()