Spaces:
Sleeping
Sleeping
| """Surface frequent phrasebook misses from field telemetry. | |
| Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs | |
| that produced `phrasebook=null` so you can decide which ones deserve a | |
| curated entry. This is the data-driven side of vocabulary growth: instead | |
| of guessing what to translate next, look at what real users typed. | |
| Usage: | |
| python scripts/phrasebook_misses.py # top-20 misses, all langs | |
| python scripts/phrasebook_misses.py --lang ful # filter to one target lang | |
| python scripts/phrasebook_misses.py --top 50 # show more | |
| python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts | |
| python scripts/phrasebook_misses.py --draft # also draft an LLM target | |
| # (HF_TOKEN required) | |
| Output is plain text — eyeball the list, decide what's worth adding, and | |
| paste curated translations into: | |
| configs/dialect_anchors/{bambara,pular}_phrasebook.json | |
| Stub status: counting + grouping is fully wired. The --draft flag is a | |
| TODO — it'll call MinimalClient on each miss and propose a target string | |
| for you to review. Land that when we want a tighter loop. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from collections import Counter | |
| from pathlib import Path | |
| from typing import Optional | |
| # Default location matches TurnLogger's default. | |
| _REPO_ROOT = Path(__file__).resolve().parent.parent | |
| _DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl" | |
| def _norm(text: str) -> str: | |
| """Cheap normalisation for grouping. Same intent as phrasebook._normalize | |
| but kept local so this script has no project import dependency.""" | |
| return " ".join((text or "").lower().strip().split()) | |
| def load_misses( | |
| path: Path, | |
| lang: Optional[str] = None, | |
| since: Optional[str] = None, | |
| ) -> Counter: | |
| """Return Counter of normalised user inputs that produced phrasebook=null. | |
| Filters: | |
| lang — only count rows whose `output_lang` equals this (e.g. "ful"). | |
| since — only count rows whose `ts` starts with this string (ISO prefix). | |
| """ | |
| if not path.exists(): | |
| print(f"No telemetry file at {path} — run the app to produce some turns first.", | |
| file=sys.stderr) | |
| return Counter() | |
| counts: Counter = Counter() | |
| with path.open("r", encoding="utf-8") as fh: | |
| for line in fh: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| row = json.loads(line) | |
| except json.JSONDecodeError: | |
| continue | |
| # Only translate-phase rows are useful — reply-phase rows always | |
| # have phrasebook=null by construction. | |
| if row.get("phase") not in (None, "translate"): | |
| continue | |
| if row.get("phrasebook") is not None: | |
| continue | |
| if lang and row.get("output_lang") != lang: | |
| continue | |
| if since and not (row.get("ts") or "").startswith(since): | |
| continue | |
| text = row.get("user_text") or row.get("transcript") or "" | |
| text = _norm(text) | |
| if text and not text.startswith("("): # skip our own sentinels | |
| counts[text] += 1 | |
| return counts | |
| def main() -> None: | |
| ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) | |
| ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)), | |
| help="Path to field_turns.jsonl (default: data/field_turns.jsonl)") | |
| ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None, | |
| help="Filter to one output language") | |
| ap.add_argument("--since", default=None, | |
| help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"') | |
| ap.add_argument("--top", type=int, default=20, help="How many misses to print") | |
| ap.add_argument("--draft", action="store_true", | |
| help="(TODO) Also draft an LLM-proposed target for each miss") | |
| args = ap.parse_args() | |
| counts = load_misses(Path(args.path), lang=args.lang, since=args.since) | |
| if not counts: | |
| print("No misses found with the current filters.") | |
| return | |
| total = sum(counts.values()) | |
| distinct = len(counts) | |
| print(f"{total} miss-events across {distinct} distinct inputs" | |
| + (f" (lang={args.lang})" if args.lang else "") | |
| + (f" since {args.since}" if args.since else "") | |
| + ".\n") | |
| print(f"{'count':>5} input") | |
| print(f"{'-----':>5} -----") | |
| for text, n in counts.most_common(args.top): | |
| print(f"{n:>5} {text}") | |
| if args.draft: | |
| # TODO: import MinimalClient and call .chat(text, target_lang=args.lang) | |
| # for each top input, printing the proposed target alongside. Skipping | |
| # now to keep the stub dependency-free. | |
| print("\n[--draft is a stub — not yet implemented]") | |
| if __name__ == "__main__": | |
| main() | |