Spaces:

MataStrategy
/

ground-zero

Sleeping

ground-zero / scripts /phrasebook_misses.py

jefffffff9

Phrasebook: multi-alias + French source keys, plus misses-script stub

5815492 2 months ago

5.04 kB

	"""Surface frequent phrasebook misses from field telemetry.

	Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs
	that produced `phrasebook=null` so you can decide which ones deserve a
	curated entry. This is the data-driven side of vocabulary growth: instead
	of guessing what to translate next, look at what real users typed.

	Usage:
	python scripts/phrasebook_misses.py # top-20 misses, all langs
	python scripts/phrasebook_misses.py --lang ful # filter to one target lang
	python scripts/phrasebook_misses.py --top 50 # show more
	python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts
	python scripts/phrasebook_misses.py --draft # also draft an LLM target
	# (HF_TOKEN required)

	Output is plain text — eyeball the list, decide what's worth adding, and
	paste curated translations into:
	configs/dialect_anchors/{bambara,pular}_phrasebook.json

	Stub status: counting + grouping is fully wired. The --draft flag is a
	TODO — it'll call MinimalClient on each miss and propose a target string
	for you to review. Land that when we want a tighter loop.
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	from collections import Counter
	from pathlib import Path
	from typing import Optional

	# Default location matches TurnLogger's default.
	_REPO_ROOT = Path(__file__).resolve().parent.parent
	_DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl"


	def _norm(text: str) -> str:
	"""Cheap normalisation for grouping. Same intent as phrasebook._normalize
	but kept local so this script has no project import dependency."""
	return " ".join((text or "").lower().strip().split())


	def load_misses(
	path: Path,
	lang: Optional[str] = None,
	since: Optional[str] = None,
	) -> Counter:
	"""Return Counter of normalised user inputs that produced phrasebook=null.

	Filters:
	lang — only count rows whose `output_lang` equals this (e.g. "ful").
	since — only count rows whose `ts` starts with this string (ISO prefix).
	"""
	if not path.exists():
	print(f"No telemetry file at {path} — run the app to produce some turns first.",
	file=sys.stderr)
	return Counter()

	counts: Counter = Counter()
	with path.open("r", encoding="utf-8") as fh:
	for line in fh:
	line = line.strip()
	if not line:
	continue
	try:
	row = json.loads(line)
	except json.JSONDecodeError:
	continue
	# Only translate-phase rows are useful — reply-phase rows always
	# have phrasebook=null by construction.
	if row.get("phase") not in (None, "translate"):
	continue
	if row.get("phrasebook") is not None:
	continue
	if lang and row.get("output_lang") != lang:
	continue
	if since and not (row.get("ts") or "").startswith(since):
	continue
	text = row.get("user_text") or row.get("transcript") or ""
	text = _norm(text)
	if text and not text.startswith("("): # skip our own sentinels
	counts[text] += 1
	return counts


	def main() -> None:
	ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
	ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)),
	help="Path to field_turns.jsonl (default: data/field_turns.jsonl)")
	ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None,
	help="Filter to one output language")
	ap.add_argument("--since", default=None,
	help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"')
	ap.add_argument("--top", type=int, default=20, help="How many misses to print")
	ap.add_argument("--draft", action="store_true",
	help="(TODO) Also draft an LLM-proposed target for each miss")
	args = ap.parse_args()

	counts = load_misses(Path(args.path), lang=args.lang, since=args.since)
	if not counts:
	print("No misses found with the current filters.")
	return

	total = sum(counts.values())
	distinct = len(counts)
	print(f"{total} miss-events across {distinct} distinct inputs"
	+ (f" (lang={args.lang})" if args.lang else "")
	+ (f" since {args.since}" if args.since else "")
	+ ".\n")
	print(f"{'count':>5} input")
	print(f"{'-----':>5} -----")
	for text, n in counts.most_common(args.top):
	print(f"{n:>5} {text}")

	if args.draft:
	# TODO: import MinimalClient and call .chat(text, target_lang=args.lang)
	# for each top input, printing the proposed target alongside. Skipping
	# now to keep the stub dependency-free.
	print("\n[--draft is a stub — not yet implemented]")


	if __name__ == "__main__":
	main()