"""Surface frequent phrasebook misses from field telemetry.

Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs
that produced `phrasebook=null` so you can decide which ones deserve a
curated entry. This is the data-driven side of vocabulary growth: instead
of guessing what to translate next, look at what real users typed.

Usage:
    python scripts/phrasebook_misses.py                 # top-20 misses, all langs
    python scripts/phrasebook_misses.py --lang ful      # filter to one target lang
    python scripts/phrasebook_misses.py --top 50        # show more
    python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts
    python scripts/phrasebook_misses.py --draft         # also draft an LLM target
                                                        #   (HF_TOKEN required)

Output is plain text — eyeball the list, decide what's worth adding, and
paste curated translations into:
    configs/dialect_anchors/{bambara,pular}_phrasebook.json

Stub status: counting + grouping is fully wired. The --draft flag is a
TODO — it'll call MinimalClient on each miss and propose a target string
for you to review. Land that when we want a tighter loop.
"""
from __future__ import annotations

import argparse
import json
import os
import sys
from collections import Counter
from pathlib import Path
from typing import Optional

# Default location matches TurnLogger's default.
_REPO_ROOT = Path(__file__).resolve().parent.parent
_DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl"


def _norm(text: str) -> str:
    """Cheap normalisation for grouping. Same intent as phrasebook._normalize
    but kept local so this script has no project import dependency."""
    return " ".join((text or "").lower().strip().split())


def load_misses(
    path: Path,
    lang: Optional[str] = None,
    since: Optional[str] = None,
) -> Counter:
    """Return Counter of normalised user inputs that produced phrasebook=null.

    Filters:
        lang  — only count rows whose `output_lang` equals this (e.g. "ful").
        since — only count rows whose `ts` starts with this string (ISO prefix).
    """
    if not path.exists():
        print(f"No telemetry file at {path} — run the app to produce some turns first.",
              file=sys.stderr)
        return Counter()

    counts: Counter = Counter()
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                row = json.loads(line)
            except json.JSONDecodeError:
                continue
            # Only translate-phase rows are useful — reply-phase rows always
            # have phrasebook=null by construction.
            if row.get("phase") not in (None, "translate"):
                continue
            if row.get("phrasebook") is not None:
                continue
            if lang and row.get("output_lang") != lang:
                continue
            if since and not (row.get("ts") or "").startswith(since):
                continue
            text = row.get("user_text") or row.get("transcript") or ""
            text = _norm(text)
            if text and not text.startswith("("):  # skip our own sentinels
                counts[text] += 1
    return counts


def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
    ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)),
                    help="Path to field_turns.jsonl (default: data/field_turns.jsonl)")
    ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None,
                    help="Filter to one output language")
    ap.add_argument("--since", default=None,
                    help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"')
    ap.add_argument("--top", type=int, default=20, help="How many misses to print")
    ap.add_argument("--draft", action="store_true",
                    help="(TODO) Also draft an LLM-proposed target for each miss")
    args = ap.parse_args()

    counts = load_misses(Path(args.path), lang=args.lang, since=args.since)
    if not counts:
        print("No misses found with the current filters.")
        return

    total = sum(counts.values())
    distinct = len(counts)
    print(f"{total} miss-events across {distinct} distinct inputs"
          + (f" (lang={args.lang})" if args.lang else "")
          + (f" since {args.since}" if args.since else "")
          + ".\n")
    print(f"{'count':>5}  input")
    print(f"{'-----':>5}  -----")
    for text, n in counts.most_common(args.top):
        print(f"{n:>5}  {text}")

    if args.draft:
        # TODO: import MinimalClient and call .chat(text, target_lang=args.lang)
        # for each top input, printing the proposed target alongside. Skipping
        # now to keep the stub dependency-free.
        print("\n[--draft is a stub — not yet implemented]")


if __name__ == "__main__":
    main()