#!/usr/bin/env python3
"""Objective search-quality evaluation harness.

Measures search quality without human relevance judgments, using the catalog
itself as ground truth: if a user searches a video's TITLE (exact, or the first
few words), that video should rank at/near the top. We report recall@k and MRR
for title search, keyword search, and hybrid search over a deterministic sample.

This turns "does search feel good?" into numbers you can move and compare.

Usage:
    python scripts/search_eval.py [--base-url http://localhost:8001] \
        [--language E] [--sample 150] [--data-root <backend dir>]

Run it before and after a ranking change to see the effect.
"""

from __future__ import annotations

import argparse
import json
import os
import re
import sys
import urllib.parse
import urllib.request
from pathlib import Path


def _load_titles(data_root: str, language: str) -> dict[str, str]:
    path = os.path.join(data_root, "json", language, "all_media_items.json")
    with open(path, encoding="utf-8") as handle:
        media = json.load(handle)
    items = media.items() if isinstance(media, dict) else ((m.get("naturalKey"), m) for m in media)
    titles: dict[str, str] = {}
    for natural_key, item in items:
        if natural_key and isinstance(item, dict) and item.get("title"):
            titles[natural_key] = item["title"]
    return titles


def _fetch(base_url: str, path: str, **params) -> list[dict]:
    qs = urllib.parse.urlencode({k: v for k, v in params.items() if v != ""})
    url = f"{base_url.rstrip('/')}{path}?{qs}"
    with urllib.request.urlopen(url, timeout=120) as resp:
        body = json.load(resp)
    return body.get("results", []) if isinstance(body, dict) else []


_AD_SUFFIX = "(With Audio Descriptions)"


def _acceptable_keys(natural_key: str, title: str, titles: dict[str, str]) -> set[str]:
    """Keys that count as a correct hit for this video's title query.

    Title search intentionally collapses "(With Audio Descriptions)" duplicates
    to the primary (non-AD) version, so an AD title query should be considered
    found when ANY video with the de-suffixed primary title is returned.

    Note: this is slightly more generous than production, which canonicalizes to
    a single primary; on the rare titles shared by two unrelated videos this
    over-accepts (~1% of AD probes), nudging recall optimistically. Acceptable
    for a relative before/after measure. AD detection here is the literal title
    suffix; production's is_ad_media also keys off category — they agree on the
    current catalog.
    """
    if _AD_SUFFIX in title:
        primary_title = title.replace(_AD_SUFFIX, "").strip()
        primaries = {k for k, t in titles.items() if t == primary_title}
        return primaries or {natural_key}
    return {natural_key}


def _rank_of(results: list[dict], acceptable: set[str]) -> int | None:
    """Best 1-based rank among any acceptable key in results, or None."""
    seen: list[str] = []
    for r in results:
        nk = r.get("natural_key")
        if nk and nk not in seen:
            seen.append(nk)
    ranks = [seen.index(k) + 1 for k in acceptable if k in seen]
    return min(ranks) if ranks else None


def _first_words(title: str, n: int) -> str:
    words = re.findall(r"\w+", title)
    return " ".join(words[:n])


def _sample(titles: dict[str, str], n: int) -> list[tuple[str, str]]:
    items = sorted(titles.items())  # deterministic order
    if len(items) <= n:
        return items
    step = len(items) / n
    return [items[int(i * step)] for i in range(n)]


def _metrics(ranks: list[int | None]) -> dict[str, float]:
    total = len(ranks)
    if not total:
        return {}
    found = [r for r in ranks if r is not None]
    return {
        "recall@1": sum(1 for r in found if r <= 1) / total,
        "recall@5": sum(1 for r in found if r <= 5) / total,
        "recall@10": sum(1 for r in found if r <= 10) / total,
        "mrr": sum(1.0 / r for r in found) / total,
        "not_found": (total - len(found)) / total,
    }


def _run_probe(base_url, path, query, language, acceptable, **extra) -> int | None:
    # Self-guard each probe so one failure counts as a miss (None) rather than
    # desyncing the per-metric sample sizes / denominators.
    try:
        results = _fetch(base_url, path, q=query, language=language, limit=20, **extra)
    except Exception as exc:  # noqa: BLE001 — eval tool; a failed probe is a miss
        print(f"  (probe failed: {path} q={query!r}: {exc})", file=sys.stderr)
        return None
    return _rank_of(results, acceptable)


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--base-url", default="http://localhost:8001")
    parser.add_argument("--language", default="E")
    parser.add_argument("--sample", type=int, default=150)
    parser.add_argument(
        "--data-root",
        default=os.environ.get("SEARCH_UI_DATA_ROOT", "/Users/avsadmin/Documents/Apps/Search-UI/backend"),
    )
    parser.add_argument("--partial-words", type=int, default=4)
    args = parser.parse_args()

    titles = _load_titles(args.data_root, args.language)
    sample = _sample(titles, args.sample)
    print(f"Catalog: {len(titles)} titled videos; evaluating a deterministic sample of {len(sample)}.\n")

    probes: dict[str, list[int | None]] = {
        "title-search (exact title)": [],
        "title-search (first N words)": [],
        "keyword (exact title)": [],
        "hybrid (exact title)": [],
    }
    for natural_key, title in sample:
        partial = _first_words(title, args.partial_words)
        acceptable = _acceptable_keys(natural_key, title, titles)
        # Each _run_probe self-guards, so all four lists stay the same length.
        probes["title-search (exact title)"].append(
            _run_probe(args.base_url, "/api/search-title", title, args.language, acceptable)
        )
        probes["title-search (first N words)"].append(
            _run_probe(args.base_url, "/api/search-title", partial, args.language, acceptable)
        )
        probes["keyword (exact title)"].append(
            _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="keyword")
        )
        probes["hybrid (exact title)"].append(
            _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="hybrid")
        )

    print(f"{'probe':32} {'recall@1':>9} {'recall@5':>9} {'recall@10':>10} {'MRR':>7} {'miss':>7}")
    print("-" * 80)
    for name, ranks in probes.items():
        m = _metrics(ranks)
        if m:
            print(
                f"{name:32} {m['recall@1']:>9.2%} {m['recall@5']:>9.2%} "
                f"{m['recall@10']:>10.2%} {m['mrr']:>7.3f} {m['not_found']:>7.2%}"
            )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())