#!/usr/bin/env python3 """Objective search-quality evaluation harness. Measures search quality without human relevance judgments, using the catalog itself as ground truth: if a user searches a video's TITLE (exact, or the first few words), that video should rank at/near the top. We report recall@k and MRR for title search, keyword search, and hybrid search over a deterministic sample. This turns "does search feel good?" into numbers you can move and compare. Usage: python scripts/search_eval.py [--base-url http://localhost:8001] \ [--language E] [--sample 150] [--data-root ] Run it before and after a ranking change to see the effect. """ from __future__ import annotations import argparse import json import os import re import sys import urllib.parse import urllib.request from pathlib import Path def _load_titles(data_root: str, language: str) -> dict[str, str]: path = os.path.join(data_root, "json", language, "all_media_items.json") with open(path, encoding="utf-8") as handle: media = json.load(handle) items = media.items() if isinstance(media, dict) else ((m.get("naturalKey"), m) for m in media) titles: dict[str, str] = {} for natural_key, item in items: if natural_key and isinstance(item, dict) and item.get("title"): titles[natural_key] = item["title"] return titles def _fetch(base_url: str, path: str, **params) -> list[dict]: qs = urllib.parse.urlencode({k: v for k, v in params.items() if v != ""}) url = f"{base_url.rstrip('/')}{path}?{qs}" with urllib.request.urlopen(url, timeout=120) as resp: body = json.load(resp) return body.get("results", []) if isinstance(body, dict) else [] _AD_SUFFIX = "(With Audio Descriptions)" def _acceptable_keys(natural_key: str, title: str, titles: dict[str, str]) -> set[str]: """Keys that count as a correct hit for this video's title query. Title search intentionally collapses "(With Audio Descriptions)" duplicates to the primary (non-AD) version, so an AD title query should be considered found when ANY video with the de-suffixed primary title is returned. Note: this is slightly more generous than production, which canonicalizes to a single primary; on the rare titles shared by two unrelated videos this over-accepts (~1% of AD probes), nudging recall optimistically. Acceptable for a relative before/after measure. AD detection here is the literal title suffix; production's is_ad_media also keys off category — they agree on the current catalog. """ if _AD_SUFFIX in title: primary_title = title.replace(_AD_SUFFIX, "").strip() primaries = {k for k, t in titles.items() if t == primary_title} return primaries or {natural_key} return {natural_key} def _rank_of(results: list[dict], acceptable: set[str]) -> int | None: """Best 1-based rank among any acceptable key in results, or None.""" seen: list[str] = [] for r in results: nk = r.get("natural_key") if nk and nk not in seen: seen.append(nk) ranks = [seen.index(k) + 1 for k in acceptable if k in seen] return min(ranks) if ranks else None def _first_words(title: str, n: int) -> str: words = re.findall(r"\w+", title) return " ".join(words[:n]) def _sample(titles: dict[str, str], n: int) -> list[tuple[str, str]]: items = sorted(titles.items()) # deterministic order if len(items) <= n: return items step = len(items) / n return [items[int(i * step)] for i in range(n)] def _metrics(ranks: list[int | None]) -> dict[str, float]: total = len(ranks) if not total: return {} found = [r for r in ranks if r is not None] return { "recall@1": sum(1 for r in found if r <= 1) / total, "recall@5": sum(1 for r in found if r <= 5) / total, "recall@10": sum(1 for r in found if r <= 10) / total, "mrr": sum(1.0 / r for r in found) / total, "not_found": (total - len(found)) / total, } def _run_probe(base_url, path, query, language, acceptable, **extra) -> int | None: # Self-guard each probe so one failure counts as a miss (None) rather than # desyncing the per-metric sample sizes / denominators. try: results = _fetch(base_url, path, q=query, language=language, limit=20, **extra) except Exception as exc: # noqa: BLE001 — eval tool; a failed probe is a miss print(f" (probe failed: {path} q={query!r}: {exc})", file=sys.stderr) return None return _rank_of(results, acceptable) def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--base-url", default="http://localhost:8001") parser.add_argument("--language", default="E") parser.add_argument("--sample", type=int, default=150) parser.add_argument( "--data-root", default=os.environ.get("SEARCH_UI_DATA_ROOT", "/Users/avsadmin/Documents/Apps/Search-UI/backend"), ) parser.add_argument("--partial-words", type=int, default=4) args = parser.parse_args() titles = _load_titles(args.data_root, args.language) sample = _sample(titles, args.sample) print(f"Catalog: {len(titles)} titled videos; evaluating a deterministic sample of {len(sample)}.\n") probes: dict[str, list[int | None]] = { "title-search (exact title)": [], "title-search (first N words)": [], "keyword (exact title)": [], "hybrid (exact title)": [], } for natural_key, title in sample: partial = _first_words(title, args.partial_words) acceptable = _acceptable_keys(natural_key, title, titles) # Each _run_probe self-guards, so all four lists stay the same length. probes["title-search (exact title)"].append( _run_probe(args.base_url, "/api/search-title", title, args.language, acceptable) ) probes["title-search (first N words)"].append( _run_probe(args.base_url, "/api/search-title", partial, args.language, acceptable) ) probes["keyword (exact title)"].append( _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="keyword") ) probes["hybrid (exact title)"].append( _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="hybrid") ) print(f"{'probe':32} {'recall@1':>9} {'recall@5':>9} {'recall@10':>10} {'MRR':>7} {'miss':>7}") print("-" * 80) for name, ranks in probes.items(): m = _metrics(ranks) if m: print( f"{name:32} {m['recall@1']:>9.2%} {m['recall@5']:>9.2%} " f"{m['recall@10']:>10.2%} {m['mrr']:>7.3f} {m['not_found']:>7.2%}" ) return 0 if __name__ == "__main__": raise SystemExit(main())