| |
| """Objective search-quality evaluation harness. |
| |
| Measures search quality without human relevance judgments, using the catalog |
| itself as ground truth: if a user searches a video's TITLE (exact, or the first |
| few words), that video should rank at/near the top. We report recall@k and MRR |
| for title search, keyword search, and hybrid search over a deterministic sample. |
| |
| This turns "does search feel good?" into numbers you can move and compare. |
| |
| Usage: |
| python scripts/search_eval.py [--base-url http://localhost:8001] \ |
| [--language E] [--sample 150] [--data-root <backend dir>] |
| |
| Run it before and after a ranking change to see the effect. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import re |
| import sys |
| import urllib.parse |
| import urllib.request |
| from pathlib import Path |
|
|
|
|
| def _load_titles(data_root: str, language: str) -> dict[str, str]: |
| path = os.path.join(data_root, "json", language, "all_media_items.json") |
| with open(path, encoding="utf-8") as handle: |
| media = json.load(handle) |
| items = media.items() if isinstance(media, dict) else ((m.get("naturalKey"), m) for m in media) |
| titles: dict[str, str] = {} |
| for natural_key, item in items: |
| if natural_key and isinstance(item, dict) and item.get("title"): |
| titles[natural_key] = item["title"] |
| return titles |
|
|
|
|
| def _fetch(base_url: str, path: str, **params) -> list[dict]: |
| qs = urllib.parse.urlencode({k: v for k, v in params.items() if v != ""}) |
| url = f"{base_url.rstrip('/')}{path}?{qs}" |
| with urllib.request.urlopen(url, timeout=120) as resp: |
| body = json.load(resp) |
| return body.get("results", []) if isinstance(body, dict) else [] |
|
|
|
|
| _AD_SUFFIX = "(With Audio Descriptions)" |
|
|
|
|
| def _acceptable_keys(natural_key: str, title: str, titles: dict[str, str]) -> set[str]: |
| """Keys that count as a correct hit for this video's title query. |
| |
| Title search intentionally collapses "(With Audio Descriptions)" duplicates |
| to the primary (non-AD) version, so an AD title query should be considered |
| found when ANY video with the de-suffixed primary title is returned. |
| |
| Note: this is slightly more generous than production, which canonicalizes to |
| a single primary; on the rare titles shared by two unrelated videos this |
| over-accepts (~1% of AD probes), nudging recall optimistically. Acceptable |
| for a relative before/after measure. AD detection here is the literal title |
| suffix; production's is_ad_media also keys off category — they agree on the |
| current catalog. |
| """ |
| if _AD_SUFFIX in title: |
| primary_title = title.replace(_AD_SUFFIX, "").strip() |
| primaries = {k for k, t in titles.items() if t == primary_title} |
| return primaries or {natural_key} |
| return {natural_key} |
|
|
|
|
| def _rank_of(results: list[dict], acceptable: set[str]) -> int | None: |
| """Best 1-based rank among any acceptable key in results, or None.""" |
| seen: list[str] = [] |
| for r in results: |
| nk = r.get("natural_key") |
| if nk and nk not in seen: |
| seen.append(nk) |
| ranks = [seen.index(k) + 1 for k in acceptable if k in seen] |
| return min(ranks) if ranks else None |
|
|
|
|
| def _first_words(title: str, n: int) -> str: |
| words = re.findall(r"\w+", title) |
| return " ".join(words[:n]) |
|
|
|
|
| def _sample(titles: dict[str, str], n: int) -> list[tuple[str, str]]: |
| items = sorted(titles.items()) |
| if len(items) <= n: |
| return items |
| step = len(items) / n |
| return [items[int(i * step)] for i in range(n)] |
|
|
|
|
| def _metrics(ranks: list[int | None]) -> dict[str, float]: |
| total = len(ranks) |
| if not total: |
| return {} |
| found = [r for r in ranks if r is not None] |
| return { |
| "recall@1": sum(1 for r in found if r <= 1) / total, |
| "recall@5": sum(1 for r in found if r <= 5) / total, |
| "recall@10": sum(1 for r in found if r <= 10) / total, |
| "mrr": sum(1.0 / r for r in found) / total, |
| "not_found": (total - len(found)) / total, |
| } |
|
|
|
|
| def _run_probe(base_url, path, query, language, acceptable, **extra) -> int | None: |
| |
| |
| try: |
| results = _fetch(base_url, path, q=query, language=language, limit=20, **extra) |
| except Exception as exc: |
| print(f" (probe failed: {path} q={query!r}: {exc})", file=sys.stderr) |
| return None |
| return _rank_of(results, acceptable) |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("--base-url", default="http://localhost:8001") |
| parser.add_argument("--language", default="E") |
| parser.add_argument("--sample", type=int, default=150) |
| parser.add_argument( |
| "--data-root", |
| default=os.environ.get("SEARCH_UI_DATA_ROOT", "/Users/avsadmin/Documents/Apps/Search-UI/backend"), |
| ) |
| parser.add_argument("--partial-words", type=int, default=4) |
| args = parser.parse_args() |
|
|
| titles = _load_titles(args.data_root, args.language) |
| sample = _sample(titles, args.sample) |
| print(f"Catalog: {len(titles)} titled videos; evaluating a deterministic sample of {len(sample)}.\n") |
|
|
| probes: dict[str, list[int | None]] = { |
| "title-search (exact title)": [], |
| "title-search (first N words)": [], |
| "keyword (exact title)": [], |
| "hybrid (exact title)": [], |
| } |
| for natural_key, title in sample: |
| partial = _first_words(title, args.partial_words) |
| acceptable = _acceptable_keys(natural_key, title, titles) |
| |
| probes["title-search (exact title)"].append( |
| _run_probe(args.base_url, "/api/search-title", title, args.language, acceptable) |
| ) |
| probes["title-search (first N words)"].append( |
| _run_probe(args.base_url, "/api/search-title", partial, args.language, acceptable) |
| ) |
| probes["keyword (exact title)"].append( |
| _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="keyword") |
| ) |
| probes["hybrid (exact title)"].append( |
| _run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="hybrid") |
| ) |
|
|
| print(f"{'probe':32} {'recall@1':>9} {'recall@5':>9} {'recall@10':>10} {'MRR':>7} {'miss':>7}") |
| print("-" * 80) |
| for name, ranks in probes.items(): |
| m = _metrics(ranks) |
| if m: |
| print( |
| f"{name:32} {m['recall@1']:>9.2%} {m['recall@5']:>9.2%} " |
| f"{m['recall@10']:>10.2%} {m['mrr']:>7.3f} {m['not_found']:>7.2%}" |
| ) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|