jw-search / scripts /search_eval.py
jw-tools's picture
deploy: latest main (lazy-ML cold start, durable launcher, web-image search, scene search) + full-app data refresh
7ea1851 verified
#!/usr/bin/env python3
"""Objective search-quality evaluation harness.
Measures search quality without human relevance judgments, using the catalog
itself as ground truth: if a user searches a video's TITLE (exact, or the first
few words), that video should rank at/near the top. We report recall@k and MRR
for title search, keyword search, and hybrid search over a deterministic sample.
This turns "does search feel good?" into numbers you can move and compare.
Usage:
python scripts/search_eval.py [--base-url http://localhost:8001] \
[--language E] [--sample 150] [--data-root <backend dir>]
Run it before and after a ranking change to see the effect.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
import urllib.parse
import urllib.request
from pathlib import Path
def _load_titles(data_root: str, language: str) -> dict[str, str]:
path = os.path.join(data_root, "json", language, "all_media_items.json")
with open(path, encoding="utf-8") as handle:
media = json.load(handle)
items = media.items() if isinstance(media, dict) else ((m.get("naturalKey"), m) for m in media)
titles: dict[str, str] = {}
for natural_key, item in items:
if natural_key and isinstance(item, dict) and item.get("title"):
titles[natural_key] = item["title"]
return titles
def _fetch(base_url: str, path: str, **params) -> list[dict]:
qs = urllib.parse.urlencode({k: v for k, v in params.items() if v != ""})
url = f"{base_url.rstrip('/')}{path}?{qs}"
with urllib.request.urlopen(url, timeout=120) as resp:
body = json.load(resp)
return body.get("results", []) if isinstance(body, dict) else []
_AD_SUFFIX = "(With Audio Descriptions)"
def _acceptable_keys(natural_key: str, title: str, titles: dict[str, str]) -> set[str]:
"""Keys that count as a correct hit for this video's title query.
Title search intentionally collapses "(With Audio Descriptions)" duplicates
to the primary (non-AD) version, so an AD title query should be considered
found when ANY video with the de-suffixed primary title is returned.
Note: this is slightly more generous than production, which canonicalizes to
a single primary; on the rare titles shared by two unrelated videos this
over-accepts (~1% of AD probes), nudging recall optimistically. Acceptable
for a relative before/after measure. AD detection here is the literal title
suffix; production's is_ad_media also keys off category — they agree on the
current catalog.
"""
if _AD_SUFFIX in title:
primary_title = title.replace(_AD_SUFFIX, "").strip()
primaries = {k for k, t in titles.items() if t == primary_title}
return primaries or {natural_key}
return {natural_key}
def _rank_of(results: list[dict], acceptable: set[str]) -> int | None:
"""Best 1-based rank among any acceptable key in results, or None."""
seen: list[str] = []
for r in results:
nk = r.get("natural_key")
if nk and nk not in seen:
seen.append(nk)
ranks = [seen.index(k) + 1 for k in acceptable if k in seen]
return min(ranks) if ranks else None
def _first_words(title: str, n: int) -> str:
words = re.findall(r"\w+", title)
return " ".join(words[:n])
def _sample(titles: dict[str, str], n: int) -> list[tuple[str, str]]:
items = sorted(titles.items()) # deterministic order
if len(items) <= n:
return items
step = len(items) / n
return [items[int(i * step)] for i in range(n)]
def _metrics(ranks: list[int | None]) -> dict[str, float]:
total = len(ranks)
if not total:
return {}
found = [r for r in ranks if r is not None]
return {
"recall@1": sum(1 for r in found if r <= 1) / total,
"recall@5": sum(1 for r in found if r <= 5) / total,
"recall@10": sum(1 for r in found if r <= 10) / total,
"mrr": sum(1.0 / r for r in found) / total,
"not_found": (total - len(found)) / total,
}
def _run_probe(base_url, path, query, language, acceptable, **extra) -> int | None:
# Self-guard each probe so one failure counts as a miss (None) rather than
# desyncing the per-metric sample sizes / denominators.
try:
results = _fetch(base_url, path, q=query, language=language, limit=20, **extra)
except Exception as exc: # noqa: BLE001 — eval tool; a failed probe is a miss
print(f" (probe failed: {path} q={query!r}: {exc})", file=sys.stderr)
return None
return _rank_of(results, acceptable)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--base-url", default="http://localhost:8001")
parser.add_argument("--language", default="E")
parser.add_argument("--sample", type=int, default=150)
parser.add_argument(
"--data-root",
default=os.environ.get("SEARCH_UI_DATA_ROOT", "/Users/avsadmin/Documents/Apps/Search-UI/backend"),
)
parser.add_argument("--partial-words", type=int, default=4)
args = parser.parse_args()
titles = _load_titles(args.data_root, args.language)
sample = _sample(titles, args.sample)
print(f"Catalog: {len(titles)} titled videos; evaluating a deterministic sample of {len(sample)}.\n")
probes: dict[str, list[int | None]] = {
"title-search (exact title)": [],
"title-search (first N words)": [],
"keyword (exact title)": [],
"hybrid (exact title)": [],
}
for natural_key, title in sample:
partial = _first_words(title, args.partial_words)
acceptable = _acceptable_keys(natural_key, title, titles)
# Each _run_probe self-guards, so all four lists stay the same length.
probes["title-search (exact title)"].append(
_run_probe(args.base_url, "/api/search-title", title, args.language, acceptable)
)
probes["title-search (first N words)"].append(
_run_probe(args.base_url, "/api/search-title", partial, args.language, acceptable)
)
probes["keyword (exact title)"].append(
_run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="keyword")
)
probes["hybrid (exact title)"].append(
_run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="hybrid")
)
print(f"{'probe':32} {'recall@1':>9} {'recall@5':>9} {'recall@10':>10} {'MRR':>7} {'miss':>7}")
print("-" * 80)
for name, ranks in probes.items():
m = _metrics(ranks)
if m:
print(
f"{name:32} {m['recall@1']:>9.2%} {m['recall@5']:>9.2%} "
f"{m['recall@10']:>10.2%} {m['mrr']:>7.3f} {m['not_found']:>7.2%}"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())