Spaces:

jw-tools
/

jw-search

Running

App Files Files Community

jw-search / scripts /search_eval.py

jw-tools

deploy: latest main (lazy-ML cold start, durable launcher, web-image search, scene search) + full-app data refresh

7ea1851 verified about 9 hours ago

raw

history blame contribute delete

6.88 kB

	#!/usr/bin/env python3
	"""Objective search-quality evaluation harness.

	Measures search quality without human relevance judgments, using the catalog
	itself as ground truth: if a user searches a video's TITLE (exact, or the first
	few words), that video should rank at/near the top. We report recall@k and MRR
	for title search, keyword search, and hybrid search over a deterministic sample.

	This turns "does search feel good?" into numbers you can move and compare.

	Usage:
	python scripts/search_eval.py [--base-url http://localhost:8001] \
	[--language E] [--sample 150] [--data-root <backend dir>]

	Run it before and after a ranking change to see the effect.
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import re
	import sys
	import urllib.parse
	import urllib.request
	from pathlib import Path


	def _load_titles(data_root: str, language: str) -> dict[str, str]:
	path = os.path.join(data_root, "json", language, "all_media_items.json")
	with open(path, encoding="utf-8") as handle:
	media = json.load(handle)
	items = media.items() if isinstance(media, dict) else ((m.get("naturalKey"), m) for m in media)
	titles: dict[str, str] = {}
	for natural_key, item in items:
	if natural_key and isinstance(item, dict) and item.get("title"):
	titles[natural_key] = item["title"]
	return titles


	def _fetch(base_url: str, path: str, **params) -> list[dict]:
	qs = urllib.parse.urlencode({k: v for k, v in params.items() if v != ""})
	url = f"{base_url.rstrip('/')}{path}?{qs}"
	with urllib.request.urlopen(url, timeout=120) as resp:
	body = json.load(resp)
	return body.get("results", []) if isinstance(body, dict) else []


	_AD_SUFFIX = "(With Audio Descriptions)"


	def _acceptable_keys(natural_key: str, title: str, titles: dict[str, str]) -> set[str]:
	"""Keys that count as a correct hit for this video's title query.

	Title search intentionally collapses "(With Audio Descriptions)" duplicates
	to the primary (non-AD) version, so an AD title query should be considered
	found when ANY video with the de-suffixed primary title is returned.

	Note: this is slightly more generous than production, which canonicalizes to
	a single primary; on the rare titles shared by two unrelated videos this
	over-accepts (~1% of AD probes), nudging recall optimistically. Acceptable
	for a relative before/after measure. AD detection here is the literal title
	suffix; production's is_ad_media also keys off category — they agree on the
	current catalog.
	"""
	if _AD_SUFFIX in title:
	primary_title = title.replace(_AD_SUFFIX, "").strip()
	primaries = {k for k, t in titles.items() if t == primary_title}
	return primaries or {natural_key}
	return {natural_key}


	def _rank_of(results: list[dict], acceptable: set[str]) -> int \| None:
	"""Best 1-based rank among any acceptable key in results, or None."""
	seen: list[str] = []
	for r in results:
	nk = r.get("natural_key")
	if nk and nk not in seen:
	seen.append(nk)
	ranks = [seen.index(k) + 1 for k in acceptable if k in seen]
	return min(ranks) if ranks else None


	def _first_words(title: str, n: int) -> str:
	words = re.findall(r"\w+", title)
	return " ".join(words[:n])


	def _sample(titles: dict[str, str], n: int) -> list[tuple[str, str]]:
	items = sorted(titles.items()) # deterministic order
	if len(items) <= n:
	return items
	step = len(items) / n
	return [items[int(i * step)] for i in range(n)]


	def _metrics(ranks: list[int \| None]) -> dict[str, float]:
	total = len(ranks)
	if not total:
	return {}
	found = [r for r in ranks if r is not None]
	return {
	"recall@1": sum(1 for r in found if r <= 1) / total,
	"recall@5": sum(1 for r in found if r <= 5) / total,
	"recall@10": sum(1 for r in found if r <= 10) / total,
	"mrr": sum(1.0 / r for r in found) / total,
	"not_found": (total - len(found)) / total,
	}


	def _run_probe(base_url, path, query, language, acceptable, **extra) -> int \| None:
	# Self-guard each probe so one failure counts as a miss (None) rather than
	# desyncing the per-metric sample sizes / denominators.
	try:
	results = _fetch(base_url, path, q=query, language=language, limit=20, **extra)
	except Exception as exc: # noqa: BLE001 — eval tool; a failed probe is a miss
	print(f" (probe failed: {path} q={query!r}: {exc})", file=sys.stderr)
	return None
	return _rank_of(results, acceptable)


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--base-url", default="http://localhost:8001")
	parser.add_argument("--language", default="E")
	parser.add_argument("--sample", type=int, default=150)
	parser.add_argument(
	"--data-root",
	default=os.environ.get("SEARCH_UI_DATA_ROOT", "/Users/avsadmin/Documents/Apps/Search-UI/backend"),
	)
	parser.add_argument("--partial-words", type=int, default=4)
	args = parser.parse_args()

	titles = _load_titles(args.data_root, args.language)
	sample = _sample(titles, args.sample)
	print(f"Catalog: {len(titles)} titled videos; evaluating a deterministic sample of {len(sample)}.\n")

	probes: dict[str, list[int \| None]] = {
	"title-search (exact title)": [],
	"title-search (first N words)": [],
	"keyword (exact title)": [],
	"hybrid (exact title)": [],
	}
	for natural_key, title in sample:
	partial = _first_words(title, args.partial_words)
	acceptable = _acceptable_keys(natural_key, title, titles)
	# Each _run_probe self-guards, so all four lists stay the same length.
	probes["title-search (exact title)"].append(
	_run_probe(args.base_url, "/api/search-title", title, args.language, acceptable)
	)
	probes["title-search (first N words)"].append(
	_run_probe(args.base_url, "/api/search-title", partial, args.language, acceptable)
	)
	probes["keyword (exact title)"].append(
	_run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="keyword")
	)
	probes["hybrid (exact title)"].append(
	_run_probe(args.base_url, "/api/search", title, args.language, acceptable, method="hybrid")
	)

	print(f"{'probe':32} {'recall@1':>9} {'recall@5':>9} {'recall@10':>10} {'MRR':>7} {'miss':>7}")
	print("-" * 80)
	for name, ranks in probes.items():
	m = _metrics(ranks)
	if m:
	print(
	f"{name:32} {m['recall@1']:>9.2%} {m['recall@5']:>9.2%} "
	f"{m['recall@10']:>10.2%} {m['mrr']:>7.3f} {m['not_found']:>7.2%}"
	)
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())