| from __future__ import annotations |
|
|
| import argparse |
| import re |
| import sys |
| from pathlib import Path |
|
|
|
|
| ROOT_DIR = Path(__file__).resolve().parents[1] |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from backend.spaces_index import recommend_spaces |
|
|
|
|
| SCENARIOS = [ |
| { |
| "query": "kid games apps", |
| "expected_ids": {"build-small-hackathon/reachy-to-the-rescue"}, |
| "description": "kid-friendly game intent should surface a clearly game-like result", |
| "reason_terms": {"kid", "play", "game", "younger"}, |
| }, |
| { |
| "query": "suggest me some good writing apps", |
| "expected_ids": { |
| "build-small-hackathon/copy-campfire", |
| "build-small-hackathon/anti-ill-comix", |
| "build-small-hackathon/cuneiform-translator", |
| }, |
| "description": "writing intent should favor writing-first spaces over generic assistants", |
| "reason_terms": {"writing", "write", "journal", "creative"}, |
| }, |
| { |
| "query": "provide card games", |
| "expected_ids": {"build-small-hackathon/MatchWise"}, |
| "description": "card-game intent should find a space whose central loop is card-based", |
| "reason_terms": {"card", "cards", "match", "flip"}, |
| }, |
| { |
| "query": "do we have tutor apps?", |
| "expected_ids": { |
| "build-small-hackathon/tutori", |
| "build-small-hackathon/educrate", |
| "build-small-hackathon/pocket-tutor", |
| }, |
| "description": "tutor intent should surface tutoring or teaching spaces", |
| "reason_terms": {"tutor", "learning", "lesson", "teach"}, |
| }, |
| { |
| "query": "playing cards", |
| "expected_ids": {"build-small-hackathon/MatchWise"}, |
| "description": "plain-language card intent should still map to the same relevant result", |
| "reason_terms": {"card", "cards", "match", "flip"}, |
| }, |
| ] |
|
|
|
|
| def _sentence_count(text: str) -> int: |
| return len([part for part in re.split(r"(?<=[.!?])\s+", text.strip()) if part.strip()]) |
|
|
|
|
| def evaluate(top_k: int, verbose: bool) -> int: |
| failures: list[str] = [] |
| for scenario in SCENARIOS: |
| query = scenario["query"] |
| expected_ids = set(scenario["expected_ids"]) |
| results = recommend_spaces(query, "", "All", top_k=top_k) |
| result_ids = [str(item.get("id", "")).strip() for item in results] |
| matched = expected_ids & set(result_ids) |
| if verbose: |
| print(f"\nQUERY: {query}") |
| print(f"Expected any of: {sorted(expected_ids)}") |
| print(f"Description: {scenario['description']}") |
| for idx, item in enumerate(results, 1): |
| print( |
| f"{idx}. {item.get('id')} | {item.get('name')} | " |
| f"{item.get('category')} | {item.get('matched_signals')} | " |
| f"{(item.get('reason') or '')[:180]}" |
| ) |
| if not matched: |
| failures.append( |
| f'Query "{query}" did not return any expected ids in top {top_k}. ' |
| f"Expected one of {sorted(expected_ids)}, got {result_ids}." |
| ) |
| continue |
|
|
| if not results: |
| failures.append(f'Query "{query}" returned no results.') |
| continue |
|
|
| top_reason = str(results[0].get("reason", "") or "").strip() |
| if top_reason: |
| sentence_count = _sentence_count(top_reason) |
| if sentence_count > 2: |
| failures.append( |
| f'Query "{query}" returned a top reason with {sentence_count} sentences; expected at most 2.' |
| ) |
|
|
| lowered_reason = top_reason.lower() |
| if "readme" in lowered_reason: |
| failures.append( |
| f'Query "{query}" returned a top reason that mentioned README explicitly. ' |
| f"Reason was: {top_reason}" |
| ) |
|
|
| reason_terms = set(scenario.get("reason_terms", set()) or set()) |
| if reason_terms and not any(term in lowered_reason for term in reason_terms): |
| failures.append( |
| f'Query "{query}" returned a top reason that did not mention expected concepts {sorted(reason_terms)}. ' |
| f"Reason was: {top_reason}" |
| ) |
|
|
| if failures: |
| print("\nRecommendation evaluation failed:") |
| for failure in failures: |
| print(f"- {failure}") |
| return 1 |
|
|
| print("Recommendation evaluation passed.") |
| return 0 |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description="Run lightweight recommendation quality checks.") |
| parser.add_argument("--top-k", type=int, default=5, help="Number of results to inspect for each scenario.") |
| parser.add_argument("--quiet", action="store_true", help="Only print pass/fail summary.") |
| args = parser.parse_args() |
| return evaluate(top_k=max(1, args.top_k), verbose=not args.quiet) |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|