HackathonSpaceRecommender / scripts /evaluate_recommendations.py
tejasashinde's picture
Corrected logic and UI
6f97a9b
Raw
History Blame Contribute Delete
5.01 kB
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from backend.spaces_index import recommend_spaces
SCENARIOS = [
{
"query": "kid games apps",
"expected_ids": {"build-small-hackathon/reachy-to-the-rescue"},
"description": "kid-friendly game intent should surface a clearly game-like result",
"reason_terms": {"kid", "play", "game", "younger"},
},
{
"query": "suggest me some good writing apps",
"expected_ids": {
"build-small-hackathon/copy-campfire",
"build-small-hackathon/anti-ill-comix",
"build-small-hackathon/cuneiform-translator",
},
"description": "writing intent should favor writing-first spaces over generic assistants",
"reason_terms": {"writing", "write", "journal", "creative"},
},
{
"query": "provide card games",
"expected_ids": {"build-small-hackathon/MatchWise"},
"description": "card-game intent should find a space whose central loop is card-based",
"reason_terms": {"card", "cards", "match", "flip"},
},
{
"query": "do we have tutor apps?",
"expected_ids": {
"build-small-hackathon/tutori",
"build-small-hackathon/educrate",
"build-small-hackathon/pocket-tutor",
},
"description": "tutor intent should surface tutoring or teaching spaces",
"reason_terms": {"tutor", "learning", "lesson", "teach"},
},
{
"query": "playing cards",
"expected_ids": {"build-small-hackathon/MatchWise"},
"description": "plain-language card intent should still map to the same relevant result",
"reason_terms": {"card", "cards", "match", "flip"},
},
]
def _sentence_count(text: str) -> int:
return len([part for part in re.split(r"(?<=[.!?])\s+", text.strip()) if part.strip()])
def evaluate(top_k: int, verbose: bool) -> int:
failures: list[str] = []
for scenario in SCENARIOS:
query = scenario["query"]
expected_ids = set(scenario["expected_ids"])
results = recommend_spaces(query, "", "All", top_k=top_k)
result_ids = [str(item.get("id", "")).strip() for item in results]
matched = expected_ids & set(result_ids)
if verbose:
print(f"\nQUERY: {query}")
print(f"Expected any of: {sorted(expected_ids)}")
print(f"Description: {scenario['description']}")
for idx, item in enumerate(results, 1):
print(
f"{idx}. {item.get('id')} | {item.get('name')} | "
f"{item.get('category')} | {item.get('matched_signals')} | "
f"{(item.get('reason') or '')[:180]}"
)
if not matched:
failures.append(
f'Query "{query}" did not return any expected ids in top {top_k}. '
f"Expected one of {sorted(expected_ids)}, got {result_ids}."
)
continue
if not results:
failures.append(f'Query "{query}" returned no results.')
continue
top_reason = str(results[0].get("reason", "") or "").strip()
if top_reason:
sentence_count = _sentence_count(top_reason)
if sentence_count > 2:
failures.append(
f'Query "{query}" returned a top reason with {sentence_count} sentences; expected at most 2.'
)
lowered_reason = top_reason.lower()
if "readme" in lowered_reason:
failures.append(
f'Query "{query}" returned a top reason that mentioned README explicitly. '
f"Reason was: {top_reason}"
)
reason_terms = set(scenario.get("reason_terms", set()) or set())
if reason_terms and not any(term in lowered_reason for term in reason_terms):
failures.append(
f'Query "{query}" returned a top reason that did not mention expected concepts {sorted(reason_terms)}. '
f"Reason was: {top_reason}"
)
if failures:
print("\nRecommendation evaluation failed:")
for failure in failures:
print(f"- {failure}")
return 1
print("Recommendation evaluation passed.")
return 0
def main() -> int:
parser = argparse.ArgumentParser(description="Run lightweight recommendation quality checks.")
parser.add_argument("--top-k", type=int, default=5, help="Number of results to inspect for each scenario.")
parser.add_argument("--quiet", action="store_true", help="Only print pass/fail summary.")
args = parser.parse_args()
return evaluate(top_k=max(1, args.top_k), verbose=not args.quiet)
if __name__ == "__main__":
raise SystemExit(main())