Spaces:

tejasashinde
/

HackathonSpaceRecommender

Running

App Files Files Community

HackathonSpaceRecommender / scripts /evaluate_recommendations.py

tejasashinde

Corrected logic and UI

6f97a9b 3 days ago

Raw

History Blame Contribute Delete

5.01 kB

	from __future__ import annotations

	import argparse
	import re
	import sys
	from pathlib import Path


	ROOT_DIR = Path(__file__).resolve().parents[1]
	if str(ROOT_DIR) not in sys.path:
	sys.path.insert(0, str(ROOT_DIR))

	from backend.spaces_index import recommend_spaces


	SCENARIOS = [
	{
	"query": "kid games apps",
	"expected_ids": {"build-small-hackathon/reachy-to-the-rescue"},
	"description": "kid-friendly game intent should surface a clearly game-like result",
	"reason_terms": {"kid", "play", "game", "younger"},
	},
	{
	"query": "suggest me some good writing apps",
	"expected_ids": {
	"build-small-hackathon/copy-campfire",
	"build-small-hackathon/anti-ill-comix",
	"build-small-hackathon/cuneiform-translator",
	},
	"description": "writing intent should favor writing-first spaces over generic assistants",
	"reason_terms": {"writing", "write", "journal", "creative"},
	},
	{
	"query": "provide card games",
	"expected_ids": {"build-small-hackathon/MatchWise"},
	"description": "card-game intent should find a space whose central loop is card-based",
	"reason_terms": {"card", "cards", "match", "flip"},
	},
	{
	"query": "do we have tutor apps?",
	"expected_ids": {
	"build-small-hackathon/tutori",
	"build-small-hackathon/educrate",
	"build-small-hackathon/pocket-tutor",
	},
	"description": "tutor intent should surface tutoring or teaching spaces",
	"reason_terms": {"tutor", "learning", "lesson", "teach"},
	},
	{
	"query": "playing cards",
	"expected_ids": {"build-small-hackathon/MatchWise"},
	"description": "plain-language card intent should still map to the same relevant result",
	"reason_terms": {"card", "cards", "match", "flip"},
	},
	]


	def _sentence_count(text: str) -> int:
	return len([part for part in re.split(r"(?<=[.!?])\s+", text.strip()) if part.strip()])


	def evaluate(top_k: int, verbose: bool) -> int:
	failures: list[str] = []
	for scenario in SCENARIOS:
	query = scenario["query"]
	expected_ids = set(scenario["expected_ids"])
	results = recommend_spaces(query, "", "All", top_k=top_k)
	result_ids = [str(item.get("id", "")).strip() for item in results]
	matched = expected_ids & set(result_ids)
	if verbose:
	print(f"\nQUERY: {query}")
	print(f"Expected any of: {sorted(expected_ids)}")
	print(f"Description: {scenario['description']}")
	for idx, item in enumerate(results, 1):
	print(
	f"{idx}. {item.get('id')} \| {item.get('name')} \| "
	f"{item.get('category')} \| {item.get('matched_signals')} \| "
	f"{(item.get('reason') or '')[:180]}"
	)
	if not matched:
	failures.append(
	f'Query "{query}" did not return any expected ids in top {top_k}. '
	f"Expected one of {sorted(expected_ids)}, got {result_ids}."
	)
	continue

	if not results:
	failures.append(f'Query "{query}" returned no results.')
	continue

	top_reason = str(results[0].get("reason", "") or "").strip()
	if top_reason:
	sentence_count = _sentence_count(top_reason)
	if sentence_count > 2:
	failures.append(
	f'Query "{query}" returned a top reason with {sentence_count} sentences; expected at most 2.'
	)

	lowered_reason = top_reason.lower()
	if "readme" in lowered_reason:
	failures.append(
	f'Query "{query}" returned a top reason that mentioned README explicitly. '
	f"Reason was: {top_reason}"
	)

	reason_terms = set(scenario.get("reason_terms", set()) or set())
	if reason_terms and not any(term in lowered_reason for term in reason_terms):
	failures.append(
	f'Query "{query}" returned a top reason that did not mention expected concepts {sorted(reason_terms)}. '
	f"Reason was: {top_reason}"
	)

	if failures:
	print("\nRecommendation evaluation failed:")
	for failure in failures:
	print(f"- {failure}")
	return 1

	print("Recommendation evaluation passed.")
	return 0


	def main() -> int:
	parser = argparse.ArgumentParser(description="Run lightweight recommendation quality checks.")
	parser.add_argument("--top-k", type=int, default=5, help="Number of results to inspect for each scenario.")
	parser.add_argument("--quiet", action="store_true", help="Only print pass/fail summary.")
	args = parser.parse_args()
	return evaluate(top_k=max(1, args.top_k), verbose=not args.quiet)


	if __name__ == "__main__":
	raise SystemExit(main())