File size: 5,009 Bytes
6f97a9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from backend.spaces_index import recommend_spaces
SCENARIOS = [
{
"query": "kid games apps",
"expected_ids": {"build-small-hackathon/reachy-to-the-rescue"},
"description": "kid-friendly game intent should surface a clearly game-like result",
"reason_terms": {"kid", "play", "game", "younger"},
},
{
"query": "suggest me some good writing apps",
"expected_ids": {
"build-small-hackathon/copy-campfire",
"build-small-hackathon/anti-ill-comix",
"build-small-hackathon/cuneiform-translator",
},
"description": "writing intent should favor writing-first spaces over generic assistants",
"reason_terms": {"writing", "write", "journal", "creative"},
},
{
"query": "provide card games",
"expected_ids": {"build-small-hackathon/MatchWise"},
"description": "card-game intent should find a space whose central loop is card-based",
"reason_terms": {"card", "cards", "match", "flip"},
},
{
"query": "do we have tutor apps?",
"expected_ids": {
"build-small-hackathon/tutori",
"build-small-hackathon/educrate",
"build-small-hackathon/pocket-tutor",
},
"description": "tutor intent should surface tutoring or teaching spaces",
"reason_terms": {"tutor", "learning", "lesson", "teach"},
},
{
"query": "playing cards",
"expected_ids": {"build-small-hackathon/MatchWise"},
"description": "plain-language card intent should still map to the same relevant result",
"reason_terms": {"card", "cards", "match", "flip"},
},
]
def _sentence_count(text: str) -> int:
return len([part for part in re.split(r"(?<=[.!?])\s+", text.strip()) if part.strip()])
def evaluate(top_k: int, verbose: bool) -> int:
failures: list[str] = []
for scenario in SCENARIOS:
query = scenario["query"]
expected_ids = set(scenario["expected_ids"])
results = recommend_spaces(query, "", "All", top_k=top_k)
result_ids = [str(item.get("id", "")).strip() for item in results]
matched = expected_ids & set(result_ids)
if verbose:
print(f"\nQUERY: {query}")
print(f"Expected any of: {sorted(expected_ids)}")
print(f"Description: {scenario['description']}")
for idx, item in enumerate(results, 1):
print(
f"{idx}. {item.get('id')} | {item.get('name')} | "
f"{item.get('category')} | {item.get('matched_signals')} | "
f"{(item.get('reason') or '')[:180]}"
)
if not matched:
failures.append(
f'Query "{query}" did not return any expected ids in top {top_k}. '
f"Expected one of {sorted(expected_ids)}, got {result_ids}."
)
continue
if not results:
failures.append(f'Query "{query}" returned no results.')
continue
top_reason = str(results[0].get("reason", "") or "").strip()
if top_reason:
sentence_count = _sentence_count(top_reason)
if sentence_count > 2:
failures.append(
f'Query "{query}" returned a top reason with {sentence_count} sentences; expected at most 2.'
)
lowered_reason = top_reason.lower()
if "readme" in lowered_reason:
failures.append(
f'Query "{query}" returned a top reason that mentioned README explicitly. '
f"Reason was: {top_reason}"
)
reason_terms = set(scenario.get("reason_terms", set()) or set())
if reason_terms and not any(term in lowered_reason for term in reason_terms):
failures.append(
f'Query "{query}" returned a top reason that did not mention expected concepts {sorted(reason_terms)}. '
f"Reason was: {top_reason}"
)
if failures:
print("\nRecommendation evaluation failed:")
for failure in failures:
print(f"- {failure}")
return 1
print("Recommendation evaluation passed.")
return 0
def main() -> int:
parser = argparse.ArgumentParser(description="Run lightweight recommendation quality checks.")
parser.add_argument("--top-k", type=int, default=5, help="Number of results to inspect for each scenario.")
parser.add_argument("--quiet", action="store_true", help="Only print pass/fail summary.")
args = parser.parse_args()
return evaluate(top_k=max(1, args.top_k), verbose=not args.quiet)
if __name__ == "__main__":
raise SystemExit(main())
|