File size: 5,009 Bytes
6f97a9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path


ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from backend.spaces_index import recommend_spaces


SCENARIOS = [
    {
        "query": "kid games apps",
        "expected_ids": {"build-small-hackathon/reachy-to-the-rescue"},
        "description": "kid-friendly game intent should surface a clearly game-like result",
        "reason_terms": {"kid", "play", "game", "younger"},
    },
    {
        "query": "suggest me some good writing apps",
        "expected_ids": {
            "build-small-hackathon/copy-campfire",
            "build-small-hackathon/anti-ill-comix",
            "build-small-hackathon/cuneiform-translator",
        },
        "description": "writing intent should favor writing-first spaces over generic assistants",
        "reason_terms": {"writing", "write", "journal", "creative"},
    },
    {
        "query": "provide card games",
        "expected_ids": {"build-small-hackathon/MatchWise"},
        "description": "card-game intent should find a space whose central loop is card-based",
        "reason_terms": {"card", "cards", "match", "flip"},
    },
    {
        "query": "do we have tutor apps?",
        "expected_ids": {
            "build-small-hackathon/tutori",
            "build-small-hackathon/educrate",
            "build-small-hackathon/pocket-tutor",
        },
        "description": "tutor intent should surface tutoring or teaching spaces",
        "reason_terms": {"tutor", "learning", "lesson", "teach"},
    },
    {
        "query": "playing cards",
        "expected_ids": {"build-small-hackathon/MatchWise"},
        "description": "plain-language card intent should still map to the same relevant result",
        "reason_terms": {"card", "cards", "match", "flip"},
    },
]


def _sentence_count(text: str) -> int:
    return len([part for part in re.split(r"(?<=[.!?])\s+", text.strip()) if part.strip()])


def evaluate(top_k: int, verbose: bool) -> int:
    failures: list[str] = []
    for scenario in SCENARIOS:
        query = scenario["query"]
        expected_ids = set(scenario["expected_ids"])
        results = recommend_spaces(query, "", "All", top_k=top_k)
        result_ids = [str(item.get("id", "")).strip() for item in results]
        matched = expected_ids & set(result_ids)
        if verbose:
            print(f"\nQUERY: {query}")
            print(f"Expected any of: {sorted(expected_ids)}")
            print(f"Description: {scenario['description']}")
            for idx, item in enumerate(results, 1):
                print(
                    f"{idx}. {item.get('id')} | {item.get('name')} | "
                    f"{item.get('category')} | {item.get('matched_signals')} | "
                    f"{(item.get('reason') or '')[:180]}"
                )
        if not matched:
            failures.append(
                f'Query "{query}" did not return any expected ids in top {top_k}. '
                f"Expected one of {sorted(expected_ids)}, got {result_ids}."
            )
            continue

        if not results:
            failures.append(f'Query "{query}" returned no results.')
            continue

        top_reason = str(results[0].get("reason", "") or "").strip()
        if top_reason:
            sentence_count = _sentence_count(top_reason)
            if sentence_count > 2:
                failures.append(
                    f'Query "{query}" returned a top reason with {sentence_count} sentences; expected at most 2.'
                )

            lowered_reason = top_reason.lower()
            if "readme" in lowered_reason:
                failures.append(
                    f'Query "{query}" returned a top reason that mentioned README explicitly. '
                    f"Reason was: {top_reason}"
                )

            reason_terms = set(scenario.get("reason_terms", set()) or set())
            if reason_terms and not any(term in lowered_reason for term in reason_terms):
                failures.append(
                    f'Query "{query}" returned a top reason that did not mention expected concepts {sorted(reason_terms)}. '
                    f"Reason was: {top_reason}"
                )

    if failures:
        print("\nRecommendation evaluation failed:")
        for failure in failures:
            print(f"- {failure}")
        return 1

    print("Recommendation evaluation passed.")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(description="Run lightweight recommendation quality checks.")
    parser.add_argument("--top-k", type=int, default=5, help="Number of results to inspect for each scenario.")
    parser.add_argument("--quiet", action="store_true", help="Only print pass/fail summary.")
    args = parser.parse_args()
    return evaluate(top_k=max(1, args.top_k), verbose=not args.quiet)


if __name__ == "__main__":
    raise SystemExit(main())