""" Partition 2: User just wants to browse schools. Expected chatbot behavior: - MUST return at least 1 real school name - MUST NOT gatekeep ("I can't help without your address") - Should match what the database would return for the same query Requires: HF_TOKEN (skipped automatically if not set) """ import json import pytest from pathlib import Path from helpers import ( contains_any, not_gatekeeping, schools_overlap, has_list, ) TEST_DATA = json.loads( (Path(__file__).parent / "test_data" / "test_cases.json").read_text() )["partition2_browsing"] pytestmark = pytest.mark.chatbot # ── Helper: run the matching DB query for a test case ──────────────────────── def _run_db_query(db, case: dict) -> list: """Run the test case's db_query against the database.""" q = case.get("db_query", {}) return db.search( query=q.get("query", ""), grade=q.get("grade"), has_language_program=q.get("has_language_program"), surround_care=q.get("surround_care"), top_k=q.get("top_k", 5), ) # ── Rule-based tests ────────────────────────────────────────────────────────── @pytest.mark.parametrize("case", TEST_DATA, ids=[c["id"] for c in TEST_DATA]) def test_response_contains_at_least_one_school_name(chatbot, db, case): """Response must mention at least one real school name from DB results.""" response = chatbot.get_response(case["input"]) db_results = _run_db_query(db, case) school_names = [r["school"] for r in db_results if isinstance(r, dict) and "school" in r] assert school_names, ( f"[{case['id']}] DB returned no results for query — cannot validate school names.\n" f"DB query: {case.get('db_query')}" ) assert schools_overlap(response, db_results, min_overlap=1), ( f"[{case['id']}] Response did not mention any school from DB results.\n" f"Input: {case['input']!r}\n" f"Response: {response!r}\n" f"DB schools: {school_names}" ) @pytest.mark.parametrize("case", TEST_DATA, ids=[c["id"] for c in TEST_DATA]) def test_response_does_not_gatekeep(chatbot, case): """Response MUST NOT refuse to help without an address.""" response = chatbot.get_response(case["input"]) assert not_gatekeeping(response), ( f"[{case['id']}] Response appeared to gatekeep (demanded address before helping).\n" f"Input: {case['input']!r}\n" f"Response: {response!r}" ) @pytest.mark.parametrize("case", TEST_DATA, ids=[c["id"] for c in TEST_DATA]) def test_response_is_not_empty(chatbot, case): """Response must be a non-empty string.""" response = chatbot.get_response(case["input"]) assert response and len(response.strip()) > 20, ( f"[{case['id']}] Response is empty or too short.\n" f"Input: {case['input']!r}\n" f"Response: {response!r}" ) # ── Retrieval alignment tests ───────────────────────────────────────────────── @pytest.mark.parametrize("case", TEST_DATA, ids=[c["id"] for c in TEST_DATA]) def test_chatbot_schools_overlap_with_database(chatbot, db, case): """At least 1 school recommended by chatbot should appear in DB results.""" response = chatbot.get_response(case["input"]) db_results = _run_db_query(db, case) if not db_results: pytest.skip(f"[{case['id']}] DB returned no results for this query") assert schools_overlap(response, db_results, min_overlap=1), ( f"[{case['id']}] No overlap between chatbot response and database results.\n" f"Input: {case['input']!r}\n" f"Response: {response!r}\n" f"DB top schools: {[r['school'] for r in db_results[:5]]}" ) # ── Embedding similarity tests ──────────────────────────────────────────────── @pytest.mark.parametrize("case", TEST_DATA, ids=[c["id"] for c in TEST_DATA]) def test_response_similarity_to_reference(chatbot, similarity_checker, case): """Response semantic similarity to reference answer must meet threshold.""" response = chatbot.get_response(case["input"]) score = similarity_checker(response, case["reference_answer"]) threshold = case.get("similarity_threshold", 0.45) assert score >= threshold, ( f"[{case['id']}] Embedding similarity {score:.3f} < threshold {threshold}.\n" f"Input: {case['input']!r}\n" f"Response: {response!r}\n" f"Reference: {case['reference_answer']!r}" ) # ── LLM-as-judge tests ──────────────────────────────────────────────────────── @pytest.mark.llm_judge @pytest.mark.parametrize("case", TEST_DATA[:2], ids=[c["id"] for c in TEST_DATA[:2]]) def test_llm_judge_browsing_response(chatbot, openai_judge, case): """GPT-4o judge evaluates helpfulness, completeness, and accuracy.""" response = chatbot.get_response(case["input"]) judge_prompt = f"""Rate this chatbot response on a 1-5 scale. Return ONLY valid JSON with keys: relevance, completeness, accuracy, helpfulness. User question: {case["input"]} Chatbot response: {response} Context: The chatbot helps Boston parents browse schools. It should name real schools and NOT refuse to help because the user hasn't provided their address. Return JSON only. Example: {{"relevance": 4, "completeness": 3, "accuracy": 5, "helpfulness": 4}}""" completion = openai_judge.chat.completions.create( model=openai_judge._judge_model, messages=[{"role": "user", "content": judge_prompt}], temperature=0, ) import json as _json try: scores = _json.loads(completion.choices[0].message.content.strip()) except _json.JSONDecodeError: pytest.fail(f"Judge returned invalid JSON: {completion.choices[0].message.content}") assert scores.get("relevance", 0) >= 3, f"relevance too low: {scores}" assert scores.get("helpfulness", 0) >= 3, f"helpfulness too low: {scores}" assert scores.get("completeness", 0) >= 2, f"completeness too low: {scores}"