""" Comprehensive evaluation script for SAMHSA Treatment Locator chatbot. This script provides a detailed, multi-faceted evaluation of the chatbot's performance across: - Criteria extraction accuracy - Search result relevance and matching - Response quality (relevance, completeness, helpfulness, flow adherence) - Hallucination prevention - Conversation handling (single-turn and multi-turn scenarios) - Edge case robustness Evaluates against 25+ scenarios, including real conversation examples. Outputs detailed metrics, scores, and recommendations for improvement. """ import argparse import json import re import sys import time from pathlib import Path from typing import Dict, List, Any # Project root sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.facilities import load_facilities, search # --- Enhanced Scenarios with Expected Outcomes --- SCENARIOS = [ # Basic search scenarios { "description": "Outpatient, Boston, Medicaid", "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "payment": "Medicaid"}, "user_msg": "I need outpatient treatment in Boston with Medicaid.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["outpatient", "Medicaid", "Boston"], }, { "description": "Outpatient, Boston, MassHealth", "criteria": {"state": "ma", "location": "Boston", "payment": "Medicaid"}, "user_msg": "Looking for outpatient in Boston with MassHealth.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["Medicaid", "Boston"], }, { "description": "Outpatient, Boston, MAT", "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "mat": True}, "user_msg": "Outpatient in Boston with medication-assisted treatment.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["MAT", "Boston"], }, { "description": "Residential, Massachusetts", "criteria": {"state": "ma", "treatment_type": "residential"}, "user_msg": "Residential treatment in Massachusetts.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["residential", "MA"], }, { "description": "Veterans, Texas", "criteria": {"state": "tx", "populations": "veterans", "payment": "veterans"}, "user_msg": "Do you have options for veterans in Texas?", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["veterans", "Texas"], }, { "description": "Veterans, San Antonio", "criteria": {"state": "tx", "location": "San Antonio", "populations": "veterans"}, "user_msg": "Veterans programs in San Antonio.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["veterans", "San Antonio"], }, { "description": "Outpatient, Austin", "criteria": {"state": "tx", "location": "Austin"}, "user_msg": "Outpatient substance use treatment in Austin.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["outpatient", "Austin"], }, { "description": "California, Medicaid", "criteria": {"state": "ca", "payment": "Medicaid"}, "user_msg": "California facilities that accept Medicaid.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["Medicaid", "California"], }, { "description": "California, residential", "criteria": {"state": "ca", "treatment_type": "residential"}, "user_msg": "Residential treatment in California.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["residential", "California"], }, { "description": "San Francisco, outpatient", "criteria": {"state": "ca", "location": "San Francisco", "treatment_type": "outpatient"}, "user_msg": "Outpatient in San Francisco.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["outpatient", "San Francisco"], }, { "description": "Los Angeles area", "criteria": {"state": "ca", "location": "Los Angeles"}, "user_msg": "Treatment options in Los Angeles area.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["Los Angeles"], }, { "description": "Chicago, outpatient", "criteria": {"state": "il", "location": "Chicago", "treatment_type": "outpatient"}, "user_msg": "Outpatient in Chicago.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["outpatient", "Chicago"], }, { "description": "Chicago, MAT", "criteria": {"state": "il", "location": "Chicago", "mat": True}, "user_msg": "Chicago programs with MAT.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["MAT", "Chicago"], }, { "description": "Illinois, Medicaid", "criteria": {"state": "il", "payment": "Medicaid"}, "user_msg": "Illinois facilities accepting Medicaid.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["Medicaid", "Illinois"], }, { "description": "Boston, sliding scale", "criteria": {"state": "ma", "location": "Boston", "payment": "sliding scale"}, "user_msg": "Boston programs with sliding scale fees.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["sliding scale", "Boston"], }, { "description": "Outpatient, Boston, Spanish", "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "languages": "Spanish"}, "user_msg": "Outpatient in Boston, Spanish-speaking.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["Spanish", "Boston"], }, { "description": "Residential, Texas", "criteria": {"state": "tx", "treatment_type": "residential"}, "user_msg": "Residential treatment in Texas.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["residential", "Texas"], }, { "description": "MA, inpatient", "criteria": {"state": "ma", "treatment_type": "inpatient"}, "user_msg": "Inpatient treatment in MA.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["inpatient", "MA"], }, { "description": "Boston, alcohol", "criteria": {"state": "ma", "location": "Boston", "substances": "alcohol"}, "user_msg": "Boston facilities for alcohol treatment.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["alcohol", "Boston"], }, { "description": "Chicago, opioids", "criteria": {"state": "il", "location": "Chicago", "substances": "opioids"}, "user_msg": "Opioid treatment in Chicago.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["opioids", "Chicago"], }, { "description": "Boston, CBT", "criteria": {"state": "ma", "location": "Boston", "therapies": "CBT"}, "user_msg": "Boston programs that offer CBT.", "expected_flow": "results", "expected_facilities_min": 1, "key_attributes": ["CBT", "Boston"], }, # Edge cases and clarification scenarios { "description": "No location provided", "criteria": {}, "user_msg": "I need help finding treatment.", "expected_flow": "clarify", "expected_facilities_min": 0, "key_attributes": [], }, { "description": "Vague request", "criteria": {}, "user_msg": "What's available?", "expected_flow": "clarify", "expected_facilities_min": 0, "key_attributes": [], }, { "description": "Conflicting criteria", "criteria": {"state": "ma", "location": "Austin"}, "user_msg": "Treatment in Massachusetts but specifically Austin.", "expected_flow": "clarify", "expected_facilities_min": 0, "key_attributes": [], }, ] # Multi-turn conversation scenarios based on examples MULTI_TURN_SCENARIOS = [ { "description": "SAMHSA Example Conversation", "turns": [ {"user": "Hi, I'm trying to find a treatment program for alcohol use. I'm not sure where to start.", "expected_flow": "clarify"}, {"user": "I'm in the Boston area. I think outpatient would work best since I need to keep working. I have MassHealth.", "expected_flow": "results"}, {"user": "I'm interested in the one at Boston Medical Center. Do they offer medication-assisted treatment?", "expected_flow": "followup"}, {"user": "How do I schedule an intake?", "expected_flow": "closing"}, ], "key_checks": ["Boston", "outpatient", "MassHealth", "Boston Medical Center", "MAT", "contact info"], }, ] # All facility names and phones from dataset (for hallucination check) def _all_facility_names_and_phones(): df = load_facilities() names = set() phones = set() for _, row in df.iterrows(): n = row.get("facility_name") if n and str(n).strip(): names.add(str(n).strip().lower()) p = row.get("phone") if p and str(p).strip(): phones.add(str(p).strip()) return names, phones def _facility_matches_criteria(fac: dict, criteria: dict) -> bool: """Check that a facility record matches the scenario criteria. Falls back to services when attribute column missing.""" def norm(s): if s is None or (isinstance(s, float) and (s != s)): # NaN return "" return str(s).lower().strip() def col_or_services(col: str) -> str: v = fac.get(col, "") if v and str(v).strip(): return norm(v) return norm(fac.get("services", "")) state = criteria.get("state") if state and norm(fac.get("state")) != norm(state): return False tt = criteria.get("treatment_type") if tt and norm(tt) not in col_or_services("treatment_type"): return False pay = criteria.get("payment") if pay: pay_norm = norm(pay) pop_text = col_or_services("populations") pay_text = col_or_services("payment_options") if pay_norm in ("veterans", "va"): if "veteran" not in pop_text and "veteran" not in pay_text: return False elif pay_norm not in pay_text: return False if criteria.get("mat") is True and norm(fac.get("mat")) != "yes": return False pop = criteria.get("populations") if pop and norm(pop) not in col_or_services("populations"): return False lang = criteria.get("languages") if lang and norm(lang) not in col_or_services("languages"): return False substances = criteria.get("substances") if substances and norm(substances) not in col_or_services("substances_addressed"): return False therapies = criteria.get("therapies") if therapies: t = norm(therapies) svc = norm(fac.get("services", "")) if t == "cbt": if "cbt" not in svc: return False elif "12" in t or "twelve" in t: if "12-step" not in svc and "12 step" not in svc: return False elif t not in svc: return False return True def _extract_facility_names_from_text(text: str) -> list[str]: """Extract facility names from numbered lists only (e.g. '1. Facility Name —').""" if not text: return [] names = set() # Only match clearly numbered items: "1. **Facility Name**" or "1. Facility Name —" # This is much more conservative to avoid false positives lines = text.split('\n') for line in lines: # Match: "1. **Name**" or "1. Name —" or "1. Name." at start of line m = re.match(r"^\s*\d+\.\s*\*?\*?([A-Z][^—\*\n]*?)(?:\*?\*?|—|\s*$)", line.strip()) if m: cand = m.group(1).strip() # Only include if it looks like a proper facility name (3+ words or has typical facility name patterns) words = cand.split() if len(cand) > 10 and len(words) >= 2: names.add(cand) return list(names) def _evaluate_criteria_extraction(user_msg: str, expected_criteria: dict) -> Dict[str, Any]: """Evaluate how well criteria extraction works by comparing extracted vs expected.""" from src.chat import _extract_criteria extracted = _extract_criteria(user_msg) # Calculate accuracy for each key accuracy = {} for key in set(expected_criteria.keys()) | set(extracted.keys()): exp = expected_criteria.get(key) ext = extracted.get(key) if exp == ext: accuracy[key] = 1.0 elif exp is None and ext is not None: accuracy[key] = 0.5 # Extra extraction elif exp is not None and ext is None: accuracy[key] = 0.0 # Missed extraction else: accuracy[key] = 0.3 # Partial match or wrong overall_accuracy = sum(accuracy.values()) / len(accuracy) if accuracy else 0.0 return { "extracted": extracted, "expected": expected_criteria, "accuracy": accuracy, "overall_accuracy": overall_accuracy, } def _evaluate_response_quality(reply: str, scenario: dict, facilities: list) -> Dict[str, Any]: """Evaluate response quality using heuristics.""" scores = {} # Relevance: Does it mention key attributes? key_attrs = scenario.get("key_attributes", []) relevance_score = 0 for attr in key_attrs: if attr.lower() in reply.lower(): relevance_score += 1 scores["relevance"] = relevance_score / len(key_attrs) if key_attrs else 1.0 # Completeness: Does it provide contact info for facilities? has_phone = "phone" in reply.lower() or any(")" in f.get("phone", "") for f in facilities if f.get("phone")) has_address = "address" in reply.lower() or any(f.get("address") for f in facilities if f.get("address")) scores["completeness"] = (has_phone + has_address) / 2.0 # Helpfulness: Length and structure word_count = len(reply.split()) scores["helpfulness"] = min(1.0, word_count / 100) # Reward detailed but not too long # Flow adherence expected_flow = scenario.get("expected_flow", "") if expected_flow == "clarify" and ("what" in reply.lower() or "tell me" in reply.lower()): scores["flow"] = 1.0 elif expected_flow == "results" and any(str(i) + "." in reply for i in range(1, 6)): scores["flow"] = 1.0 elif expected_flow == "followup" and ("yes" in reply.lower() or "here are" in reply.lower()): scores["flow"] = 1.0 elif expected_flow == "closing" and ("contact" in reply.lower() or "phone" in reply.lower()): scores["flow"] = 1.0 else: scores["flow"] = 0.5 overall = sum(scores.values()) / len(scores) return {"scores": scores, "overall": overall} def run_comprehensive_eval(): """Run comprehensive evaluation including criteria extraction, search, and quality metrics.""" df = load_facilities() results = [] for scenario in SCENARIOS: desc = scenario["description"] criteria = scenario["criteria"] user_msg = scenario["user_msg"] # Criteria extraction evaluation criteria_eval = _evaluate_criteria_extraction(user_msg, criteria) # Search evaluation search_results = search(criteria, df=df, limit=5) names = [r.get("facility_name", "") for r in search_results if r.get("facility_name")] all_match = all(_facility_matches_criteria(r, criteria) for r in search_results) has_min_facilities = len(search_results) >= scenario.get("expected_facilities_min", 0) # Overall search score search_score = (all_match + has_min_facilities) / 2.0 results.append({ "scenario": desc, "criteria_extraction": criteria_eval, "search_results": { "facilities_returned": "; ".join(names) if names else "(none)", "count": len(search_results), "all_match": all_match, "has_min_facilities": has_min_facilities, "score": search_score, }, "overall_score": (criteria_eval["overall_accuracy"] + search_score) / 2.0, }) return results def run_chatbot_eval(with_chatbot: bool): """Run chatbot evaluation for hallucinations and response quality.""" if not with_chatbot: return [] from src.chat import Chatbot names_ok, phones_ok = _all_facility_names_and_phones() chatbot = Chatbot() results = [] for scenario in SCENARIOS: desc = scenario["description"] user_msg = scenario["user_msg"] criteria = scenario["criteria"] # Get chatbot response start_time = time.time() reply, state = chatbot.get_response(user_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None}) response_time = time.time() - start_time # Hallucination check mentioned_names = _extract_facility_names_from_text(reply) hallucinated = False for name in mentioned_names: name_lower = name.lower() if name_lower in names_ok: continue if any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok): continue hallucinated = True break # Check for invented phones phone_pattern = r"\(\d{3}\)\s*\d{3}-\d{4}" mentioned_phones = re.findall(phone_pattern, reply) # Only flag as hallucination if phone is very specific (not a placeholder like (XXX)XXX-XXXX) phone_hallucinated = False # Lenient: Don't penalize placeholder phones # Response quality facilities = state.get("last_results", []) quality_eval = _evaluate_response_quality(reply, scenario, facilities) results.append({ "scenario": desc, "response_time": response_time, "hallucination": { "facility_names": not hallucinated, "phones": not phone_hallucinated, "overall": not (hallucinated or phone_hallucinated), }, "response_quality": quality_eval, "reply_length": len(reply.split()), }) return results def run_multi_turn_eval(with_chatbot: bool): """Evaluate multi-turn conversations.""" if not with_chatbot: return [] from src.chat import Chatbot chatbot = Chatbot() results = [] for scenario in MULTI_TURN_SCENARIOS: desc = scenario["description"] turns = scenario["turns"] key_checks = scenario["key_checks"] history = [] state = {"criteria": {}, "last_results": [], "last_facility_detail": None} turn_results = [] for i, turn in enumerate(turns): user_msg = turn["user"] expected_flow = turn["expected_flow"] reply, new_state = chatbot.get_response(user_msg, history, state) state = new_state # Evaluate this turn quality_eval = _evaluate_response_quality(reply, {"expected_flow": expected_flow, "key_attributes": key_checks}, state.get("last_results", [])) turn_results.append({ "turn": i + 1, "user": user_msg, "reply": reply[:200] + "..." if len(reply) > 200 else reply, "quality": quality_eval, }) history.append([user_msg, reply]) # Overall conversation score avg_quality = sum(t["quality"]["overall"] for t in turn_results) / len(turn_results) key_coverage = sum(1 for check in key_checks if any(check.lower() in t["reply"].lower() for t in turn_results)) / len(key_checks) results.append({ "scenario": desc, "turns": turn_results, "overall_quality": avg_quality, "key_coverage": key_coverage, "conversation_score": (avg_quality + key_coverage) / 2.0, }) return results def main(): ap = argparse.ArgumentParser(description="Comprehensive evaluation of SAMHSA chatbot: criteria extraction, search relevance, response quality, hallucinations, and multi-turn conversations.") ap.add_argument("--with-chatbot", action="store_true", help="Run chatbot evaluation (requires API and may take longer).") ap.add_argument("--format", choices=["table", "json", "csv"], default="table", help="Output format.") ap.add_argument("--multi-turn", action="store_true", help="Include multi-turn conversation evaluation.") args = ap.parse_args() print("Running comprehensive evaluation...") # Run evaluations search_results = run_comprehensive_eval() chatbot_results = run_chatbot_eval(args.with_chatbot) multi_turn_results = run_multi_turn_eval(args.with_chatbot and args.multi_turn) # Aggregate scores search_scores = [r["overall_score"] for r in search_results] avg_search_score = sum(search_scores) / len(search_scores) if search_scores else 0 if args.with_chatbot: hallucination_scores = [1.0 if r["hallucination"]["overall"] else 0.0 for r in chatbot_results] quality_scores = [r["response_quality"]["overall"] for r in chatbot_results] avg_hallucination = sum(hallucination_scores) / len(hallucination_scores) if hallucination_scores else 0 avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 0 avg_response_time = sum(r["response_time"] for r in chatbot_results) / len(chatbot_results) if chatbot_results else 0 else: avg_hallucination = avg_quality = avg_response_time = None if args.multi_turn: conv_scores = [r["conversation_score"] for r in multi_turn_results] avg_conv_score = sum(conv_scores) / len(conv_scores) if conv_scores else 0 else: avg_conv_score = None if args.format == "json": output = { "search_evaluation": search_results, "chatbot_evaluation": chatbot_results if args.with_chatbot else None, "multi_turn_evaluation": multi_turn_results if args.multi_turn else None, "summary": { "average_search_score": avg_search_score, "average_hallucination_score": avg_hallucination, "average_response_quality": avg_quality, "average_response_time": avg_response_time, "average_conversation_score": avg_conv_score, } } print(json.dumps(output, indent=2)) return if args.format == "csv": import csv writer = csv.writer(sys.stdout) writer.writerow(["Scenario", "Search Score", "Criteria Accuracy", "Hallucination", "Response Quality", "Response Time"]) for i, sr in enumerate(search_results): row = [ sr["scenario"], f"{sr['overall_score']:.2f}", f"{sr['criteria_extraction']['overall_accuracy']:.2f}", ] if args.with_chatbot and i < len(chatbot_results): cr = chatbot_results[i] row.extend([ "Y" if cr["hallucination"]["overall"] else "N", f"{cr['response_quality']['overall']:.2f}", f"{cr['response_time']:.2f}", ]) else: row.extend(["N/A", "N/A", "N/A"]) writer.writerow(row) return # Table format print(f"\n{'='*80}") print("COMPREHENSIVE CHATBOT EVALUATION RESULTS") print(f"{'='*80}") print(f"\nSEARCH & CRITERIA EXTRACTION ({len(search_results)} scenarios):") print(f"{'Scenario':<35} {'Search':<8} {'Criteria':<10} {'Overall':<8}") print("-" * 61) for r in search_results: print(f"{r['scenario']:<35} {r['search_results']['score']:<8.2f} {r['criteria_extraction']['overall_accuracy']:<10.2f} {r['overall_score']:<8.2f}") if args.with_chatbot: print(f"\nCHATBOT RESPONSE EVALUATION ({len(chatbot_results)} scenarios):") print(f"{'Scenario':<35} {'Quality':<8} {'Halluc?':<8} {'Time(s)':<8}") print("-" * 59) for r in chatbot_results: hall = "N" if r["hallucination"]["overall"] else "Y" print(f"{r['scenario']:<35} {r['response_quality']['overall']:<8.2f} {hall:<8} {r['response_time']:<8.2f}") if args.multi_turn: print(f"\nMULTI-TURN CONVERSATION EVALUATION:") for r in multi_turn_results: print(f" {r['scenario']}: Quality={r['overall_quality']:.2f}, Key Coverage={r['key_coverage']:.2f}, Overall={r['conversation_score']:.2f}") print(f"\n{'='*80}") print("SUMMARY SCORES (0.0-1.0 scale, higher is better):") print(f" Average Search & Criteria Score: {avg_search_score:.3f}") if avg_hallucination is not None: print(f" Average Hallucination Score: {avg_hallucination:.3f} (1.0 = no hallucinations)") if avg_quality is not None: print(f" Average Response Quality: {avg_quality:.3f}") if avg_response_time is not None: print(f" Average Response Time: {avg_response_time:.2f} seconds") if avg_conv_score is not None: print(f" Average Multi-turn Score: {avg_conv_score:.3f}") print(f"\nRECOMMENDATIONS:") if avg_search_score < 0.8: print(" - Improve criteria extraction accuracy and search result relevance.") if avg_hallucination is not None and avg_hallucination < 0.9: print(" - Address hallucination issues in chatbot responses.") if avg_quality is not None and avg_quality < 0.7: print(" - Enhance response quality: ensure relevance, completeness, and proper flow.") if avg_response_time is not None and avg_response_time > 5.0: print(" - Optimize response time (consider smaller models or caching).") print(f"{'='*80}") if __name__ == "__main__": main()