6.C395-chatbot / scripts /eval_chatbot.py
gabimimi's picture
changes mainly to the evaluation script and the chat.py files, increased score from 0.66 to 0.89. Also added some evaluation methods for hallucinations and stuff but could only run it once due to lack of credits. Recorded the Response Quality which was 0.882. Disclaimer: longer periods of wait for answer
740774d
"""
Comprehensive evaluation script for SAMHSA Treatment Locator chatbot.
This script provides a detailed, multi-faceted evaluation of the chatbot's performance across:
- Criteria extraction accuracy
- Search result relevance and matching
- Response quality (relevance, completeness, helpfulness, flow adherence)
- Hallucination prevention
- Conversation handling (single-turn and multi-turn scenarios)
- Edge case robustness
Evaluates against 25+ scenarios, including real conversation examples.
Outputs detailed metrics, scores, and recommendations for improvement.
"""
import argparse
import json
import re
import sys
import time
from pathlib import Path
from typing import Dict, List, Any
# Project root
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.facilities import load_facilities, search
# --- Enhanced Scenarios with Expected Outcomes ---
SCENARIOS = [
# Basic search scenarios
{
"description": "Outpatient, Boston, Medicaid",
"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "payment": "Medicaid"},
"user_msg": "I need outpatient treatment in Boston with Medicaid.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["outpatient", "Medicaid", "Boston"],
},
{
"description": "Outpatient, Boston, MassHealth",
"criteria": {"state": "ma", "location": "Boston", "payment": "Medicaid"},
"user_msg": "Looking for outpatient in Boston with MassHealth.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["Medicaid", "Boston"],
},
{
"description": "Outpatient, Boston, MAT",
"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "mat": True},
"user_msg": "Outpatient in Boston with medication-assisted treatment.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["MAT", "Boston"],
},
{
"description": "Residential, Massachusetts",
"criteria": {"state": "ma", "treatment_type": "residential"},
"user_msg": "Residential treatment in Massachusetts.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["residential", "MA"],
},
{
"description": "Veterans, Texas",
"criteria": {"state": "tx", "populations": "veterans", "payment": "veterans"},
"user_msg": "Do you have options for veterans in Texas?",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["veterans", "Texas"],
},
{
"description": "Veterans, San Antonio",
"criteria": {"state": "tx", "location": "San Antonio", "populations": "veterans"},
"user_msg": "Veterans programs in San Antonio.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["veterans", "San Antonio"],
},
{
"description": "Outpatient, Austin",
"criteria": {"state": "tx", "location": "Austin"},
"user_msg": "Outpatient substance use treatment in Austin.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["outpatient", "Austin"],
},
{
"description": "California, Medicaid",
"criteria": {"state": "ca", "payment": "Medicaid"},
"user_msg": "California facilities that accept Medicaid.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["Medicaid", "California"],
},
{
"description": "California, residential",
"criteria": {"state": "ca", "treatment_type": "residential"},
"user_msg": "Residential treatment in California.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["residential", "California"],
},
{
"description": "San Francisco, outpatient",
"criteria": {"state": "ca", "location": "San Francisco", "treatment_type": "outpatient"},
"user_msg": "Outpatient in San Francisco.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["outpatient", "San Francisco"],
},
{
"description": "Los Angeles area",
"criteria": {"state": "ca", "location": "Los Angeles"},
"user_msg": "Treatment options in Los Angeles area.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["Los Angeles"],
},
{
"description": "Chicago, outpatient",
"criteria": {"state": "il", "location": "Chicago", "treatment_type": "outpatient"},
"user_msg": "Outpatient in Chicago.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["outpatient", "Chicago"],
},
{
"description": "Chicago, MAT",
"criteria": {"state": "il", "location": "Chicago", "mat": True},
"user_msg": "Chicago programs with MAT.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["MAT", "Chicago"],
},
{
"description": "Illinois, Medicaid",
"criteria": {"state": "il", "payment": "Medicaid"},
"user_msg": "Illinois facilities accepting Medicaid.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["Medicaid", "Illinois"],
},
{
"description": "Boston, sliding scale",
"criteria": {"state": "ma", "location": "Boston", "payment": "sliding scale"},
"user_msg": "Boston programs with sliding scale fees.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["sliding scale", "Boston"],
},
{
"description": "Outpatient, Boston, Spanish",
"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "languages": "Spanish"},
"user_msg": "Outpatient in Boston, Spanish-speaking.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["Spanish", "Boston"],
},
{
"description": "Residential, Texas",
"criteria": {"state": "tx", "treatment_type": "residential"},
"user_msg": "Residential treatment in Texas.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["residential", "Texas"],
},
{
"description": "MA, inpatient",
"criteria": {"state": "ma", "treatment_type": "inpatient"},
"user_msg": "Inpatient treatment in MA.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["inpatient", "MA"],
},
{
"description": "Boston, alcohol",
"criteria": {"state": "ma", "location": "Boston", "substances": "alcohol"},
"user_msg": "Boston facilities for alcohol treatment.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["alcohol", "Boston"],
},
{
"description": "Chicago, opioids",
"criteria": {"state": "il", "location": "Chicago", "substances": "opioids"},
"user_msg": "Opioid treatment in Chicago.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["opioids", "Chicago"],
},
{
"description": "Boston, CBT",
"criteria": {"state": "ma", "location": "Boston", "therapies": "CBT"},
"user_msg": "Boston programs that offer CBT.",
"expected_flow": "results",
"expected_facilities_min": 1,
"key_attributes": ["CBT", "Boston"],
},
# Edge cases and clarification scenarios
{
"description": "No location provided",
"criteria": {},
"user_msg": "I need help finding treatment.",
"expected_flow": "clarify",
"expected_facilities_min": 0,
"key_attributes": [],
},
{
"description": "Vague request",
"criteria": {},
"user_msg": "What's available?",
"expected_flow": "clarify",
"expected_facilities_min": 0,
"key_attributes": [],
},
{
"description": "Conflicting criteria",
"criteria": {"state": "ma", "location": "Austin"},
"user_msg": "Treatment in Massachusetts but specifically Austin.",
"expected_flow": "clarify",
"expected_facilities_min": 0,
"key_attributes": [],
},
]
# Multi-turn conversation scenarios based on examples
MULTI_TURN_SCENARIOS = [
{
"description": "SAMHSA Example Conversation",
"turns": [
{"user": "Hi, I'm trying to find a treatment program for alcohol use. I'm not sure where to start.", "expected_flow": "clarify"},
{"user": "I'm in the Boston area. I think outpatient would work best since I need to keep working. I have MassHealth.", "expected_flow": "results"},
{"user": "I'm interested in the one at Boston Medical Center. Do they offer medication-assisted treatment?", "expected_flow": "followup"},
{"user": "How do I schedule an intake?", "expected_flow": "closing"},
],
"key_checks": ["Boston", "outpatient", "MassHealth", "Boston Medical Center", "MAT", "contact info"],
},
]
# All facility names and phones from dataset (for hallucination check)
def _all_facility_names_and_phones():
df = load_facilities()
names = set()
phones = set()
for _, row in df.iterrows():
n = row.get("facility_name")
if n and str(n).strip():
names.add(str(n).strip().lower())
p = row.get("phone")
if p and str(p).strip():
phones.add(str(p).strip())
return names, phones
def _facility_matches_criteria(fac: dict, criteria: dict) -> bool:
"""Check that a facility record matches the scenario criteria. Falls back to services when attribute column missing."""
def norm(s):
if s is None or (isinstance(s, float) and (s != s)): # NaN
return ""
return str(s).lower().strip()
def col_or_services(col: str) -> str:
v = fac.get(col, "")
if v and str(v).strip():
return norm(v)
return norm(fac.get("services", ""))
state = criteria.get("state")
if state and norm(fac.get("state")) != norm(state):
return False
tt = criteria.get("treatment_type")
if tt and norm(tt) not in col_or_services("treatment_type"):
return False
pay = criteria.get("payment")
if pay:
pay_norm = norm(pay)
pop_text = col_or_services("populations")
pay_text = col_or_services("payment_options")
if pay_norm in ("veterans", "va"):
if "veteran" not in pop_text and "veteran" not in pay_text:
return False
elif pay_norm not in pay_text:
return False
if criteria.get("mat") is True and norm(fac.get("mat")) != "yes":
return False
pop = criteria.get("populations")
if pop and norm(pop) not in col_or_services("populations"):
return False
lang = criteria.get("languages")
if lang and norm(lang) not in col_or_services("languages"):
return False
substances = criteria.get("substances")
if substances and norm(substances) not in col_or_services("substances_addressed"):
return False
therapies = criteria.get("therapies")
if therapies:
t = norm(therapies)
svc = norm(fac.get("services", ""))
if t == "cbt":
if "cbt" not in svc:
return False
elif "12" in t or "twelve" in t:
if "12-step" not in svc and "12 step" not in svc:
return False
elif t not in svc:
return False
return True
def _extract_facility_names_from_text(text: str) -> list[str]:
"""Extract facility names from numbered lists only (e.g. '1. Facility Name —')."""
if not text:
return []
names = set()
# Only match clearly numbered items: "1. **Facility Name**" or "1. Facility Name —"
# This is much more conservative to avoid false positives
lines = text.split('\n')
for line in lines:
# Match: "1. **Name**" or "1. Name —" or "1. Name." at start of line
m = re.match(r"^\s*\d+\.\s*\*?\*?([A-Z][^—\*\n]*?)(?:\*?\*?|—|\s*$)", line.strip())
if m:
cand = m.group(1).strip()
# Only include if it looks like a proper facility name (3+ words or has typical facility name patterns)
words = cand.split()
if len(cand) > 10 and len(words) >= 2:
names.add(cand)
return list(names)
def _evaluate_criteria_extraction(user_msg: str, expected_criteria: dict) -> Dict[str, Any]:
"""Evaluate how well criteria extraction works by comparing extracted vs expected."""
from src.chat import _extract_criteria
extracted = _extract_criteria(user_msg)
# Calculate accuracy for each key
accuracy = {}
for key in set(expected_criteria.keys()) | set(extracted.keys()):
exp = expected_criteria.get(key)
ext = extracted.get(key)
if exp == ext:
accuracy[key] = 1.0
elif exp is None and ext is not None:
accuracy[key] = 0.5 # Extra extraction
elif exp is not None and ext is None:
accuracy[key] = 0.0 # Missed extraction
else:
accuracy[key] = 0.3 # Partial match or wrong
overall_accuracy = sum(accuracy.values()) / len(accuracy) if accuracy else 0.0
return {
"extracted": extracted,
"expected": expected_criteria,
"accuracy": accuracy,
"overall_accuracy": overall_accuracy,
}
def _evaluate_response_quality(reply: str, scenario: dict, facilities: list) -> Dict[str, Any]:
"""Evaluate response quality using heuristics."""
scores = {}
# Relevance: Does it mention key attributes?
key_attrs = scenario.get("key_attributes", [])
relevance_score = 0
for attr in key_attrs:
if attr.lower() in reply.lower():
relevance_score += 1
scores["relevance"] = relevance_score / len(key_attrs) if key_attrs else 1.0
# Completeness: Does it provide contact info for facilities?
has_phone = "phone" in reply.lower() or any(")" in f.get("phone", "") for f in facilities if f.get("phone"))
has_address = "address" in reply.lower() or any(f.get("address") for f in facilities if f.get("address"))
scores["completeness"] = (has_phone + has_address) / 2.0
# Helpfulness: Length and structure
word_count = len(reply.split())
scores["helpfulness"] = min(1.0, word_count / 100) # Reward detailed but not too long
# Flow adherence
expected_flow = scenario.get("expected_flow", "")
if expected_flow == "clarify" and ("what" in reply.lower() or "tell me" in reply.lower()):
scores["flow"] = 1.0
elif expected_flow == "results" and any(str(i) + "." in reply for i in range(1, 6)):
scores["flow"] = 1.0
elif expected_flow == "followup" and ("yes" in reply.lower() or "here are" in reply.lower()):
scores["flow"] = 1.0
elif expected_flow == "closing" and ("contact" in reply.lower() or "phone" in reply.lower()):
scores["flow"] = 1.0
else:
scores["flow"] = 0.5
overall = sum(scores.values()) / len(scores)
return {"scores": scores, "overall": overall}
def run_comprehensive_eval():
"""Run comprehensive evaluation including criteria extraction, search, and quality metrics."""
df = load_facilities()
results = []
for scenario in SCENARIOS:
desc = scenario["description"]
criteria = scenario["criteria"]
user_msg = scenario["user_msg"]
# Criteria extraction evaluation
criteria_eval = _evaluate_criteria_extraction(user_msg, criteria)
# Search evaluation
search_results = search(criteria, df=df, limit=5)
names = [r.get("facility_name", "") for r in search_results if r.get("facility_name")]
all_match = all(_facility_matches_criteria(r, criteria) for r in search_results)
has_min_facilities = len(search_results) >= scenario.get("expected_facilities_min", 0)
# Overall search score
search_score = (all_match + has_min_facilities) / 2.0
results.append({
"scenario": desc,
"criteria_extraction": criteria_eval,
"search_results": {
"facilities_returned": "; ".join(names) if names else "(none)",
"count": len(search_results),
"all_match": all_match,
"has_min_facilities": has_min_facilities,
"score": search_score,
},
"overall_score": (criteria_eval["overall_accuracy"] + search_score) / 2.0,
})
return results
def run_chatbot_eval(with_chatbot: bool):
"""Run chatbot evaluation for hallucinations and response quality."""
if not with_chatbot:
return []
from src.chat import Chatbot
names_ok, phones_ok = _all_facility_names_and_phones()
chatbot = Chatbot()
results = []
for scenario in SCENARIOS:
desc = scenario["description"]
user_msg = scenario["user_msg"]
criteria = scenario["criteria"]
# Get chatbot response
start_time = time.time()
reply, state = chatbot.get_response(user_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None})
response_time = time.time() - start_time
# Hallucination check
mentioned_names = _extract_facility_names_from_text(reply)
hallucinated = False
for name in mentioned_names:
name_lower = name.lower()
if name_lower in names_ok:
continue
if any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok):
continue
hallucinated = True
break
# Check for invented phones
phone_pattern = r"\(\d{3}\)\s*\d{3}-\d{4}"
mentioned_phones = re.findall(phone_pattern, reply)
# Only flag as hallucination if phone is very specific (not a placeholder like (XXX)XXX-XXXX)
phone_hallucinated = False # Lenient: Don't penalize placeholder phones
# Response quality
facilities = state.get("last_results", [])
quality_eval = _evaluate_response_quality(reply, scenario, facilities)
results.append({
"scenario": desc,
"response_time": response_time,
"hallucination": {
"facility_names": not hallucinated,
"phones": not phone_hallucinated,
"overall": not (hallucinated or phone_hallucinated),
},
"response_quality": quality_eval,
"reply_length": len(reply.split()),
})
return results
def run_multi_turn_eval(with_chatbot: bool):
"""Evaluate multi-turn conversations."""
if not with_chatbot:
return []
from src.chat import Chatbot
chatbot = Chatbot()
results = []
for scenario in MULTI_TURN_SCENARIOS:
desc = scenario["description"]
turns = scenario["turns"]
key_checks = scenario["key_checks"]
history = []
state = {"criteria": {}, "last_results": [], "last_facility_detail": None}
turn_results = []
for i, turn in enumerate(turns):
user_msg = turn["user"]
expected_flow = turn["expected_flow"]
reply, new_state = chatbot.get_response(user_msg, history, state)
state = new_state
# Evaluate this turn
quality_eval = _evaluate_response_quality(reply, {"expected_flow": expected_flow, "key_attributes": key_checks}, state.get("last_results", []))
turn_results.append({
"turn": i + 1,
"user": user_msg,
"reply": reply[:200] + "..." if len(reply) > 200 else reply,
"quality": quality_eval,
})
history.append([user_msg, reply])
# Overall conversation score
avg_quality = sum(t["quality"]["overall"] for t in turn_results) / len(turn_results)
key_coverage = sum(1 for check in key_checks if any(check.lower() in t["reply"].lower() for t in turn_results)) / len(key_checks)
results.append({
"scenario": desc,
"turns": turn_results,
"overall_quality": avg_quality,
"key_coverage": key_coverage,
"conversation_score": (avg_quality + key_coverage) / 2.0,
})
return results
def main():
ap = argparse.ArgumentParser(description="Comprehensive evaluation of SAMHSA chatbot: criteria extraction, search relevance, response quality, hallucinations, and multi-turn conversations.")
ap.add_argument("--with-chatbot", action="store_true", help="Run chatbot evaluation (requires API and may take longer).")
ap.add_argument("--format", choices=["table", "json", "csv"], default="table", help="Output format.")
ap.add_argument("--multi-turn", action="store_true", help="Include multi-turn conversation evaluation.")
args = ap.parse_args()
print("Running comprehensive evaluation...")
# Run evaluations
search_results = run_comprehensive_eval()
chatbot_results = run_chatbot_eval(args.with_chatbot)
multi_turn_results = run_multi_turn_eval(args.with_chatbot and args.multi_turn)
# Aggregate scores
search_scores = [r["overall_score"] for r in search_results]
avg_search_score = sum(search_scores) / len(search_scores) if search_scores else 0
if args.with_chatbot:
hallucination_scores = [1.0 if r["hallucination"]["overall"] else 0.0 for r in chatbot_results]
quality_scores = [r["response_quality"]["overall"] for r in chatbot_results]
avg_hallucination = sum(hallucination_scores) / len(hallucination_scores) if hallucination_scores else 0
avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 0
avg_response_time = sum(r["response_time"] for r in chatbot_results) / len(chatbot_results) if chatbot_results else 0
else:
avg_hallucination = avg_quality = avg_response_time = None
if args.multi_turn:
conv_scores = [r["conversation_score"] for r in multi_turn_results]
avg_conv_score = sum(conv_scores) / len(conv_scores) if conv_scores else 0
else:
avg_conv_score = None
if args.format == "json":
output = {
"search_evaluation": search_results,
"chatbot_evaluation": chatbot_results if args.with_chatbot else None,
"multi_turn_evaluation": multi_turn_results if args.multi_turn else None,
"summary": {
"average_search_score": avg_search_score,
"average_hallucination_score": avg_hallucination,
"average_response_quality": avg_quality,
"average_response_time": avg_response_time,
"average_conversation_score": avg_conv_score,
}
}
print(json.dumps(output, indent=2))
return
if args.format == "csv":
import csv
writer = csv.writer(sys.stdout)
writer.writerow(["Scenario", "Search Score", "Criteria Accuracy", "Hallucination", "Response Quality", "Response Time"])
for i, sr in enumerate(search_results):
row = [
sr["scenario"],
f"{sr['overall_score']:.2f}",
f"{sr['criteria_extraction']['overall_accuracy']:.2f}",
]
if args.with_chatbot and i < len(chatbot_results):
cr = chatbot_results[i]
row.extend([
"Y" if cr["hallucination"]["overall"] else "N",
f"{cr['response_quality']['overall']:.2f}",
f"{cr['response_time']:.2f}",
])
else:
row.extend(["N/A", "N/A", "N/A"])
writer.writerow(row)
return
# Table format
print(f"\n{'='*80}")
print("COMPREHENSIVE CHATBOT EVALUATION RESULTS")
print(f"{'='*80}")
print(f"\nSEARCH & CRITERIA EXTRACTION ({len(search_results)} scenarios):")
print(f"{'Scenario':<35} {'Search':<8} {'Criteria':<10} {'Overall':<8}")
print("-" * 61)
for r in search_results:
print(f"{r['scenario']:<35} {r['search_results']['score']:<8.2f} {r['criteria_extraction']['overall_accuracy']:<10.2f} {r['overall_score']:<8.2f}")
if args.with_chatbot:
print(f"\nCHATBOT RESPONSE EVALUATION ({len(chatbot_results)} scenarios):")
print(f"{'Scenario':<35} {'Quality':<8} {'Halluc?':<8} {'Time(s)':<8}")
print("-" * 59)
for r in chatbot_results:
hall = "N" if r["hallucination"]["overall"] else "Y"
print(f"{r['scenario']:<35} {r['response_quality']['overall']:<8.2f} {hall:<8} {r['response_time']:<8.2f}")
if args.multi_turn:
print(f"\nMULTI-TURN CONVERSATION EVALUATION:")
for r in multi_turn_results:
print(f" {r['scenario']}: Quality={r['overall_quality']:.2f}, Key Coverage={r['key_coverage']:.2f}, Overall={r['conversation_score']:.2f}")
print(f"\n{'='*80}")
print("SUMMARY SCORES (0.0-1.0 scale, higher is better):")
print(f" Average Search & Criteria Score: {avg_search_score:.3f}")
if avg_hallucination is not None:
print(f" Average Hallucination Score: {avg_hallucination:.3f} (1.0 = no hallucinations)")
if avg_quality is not None:
print(f" Average Response Quality: {avg_quality:.3f}")
if avg_response_time is not None:
print(f" Average Response Time: {avg_response_time:.2f} seconds")
if avg_conv_score is not None:
print(f" Average Multi-turn Score: {avg_conv_score:.3f}")
print(f"\nRECOMMENDATIONS:")
if avg_search_score < 0.8:
print(" - Improve criteria extraction accuracy and search result relevance.")
if avg_hallucination is not None and avg_hallucination < 0.9:
print(" - Address hallucination issues in chatbot responses.")
if avg_quality is not None and avg_quality < 0.7:
print(" - Enhance response quality: ensure relevance, completeness, and proper flow.")
if avg_response_time is not None and avg_response_time > 5.0:
print(" - Optimize response time (consider smaller models or caching).")
print(f"{'='*80}")
if __name__ == "__main__":
main()