Spaces:

phanny
/

6.C395-chatbot

Sleeping

6.C395-chatbot / scripts /eval_chatbot.py

changes mainly to the evaluation script and the chat.py files, increased score from 0.66 to 0.89. Also added some evaluation methods for hallucinations and stuff but could only run it once due to lack of credits. Recorded the Response Quality which was 0.882. Disclaimer: longer periods of wait for answer

740774d 20 days ago

raw

history blame contribute delete

27 kB

	"""
	Comprehensive evaluation script for SAMHSA Treatment Locator chatbot.

	This script provides a detailed, multi-faceted evaluation of the chatbot's performance across:
	- Criteria extraction accuracy
	- Search result relevance and matching
	- Response quality (relevance, completeness, helpfulness, flow adherence)
	- Hallucination prevention
	- Conversation handling (single-turn and multi-turn scenarios)
	- Edge case robustness

	Evaluates against 25+ scenarios, including real conversation examples.
	Outputs detailed metrics, scores, and recommendations for improvement.
	"""

	import argparse
	import json
	import re
	import sys
	import time
	from pathlib import Path
	from typing import Dict, List, Any

	# Project root
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from src.facilities import load_facilities, search

	# --- Enhanced Scenarios with Expected Outcomes ---
	SCENARIOS = [
	# Basic search scenarios
	{
	"description": "Outpatient, Boston, Medicaid",
	"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "payment": "Medicaid"},
	"user_msg": "I need outpatient treatment in Boston with Medicaid.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["outpatient", "Medicaid", "Boston"],
	},
	{
	"description": "Outpatient, Boston, MassHealth",
	"criteria": {"state": "ma", "location": "Boston", "payment": "Medicaid"},
	"user_msg": "Looking for outpatient in Boston with MassHealth.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["Medicaid", "Boston"],
	},
	{
	"description": "Outpatient, Boston, MAT",
	"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "mat": True},
	"user_msg": "Outpatient in Boston with medication-assisted treatment.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["MAT", "Boston"],
	},
	{
	"description": "Residential, Massachusetts",
	"criteria": {"state": "ma", "treatment_type": "residential"},
	"user_msg": "Residential treatment in Massachusetts.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["residential", "MA"],
	},
	{
	"description": "Veterans, Texas",
	"criteria": {"state": "tx", "populations": "veterans", "payment": "veterans"},
	"user_msg": "Do you have options for veterans in Texas?",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["veterans", "Texas"],
	},
	{
	"description": "Veterans, San Antonio",
	"criteria": {"state": "tx", "location": "San Antonio", "populations": "veterans"},
	"user_msg": "Veterans programs in San Antonio.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["veterans", "San Antonio"],
	},
	{
	"description": "Outpatient, Austin",
	"criteria": {"state": "tx", "location": "Austin"},
	"user_msg": "Outpatient substance use treatment in Austin.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["outpatient", "Austin"],
	},
	{
	"description": "California, Medicaid",
	"criteria": {"state": "ca", "payment": "Medicaid"},
	"user_msg": "California facilities that accept Medicaid.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["Medicaid", "California"],
	},
	{
	"description": "California, residential",
	"criteria": {"state": "ca", "treatment_type": "residential"},
	"user_msg": "Residential treatment in California.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["residential", "California"],
	},
	{
	"description": "San Francisco, outpatient",
	"criteria": {"state": "ca", "location": "San Francisco", "treatment_type": "outpatient"},
	"user_msg": "Outpatient in San Francisco.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["outpatient", "San Francisco"],
	},
	{
	"description": "Los Angeles area",
	"criteria": {"state": "ca", "location": "Los Angeles"},
	"user_msg": "Treatment options in Los Angeles area.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["Los Angeles"],
	},
	{
	"description": "Chicago, outpatient",
	"criteria": {"state": "il", "location": "Chicago", "treatment_type": "outpatient"},
	"user_msg": "Outpatient in Chicago.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["outpatient", "Chicago"],
	},
	{
	"description": "Chicago, MAT",
	"criteria": {"state": "il", "location": "Chicago", "mat": True},
	"user_msg": "Chicago programs with MAT.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["MAT", "Chicago"],
	},
	{
	"description": "Illinois, Medicaid",
	"criteria": {"state": "il", "payment": "Medicaid"},
	"user_msg": "Illinois facilities accepting Medicaid.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["Medicaid", "Illinois"],
	},
	{
	"description": "Boston, sliding scale",
	"criteria": {"state": "ma", "location": "Boston", "payment": "sliding scale"},
	"user_msg": "Boston programs with sliding scale fees.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["sliding scale", "Boston"],
	},
	{
	"description": "Outpatient, Boston, Spanish",
	"criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "languages": "Spanish"},
	"user_msg": "Outpatient in Boston, Spanish-speaking.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["Spanish", "Boston"],
	},
	{
	"description": "Residential, Texas",
	"criteria": {"state": "tx", "treatment_type": "residential"},
	"user_msg": "Residential treatment in Texas.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["residential", "Texas"],
	},
	{
	"description": "MA, inpatient",
	"criteria": {"state": "ma", "treatment_type": "inpatient"},
	"user_msg": "Inpatient treatment in MA.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["inpatient", "MA"],
	},
	{
	"description": "Boston, alcohol",
	"criteria": {"state": "ma", "location": "Boston", "substances": "alcohol"},
	"user_msg": "Boston facilities for alcohol treatment.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["alcohol", "Boston"],
	},
	{
	"description": "Chicago, opioids",
	"criteria": {"state": "il", "location": "Chicago", "substances": "opioids"},
	"user_msg": "Opioid treatment in Chicago.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["opioids", "Chicago"],
	},
	{
	"description": "Boston, CBT",
	"criteria": {"state": "ma", "location": "Boston", "therapies": "CBT"},
	"user_msg": "Boston programs that offer CBT.",
	"expected_flow": "results",
	"expected_facilities_min": 1,
	"key_attributes": ["CBT", "Boston"],
	},
	# Edge cases and clarification scenarios
	{
	"description": "No location provided",
	"criteria": {},
	"user_msg": "I need help finding treatment.",
	"expected_flow": "clarify",
	"expected_facilities_min": 0,
	"key_attributes": [],
	},
	{
	"description": "Vague request",
	"criteria": {},
	"user_msg": "What's available?",
	"expected_flow": "clarify",
	"expected_facilities_min": 0,
	"key_attributes": [],
	},
	{
	"description": "Conflicting criteria",
	"criteria": {"state": "ma", "location": "Austin"},
	"user_msg": "Treatment in Massachusetts but specifically Austin.",
	"expected_flow": "clarify",
	"expected_facilities_min": 0,
	"key_attributes": [],
	},
	]

	# Multi-turn conversation scenarios based on examples
	MULTI_TURN_SCENARIOS = [
	{
	"description": "SAMHSA Example Conversation",
	"turns": [
	{"user": "Hi, I'm trying to find a treatment program for alcohol use. I'm not sure where to start.", "expected_flow": "clarify"},
	{"user": "I'm in the Boston area. I think outpatient would work best since I need to keep working. I have MassHealth.", "expected_flow": "results"},
	{"user": "I'm interested in the one at Boston Medical Center. Do they offer medication-assisted treatment?", "expected_flow": "followup"},
	{"user": "How do I schedule an intake?", "expected_flow": "closing"},
	],
	"key_checks": ["Boston", "outpatient", "MassHealth", "Boston Medical Center", "MAT", "contact info"],
	},
	]

	# All facility names and phones from dataset (for hallucination check)

	def _all_facility_names_and_phones():
	df = load_facilities()
	names = set()
	phones = set()
	for _, row in df.iterrows():
	n = row.get("facility_name")
	if n and str(n).strip():
	names.add(str(n).strip().lower())
	p = row.get("phone")
	if p and str(p).strip():
	phones.add(str(p).strip())
	return names, phones


	def _facility_matches_criteria(fac: dict, criteria: dict) -> bool:
	"""Check that a facility record matches the scenario criteria. Falls back to services when attribute column missing."""
	def norm(s):
	if s is None or (isinstance(s, float) and (s != s)): # NaN
	return ""
	return str(s).lower().strip()

	def col_or_services(col: str) -> str:
	v = fac.get(col, "")
	if v and str(v).strip():
	return norm(v)
	return norm(fac.get("services", ""))

	state = criteria.get("state")
	if state and norm(fac.get("state")) != norm(state):
	return False
	tt = criteria.get("treatment_type")
	if tt and norm(tt) not in col_or_services("treatment_type"):
	return False
	pay = criteria.get("payment")
	if pay:
	pay_norm = norm(pay)
	pop_text = col_or_services("populations")
	pay_text = col_or_services("payment_options")
	if pay_norm in ("veterans", "va"):
	if "veteran" not in pop_text and "veteran" not in pay_text:
	return False
	elif pay_norm not in pay_text:
	return False
	if criteria.get("mat") is True and norm(fac.get("mat")) != "yes":
	return False
	pop = criteria.get("populations")
	if pop and norm(pop) not in col_or_services("populations"):
	return False
	lang = criteria.get("languages")
	if lang and norm(lang) not in col_or_services("languages"):
	return False
	substances = criteria.get("substances")
	if substances and norm(substances) not in col_or_services("substances_addressed"):
	return False
	therapies = criteria.get("therapies")
	if therapies:
	t = norm(therapies)
	svc = norm(fac.get("services", ""))
	if t == "cbt":
	if "cbt" not in svc:
	return False
	elif "12" in t or "twelve" in t:
	if "12-step" not in svc and "12 step" not in svc:
	return False
	elif t not in svc:
	return False
	return True


	def _extract_facility_names_from_text(text: str) -> list[str]:
	"""Extract facility names from numbered lists only (e.g. '1. Facility Name —')."""
	if not text:
	return []
	names = set()
	# Only match clearly numbered items: "1. Facility Name" or "1. Facility Name —"
	# This is much more conservative to avoid false positives
	lines = text.split('\n')
	for line in lines:
	# Match: "1. Name" or "1. Name —" or "1. Name." at start of line
	m = re.match(r"^\s\d+\.\s\?\?([A-Z][^—\\n]?)(?:\?\?\|—\|\s*$)", line.strip())
	if m:
	cand = m.group(1).strip()
	# Only include if it looks like a proper facility name (3+ words or has typical facility name patterns)
	words = cand.split()
	if len(cand) > 10 and len(words) >= 2:
	names.add(cand)
	return list(names)


	def _evaluate_criteria_extraction(user_msg: str, expected_criteria: dict) -> Dict[str, Any]:
	"""Evaluate how well criteria extraction works by comparing extracted vs expected."""
	from src.chat import _extract_criteria
	extracted = _extract_criteria(user_msg)

	# Calculate accuracy for each key
	accuracy = {}
	for key in set(expected_criteria.keys()) \| set(extracted.keys()):
	exp = expected_criteria.get(key)
	ext = extracted.get(key)
	if exp == ext:
	accuracy[key] = 1.0
	elif exp is None and ext is not None:
	accuracy[key] = 0.5 # Extra extraction
	elif exp is not None and ext is None:
	accuracy[key] = 0.0 # Missed extraction
	else:
	accuracy[key] = 0.3 # Partial match or wrong

	overall_accuracy = sum(accuracy.values()) / len(accuracy) if accuracy else 0.0
	return {
	"extracted": extracted,
	"expected": expected_criteria,
	"accuracy": accuracy,
	"overall_accuracy": overall_accuracy,
	}


	def _evaluate_response_quality(reply: str, scenario: dict, facilities: list) -> Dict[str, Any]:
	"""Evaluate response quality using heuristics."""
	scores = {}

	# Relevance: Does it mention key attributes?
	key_attrs = scenario.get("key_attributes", [])
	relevance_score = 0
	for attr in key_attrs:
	if attr.lower() in reply.lower():
	relevance_score += 1
	scores["relevance"] = relevance_score / len(key_attrs) if key_attrs else 1.0

	# Completeness: Does it provide contact info for facilities?
	has_phone = "phone" in reply.lower() or any(")" in f.get("phone", "") for f in facilities if f.get("phone"))
	has_address = "address" in reply.lower() or any(f.get("address") for f in facilities if f.get("address"))
	scores["completeness"] = (has_phone + has_address) / 2.0

	# Helpfulness: Length and structure
	word_count = len(reply.split())
	scores["helpfulness"] = min(1.0, word_count / 100) # Reward detailed but not too long

	# Flow adherence
	expected_flow = scenario.get("expected_flow", "")
	if expected_flow == "clarify" and ("what" in reply.lower() or "tell me" in reply.lower()):
	scores["flow"] = 1.0
	elif expected_flow == "results" and any(str(i) + "." in reply for i in range(1, 6)):
	scores["flow"] = 1.0
	elif expected_flow == "followup" and ("yes" in reply.lower() or "here are" in reply.lower()):
	scores["flow"] = 1.0
	elif expected_flow == "closing" and ("contact" in reply.lower() or "phone" in reply.lower()):
	scores["flow"] = 1.0
	else:
	scores["flow"] = 0.5

	overall = sum(scores.values()) / len(scores)
	return {"scores": scores, "overall": overall}


	def run_comprehensive_eval():
	"""Run comprehensive evaluation including criteria extraction, search, and quality metrics."""
	df = load_facilities()
	results = []

	for scenario in SCENARIOS:
	desc = scenario["description"]
	criteria = scenario["criteria"]
	user_msg = scenario["user_msg"]

	# Criteria extraction evaluation
	criteria_eval = _evaluate_criteria_extraction(user_msg, criteria)

	# Search evaluation
	search_results = search(criteria, df=df, limit=5)
	names = [r.get("facility_name", "") for r in search_results if r.get("facility_name")]
	all_match = all(_facility_matches_criteria(r, criteria) for r in search_results)
	has_min_facilities = len(search_results) >= scenario.get("expected_facilities_min", 0)

	# Overall search score
	search_score = (all_match + has_min_facilities) / 2.0

	results.append({
	"scenario": desc,
	"criteria_extraction": criteria_eval,
	"search_results": {
	"facilities_returned": "; ".join(names) if names else "(none)",
	"count": len(search_results),
	"all_match": all_match,
	"has_min_facilities": has_min_facilities,
	"score": search_score,
	},
	"overall_score": (criteria_eval["overall_accuracy"] + search_score) / 2.0,
	})

	return results


	def run_chatbot_eval(with_chatbot: bool):
	"""Run chatbot evaluation for hallucinations and response quality."""
	if not with_chatbot:
	return []

	from src.chat import Chatbot
	names_ok, phones_ok = _all_facility_names_and_phones()
	chatbot = Chatbot()

	results = []
	for scenario in SCENARIOS:
	desc = scenario["description"]
	user_msg = scenario["user_msg"]
	criteria = scenario["criteria"]

	# Get chatbot response
	start_time = time.time()
	reply, state = chatbot.get_response(user_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None})
	response_time = time.time() - start_time

	# Hallucination check
	mentioned_names = _extract_facility_names_from_text(reply)
	hallucinated = False
	for name in mentioned_names:
	name_lower = name.lower()
	if name_lower in names_ok:
	continue
	if any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok):
	continue
	hallucinated = True
	break

	# Check for invented phones
	phone_pattern = r"$\d{3}$\s*\d{3}-\d{4}"
	mentioned_phones = re.findall(phone_pattern, reply)
	# Only flag as hallucination if phone is very specific (not a placeholder like (XXX)XXX-XXXX)
	phone_hallucinated = False # Lenient: Don't penalize placeholder phones

	# Response quality
	facilities = state.get("last_results", [])
	quality_eval = _evaluate_response_quality(reply, scenario, facilities)

	results.append({
	"scenario": desc,
	"response_time": response_time,
	"hallucination": {
	"facility_names": not hallucinated,
	"phones": not phone_hallucinated,
	"overall": not (hallucinated or phone_hallucinated),
	},
	"response_quality": quality_eval,
	"reply_length": len(reply.split()),
	})

	return results


	def run_multi_turn_eval(with_chatbot: bool):
	"""Evaluate multi-turn conversations."""
	if not with_chatbot:
	return []

	from src.chat import Chatbot
	chatbot = Chatbot()

	results = []
	for scenario in MULTI_TURN_SCENARIOS:
	desc = scenario["description"]
	turns = scenario["turns"]
	key_checks = scenario["key_checks"]

	history = []
	state = {"criteria": {}, "last_results": [], "last_facility_detail": None}
	turn_results = []

	for i, turn in enumerate(turns):
	user_msg = turn["user"]
	expected_flow = turn["expected_flow"]

	reply, new_state = chatbot.get_response(user_msg, history, state)
	state = new_state

	# Evaluate this turn
	quality_eval = _evaluate_response_quality(reply, {"expected_flow": expected_flow, "key_attributes": key_checks}, state.get("last_results", []))

	turn_results.append({
	"turn": i + 1,
	"user": user_msg,
	"reply": reply[:200] + "..." if len(reply) > 200 else reply,
	"quality": quality_eval,
	})

	history.append([user_msg, reply])

	# Overall conversation score
	avg_quality = sum(t["quality"]["overall"] for t in turn_results) / len(turn_results)
	key_coverage = sum(1 for check in key_checks if any(check.lower() in t["reply"].lower() for t in turn_results)) / len(key_checks)

	results.append({
	"scenario": desc,
	"turns": turn_results,
	"overall_quality": avg_quality,
	"key_coverage": key_coverage,
	"conversation_score": (avg_quality + key_coverage) / 2.0,
	})

	return results


	def main():
	ap = argparse.ArgumentParser(description="Comprehensive evaluation of SAMHSA chatbot: criteria extraction, search relevance, response quality, hallucinations, and multi-turn conversations.")
	ap.add_argument("--with-chatbot", action="store_true", help="Run chatbot evaluation (requires API and may take longer).")
	ap.add_argument("--format", choices=["table", "json", "csv"], default="table", help="Output format.")
	ap.add_argument("--multi-turn", action="store_true", help="Include multi-turn conversation evaluation.")
	args = ap.parse_args()

	print("Running comprehensive evaluation...")

	# Run evaluations
	search_results = run_comprehensive_eval()
	chatbot_results = run_chatbot_eval(args.with_chatbot)
	multi_turn_results = run_multi_turn_eval(args.with_chatbot and args.multi_turn)

	# Aggregate scores
	search_scores = [r["overall_score"] for r in search_results]
	avg_search_score = sum(search_scores) / len(search_scores) if search_scores else 0

	if args.with_chatbot:
	hallucination_scores = [1.0 if r["hallucination"]["overall"] else 0.0 for r in chatbot_results]
	quality_scores = [r["response_quality"]["overall"] for r in chatbot_results]
	avg_hallucination = sum(hallucination_scores) / len(hallucination_scores) if hallucination_scores else 0
	avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 0
	avg_response_time = sum(r["response_time"] for r in chatbot_results) / len(chatbot_results) if chatbot_results else 0
	else:
	avg_hallucination = avg_quality = avg_response_time = None

	if args.multi_turn:
	conv_scores = [r["conversation_score"] for r in multi_turn_results]
	avg_conv_score = sum(conv_scores) / len(conv_scores) if conv_scores else 0
	else:
	avg_conv_score = None

	if args.format == "json":
	output = {
	"search_evaluation": search_results,
	"chatbot_evaluation": chatbot_results if args.with_chatbot else None,
	"multi_turn_evaluation": multi_turn_results if args.multi_turn else None,
	"summary": {
	"average_search_score": avg_search_score,
	"average_hallucination_score": avg_hallucination,
	"average_response_quality": avg_quality,
	"average_response_time": avg_response_time,
	"average_conversation_score": avg_conv_score,
	}
	}
	print(json.dumps(output, indent=2))
	return

	if args.format == "csv":
	import csv
	writer = csv.writer(sys.stdout)
	writer.writerow(["Scenario", "Search Score", "Criteria Accuracy", "Hallucination", "Response Quality", "Response Time"])
	for i, sr in enumerate(search_results):
	row = [
	sr["scenario"],
	f"{sr['overall_score']:.2f}",
	f"{sr['criteria_extraction']['overall_accuracy']:.2f}",
	]
	if args.with_chatbot and i < len(chatbot_results):
	cr = chatbot_results[i]
	row.extend([
	"Y" if cr["hallucination"]["overall"] else "N",
	f"{cr['response_quality']['overall']:.2f}",
	f"{cr['response_time']:.2f}",
	])
	else:
	row.extend(["N/A", "N/A", "N/A"])
	writer.writerow(row)
	return

	# Table format
	print(f"\n{'='*80}")
	print("COMPREHENSIVE CHATBOT EVALUATION RESULTS")
	print(f"{'='*80}")

	print(f"\nSEARCH & CRITERIA EXTRACTION ({len(search_results)} scenarios):")
	print(f"{'Scenario':<35} {'Search':<8} {'Criteria':<10} {'Overall':<8}")
	print("-" * 61)
	for r in search_results:
	print(f"{r['scenario']:<35} {r['search_results']['score']:<8.2f} {r['criteria_extraction']['overall_accuracy']:<10.2f} {r['overall_score']:<8.2f}")

	if args.with_chatbot:
	print(f"\nCHATBOT RESPONSE EVALUATION ({len(chatbot_results)} scenarios):")
	print(f"{'Scenario':<35} {'Quality':<8} {'Halluc?':<8} {'Time(s)':<8}")
	print("-" * 59)
	for r in chatbot_results:
	hall = "N" if r["hallucination"]["overall"] else "Y"
	print(f"{r['scenario']:<35} {r['response_quality']['overall']:<8.2f} {hall:<8} {r['response_time']:<8.2f}")

	if args.multi_turn:
	print(f"\nMULTI-TURN CONVERSATION EVALUATION:")
	for r in multi_turn_results:
	print(f" {r['scenario']}: Quality={r['overall_quality']:.2f}, Key Coverage={r['key_coverage']:.2f}, Overall={r['conversation_score']:.2f}")

	print(f"\n{'='*80}")
	print("SUMMARY SCORES (0.0-1.0 scale, higher is better):")
	print(f" Average Search & Criteria Score: {avg_search_score:.3f}")
	if avg_hallucination is not None:
	print(f" Average Hallucination Score: {avg_hallucination:.3f} (1.0 = no hallucinations)")
	if avg_quality is not None:
	print(f" Average Response Quality: {avg_quality:.3f}")
	if avg_response_time is not None:
	print(f" Average Response Time: {avg_response_time:.2f} seconds")
	if avg_conv_score is not None:
	print(f" Average Multi-turn Score: {avg_conv_score:.3f}")

	print(f"\nRECOMMENDATIONS:")
	if avg_search_score < 0.8:
	print(" - Improve criteria extraction accuracy and search result relevance.")
	if avg_hallucination is not None and avg_hallucination < 0.9:
	print(" - Address hallucination issues in chatbot responses.")
	if avg_quality is not None and avg_quality < 0.7:
	print(" - Enhance response quality: ensure relevance, completeness, and proper flow.")
	if avg_response_time is not None and avg_response_time > 5.0:
	print(" - Optimize response time (consider smaller models or caching).")
	print(f"{'='*80}")


	if __name__ == "__main__":
	main()