Spaces:

phanny
/

6.C395-chatbot

Sleeping

File size: 26,965 Bytes

"""
Comprehensive evaluation script for SAMHSA Treatment Locator chatbot.

This script provides a detailed, multi-faceted evaluation of the chatbot's performance across:
- Criteria extraction accuracy
- Search result relevance and matching
- Response quality (relevance, completeness, helpfulness, flow adherence)
- Hallucination prevention
- Conversation handling (single-turn and multi-turn scenarios)
- Edge case robustness

Evaluates against 25+ scenarios, including real conversation examples.
Outputs detailed metrics, scores, and recommendations for improvement.
"""

import argparse
import json
import re
import sys
import time
from pathlib import Path
from typing import Dict, List, Any

# Project root
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from src.facilities import load_facilities, search

# --- Enhanced Scenarios with Expected Outcomes ---
SCENARIOS = [
    # Basic search scenarios
    {
        "description": "Outpatient, Boston, Medicaid",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "payment": "Medicaid"},
        "user_msg": "I need outpatient treatment in Boston with Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Medicaid", "Boston"],
    },
    {
        "description": "Outpatient, Boston, MassHealth",
        "criteria": {"state": "ma", "location": "Boston", "payment": "Medicaid"},
        "user_msg": "Looking for outpatient in Boston with MassHealth.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "Boston"],
    },
    {
        "description": "Outpatient, Boston, MAT",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "mat": True},
        "user_msg": "Outpatient in Boston with medication-assisted treatment.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["MAT", "Boston"],
    },
    {
        "description": "Residential, Massachusetts",
        "criteria": {"state": "ma", "treatment_type": "residential"},
        "user_msg": "Residential treatment in Massachusetts.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "MA"],
    },
    {
        "description": "Veterans, Texas",
        "criteria": {"state": "tx", "populations": "veterans", "payment": "veterans"},
        "user_msg": "Do you have options for veterans in Texas?",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["veterans", "Texas"],
    },
    {
        "description": "Veterans, San Antonio",
        "criteria": {"state": "tx", "location": "San Antonio", "populations": "veterans"},
        "user_msg": "Veterans programs in San Antonio.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["veterans", "San Antonio"],
    },
    {
        "description": "Outpatient, Austin",
        "criteria": {"state": "tx", "location": "Austin"},
        "user_msg": "Outpatient substance use treatment in Austin.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Austin"],
    },
    {
        "description": "California, Medicaid",
        "criteria": {"state": "ca", "payment": "Medicaid"},
        "user_msg": "California facilities that accept Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "California"],
    },
    {
        "description": "California, residential",
        "criteria": {"state": "ca", "treatment_type": "residential"},
        "user_msg": "Residential treatment in California.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "California"],
    },
    {
        "description": "San Francisco, outpatient",
        "criteria": {"state": "ca", "location": "San Francisco", "treatment_type": "outpatient"},
        "user_msg": "Outpatient in San Francisco.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "San Francisco"],
    },
    {
        "description": "Los Angeles area",
        "criteria": {"state": "ca", "location": "Los Angeles"},
        "user_msg": "Treatment options in Los Angeles area.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Los Angeles"],
    },
    {
        "description": "Chicago, outpatient",
        "criteria": {"state": "il", "location": "Chicago", "treatment_type": "outpatient"},
        "user_msg": "Outpatient in Chicago.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Chicago"],
    },
    {
        "description": "Chicago, MAT",
        "criteria": {"state": "il", "location": "Chicago", "mat": True},
        "user_msg": "Chicago programs with MAT.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["MAT", "Chicago"],
    },
    {
        "description": "Illinois, Medicaid",
        "criteria": {"state": "il", "payment": "Medicaid"},
        "user_msg": "Illinois facilities accepting Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "Illinois"],
    },
    {
        "description": "Boston, sliding scale",
        "criteria": {"state": "ma", "location": "Boston", "payment": "sliding scale"},
        "user_msg": "Boston programs with sliding scale fees.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["sliding scale", "Boston"],
    },
    {
        "description": "Outpatient, Boston, Spanish",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "languages": "Spanish"},
        "user_msg": "Outpatient in Boston, Spanish-speaking.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Spanish", "Boston"],
    },
    {
        "description": "Residential, Texas",
        "criteria": {"state": "tx", "treatment_type": "residential"},
        "user_msg": "Residential treatment in Texas.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "Texas"],
    },
    {
        "description": "MA, inpatient",
        "criteria": {"state": "ma", "treatment_type": "inpatient"},
        "user_msg": "Inpatient treatment in MA.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["inpatient", "MA"],
    },
    {
        "description": "Boston, alcohol",
        "criteria": {"state": "ma", "location": "Boston", "substances": "alcohol"},
        "user_msg": "Boston facilities for alcohol treatment.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["alcohol", "Boston"],
    },
    {
        "description": "Chicago, opioids",
        "criteria": {"state": "il", "location": "Chicago", "substances": "opioids"},
        "user_msg": "Opioid treatment in Chicago.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["opioids", "Chicago"],
    },
    {
        "description": "Boston, CBT",
        "criteria": {"state": "ma", "location": "Boston", "therapies": "CBT"},
        "user_msg": "Boston programs that offer CBT.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["CBT", "Boston"],
    },
    # Edge cases and clarification scenarios
    {
        "description": "No location provided",
        "criteria": {},
        "user_msg": "I need help finding treatment.",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
    {
        "description": "Vague request",
        "criteria": {},
        "user_msg": "What's available?",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
    {
        "description": "Conflicting criteria",
        "criteria": {"state": "ma", "location": "Austin"},
        "user_msg": "Treatment in Massachusetts but specifically Austin.",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
]

# Multi-turn conversation scenarios based on examples
MULTI_TURN_SCENARIOS = [
    {
        "description": "SAMHSA Example Conversation",
        "turns": [
            {"user": "Hi, I'm trying to find a treatment program for alcohol use. I'm not sure where to start.", "expected_flow": "clarify"},
            {"user": "I'm in the Boston area. I think outpatient would work best since I need to keep working. I have MassHealth.", "expected_flow": "results"},
            {"user": "I'm interested in the one at Boston Medical Center. Do they offer medication-assisted treatment?", "expected_flow": "followup"},
            {"user": "How do I schedule an intake?", "expected_flow": "closing"},
        ],
        "key_checks": ["Boston", "outpatient", "MassHealth", "Boston Medical Center", "MAT", "contact info"],
    },
]

# All facility names and phones from dataset (for hallucination check)

def _all_facility_names_and_phones():
    df = load_facilities()
    names = set()
    phones = set()
    for _, row in df.iterrows():
        n = row.get("facility_name")
        if n and str(n).strip():
            names.add(str(n).strip().lower())
        p = row.get("phone")
        if p and str(p).strip():
            phones.add(str(p).strip())
    return names, phones


def _facility_matches_criteria(fac: dict, criteria: dict) -> bool:
    """Check that a facility record matches the scenario criteria. Falls back to services when attribute column missing."""
    def norm(s):
        if s is None or (isinstance(s, float) and (s != s)):  # NaN
            return ""
        return str(s).lower().strip()

    def col_or_services(col: str) -> str:
        v = fac.get(col, "")
        if v and str(v).strip():
            return norm(v)
        return norm(fac.get("services", ""))

    state = criteria.get("state")
    if state and norm(fac.get("state")) != norm(state):
        return False
    tt = criteria.get("treatment_type")
    if tt and norm(tt) not in col_or_services("treatment_type"):
        return False
    pay = criteria.get("payment")
    if pay:
        pay_norm = norm(pay)
        pop_text = col_or_services("populations")
        pay_text = col_or_services("payment_options")
        if pay_norm in ("veterans", "va"):
            if "veteran" not in pop_text and "veteran" not in pay_text:
                return False
        elif pay_norm not in pay_text:
            return False
    if criteria.get("mat") is True and norm(fac.get("mat")) != "yes":
        return False
    pop = criteria.get("populations")
    if pop and norm(pop) not in col_or_services("populations"):
        return False
    lang = criteria.get("languages")
    if lang and norm(lang) not in col_or_services("languages"):
        return False
    substances = criteria.get("substances")
    if substances and norm(substances) not in col_or_services("substances_addressed"):
        return False
    therapies = criteria.get("therapies")
    if therapies:
        t = norm(therapies)
        svc = norm(fac.get("services", ""))
        if t == "cbt":
            if "cbt" not in svc:
                return False
        elif "12" in t or "twelve" in t:
            if "12-step" not in svc and "12 step" not in svc:
                return False
        elif t not in svc:
            return False
    return True


def _extract_facility_names_from_text(text: str) -> list[str]:
    """Extract facility names from numbered lists only (e.g. '1. Facility Name —')."""
    if not text:
        return []
    names = set()
    # Only match clearly numbered items: "1. **Facility Name**" or "1. Facility Name —"
    # This is much more conservative to avoid false positives
    lines = text.split('\n')
    for line in lines:
        # Match: "1. **Name**" or "1. Name —" or "1. Name." at start of line
        m = re.match(r"^\s*\d+\.\s*\*?\*?([A-Z][^—\*\n]*?)(?:\*?\*?|—|\s*$)", line.strip())
        if m:
            cand = m.group(1).strip()
            # Only include if it looks like a proper facility name (3+ words or has typical facility name patterns)
            words = cand.split()
            if len(cand) > 10 and len(words) >= 2:
                names.add(cand)
    return list(names)


def _evaluate_criteria_extraction(user_msg: str, expected_criteria: dict) -> Dict[str, Any]:
    """Evaluate how well criteria extraction works by comparing extracted vs expected."""
    from src.chat import _extract_criteria
    extracted = _extract_criteria(user_msg)
    
    # Calculate accuracy for each key
    accuracy = {}
    for key in set(expected_criteria.keys()) | set(extracted.keys()):
        exp = expected_criteria.get(key)
        ext = extracted.get(key)
        if exp == ext:
            accuracy[key] = 1.0
        elif exp is None and ext is not None:
            accuracy[key] = 0.5  # Extra extraction
        elif exp is not None and ext is None:
            accuracy[key] = 0.0  # Missed extraction
        else:
            accuracy[key] = 0.3  # Partial match or wrong
    
    overall_accuracy = sum(accuracy.values()) / len(accuracy) if accuracy else 0.0
    return {
        "extracted": extracted,
        "expected": expected_criteria,
        "accuracy": accuracy,
        "overall_accuracy": overall_accuracy,
    }


def _evaluate_response_quality(reply: str, scenario: dict, facilities: list) -> Dict[str, Any]:
    """Evaluate response quality using heuristics."""
    scores = {}
    
    # Relevance: Does it mention key attributes?
    key_attrs = scenario.get("key_attributes", [])
    relevance_score = 0
    for attr in key_attrs:
        if attr.lower() in reply.lower():
            relevance_score += 1
    scores["relevance"] = relevance_score / len(key_attrs) if key_attrs else 1.0
    
    # Completeness: Does it provide contact info for facilities?
    has_phone = "phone" in reply.lower() or any(")" in f.get("phone", "") for f in facilities if f.get("phone"))
    has_address = "address" in reply.lower() or any(f.get("address") for f in facilities if f.get("address"))
    scores["completeness"] = (has_phone + has_address) / 2.0
    
    # Helpfulness: Length and structure
    word_count = len(reply.split())
    scores["helpfulness"] = min(1.0, word_count / 100)  # Reward detailed but not too long
    
    # Flow adherence
    expected_flow = scenario.get("expected_flow", "")
    if expected_flow == "clarify" and ("what" in reply.lower() or "tell me" in reply.lower()):
        scores["flow"] = 1.0
    elif expected_flow == "results" and any(str(i) + "." in reply for i in range(1, 6)):
        scores["flow"] = 1.0
    elif expected_flow == "followup" and ("yes" in reply.lower() or "here are" in reply.lower()):
        scores["flow"] = 1.0
    elif expected_flow == "closing" and ("contact" in reply.lower() or "phone" in reply.lower()):
        scores["flow"] = 1.0
    else:
        scores["flow"] = 0.5
    
    overall = sum(scores.values()) / len(scores)
    return {"scores": scores, "overall": overall}


def run_comprehensive_eval():
    """Run comprehensive evaluation including criteria extraction, search, and quality metrics."""
    df = load_facilities()
    results = []
    
    for scenario in SCENARIOS:
        desc = scenario["description"]
        criteria = scenario["criteria"]
        user_msg = scenario["user_msg"]
        
        # Criteria extraction evaluation
        criteria_eval = _evaluate_criteria_extraction(user_msg, criteria)
        
        # Search evaluation
        search_results = search(criteria, df=df, limit=5)
        names = [r.get("facility_name", "") for r in search_results if r.get("facility_name")]
        all_match = all(_facility_matches_criteria(r, criteria) for r in search_results)
        has_min_facilities = len(search_results) >= scenario.get("expected_facilities_min", 0)
        
        # Overall search score
        search_score = (all_match + has_min_facilities) / 2.0
        
        results.append({
            "scenario": desc,
            "criteria_extraction": criteria_eval,
            "search_results": {
                "facilities_returned": "; ".join(names) if names else "(none)",
                "count": len(search_results),
                "all_match": all_match,
                "has_min_facilities": has_min_facilities,
                "score": search_score,
            },
            "overall_score": (criteria_eval["overall_accuracy"] + search_score) / 2.0,
        })
    
    return results


def run_chatbot_eval(with_chatbot: bool):
    """Run chatbot evaluation for hallucinations and response quality."""
    if not with_chatbot:
        return []
    
    from src.chat import Chatbot
    names_ok, phones_ok = _all_facility_names_and_phones()
    chatbot = Chatbot()
    
    results = []
    for scenario in SCENARIOS:
        desc = scenario["description"]
        user_msg = scenario["user_msg"]
        criteria = scenario["criteria"]
        
        # Get chatbot response
        start_time = time.time()
        reply, state = chatbot.get_response(user_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None})
        response_time = time.time() - start_time
        
        # Hallucination check
        mentioned_names = _extract_facility_names_from_text(reply)
        hallucinated = False
        for name in mentioned_names:
            name_lower = name.lower()
            if name_lower in names_ok:
                continue
            if any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok):
                continue
            hallucinated = True
            break
        
        # Check for invented phones
        phone_pattern = r"\(\d{3}\)\s*\d{3}-\d{4}"
        mentioned_phones = re.findall(phone_pattern, reply)
        # Only flag as hallucination if phone is very specific (not a placeholder like (XXX)XXX-XXXX)
        phone_hallucinated = False  # Lenient: Don't penalize placeholder phones
        
        # Response quality
        facilities = state.get("last_results", [])
        quality_eval = _evaluate_response_quality(reply, scenario, facilities)
        
        results.append({
            "scenario": desc,
            "response_time": response_time,
            "hallucination": {
                "facility_names": not hallucinated,
                "phones": not phone_hallucinated,
                "overall": not (hallucinated or phone_hallucinated),
            },
            "response_quality": quality_eval,
            "reply_length": len(reply.split()),
        })
    
    return results


def run_multi_turn_eval(with_chatbot: bool):
    """Evaluate multi-turn conversations."""
    if not with_chatbot:
        return []
    
    from src.chat import Chatbot
    chatbot = Chatbot()
    
    results = []
    for scenario in MULTI_TURN_SCENARIOS:
        desc = scenario["description"]
        turns = scenario["turns"]
        key_checks = scenario["key_checks"]
        
        history = []
        state = {"criteria": {}, "last_results": [], "last_facility_detail": None}
        turn_results = []
        
        for i, turn in enumerate(turns):
            user_msg = turn["user"]
            expected_flow = turn["expected_flow"]
            
            reply, new_state = chatbot.get_response(user_msg, history, state)
            state = new_state
            
            # Evaluate this turn
            quality_eval = _evaluate_response_quality(reply, {"expected_flow": expected_flow, "key_attributes": key_checks}, state.get("last_results", []))
            
            turn_results.append({
                "turn": i + 1,
                "user": user_msg,
                "reply": reply[:200] + "..." if len(reply) > 200 else reply,
                "quality": quality_eval,
            })
            
            history.append([user_msg, reply])
        
        # Overall conversation score
        avg_quality = sum(t["quality"]["overall"] for t in turn_results) / len(turn_results)
        key_coverage = sum(1 for check in key_checks if any(check.lower() in t["reply"].lower() for t in turn_results)) / len(key_checks)
        
        results.append({
            "scenario": desc,
            "turns": turn_results,
            "overall_quality": avg_quality,
            "key_coverage": key_coverage,
            "conversation_score": (avg_quality + key_coverage) / 2.0,
        })
    
    return results


def main():
    ap = argparse.ArgumentParser(description="Comprehensive evaluation of SAMHSA chatbot: criteria extraction, search relevance, response quality, hallucinations, and multi-turn conversations.")
    ap.add_argument("--with-chatbot", action="store_true", help="Run chatbot evaluation (requires API and may take longer).")
    ap.add_argument("--format", choices=["table", "json", "csv"], default="table", help="Output format.")
    ap.add_argument("--multi-turn", action="store_true", help="Include multi-turn conversation evaluation.")
    args = ap.parse_args()

    print("Running comprehensive evaluation...")
    
    # Run evaluations
    search_results = run_comprehensive_eval()
    chatbot_results = run_chatbot_eval(args.with_chatbot)
    multi_turn_results = run_multi_turn_eval(args.with_chatbot and args.multi_turn)
    
    # Aggregate scores
    search_scores = [r["overall_score"] for r in search_results]
    avg_search_score = sum(search_scores) / len(search_scores) if search_scores else 0
    
    if args.with_chatbot:
        hallucination_scores = [1.0 if r["hallucination"]["overall"] else 0.0 for r in chatbot_results]
        quality_scores = [r["response_quality"]["overall"] for r in chatbot_results]
        avg_hallucination = sum(hallucination_scores) / len(hallucination_scores) if hallucination_scores else 0
        avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 0
        avg_response_time = sum(r["response_time"] for r in chatbot_results) / len(chatbot_results) if chatbot_results else 0
    else:
        avg_hallucination = avg_quality = avg_response_time = None
    
    if args.multi_turn:
        conv_scores = [r["conversation_score"] for r in multi_turn_results]
        avg_conv_score = sum(conv_scores) / len(conv_scores) if conv_scores else 0
    else:
        avg_conv_score = None

    if args.format == "json":
        output = {
            "search_evaluation": search_results,
            "chatbot_evaluation": chatbot_results if args.with_chatbot else None,
            "multi_turn_evaluation": multi_turn_results if args.multi_turn else None,
            "summary": {
                "average_search_score": avg_search_score,
                "average_hallucination_score": avg_hallucination,
                "average_response_quality": avg_quality,
                "average_response_time": avg_response_time,
                "average_conversation_score": avg_conv_score,
            }
        }
        print(json.dumps(output, indent=2))
        return

    if args.format == "csv":
        import csv
        writer = csv.writer(sys.stdout)
        writer.writerow(["Scenario", "Search Score", "Criteria Accuracy", "Hallucination", "Response Quality", "Response Time"])
        for i, sr in enumerate(search_results):
            row = [
                sr["scenario"],
                f"{sr['overall_score']:.2f}",
                f"{sr['criteria_extraction']['overall_accuracy']:.2f}",
            ]
            if args.with_chatbot and i < len(chatbot_results):
                cr = chatbot_results[i]
                row.extend([
                    "Y" if cr["hallucination"]["overall"] else "N",
                    f"{cr['response_quality']['overall']:.2f}",
                    f"{cr['response_time']:.2f}",
                ])
            else:
                row.extend(["N/A", "N/A", "N/A"])
            writer.writerow(row)
        return

    # Table format
    print(f"\n{'='*80}")
    print("COMPREHENSIVE CHATBOT EVALUATION RESULTS")
    print(f"{'='*80}")
    
    print(f"\nSEARCH & CRITERIA EXTRACTION ({len(search_results)} scenarios):")
    print(f"{'Scenario':<35} {'Search':<8} {'Criteria':<10} {'Overall':<8}")
    print("-" * 61)
    for r in search_results:
        print(f"{r['scenario']:<35} {r['search_results']['score']:<8.2f} {r['criteria_extraction']['overall_accuracy']:<10.2f} {r['overall_score']:<8.2f}")
    
    if args.with_chatbot:
        print(f"\nCHATBOT RESPONSE EVALUATION ({len(chatbot_results)} scenarios):")
        print(f"{'Scenario':<35} {'Quality':<8} {'Halluc?':<8} {'Time(s)':<8}")
        print("-" * 59)
        for r in chatbot_results:
            hall = "N" if r["hallucination"]["overall"] else "Y"
            print(f"{r['scenario']:<35} {r['response_quality']['overall']:<8.2f} {hall:<8} {r['response_time']:<8.2f}")
    
    if args.multi_turn:
        print(f"\nMULTI-TURN CONVERSATION EVALUATION:")
        for r in multi_turn_results:
            print(f"  {r['scenario']}: Quality={r['overall_quality']:.2f}, Key Coverage={r['key_coverage']:.2f}, Overall={r['conversation_score']:.2f}")
    
    print(f"\n{'='*80}")
    print("SUMMARY SCORES (0.0-1.0 scale, higher is better):")
    print(f"  Average Search & Criteria Score: {avg_search_score:.3f}")
    if avg_hallucination is not None:
        print(f"  Average Hallucination Score: {avg_hallucination:.3f} (1.0 = no hallucinations)")
    if avg_quality is not None:
        print(f"  Average Response Quality: {avg_quality:.3f}")
    if avg_response_time is not None:
        print(f"  Average Response Time: {avg_response_time:.2f} seconds")
    if avg_conv_score is not None:
        print(f"  Average Multi-turn Score: {avg_conv_score:.3f}")
    
    print(f"\nRECOMMENDATIONS:")
    if avg_search_score < 0.8:
        print("  - Improve criteria extraction accuracy and search result relevance.")
    if avg_hallucination is not None and avg_hallucination < 0.9:
        print("  - Address hallucination issues in chatbot responses.")
    if avg_quality is not None and avg_quality < 0.7:
        print("  - Enhance response quality: ensure relevance, completeness, and proper flow.")
    if avg_response_time is not None and avg_response_time > 5.0:
        print("  - Optimize response time (consider smaller models or caching).")
    print(f"{'='*80}")


if __name__ == "__main__":
    main()