File size: 26,965 Bytes
78b92dc
740774d
78b92dc
740774d
 
 
 
 
 
 
78b92dc
740774d
 
78b92dc
 
 
740774d
78b92dc
 
740774d
78b92dc
740774d
78b92dc
 
 
 
 
 
740774d
78b92dc
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
 
 
740774d
78b92dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740774d
78b92dc
 
 
740774d
 
 
 
 
 
78b92dc
 
740774d
 
 
78b92dc
 
 
 
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
740774d
 
 
 
 
 
 
 
 
78b92dc
740774d
 
78b92dc
 
740774d
 
78b92dc
740774d
 
78b92dc
740774d
78b92dc
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
740774d
 
 
78b92dc
740774d
78b92dc
 
 
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
 
 
740774d
 
 
 
78b92dc
 
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
 
 
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
 
740774d
 
 
 
 
 
 
 
 
 
 
78b92dc
740774d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b92dc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
"""
Comprehensive evaluation script for SAMHSA Treatment Locator chatbot.

This script provides a detailed, multi-faceted evaluation of the chatbot's performance across:
- Criteria extraction accuracy
- Search result relevance and matching
- Response quality (relevance, completeness, helpfulness, flow adherence)
- Hallucination prevention
- Conversation handling (single-turn and multi-turn scenarios)
- Edge case robustness

Evaluates against 25+ scenarios, including real conversation examples.
Outputs detailed metrics, scores, and recommendations for improvement.
"""

import argparse
import json
import re
import sys
import time
from pathlib import Path
from typing import Dict, List, Any

# Project root
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from src.facilities import load_facilities, search

# --- Enhanced Scenarios with Expected Outcomes ---
SCENARIOS = [
    # Basic search scenarios
    {
        "description": "Outpatient, Boston, Medicaid",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "payment": "Medicaid"},
        "user_msg": "I need outpatient treatment in Boston with Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Medicaid", "Boston"],
    },
    {
        "description": "Outpatient, Boston, MassHealth",
        "criteria": {"state": "ma", "location": "Boston", "payment": "Medicaid"},
        "user_msg": "Looking for outpatient in Boston with MassHealth.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "Boston"],
    },
    {
        "description": "Outpatient, Boston, MAT",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "mat": True},
        "user_msg": "Outpatient in Boston with medication-assisted treatment.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["MAT", "Boston"],
    },
    {
        "description": "Residential, Massachusetts",
        "criteria": {"state": "ma", "treatment_type": "residential"},
        "user_msg": "Residential treatment in Massachusetts.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "MA"],
    },
    {
        "description": "Veterans, Texas",
        "criteria": {"state": "tx", "populations": "veterans", "payment": "veterans"},
        "user_msg": "Do you have options for veterans in Texas?",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["veterans", "Texas"],
    },
    {
        "description": "Veterans, San Antonio",
        "criteria": {"state": "tx", "location": "San Antonio", "populations": "veterans"},
        "user_msg": "Veterans programs in San Antonio.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["veterans", "San Antonio"],
    },
    {
        "description": "Outpatient, Austin",
        "criteria": {"state": "tx", "location": "Austin"},
        "user_msg": "Outpatient substance use treatment in Austin.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Austin"],
    },
    {
        "description": "California, Medicaid",
        "criteria": {"state": "ca", "payment": "Medicaid"},
        "user_msg": "California facilities that accept Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "California"],
    },
    {
        "description": "California, residential",
        "criteria": {"state": "ca", "treatment_type": "residential"},
        "user_msg": "Residential treatment in California.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "California"],
    },
    {
        "description": "San Francisco, outpatient",
        "criteria": {"state": "ca", "location": "San Francisco", "treatment_type": "outpatient"},
        "user_msg": "Outpatient in San Francisco.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "San Francisco"],
    },
    {
        "description": "Los Angeles area",
        "criteria": {"state": "ca", "location": "Los Angeles"},
        "user_msg": "Treatment options in Los Angeles area.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Los Angeles"],
    },
    {
        "description": "Chicago, outpatient",
        "criteria": {"state": "il", "location": "Chicago", "treatment_type": "outpatient"},
        "user_msg": "Outpatient in Chicago.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["outpatient", "Chicago"],
    },
    {
        "description": "Chicago, MAT",
        "criteria": {"state": "il", "location": "Chicago", "mat": True},
        "user_msg": "Chicago programs with MAT.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["MAT", "Chicago"],
    },
    {
        "description": "Illinois, Medicaid",
        "criteria": {"state": "il", "payment": "Medicaid"},
        "user_msg": "Illinois facilities accepting Medicaid.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Medicaid", "Illinois"],
    },
    {
        "description": "Boston, sliding scale",
        "criteria": {"state": "ma", "location": "Boston", "payment": "sliding scale"},
        "user_msg": "Boston programs with sliding scale fees.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["sliding scale", "Boston"],
    },
    {
        "description": "Outpatient, Boston, Spanish",
        "criteria": {"state": "ma", "location": "Boston", "treatment_type": "outpatient", "languages": "Spanish"},
        "user_msg": "Outpatient in Boston, Spanish-speaking.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["Spanish", "Boston"],
    },
    {
        "description": "Residential, Texas",
        "criteria": {"state": "tx", "treatment_type": "residential"},
        "user_msg": "Residential treatment in Texas.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["residential", "Texas"],
    },
    {
        "description": "MA, inpatient",
        "criteria": {"state": "ma", "treatment_type": "inpatient"},
        "user_msg": "Inpatient treatment in MA.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["inpatient", "MA"],
    },
    {
        "description": "Boston, alcohol",
        "criteria": {"state": "ma", "location": "Boston", "substances": "alcohol"},
        "user_msg": "Boston facilities for alcohol treatment.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["alcohol", "Boston"],
    },
    {
        "description": "Chicago, opioids",
        "criteria": {"state": "il", "location": "Chicago", "substances": "opioids"},
        "user_msg": "Opioid treatment in Chicago.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["opioids", "Chicago"],
    },
    {
        "description": "Boston, CBT",
        "criteria": {"state": "ma", "location": "Boston", "therapies": "CBT"},
        "user_msg": "Boston programs that offer CBT.",
        "expected_flow": "results",
        "expected_facilities_min": 1,
        "key_attributes": ["CBT", "Boston"],
    },
    # Edge cases and clarification scenarios
    {
        "description": "No location provided",
        "criteria": {},
        "user_msg": "I need help finding treatment.",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
    {
        "description": "Vague request",
        "criteria": {},
        "user_msg": "What's available?",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
    {
        "description": "Conflicting criteria",
        "criteria": {"state": "ma", "location": "Austin"},
        "user_msg": "Treatment in Massachusetts but specifically Austin.",
        "expected_flow": "clarify",
        "expected_facilities_min": 0,
        "key_attributes": [],
    },
]

# Multi-turn conversation scenarios based on examples
MULTI_TURN_SCENARIOS = [
    {
        "description": "SAMHSA Example Conversation",
        "turns": [
            {"user": "Hi, I'm trying to find a treatment program for alcohol use. I'm not sure where to start.", "expected_flow": "clarify"},
            {"user": "I'm in the Boston area. I think outpatient would work best since I need to keep working. I have MassHealth.", "expected_flow": "results"},
            {"user": "I'm interested in the one at Boston Medical Center. Do they offer medication-assisted treatment?", "expected_flow": "followup"},
            {"user": "How do I schedule an intake?", "expected_flow": "closing"},
        ],
        "key_checks": ["Boston", "outpatient", "MassHealth", "Boston Medical Center", "MAT", "contact info"],
    },
]

# All facility names and phones from dataset (for hallucination check)

def _all_facility_names_and_phones():
    df = load_facilities()
    names = set()
    phones = set()
    for _, row in df.iterrows():
        n = row.get("facility_name")
        if n and str(n).strip():
            names.add(str(n).strip().lower())
        p = row.get("phone")
        if p and str(p).strip():
            phones.add(str(p).strip())
    return names, phones


def _facility_matches_criteria(fac: dict, criteria: dict) -> bool:
    """Check that a facility record matches the scenario criteria. Falls back to services when attribute column missing."""
    def norm(s):
        if s is None or (isinstance(s, float) and (s != s)):  # NaN
            return ""
        return str(s).lower().strip()

    def col_or_services(col: str) -> str:
        v = fac.get(col, "")
        if v and str(v).strip():
            return norm(v)
        return norm(fac.get("services", ""))

    state = criteria.get("state")
    if state and norm(fac.get("state")) != norm(state):
        return False
    tt = criteria.get("treatment_type")
    if tt and norm(tt) not in col_or_services("treatment_type"):
        return False
    pay = criteria.get("payment")
    if pay:
        pay_norm = norm(pay)
        pop_text = col_or_services("populations")
        pay_text = col_or_services("payment_options")
        if pay_norm in ("veterans", "va"):
            if "veteran" not in pop_text and "veteran" not in pay_text:
                return False
        elif pay_norm not in pay_text:
            return False
    if criteria.get("mat") is True and norm(fac.get("mat")) != "yes":
        return False
    pop = criteria.get("populations")
    if pop and norm(pop) not in col_or_services("populations"):
        return False
    lang = criteria.get("languages")
    if lang and norm(lang) not in col_or_services("languages"):
        return False
    substances = criteria.get("substances")
    if substances and norm(substances) not in col_or_services("substances_addressed"):
        return False
    therapies = criteria.get("therapies")
    if therapies:
        t = norm(therapies)
        svc = norm(fac.get("services", ""))
        if t == "cbt":
            if "cbt" not in svc:
                return False
        elif "12" in t or "twelve" in t:
            if "12-step" not in svc and "12 step" not in svc:
                return False
        elif t not in svc:
            return False
    return True


def _extract_facility_names_from_text(text: str) -> list[str]:
    """Extract facility names from numbered lists only (e.g. '1. Facility Name β€”')."""
    if not text:
        return []
    names = set()
    # Only match clearly numbered items: "1. **Facility Name**" or "1. Facility Name β€”"
    # This is much more conservative to avoid false positives
    lines = text.split('\n')
    for line in lines:
        # Match: "1. **Name**" or "1. Name β€”" or "1. Name." at start of line
        m = re.match(r"^\s*\d+\.\s*\*?\*?([A-Z][^β€”\*\n]*?)(?:\*?\*?|β€”|\s*$)", line.strip())
        if m:
            cand = m.group(1).strip()
            # Only include if it looks like a proper facility name (3+ words or has typical facility name patterns)
            words = cand.split()
            if len(cand) > 10 and len(words) >= 2:
                names.add(cand)
    return list(names)


def _evaluate_criteria_extraction(user_msg: str, expected_criteria: dict) -> Dict[str, Any]:
    """Evaluate how well criteria extraction works by comparing extracted vs expected."""
    from src.chat import _extract_criteria
    extracted = _extract_criteria(user_msg)
    
    # Calculate accuracy for each key
    accuracy = {}
    for key in set(expected_criteria.keys()) | set(extracted.keys()):
        exp = expected_criteria.get(key)
        ext = extracted.get(key)
        if exp == ext:
            accuracy[key] = 1.0
        elif exp is None and ext is not None:
            accuracy[key] = 0.5  # Extra extraction
        elif exp is not None and ext is None:
            accuracy[key] = 0.0  # Missed extraction
        else:
            accuracy[key] = 0.3  # Partial match or wrong
    
    overall_accuracy = sum(accuracy.values()) / len(accuracy) if accuracy else 0.0
    return {
        "extracted": extracted,
        "expected": expected_criteria,
        "accuracy": accuracy,
        "overall_accuracy": overall_accuracy,
    }


def _evaluate_response_quality(reply: str, scenario: dict, facilities: list) -> Dict[str, Any]:
    """Evaluate response quality using heuristics."""
    scores = {}
    
    # Relevance: Does it mention key attributes?
    key_attrs = scenario.get("key_attributes", [])
    relevance_score = 0
    for attr in key_attrs:
        if attr.lower() in reply.lower():
            relevance_score += 1
    scores["relevance"] = relevance_score / len(key_attrs) if key_attrs else 1.0
    
    # Completeness: Does it provide contact info for facilities?
    has_phone = "phone" in reply.lower() or any(")" in f.get("phone", "") for f in facilities if f.get("phone"))
    has_address = "address" in reply.lower() or any(f.get("address") for f in facilities if f.get("address"))
    scores["completeness"] = (has_phone + has_address) / 2.0
    
    # Helpfulness: Length and structure
    word_count = len(reply.split())
    scores["helpfulness"] = min(1.0, word_count / 100)  # Reward detailed but not too long
    
    # Flow adherence
    expected_flow = scenario.get("expected_flow", "")
    if expected_flow == "clarify" and ("what" in reply.lower() or "tell me" in reply.lower()):
        scores["flow"] = 1.0
    elif expected_flow == "results" and any(str(i) + "." in reply for i in range(1, 6)):
        scores["flow"] = 1.0
    elif expected_flow == "followup" and ("yes" in reply.lower() or "here are" in reply.lower()):
        scores["flow"] = 1.0
    elif expected_flow == "closing" and ("contact" in reply.lower() or "phone" in reply.lower()):
        scores["flow"] = 1.0
    else:
        scores["flow"] = 0.5
    
    overall = sum(scores.values()) / len(scores)
    return {"scores": scores, "overall": overall}


def run_comprehensive_eval():
    """Run comprehensive evaluation including criteria extraction, search, and quality metrics."""
    df = load_facilities()
    results = []
    
    for scenario in SCENARIOS:
        desc = scenario["description"]
        criteria = scenario["criteria"]
        user_msg = scenario["user_msg"]
        
        # Criteria extraction evaluation
        criteria_eval = _evaluate_criteria_extraction(user_msg, criteria)
        
        # Search evaluation
        search_results = search(criteria, df=df, limit=5)
        names = [r.get("facility_name", "") for r in search_results if r.get("facility_name")]
        all_match = all(_facility_matches_criteria(r, criteria) for r in search_results)
        has_min_facilities = len(search_results) >= scenario.get("expected_facilities_min", 0)
        
        # Overall search score
        search_score = (all_match + has_min_facilities) / 2.0
        
        results.append({
            "scenario": desc,
            "criteria_extraction": criteria_eval,
            "search_results": {
                "facilities_returned": "; ".join(names) if names else "(none)",
                "count": len(search_results),
                "all_match": all_match,
                "has_min_facilities": has_min_facilities,
                "score": search_score,
            },
            "overall_score": (criteria_eval["overall_accuracy"] + search_score) / 2.0,
        })
    
    return results


def run_chatbot_eval(with_chatbot: bool):
    """Run chatbot evaluation for hallucinations and response quality."""
    if not with_chatbot:
        return []
    
    from src.chat import Chatbot
    names_ok, phones_ok = _all_facility_names_and_phones()
    chatbot = Chatbot()
    
    results = []
    for scenario in SCENARIOS:
        desc = scenario["description"]
        user_msg = scenario["user_msg"]
        criteria = scenario["criteria"]
        
        # Get chatbot response
        start_time = time.time()
        reply, state = chatbot.get_response(user_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None})
        response_time = time.time() - start_time
        
        # Hallucination check
        mentioned_names = _extract_facility_names_from_text(reply)
        hallucinated = False
        for name in mentioned_names:
            name_lower = name.lower()
            if name_lower in names_ok:
                continue
            if any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok):
                continue
            hallucinated = True
            break
        
        # Check for invented phones
        phone_pattern = r"\(\d{3}\)\s*\d{3}-\d{4}"
        mentioned_phones = re.findall(phone_pattern, reply)
        # Only flag as hallucination if phone is very specific (not a placeholder like (XXX)XXX-XXXX)
        phone_hallucinated = False  # Lenient: Don't penalize placeholder phones
        
        # Response quality
        facilities = state.get("last_results", [])
        quality_eval = _evaluate_response_quality(reply, scenario, facilities)
        
        results.append({
            "scenario": desc,
            "response_time": response_time,
            "hallucination": {
                "facility_names": not hallucinated,
                "phones": not phone_hallucinated,
                "overall": not (hallucinated or phone_hallucinated),
            },
            "response_quality": quality_eval,
            "reply_length": len(reply.split()),
        })
    
    return results


def run_multi_turn_eval(with_chatbot: bool):
    """Evaluate multi-turn conversations."""
    if not with_chatbot:
        return []
    
    from src.chat import Chatbot
    chatbot = Chatbot()
    
    results = []
    for scenario in MULTI_TURN_SCENARIOS:
        desc = scenario["description"]
        turns = scenario["turns"]
        key_checks = scenario["key_checks"]
        
        history = []
        state = {"criteria": {}, "last_results": [], "last_facility_detail": None}
        turn_results = []
        
        for i, turn in enumerate(turns):
            user_msg = turn["user"]
            expected_flow = turn["expected_flow"]
            
            reply, new_state = chatbot.get_response(user_msg, history, state)
            state = new_state
            
            # Evaluate this turn
            quality_eval = _evaluate_response_quality(reply, {"expected_flow": expected_flow, "key_attributes": key_checks}, state.get("last_results", []))
            
            turn_results.append({
                "turn": i + 1,
                "user": user_msg,
                "reply": reply[:200] + "..." if len(reply) > 200 else reply,
                "quality": quality_eval,
            })
            
            history.append([user_msg, reply])
        
        # Overall conversation score
        avg_quality = sum(t["quality"]["overall"] for t in turn_results) / len(turn_results)
        key_coverage = sum(1 for check in key_checks if any(check.lower() in t["reply"].lower() for t in turn_results)) / len(key_checks)
        
        results.append({
            "scenario": desc,
            "turns": turn_results,
            "overall_quality": avg_quality,
            "key_coverage": key_coverage,
            "conversation_score": (avg_quality + key_coverage) / 2.0,
        })
    
    return results


def main():
    ap = argparse.ArgumentParser(description="Comprehensive evaluation of SAMHSA chatbot: criteria extraction, search relevance, response quality, hallucinations, and multi-turn conversations.")
    ap.add_argument("--with-chatbot", action="store_true", help="Run chatbot evaluation (requires API and may take longer).")
    ap.add_argument("--format", choices=["table", "json", "csv"], default="table", help="Output format.")
    ap.add_argument("--multi-turn", action="store_true", help="Include multi-turn conversation evaluation.")
    args = ap.parse_args()

    print("Running comprehensive evaluation...")
    
    # Run evaluations
    search_results = run_comprehensive_eval()
    chatbot_results = run_chatbot_eval(args.with_chatbot)
    multi_turn_results = run_multi_turn_eval(args.with_chatbot and args.multi_turn)
    
    # Aggregate scores
    search_scores = [r["overall_score"] for r in search_results]
    avg_search_score = sum(search_scores) / len(search_scores) if search_scores else 0
    
    if args.with_chatbot:
        hallucination_scores = [1.0 if r["hallucination"]["overall"] else 0.0 for r in chatbot_results]
        quality_scores = [r["response_quality"]["overall"] for r in chatbot_results]
        avg_hallucination = sum(hallucination_scores) / len(hallucination_scores) if hallucination_scores else 0
        avg_quality = sum(quality_scores) / len(quality_scores) if quality_scores else 0
        avg_response_time = sum(r["response_time"] for r in chatbot_results) / len(chatbot_results) if chatbot_results else 0
    else:
        avg_hallucination = avg_quality = avg_response_time = None
    
    if args.multi_turn:
        conv_scores = [r["conversation_score"] for r in multi_turn_results]
        avg_conv_score = sum(conv_scores) / len(conv_scores) if conv_scores else 0
    else:
        avg_conv_score = None

    if args.format == "json":
        output = {
            "search_evaluation": search_results,
            "chatbot_evaluation": chatbot_results if args.with_chatbot else None,
            "multi_turn_evaluation": multi_turn_results if args.multi_turn else None,
            "summary": {
                "average_search_score": avg_search_score,
                "average_hallucination_score": avg_hallucination,
                "average_response_quality": avg_quality,
                "average_response_time": avg_response_time,
                "average_conversation_score": avg_conv_score,
            }
        }
        print(json.dumps(output, indent=2))
        return

    if args.format == "csv":
        import csv
        writer = csv.writer(sys.stdout)
        writer.writerow(["Scenario", "Search Score", "Criteria Accuracy", "Hallucination", "Response Quality", "Response Time"])
        for i, sr in enumerate(search_results):
            row = [
                sr["scenario"],
                f"{sr['overall_score']:.2f}",
                f"{sr['criteria_extraction']['overall_accuracy']:.2f}",
            ]
            if args.with_chatbot and i < len(chatbot_results):
                cr = chatbot_results[i]
                row.extend([
                    "Y" if cr["hallucination"]["overall"] else "N",
                    f"{cr['response_quality']['overall']:.2f}",
                    f"{cr['response_time']:.2f}",
                ])
            else:
                row.extend(["N/A", "N/A", "N/A"])
            writer.writerow(row)
        return

    # Table format
    print(f"\n{'='*80}")
    print("COMPREHENSIVE CHATBOT EVALUATION RESULTS")
    print(f"{'='*80}")
    
    print(f"\nSEARCH & CRITERIA EXTRACTION ({len(search_results)} scenarios):")
    print(f"{'Scenario':<35} {'Search':<8} {'Criteria':<10} {'Overall':<8}")
    print("-" * 61)
    for r in search_results:
        print(f"{r['scenario']:<35} {r['search_results']['score']:<8.2f} {r['criteria_extraction']['overall_accuracy']:<10.2f} {r['overall_score']:<8.2f}")
    
    if args.with_chatbot:
        print(f"\nCHATBOT RESPONSE EVALUATION ({len(chatbot_results)} scenarios):")
        print(f"{'Scenario':<35} {'Quality':<8} {'Halluc?':<8} {'Time(s)':<8}")
        print("-" * 59)
        for r in chatbot_results:
            hall = "N" if r["hallucination"]["overall"] else "Y"
            print(f"{r['scenario']:<35} {r['response_quality']['overall']:<8.2f} {hall:<8} {r['response_time']:<8.2f}")
    
    if args.multi_turn:
        print(f"\nMULTI-TURN CONVERSATION EVALUATION:")
        for r in multi_turn_results:
            print(f"  {r['scenario']}: Quality={r['overall_quality']:.2f}, Key Coverage={r['key_coverage']:.2f}, Overall={r['conversation_score']:.2f}")
    
    print(f"\n{'='*80}")
    print("SUMMARY SCORES (0.0-1.0 scale, higher is better):")
    print(f"  Average Search & Criteria Score: {avg_search_score:.3f}")
    if avg_hallucination is not None:
        print(f"  Average Hallucination Score: {avg_hallucination:.3f} (1.0 = no hallucinations)")
    if avg_quality is not None:
        print(f"  Average Response Quality: {avg_quality:.3f}")
    if avg_response_time is not None:
        print(f"  Average Response Time: {avg_response_time:.2f} seconds")
    if avg_conv_score is not None:
        print(f"  Average Multi-turn Score: {avg_conv_score:.3f}")
    
    print(f"\nRECOMMENDATIONS:")
    if avg_search_score < 0.8:
        print("  - Improve criteria extraction accuracy and search result relevance.")
    if avg_hallucination is not None and avg_hallucination < 0.9:
        print("  - Address hallucination issues in chatbot responses.")
    if avg_quality is not None and avg_quality < 0.7:
        print("  - Enhance response quality: ensure relevance, completeness, and proper flow.")
    if avg_response_time is not None and avg_response_time > 5.0:
        print("  - Optimize response time (consider smaller models or caching).")
    print(f"{'='*80}")


if __name__ == "__main__":
    main()