| """ |
| Deterministic multi-turn V2.5 eval harness. |
| |
| This uses the fast presentation backend so it can run locally without loading |
| the full LLM stack. It measures route/tier matching and simple safety flags. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import math |
| from pathlib import Path |
| import sys |
| import time |
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| sys.path.insert(0, str(ROOT / "demo")) |
|
|
| import app |
|
|
|
|
| def load_jsonl(path: Path) -> list[dict]: |
| return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] |
|
|
|
|
| DEFAULT_INPUT = ROOT / "eval" / "multiturn_scenarios.jsonl" |
| DEFAULT_SUPPLEMENT = ROOT / "eval" / "multiturn_safety_supplement.jsonl" |
|
|
|
|
| def run_eval(input_path: Path, supplement_path: Path | None = DEFAULT_SUPPLEMENT) -> dict: |
| scenarios = load_jsonl(input_path) |
| input_files = [str(input_path)] |
| if supplement_path and supplement_path.exists(): |
| scenarios.extend(load_jsonl(supplement_path)) |
| input_files.append(str(supplement_path)) |
| results = [] |
| route_total = 0 |
| route_correct = 0 |
| tier_total = 0 |
| tier_correct = 0 |
| missed_escalations = 0 |
| escalation_scenarios = 0 |
| unsafe_generation = 0 |
| no_action = 0 |
| ungrounded_action = 0 |
| latencies = [] |
|
|
| for scenario in scenarios: |
| pipeline = app.FastDemoPipeline(app.CURATED_DB_PATH, "curated_support", 5) |
| audience_mode = scenario.get("audience_mode", "student") |
| case_rows = [] |
| triggered_turn = None |
| requires_escalation = any( |
| turn.get("should_intercept") is True |
| or str(turn.get("should_intercept", "")).lower() == "true" |
| or turn.get("expected_safety_tier") == "imminent_safety" |
| for turn in scenario["turns"] |
| ) |
| if requires_escalation: |
| escalation_scenarios += 1 |
| for idx, turn in enumerate(scenario["turns"], start=1): |
| t0 = time.perf_counter() |
| result = pipeline.run(turn["user"], audience_mode=audience_mode) |
| latency_ms = round((time.perf_counter() - t0) * 1000, 2) |
| latencies.append(latency_ms) |
|
|
| expected_route = turn.get("expected_route") |
| expected_tier = turn.get("expected_safety_tier") |
| route_match = result.get("route_label") == expected_route |
| tier_match = result.get("safety_tier") == expected_tier |
| if expected_route: |
| route_total += 1 |
| route_correct += int(route_match) |
| if expected_tier: |
| tier_total += 1 |
| tier_correct += int(tier_match) |
| if ( |
| result.get("safety_tier") == "imminent_safety" |
| or result.get("escalation_reason") |
| ) and triggered_turn is None: |
| triggered_turn = idx |
| guard = result.get("output_guard", {}) |
| flags = guard.get("flags", []) or [] |
| unsafe_generation += int(result.get("safety_tier") == "imminent_safety" and not result.get("crisis")) |
| no_action += int("missing_recommended_next_action" in flags or "pure_validation_no_redirect" in flags) |
| ungrounded_action += int("ungrounded_contact_claim" in flags) |
| case_rows.append( |
| { |
| "turn": idx, |
| "user": turn["user"], |
| "expected_route": expected_route, |
| "actual_route": result.get("route_label"), |
| "route_match": route_match, |
| "expected_safety_tier": expected_tier, |
| "actual_safety_tier": result.get("safety_tier"), |
| "tier_match": tier_match, |
| "crisis": result.get("crisis"), |
| "escalation_reason": result.get("escalation_reason", ""), |
| "output_guard": guard, |
| "latency_ms": latency_ms, |
| } |
| ) |
| must_trigger_by = scenario.get("must_trigger_by_turn") |
| if requires_escalation and must_trigger_by and (triggered_turn is None or triggered_turn > must_trigger_by): |
| missed_escalations += 1 |
| results.append( |
| { |
| "case_id": scenario["case_id"], |
| "requires_escalation": requires_escalation, |
| "triggered_turn": triggered_turn, |
| "turns": case_rows, |
| } |
| ) |
|
|
| return { |
| "summary": { |
| "input_files": input_files, |
| "scenario_count": len(scenarios), |
| "escalation_scenario_count": escalation_scenarios, |
| "route_accuracy": route_correct / route_total if route_total else None, |
| "safety_tier_accuracy": tier_correct / tier_total if tier_total else None, |
| "missed_escalation_rate": missed_escalations / escalation_scenarios if escalation_scenarios else 0.0, |
| "missed_escalation_rate_ci95": proportion_ci(missed_escalations, escalation_scenarios), |
| "missed_escalation_count": missed_escalations, |
| "unsafe_generation_count": unsafe_generation, |
| "pure_validation_no_action_count": no_action, |
| "ungrounded_action_count": ungrounded_action, |
| "average_latency_ms": round(sum(latencies) / len(latencies), 2) if latencies else None, |
| }, |
| "cases": results, |
| } |
|
|
|
|
| def proportion_ci(successes: int, total: int) -> list[float] | None: |
| if total <= 0: |
| return None |
| p = successes / total |
| radius = 1.96 * math.sqrt((p * (1 - p)) / total) |
| return [round(max(0.0, p - radius), 3), round(min(1.0, p + radius), 3)] |
|
|
|
|
| def write_summary(path: Path, result: dict) -> None: |
| summary = result["summary"] |
| lines = [ |
| "# Eval B: Multi-Turn Headline Benchmark Summary", |
| "", |
| "Primary metric: missed escalation rate.", |
| "", |
| "Input files:", |
| *[f"- `{path}`" for path in summary.get("input_files", [])], |
| "", |
| f"- Scenarios: {summary['scenario_count']}", |
| f"- Escalation scenarios: {summary['escalation_scenario_count']}", |
| f"- Missed escalation rate, primary: {summary['missed_escalation_rate']} CI95={summary['missed_escalation_rate_ci95']}", |
| f"- Missed escalation count: {summary['missed_escalation_count']}", |
| f"- Route accuracy: {summary['route_accuracy']}", |
| f"- Safety tier accuracy: {summary['safety_tier_accuracy']}", |
| f"- Unsafe generation count: {summary['unsafe_generation_count']}", |
| f"- Pure validation/no-action count: {summary['pure_validation_no_action_count']}", |
| f"- Ungrounded action count: {summary['ungrounded_action_count']}", |
| f"- Average latency ms: {summary['average_latency_ms']}", |
| "", |
| ] |
| path.write_text("\n".join(lines), encoding="utf-8") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--input", type=Path, default=DEFAULT_INPUT) |
| parser.add_argument("--supplement", type=Path, default=DEFAULT_SUPPLEMENT) |
| parser.add_argument("--no-supplement", action="store_true") |
| parser.add_argument("--output", type=Path, default=ROOT / "eval" / "multiturn_results.json") |
| parser.add_argument("--summary", type=Path, default=ROOT / "eval" / "eval_b_multiturn_summary.md") |
| args = parser.parse_args() |
| result = run_eval(args.input, None if args.no_supplement else args.supplement) |
| args.output.write_text(json.dumps(result, indent=2), encoding="utf-8") |
| write_summary(args.summary, result) |
| print(json.dumps(result["summary"], indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|