6.C395-chatbot / test_hallucination.py
gabimimi's picture
changes mainly to the evaluation script and the chat.py files, increased score from 0.66 to 0.89. Also added some evaluation methods for hallucinations and stuff but could only run it once due to lack of credits. Recorded the Response Quality which was 0.882. Disclaimer: longer periods of wait for answer
740774d
#!/usr/bin/env python
"""Quick test of hallucination detection with new extraction logic."""
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from src.chat import Chatbot
from src.facilities import load_facilities
def _extract_facility_names_from_text(text: str) -> list[str]:
"""Extract facility names from numbered lists only (e.g. '1. Facility Name β€”')."""
if not text:
return []
names = set()
# Only match clearly numbered items: "1. **Facility Name**" or "1. Facility Name β€”"
lines = text.split('\n')
for line in lines:
# Match: "1. **Name**" or "1. Name β€”" or "1. Name." at start of line
m = re.match(r"^\s*\d+\.\s*\*?\*?([A-Z][^β€”\*\n]*?)(?:\*?\*?|β€”|\s*$)", line.strip())
if m:
cand = m.group(1).strip()
# Only include if it looks like a proper facility name (3+ words or has typical facility name patterns)
words = cand.split()
if len(cand) > 10 and len(words) >= 2:
names.add(cand)
return list(names)
# Load facility names
df = load_facilities()
names_ok = set()
for _, row in df.iterrows():
n = row.get("facility_name")
if n and str(n).strip():
names_ok.add(str(n).strip().lower())
print(f"Loaded {len(names_ok)} facility names from database\n")
# Test the chatbot
chatbot = Chatbot()
test_msg = "I need outpatient treatment in Boston with Medicaid."
print(f"Testing: {test_msg}")
print("-" * 60)
reply, state = chatbot.get_response(test_msg, [], {"criteria": {}, "last_results": [], "last_facility_detail": None})
print("CHATBOT RESPONSE:")
print(reply)
print("\n" + "=" * 60)
# Extract facility names
extracted = _extract_facility_names_from_text(reply)
print(f"\nEXTRACTED FACILITY NAMES: {extracted}")
# Check hallucinations
hallucinated = False
for name in extracted:
name_lower = name.lower()
is_real = (name_lower in names_ok) or any(name_lower in db for db in names_ok) or any(db in name_lower for db in names_ok)
print(f" '{name}' -> Real: {is_real}")
if not is_real:
hallucinated = True
print(f"\nHALLUCINATED: {'YES' if hallucinated else 'NO'}")
print(f"Result count: {len(state.get('last_results', []))}")