T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
MediGuard AI RAG-Helper - Evaluation System
5D Quality Assessment Framework
This module provides quality evaluation for RAG outputs using a 5-dimension framework:
1. Clinical Accuracy - Medical correctness (LLM-as-judge)
2. Evidence Grounding - Citation coverage (programmatic + LLM)
3. Actionability - Practical recommendations (LLM-as-judge)
4. Clarity - Communication quality (LLM-as-judge)
5. Safety Completeness - Safety alerts coverage (programmatic)
IMPORTANT LIMITATIONS:
- LLM-as-judge evaluations are non-deterministic (may vary between runs)
- Designed for offline batch evaluation, NOT production scoring
- Requires LLM API access (Groq or Gemini) for full evaluation
- Set EVALUATION_DETERMINISTIC=true for reproducible tests (uses heuristics)
Usage:
from src.evaluation.evaluators import run_5d_evaluation
result = run_5d_evaluation(final_response, pubmed_context)
print(f"Average score: {result.average_score():.2f}")
"""
import json
import os
from typing import Any
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from src.llm_config import get_chat_model
# Set to True for deterministic evaluation (testing)
DETERMINISTIC_MODE = os.environ.get("EVALUATION_DETERMINISTIC", "false").lower() == "true"
class GradedScore(BaseModel):
"""Structured score with justification"""
score: float = Field(description="Score from 0.0 to 1.0", ge=0.0, le=1.0)
reasoning: str = Field(description="Justification for the score")
class EvaluationResult(BaseModel):
"""Complete 5D evaluation result"""
clinical_accuracy: GradedScore
evidence_grounding: GradedScore
actionability: GradedScore
clarity: GradedScore
safety_completeness: GradedScore
def to_vector(self) -> list[float]:
"""Extract scores as a vector for Pareto analysis"""
return [
self.clinical_accuracy.score,
self.evidence_grounding.score,
self.actionability.score,
self.clarity.score,
self.safety_completeness.score,
]
def average_score(self) -> float:
"""Calculate average of all 5 dimensions"""
scores = self.to_vector()
return sum(scores) / len(scores) if scores else 0.0
# Evaluator 1: Clinical Accuracy (LLM-as-Judge)
def evaluate_clinical_accuracy(final_response: dict[str, Any], pubmed_context: str) -> GradedScore:
"""
Evaluates if medical interpretations are accurate.
Uses cloud LLM (Groq/Gemini) as expert judge.
In DETERMINISTIC_MODE, uses heuristics instead.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_clinical_accuracy(final_response, pubmed_context)
# Use cloud LLM for evaluation (FREE via Groq/Gemini)
evaluator_llm = get_chat_model(temperature=0.0, json_mode=True)
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a medical expert evaluating clinical accuracy.
Evaluate the following clinical assessment:
- Are biomarker interpretations medically correct?
- Is the disease mechanism explanation accurate?
- Are the medical recommendations appropriate?
Score 1.0 = Perfectly accurate, no medical errors
Score 0.0 = Contains dangerous misinformation
Respond ONLY with valid JSON in this format:
{{"score": 0.85, "reasoning": "Your detailed justification here"}}
""",
),
(
"human",
"""Evaluate this clinical output:
**Patient Summary:**
{patient_summary}
**Prediction Explanation:**
{prediction_explanation}
**Clinical Recommendations:**
{recommendations}
**Scientific Context (Ground Truth):**
{context}
""",
),
]
)
chain = prompt | evaluator_llm
result = chain.invoke(
{
"patient_summary": final_response["patient_summary"],
"prediction_explanation": final_response["prediction_explanation"],
"recommendations": final_response["clinical_recommendations"],
"context": pubmed_context,
}
)
# Parse JSON response
try:
content = result.content if isinstance(result.content, str) else str(result.content)
parsed = json.loads(content)
return GradedScore(score=parsed["score"], reasoning=parsed["reasoning"])
except (json.JSONDecodeError, KeyError, TypeError):
# Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics
return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.")
# Evaluator 2: Evidence Grounding (Programmatic + LLM)
def evaluate_evidence_grounding(final_response: dict[str, Any]) -> GradedScore:
"""
Checks if all claims are backed by citations.
Programmatic + LLM verification.
"""
# Count citations
pdf_refs = final_response["prediction_explanation"].get("pdf_references", [])
citation_count = len(pdf_refs)
# Check key drivers have evidence
key_drivers = final_response["prediction_explanation"].get("key_drivers", [])
drivers_with_evidence = sum(1 for d in key_drivers if d.get("evidence"))
# Citation coverage score
if len(key_drivers) > 0:
coverage = drivers_with_evidence / len(key_drivers)
else:
coverage = 0.0
# Base score from programmatic checks
base_score = min(1.0, citation_count / 5.0) * 0.5 + coverage * 0.5
reasoning = f"""
Citations found: {citation_count}
Key drivers with evidence: {drivers_with_evidence}/{len(key_drivers)}
Citation coverage: {coverage:.1%}
"""
return GradedScore(score=base_score, reasoning=reasoning.strip())
# Evaluator 3: Clinical Actionability (LLM-as-Judge)
def evaluate_actionability(final_response: dict[str, Any]) -> GradedScore:
"""
Evaluates if recommendations are actionable and safe.
Uses cloud LLM (Groq/Gemini) as expert judge.
In DETERMINISTIC_MODE, uses heuristics instead.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_actionability(final_response)
# Use cloud LLM for evaluation (FREE via Groq/Gemini)
evaluator_llm = get_chat_model(temperature=0.0, json_mode=True)
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a clinical care coordinator evaluating actionability.
Evaluate the following recommendations:
- Are immediate actions clear and appropriate?
- Are lifestyle changes specific and practical?
- Are monitoring recommendations feasible?
- Are next steps clearly defined?
Score 1.0 = Perfectly actionable, clear next steps
Score 0.0 = Vague, impractical, or unsafe
Respond ONLY with valid JSON in this format:
{{"score": 0.90, "reasoning": "Your detailed justification here"}}
""",
),
(
"human",
"""Evaluate these recommendations:
**Immediate Actions:**
{immediate_actions}
**Lifestyle Changes:**
{lifestyle_changes}
**Monitoring:**
{monitoring}
**Confidence Assessment:**
{confidence}
""",
),
]
)
chain = prompt | evaluator_llm
recs = final_response["clinical_recommendations"]
result = chain.invoke(
{
"immediate_actions": recs.get("immediate_actions", []),
"lifestyle_changes": recs.get("lifestyle_changes", []),
"monitoring": recs.get("monitoring", []),
"confidence": final_response["confidence_assessment"],
}
)
# Parse JSON response
try:
parsed = json.loads(result.content if isinstance(result.content, str) else str(result.content))
return GradedScore(score=parsed["score"], reasoning=parsed["reasoning"])
except (json.JSONDecodeError, KeyError, TypeError):
# Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics
return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.")
# Evaluator 4: Explainability Clarity (Programmatic)
def evaluate_clarity(final_response: dict[str, Any]) -> GradedScore:
"""
Measures readability and patient-friendliness.
Uses programmatic text analysis.
In DETERMINISTIC_MODE, uses simple heuristics for reproducibility.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_clarity(final_response)
try:
import textstat
has_textstat = True
except ImportError:
has_textstat = False
# Get patient narrative
narrative = final_response["patient_summary"].get("narrative", "")
if has_textstat:
# Calculate readability (Flesch Reading Ease)
# Score 60-70 = Standard (8th-9th grade)
# Score 50-60 = Fairly difficult (10th-12th grade)
flesch_score = textstat.flesch_reading_ease(narrative)
readability_score = min(1.0, flesch_score / 70.0) # Normalize to 1.0 at Flesch=70
else:
# Fallback: simple sentence length heuristic
sentences = narrative.split(".")
avg_words = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
# Optimal: 15-20 words per sentence
if 15 <= avg_words <= 20:
readability_score = 1.0
elif avg_words < 15:
readability_score = 0.9
else:
readability_score = max(0.5, 1.0 - (avg_words - 20) * 0.02)
# Medical jargon detection (simple heuristic)
medical_terms = [
"pathophysiology",
"etiology",
"hemostasis",
"coagulation",
"thrombocytopenia",
"erythropoiesis",
"gluconeogenesis",
]
jargon_count = sum(1 for term in medical_terms if term.lower() in narrative.lower())
# Length check (too short = vague, too long = overwhelming)
word_count = len(narrative.split())
optimal_length = 50 <= word_count <= 150
# Scoring
jargon_penalty = max(0.0, 1.0 - (jargon_count * 0.2))
length_score = 1.0 if optimal_length else 0.7
final_score = readability_score * 0.5 + jargon_penalty * 0.3 + length_score * 0.2
if has_textstat:
reasoning = f"""
Flesch Reading Ease: {flesch_score:.1f} (Target: 60-70)
Medical jargon terms: {jargon_count}
Word count: {word_count} (Optimal: 50-150)
Readability subscore: {readability_score:.2f}
"""
else:
reasoning = f"""
Readability (heuristic): {readability_score:.2f}
Medical jargon terms: {jargon_count}
Word count: {word_count} (Optimal: 50-150)
Note: textstat not available, using fallback metrics
"""
return GradedScore(score=final_score, reasoning=reasoning.strip())
# Evaluator 5: Safety & Completeness (Programmatic)
def evaluate_safety_completeness(final_response: dict[str, Any], biomarkers: dict[str, float]) -> GradedScore:
"""
Checks if all safety concerns are flagged.
Programmatic validation.
"""
from src.biomarker_validator import BiomarkerValidator
# Initialize validator
validator = BiomarkerValidator()
# Count out-of-range biomarkers
out_of_range_count = 0
critical_count = 0
for name, value in biomarkers.items():
result = validator.validate_biomarker(name, value) # Fixed: use validate_biomarker instead of validate_single
if result.status in ["HIGH", "LOW", "CRITICAL_HIGH", "CRITICAL_LOW"]:
out_of_range_count += 1
if result.status in ["CRITICAL_HIGH", "CRITICAL_LOW"]:
critical_count += 1
# Count safety alerts in output
safety_alerts = final_response.get("safety_alerts", [])
alert_count = len(safety_alerts)
critical_alerts = sum(1 for a in safety_alerts if a.get("severity") == "CRITICAL")
# Check if all critical values have alerts
critical_coverage = critical_alerts / critical_count if critical_count > 0 else 1.0
# Check for disclaimer
has_disclaimer = "disclaimer" in final_response.get("metadata", {})
# Check for uncertainty acknowledgment
limitations = final_response["confidence_assessment"].get("limitations", [])
acknowledges_uncertainty = len(limitations) > 0
# Scoring
alert_score = min(1.0, alert_count / max(1, out_of_range_count))
critical_score = min(1.0, critical_coverage)
disclaimer_score = 1.0 if has_disclaimer else 0.0
uncertainty_score = 1.0 if acknowledges_uncertainty else 0.5
final_score = min(
1.0, (alert_score * 0.4 + critical_score * 0.3 + disclaimer_score * 0.2 + uncertainty_score * 0.1)
)
reasoning = f"""
Out-of-range biomarkers: {out_of_range_count}
Critical values: {critical_count}
Safety alerts generated: {alert_count}
Critical alerts: {critical_alerts}
Critical coverage: {critical_coverage:.1%}
Has disclaimer: {has_disclaimer}
Acknowledges uncertainty: {acknowledges_uncertainty}
"""
return GradedScore(score=final_score, reasoning=reasoning.strip())
# Master Evaluation Function
def run_full_evaluation(
final_response: dict[str, Any], agent_outputs: list[Any], biomarkers: dict[str, float]
) -> EvaluationResult:
"""
Orchestrates all 5 evaluators and returns complete assessment.
"""
print("=" * 70)
print("RUNNING 5D EVALUATION GAUNTLET")
print("=" * 70)
# Extract context from agent outputs
pubmed_context = ""
for output in agent_outputs:
if output.agent_name == "Disease Explainer":
findings = output.findings
if isinstance(findings, dict):
pubmed_context = findings.get("mechanism_summary", "") or findings.get("pathophysiology", "")
elif isinstance(findings, str):
pubmed_context = findings
else:
pubmed_context = str(findings)
break
# Run all evaluators
print("\n1. Evaluating Clinical Accuracy...")
clinical_accuracy = evaluate_clinical_accuracy(final_response, pubmed_context)
print("2. Evaluating Evidence Grounding...")
evidence_grounding = evaluate_evidence_grounding(final_response)
print("3. Evaluating Clinical Actionability...")
actionability = evaluate_actionability(final_response)
print("4. Evaluating Explainability Clarity...")
clarity = evaluate_clarity(final_response)
print("5. Evaluating Safety & Completeness...")
safety_completeness = evaluate_safety_completeness(final_response, biomarkers)
print("\n" + "=" * 70)
print("EVALUATION COMPLETE")
print("=" * 70)
return EvaluationResult(
clinical_accuracy=clinical_accuracy,
evidence_grounding=evidence_grounding,
actionability=actionability,
clarity=clarity,
safety_completeness=safety_completeness,
)
# ---------------------------------------------------------------------------
# Deterministic Evaluation Functions (for testing)
# ---------------------------------------------------------------------------
def _deterministic_clinical_accuracy(final_response: dict[str, Any], pubmed_context: str) -> GradedScore:
"""Heuristic-based clinical accuracy (deterministic)."""
score = 0.5
reasons = []
# Check if response has expected structure
if final_response.get("patient_summary"):
score += 0.1
reasons.append("Has patient summary")
if final_response.get("prediction_explanation"):
score += 0.1
reasons.append("Has prediction explanation")
if final_response.get("clinical_recommendations"):
score += 0.1
reasons.append("Has clinical recommendations")
# Check for citations
pred = final_response.get("prediction_explanation", {})
if isinstance(pred, dict):
refs = pred.get("pdf_references", [])
if refs:
score += min(0.2, len(refs) * 0.05)
reasons.append(f"Has {len(refs)} citations")
return GradedScore(score=min(1.0, score), reasoning="[DETERMINISTIC] " + "; ".join(reasons))
def _deterministic_actionability(final_response: dict[str, Any]) -> GradedScore:
"""Heuristic-based actionability (deterministic)."""
score = 0.5
reasons = []
recs = final_response.get("clinical_recommendations", {})
if isinstance(recs, dict):
if recs.get("immediate_actions"):
score += 0.15
reasons.append("Has immediate actions")
if recs.get("lifestyle_changes"):
score += 0.15
reasons.append("Has lifestyle changes")
if recs.get("monitoring"):
score += 0.1
reasons.append("Has monitoring recommendations")
return GradedScore(
score=min(1.0, score),
reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Missing recommendations",
)
def _deterministic_clarity(final_response: dict[str, Any]) -> GradedScore:
"""Heuristic-based clarity (deterministic)."""
score = 0.5
reasons = []
summary = final_response.get("patient_summary", "")
if isinstance(summary, str):
word_count = len(summary.split())
if 50 <= word_count <= 300:
score += 0.2
reasons.append(f"Summary length OK ({word_count} words)")
elif word_count > 0:
score += 0.1
reasons.append("Has summary")
# Check for structured output
if final_response.get("biomarker_flags"):
score += 0.15
reasons.append("Has biomarker flags")
if final_response.get("key_findings"):
score += 0.15
reasons.append("Has key findings")
return GradedScore(
score=min(1.0, score),
reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Limited structure",
)