Agentic-RagBot / src /evaluation /evaluators.py
Nikhil Pravin Pise
Production Upgrade v2.0: SSE streaming, HIPAA compliance, Gradio Q&A UI
3ca1d38
raw
history blame
17.9 kB
"""
MediGuard AI RAG-Helper - Evaluation System
5D Quality Assessment Framework
This module provides quality evaluation for RAG outputs using a 5-dimension framework:
1. Clinical Accuracy - Medical correctness (LLM-as-judge)
2. Evidence Grounding - Citation coverage (programmatic + LLM)
3. Actionability - Practical recommendations (LLM-as-judge)
4. Clarity - Communication quality (LLM-as-judge)
5. Safety Completeness - Safety alerts coverage (programmatic)
IMPORTANT LIMITATIONS:
- LLM-as-judge evaluations are non-deterministic (may vary between runs)
- Designed for offline batch evaluation, NOT production scoring
- Requires LLM API access (Groq or Gemini) for full evaluation
- Set EVALUATION_DETERMINISTIC=true for reproducible tests (uses heuristics)
Usage:
from src.evaluation.evaluators import run_5d_evaluation
result = run_5d_evaluation(final_response, pubmed_context)
print(f"Average score: {result.average_score():.2f}")
"""
import os
from pydantic import BaseModel, Field
from typing import Dict, Any, List
import json
from langchain_core.prompts import ChatPromptTemplate
from src.llm_config import get_chat_model
# Set to True for deterministic evaluation (testing)
DETERMINISTIC_MODE = os.environ.get("EVALUATION_DETERMINISTIC", "false").lower() == "true"
class GradedScore(BaseModel):
"""Structured score with justification"""
score: float = Field(description="Score from 0.0 to 1.0", ge=0.0, le=1.0)
reasoning: str = Field(description="Justification for the score")
class EvaluationResult(BaseModel):
"""Complete 5D evaluation result"""
clinical_accuracy: GradedScore
evidence_grounding: GradedScore
actionability: GradedScore
clarity: GradedScore
safety_completeness: GradedScore
def to_vector(self) -> List[float]:
"""Extract scores as a vector for Pareto analysis"""
return [
self.clinical_accuracy.score,
self.evidence_grounding.score,
self.actionability.score,
self.clarity.score,
self.safety_completeness.score
]
def average_score(self) -> float:
"""Calculate average of all 5 dimensions"""
scores = self.to_vector()
return sum(scores) / len(scores) if scores else 0.0
# Evaluator 1: Clinical Accuracy (LLM-as-Judge)
def evaluate_clinical_accuracy(
final_response: Dict[str, Any],
pubmed_context: str
) -> GradedScore:
"""
Evaluates if medical interpretations are accurate.
Uses cloud LLM (Groq/Gemini) as expert judge.
In DETERMINISTIC_MODE, uses heuristics instead.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_clinical_accuracy(final_response, pubmed_context)
# Use cloud LLM for evaluation (FREE via Groq/Gemini)
evaluator_llm = get_chat_model(
temperature=0.0,
json_mode=True
)
prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical expert evaluating clinical accuracy.
Evaluate the following clinical assessment:
- Are biomarker interpretations medically correct?
- Is the disease mechanism explanation accurate?
- Are the medical recommendations appropriate?
Score 1.0 = Perfectly accurate, no medical errors
Score 0.0 = Contains dangerous misinformation
Respond ONLY with valid JSON in this format:
{{"score": 0.85, "reasoning": "Your detailed justification here"}}
"""),
("human", """Evaluate this clinical output:
**Patient Summary:**
{patient_summary}
**Prediction Explanation:**
{prediction_explanation}
**Clinical Recommendations:**
{recommendations}
**Scientific Context (Ground Truth):**
{context}
""")
])
chain = prompt | evaluator_llm
result = chain.invoke({
"patient_summary": final_response['patient_summary'],
"prediction_explanation": final_response['prediction_explanation'],
"recommendations": final_response['clinical_recommendations'],
"context": pubmed_context
})
# Parse JSON response
try:
content = result.content if isinstance(result.content, str) else str(result.content)
parsed = json.loads(content)
return GradedScore(score=parsed['score'], reasoning=parsed['reasoning'])
except (json.JSONDecodeError, KeyError, TypeError):
# Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics
return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.")
# Evaluator 2: Evidence Grounding (Programmatic + LLM)
def evaluate_evidence_grounding(
final_response: Dict[str, Any]
) -> GradedScore:
"""
Checks if all claims are backed by citations.
Programmatic + LLM verification.
"""
# Count citations
pdf_refs = final_response['prediction_explanation'].get('pdf_references', [])
citation_count = len(pdf_refs)
# Check key drivers have evidence
key_drivers = final_response['prediction_explanation'].get('key_drivers', [])
drivers_with_evidence = sum(1 for d in key_drivers if d.get('evidence'))
# Citation coverage score
if len(key_drivers) > 0:
coverage = drivers_with_evidence / len(key_drivers)
else:
coverage = 0.0
# Base score from programmatic checks
base_score = min(1.0, citation_count / 5.0) * 0.5 + coverage * 0.5
reasoning = f"""
Citations found: {citation_count}
Key drivers with evidence: {drivers_with_evidence}/{len(key_drivers)}
Citation coverage: {coverage:.1%}
"""
return GradedScore(score=base_score, reasoning=reasoning.strip())
# Evaluator 3: Clinical Actionability (LLM-as-Judge)
def evaluate_actionability(
final_response: Dict[str, Any]
) -> GradedScore:
"""
Evaluates if recommendations are actionable and safe.
Uses cloud LLM (Groq/Gemini) as expert judge.
In DETERMINISTIC_MODE, uses heuristics instead.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_actionability(final_response)
# Use cloud LLM for evaluation (FREE via Groq/Gemini)
evaluator_llm = get_chat_model(
temperature=0.0,
json_mode=True
)
prompt = ChatPromptTemplate.from_messages([
("system", """You are a clinical care coordinator evaluating actionability.
Evaluate the following recommendations:
- Are immediate actions clear and appropriate?
- Are lifestyle changes specific and practical?
- Are monitoring recommendations feasible?
- Are next steps clearly defined?
Score 1.0 = Perfectly actionable, clear next steps
Score 0.0 = Vague, impractical, or unsafe
Respond ONLY with valid JSON in this format:
{{"score": 0.90, "reasoning": "Your detailed justification here"}}
"""),
("human", """Evaluate these recommendations:
**Immediate Actions:**
{immediate_actions}
**Lifestyle Changes:**
{lifestyle_changes}
**Monitoring:**
{monitoring}
**Confidence Assessment:**
{confidence}
""")
])
chain = prompt | evaluator_llm
recs = final_response['clinical_recommendations']
result = chain.invoke({
"immediate_actions": recs.get('immediate_actions', []),
"lifestyle_changes": recs.get('lifestyle_changes', []),
"monitoring": recs.get('monitoring', []),
"confidence": final_response['confidence_assessment']
})
# Parse JSON response
try:
parsed = json.loads(result.content if isinstance(result.content, str) else str(result.content))
return GradedScore(score=parsed['score'], reasoning=parsed['reasoning'])
except (json.JSONDecodeError, KeyError, TypeError):
# Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics
return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.")
# Evaluator 4: Explainability Clarity (Programmatic)
def evaluate_clarity(
final_response: Dict[str, Any]
) -> GradedScore:
"""
Measures readability and patient-friendliness.
Uses programmatic text analysis.
In DETERMINISTIC_MODE, uses simple heuristics for reproducibility.
"""
# Deterministic mode for testing
if DETERMINISTIC_MODE:
return _deterministic_clarity(final_response)
try:
import textstat
has_textstat = True
except ImportError:
has_textstat = False
# Get patient narrative
narrative = final_response['patient_summary'].get('narrative', '')
if has_textstat:
# Calculate readability (Flesch Reading Ease)
# Score 60-70 = Standard (8th-9th grade)
# Score 50-60 = Fairly difficult (10th-12th grade)
flesch_score = textstat.flesch_reading_ease(narrative)
readability_score = min(1.0, flesch_score / 70.0) # Normalize to 1.0 at Flesch=70
else:
# Fallback: simple sentence length heuristic
sentences = narrative.split('.')
avg_words = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
# Optimal: 15-20 words per sentence
if 15 <= avg_words <= 20:
readability_score = 1.0
elif avg_words < 15:
readability_score = 0.9
else:
readability_score = max(0.5, 1.0 - (avg_words - 20) * 0.02)
# Medical jargon detection (simple heuristic)
medical_terms = [
'pathophysiology', 'etiology', 'hemostasis', 'coagulation',
'thrombocytopenia', 'erythropoiesis', 'gluconeogenesis'
]
jargon_count = sum(1 for term in medical_terms if term.lower() in narrative.lower())
# Length check (too short = vague, too long = overwhelming)
word_count = len(narrative.split())
optimal_length = 50 <= word_count <= 150
# Scoring
jargon_penalty = max(0.0, 1.0 - (jargon_count * 0.2))
length_score = 1.0 if optimal_length else 0.7
final_score = (readability_score * 0.5 + jargon_penalty * 0.3 + length_score * 0.2)
if has_textstat:
reasoning = f"""
Flesch Reading Ease: {flesch_score:.1f} (Target: 60-70)
Medical jargon terms: {jargon_count}
Word count: {word_count} (Optimal: 50-150)
Readability subscore: {readability_score:.2f}
"""
else:
reasoning = f"""
Readability (heuristic): {readability_score:.2f}
Medical jargon terms: {jargon_count}
Word count: {word_count} (Optimal: 50-150)
Note: textstat not available, using fallback metrics
"""
return GradedScore(score=final_score, reasoning=reasoning.strip())
# Evaluator 5: Safety & Completeness (Programmatic)
def evaluate_safety_completeness(
final_response: Dict[str, Any],
biomarkers: Dict[str, float]
) -> GradedScore:
"""
Checks if all safety concerns are flagged.
Programmatic validation.
"""
from src.biomarker_validator import BiomarkerValidator
# Initialize validator
validator = BiomarkerValidator()
# Count out-of-range biomarkers
out_of_range_count = 0
critical_count = 0
for name, value in biomarkers.items():
result = validator.validate_biomarker(name, value) # Fixed: use validate_biomarker instead of validate_single
if result.status in ['HIGH', 'LOW', 'CRITICAL_HIGH', 'CRITICAL_LOW']:
out_of_range_count += 1
if result.status in ['CRITICAL_HIGH', 'CRITICAL_LOW']:
critical_count += 1
# Count safety alerts in output
safety_alerts = final_response.get('safety_alerts', [])
alert_count = len(safety_alerts)
critical_alerts = sum(1 for a in safety_alerts if a.get('severity') == 'CRITICAL')
# Check if all critical values have alerts
critical_coverage = critical_alerts / critical_count if critical_count > 0 else 1.0
# Check for disclaimer
has_disclaimer = 'disclaimer' in final_response.get('metadata', {})
# Check for uncertainty acknowledgment
limitations = final_response['confidence_assessment'].get('limitations', [])
acknowledges_uncertainty = len(limitations) > 0
# Scoring
alert_score = min(1.0, alert_count / max(1, out_of_range_count))
critical_score = min(1.0, critical_coverage)
disclaimer_score = 1.0 if has_disclaimer else 0.0
uncertainty_score = 1.0 if acknowledges_uncertainty else 0.5
final_score = min(1.0, (
alert_score * 0.4 +
critical_score * 0.3 +
disclaimer_score * 0.2 +
uncertainty_score * 0.1
))
reasoning = f"""
Out-of-range biomarkers: {out_of_range_count}
Critical values: {critical_count}
Safety alerts generated: {alert_count}
Critical alerts: {critical_alerts}
Critical coverage: {critical_coverage:.1%}
Has disclaimer: {has_disclaimer}
Acknowledges uncertainty: {acknowledges_uncertainty}
"""
return GradedScore(score=final_score, reasoning=reasoning.strip())
# Master Evaluation Function
def run_full_evaluation(
final_response: Dict[str, Any],
agent_outputs: List[Any],
biomarkers: Dict[str, float]
) -> EvaluationResult:
"""
Orchestrates all 5 evaluators and returns complete assessment.
"""
print("=" * 70)
print("RUNNING 5D EVALUATION GAUNTLET")
print("=" * 70)
# Extract context from agent outputs
pubmed_context = ""
for output in agent_outputs:
if output.agent_name == "Disease Explainer":
findings = output.findings
if isinstance(findings, dict):
pubmed_context = findings.get('mechanism_summary', '') or findings.get('pathophysiology', '')
elif isinstance(findings, str):
pubmed_context = findings
else:
pubmed_context = str(findings)
break
# Run all evaluators
print("\n1. Evaluating Clinical Accuracy...")
clinical_accuracy = evaluate_clinical_accuracy(final_response, pubmed_context)
print("2. Evaluating Evidence Grounding...")
evidence_grounding = evaluate_evidence_grounding(final_response)
print("3. Evaluating Clinical Actionability...")
actionability = evaluate_actionability(final_response)
print("4. Evaluating Explainability Clarity...")
clarity = evaluate_clarity(final_response)
print("5. Evaluating Safety & Completeness...")
safety_completeness = evaluate_safety_completeness(final_response, biomarkers)
print("\n" + "=" * 70)
print("EVALUATION COMPLETE")
print("=" * 70)
return EvaluationResult(
clinical_accuracy=clinical_accuracy,
evidence_grounding=evidence_grounding,
actionability=actionability,
clarity=clarity,
safety_completeness=safety_completeness
)
# ---------------------------------------------------------------------------
# Deterministic Evaluation Functions (for testing)
# ---------------------------------------------------------------------------
def _deterministic_clinical_accuracy(
final_response: Dict[str, Any],
pubmed_context: str
) -> GradedScore:
"""Heuristic-based clinical accuracy (deterministic)."""
score = 0.5
reasons = []
# Check if response has expected structure
if final_response.get('patient_summary'):
score += 0.1
reasons.append("Has patient summary")
if final_response.get('prediction_explanation'):
score += 0.1
reasons.append("Has prediction explanation")
if final_response.get('clinical_recommendations'):
score += 0.1
reasons.append("Has clinical recommendations")
# Check for citations
pred = final_response.get('prediction_explanation', {})
if isinstance(pred, dict):
refs = pred.get('pdf_references', [])
if refs:
score += min(0.2, len(refs) * 0.05)
reasons.append(f"Has {len(refs)} citations")
return GradedScore(
score=min(1.0, score),
reasoning="[DETERMINISTIC] " + "; ".join(reasons)
)
def _deterministic_actionability(
final_response: Dict[str, Any]
) -> GradedScore:
"""Heuristic-based actionability (deterministic)."""
score = 0.5
reasons = []
recs = final_response.get('clinical_recommendations', {})
if isinstance(recs, dict):
if recs.get('immediate_actions'):
score += 0.15
reasons.append("Has immediate actions")
if recs.get('lifestyle_changes'):
score += 0.15
reasons.append("Has lifestyle changes")
if recs.get('monitoring'):
score += 0.1
reasons.append("Has monitoring recommendations")
return GradedScore(
score=min(1.0, score),
reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Missing recommendations"
)
def _deterministic_clarity(
final_response: Dict[str, Any]
) -> GradedScore:
"""Heuristic-based clarity (deterministic)."""
score = 0.5
reasons = []
summary = final_response.get('patient_summary', '')
if isinstance(summary, str):
word_count = len(summary.split())
if 50 <= word_count <= 300:
score += 0.2
reasons.append(f"Summary length OK ({word_count} words)")
elif word_count > 0:
score += 0.1
reasons.append("Has summary")
# Check for structured output
if final_response.get('biomarker_flags'):
score += 0.15
reasons.append("Has biomarker flags")
if final_response.get('key_findings'):
score += 0.15
reasons.append("Has key findings")
return GradedScore(
score=min(1.0, score),
reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Limited structure"
)