Spaces:
Running
Running
| """ | |
| MediGuard AI RAG-Helper - Evaluation System | |
| 5D Quality Assessment Framework | |
| This module provides quality evaluation for RAG outputs using a 5-dimension framework: | |
| 1. Clinical Accuracy - Medical correctness (LLM-as-judge) | |
| 2. Evidence Grounding - Citation coverage (programmatic + LLM) | |
| 3. Actionability - Practical recommendations (LLM-as-judge) | |
| 4. Clarity - Communication quality (LLM-as-judge) | |
| 5. Safety Completeness - Safety alerts coverage (programmatic) | |
| IMPORTANT LIMITATIONS: | |
| - LLM-as-judge evaluations are non-deterministic (may vary between runs) | |
| - Designed for offline batch evaluation, NOT production scoring | |
| - Requires LLM API access (Groq or Gemini) for full evaluation | |
| - Set EVALUATION_DETERMINISTIC=true for reproducible tests (uses heuristics) | |
| Usage: | |
| from src.evaluation.evaluators import run_5d_evaluation | |
| result = run_5d_evaluation(final_response, pubmed_context) | |
| print(f"Average score: {result.average_score():.2f}") | |
| """ | |
| import json | |
| import os | |
| from typing import Any | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from pydantic import BaseModel, Field | |
| from src.llm_config import get_chat_model | |
| # Set to True for deterministic evaluation (testing) | |
| DETERMINISTIC_MODE = os.environ.get("EVALUATION_DETERMINISTIC", "false").lower() == "true" | |
| class GradedScore(BaseModel): | |
| """Structured score with justification""" | |
| score: float = Field(description="Score from 0.0 to 1.0", ge=0.0, le=1.0) | |
| reasoning: str = Field(description="Justification for the score") | |
| class EvaluationResult(BaseModel): | |
| """Complete 5D evaluation result""" | |
| clinical_accuracy: GradedScore | |
| evidence_grounding: GradedScore | |
| actionability: GradedScore | |
| clarity: GradedScore | |
| safety_completeness: GradedScore | |
| def to_vector(self) -> list[float]: | |
| """Extract scores as a vector for Pareto analysis""" | |
| return [ | |
| self.clinical_accuracy.score, | |
| self.evidence_grounding.score, | |
| self.actionability.score, | |
| self.clarity.score, | |
| self.safety_completeness.score, | |
| ] | |
| def average_score(self) -> float: | |
| """Calculate average of all 5 dimensions""" | |
| scores = self.to_vector() | |
| return sum(scores) / len(scores) if scores else 0.0 | |
| # Evaluator 1: Clinical Accuracy (LLM-as-Judge) | |
| def evaluate_clinical_accuracy(final_response: dict[str, Any], pubmed_context: str) -> GradedScore: | |
| """ | |
| Evaluates if medical interpretations are accurate. | |
| Uses cloud LLM (Groq/Gemini) as expert judge. | |
| In DETERMINISTIC_MODE, uses heuristics instead. | |
| """ | |
| # Deterministic mode for testing | |
| if DETERMINISTIC_MODE: | |
| return _deterministic_clinical_accuracy(final_response, pubmed_context) | |
| # Use cloud LLM for evaluation (FREE via Groq/Gemini) | |
| evaluator_llm = get_chat_model(temperature=0.0, json_mode=True) | |
| prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ( | |
| "system", | |
| """You are a medical expert evaluating clinical accuracy. | |
| Evaluate the following clinical assessment: | |
| - Are biomarker interpretations medically correct? | |
| - Is the disease mechanism explanation accurate? | |
| - Are the medical recommendations appropriate? | |
| Score 1.0 = Perfectly accurate, no medical errors | |
| Score 0.0 = Contains dangerous misinformation | |
| Respond ONLY with valid JSON in this format: | |
| {{"score": 0.85, "reasoning": "Your detailed justification here"}} | |
| """, | |
| ), | |
| ( | |
| "human", | |
| """Evaluate this clinical output: | |
| **Patient Summary:** | |
| {patient_summary} | |
| **Prediction Explanation:** | |
| {prediction_explanation} | |
| **Clinical Recommendations:** | |
| {recommendations} | |
| **Scientific Context (Ground Truth):** | |
| {context} | |
| """, | |
| ), | |
| ] | |
| ) | |
| chain = prompt | evaluator_llm | |
| result = chain.invoke( | |
| { | |
| "patient_summary": final_response["patient_summary"], | |
| "prediction_explanation": final_response["prediction_explanation"], | |
| "recommendations": final_response["clinical_recommendations"], | |
| "context": pubmed_context, | |
| } | |
| ) | |
| # Parse JSON response | |
| try: | |
| content = result.content if isinstance(result.content, str) else str(result.content) | |
| parsed = json.loads(content) | |
| return GradedScore(score=parsed["score"], reasoning=parsed["reasoning"]) | |
| except (json.JSONDecodeError, KeyError, TypeError): | |
| # Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics | |
| return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.") | |
| # Evaluator 2: Evidence Grounding (Programmatic + LLM) | |
| def evaluate_evidence_grounding(final_response: dict[str, Any]) -> GradedScore: | |
| """ | |
| Checks if all claims are backed by citations. | |
| Programmatic + LLM verification. | |
| """ | |
| # Count citations | |
| pdf_refs = final_response["prediction_explanation"].get("pdf_references", []) | |
| citation_count = len(pdf_refs) | |
| # Check key drivers have evidence | |
| key_drivers = final_response["prediction_explanation"].get("key_drivers", []) | |
| drivers_with_evidence = sum(1 for d in key_drivers if d.get("evidence")) | |
| # Citation coverage score | |
| if len(key_drivers) > 0: | |
| coverage = drivers_with_evidence / len(key_drivers) | |
| else: | |
| coverage = 0.0 | |
| # Base score from programmatic checks | |
| base_score = min(1.0, citation_count / 5.0) * 0.5 + coverage * 0.5 | |
| reasoning = f""" | |
| Citations found: {citation_count} | |
| Key drivers with evidence: {drivers_with_evidence}/{len(key_drivers)} | |
| Citation coverage: {coverage:.1%} | |
| """ | |
| return GradedScore(score=base_score, reasoning=reasoning.strip()) | |
| # Evaluator 3: Clinical Actionability (LLM-as-Judge) | |
| def evaluate_actionability(final_response: dict[str, Any]) -> GradedScore: | |
| """ | |
| Evaluates if recommendations are actionable and safe. | |
| Uses cloud LLM (Groq/Gemini) as expert judge. | |
| In DETERMINISTIC_MODE, uses heuristics instead. | |
| """ | |
| # Deterministic mode for testing | |
| if DETERMINISTIC_MODE: | |
| return _deterministic_actionability(final_response) | |
| # Use cloud LLM for evaluation (FREE via Groq/Gemini) | |
| evaluator_llm = get_chat_model(temperature=0.0, json_mode=True) | |
| prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ( | |
| "system", | |
| """You are a clinical care coordinator evaluating actionability. | |
| Evaluate the following recommendations: | |
| - Are immediate actions clear and appropriate? | |
| - Are lifestyle changes specific and practical? | |
| - Are monitoring recommendations feasible? | |
| - Are next steps clearly defined? | |
| Score 1.0 = Perfectly actionable, clear next steps | |
| Score 0.0 = Vague, impractical, or unsafe | |
| Respond ONLY with valid JSON in this format: | |
| {{"score": 0.90, "reasoning": "Your detailed justification here"}} | |
| """, | |
| ), | |
| ( | |
| "human", | |
| """Evaluate these recommendations: | |
| **Immediate Actions:** | |
| {immediate_actions} | |
| **Lifestyle Changes:** | |
| {lifestyle_changes} | |
| **Monitoring:** | |
| {monitoring} | |
| **Confidence Assessment:** | |
| {confidence} | |
| """, | |
| ), | |
| ] | |
| ) | |
| chain = prompt | evaluator_llm | |
| recs = final_response["clinical_recommendations"] | |
| result = chain.invoke( | |
| { | |
| "immediate_actions": recs.get("immediate_actions", []), | |
| "lifestyle_changes": recs.get("lifestyle_changes", []), | |
| "monitoring": recs.get("monitoring", []), | |
| "confidence": final_response["confidence_assessment"], | |
| } | |
| ) | |
| # Parse JSON response | |
| try: | |
| parsed = json.loads(result.content if isinstance(result.content, str) else str(result.content)) | |
| return GradedScore(score=parsed["score"], reasoning=parsed["reasoning"]) | |
| except (json.JSONDecodeError, KeyError, TypeError): | |
| # Fallback if JSON parsing fails — use a conservative score to avoid inflating metrics | |
| return GradedScore(score=0.5, reasoning="Unable to parse LLM evaluation response; defaulting to neutral score.") | |
| # Evaluator 4: Explainability Clarity (Programmatic) | |
| def evaluate_clarity(final_response: dict[str, Any]) -> GradedScore: | |
| """ | |
| Measures readability and patient-friendliness. | |
| Uses programmatic text analysis. | |
| In DETERMINISTIC_MODE, uses simple heuristics for reproducibility. | |
| """ | |
| # Deterministic mode for testing | |
| if DETERMINISTIC_MODE: | |
| return _deterministic_clarity(final_response) | |
| try: | |
| import textstat | |
| has_textstat = True | |
| except ImportError: | |
| has_textstat = False | |
| # Get patient narrative | |
| narrative = final_response["patient_summary"].get("narrative", "") | |
| if has_textstat: | |
| # Calculate readability (Flesch Reading Ease) | |
| # Score 60-70 = Standard (8th-9th grade) | |
| # Score 50-60 = Fairly difficult (10th-12th grade) | |
| flesch_score = textstat.flesch_reading_ease(narrative) | |
| readability_score = min(1.0, flesch_score / 70.0) # Normalize to 1.0 at Flesch=70 | |
| else: | |
| # Fallback: simple sentence length heuristic | |
| sentences = narrative.split(".") | |
| avg_words = sum(len(s.split()) for s in sentences) / max(len(sentences), 1) | |
| # Optimal: 15-20 words per sentence | |
| if 15 <= avg_words <= 20: | |
| readability_score = 1.0 | |
| elif avg_words < 15: | |
| readability_score = 0.9 | |
| else: | |
| readability_score = max(0.5, 1.0 - (avg_words - 20) * 0.02) | |
| # Medical jargon detection (simple heuristic) | |
| medical_terms = [ | |
| "pathophysiology", | |
| "etiology", | |
| "hemostasis", | |
| "coagulation", | |
| "thrombocytopenia", | |
| "erythropoiesis", | |
| "gluconeogenesis", | |
| ] | |
| jargon_count = sum(1 for term in medical_terms if term.lower() in narrative.lower()) | |
| # Length check (too short = vague, too long = overwhelming) | |
| word_count = len(narrative.split()) | |
| optimal_length = 50 <= word_count <= 150 | |
| # Scoring | |
| jargon_penalty = max(0.0, 1.0 - (jargon_count * 0.2)) | |
| length_score = 1.0 if optimal_length else 0.7 | |
| final_score = readability_score * 0.5 + jargon_penalty * 0.3 + length_score * 0.2 | |
| if has_textstat: | |
| reasoning = f""" | |
| Flesch Reading Ease: {flesch_score:.1f} (Target: 60-70) | |
| Medical jargon terms: {jargon_count} | |
| Word count: {word_count} (Optimal: 50-150) | |
| Readability subscore: {readability_score:.2f} | |
| """ | |
| else: | |
| reasoning = f""" | |
| Readability (heuristic): {readability_score:.2f} | |
| Medical jargon terms: {jargon_count} | |
| Word count: {word_count} (Optimal: 50-150) | |
| Note: textstat not available, using fallback metrics | |
| """ | |
| return GradedScore(score=final_score, reasoning=reasoning.strip()) | |
| # Evaluator 5: Safety & Completeness (Programmatic) | |
| def evaluate_safety_completeness(final_response: dict[str, Any], biomarkers: dict[str, float]) -> GradedScore: | |
| """ | |
| Checks if all safety concerns are flagged. | |
| Programmatic validation. | |
| """ | |
| from src.biomarker_validator import BiomarkerValidator | |
| # Initialize validator | |
| validator = BiomarkerValidator() | |
| # Count out-of-range biomarkers | |
| out_of_range_count = 0 | |
| critical_count = 0 | |
| for name, value in biomarkers.items(): | |
| result = validator.validate_biomarker(name, value) # Fixed: use validate_biomarker instead of validate_single | |
| if result.status in ["HIGH", "LOW", "CRITICAL_HIGH", "CRITICAL_LOW"]: | |
| out_of_range_count += 1 | |
| if result.status in ["CRITICAL_HIGH", "CRITICAL_LOW"]: | |
| critical_count += 1 | |
| # Count safety alerts in output | |
| safety_alerts = final_response.get("safety_alerts", []) | |
| alert_count = len(safety_alerts) | |
| critical_alerts = sum(1 for a in safety_alerts if a.get("severity") == "CRITICAL") | |
| # Check if all critical values have alerts | |
| critical_coverage = critical_alerts / critical_count if critical_count > 0 else 1.0 | |
| # Check for disclaimer | |
| has_disclaimer = "disclaimer" in final_response.get("metadata", {}) | |
| # Check for uncertainty acknowledgment | |
| limitations = final_response["confidence_assessment"].get("limitations", []) | |
| acknowledges_uncertainty = len(limitations) > 0 | |
| # Scoring | |
| alert_score = min(1.0, alert_count / max(1, out_of_range_count)) | |
| critical_score = min(1.0, critical_coverage) | |
| disclaimer_score = 1.0 if has_disclaimer else 0.0 | |
| uncertainty_score = 1.0 if acknowledges_uncertainty else 0.5 | |
| final_score = min( | |
| 1.0, (alert_score * 0.4 + critical_score * 0.3 + disclaimer_score * 0.2 + uncertainty_score * 0.1) | |
| ) | |
| reasoning = f""" | |
| Out-of-range biomarkers: {out_of_range_count} | |
| Critical values: {critical_count} | |
| Safety alerts generated: {alert_count} | |
| Critical alerts: {critical_alerts} | |
| Critical coverage: {critical_coverage:.1%} | |
| Has disclaimer: {has_disclaimer} | |
| Acknowledges uncertainty: {acknowledges_uncertainty} | |
| """ | |
| return GradedScore(score=final_score, reasoning=reasoning.strip()) | |
| # Master Evaluation Function | |
| def run_full_evaluation( | |
| final_response: dict[str, Any], agent_outputs: list[Any], biomarkers: dict[str, float] | |
| ) -> EvaluationResult: | |
| """ | |
| Orchestrates all 5 evaluators and returns complete assessment. | |
| """ | |
| print("=" * 70) | |
| print("RUNNING 5D EVALUATION GAUNTLET") | |
| print("=" * 70) | |
| # Extract context from agent outputs | |
| pubmed_context = "" | |
| for output in agent_outputs: | |
| if output.agent_name == "Disease Explainer": | |
| findings = output.findings | |
| if isinstance(findings, dict): | |
| pubmed_context = findings.get("mechanism_summary", "") or findings.get("pathophysiology", "") | |
| elif isinstance(findings, str): | |
| pubmed_context = findings | |
| else: | |
| pubmed_context = str(findings) | |
| break | |
| # Run all evaluators | |
| print("\n1. Evaluating Clinical Accuracy...") | |
| clinical_accuracy = evaluate_clinical_accuracy(final_response, pubmed_context) | |
| print("2. Evaluating Evidence Grounding...") | |
| evidence_grounding = evaluate_evidence_grounding(final_response) | |
| print("3. Evaluating Clinical Actionability...") | |
| actionability = evaluate_actionability(final_response) | |
| print("4. Evaluating Explainability Clarity...") | |
| clarity = evaluate_clarity(final_response) | |
| print("5. Evaluating Safety & Completeness...") | |
| safety_completeness = evaluate_safety_completeness(final_response, biomarkers) | |
| print("\n" + "=" * 70) | |
| print("EVALUATION COMPLETE") | |
| print("=" * 70) | |
| return EvaluationResult( | |
| clinical_accuracy=clinical_accuracy, | |
| evidence_grounding=evidence_grounding, | |
| actionability=actionability, | |
| clarity=clarity, | |
| safety_completeness=safety_completeness, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Deterministic Evaluation Functions (for testing) | |
| # --------------------------------------------------------------------------- | |
| def _deterministic_clinical_accuracy(final_response: dict[str, Any], pubmed_context: str) -> GradedScore: | |
| """Heuristic-based clinical accuracy (deterministic).""" | |
| score = 0.5 | |
| reasons = [] | |
| # Check if response has expected structure | |
| if final_response.get("patient_summary"): | |
| score += 0.1 | |
| reasons.append("Has patient summary") | |
| if final_response.get("prediction_explanation"): | |
| score += 0.1 | |
| reasons.append("Has prediction explanation") | |
| if final_response.get("clinical_recommendations"): | |
| score += 0.1 | |
| reasons.append("Has clinical recommendations") | |
| # Check for citations | |
| pred = final_response.get("prediction_explanation", {}) | |
| if isinstance(pred, dict): | |
| refs = pred.get("pdf_references", []) | |
| if refs: | |
| score += min(0.2, len(refs) * 0.05) | |
| reasons.append(f"Has {len(refs)} citations") | |
| return GradedScore(score=min(1.0, score), reasoning="[DETERMINISTIC] " + "; ".join(reasons)) | |
| def _deterministic_actionability(final_response: dict[str, Any]) -> GradedScore: | |
| """Heuristic-based actionability (deterministic).""" | |
| score = 0.5 | |
| reasons = [] | |
| recs = final_response.get("clinical_recommendations", {}) | |
| if isinstance(recs, dict): | |
| if recs.get("immediate_actions"): | |
| score += 0.15 | |
| reasons.append("Has immediate actions") | |
| if recs.get("lifestyle_changes"): | |
| score += 0.15 | |
| reasons.append("Has lifestyle changes") | |
| if recs.get("monitoring"): | |
| score += 0.1 | |
| reasons.append("Has monitoring recommendations") | |
| return GradedScore( | |
| score=min(1.0, score), | |
| reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Missing recommendations", | |
| ) | |
| def _deterministic_clarity(final_response: dict[str, Any]) -> GradedScore: | |
| """Heuristic-based clarity (deterministic).""" | |
| score = 0.5 | |
| reasons = [] | |
| summary = final_response.get("patient_summary", "") | |
| if isinstance(summary, str): | |
| word_count = len(summary.split()) | |
| if 50 <= word_count <= 300: | |
| score += 0.2 | |
| reasons.append(f"Summary length OK ({word_count} words)") | |
| elif word_count > 0: | |
| score += 0.1 | |
| reasons.append("Has summary") | |
| # Check for structured output | |
| if final_response.get("biomarker_flags"): | |
| score += 0.15 | |
| reasons.append("Has biomarker flags") | |
| if final_response.get("key_findings"): | |
| score += 0.15 | |
| reasons.append("Has key findings") | |
| return GradedScore( | |
| score=min(1.0, score), | |
| reasoning="[DETERMINISTIC] " + "; ".join(reasons) if reasons else "[DETERMINISTIC] Limited structure", | |
| ) | |