Spaces:
Running
Running
File size: 7,696 Bytes
6dc9d46 696f787 6dc9d46 696f787 9659593 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 9659593 6dc9d46 9659593 6dc9d46 9659593 6dc9d46 9659593 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 6dc9d46 696f787 9659593 6dc9d46 696f787 6dc9d46 aefac4f 6dc9d46 aefac4f 6dc9d46 696f787 aefac4f 696f787 6dc9d46 aefac4f 6dc9d46 696f787 6dc9d46 9659593 6dc9d46 aefac4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | """
Test the 5D Evaluation System
Tests all evaluators with real diabetes patient output
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import json
import pytest
import os
from src.evaluation.evaluators import run_full_evaluation
from src.state import AgentOutput
@pytest.mark.skipif(
not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
def test_evaluation_system():
"""Test evaluation system with diabetes patient data"""
print("=" * 80)
print("TESTING 5D EVALUATION SYSTEM")
print("=" * 80)
# Load test output from diabetes patient
test_output_path = Path(__file__).parent / "test_output_diabetes.json"
with open(test_output_path, encoding="utf-8") as f:
final_response = json.load(f)
print(f"\nβ Loaded test data from: {test_output_path}")
print(f" - Disease: {final_response['prediction_explanation']['primary_disease']}")
print(f" - Confidence: {final_response['prediction_explanation']['confidence']:.1%}")
print(f" - Out of range biomarkers: {final_response['patient_summary']['biomarkers_out_of_range']}")
print(f" - Critical alerts: {len(final_response['safety_alerts'])}")
# Reconstruct patient biomarkers from test output
biomarkers = {
"Glucose": 185.0,
"HbA1c": 8.2,
"Cholesterol": 235.0,
"Triglycerides": 210.0,
"HDL": 38.0,
"LDL": 155.0,
"VLDL": 42.0,
"Total_Protein": 6.8,
"Albumin": 4.2,
"Globulin": 2.6,
"AG_Ratio": 1.6,
"Bilirubin_Total": 0.9,
"Bilirubin_Direct": 0.2,
"ALT": 35.0,
"AST": 28.0,
"ALP": 95.0,
"Creatinine": 1.1,
"BUN": 18.0,
"BUN_Creatinine_Ratio": 16.4,
"Uric_Acid": 6.2,
"WBC": 7200.0,
"RBC": 4.7,
"Hemoglobin": 14.2,
"Hematocrit": 42.0,
"Platelets": 245.0,
}
print(f"\nβ Reconstructed {len(biomarkers)} biomarker values")
# Mock agent outputs to provide PubMed context for Clinical Accuracy evaluator
disease_explainer_context = """
Type 2 diabetes (T2D) accounts for the majority of cases and results
primarily from insulin resistance with a progressive beta-cell secretory defect.
Pathophysiology:
- Insulin resistance in peripheral tissues (muscle, liver, adipose)
- Progressive decline in beta-cell function
- Impaired glucose homeostasis leading to hyperglycemia
- Long-term complications affecting cardiovascular, renal, and neurological systems
Key Biomarkers:
- Fasting glucose β₯126 mg/dL indicates diabetes
- HbA1c β₯6.5% indicates diabetes
- Elevated cholesterol and triglycerides common due to dyslipidemia
- HDL typically reduced in metabolic syndrome
Clinical Management:
- Lifestyle modifications (diet, exercise)
- Pharmacological intervention (metformin, insulin sensitizers)
- Regular monitoring of glycemic control
- Cardiovascular risk management
"""
agent_outputs = [
AgentOutput(
agent_name="Disease Explainer",
findings=disease_explainer_context,
metadata={"citations": ["diabetes.pdf", "MediGuard_Diabetes_Guidelines_Extensive.pdf"]},
),
AgentOutput(
agent_name="Biomarker Analyzer",
findings="Analyzed 25 biomarkers. Found 19 out of range, 3 critical values.",
metadata={"citations": []},
),
AgentOutput(
agent_name="Biomarker-Disease Linker",
findings="Glucose and HbA1c are primary drivers for Type 2 Diabetes prediction.",
metadata={"citations": ["diabetes.pdf"]},
),
AgentOutput(
agent_name="Clinical Guidelines",
findings="Recommend immediate medical consultation, lifestyle modifications.",
metadata={"citations": ["diabetes.pdf"]},
),
AgentOutput(
agent_name="Confidence Assessor",
findings="High confidence prediction (87%) based on strong biomarker evidence.",
metadata={"citations": []},
),
]
print(f"β Created {len(agent_outputs)} mock agent outputs for evaluation context")
# Run full evaluation
print("\n" + "=" * 80)
print("RUNNING EVALUATION PIPELINE")
print("=" * 80)
try:
evaluation_result = run_full_evaluation(
final_response=final_response, agent_outputs=agent_outputs, biomarkers=biomarkers
)
# Display results
print("\n" + "=" * 80)
print("5D EVALUATION RESULTS")
print("=" * 80)
print(f"\n1. π Clinical Accuracy: {evaluation_result.clinical_accuracy.score:.3f}")
print(f" Reasoning: {evaluation_result.clinical_accuracy.reasoning[:200]}...")
print(f"\n2. π Evidence Grounding: {evaluation_result.evidence_grounding.score:.3f}")
print(f" Reasoning: {evaluation_result.evidence_grounding.reasoning}")
print(f"\n3. β‘ Actionability: {evaluation_result.actionability.score:.3f}")
print(f" Reasoning: {evaluation_result.actionability.reasoning[:200]}...")
print(f"\n4. π‘ Clarity: {evaluation_result.clarity.score:.3f}")
print(f" Reasoning: {evaluation_result.clarity.reasoning}")
print(f"\n5. π‘οΈ Safety & Completeness: {evaluation_result.safety_completeness.score:.3f}")
print(f" Reasoning: {evaluation_result.safety_completeness.reasoning}")
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
scores = evaluation_result.to_vector()
avg_score = sum(scores) / len(scores)
print(f"\nβ Evaluation Vector: {[f'{s:.3f}' for s in scores]}")
print(f"β Average Score: {avg_score:.3f}")
print(f"β Min Score: {min(scores):.3f}")
print(f"β Max Score: {max(scores):.3f}")
# Validation checks
print("\n" + "=" * 80)
print("VALIDATION CHECKS")
print("=" * 80)
all_valid = True
for i, (name, score) in enumerate(
[
("Clinical Accuracy", evaluation_result.clinical_accuracy.score),
("Evidence Grounding", evaluation_result.evidence_grounding.score),
("Actionability", evaluation_result.actionability.score),
("Clarity", evaluation_result.clarity.score),
("Safety & Completeness", evaluation_result.safety_completeness.score),
],
1,
):
if 0.0 <= score <= 1.0:
print(f"β {name}: Score in valid range [0.0, 1.0]")
else:
print(f"β {name}: Score OUT OF RANGE: {score}")
all_valid = False
if all_valid:
print("\n" + "=" * 80)
print("All evaluators passed validation")
print("=" * 80)
else:
print("\n" + "=" * 80)
print("Some evaluators failed validation")
print("=" * 80)
assert all_valid, "Some evaluators had scores out of valid range"
assert avg_score > 0.0, "Average evaluation score should be positive"
except Exception as e:
print("\n" + "=" * 80)
print("Evaluation failed")
print("=" * 80)
print(f"\nError: {type(e).__name__}: {e!s}")
import traceback
traceback.print_exc()
raise
if __name__ == "__main__":
print("\nStarting 5D Evaluation System Test\n")
test_evaluation_system()
print("\nTest completed successfully!")
|