File size: 7,696 Bytes
6dc9d46
 
 
 
 
 
 
696f787
6dc9d46
 
 
696f787
9659593
 
 
6dc9d46
696f787
6dc9d46
 
9659593
 
 
6dc9d46
 
696f787
6dc9d46
 
 
696f787
6dc9d46
9659593
 
6dc9d46
696f787
6dc9d46
 
 
 
 
696f787
6dc9d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9659593
6dc9d46
696f787
6dc9d46
696f787
6dc9d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696f787
6dc9d46
 
 
 
9659593
6dc9d46
 
 
 
9659593
6dc9d46
 
 
 
9659593
6dc9d46
 
 
 
9659593
6dc9d46
 
 
 
9659593
 
6dc9d46
696f787
6dc9d46
696f787
6dc9d46
 
 
 
696f787
6dc9d46
 
9659593
6dc9d46
696f787
6dc9d46
 
 
 
696f787
6dc9d46
 
696f787
6dc9d46
 
696f787
6dc9d46
 
696f787
6dc9d46
 
696f787
6dc9d46
 
696f787
6dc9d46
 
 
 
696f787
6dc9d46
 
696f787
6dc9d46
 
 
 
696f787
6dc9d46
 
 
 
696f787
6dc9d46
696f787
9659593
 
 
 
 
 
 
 
 
 
6dc9d46
 
 
 
 
696f787
6dc9d46
 
aefac4f
6dc9d46
 
 
aefac4f
6dc9d46
696f787
aefac4f
 
696f787
6dc9d46
 
aefac4f
6dc9d46
696f787
6dc9d46
9659593
6dc9d46
 
 
 
 
aefac4f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"""
Test the 5D Evaluation System
Tests all evaluators with real diabetes patient output
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import json

import pytest
import os

from src.evaluation.evaluators import run_full_evaluation
from src.state import AgentOutput


@pytest.mark.skipif(
    not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
def test_evaluation_system():
    """Test evaluation system with diabetes patient data"""

    print("=" * 80)
    print("TESTING 5D EVALUATION SYSTEM")
    print("=" * 80)

    # Load test output from diabetes patient
    test_output_path = Path(__file__).parent / "test_output_diabetes.json"
    with open(test_output_path, encoding="utf-8") as f:
        final_response = json.load(f)

    print(f"\nβœ“ Loaded test data from: {test_output_path}")
    print(f"  - Disease: {final_response['prediction_explanation']['primary_disease']}")
    print(f"  - Confidence: {final_response['prediction_explanation']['confidence']:.1%}")
    print(f"  - Out of range biomarkers: {final_response['patient_summary']['biomarkers_out_of_range']}")
    print(f"  - Critical alerts: {len(final_response['safety_alerts'])}")

    # Reconstruct patient biomarkers from test output
    biomarkers = {
        "Glucose": 185.0,
        "HbA1c": 8.2,
        "Cholesterol": 235.0,
        "Triglycerides": 210.0,
        "HDL": 38.0,
        "LDL": 155.0,
        "VLDL": 42.0,
        "Total_Protein": 6.8,
        "Albumin": 4.2,
        "Globulin": 2.6,
        "AG_Ratio": 1.6,
        "Bilirubin_Total": 0.9,
        "Bilirubin_Direct": 0.2,
        "ALT": 35.0,
        "AST": 28.0,
        "ALP": 95.0,
        "Creatinine": 1.1,
        "BUN": 18.0,
        "BUN_Creatinine_Ratio": 16.4,
        "Uric_Acid": 6.2,
        "WBC": 7200.0,
        "RBC": 4.7,
        "Hemoglobin": 14.2,
        "Hematocrit": 42.0,
        "Platelets": 245.0,
    }

    print(f"\nβœ“ Reconstructed {len(biomarkers)} biomarker values")

    # Mock agent outputs to provide PubMed context for Clinical Accuracy evaluator
    disease_explainer_context = """
    Type 2 diabetes (T2D) accounts for the majority of cases and results 
    primarily from insulin resistance with a progressive beta-cell secretory defect.
    
    Pathophysiology:
    - Insulin resistance in peripheral tissues (muscle, liver, adipose)
    - Progressive decline in beta-cell function
    - Impaired glucose homeostasis leading to hyperglycemia
    - Long-term complications affecting cardiovascular, renal, and neurological systems
    
    Key Biomarkers:
    - Fasting glucose β‰₯126 mg/dL indicates diabetes
    - HbA1c β‰₯6.5% indicates diabetes
    - Elevated cholesterol and triglycerides common due to dyslipidemia
    - HDL typically reduced in metabolic syndrome
    
    Clinical Management:
    - Lifestyle modifications (diet, exercise)
    - Pharmacological intervention (metformin, insulin sensitizers)
    - Regular monitoring of glycemic control
    - Cardiovascular risk management
    """

    agent_outputs = [
        AgentOutput(
            agent_name="Disease Explainer",
            findings=disease_explainer_context,
            metadata={"citations": ["diabetes.pdf", "MediGuard_Diabetes_Guidelines_Extensive.pdf"]},
        ),
        AgentOutput(
            agent_name="Biomarker Analyzer",
            findings="Analyzed 25 biomarkers. Found 19 out of range, 3 critical values.",
            metadata={"citations": []},
        ),
        AgentOutput(
            agent_name="Biomarker-Disease Linker",
            findings="Glucose and HbA1c are primary drivers for Type 2 Diabetes prediction.",
            metadata={"citations": ["diabetes.pdf"]},
        ),
        AgentOutput(
            agent_name="Clinical Guidelines",
            findings="Recommend immediate medical consultation, lifestyle modifications.",
            metadata={"citations": ["diabetes.pdf"]},
        ),
        AgentOutput(
            agent_name="Confidence Assessor",
            findings="High confidence prediction (87%) based on strong biomarker evidence.",
            metadata={"citations": []},
        ),
    ]

    print(f"βœ“ Created {len(agent_outputs)} mock agent outputs for evaluation context")

    # Run full evaluation
    print("\n" + "=" * 80)
    print("RUNNING EVALUATION PIPELINE")
    print("=" * 80)

    try:
        evaluation_result = run_full_evaluation(
            final_response=final_response, agent_outputs=agent_outputs, biomarkers=biomarkers
        )

        # Display results
        print("\n" + "=" * 80)
        print("5D EVALUATION RESULTS")
        print("=" * 80)

        print(f"\n1. πŸ“Š Clinical Accuracy: {evaluation_result.clinical_accuracy.score:.3f}")
        print(f"   Reasoning: {evaluation_result.clinical_accuracy.reasoning[:200]}...")

        print(f"\n2. πŸ“š Evidence Grounding: {evaluation_result.evidence_grounding.score:.3f}")
        print(f"   Reasoning: {evaluation_result.evidence_grounding.reasoning}")

        print(f"\n3. ⚑ Actionability: {evaluation_result.actionability.score:.3f}")
        print(f"   Reasoning: {evaluation_result.actionability.reasoning[:200]}...")

        print(f"\n4. πŸ’‘ Clarity: {evaluation_result.clarity.score:.3f}")
        print(f"   Reasoning: {evaluation_result.clarity.reasoning}")

        print(f"\n5. πŸ›‘οΈ Safety & Completeness: {evaluation_result.safety_completeness.score:.3f}")
        print(f"   Reasoning: {evaluation_result.safety_completeness.reasoning}")

        # Summary
        print("\n" + "=" * 80)
        print("SUMMARY")
        print("=" * 80)

        scores = evaluation_result.to_vector()
        avg_score = sum(scores) / len(scores)

        print(f"\nβœ“ Evaluation Vector: {[f'{s:.3f}' for s in scores]}")
        print(f"βœ“ Average Score: {avg_score:.3f}")
        print(f"βœ“ Min Score: {min(scores):.3f}")
        print(f"βœ“ Max Score: {max(scores):.3f}")

        # Validation checks
        print("\n" + "=" * 80)
        print("VALIDATION CHECKS")
        print("=" * 80)

        all_valid = True

        for i, (name, score) in enumerate(
            [
                ("Clinical Accuracy", evaluation_result.clinical_accuracy.score),
                ("Evidence Grounding", evaluation_result.evidence_grounding.score),
                ("Actionability", evaluation_result.actionability.score),
                ("Clarity", evaluation_result.clarity.score),
                ("Safety & Completeness", evaluation_result.safety_completeness.score),
            ],
            1,
        ):
            if 0.0 <= score <= 1.0:
                print(f"βœ“ {name}: Score in valid range [0.0, 1.0]")
            else:
                print(f"βœ— {name}: Score OUT OF RANGE: {score}")
                all_valid = False

        if all_valid:
            print("\n" + "=" * 80)
            print("All evaluators passed validation")
            print("=" * 80)
        else:
            print("\n" + "=" * 80)
            print("Some evaluators failed validation")
            print("=" * 80)

        assert all_valid, "Some evaluators had scores out of valid range"
        assert avg_score > 0.0, "Average evaluation score should be positive"

    except Exception as e:
        print("\n" + "=" * 80)
        print("Evaluation failed")
        print("=" * 80)
        print(f"\nError: {type(e).__name__}: {e!s}")
        import traceback

        traceback.print_exc()
        raise


if __name__ == "__main__":
    print("\nStarting 5D Evaluation System Test\n")
    test_evaluation_system()
    print("\nTest completed successfully!")