Spaces:

T0X1N
/

Agentic-RagBot

Running

File size: 7,696 Bytes

"""
Test the 5D Evaluation System
Tests all evaluators with real diabetes patient output
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import json

import pytest
import os

from src.evaluation.evaluators import run_full_evaluation
from src.state import AgentOutput


@pytest.mark.skipif(
    not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
def test_evaluation_system():
    """Test evaluation system with diabetes patient data"""

    print("=" * 80)
    print("TESTING 5D EVALUATION SYSTEM")
    print("=" * 80)

    # Load test output from diabetes patient
    test_output_path = Path(__file__).parent / "test_output_diabetes.json"
    with open(test_output_path, encoding="utf-8") as f:
        final_response = json.load(f)

    print(f"\n✓ Loaded test data from: {test_output_path}")
    print(f"  - Disease: {final_response['prediction_explanation']['primary_disease']}")
    print(f"  - Confidence: {final_response['prediction_explanation']['confidence']:.1%}")
    print(f"  - Out of range biomarkers: {final_response['patient_summary']['biomarkers_out_of_range']}")
    print(f"  - Critical alerts: {len(final_response['safety_alerts'])}")

    # Reconstruct patient biomarkers from test output
    biomarkers = {
        "Glucose": 185.0,
        "HbA1c": 8.2,
        "Cholesterol": 235.0,
        "Triglycerides": 210.0,
        "HDL": 38.0,
        "LDL": 155.0,
        "VLDL": 42.0,
        "Total_Protein": 6.8,
        "Albumin": 4.2,
        "Globulin": 2.6,
        "AG_Ratio": 1.6,
        "Bilirubin_Total": 0.9,
        "Bilirubin_Direct": 0.2,
        "ALT": 35.0,
        "AST": 28.0,
        "ALP": 95.0,
        "Creatinine": 1.1,
        "BUN": 18.0,
        "BUN_Creatinine_Ratio": 16.4,
        "Uric_Acid": 6.2,
        "WBC": 7200.0,
        "RBC": 4.7,
        "Hemoglobin": 14.2,
        "Hematocrit": 42.0,
        "Platelets": 245.0,
    }

    print(f"\n✓ Reconstructed {len(biomarkers)} biomarker values")

    # Mock agent outputs to provide PubMed context for Clinical Accuracy evaluator
    disease_explainer_context = """
    Type 2 diabetes (T2D) accounts for the majority of cases and results 
    primarily from insulin resistance with a progressive beta-cell secretory defect.
    
    Pathophysiology:
    - Insulin resistance in peripheral tissues (muscle, liver, adipose)
    - Progressive decline in beta-cell function
    - Impaired glucose homeostasis leading to hyperglycemia
    - Long-term complications affecting cardiovascular, renal, and neurological systems
    
    Key Biomarkers:
    - Fasting glucose ≥126 mg/dL indicates diabetes
    - HbA1c ≥6.5% indicates diabetes
    - Elevated cholesterol and triglycerides common due to dyslipidemia
    - HDL typically reduced in metabolic syndrome
    
    Clinical Management:
    - Lifestyle modifications (diet, exercise)
    - Pharmacological intervention (metformin, insulin sensitizers)
    - Regular monitoring of glycemic control
    - Cardiovascular risk management
    """

    agent_outputs = [
        AgentOutput(
            agent_name="Disease Explainer",
            findings=disease_explainer_context,
            metadata={"citations": ["diabetes.pdf", "MediGuard_Diabetes_Guidelines_Extensive.pdf"]},
        ),
        AgentOutput(
            agent_name="Biomarker Analyzer",
            findings="Analyzed 25 biomarkers. Found 19 out of range, 3 critical values.",
            metadata={"citations": []},
        ),
        AgentOutput(
            agent_name="Biomarker-Disease Linker",
            findings="Glucose and HbA1c are primary drivers for Type 2 Diabetes prediction.",
            metadata={"citations": ["diabetes.pdf"]},
        ),
        AgentOutput(
            agent_name="Clinical Guidelines",
            findings="Recommend immediate medical consultation, lifestyle modifications.",
            metadata={"citations": ["diabetes.pdf"]},
        ),
        AgentOutput(
            agent_name="Confidence Assessor",
            findings="High confidence prediction (87%) based on strong biomarker evidence.",
            metadata={"citations": []},
        ),
    ]

    print(f"✓ Created {len(agent_outputs)} mock agent outputs for evaluation context")

    # Run full evaluation
    print("\n" + "=" * 80)
    print("RUNNING EVALUATION PIPELINE")
    print("=" * 80)

    try:
        evaluation_result = run_full_evaluation(
            final_response=final_response, agent_outputs=agent_outputs, biomarkers=biomarkers
        )

        # Display results
        print("\n" + "=" * 80)
        print("5D EVALUATION RESULTS")
        print("=" * 80)

        print(f"\n1. 📊 Clinical Accuracy: {evaluation_result.clinical_accuracy.score:.3f}")
        print(f"   Reasoning: {evaluation_result.clinical_accuracy.reasoning[:200]}...")

        print(f"\n2. 📚 Evidence Grounding: {evaluation_result.evidence_grounding.score:.3f}")
        print(f"   Reasoning: {evaluation_result.evidence_grounding.reasoning}")

        print(f"\n3. ⚡ Actionability: {evaluation_result.actionability.score:.3f}")
        print(f"   Reasoning: {evaluation_result.actionability.reasoning[:200]}...")

        print(f"\n4. 💡 Clarity: {evaluation_result.clarity.score:.3f}")
        print(f"   Reasoning: {evaluation_result.clarity.reasoning}")

        print(f"\n5. 🛡️ Safety & Completeness: {evaluation_result.safety_completeness.score:.3f}")
        print(f"   Reasoning: {evaluation_result.safety_completeness.reasoning}")

        # Summary
        print("\n" + "=" * 80)
        print("SUMMARY")
        print("=" * 80)

        scores = evaluation_result.to_vector()
        avg_score = sum(scores) / len(scores)

        print(f"\n✓ Evaluation Vector: {[f'{s:.3f}' for s in scores]}")
        print(f"✓ Average Score: {avg_score:.3f}")
        print(f"✓ Min Score: {min(scores):.3f}")
        print(f"✓ Max Score: {max(scores):.3f}")

        # Validation checks
        print("\n" + "=" * 80)
        print("VALIDATION CHECKS")
        print("=" * 80)

        all_valid = True

        for i, (name, score) in enumerate(
            [
                ("Clinical Accuracy", evaluation_result.clinical_accuracy.score),
                ("Evidence Grounding", evaluation_result.evidence_grounding.score),
                ("Actionability", evaluation_result.actionability.score),
                ("Clarity", evaluation_result.clarity.score),
                ("Safety & Completeness", evaluation_result.safety_completeness.score),
            ],
            1,
        ):
            if 0.0 <= score <= 1.0:
                print(f"✓ {name}: Score in valid range [0.0, 1.0]")
            else:
                print(f"✗ {name}: Score OUT OF RANGE: {score}")
                all_valid = False

        if all_valid:
            print("\n" + "=" * 80)
            print("All evaluators passed validation")
            print("=" * 80)
        else:
            print("\n" + "=" * 80)
            print("Some evaluators failed validation")
            print("=" * 80)

        assert all_valid, "Some evaluators had scores out of valid range"
        assert avg_score > 0.0, "Average evaluation score should be positive"

    except Exception as e:
        print("\n" + "=" * 80)
        print("Evaluation failed")
        print("=" * 80)
        print(f"\nError: {type(e).__name__}: {e!s}")
        import traceback

        traceback.print_exc()
        raise


if __name__ == "__main__":
    print("\nStarting 5D Evaluation System Test\n")
    test_evaluation_system()
    print("\nTest completed successfully!")