Spaces:

T0X1N
/

Agentic-RagBot

Running

File size: 12,599 Bytes

"""
MediGuard AI — Integration Tests

End-to-end tests verifying the complete analysis workflow.
These tests ensure all components work together correctly.

Run with: pytest tests/test_integration.py -v
"""

import os
from typing import Any

import pytest

# Set deterministic mode for evaluation tests
os.environ["EVALUATION_DETERMINISTIC"] = "true"


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture
def sample_biomarkers() -> dict[str, float]:
    """Standard diabetic biomarker panel."""
    return {
        "Glucose": 145,
        "HbA1c": 7.2,
        "Cholesterol": 220,
        "LDL": 140,
        "HDL": 45,
        "Triglycerides": 180,
    }


@pytest.fixture
def normal_biomarkers() -> dict[str, float]:
    """Normal healthy biomarkers."""
    return {
        "Glucose": 90,
        "HbA1c": 5.2,
        "Cholesterol": 180,
        "LDL": 90,
        "HDL": 55,
        "Triglycerides": 120,
    }


# ---------------------------------------------------------------------------
# Shared Utilities Tests
# ---------------------------------------------------------------------------


class TestBiomarkerParsing:
    """Tests for biomarker parsing from natural language."""

    def test_parse_json_input(self):
        """Should parse valid JSON biomarker input."""
        from src.shared_utils import parse_biomarkers

        result = parse_biomarkers('{"Glucose": 140, "HbA1c": 7.5}')

        assert result["Glucose"] == 140
        assert result["HbA1c"] == 7.5

    def test_parse_key_value_format(self):
        """Should parse key:value format."""
        from src.shared_utils import parse_biomarkers

        result = parse_biomarkers("Glucose: 140, HbA1c: 7.5")

        assert result["Glucose"] == 140
        assert result["HbA1c"] == 7.5

    def test_parse_natural_language(self):
        """Should parse natural language with units."""
        from src.shared_utils import parse_biomarkers

        result = parse_biomarkers("glucose 140 mg/dL and hemoglobin 13.5 g/dL")

        assert "Glucose" in result or "glucose" in result
        assert 140 in result.values()

    def test_normalize_biomarker_aliases(self):
        """Should normalize biomarker aliases to canonical names."""
        from src.shared_utils import normalize_biomarker_name

        assert normalize_biomarker_name("a1c") == "HbA1c"
        assert normalize_biomarker_name("fasting glucose") == "Glucose"
        assert normalize_biomarker_name("ldl-c") == "LDL"

    def test_empty_input(self):
        """Should return empty dict for empty input."""
        from src.shared_utils import parse_biomarkers

        assert parse_biomarkers("") == {}
        assert parse_biomarkers("  ") == {}


class TestDiseaseScoring:
    """Tests for rule-based disease scoring heuristics."""

    def test_diabetes_scoring_diabetic(self, sample_biomarkers):
        """Should detect diabetes with elevated glucose/HbA1c."""
        from src.shared_utils import score_disease_diabetes

        score, severity = score_disease_diabetes(sample_biomarkers)

        assert score > 0.5
        assert severity in ["moderate", "high"]

    def test_diabetes_scoring_normal(self, normal_biomarkers):
        """Should not flag diabetes with normal biomarkers."""
        from src.shared_utils import score_disease_diabetes

        score, severity = score_disease_diabetes(normal_biomarkers)

        assert score < 0.3

    def test_dyslipidemia_scoring(self, sample_biomarkers):
        """Should detect dyslipidemia with elevated lipids."""
        from src.shared_utils import score_disease_dyslipidemia

        score, severity = score_disease_dyslipidemia(sample_biomarkers)

        assert score > 0.3

    def test_primary_prediction(self, sample_biomarkers):
        """Should return highest-confidence prediction."""
        from src.shared_utils import get_primary_prediction

        result = get_primary_prediction(sample_biomarkers)

        assert "disease" in result
        assert "confidence" in result
        assert "severity" in result
        assert result["confidence"] > 0


class TestBiomarkerFlagging:
    """Tests for biomarker classification and flagging."""

    def test_classify_abnormal_biomarker(self):
        """Should classify abnormal biomarkers correctly."""
        from src.shared_utils import classify_biomarker

        assert classify_biomarker("Glucose", 200) == "high"
        assert classify_biomarker("Glucose", 50) == "low"
        assert classify_biomarker("Glucose", 90) == "normal"

    def test_flag_biomarkers(self, sample_biomarkers):
        """Should flag abnormal biomarkers with details."""
        from src.shared_utils import flag_biomarkers

        flags = flag_biomarkers(sample_biomarkers)

        assert len(flags) == len(sample_biomarkers)

        # Check that flagged items have expected fields
        for flag in flags:
            assert "name" in flag
            assert "value" in flag
            assert "status" in flag


# ---------------------------------------------------------------------------
# Retrieval Tests
# ---------------------------------------------------------------------------


class TestRetrieverInterface:
    """Tests for the unified retriever interface."""

    def test_retrieval_result_dataclass(self):
        """Should create RetrievalResult with correct fields."""
        from src.services.retrieval.interface import RetrievalResult

        result = RetrievalResult(
            doc_id="test-123", content="Test content about diabetes.", score=0.85, metadata={"source": "test.pdf"}
        )

        assert result.doc_id == "test-123"
        assert result.score == 0.85
        assert "diabetes" in result.content

    @pytest.mark.skipif(
        not os.path.exists("data/vector_stores/medical_knowledge.faiss"), reason="FAISS index not available"
    )
    def test_faiss_retriever_loads(self):
        """Should load FAISS retriever from local index."""
        from src.services.retrieval import make_retriever

        retriever = make_retriever(backend="faiss")

        assert retriever.health()
        assert retriever.doc_count() > 0


# ---------------------------------------------------------------------------
# Evaluation Tests
# ---------------------------------------------------------------------------


class TestEvaluationSystem:
    """Tests for the 5D evaluation system."""

    @pytest.fixture
    def sample_response(self) -> dict[str, Any]:
        """Sample analysis response for evaluation."""
        return {
            "patient_summary": {
                "narrative": "Patient shows elevated blood glucose and HbA1c indicating diabetes.",
                "primary_finding": "Type 2 Diabetes",
            },
            "prediction_explanation": {
                "key_drivers": [
                    {"biomarker": "Glucose", "evidence": "Elevated at 145 mg/dL"},
                    {"biomarker": "HbA1c", "evidence": "7.2% indicates poor glycemic control"},
                ],
                "pdf_references": [
                    {"source": "guidelines.pdf", "page": 12},
                    {"source": "diabetes.pdf", "page": 45},
                ],
            },
            "clinical_recommendations": {
                "immediate_actions": ["Confirm HbA1c", "Schedule follow-up"],
                "lifestyle_changes": ["Dietary modifications", "Regular exercise"],
                "monitoring": ["Weekly glucose checks"],
            },
            "biomarker_flags": [
                {"name": "Glucose", "value": 145, "status": "high"},
                {"name": "HbA1c", "value": 7.2, "status": "high"},
            ],
            "key_findings": ["Diabetes indicators present"],
        }

    def test_graded_score_validation(self):
        """Should validate score range 0-1."""
        from src.evaluation.evaluators import GradedScore

        valid = GradedScore(score=0.75, reasoning="Test")
        assert valid.score == 0.75

        with pytest.raises(ValueError):
            GradedScore(score=1.5, reasoning="Invalid")

    def test_evidence_grounding_programmatic(self, sample_response):
        """Should evaluate evidence grounding programmatically."""
        from src.evaluation.evaluators import evaluate_evidence_grounding

        result = evaluate_evidence_grounding(sample_response)

        assert 0 <= result.score <= 1
        assert "Citations" in result.reasoning or "citations" in result.reasoning.lower()

    def test_safety_completeness_programmatic(self, sample_response, sample_biomarkers):
        """Should evaluate safety completeness programmatically."""
        from src.evaluation.evaluators import evaluate_safety_completeness

        # Add required field for safety evaluation
        sample_response["confidence_assessment"] = {
            "limitations": ["Requires clinical confirmation"],
            "confidence_score": 0.75,
        }

        result = evaluate_safety_completeness(sample_response, sample_biomarkers)

        assert 0 <= result.score <= 1

    @pytest.mark.skipif(
        not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
    )
    def test_deterministic_clinical_accuracy(self, sample_response):
        """Should evaluate clinical accuracy deterministically."""
        from src.evaluation.evaluators import evaluate_clinical_accuracy

        # EVALUATION_DETERMINISTIC=true set at top of file
        result = evaluate_clinical_accuracy(sample_response, "Test context")

        assert 0 <= result.score <= 1
        assert "[DETERMINISTIC]" in result.reasoning

    def test_evaluation_result_average(self, sample_response, sample_biomarkers):
        """Should calculate average score across all dimensions."""
        from src.evaluation.evaluators import EvaluationResult, GradedScore

        result = EvaluationResult(
            clinical_accuracy=GradedScore(score=0.8, reasoning="Good"),
            evidence_grounding=GradedScore(score=0.7, reasoning="Good"),
            actionability=GradedScore(score=0.9, reasoning="Good"),
            clarity=GradedScore(score=0.6, reasoning="OK"),
            safety_completeness=GradedScore(score=0.8, reasoning="Good"),
        )

        avg = result.average_score()

        assert 0.7 < avg < 0.8  # (0.8+0.7+0.9+0.6+0.8)/5 = 0.76


# ---------------------------------------------------------------------------
# API Route Tests
# ---------------------------------------------------------------------------


class TestAPIRoutes:
    """Tests for FastAPI routes (requires running server or test client)."""

    def test_analyze_router_import(self):
        """Should import analyze router without errors."""
        from src.routers import analyze

        assert hasattr(analyze, "router")

    def test_health_check_import(self):
        """Should have health check endpoint."""
        from src.routers import health

        assert hasattr(health, "router")


# ---------------------------------------------------------------------------
# HuggingFace App Tests
# ---------------------------------------------------------------------------


class TestHuggingFaceApp:
    """Tests for HuggingFace Gradio app components."""

    def test_shared_utils_import_in_hf(self):
        """HuggingFace app should import shared utilities."""
        import sys
        from pathlib import Path

        # Add project root to path (as HF app does)
        project_root = str(Path(__file__).parent.parent)
        if project_root not in sys.path:
            sys.path.insert(0, project_root)

        from src.shared_utils import parse_biomarkers

        # Should work without errors
        result = parse_biomarkers("Glucose: 140")
        assert "Glucose" in result or len(result) > 0


# ---------------------------------------------------------------------------
# Workflow Tests
# ---------------------------------------------------------------------------


@pytest.mark.skipif(
    not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
class TestWorkflow:
    """Tests requiring LLM API access."""

    def test_create_guild(self):
        """Should create ClinicalInsightGuild without errors."""
        from src.workflow import create_guild

        guild = create_guild()

        assert guild is not None


if __name__ == "__main__":
    pytest.main([__file__, "-v"])