Spaces:
Running
Running
| """ | |
| tests/test_generation.py | |
| ========================= | |
| ALZDETECT-AI β Unit tests for generation/rag_pipeline.py | |
| WHAT: Tests AlzheimerAnswer Pydantic model validation | |
| and answer parsing logic. | |
| WHY: Claude output is non-deterministic β the parser | |
| must handle good JSON, bad JSON, and missing fields. | |
| WHO: Run after any change to rag_pipeline.py | |
| WHEN: Before every commit that touches generation/ | |
| Run: | |
| pytest tests/test_generation.py -v | |
| """ | |
| import pytest | |
| from pydantic import ValidationError | |
| from generation.rag_pipeline import AlzheimerAnswer, RAGPipeline | |
| # ββ AlzheimerAnswer validation tests βββββββββββββββββββββββββββββ | |
| def valid_answer() -> dict: | |
| """Valid AlzheimerAnswer dict β baseline.""" | |
| return { | |
| "summary": "Plasma pTau217 is a strong early biomarker " | |
| "for Alzheimer's disease with 96% sensitivity.", | |
| "key_findings": [ | |
| "pTau217 detects AD 10 years before symptoms (PMID: 37123456)", | |
| "GFAP correlates with amyloid burden (PMID: 37234567)", | |
| ], | |
| "pmids_cited": ["37123456", "37234567"], | |
| "confidence": "high", | |
| "limitations": "Studies limited to older adults over 65.", | |
| } | |
| def test_answer_valid(valid_answer): | |
| """Valid answer passes.""" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "high" | |
| assert len(answer.key_findings) == 2 | |
| def test_answer_empty_summary_rejected(valid_answer): | |
| """Empty summary is rejected.""" | |
| valid_answer["summary"] = "" | |
| with pytest.raises(ValidationError): | |
| AlzheimerAnswer(**valid_answer) | |
| def test_answer_short_summary_rejected(valid_answer): | |
| """Summary under 10 chars is rejected.""" | |
| valid_answer["summary"] = "Too short" | |
| with pytest.raises(ValidationError): | |
| AlzheimerAnswer(**valid_answer) | |
| def test_answer_confidence_high(valid_answer): | |
| """confidence='high' is valid.""" | |
| valid_answer["confidence"] = "high" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "high" | |
| def test_answer_confidence_medium(valid_answer): | |
| """confidence='medium' is valid.""" | |
| valid_answer["confidence"] = "medium" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "medium" | |
| def test_answer_confidence_low(valid_answer): | |
| """confidence='low' is valid.""" | |
| valid_answer["confidence"] = "low" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "low" | |
| def test_answer_confidence_invalid_defaults_medium(valid_answer): | |
| """ | |
| Invalid confidence defaults to 'medium' β not a crash. | |
| Claude might return 'moderate' or 'uncertain'. | |
| """ | |
| valid_answer["confidence"] = "maybe" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "medium" | |
| def test_answer_confidence_uppercase_normalized(valid_answer): | |
| """'HIGH' is normalized to 'high'.""" | |
| valid_answer["confidence"] = "HIGH" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.confidence == "high" | |
| def test_answer_empty_key_findings(valid_answer): | |
| """Empty key_findings list is allowed.""" | |
| valid_answer["key_findings"] = [] | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.key_findings == [] | |
| def test_answer_empty_pmids(valid_answer): | |
| """Empty pmids_cited is allowed.""" | |
| valid_answer["pmids_cited"] = [] | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.pmids_cited == [] | |
| def test_answer_no_limitations(valid_answer): | |
| """limitations=None is allowed.""" | |
| valid_answer["limitations"] = None | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert answer.limitations is None | |
| def test_answer_has_disclaimer(valid_answer): | |
| """Disclaimer is always present.""" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| assert "research purposes only" in answer.disclaimer | |
| def test_answer_to_display(valid_answer): | |
| """to_display returns formatted string.""" | |
| answer = AlzheimerAnswer(**valid_answer) | |
| display = answer.to_display() | |
| assert "Summary" in display | |
| assert "Key Findings" in display | |
| assert "37123456" in display | |
| assert "high" in display | |
| # ββ _parse_answer tests βββββββββββββββββββββββββββββββββββββββββββ | |
| def pipeline(): | |
| """ | |
| RAGPipeline instance for testing _parse_answer. | |
| We only test the parsing method β no API calls made. | |
| """ | |
| return RAGPipeline() | |
| def test_parse_valid_json(pipeline): | |
| """Valid Claude JSON β AlzheimerAnswer.""" | |
| raw = '''{ | |
| "summary": "pTau217 is a strong biomarker for early Alzheimer detection.", | |
| "key_findings": ["pTau217 shows 96% sensitivity (PMID: 37123456)"], | |
| "pmids_cited": ["37123456"], | |
| "confidence": "high", | |
| "limitations": null | |
| }''' | |
| answer = pipeline._parse_answer(raw, retrieved_pmids=["37123456"]) | |
| assert answer.confidence == "high" | |
| assert "37123456" in answer.pmids_cited | |
| def test_parse_bad_json_fallback(pipeline): | |
| """ | |
| Malformed JSON β fallback answer, no crash. | |
| Worst-case: Claude returns text instead of JSON. | |
| """ | |
| raw = "I cannot answer this question based on the provided papers." | |
| answer = pipeline._parse_answer(raw, retrieved_pmids=[]) | |
| assert answer.confidence == "low" | |
| assert answer.summary != "" | |
| def test_parse_empty_string_fallback(pipeline): | |
| """Empty string β fallback answer, no crash.""" | |
| answer = pipeline._parse_answer("", retrieved_pmids=[]) | |
| assert isinstance(answer, AlzheimerAnswer) | |
| def test_parse_strips_markdown(pipeline): | |
| """Claude sometimes wraps JSON in markdown code blocks.""" | |
| raw = '''```json | |
| { | |
| "summary": "pTau217 predicts Alzheimer disease progression reliably.", | |
| "key_findings": ["Finding 1 with citation"], | |
| "pmids_cited": ["37123456"], | |
| "confidence": "medium", | |
| "limitations": null | |
| } | |
| ```''' | |
| answer = pipeline._parse_answer(raw, retrieved_pmids=["37123456"]) | |
| assert answer.confidence == "medium" | |
| def test_parse_pmid_validation(pipeline): | |
| """ | |
| PMIDs not in retrieved list are filtered out. | |
| Worst-case: Claude hallucinates a PMID. | |
| """ | |
| raw = '''{ | |
| "summary": "This is a valid summary about Alzheimer disease research.", | |
| "key_findings": ["Real finding (PMID: 37123456)"], | |
| "pmids_cited": ["37123456", "99999999"], | |
| "confidence": "high", | |
| "limitations": null | |
| }''' | |
| # Only 37123456 was actually retrieved β 99999999 is hallucinated | |
| answer = pipeline._parse_answer( | |
| raw, | |
| retrieved_pmids=["37123456"] | |
| ) | |
| assert "37123456" in answer.pmids_cited | |
| assert "99999999" not in answer.pmids_cited | |
| def test_parse_json_with_extra_fields(pipeline): | |
| """Extra fields in Claude JSON are ignored gracefully.""" | |
| raw = '''{ | |
| "summary": "Valid summary about Alzheimer biomarker research findings.", | |
| "key_findings": ["Finding with evidence"], | |
| "pmids_cited": ["37123456"], | |
| "confidence": "high", | |
| "limitations": null, | |
| "extra_field": "this should be ignored" | |
| }''' | |
| answer = pipeline._parse_answer(raw, retrieved_pmids=["37123456"]) | |
| assert answer.confidence == "high" | |