""" tests/test_preprocessing.py ============================= ALZDETECT-AI — Unit tests for preprocessing/cleaner.py + chunker.py WHAT: Tests cleaning functions and chunking logic. WHY: Cleaner fixes 3 known problems — each fix must be tested. Chunker splits abstracts — overlap and edge cases must work. WHO: Run after any change to cleaner.py or chunker.py WHEN: Before every commit that touches preprocessing/ Run: pytest tests/test_preprocessing.py -v """ import pytest from pydantic import ValidationError from preprocessing.cleaner import ( strip_html, infer_year, extract_keywords, CleanedPaper, ) from preprocessing.chunker import ( split_into_chunks, PaperChunk, ) # ── strip_html tests ────────────────────────────────────────────── def test_strip_html_italic(): """APOE → 'APOE', was_html=True""" result, was_html = strip_html("APOE gene") assert "APOE" in result assert "" not in result assert was_html is True def test_strip_html_bold(): """results → 'results', was_html=True""" result, was_html = strip_html("results show significance") assert "results" in result assert "" not in result assert was_html is True def test_strip_html_clean_text(): """Plain text — no HTML — unchanged, was_html=False""" result, was_html = strip_html("normal text without tags") assert result == "normal text without tags" assert was_html is False def test_strip_html_multiple_tags(): """Multiple tags all stripped.""" result, was_html = strip_html("tau and amyloid pathology") assert "tau" in result assert "amyloid" in result assert "<" not in result assert was_html is True def test_strip_html_whitespace_collapsed(): """Multiple spaces after stripping are collapsed to one.""" result, _ = strip_html("word1 tag word2") assert " " not in result # ── infer_year tests ────────────────────────────────────────────── def test_infer_year_valid(): """Valid ISO timestamp → correct year extracted.""" year, inferred = infer_year("2026-04-30T10:00:00") assert year == 2026 assert inferred is True def test_infer_year_empty(): """Empty string → None, not inferred.""" year, inferred = infer_year("") assert year is None assert inferred is False def test_infer_year_invalid(): """Garbage string → None, not inferred.""" year, inferred = infer_year("not-a-date") assert year is None assert inferred is False def test_infer_year_2025(): """2025 timestamp → year 2025.""" year, inferred = infer_year("2025-01-15T08:30:00") assert year == 2025 assert inferred is True # ── extract_keywords tests ──────────────────────────────────────── def test_extract_keywords_returns_list(): """Returns a list of strings.""" abstract = ( "Alzheimer disease biomarkers pTau217 blood plasma " "early detection clinical study results show sensitivity" ) keywords, extracted = extract_keywords(abstract, "Alzheimer biomarkers") assert isinstance(keywords, list) assert extracted is True def test_extract_keywords_max_limit(): """Returns at most max_kw keywords.""" abstract = " ".join([f"word{i}" for i in range(100)]) keywords, _ = extract_keywords(abstract, "title", max_kw=5) assert len(keywords) <= 5 def test_extract_keywords_not_empty(): """Non-empty abstract returns at least one keyword.""" abstract = "Alzheimer disease biomarker detection study plasma tau" keywords, _ = extract_keywords(abstract, "title") assert len(keywords) > 0 def test_extract_keywords_excludes_stopwords(): """Common stop words are excluded.""" abstract = "the study was conducted with the patients in the hospital" keywords, _ = extract_keywords(abstract, "title") assert "the" not in keywords assert "was" not in keywords # ── CleanedPaper validation tests ──────────────────────────────── @pytest.fixture def valid_cleaned() -> dict: """Valid CleanedPaper dict — baseline.""" return { "pmid": "37123456", "title": "Blood biomarkers for Alzheimer detection", "abstract": "This study examines plasma pTau217 levels " "in 500 subjects showing 96% sensitivity for " "detecting preclinical Alzheimer pathology in " "cognitively normal older adults.", "authors": ["Smith, John"], "year": 2024, "keywords": ["Alzheimer", "pTau217"], "source_query": "plasma_ptau217_diagnosis", "year_inferred": False, "keywords_extracted":False, "html_stripped": False, } def test_cleaned_paper_valid(valid_cleaned): """Valid CleanedPaper passes.""" paper = CleanedPaper(**valid_cleaned) assert paper.pmid == "37123456" assert paper.year == 2024 def test_cleaned_paper_short_abstract_rejected(valid_cleaned): """Abstract under 20 words is rejected.""" valid_cleaned["abstract"] = "Too short abstract here." with pytest.raises(ValidationError): CleanedPaper(**valid_cleaned) def test_cleaned_paper_audit_flags(valid_cleaned): """Audit flags are stored correctly.""" valid_cleaned["year_inferred"] = True valid_cleaned["keywords_extracted"] = True valid_cleaned["html_stripped"] = True paper = CleanedPaper(**valid_cleaned) assert paper.year_inferred is True assert paper.keywords_extracted is True assert paper.html_stripped is True # ── split_into_chunks tests ─────────────────────────────────────── def test_split_short_abstract(): """Abstract shorter than chunk_size → single chunk.""" text = "Short abstract with just a few words about Alzheimer." chunks = split_into_chunks(text, chunk_size=100, overlap=10) assert len(chunks) == 1 assert chunks[0] == text def test_split_long_abstract(): """Abstract longer than chunk_size → multiple chunks.""" words = ["word"] * 200 text = " ".join(words) chunks = split_into_chunks(text, chunk_size=50, overlap=10) assert len(chunks) > 1 def test_split_overlap(): """Chunks share overlapping words.""" words = [f"word{i}" for i in range(100)] text = " ".join(words) chunks = split_into_chunks(text, chunk_size=20, overlap=5) # Last words of chunk 0 should appear in start of chunk 1 last_words_chunk0 = set(chunks[0].split()[-5:]) first_words_chunk1 = set(chunks[1].split()[:5]) assert len(last_words_chunk0 & first_words_chunk1) > 0 def test_split_no_empty_chunks(): """No empty chunks produced.""" text = " ".join([f"word{i}" for i in range(50)]) chunks = split_into_chunks(text, chunk_size=10, overlap=2) assert all(len(c.strip()) > 0 for c in chunks) # ── PaperChunk validation tests ─────────────────────────────────── @pytest.fixture def valid_chunk() -> dict: """Valid PaperChunk dict — baseline.""" return { "chunk_id": "37123456_chunk_0", "pmid": "37123456", "chunk_idx": 0, "text": "This study examines plasma pTau217 as an early " "biomarker for Alzheimer disease detection.", "title": "Blood biomarkers for Alzheimer detection", "year": 2024, "keywords": ["Alzheimer", "pTau217"], "source": "pubmed", "source_query":"plasma_ptau217_diagnosis", } def test_paper_chunk_valid(valid_chunk): """Valid PaperChunk passes.""" chunk = PaperChunk(**valid_chunk) assert chunk.chunk_id == "37123456_chunk_0" assert chunk.pmid == "37123456" def test_paper_chunk_bad_id_rejected(valid_chunk): """chunk_id without '_chunk_' is rejected.""" valid_chunk["chunk_id"] = "bad-format-id" with pytest.raises(ValidationError) as exc: PaperChunk(**valid_chunk) assert "_chunk_" in str(exc.value) def test_paper_chunk_short_text_rejected(valid_chunk): """Text under 5 words is rejected.""" valid_chunk["text"] = "Too short" with pytest.raises(ValidationError): PaperChunk(**valid_chunk) def test_paper_chunk_word_count(valid_chunk): """word_count is auto-computed correctly.""" chunk = PaperChunk(**valid_chunk) assert chunk.word_count == len(valid_chunk["text"].split()) def test_paper_chunk_to_pinecone_record(valid_chunk): """to_pinecone_record returns correct structure.""" chunk = PaperChunk(**valid_chunk) record = chunk.to_pinecone_record() assert record["id"] == "37123456_chunk_0" assert "metadata" in record assert record["metadata"]["pmid"] == "37123456" assert record["metadata"]["source"] == "pubmed"