Spaces:
Running
Running
File size: 9,399 Bytes
4279357 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | """
tests/test_preprocessing.py
=============================
ALZDETECT-AI β Unit tests for preprocessing/cleaner.py + chunker.py
WHAT: Tests cleaning functions and chunking logic.
WHY: Cleaner fixes 3 known problems β each fix must be tested.
Chunker splits abstracts β overlap and edge cases must work.
WHO: Run after any change to cleaner.py or chunker.py
WHEN: Before every commit that touches preprocessing/
Run:
pytest tests/test_preprocessing.py -v
"""
import pytest
from pydantic import ValidationError
from preprocessing.cleaner import (
strip_html,
infer_year,
extract_keywords,
CleanedPaper,
)
from preprocessing.chunker import (
split_into_chunks,
PaperChunk,
)
# ββ strip_html tests ββββββββββββββββββββββββββββββββββββββββββββββ
def test_strip_html_italic():
"""<i>APOE</i> β 'APOE', was_html=True"""
result, was_html = strip_html("<i>APOE</i> gene")
assert "APOE" in result
assert "<i>" not in result
assert was_html is True
def test_strip_html_bold():
"""<b>results</b> β 'results', was_html=True"""
result, was_html = strip_html("<b>results</b> show significance")
assert "results" in result
assert "<b>" not in result
assert was_html is True
def test_strip_html_clean_text():
"""Plain text β no HTML β unchanged, was_html=False"""
result, was_html = strip_html("normal text without tags")
assert result == "normal text without tags"
assert was_html is False
def test_strip_html_multiple_tags():
"""Multiple tags all stripped."""
result, was_html = strip_html("<i>tau</i> and <b>amyloid</b> pathology")
assert "tau" in result
assert "amyloid" in result
assert "<" not in result
assert was_html is True
def test_strip_html_whitespace_collapsed():
"""Multiple spaces after stripping are collapsed to one."""
result, _ = strip_html("word1 <i>tag</i> word2")
assert " " not in result
# ββ infer_year tests ββββββββββββββββββββββββββββββββββββββββββββββ
def test_infer_year_valid():
"""Valid ISO timestamp β correct year extracted."""
year, inferred = infer_year("2026-04-30T10:00:00")
assert year == 2026
assert inferred is True
def test_infer_year_empty():
"""Empty string β None, not inferred."""
year, inferred = infer_year("")
assert year is None
assert inferred is False
def test_infer_year_invalid():
"""Garbage string β None, not inferred."""
year, inferred = infer_year("not-a-date")
assert year is None
assert inferred is False
def test_infer_year_2025():
"""2025 timestamp β year 2025."""
year, inferred = infer_year("2025-01-15T08:30:00")
assert year == 2025
assert inferred is True
# ββ extract_keywords tests ββββββββββββββββββββββββββββββββββββββββ
def test_extract_keywords_returns_list():
"""Returns a list of strings."""
abstract = (
"Alzheimer disease biomarkers pTau217 blood plasma "
"early detection clinical study results show sensitivity"
)
keywords, extracted = extract_keywords(abstract, "Alzheimer biomarkers")
assert isinstance(keywords, list)
assert extracted is True
def test_extract_keywords_max_limit():
"""Returns at most max_kw keywords."""
abstract = " ".join([f"word{i}" for i in range(100)])
keywords, _ = extract_keywords(abstract, "title", max_kw=5)
assert len(keywords) <= 5
def test_extract_keywords_not_empty():
"""Non-empty abstract returns at least one keyword."""
abstract = "Alzheimer disease biomarker detection study plasma tau"
keywords, _ = extract_keywords(abstract, "title")
assert len(keywords) > 0
def test_extract_keywords_excludes_stopwords():
"""Common stop words are excluded."""
abstract = "the study was conducted with the patients in the hospital"
keywords, _ = extract_keywords(abstract, "title")
assert "the" not in keywords
assert "was" not in keywords
# ββ CleanedPaper validation tests ββββββββββββββββββββββββββββββββ
@pytest.fixture
def valid_cleaned() -> dict:
"""Valid CleanedPaper dict β baseline."""
return {
"pmid": "37123456",
"title": "Blood biomarkers for Alzheimer detection",
"abstract": "This study examines plasma pTau217 levels "
"in 500 subjects showing 96% sensitivity for "
"detecting preclinical Alzheimer pathology in "
"cognitively normal older adults.",
"authors": ["Smith, John"],
"year": 2024,
"keywords": ["Alzheimer", "pTau217"],
"source_query": "plasma_ptau217_diagnosis",
"year_inferred": False,
"keywords_extracted":False,
"html_stripped": False,
}
def test_cleaned_paper_valid(valid_cleaned):
"""Valid CleanedPaper passes."""
paper = CleanedPaper(**valid_cleaned)
assert paper.pmid == "37123456"
assert paper.year == 2024
def test_cleaned_paper_short_abstract_rejected(valid_cleaned):
"""Abstract under 20 words is rejected."""
valid_cleaned["abstract"] = "Too short abstract here."
with pytest.raises(ValidationError):
CleanedPaper(**valid_cleaned)
def test_cleaned_paper_audit_flags(valid_cleaned):
"""Audit flags are stored correctly."""
valid_cleaned["year_inferred"] = True
valid_cleaned["keywords_extracted"] = True
valid_cleaned["html_stripped"] = True
paper = CleanedPaper(**valid_cleaned)
assert paper.year_inferred is True
assert paper.keywords_extracted is True
assert paper.html_stripped is True
# ββ split_into_chunks tests βββββββββββββββββββββββββββββββββββββββ
def test_split_short_abstract():
"""Abstract shorter than chunk_size β single chunk."""
text = "Short abstract with just a few words about Alzheimer."
chunks = split_into_chunks(text, chunk_size=100, overlap=10)
assert len(chunks) == 1
assert chunks[0] == text
def test_split_long_abstract():
"""Abstract longer than chunk_size β multiple chunks."""
words = ["word"] * 200
text = " ".join(words)
chunks = split_into_chunks(text, chunk_size=50, overlap=10)
assert len(chunks) > 1
def test_split_overlap():
"""Chunks share overlapping words."""
words = [f"word{i}" for i in range(100)]
text = " ".join(words)
chunks = split_into_chunks(text, chunk_size=20, overlap=5)
# Last words of chunk 0 should appear in start of chunk 1
last_words_chunk0 = set(chunks[0].split()[-5:])
first_words_chunk1 = set(chunks[1].split()[:5])
assert len(last_words_chunk0 & first_words_chunk1) > 0
def test_split_no_empty_chunks():
"""No empty chunks produced."""
text = " ".join([f"word{i}" for i in range(50)])
chunks = split_into_chunks(text, chunk_size=10, overlap=2)
assert all(len(c.strip()) > 0 for c in chunks)
# ββ PaperChunk validation tests βββββββββββββββββββββββββββββββββββ
@pytest.fixture
def valid_chunk() -> dict:
"""Valid PaperChunk dict β baseline."""
return {
"chunk_id": "37123456_chunk_0",
"pmid": "37123456",
"chunk_idx": 0,
"text": "This study examines plasma pTau217 as an early "
"biomarker for Alzheimer disease detection.",
"title": "Blood biomarkers for Alzheimer detection",
"year": 2024,
"keywords": ["Alzheimer", "pTau217"],
"source": "pubmed",
"source_query":"plasma_ptau217_diagnosis",
}
def test_paper_chunk_valid(valid_chunk):
"""Valid PaperChunk passes."""
chunk = PaperChunk(**valid_chunk)
assert chunk.chunk_id == "37123456_chunk_0"
assert chunk.pmid == "37123456"
def test_paper_chunk_bad_id_rejected(valid_chunk):
"""chunk_id without '_chunk_' is rejected."""
valid_chunk["chunk_id"] = "bad-format-id"
with pytest.raises(ValidationError) as exc:
PaperChunk(**valid_chunk)
assert "_chunk_" in str(exc.value)
def test_paper_chunk_short_text_rejected(valid_chunk):
"""Text under 5 words is rejected."""
valid_chunk["text"] = "Too short"
with pytest.raises(ValidationError):
PaperChunk(**valid_chunk)
def test_paper_chunk_word_count(valid_chunk):
"""word_count is auto-computed correctly."""
chunk = PaperChunk(**valid_chunk)
assert chunk.word_count == len(valid_chunk["text"].split())
def test_paper_chunk_to_pinecone_record(valid_chunk):
"""to_pinecone_record returns correct structure."""
chunk = PaperChunk(**valid_chunk)
record = chunk.to_pinecone_record()
assert record["id"] == "37123456_chunk_0"
assert "metadata" in record
assert record["metadata"]["pmid"] == "37123456"
assert record["metadata"]["source"] == "pubmed" |