Spaces:

oviya08
/

AlzDetectAI

Running

AlzDetectAI / tests /test_preprocessing.py

tpriyadata

test: add 94 unit tests across all pipeline stages — 100% passing

4279357 15 days ago

9.4 kB

	"""
	tests/test_preprocessing.py
	=============================
	ALZDETECT-AI — Unit tests for preprocessing/cleaner.py + chunker.py

	WHAT: Tests cleaning functions and chunking logic.
	WHY: Cleaner fixes 3 known problems — each fix must be tested.
	Chunker splits abstracts — overlap and edge cases must work.
	WHO: Run after any change to cleaner.py or chunker.py
	WHEN: Before every commit that touches preprocessing/

	Run:
	pytest tests/test_preprocessing.py -v
	"""

	import pytest
	from pydantic import ValidationError
	from preprocessing.cleaner import (
	strip_html,
	infer_year,
	extract_keywords,
	CleanedPaper,
	)
	from preprocessing.chunker import (
	split_into_chunks,
	PaperChunk,
	)


	# ── strip_html tests ──────────────────────────────────────────────

	def test_strip_html_italic():
	"""<i>APOE</i> → 'APOE', was_html=True"""
	result, was_html = strip_html("<i>APOE</i> gene")
	assert "APOE" in result
	assert "<i>" not in result
	assert was_html is True


	def test_strip_html_bold():
	"""<b>results</b> → 'results', was_html=True"""
	result, was_html = strip_html("<b>results</b> show significance")
	assert "results" in result
	assert "<b>" not in result
	assert was_html is True


	def test_strip_html_clean_text():
	"""Plain text — no HTML — unchanged, was_html=False"""
	result, was_html = strip_html("normal text without tags")
	assert result == "normal text without tags"
	assert was_html is False


	def test_strip_html_multiple_tags():
	"""Multiple tags all stripped."""
	result, was_html = strip_html("<i>tau</i> and <b>amyloid</b> pathology")
	assert "tau" in result
	assert "amyloid" in result
	assert "<" not in result
	assert was_html is True


	def test_strip_html_whitespace_collapsed():
	"""Multiple spaces after stripping are collapsed to one."""
	result, _ = strip_html("word1 <i>tag</i> word2")
	assert " " not in result


	# ── infer_year tests ──────────────────────────────────────────────

	def test_infer_year_valid():
	"""Valid ISO timestamp → correct year extracted."""
	year, inferred = infer_year("2026-04-30T10:00:00")
	assert year == 2026
	assert inferred is True


	def test_infer_year_empty():
	"""Empty string → None, not inferred."""
	year, inferred = infer_year("")
	assert year is None
	assert inferred is False


	def test_infer_year_invalid():
	"""Garbage string → None, not inferred."""
	year, inferred = infer_year("not-a-date")
	assert year is None
	assert inferred is False


	def test_infer_year_2025():
	"""2025 timestamp → year 2025."""
	year, inferred = infer_year("2025-01-15T08:30:00")
	assert year == 2025
	assert inferred is True


	# ── extract_keywords tests ────────────────────────────────────────

	def test_extract_keywords_returns_list():
	"""Returns a list of strings."""
	abstract = (
	"Alzheimer disease biomarkers pTau217 blood plasma "
	"early detection clinical study results show sensitivity"
	)
	keywords, extracted = extract_keywords(abstract, "Alzheimer biomarkers")
	assert isinstance(keywords, list)
	assert extracted is True


	def test_extract_keywords_max_limit():
	"""Returns at most max_kw keywords."""
	abstract = " ".join([f"word{i}" for i in range(100)])
	keywords, _ = extract_keywords(abstract, "title", max_kw=5)
	assert len(keywords) <= 5


	def test_extract_keywords_not_empty():
	"""Non-empty abstract returns at least one keyword."""
	abstract = "Alzheimer disease biomarker detection study plasma tau"
	keywords, _ = extract_keywords(abstract, "title")
	assert len(keywords) > 0


	def test_extract_keywords_excludes_stopwords():
	"""Common stop words are excluded."""
	abstract = "the study was conducted with the patients in the hospital"
	keywords, _ = extract_keywords(abstract, "title")
	assert "the" not in keywords
	assert "was" not in keywords


	# ── CleanedPaper validation tests ────────────────────────────────

	@pytest.fixture
	def valid_cleaned() -> dict:
	"""Valid CleanedPaper dict — baseline."""
	return {
	"pmid": "37123456",
	"title": "Blood biomarkers for Alzheimer detection",
	"abstract": "This study examines plasma pTau217 levels "
	"in 500 subjects showing 96% sensitivity for "
	"detecting preclinical Alzheimer pathology in "
	"cognitively normal older adults.",
	"authors": ["Smith, John"],
	"year": 2024,
	"keywords": ["Alzheimer", "pTau217"],
	"source_query": "plasma_ptau217_diagnosis",
	"year_inferred": False,
	"keywords_extracted":False,
	"html_stripped": False,
	}


	def test_cleaned_paper_valid(valid_cleaned):
	"""Valid CleanedPaper passes."""
	paper = CleanedPaper(**valid_cleaned)
	assert paper.pmid == "37123456"
	assert paper.year == 2024


	def test_cleaned_paper_short_abstract_rejected(valid_cleaned):
	"""Abstract under 20 words is rejected."""
	valid_cleaned["abstract"] = "Too short abstract here."
	with pytest.raises(ValidationError):
	CleanedPaper(**valid_cleaned)


	def test_cleaned_paper_audit_flags(valid_cleaned):
	"""Audit flags are stored correctly."""
	valid_cleaned["year_inferred"] = True
	valid_cleaned["keywords_extracted"] = True
	valid_cleaned["html_stripped"] = True
	paper = CleanedPaper(**valid_cleaned)
	assert paper.year_inferred is True
	assert paper.keywords_extracted is True
	assert paper.html_stripped is True


	# ── split_into_chunks tests ───────────────────────────────────────

	def test_split_short_abstract():
	"""Abstract shorter than chunk_size → single chunk."""
	text = "Short abstract with just a few words about Alzheimer."
	chunks = split_into_chunks(text, chunk_size=100, overlap=10)
	assert len(chunks) == 1
	assert chunks[0] == text


	def test_split_long_abstract():
	"""Abstract longer than chunk_size → multiple chunks."""
	words = ["word"] * 200
	text = " ".join(words)
	chunks = split_into_chunks(text, chunk_size=50, overlap=10)
	assert len(chunks) > 1


	def test_split_overlap():
	"""Chunks share overlapping words."""
	words = [f"word{i}" for i in range(100)]
	text = " ".join(words)
	chunks = split_into_chunks(text, chunk_size=20, overlap=5)
	# Last words of chunk 0 should appear in start of chunk 1
	last_words_chunk0 = set(chunks[0].split()[-5:])
	first_words_chunk1 = set(chunks[1].split()[:5])
	assert len(last_words_chunk0 & first_words_chunk1) > 0


	def test_split_no_empty_chunks():
	"""No empty chunks produced."""
	text = " ".join([f"word{i}" for i in range(50)])
	chunks = split_into_chunks(text, chunk_size=10, overlap=2)
	assert all(len(c.strip()) > 0 for c in chunks)


	# ── PaperChunk validation tests ───────────────────────────────────

	@pytest.fixture
	def valid_chunk() -> dict:
	"""Valid PaperChunk dict — baseline."""
	return {
	"chunk_id": "37123456_chunk_0",
	"pmid": "37123456",
	"chunk_idx": 0,
	"text": "This study examines plasma pTau217 as an early "
	"biomarker for Alzheimer disease detection.",
	"title": "Blood biomarkers for Alzheimer detection",
	"year": 2024,
	"keywords": ["Alzheimer", "pTau217"],
	"source": "pubmed",
	"source_query":"plasma_ptau217_diagnosis",
	}


	def test_paper_chunk_valid(valid_chunk):
	"""Valid PaperChunk passes."""
	chunk = PaperChunk(**valid_chunk)
	assert chunk.chunk_id == "37123456_chunk_0"
	assert chunk.pmid == "37123456"


	def test_paper_chunk_bad_id_rejected(valid_chunk):
	"""chunk_id without '_chunk_' is rejected."""
	valid_chunk["chunk_id"] = "bad-format-id"
	with pytest.raises(ValidationError) as exc:
	PaperChunk(**valid_chunk)
	assert "_chunk_" in str(exc.value)


	def test_paper_chunk_short_text_rejected(valid_chunk):
	"""Text under 5 words is rejected."""
	valid_chunk["text"] = "Too short"
	with pytest.raises(ValidationError):
	PaperChunk(**valid_chunk)


	def test_paper_chunk_word_count(valid_chunk):
	"""word_count is auto-computed correctly."""
	chunk = PaperChunk(**valid_chunk)
	assert chunk.word_count == len(valid_chunk["text"].split())


	def test_paper_chunk_to_pinecone_record(valid_chunk):
	"""to_pinecone_record returns correct structure."""
	chunk = PaperChunk(**valid_chunk)
	record = chunk.to_pinecone_record()
	assert record["id"] == "37123456_chunk_0"
	assert "metadata" in record
	assert record["metadata"]["pmid"] == "37123456"
	assert record["metadata"]["source"] == "pubmed"