Spaces:
Sleeping
Sleeping
File size: 2,249 Bytes
1e732dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | """
Tests for src/services/indexing/text_chunker.py — medical text chunking.
"""
import pytest
from src.services.indexing.text_chunker import MedicalChunk, MedicalTextChunker
@pytest.fixture
def chunker():
return MedicalTextChunker(target_words=30, overlap_words=5, min_words=5)
def test_basic_chunking(chunker: MedicalTextChunker):
"""Should split text into chunks."""
# Generate enough words to require multiple chunks (target_words=30)
words = [f"word{i}" for i in range(200)]
text = " ".join(words)
chunks = chunker.chunk_text(text)
assert len(chunks) > 1
for c in chunks:
assert isinstance(c, MedicalChunk)
assert c.text.strip()
def test_section_aware(chunker: MedicalTextChunker):
"""Should detect section headers."""
text = (
"Introduction\nThis study examines diabetes.\n\n"
"Methods\nWe collected blood samples.\n\n"
"Results\nGlucose levels were elevated."
)
chunks = chunker.chunk_text(text)
assert len(chunks) >= 1
def test_biomarker_detection(chunker: MedicalTextChunker):
"""Should detect biomarkers in chunks."""
text = (
"The patient's HbA1c was 8.2% indicating poor glycemic control. "
"Fasting glucose was 185 mg/dL and total cholesterol was elevated at 240."
)
chunks = chunker.chunk_text(text)
assert len(chunks) >= 1
# At least one chunk should have biomarkers detected
all_biomarkers = set()
for c in chunks:
all_biomarkers.update(c.biomarkers_mentioned)
assert len(all_biomarkers) > 0
def test_condition_tagging(chunker: MedicalTextChunker):
"""Should tag chunks with relevant conditions."""
text = (
"Diabetes mellitus is characterised by insulin resistance and elevated blood glucose. "
"Cardiovascular disease risk increases with uncontrolled hypertension."
)
chunks = chunker.chunk_text(text)
all_tags = set()
for c in chunks:
all_tags.update(c.condition_tags)
assert "diabetes" in all_tags or "heart_disease" in all_tags
def test_empty_text(chunker: MedicalTextChunker):
"""Empty text should return empty list."""
assert chunker.chunk_text("") == []
assert chunker.chunk_text(" ") == []
|