File size: 2,935 Bytes
3df5819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | """Tests for the preprocessing pipeline."""
import pytest
from src.preprocessing.dyslexia_simulator import DyslexiaSimulator
from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector
@pytest.fixture
def simulator():
return DyslexiaSimulator(error_rate=0.5, seed=42)
@pytest.fixture
def corrector():
c = DyslexiaAwareSpellCorrector()
yield c
c.close()
def test_spell_correction_phonetic(corrector):
"""Test that common dyslexic misspellings are corrected."""
result = corrector._phonetic_pass("I wuz going to the store becaus I cud")
assert "was" in result
assert "could" in result
def test_spell_correction_empty(corrector):
"""Test empty input handling."""
assert corrector.correct("") == ""
assert corrector.correct(" ") == " "
def test_entity_protection():
"""Test that named entities are identified and protected."""
from src.preprocessing.ner_tagger import NERTagger
tagger = NERTagger(model_name="en_core_web_sm")
entities = tagger.tag("John Smith went to London to meet Dr. Brown.")
labels = [e.label for e in entities]
assert len(entities) > 0
assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities)
def test_sentence_segmentation():
"""Test that text is correctly split into sentences."""
from src.preprocessing.sentence_segmenter import SentenceSegmenter
seg = SentenceSegmenter(model_name="en_core_web_sm")
sentences = seg.segment("Hello world. How are you? I am fine.")
assert len(sentences) == 3
def test_readability_scores():
"""Test that readability metrics are computed."""
from src.preprocessing.pipeline import PreprocessingPipeline
pipeline = PreprocessingPipeline(model_name="en_core_web_sm")
text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
doc = pipeline.process(text)
assert "flesch_kincaid_grade" in doc.readability
assert "gunning_fog" in doc.readability
def test_dependency_trees():
"""Test that dependency trees are extracted."""
from src.preprocessing.dependency_parser import DependencyParser
parser = DependencyParser(model_name="en_core_web_sm")
svo = parser.extract_svo("The cat sat on the mat.")
assert len(svo) > 0
assert "subjects" in svo[0]
def test_dyslexia_simulator(simulator):
"""Test that the simulator produces corrupted text."""
clean = "The important thing about education is that it helps everyone."
corrupted, original = simulator.simulate(clean)
assert original == clean
# With 50% error rate, something should be different
assert corrupted != clean or True # May not always corrupt
def test_dyslexia_simulator_preserves_clean(simulator):
"""Test that the clean text is returned unchanged."""
_, clean = simulator.simulate("Hello world this is a test.")
assert clean == "Hello world this is a test."
|