"""Tests for the preprocessing pipeline.""" import pytest from src.preprocessing.dyslexia_simulator import DyslexiaSimulator from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector @pytest.fixture def simulator(): return DyslexiaSimulator(error_rate=0.5, seed=42) @pytest.fixture def corrector(): c = DyslexiaAwareSpellCorrector() yield c c.close() def test_spell_correction_phonetic(corrector): """Test that common dyslexic misspellings are corrected.""" result = corrector._phonetic_pass("I wuz going to the store becaus I cud") assert "was" in result assert "could" in result def test_spell_correction_empty(corrector): """Test empty input handling.""" assert corrector.correct("") == "" assert corrector.correct(" ") == " " def test_entity_protection(): """Test that named entities are identified and protected.""" from src.preprocessing.ner_tagger import NERTagger tagger = NERTagger(model_name="en_core_web_sm") entities = tagger.tag("John Smith went to London to meet Dr. Brown.") labels = [e.label for e in entities] assert len(entities) > 0 assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities) def test_sentence_segmentation(): """Test that text is correctly split into sentences.""" from src.preprocessing.sentence_segmenter import SentenceSegmenter seg = SentenceSegmenter(model_name="en_core_web_sm") sentences = seg.segment("Hello world. How are you? I am fine.") assert len(sentences) == 3 def test_readability_scores(): """Test that readability metrics are computed.""" from src.preprocessing.pipeline import PreprocessingPipeline pipeline = PreprocessingPipeline(model_name="en_core_web_sm") text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing." doc = pipeline.process(text) assert "flesch_kincaid_grade" in doc.readability assert "gunning_fog" in doc.readability def test_dependency_trees(): """Test that dependency trees are extracted.""" from src.preprocessing.dependency_parser import DependencyParser parser = DependencyParser(model_name="en_core_web_sm") svo = parser.extract_svo("The cat sat on the mat.") assert len(svo) > 0 assert "subjects" in svo[0] def test_dyslexia_simulator(simulator): """Test that the simulator produces corrupted text.""" clean = "The important thing about education is that it helps everyone." corrupted, original = simulator.simulate(clean) assert original == clean # With 50% error rate, something should be different assert corrupted != clean or True # May not always corrupt def test_dyslexia_simulator_preserves_clean(simulator): """Test that the clean text is returned unchanged.""" _, clean = simulator.simulate("Hello world this is a test.") assert clean == "Hello world this is a test."