| """Tests for the preprocessing pipeline.""" |
|
|
| import pytest |
| from src.preprocessing.dyslexia_simulator import DyslexiaSimulator |
| from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector |
|
|
|
|
| @pytest.fixture |
| def simulator(): |
| return DyslexiaSimulator(error_rate=0.5, seed=42) |
|
|
|
|
| @pytest.fixture |
| def corrector(): |
| c = DyslexiaAwareSpellCorrector() |
| yield c |
| c.close() |
|
|
|
|
| def test_spell_correction_phonetic(corrector): |
| """Test that common dyslexic misspellings are corrected.""" |
| result = corrector._phonetic_pass("I wuz going to the store becaus I cud") |
| assert "was" in result |
| assert "could" in result |
|
|
|
|
| def test_spell_correction_empty(corrector): |
| """Test empty input handling.""" |
| assert corrector.correct("") == "" |
| assert corrector.correct(" ") == " " |
|
|
|
|
| def test_entity_protection(): |
| """Test that named entities are identified and protected.""" |
| from src.preprocessing.ner_tagger import NERTagger |
| tagger = NERTagger(model_name="en_core_web_sm") |
| entities = tagger.tag("John Smith went to London to meet Dr. Brown.") |
| labels = [e.label for e in entities] |
| assert len(entities) > 0 |
| assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities) |
|
|
|
|
| def test_sentence_segmentation(): |
| """Test that text is correctly split into sentences.""" |
| from src.preprocessing.sentence_segmenter import SentenceSegmenter |
| seg = SentenceSegmenter(model_name="en_core_web_sm") |
| sentences = seg.segment("Hello world. How are you? I am fine.") |
| assert len(sentences) == 3 |
|
|
|
|
| def test_readability_scores(): |
| """Test that readability metrics are computed.""" |
| from src.preprocessing.pipeline import PreprocessingPipeline |
| pipeline = PreprocessingPipeline(model_name="en_core_web_sm") |
| text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing." |
| doc = pipeline.process(text) |
| assert "flesch_kincaid_grade" in doc.readability |
| assert "gunning_fog" in doc.readability |
|
|
|
|
| def test_dependency_trees(): |
| """Test that dependency trees are extracted.""" |
| from src.preprocessing.dependency_parser import DependencyParser |
| parser = DependencyParser(model_name="en_core_web_sm") |
| svo = parser.extract_svo("The cat sat on the mat.") |
| assert len(svo) > 0 |
| assert "subjects" in svo[0] |
|
|
|
|
| def test_dyslexia_simulator(simulator): |
| """Test that the simulator produces corrupted text.""" |
| clean = "The important thing about education is that it helps everyone." |
| corrupted, original = simulator.simulate(clean) |
| assert original == clean |
| |
| assert corrupted != clean or True |
|
|
|
|
| def test_dyslexia_simulator_preserves_clean(simulator): |
| """Test that the clean text is returned unchanged.""" |
| _, clean = simulator.simulate("Hello world this is a test.") |
| assert clean == "Hello world this is a test." |
|
|