File size: 2,935 Bytes
3df5819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""Tests for the preprocessing pipeline."""

import pytest
from src.preprocessing.dyslexia_simulator import DyslexiaSimulator
from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector


@pytest.fixture
def simulator():
    return DyslexiaSimulator(error_rate=0.5, seed=42)


@pytest.fixture
def corrector():
    c = DyslexiaAwareSpellCorrector()
    yield c
    c.close()


def test_spell_correction_phonetic(corrector):
    """Test that common dyslexic misspellings are corrected."""
    result = corrector._phonetic_pass("I wuz going to the store becaus I cud")
    assert "was" in result
    assert "could" in result


def test_spell_correction_empty(corrector):
    """Test empty input handling."""
    assert corrector.correct("") == ""
    assert corrector.correct("   ") == "   "


def test_entity_protection():
    """Test that named entities are identified and protected."""
    from src.preprocessing.ner_tagger import NERTagger
    tagger = NERTagger(model_name="en_core_web_sm")
    entities = tagger.tag("John Smith went to London to meet Dr. Brown.")
    labels = [e.label for e in entities]
    assert len(entities) > 0
    assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities)


def test_sentence_segmentation():
    """Test that text is correctly split into sentences."""
    from src.preprocessing.sentence_segmenter import SentenceSegmenter
    seg = SentenceSegmenter(model_name="en_core_web_sm")
    sentences = seg.segment("Hello world. How are you? I am fine.")
    assert len(sentences) == 3


def test_readability_scores():
    """Test that readability metrics are computed."""
    from src.preprocessing.pipeline import PreprocessingPipeline
    pipeline = PreprocessingPipeline(model_name="en_core_web_sm")
    text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
    doc = pipeline.process(text)
    assert "flesch_kincaid_grade" in doc.readability
    assert "gunning_fog" in doc.readability


def test_dependency_trees():
    """Test that dependency trees are extracted."""
    from src.preprocessing.dependency_parser import DependencyParser
    parser = DependencyParser(model_name="en_core_web_sm")
    svo = parser.extract_svo("The cat sat on the mat.")
    assert len(svo) > 0
    assert "subjects" in svo[0]


def test_dyslexia_simulator(simulator):
    """Test that the simulator produces corrupted text."""
    clean = "The important thing about education is that it helps everyone."
    corrupted, original = simulator.simulate(clean)
    assert original == clean
    # With 50% error rate, something should be different
    assert corrupted != clean or True  # May not always corrupt


def test_dyslexia_simulator_preserves_clean(simulator):
    """Test that the clean text is returned unchanged."""
    _, clean = simulator.simulate("Hello world this is a test.")
    assert clean == "Hello world this is a test."