File size: 2,174 Bytes
5548ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Sentence-level surface feature extraction for Sinhala dyslexic writing analysis.

This module computes interpretable surface-level error signals
by comparing clean and dyslexic sentence pairs.
"""

import difflib

# Sinhala diacritic characters
SINHALA_DIACRITICS = set([
    "ා", "ැ", "ෑ", "ි", "ී", "ු", "ූ", "ෘ", "ෙ", "ේ", "ො", "ෝ", "ං", "ඃ"
])


def char_level_diff(clean: str, dyslexic: str) -> dict:
    """
    Compute character-level edit operations between clean and dyslexic sentences.
    """
    matcher = difflib.SequenceMatcher(None, clean, dyslexic)

    additions = omissions = substitutions = 0

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "insert":
            additions += (j2 - j1)
        elif tag == "delete":
            omissions += (i2 - i1)
        elif tag == "replace":
            substitutions += max(i2 - i1, j2 - j1)

    return {
        "char_addition": additions,
        "char_omission": omissions,
        "char_substitution": substitutions,
        "has_addition": additions > 0,
        "has_omission": omissions > 0,
        "has_substitution": substitutions > 0,
    }


def spacing_diff(clean: str, dyslexic: str) -> dict:
    """
    Detect word boundary (spacing) inconsistencies.
    """
    diff = abs(len(clean.split()) - len(dyslexic.split()))
    return {
        "word_count_diff": diff,
        "has_spacing_issue": diff > 0,
    }


def diacritic_loss(clean: str, dyslexic: str) -> dict:
    """
    Detect diacritic loss in dyslexic writing.
    """
    clean_count = sum(1 for c in clean if c in SINHALA_DIACRITICS)
    dys_count = sum(1 for c in dyslexic if c in SINHALA_DIACRITICS)

    return {
        "has_diacritic_loss": clean_count > dys_count
    }


def extract_surface_features(clean_sentence: str, dyslexic_sentence: str) -> dict:
    """
    Extract all sentence-level surface features.
    """
    features = {}

    features.update(char_level_diff(clean_sentence, dyslexic_sentence))
    features.update(spacing_diff(clean_sentence, dyslexic_sentence))
    features.update(diacritic_loss(clean_sentence, dyslexic_sentence))

    return features