| """ |
| Sentence-level surface feature extraction for Sinhala dyslexic writing analysis. |
| |
| This module computes interpretable surface-level error signals |
| by comparing clean and dyslexic sentence pairs. |
| """ |
|
|
| import difflib |
|
|
| |
| SINHALA_DIACRITICS = set([ |
| "ා", "ැ", "ෑ", "ි", "ී", "ු", "ූ", "ෘ", "ෙ", "ේ", "ො", "ෝ", "ං", "ඃ" |
| ]) |
|
|
|
|
| def char_level_diff(clean: str, dyslexic: str) -> dict: |
| """ |
| Compute character-level edit operations between clean and dyslexic sentences. |
| """ |
| matcher = difflib.SequenceMatcher(None, clean, dyslexic) |
|
|
| additions = omissions = substitutions = 0 |
|
|
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): |
| if tag == "insert": |
| additions += (j2 - j1) |
| elif tag == "delete": |
| omissions += (i2 - i1) |
| elif tag == "replace": |
| substitutions += max(i2 - i1, j2 - j1) |
|
|
| return { |
| "char_addition": additions, |
| "char_omission": omissions, |
| "char_substitution": substitutions, |
| "has_addition": additions > 0, |
| "has_omission": omissions > 0, |
| "has_substitution": substitutions > 0, |
| } |
|
|
|
|
| def spacing_diff(clean: str, dyslexic: str) -> dict: |
| """ |
| Detect word boundary (spacing) inconsistencies. |
| """ |
| diff = abs(len(clean.split()) - len(dyslexic.split())) |
| return { |
| "word_count_diff": diff, |
| "has_spacing_issue": diff > 0, |
| } |
|
|
|
|
| def diacritic_loss(clean: str, dyslexic: str) -> dict: |
| """ |
| Detect diacritic loss in dyslexic writing. |
| """ |
| clean_count = sum(1 for c in clean if c in SINHALA_DIACRITICS) |
| dys_count = sum(1 for c in dyslexic if c in SINHALA_DIACRITICS) |
|
|
| return { |
| "has_diacritic_loss": clean_count > dys_count |
| } |
|
|
|
|
| def extract_surface_features(clean_sentence: str, dyslexic_sentence: str) -> dict: |
| """ |
| Extract all sentence-level surface features. |
| """ |
| features = {} |
|
|
| features.update(char_level_diff(clean_sentence, dyslexic_sentence)) |
| features.update(spacing_diff(clean_sentence, dyslexic_sentence)) |
| features.update(diacritic_loss(clean_sentence, dyslexic_sentence)) |
|
|
| return features |
|
|