""" Sentence-level surface feature extraction for Sinhala dyslexic writing analysis. This module computes interpretable surface-level error signals by comparing clean and dyslexic sentence pairs. """ import difflib # Sinhala diacritic characters SINHALA_DIACRITICS = set([ "ා", "ැ", "ෑ", "ි", "ී", "ු", "ූ", "ෘ", "ෙ", "ේ", "ො", "ෝ", "ං", "ඃ" ]) def char_level_diff(clean: str, dyslexic: str) -> dict: """ Compute character-level edit operations between clean and dyslexic sentences. """ matcher = difflib.SequenceMatcher(None, clean, dyslexic) additions = omissions = substitutions = 0 for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == "insert": additions += (j2 - j1) elif tag == "delete": omissions += (i2 - i1) elif tag == "replace": substitutions += max(i2 - i1, j2 - j1) return { "char_addition": additions, "char_omission": omissions, "char_substitution": substitutions, "has_addition": additions > 0, "has_omission": omissions > 0, "has_substitution": substitutions > 0, } def spacing_diff(clean: str, dyslexic: str) -> dict: """ Detect word boundary (spacing) inconsistencies. """ diff = abs(len(clean.split()) - len(dyslexic.split())) return { "word_count_diff": diff, "has_spacing_issue": diff > 0, } def diacritic_loss(clean: str, dyslexic: str) -> dict: """ Detect diacritic loss in dyslexic writing. """ clean_count = sum(1 for c in clean if c in SINHALA_DIACRITICS) dys_count = sum(1 for c in dyslexic if c in SINHALA_DIACRITICS) return { "has_diacritic_loss": clean_count > dys_count } def extract_surface_features(clean_sentence: str, dyslexic_sentence: str) -> dict: """ Extract all sentence-level surface features. """ features = {} features.update(char_level_diff(clean_sentence, dyslexic_sentence)) features.update(spacing_diff(clean_sentence, dyslexic_sentence)) features.update(diacritic_loss(clean_sentence, dyslexic_sentence)) return features