tinykavi's picture
Add writing_pattern_classifier package for live demo
5548ff6
"""
Sentence-level surface feature extraction for Sinhala dyslexic writing analysis.
This module computes interpretable surface-level error signals
by comparing clean and dyslexic sentence pairs.
"""
import difflib
# Sinhala diacritic characters
SINHALA_DIACRITICS = set([
"ා", "ැ", "ෑ", "ි", "ී", "ු", "ූ", "ෘ", "ෙ", "ේ", "ො", "ෝ", "ං", "ඃ"
])
def char_level_diff(clean: str, dyslexic: str) -> dict:
"""
Compute character-level edit operations between clean and dyslexic sentences.
"""
matcher = difflib.SequenceMatcher(None, clean, dyslexic)
additions = omissions = substitutions = 0
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "insert":
additions += (j2 - j1)
elif tag == "delete":
omissions += (i2 - i1)
elif tag == "replace":
substitutions += max(i2 - i1, j2 - j1)
return {
"char_addition": additions,
"char_omission": omissions,
"char_substitution": substitutions,
"has_addition": additions > 0,
"has_omission": omissions > 0,
"has_substitution": substitutions > 0,
}
def spacing_diff(clean: str, dyslexic: str) -> dict:
"""
Detect word boundary (spacing) inconsistencies.
"""
diff = abs(len(clean.split()) - len(dyslexic.split()))
return {
"word_count_diff": diff,
"has_spacing_issue": diff > 0,
}
def diacritic_loss(clean: str, dyslexic: str) -> dict:
"""
Detect diacritic loss in dyslexic writing.
"""
clean_count = sum(1 for c in clean if c in SINHALA_DIACRITICS)
dys_count = sum(1 for c in dyslexic if c in SINHALA_DIACRITICS)
return {
"has_diacritic_loss": clean_count > dys_count
}
def extract_surface_features(clean_sentence: str, dyslexic_sentence: str) -> dict:
"""
Extract all sentence-level surface features.
"""
features = {}
features.update(char_level_diff(clean_sentence, dyslexic_sentence))
features.update(spacing_diff(clean_sentence, dyslexic_sentence))
features.update(diacritic_loss(clean_sentence, dyslexic_sentence))
return features