rewrite / src /preprocessing /dyslexia_simulator.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
Programmatically generates dyslectic training data from clean text.
Used to augment training pairs when real dyslectic examples are scarce.
Error types simulated (from Rello et al. 2013, 2017 dyslexia research):
- Phonetic substitution (most common, ~35% of errors)
- Letter transposition (e.g., "teh" for "the") (~18%)
- Letter omission (~16%)
- Letter doubling (~12%)
- Letter reversal b/d, p/q (~10%)
- Word boundary errors (~9%)
"""
import random
import re
from typing import Tuple
class DyslexiaSimulator:
"""Generates synthetic dyslectic text from clean input for data augmentation."""
LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'}
PHONETIC_SUBS = {
'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay',
'because': 'becaus', 'important': 'importnt', 'receive': 'recieve',
'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate',
'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell',
}
WORD_MERGES = [
('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'),
('all right', 'alright'), ('every one', 'everyone'),
]
def __init__(self, error_rate: float = 0.15, seed: int = 42):
self.error_rate = error_rate
self.rng = random.Random(seed)
def _transpose_letters(self, word: str) -> str:
"""Swap two adjacent letters."""
if len(word) < 3:
return word
# Pick a random position in interior of word (not first/last)
idx = self.rng.randint(1, len(word) - 2)
chars = list(word)
chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
return ''.join(chars)
def _omit_letter(self, word: str) -> str:
"""Remove a random interior letter."""
if len(word) < 4:
return word
idx = self.rng.randint(1, len(word) - 2)
return word[:idx] + word[idx + 1:]
def _double_letter(self, word: str) -> str:
"""Double a random interior letter."""
if len(word) < 3:
return word
idx = self.rng.randint(1, len(word) - 2)
return word[:idx] + word[idx] + word[idx:]
def _reverse_letter(self, word: str) -> str:
"""Swap b/d, p/q style reversals."""
chars = list(word)
reversed_any = False
for i, c in enumerate(chars):
if c.lower() in self.LETTER_REVERSALS:
replacement = self.LETTER_REVERSALS[c.lower()]
# Preserve case
chars[i] = replacement.upper() if c.isupper() else replacement
reversed_any = True
break # Only reverse one letter per word
if reversed_any:
return ''.join(chars)
return word
def corrupt_word(self, word: str) -> str:
"""Apply a single random error to a word."""
if len(word) < 3:
return word
# Check for phonetic substitution first
lower = word.lower()
if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35:
sub = self.PHONETIC_SUBS[lower]
return sub.capitalize() if word[0].isupper() else sub
# Weighted random error selection matching research distributions
error_type = self.rng.choices(
['transpose', 'omit', 'double', 'reverse'],
weights=[0.35, 0.30, 0.20, 0.15],
k=1
)[0]
if error_type == 'transpose':
return self._transpose_letters(word)
elif error_type == 'omit':
return self._omit_letter(word)
elif error_type == 'double':
return self._double_letter(word)
else:
return self._reverse_letter(word)
def simulate(self, clean_text: str) -> Tuple[str, str]:
"""Returns (corrupted_text, clean_text) training pair."""
if not clean_text or not clean_text.strip():
return (clean_text, clean_text)
# First, apply word merge errors at phrase level
corrupted = clean_text
for original_phrase, merged in self.WORD_MERGES:
if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate:
# Case-insensitive replacement
pattern = re.compile(re.escape(original_phrase), re.IGNORECASE)
corrupted = pattern.sub(merged, corrupted, count=1)
# Then corrupt individual words
words = corrupted.split()
corrupted_words = []
for word in words:
# Strip trailing punctuation for corruption, reattach after
stripped = word.rstrip(".,!?;:\"'()[]{}—–-")
suffix = word[len(stripped):]
if (len(stripped) >= 3 and
self.rng.random() < self.error_rate and
stripped.isalpha()):
corrupted_word = self.corrupt_word(stripped)
corrupted_words.append(corrupted_word + suffix)
else:
corrupted_words.append(word)
corrupted = ' '.join(corrupted_words)
return (corrupted, clean_text)