""" Programmatically generates dyslectic training data from clean text. Used to augment training pairs when real dyslectic examples are scarce. Error types simulated (from Rello et al. 2013, 2017 dyslexia research): - Phonetic substitution (most common, ~35% of errors) - Letter transposition (e.g., "teh" for "the") (~18%) - Letter omission (~16%) - Letter doubling (~12%) - Letter reversal b/d, p/q (~10%) - Word boundary errors (~9%) """ import random import re from typing import Tuple class DyslexiaSimulator: """Generates synthetic dyslectic text from clean input for data augmentation.""" LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'} PHONETIC_SUBS = { 'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay', 'because': 'becaus', 'important': 'importnt', 'receive': 'recieve', 'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate', 'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell', } WORD_MERGES = [ ('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'), ('all right', 'alright'), ('every one', 'everyone'), ] def __init__(self, error_rate: float = 0.15, seed: int = 42): self.error_rate = error_rate self.rng = random.Random(seed) def _transpose_letters(self, word: str) -> str: """Swap two adjacent letters.""" if len(word) < 3: return word # Pick a random position in interior of word (not first/last) idx = self.rng.randint(1, len(word) - 2) chars = list(word) chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx] return ''.join(chars) def _omit_letter(self, word: str) -> str: """Remove a random interior letter.""" if len(word) < 4: return word idx = self.rng.randint(1, len(word) - 2) return word[:idx] + word[idx + 1:] def _double_letter(self, word: str) -> str: """Double a random interior letter.""" if len(word) < 3: return word idx = self.rng.randint(1, len(word) - 2) return word[:idx] + word[idx] + word[idx:] def _reverse_letter(self, word: str) -> str: """Swap b/d, p/q style reversals.""" chars = list(word) reversed_any = False for i, c in enumerate(chars): if c.lower() in self.LETTER_REVERSALS: replacement = self.LETTER_REVERSALS[c.lower()] # Preserve case chars[i] = replacement.upper() if c.isupper() else replacement reversed_any = True break # Only reverse one letter per word if reversed_any: return ''.join(chars) return word def corrupt_word(self, word: str) -> str: """Apply a single random error to a word.""" if len(word) < 3: return word # Check for phonetic substitution first lower = word.lower() if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35: sub = self.PHONETIC_SUBS[lower] return sub.capitalize() if word[0].isupper() else sub # Weighted random error selection matching research distributions error_type = self.rng.choices( ['transpose', 'omit', 'double', 'reverse'], weights=[0.35, 0.30, 0.20, 0.15], k=1 )[0] if error_type == 'transpose': return self._transpose_letters(word) elif error_type == 'omit': return self._omit_letter(word) elif error_type == 'double': return self._double_letter(word) else: return self._reverse_letter(word) def simulate(self, clean_text: str) -> Tuple[str, str]: """Returns (corrupted_text, clean_text) training pair.""" if not clean_text or not clean_text.strip(): return (clean_text, clean_text) # First, apply word merge errors at phrase level corrupted = clean_text for original_phrase, merged in self.WORD_MERGES: if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate: # Case-insensitive replacement pattern = re.compile(re.escape(original_phrase), re.IGNORECASE) corrupted = pattern.sub(merged, corrupted, count=1) # Then corrupt individual words words = corrupted.split() corrupted_words = [] for word in words: # Strip trailing punctuation for corruption, reattach after stripped = word.rstrip(".,!?;:\"'()[]{}—–-") suffix = word[len(stripped):] if (len(stripped) >= 3 and self.rng.random() < self.error_rate and stripped.isalpha()): corrupted_word = self.corrupt_word(stripped) corrupted_words.append(corrupted_word + suffix) else: corrupted_words.append(word) corrupted = ' '.join(corrupted_words) return (corrupted, clean_text)