| """ |
| Programmatically generates dyslectic training data from clean text. |
| Used to augment training pairs when real dyslectic examples are scarce. |
| |
| Error types simulated (from Rello et al. 2013, 2017 dyslexia research): |
| - Phonetic substitution (most common, ~35% of errors) |
| - Letter transposition (e.g., "teh" for "the") (~18%) |
| - Letter omission (~16%) |
| - Letter doubling (~12%) |
| - Letter reversal b/d, p/q (~10%) |
| - Word boundary errors (~9%) |
| """ |
|
|
| import random |
| import re |
| from typing import Tuple |
|
|
|
|
| class DyslexiaSimulator: |
| """Generates synthetic dyslectic text from clean input for data augmentation.""" |
|
|
| LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'} |
| PHONETIC_SUBS = { |
| 'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay', |
| 'because': 'becaus', 'important': 'importnt', 'receive': 'recieve', |
| 'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate', |
| 'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell', |
| } |
| WORD_MERGES = [ |
| ('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'), |
| ('all right', 'alright'), ('every one', 'everyone'), |
| ] |
|
|
| def __init__(self, error_rate: float = 0.15, seed: int = 42): |
| self.error_rate = error_rate |
| self.rng = random.Random(seed) |
|
|
| def _transpose_letters(self, word: str) -> str: |
| """Swap two adjacent letters.""" |
| if len(word) < 3: |
| return word |
| |
| idx = self.rng.randint(1, len(word) - 2) |
| chars = list(word) |
| chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx] |
| return ''.join(chars) |
|
|
| def _omit_letter(self, word: str) -> str: |
| """Remove a random interior letter.""" |
| if len(word) < 4: |
| return word |
| idx = self.rng.randint(1, len(word) - 2) |
| return word[:idx] + word[idx + 1:] |
|
|
| def _double_letter(self, word: str) -> str: |
| """Double a random interior letter.""" |
| if len(word) < 3: |
| return word |
| idx = self.rng.randint(1, len(word) - 2) |
| return word[:idx] + word[idx] + word[idx:] |
|
|
| def _reverse_letter(self, word: str) -> str: |
| """Swap b/d, p/q style reversals.""" |
| chars = list(word) |
| reversed_any = False |
| for i, c in enumerate(chars): |
| if c.lower() in self.LETTER_REVERSALS: |
| replacement = self.LETTER_REVERSALS[c.lower()] |
| |
| chars[i] = replacement.upper() if c.isupper() else replacement |
| reversed_any = True |
| break |
| if reversed_any: |
| return ''.join(chars) |
| return word |
|
|
| def corrupt_word(self, word: str) -> str: |
| """Apply a single random error to a word.""" |
| if len(word) < 3: |
| return word |
| |
| lower = word.lower() |
| if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35: |
| sub = self.PHONETIC_SUBS[lower] |
| return sub.capitalize() if word[0].isupper() else sub |
|
|
| |
| error_type = self.rng.choices( |
| ['transpose', 'omit', 'double', 'reverse'], |
| weights=[0.35, 0.30, 0.20, 0.15], |
| k=1 |
| )[0] |
|
|
| if error_type == 'transpose': |
| return self._transpose_letters(word) |
| elif error_type == 'omit': |
| return self._omit_letter(word) |
| elif error_type == 'double': |
| return self._double_letter(word) |
| else: |
| return self._reverse_letter(word) |
|
|
| def simulate(self, clean_text: str) -> Tuple[str, str]: |
| """Returns (corrupted_text, clean_text) training pair.""" |
| if not clean_text or not clean_text.strip(): |
| return (clean_text, clean_text) |
|
|
| |
| corrupted = clean_text |
| for original_phrase, merged in self.WORD_MERGES: |
| if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate: |
| |
| pattern = re.compile(re.escape(original_phrase), re.IGNORECASE) |
| corrupted = pattern.sub(merged, corrupted, count=1) |
|
|
| |
| words = corrupted.split() |
| corrupted_words = [] |
| for word in words: |
| |
| stripped = word.rstrip(".,!?;:\"'()[]{}—–-") |
| suffix = word[len(stripped):] |
|
|
| if (len(stripped) >= 3 and |
| self.rng.random() < self.error_rate and |
| stripped.isalpha()): |
| corrupted_word = self.corrupt_word(stripped) |
| corrupted_words.append(corrupted_word + suffix) |
| else: |
| corrupted_words.append(word) |
|
|
| corrupted = ' '.join(corrupted_words) |
| return (corrupted, clean_text) |
|
|