File size: 5,106 Bytes

12fd5f2

"""
Programmatically generates dyslectic training data from clean text.
Used to augment training pairs when real dyslectic examples are scarce.

Error types simulated (from Rello et al. 2013, 2017 dyslexia research):
- Phonetic substitution (most common, ~35% of errors)
- Letter transposition (e.g., "teh" for "the") (~18%)
- Letter omission (~16%)
- Letter doubling (~12%)
- Letter reversal b/d, p/q (~10%)
- Word boundary errors (~9%)
"""

import random
import re
from typing import Tuple


class DyslexiaSimulator:
    """Generates synthetic dyslectic text from clean input for data augmentation."""

    LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'}
    PHONETIC_SUBS = {
        'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay',
        'because': 'becaus', 'important': 'importnt', 'receive': 'recieve',
        'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate',
        'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell',
    }
    WORD_MERGES = [
        ('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'),
        ('all right', 'alright'), ('every one', 'everyone'),
    ]

    def __init__(self, error_rate: float = 0.15, seed: int = 42):
        self.error_rate = error_rate
        self.rng = random.Random(seed)

    def _transpose_letters(self, word: str) -> str:
        """Swap two adjacent letters."""
        if len(word) < 3:
            return word
        # Pick a random position in interior of word (not first/last)
        idx = self.rng.randint(1, len(word) - 2)
        chars = list(word)
        chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
        return ''.join(chars)

    def _omit_letter(self, word: str) -> str:
        """Remove a random interior letter."""
        if len(word) < 4:
            return word
        idx = self.rng.randint(1, len(word) - 2)
        return word[:idx] + word[idx + 1:]

    def _double_letter(self, word: str) -> str:
        """Double a random interior letter."""
        if len(word) < 3:
            return word
        idx = self.rng.randint(1, len(word) - 2)
        return word[:idx] + word[idx] + word[idx:]

    def _reverse_letter(self, word: str) -> str:
        """Swap b/d, p/q style reversals."""
        chars = list(word)
        reversed_any = False
        for i, c in enumerate(chars):
            if c.lower() in self.LETTER_REVERSALS:
                replacement = self.LETTER_REVERSALS[c.lower()]
                # Preserve case
                chars[i] = replacement.upper() if c.isupper() else replacement
                reversed_any = True
                break  # Only reverse one letter per word
        if reversed_any:
            return ''.join(chars)
        return word

    def corrupt_word(self, word: str) -> str:
        """Apply a single random error to a word."""
        if len(word) < 3:
            return word
        # Check for phonetic substitution first
        lower = word.lower()
        if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35:
            sub = self.PHONETIC_SUBS[lower]
            return sub.capitalize() if word[0].isupper() else sub

        # Weighted random error selection matching research distributions
        error_type = self.rng.choices(
            ['transpose', 'omit', 'double', 'reverse'],
            weights=[0.35, 0.30, 0.20, 0.15],
            k=1
        )[0]

        if error_type == 'transpose':
            return self._transpose_letters(word)
        elif error_type == 'omit':
            return self._omit_letter(word)
        elif error_type == 'double':
            return self._double_letter(word)
        else:
            return self._reverse_letter(word)

    def simulate(self, clean_text: str) -> Tuple[str, str]:
        """Returns (corrupted_text, clean_text) training pair."""
        if not clean_text or not clean_text.strip():
            return (clean_text, clean_text)

        # First, apply word merge errors at phrase level
        corrupted = clean_text
        for original_phrase, merged in self.WORD_MERGES:
            if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate:
                # Case-insensitive replacement
                pattern = re.compile(re.escape(original_phrase), re.IGNORECASE)
                corrupted = pattern.sub(merged, corrupted, count=1)

        # Then corrupt individual words
        words = corrupted.split()
        corrupted_words = []
        for word in words:
            # Strip trailing punctuation for corruption, reattach after
            stripped = word.rstrip(".,!?;:\"'()[]{}—–-")
            suffix = word[len(stripped):]

            if (len(stripped) >= 3 and
                    self.rng.random() < self.error_rate and
                    stripped.isalpha()):
                corrupted_word = self.corrupt_word(stripped)
                corrupted_words.append(corrupted_word + suffix)
            else:
                corrupted_words.append(word)

        corrupted = ' '.join(corrupted_words)
        return (corrupted, clean_text)