"""
Programmatically generates dyslectic training data from clean text.
Used to augment training pairs when real dyslectic examples are scarce.

Error types simulated (from Rello et al. 2013, 2017 dyslexia research):
- Phonetic substitution (most common, ~35% of errors)
- Letter transposition (e.g., "teh" for "the") (~18%)
- Letter omission (~16%)
- Letter doubling (~12%)
- Letter reversal b/d, p/q (~10%)
- Word boundary errors (~9%)
"""

import random
import re
from typing import Tuple


class DyslexiaSimulator:
    """Generates synthetic dyslectic text from clean input for data augmentation."""

    LETTER_REVERSALS = {'b': 'd', 'd': 'b', 'p': 'q', 'q': 'p', 'n': 'u', 'u': 'n'}
    PHONETIC_SUBS = {
        'was': 'wuz', 'could': 'cud', 'would': 'wud', 'they': 'thay',
        'because': 'becaus', 'important': 'importnt', 'receive': 'recieve',
        'believe': 'beleive', 'definitely': 'definately', 'separate': 'seperate',
        'a lot': 'alot', 'in fact': 'infact', 'as well': 'aswell',
    }
    WORD_MERGES = [
        ('a lot', 'alot'), ('in fact', 'infact'), ('as well', 'aswell'),
        ('all right', 'alright'), ('every one', 'everyone'),
    ]

    def __init__(self, error_rate: float = 0.15, seed: int = 42):
        self.error_rate = error_rate
        self.rng = random.Random(seed)

    def _transpose_letters(self, word: str) -> str:
        """Swap two adjacent letters."""
        if len(word) < 3:
            return word
        # Pick a random position in interior of word (not first/last)
        idx = self.rng.randint(1, len(word) - 2)
        chars = list(word)
        chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
        return ''.join(chars)

    def _omit_letter(self, word: str) -> str:
        """Remove a random interior letter."""
        if len(word) < 4:
            return word
        idx = self.rng.randint(1, len(word) - 2)
        return word[:idx] + word[idx + 1:]

    def _double_letter(self, word: str) -> str:
        """Double a random interior letter."""
        if len(word) < 3:
            return word
        idx = self.rng.randint(1, len(word) - 2)
        return word[:idx] + word[idx] + word[idx:]

    def _reverse_letter(self, word: str) -> str:
        """Swap b/d, p/q style reversals."""
        chars = list(word)
        reversed_any = False
        for i, c in enumerate(chars):
            if c.lower() in self.LETTER_REVERSALS:
                replacement = self.LETTER_REVERSALS[c.lower()]
                # Preserve case
                chars[i] = replacement.upper() if c.isupper() else replacement
                reversed_any = True
                break  # Only reverse one letter per word
        if reversed_any:
            return ''.join(chars)
        return word

    def corrupt_word(self, word: str) -> str:
        """Apply a single random error to a word."""
        if len(word) < 3:
            return word
        # Check for phonetic substitution first
        lower = word.lower()
        if lower in self.PHONETIC_SUBS and self.rng.random() < 0.35:
            sub = self.PHONETIC_SUBS[lower]
            return sub.capitalize() if word[0].isupper() else sub

        # Weighted random error selection matching research distributions
        error_type = self.rng.choices(
            ['transpose', 'omit', 'double', 'reverse'],
            weights=[0.35, 0.30, 0.20, 0.15],
            k=1
        )[0]

        if error_type == 'transpose':
            return self._transpose_letters(word)
        elif error_type == 'omit':
            return self._omit_letter(word)
        elif error_type == 'double':
            return self._double_letter(word)
        else:
            return self._reverse_letter(word)

    def simulate(self, clean_text: str) -> Tuple[str, str]:
        """Returns (corrupted_text, clean_text) training pair."""
        if not clean_text or not clean_text.strip():
            return (clean_text, clean_text)

        # First, apply word merge errors at phrase level
        corrupted = clean_text
        for original_phrase, merged in self.WORD_MERGES:
            if original_phrase in corrupted.lower() and self.rng.random() < self.error_rate:
                # Case-insensitive replacement
                pattern = re.compile(re.escape(original_phrase), re.IGNORECASE)
                corrupted = pattern.sub(merged, corrupted, count=1)

        # Then corrupt individual words
        words = corrupted.split()
        corrupted_words = []
        for word in words:
            # Strip trailing punctuation for corruption, reattach after
            stripped = word.rstrip(".,!?;:\"'()[]{}—–-")
            suffix = word[len(stripped):]

            if (len(stripped) >= 3 and
                    self.rng.random() < self.error_rate and
                    stripped.isalpha()):
                corrupted_word = self.corrupt_word(stripped)
                corrupted_words.append(corrupted_word + suffix)
            else:
                corrupted_words.append(word)

        corrupted = ' '.join(corrupted_words)
        return (corrupted, clean_text)