File size: 5,282 Bytes

12fd5f2

"""
Two-pass spell correction:
Pass 1: pyspellchecker (fast, context-free, catches simple typos)
Pass 2: LanguageTool (context-aware, catches grammar + dyslexic patterns)

Dyslexic error patterns handled:
- Letter reversals: b/d, p/q, n/u, m/w
- Phonetic spelling: "wuz", "cud", "thay"
- Word boundary errors: "alot", "infact", "aswell"
- Letter omissions: "becaus", "importnt"
- Letter transpositions: "teh", "recieve"
- Homophone confusion: there/their/they're
"""

import language_tool_python
from spellchecker import SpellChecker
from loguru import logger
from typing import Optional
import re


class DyslexiaAwareSpellCorrector:
    """Two-pass spell corrector with dyslexia-specific phonetic pattern handling."""

    DYSLEXIC_PHONETIC_MAP = {
        "wuz": "was", "cud": "could", "wud": "would", "shud": "should",
        "thay": "they", "thier": "their", "recieve": "receive",
        "beleive": "believe", "occured": "occurred", "definately": "definitely",
        "seperate": "separate", "untill": "until", "tommorrow": "tomorrow",
        "alot": "a lot", "infact": "in fact", "aswell": "as well",
        "alright": "all right", "cant": "cannot", "wont": "will not",
        "ive": "I have", "im": "I am", "id": "I would",
    }

    def __init__(self, language: str = "en-US"):
        self.spell = SpellChecker()
        self.language = language
        # Build regex pattern for phonetic map (word-boundary matching)
        self._phonetic_pattern = re.compile(
            r'\b(' + '|'.join(re.escape(k) for k in self.DYSLEXIC_PHONETIC_MAP.keys()) + r')\b',
            re.IGNORECASE
        )
        # Try to initialise LanguageTool; graceful fallback if JVM not available
        self.tool = None
        try:
            self.tool = language_tool_python.LanguageTool(language)
            logger.info("LanguageTool initialised successfully")
        except Exception as e:
            logger.warning(f"LanguageTool unavailable (JVM issue?), skipping context-aware pass: {e}")

    def _phonetic_pass(self, text: str) -> str:
        """Apply known dyslexic phonetic substitutions first."""
        def _replace(match):
            word = match.group(0)
            lower = word.lower()
            replacement = self.DYSLEXIC_PHONETIC_MAP.get(lower, word)
            # Preserve capitalisation of first letter
            if word[0].isupper() and replacement[0].islower():
                replacement = replacement[0].upper() + replacement[1:]
            return replacement

        return self._phonetic_pattern.sub(_replace, text)

    def _spellcheck_pass(self, text: str) -> str:
        """pyspellchecker pass for simple token-level errors."""
        words = text.split()
        corrected_words = []
        for word in words:
            # Strip punctuation for checking but preserve it
            stripped = word.strip(".,!?;:\"'()[]{}—–-")
            prefix = word[:len(word) - len(word.lstrip(".,!?;:\"'()[]{}—–-"))]
            suffix = word[len(stripped) + len(prefix):]

            if stripped and stripped.lower() not in self.spell and not stripped.isupper():
                correction = self.spell.correction(stripped.lower())
                if correction and correction != stripped.lower():
                    # Preserve original capitalisation
                    if stripped[0].isupper():
                        correction = correction.capitalize()
                    corrected_words.append(prefix + correction + suffix)
                else:
                    corrected_words.append(word)
            else:
                corrected_words.append(word)
        return " ".join(corrected_words)

    def _languagetool_pass(self, text: str) -> str:
        """LanguageTool pass for context-aware grammar + spelling corrections."""
        if self.tool is None:
            return text

        try:
            matches = self.tool.check(text)
            # Apply corrections in reverse order to preserve offsets
            matches = sorted(matches, key=lambda m: m.offset, reverse=True)
            result = text
            for match in matches:
                if match.replacements:
                    replacement = match.replacements[0]
                    start = match.offset
                    end = start + match.errorLength
                    result = result[:start] + replacement + result[end:]
            return result
        except Exception as e:
            logger.warning(f"LanguageTool check failed: {e}")
            return text

    def correct(self, text: str) -> str:
        """Run all three correction passes in sequence."""
        if not text or not text.strip():
            return text

        logger.debug(f"Spell correction input: {text[:100]}...")

        # Pass 0: Phonetic substitutions (dyslexia-specific)
        text = self._phonetic_pass(text)

        # Pass 1: Token-level spellcheck
        text = self._spellcheck_pass(text)

        # Pass 2: Context-aware grammar correction
        text = self._languagetool_pass(text)

        return text

    def close(self):
        """Clean up LanguageTool resources."""
        if self.tool is not None:
            try:
                self.tool.close()
            except Exception:
                pass
            self.tool = None