File size: 5,282 Bytes
12fd5f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Two-pass spell correction:
Pass 1: pyspellchecker (fast, context-free, catches simple typos)
Pass 2: LanguageTool (context-aware, catches grammar + dyslexic patterns)

Dyslexic error patterns handled:
- Letter reversals: b/d, p/q, n/u, m/w
- Phonetic spelling: "wuz", "cud", "thay"
- Word boundary errors: "alot", "infact", "aswell"
- Letter omissions: "becaus", "importnt"
- Letter transpositions: "teh", "recieve"
- Homophone confusion: there/their/they're
"""

import language_tool_python
from spellchecker import SpellChecker
from loguru import logger
from typing import Optional
import re


class DyslexiaAwareSpellCorrector:
    """Two-pass spell corrector with dyslexia-specific phonetic pattern handling."""

    DYSLEXIC_PHONETIC_MAP = {
        "wuz": "was", "cud": "could", "wud": "would", "shud": "should",
        "thay": "they", "thier": "their", "recieve": "receive",
        "beleive": "believe", "occured": "occurred", "definately": "definitely",
        "seperate": "separate", "untill": "until", "tommorrow": "tomorrow",
        "alot": "a lot", "infact": "in fact", "aswell": "as well",
        "alright": "all right", "cant": "cannot", "wont": "will not",
        "ive": "I have", "im": "I am", "id": "I would",
    }

    def __init__(self, language: str = "en-US"):
        self.spell = SpellChecker()
        self.language = language
        # Build regex pattern for phonetic map (word-boundary matching)
        self._phonetic_pattern = re.compile(
            r'\b(' + '|'.join(re.escape(k) for k in self.DYSLEXIC_PHONETIC_MAP.keys()) + r')\b',
            re.IGNORECASE
        )
        # Try to initialise LanguageTool; graceful fallback if JVM not available
        self.tool = None
        try:
            self.tool = language_tool_python.LanguageTool(language)
            logger.info("LanguageTool initialised successfully")
        except Exception as e:
            logger.warning(f"LanguageTool unavailable (JVM issue?), skipping context-aware pass: {e}")

    def _phonetic_pass(self, text: str) -> str:
        """Apply known dyslexic phonetic substitutions first."""
        def _replace(match):
            word = match.group(0)
            lower = word.lower()
            replacement = self.DYSLEXIC_PHONETIC_MAP.get(lower, word)
            # Preserve capitalisation of first letter
            if word[0].isupper() and replacement[0].islower():
                replacement = replacement[0].upper() + replacement[1:]
            return replacement

        return self._phonetic_pattern.sub(_replace, text)

    def _spellcheck_pass(self, text: str) -> str:
        """pyspellchecker pass for simple token-level errors."""
        words = text.split()
        corrected_words = []
        for word in words:
            # Strip punctuation for checking but preserve it
            stripped = word.strip(".,!?;:\"'()[]{}—–-")
            prefix = word[:len(word) - len(word.lstrip(".,!?;:\"'()[]{}—–-"))]
            suffix = word[len(stripped) + len(prefix):]

            if stripped and stripped.lower() not in self.spell and not stripped.isupper():
                correction = self.spell.correction(stripped.lower())
                if correction and correction != stripped.lower():
                    # Preserve original capitalisation
                    if stripped[0].isupper():
                        correction = correction.capitalize()
                    corrected_words.append(prefix + correction + suffix)
                else:
                    corrected_words.append(word)
            else:
                corrected_words.append(word)
        return " ".join(corrected_words)

    def _languagetool_pass(self, text: str) -> str:
        """LanguageTool pass for context-aware grammar + spelling corrections."""
        if self.tool is None:
            return text

        try:
            matches = self.tool.check(text)
            # Apply corrections in reverse order to preserve offsets
            matches = sorted(matches, key=lambda m: m.offset, reverse=True)
            result = text
            for match in matches:
                if match.replacements:
                    replacement = match.replacements[0]
                    start = match.offset
                    end = start + match.errorLength
                    result = result[:start] + replacement + result[end:]
            return result
        except Exception as e:
            logger.warning(f"LanguageTool check failed: {e}")
            return text

    def correct(self, text: str) -> str:
        """Run all three correction passes in sequence."""
        if not text or not text.strip():
            return text

        logger.debug(f"Spell correction input: {text[:100]}...")

        # Pass 0: Phonetic substitutions (dyslexia-specific)
        text = self._phonetic_pass(text)

        # Pass 1: Token-level spellcheck
        text = self._spellcheck_pass(text)

        # Pass 2: Context-aware grammar correction
        text = self._languagetool_pass(text)

        return text

    def close(self):
        """Clean up LanguageTool resources."""
        if self.tool is not None:
            try:
                self.tool.close()
            except Exception:
                pass
            self.tool = None