File size: 5,282 Bytes
12fd5f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Two-pass spell correction:
Pass 1: pyspellchecker (fast, context-free, catches simple typos)
Pass 2: LanguageTool (context-aware, catches grammar + dyslexic patterns)
Dyslexic error patterns handled:
- Letter reversals: b/d, p/q, n/u, m/w
- Phonetic spelling: "wuz", "cud", "thay"
- Word boundary errors: "alot", "infact", "aswell"
- Letter omissions: "becaus", "importnt"
- Letter transpositions: "teh", "recieve"
- Homophone confusion: there/their/they're
"""
import language_tool_python
from spellchecker import SpellChecker
from loguru import logger
from typing import Optional
import re
class DyslexiaAwareSpellCorrector:
"""Two-pass spell corrector with dyslexia-specific phonetic pattern handling."""
DYSLEXIC_PHONETIC_MAP = {
"wuz": "was", "cud": "could", "wud": "would", "shud": "should",
"thay": "they", "thier": "their", "recieve": "receive",
"beleive": "believe", "occured": "occurred", "definately": "definitely",
"seperate": "separate", "untill": "until", "tommorrow": "tomorrow",
"alot": "a lot", "infact": "in fact", "aswell": "as well",
"alright": "all right", "cant": "cannot", "wont": "will not",
"ive": "I have", "im": "I am", "id": "I would",
}
def __init__(self, language: str = "en-US"):
self.spell = SpellChecker()
self.language = language
# Build regex pattern for phonetic map (word-boundary matching)
self._phonetic_pattern = re.compile(
r'\b(' + '|'.join(re.escape(k) for k in self.DYSLEXIC_PHONETIC_MAP.keys()) + r')\b',
re.IGNORECASE
)
# Try to initialise LanguageTool; graceful fallback if JVM not available
self.tool = None
try:
self.tool = language_tool_python.LanguageTool(language)
logger.info("LanguageTool initialised successfully")
except Exception as e:
logger.warning(f"LanguageTool unavailable (JVM issue?), skipping context-aware pass: {e}")
def _phonetic_pass(self, text: str) -> str:
"""Apply known dyslexic phonetic substitutions first."""
def _replace(match):
word = match.group(0)
lower = word.lower()
replacement = self.DYSLEXIC_PHONETIC_MAP.get(lower, word)
# Preserve capitalisation of first letter
if word[0].isupper() and replacement[0].islower():
replacement = replacement[0].upper() + replacement[1:]
return replacement
return self._phonetic_pattern.sub(_replace, text)
def _spellcheck_pass(self, text: str) -> str:
"""pyspellchecker pass for simple token-level errors."""
words = text.split()
corrected_words = []
for word in words:
# Strip punctuation for checking but preserve it
stripped = word.strip(".,!?;:\"'()[]{}—–-")
prefix = word[:len(word) - len(word.lstrip(".,!?;:\"'()[]{}—–-"))]
suffix = word[len(stripped) + len(prefix):]
if stripped and stripped.lower() not in self.spell and not stripped.isupper():
correction = self.spell.correction(stripped.lower())
if correction and correction != stripped.lower():
# Preserve original capitalisation
if stripped[0].isupper():
correction = correction.capitalize()
corrected_words.append(prefix + correction + suffix)
else:
corrected_words.append(word)
else:
corrected_words.append(word)
return " ".join(corrected_words)
def _languagetool_pass(self, text: str) -> str:
"""LanguageTool pass for context-aware grammar + spelling corrections."""
if self.tool is None:
return text
try:
matches = self.tool.check(text)
# Apply corrections in reverse order to preserve offsets
matches = sorted(matches, key=lambda m: m.offset, reverse=True)
result = text
for match in matches:
if match.replacements:
replacement = match.replacements[0]
start = match.offset
end = start + match.errorLength
result = result[:start] + replacement + result[end:]
return result
except Exception as e:
logger.warning(f"LanguageTool check failed: {e}")
return text
def correct(self, text: str) -> str:
"""Run all three correction passes in sequence."""
if not text or not text.strip():
return text
logger.debug(f"Spell correction input: {text[:100]}...")
# Pass 0: Phonetic substitutions (dyslexia-specific)
text = self._phonetic_pass(text)
# Pass 1: Token-level spellcheck
text = self._spellcheck_pass(text)
# Pass 2: Context-aware grammar correction
text = self._languagetool_pass(text)
return text
def close(self):
"""Clean up LanguageTool resources."""
if self.tool is not None:
try:
self.tool.close()
except Exception:
pass
self.tool = None
|