rewrite / src /preprocessing /spell_corrector.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
Two-pass spell correction:
Pass 1: pyspellchecker (fast, context-free, catches simple typos)
Pass 2: LanguageTool (context-aware, catches grammar + dyslexic patterns)
Dyslexic error patterns handled:
- Letter reversals: b/d, p/q, n/u, m/w
- Phonetic spelling: "wuz", "cud", "thay"
- Word boundary errors: "alot", "infact", "aswell"
- Letter omissions: "becaus", "importnt"
- Letter transpositions: "teh", "recieve"
- Homophone confusion: there/their/they're
"""
import language_tool_python
from spellchecker import SpellChecker
from loguru import logger
from typing import Optional
import re
class DyslexiaAwareSpellCorrector:
"""Two-pass spell corrector with dyslexia-specific phonetic pattern handling."""
DYSLEXIC_PHONETIC_MAP = {
"wuz": "was", "cud": "could", "wud": "would", "shud": "should",
"thay": "they", "thier": "their", "recieve": "receive",
"beleive": "believe", "occured": "occurred", "definately": "definitely",
"seperate": "separate", "untill": "until", "tommorrow": "tomorrow",
"alot": "a lot", "infact": "in fact", "aswell": "as well",
"alright": "all right", "cant": "cannot", "wont": "will not",
"ive": "I have", "im": "I am", "id": "I would",
}
def __init__(self, language: str = "en-US"):
self.spell = SpellChecker()
self.language = language
# Build regex pattern for phonetic map (word-boundary matching)
self._phonetic_pattern = re.compile(
r'\b(' + '|'.join(re.escape(k) for k in self.DYSLEXIC_PHONETIC_MAP.keys()) + r')\b',
re.IGNORECASE
)
# Try to initialise LanguageTool; graceful fallback if JVM not available
self.tool = None
try:
self.tool = language_tool_python.LanguageTool(language)
logger.info("LanguageTool initialised successfully")
except Exception as e:
logger.warning(f"LanguageTool unavailable (JVM issue?), skipping context-aware pass: {e}")
def _phonetic_pass(self, text: str) -> str:
"""Apply known dyslexic phonetic substitutions first."""
def _replace(match):
word = match.group(0)
lower = word.lower()
replacement = self.DYSLEXIC_PHONETIC_MAP.get(lower, word)
# Preserve capitalisation of first letter
if word[0].isupper() and replacement[0].islower():
replacement = replacement[0].upper() + replacement[1:]
return replacement
return self._phonetic_pattern.sub(_replace, text)
def _spellcheck_pass(self, text: str) -> str:
"""pyspellchecker pass for simple token-level errors."""
words = text.split()
corrected_words = []
for word in words:
# Strip punctuation for checking but preserve it
stripped = word.strip(".,!?;:\"'()[]{}—–-")
prefix = word[:len(word) - len(word.lstrip(".,!?;:\"'()[]{}—–-"))]
suffix = word[len(stripped) + len(prefix):]
if stripped and stripped.lower() not in self.spell and not stripped.isupper():
correction = self.spell.correction(stripped.lower())
if correction and correction != stripped.lower():
# Preserve original capitalisation
if stripped[0].isupper():
correction = correction.capitalize()
corrected_words.append(prefix + correction + suffix)
else:
corrected_words.append(word)
else:
corrected_words.append(word)
return " ".join(corrected_words)
def _languagetool_pass(self, text: str) -> str:
"""LanguageTool pass for context-aware grammar + spelling corrections."""
if self.tool is None:
return text
try:
matches = self.tool.check(text)
# Apply corrections in reverse order to preserve offsets
matches = sorted(matches, key=lambda m: m.offset, reverse=True)
result = text
for match in matches:
if match.replacements:
replacement = match.replacements[0]
start = match.offset
end = start + match.errorLength
result = result[:start] + replacement + result[end:]
return result
except Exception as e:
logger.warning(f"LanguageTool check failed: {e}")
return text
def correct(self, text: str) -> str:
"""Run all three correction passes in sequence."""
if not text or not text.strip():
return text
logger.debug(f"Spell correction input: {text[:100]}...")
# Pass 0: Phonetic substitutions (dyslexia-specific)
text = self._phonetic_pass(text)
# Pass 1: Token-level spellcheck
text = self._spellcheck_pass(text)
# Pass 2: Context-aware grammar correction
text = self._languagetool_pass(text)
return text
def close(self):
"""Clean up LanguageTool resources."""
if self.tool is not None:
try:
self.tool.close()
except Exception:
pass
self.tool = None