Improve spell checking to reduce false positives

- Add _is_likely_word() function to filter out non-words
- Filter tokens that are mostly non-letter characters (<60% letters)
- Detect and filter keyboard patterns (qwerty, asdfgh, etc.)
- Filter excessive consonant clusters that look like random typing
- Improve word boundary recognition with \b regex anchors
- Better text normalization with whitespace handling
- Filter tokens during extraction to only process likely words
- Reduce false positives by not flagging non-words as misspellings
- Enhanced space and word boundary recognition

Files changed (1) hide show

pdf_comparator.py +83 -4

pdf_comparator.py CHANGED Viewed

@@ -61,9 +61,11 @@ class Box:
 # ---- spell/tokenization helpers & caches ----
 if HAS_REGEX:
-    _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE)
 else:
-    _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*")
 if HAS_SPELLCHECK:
     _SPELL_EN = SpellChecker(language="en")
@@ -87,21 +89,96 @@ if _SPELL_FR:
     _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
 def _normalize_text(s: str) -> str:
     s = unicodedata.normalize("NFC", s)
-    return s.replace("'", "'").strip()
 def _extract_tokens(raw: str):
     s = _normalize_text(raw or "")
-    return _WORD_RE.findall(s)
 def _looks_like_acronym(tok: str) -> bool:
     return tok.isupper() and 2 <= len(tok) <= 6
 def _has_digits(tok: str) -> bool:
     return any(ch.isdigit() for ch in tok)
 def _is_known_word(tok: str) -> bool:
     t = tok.lower()
     if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
         return True
@@ -111,10 +188,12 @@ def _is_known_word(tok: str) -> bool:
         if all(_is_known_word(part) for part in parts):
             return True
     if _SPELL_EN and not _SPELL_EN.unknown([t]):  # known in EN
         return True
     if _SPELL_FR and not _SPELL_FR.unknown([t]):  # known in FR
         return True
     return False
 # (optional) keep a compatibility shim so any other code calling normalize_token() won't break

 # ---- spell/tokenization helpers & caches ----
 if HAS_REGEX:
+    # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
+    _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
 else:
+    # Fallback regex for basic ASCII
+    _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
 if HAS_SPELLCHECK:
     _SPELL_EN = SpellChecker(language="en")
     _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
 def _normalize_text(s: str) -> str:
+    """Normalize text for better word extraction"""
+    if not s:
+        return ""
+    # Unicode normalization
     s = unicodedata.normalize("NFC", s)
+    # Fix common apostrophe issues
+    s = s.replace("'", "'").replace("'", "'")
+    # Normalize whitespace - replace multiple spaces with single space
+    s = re.sub(r'\s+', ' ', s)
+    # Remove leading/trailing whitespace
+    s = s.strip()
+    return s
 def _extract_tokens(raw: str):
+    """Extract word tokens with improved filtering"""
     s = _normalize_text(raw or "")
+    tokens = _WORD_RE.findall(s)
+    # Filter out tokens that are too short or don't look like words
+    filtered_tokens = []
+    for token in tokens:
+        if len(token) >= 2 and _is_likely_word(token):
+            filtered_tokens.append(token)
+    return filtered_tokens
 def _looks_like_acronym(tok: str) -> bool:
+    """Check if token looks like a valid acronym"""
     return tok.isupper() and 2 <= len(tok) <= 6
 def _has_digits(tok: str) -> bool:
+    """Check if token contains digits"""
     return any(ch.isdigit() for ch in tok)
+def _is_likely_word(tok: str) -> bool:
+    """Check if token looks like a real word (not random characters)"""
+    if len(tok) < 2:
+        return False
+    # Filter out tokens that are mostly non-letter characters
+    letter_count = sum(1 for c in tok if c.isalpha())
+    if letter_count < len(tok) * 0.6:  # At least 60% letters
+        return False
+    # Filter out tokens with too many consecutive consonants/vowels
+    vowels = set('aeiouAEIOU')
+    consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
+    # Check for excessive consonant clusters (like "qwerty" or "zxcvb")
+    if len(tok) >= 4:
+        consonant_clusters = 0
+        vowel_clusters = 0
+        for i in range(len(tok) - 2):
+            if tok[i:i+3].lower() in consonants:
+                consonant_clusters += 1
+            if tok[i:i+3].lower() in vowels:
+                vowel_clusters += 1
+        # If more than half the possible clusters are consonant clusters, likely not a word
+        if consonant_clusters > len(tok) * 0.3:
+            return False
+    # Filter out tokens that look like random keyboard patterns
+    keyboard_patterns = [
+        'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
+        'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
+        '123456', '234567', '345678', '456789', '567890'
+    ]
+    tok_lower = tok.lower()
+    for pattern in keyboard_patterns:
+        if pattern in tok_lower or tok_lower in pattern:
+            return False
+    return True
 def _is_known_word(tok: str) -> bool:
+    """Check if token is a known word with improved filtering"""
     t = tok.lower()
+    # First check if it looks like a real word
+    if not _is_likely_word(tok):
+        return True  # Don't flag non-words as misspellings
+    # Check domain allowlist, acronyms, and words with digits
     if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
         return True
         if all(_is_known_word(part) for part in parts):
             return True
+    # Check against spell checkers
     if _SPELL_EN and not _SPELL_EN.unknown([t]):  # known in EN
         return True
     if _SPELL_FR and not _SPELL_FR.unknown([t]):  # known in FR
         return True
     return False
 # (optional) keep a compatibility shim so any other code calling normalize_token() won't break