Yaz Hobooti commited on
Commit
507d05e
·
1 Parent(s): 9a98b3f

Improve spell checking to reduce false positives

Browse files

- Add _is_likely_word() function to filter out non-words
- Filter tokens that are mostly non-letter characters (<60% letters)
- Detect and filter keyboard patterns (qwerty, asdfgh, etc.)
- Filter excessive consonant clusters that look like random typing
- Improve word boundary recognition with \b regex anchors
- Better text normalization with whitespace handling
- Filter tokens during extraction to only process likely words
- Reduce false positives by not flagging non-words as misspellings
- Enhanced space and word boundary recognition

Files changed (1) hide show
  1. pdf_comparator.py +83 -4
pdf_comparator.py CHANGED
@@ -61,9 +61,11 @@ class Box:
61
 
62
  # ---- spell/tokenization helpers & caches ----
63
  if HAS_REGEX:
64
- _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE)
 
65
  else:
66
- _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*")
 
67
 
68
  if HAS_SPELLCHECK:
69
  _SPELL_EN = SpellChecker(language="en")
@@ -87,21 +89,96 @@ if _SPELL_FR:
87
  _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
88
 
89
  def _normalize_text(s: str) -> str:
 
 
 
 
 
90
  s = unicodedata.normalize("NFC", s)
91
- return s.replace("'", "'").strip()
 
 
 
 
 
 
 
 
 
 
92
 
93
  def _extract_tokens(raw: str):
 
94
  s = _normalize_text(raw or "")
95
- return _WORD_RE.findall(s)
 
 
 
 
 
 
 
 
96
 
97
  def _looks_like_acronym(tok: str) -> bool:
 
98
  return tok.isupper() and 2 <= len(tok) <= 6
99
 
100
  def _has_digits(tok: str) -> bool:
 
101
  return any(ch.isdigit() for ch in tok)
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def _is_known_word(tok: str) -> bool:
 
104
  t = tok.lower()
 
 
 
 
 
 
105
  if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
106
  return True
107
 
@@ -111,10 +188,12 @@ def _is_known_word(tok: str) -> bool:
111
  if all(_is_known_word(part) for part in parts):
112
  return True
113
 
 
114
  if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
115
  return True
116
  if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
117
  return True
 
118
  return False
119
 
120
  # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
 
61
 
62
  # ---- spell/tokenization helpers & caches ----
63
  if HAS_REGEX:
64
+ # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
65
+ _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
66
  else:
67
+ # Fallback regex for basic ASCII
68
+ _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
69
 
70
  if HAS_SPELLCHECK:
71
  _SPELL_EN = SpellChecker(language="en")
 
89
  _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
90
 
91
  def _normalize_text(s: str) -> str:
92
+ """Normalize text for better word extraction"""
93
+ if not s:
94
+ return ""
95
+
96
+ # Unicode normalization
97
  s = unicodedata.normalize("NFC", s)
98
+
99
+ # Fix common apostrophe issues
100
+ s = s.replace("'", "'").replace("'", "'")
101
+
102
+ # Normalize whitespace - replace multiple spaces with single space
103
+ s = re.sub(r'\s+', ' ', s)
104
+
105
+ # Remove leading/trailing whitespace
106
+ s = s.strip()
107
+
108
+ return s
109
 
110
  def _extract_tokens(raw: str):
111
+ """Extract word tokens with improved filtering"""
112
  s = _normalize_text(raw or "")
113
+ tokens = _WORD_RE.findall(s)
114
+
115
+ # Filter out tokens that are too short or don't look like words
116
+ filtered_tokens = []
117
+ for token in tokens:
118
+ if len(token) >= 2 and _is_likely_word(token):
119
+ filtered_tokens.append(token)
120
+
121
+ return filtered_tokens
122
 
123
  def _looks_like_acronym(tok: str) -> bool:
124
+ """Check if token looks like a valid acronym"""
125
  return tok.isupper() and 2 <= len(tok) <= 6
126
 
127
  def _has_digits(tok: str) -> bool:
128
+ """Check if token contains digits"""
129
  return any(ch.isdigit() for ch in tok)
130
 
131
+ def _is_likely_word(tok: str) -> bool:
132
+ """Check if token looks like a real word (not random characters)"""
133
+ if len(tok) < 2:
134
+ return False
135
+
136
+ # Filter out tokens that are mostly non-letter characters
137
+ letter_count = sum(1 for c in tok if c.isalpha())
138
+ if letter_count < len(tok) * 0.6: # At least 60% letters
139
+ return False
140
+
141
+ # Filter out tokens with too many consecutive consonants/vowels
142
+ vowels = set('aeiouAEIOU')
143
+ consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
144
+
145
+ # Check for excessive consonant clusters (like "qwerty" or "zxcvb")
146
+ if len(tok) >= 4:
147
+ consonant_clusters = 0
148
+ vowel_clusters = 0
149
+ for i in range(len(tok) - 2):
150
+ if tok[i:i+3].lower() in consonants:
151
+ consonant_clusters += 1
152
+ if tok[i:i+3].lower() in vowels:
153
+ vowel_clusters += 1
154
+
155
+ # If more than half the possible clusters are consonant clusters, likely not a word
156
+ if consonant_clusters > len(tok) * 0.3:
157
+ return False
158
+
159
+ # Filter out tokens that look like random keyboard patterns
160
+ keyboard_patterns = [
161
+ 'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
162
+ 'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
163
+ '123456', '234567', '345678', '456789', '567890'
164
+ ]
165
+
166
+ tok_lower = tok.lower()
167
+ for pattern in keyboard_patterns:
168
+ if pattern in tok_lower or tok_lower in pattern:
169
+ return False
170
+
171
+ return True
172
+
173
  def _is_known_word(tok: str) -> bool:
174
+ """Check if token is a known word with improved filtering"""
175
  t = tok.lower()
176
+
177
+ # First check if it looks like a real word
178
+ if not _is_likely_word(tok):
179
+ return True # Don't flag non-words as misspellings
180
+
181
+ # Check domain allowlist, acronyms, and words with digits
182
  if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
183
  return True
184
 
 
188
  if all(_is_known_word(part) for part in parts):
189
  return True
190
 
191
+ # Check against spell checkers
192
  if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
193
  return True
194
  if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
195
  return True
196
+
197
  return False
198
 
199
  # (optional) keep a compatibility shim so any other code calling normalize_token() won't break