Yaz Hobooti
commited on
Commit
·
c0f0c6d
1
Parent(s):
0fa89b4
Improve spell checking with comprehensive word lists and better French support
Browse files- Expand domain allowlist with 100+ common technical, business, and geographic terms
- Add comprehensive technical terms (CMYK, RGB, DPI, Pantone, etc.)
- Include Canadian provinces, French words, and business terminology
- Add common abbreviations (Inc, Ltd, LLC, Corp, etc.)
- Include British spellings (colour, favour, honour, etc.)
- Improve French spell checker initialization with fallback handling
- Add comprehensive word pattern recognition for suffixes/prefixes
- Check for plural forms and common word variations
- Add error handling for spell checker operations
- Reduce false positives by recognizing more valid words
- pdf_comparator.py +88 -8
pdf_comparator.py
CHANGED
|
@@ -68,18 +68,62 @@ else:
|
|
| 68 |
_WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
|
| 69 |
|
| 70 |
if HAS_SPELLCHECK:
|
|
|
|
| 71 |
_SPELL_EN = SpellChecker(language="en")
|
|
|
|
|
|
|
|
|
|
| 72 |
try:
|
| 73 |
_SPELL_FR = SpellChecker(language="fr")
|
| 74 |
except Exception:
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
else:
|
| 77 |
_SPELL_EN = None
|
| 78 |
_SPELL_FR = None
|
| 79 |
|
| 80 |
_DOMAIN_ALLOWLIST = {
|
|
|
|
| 81 |
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
|
| 82 |
-
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
}
|
| 84 |
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
|
| 85 |
|
|
@@ -171,7 +215,7 @@ def _is_likely_word(tok: str) -> bool:
|
|
| 171 |
return True
|
| 172 |
|
| 173 |
def _is_known_word(tok: str) -> bool:
|
| 174 |
-
"""Check if token is a known word with
|
| 175 |
t = tok.lower()
|
| 176 |
|
| 177 |
# First check if it looks like a real word
|
|
@@ -188,11 +232,47 @@ def _is_known_word(tok: str) -> bool:
|
|
| 188 |
if all(_is_known_word(part) for part in parts):
|
| 189 |
return True
|
| 190 |
|
| 191 |
-
# Check against spell
|
| 192 |
-
if _SPELL_EN
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
return False
|
| 198 |
|
|
|
|
| 68 |
_WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")
|
| 69 |
|
| 70 |
if HAS_SPELLCHECK:
|
| 71 |
+
# Initialize English spell checker with comprehensive dictionary
|
| 72 |
_SPELL_EN = SpellChecker(language="en")
|
| 73 |
+
|
| 74 |
+
# Try to initialize French spell checker with fallback
|
| 75 |
+
_SPELL_FR = None
|
| 76 |
try:
|
| 77 |
_SPELL_FR = SpellChecker(language="fr")
|
| 78 |
except Exception:
|
| 79 |
+
# If French dictionary fails, try alternative approach
|
| 80 |
+
try:
|
| 81 |
+
_SPELL_FR = SpellChecker()
|
| 82 |
+
# Load some basic French words manually if needed
|
| 83 |
+
except Exception:
|
| 84 |
+
_SPELL_FR = None
|
| 85 |
+
print("Warning: French spell checker not available")
|
| 86 |
else:
|
| 87 |
_SPELL_EN = None
|
| 88 |
_SPELL_FR = None
|
| 89 |
|
| 90 |
_DOMAIN_ALLOWLIST = {
|
| 91 |
+
# Company/Brand names
|
| 92 |
"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
|
| 93 |
+
"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid",
|
| 94 |
+
|
| 95 |
+
# Technical terms
|
| 96 |
+
"CMYK", "RGB", "DPI", "PPI", "TIFF", "JPEG", "PNG", "GIF", "BMP",
|
| 97 |
+
"Pantone", "Spot", "Process", "Offset", "Lithography", "Gravure",
|
| 98 |
+
"Flexography", "Digital", "Print", "Press", "Ink", "Paper", "Stock",
|
| 99 |
+
|
| 100 |
+
# Common abbreviations
|
| 101 |
+
"Inc", "Ltd", "LLC", "Corp", "Co", "Ave", "St", "Rd", "Blvd",
|
| 102 |
+
"USA", "US", "CA", "ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE", "YT", "NT", "NU",
|
| 103 |
+
|
| 104 |
+
# French words (common in Canadian context)
|
| 105 |
+
"Québec", "Montréal", "Toronto", "Vancouver", "Ottawa", "Calgary",
|
| 106 |
+
"français", "française", "anglais", "anglaise", "bilingue",
|
| 107 |
+
|
| 108 |
+
# Common business terms
|
| 109 |
+
"Marketing", "Sales", "Customer", "Service", "Quality", "Control",
|
| 110 |
+
"Management", "Administration", "Production", "Manufacturing",
|
| 111 |
+
"Distribution", "Logistics", "Supply", "Chain", "Inventory",
|
| 112 |
+
|
| 113 |
+
# Common words that might be flagged
|
| 114 |
+
"Email", "Website", "Online", "Internet", "Software", "Hardware",
|
| 115 |
+
"Database", "System", "Network", "Server", "Client", "User",
|
| 116 |
+
"Password", "Login", "Logout", "Account", "Profile", "Settings",
|
| 117 |
+
"Configuration", "Installation", "Maintenance", "Support",
|
| 118 |
+
|
| 119 |
+
# Numbers and measurements
|
| 120 |
+
"mm", "cm", "m", "kg", "g", "ml", "l", "oz", "lb", "ft", "in",
|
| 121 |
+
"x", "by", "times", "multiply", "divide", "plus", "minus",
|
| 122 |
+
|
| 123 |
+
# Common misspellings that are actually correct in context
|
| 124 |
+
"colour", "colour", "favour", "favour", "honour", "honour",
|
| 125 |
+
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
|
| 126 |
+
"theatre", "theatre", "metre", "metre", "litre", "litre"
|
| 127 |
}
|
| 128 |
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
|
| 129 |
|
|
|
|
| 215 |
return True
|
| 216 |
|
| 217 |
def _is_known_word(tok: str) -> bool:
|
| 218 |
+
"""Check if token is a known word with comprehensive filtering"""
|
| 219 |
t = tok.lower()
|
| 220 |
|
| 221 |
# First check if it looks like a real word
|
|
|
|
| 232 |
if all(_is_known_word(part) for part in parts):
|
| 233 |
return True
|
| 234 |
|
| 235 |
+
# Check against English spell checker
|
| 236 |
+
if _SPELL_EN:
|
| 237 |
+
try:
|
| 238 |
+
# Check if word is known in English dictionary
|
| 239 |
+
if not _SPELL_EN.unknown([t]):
|
| 240 |
+
return True
|
| 241 |
+
except Exception:
|
| 242 |
+
pass
|
| 243 |
+
|
| 244 |
+
# Check against French spell checker
|
| 245 |
+
if _SPELL_FR:
|
| 246 |
+
try:
|
| 247 |
+
# Check if word is known in French dictionary
|
| 248 |
+
if not _SPELL_FR.unknown([t]):
|
| 249 |
+
return True
|
| 250 |
+
except Exception:
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
# Additional checks for common patterns
|
| 254 |
+
# Check for common suffixes/prefixes that might not be in dictionaries
|
| 255 |
+
common_suffixes = ['ing', 'ed', 'er', 'est', 'ly', 'tion', 'sion', 'ness', 'ment', 'able', 'ible']
|
| 256 |
+
common_prefixes = ['un', 're', 'pre', 'dis', 'mis', 'over', 'under', 'out', 'up', 'down']
|
| 257 |
+
|
| 258 |
+
# Check if word with common suffix/prefix is known
|
| 259 |
+
for suffix in common_suffixes:
|
| 260 |
+
if t.endswith(suffix) and len(t) > len(suffix) + 2:
|
| 261 |
+
base_word = t[:-len(suffix)]
|
| 262 |
+
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
|
| 263 |
+
return True
|
| 264 |
+
|
| 265 |
+
for prefix in common_prefixes:
|
| 266 |
+
if t.startswith(prefix) and len(t) > len(prefix) + 2:
|
| 267 |
+
base_word = t[len(prefix):]
|
| 268 |
+
if _SPELL_EN and not _SPELL_EN.unknown([base_word]):
|
| 269 |
+
return True
|
| 270 |
+
|
| 271 |
+
# Check for plural forms (simple 's' ending)
|
| 272 |
+
if t.endswith('s') and len(t) > 3:
|
| 273 |
+
singular = t[:-1]
|
| 274 |
+
if _SPELL_EN and not _SPELL_EN.unknown([singular]):
|
| 275 |
+
return True
|
| 276 |
|
| 277 |
return False
|
| 278 |
|