Yaz Hobooti commited on
Commit
cdad8f0
·
1 Parent(s): def48ce

Fix spell checking issues: remove duplicates, auto-detect languages, handle hyphenated words, optimize allowlist

Browse files
Files changed (1) hide show
  1. pdf_comparator.py +41 -23
pdf_comparator.py CHANGED
@@ -67,7 +67,10 @@ else:
67
 
68
  if HAS_SPELLCHECK:
69
  _SPELL_EN = SpellChecker(language="en")
70
- _SPELL_FR = SpellChecker(language="fr")
 
 
 
71
  else:
72
  _SPELL_EN = None
73
  _SPELL_FR = None
@@ -76,10 +79,12 @@ _DOMAIN_ALLOWLIST = {
76
  "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
77
  "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
78
  }
 
79
 
80
- if _SPELL_EN and _SPELL_FR:
81
- _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
82
- _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
 
83
 
84
  def _normalize_text(s: str) -> str:
85
  s = unicodedata.normalize("NFC", s)
@@ -97,8 +102,15 @@ def _has_digits(tok: str) -> bool:
97
 
98
  def _is_known_word(tok: str) -> bool:
99
  t = tok.lower()
100
- if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
101
  return True
 
 
 
 
 
 
 
102
  if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
103
  return True
104
  if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
@@ -281,38 +293,44 @@ def _looks_like_acronym(tok: str) -> bool:
281
  def _has_digits(tok: str) -> bool:
282
  return any(ch.isdigit() for ch in tok)
283
 
284
- def _is_known_word(tok: str) -> bool:
285
- t = tok.lower()
286
- if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
287
- return True
288
- if not _SPELL_EN.unknown([t]): # known in EN
289
- return True
290
- if not _SPELL_FR.unknown([t]): # known in FR
291
- return True
292
- return False
293
-
294
  # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
295
  def normalize_token(token: str) -> str:
296
  toks = _extract_tokens(token)
297
  return (toks[0].lower() if toks else "")
298
 
299
-
300
- def normalize_token(token: str) -> str:
301
- cleaned = re.sub(r"[^A-Za-z']", "", token)
302
- return cleaned.lower()
 
 
 
 
 
 
 
 
 
 
303
 
304
  def find_misspell_boxes(
305
  img: Image.Image,
306
  *,
307
  min_conf: int = 60,
308
- lang: str = "eng+fra",
309
  extra_allow: Optional[Iterable[str]] = None
310
- ) -> List["Box"]:
311
  if not (HAS_OCR and HAS_SPELLCHECK):
312
  return []
 
 
 
 
 
313
  try:
314
- if extra_allow and _SPELL_EN and _SPELL_FR:
315
  _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
 
316
  _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
317
 
318
  data = pytesseract.image_to_data(
@@ -325,7 +343,7 @@ def find_misspell_boxes(
325
  return []
326
 
327
  n = len(data.get("text", [])) or 0
328
- boxes: List["Box"] = []
329
 
330
  for i in range(n):
331
  raw = data["text"][i]
 
67
 
68
  if HAS_SPELLCHECK:
69
  _SPELL_EN = SpellChecker(language="en")
70
+ try:
71
+ _SPELL_FR = SpellChecker(language="fr")
72
+ except Exception:
73
+ _SPELL_FR = None
74
  else:
75
  _SPELL_EN = None
76
  _SPELL_FR = None
 
79
  "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
80
  "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
81
  }
82
+ _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
83
 
84
+ if _SPELL_EN:
85
+ _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
86
+ if _SPELL_FR:
87
+ _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
88
 
89
  def _normalize_text(s: str) -> str:
90
  s = unicodedata.normalize("NFC", s)
 
102
 
103
  def _is_known_word(tok: str) -> bool:
104
  t = tok.lower()
105
+ if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
106
  return True
107
+
108
+ # Check hyphenated words - if any part is known, consider the whole word known
109
+ if '-' in tok:
110
+ parts = tok.split('-')
111
+ if all(_is_known_word(part) for part in parts):
112
+ return True
113
+
114
  if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
115
  return True
116
  if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
 
293
  def _has_digits(tok: str) -> bool:
294
  return any(ch.isdigit() for ch in tok)
295
 
 
 
 
 
 
 
 
 
 
 
296
  # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
297
  def normalize_token(token: str) -> str:
298
  toks = _extract_tokens(token)
299
  return (toks[0].lower() if toks else "")
300
 
301
+ def _get_available_tesseract_langs():
302
+ """Get available Tesseract languages"""
303
+ try:
304
+ langs = pytesseract.get_languages()
305
+ if 'eng' in langs and 'fra' in langs:
306
+ return "eng+fra"
307
+ elif 'eng' in langs:
308
+ return "eng"
309
+ elif langs:
310
+ return langs[0]
311
+ else:
312
+ return "eng"
313
+ except Exception:
314
+ return "eng"
315
 
316
  def find_misspell_boxes(
317
  img: Image.Image,
318
  *,
319
  min_conf: int = 60,
320
+ lang: Optional[str] = None,
321
  extra_allow: Optional[Iterable[str]] = None
322
+ ) -> List[Box]:
323
  if not (HAS_OCR and HAS_SPELLCHECK):
324
  return []
325
+
326
+ # Auto-detect language if not provided
327
+ if lang is None:
328
+ lang = _get_available_tesseract_langs()
329
+
330
  try:
331
+ if extra_allow and _SPELL_EN:
332
  _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
333
+ if extra_allow and _SPELL_FR:
334
  _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
335
 
336
  data = pytesseract.image_to_data(
 
343
  return []
344
 
345
  n = len(data.get("text", [])) or 0
346
+ boxes: List[Box] = []
347
 
348
  for i in range(n):
349
  raw = data["text"][i]