Yaz Hobooti commited on
Commit
07087d8
·
1 Parent(s): 02b1336

Implement improved spell checking system with regex, domain allowlist, and confidence filtering

Browse files
Files changed (2) hide show
  1. pdf_comparator.py +168 -16
  2. requirements.txt +2 -2
pdf_comparator.py CHANGED
@@ -6,8 +6,9 @@ Upload two PDF files and get comprehensive analysis including differences, OCR,
6
 
7
  import os, sys, re, csv, json, io
8
  from dataclasses import dataclass
9
- from typing import List, Tuple, Optional
10
  import tempfile
 
11
 
12
  import numpy as np
13
  from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
@@ -39,6 +40,13 @@ except Exception:
39
  SpellChecker = None
40
  HAS_SPELLCHECK = False
41
 
 
 
 
 
 
 
 
42
  try:
43
  from pyzbar.pyzbar import decode as zbar_decode
44
  HAS_BARCODE = True
@@ -51,6 +59,57 @@ except Exception:
51
  class Box:
52
  y1: int; x1: int; y2: int; x2: int; area: int
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # -------------------- Helpers ----------------------
55
  def _is_pdf(path: str) -> bool:
56
  return os.path.splitext(path.lower())[1] == ".pdf"
@@ -150,38 +209,131 @@ def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
150
  return Image.fromarray(A)
151
 
152
  # -------------------- OCR + Spellcheck -------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def normalize_token(token: str) -> str:
154
  cleaned = re.sub(r"[^A-Za-z']", "", token)
155
  return cleaned.lower()
156
 
157
- def find_misspell_boxes(img: Image.Image) -> List[Box]:
 
 
 
 
 
 
158
  if not (HAS_OCR and HAS_SPELLCHECK):
159
  return []
160
  try:
161
- spell = SpellChecker()
162
- data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
 
 
 
 
 
 
 
 
163
  except Exception:
164
  return []
165
- n = len(data.get("text", []))
166
- boxes: List[Box] = []
 
 
167
  for i in range(n):
168
- text = data["text"][i]
169
- if not text:
170
  continue
171
- token = normalize_token(text)
172
- if len(token) < 2:
 
 
 
 
 
 
 
 
 
 
173
  continue
174
- if token in spell:
 
 
175
  continue
176
- left = data.get("left", [0])[i]
177
- top = data.get("top", [0])[i]
178
- width = data.get("width", [0])[i]
179
- height= data.get("height",[0])[i]
 
180
  if width <= 0 or height <= 0:
181
  continue
182
- boxes.append(Box(top, left, top+height, left+width, width*height))
 
 
 
183
  return boxes
184
 
 
185
  # -------------------- Barcode / QR -----------------
186
  def ean_like_checksum_ok(digits: str) -> bool:
187
  if not digits.isdigit():
 
6
 
7
  import os, sys, re, csv, json, io
8
  from dataclasses import dataclass
9
+ from typing import List, Tuple, Optional, Iterable
10
  import tempfile
11
+ import unicodedata
12
 
13
  import numpy as np
14
  from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
 
40
  SpellChecker = None
41
  HAS_SPELLCHECK = False
42
 
43
+ try:
44
+ import regex as re
45
+ HAS_REGEX = True
46
+ except Exception:
47
+ import re
48
+ HAS_REGEX = False
49
+
50
  try:
51
  from pyzbar.pyzbar import decode as zbar_decode
52
  HAS_BARCODE = True
 
59
  class Box:
60
  y1: int; x1: int; y2: int; x2: int; area: int
61
 
62
+ # ---- spell/tokenization helpers & caches ----
63
+ if HAS_REGEX:
64
+ _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE)
65
+ else:
66
+ _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*")
67
+
68
+ if HAS_SPELLCHECK:
69
+ _SPELL_EN = SpellChecker(language="en")
70
+ _SPELL_FR = SpellChecker(language="fr")
71
+ else:
72
+ _SPELL_EN = None
73
+ _SPELL_FR = None
74
+
75
+ _DOMAIN_ALLOWLIST = {
76
+ "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
77
+ "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
78
+ }
79
+
80
+ if _SPELL_EN and _SPELL_FR:
81
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
82
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
83
+
84
+ def _normalize_text(s: str) -> str:
85
+ s = unicodedata.normalize("NFC", s)
86
+ return s.replace("'", "'").strip()
87
+
88
+ def _extract_tokens(raw: str):
89
+ s = _normalize_text(raw or "")
90
+ return _WORD_RE.findall(s)
91
+
92
+ def _looks_like_acronym(tok: str) -> bool:
93
+ return tok.isupper() and 2 <= len(tok) <= 6
94
+
95
+ def _has_digits(tok: str) -> bool:
96
+ return any(ch.isdigit() for ch in tok)
97
+
98
+ def _is_known_word(tok: str) -> bool:
99
+ t = tok.lower()
100
+ if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
101
+ return True
102
+ if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
103
+ return True
104
+ if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
105
+ return True
106
+ return False
107
+
108
+ # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
109
+ def normalize_token(token: str) -> str:
110
+ toks = _extract_tokens(token)
111
+ return (toks[0].lower() if toks else "")
112
+
113
  # -------------------- Helpers ----------------------
114
  def _is_pdf(path: str) -> bool:
115
  return os.path.splitext(path.lower())[1] == ".pdf"
 
209
  return Image.fromarray(A)
210
 
211
  # -------------------- OCR + Spellcheck -------------
212
+ from typing import List, Iterable, Optional
213
+ from PIL import Image
214
+ import unicodedata
215
+ import regex as re
216
+ import pytesseract
217
+ from spellchecker import SpellChecker
218
+
219
+ # If these existed in your file, keep them; otherwise define defaults to avoid NameError
220
+ try:
221
+ HAS_OCR
222
+ except NameError:
223
+ HAS_OCR = True
224
+ try:
225
+ HAS_SPELLCHECK
226
+ except NameError:
227
+ HAS_SPELLCHECK = True
228
+
229
+ # ---- spell/tokenization helpers & caches ----
230
+ _WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)
231
+
232
+ _SPELL_EN = SpellChecker(language="en")
233
+ _SPELL_FR = SpellChecker(language="fr")
234
+
235
+ _DOMAIN_ALLOWLIST = {
236
+ "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
237
+ "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
238
+ }
239
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
240
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
241
+
242
+ def _normalize_text(s: str) -> str:
243
+ s = unicodedata.normalize("NFC", s)
244
+ return s.replace("’", "'").strip()
245
+
246
+ def _extract_tokens(raw: str):
247
+ s = _normalize_text(raw or "")
248
+ return _WORD_RE.findall(s)
249
+
250
+ def _looks_like_acronym(tok: str) -> bool:
251
+ return tok.isupper() and 2 <= len(tok) <= 6
252
+
253
+ def _has_digits(tok: str) -> bool:
254
+ return any(ch.isdigit() for ch in tok)
255
+
256
+ def _is_known_word(tok: str) -> bool:
257
+ t = tok.lower()
258
+ if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
259
+ return True
260
+ if not _SPELL_EN.unknown([t]): # known in EN
261
+ return True
262
+ if not _SPELL_FR.unknown([t]): # known in FR
263
+ return True
264
+ return False
265
+
266
+ # (optional) keep a compatibility shim so any other code calling normalize_token() won't break
267
+ def normalize_token(token: str) -> str:
268
+ toks = _extract_tokens(token)
269
+ return (toks[0].lower() if toks else "")
270
+
271
+
272
  def normalize_token(token: str) -> str:
273
  cleaned = re.sub(r"[^A-Za-z']", "", token)
274
  return cleaned.lower()
275
 
276
+ def find_misspell_boxes(
277
+ img: Image.Image,
278
+ *,
279
+ min_conf: int = 60,
280
+ lang: str = "eng+fra",
281
+ extra_allow: Optional[Iterable[str]] = None
282
+ ) -> List["Box"]:
283
  if not (HAS_OCR and HAS_SPELLCHECK):
284
  return []
285
  try:
286
+ if extra_allow and _SPELL_EN and _SPELL_FR:
287
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
288
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
289
+
290
+ data = pytesseract.image_to_data(
291
+ img,
292
+ lang=lang,
293
+ output_type=pytesseract.Output.DICT,
294
+ # config="--psm 6" # uncomment if your pages are simple blocks of text
295
+ )
296
  except Exception:
297
  return []
298
+
299
+ n = len(data.get("text", [])) or 0
300
+ boxes: List["Box"] = []
301
+
302
  for i in range(n):
303
+ raw = data["text"][i]
304
+ if not raw:
305
  continue
306
+
307
+ # confidence filter
308
+ conf_str = data.get("conf", ["-1"])[i]
309
+ try:
310
+ conf = int(float(conf_str))
311
+ except Exception:
312
+ conf = -1
313
+ if conf < min_conf:
314
+ continue
315
+
316
+ tokens = _extract_tokens(raw)
317
+ if not tokens:
318
  continue
319
+
320
+ # flag the box if ANY token in it looks misspelled
321
+ if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
322
  continue
323
+
324
+ left = data.get("left", [0])[i]
325
+ top = data.get("top", [0])[i]
326
+ width = data.get("width", [0])[i]
327
+ height = data.get("height",[0])[i]
328
  if width <= 0 or height <= 0:
329
  continue
330
+
331
+ # NOTE: adjust to match your Box constructor if needed
332
+ boxes.append(Box(top, left, top + height, left + width, width * height))
333
+
334
  return boxes
335
 
336
+
337
  # -------------------- Barcode / QR -----------------
338
  def ean_like_checksum_ok(digits: str) -> bool:
339
  if not digits.isdigit():
requirements.txt CHANGED
@@ -6,13 +6,13 @@ Pillow==10.0.1
6
  opencv-python==4.8.1.78
7
  pytesseract==0.3.10
8
  pyzbar==0.1.9
9
- pyspellchecker==0.7.2
10
  nltk==3.8.1
11
  numpy==1.24.3
12
  scikit-image==0.21.0
13
  matplotlib==3.7.2
14
  pandas==2.0.3
15
  reportlab==4.0.4
16
- regex==2023.10.3
17
  gradio==4.44.1
18
  PyMuPDF==1.23.8
 
6
  opencv-python==4.8.1.78
7
  pytesseract==0.3.10
8
  pyzbar==0.1.9
9
+ pyspellchecker==0.8.3
10
  nltk==3.8.1
11
  numpy==1.24.3
12
  scikit-image==0.21.0
13
  matplotlib==3.7.2
14
  pandas==2.0.3
15
  reportlab==4.0.4
16
+ regex==2025.9.1
17
  gradio==4.44.1
18
  PyMuPDF==1.23.8