Implement improved spell checking system with regex, domain allowlist, and confidence filtering

Files changed (2) hide show

pdf_comparator.py +168 -16
requirements.txt +2 -2

pdf_comparator.py CHANGED Viewed

@@ -6,8 +6,9 @@ Upload two PDF files and get comprehensive analysis including differences, OCR,
 import os, sys, re, csv, json, io
 from dataclasses import dataclass
-from typing import List, Tuple, Optional
 import tempfile
 import numpy as np
 from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
@@ -39,6 +40,13 @@ except Exception:
     SpellChecker = None
     HAS_SPELLCHECK = False
 try:
     from pyzbar.pyzbar import decode as zbar_decode
     HAS_BARCODE = True
@@ -51,6 +59,57 @@ except Exception:
 class Box:
     y1: int; x1: int; y2: int; x2: int; area: int
 # -------------------- Helpers ----------------------
 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
@@ -150,38 +209,131 @@ def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
     return Image.fromarray(A)
 # -------------------- OCR + Spellcheck -------------
 def normalize_token(token: str) -> str:
     cleaned = re.sub(r"[^A-Za-z']", "", token)
     return cleaned.lower()
-def find_misspell_boxes(img: Image.Image) -> List[Box]:
     if not (HAS_OCR and HAS_SPELLCHECK):
         return []
     try:
-        spell = SpellChecker()
-        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
     except Exception:
         return []
-    n = len(data.get("text", []))
-    boxes: List[Box] = []
     for i in range(n):
-        text = data["text"][i]
-        if not text:
             continue
-        token = normalize_token(text)
-        if len(token) < 2:
             continue
-        if token in spell:
             continue
-        left  = data.get("left",  [0])[i]
-        top   = data.get("top",   [0])[i]
-        width = data.get("width", [0])[i]
-        height= data.get("height",[0])[i]
         if width <= 0 or height <= 0:
             continue
-        boxes.append(Box(top, left, top+height, left+width, width*height))
     return boxes
 # -------------------- Barcode / QR -----------------
 def ean_like_checksum_ok(digits: str) -> bool:
     if not digits.isdigit():

 import os, sys, re, csv, json, io
 from dataclasses import dataclass
+from typing import List, Tuple, Optional, Iterable
 import tempfile
+import unicodedata
 import numpy as np
 from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
     SpellChecker = None
     HAS_SPELLCHECK = False
+try:
+    import regex as re
+    HAS_REGEX = True
+except Exception:
+    import re
+    HAS_REGEX = False
 try:
     from pyzbar.pyzbar import decode as zbar_decode
     HAS_BARCODE = True
 class Box:
     y1: int; x1: int; y2: int; x2: int; area: int
+# ---- spell/tokenization helpers & caches ----
+if HAS_REGEX:
+    _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE)
+else:
+    _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*")
+if HAS_SPELLCHECK:
+    _SPELL_EN = SpellChecker(language="en")
+    _SPELL_FR = SpellChecker(language="fr")
+else:
+    _SPELL_EN = None
+    _SPELL_FR = None
+_DOMAIN_ALLOWLIST = {
+    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
+    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
+}
+if _SPELL_EN and _SPELL_FR:
+    _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+    _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+def _normalize_text(s: str) -> str:
+    s = unicodedata.normalize("NFC", s)
+    return s.replace("'", "'").strip()
+def _extract_tokens(raw: str):
+    s = _normalize_text(raw or "")
+    return _WORD_RE.findall(s)
+def _looks_like_acronym(tok: str) -> bool:
+    return tok.isupper() and 2 <= len(tok) <= 6
+def _has_digits(tok: str) -> bool:
+    return any(ch.isdigit() for ch in tok)
+def _is_known_word(tok: str) -> bool:
+    t = tok.lower()
+    if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
+        return True
+    if _SPELL_EN and not _SPELL_EN.unknown([t]):  # known in EN
+        return True
+    if _SPELL_FR and not _SPELL_FR.unknown([t]):  # known in FR
+        return True
+    return False
+# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
+def normalize_token(token: str) -> str:
+    toks = _extract_tokens(token)
+    return (toks[0].lower() if toks else "")
 # -------------------- Helpers ----------------------
 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
     return Image.fromarray(A)
 # -------------------- OCR + Spellcheck -------------
+from typing import List, Iterable, Optional
+from PIL import Image
+import unicodedata
+import regex as re
+import pytesseract
+from spellchecker import SpellChecker
+# If these existed in your file, keep them; otherwise define defaults to avoid NameError
+try:
+    HAS_OCR
+except NameError:
+    HAS_OCR = True
+try:
+    HAS_SPELLCHECK
+except NameError:
+    HAS_SPELLCHECK = True
+# ---- spell/tokenization helpers & caches ----
+_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)
+_SPELL_EN = SpellChecker(language="en")
+_SPELL_FR = SpellChecker(language="fr")
+_DOMAIN_ALLOWLIST = {
+    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
+    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
+}
+_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
+def _normalize_text(s: str) -> str:
+    s = unicodedata.normalize("NFC", s)
+    return s.replace("’", "'").strip()
+def _extract_tokens(raw: str):
+    s = _normalize_text(raw or "")
+    return _WORD_RE.findall(s)
+def _looks_like_acronym(tok: str) -> bool:
+    return tok.isupper() and 2 <= len(tok) <= 6
+def _has_digits(tok: str) -> bool:
+    return any(ch.isdigit() for ch in tok)
+def _is_known_word(tok: str) -> bool:
+    t = tok.lower()
+    if t in (w.lower() for w in _DOMAIN_ALLOWLIST) or _looks_like_acronym(tok) or _has_digits(tok):
+        return True
+    if not _SPELL_EN.unknown([t]):  # known in EN
+        return True
+    if not _SPELL_FR.unknown([t]):  # known in FR
+        return True
+    return False
+# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
+def normalize_token(token: str) -> str:
+    toks = _extract_tokens(token)
+    return (toks[0].lower() if toks else "")
 def normalize_token(token: str) -> str:
     cleaned = re.sub(r"[^A-Za-z']", "", token)
     return cleaned.lower()
+def find_misspell_boxes(
+    img: Image.Image,
+    *,
+    min_conf: int = 60,
+    lang: str = "eng+fra",
+    extra_allow: Optional[Iterable[str]] = None
+) -> List["Box"]:
     if not (HAS_OCR and HAS_SPELLCHECK):
         return []
     try:
+        if extra_allow and _SPELL_EN and _SPELL_FR:
+            _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
+            _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
+        data = pytesseract.image_to_data(
+            img,
+            lang=lang,
+            output_type=pytesseract.Output.DICT,
+            # config="--psm 6"  # uncomment if your pages are simple blocks of text
+        )
     except Exception:
         return []
+    n = len(data.get("text", [])) or 0
+    boxes: List["Box"] = []
     for i in range(n):
+        raw = data["text"][i]
+        if not raw:
             continue
+        # confidence filter
+        conf_str = data.get("conf", ["-1"])[i]
+        try:
+            conf = int(float(conf_str))
+        except Exception:
+            conf = -1
+        if conf < min_conf:
+            continue
+        tokens = _extract_tokens(raw)
+        if not tokens:
             continue
+        # flag the box if ANY token in it looks misspelled
+        if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
             continue
+        left   = data.get("left",  [0])[i]
+        top    = data.get("top",   [0])[i]
+        width  = data.get("width", [0])[i]
+        height = data.get("height",[0])[i]
         if width <= 0 or height <= 0:
             continue
+        # NOTE: adjust to match your Box constructor if needed
+        boxes.append(Box(top, left, top + height, left + width, width * height))
     return boxes
 # -------------------- Barcode / QR -----------------
 def ean_like_checksum_ok(digits: str) -> bool:
     if not digits.isdigit():

requirements.txt CHANGED Viewed

@@ -6,13 +6,13 @@ Pillow==10.0.1
 opencv-python==4.8.1.78
 pytesseract==0.3.10
 pyzbar==0.1.9
-pyspellchecker==0.7.2
 nltk==3.8.1
 numpy==1.24.3
 scikit-image==0.21.0
 matplotlib==3.7.2
 pandas==2.0.3
 reportlab==4.0.4
-regex==2023.10.3
 gradio==4.44.1
 PyMuPDF==1.23.8

 opencv-python==4.8.1.78
 pytesseract==0.3.10
 pyzbar==0.1.9
+pyspellchecker==0.8.3
 nltk==3.8.1
 numpy==1.24.3
 scikit-image==0.21.0
 matplotlib==3.7.2
 pandas==2.0.3
 reportlab==4.0.4
+regex==2025.9.1
 gradio==4.44.1
 PyMuPDF==1.23.8