Digitaljoint
/

ProofCheck

document-processing

pdf

ocr

comparator

Model card Files Files and versions

xet

Community

Yaz Hobooti commited on Sep 17, 2025

Commit

e88aad6

1 Parent(s): cdad8f0

Improve OCR performance: add image preprocessing, higher DPI, better Tesseract config

Browse files

Files changed (1) hide show

pdf_comparator.py +33 -6

pdf_comparator.py CHANGED Viewed

@@ -126,7 +126,7 @@ def normalize_token(token: str) -> str:
 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
-def load_pdf_pages(path: str, dpi: int = 300, max_pages: int = 5) -> List[Image.Image]:
     if _is_pdf(path):
         # Try pdf2image with multiple poppler paths first
         poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
@@ -313,19 +313,43 @@ def _get_available_tesseract_langs():
     except Exception:
         return "eng"
 def find_misspell_boxes(
     img: Image.Image,
     *,
     min_conf: int = 60,
     lang: Optional[str] = None,
-    extra_allow: Optional[Iterable[str]] = None
 ) -> List[Box]:
     if not (HAS_OCR and HAS_SPELLCHECK):
         return []
     # Auto-detect language if not provided
     if lang is None:
-        lang = _get_available_tesseract_langs()
     try:
         if extra_allow and _SPELL_EN:
@@ -333,11 +357,14 @@ def find_misspell_boxes(
         if extra_allow and _SPELL_FR:
             _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
         data = pytesseract.image_to_data(
             img,
             lang=lang,
             output_type=pytesseract.Output.DICT,
-            # config="--psm 6"  # uncomment if your pages are simple blocks of text
         )
     except Exception:
         return []
@@ -502,8 +529,8 @@ def compare_pdfs(file_a, file_b):
             return None, None, None, "❌ Please upload both PDF files to compare", [], []
         # Load images with multiple pages support
-        pages_a = load_pdf_pages(file_a.name, dpi=300, max_pages=5)
-        pages_b = load_pdf_pages(file_b.name, dpi=300, max_pages=5)
         # Combine pages into single images for comparison
         a = combine_pages_vertically(pages_a)

 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
+def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
     if _is_pdf(path):
         # Try pdf2image with multiple poppler paths first
         poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
     except Exception:
         return "eng"
+def prepare_for_ocr(img: Image.Image) -> Image.Image:
+    """Prepare image for better OCR results"""
+    from PIL import ImageOps, ImageFilter
+    g = img.convert("L")
+    g = ImageOps.autocontrast(g)
+    g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
+    return g
 def find_misspell_boxes(
     img: Image.Image,
     *,
     min_conf: int = 60,
     lang: Optional[str] = None,
+    extra_allow: Optional[Iterable[str]] = None,
+    dpi: int = 300,
+    psm: int = 6,
+    oem: int = 3
 ) -> List[Box]:
     if not (HAS_OCR and HAS_SPELLCHECK):
         return []
     # Auto-detect language if not provided
     if lang is None:
+        try:
+            avail = set(pytesseract.get_languages(config="") or [])
+        except Exception:
+            avail = {"eng"}
+        lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
+    # OPTIONAL: light upscale if the image is small (heuristic)
+    # target width ~ 2500–3000 px for letter-sized pages
+    if img.width < 1600:
+        scale = 2
+        img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
+    # Prepare image for better OCR
+    img = prepare_for_ocr(img)
     try:
         if extra_allow and _SPELL_EN:
         if extra_allow and _SPELL_FR:
             _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
+        # Build a config that sets an explicit DPI and keeps spaces
+        config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
         data = pytesseract.image_to_data(
             img,
             lang=lang,
+            config=config,
             output_type=pytesseract.Output.DICT,
         )
     except Exception:
         return []
             return None, None, None, "❌ Please upload both PDF files to compare", [], []
         # Load images with multiple pages support
+        pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5)
+        pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5)
         # Combine pages into single images for comparison
         a = combine_pages_vertically(pages_a)