Yaz Hobooti commited on
Commit
e88aad6
·
1 Parent(s): cdad8f0

Improve OCR performance: add image preprocessing, higher DPI, better Tesseract config

Browse files
Files changed (1) hide show
  1. pdf_comparator.py +33 -6
pdf_comparator.py CHANGED
@@ -126,7 +126,7 @@ def normalize_token(token: str) -> str:
126
  def _is_pdf(path: str) -> bool:
127
  return os.path.splitext(path.lower())[1] == ".pdf"
128
 
129
- def load_pdf_pages(path: str, dpi: int = 300, max_pages: int = 5) -> List[Image.Image]:
130
  if _is_pdf(path):
131
  # Try pdf2image with multiple poppler paths first
132
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
@@ -313,19 +313,43 @@ def _get_available_tesseract_langs():
313
  except Exception:
314
  return "eng"
315
 
 
 
 
 
 
 
 
 
316
  def find_misspell_boxes(
317
  img: Image.Image,
318
  *,
319
  min_conf: int = 60,
320
  lang: Optional[str] = None,
321
- extra_allow: Optional[Iterable[str]] = None
 
 
 
322
  ) -> List[Box]:
323
  if not (HAS_OCR and HAS_SPELLCHECK):
324
  return []
325
 
326
  # Auto-detect language if not provided
327
  if lang is None:
328
- lang = _get_available_tesseract_langs()
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  try:
331
  if extra_allow and _SPELL_EN:
@@ -333,11 +357,14 @@ def find_misspell_boxes(
333
  if extra_allow and _SPELL_FR:
334
  _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
335
 
 
 
 
336
  data = pytesseract.image_to_data(
337
  img,
338
  lang=lang,
 
339
  output_type=pytesseract.Output.DICT,
340
- # config="--psm 6" # uncomment if your pages are simple blocks of text
341
  )
342
  except Exception:
343
  return []
@@ -502,8 +529,8 @@ def compare_pdfs(file_a, file_b):
502
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
503
 
504
  # Load images with multiple pages support
505
- pages_a = load_pdf_pages(file_a.name, dpi=300, max_pages=5)
506
- pages_b = load_pdf_pages(file_b.name, dpi=300, max_pages=5)
507
 
508
  # Combine pages into single images for comparison
509
  a = combine_pages_vertically(pages_a)
 
126
  def _is_pdf(path: str) -> bool:
127
  return os.path.splitext(path.lower())[1] == ".pdf"
128
 
129
+ def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
130
  if _is_pdf(path):
131
  # Try pdf2image with multiple poppler paths first
132
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
 
313
  except Exception:
314
  return "eng"
315
 
316
+ def prepare_for_ocr(img: Image.Image) -> Image.Image:
317
+ """Prepare image for better OCR results"""
318
+ from PIL import ImageOps, ImageFilter
319
+ g = img.convert("L")
320
+ g = ImageOps.autocontrast(g)
321
+ g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
322
+ return g
323
+
324
  def find_misspell_boxes(
325
  img: Image.Image,
326
  *,
327
  min_conf: int = 60,
328
  lang: Optional[str] = None,
329
+ extra_allow: Optional[Iterable[str]] = None,
330
+ dpi: int = 300,
331
+ psm: int = 6,
332
+ oem: int = 3
333
  ) -> List[Box]:
334
  if not (HAS_OCR and HAS_SPELLCHECK):
335
  return []
336
 
337
  # Auto-detect language if not provided
338
  if lang is None:
339
+ try:
340
+ avail = set(pytesseract.get_languages(config="") or [])
341
+ except Exception:
342
+ avail = {"eng"}
343
+ lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
344
+
345
+ # OPTIONAL: light upscale if the image is small (heuristic)
346
+ # target width ~ 2500–3000 px for letter-sized pages
347
+ if img.width < 1600:
348
+ scale = 2
349
+ img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
350
+
351
+ # Prepare image for better OCR
352
+ img = prepare_for_ocr(img)
353
 
354
  try:
355
  if extra_allow and _SPELL_EN:
 
357
  if extra_allow and _SPELL_FR:
358
  _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
359
 
360
+ # Build a config that sets an explicit DPI and keeps spaces
361
+ config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
362
+
363
  data = pytesseract.image_to_data(
364
  img,
365
  lang=lang,
366
+ config=config,
367
  output_type=pytesseract.Output.DICT,
 
368
  )
369
  except Exception:
370
  return []
 
529
  return None, None, None, "❌ Please upload both PDF files to compare", [], []
530
 
531
  # Load images with multiple pages support
532
+ pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5)
533
+ pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5)
534
 
535
  # Combine pages into single images for comparison
536
  a = combine_pages_vertically(pages_a)