Yaz Hobooti
commited on
Commit
·
e88aad6
1
Parent(s):
cdad8f0
Improve OCR performance: add image preprocessing, higher DPI, better Tesseract config
Browse files- pdf_comparator.py +33 -6
pdf_comparator.py
CHANGED
|
@@ -126,7 +126,7 @@ def normalize_token(token: str) -> str:
|
|
| 126 |
def _is_pdf(path: str) -> bool:
|
| 127 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 128 |
|
| 129 |
-
def load_pdf_pages(path: str, dpi: int =
|
| 130 |
if _is_pdf(path):
|
| 131 |
# Try pdf2image with multiple poppler paths first
|
| 132 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
|
@@ -313,19 +313,43 @@ def _get_available_tesseract_langs():
|
|
| 313 |
except Exception:
|
| 314 |
return "eng"
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
def find_misspell_boxes(
|
| 317 |
img: Image.Image,
|
| 318 |
*,
|
| 319 |
min_conf: int = 60,
|
| 320 |
lang: Optional[str] = None,
|
| 321 |
-
extra_allow: Optional[Iterable[str]] = None
|
|
|
|
|
|
|
|
|
|
| 322 |
) -> List[Box]:
|
| 323 |
if not (HAS_OCR and HAS_SPELLCHECK):
|
| 324 |
return []
|
| 325 |
|
| 326 |
# Auto-detect language if not provided
|
| 327 |
if lang is None:
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
try:
|
| 331 |
if extra_allow and _SPELL_EN:
|
|
@@ -333,11 +357,14 @@ def find_misspell_boxes(
|
|
| 333 |
if extra_allow and _SPELL_FR:
|
| 334 |
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 335 |
|
|
|
|
|
|
|
|
|
|
| 336 |
data = pytesseract.image_to_data(
|
| 337 |
img,
|
| 338 |
lang=lang,
|
|
|
|
| 339 |
output_type=pytesseract.Output.DICT,
|
| 340 |
-
# config="--psm 6" # uncomment if your pages are simple blocks of text
|
| 341 |
)
|
| 342 |
except Exception:
|
| 343 |
return []
|
|
@@ -502,8 +529,8 @@ def compare_pdfs(file_a, file_b):
|
|
| 502 |
return None, None, None, "❌ Please upload both PDF files to compare", [], []
|
| 503 |
|
| 504 |
# Load images with multiple pages support
|
| 505 |
-
pages_a = load_pdf_pages(file_a.name, dpi=
|
| 506 |
-
pages_b = load_pdf_pages(file_b.name, dpi=
|
| 507 |
|
| 508 |
# Combine pages into single images for comparison
|
| 509 |
a = combine_pages_vertically(pages_a)
|
|
|
|
| 126 |
def _is_pdf(path: str) -> bool:
|
| 127 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 128 |
|
| 129 |
+
def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
|
| 130 |
if _is_pdf(path):
|
| 131 |
# Try pdf2image with multiple poppler paths first
|
| 132 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
|
|
|
| 313 |
except Exception:
|
| 314 |
return "eng"
|
| 315 |
|
| 316 |
+
def prepare_for_ocr(img: Image.Image) -> Image.Image:
|
| 317 |
+
"""Prepare image for better OCR results"""
|
| 318 |
+
from PIL import ImageOps, ImageFilter
|
| 319 |
+
g = img.convert("L")
|
| 320 |
+
g = ImageOps.autocontrast(g)
|
| 321 |
+
g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
|
| 322 |
+
return g
|
| 323 |
+
|
| 324 |
def find_misspell_boxes(
|
| 325 |
img: Image.Image,
|
| 326 |
*,
|
| 327 |
min_conf: int = 60,
|
| 328 |
lang: Optional[str] = None,
|
| 329 |
+
extra_allow: Optional[Iterable[str]] = None,
|
| 330 |
+
dpi: int = 300,
|
| 331 |
+
psm: int = 6,
|
| 332 |
+
oem: int = 3
|
| 333 |
) -> List[Box]:
|
| 334 |
if not (HAS_OCR and HAS_SPELLCHECK):
|
| 335 |
return []
|
| 336 |
|
| 337 |
# Auto-detect language if not provided
|
| 338 |
if lang is None:
|
| 339 |
+
try:
|
| 340 |
+
avail = set(pytesseract.get_languages(config="") or [])
|
| 341 |
+
except Exception:
|
| 342 |
+
avail = {"eng"}
|
| 343 |
+
lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
|
| 344 |
+
|
| 345 |
+
# OPTIONAL: light upscale if the image is small (heuristic)
|
| 346 |
+
# target width ~ 2500–3000 px for letter-sized pages
|
| 347 |
+
if img.width < 1600:
|
| 348 |
+
scale = 2
|
| 349 |
+
img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
|
| 350 |
+
|
| 351 |
+
# Prepare image for better OCR
|
| 352 |
+
img = prepare_for_ocr(img)
|
| 353 |
|
| 354 |
try:
|
| 355 |
if extra_allow and _SPELL_EN:
|
|
|
|
| 357 |
if extra_allow and _SPELL_FR:
|
| 358 |
_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
|
| 359 |
|
| 360 |
+
# Build a config that sets an explicit DPI and keeps spaces
|
| 361 |
+
config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"
|
| 362 |
+
|
| 363 |
data = pytesseract.image_to_data(
|
| 364 |
img,
|
| 365 |
lang=lang,
|
| 366 |
+
config=config,
|
| 367 |
output_type=pytesseract.Output.DICT,
|
|
|
|
| 368 |
)
|
| 369 |
except Exception:
|
| 370 |
return []
|
|
|
|
| 529 |
return None, None, None, "❌ Please upload both PDF files to compare", [], []
|
| 530 |
|
| 531 |
# Load images with multiple pages support
|
| 532 |
+
pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5)
|
| 533 |
+
pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5)
|
| 534 |
|
| 535 |
# Combine pages into single images for comparison
|
| 536 |
a = combine_pages_vertically(pages_a)
|