Yaz Hobooti
commited on
Commit
·
767e261
1
Parent(s):
3ac3a56
COMPLETE REWRITE: Fix all syntax errors in pdf_comparator.py
Browse files- Completely rewrote load_pdf_pages function with cleaner structure
- Fixed indentation error at line 675 in barcode detection
- Fixed indentation error at line 1271 in debug function
- All syntax errors resolved - file now compiles successfully
- Simplified exception handling and improved code structure
- pdf_comparator.py +39 -41
pdf_comparator.py
CHANGED
|
@@ -371,47 +371,45 @@ def _contains_validation_text(text: str) -> bool:
|
|
| 371 |
return "50 Carroll" in text
|
| 372 |
|
| 373 |
def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
|
|
|
| 388 |
return [img.convert("RGB") for img in imgs]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
except Exception as e:
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
# Fallback to PyMuPDF if pdf2image fails
|
| 395 |
-
if HAS_PYMUPDF:
|
| 396 |
-
try:
|
| 397 |
-
doc = fitz.open(path)
|
| 398 |
-
pages = []
|
| 399 |
-
for page_num in range(min(len(doc), max_pages)):
|
| 400 |
-
page = doc[page_num]
|
| 401 |
-
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
|
| 402 |
-
pix = page.get_pixmap(matrix=mat)
|
| 403 |
-
img_data = pix.tobytes("ppm")
|
| 404 |
-
img = Image.open(io.BytesIO(img_data))
|
| 405 |
-
pages.append(img.convert("RGB"))
|
| 406 |
-
doc.close()
|
| 407 |
-
return pages
|
| 408 |
-
except Exception as e:
|
| 409 |
-
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
|
| 410 |
-
else:
|
| 411 |
-
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
|
| 412 |
-
|
| 413 |
-
raise ValueError(f"No pages in PDF: {path}")
|
| 414 |
-
return [Image.open(path).convert("RGB")]
|
| 415 |
|
| 416 |
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
|
| 417 |
"""Combine multiple pages into a single vertical image"""
|
|
@@ -667,7 +665,7 @@ def find_misspell_boxes_from_text(
|
|
| 667 |
y1 = int(bbox[1] * scale_y) + (page_num * img_height)
|
| 668 |
x2 = int(bbox[2] * scale_x)
|
| 669 |
y2 = int(bbox[3] * scale_y) + (page_num * img_height)
|
| 670 |
-
|
| 671 |
# Use PDF coordinates directly (fallback)
|
| 672 |
x1 = int(bbox[0])
|
| 673 |
y1 = int(bbox[1]) + (page_num * 1000)
|
|
@@ -1270,7 +1268,7 @@ def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
|
|
| 1270 |
rr = _decode_once(pil) or _decode_once(_binarize(pil))
|
| 1271 |
if rr:
|
| 1272 |
print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
|
| 1273 |
-
|
| 1274 |
print(" Embedded image error:", e)
|
| 1275 |
|
| 1276 |
doc.close()
|
|
|
|
| 371 |
return "50 Carroll" in text
|
| 372 |
|
| 373 |
def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
|
| 374 |
+
"""Load PDF pages as images with fallback options"""
|
| 375 |
+
if not _is_pdf(path):
|
| 376 |
+
return [Image.open(path).convert("RGB")]
|
| 377 |
+
|
| 378 |
+
# Try pdf2image first
|
| 379 |
+
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
| 380 |
+
|
| 381 |
+
for poppler_path in poppler_paths:
|
| 382 |
+
try:
|
| 383 |
+
if poppler_path:
|
| 384 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
|
| 385 |
+
else:
|
| 386 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
|
| 387 |
+
|
| 388 |
+
if imgs:
|
| 389 |
return [img.convert("RGB") for img in imgs]
|
| 390 |
+
except Exception:
|
| 391 |
+
if poppler_path is None: # All pdf2image attempts failed
|
| 392 |
+
break
|
| 393 |
+
continue # Try next path
|
| 394 |
+
|
| 395 |
+
# Fallback to PyMuPDF
|
| 396 |
+
if HAS_PYMUPDF:
|
| 397 |
+
try:
|
| 398 |
+
doc = fitz.open(path)
|
| 399 |
+
pages = []
|
| 400 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 401 |
+
page = doc[page_num]
|
| 402 |
+
mat = fitz.Matrix(dpi/72, dpi/72)
|
| 403 |
+
pix = page.get_pixmap(matrix=mat)
|
| 404 |
+
img_data = pix.tobytes("ppm")
|
| 405 |
+
img = Image.open(io.BytesIO(img_data))
|
| 406 |
+
pages.append(img.convert("RGB"))
|
| 407 |
+
doc.close()
|
| 408 |
+
return pages
|
| 409 |
except Exception as e:
|
| 410 |
+
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
|
| 411 |
+
|
| 412 |
+
raise ValueError("Failed to convert PDF to image. No working method available.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
|
| 415 |
"""Combine multiple pages into a single vertical image"""
|
|
|
|
| 665 |
y1 = int(bbox[1] * scale_y) + (page_num * img_height)
|
| 666 |
x2 = int(bbox[2] * scale_x)
|
| 667 |
y2 = int(bbox[3] * scale_y) + (page_num * img_height)
|
| 668 |
+
else:
|
| 669 |
# Use PDF coordinates directly (fallback)
|
| 670 |
x1 = int(bbox[0])
|
| 671 |
y1 = int(bbox[1]) + (page_num * 1000)
|
|
|
|
| 1268 |
rr = _decode_once(pil) or _decode_once(_binarize(pil))
|
| 1269 |
if rr:
|
| 1270 |
print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
|
| 1271 |
+
except Exception as e:
|
| 1272 |
print(" Embedded image error:", e)
|
| 1273 |
|
| 1274 |
doc.close()
|