Yaz Hobooti commited on
Commit
767e261
·
1 Parent(s): 3ac3a56

COMPLETE REWRITE: Fix all syntax errors in pdf_comparator.py

Browse files

- Completely rewrote load_pdf_pages function with cleaner structure
- Fixed indentation error at line 675 in barcode detection
- Fixed indentation error at line 1271 in debug function
- All syntax errors resolved - file now compiles successfully
- Simplified exception handling and improved code structure

Files changed (1) hide show
  1. pdf_comparator.py +39 -41
pdf_comparator.py CHANGED
@@ -371,47 +371,45 @@ def _contains_validation_text(text: str) -> bool:
371
  return "50 Carroll" in text
372
 
373
  def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
374
- if _is_pdf(path):
375
- # Try pdf2image with multiple poppler paths first
376
- poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
377
-
378
- for poppler_path in poppler_paths:
379
- try:
380
- if poppler_path:
381
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
382
- else:
383
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
384
-
385
- if not imgs:
386
- continue
387
-
 
388
  return [img.convert("RGB") for img in imgs]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  except Exception as e:
390
- if poppler_path is None: # All pdf2image attempts failed
391
- break
392
- continue # Try next path
393
-
394
- # Fallback to PyMuPDF if pdf2image fails
395
- if HAS_PYMUPDF:
396
- try:
397
- doc = fitz.open(path)
398
- pages = []
399
- for page_num in range(min(len(doc), max_pages)):
400
- page = doc[page_num]
401
- mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
402
- pix = page.get_pixmap(matrix=mat)
403
- img_data = pix.tobytes("ppm")
404
- img = Image.open(io.BytesIO(img_data))
405
- pages.append(img.convert("RGB"))
406
- doc.close()
407
- return pages
408
- except Exception as e:
409
- raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
410
- else:
411
- raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
412
-
413
- raise ValueError(f"No pages in PDF: {path}")
414
- return [Image.open(path).convert("RGB")]
415
 
416
  def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
417
  """Combine multiple pages into a single vertical image"""
@@ -667,7 +665,7 @@ def find_misspell_boxes_from_text(
667
  y1 = int(bbox[1] * scale_y) + (page_num * img_height)
668
  x2 = int(bbox[2] * scale_x)
669
  y2 = int(bbox[3] * scale_y) + (page_num * img_height)
670
- else:
671
  # Use PDF coordinates directly (fallback)
672
  x1 = int(bbox[0])
673
  y1 = int(bbox[1]) + (page_num * 1000)
@@ -1270,7 +1268,7 @@ def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
1270
  rr = _decode_once(pil) or _decode_once(_binarize(pil))
1271
  if rr:
1272
  print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
1273
- except Exception as e:
1274
  print(" Embedded image error:", e)
1275
 
1276
  doc.close()
 
371
  return "50 Carroll" in text
372
 
373
  def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
374
+ """Load PDF pages as images with fallback options"""
375
+ if not _is_pdf(path):
376
+ return [Image.open(path).convert("RGB")]
377
+
378
+ # Try pdf2image first
379
+ poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
380
+
381
+ for poppler_path in poppler_paths:
382
+ try:
383
+ if poppler_path:
384
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
385
+ else:
386
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
387
+
388
+ if imgs:
389
  return [img.convert("RGB") for img in imgs]
390
+ except Exception:
391
+ if poppler_path is None: # All pdf2image attempts failed
392
+ break
393
+ continue # Try next path
394
+
395
+ # Fallback to PyMuPDF
396
+ if HAS_PYMUPDF:
397
+ try:
398
+ doc = fitz.open(path)
399
+ pages = []
400
+ for page_num in range(min(len(doc), max_pages)):
401
+ page = doc[page_num]
402
+ mat = fitz.Matrix(dpi/72, dpi/72)
403
+ pix = page.get_pixmap(matrix=mat)
404
+ img_data = pix.tobytes("ppm")
405
+ img = Image.open(io.BytesIO(img_data))
406
+ pages.append(img.convert("RGB"))
407
+ doc.close()
408
+ return pages
409
  except Exception as e:
410
+ raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. Error: {str(e)}")
411
+
412
+ raise ValueError("Failed to convert PDF to image. No working method available.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
  def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
415
  """Combine multiple pages into a single vertical image"""
 
665
  y1 = int(bbox[1] * scale_y) + (page_num * img_height)
666
  x2 = int(bbox[2] * scale_x)
667
  y2 = int(bbox[3] * scale_y) + (page_num * img_height)
668
+ else:
669
  # Use PDF coordinates directly (fallback)
670
  x1 = int(bbox[0])
671
  y1 = int(bbox[1]) + (page_num * 1000)
 
1268
  rr = _decode_once(pil) or _decode_once(_binarize(pil))
1269
  if rr:
1270
  print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
1271
+ except Exception as e:
1272
  print(" Embedded image error:", e)
1273
 
1274
  doc.close()