Yaz Hobooti commited on
Commit
8cec543
·
1 Parent(s): 47f3477

Implement PDF-based barcode/QR code detection

Browse files

- Add extract_pdf_images() function to extract images directly from PDF using PyMuPDF
- Add find_barcode_boxes_and_info_from_pdf() function for PDF-based barcode detection
- Update compare_pdfs() to use PDF-based barcode detection instead of rasterized images
- Add proper coordinate mapping from PDF space to image space
- Maintain fallback to legacy rasterized image detection
- Add debug output for barcode detection results
- Barcode detection now operates on original PDF for better accuracy and performance

Files changed (1) hide show
  1. pdf_comparator.py +123 -2
pdf_comparator.py CHANGED
@@ -602,7 +602,124 @@ def decode_with_variants(img: Image.Image):
602
  do_decode(img.convert('RGB'))
603
  return results
604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  def find_barcode_boxes_and_info(img: Image.Image):
 
606
  decodes = decode_with_variants(img)
607
  boxes: List[Box] = []
608
  infos = []
@@ -695,8 +812,12 @@ def compare_pdfs(file_a, file_b):
695
  print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
696
 
697
  if HAS_BARCODE:
698
- bar_a, info_a = find_barcode_boxes_and_info(a)
699
- bar_b, info_b = find_barcode_boxes_and_info(b)
 
 
 
 
700
  else:
701
  bar_a, info_a = [], []
702
  bar_b, info_b = [], []
 
602
  do_decode(img.convert('RGB'))
603
  return results
604
 
605
+ def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
606
+ """Extract images directly from PDF using PyMuPDF"""
607
+ if not HAS_PYMUPDF:
608
+ return []
609
+
610
+ try:
611
+ doc = fitz.open(pdf_path)
612
+ images = []
613
+
614
+ for page_num in range(min(len(doc), max_pages)):
615
+ page = doc[page_num]
616
+
617
+ # Get images from the page
618
+ image_list = page.get_images()
619
+
620
+ for img_index, img in enumerate(image_list):
621
+ # Get the image data
622
+ xref = img[0]
623
+ pix = fitz.Pixmap(doc, xref)
624
+
625
+ # Convert to PIL Image
626
+ if pix.n - pix.alpha < 4: # GRAY or RGB
627
+ img_data = pix.tobytes("ppm")
628
+ pil_img = Image.open(io.BytesIO(img_data))
629
+ images.append(pil_img)
630
+ else: # CMYK: convert to RGB first
631
+ pix1 = fitz.Pixmap(fitz.csRGB, pix)
632
+ img_data = pix1.tobytes("ppm")
633
+ pil_img = Image.open(io.BytesIO(img_data))
634
+ images.append(pil_img)
635
+ pix1 = None
636
+ pix = None
637
+
638
+ doc.close()
639
+ return images
640
+ except Exception:
641
+ return []
642
+
643
+ def find_barcode_boxes_and_info_from_pdf(
644
+ pdf_path: str,
645
+ *,
646
+ max_pages: int = 5,
647
+ image_size: Optional[Tuple[int, int]] = None,
648
+ dpi: int = 300
649
+ ) -> Tuple[List[Box], List[dict]]:
650
+ """Find barcodes/QR codes by analyzing extracted PDF images directly"""
651
+ if not HAS_BARCODE:
652
+ return [], []
653
+
654
+ boxes: List[Box] = []
655
+ infos = []
656
+
657
+ try:
658
+ doc = fitz.open(pdf_path)
659
+
660
+ for page_num in range(min(len(doc), max_pages)):
661
+ page = doc[page_num]
662
+
663
+ # Get page dimensions for coordinate conversion
664
+ page_rect = page.rect
665
+ pdf_width = page_rect.width
666
+ pdf_height = page_rect.height
667
+
668
+ # Render page to image for barcode detection
669
+ mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
670
+ pix = page.get_pixmap(matrix=mat)
671
+ img_data = pix.tobytes("ppm")
672
+ pil_img = Image.open(io.BytesIO(img_data))
673
+
674
+ # Detect barcodes in the page image
675
+ decodes = decode_with_variants(pil_img)
676
+
677
+ for d in decodes:
678
+ rect = d.rect
679
+
680
+ # Convert coordinates if image_size is provided
681
+ if image_size:
682
+ img_width, img_height = image_size
683
+ scale_x = img_width / pil_img.width
684
+ scale_y = img_height / pil_img.height
685
+
686
+ left = int(rect.left * scale_x)
687
+ top = int(rect.top * scale_y) + (page_num * img_height)
688
+ width = int(rect.width * scale_x)
689
+ height = int(rect.height * scale_y)
690
+ else:
691
+ left = rect.left
692
+ top = rect.top + (page_num * 1000)
693
+ width = rect.width
694
+ height = rect.height
695
+
696
+ boxes.append(Box(
697
+ y1=top,
698
+ x1=left,
699
+ y2=top + height,
700
+ x2=left + width,
701
+ area=width * height
702
+ ))
703
+
704
+ valid = validate_symbology(d.type, d.data)
705
+ infos.append({
706
+ 'type': d.type,
707
+ 'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
708
+ 'left': left, 'top': top, 'width': width, 'height': height,
709
+ 'valid': bool(valid),
710
+ 'page': page_num
711
+ })
712
+
713
+ doc.close()
714
+
715
+ except Exception:
716
+ # Fallback to original method if PDF processing fails
717
+ return [], []
718
+
719
+ return boxes, infos
720
+
721
  def find_barcode_boxes_and_info(img: Image.Image):
722
+ """Legacy barcode detection (kept for fallback)"""
723
  decodes = decode_with_variants(img)
724
  boxes: List[Box] = []
725
  infos = []
 
812
  print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
813
 
814
  if HAS_BARCODE:
815
+ # Use PDF-based barcode detection instead of rasterized image
816
+ bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
817
+ bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
818
+
819
+ # Debug: Print barcode detection results
820
+ print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
821
  else:
822
  bar_a, info_a = [], []
823
  bar_b, info_b = [], []