Implement PDF-based barcode/QR code detection

- Add extract_pdf_images() function to extract images directly from PDF using PyMuPDF
- Add find_barcode_boxes_and_info_from_pdf() function for PDF-based barcode detection
- Update compare_pdfs() to use PDF-based barcode detection instead of rasterized images
- Add proper coordinate mapping from PDF space to image space
- Maintain fallback to legacy rasterized image detection
- Add debug output for barcode detection results
- Barcode detection now operates on original PDF for better accuracy and performance

Files changed (1) hide show

pdf_comparator.py +123 -2

pdf_comparator.py CHANGED Viewed

@@ -602,7 +602,124 @@ def decode_with_variants(img: Image.Image):
         do_decode(img.convert('RGB'))
     return results
 def find_barcode_boxes_and_info(img: Image.Image):
     decodes = decode_with_variants(img)
     boxes: List[Box] = []
     infos = []
@@ -695,8 +812,12 @@ def compare_pdfs(file_a, file_b):
         print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
         if HAS_BARCODE:
-            bar_a, info_a = find_barcode_boxes_and_info(a)
-            bar_b, info_b = find_barcode_boxes_and_info(b)
         else:
             bar_a, info_a = [], []
             bar_b, info_b = [], []

         do_decode(img.convert('RGB'))
     return results
+def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
+    """Extract images directly from PDF using PyMuPDF"""
+    if not HAS_PYMUPDF:
+        return []
+    try:
+        doc = fitz.open(pdf_path)
+        images = []
+        for page_num in range(min(len(doc), max_pages)):
+            page = doc[page_num]
+            # Get images from the page
+            image_list = page.get_images()
+            for img_index, img in enumerate(image_list):
+                # Get the image data
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+                # Convert to PIL Image
+                if pix.n - pix.alpha < 4:  # GRAY or RGB
+                    img_data = pix.tobytes("ppm")
+                    pil_img = Image.open(io.BytesIO(img_data))
+                    images.append(pil_img)
+                else:  # CMYK: convert to RGB first
+                    pix1 = fitz.Pixmap(fitz.csRGB, pix)
+                    img_data = pix1.tobytes("ppm")
+                    pil_img = Image.open(io.BytesIO(img_data))
+                    images.append(pil_img)
+                    pix1 = None
+                pix = None
+        doc.close()
+        return images
+    except Exception:
+        return []
+def find_barcode_boxes_and_info_from_pdf(
+    pdf_path: str,
+    *,
+    max_pages: int = 5,
+    image_size: Optional[Tuple[int, int]] = None,
+    dpi: int = 300
+) -> Tuple[List[Box], List[dict]]:
+    """Find barcodes/QR codes by analyzing extracted PDF images directly"""
+    if not HAS_BARCODE:
+        return [], []
+    boxes: List[Box] = []
+    infos = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(min(len(doc), max_pages)):
+            page = doc[page_num]
+            # Get page dimensions for coordinate conversion
+            page_rect = page.rect
+            pdf_width = page_rect.width
+            pdf_height = page_rect.height
+            # Render page to image for barcode detection
+            mat = fitz.Matrix(dpi/72, dpi/72)  # Scale factor for DPI
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("ppm")
+            pil_img = Image.open(io.BytesIO(img_data))
+            # Detect barcodes in the page image
+            decodes = decode_with_variants(pil_img)
+            for d in decodes:
+                rect = d.rect
+                # Convert coordinates if image_size is provided
+                if image_size:
+                    img_width, img_height = image_size
+                    scale_x = img_width / pil_img.width
+                    scale_y = img_height / pil_img.height
+                    left = int(rect.left * scale_x)
+                    top = int(rect.top * scale_y) + (page_num * img_height)
+                    width = int(rect.width * scale_x)
+                    height = int(rect.height * scale_y)
+                else:
+                    left = rect.left
+                    top = rect.top + (page_num * 1000)
+                    width = rect.width
+                    height = rect.height
+                boxes.append(Box(
+                    y1=top,
+                    x1=left,
+                    y2=top + height,
+                    x2=left + width,
+                    area=width * height
+                ))
+                valid = validate_symbology(d.type, d.data)
+                infos.append({
+                    'type': d.type,
+                    'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
+                    'left': left, 'top': top, 'width': width, 'height': height,
+                    'valid': bool(valid),
+                    'page': page_num
+                })
+        doc.close()
+    except Exception:
+        # Fallback to original method if PDF processing fails
+        return [], []
+    return boxes, infos
 def find_barcode_boxes_and_info(img: Image.Image):
+    """Legacy barcode detection (kept for fallback)"""
     decodes = decode_with_variants(img)
     boxes: List[Box] = []
     infos = []
         print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
         if HAS_BARCODE:
+            # Use PDF-based barcode detection instead of rasterized image
+            bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
+            bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
+            # Debug: Print barcode detection results
+            print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
         else:
             bar_a, info_a = [], []
             bar_b, info_b = [], []