Yaz Hobooti
commited on
Commit
·
8cec543
1
Parent(s):
47f3477
Implement PDF-based barcode/QR code detection
Browse files- Add extract_pdf_images() function to extract images directly from PDF using PyMuPDF
- Add find_barcode_boxes_and_info_from_pdf() function for PDF-based barcode detection
- Update compare_pdfs() to use PDF-based barcode detection instead of rasterized images
- Add proper coordinate mapping from PDF space to image space
- Maintain fallback to legacy rasterized image detection
- Add debug output for barcode detection results
- Barcode detection now operates on original PDF for better accuracy and performance
- pdf_comparator.py +123 -2
pdf_comparator.py
CHANGED
|
@@ -602,7 +602,124 @@ def decode_with_variants(img: Image.Image):
|
|
| 602 |
do_decode(img.convert('RGB'))
|
| 603 |
return results
|
| 604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
def find_barcode_boxes_and_info(img: Image.Image):
|
|
|
|
| 606 |
decodes = decode_with_variants(img)
|
| 607 |
boxes: List[Box] = []
|
| 608 |
infos = []
|
|
@@ -695,8 +812,12 @@ def compare_pdfs(file_a, file_b):
|
|
| 695 |
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
|
| 696 |
|
| 697 |
if HAS_BARCODE:
|
| 698 |
-
|
| 699 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
else:
|
| 701 |
bar_a, info_a = [], []
|
| 702 |
bar_b, info_b = [], []
|
|
|
|
| 602 |
do_decode(img.convert('RGB'))
|
| 603 |
return results
|
| 604 |
|
| 605 |
+
def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
|
| 606 |
+
"""Extract images directly from PDF using PyMuPDF"""
|
| 607 |
+
if not HAS_PYMUPDF:
|
| 608 |
+
return []
|
| 609 |
+
|
| 610 |
+
try:
|
| 611 |
+
doc = fitz.open(pdf_path)
|
| 612 |
+
images = []
|
| 613 |
+
|
| 614 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 615 |
+
page = doc[page_num]
|
| 616 |
+
|
| 617 |
+
# Get images from the page
|
| 618 |
+
image_list = page.get_images()
|
| 619 |
+
|
| 620 |
+
for img_index, img in enumerate(image_list):
|
| 621 |
+
# Get the image data
|
| 622 |
+
xref = img[0]
|
| 623 |
+
pix = fitz.Pixmap(doc, xref)
|
| 624 |
+
|
| 625 |
+
# Convert to PIL Image
|
| 626 |
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
| 627 |
+
img_data = pix.tobytes("ppm")
|
| 628 |
+
pil_img = Image.open(io.BytesIO(img_data))
|
| 629 |
+
images.append(pil_img)
|
| 630 |
+
else: # CMYK: convert to RGB first
|
| 631 |
+
pix1 = fitz.Pixmap(fitz.csRGB, pix)
|
| 632 |
+
img_data = pix1.tobytes("ppm")
|
| 633 |
+
pil_img = Image.open(io.BytesIO(img_data))
|
| 634 |
+
images.append(pil_img)
|
| 635 |
+
pix1 = None
|
| 636 |
+
pix = None
|
| 637 |
+
|
| 638 |
+
doc.close()
|
| 639 |
+
return images
|
| 640 |
+
except Exception:
|
| 641 |
+
return []
|
| 642 |
+
|
| 643 |
+
def find_barcode_boxes_and_info_from_pdf(
|
| 644 |
+
pdf_path: str,
|
| 645 |
+
*,
|
| 646 |
+
max_pages: int = 5,
|
| 647 |
+
image_size: Optional[Tuple[int, int]] = None,
|
| 648 |
+
dpi: int = 300
|
| 649 |
+
) -> Tuple[List[Box], List[dict]]:
|
| 650 |
+
"""Find barcodes/QR codes by analyzing extracted PDF images directly"""
|
| 651 |
+
if not HAS_BARCODE:
|
| 652 |
+
return [], []
|
| 653 |
+
|
| 654 |
+
boxes: List[Box] = []
|
| 655 |
+
infos = []
|
| 656 |
+
|
| 657 |
+
try:
|
| 658 |
+
doc = fitz.open(pdf_path)
|
| 659 |
+
|
| 660 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 661 |
+
page = doc[page_num]
|
| 662 |
+
|
| 663 |
+
# Get page dimensions for coordinate conversion
|
| 664 |
+
page_rect = page.rect
|
| 665 |
+
pdf_width = page_rect.width
|
| 666 |
+
pdf_height = page_rect.height
|
| 667 |
+
|
| 668 |
+
# Render page to image for barcode detection
|
| 669 |
+
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
|
| 670 |
+
pix = page.get_pixmap(matrix=mat)
|
| 671 |
+
img_data = pix.tobytes("ppm")
|
| 672 |
+
pil_img = Image.open(io.BytesIO(img_data))
|
| 673 |
+
|
| 674 |
+
# Detect barcodes in the page image
|
| 675 |
+
decodes = decode_with_variants(pil_img)
|
| 676 |
+
|
| 677 |
+
for d in decodes:
|
| 678 |
+
rect = d.rect
|
| 679 |
+
|
| 680 |
+
# Convert coordinates if image_size is provided
|
| 681 |
+
if image_size:
|
| 682 |
+
img_width, img_height = image_size
|
| 683 |
+
scale_x = img_width / pil_img.width
|
| 684 |
+
scale_y = img_height / pil_img.height
|
| 685 |
+
|
| 686 |
+
left = int(rect.left * scale_x)
|
| 687 |
+
top = int(rect.top * scale_y) + (page_num * img_height)
|
| 688 |
+
width = int(rect.width * scale_x)
|
| 689 |
+
height = int(rect.height * scale_y)
|
| 690 |
+
else:
|
| 691 |
+
left = rect.left
|
| 692 |
+
top = rect.top + (page_num * 1000)
|
| 693 |
+
width = rect.width
|
| 694 |
+
height = rect.height
|
| 695 |
+
|
| 696 |
+
boxes.append(Box(
|
| 697 |
+
y1=top,
|
| 698 |
+
x1=left,
|
| 699 |
+
y2=top + height,
|
| 700 |
+
x2=left + width,
|
| 701 |
+
area=width * height
|
| 702 |
+
))
|
| 703 |
+
|
| 704 |
+
valid = validate_symbology(d.type, d.data)
|
| 705 |
+
infos.append({
|
| 706 |
+
'type': d.type,
|
| 707 |
+
'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
|
| 708 |
+
'left': left, 'top': top, 'width': width, 'height': height,
|
| 709 |
+
'valid': bool(valid),
|
| 710 |
+
'page': page_num
|
| 711 |
+
})
|
| 712 |
+
|
| 713 |
+
doc.close()
|
| 714 |
+
|
| 715 |
+
except Exception:
|
| 716 |
+
# Fallback to original method if PDF processing fails
|
| 717 |
+
return [], []
|
| 718 |
+
|
| 719 |
+
return boxes, infos
|
| 720 |
+
|
| 721 |
def find_barcode_boxes_and_info(img: Image.Image):
|
| 722 |
+
"""Legacy barcode detection (kept for fallback)"""
|
| 723 |
decodes = decode_with_variants(img)
|
| 724 |
boxes: List[Box] = []
|
| 725 |
infos = []
|
|
|
|
| 812 |
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
|
| 813 |
|
| 814 |
if HAS_BARCODE:
|
| 815 |
+
# Use PDF-based barcode detection instead of rasterized image
|
| 816 |
+
bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
|
| 817 |
+
bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
|
| 818 |
+
|
| 819 |
+
# Debug: Print barcode detection results
|
| 820 |
+
print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
|
| 821 |
else:
|
| 822 |
bar_a, info_a = [], []
|
| 823 |
bar_b, info_b = [], []
|