Yaz Hobooti
commited on
Commit
·
b050288
1
Parent(s):
8cec543
Fix barcode detection bugs and improve robustness
Browse files- Fix decode_with_variants() unreachable code bug (results was defined after return)
- Add robust decode variants: grayscale, scale-up, rotations (90°, 180°, 270°)
- Use LANCZOS resampling for better quality in newer Pillow versions
- Improve UPC-A/EAN-13 validation with leading zero normalization
- Remove arbitrary coordinate offset (+ page_num * 1000) for cleaner positioning
- Add comprehensive error handling and fallbacks
- Enhanced barcode detection with multiple image processing variants
- pdf_comparator.py +56 -8
pdf_comparator.py
CHANGED
|
@@ -571,35 +571,82 @@ def ean_like_checksum_ok(digits: str) -> bool:
|
|
| 571 |
return True
|
| 572 |
|
| 573 |
def validate_symbology(symbology: str, data: bytes) -> bool:
|
|
|
|
| 574 |
try:
|
| 575 |
text = data.decode('utf-8', errors='ignore')
|
| 576 |
except Exception:
|
| 577 |
return False
|
|
|
|
| 578 |
sym = (symbology or '').upper()
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
return len(text) > 0
|
|
|
|
|
|
|
| 583 |
return len(text) > 0
|
| 584 |
|
| 585 |
def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
|
| 586 |
return Box(y, x, y + h, x + w, w * h)
|
| 587 |
|
| 588 |
def decode_with_variants(img: Image.Image):
|
|
|
|
| 589 |
if not HAS_BARCODE:
|
| 590 |
return []
|
| 591 |
-
|
|
|
|
|
|
|
| 592 |
def do_decode(pil_img):
|
| 593 |
try:
|
| 594 |
dec = zbar_decode(pil_img)
|
| 595 |
-
if dec:
|
|
|
|
| 596 |
except Exception:
|
| 597 |
pass
|
|
|
|
|
|
|
| 598 |
do_decode(img)
|
| 599 |
-
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
if not results and img.mode != 'RGB':
|
| 602 |
do_decode(img.convert('RGB'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
return results
|
| 604 |
|
| 605 |
def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
|
|
@@ -688,8 +735,9 @@ def find_barcode_boxes_and_info_from_pdf(
|
|
| 688 |
width = int(rect.width * scale_x)
|
| 689 |
height = int(rect.height * scale_y)
|
| 690 |
else:
|
|
|
|
| 691 |
left = rect.left
|
| 692 |
-
top = rect.top
|
| 693 |
width = rect.width
|
| 694 |
height = rect.height
|
| 695 |
|
|
|
|
| 571 |
return True
|
| 572 |
|
| 573 |
def validate_symbology(symbology: str, data: bytes) -> bool:
|
| 574 |
+
"""Validate barcode symbology with improved UPC-A/EAN-13 handling"""
|
| 575 |
try:
|
| 576 |
text = data.decode('utf-8', errors='ignore')
|
| 577 |
except Exception:
|
| 578 |
return False
|
| 579 |
+
|
| 580 |
sym = (symbology or '').upper()
|
| 581 |
+
|
| 582 |
+
if sym in ("EAN13", "EAN-13", "EAN8", "EAN-8", "UPCA", "UPC-A"):
|
| 583 |
+
# Extract digits only
|
| 584 |
+
digits = re.sub(r"\D", "", text)
|
| 585 |
+
|
| 586 |
+
# Handle UPC-A reported as EAN-13 with leading 0
|
| 587 |
+
if sym in ("EAN13", "EAN-13") and len(digits) == 13 and digits.startswith('0'):
|
| 588 |
+
# Remove leading 0 to get UPC-A format
|
| 589 |
+
digits = digits[1:]
|
| 590 |
+
|
| 591 |
+
return ean_like_checksum_ok(digits)
|
| 592 |
+
|
| 593 |
+
if sym in ("QRCODE", "QRCODEMODEL2", "QR-CODE"):
|
| 594 |
return len(text) > 0
|
| 595 |
+
|
| 596 |
+
# Default validation for other symbologies
|
| 597 |
return len(text) > 0
|
| 598 |
|
| 599 |
def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
|
| 600 |
return Box(y, x, y + h, x + w, w * h)
|
| 601 |
|
| 602 |
def decode_with_variants(img: Image.Image):
|
| 603 |
+
"""Robust barcode/QR code detection with multiple variants"""
|
| 604 |
if not HAS_BARCODE:
|
| 605 |
return []
|
| 606 |
+
|
| 607 |
+
results = []
|
| 608 |
+
|
| 609 |
def do_decode(pil_img):
|
| 610 |
try:
|
| 611 |
dec = zbar_decode(pil_img)
|
| 612 |
+
if dec:
|
| 613 |
+
results.extend(dec)
|
| 614 |
except Exception:
|
| 615 |
pass
|
| 616 |
+
|
| 617 |
+
# Try original image
|
| 618 |
do_decode(img)
|
| 619 |
+
|
| 620 |
+
# Try grayscale conversion
|
| 621 |
+
if not results:
|
| 622 |
+
do_decode(img.convert('L'))
|
| 623 |
+
|
| 624 |
+
# Try scaling up (better for small barcodes)
|
| 625 |
+
if not results:
|
| 626 |
+
try:
|
| 627 |
+
# Use LANCZOS for better quality in newer Pillow versions
|
| 628 |
+
scaled_img = img.resize((img.width*2, img.height*2), Image.LANCZOS)
|
| 629 |
+
do_decode(scaled_img)
|
| 630 |
+
except AttributeError:
|
| 631 |
+
# Fallback for older Pillow versions
|
| 632 |
+
scaled_img = img.resize((img.width*2, img.height*2), Image.BICUBIC)
|
| 633 |
+
do_decode(scaled_img)
|
| 634 |
+
|
| 635 |
+
# Try different color modes
|
| 636 |
if not results and img.mode != 'RGB':
|
| 637 |
do_decode(img.convert('RGB'))
|
| 638 |
+
|
| 639 |
+
# Try rotated versions (common for QR codes)
|
| 640 |
+
if not results:
|
| 641 |
+
for angle in [90, 180, 270]:
|
| 642 |
+
try:
|
| 643 |
+
rotated = img.rotate(angle, expand=True)
|
| 644 |
+
do_decode(rotated)
|
| 645 |
+
if results:
|
| 646 |
+
break
|
| 647 |
+
except Exception:
|
| 648 |
+
continue
|
| 649 |
+
|
| 650 |
return results
|
| 651 |
|
| 652 |
def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
|
|
|
|
| 735 |
width = int(rect.width * scale_x)
|
| 736 |
height = int(rect.height * scale_y)
|
| 737 |
else:
|
| 738 |
+
# Use PDF coordinates directly without arbitrary offset
|
| 739 |
left = rect.left
|
| 740 |
+
top = rect.top
|
| 741 |
width = rect.width
|
| 742 |
height = rect.height
|
| 743 |
|