Yaz Hobooti commited on
Commit
b050288
·
1 Parent(s): 8cec543

Fix barcode detection bugs and improve robustness

Browse files

- Fix decode_with_variants() unreachable code bug (results was defined after return)
- Add robust decode variants: grayscale, scale-up, rotations (90°, 180°, 270°)
- Use LANCZOS resampling for better quality in newer Pillow versions
- Improve UPC-A/EAN-13 validation with leading zero normalization
- Remove arbitrary coordinate offset (+ page_num * 1000) for cleaner positioning
- Add comprehensive error handling and fallbacks
- Enhanced barcode detection with multiple image processing variants

Files changed (1) hide show
  1. pdf_comparator.py +56 -8
pdf_comparator.py CHANGED
@@ -571,35 +571,82 @@ def ean_like_checksum_ok(digits: str) -> bool:
571
  return True
572
 
573
  def validate_symbology(symbology: str, data: bytes) -> bool:
 
574
  try:
575
  text = data.decode('utf-8', errors='ignore')
576
  except Exception:
577
  return False
 
578
  sym = (symbology or '').upper()
579
- if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
580
- return ean_like_checksum_ok(re.sub(r"\D", "", text))
581
- if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
 
 
 
 
 
 
 
 
 
 
582
  return len(text) > 0
 
 
583
  return len(text) > 0
584
 
585
  def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
586
  return Box(y, x, y + h, x + w, w * h)
587
 
588
  def decode_with_variants(img: Image.Image):
 
589
  if not HAS_BARCODE:
590
  return []
591
- results = []
 
 
592
  def do_decode(pil_img):
593
  try:
594
  dec = zbar_decode(pil_img)
595
- if dec: results.extend(dec)
 
596
  except Exception:
597
  pass
 
 
598
  do_decode(img)
599
- if not results: do_decode(img.convert('L'))
600
- if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  if not results and img.mode != 'RGB':
602
  do_decode(img.convert('RGB'))
 
 
 
 
 
 
 
 
 
 
 
 
603
  return results
604
 
605
  def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
@@ -688,8 +735,9 @@ def find_barcode_boxes_and_info_from_pdf(
688
  width = int(rect.width * scale_x)
689
  height = int(rect.height * scale_y)
690
  else:
 
691
  left = rect.left
692
- top = rect.top + (page_num * 1000)
693
  width = rect.width
694
  height = rect.height
695
 
 
571
  return True
572
 
573
  def validate_symbology(symbology: str, data: bytes) -> bool:
574
+ """Validate barcode symbology with improved UPC-A/EAN-13 handling"""
575
  try:
576
  text = data.decode('utf-8', errors='ignore')
577
  except Exception:
578
  return False
579
+
580
  sym = (symbology or '').upper()
581
+
582
+ if sym in ("EAN13", "EAN-13", "EAN8", "EAN-8", "UPCA", "UPC-A"):
583
+ # Extract digits only
584
+ digits = re.sub(r"\D", "", text)
585
+
586
+ # Handle UPC-A reported as EAN-13 with leading 0
587
+ if sym in ("EAN13", "EAN-13") and len(digits) == 13 and digits.startswith('0'):
588
+ # Remove leading 0 to get UPC-A format
589
+ digits = digits[1:]
590
+
591
+ return ean_like_checksum_ok(digits)
592
+
593
+ if sym in ("QRCODE", "QRCODEMODEL2", "QR-CODE"):
594
  return len(text) > 0
595
+
596
+ # Default validation for other symbologies
597
  return len(text) > 0
598
 
599
  def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
600
  return Box(y, x, y + h, x + w, w * h)
601
 
602
  def decode_with_variants(img: Image.Image):
603
+ """Robust barcode/QR code detection with multiple variants"""
604
  if not HAS_BARCODE:
605
  return []
606
+
607
+ results = []
608
+
609
  def do_decode(pil_img):
610
  try:
611
  dec = zbar_decode(pil_img)
612
+ if dec:
613
+ results.extend(dec)
614
  except Exception:
615
  pass
616
+
617
+ # Try original image
618
  do_decode(img)
619
+
620
+ # Try grayscale conversion
621
+ if not results:
622
+ do_decode(img.convert('L'))
623
+
624
+ # Try scaling up (better for small barcodes)
625
+ if not results:
626
+ try:
627
+ # Use LANCZOS for better quality in newer Pillow versions
628
+ scaled_img = img.resize((img.width*2, img.height*2), Image.LANCZOS)
629
+ do_decode(scaled_img)
630
+ except AttributeError:
631
+ # Fallback for older Pillow versions
632
+ scaled_img = img.resize((img.width*2, img.height*2), Image.BICUBIC)
633
+ do_decode(scaled_img)
634
+
635
+ # Try different color modes
636
  if not results and img.mode != 'RGB':
637
  do_decode(img.convert('RGB'))
638
+
639
+ # Try rotated versions (common for QR codes)
640
+ if not results:
641
+ for angle in [90, 180, 270]:
642
+ try:
643
+ rotated = img.rotate(angle, expand=True)
644
+ do_decode(rotated)
645
+ if results:
646
+ break
647
+ except Exception:
648
+ continue
649
+
650
  return results
651
 
652
  def extract_pdf_images(pdf_path: str, max_pages: int = 5, dpi: int = 300) -> List[Image.Image]:
 
735
  width = int(rect.width * scale_x)
736
  height = int(rect.height * scale_y)
737
  else:
738
+ # Use PDF coordinates directly without arbitrary offset
739
  left = rect.left
740
+ top = rect.top
741
  width = rect.width
742
  height = rect.height
743