Yaz Hobooti commited on
Commit
0fa89b4
·
1 Parent(s): 223bf48

Add bottom 115mm exclusion zone for all analysis

Browse files

- Add _is_in_excluded_bottom_area() helper to check if boxes are in excluded zone
- Add _contains_validation_text() to detect '50 Carroll' validation text
- Modify find_diff_boxes() to exclude bottom 115mm area from difference detection
- Modify find_misspell_boxes_from_text() to exclude bottom area except for validation text
- Modify find_barcode_boxes_and_info_from_pdf() to exclude bottom area from barcode detection
- Convert 115mm to pixels using DPI for accurate coordinate mapping
- All analysis functions now respect the exclusion zone except for '50 Carroll' validation text

Files changed (1) hide show
  1. pdf_comparator.py +46 -5
pdf_comparator.py CHANGED
@@ -205,6 +205,24 @@ def normalize_token(token: str) -> str:
205
  def _is_pdf(path: str) -> bool:
206
  return os.path.splitext(path.lower())[1] == ".pdf"
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
209
  if _is_pdf(path):
210
  # Try pdf2image with multiple poppler paths first
@@ -289,11 +307,19 @@ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int =
289
  mask = dilation(mask, rectangle(3, 3))
290
  labeled = label(mask, connectivity=2)
291
  out: List[Box] = []
 
 
292
  for p in regionprops(labeled):
293
  if p.area < min_area:
294
- continue
295
  minr, minc, maxr, maxc = p.bbox
296
- out.append(Box(minr, minc, maxr, maxc, int(p.area)))
 
 
 
 
 
 
297
  return out
298
 
299
  def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
@@ -501,13 +527,21 @@ def find_misspell_boxes_from_text(
501
  x2 = int(bbox[2])
502
  y2 = int(bbox[3]) + (page_num * 1000)
503
 
504
- boxes.append(Box(
505
  y1=y1,
506
  x1=x1,
507
  y2=y2,
508
  x2=x2,
509
  area=(x2 - x1) * (y2 - y1)
510
- ))
 
 
 
 
 
 
 
 
511
 
512
  doc.close()
513
 
@@ -743,10 +777,17 @@ def find_barcode_boxes_and_info_from_pdf(pdf_path: str, *, max_pages: int = 5, d
743
  pass
744
 
745
  # Collect results
 
746
  for d in decs:
747
  rect = d.rect
748
  left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
749
- boxes.append(Box(top, left, top + height, left + width, width * height))
 
 
 
 
 
 
750
  # basic validation (you already have ean_like_checksum_ok / validate_symbology)
751
  try:
752
  payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)
 
205
  def _is_pdf(path: str) -> bool:
206
  return os.path.splitext(path.lower())[1] == ".pdf"
207
 
208
+ def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
209
+ """
210
+ Check if a box is in the excluded bottom area (115mm from bottom).
211
+ Converts mm to pixels using DPI.
212
+ """
213
+ # Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
214
+ excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
215
+
216
+ # Calculate the top boundary of the excluded area
217
+ excluded_top = image_height - excluded_height_pixels
218
+
219
+ # Check if the box intersects with the excluded area
220
+ return box.y1 >= excluded_top
221
+
222
+ def _contains_validation_text(text: str) -> bool:
223
+ """Check if text contains the validation text '50 Carroll'"""
224
+ return "50 Carroll" in text
225
+
226
  def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
227
  if _is_pdf(path):
228
  # Try pdf2image with multiple poppler paths first
 
307
  mask = dilation(mask, rectangle(3, 3))
308
  labeled = label(mask, connectivity=2)
309
  out: List[Box] = []
310
+ img_height = diff_img.height
311
+
312
  for p in regionprops(labeled):
313
  if p.area < min_area:
314
+ continue
315
  minr, minc, maxr, maxc = p.bbox
316
+ box = Box(minr, minc, maxr, maxc, int(p.area))
317
+
318
+ # Skip boxes in the excluded bottom area
319
+ if _is_in_excluded_bottom_area(box, img_height):
320
+ continue
321
+
322
+ out.append(box)
323
  return out
324
 
325
  def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
 
527
  x2 = int(bbox[2])
528
  y2 = int(bbox[3]) + (page_num * 1000)
529
 
530
+ box = Box(
531
  y1=y1,
532
  x1=x1,
533
  y2=y2,
534
  x2=x2,
535
  area=(x2 - x1) * (y2 - y1)
536
+ )
537
+
538
+ # Skip boxes in excluded bottom area unless they contain validation text
539
+ if image_size:
540
+ img_height = image_size[1]
541
+ if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
542
+ continue
543
+
544
+ boxes.append(box)
545
 
546
  doc.close()
547
 
 
777
  pass
778
 
779
  # Collect results
780
+ img_height = img.height
781
  for d in decs:
782
  rect = d.rect
783
  left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
784
+ box = Box(top, left, top + height, left + width, width * height)
785
+
786
+ # Skip barcodes in the excluded bottom area
787
+ if _is_in_excluded_bottom_area(box, img_height, dpi=dpi):
788
+ continue
789
+
790
+ boxes.append(box)
791
  # basic validation (you already have ean_like_checksum_ok / validate_symbology)
792
  try:
793
  payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)