Add bottom 115mm exclusion zone for all analysis

- Add _is_in_excluded_bottom_area() helper to check if boxes are in excluded zone
- Add _contains_validation_text() to detect '50 Carroll' validation text
- Modify find_diff_boxes() to exclude bottom 115mm area from difference detection
- Modify find_misspell_boxes_from_text() to exclude bottom area except for validation text
- Modify find_barcode_boxes_and_info_from_pdf() to exclude bottom area from barcode detection
- Convert 115mm to pixels using DPI for accurate coordinate mapping
- All analysis functions now respect the exclusion zone except for '50 Carroll' validation text

Files changed (1) hide show

pdf_comparator.py +46 -5

pdf_comparator.py CHANGED Viewed

@@ -205,6 +205,24 @@ def normalize_token(token: str) -> str:
 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
 def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
     if _is_pdf(path):
         # Try pdf2image with multiple poppler paths first
@@ -289,11 +307,19 @@ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int =
     mask = dilation(mask, rectangle(3, 3))
     labeled = label(mask, connectivity=2)
     out: List[Box] = []
     for p in regionprops(labeled):
         if p.area < min_area:
-                            continue
         minr, minc, maxr, maxc = p.bbox
-        out.append(Box(minr, minc, maxr, maxc, int(p.area)))
     return out
 def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
@@ -501,13 +527,21 @@ def find_misspell_boxes_from_text(
                                 x2 = int(bbox[2])
                                 y2 = int(bbox[3]) + (page_num * 1000)
-                            boxes.append(Box(
                                 y1=y1,
                                 x1=x1,
                                 y2=y2,
                                 x2=x2,
                                 area=(x2 - x1) * (y2 - y1)
-                            ))
         doc.close()
@@ -743,10 +777,17 @@ def find_barcode_boxes_and_info_from_pdf(pdf_path: str, *, max_pages: int = 5, d
                     pass
             # Collect results
             for d in decs:
                 rect = d.rect
                 left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
-                boxes.append(Box(top, left, top + height, left + width, width * height))
                 # basic validation (you already have ean_like_checksum_ok / validate_symbology)
                 try:
                     payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)

 def _is_pdf(path: str) -> bool:
     return os.path.splitext(path.lower())[1] == ".pdf"
+def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
+    """
+    Check if a box is in the excluded bottom area (115mm from bottom).
+    Converts mm to pixels using DPI.
+    """
+    # Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
+    excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
+    # Calculate the top boundary of the excluded area
+    excluded_top = image_height - excluded_height_pixels
+    # Check if the box intersects with the excluded area
+    return box.y1 >= excluded_top
+def _contains_validation_text(text: str) -> bool:
+    """Check if text contains the validation text '50 Carroll'"""
+    return "50 Carroll" in text
 def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
     if _is_pdf(path):
         # Try pdf2image with multiple poppler paths first
     mask = dilation(mask, rectangle(3, 3))
     labeled = label(mask, connectivity=2)
     out: List[Box] = []
+    img_height = diff_img.height
     for p in regionprops(labeled):
         if p.area < min_area:
+            continue
         minr, minc, maxr, maxc = p.bbox
+        box = Box(minr, minc, maxr, maxc, int(p.area))
+        # Skip boxes in the excluded bottom area
+        if _is_in_excluded_bottom_area(box, img_height):
+            continue
+        out.append(box)
     return out
 def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
                                 x2 = int(bbox[2])
                                 y2 = int(bbox[3]) + (page_num * 1000)
+                            box = Box(
                                 y1=y1,
                                 x1=x1,
                                 y2=y2,
                                 x2=x2,
                                 area=(x2 - x1) * (y2 - y1)
+                            )
+                            # Skip boxes in excluded bottom area unless they contain validation text
+                            if image_size:
+                                img_height = image_size[1]
+                                if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
+                                    continue
+                            boxes.append(box)
         doc.close()
                     pass
             # Collect results
+            img_height = img.height
             for d in decs:
                 rect = d.rect
                 left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
+                box = Box(top, left, top + height, left + width, width * height)
+                # Skip barcodes in the excluded bottom area
+                if _is_in_excluded_bottom_area(box, img_height, dpi=dpi):
+                    continue
+                boxes.append(box)
                 # basic validation (you already have ean_like_checksum_ok / validate_symbology)
                 try:
                     payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)