Yaz Hobooti
commited on
Commit
·
0fa89b4
1
Parent(s):
223bf48
Add bottom 115mm exclusion zone for all analysis
Browse files- Add _is_in_excluded_bottom_area() helper to check if boxes are in excluded zone
- Add _contains_validation_text() to detect '50 Carroll' validation text
- Modify find_diff_boxes() to exclude bottom 115mm area from difference detection
- Modify find_misspell_boxes_from_text() to exclude bottom area except for validation text
- Modify find_barcode_boxes_and_info_from_pdf() to exclude bottom area from barcode detection
- Convert 115mm to pixels using DPI for accurate coordinate mapping
- All analysis functions now respect the exclusion zone except for '50 Carroll' validation text
- pdf_comparator.py +46 -5
pdf_comparator.py
CHANGED
|
@@ -205,6 +205,24 @@ def normalize_token(token: str) -> str:
|
|
| 205 |
def _is_pdf(path: str) -> bool:
|
| 206 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
|
| 209 |
if _is_pdf(path):
|
| 210 |
# Try pdf2image with multiple poppler paths first
|
|
@@ -289,11 +307,19 @@ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int =
|
|
| 289 |
mask = dilation(mask, rectangle(3, 3))
|
| 290 |
labeled = label(mask, connectivity=2)
|
| 291 |
out: List[Box] = []
|
|
|
|
|
|
|
| 292 |
for p in regionprops(labeled):
|
| 293 |
if p.area < min_area:
|
| 294 |
-
|
| 295 |
minr, minc, maxr, maxc = p.bbox
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
return out
|
| 298 |
|
| 299 |
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
|
|
@@ -501,13 +527,21 @@ def find_misspell_boxes_from_text(
|
|
| 501 |
x2 = int(bbox[2])
|
| 502 |
y2 = int(bbox[3]) + (page_num * 1000)
|
| 503 |
|
| 504 |
-
|
| 505 |
y1=y1,
|
| 506 |
x1=x1,
|
| 507 |
y2=y2,
|
| 508 |
x2=x2,
|
| 509 |
area=(x2 - x1) * (y2 - y1)
|
| 510 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
doc.close()
|
| 513 |
|
|
@@ -743,10 +777,17 @@ def find_barcode_boxes_and_info_from_pdf(pdf_path: str, *, max_pages: int = 5, d
|
|
| 743 |
pass
|
| 744 |
|
| 745 |
# Collect results
|
|
|
|
| 746 |
for d in decs:
|
| 747 |
rect = d.rect
|
| 748 |
left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
|
| 749 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
# basic validation (you already have ean_like_checksum_ok / validate_symbology)
|
| 751 |
try:
|
| 752 |
payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)
|
|
|
|
| 205 |
def _is_pdf(path: str) -> bool:
|
| 206 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 207 |
|
| 208 |
+
def _is_in_excluded_bottom_area(box: Box, image_height: int, excluded_height_mm: float = 115.0, dpi: int = 400) -> bool:
|
| 209 |
+
"""
|
| 210 |
+
Check if a box is in the excluded bottom area (115mm from bottom).
|
| 211 |
+
Converts mm to pixels using DPI.
|
| 212 |
+
"""
|
| 213 |
+
# Convert mm to pixels: 1 inch = 25.4mm, so 1mm = dpi/25.4 pixels
|
| 214 |
+
excluded_height_pixels = int(excluded_height_mm * dpi / 25.4)
|
| 215 |
+
|
| 216 |
+
# Calculate the top boundary of the excluded area
|
| 217 |
+
excluded_top = image_height - excluded_height_pixels
|
| 218 |
+
|
| 219 |
+
# Check if the box intersects with the excluded area
|
| 220 |
+
return box.y1 >= excluded_top
|
| 221 |
+
|
| 222 |
+
def _contains_validation_text(text: str) -> bool:
|
| 223 |
+
"""Check if text contains the validation text '50 Carroll'"""
|
| 224 |
+
return "50 Carroll" in text
|
| 225 |
+
|
| 226 |
def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
|
| 227 |
if _is_pdf(path):
|
| 228 |
# Try pdf2image with multiple poppler paths first
|
|
|
|
| 307 |
mask = dilation(mask, rectangle(3, 3))
|
| 308 |
labeled = label(mask, connectivity=2)
|
| 309 |
out: List[Box] = []
|
| 310 |
+
img_height = diff_img.height
|
| 311 |
+
|
| 312 |
for p in regionprops(labeled):
|
| 313 |
if p.area < min_area:
|
| 314 |
+
continue
|
| 315 |
minr, minc, maxr, maxc = p.bbox
|
| 316 |
+
box = Box(minr, minc, maxr, maxc, int(p.area))
|
| 317 |
+
|
| 318 |
+
# Skip boxes in the excluded bottom area
|
| 319 |
+
if _is_in_excluded_bottom_area(box, img_height):
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
out.append(box)
|
| 323 |
return out
|
| 324 |
|
| 325 |
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
|
|
|
|
| 527 |
x2 = int(bbox[2])
|
| 528 |
y2 = int(bbox[3]) + (page_num * 1000)
|
| 529 |
|
| 530 |
+
box = Box(
|
| 531 |
y1=y1,
|
| 532 |
x1=x1,
|
| 533 |
y2=y2,
|
| 534 |
x2=x2,
|
| 535 |
area=(x2 - x1) * (y2 - y1)
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
# Skip boxes in excluded bottom area unless they contain validation text
|
| 539 |
+
if image_size:
|
| 540 |
+
img_height = image_size[1]
|
| 541 |
+
if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
|
| 542 |
+
continue
|
| 543 |
+
|
| 544 |
+
boxes.append(box)
|
| 545 |
|
| 546 |
doc.close()
|
| 547 |
|
|
|
|
| 777 |
pass
|
| 778 |
|
| 779 |
# Collect results
|
| 780 |
+
img_height = img.height
|
| 781 |
for d in decs:
|
| 782 |
rect = d.rect
|
| 783 |
left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
|
| 784 |
+
box = Box(top, left, top + height, left + width, width * height)
|
| 785 |
+
|
| 786 |
+
# Skip barcodes in the excluded bottom area
|
| 787 |
+
if _is_in_excluded_bottom_area(box, img_height, dpi=dpi):
|
| 788 |
+
continue
|
| 789 |
+
|
| 790 |
+
boxes.append(box)
|
| 791 |
# basic validation (you already have ean_like_checksum_ok / validate_symbology)
|
| 792 |
try:
|
| 793 |
payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)
|