Yaz Hobooti
commited on
Commit
·
7981251
1
Parent(s):
a75a85f
Fix: revert to working version with proper indentation
Browse files- pdf_comparator.py +8 -74
pdf_comparator.py
CHANGED
|
@@ -679,9 +679,6 @@ def find_misspell_boxes_from_text(
|
|
| 679 |
img_height = image_size[1]
|
| 680 |
if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
|
| 681 |
continue
|
| 682 |
-
else:
|
| 683 |
-
if _is_in_excluded_bottom_area(box, ph):
|
| 684 |
-
continue
|
| 685 |
|
| 686 |
boxes.append(box)
|
| 687 |
|
|
@@ -791,11 +788,7 @@ def find_misspell_boxes(
|
|
| 791 |
continue
|
| 792 |
|
| 793 |
# NOTE: adjust to match your Box constructor if needed
|
| 794 |
-
|
| 795 |
-
# Exclude bottom 115mm unless the text contains the validation phrase
|
| 796 |
-
if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw):
|
| 797 |
-
continue
|
| 798 |
-
boxes.append(b)
|
| 799 |
|
| 800 |
return boxes
|
| 801 |
|
|
@@ -985,11 +978,7 @@ def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
|
|
| 985 |
pil=_pix_to_pil(pix)
|
| 986 |
hits=_decode_variants(pil)
|
| 987 |
for r in hits:
|
| 988 |
-
|
| 989 |
-
# Exclude barcodes in the bottom 115mm of the page image
|
| 990 |
-
if _is_in_excluded_bottom_area(b, pil.height):
|
| 991 |
-
continue
|
| 992 |
-
boxes.append(b)
|
| 993 |
sym, payload = r["type"], r["data"]
|
| 994 |
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
|
| 995 |
except Exception:
|
|
@@ -1005,10 +994,7 @@ def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
|
|
| 1005 |
pil=_pix_to_pil(pix)
|
| 1006 |
hits=_decode_variants(pil)
|
| 1007 |
for r in hits:
|
| 1008 |
-
|
| 1009 |
-
if _is_in_excluded_bottom_area(b, pil.height):
|
| 1010 |
-
continue
|
| 1011 |
-
boxes.append(b)
|
| 1012 |
sym, payload = r["type"], r["data"]
|
| 1013 |
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
|
| 1014 |
if any(i["page"]==page_idx+1 for i in infos):
|
|
@@ -1314,64 +1300,12 @@ def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
|
|
| 1314 |
print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
|
| 1315 |
|
| 1316 |
def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
|
| 1317 |
-
"""
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
each page is rendered so its width matches w, then decoded. Box y-coordinates
|
| 1322 |
-
are offset by the cumulative height of previous pages so that all boxes map
|
| 1323 |
-
into the combined image space correctly.
|
| 1324 |
"""
|
| 1325 |
-
|
| 1326 |
-
infos: List[Dict[str, Any]] = []
|
| 1327 |
-
try:
|
| 1328 |
-
doc = fitz.open(pdf_path)
|
| 1329 |
-
num_pages = min(len(doc), max_pages)
|
| 1330 |
-
if num_pages == 0:
|
| 1331 |
-
return [], []
|
| 1332 |
-
|
| 1333 |
-
target_width = None
|
| 1334 |
-
if image_size:
|
| 1335 |
-
target_width = int(image_size[0])
|
| 1336 |
-
|
| 1337 |
-
y_offset = 0
|
| 1338 |
-
for page_idx in range(num_pages):
|
| 1339 |
-
page = doc[page_idx]
|
| 1340 |
-
# Compute scale so that rendered width matches target_width when provided
|
| 1341 |
-
if target_width:
|
| 1342 |
-
page_width_pts = float(page.rect.width) # points (72 dpi)
|
| 1343 |
-
scale = max(1.0, target_width / page_width_pts)
|
| 1344 |
-
else:
|
| 1345 |
-
# fallback dpi ~600
|
| 1346 |
-
scale = 600.0 / 72.0
|
| 1347 |
-
try:
|
| 1348 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
|
| 1349 |
-
except TypeError:
|
| 1350 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
|
| 1351 |
-
pil = _pix_to_pil(pix)
|
| 1352 |
-
pw, ph = pil.size
|
| 1353 |
-
hits = _decode_variants(pil)
|
| 1354 |
-
for r in hits:
|
| 1355 |
-
x1 = int(r.get("left", 0))
|
| 1356 |
-
y1 = int(r.get("top", 0)) + y_offset
|
| 1357 |
-
w = int(r.get("width", 0))
|
| 1358 |
-
h = int(r.get("height", 0))
|
| 1359 |
-
x2 = x1 + w
|
| 1360 |
-
y2 = y1 + h
|
| 1361 |
-
b = Box(y1, x1, y2, x2, w * h)
|
| 1362 |
-
# Exclude bottom 115mm for combined image if we know full height; else per-page
|
| 1363 |
-
if image_size and _is_in_excluded_bottom_area(b, image_size[1]):
|
| 1364 |
-
continue
|
| 1365 |
-
if not image_size and _is_in_excluded_bottom_area(b, ph):
|
| 1366 |
-
continue
|
| 1367 |
-
boxes.append(b)
|
| 1368 |
-
sym, payload = r.get("type", ""), r.get("data", "")
|
| 1369 |
-
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"})
|
| 1370 |
-
y_offset += ph
|
| 1371 |
-
doc.close()
|
| 1372 |
-
except Exception:
|
| 1373 |
-
return [], []
|
| 1374 |
-
return boxes, infos
|
| 1375 |
|
| 1376 |
if __name__ == "__main__":
|
| 1377 |
demo = create_demo()
|
|
|
|
| 679 |
img_height = image_size[1]
|
| 680 |
if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
|
| 681 |
continue
|
|
|
|
|
|
|
|
|
|
| 682 |
|
| 683 |
boxes.append(box)
|
| 684 |
|
|
|
|
| 788 |
continue
|
| 789 |
|
| 790 |
# NOTE: adjust to match your Box constructor if needed
|
| 791 |
+
boxes.append(Box(top, left, top + height, left + width, width * height))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
|
| 793 |
return boxes
|
| 794 |
|
|
|
|
| 978 |
pil=_pix_to_pil(pix)
|
| 979 |
hits=_decode_variants(pil)
|
| 980 |
for r in hits:
|
| 981 |
+
boxes.append(Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
sym, payload = r["type"], r["data"]
|
| 983 |
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
|
| 984 |
except Exception:
|
|
|
|
| 994 |
pil=_pix_to_pil(pix)
|
| 995 |
hits=_decode_variants(pil)
|
| 996 |
for r in hits:
|
| 997 |
+
boxes.append(Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]))
|
|
|
|
|
|
|
|
|
|
| 998 |
sym, payload = r["type"], r["data"]
|
| 999 |
infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
|
| 1000 |
if any(i["page"]==page_idx+1 for i in infos):
|
|
|
|
| 1300 |
print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
|
| 1301 |
|
| 1302 |
def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
|
| 1303 |
+
"""Compatibility wrapper expected by callers.
|
| 1304 |
+
Delegates to scan_pdf_barcodes; image_size is unused here but
|
| 1305 |
+
kept to match previous signature.
|
| 1306 |
+
Returns (boxes, infos).
|
|
|
|
|
|
|
|
|
|
| 1307 |
"""
|
| 1308 |
+
return scan_pdf_barcodes(pdf_path, max_pages=max_pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1309 |
|
| 1310 |
if __name__ == "__main__":
|
| 1311 |
demo = create_demo()
|