Yaz Hobooti commited on
Commit
a75a85f
·
1 Parent(s): ef1e0b0

Enforce bottom-115mm exclusion for spellcheck and barcodes; fix indentation; compile clean

Browse files
Files changed (1) hide show
  1. pdf_comparator.py +74 -8
pdf_comparator.py CHANGED
@@ -679,6 +679,9 @@ def find_misspell_boxes_from_text(
679
  img_height = image_size[1]
680
  if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
681
  continue
 
 
 
682
 
683
  boxes.append(box)
684
 
@@ -788,7 +791,11 @@ def find_misspell_boxes(
788
  continue
789
 
790
  # NOTE: adjust to match your Box constructor if needed
791
- boxes.append(Box(top, left, top + height, left + width, width * height))
 
 
 
 
792
 
793
  return boxes
794
 
@@ -978,7 +985,11 @@ def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
978
  pil=_pix_to_pil(pix)
979
  hits=_decode_variants(pil)
980
  for r in hits:
981
- boxes.append(Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]))
 
 
 
 
982
  sym, payload = r["type"], r["data"]
983
  infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
984
  except Exception:
@@ -994,7 +1005,10 @@ def scan_pdf_barcodes(pdf_path: str, *, dpi_list=(900,1200), max_pages=10):
994
  pil=_pix_to_pil(pix)
995
  hits=_decode_variants(pil)
996
  for r in hits:
997
- boxes.append(Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"]))
 
 
 
998
  sym, payload = r["type"], r["data"]
999
  infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
1000
  if any(i["page"]==page_idx+1 for i in infos):
@@ -1300,12 +1314,64 @@ def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
1300
  print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
1301
 
1302
  def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
1303
- """Compatibility wrapper expected by callers.
1304
- Delegates to scan_pdf_barcodes; image_size is unused here but
1305
- kept to match previous signature.
1306
- Returns (boxes, infos).
 
 
 
1307
  """
1308
- return scan_pdf_barcodes(pdf_path, max_pages=max_pages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1309
 
1310
  if __name__ == "__main__":
1311
  demo = create_demo()
 
679
  img_height = image_size[1]
680
  if _is_in_excluded_bottom_area(box, img_height) and not _contains_validation_text(text):
681
  continue
682
+ else:
683
+ if _is_in_excluded_bottom_area(box, ph):
684
+ continue
685
 
686
  boxes.append(box)
687
 
 
791
  continue
792
 
793
  # NOTE: adjust to match your Box constructor if needed
794
+ b = Box(top, left, top + height, left + width, width * height)
795
+ # Exclude bottom 115mm unless the text contains the validation phrase
796
+ if _is_in_excluded_bottom_area(b, img.height) and not _contains_validation_text(raw):
797
+ continue
798
+ boxes.append(b)
799
 
800
  return boxes
801
 
 
985
  pil=_pix_to_pil(pix)
986
  hits=_decode_variants(pil)
987
  for r in hits:
988
+ b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
989
+ # Exclude barcodes in the bottom 115mm of the page image
990
+ if _is_in_excluded_bottom_area(b, pil.height):
991
+ continue
992
+ boxes.append(b)
993
  sym, payload = r["type"], r["data"]
994
  infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"embed:{ix+1}"})
995
  except Exception:
 
1005
  pil=_pix_to_pil(pix)
1006
  hits=_decode_variants(pil)
1007
  for r in hits:
1008
+ b = Box(r["top"], r["left"], r["top"]+r["height"], r["left"]+r["width"], r["width"]*r["height"])
1009
+ if _is_in_excluded_bottom_area(b, pil.height):
1010
+ continue
1011
+ boxes.append(b)
1012
  sym, payload = r["type"], r["data"]
1013
  infos.append({**r, "valid": _validate(sym, payload), "page": page_idx+1, "source": f"page@{dpi}dpi"})
1014
  if any(i["page"]==page_idx+1 for i in infos):
 
1314
  print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")
1315
 
1316
  def find_barcode_boxes_and_info_from_pdf(pdf_path: str, image_size: Optional[Tuple[int, int]] = None, max_pages: int = 10):
1317
+ """Detect barcodes from the original PDF and return boxes in the same
1318
+ coordinate space as the combined display image.
1319
+
1320
+ If image_size is provided (w,h of the vertically combined display image),
1321
+ each page is rendered so its width matches w, then decoded. Box y-coordinates
1322
+ are offset by the cumulative height of previous pages so that all boxes map
1323
+ into the combined image space correctly.
1324
  """
1325
+ boxes: List[Box] = []
1326
+ infos: List[Dict[str, Any]] = []
1327
+ try:
1328
+ doc = fitz.open(pdf_path)
1329
+ num_pages = min(len(doc), max_pages)
1330
+ if num_pages == 0:
1331
+ return [], []
1332
+
1333
+ target_width = None
1334
+ if image_size:
1335
+ target_width = int(image_size[0])
1336
+
1337
+ y_offset = 0
1338
+ for page_idx in range(num_pages):
1339
+ page = doc[page_idx]
1340
+ # Compute scale so that rendered width matches target_width when provided
1341
+ if target_width:
1342
+ page_width_pts = float(page.rect.width) # points (72 dpi)
1343
+ scale = max(1.0, target_width / page_width_pts)
1344
+ else:
1345
+ # fallback dpi ~600
1346
+ scale = 600.0 / 72.0
1347
+ try:
1348
+ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), colorspace=fitz.csGRAY, alpha=False)
1349
+ except TypeError:
1350
+ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
1351
+ pil = _pix_to_pil(pix)
1352
+ pw, ph = pil.size
1353
+ hits = _decode_variants(pil)
1354
+ for r in hits:
1355
+ x1 = int(r.get("left", 0))
1356
+ y1 = int(r.get("top", 0)) + y_offset
1357
+ w = int(r.get("width", 0))
1358
+ h = int(r.get("height", 0))
1359
+ x2 = x1 + w
1360
+ y2 = y1 + h
1361
+ b = Box(y1, x1, y2, x2, w * h)
1362
+ # Exclude bottom 115mm for combined image if we know full height; else per-page
1363
+ if image_size and _is_in_excluded_bottom_area(b, image_size[1]):
1364
+ continue
1365
+ if not image_size and _is_in_excluded_bottom_area(b, ph):
1366
+ continue
1367
+ boxes.append(b)
1368
+ sym, payload = r.get("type", ""), r.get("data", "")
1369
+ infos.append({**r, "valid": _validate(sym, payload), "page": page_idx + 1, "source": f"page@scale{scale:.2f}"})
1370
+ y_offset += ph
1371
+ doc.close()
1372
+ except Exception:
1373
+ return [], []
1374
+ return boxes, infos
1375
 
1376
  if __name__ == "__main__":
1377
  demo = create_demo()