Yaz Hobooti
commited on
Commit
·
bae9f7f
1
Parent(s):
d066e77
Fix PDF processing: add poppler dev package and better error handling
Browse files- apt.txt +1 -0
- pdf_comparator.py +29 -18
apt.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
poppler-utils
|
| 2 |
tesseract-ocr
|
| 3 |
libzbar0
|
|
|
|
|
|
| 1 |
poppler-utils
|
| 2 |
tesseract-ocr
|
| 3 |
libzbar0
|
| 4 |
+
libpoppler-cpp-dev
|
pdf_comparator.py
CHANGED
|
@@ -49,10 +49,21 @@ def _is_pdf(path: str) -> bool:
|
|
| 49 |
|
| 50 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 51 |
if _is_pdf(path):
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
return Image.open(path).convert("RGB")
|
| 57 |
|
| 58 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
|
@@ -73,7 +84,7 @@ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int =
|
|
| 73 |
out: List[Box] = []
|
| 74 |
for p in regionprops(labeled):
|
| 75 |
if p.area < min_area:
|
| 76 |
-
|
| 77 |
minr, minc, maxr, maxc = p.bbox
|
| 78 |
out.append(Box(minr, minc, maxr, maxc, int(p.area)))
|
| 79 |
return out
|
|
@@ -120,25 +131,25 @@ def find_misspell_boxes(img: Image.Image) -> List[Box]:
|
|
| 120 |
try:
|
| 121 |
spell = SpellChecker()
|
| 122 |
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
boxes: List[Box] = []
|
| 127 |
-
|
| 128 |
text = data["text"][i]
|
| 129 |
if not text:
|
| 130 |
-
|
| 131 |
token = normalize_token(text)
|
| 132 |
if len(token) < 2:
|
| 133 |
-
|
| 134 |
if token in spell:
|
| 135 |
-
|
| 136 |
left = data.get("left", [0])[i]
|
| 137 |
top = data.get("top", [0])[i]
|
| 138 |
width = data.get("width", [0])[i]
|
| 139 |
height= data.get("height",[0])[i]
|
| 140 |
if width <= 0 or height <= 0:
|
| 141 |
-
|
| 142 |
boxes.append(Box(top, left, top+height, left+width, width*height))
|
| 143 |
return boxes
|
| 144 |
|
|
@@ -182,7 +193,7 @@ def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
|
|
| 182 |
def decode_with_variants(img: Image.Image):
|
| 183 |
if not HAS_BARCODE:
|
| 184 |
return []
|
| 185 |
-
|
| 186 |
def do_decode(pil_img):
|
| 187 |
try:
|
| 188 |
dec = zbar_decode(pil_img)
|
|
@@ -194,8 +205,8 @@ def decode_with_variants(img: Image.Image):
|
|
| 194 |
if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
|
| 195 |
if not results and img.mode != 'RGB':
|
| 196 |
do_decode(img.convert('RGB'))
|
| 197 |
-
|
| 198 |
-
|
| 199 |
def find_barcode_boxes_and_info(img: Image.Image):
|
| 200 |
decodes = decode_with_variants(img)
|
| 201 |
boxes: List[Box] = []
|
|
@@ -281,7 +292,7 @@ def compare_pdfs(file_a, file_b):
|
|
| 281 |
if HAS_BARCODE:
|
| 282 |
bar_a, info_a = find_barcode_boxes_and_info(a)
|
| 283 |
bar_b, info_b = find_barcode_boxes_and_info(b)
|
| 284 |
-
|
| 285 |
bar_a, info_a = [], []
|
| 286 |
bar_b, info_b = [], []
|
| 287 |
|
|
@@ -322,7 +333,7 @@ def compare_pdfs(file_a, file_b):
|
|
| 322 |
|
| 323 |
return overlay, a_disp, b_disp, status, codes_a, codes_b
|
| 324 |
|
| 325 |
-
|
| 326 |
error_msg = f"❌ **Error:** {str(e)}"
|
| 327 |
return None, None, None, error_msg, [], []
|
| 328 |
|
|
|
|
| 49 |
|
| 50 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 51 |
if _is_pdf(path):
|
| 52 |
+
try:
|
| 53 |
+
# Try with poppler_path explicitly set
|
| 54 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
|
| 55 |
+
if not imgs:
|
| 56 |
+
raise ValueError(f"No pages in PDF: {path}")
|
| 57 |
+
return imgs[0].convert("RGB")
|
| 58 |
+
except Exception as e1:
|
| 59 |
+
try:
|
| 60 |
+
# Fallback: try without explicit poppler_path
|
| 61 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
|
| 62 |
+
if not imgs:
|
| 63 |
+
raise ValueError(f"No pages in PDF: {path}")
|
| 64 |
+
return imgs[0].convert("RGB")
|
| 65 |
+
except Exception as e2:
|
| 66 |
+
raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
|
| 67 |
return Image.open(path).convert("RGB")
|
| 68 |
|
| 69 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
|
|
|
| 84 |
out: List[Box] = []
|
| 85 |
for p in regionprops(labeled):
|
| 86 |
if p.area < min_area:
|
| 87 |
+
continue
|
| 88 |
minr, minc, maxr, maxc = p.bbox
|
| 89 |
out.append(Box(minr, minc, maxr, maxc, int(p.area)))
|
| 90 |
return out
|
|
|
|
| 131 |
try:
|
| 132 |
spell = SpellChecker()
|
| 133 |
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
| 134 |
+
except Exception:
|
| 135 |
+
return []
|
| 136 |
+
n = len(data.get("text", []))
|
| 137 |
boxes: List[Box] = []
|
| 138 |
+
for i in range(n):
|
| 139 |
text = data["text"][i]
|
| 140 |
if not text:
|
| 141 |
+
continue
|
| 142 |
token = normalize_token(text)
|
| 143 |
if len(token) < 2:
|
| 144 |
+
continue
|
| 145 |
if token in spell:
|
| 146 |
+
continue
|
| 147 |
left = data.get("left", [0])[i]
|
| 148 |
top = data.get("top", [0])[i]
|
| 149 |
width = data.get("width", [0])[i]
|
| 150 |
height= data.get("height",[0])[i]
|
| 151 |
if width <= 0 or height <= 0:
|
| 152 |
+
continue
|
| 153 |
boxes.append(Box(top, left, top+height, left+width, width*height))
|
| 154 |
return boxes
|
| 155 |
|
|
|
|
| 193 |
def decode_with_variants(img: Image.Image):
|
| 194 |
if not HAS_BARCODE:
|
| 195 |
return []
|
| 196 |
+
results = []
|
| 197 |
def do_decode(pil_img):
|
| 198 |
try:
|
| 199 |
dec = zbar_decode(pil_img)
|
|
|
|
| 205 |
if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
|
| 206 |
if not results and img.mode != 'RGB':
|
| 207 |
do_decode(img.convert('RGB'))
|
| 208 |
+
return results
|
| 209 |
+
|
| 210 |
def find_barcode_boxes_and_info(img: Image.Image):
|
| 211 |
decodes = decode_with_variants(img)
|
| 212 |
boxes: List[Box] = []
|
|
|
|
| 292 |
if HAS_BARCODE:
|
| 293 |
bar_a, info_a = find_barcode_boxes_and_info(a)
|
| 294 |
bar_b, info_b = find_barcode_boxes_and_info(b)
|
| 295 |
+
else:
|
| 296 |
bar_a, info_a = [], []
|
| 297 |
bar_b, info_b = [], []
|
| 298 |
|
|
|
|
| 333 |
|
| 334 |
return overlay, a_disp, b_disp, status, codes_a, codes_b
|
| 335 |
|
| 336 |
+
except Exception as e:
|
| 337 |
error_msg = f"❌ **Error:** {str(e)}"
|
| 338 |
return None, None, None, error_msg, [], []
|
| 339 |
|