Yaz Hobooti commited on
Commit
bae9f7f
·
1 Parent(s): d066e77

Fix PDF processing: add poppler dev package and better error handling

Browse files
Files changed (2) hide show
  1. apt.txt +1 -0
  2. pdf_comparator.py +29 -18
apt.txt CHANGED
@@ -1,3 +1,4 @@
1
  poppler-utils
2
  tesseract-ocr
3
  libzbar0
 
 
1
  poppler-utils
2
  tesseract-ocr
3
  libzbar0
4
+ libpoppler-cpp-dev
pdf_comparator.py CHANGED
@@ -49,10 +49,21 @@ def _is_pdf(path: str) -> bool:
49
 
50
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
51
  if _is_pdf(path):
52
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
53
- if not imgs:
54
- raise ValueError(f"No pages in PDF: {path}")
55
- return imgs[0].convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
56
  return Image.open(path).convert("RGB")
57
 
58
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
@@ -73,7 +84,7 @@ def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int =
73
  out: List[Box] = []
74
  for p in regionprops(labeled):
75
  if p.area < min_area:
76
- continue
77
  minr, minc, maxr, maxc = p.bbox
78
  out.append(Box(minr, minc, maxr, maxc, int(p.area)))
79
  return out
@@ -120,25 +131,25 @@ def find_misspell_boxes(img: Image.Image) -> List[Box]:
120
  try:
121
  spell = SpellChecker()
122
  data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
123
- except Exception:
124
- return []
125
- n = len(data.get("text", []))
126
  boxes: List[Box] = []
127
- for i in range(n):
128
  text = data["text"][i]
129
  if not text:
130
- continue
131
  token = normalize_token(text)
132
  if len(token) < 2:
133
- continue
134
  if token in spell:
135
- continue
136
  left = data.get("left", [0])[i]
137
  top = data.get("top", [0])[i]
138
  width = data.get("width", [0])[i]
139
  height= data.get("height",[0])[i]
140
  if width <= 0 or height <= 0:
141
- continue
142
  boxes.append(Box(top, left, top+height, left+width, width*height))
143
  return boxes
144
 
@@ -182,7 +193,7 @@ def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
182
  def decode_with_variants(img: Image.Image):
183
  if not HAS_BARCODE:
184
  return []
185
- results = []
186
  def do_decode(pil_img):
187
  try:
188
  dec = zbar_decode(pil_img)
@@ -194,8 +205,8 @@ def decode_with_variants(img: Image.Image):
194
  if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
195
  if not results and img.mode != 'RGB':
196
  do_decode(img.convert('RGB'))
197
- return results
198
-
199
  def find_barcode_boxes_and_info(img: Image.Image):
200
  decodes = decode_with_variants(img)
201
  boxes: List[Box] = []
@@ -281,7 +292,7 @@ def compare_pdfs(file_a, file_b):
281
  if HAS_BARCODE:
282
  bar_a, info_a = find_barcode_boxes_and_info(a)
283
  bar_b, info_b = find_barcode_boxes_and_info(b)
284
- else:
285
  bar_a, info_a = [], []
286
  bar_b, info_b = [], []
287
 
@@ -322,7 +333,7 @@ def compare_pdfs(file_a, file_b):
322
 
323
  return overlay, a_disp, b_disp, status, codes_a, codes_b
324
 
325
- except Exception as e:
326
  error_msg = f"❌ **Error:** {str(e)}"
327
  return None, None, None, error_msg, [], []
328
 
 
49
 
50
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
51
  if _is_pdf(path):
52
+ try:
53
+ # Try with poppler_path explicitly set
54
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
55
+ if not imgs:
56
+ raise ValueError(f"No pages in PDF: {path}")
57
+ return imgs[0].convert("RGB")
58
+ except Exception as e1:
59
+ try:
60
+ # Fallback: try without explicit poppler_path
61
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
62
+ if not imgs:
63
+ raise ValueError(f"No pages in PDF: {path}")
64
+ return imgs[0].convert("RGB")
65
+ except Exception as e2:
66
+ raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
67
  return Image.open(path).convert("RGB")
68
 
69
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
 
84
  out: List[Box] = []
85
  for p in regionprops(labeled):
86
  if p.area < min_area:
87
+ continue
88
  minr, minc, maxr, maxc = p.bbox
89
  out.append(Box(minr, minc, maxr, maxc, int(p.area)))
90
  return out
 
131
  try:
132
  spell = SpellChecker()
133
  data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
134
+ except Exception:
135
+ return []
136
+ n = len(data.get("text", []))
137
  boxes: List[Box] = []
138
+ for i in range(n):
139
  text = data["text"][i]
140
  if not text:
141
+ continue
142
  token = normalize_token(text)
143
  if len(token) < 2:
144
+ continue
145
  if token in spell:
146
+ continue
147
  left = data.get("left", [0])[i]
148
  top = data.get("top", [0])[i]
149
  width = data.get("width", [0])[i]
150
  height= data.get("height",[0])[i]
151
  if width <= 0 or height <= 0:
152
+ continue
153
  boxes.append(Box(top, left, top+height, left+width, width*height))
154
  return boxes
155
 
 
193
  def decode_with_variants(img: Image.Image):
194
  if not HAS_BARCODE:
195
  return []
196
+ results = []
197
  def do_decode(pil_img):
198
  try:
199
  dec = zbar_decode(pil_img)
 
205
  if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
206
  if not results and img.mode != 'RGB':
207
  do_decode(img.convert('RGB'))
208
+ return results
209
+
210
  def find_barcode_boxes_and_info(img: Image.Image):
211
  decodes = decode_with_variants(img)
212
  boxes: List[Box] = []
 
292
  if HAS_BARCODE:
293
  bar_a, info_a = find_barcode_boxes_and_info(a)
294
  bar_b, info_b = find_barcode_boxes_and_info(b)
295
+ else:
296
  bar_a, info_a = [], []
297
  bar_b, info_b = [], []
298
 
 
333
 
334
  return overlay, a_disp, b_disp, status, codes_a, codes_b
335
 
336
+ except Exception as e:
337
  error_msg = f"❌ **Error:** {str(e)}"
338
  return None, None, None, error_msg, [], []
339