heerjtdev commited on
Commit
2665493
·
verified ·
1 Parent(s): e12f847

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -171
app.py CHANGED
@@ -1,6 +1,9 @@
1
 
2
  import base64
3
  from PIL import Image
 
 
 
4
 
5
 
6
 
@@ -41,7 +44,7 @@ logging.basicConfig(level=logging.WARNING)
41
  WEIGHTS_PATH = 'best.pt'
42
  SCALE_FACTOR = 2.0
43
  # OUTPUT_DIR = "yolo_extracted_regions"
44
- OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
45
 
46
 
47
 
@@ -155,53 +158,92 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
155
  return img
156
 
157
 
 
 
158
  def run_yolo_detection_and_count(
159
  image: np.ndarray, model: YOLO, page_num: int
160
- ) -> Tuple[int, int]:
161
- """
162
- Runs YOLO inference, applies NMS/filtering, and updates global counters.
163
- Returns page counts only.
164
- """
165
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
166
-
167
  yolo_detections = []
168
  page_equations = 0
169
  page_figures = 0
170
-
 
171
  try:
172
  results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
173
-
174
  if results and results[0].boxes:
175
  for box in results[0].boxes.data.tolist():
176
  x1, y1, x2, y2, conf, cls_id = box
177
  cls_name = model.names[int(cls_id)]
178
-
179
  if cls_name in TARGET_CLASSES:
180
  yolo_detections.append({
181
- 'coords': (x1, y1, x2, y2),
182
- 'class': cls_name,
183
  'conf': conf
184
  })
185
  except Exception as e:
186
  logging.error(f"YOLO inference failed on page {page_num}: {e}")
187
- return 0, 0
188
 
189
- # Apply NMS/Merging/Filtering
190
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
191
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
192
 
193
- # Update Global Counters
194
  for det in final_detections:
195
- if det['class'] == 'figure':
196
- GLOBAL_FIGURE_COUNT += 1
197
- page_figures += 1
198
- elif det['class'] == 'equation':
199
  GLOBAL_EQUATION_COUNT += 1
200
  page_equations += 1
201
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
203
- return page_equations, page_figures
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
 
 
205
 
206
 
207
 
@@ -242,132 +284,11 @@ def extract_images_from_page_in_memory(page) -> Dict[str, str]:
242
 
243
 
244
 
245
-
246
- def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
247
- """
248
- Crop bounding box from image and return as base64 string.
249
- """
250
- x1, y1, x2, y2 = map(int, bbox)
251
- h, w, _ = image.shape
252
-
253
- # Clamp to image bounds
254
- x1 = max(0, x1)
255
- y1 = max(0, y1)
256
- x2 = min(w, x2)
257
- y2 = min(h, y2)
258
-
259
- crop = image[y1:y2, x1:x2]
260
-
261
- # Convert to PNG
262
- _, buffer = cv2.imencode(".png", crop)
263
- b64 = base64.b64encode(buffer).decode("utf-8")
264
-
265
- return f"data:image/png;base64,{b64}"
266
-
267
-
268
-
269
-
270
- def crop_and_save(image: np.ndarray, bbox, label: str, index: int) -> str:
271
- """Crop bounding box and save to disk. Return file path."""
272
- x1, y1, x2, y2 = map(int, bbox)
273
-
274
- h, w, _ = image.shape
275
- x1 = max(0, x1)
276
- y1 = max(0, y1)
277
- x2 = min(w, x2)
278
- y2 = min(h, y2)
279
-
280
- crop = image[y1:y2, x1:x2]
281
- filename = f"{label}{index}.png"
282
- filepath = os.path.join(OUTPUT_DIR, filename)
283
-
284
- cv2.imwrite(filepath, crop)
285
-
286
- return filepath
287
-
288
-
289
-
290
-
291
-
292
-
293
-
294
- def run_yolo_detection_and_count(
295
- image: np.ndarray, model: YOLO, page_num: int
296
- ) -> Tuple[int, int, List[str]]:
297
- """
298
- Runs YOLO inference, saves crops, and returns file paths.
299
- """
300
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
301
-
302
- yolo_detections = []
303
- page_equations = 0
304
- page_figures = 0
305
- saved_images = []
306
-
307
- try:
308
- results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
309
-
310
- if results and results[0].boxes:
311
- for box in results[0].boxes.data.tolist():
312
- x1, y1, x2, y2, conf, cls_id = box
313
- cls_name = model.names[int(cls_id)]
314
-
315
- if cls_name in TARGET_CLASSES:
316
- yolo_detections.append({
317
- 'coords': (x1, y1, x2, y2),
318
- 'class': cls_name,
319
- 'conf': conf
320
- })
321
- except Exception as e:
322
- logging.error(f"YOLO inference failed on page {page_num}: {e}")
323
- return 0, 0, []
324
-
325
- merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
326
- final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
327
-
328
- for det in final_detections:
329
- bbox = det["coords"]
330
-
331
- if det["class"] == "equation":
332
- GLOBAL_EQUATION_COUNT += 1
333
- page_equations += 1
334
-
335
- path = crop_and_save(image, bbox, "EQUATION", GLOBAL_EQUATION_COUNT)
336
- saved_images.append(path)
337
-
338
- elif det["class"] == "figure":
339
- GLOBAL_FIGURE_COUNT += 1
340
- page_figures += 1
341
-
342
- path = crop_and_save(image, bbox, "FIGURE", GLOBAL_FIGURE_COUNT)
343
- saved_images.append(path)
344
-
345
- logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
346
- return page_equations, page_figures, saved_images
347
-
348
-
349
- def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], pdf_doc) -> List[Dict[str, Any]]:
350
- print("\n" + "="*80)
351
- print("--- IN-MEMORY IMAGE + EQUATION TO LATEX PIPELINE ---")
352
- print("="*80)
353
-
354
- if not structured_data:
355
- return []
356
-
357
- # Build global image map from all pages (in memory only)
358
- full_image_lookup = {}
359
-
360
- for page_index in range(len(pdf_doc)):
361
- page = pdf_doc[page_index]
362
- page_images = extract_images_from_page_in_memory(page)
363
-
364
- for tag, base64_img in page_images.items():
365
- full_image_lookup[tag] = base64_img
366
-
367
- print(f" -> Found {len(full_image_lookup)} total in-memory images.")
368
-
369
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
370
- final_structured_data = []
 
 
371
 
372
  for item in structured_data:
373
  text_fields = [
@@ -377,41 +298,52 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], pdf_
377
  ]
378
 
379
  if 'options' in item:
380
- for opt in item['options'].values():
381
- text_fields.append(opt)
382
 
383
- unique_tags = set()
384
 
385
  for text in text_fields:
386
- if not text:
387
- continue
388
- for match in tag_regex.finditer(text):
389
- unique_tags.add(match.group(0).upper())
390
 
391
- for tag in sorted(unique_tags):
392
- base_key = tag.lower().replace(' ', '')
393
 
394
- if tag not in full_image_lookup:
395
  item[base_key] = "[MISSING_IMAGE]"
396
  continue
397
 
398
- base64_img = full_image_lookup[tag]
 
 
 
 
 
 
399
 
400
- if "EQUATION" in tag:
401
- latex = get_latex_from_base64(base64_img)
402
- item[base_key] = latex
403
- print(f" ✅ {tag} → LaTeX")
404
 
405
- elif "FIGURE" in tag:
406
- item[base_key] = base64_img
407
- print(f" ✅ {tag} → Base64")
408
 
409
- final_structured_data.append(item)
410
 
411
- print("✅ In-memory embedding completed")
412
- return final_structured_data
413
 
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
 
417
 
@@ -455,9 +387,9 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
455
 
456
 
457
 
458
- if os.path.exists(OUTPUT_DIR):
459
- shutil.rmtree(OUTPUT_DIR)
460
- os.makedirs(OUTPUT_DIR, exist_ok=True)
461
 
462
 
463
  # 1. Validation and Model Loading
@@ -640,7 +572,7 @@ if __name__ == "__main__":
640
  # interface.launch(inbrowser=True)
641
  interface.launch(
642
  inbrowser=True,
643
- allowed_paths=[OUTPUT_DIR]
644
  )
645
 
646
 
 
1
 
2
  import base64
3
  from PIL import Image
4
+ import re
5
+ from transformers import TrOCRProcessor
6
+ from optimum.onnxruntime import ORTModelForVision2Seq
7
 
8
 
9
 
 
44
  WEIGHTS_PATH = 'best.pt'
45
  SCALE_FACTOR = 2.0
46
  # OUTPUT_DIR = "yolo_extracted_regions"
47
+ # OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
48
 
49
 
50
 
 
158
  return img
159
 
160
 
161
+
162
+
163
  def run_yolo_detection_and_count(
164
  image: np.ndarray, model: YOLO, page_num: int
165
+ ) -> Tuple[int, int, List[Dict[str, str]]]:
166
+
 
 
 
167
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
168
+
169
  yolo_detections = []
170
  page_equations = 0
171
  page_figures = 0
172
+ detected_items = []
173
+
174
  try:
175
  results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
176
+
177
  if results and results[0].boxes:
178
  for box in results[0].boxes.data.tolist():
179
  x1, y1, x2, y2, conf, cls_id = box
180
  cls_name = model.names[int(cls_id)]
181
+
182
  if cls_name in TARGET_CLASSES:
183
  yolo_detections.append({
184
+ 'coords': (x1, y1, x2, y2),
185
+ 'class': cls_name,
186
  'conf': conf
187
  })
188
  except Exception as e:
189
  logging.error(f"YOLO inference failed on page {page_num}: {e}")
190
+ return 0, 0, []
191
 
 
192
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
193
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
194
 
 
195
  for det in final_detections:
196
+ bbox = det["coords"]
197
+
198
+ if det["class"] == "equation":
 
199
  GLOBAL_EQUATION_COUNT += 1
200
  page_equations += 1
201
+
202
+ b64 = crop_and_convert_to_base64(image, bbox)
203
+ detected_items.append({
204
+ "type": "equation",
205
+ "id": f"EQUATION{GLOBAL_EQUATION_COUNT}",
206
+ "base64": b64
207
+ })
208
+
209
+ elif det["class"] == "figure":
210
+ GLOBAL_FIGURE_COUNT += 1
211
+ page_figures += 1
212
+
213
+ b64 = crop_and_convert_to_base64(image, bbox)
214
+ detected_items.append({
215
+ "type": "figure",
216
+ "id": f"FIGURE{GLOBAL_FIGURE_COUNT}",
217
+ "base64": b64
218
+ })
219
+
220
  logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
221
+ return page_equations, page_figures, detected_items
222
+
223
+
224
+
225
+ def get_latex_from_base64(base64_string: str) -> str:
226
+ if ort_model is None or processor is None:
227
+ return "[MODEL_ERROR: Model not initialized]"
228
+
229
+ try:
230
+ image_data = base64.b64decode(base64_string)
231
+ image = Image.open(io.BytesIO(image_data)).convert('RGB')
232
+
233
+ pixel_values = processor(images=image, return_tensors="pt").pixel_values
234
+ generated_ids = ort_model.generate(pixel_values)
235
+ raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
236
+
237
+ if not raw_text:
238
+ return "[OCR_WARNING: No formula found]"
239
+
240
+ latex = raw_text[0]
241
+ latex = re.sub(r'[\r\n]+', '', latex)
242
+
243
+ return latex
244
 
245
+ except Exception as e:
246
+ return f"[TR_OCR_ERROR: {e}]"
247
 
248
 
249
 
 
284
 
285
 
286
 
287
+ def embed_images_as_base64_in_memory(structured_data, detected_items):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
289
+
290
+ item_lookup = {d["id"]: d for d in detected_items}
291
+ final_data = []
292
 
293
  for item in structured_data:
294
  text_fields = [
 
298
  ]
299
 
300
  if 'options' in item:
301
+ text_fields.extend(item['options'].values())
 
302
 
303
+ used_tags = set()
304
 
305
  for text in text_fields:
306
+ for m in tag_regex.finditer(text or ""):
307
+ used_tags.add(m.group(0).upper())
 
 
308
 
309
+ for tag in used_tags:
310
+ base_key = tag.lower().replace(" ", "")
311
 
312
+ if tag not in item_lookup:
313
  item[base_key] = "[MISSING_IMAGE]"
314
  continue
315
 
316
+ entry = item_lookup[tag]
317
+
318
+ if entry["type"] == "equation":
319
+ item[base_key] = get_latex_from_base64(entry["base64"])
320
+
321
+ else:
322
+ item[base_key] = entry["base64"]
323
 
324
+ final_data.append(item)
325
+
326
+ return final_data
327
+
328
 
 
 
 
329
 
 
330
 
 
 
331
 
332
 
333
+ def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
334
+ x1, y1, x2, y2 = map(int, bbox)
335
+ h, w, _ = image.shape
336
+
337
+ x1 = max(0, x1)
338
+ y1 = max(0, y1)
339
+ x2 = min(w, x2)
340
+ y2 = min(h, y2)
341
+
342
+ crop = image[y1:y2, x1:x2]
343
+ _, buffer = cv2.imencode(".png", crop)
344
+
345
+ return base64.b64encode(buffer).decode("utf-8")
346
+
347
 
348
 
349
 
 
387
 
388
 
389
 
390
+ # if os.path.exists(OUTPUT_DIR):
391
+ # shutil.rmtree(OUTPUT_DIR)
392
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
393
 
394
 
395
  # 1. Validation and Model Loading
 
572
  # interface.launch(inbrowser=True)
573
  interface.launch(
574
  inbrowser=True,
575
+ # allowed_paths=[OUTPUT_DIR]
576
  )
577
 
578