heerjtdev commited on
Commit
e12f847
·
verified ·
1 Parent(s): a5f01d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py CHANGED
@@ -44,6 +44,21 @@ SCALE_FACTOR = 2.0
44
  OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # Detection parameters
48
  CONF_THRESHOLD = 0.2
49
  TARGET_CLASSES = ['figure', 'equation']
@@ -203,6 +218,26 @@ def run_yolo_detection_and_count(
203
 
204
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
 
208
 
@@ -311,6 +346,72 @@ def run_yolo_detection_and_count(
311
  return page_equations, page_figures, saved_images
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
 
316
 
 
44
  OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
45
 
46
 
47
+
48
+
49
+
50
+ MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
51
+ processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
52
+ ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
  # Detection parameters
63
  CONF_THRESHOLD = 0.2
64
  TARGET_CLASSES = ['figure', 'equation']
 
218
 
219
 
220
 
221
+ def extract_images_from_page_in_memory(page) -> Dict[str, str]:
222
+ """
223
+ Extract images from a page and return:
224
+ { "EQUATION1": base64_string, "FIGURE1": base64_string }
225
+ """
226
+ image_map = {}
227
+ image_list = page.get_images(full=True)
228
+
229
+ for idx, img in enumerate(image_list, start=1):
230
+ xref = img[0]
231
+ base = page.parent.extract_image(xref)
232
+ image_bytes = base["image"]
233
+
234
+ base64_img = base64.b64encode(image_bytes).decode("utf-8")
235
+
236
+ # Convention: first image = FIGURE1, second image = EQUATION1 etc
237
+ # You can tune this if needed
238
+ image_map[f"FIGURE{idx}"] = base64_img
239
+
240
+ return image_map
241
 
242
 
243
 
 
346
  return page_equations, page_figures, saved_images
347
 
348
 
349
+ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], pdf_doc) -> List[Dict[str, Any]]:
350
+ print("\n" + "="*80)
351
+ print("--- IN-MEMORY IMAGE + EQUATION TO LATEX PIPELINE ---")
352
+ print("="*80)
353
+
354
+ if not structured_data:
355
+ return []
356
+
357
+ # Build global image map from all pages (in memory only)
358
+ full_image_lookup = {}
359
+
360
+ for page_index in range(len(pdf_doc)):
361
+ page = pdf_doc[page_index]
362
+ page_images = extract_images_from_page_in_memory(page)
363
+
364
+ for tag, base64_img in page_images.items():
365
+ full_image_lookup[tag] = base64_img
366
+
367
+ print(f" -> Found {len(full_image_lookup)} total in-memory images.")
368
+
369
+ tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
370
+ final_structured_data = []
371
+
372
+ for item in structured_data:
373
+ text_fields = [
374
+ item.get('question', ''),
375
+ item.get('passage', ''),
376
+ item.get('new_passage', '')
377
+ ]
378
+
379
+ if 'options' in item:
380
+ for opt in item['options'].values():
381
+ text_fields.append(opt)
382
+
383
+ unique_tags = set()
384
+
385
+ for text in text_fields:
386
+ if not text:
387
+ continue
388
+ for match in tag_regex.finditer(text):
389
+ unique_tags.add(match.group(0).upper())
390
+
391
+ for tag in sorted(unique_tags):
392
+ base_key = tag.lower().replace(' ', '')
393
+
394
+ if tag not in full_image_lookup:
395
+ item[base_key] = "[MISSING_IMAGE]"
396
+ continue
397
+
398
+ base64_img = full_image_lookup[tag]
399
+
400
+ if "EQUATION" in tag:
401
+ latex = get_latex_from_base64(base64_img)
402
+ item[base_key] = latex
403
+ print(f" ✅ {tag} → LaTeX")
404
+
405
+ elif "FIGURE" in tag:
406
+ item[base_key] = base64_img
407
+ print(f" ✅ {tag} → Base64")
408
+
409
+ final_structured_data.append(item)
410
+
411
+ print("✅ In-memory embedding completed")
412
+ return final_structured_data
413
+
414
+
415
 
416
 
417