heerjtdev commited on
Commit
3dd4c9e
·
verified ·
1 Parent(s): 0add556

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +302 -25
app.py CHANGED
@@ -1296,6 +1296,73 @@ def get_latex_from_base64(base64_string: str) -> str:
1296
 
1297
 
1298
  # --- UPDATED: page width argument removed from signature and call ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1299
  def run_yolo_detection_and_count(
1300
  image: np.ndarray, model: YOLO, page_num: int,
1301
  current_eq_count: int, current_fig_count: int
@@ -1327,12 +1394,9 @@ def run_yolo_detection_and_count(
1327
  logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1328
  return [], eq_counter, fig_counter
1329
 
1330
- # Call merge_overlapping_boxes without page_width
1331
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1332
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1333
 
1334
- # Note: final_detections is now sorted purely by y1
1335
-
1336
  for det in final_detections:
1337
  bbox = det["coords"]
1338
  crop_pil = crop_and_convert_to_pil(image, bbox)
@@ -1341,6 +1405,7 @@ def run_yolo_detection_and_count(
1341
  "type": det["class"],
1342
  "coords": bbox,
1343
  "pil_image": crop_pil,
 
1344
  }
1345
 
1346
  if det["class"] == "equation":
@@ -1357,10 +1422,182 @@ def run_yolo_detection_and_count(
1357
  return detected_items, eq_counter, fig_counter
1358
 
1359
 
 
 
 
 
 
 
 
1360
  # ============================================================================
1361
  # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
1362
  # ============================================================================
1363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1364
  def run_single_pdf_preprocessing(
1365
  pdf_path: str
1366
  ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
@@ -1408,7 +1645,7 @@ def run_single_pdf_preprocessing(
1408
 
1409
  mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
1410
 
1411
- # 3. Page Processing, Detection, and OCR Loop
1412
  t4 = time.time()
1413
  for page_num_0_based in range(doc.page_count):
1414
  page_start_time = time.time()
@@ -1440,36 +1677,66 @@ def run_single_pdf_preprocessing(
1440
  )
1441
  detect_time = time.time() - detect_start
1442
 
1443
- # --- OCR/LaTeX Conversion and Logging ---
1444
- ocr_total_time = 0
1445
- page_equations = 0
1446
-
1447
- for item in page_extracted_items:
1448
- if item["type"] == "equation":
1449
- page_equations += 1
1450
- ocr_start = time.time()
1451
-
1452
- b64_string = pil_to_base64(item["pil_image"])
1453
- item["latex"] = get_latex_from_base64(b64_string)
1454
-
1455
- ocr_time = time.time() - ocr_start
1456
- ocr_total_time += ocr_time
1457
-
1458
- logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
1459
-
1460
  all_extracted_items.extend(page_extracted_items)
1461
 
1462
  page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
 
1463
 
1464
  page_total_time = time.time() - page_start_time
1465
- logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
1466
 
1467
  doc.close()
1468
  t5 = time.time()
1469
  detection_loop_time = t5 - t4
1470
- logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
 
 
 
 
1471
 
1472
- # 4. Final Report Generation and Gallery Formatting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1473
 
1474
  # Create the structured JSON output as requested by the user
1475
  structured_latex_output = {
@@ -1493,7 +1760,7 @@ def run_single_pdf_preprocessing(
1493
  gallery_items.append((item["pil_image"], image_label))
1494
 
1495
 
1496
- total_execution_time = t5 - start_time
1497
 
1498
  full_log = log_stream.getvalue()
1499
 
@@ -1514,6 +1781,16 @@ def run_single_pdf_preprocessing(
1514
  return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
1515
 
1516
 
 
 
 
 
 
 
 
 
 
 
1517
  # ============================================================================
1518
  # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
1519
  # ============================================================================
 
1296
 
1297
 
1298
  # --- UPDATED: page width argument removed from signature and call ---
1299
+ # def run_yolo_detection_and_count(
1300
+ # image: np.ndarray, model: YOLO, page_num: int,
1301
+ # current_eq_count: int, current_fig_count: int
1302
+ # ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
1303
+ # """
1304
+ # Performs YOLO detection and returns a list of detected item dictionaries
1305
+ # and the updated total counters.
1306
+ # """
1307
+
1308
+ # eq_counter = current_eq_count
1309
+ # fig_counter = current_fig_count
1310
+
1311
+ # detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
1312
+ # yolo_detections = []
1313
+
1314
+ # try:
1315
+ # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
1316
+ # if results and results[0].boxes:
1317
+ # for box in results[0].boxes.data.tolist():
1318
+ # x1, y1, x2, y2, conf, cls_id = box
1319
+ # cls_name = model.names[int(cls_id)]
1320
+ # if cls_name in TARGET_CLASSES:
1321
+ # yolo_detections.append({
1322
+ # 'coords': (x1, y1, x2, y2),
1323
+ # 'class': cls_name,
1324
+ # 'conf': conf
1325
+ # })
1326
+ # except Exception as e:
1327
+ # logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1328
+ # return [], eq_counter, fig_counter
1329
+
1330
+ # # Call merge_overlapping_boxes without page_width
1331
+ # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1332
+ # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1333
+
1334
+ # # Note: final_detections is now sorted purely by y1
1335
+
1336
+ # for det in final_detections:
1337
+ # bbox = det["coords"]
1338
+ # crop_pil = crop_and_convert_to_pil(image, bbox)
1339
+
1340
+ # item = {
1341
+ # "type": det["class"],
1342
+ # "coords": bbox,
1343
+ # "pil_image": crop_pil,
1344
+ # }
1345
+
1346
+ # if det["class"] == "equation":
1347
+ # eq_counter += 1
1348
+ # item["id"] = f"EQUATION{eq_counter}"
1349
+ # item["latex"] = ""
1350
+ # elif det["class"] == "figure":
1351
+ # fig_counter += 1
1352
+ # item["id"] = f"FIGURE{fig_counter}"
1353
+ # item["latex"] = "[FIGURE - No LaTeX]"
1354
+
1355
+ # detected_items.append(item)
1356
+
1357
+ # return detected_items, eq_counter, fig_counter
1358
+
1359
+
1360
+
1361
+
1362
+
1363
+
1364
+
1365
+
1366
  def run_yolo_detection_and_count(
1367
  image: np.ndarray, model: YOLO, page_num: int,
1368
  current_eq_count: int, current_fig_count: int
 
1394
  logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1395
  return [], eq_counter, fig_counter
1396
 
 
1397
  merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1398
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1399
 
 
 
1400
  for det in final_detections:
1401
  bbox = det["coords"]
1402
  crop_pil = crop_and_convert_to_pil(image, bbox)
 
1405
  "type": det["class"],
1406
  "coords": bbox,
1407
  "pil_image": crop_pil,
1408
+ "page_num": page_num, # ← ADD THIS LINE
1409
  }
1410
 
1411
  if det["class"] == "equation":
 
1422
  return detected_items, eq_counter, fig_counter
1423
 
1424
 
1425
+
1426
+
1427
+
1428
+
1429
+
1430
+
1431
+
1432
  # ============================================================================
1433
  # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
1434
  # ============================================================================
1435
 
1436
+ # def run_single_pdf_preprocessing(
1437
+ # pdf_path: str
1438
+ # ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
1439
+ # """
1440
+ # Runs the pipeline, performs OCR, and returns final results.
1441
+ # """
1442
+
1443
+ # log_stream.truncate(0)
1444
+ # log_stream.seek(0)
1445
+
1446
+ # start_time = time.time()
1447
+
1448
+ # all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
1449
+
1450
+ # total_figure_count = 0
1451
+ # total_equation_count = 0
1452
+
1453
+
1454
+ # # 1. Validation and Model Loading (YOLO)
1455
+ # t0 = time.time()
1456
+ # if not os.path.exists(pdf_path):
1457
+ # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
1458
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
1459
+
1460
+ # try:
1461
+ # model = YOLO(WEIGHTS_PATH)
1462
+ # logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
1463
+ # except Exception as e:
1464
+ # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
1465
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
1466
+ # t1 = time.time()
1467
+ # logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
1468
+
1469
+ # # 2. PDF Loading (fitz)
1470
+ # t2 = time.time()
1471
+ # try:
1472
+ # doc = fitz.open(pdf_path)
1473
+ # total_pages = doc.page_count
1474
+ # logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
1475
+ # except Exception as e:
1476
+ # report = f"❌ ERROR loading PDF file: {e}"
1477
+ # return 0, 0, 0, report, time.time() - start_time, {}, []
1478
+ # t3 = time.time()
1479
+ # logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
1480
+
1481
+ # mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
1482
+
1483
+ # # 3. Page Processing, Detection, and OCR Loop
1484
+ # t4 = time.time()
1485
+ # for page_num_0_based in range(doc.page_count):
1486
+ # page_start_time = time.time()
1487
+ # fitz_page = doc.load_page(page_num_0_based)
1488
+ # page_num = page_num_0_based + 1
1489
+
1490
+ # # Render page to image for YOLO
1491
+ # try:
1492
+ # pix_start = time.time()
1493
+ # pix = fitz_page.get_pixmap(matrix=mat)
1494
+ # original_img = pixmap_to_numpy(pix)
1495
+ # pix_time = time.time() - pix_start
1496
+ # except Exception as e:
1497
+ # logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
1498
+ # continue
1499
+
1500
+ # # YOLO Detection
1501
+ # detect_start = time.time()
1502
+ # (
1503
+ # page_extracted_items,
1504
+ # total_equation_count,
1505
+ # total_figure_count
1506
+ # ) = run_yolo_detection_and_count(
1507
+ # original_img,
1508
+ # model,
1509
+ # page_num,
1510
+ # total_equation_count,
1511
+ # total_figure_count
1512
+ # )
1513
+ # detect_time = time.time() - detect_start
1514
+
1515
+ # # --- OCR/LaTeX Conversion and Logging ---
1516
+ # ocr_total_time = 0
1517
+ # page_equations = 0
1518
+
1519
+ # for item in page_extracted_items:
1520
+ # if item["type"] == "equation":
1521
+ # page_equations += 1
1522
+ # ocr_start = time.time()
1523
+
1524
+ # b64_string = pil_to_base64(item["pil_image"])
1525
+ # item["latex"] = get_latex_from_base64(b64_string)
1526
+
1527
+ # ocr_time = time.time() - ocr_start
1528
+ # ocr_total_time += ocr_time
1529
+
1530
+ # logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
1531
+
1532
+ # all_extracted_items.extend(page_extracted_items)
1533
+
1534
+ # page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
1535
+
1536
+ # page_total_time = time.time() - page_start_time
1537
+ # logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
1538
+
1539
+ # doc.close()
1540
+ # t5 = time.time()
1541
+ # detection_loop_time = t5 - t4
1542
+ # logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
1543
+
1544
+ # # 4. Final Report Generation and Gallery Formatting
1545
+
1546
+ # # Create the structured JSON output as requested by the user
1547
+ # structured_latex_output = {
1548
+ # "Total Pages": total_pages,
1549
+ # "Total Equations": total_equation_count,
1550
+ # }
1551
+ # for item in all_extracted_items:
1552
+ # if item["type"] == "equation":
1553
+ # # Map EQUATION ID to LaTeX code
1554
+ # structured_latex_output[item["id"]] = item["latex"]
1555
+
1556
+
1557
+ # # Format the extracted items for the Gradio Gallery
1558
+ # gallery_items: List[Tuple[Image.Image, str]] = []
1559
+
1560
+ # for item in all_extracted_items:
1561
+ # image_label = item["id"]
1562
+ # if item["type"] == "equation":
1563
+ # image_label = f'{item["id"]}: {item["latex"]}'
1564
+
1565
+ # gallery_items.append((item["pil_image"], image_label))
1566
+
1567
+
1568
+ # total_execution_time = t5 - start_time
1569
+
1570
+ # full_log = log_stream.getvalue()
1571
+
1572
+ # report = (
1573
+ # f"✅ **YOLO Counting & OCR Complete!**\n\n"
1574
+ # f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
1575
+ # f"**2) Total Equations Detected:** **{total_equation_count}**\n"
1576
+ # f"**3) Total Figures Detected:** **{total_figure_count}**\n"
1577
+ # f"---\n"
1578
+ # f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
1579
+ # f"### Full Processing Log\n"
1580
+ # f"```text\n"
1581
+ # f"{full_log}"
1582
+ # f"\n```"
1583
+ # )
1584
+
1585
+ # # Return the new structured_latex_output instead of the page counts
1586
+ # return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
1587
+
1588
+
1589
+
1590
+
1591
+
1592
+
1593
+
1594
+
1595
+
1596
+
1597
+
1598
+
1599
+
1600
+
1601
  def run_single_pdf_preprocessing(
1602
  pdf_path: str
1603
  ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
 
1645
 
1646
  mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
1647
 
1648
+ # 3. Page Processing and Detection Loop
1649
  t4 = time.time()
1650
  for page_num_0_based in range(doc.page_count):
1651
  page_start_time = time.time()
 
1677
  )
1678
  detect_time = time.time() - detect_start
1679
 
1680
+ # Store items (OCR will be done later in correct order)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1681
  all_extracted_items.extend(page_extracted_items)
1682
 
1683
  page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
1684
+ page_equations = sum(1 for item in page_extracted_items if item["type"] == "equation")
1685
 
1686
  page_total_time = time.time() - page_start_time
1687
+ logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s)")
1688
 
1689
  doc.close()
1690
  t5 = time.time()
1691
  detection_loop_time = t5 - t4
1692
+ logging.warning(f"INFO: Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
1693
+
1694
+ # 4. Sort all items by page number, then by y-coordinate
1695
+ logging.warning(f"INFO: Sorting {len(all_extracted_items)} items by page and position...")
1696
+ all_extracted_items.sort(key=lambda item: (item['page_num'], item['coords'][1]))
1697
 
1698
+ # 5. Re-assign IDs in the correct order
1699
+ equation_counter = 0
1700
+ figure_counter = 0
1701
+
1702
+ for item in all_extracted_items:
1703
+ if item["type"] == "equation":
1704
+ equation_counter += 1
1705
+ item["id"] = f"EQUATION{equation_counter}"
1706
+ elif item["type"] == "figure":
1707
+ figure_counter += 1
1708
+ item["id"] = f"FIGURE{figure_counter}"
1709
+
1710
+ # Update the total counts with the correct values
1711
+ total_equation_count = equation_counter
1712
+ total_figure_count = figure_counter
1713
+
1714
+ logging.warning(f"INFO: Re-numbered items - Total Equations: {total_equation_count}, Total Figures: {total_figure_count}")
1715
+
1716
+ # 6. Perform OCR in the correct order
1717
+ t6 = time.time()
1718
+ ocr_total_time = 0
1719
+
1720
+ logging.warning(f"INFO: Starting OCR for {total_equation_count} equations in correct order...")
1721
+
1722
+ for item in all_extracted_items:
1723
+ if item["type"] == "equation":
1724
+ ocr_start = time.time()
1725
+
1726
+ b64_string = pil_to_base64(item["pil_image"])
1727
+ item["latex"] = get_latex_from_base64(b64_string)
1728
+
1729
+ ocr_time = time.time() - ocr_start
1730
+ ocr_total_time += ocr_time
1731
+
1732
+ logging.warning(f"LATEX: Page {item['page_num']}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
1733
+ elif item["type"] == "figure":
1734
+ item["latex"] = "[FIGURE - No LaTeX]"
1735
+
1736
+ t7 = time.time()
1737
+ logging.warning(f"INFO: Total OCR Time: {ocr_total_time:.4f}s")
1738
+
1739
+ # 7. Final Report Generation and Gallery Formatting
1740
 
1741
  # Create the structured JSON output as requested by the user
1742
  structured_latex_output = {
 
1760
  gallery_items.append((item["pil_image"], image_label))
1761
 
1762
 
1763
+ total_execution_time = t7 - start_time
1764
 
1765
  full_log = log_stream.getvalue()
1766
 
 
1781
  return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
1782
 
1783
 
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
  # ============================================================================
1795
  # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
1796
  # ============================================================================