Update app.py
Browse files
app.py
CHANGED
|
@@ -1296,6 +1296,73 @@ def get_latex_from_base64(base64_string: str) -> str:
|
|
| 1296 |
|
| 1297 |
|
| 1298 |
# --- UPDATED: page width argument removed from signature and call ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1299 |
def run_yolo_detection_and_count(
|
| 1300 |
image: np.ndarray, model: YOLO, page_num: int,
|
| 1301 |
current_eq_count: int, current_fig_count: int
|
|
@@ -1327,12 +1394,9 @@ def run_yolo_detection_and_count(
|
|
| 1327 |
logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
|
| 1328 |
return [], eq_counter, fig_counter
|
| 1329 |
|
| 1330 |
-
# Call merge_overlapping_boxes without page_width
|
| 1331 |
merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
|
| 1332 |
final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
|
| 1333 |
|
| 1334 |
-
# Note: final_detections is now sorted purely by y1
|
| 1335 |
-
|
| 1336 |
for det in final_detections:
|
| 1337 |
bbox = det["coords"]
|
| 1338 |
crop_pil = crop_and_convert_to_pil(image, bbox)
|
|
@@ -1341,6 +1405,7 @@ def run_yolo_detection_and_count(
|
|
| 1341 |
"type": det["class"],
|
| 1342 |
"coords": bbox,
|
| 1343 |
"pil_image": crop_pil,
|
|
|
|
| 1344 |
}
|
| 1345 |
|
| 1346 |
if det["class"] == "equation":
|
|
@@ -1357,10 +1422,182 @@ def run_yolo_detection_and_count(
|
|
| 1357 |
return detected_items, eq_counter, fig_counter
|
| 1358 |
|
| 1359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
# ============================================================================
|
| 1361 |
# --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
|
| 1362 |
# ============================================================================
|
| 1363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1364 |
def run_single_pdf_preprocessing(
|
| 1365 |
pdf_path: str
|
| 1366 |
) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
|
|
@@ -1408,7 +1645,7 @@ def run_single_pdf_preprocessing(
|
|
| 1408 |
|
| 1409 |
mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
|
| 1410 |
|
| 1411 |
-
# 3. Page Processing
|
| 1412 |
t4 = time.time()
|
| 1413 |
for page_num_0_based in range(doc.page_count):
|
| 1414 |
page_start_time = time.time()
|
|
@@ -1440,36 +1677,66 @@ def run_single_pdf_preprocessing(
|
|
| 1440 |
)
|
| 1441 |
detect_time = time.time() - detect_start
|
| 1442 |
|
| 1443 |
-
#
|
| 1444 |
-
ocr_total_time = 0
|
| 1445 |
-
page_equations = 0
|
| 1446 |
-
|
| 1447 |
-
for item in page_extracted_items:
|
| 1448 |
-
if item["type"] == "equation":
|
| 1449 |
-
page_equations += 1
|
| 1450 |
-
ocr_start = time.time()
|
| 1451 |
-
|
| 1452 |
-
b64_string = pil_to_base64(item["pil_image"])
|
| 1453 |
-
item["latex"] = get_latex_from_base64(b64_string)
|
| 1454 |
-
|
| 1455 |
-
ocr_time = time.time() - ocr_start
|
| 1456 |
-
ocr_total_time += ocr_time
|
| 1457 |
-
|
| 1458 |
-
logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
|
| 1459 |
-
|
| 1460 |
all_extracted_items.extend(page_extracted_items)
|
| 1461 |
|
| 1462 |
page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
|
|
|
|
| 1463 |
|
| 1464 |
page_total_time = time.time() - page_start_time
|
| 1465 |
-
logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s
|
| 1466 |
|
| 1467 |
doc.close()
|
| 1468 |
t5 = time.time()
|
| 1469 |
detection_loop_time = t5 - t4
|
| 1470 |
-
logging.warning(f"INFO: Total Detection
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1471 |
|
| 1472 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1473 |
|
| 1474 |
# Create the structured JSON output as requested by the user
|
| 1475 |
structured_latex_output = {
|
|
@@ -1493,7 +1760,7 @@ def run_single_pdf_preprocessing(
|
|
| 1493 |
gallery_items.append((item["pil_image"], image_label))
|
| 1494 |
|
| 1495 |
|
| 1496 |
-
total_execution_time =
|
| 1497 |
|
| 1498 |
full_log = log_stream.getvalue()
|
| 1499 |
|
|
@@ -1514,6 +1781,16 @@ def run_single_pdf_preprocessing(
|
|
| 1514 |
return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
|
| 1515 |
|
| 1516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1517 |
# ============================================================================
|
| 1518 |
# --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
|
| 1519 |
# ============================================================================
|
|
|
|
| 1296 |
|
| 1297 |
|
| 1298 |
# --- UPDATED: page width argument removed from signature and call ---
|
| 1299 |
+
# def run_yolo_detection_and_count(
|
| 1300 |
+
# image: np.ndarray, model: YOLO, page_num: int,
|
| 1301 |
+
# current_eq_count: int, current_fig_count: int
|
| 1302 |
+
# ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
|
| 1303 |
+
# """
|
| 1304 |
+
# Performs YOLO detection and returns a list of detected item dictionaries
|
| 1305 |
+
# and the updated total counters.
|
| 1306 |
+
# """
|
| 1307 |
+
|
| 1308 |
+
# eq_counter = current_eq_count
|
| 1309 |
+
# fig_counter = current_fig_count
|
| 1310 |
+
|
| 1311 |
+
# detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
|
| 1312 |
+
# yolo_detections = []
|
| 1313 |
+
|
| 1314 |
+
# try:
|
| 1315 |
+
# results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
|
| 1316 |
+
# if results and results[0].boxes:
|
| 1317 |
+
# for box in results[0].boxes.data.tolist():
|
| 1318 |
+
# x1, y1, x2, y2, conf, cls_id = box
|
| 1319 |
+
# cls_name = model.names[int(cls_id)]
|
| 1320 |
+
# if cls_name in TARGET_CLASSES:
|
| 1321 |
+
# yolo_detections.append({
|
| 1322 |
+
# 'coords': (x1, y1, x2, y2),
|
| 1323 |
+
# 'class': cls_name,
|
| 1324 |
+
# 'conf': conf
|
| 1325 |
+
# })
|
| 1326 |
+
# except Exception as e:
|
| 1327 |
+
# logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
|
| 1328 |
+
# return [], eq_counter, fig_counter
|
| 1329 |
+
|
| 1330 |
+
# # Call merge_overlapping_boxes without page_width
|
| 1331 |
+
# merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
|
| 1332 |
+
# final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
|
| 1333 |
+
|
| 1334 |
+
# # Note: final_detections is now sorted purely by y1
|
| 1335 |
+
|
| 1336 |
+
# for det in final_detections:
|
| 1337 |
+
# bbox = det["coords"]
|
| 1338 |
+
# crop_pil = crop_and_convert_to_pil(image, bbox)
|
| 1339 |
+
|
| 1340 |
+
# item = {
|
| 1341 |
+
# "type": det["class"],
|
| 1342 |
+
# "coords": bbox,
|
| 1343 |
+
# "pil_image": crop_pil,
|
| 1344 |
+
# }
|
| 1345 |
+
|
| 1346 |
+
# if det["class"] == "equation":
|
| 1347 |
+
# eq_counter += 1
|
| 1348 |
+
# item["id"] = f"EQUATION{eq_counter}"
|
| 1349 |
+
# item["latex"] = ""
|
| 1350 |
+
# elif det["class"] == "figure":
|
| 1351 |
+
# fig_counter += 1
|
| 1352 |
+
# item["id"] = f"FIGURE{fig_counter}"
|
| 1353 |
+
# item["latex"] = "[FIGURE - No LaTeX]"
|
| 1354 |
+
|
| 1355 |
+
# detected_items.append(item)
|
| 1356 |
+
|
| 1357 |
+
# return detected_items, eq_counter, fig_counter
|
| 1358 |
+
|
| 1359 |
+
|
| 1360 |
+
|
| 1361 |
+
|
| 1362 |
+
|
| 1363 |
+
|
| 1364 |
+
|
| 1365 |
+
|
| 1366 |
def run_yolo_detection_and_count(
|
| 1367 |
image: np.ndarray, model: YOLO, page_num: int,
|
| 1368 |
current_eq_count: int, current_fig_count: int
|
|
|
|
| 1394 |
logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
|
| 1395 |
return [], eq_counter, fig_counter
|
| 1396 |
|
|
|
|
| 1397 |
merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
|
| 1398 |
final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
|
| 1399 |
|
|
|
|
|
|
|
| 1400 |
for det in final_detections:
|
| 1401 |
bbox = det["coords"]
|
| 1402 |
crop_pil = crop_and_convert_to_pil(image, bbox)
|
|
|
|
| 1405 |
"type": det["class"],
|
| 1406 |
"coords": bbox,
|
| 1407 |
"pil_image": crop_pil,
|
| 1408 |
+
"page_num": page_num, # ← ADD THIS LINE
|
| 1409 |
}
|
| 1410 |
|
| 1411 |
if det["class"] == "equation":
|
|
|
|
| 1422 |
return detected_items, eq_counter, fig_counter
|
| 1423 |
|
| 1424 |
|
| 1425 |
+
|
| 1426 |
+
|
| 1427 |
+
|
| 1428 |
+
|
| 1429 |
+
|
| 1430 |
+
|
| 1431 |
+
|
| 1432 |
# ============================================================================
|
| 1433 |
# --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
|
| 1434 |
# ============================================================================
|
| 1435 |
|
| 1436 |
+
# def run_single_pdf_preprocessing(
|
| 1437 |
+
# pdf_path: str
|
| 1438 |
+
# ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
|
| 1439 |
+
# """
|
| 1440 |
+
# Runs the pipeline, performs OCR, and returns final results.
|
| 1441 |
+
# """
|
| 1442 |
+
|
| 1443 |
+
# log_stream.truncate(0)
|
| 1444 |
+
# log_stream.seek(0)
|
| 1445 |
+
|
| 1446 |
+
# start_time = time.time()
|
| 1447 |
+
|
| 1448 |
+
# all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
|
| 1449 |
+
|
| 1450 |
+
# total_figure_count = 0
|
| 1451 |
+
# total_equation_count = 0
|
| 1452 |
+
|
| 1453 |
+
|
| 1454 |
+
# # 1. Validation and Model Loading (YOLO)
|
| 1455 |
+
# t0 = time.time()
|
| 1456 |
+
# if not os.path.exists(pdf_path):
|
| 1457 |
+
# report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
|
| 1458 |
+
# return 0, 0, 0, report, time.time() - start_time, {}, []
|
| 1459 |
+
|
| 1460 |
+
# try:
|
| 1461 |
+
# model = YOLO(WEIGHTS_PATH)
|
| 1462 |
+
# logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
|
| 1463 |
+
# except Exception as e:
|
| 1464 |
+
# report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
|
| 1465 |
+
# return 0, 0, 0, report, time.time() - start_time, {}, []
|
| 1466 |
+
# t1 = time.time()
|
| 1467 |
+
# logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
|
| 1468 |
+
|
| 1469 |
+
# # 2. PDF Loading (fitz)
|
| 1470 |
+
# t2 = time.time()
|
| 1471 |
+
# try:
|
| 1472 |
+
# doc = fitz.open(pdf_path)
|
| 1473 |
+
# total_pages = doc.page_count
|
| 1474 |
+
# logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
|
| 1475 |
+
# except Exception as e:
|
| 1476 |
+
# report = f"❌ ERROR loading PDF file: {e}"
|
| 1477 |
+
# return 0, 0, 0, report, time.time() - start_time, {}, []
|
| 1478 |
+
# t3 = time.time()
|
| 1479 |
+
# logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
|
| 1480 |
+
|
| 1481 |
+
# mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
|
| 1482 |
+
|
| 1483 |
+
# # 3. Page Processing, Detection, and OCR Loop
|
| 1484 |
+
# t4 = time.time()
|
| 1485 |
+
# for page_num_0_based in range(doc.page_count):
|
| 1486 |
+
# page_start_time = time.time()
|
| 1487 |
+
# fitz_page = doc.load_page(page_num_0_based)
|
| 1488 |
+
# page_num = page_num_0_based + 1
|
| 1489 |
+
|
| 1490 |
+
# # Render page to image for YOLO
|
| 1491 |
+
# try:
|
| 1492 |
+
# pix_start = time.time()
|
| 1493 |
+
# pix = fitz_page.get_pixmap(matrix=mat)
|
| 1494 |
+
# original_img = pixmap_to_numpy(pix)
|
| 1495 |
+
# pix_time = time.time() - pix_start
|
| 1496 |
+
# except Exception as e:
|
| 1497 |
+
# logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
|
| 1498 |
+
# continue
|
| 1499 |
+
|
| 1500 |
+
# # YOLO Detection
|
| 1501 |
+
# detect_start = time.time()
|
| 1502 |
+
# (
|
| 1503 |
+
# page_extracted_items,
|
| 1504 |
+
# total_equation_count,
|
| 1505 |
+
# total_figure_count
|
| 1506 |
+
# ) = run_yolo_detection_and_count(
|
| 1507 |
+
# original_img,
|
| 1508 |
+
# model,
|
| 1509 |
+
# page_num,
|
| 1510 |
+
# total_equation_count,
|
| 1511 |
+
# total_figure_count
|
| 1512 |
+
# )
|
| 1513 |
+
# detect_time = time.time() - detect_start
|
| 1514 |
+
|
| 1515 |
+
# # --- OCR/LaTeX Conversion and Logging ---
|
| 1516 |
+
# ocr_total_time = 0
|
| 1517 |
+
# page_equations = 0
|
| 1518 |
+
|
| 1519 |
+
# for item in page_extracted_items:
|
| 1520 |
+
# if item["type"] == "equation":
|
| 1521 |
+
# page_equations += 1
|
| 1522 |
+
# ocr_start = time.time()
|
| 1523 |
+
|
| 1524 |
+
# b64_string = pil_to_base64(item["pil_image"])
|
| 1525 |
+
# item["latex"] = get_latex_from_base64(b64_string)
|
| 1526 |
+
|
| 1527 |
+
# ocr_time = time.time() - ocr_start
|
| 1528 |
+
# ocr_total_time += ocr_time
|
| 1529 |
+
|
| 1530 |
+
# logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
|
| 1531 |
+
|
| 1532 |
+
# all_extracted_items.extend(page_extracted_items)
|
| 1533 |
+
|
| 1534 |
+
# page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
|
| 1535 |
+
|
| 1536 |
+
# page_total_time = time.time() - page_start_time
|
| 1537 |
+
# logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
|
| 1538 |
+
|
| 1539 |
+
# doc.close()
|
| 1540 |
+
# t5 = time.time()
|
| 1541 |
+
# detection_loop_time = t5 - t4
|
| 1542 |
+
# logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
|
| 1543 |
+
|
| 1544 |
+
# # 4. Final Report Generation and Gallery Formatting
|
| 1545 |
+
|
| 1546 |
+
# # Create the structured JSON output as requested by the user
|
| 1547 |
+
# structured_latex_output = {
|
| 1548 |
+
# "Total Pages": total_pages,
|
| 1549 |
+
# "Total Equations": total_equation_count,
|
| 1550 |
+
# }
|
| 1551 |
+
# for item in all_extracted_items:
|
| 1552 |
+
# if item["type"] == "equation":
|
| 1553 |
+
# # Map EQUATION ID to LaTeX code
|
| 1554 |
+
# structured_latex_output[item["id"]] = item["latex"]
|
| 1555 |
+
|
| 1556 |
+
|
| 1557 |
+
# # Format the extracted items for the Gradio Gallery
|
| 1558 |
+
# gallery_items: List[Tuple[Image.Image, str]] = []
|
| 1559 |
+
|
| 1560 |
+
# for item in all_extracted_items:
|
| 1561 |
+
# image_label = item["id"]
|
| 1562 |
+
# if item["type"] == "equation":
|
| 1563 |
+
# image_label = f'{item["id"]}: {item["latex"]}'
|
| 1564 |
+
|
| 1565 |
+
# gallery_items.append((item["pil_image"], image_label))
|
| 1566 |
+
|
| 1567 |
+
|
| 1568 |
+
# total_execution_time = t5 - start_time
|
| 1569 |
+
|
| 1570 |
+
# full_log = log_stream.getvalue()
|
| 1571 |
+
|
| 1572 |
+
# report = (
|
| 1573 |
+
# f"✅ **YOLO Counting & OCR Complete!**\n\n"
|
| 1574 |
+
# f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
|
| 1575 |
+
# f"**2) Total Equations Detected:** **{total_equation_count}**\n"
|
| 1576 |
+
# f"**3) Total Figures Detected:** **{total_figure_count}**\n"
|
| 1577 |
+
# f"---\n"
|
| 1578 |
+
# f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
|
| 1579 |
+
# f"### Full Processing Log\n"
|
| 1580 |
+
# f"```text\n"
|
| 1581 |
+
# f"{full_log}"
|
| 1582 |
+
# f"\n```"
|
| 1583 |
+
# )
|
| 1584 |
+
|
| 1585 |
+
# # Return the new structured_latex_output instead of the page counts
|
| 1586 |
+
# return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
|
| 1587 |
+
|
| 1588 |
+
|
| 1589 |
+
|
| 1590 |
+
|
| 1591 |
+
|
| 1592 |
+
|
| 1593 |
+
|
| 1594 |
+
|
| 1595 |
+
|
| 1596 |
+
|
| 1597 |
+
|
| 1598 |
+
|
| 1599 |
+
|
| 1600 |
+
|
| 1601 |
def run_single_pdf_preprocessing(
|
| 1602 |
pdf_path: str
|
| 1603 |
) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
|
|
|
|
| 1645 |
|
| 1646 |
mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
|
| 1647 |
|
| 1648 |
+
# 3. Page Processing and Detection Loop
|
| 1649 |
t4 = time.time()
|
| 1650 |
for page_num_0_based in range(doc.page_count):
|
| 1651 |
page_start_time = time.time()
|
|
|
|
| 1677 |
)
|
| 1678 |
detect_time = time.time() - detect_start
|
| 1679 |
|
| 1680 |
+
# Store items (OCR will be done later in correct order)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1681 |
all_extracted_items.extend(page_extracted_items)
|
| 1682 |
|
| 1683 |
page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
|
| 1684 |
+
page_equations = sum(1 for item in page_extracted_items if item["type"] == "equation")
|
| 1685 |
|
| 1686 |
page_total_time = time.time() - page_start_time
|
| 1687 |
+
logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s)")
|
| 1688 |
|
| 1689 |
doc.close()
|
| 1690 |
t5 = time.time()
|
| 1691 |
detection_loop_time = t5 - t4
|
| 1692 |
+
logging.warning(f"INFO: Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
|
| 1693 |
+
|
| 1694 |
+
# 4. Sort all items by page number, then by y-coordinate
|
| 1695 |
+
logging.warning(f"INFO: Sorting {len(all_extracted_items)} items by page and position...")
|
| 1696 |
+
all_extracted_items.sort(key=lambda item: (item['page_num'], item['coords'][1]))
|
| 1697 |
|
| 1698 |
+
# 5. Re-assign IDs in the correct order
|
| 1699 |
+
equation_counter = 0
|
| 1700 |
+
figure_counter = 0
|
| 1701 |
+
|
| 1702 |
+
for item in all_extracted_items:
|
| 1703 |
+
if item["type"] == "equation":
|
| 1704 |
+
equation_counter += 1
|
| 1705 |
+
item["id"] = f"EQUATION{equation_counter}"
|
| 1706 |
+
elif item["type"] == "figure":
|
| 1707 |
+
figure_counter += 1
|
| 1708 |
+
item["id"] = f"FIGURE{figure_counter}"
|
| 1709 |
+
|
| 1710 |
+
# Update the total counts with the correct values
|
| 1711 |
+
total_equation_count = equation_counter
|
| 1712 |
+
total_figure_count = figure_counter
|
| 1713 |
+
|
| 1714 |
+
logging.warning(f"INFO: Re-numbered items - Total Equations: {total_equation_count}, Total Figures: {total_figure_count}")
|
| 1715 |
+
|
| 1716 |
+
# 6. Perform OCR in the correct order
|
| 1717 |
+
t6 = time.time()
|
| 1718 |
+
ocr_total_time = 0
|
| 1719 |
+
|
| 1720 |
+
logging.warning(f"INFO: Starting OCR for {total_equation_count} equations in correct order...")
|
| 1721 |
+
|
| 1722 |
+
for item in all_extracted_items:
|
| 1723 |
+
if item["type"] == "equation":
|
| 1724 |
+
ocr_start = time.time()
|
| 1725 |
+
|
| 1726 |
+
b64_string = pil_to_base64(item["pil_image"])
|
| 1727 |
+
item["latex"] = get_latex_from_base64(b64_string)
|
| 1728 |
+
|
| 1729 |
+
ocr_time = time.time() - ocr_start
|
| 1730 |
+
ocr_total_time += ocr_time
|
| 1731 |
+
|
| 1732 |
+
logging.warning(f"LATEX: Page {item['page_num']}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
|
| 1733 |
+
elif item["type"] == "figure":
|
| 1734 |
+
item["latex"] = "[FIGURE - No LaTeX]"
|
| 1735 |
+
|
| 1736 |
+
t7 = time.time()
|
| 1737 |
+
logging.warning(f"INFO: Total OCR Time: {ocr_total_time:.4f}s")
|
| 1738 |
+
|
| 1739 |
+
# 7. Final Report Generation and Gallery Formatting
|
| 1740 |
|
| 1741 |
# Create the structured JSON output as requested by the user
|
| 1742 |
structured_latex_output = {
|
|
|
|
| 1760 |
gallery_items.append((item["pil_image"], image_label))
|
| 1761 |
|
| 1762 |
|
| 1763 |
+
total_execution_time = t7 - start_time
|
| 1764 |
|
| 1765 |
full_log = log_stream.getvalue()
|
| 1766 |
|
|
|
|
| 1781 |
return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
|
| 1782 |
|
| 1783 |
|
| 1784 |
+
|
| 1785 |
+
|
| 1786 |
+
|
| 1787 |
+
|
| 1788 |
+
|
| 1789 |
+
|
| 1790 |
+
|
| 1791 |
+
|
| 1792 |
+
|
| 1793 |
+
|
| 1794 |
# ============================================================================
|
| 1795 |
# --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
|
| 1796 |
# ============================================================================
|