heerjtdev commited on
Commit
994b14b
·
verified ·
1 Parent(s): 9a2f423

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +300 -1195
working_yolo_pipeline.py CHANGED
@@ -550,7 +550,7 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
550
 
551
  # THRESHOLD: If bridging blocks > 8% of page height, REJECT.
552
  # This allows for page numbers or headers (usually < 5%) to cross, but NOT paragraphs.
553
- if bridging_ratio > 0.045:
554
  print(
555
  f" ❌ Separator X={x_coord} REJECTED: Bridging Ratio {bridging_ratio:.1%} (>15%) cuts through text.")
556
  continue
@@ -974,6 +974,275 @@ def post_process_json_with_inference(json_data, classifier):
974
 
975
 
976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
978
  # page_num: int, fitz_page: fitz.Page,
979
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -1146,6 +1415,21 @@ def post_process_json_with_inference(json_data, classifier):
1146
  # config=custom_config
1147
  # )
1148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1149
  # for i in range(len(hocr_data['level'])):
1150
  # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1151
 
@@ -1230,8 +1514,6 @@ def post_process_json_with_inference(json_data, classifier):
1230
 
1231
  # return final_output, page_separator_x
1232
 
1233
- #=============================================================================================================================================================================
1234
-
1235
 
1236
 
1237
 
@@ -1239,1039 +1521,26 @@ def post_process_json_with_inference(json_data, classifier):
1239
 
1240
 
1241
 
 
 
1242
 
 
 
 
1243
 
 
 
 
1244
 
 
 
 
1245
 
1246
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
- # page_num: int, fitz_page: fitz.Page,
1248
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1249
- # """
1250
- # OPTIMIZED FLOW:
1251
- # 1. Run YOLO to find Equations/Tables.
1252
- # 2. Mask raw text with YOLO boxes.
1253
- # 3. Run Column Detection on the MASKED data.
1254
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1255
- # """
1256
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1257
 
1258
- # start_time_total = time.time()
1259
-
1260
- # if original_img is None:
1261
- # print(f" ❌ Invalid image for page {page_num}.")
1262
- # return None, None
1263
-
1264
- # # ====================================================================
1265
- # # --- STEP 1: YOLO DETECTION ---
1266
- # # ====================================================================
1267
- # start_time_yolo = time.time()
1268
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1269
-
1270
- # relevant_detections = []
1271
- # if results and results[0].boxes:
1272
- # for box in results[0].boxes:
1273
- # class_id = int(box.cls[0])
1274
- # class_name = model.names[class_id]
1275
- # if class_name in TARGET_CLASSES:
1276
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1277
- # relevant_detections.append(
1278
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1279
- # )
1280
-
1281
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1282
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1283
-
1284
- # # ====================================================================
1285
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1286
- # # ====================================================================
1287
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1288
- # raw_words_for_layout = get_word_data_for_detection(
1289
- # fitz_page, pdf_path, page_num,
1290
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1291
- # )
1292
-
1293
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1294
-
1295
- # # ====================================================================
1296
- # # --- STEP 3: COLUMN DETECTION ---
1297
- # # ====================================================================
1298
- # page_width_pdf = fitz_page.rect.width
1299
- # page_height_pdf = fitz_page.rect.height
1300
-
1301
- # column_detection_params = {
1302
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1303
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1304
- # }
1305
-
1306
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1307
-
1308
- # page_separator_x = None
1309
- # if separators:
1310
- # central_min = page_width_pdf * 0.35
1311
- # central_max = page_width_pdf * 0.65
1312
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1313
-
1314
- # if central_separators:
1315
- # center_x = page_width_pdf / 2
1316
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1317
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1318
- # else:
1319
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1320
- # else:
1321
- # print(" -> Single Column Layout Confirmed.")
1322
-
1323
- # # ====================================================================
1324
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1325
- # # ====================================================================
1326
- # start_time_components = time.time()
1327
- # component_metadata = []
1328
- # fig_count_page = 0
1329
- # eq_count_page = 0
1330
-
1331
- # for detection in merged_detections:
1332
- # x1, y1, x2, y2 = detection['coords']
1333
- # class_name = detection['class']
1334
-
1335
- # if class_name == 'figure':
1336
- # GLOBAL_FIGURE_COUNT += 1
1337
- # counter = GLOBAL_FIGURE_COUNT
1338
- # component_word = f"FIGURE{counter}"
1339
- # fig_count_page += 1
1340
- # elif class_name == 'equation':
1341
- # GLOBAL_EQUATION_COUNT += 1
1342
- # counter = GLOBAL_EQUATION_COUNT
1343
- # component_word = f"EQUATION{counter}"
1344
- # eq_count_page += 1
1345
- # else:
1346
- # continue
1347
-
1348
- # component_crop = original_img[y1:y2, x1:x2]
1349
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1350
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1351
-
1352
- # y_midpoint = (y1 + y2) // 2
1353
- # component_metadata.append({
1354
- # 'type': class_name, 'word': component_word,
1355
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1356
- # 'y0': int(y_midpoint), 'x0': int(x1)
1357
- # })
1358
-
1359
- # # ====================================================================
1360
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1361
- # # ====================================================================
1362
- # raw_ocr_output = []
1363
- # scale_factor = 2.0 # Pipeline standard scale
1364
-
1365
- # try:
1366
- # # Try getting native text first
1367
- # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1368
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1369
- # except Exception as e:
1370
- # print(f" ❌ Native text extraction failed: {e}")
1371
-
1372
- # # If native text is missing, fall back to OCR
1373
- # if not raw_ocr_output:
1374
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1375
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1376
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1377
- # for word_tuple in cached_word_data:
1378
- # word_text, x1, y1, x2, y2 = word_tuple
1379
-
1380
- # # Scale from PDF points to Pipeline Pixels (2.0)
1381
- # x1_pix = int(x1 * scale_factor)
1382
- # y1_pix = int(y1 * scale_factor)
1383
- # x2_pix = int(x2 * scale_factor)
1384
- # y2_pix = int(y2 * scale_factor)
1385
-
1386
- # raw_ocr_output.append({
1387
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1388
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1389
- # 'y0': y1_pix, 'x0': x1_pix
1390
- # })
1391
- # else:
1392
- # # === START OF OPTIMIZED OCR BLOCK ===
1393
- # try:
1394
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1395
- # ocr_zoom = 4.0
1396
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1397
-
1398
- # # Convert PyMuPDF Pixmap to OpenCV format
1399
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1400
- # pix_ocr.n)
1401
- # if pix_ocr.n == 3:
1402
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1403
- # elif pix_ocr.n == 4:
1404
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1405
-
1406
- # # 2. Preprocess (Binarization)
1407
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1408
-
1409
- # # 3. Run Tesseract with Optimized Configuration
1410
- # custom_config = r'--oem 3 --psm 6'
1411
-
1412
- # hocr_data = pytesseract.image_to_data(
1413
- # processed_img,
1414
- # output_type=pytesseract.Output.DICT,
1415
- # config=custom_config
1416
- # )
1417
-
1418
- # # ==============================================================================
1419
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
- # # ==============================================================================
1421
- # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
- # debug_count = 0
1423
- # for i in range(len(hocr_data['level'])):
1424
- # text = hocr_data['text'][i].strip()
1425
- # if text:
1426
- # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
- # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
- # debug_count += 1
1429
- # if debug_count >= 50: break
1430
- # print("----------------------------------------------------------------------\n")
1431
- # # ==============================================================================
1432
-
1433
- # for i in range(len(hocr_data['level'])):
1434
- # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
-
1436
- # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1437
- # cleaned_text = sanitize_text(text).strip()
1438
-
1439
- # if cleaned_text and hocr_data['conf'][i] > -1:
1440
- # # 4. Coordinate Mapping
1441
- # scale_adjustment = scale_factor / ocr_zoom
1442
-
1443
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1444
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1445
- # w = int(hocr_data['width'][i] * scale_adjustment)
1446
- # h = int(hocr_data['height'][i] * scale_adjustment)
1447
- # x2 = x1 + w
1448
- # y2 = y1 + h
1449
-
1450
- # raw_ocr_output.append({
1451
- # 'type': 'text',
1452
- # 'word': cleaned_text, # Use the sanitized word
1453
- # 'confidence': float(hocr_data['conf'][i]),
1454
- # 'bbox': [x1, y1, x2, y2],
1455
- # 'y0': y1,
1456
- # 'x0': x1
1457
- # })
1458
- # except Exception as e:
1459
- # print(f" ❌ Tesseract OCR Error: {e}")
1460
- # # === END OF OPTIMIZED OCR BLOCK ===
1461
-
1462
- # # ====================================================================
1463
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1464
- # # ====================================================================
1465
- # items_to_sort = []
1466
-
1467
- # for ocr_word in raw_ocr_output:
1468
- # is_suppressed = False
1469
- # for component in component_metadata:
1470
- # # Do not include words that are inside figure/equation boxes
1471
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1472
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1473
- # is_suppressed = True
1474
- # break
1475
- # if not is_suppressed:
1476
- # items_to_sort.append(ocr_word)
1477
-
1478
- # # Add figures/equations back into the flow as "words"
1479
- # items_to_sort.extend(component_metadata)
1480
-
1481
- # # ====================================================================
1482
- # # --- STEP 7: LINE-BASED SORTING ---
1483
- # # ====================================================================
1484
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1485
- # lines = []
1486
-
1487
- # for item in items_to_sort:
1488
- # placed = False
1489
- # for line in lines:
1490
- # y_ref = min(it['y0'] for it in line)
1491
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1492
- # line.append(item)
1493
- # placed = True
1494
- # break
1495
- # if not placed and item['type'] in ['equation', 'figure']:
1496
- # for line in lines:
1497
- # y_ref = min(it['y0'] for it in line)
1498
- # if abs(y_ref - item['y0']) < 20:
1499
- # line.append(item)
1500
- # placed = True
1501
- # break
1502
- # if not placed:
1503
- # lines.append([item])
1504
-
1505
- # for line in lines:
1506
- # line.sort(key=lambda x: x['x0'])
1507
-
1508
- # final_output = []
1509
- # for line in lines:
1510
- # for item in line:
1511
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1512
- # if 'tag' in item: data_item['tag'] = item['tag']
1513
- # final_output.append(data_item)
1514
-
1515
- # return final_output, page_separator_x
1516
-
1517
- #==========================================================================================================================================================================================
1518
-
1519
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1520
- # page_num: int, fitz_page: fitz.Page,
1521
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1522
- # """
1523
- # OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
1524
- # 1. Run YOLO to find Equations/Tables.
1525
- # 2. Store detections with page_num but DON'T assign global IDs yet
1526
- # 3. Mask raw text with YOLO boxes.
1527
- # 4. Run Column Detection on the MASKED data.
1528
- # 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1529
- # """
1530
- # # NOTE: Removed global counter increments from here
1531
-
1532
- # start_time_total = time.time()
1533
-
1534
- # if original_img is None:
1535
- # print(f" ❌ Invalid image for page {page_num}.")
1536
- # return None, None
1537
-
1538
- # # ====================================================================
1539
- # # --- STEP 1: YOLO DETECTION ---
1540
- # # ====================================================================
1541
- # start_time_yolo = time.time()
1542
- # # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1543
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1544
-
1545
- # relevant_detections = []
1546
- # if results and results[0].boxes:
1547
- # for box in results[0].boxes:
1548
- # class_id = int(box.cls[0])
1549
- # class_name = model.names[class_id]
1550
- # if class_name in TARGET_CLASSES:
1551
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1552
- # relevant_detections.append(
1553
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1554
- # )
1555
-
1556
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1557
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1558
-
1559
- # # ====================================================================
1560
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1561
- # # ====================================================================
1562
- # raw_words_for_layout = get_word_data_for_detection(
1563
- # fitz_page, pdf_path, page_num,
1564
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1565
- # )
1566
-
1567
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1568
-
1569
- # # ====================================================================
1570
- # # --- STEP 3: COLUMN DETECTION ---
1571
- # # ====================================================================
1572
- # page_width_pdf = fitz_page.rect.width
1573
- # page_height_pdf = fitz_page.rect.height
1574
-
1575
- # column_detection_params = {
1576
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1577
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1578
- # }
1579
-
1580
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1581
-
1582
- # page_separator_x = None
1583
- # if separators:
1584
- # central_min = page_width_pdf * 0.35
1585
- # central_max = page_width_pdf * 0.65
1586
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1587
-
1588
- # if central_separators:
1589
- # center_x = page_width_pdf / 2
1590
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1591
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1592
- # else:
1593
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1594
- # else:
1595
- # print(" -> Single Column Layout Confirmed.")
1596
-
1597
- # # ====================================================================
1598
- # # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1599
- # # ====================================================================
1600
- # start_time_components = time.time()
1601
- # component_metadata = []
1602
-
1603
- # for detection in merged_detections:
1604
- # x1, y1, x2, y2 = detection['coords']
1605
- # class_name = detection['class']
1606
-
1607
- # # DON'T assign global IDs here - just store the type and coordinates
1608
- # component_crop = original_img[y1:y2, x1:x2]
1609
-
1610
- # # Store image temporarily with page and position info in filename
1611
- # temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1612
- # temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
1613
- # cv2.imwrite(temp_filepath, component_crop)
1614
-
1615
- # y_midpoint = (y1 + y2) // 2
1616
- # component_metadata.append({
1617
- # 'type': class_name,
1618
- # 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1619
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1620
- # 'y0': int(y_midpoint),
1621
- # 'x0': int(x1),
1622
- # 'page_num': page_num, # CRITICAL: Store page number
1623
- # 'temp_filepath': temp_filepath # Store temp filepath for later renaming
1624
- # })
1625
-
1626
- # # ====================================================================
1627
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1628
- # # ====================================================================
1629
- # raw_ocr_output = []
1630
- # scale_factor = 2.0
1631
-
1632
- # try:
1633
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1634
- # except Exception as e:
1635
- # print(f" ❌ Native text extraction failed: {e}")
1636
-
1637
- # if not raw_ocr_output:
1638
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1639
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1640
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1641
- # for word_tuple in cached_word_data:
1642
- # word_text, x1, y1, x2, y2 = word_tuple
1643
- # x1_pix = int(x1 * scale_factor)
1644
- # y1_pix = int(y1 * scale_factor)
1645
- # x2_pix = int(x2 * scale_factor)
1646
- # y2_pix = int(y2 * scale_factor)
1647
-
1648
- # raw_ocr_output.append({
1649
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1650
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1651
- # 'y0': y1_pix, 'x0': x1_pix
1652
- # })
1653
- # else:
1654
- # try:
1655
- # ocr_zoom = 4.0
1656
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1657
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1658
- # pix_ocr.n)
1659
- # if pix_ocr.n == 3:
1660
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1661
- # elif pix_ocr.n == 4:
1662
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1663
-
1664
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1665
- # custom_config = r'--oem 3 --psm 6'
1666
- # hocr_data = pytesseract.image_to_data(
1667
- # processed_img,
1668
- # output_type=pytesseract.Output.DICT,
1669
- # config=custom_config
1670
- # )
1671
-
1672
- # for i in range(len(hocr_data['level'])):
1673
- # text = hocr_data['text'][i]
1674
- # cleaned_text = sanitize_text(text).strip()
1675
-
1676
- # if cleaned_text and hocr_data['conf'][i] > -1:
1677
- # scale_adjustment = scale_factor / ocr_zoom
1678
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1679
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1680
- # w = int(hocr_data['width'][i] * scale_adjustment)
1681
- # h = int(hocr_data['height'][i] * scale_adjustment)
1682
- # x2 = x1 + w
1683
- # y2 = y1 + h
1684
-
1685
- # raw_ocr_output.append({
1686
- # 'type': 'text',
1687
- # 'word': cleaned_text,
1688
- # 'confidence': float(hocr_data['conf'][i]),
1689
- # 'bbox': [x1, y1, x2, y2],
1690
- # 'y0': y1,
1691
- # 'x0': x1
1692
- # })
1693
- # except Exception as e:
1694
- # print(f" ❌ Tesseract OCR Error: {e}")
1695
-
1696
- # # ====================================================================
1697
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1698
- # # ====================================================================
1699
- # items_to_sort = []
1700
-
1701
- # for ocr_word in raw_ocr_output:
1702
- # is_suppressed = False
1703
- # for component in component_metadata:
1704
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1705
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1706
- # is_suppressed = True
1707
- # break
1708
- # if not is_suppressed:
1709
- # items_to_sort.append(ocr_word)
1710
-
1711
- # items_to_sort.extend(component_metadata)
1712
-
1713
- # # ====================================================================
1714
- # # --- STEP 7: LINE-BASED SORTING ---
1715
- # # ====================================================================
1716
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1717
- # lines = []
1718
-
1719
- # for item in items_to_sort:
1720
- # placed = False
1721
- # for line in lines:
1722
- # y_ref = min(it['y0'] for it in line)
1723
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1724
- # line.append(item)
1725
- # placed = True
1726
- # break
1727
- # if not placed and item['type'] in ['equation', 'figure']:
1728
- # for line in lines:
1729
- # y_ref = min(it['y0'] for it in line)
1730
- # if abs(y_ref - item['y0']) < 20:
1731
- # line.append(item)
1732
- # placed = True
1733
- # break
1734
- # if not placed:
1735
- # lines.append([item])
1736
-
1737
- # for line in lines:
1738
- # line.sort(key=lambda x: x['x0'])
1739
-
1740
- # final_output = []
1741
- # for line in lines:
1742
- # for item in line:
1743
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1744
- # if 'tag' in item: data_item['tag'] = item['tag']
1745
- # if 'page_num' in item: data_item['page_num'] = item['page_num']
1746
- # if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
1747
- # final_output.append(data_item)
1748
-
1749
- # return final_output, page_separator_x
1750
- # #=================================================================================================================================================================================================
1751
-
1752
-
1753
-
1754
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1755
- # page_num: int, fitz_page: fitz.Page,
1756
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1757
- # """
1758
- # OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
1759
- # 1. Run YOLO to find Equations/Tables.
1760
- # 2. Store detections with page_num but DON'T assign global IDs yet
1761
- # 3. Mask raw text with YOLO boxes.
1762
- # 4. Run Column Detection on the MASKED data.
1763
- # 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1764
- # """
1765
- # # NOTE: Removed global counter increments from here
1766
-
1767
- # start_time_total = time.time()
1768
-
1769
- # if original_img is None:
1770
- # print(f" ❌ Invalid image for page {page_num}.")
1771
- # return None, None
1772
-
1773
- # # ====================================================================
1774
- # # --- STEP 1: YOLO DETECTION ---
1775
- # # ====================================================================
1776
- # start_time_yolo = time.time()
1777
- # # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1778
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1779
-
1780
- # relevant_detections = []
1781
-
1782
- # # FIX 1: Use .data.tolist() to preserve float coordinates (matches feedback.py)
1783
- # if results and results[0].boxes:
1784
- # for box in results[0].boxes.data.tolist():
1785
- # x1, y1, x2, y2, conf, cls_id = box
1786
- # class_name = model.names[int(cls_id)]
1787
- # if class_name in TARGET_CLASSES:
1788
- # relevant_detections.append(
1789
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': conf}
1790
- # )
1791
-
1792
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1793
-
1794
- # # FIX 2: Add the missing filter_nested_boxes step (matches feedback.py)
1795
- # merged_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1796
-
1797
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1798
-
1799
- # # ====================================================================
1800
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1801
- # # ====================================================================
1802
- # raw_words_for_layout = get_word_data_for_detection(
1803
- # fitz_page, pdf_path, page_num,
1804
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1805
- # )
1806
-
1807
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1808
-
1809
- # # ====================================================================
1810
- # # --- STEP 3: COLUMN DETECTION ---
1811
- # # ====================================================================
1812
- # page_width_pdf = fitz_page.rect.width
1813
- # page_height_pdf = fitz_page.rect.height
1814
-
1815
- # column_detection_params = {
1816
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1817
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1818
- # }
1819
-
1820
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1821
-
1822
- # page_separator_x = None
1823
- # if separators:
1824
- # central_min = page_width_pdf * 0.35
1825
- # central_max = page_width_pdf * 0.65
1826
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1827
-
1828
- # if central_separators:
1829
- # center_x = page_width_pdf / 2
1830
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1831
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1832
- # else:
1833
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1834
- # else:
1835
- # print(" -> Single Column Layout Confirmed.")
1836
-
1837
- # # ====================================================================
1838
- # # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1839
- # # ====================================================================
1840
- # start_time_components = time.time()
1841
- # component_metadata = []
1842
-
1843
- # for detection in merged_detections:
1844
- # # FIX 3: Cast float coordinates to int HERE for numpy array slicing
1845
- # x1, y1, x2, y2 = map(int, detection['coords'])
1846
- # class_name = detection['class']
1847
-
1848
- # # Ensure coordinates are within image bounds
1849
- # h, w = original_img.shape[:2]
1850
- # x1, y1 = max(0, x1), max(0, y1)
1851
- # x2, y2 = min(w, x2), min(h, y2)
1852
-
1853
- # # DON'T assign global IDs here - just store the type and coordinates
1854
- # component_crop = original_img[y1:y2, x1:x2]
1855
-
1856
- # # Store image temporarily with page and position info in filename
1857
- # temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1858
- # temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
1859
- # cv2.imwrite(temp_filepath, component_crop)
1860
-
1861
- # y_midpoint = (y1 + y2) // 2
1862
- # component_metadata.append({
1863
- # 'type': class_name,
1864
- # 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1865
- # 'bbox': [x1, y1, x2, y2],
1866
- # 'y0': int(y_midpoint),
1867
- # 'x0': int(x1),
1868
- # 'page_num': page_num, # CRITICAL: Store page number
1869
- # 'temp_filepath': temp_filepath # Store temp filepath for later renaming
1870
- # })
1871
-
1872
- # # ====================================================================
1873
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1874
- # # ====================================================================
1875
- # raw_ocr_output = []
1876
- # scale_factor = 2.0
1877
-
1878
- # try:
1879
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1880
- # except Exception as e:
1881
- # print(f" ❌ Native text extraction failed: {e}")
1882
-
1883
- # if not raw_ocr_output:
1884
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1885
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1886
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1887
- # for word_tuple in cached_word_data:
1888
- # word_text, x1, y1, x2, y2 = word_tuple
1889
- # x1_pix = int(x1 * scale_factor)
1890
- # y1_pix = int(y1 * scale_factor)
1891
- # x2_pix = int(x2 * scale_factor)
1892
- # y2_pix = int(y2 * scale_factor)
1893
-
1894
- # raw_ocr_output.append({
1895
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1896
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1897
- # 'y0': y1_pix, 'x0': x1_pix
1898
- # })
1899
- # else:
1900
- # try:
1901
- # ocr_zoom = 4.0
1902
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1903
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1904
- # pix_ocr.n)
1905
- # if pix_ocr.n == 3:
1906
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1907
- # elif pix_ocr.n == 4:
1908
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1909
-
1910
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1911
- # custom_config = r'--oem 3 --psm 6'
1912
- # hocr_data = pytesseract.image_to_data(
1913
- # processed_img,
1914
- # output_type=pytesseract.Output.DICT,
1915
- # config=custom_config
1916
- # )
1917
-
1918
- # for i in range(len(hocr_data['level'])):
1919
- # text = hocr_data['text'][i]
1920
- # cleaned_text = sanitize_text(text).strip()
1921
-
1922
- # if cleaned_text and hocr_data['conf'][i] > -1:
1923
- # scale_adjustment = scale_factor / ocr_zoom
1924
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1925
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1926
- # w = int(hocr_data['width'][i] * scale_adjustment)
1927
- # h = int(hocr_data['height'][i] * scale_adjustment)
1928
- # x2 = x1 + w
1929
- # y2 = y1 + h
1930
-
1931
- # raw_ocr_output.append({
1932
- # 'type': 'text',
1933
- # 'word': cleaned_text,
1934
- # 'confidence': float(hocr_data['conf'][i]),
1935
- # 'bbox': [x1, y1, x2, y2],
1936
- # 'y0': y1,
1937
- # 'x0': x1
1938
- # })
1939
- # except Exception as e:
1940
- # print(f" ❌ Tesseract OCR Error: {e}")
1941
-
1942
- # # ====================================================================
1943
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1944
- # # ====================================================================
1945
- # items_to_sort = []
1946
-
1947
- # for ocr_word in raw_ocr_output:
1948
- # is_suppressed = False
1949
- # for component in component_metadata:
1950
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1951
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1952
- # is_suppressed = True
1953
- # break
1954
- # if not is_suppressed:
1955
- # items_to_sort.append(ocr_word)
1956
-
1957
- # items_to_sort.extend(component_metadata)
1958
-
1959
- # # ====================================================================
1960
- # # --- STEP 7: LINE-BASED SORTING ---
1961
- # # ====================================================================
1962
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1963
- # lines = []
1964
-
1965
- # for item in items_to_sort:
1966
- # placed = False
1967
- # for line in lines:
1968
- # y_ref = min(it['y0'] for it in line)
1969
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1970
- # line.append(item)
1971
- # placed = True
1972
- # break
1973
- # if not placed and item['type'] in ['equation', 'figure']:
1974
- # for line in lines:
1975
- # y_ref = min(it['y0'] for it in line)
1976
- # if abs(y_ref - item['y0']) < 20:
1977
- # line.append(item)
1978
- # placed = True
1979
- # break
1980
- # if not placed:
1981
- # lines.append([item])
1982
-
1983
- # for line in lines:
1984
- # line.sort(key=lambda x: x['x0'])
1985
-
1986
- # final_output = []
1987
- # for line in lines:
1988
- # for item in line:
1989
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1990
- # if 'tag' in item: data_item['tag'] = item['tag']
1991
- # if 'page_num' in item: data_item['page_num'] = item['page_num']
1992
- # if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
1993
- # final_output.append(data_item)
1994
-
1995
- # return final_output, page_separator_x
1996
-
1997
-
1998
-
1999
-
2000
-
2001
-
2002
-
2003
-
2004
- def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
2005
- page_num: int, fitz_page: fitz.Page,
2006
- pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
2007
- """
2008
- OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
2009
- 1. Run YOLO to find Equations/Tables.
2010
- 2. Store detections with page_num but DON'T assign global IDs yet
2011
- 3. Mask raw text with YOLO boxes.
2012
- 4. Run Column Detection on the MASKED data.
2013
- 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
2014
- """
2015
- # NOTE: Removed global counter increments from here
2016
-
2017
- start_time_total = time.time()
2018
-
2019
- if original_img is None:
2020
- print(f" ❌ Invalid image for page {page_num}.")
2021
- return None, None
2022
-
2023
- # ====================================================================
2024
- # --- STEP 1: YOLO DETECTION (FIXED) ---
2025
- # ====================================================================
2026
- start_time_yolo = time.time()
2027
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
2028
- results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
2029
-
2030
- relevant_detections = []
2031
-
2032
- # FIX 1: Use .data.tolist() to preserve float coordinates for merging/filtering (matches feedback.py)
2033
- if results and results[0].boxes:
2034
- for box in results[0].boxes.data.tolist():
2035
- x1, y1, x2, y2, conf, cls_id = box
2036
- class_name = model.names[int(cls_id)]
2037
- if class_name in TARGET_CLASSES:
2038
- relevant_detections.append(
2039
- {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': conf}
2040
- )
2041
-
2042
- merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
2043
-
2044
- # FIX 2: Add the missing filter_nested_boxes step (matches feedback.py)
2045
- merged_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
2046
-
2047
- print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
2048
-
2049
- # ====================================================================
2050
- # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
2051
- # ====================================================================
2052
- raw_words_for_layout = get_word_data_for_detection(
2053
- fitz_page, pdf_path, page_num,
2054
- top_margin_percent=0.10, bottom_margin_percent=0.10
2055
- )
2056
-
2057
- masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
2058
-
2059
- # ====================================================================
2060
- # --- STEP 3: COLUMN DETECTION ---
2061
- # ====================================================================
2062
- page_width_pdf = fitz_page.rect.width
2063
- page_height_pdf = fitz_page.rect.height
2064
-
2065
- column_detection_params = {
2066
- 'cluster_bin_size': 2, 'cluster_smoothing': 2,
2067
- 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
2068
- }
2069
-
2070
- separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
2071
-
2072
- page_separator_x = None
2073
- if separators:
2074
- central_min = page_width_pdf * 0.35
2075
- central_max = page_width_pdf * 0.65
2076
- central_separators = [s for s in separators if central_min <= s <= central_max]
2077
-
2078
- if central_separators:
2079
- center_x = page_width_pdf / 2
2080
- page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
2081
- print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
2082
- else:
2083
- print(" ⚠️ Gutter found off-center. Ignoring.")
2084
- else:
2085
- print(" -> Single Column Layout Confirmed.")
2086
-
2087
- # ====================================================================
2088
- # --- STEP 4: COMPONENT EXTRACTION ---
2089
- # ====================================================================
2090
- start_time_components = time.time()
2091
- component_metadata = []
2092
-
2093
- for detection in merged_detections:
2094
- # Cast float coordinates to int HERE for numpy array slicing (cropping)
2095
- x1, y1, x2, y2 = map(int, detection['coords'])
2096
- class_name = detection['class']
2097
-
2098
- # Ensure coordinates are within image bounds
2099
- h, w = original_img.shape[:2]
2100
- x1, y1 = max(0, x1), max(0, y1)
2101
- x2, y2 = min(w, x2), min(h, y2)
2102
-
2103
- # DON'T assign global IDs here - just store the type and coordinates
2104
- component_crop = original_img[y1:y2, x1:x2]
2105
-
2106
- # Store image temporarily with page and position info in filename
2107
- temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
2108
- temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
2109
- cv2.imwrite(temp_filepath, component_crop)
2110
-
2111
- y_midpoint = (y1 + y2) // 2
2112
- component_metadata.append({
2113
- 'type': class_name,
2114
- 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
2115
- 'bbox': [x1, y1, x2, y2],
2116
- 'y0': int(y_midpoint),
2117
- 'x0': int(x1),
2118
- 'page_num': page_num, # CRITICAL: Store page number
2119
- 'temp_filepath': temp_filepath # Store temp filepath for later renaming
2120
- })
2121
-
2122
- # ====================================================================
2123
- # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
2124
- # ====================================================================
2125
- raw_ocr_output = []
2126
- scale_factor = 2.0
2127
-
2128
- try:
2129
- raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
2130
- except Exception as e:
2131
- print(f" ❌ Native text extraction failed: {e}")
2132
-
2133
- if not raw_ocr_output:
2134
- if _ocr_cache.has_ocr(pdf_path, page_num):
2135
- print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
2136
- cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
2137
- for word_tuple in cached_word_data:
2138
- word_text, x1, y1, x2, y2 = word_tuple
2139
- x1_pix = int(x1 * scale_factor)
2140
- y1_pix = int(y1 * scale_factor)
2141
- x2_pix = int(x2 * scale_factor)
2142
- y2_pix = int(y2 * scale_factor)
2143
-
2144
- raw_ocr_output.append({
2145
- 'type': 'text', 'word': word_text, 'confidence': 95.0,
2146
- 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
2147
- 'y0': y1_pix, 'x0': x1_pix
2148
- })
2149
- else:
2150
- try:
2151
- ocr_zoom = 4.0
2152
- pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
2153
- img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
2154
- pix_ocr.n)
2155
- if pix_ocr.n == 3:
2156
- img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
2157
- elif pix_ocr.n == 4:
2158
- img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
2159
-
2160
- processed_img = preprocess_image_for_ocr(img_ocr_np)
2161
- custom_config = r'--oem 3 --psm 6'
2162
- hocr_data = pytesseract.image_to_data(
2163
- processed_img,
2164
- output_type=pytesseract.Output.DICT,
2165
- config=custom_config
2166
- )
2167
-
2168
- for i in range(len(hocr_data['level'])):
2169
- text = hocr_data['text'][i]
2170
- cleaned_text = sanitize_text(text).strip()
2171
-
2172
- if cleaned_text and hocr_data['conf'][i] > -1:
2173
- scale_adjustment = scale_factor / ocr_zoom
2174
- x1 = int(hocr_data['left'][i] * scale_adjustment)
2175
- y1 = int(hocr_data['top'][i] * scale_adjustment)
2176
- w = int(hocr_data['width'][i] * scale_adjustment)
2177
- h = int(hocr_data['height'][i] * scale_adjustment)
2178
- x2 = x1 + w
2179
- y2 = y1 + h
2180
-
2181
- raw_ocr_output.append({
2182
- 'type': 'text',
2183
- 'word': cleaned_text,
2184
- 'confidence': float(hocr_data['conf'][i]),
2185
- 'bbox': [x1, y1, x2, y2],
2186
- 'y0': y1,
2187
- 'x0': x1
2188
- })
2189
- except Exception as e:
2190
- print(f" ❌ Tesseract OCR Error: {e}")
2191
-
2192
- # ====================================================================
2193
- # --- STEP 6: OCR CLEANING AND MERGING ---
2194
- # ====================================================================
2195
- items_to_sort = []
2196
-
2197
- for ocr_word in raw_ocr_output:
2198
- is_suppressed = False
2199
- for component in component_metadata:
2200
- ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
2201
- if ioa > IOA_SUPPRESSION_THRESHOLD:
2202
- is_suppressed = True
2203
- break
2204
- if not is_suppressed:
2205
- items_to_sort.append(ocr_word)
2206
-
2207
- items_to_sort.extend(component_metadata)
2208
-
2209
- # ====================================================================
2210
- # --- STEP 7: LINE-BASED SORTING (FIXED) ---
2211
- # ====================================================================
2212
- items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
2213
- lines = []
2214
-
2215
- for item in items_to_sort:
2216
- placed = False
2217
- for line in lines:
2218
- y_ref = min(it['y0'] for it in line)
2219
- if abs(y_ref - item['y0']) < LINE_TOLERANCE:
2220
- line.append(item)
2221
- placed = True
2222
- break
2223
-
2224
- # FIX: The overly permissive/non-standard line merging block for equations/figures
2225
- # that uses a large tolerance (20) has been removed to enforce strict vertical sorting.
2226
-
2227
- if not placed:
2228
- lines.append([item])
2229
-
2230
- for line in lines:
2231
- line.sort(key=lambda x: x['x0'])
2232
-
2233
- final_output = []
2234
- for line in lines:
2235
- for item in line:
2236
- data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
2237
- if 'tag' in item: data_item['tag'] = item['tag']
2238
- if 'page_num' in item: data_item['page_num'] = item['page_num']
2239
- if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
2240
- final_output.append(data_item)
2241
-
2242
- return final_output, page_separator_x
2243
-
2244
-
2245
-
2246
-
2247
-
2248
-
2249
-
2250
-
2251
-
2252
-
2253
-
2254
-
2255
- def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
2256
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2257
-
2258
- GLOBAL_FIGURE_COUNT = 0
2259
- GLOBAL_EQUATION_COUNT = 0
2260
- _ocr_cache.clear()
2261
-
2262
- print("\n" + "=" * 80)
2263
- print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
2264
- print("=" * 80)
2265
-
2266
- if not os.path.exists(pdf_path):
2267
- print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
2268
- return None
2269
-
2270
- os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
2271
- os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
2272
-
2273
- model = YOLO(WEIGHTS_PATH)
2274
- pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
2275
 
2276
  try:
2277
  doc = fitz.open(pdf_path)
@@ -2286,7 +1555,6 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2286
 
2287
  print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
2288
 
2289
- # STEP 1: Collect all page data WITHOUT global numbering
2290
  for page_num_0_based in range(doc.page_count):
2291
  page_num = page_num_0_based + 1
2292
  print(f" -> Processing Page {page_num}/{doc.page_count}...")
@@ -2322,78 +1590,6 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2322
 
2323
  doc.close()
2324
 
2325
- # ====================================================================
2326
- # STEP 2: GLOBAL SORTING AND RENUMBERING
2327
- # ====================================================================
2328
- print("\n[STEP 1.3: SORTING AND RENUMBERING COMPONENTS GLOBALLY]")
2329
-
2330
- # Collect all figure and equation items from all pages
2331
- all_components = []
2332
- for page_data in all_pages_data:
2333
- for item in page_data['data']:
2334
- if item['type'] in ['figure', 'equation']:
2335
- all_components.append({
2336
- 'item': item,
2337
- 'page_num': page_data['page_number']
2338
- })
2339
-
2340
- # Sort by page number first, then by y-coordinate
2341
- all_components.sort(key=lambda x: (x['page_num'], x['item']['bbox'][1]))
2342
-
2343
- # Assign global IDs in correct order
2344
- equation_counter = 0
2345
- figure_counter = 0
2346
- component_id_map = {} # Maps temp placeholder to final ID
2347
-
2348
- for comp_data in all_components:
2349
- item = comp_data['item']
2350
- temp_word = item['word']
2351
-
2352
- if item['type'] == 'equation':
2353
- equation_counter += 1
2354
- final_word = f"EQUATION{equation_counter}"
2355
- component_id_map[temp_word] = final_word
2356
-
2357
- # Rename the saved image file
2358
- if 'temp_filepath' in item:
2359
- old_path = item['temp_filepath']
2360
- new_filename = f"{pdf_name}_page{comp_data['page_num']}_equation{equation_counter}.png"
2361
- new_path = os.path.join(FIGURE_EXTRACTION_DIR, new_filename)
2362
- if os.path.exists(old_path):
2363
- os.rename(old_path, new_path)
2364
-
2365
- elif item['type'] == 'figure':
2366
- figure_counter += 1
2367
- final_word = f"FIGURE{figure_counter}"
2368
- component_id_map[temp_word] = final_word
2369
-
2370
- # Rename the saved image file
2371
- if 'temp_filepath' in item:
2372
- old_path = item['temp_filepath']
2373
- new_filename = f"{pdf_name}_page{comp_data['page_num']}_figure{figure_counter}.png"
2374
- new_path = os.path.join(FIGURE_EXTRACTION_DIR, new_filename)
2375
- if os.path.exists(old_path):
2376
- os.rename(old_path, new_path)
2377
-
2378
- # Update all references with final IDs
2379
- for page_data in all_pages_data:
2380
- for item in page_data['data']:
2381
- if item['word'] in component_id_map:
2382
- item['word'] = component_id_map[item['word']]
2383
- # Clean up temporary fields
2384
- if 'temp_filepath' in item:
2385
- del item['temp_filepath']
2386
- if 'page_num' in item:
2387
- del item['page_num']
2388
-
2389
- GLOBAL_FIGURE_COUNT = figure_counter
2390
- GLOBAL_EQUATION_COUNT = equation_counter
2391
-
2392
- print(f" ✅ Global numbering complete: {GLOBAL_EQUATION_COUNT} equations, {GLOBAL_FIGURE_COUNT} figures")
2393
-
2394
- # ====================================================================
2395
- # STEP 3: SAVE OUTPUT
2396
- # ====================================================================
2397
  if all_pages_data:
2398
  try:
2399
  with open(preprocessed_json_path, 'w') as f:
@@ -2413,97 +1609,6 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2413
  return preprocessed_json_path
2414
 
2415
 
2416
-
2417
- #==============================================================================================================================================================
2418
-
2419
- # def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
2420
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2421
-
2422
- # GLOBAL_FIGURE_COUNT = 0
2423
- # GLOBAL_EQUATION_COUNT = 0
2424
- # _ocr_cache.clear()
2425
-
2426
- # print("\n" + "=" * 80)
2427
- # print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
2428
- # print("=" * 80)
2429
-
2430
- # if not os.path.exists(pdf_path):
2431
- # print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
2432
- # return None
2433
-
2434
- # os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
2435
- # os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
2436
-
2437
- # model = YOLO(WEIGHTS_PATH)
2438
- # pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
2439
-
2440
- # try:
2441
- # doc = fitz.open(pdf_path)
2442
- # print(f"✅ Opened PDF: {pdf_name} ({doc.page_count} pages)")
2443
- # except Exception as e:
2444
- # print(f"❌ ERROR loading PDF file: {e}")
2445
- # return None
2446
-
2447
- # all_pages_data = []
2448
- # total_pages_processed = 0
2449
- # mat = fitz.Matrix(2.0, 2.0)
2450
-
2451
- # print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
2452
-
2453
- # for page_num_0_based in range(doc.page_count):
2454
- # page_num = page_num_0_based + 1
2455
- # print(f" -> Processing Page {page_num}/{doc.page_count}...")
2456
-
2457
- # fitz_page = doc.load_page(page_num_0_based)
2458
-
2459
- # try:
2460
- # pix = fitz_page.get_pixmap(matrix=mat)
2461
- # original_img = pixmap_to_numpy(pix)
2462
- # except Exception as e:
2463
- # print(f" ❌ Error converting page {page_num} to image: {e}")
2464
- # continue
2465
-
2466
- # final_output, page_separator_x = preprocess_and_ocr_page(
2467
- # original_img,
2468
- # model,
2469
- # pdf_path,
2470
- # page_num,
2471
- # fitz_page,
2472
- # pdf_name
2473
- # )
2474
-
2475
- # if final_output is not None:
2476
- # page_data = {
2477
- # "page_number": page_num,
2478
- # "data": final_output,
2479
- # "column_separator_x": page_separator_x
2480
- # }
2481
- # all_pages_data.append(page_data)
2482
- # total_pages_processed += 1
2483
- # else:
2484
- # print(f" ❌ Skipped page {page_num} due to processing error.")
2485
-
2486
- # doc.close()
2487
-
2488
- # if all_pages_data:
2489
- # try:
2490
- # with open(preprocessed_json_path, 'w') as f:
2491
- # json.dump(all_pages_data, f, indent=4)
2492
- # print(f"\n ✅ Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
2493
- # except Exception as e:
2494
- # print(f"❌ ERROR saving combined JSON output: {e}")
2495
- # return None
2496
- # else:
2497
- # print("❌ WARNING: No page data generated. Halting pipeline.")
2498
- # return None
2499
-
2500
- # print("\n" + "=" * 80)
2501
- # print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
2502
- # print("=" * 80)
2503
-
2504
- # return preprocessed_json_path
2505
-
2506
-
2507
  # ============================================================================
2508
  # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
2509
  # ============================================================================
 
550
 
551
  # THRESHOLD: If bridging blocks > 8% of page height, REJECT.
552
  # This allows for page numbers or headers (usually < 5%) to cross, but NOT paragraphs.
553
+ if bridging_ratio > 0.08:
554
  print(
555
  f" ❌ Separator X={x_coord} REJECTED: Bridging Ratio {bridging_ratio:.1%} (>15%) cuts through text.")
556
  continue
 
974
 
975
 
976
 
977
+ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
978
+ page_num: int, fitz_page: fitz.Page,
979
+ pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
980
+ """
981
+ OPTIMIZED FLOW:
982
+ 1. Run YOLO to find Equations/Tables.
983
+ 2. Mask raw text with YOLO boxes.
984
+ 3. Run Column Detection on the MASKED data.
985
+ 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
986
+ """
987
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
988
+
989
+ start_time_total = time.time()
990
+
991
+ if original_img is None:
992
+ print(f" ❌ Invalid image for page {page_num}.")
993
+ return None, None
994
+
995
+ # ====================================================================
996
+ # --- STEP 1: YOLO DETECTION ---
997
+ # ====================================================================
998
+ start_time_yolo = time.time()
999
+ results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1000
+
1001
+ relevant_detections = []
1002
+ if results and results[0].boxes:
1003
+ for box in results[0].boxes:
1004
+ class_id = int(box.cls[0])
1005
+ class_name = model.names[class_id]
1006
+ if class_name in TARGET_CLASSES:
1007
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1008
+ relevant_detections.append(
1009
+ {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1010
+ )
1011
+
1012
+ merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1013
+ print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1014
+
1015
+ # ====================================================================
1016
+ # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1017
+ # ====================================================================
1018
+ # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1019
+ raw_words_for_layout = get_word_data_for_detection(
1020
+ fitz_page, pdf_path, page_num,
1021
+ top_margin_percent=0.10, bottom_margin_percent=0.10
1022
+ )
1023
+
1024
+ masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1025
+
1026
+ # ====================================================================
1027
+ # --- STEP 3: COLUMN DETECTION ---
1028
+ # ====================================================================
1029
+ page_width_pdf = fitz_page.rect.width
1030
+ page_height_pdf = fitz_page.rect.height
1031
+
1032
+ column_detection_params = {
1033
+ 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1034
+ 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1035
+ }
1036
+
1037
+ separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1038
+
1039
+ page_separator_x = None
1040
+ if separators:
1041
+ central_min = page_width_pdf * 0.35
1042
+ central_max = page_width_pdf * 0.65
1043
+ central_separators = [s for s in separators if central_min <= s <= central_max]
1044
+
1045
+ if central_separators:
1046
+ center_x = page_width_pdf / 2
1047
+ page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1048
+ print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1049
+ else:
1050
+ print(" ⚠️ Gutter found off-center. Ignoring.")
1051
+ else:
1052
+ print(" -> Single Column Layout Confirmed.")
1053
+
1054
+ # ====================================================================
1055
+ # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1056
+ # ====================================================================
1057
+ start_time_components = time.time()
1058
+ component_metadata = []
1059
+ fig_count_page = 0
1060
+ eq_count_page = 0
1061
+
1062
+ for detection in merged_detections:
1063
+ x1, y1, x2, y2 = detection['coords']
1064
+ class_name = detection['class']
1065
+
1066
+ if class_name == 'figure':
1067
+ GLOBAL_FIGURE_COUNT += 1
1068
+ counter = GLOBAL_FIGURE_COUNT
1069
+ component_word = f"FIGURE{counter}"
1070
+ fig_count_page += 1
1071
+ elif class_name == 'equation':
1072
+ GLOBAL_EQUATION_COUNT += 1
1073
+ counter = GLOBAL_EQUATION_COUNT
1074
+ component_word = f"EQUATION{counter}"
1075
+ eq_count_page += 1
1076
+ else:
1077
+ continue
1078
+
1079
+ component_crop = original_img[y1:y2, x1:x2]
1080
+ component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1081
+ cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1082
+
1083
+ y_midpoint = (y1 + y2) // 2
1084
+ component_metadata.append({
1085
+ 'type': class_name, 'word': component_word,
1086
+ 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1087
+ 'y0': int(y_midpoint), 'x0': int(x1)
1088
+ })
1089
+
1090
+ # ====================================================================
1091
+ # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1092
+ # ====================================================================
1093
+ raw_ocr_output = []
1094
+ scale_factor = 2.0 # Pipeline standard scale
1095
+
1096
+ try:
1097
+ # Try getting native text first
1098
+ # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1099
+ raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1100
+ except Exception as e:
1101
+ print(f" ❌ Native text extraction failed: {e}")
1102
+
1103
+ # If native text is missing, fall back to OCR
1104
+ if not raw_ocr_output:
1105
+ if _ocr_cache.has_ocr(pdf_path, page_num):
1106
+ print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1107
+ cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1108
+ for word_tuple in cached_word_data:
1109
+ word_text, x1, y1, x2, y2 = word_tuple
1110
+
1111
+ # Scale from PDF points to Pipeline Pixels (2.0)
1112
+ x1_pix = int(x1 * scale_factor)
1113
+ y1_pix = int(y1 * scale_factor)
1114
+ x2_pix = int(x2 * scale_factor)
1115
+ y2_pix = int(y2 * scale_factor)
1116
+
1117
+ raw_ocr_output.append({
1118
+ 'type': 'text', 'word': word_text, 'confidence': 95.0,
1119
+ 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1120
+ 'y0': y1_pix, 'x0': x1_pix
1121
+ })
1122
+ else:
1123
+ # === START OF OPTIMIZED OCR BLOCK ===
1124
+ try:
1125
+ # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1126
+ ocr_zoom = 4.0
1127
+ pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1128
+
1129
+ # Convert PyMuPDF Pixmap to OpenCV format
1130
+ img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1131
+ pix_ocr.n)
1132
+ if pix_ocr.n == 3:
1133
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1134
+ elif pix_ocr.n == 4:
1135
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1136
+
1137
+ # 2. Preprocess (Binarization)
1138
+ processed_img = preprocess_image_for_ocr(img_ocr_np)
1139
+
1140
+ # 3. Run Tesseract with Optimized Configuration
1141
+ custom_config = r'--oem 3 --psm 6'
1142
+
1143
+ hocr_data = pytesseract.image_to_data(
1144
+ processed_img,
1145
+ output_type=pytesseract.Output.DICT,
1146
+ config=custom_config
1147
+ )
1148
+
1149
+ for i in range(len(hocr_data['level'])):
1150
+ text = hocr_data['text'][i] # Retrieve raw Tesseract text
1151
+
1152
+ # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1153
+ cleaned_text = sanitize_text(text).strip()
1154
+
1155
+ if cleaned_text and hocr_data['conf'][i] > -1:
1156
+ # 4. Coordinate Mapping
1157
+ scale_adjustment = scale_factor / ocr_zoom
1158
+
1159
+ x1 = int(hocr_data['left'][i] * scale_adjustment)
1160
+ y1 = int(hocr_data['top'][i] * scale_adjustment)
1161
+ w = int(hocr_data['width'][i] * scale_adjustment)
1162
+ h = int(hocr_data['height'][i] * scale_adjustment)
1163
+ x2 = x1 + w
1164
+ y2 = y1 + h
1165
+
1166
+ raw_ocr_output.append({
1167
+ 'type': 'text',
1168
+ 'word': cleaned_text, # Use the sanitized word
1169
+ 'confidence': float(hocr_data['conf'][i]),
1170
+ 'bbox': [x1, y1, x2, y2],
1171
+ 'y0': y1,
1172
+ 'x0': x1
1173
+ })
1174
+ except Exception as e:
1175
+ print(f" ❌ Tesseract OCR Error: {e}")
1176
+ # === END OF OPTIMIZED OCR BLOCK ===
1177
+
1178
+ # ====================================================================
1179
+ # --- STEP 6: OCR CLEANING AND MERGING ---
1180
+ # ====================================================================
1181
+ items_to_sort = []
1182
+
1183
+ for ocr_word in raw_ocr_output:
1184
+ is_suppressed = False
1185
+ for component in component_metadata:
1186
+ # Do not include words that are inside figure/equation boxes
1187
+ ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1188
+ if ioa > IOA_SUPPRESSION_THRESHOLD:
1189
+ is_suppressed = True
1190
+ break
1191
+ if not is_suppressed:
1192
+ items_to_sort.append(ocr_word)
1193
+
1194
+ # Add figures/equations back into the flow as "words"
1195
+ items_to_sort.extend(component_metadata)
1196
+
1197
+ # ====================================================================
1198
+ # --- STEP 7: LINE-BASED SORTING ---
1199
+ # ====================================================================
1200
+ items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1201
+ lines = []
1202
+
1203
+ for item in items_to_sort:
1204
+ placed = False
1205
+ for line in lines:
1206
+ y_ref = min(it['y0'] for it in line)
1207
+ if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1208
+ line.append(item)
1209
+ placed = True
1210
+ break
1211
+ if not placed and item['type'] in ['equation', 'figure']:
1212
+ for line in lines:
1213
+ y_ref = min(it['y0'] for it in line)
1214
+ if abs(y_ref - item['y0']) < 20:
1215
+ line.append(item)
1216
+ placed = True
1217
+ break
1218
+ if not placed:
1219
+ lines.append([item])
1220
+
1221
+ for line in lines:
1222
+ line.sort(key=lambda x: x['x0'])
1223
+
1224
+ final_output = []
1225
+ for line in lines:
1226
+ for item in line:
1227
+ data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1228
+ if 'tag' in item: data_item['tag'] = item['tag']
1229
+ final_output.append(data_item)
1230
+
1231
+ return final_output, page_separator_x
1232
+
1233
+
1234
+
1235
+
1236
+
1237
+
1238
+
1239
+
1240
+
1241
+
1242
+
1243
+
1244
+
1245
+
1246
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
  # page_num: int, fitz_page: fitz.Page,
1248
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 
1415
  # config=custom_config
1416
  # )
1417
 
1418
+ # # ==============================================================================
1419
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
+ # # ==============================================================================
1421
+ # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
+ # debug_count = 0
1423
+ # for i in range(len(hocr_data['level'])):
1424
+ # text = hocr_data['text'][i].strip()
1425
+ # if text:
1426
+ # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
+ # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
+ # debug_count += 1
1429
+ # if debug_count >= 50: break
1430
+ # print("----------------------------------------------------------------------\n")
1431
+ # # ==============================================================================
1432
+
1433
  # for i in range(len(hocr_data['level'])):
1434
  # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
 
 
1514
 
1515
  # return final_output, page_separator_x
1516
 
 
 
1517
 
1518
 
1519
 
 
1521
 
1522
 
1523
 
1524
+ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1525
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1526
 
1527
+ GLOBAL_FIGURE_COUNT = 0
1528
+ GLOBAL_EQUATION_COUNT = 0
1529
+ _ocr_cache.clear()
1530
 
1531
+ print("\n" + "=" * 80)
1532
+ print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
1533
+ print("=" * 80)
1534
 
1535
+ if not os.path.exists(pdf_path):
1536
+ print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
1537
+ return None
1538
 
1539
+ os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
1540
+ os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
1541
 
1542
+ model = YOLO(WEIGHTS_PATH)
1543
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1544
 
1545
  try:
1546
  doc = fitz.open(pdf_path)
 
1555
 
1556
  print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
1557
 
 
1558
  for page_num_0_based in range(doc.page_count):
1559
  page_num = page_num_0_based + 1
1560
  print(f" -> Processing Page {page_num}/{doc.page_count}...")
 
1590
 
1591
  doc.close()
1592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593
  if all_pages_data:
1594
  try:
1595
  with open(preprocessed_json_path, 'w') as f:
 
1609
  return preprocessed_json_path
1610
 
1611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1612
  # ============================================================================
1613
  # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
1614
  # ============================================================================