heerjtdev commited on
Commit
4732985
·
verified ·
1 Parent(s): 8e7b4fa

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +258 -17
working_yolo_pipeline.py CHANGED
@@ -1514,6 +1514,241 @@ def post_process_json_with_inference(json_data, classifier):
1514
 
1515
  # return final_output, page_separator_x
1516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1517
 
1518
 
1519
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
@@ -1543,17 +1778,22 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1543
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1544
 
1545
  relevant_detections = []
 
 
1546
  if results and results[0].boxes:
1547
- for box in results[0].boxes:
1548
- class_id = int(box.cls[0])
1549
- class_name = model.names[class_id]
1550
  if class_name in TARGET_CLASSES:
1551
- x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1552
  relevant_detections.append(
1553
- {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1554
  )
1555
 
1556
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
 
 
 
 
1557
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1558
 
1559
  # ====================================================================
@@ -1601,12 +1841,18 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1601
  component_metadata = []
1602
 
1603
  for detection in merged_detections:
1604
- x1, y1, x2, y2 = detection['coords']
 
1605
  class_name = detection['class']
1606
 
 
 
 
 
 
1607
  # DON'T assign global IDs here - just store the type and coordinates
1608
  component_crop = original_img[y1:y2, x1:x2]
1609
-
1610
  # Store image temporarily with page and position info in filename
1611
  temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1612
  temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
@@ -1614,10 +1860,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1614
 
1615
  y_midpoint = (y1 + y2) // 2
1616
  component_metadata.append({
1617
- 'type': class_name,
1618
  'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1619
- 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1620
- 'y0': int(y_midpoint),
1621
  'x0': int(x1),
1622
  'page_num': page_num, # CRITICAL: Store page number
1623
  'temp_filepath': temp_filepath # Store temp filepath for later renaming
@@ -1672,7 +1918,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1672
  for i in range(len(hocr_data['level'])):
1673
  text = hocr_data['text'][i]
1674
  cleaned_text = sanitize_text(text).strip()
1675
-
1676
  if cleaned_text and hocr_data['conf'][i] > -1:
1677
  scale_adjustment = scale_factor / ocr_zoom
1678
  x1 = int(hocr_data['left'][i] * scale_adjustment)
@@ -1692,7 +1938,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1692
  })
1693
  except Exception as e:
1694
  print(f" ❌ Tesseract OCR Error: {e}")
1695
-
1696
  # ====================================================================
1697
  # --- STEP 6: OCR CLEANING AND MERGING ---
1698
  # ====================================================================
@@ -1750,11 +1996,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1750
 
1751
 
1752
 
1753
-
1754
-
1755
-
1756
-
1757
-
1758
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1759
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1760
 
 
1514
 
1515
  # return final_output, page_separator_x
1516
 
1517
+ #==========================================================================================================================================================================================
1518
+
1519
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1520
+ # page_num: int, fitz_page: fitz.Page,
1521
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1522
+ # """
1523
+ # OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
1524
+ # 1. Run YOLO to find Equations/Tables.
1525
+ # 2. Store detections with page_num but DON'T assign global IDs yet
1526
+ # 3. Mask raw text with YOLO boxes.
1527
+ # 4. Run Column Detection on the MASKED data.
1528
+ # 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1529
+ # """
1530
+ # # NOTE: Removed global counter increments from here
1531
+
1532
+ # start_time_total = time.time()
1533
+
1534
+ # if original_img is None:
1535
+ # print(f" ❌ Invalid image for page {page_num}.")
1536
+ # return None, None
1537
+
1538
+ # # ====================================================================
1539
+ # # --- STEP 1: YOLO DETECTION ---
1540
+ # # ====================================================================
1541
+ # start_time_yolo = time.time()
1542
+ # # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1543
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1544
+
1545
+ # relevant_detections = []
1546
+ # if results and results[0].boxes:
1547
+ # for box in results[0].boxes:
1548
+ # class_id = int(box.cls[0])
1549
+ # class_name = model.names[class_id]
1550
+ # if class_name in TARGET_CLASSES:
1551
+ # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1552
+ # relevant_detections.append(
1553
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1554
+ # )
1555
+
1556
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1557
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1558
+
1559
+ # # ====================================================================
1560
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1561
+ # # ====================================================================
1562
+ # raw_words_for_layout = get_word_data_for_detection(
1563
+ # fitz_page, pdf_path, page_num,
1564
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
1565
+ # )
1566
+
1567
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1568
+
1569
+ # # ====================================================================
1570
+ # # --- STEP 3: COLUMN DETECTION ---
1571
+ # # ====================================================================
1572
+ # page_width_pdf = fitz_page.rect.width
1573
+ # page_height_pdf = fitz_page.rect.height
1574
+
1575
+ # column_detection_params = {
1576
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1577
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1578
+ # }
1579
+
1580
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1581
+
1582
+ # page_separator_x = None
1583
+ # if separators:
1584
+ # central_min = page_width_pdf * 0.35
1585
+ # central_max = page_width_pdf * 0.65
1586
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
1587
+
1588
+ # if central_separators:
1589
+ # center_x = page_width_pdf / 2
1590
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1591
+ # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1592
+ # else:
1593
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
1594
+ # else:
1595
+ # print(" -> Single Column Layout Confirmed.")
1596
+
1597
+ # # ====================================================================
1598
+ # # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1599
+ # # ====================================================================
1600
+ # start_time_components = time.time()
1601
+ # component_metadata = []
1602
+
1603
+ # for detection in merged_detections:
1604
+ # x1, y1, x2, y2 = detection['coords']
1605
+ # class_name = detection['class']
1606
+
1607
+ # # DON'T assign global IDs here - just store the type and coordinates
1608
+ # component_crop = original_img[y1:y2, x1:x2]
1609
+
1610
+ # # Store image temporarily with page and position info in filename
1611
+ # temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1612
+ # temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
1613
+ # cv2.imwrite(temp_filepath, component_crop)
1614
+
1615
+ # y_midpoint = (y1 + y2) // 2
1616
+ # component_metadata.append({
1617
+ # 'type': class_name,
1618
+ # 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1619
+ # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1620
+ # 'y0': int(y_midpoint),
1621
+ # 'x0': int(x1),
1622
+ # 'page_num': page_num, # CRITICAL: Store page number
1623
+ # 'temp_filepath': temp_filepath # Store temp filepath for later renaming
1624
+ # })
1625
+
1626
+ # # ====================================================================
1627
+ # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1628
+ # # ====================================================================
1629
+ # raw_ocr_output = []
1630
+ # scale_factor = 2.0
1631
+
1632
+ # try:
1633
+ # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1634
+ # except Exception as e:
1635
+ # print(f" ❌ Native text extraction failed: {e}")
1636
+
1637
+ # if not raw_ocr_output:
1638
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
1639
+ # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1640
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1641
+ # for word_tuple in cached_word_data:
1642
+ # word_text, x1, y1, x2, y2 = word_tuple
1643
+ # x1_pix = int(x1 * scale_factor)
1644
+ # y1_pix = int(y1 * scale_factor)
1645
+ # x2_pix = int(x2 * scale_factor)
1646
+ # y2_pix = int(y2 * scale_factor)
1647
+
1648
+ # raw_ocr_output.append({
1649
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1650
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1651
+ # 'y0': y1_pix, 'x0': x1_pix
1652
+ # })
1653
+ # else:
1654
+ # try:
1655
+ # ocr_zoom = 4.0
1656
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1657
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1658
+ # pix_ocr.n)
1659
+ # if pix_ocr.n == 3:
1660
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1661
+ # elif pix_ocr.n == 4:
1662
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1663
+
1664
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
1665
+ # custom_config = r'--oem 3 --psm 6'
1666
+ # hocr_data = pytesseract.image_to_data(
1667
+ # processed_img,
1668
+ # output_type=pytesseract.Output.DICT,
1669
+ # config=custom_config
1670
+ # )
1671
+
1672
+ # for i in range(len(hocr_data['level'])):
1673
+ # text = hocr_data['text'][i]
1674
+ # cleaned_text = sanitize_text(text).strip()
1675
+
1676
+ # if cleaned_text and hocr_data['conf'][i] > -1:
1677
+ # scale_adjustment = scale_factor / ocr_zoom
1678
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
1679
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
1680
+ # w = int(hocr_data['width'][i] * scale_adjustment)
1681
+ # h = int(hocr_data['height'][i] * scale_adjustment)
1682
+ # x2 = x1 + w
1683
+ # y2 = y1 + h
1684
+
1685
+ # raw_ocr_output.append({
1686
+ # 'type': 'text',
1687
+ # 'word': cleaned_text,
1688
+ # 'confidence': float(hocr_data['conf'][i]),
1689
+ # 'bbox': [x1, y1, x2, y2],
1690
+ # 'y0': y1,
1691
+ # 'x0': x1
1692
+ # })
1693
+ # except Exception as e:
1694
+ # print(f" ❌ Tesseract OCR Error: {e}")
1695
+
1696
+ # # ====================================================================
1697
+ # # --- STEP 6: OCR CLEANING AND MERGING ---
1698
+ # # ====================================================================
1699
+ # items_to_sort = []
1700
+
1701
+ # for ocr_word in raw_ocr_output:
1702
+ # is_suppressed = False
1703
+ # for component in component_metadata:
1704
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1705
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1706
+ # is_suppressed = True
1707
+ # break
1708
+ # if not is_suppressed:
1709
+ # items_to_sort.append(ocr_word)
1710
+
1711
+ # items_to_sort.extend(component_metadata)
1712
+
1713
+ # # ====================================================================
1714
+ # # --- STEP 7: LINE-BASED SORTING ---
1715
+ # # ====================================================================
1716
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1717
+ # lines = []
1718
+
1719
+ # for item in items_to_sort:
1720
+ # placed = False
1721
+ # for line in lines:
1722
+ # y_ref = min(it['y0'] for it in line)
1723
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1724
+ # line.append(item)
1725
+ # placed = True
1726
+ # break
1727
+ # if not placed and item['type'] in ['equation', 'figure']:
1728
+ # for line in lines:
1729
+ # y_ref = min(it['y0'] for it in line)
1730
+ # if abs(y_ref - item['y0']) < 20:
1731
+ # line.append(item)
1732
+ # placed = True
1733
+ # break
1734
+ # if not placed:
1735
+ # lines.append([item])
1736
+
1737
+ # for line in lines:
1738
+ # line.sort(key=lambda x: x['x0'])
1739
+
1740
+ # final_output = []
1741
+ # for line in lines:
1742
+ # for item in line:
1743
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1744
+ # if 'tag' in item: data_item['tag'] = item['tag']
1745
+ # if 'page_num' in item: data_item['page_num'] = item['page_num']
1746
+ # if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
1747
+ # final_output.append(data_item)
1748
+
1749
+ # return final_output, page_separator_x
1750
+ #=================================================================================================================================================================================================
1751
+
1752
 
1753
 
1754
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
 
1778
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1779
 
1780
  relevant_detections = []
1781
+
1782
+ # FIX 1: Use .data.tolist() to preserve float coordinates (matches feedback.py)
1783
  if results and results[0].boxes:
1784
+ for box in results[0].boxes.data.tolist():
1785
+ x1, y1, x2, y2, conf, cls_id = box
1786
+ class_name = model.names[int(cls_id)]
1787
  if class_name in TARGET_CLASSES:
 
1788
  relevant_detections.append(
1789
+ {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': conf}
1790
  )
1791
 
1792
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1793
+
1794
+ # FIX 2: Add the missing filter_nested_boxes step (matches feedback.py)
1795
+ merged_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1796
+
1797
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1798
 
1799
  # ====================================================================
 
1841
  component_metadata = []
1842
 
1843
  for detection in merged_detections:
1844
+ # FIX 3: Cast float coordinates to int HERE for numpy array slicing
1845
+ x1, y1, x2, y2 = map(int, detection['coords'])
1846
  class_name = detection['class']
1847
 
1848
+ # Ensure coordinates are within image bounds
1849
+ h, w = original_img.shape[:2]
1850
+ x1, y1 = max(0, x1), max(0, y1)
1851
+ x2, y2 = min(w, x2), min(h, y2)
1852
+
1853
  # DON'T assign global IDs here - just store the type and coordinates
1854
  component_crop = original_img[y1:y2, x1:x2]
1855
+
1856
  # Store image temporarily with page and position info in filename
1857
  temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1858
  temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
 
1860
 
1861
  y_midpoint = (y1 + y2) // 2
1862
  component_metadata.append({
1863
+ 'type': class_name,
1864
  'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1865
+ 'bbox': [x1, y1, x2, y2],
1866
+ 'y0': int(y_midpoint),
1867
  'x0': int(x1),
1868
  'page_num': page_num, # CRITICAL: Store page number
1869
  'temp_filepath': temp_filepath # Store temp filepath for later renaming
 
1918
  for i in range(len(hocr_data['level'])):
1919
  text = hocr_data['text'][i]
1920
  cleaned_text = sanitize_text(text).strip()
1921
+
1922
  if cleaned_text and hocr_data['conf'][i] > -1:
1923
  scale_adjustment = scale_factor / ocr_zoom
1924
  x1 = int(hocr_data['left'][i] * scale_adjustment)
 
1938
  })
1939
  except Exception as e:
1940
  print(f" ❌ Tesseract OCR Error: {e}")
1941
+
1942
  # ====================================================================
1943
  # --- STEP 6: OCR CLEANING AND MERGING ---
1944
  # ====================================================================
 
1996
 
1997
 
1998
 
 
 
 
 
 
1999
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
2000
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2001