heerjtdev commited on
Commit
9a2f423
·
verified ·
1 Parent(s): 4732985

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +271 -15
working_yolo_pipeline.py CHANGED
@@ -1747,7 +1747,257 @@ def post_process_json_with_inference(json_data, classifier):
1747
  # final_output.append(data_item)
1748
 
1749
  # return final_output, page_separator_x
1750
- #=================================================================================================================================================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1751
 
1752
 
1753
 
@@ -1771,7 +2021,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1771
  return None, None
1772
 
1773
  # ====================================================================
1774
- # --- STEP 1: YOLO DETECTION ---
1775
  # ====================================================================
1776
  start_time_yolo = time.time()
1777
  # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
@@ -1779,7 +2029,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1779
 
1780
  relevant_detections = []
1781
 
1782
- # FIX 1: Use .data.tolist() to preserve float coordinates (matches feedback.py)
1783
  if results and results[0].boxes:
1784
  for box in results[0].boxes.data.tolist():
1785
  x1, y1, x2, y2, conf, cls_id = box
@@ -1835,21 +2085,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1835
  print(" -> Single Column Layout Confirmed.")
1836
 
1837
  # ====================================================================
1838
- # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1839
  # ====================================================================
1840
  start_time_components = time.time()
1841
  component_metadata = []
1842
 
1843
  for detection in merged_detections:
1844
- # FIX 3: Cast float coordinates to int HERE for numpy array slicing
1845
  x1, y1, x2, y2 = map(int, detection['coords'])
1846
  class_name = detection['class']
1847
-
1848
  # Ensure coordinates are within image bounds
1849
  h, w = original_img.shape[:2]
1850
  x1, y1 = max(0, x1), max(0, y1)
1851
  x2, y2 = min(w, x2), min(h, y2)
1852
-
1853
  # DON'T assign global IDs here - just store the type and coordinates
1854
  component_crop = original_img[y1:y2, x1:x2]
1855
 
@@ -1957,7 +2207,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1957
  items_to_sort.extend(component_metadata)
1958
 
1959
  # ====================================================================
1960
- # --- STEP 7: LINE-BASED SORTING ---
1961
  # ====================================================================
1962
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1963
  lines = []
@@ -1970,13 +2220,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1970
  line.append(item)
1971
  placed = True
1972
  break
1973
- if not placed and item['type'] in ['equation', 'figure']:
1974
- for line in lines:
1975
- y_ref = min(it['y0'] for it in line)
1976
- if abs(y_ref - item['y0']) < 20:
1977
- line.append(item)
1978
- placed = True
1979
- break
1980
  if not placed:
1981
  lines.append([item])
1982
 
@@ -1996,6 +2243,15 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1996
 
1997
 
1998
 
 
 
 
 
 
 
 
 
 
1999
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
2000
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2001
 
 
1747
  # final_output.append(data_item)
1748
 
1749
  # return final_output, page_separator_x
1750
+ # #=================================================================================================================================================================================================
1751
+
1752
+
1753
+
1754
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1755
+ # page_num: int, fitz_page: fitz.Page,
1756
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1757
+ # """
1758
+ # OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
1759
+ # 1. Run YOLO to find Equations/Tables.
1760
+ # 2. Store detections with page_num but DON'T assign global IDs yet
1761
+ # 3. Mask raw text with YOLO boxes.
1762
+ # 4. Run Column Detection on the MASKED data.
1763
+ # 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1764
+ # """
1765
+ # # NOTE: Removed global counter increments from here
1766
+
1767
+ # start_time_total = time.time()
1768
+
1769
+ # if original_img is None:
1770
+ # print(f" ❌ Invalid image for page {page_num}.")
1771
+ # return None, None
1772
+
1773
+ # # ====================================================================
1774
+ # # --- STEP 1: YOLO DETECTION ---
1775
+ # # ====================================================================
1776
+ # start_time_yolo = time.time()
1777
+ # # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1778
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, verbose=False)
1779
+
1780
+ # relevant_detections = []
1781
+
1782
+ # # FIX 1: Use .data.tolist() to preserve float coordinates (matches feedback.py)
1783
+ # if results and results[0].boxes:
1784
+ # for box in results[0].boxes.data.tolist():
1785
+ # x1, y1, x2, y2, conf, cls_id = box
1786
+ # class_name = model.names[int(cls_id)]
1787
+ # if class_name in TARGET_CLASSES:
1788
+ # relevant_detections.append(
1789
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': conf}
1790
+ # )
1791
+
1792
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1793
+
1794
+ # # FIX 2: Add the missing filter_nested_boxes step (matches feedback.py)
1795
+ # merged_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1796
+
1797
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1798
+
1799
+ # # ====================================================================
1800
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1801
+ # # ====================================================================
1802
+ # raw_words_for_layout = get_word_data_for_detection(
1803
+ # fitz_page, pdf_path, page_num,
1804
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
1805
+ # )
1806
+
1807
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1808
+
1809
+ # # ====================================================================
1810
+ # # --- STEP 3: COLUMN DETECTION ---
1811
+ # # ====================================================================
1812
+ # page_width_pdf = fitz_page.rect.width
1813
+ # page_height_pdf = fitz_page.rect.height
1814
+
1815
+ # column_detection_params = {
1816
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1817
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1818
+ # }
1819
+
1820
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1821
+
1822
+ # page_separator_x = None
1823
+ # if separators:
1824
+ # central_min = page_width_pdf * 0.35
1825
+ # central_max = page_width_pdf * 0.65
1826
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
1827
+
1828
+ # if central_separators:
1829
+ # center_x = page_width_pdf / 2
1830
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1831
+ # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1832
+ # else:
1833
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
1834
+ # else:
1835
+ # print(" -> Single Column Layout Confirmed.")
1836
+
1837
+ # # ====================================================================
1838
+ # # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1839
+ # # ====================================================================
1840
+ # start_time_components = time.time()
1841
+ # component_metadata = []
1842
+
1843
+ # for detection in merged_detections:
1844
+ # # FIX 3: Cast float coordinates to int HERE for numpy array slicing
1845
+ # x1, y1, x2, y2 = map(int, detection['coords'])
1846
+ # class_name = detection['class']
1847
+
1848
+ # # Ensure coordinates are within image bounds
1849
+ # h, w = original_img.shape[:2]
1850
+ # x1, y1 = max(0, x1), max(0, y1)
1851
+ # x2, y2 = min(w, x2), min(h, y2)
1852
+
1853
+ # # DON'T assign global IDs here - just store the type and coordinates
1854
+ # component_crop = original_img[y1:y2, x1:x2]
1855
+
1856
+ # # Store image temporarily with page and position info in filename
1857
+ # temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1858
+ # temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
1859
+ # cv2.imwrite(temp_filepath, component_crop)
1860
+
1861
+ # y_midpoint = (y1 + y2) // 2
1862
+ # component_metadata.append({
1863
+ # 'type': class_name,
1864
+ # 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1865
+ # 'bbox': [x1, y1, x2, y2],
1866
+ # 'y0': int(y_midpoint),
1867
+ # 'x0': int(x1),
1868
+ # 'page_num': page_num, # CRITICAL: Store page number
1869
+ # 'temp_filepath': temp_filepath # Store temp filepath for later renaming
1870
+ # })
1871
+
1872
+ # # ====================================================================
1873
+ # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1874
+ # # ====================================================================
1875
+ # raw_ocr_output = []
1876
+ # scale_factor = 2.0
1877
+
1878
+ # try:
1879
+ # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1880
+ # except Exception as e:
1881
+ # print(f" ❌ Native text extraction failed: {e}")
1882
+
1883
+ # if not raw_ocr_output:
1884
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
1885
+ # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1886
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1887
+ # for word_tuple in cached_word_data:
1888
+ # word_text, x1, y1, x2, y2 = word_tuple
1889
+ # x1_pix = int(x1 * scale_factor)
1890
+ # y1_pix = int(y1 * scale_factor)
1891
+ # x2_pix = int(x2 * scale_factor)
1892
+ # y2_pix = int(y2 * scale_factor)
1893
+
1894
+ # raw_ocr_output.append({
1895
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1896
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1897
+ # 'y0': y1_pix, 'x0': x1_pix
1898
+ # })
1899
+ # else:
1900
+ # try:
1901
+ # ocr_zoom = 4.0
1902
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1903
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1904
+ # pix_ocr.n)
1905
+ # if pix_ocr.n == 3:
1906
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1907
+ # elif pix_ocr.n == 4:
1908
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1909
+
1910
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
1911
+ # custom_config = r'--oem 3 --psm 6'
1912
+ # hocr_data = pytesseract.image_to_data(
1913
+ # processed_img,
1914
+ # output_type=pytesseract.Output.DICT,
1915
+ # config=custom_config
1916
+ # )
1917
+
1918
+ # for i in range(len(hocr_data['level'])):
1919
+ # text = hocr_data['text'][i]
1920
+ # cleaned_text = sanitize_text(text).strip()
1921
+
1922
+ # if cleaned_text and hocr_data['conf'][i] > -1:
1923
+ # scale_adjustment = scale_factor / ocr_zoom
1924
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
1925
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
1926
+ # w = int(hocr_data['width'][i] * scale_adjustment)
1927
+ # h = int(hocr_data['height'][i] * scale_adjustment)
1928
+ # x2 = x1 + w
1929
+ # y2 = y1 + h
1930
+
1931
+ # raw_ocr_output.append({
1932
+ # 'type': 'text',
1933
+ # 'word': cleaned_text,
1934
+ # 'confidence': float(hocr_data['conf'][i]),
1935
+ # 'bbox': [x1, y1, x2, y2],
1936
+ # 'y0': y1,
1937
+ # 'x0': x1
1938
+ # })
1939
+ # except Exception as e:
1940
+ # print(f" ❌ Tesseract OCR Error: {e}")
1941
+
1942
+ # # ====================================================================
1943
+ # # --- STEP 6: OCR CLEANING AND MERGING ---
1944
+ # # ====================================================================
1945
+ # items_to_sort = []
1946
+
1947
+ # for ocr_word in raw_ocr_output:
1948
+ # is_suppressed = False
1949
+ # for component in component_metadata:
1950
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1951
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1952
+ # is_suppressed = True
1953
+ # break
1954
+ # if not is_suppressed:
1955
+ # items_to_sort.append(ocr_word)
1956
+
1957
+ # items_to_sort.extend(component_metadata)
1958
+
1959
+ # # ====================================================================
1960
+ # # --- STEP 7: LINE-BASED SORTING ---
1961
+ # # ====================================================================
1962
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1963
+ # lines = []
1964
+
1965
+ # for item in items_to_sort:
1966
+ # placed = False
1967
+ # for line in lines:
1968
+ # y_ref = min(it['y0'] for it in line)
1969
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1970
+ # line.append(item)
1971
+ # placed = True
1972
+ # break
1973
+ # if not placed and item['type'] in ['equation', 'figure']:
1974
+ # for line in lines:
1975
+ # y_ref = min(it['y0'] for it in line)
1976
+ # if abs(y_ref - item['y0']) < 20:
1977
+ # line.append(item)
1978
+ # placed = True
1979
+ # break
1980
+ # if not placed:
1981
+ # lines.append([item])
1982
+
1983
+ # for line in lines:
1984
+ # line.sort(key=lambda x: x['x0'])
1985
+
1986
+ # final_output = []
1987
+ # for line in lines:
1988
+ # for item in line:
1989
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1990
+ # if 'tag' in item: data_item['tag'] = item['tag']
1991
+ # if 'page_num' in item: data_item['page_num'] = item['page_num']
1992
+ # if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
1993
+ # final_output.append(data_item)
1994
+
1995
+ # return final_output, page_separator_x
1996
+
1997
+
1998
+
1999
+
2000
+
2001
 
2002
 
2003
 
 
2021
  return None, None
2022
 
2023
  # ====================================================================
2024
+ # --- STEP 1: YOLO DETECTION (FIXED) ---
2025
  # ====================================================================
2026
  start_time_yolo = time.time()
2027
  # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
 
2029
 
2030
  relevant_detections = []
2031
 
2032
+ # FIX 1: Use .data.tolist() to preserve float coordinates for merging/filtering (matches feedback.py)
2033
  if results and results[0].boxes:
2034
  for box in results[0].boxes.data.tolist():
2035
  x1, y1, x2, y2, conf, cls_id = box
 
2085
  print(" -> Single Column Layout Confirmed.")
2086
 
2087
  # ====================================================================
2088
+ # --- STEP 4: COMPONENT EXTRACTION ---
2089
  # ====================================================================
2090
  start_time_components = time.time()
2091
  component_metadata = []
2092
 
2093
  for detection in merged_detections:
2094
+ # Cast float coordinates to int HERE for numpy array slicing (cropping)
2095
  x1, y1, x2, y2 = map(int, detection['coords'])
2096
  class_name = detection['class']
2097
+
2098
  # Ensure coordinates are within image bounds
2099
  h, w = original_img.shape[:2]
2100
  x1, y1 = max(0, x1), max(0, y1)
2101
  x2, y2 = min(w, x2), min(h, y2)
2102
+
2103
  # DON'T assign global IDs here - just store the type and coordinates
2104
  component_crop = original_img[y1:y2, x1:x2]
2105
 
 
2207
  items_to_sort.extend(component_metadata)
2208
 
2209
  # ====================================================================
2210
+ # --- STEP 7: LINE-BASED SORTING (FIXED) ---
2211
  # ====================================================================
2212
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
2213
  lines = []
 
2220
  line.append(item)
2221
  placed = True
2222
  break
2223
+
2224
+ # FIX: The overly permissive/non-standard line merging block for equations/figures
2225
+ # that uses a large tolerance (20) has been removed to enforce strict vertical sorting.
2226
+
 
 
 
2227
  if not placed:
2228
  lines.append([item])
2229
 
 
2243
 
2244
 
2245
 
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
2256
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
2257