heerjtdev commited on
Commit
ec35101
Β·
verified Β·
1 Parent(s): dfb3e55

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +692 -295
working_yolo_pipeline.py CHANGED
@@ -974,275 +974,6 @@ def post_process_json_with_inference(json_data, classifier):
974
 
975
 
976
 
977
- def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
978
- page_num: int, fitz_page: fitz.Page,
979
- pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
980
- """
981
- OPTIMIZED FLOW:
982
- 1. Run YOLO to find Equations/Tables.
983
- 2. Mask raw text with YOLO boxes.
984
- 3. Run Column Detection on the MASKED data.
985
- 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
986
- """
987
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
988
-
989
- start_time_total = time.time()
990
-
991
- if original_img is None:
992
- print(f" ❌ Invalid image for page {page_num}.")
993
- return None, None
994
-
995
- # ====================================================================
996
- # --- STEP 1: YOLO DETECTION ---
997
- # ====================================================================
998
- start_time_yolo = time.time()
999
- results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1000
-
1001
- relevant_detections = []
1002
- if results and results[0].boxes:
1003
- for box in results[0].boxes:
1004
- class_id = int(box.cls[0])
1005
- class_name = model.names[class_id]
1006
- if class_name in TARGET_CLASSES:
1007
- x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1008
- relevant_detections.append(
1009
- {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1010
- )
1011
-
1012
- merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1013
- print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1014
-
1015
- # ====================================================================
1016
- # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1017
- # ====================================================================
1018
- # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1019
- raw_words_for_layout = get_word_data_for_detection(
1020
- fitz_page, pdf_path, page_num,
1021
- top_margin_percent=0.10, bottom_margin_percent=0.10
1022
- )
1023
-
1024
- masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1025
-
1026
- # ====================================================================
1027
- # --- STEP 3: COLUMN DETECTION ---
1028
- # ====================================================================
1029
- page_width_pdf = fitz_page.rect.width
1030
- page_height_pdf = fitz_page.rect.height
1031
-
1032
- column_detection_params = {
1033
- 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1034
- 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1035
- }
1036
-
1037
- separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1038
-
1039
- page_separator_x = None
1040
- if separators:
1041
- central_min = page_width_pdf * 0.35
1042
- central_max = page_width_pdf * 0.65
1043
- central_separators = [s for s in separators if central_min <= s <= central_max]
1044
-
1045
- if central_separators:
1046
- center_x = page_width_pdf / 2
1047
- page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1048
- print(f" βœ… Column Split Confirmed at X={page_separator_x:.1f}")
1049
- else:
1050
- print(" ⚠️ Gutter found off-center. Ignoring.")
1051
- else:
1052
- print(" -> Single Column Layout Confirmed.")
1053
-
1054
- # ====================================================================
1055
- # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1056
- # ====================================================================
1057
- start_time_components = time.time()
1058
- component_metadata = []
1059
- fig_count_page = 0
1060
- eq_count_page = 0
1061
-
1062
- for detection in merged_detections:
1063
- x1, y1, x2, y2 = detection['coords']
1064
- class_name = detection['class']
1065
-
1066
- if class_name == 'figure':
1067
- GLOBAL_FIGURE_COUNT += 1
1068
- counter = GLOBAL_FIGURE_COUNT
1069
- component_word = f"FIGURE{counter}"
1070
- fig_count_page += 1
1071
- elif class_name == 'equation':
1072
- GLOBAL_EQUATION_COUNT += 1
1073
- counter = GLOBAL_EQUATION_COUNT
1074
- component_word = f"EQUATION{counter}"
1075
- eq_count_page += 1
1076
- else:
1077
- continue
1078
-
1079
- component_crop = original_img[y1:y2, x1:x2]
1080
- component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1081
- cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1082
-
1083
- y_midpoint = (y1 + y2) // 2
1084
- component_metadata.append({
1085
- 'type': class_name, 'word': component_word,
1086
- 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1087
- 'y0': int(y_midpoint), 'x0': int(x1)
1088
- })
1089
-
1090
- # ====================================================================
1091
- # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1092
- # ====================================================================
1093
- raw_ocr_output = []
1094
- scale_factor = 2.0 # Pipeline standard scale
1095
-
1096
- try:
1097
- # Try getting native text first
1098
- # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1099
- raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1100
- except Exception as e:
1101
- print(f" ❌ Native text extraction failed: {e}")
1102
-
1103
- # If native text is missing, fall back to OCR
1104
- if not raw_ocr_output:
1105
- if _ocr_cache.has_ocr(pdf_path, page_num):
1106
- print(f" ⚑ Using cached Tesseract OCR for page {page_num}")
1107
- cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1108
- for word_tuple in cached_word_data:
1109
- word_text, x1, y1, x2, y2 = word_tuple
1110
-
1111
- # Scale from PDF points to Pipeline Pixels (2.0)
1112
- x1_pix = int(x1 * scale_factor)
1113
- y1_pix = int(y1 * scale_factor)
1114
- x2_pix = int(x2 * scale_factor)
1115
- y2_pix = int(y2 * scale_factor)
1116
-
1117
- raw_ocr_output.append({
1118
- 'type': 'text', 'word': word_text, 'confidence': 95.0,
1119
- 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1120
- 'y0': y1_pix, 'x0': x1_pix
1121
- })
1122
- else:
1123
- # === START OF OPTIMIZED OCR BLOCK ===
1124
- try:
1125
- # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1126
- ocr_zoom = 4.0
1127
- pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1128
-
1129
- # Convert PyMuPDF Pixmap to OpenCV format
1130
- img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1131
- pix_ocr.n)
1132
- if pix_ocr.n == 3:
1133
- img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1134
- elif pix_ocr.n == 4:
1135
- img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1136
-
1137
- # 2. Preprocess (Binarization)
1138
- processed_img = preprocess_image_for_ocr(img_ocr_np)
1139
-
1140
- # 3. Run Tesseract with Optimized Configuration
1141
- custom_config = r'--oem 3 --psm 6'
1142
-
1143
- hocr_data = pytesseract.image_to_data(
1144
- processed_img,
1145
- output_type=pytesseract.Output.DICT,
1146
- config=custom_config
1147
- )
1148
-
1149
- for i in range(len(hocr_data['level'])):
1150
- text = hocr_data['text'][i] # Retrieve raw Tesseract text
1151
-
1152
- # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1153
- cleaned_text = sanitize_text(text).strip()
1154
-
1155
- if cleaned_text and hocr_data['conf'][i] > -1:
1156
- # 4. Coordinate Mapping
1157
- scale_adjustment = scale_factor / ocr_zoom
1158
-
1159
- x1 = int(hocr_data['left'][i] * scale_adjustment)
1160
- y1 = int(hocr_data['top'][i] * scale_adjustment)
1161
- w = int(hocr_data['width'][i] * scale_adjustment)
1162
- h = int(hocr_data['height'][i] * scale_adjustment)
1163
- x2 = x1 + w
1164
- y2 = y1 + h
1165
-
1166
- raw_ocr_output.append({
1167
- 'type': 'text',
1168
- 'word': cleaned_text, # Use the sanitized word
1169
- 'confidence': float(hocr_data['conf'][i]),
1170
- 'bbox': [x1, y1, x2, y2],
1171
- 'y0': y1,
1172
- 'x0': x1
1173
- })
1174
- except Exception as e:
1175
- print(f" ❌ Tesseract OCR Error: {e}")
1176
- # === END OF OPTIMIZED OCR BLOCK ===
1177
-
1178
- # ====================================================================
1179
- # --- STEP 6: OCR CLEANING AND MERGING ---
1180
- # ====================================================================
1181
- items_to_sort = []
1182
-
1183
- for ocr_word in raw_ocr_output:
1184
- is_suppressed = False
1185
- for component in component_metadata:
1186
- # Do not include words that are inside figure/equation boxes
1187
- ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1188
- if ioa > IOA_SUPPRESSION_THRESHOLD:
1189
- is_suppressed = True
1190
- break
1191
- if not is_suppressed:
1192
- items_to_sort.append(ocr_word)
1193
-
1194
- # Add figures/equations back into the flow as "words"
1195
- items_to_sort.extend(component_metadata)
1196
-
1197
- # ====================================================================
1198
- # --- STEP 7: LINE-BASED SORTING ---
1199
- # ====================================================================
1200
- items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1201
- lines = []
1202
-
1203
- for item in items_to_sort:
1204
- placed = False
1205
- for line in lines:
1206
- y_ref = min(it['y0'] for it in line)
1207
- if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1208
- line.append(item)
1209
- placed = True
1210
- break
1211
- if not placed and item['type'] in ['equation', 'figure']:
1212
- for line in lines:
1213
- y_ref = min(it['y0'] for it in line)
1214
- if abs(y_ref - item['y0']) < 20:
1215
- line.append(item)
1216
- placed = True
1217
- break
1218
- if not placed:
1219
- lines.append([item])
1220
-
1221
- for line in lines:
1222
- line.sort(key=lambda x: x['x0'])
1223
-
1224
- final_output = []
1225
- for line in lines:
1226
- for item in line:
1227
- data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1228
- if 'tag' in item: data_item['tag'] = item['tag']
1229
- final_output.append(data_item)
1230
-
1231
- return final_output, page_separator_x
1232
-
1233
-
1234
-
1235
-
1236
-
1237
-
1238
-
1239
-
1240
-
1241
-
1242
-
1243
-
1244
-
1245
-
1246
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
  # page_num: int, fitz_page: fitz.Page,
1248
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -1415,21 +1146,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1415
  # config=custom_config
1416
  # )
1417
 
1418
- # # ==============================================================================
1419
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
- # # ==============================================================================
1421
- # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
- # debug_count = 0
1423
- # for i in range(len(hocr_data['level'])):
1424
- # text = hocr_data['text'][i].strip()
1425
- # if text:
1426
- # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
- # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
- # debug_count += 1
1429
- # if debug_count >= 50: break
1430
- # print("----------------------------------------------------------------------\n")
1431
- # # ==============================================================================
1432
-
1433
  # for i in range(len(hocr_data['level'])):
1434
  # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
 
@@ -1514,6 +1230,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1514
 
1515
  # return final_output, page_separator_x
1516
 
 
1517
 
1518
 
1519
 
@@ -1521,20 +1238,536 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1521
 
1522
 
1523
 
1524
- def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1525
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1526
 
1527
- GLOBAL_FIGURE_COUNT = 0
1528
- GLOBAL_EQUATION_COUNT = 0
1529
- _ocr_cache.clear()
1530
 
1531
- print("\n" + "=" * 80)
1532
- print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
1533
- print("=" * 80)
1534
 
1535
- if not os.path.exists(pdf_path):
1536
- print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
1537
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1538
 
1539
  os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
1540
  os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
@@ -1555,6 +1788,7 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
1555
 
1556
  print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
1557
 
 
1558
  for page_num_0_based in range(doc.page_count):
1559
  page_num = page_num_0_based + 1
1560
  print(f" -> Processing Page {page_num}/{doc.page_count}...")
@@ -1590,6 +1824,78 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
1590
 
1591
  doc.close()
1592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593
  if all_pages_data:
1594
  try:
1595
  with open(preprocessed_json_path, 'w') as f:
@@ -1609,6 +1915,97 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
1609
  return preprocessed_json_path
1610
 
1611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1612
  # ============================================================================
1613
  # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
1614
  # ============================================================================
 
974
 
975
 
976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
978
  # page_num: int, fitz_page: fitz.Page,
979
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 
1146
  # config=custom_config
1147
  # )
1148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1149
  # for i in range(len(hocr_data['level'])):
1150
  # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1151
 
 
1230
 
1231
  # return final_output, page_separator_x
1232
 
1233
+ #=============================================================================================================================================================================
1234
 
1235
 
1236
 
 
1238
 
1239
 
1240
 
 
 
1241
 
 
 
 
1242
 
 
 
 
1243
 
1244
+
1245
+
1246
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
+ # page_num: int, fitz_page: fitz.Page,
1248
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1249
+ # """
1250
+ # OPTIMIZED FLOW:
1251
+ # 1. Run YOLO to find Equations/Tables.
1252
+ # 2. Mask raw text with YOLO boxes.
1253
+ # 3. Run Column Detection on the MASKED data.
1254
+ # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1255
+ # """
1256
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1257
+
1258
+ # start_time_total = time.time()
1259
+
1260
+ # if original_img is None:
1261
+ # print(f" ❌ Invalid image for page {page_num}.")
1262
+ # return None, None
1263
+
1264
+ # # ====================================================================
1265
+ # # --- STEP 1: YOLO DETECTION ---
1266
+ # # ====================================================================
1267
+ # start_time_yolo = time.time()
1268
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1269
+
1270
+ # relevant_detections = []
1271
+ # if results and results[0].boxes:
1272
+ # for box in results[0].boxes:
1273
+ # class_id = int(box.cls[0])
1274
+ # class_name = model.names[class_id]
1275
+ # if class_name in TARGET_CLASSES:
1276
+ # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1277
+ # relevant_detections.append(
1278
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1279
+ # )
1280
+
1281
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1282
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1283
+
1284
+ # # ====================================================================
1285
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1286
+ # # ====================================================================
1287
+ # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1288
+ # raw_words_for_layout = get_word_data_for_detection(
1289
+ # fitz_page, pdf_path, page_num,
1290
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
1291
+ # )
1292
+
1293
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1294
+
1295
+ # # ====================================================================
1296
+ # # --- STEP 3: COLUMN DETECTION ---
1297
+ # # ====================================================================
1298
+ # page_width_pdf = fitz_page.rect.width
1299
+ # page_height_pdf = fitz_page.rect.height
1300
+
1301
+ # column_detection_params = {
1302
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1303
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1304
+ # }
1305
+
1306
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1307
+
1308
+ # page_separator_x = None
1309
+ # if separators:
1310
+ # central_min = page_width_pdf * 0.35
1311
+ # central_max = page_width_pdf * 0.65
1312
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
1313
+
1314
+ # if central_separators:
1315
+ # center_x = page_width_pdf / 2
1316
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1317
+ # print(f" βœ… Column Split Confirmed at X={page_separator_x:.1f}")
1318
+ # else:
1319
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
1320
+ # else:
1321
+ # print(" -> Single Column Layout Confirmed.")
1322
+
1323
+ # # ====================================================================
1324
+ # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1325
+ # # ====================================================================
1326
+ # start_time_components = time.time()
1327
+ # component_metadata = []
1328
+ # fig_count_page = 0
1329
+ # eq_count_page = 0
1330
+
1331
+ # for detection in merged_detections:
1332
+ # x1, y1, x2, y2 = detection['coords']
1333
+ # class_name = detection['class']
1334
+
1335
+ # if class_name == 'figure':
1336
+ # GLOBAL_FIGURE_COUNT += 1
1337
+ # counter = GLOBAL_FIGURE_COUNT
1338
+ # component_word = f"FIGURE{counter}"
1339
+ # fig_count_page += 1
1340
+ # elif class_name == 'equation':
1341
+ # GLOBAL_EQUATION_COUNT += 1
1342
+ # counter = GLOBAL_EQUATION_COUNT
1343
+ # component_word = f"EQUATION{counter}"
1344
+ # eq_count_page += 1
1345
+ # else:
1346
+ # continue
1347
+
1348
+ # component_crop = original_img[y1:y2, x1:x2]
1349
+ # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1350
+ # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1351
+
1352
+ # y_midpoint = (y1 + y2) // 2
1353
+ # component_metadata.append({
1354
+ # 'type': class_name, 'word': component_word,
1355
+ # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1356
+ # 'y0': int(y_midpoint), 'x0': int(x1)
1357
+ # })
1358
+
1359
+ # # ====================================================================
1360
+ # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1361
+ # # ====================================================================
1362
+ # raw_ocr_output = []
1363
+ # scale_factor = 2.0 # Pipeline standard scale
1364
+
1365
+ # try:
1366
+ # # Try getting native text first
1367
+ # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1368
+ # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1369
+ # except Exception as e:
1370
+ # print(f" ❌ Native text extraction failed: {e}")
1371
+
1372
+ # # If native text is missing, fall back to OCR
1373
+ # if not raw_ocr_output:
1374
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
1375
+ # print(f" ⚑ Using cached Tesseract OCR for page {page_num}")
1376
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1377
+ # for word_tuple in cached_word_data:
1378
+ # word_text, x1, y1, x2, y2 = word_tuple
1379
+
1380
+ # # Scale from PDF points to Pipeline Pixels (2.0)
1381
+ # x1_pix = int(x1 * scale_factor)
1382
+ # y1_pix = int(y1 * scale_factor)
1383
+ # x2_pix = int(x2 * scale_factor)
1384
+ # y2_pix = int(y2 * scale_factor)
1385
+
1386
+ # raw_ocr_output.append({
1387
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1388
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1389
+ # 'y0': y1_pix, 'x0': x1_pix
1390
+ # })
1391
+ # else:
1392
+ # # === START OF OPTIMIZED OCR BLOCK ===
1393
+ # try:
1394
+ # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1395
+ # ocr_zoom = 4.0
1396
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1397
+
1398
+ # # Convert PyMuPDF Pixmap to OpenCV format
1399
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1400
+ # pix_ocr.n)
1401
+ # if pix_ocr.n == 3:
1402
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1403
+ # elif pix_ocr.n == 4:
1404
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1405
+
1406
+ # # 2. Preprocess (Binarization)
1407
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
1408
+
1409
+ # # 3. Run Tesseract with Optimized Configuration
1410
+ # custom_config = r'--oem 3 --psm 6'
1411
+
1412
+ # hocr_data = pytesseract.image_to_data(
1413
+ # processed_img,
1414
+ # output_type=pytesseract.Output.DICT,
1415
+ # config=custom_config
1416
+ # )
1417
+
1418
+ # # ==============================================================================
1419
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
+ # # ==============================================================================
1421
+ # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
+ # debug_count = 0
1423
+ # for i in range(len(hocr_data['level'])):
1424
+ # text = hocr_data['text'][i].strip()
1425
+ # if text:
1426
+ # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
+ # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
+ # debug_count += 1
1429
+ # if debug_count >= 50: break
1430
+ # print("----------------------------------------------------------------------\n")
1431
+ # # ==============================================================================
1432
+
1433
+ # for i in range(len(hocr_data['level'])):
1434
+ # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
+
1436
+ # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1437
+ # cleaned_text = sanitize_text(text).strip()
1438
+
1439
+ # if cleaned_text and hocr_data['conf'][i] > -1:
1440
+ # # 4. Coordinate Mapping
1441
+ # scale_adjustment = scale_factor / ocr_zoom
1442
+
1443
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
1444
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
1445
+ # w = int(hocr_data['width'][i] * scale_adjustment)
1446
+ # h = int(hocr_data['height'][i] * scale_adjustment)
1447
+ # x2 = x1 + w
1448
+ # y2 = y1 + h
1449
+
1450
+ # raw_ocr_output.append({
1451
+ # 'type': 'text',
1452
+ # 'word': cleaned_text, # Use the sanitized word
1453
+ # 'confidence': float(hocr_data['conf'][i]),
1454
+ # 'bbox': [x1, y1, x2, y2],
1455
+ # 'y0': y1,
1456
+ # 'x0': x1
1457
+ # })
1458
+ # except Exception as e:
1459
+ # print(f" ❌ Tesseract OCR Error: {e}")
1460
+ # # === END OF OPTIMIZED OCR BLOCK ===
1461
+
1462
+ # # ====================================================================
1463
+ # # --- STEP 6: OCR CLEANING AND MERGING ---
1464
+ # # ====================================================================
1465
+ # items_to_sort = []
1466
+
1467
+ # for ocr_word in raw_ocr_output:
1468
+ # is_suppressed = False
1469
+ # for component in component_metadata:
1470
+ # # Do not include words that are inside figure/equation boxes
1471
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1472
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1473
+ # is_suppressed = True
1474
+ # break
1475
+ # if not is_suppressed:
1476
+ # items_to_sort.append(ocr_word)
1477
+
1478
+ # # Add figures/equations back into the flow as "words"
1479
+ # items_to_sort.extend(component_metadata)
1480
+
1481
+ # # ====================================================================
1482
+ # # --- STEP 7: LINE-BASED SORTING ---
1483
+ # # ====================================================================
1484
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1485
+ # lines = []
1486
+
1487
+ # for item in items_to_sort:
1488
+ # placed = False
1489
+ # for line in lines:
1490
+ # y_ref = min(it['y0'] for it in line)
1491
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1492
+ # line.append(item)
1493
+ # placed = True
1494
+ # break
1495
+ # if not placed and item['type'] in ['equation', 'figure']:
1496
+ # for line in lines:
1497
+ # y_ref = min(it['y0'] for it in line)
1498
+ # if abs(y_ref - item['y0']) < 20:
1499
+ # line.append(item)
1500
+ # placed = True
1501
+ # break
1502
+ # if not placed:
1503
+ # lines.append([item])
1504
+
1505
+ # for line in lines:
1506
+ # line.sort(key=lambda x: x['x0'])
1507
+
1508
+ # final_output = []
1509
+ # for line in lines:
1510
+ # for item in line:
1511
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1512
+ # if 'tag' in item: data_item['tag'] = item['tag']
1513
+ # final_output.append(data_item)
1514
+
1515
+ # return final_output, page_separator_x
1516
+
1517
+
1518
+
1519
+ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1520
+ page_num: int, fitz_page: fitz.Page,
1521
+ pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1522
+ """
1523
+ OPTIMIZED FLOW - MODIFIED FOR CORRECT ORDERING:
1524
+ 1. Run YOLO to find Equations/Tables.
1525
+ 2. Store detections with page_num but DON'T assign global IDs yet
1526
+ 3. Mask raw text with YOLO boxes.
1527
+ 4. Run Column Detection on the MASKED data.
1528
+ 5. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1529
+ """
1530
+ # NOTE: Removed global counter increments from here
1531
+
1532
+ start_time_total = time.time()
1533
+
1534
+ if original_img is None:
1535
+ print(f" ❌ Invalid image for page {page_num}.")
1536
+ return None, None
1537
+
1538
+ # ====================================================================
1539
+ # --- STEP 1: YOLO DETECTION ---
1540
+ # ====================================================================
1541
+ start_time_yolo = time.time()
1542
+ results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1543
+
1544
+ relevant_detections = []
1545
+ if results and results[0].boxes:
1546
+ for box in results[0].boxes:
1547
+ class_id = int(box.cls[0])
1548
+ class_name = model.names[class_id]
1549
+ if class_name in TARGET_CLASSES:
1550
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1551
+ relevant_detections.append(
1552
+ {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1553
+ )
1554
+
1555
+ merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1556
+ print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1557
+
1558
+ # ====================================================================
1559
+ # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1560
+ # ====================================================================
1561
+ raw_words_for_layout = get_word_data_for_detection(
1562
+ fitz_page, pdf_path, page_num,
1563
+ top_margin_percent=0.10, bottom_margin_percent=0.10
1564
+ )
1565
+
1566
+ masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1567
+
1568
+ # ====================================================================
1569
+ # --- STEP 3: COLUMN DETECTION ---
1570
+ # ====================================================================
1571
+ page_width_pdf = fitz_page.rect.width
1572
+ page_height_pdf = fitz_page.rect.height
1573
+
1574
+ column_detection_params = {
1575
+ 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1576
+ 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1577
+ }
1578
+
1579
+ separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1580
+
1581
+ page_separator_x = None
1582
+ if separators:
1583
+ central_min = page_width_pdf * 0.35
1584
+ central_max = page_width_pdf * 0.65
1585
+ central_separators = [s for s in separators if central_min <= s <= central_max]
1586
+
1587
+ if central_separators:
1588
+ center_x = page_width_pdf / 2
1589
+ page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1590
+ print(f" βœ… Column Split Confirmed at X={page_separator_x:.1f}")
1591
+ else:
1592
+ print(" ⚠️ Gutter found off-center. Ignoring.")
1593
+ else:
1594
+ print(" -> Single Column Layout Confirmed.")
1595
+
1596
+ # ====================================================================
1597
+ # --- STEP 4: COMPONENT EXTRACTION (MODIFIED - Store without ID) ---
1598
+ # ====================================================================
1599
+ start_time_components = time.time()
1600
+ component_metadata = []
1601
+
1602
+ for detection in merged_detections:
1603
+ x1, y1, x2, y2 = detection['coords']
1604
+ class_name = detection['class']
1605
+
1606
+ # DON'T assign global IDs here - just store the type and coordinates
1607
+ component_crop = original_img[y1:y2, x1:x2]
1608
+
1609
+ # Store image temporarily with page and position info in filename
1610
+ temp_filename = f"{pdf_name}_page{page_num}_{class_name}_y{y1}.png"
1611
+ temp_filepath = os.path.join(FIGURE_EXTRACTION_DIR, temp_filename)
1612
+ cv2.imwrite(temp_filepath, component_crop)
1613
+
1614
+ y_midpoint = (y1 + y2) // 2
1615
+ component_metadata.append({
1616
+ 'type': class_name,
1617
+ 'word': f"TEMP_{class_name.upper()}_PAGE{page_num}_Y{y1}", # Temporary placeholder
1618
+ 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1619
+ 'y0': int(y_midpoint),
1620
+ 'x0': int(x1),
1621
+ 'page_num': page_num, # CRITICAL: Store page number
1622
+ 'temp_filepath': temp_filepath # Store temp filepath for later renaming
1623
+ })
1624
+
1625
+ # ====================================================================
1626
+ # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1627
+ # ====================================================================
1628
+ raw_ocr_output = []
1629
+ scale_factor = 2.0
1630
+
1631
+ try:
1632
+ raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1633
+ except Exception as e:
1634
+ print(f" ❌ Native text extraction failed: {e}")
1635
+
1636
+ if not raw_ocr_output:
1637
+ if _ocr_cache.has_ocr(pdf_path, page_num):
1638
+ print(f" ⚑ Using cached Tesseract OCR for page {page_num}")
1639
+ cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1640
+ for word_tuple in cached_word_data:
1641
+ word_text, x1, y1, x2, y2 = word_tuple
1642
+ x1_pix = int(x1 * scale_factor)
1643
+ y1_pix = int(y1 * scale_factor)
1644
+ x2_pix = int(x2 * scale_factor)
1645
+ y2_pix = int(y2 * scale_factor)
1646
+
1647
+ raw_ocr_output.append({
1648
+ 'type': 'text', 'word': word_text, 'confidence': 95.0,
1649
+ 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1650
+ 'y0': y1_pix, 'x0': x1_pix
1651
+ })
1652
+ else:
1653
+ try:
1654
+ ocr_zoom = 4.0
1655
+ pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1656
+ img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1657
+ pix_ocr.n)
1658
+ if pix_ocr.n == 3:
1659
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1660
+ elif pix_ocr.n == 4:
1661
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1662
+
1663
+ processed_img = preprocess_image_for_ocr(img_ocr_np)
1664
+ custom_config = r'--oem 3 --psm 6'
1665
+ hocr_data = pytesseract.image_to_data(
1666
+ processed_img,
1667
+ output_type=pytesseract.Output.DICT,
1668
+ config=custom_config
1669
+ )
1670
+
1671
+ for i in range(len(hocr_data['level'])):
1672
+ text = hocr_data['text'][i]
1673
+ cleaned_text = sanitize_text(text).strip()
1674
+
1675
+ if cleaned_text and hocr_data['conf'][i] > -1:
1676
+ scale_adjustment = scale_factor / ocr_zoom
1677
+ x1 = int(hocr_data['left'][i] * scale_adjustment)
1678
+ y1 = int(hocr_data['top'][i] * scale_adjustment)
1679
+ w = int(hocr_data['width'][i] * scale_adjustment)
1680
+ h = int(hocr_data['height'][i] * scale_adjustment)
1681
+ x2 = x1 + w
1682
+ y2 = y1 + h
1683
+
1684
+ raw_ocr_output.append({
1685
+ 'type': 'text',
1686
+ 'word': cleaned_text,
1687
+ 'confidence': float(hocr_data['conf'][i]),
1688
+ 'bbox': [x1, y1, x2, y2],
1689
+ 'y0': y1,
1690
+ 'x0': x1
1691
+ })
1692
+ except Exception as e:
1693
+ print(f" ❌ Tesseract OCR Error: {e}")
1694
+
1695
+ # ====================================================================
1696
+ # --- STEP 6: OCR CLEANING AND MERGING ---
1697
+ # ====================================================================
1698
+ items_to_sort = []
1699
+
1700
+ for ocr_word in raw_ocr_output:
1701
+ is_suppressed = False
1702
+ for component in component_metadata:
1703
+ ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1704
+ if ioa > IOA_SUPPRESSION_THRESHOLD:
1705
+ is_suppressed = True
1706
+ break
1707
+ if not is_suppressed:
1708
+ items_to_sort.append(ocr_word)
1709
+
1710
+ items_to_sort.extend(component_metadata)
1711
+
1712
+ # ====================================================================
1713
+ # --- STEP 7: LINE-BASED SORTING ---
1714
+ # ====================================================================
1715
+ items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1716
+ lines = []
1717
+
1718
+ for item in items_to_sort:
1719
+ placed = False
1720
+ for line in lines:
1721
+ y_ref = min(it['y0'] for it in line)
1722
+ if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1723
+ line.append(item)
1724
+ placed = True
1725
+ break
1726
+ if not placed and item['type'] in ['equation', 'figure']:
1727
+ for line in lines:
1728
+ y_ref = min(it['y0'] for it in line)
1729
+ if abs(y_ref - item['y0']) < 20:
1730
+ line.append(item)
1731
+ placed = True
1732
+ break
1733
+ if not placed:
1734
+ lines.append([item])
1735
+
1736
+ for line in lines:
1737
+ line.sort(key=lambda x: x['x0'])
1738
+
1739
+ final_output = []
1740
+ for line in lines:
1741
+ for item in line:
1742
+ data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1743
+ if 'tag' in item: data_item['tag'] = item['tag']
1744
+ if 'page_num' in item: data_item['page_num'] = item['page_num']
1745
+ if 'temp_filepath' in item: data_item['temp_filepath'] = item['temp_filepath']
1746
+ final_output.append(data_item)
1747
+
1748
+ return final_output, page_separator_x
1749
+
1750
+
1751
+
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1758
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1759
+
1760
+ GLOBAL_FIGURE_COUNT = 0
1761
+ GLOBAL_EQUATION_COUNT = 0
1762
+ _ocr_cache.clear()
1763
+
1764
+ print("\n" + "=" * 80)
1765
+ print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
1766
+ print("=" * 80)
1767
+
1768
+ if not os.path.exists(pdf_path):
1769
+ print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
1770
+ return None
1771
 
1772
  os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
1773
  os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
 
1788
 
1789
  print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
1790
 
1791
+ # STEP 1: Collect all page data WITHOUT global numbering
1792
  for page_num_0_based in range(doc.page_count):
1793
  page_num = page_num_0_based + 1
1794
  print(f" -> Processing Page {page_num}/{doc.page_count}...")
 
1824
 
1825
  doc.close()
1826
 
1827
+ # ====================================================================
1828
+ # STEP 2: GLOBAL SORTING AND RENUMBERING
1829
+ # ====================================================================
1830
+ print("\n[STEP 1.3: SORTING AND RENUMBERING COMPONENTS GLOBALLY]")
1831
+
1832
+ # Collect all figure and equation items from all pages
1833
+ all_components = []
1834
+ for page_data in all_pages_data:
1835
+ for item in page_data['data']:
1836
+ if item['type'] in ['figure', 'equation']:
1837
+ all_components.append({
1838
+ 'item': item,
1839
+ 'page_num': page_data['page_number']
1840
+ })
1841
+
1842
+ # Sort by page number first, then by y-coordinate
1843
+ all_components.sort(key=lambda x: (x['page_num'], x['item']['bbox'][1]))
1844
+
1845
+ # Assign global IDs in correct order
1846
+ equation_counter = 0
1847
+ figure_counter = 0
1848
+ component_id_map = {} # Maps temp placeholder to final ID
1849
+
1850
+ for comp_data in all_components:
1851
+ item = comp_data['item']
1852
+ temp_word = item['word']
1853
+
1854
+ if item['type'] == 'equation':
1855
+ equation_counter += 1
1856
+ final_word = f"EQUATION{equation_counter}"
1857
+ component_id_map[temp_word] = final_word
1858
+
1859
+ # Rename the saved image file
1860
+ if 'temp_filepath' in item:
1861
+ old_path = item['temp_filepath']
1862
+ new_filename = f"{pdf_name}_page{comp_data['page_num']}_equation{equation_counter}.png"
1863
+ new_path = os.path.join(FIGURE_EXTRACTION_DIR, new_filename)
1864
+ if os.path.exists(old_path):
1865
+ os.rename(old_path, new_path)
1866
+
1867
+ elif item['type'] == 'figure':
1868
+ figure_counter += 1
1869
+ final_word = f"FIGURE{figure_counter}"
1870
+ component_id_map[temp_word] = final_word
1871
+
1872
+ # Rename the saved image file
1873
+ if 'temp_filepath' in item:
1874
+ old_path = item['temp_filepath']
1875
+ new_filename = f"{pdf_name}_page{comp_data['page_num']}_figure{figure_counter}.png"
1876
+ new_path = os.path.join(FIGURE_EXTRACTION_DIR, new_filename)
1877
+ if os.path.exists(old_path):
1878
+ os.rename(old_path, new_path)
1879
+
1880
+ # Update all references with final IDs
1881
+ for page_data in all_pages_data:
1882
+ for item in page_data['data']:
1883
+ if item['word'] in component_id_map:
1884
+ item['word'] = component_id_map[item['word']]
1885
+ # Clean up temporary fields
1886
+ if 'temp_filepath' in item:
1887
+ del item['temp_filepath']
1888
+ if 'page_num' in item:
1889
+ del item['page_num']
1890
+
1891
+ GLOBAL_FIGURE_COUNT = figure_counter
1892
+ GLOBAL_EQUATION_COUNT = equation_counter
1893
+
1894
+ print(f" βœ… Global numbering complete: {GLOBAL_EQUATION_COUNT} equations, {GLOBAL_FIGURE_COUNT} figures")
1895
+
1896
+ # ====================================================================
1897
+ # STEP 3: SAVE OUTPUT
1898
+ # ====================================================================
1899
  if all_pages_data:
1900
  try:
1901
  with open(preprocessed_json_path, 'w') as f:
 
1915
  return preprocessed_json_path
1916
 
1917
 
1918
+
1919
+ #==============================================================================================================================================================
1920
+
1921
+ # def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1922
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1923
+
1924
+ # GLOBAL_FIGURE_COUNT = 0
1925
+ # GLOBAL_EQUATION_COUNT = 0
1926
+ # _ocr_cache.clear()
1927
+
1928
+ # print("\n" + "=" * 80)
1929
+ # print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
1930
+ # print("=" * 80)
1931
+
1932
+ # if not os.path.exists(pdf_path):
1933
+ # print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
1934
+ # return None
1935
+
1936
+ # os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
1937
+ # os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
1938
+
1939
+ # model = YOLO(WEIGHTS_PATH)
1940
+ # pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
1941
+
1942
+ # try:
1943
+ # doc = fitz.open(pdf_path)
1944
+ # print(f"βœ… Opened PDF: {pdf_name} ({doc.page_count} pages)")
1945
+ # except Exception as e:
1946
+ # print(f"❌ ERROR loading PDF file: {e}")
1947
+ # return None
1948
+
1949
+ # all_pages_data = []
1950
+ # total_pages_processed = 0
1951
+ # mat = fitz.Matrix(2.0, 2.0)
1952
+
1953
+ # print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
1954
+
1955
+ # for page_num_0_based in range(doc.page_count):
1956
+ # page_num = page_num_0_based + 1
1957
+ # print(f" -> Processing Page {page_num}/{doc.page_count}...")
1958
+
1959
+ # fitz_page = doc.load_page(page_num_0_based)
1960
+
1961
+ # try:
1962
+ # pix = fitz_page.get_pixmap(matrix=mat)
1963
+ # original_img = pixmap_to_numpy(pix)
1964
+ # except Exception as e:
1965
+ # print(f" ❌ Error converting page {page_num} to image: {e}")
1966
+ # continue
1967
+
1968
+ # final_output, page_separator_x = preprocess_and_ocr_page(
1969
+ # original_img,
1970
+ # model,
1971
+ # pdf_path,
1972
+ # page_num,
1973
+ # fitz_page,
1974
+ # pdf_name
1975
+ # )
1976
+
1977
+ # if final_output is not None:
1978
+ # page_data = {
1979
+ # "page_number": page_num,
1980
+ # "data": final_output,
1981
+ # "column_separator_x": page_separator_x
1982
+ # }
1983
+ # all_pages_data.append(page_data)
1984
+ # total_pages_processed += 1
1985
+ # else:
1986
+ # print(f" ❌ Skipped page {page_num} due to processing error.")
1987
+
1988
+ # doc.close()
1989
+
1990
+ # if all_pages_data:
1991
+ # try:
1992
+ # with open(preprocessed_json_path, 'w') as f:
1993
+ # json.dump(all_pages_data, f, indent=4)
1994
+ # print(f"\n βœ… Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
1995
+ # except Exception as e:
1996
+ # print(f"❌ ERROR saving combined JSON output: {e}")
1997
+ # return None
1998
+ # else:
1999
+ # print("❌ WARNING: No page data generated. Halting pipeline.")
2000
+ # return None
2001
+
2002
+ # print("\n" + "=" * 80)
2003
+ # print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
2004
+ # print("=" * 80)
2005
+
2006
+ # return preprocessed_json_path
2007
+
2008
+
2009
  # ============================================================================
2010
  # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
2011
  # ============================================================================