heerjtdev commited on
Commit
17d97ef
·
verified ·
1 Parent(s): bbc046a

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +778 -146
working_yolo_pipeline.py CHANGED
@@ -146,6 +146,60 @@ def get_latex_from_base64(base64_string: str) -> str:
146
 
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # ============================================================================
151
  # --- CONFIGURATION AND CONSTANTS ---
@@ -586,6 +640,79 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
586
 
587
 
588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
 
591
 
@@ -1115,6 +1242,285 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1115
 
1116
 
1117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1119
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1120
 
@@ -1572,6 +1978,299 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1572
 
1573
 
1574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1575
 
1576
  # ============================================================================
1577
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
@@ -2058,171 +2757,104 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2058
  # ============================================================================
2059
 
2060
 
2061
- def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2062
- """
2063
- Wraps an image into a temporary PyMuPDF document/page safely.
2064
- Uses an in-memory buffer to bypass 'encoder pdf not available' errors.
2065
- """
2066
- # 1. Use PIL to open the image and ensure it's in RGB mode
2067
- img = Image.open(image_path).convert("RGB")
2068
-
2069
- # 2. Use a bytes buffer to save the image as a PDF via PIL's engine
2070
- pdf_stream = io.BytesIO()
2071
- img.save(pdf_stream, format="PDF")
2072
- pdf_stream.seek(0)
2073
-
2074
- # 3. Open that PDF stream with PyMuPDF
2075
- doc = fitz.open("pdf", pdf_stream.read())
2076
- return doc, doc[0]
2077
 
2078
- def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2079
- """
2080
- Modified pipeline that handles both PDFs and Images, running YOLO,
2081
- Tesseract OCR, and LayoutLMv3 inference.
2082
- """
2083
- # 1. INITIALIZE YOLO
2084
- yolo_model = YOLO(WEIGHTS_PATH)
2085
-
2086
- # 2. DETECT FILE TYPE
2087
- ext = os.path.splitext(input_path)[1].lower()
2088
- is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2089
-
2090
- all_pages_data = []
2091
- pdf_name = os.path.basename(input_path)
2092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2093
  try:
2094
- if is_image:
2095
- print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
2096
- doc, page = load_image_as_fitz_page(input_path)
2097
-
2098
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2099
- img_np = pixmap_to_numpy(pix)
2100
-
2101
- page_data, _ = preprocess_and_ocr_page(
2102
- img_np, yolo_model, input_path, 0, page, pdf_name
2103
- )
2104
- if page_data:
2105
- all_pages_data.append(page_data)
2106
- doc.close()
2107
- else:
2108
- doc = fitz.open(input_path)
2109
- print(f"📄 Processing PDF: {pdf_name} ({len(doc)} pages)")
2110
- for page_index in range(len(doc)):
2111
- page = doc[page_index]
2112
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2113
- img_np = pixmap_to_numpy(pix)
2114
-
2115
- page_data, _ = preprocess_and_ocr_page(
2116
- img_np, yolo_model, input_path, page_index, page, pdf_name
2117
- )
2118
- if page_data:
2119
- all_pages_data.append(page_data)
2120
- doc.close()
2121
 
2122
- if not all_pages_data:
2123
- print("❌ No data extracted.")
2124
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2125
 
2126
- # 3. CONSOLIDATE BLOCKS FOR INFERENCE (Safe against List vs Dict)
2127
- sequential_blocks = []
2128
- for p_data in all_pages_data:
2129
- if isinstance(p_data, dict):
2130
- blocks = p_data.get('blocks', [])
2131
- sequential_blocks.extend(blocks)
2132
- elif isinstance(p_data, list):
2133
- sequential_blocks.extend(p_data)
2134
 
2135
- # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
2136
- print("\n" + "=" * 80)
2137
- print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2138
- print("=" * 80)
2139
 
2140
- tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2141
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2142
-
2143
- model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2144
-
2145
- # --- FIX: ROBUST KEY REMAPPING FOR LAYOUTLMV3 ---
2146
-
2147
- checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
2148
- state_dict = checkpoint.get('model_state_dict', checkpoint)
2149
-
2150
- # Rename keys from 'layoutlm.xxx' to 'layoutlmv3.xxx' if necessary
2151
- new_state_dict = {}
2152
- for key, value in state_dict.items():
2153
- if key.startswith("layoutlm."):
2154
- new_key = key.replace("layoutlm.", "layoutlmv3.", 1)
2155
- new_state_dict[new_key] = value
2156
- else:
2157
- new_state_dict[key] = value
2158
-
2159
- # Load with strict=False to handle minor metadata differences
2160
- model.load_state_dict(new_state_dict, strict=False)
2161
- # -----------------------------------------------
2162
 
2163
- model.to(device)
2164
- model.eval()
2165
 
2166
- final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
 
 
 
 
 
 
 
2167
 
2168
- # 5. POST-PROCESS CLASSIFICATION
2169
  classifier = HierarchicalClassifier()
2170
  if classifier.load_models():
2171
- final_result = post_process_json_with_inference(final_result, classifier)
2172
- print("✅ Classification complete.")
 
 
 
 
 
 
2173
 
2174
- return final_result
 
2175
 
2176
  except Exception as e:
 
2177
  import traceback
2178
  traceback.print_exc()
2179
- print(f"❌ FATAL ERROR in pipeline: {e}")
2180
  return None
2181
 
2182
-
2183
-
2184
-
2185
- # #================================================================================
2186
- # # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2187
- # #================================================================================
2188
-
2189
- # print("\n" + "=" * 80)
2190
- # print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2191
- # print("=" * 80)
2192
-
2193
- # # 1. Initialize and Load the Classifier
2194
- # classifier = HierarchicalClassifier()
2195
- # if classifier.load_models():
2196
- # # 2. Run Classification on the *Final* Result
2197
- # # The function modifies the list in place and returns it
2198
- # final_result = post_process_json_with_inference(
2199
- # final_result, classifier
2200
- # )
2201
- # print("✅ Classification complete. Tags added to final output.")
2202
- # else:
2203
- # print("❌ Classification model loading failed. Outputting un-tagged data.")
2204
-
2205
- # # ====================================================================
2206
-
2207
-
2208
- # except Exception as e:
2209
- # print(f"❌ FATAL ERROR: {e}")
2210
- # import traceback
2211
- # traceback.print_exc()
2212
- # return None
2213
-
2214
- # finally:
2215
- # try:
2216
- # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2217
- # os.remove(f)
2218
- # os.rmdir(temp_pipeline_dir)
2219
- # except Exception:
2220
- # pass
2221
-
2222
- # print("\n" + "#" * 80)
2223
- # print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2224
- # print("#" * 80)
2225
- # return final_result
2226
 
2227
 
2228
 
 
146
 
147
 
148
 
149
+ # def get_latex_from_base64(base64_string: str) -> str:
150
+ # """
151
+ # Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
152
+ # to recognize the formula. It cleans the output by removing spaces and
153
+ # crucially, replacing double backslashes with single backslashes for correct LaTeX.
154
+ # """
155
+ # if ort_model is None or processor is None:
156
+ # return "[MODEL_ERROR: Model not initialized]"
157
+
158
+ # try:
159
+ # # 1. Decode Base64 to Image
160
+ # image_data = base64.b64decode(base64_string)
161
+ # # We must ensure the image is RGB format for the model input
162
+ # image = Image.open(io.BytesIO(image_data)).convert('RGB')
163
+
164
+ # # 2. Preprocess the image
165
+ # pixel_values = processor(images=image, return_tensors="pt").pixel_values
166
+
167
+ # # 3. Text Generation (OCR)
168
+ # generated_ids = ort_model.generate(pixel_values)
169
+ # raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
170
+
171
+ # if not raw_generated_text:
172
+ # return "[OCR_WARNING: No formula found]"
173
+
174
+ # latex_string = raw_generated_text[0]
175
+
176
+ # # ==============================================================================
177
+ # # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
178
+ # # ==============================================================================
179
+ # print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
180
+ # # ==============================================================================
181
+
182
+ # # --- 4. Post-processing and Cleanup ---
183
+
184
+ # # # A. Remove all spaces/line breaks
185
+ # # cleaned_latex = re.sub(r'\s+', '', latex_string)
186
+ # cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
187
+
188
+ # # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
189
+ # # This corrects model output that already over-escaped the LaTeX commands.
190
+ # # Python literal: '\\\\' is replaced with '\\'.
191
+ # #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
192
+
193
+ # return cleaned_latex
194
+
195
+
196
+ # except Exception as e:
197
+ # # Catch any unexpected errors
198
+ # print(f" ❌ TR-OCR Recognition failed: {e}")
199
+ # return f"[TR_OCR_ERROR: Recognition failed: {e}]"
200
+
201
+
202
+
203
 
204
  # ============================================================================
205
  # --- CONFIGURATION AND CONSTANTS ---
 
640
 
641
 
642
 
643
+ # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
644
+ # raw_word_data = fitz_page.get_text("words")
645
+ # converted_ocr_output = []
646
+ # DEFAULT_CONFIDENCE = 99.0
647
+
648
+ # for x1, y1, x2, y2, word, *rest in raw_word_data:
649
+ # # --- FIX: SANITIZE TEXT HERE ---
650
+ # # cleaned_word = sanitize_text(word)
651
+ # # if not cleaned_word.strip(): continue
652
+
653
+ # x1_pix = int(x1 * scale_factor)
654
+ # y1_pix = int(y1 * scale_factor)
655
+ # x2_pix = int(x2 * scale_factor)
656
+ # y2_pix = int(y2 * scale_factor)
657
+ # converted_ocr_output.append({
658
+ # 'type': 'text',
659
+ # 'word': cleaned_word, # Use the sanitized word
660
+ # 'confidence': DEFAULT_CONFIDENCE,
661
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
662
+ # 'y0': y1_pix, 'x0': x1_pix
663
+ # })
664
+ # return converted_ocr_output
665
+
666
+
667
+
668
+
669
+
670
+ # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
671
+ # raw_word_data = fitz_page.get_text("words")
672
+
673
+ # # ==============================================================================
674
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
675
+ # # ==============================================================================
676
+ # print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
677
+ # debug_count = 0
678
+ # for item in raw_word_data:
679
+ # if debug_count >= 50: break
680
+ # # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
681
+ # word_text = item[4]
682
+
683
+ # # Generate unicode hex codes for every character in the word
684
+ # unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
685
+ # print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
686
+ # debug_count += 1
687
+ # print("----------------------------------------------------------------------\n")
688
+ # # ==============================================================================
689
+
690
+ # converted_ocr_output = []
691
+ # DEFAULT_CONFIDENCE = 99.0
692
+
693
+ # for x1, y1, x2, y2, word, *rest in raw_word_data:
694
+ # # --- FIX: SANITIZE TEXT HERE ---
695
+ # cleaned_word = sanitize_text(word)
696
+ # if not cleaned_word.strip(): continue
697
+
698
+ # x1_pix = int(x1 * scale_factor)
699
+ # y1_pix = int(y1 * scale_factor)
700
+ # x2_pix = int(x2 * scale_factor)
701
+ # y2_pix = int(y2 * scale_factor)
702
+ # converted_ocr_output.append({
703
+ # 'type': 'text',
704
+ # 'word': cleaned_word, # Use the sanitized word
705
+ # 'confidence': DEFAULT_CONFIDENCE,
706
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
707
+ # 'y0': y1_pix, 'x0': x1_pix
708
+ # })
709
+ # return converted_ocr_output
710
+
711
+
712
+
713
+
714
+
715
+
716
 
717
 
718
 
 
1242
 
1243
 
1244
 
1245
+
1246
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
+ # page_num: int, fitz_page: fitz.Page,
1248
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1249
+ # """
1250
+ # OPTIMIZED FLOW:
1251
+ # 1. Run YOLO to find Equations/Tables.
1252
+ # 2. Mask raw text with YOLO boxes.
1253
+ # 3. Run Column Detection on the MASKED data.
1254
+ # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1255
+ # """
1256
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1257
+
1258
+ # start_time_total = time.time()
1259
+
1260
+ # if original_img is None:
1261
+ # print(f" ❌ Invalid image for page {page_num}.")
1262
+ # return None, None
1263
+
1264
+ # # ====================================================================
1265
+ # # --- STEP 1: YOLO DETECTION ---
1266
+ # # ====================================================================
1267
+ # start_time_yolo = time.time()
1268
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1269
+
1270
+ # relevant_detections = []
1271
+ # if results and results[0].boxes:
1272
+ # for box in results[0].boxes:
1273
+ # class_id = int(box.cls[0])
1274
+ # class_name = model.names[class_id]
1275
+ # if class_name in TARGET_CLASSES:
1276
+ # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1277
+ # relevant_detections.append(
1278
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1279
+ # )
1280
+
1281
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1282
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1283
+
1284
+ # # ====================================================================
1285
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1286
+ # # ====================================================================
1287
+ # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1288
+ # raw_words_for_layout = get_word_data_for_detection(
1289
+ # fitz_page, pdf_path, page_num,
1290
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
1291
+ # )
1292
+
1293
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1294
+
1295
+ # # ====================================================================
1296
+ # # --- STEP 3: COLUMN DETECTION ---
1297
+ # # ====================================================================
1298
+ # page_width_pdf = fitz_page.rect.width
1299
+ # page_height_pdf = fitz_page.rect.height
1300
+
1301
+ # column_detection_params = {
1302
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1303
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1304
+ # }
1305
+
1306
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1307
+
1308
+ # page_separator_x = None
1309
+ # if separators:
1310
+ # central_min = page_width_pdf * 0.35
1311
+ # central_max = page_width_pdf * 0.65
1312
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
1313
+
1314
+ # if central_separators:
1315
+ # center_x = page_width_pdf / 2
1316
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1317
+ # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1318
+ # else:
1319
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
1320
+ # else:
1321
+ # print(" -> Single Column Layout Confirmed.")
1322
+
1323
+ # # ====================================================================
1324
+ # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1325
+ # # ====================================================================
1326
+ # start_time_components = time.time()
1327
+ # component_metadata = []
1328
+ # fig_count_page = 0
1329
+ # eq_count_page = 0
1330
+
1331
+ # for detection in merged_detections:
1332
+ # x1, y1, x2, y2 = detection['coords']
1333
+ # class_name = detection['class']
1334
+
1335
+ # if class_name == 'figure':
1336
+ # GLOBAL_FIGURE_COUNT += 1
1337
+ # counter = GLOBAL_FIGURE_COUNT
1338
+ # component_word = f"FIGURE{counter}"
1339
+ # fig_count_page += 1
1340
+ # elif class_name == 'equation':
1341
+ # GLOBAL_EQUATION_COUNT += 1
1342
+ # counter = GLOBAL_EQUATION_COUNT
1343
+ # component_word = f"EQUATION{counter}"
1344
+ # eq_count_page += 1
1345
+ # else:
1346
+ # continue
1347
+
1348
+ # component_crop = original_img[y1:y2, x1:x2]
1349
+ # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1350
+ # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1351
+
1352
+ # y_midpoint = (y1 + y2) // 2
1353
+ # component_metadata.append({
1354
+ # 'type': class_name, 'word': component_word,
1355
+ # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1356
+ # 'y0': int(y_midpoint), 'x0': int(x1)
1357
+ # })
1358
+
1359
+ # # ====================================================================
1360
+ # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1361
+ # # ====================================================================
1362
+ # raw_ocr_output = []
1363
+ # scale_factor = 2.0 # Pipeline standard scale
1364
+
1365
+ # try:
1366
+ # # Try getting native text first
1367
+ # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1368
+ # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1369
+ # except Exception as e:
1370
+ # print(f" ❌ Native text extraction failed: {e}")
1371
+
1372
+ # # If native text is missing, fall back to OCR
1373
+ # if not raw_ocr_output:
1374
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
1375
+ # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1376
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1377
+ # for word_tuple in cached_word_data:
1378
+ # word_text, x1, y1, x2, y2 = word_tuple
1379
+
1380
+ # # Scale from PDF points to Pipeline Pixels (2.0)
1381
+ # x1_pix = int(x1 * scale_factor)
1382
+ # y1_pix = int(y1 * scale_factor)
1383
+ # x2_pix = int(x2 * scale_factor)
1384
+ # y2_pix = int(y2 * scale_factor)
1385
+
1386
+ # raw_ocr_output.append({
1387
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1388
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1389
+ # 'y0': y1_pix, 'x0': x1_pix
1390
+ # })
1391
+ # else:
1392
+ # # === START OF OPTIMIZED OCR BLOCK ===
1393
+ # try:
1394
+ # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1395
+ # ocr_zoom = 4.0
1396
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1397
+
1398
+ # # Convert PyMuPDF Pixmap to OpenCV format
1399
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1400
+ # pix_ocr.n)
1401
+ # if pix_ocr.n == 3:
1402
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1403
+ # elif pix_ocr.n == 4:
1404
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1405
+
1406
+ # # 2. Preprocess (Binarization)
1407
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
1408
+
1409
+ # # 3. Run Tesseract with Optimized Configuration
1410
+ # custom_config = r'--oem 3 --psm 6'
1411
+
1412
+ # hocr_data = pytesseract.image_to_data(
1413
+ # processed_img,
1414
+ # output_type=pytesseract.Output.DICT,
1415
+ # config=custom_config
1416
+ # )
1417
+
1418
+ # # ==============================================================================
1419
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
+ # # ==============================================================================
1421
+ # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
+ # debug_count = 0
1423
+ # for i in range(len(hocr_data['level'])):
1424
+ # text = hocr_data['text'][i].strip()
1425
+ # if text:
1426
+ # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
+ # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
+ # debug_count += 1
1429
+ # if debug_count >= 50: break
1430
+ # print("----------------------------------------------------------------------\n")
1431
+ # # ==============================================================================
1432
+
1433
+ # for i in range(len(hocr_data['level'])):
1434
+ # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
+
1436
+ # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1437
+ # cleaned_text = sanitize_text(text).strip()
1438
+
1439
+ # if cleaned_text and hocr_data['conf'][i] > -1:
1440
+ # # 4. Coordinate Mapping
1441
+ # scale_adjustment = scale_factor / ocr_zoom
1442
+
1443
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
1444
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
1445
+ # w = int(hocr_data['width'][i] * scale_adjustment)
1446
+ # h = int(hocr_data['height'][i] * scale_adjustment)
1447
+ # x2 = x1 + w
1448
+ # y2 = y1 + h
1449
+
1450
+ # raw_ocr_output.append({
1451
+ # 'type': 'text',
1452
+ # 'word': cleaned_text, # Use the sanitized word
1453
+ # 'confidence': float(hocr_data['conf'][i]),
1454
+ # 'bbox': [x1, y1, x2, y2],
1455
+ # 'y0': y1,
1456
+ # 'x0': x1
1457
+ # })
1458
+ # except Exception as e:
1459
+ # print(f" ❌ Tesseract OCR Error: {e}")
1460
+ # # === END OF OPTIMIZED OCR BLOCK ===
1461
+
1462
+ # # ====================================================================
1463
+ # # --- STEP 6: OCR CLEANING AND MERGING ---
1464
+ # # ====================================================================
1465
+ # items_to_sort = []
1466
+
1467
+ # for ocr_word in raw_ocr_output:
1468
+ # is_suppressed = False
1469
+ # for component in component_metadata:
1470
+ # # Do not include words that are inside figure/equation boxes
1471
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1472
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1473
+ # is_suppressed = True
1474
+ # break
1475
+ # if not is_suppressed:
1476
+ # items_to_sort.append(ocr_word)
1477
+
1478
+ # # Add figures/equations back into the flow as "words"
1479
+ # items_to_sort.extend(component_metadata)
1480
+
1481
+ # # ====================================================================
1482
+ # # --- STEP 7: LINE-BASED SORTING ---
1483
+ # # ====================================================================
1484
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1485
+ # lines = []
1486
+
1487
+ # for item in items_to_sort:
1488
+ # placed = False
1489
+ # for line in lines:
1490
+ # y_ref = min(it['y0'] for it in line)
1491
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1492
+ # line.append(item)
1493
+ # placed = True
1494
+ # break
1495
+ # if not placed and item['type'] in ['equation', 'figure']:
1496
+ # for line in lines:
1497
+ # y_ref = min(it['y0'] for it in line)
1498
+ # if abs(y_ref - item['y0']) < 20:
1499
+ # line.append(item)
1500
+ # placed = True
1501
+ # break
1502
+ # if not placed:
1503
+ # lines.append([item])
1504
+
1505
+ # for line in lines:
1506
+ # line.sort(key=lambda x: x['x0'])
1507
+
1508
+ # final_output = []
1509
+ # for line in lines:
1510
+ # for item in line:
1511
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1512
+ # if 'tag' in item: data_item['tag'] = item['tag']
1513
+ # final_output.append(data_item)
1514
+
1515
+ # return final_output, page_separator_x
1516
+
1517
+
1518
+
1519
+
1520
+
1521
+
1522
+
1523
+
1524
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1525
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1526
 
 
1978
 
1979
 
1980
 
1981
+ # def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1982
+ # preprocessed_json_path: str,
1983
+ # column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
1984
+ # print("\n" + "=" * 80)
1985
+ # print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
1986
+ # print("=" * 80)
1987
+
1988
+ # tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
1989
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1990
+ # print(f" -> Using device: {device}")
1991
+
1992
+ # try:
1993
+ # model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
1994
+ # checkpoint = torch.load(model_path, map_location=device)
1995
+ # model_state = checkpoint.get('model_state_dict', checkpoint)
1996
+ # # Apply patch for layoutlmv3 compatibility with saved state_dict
1997
+ # fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
1998
+ # model.load_state_dict(fixed_state_dict)
1999
+ # model.to(device)
2000
+ # model.eval()
2001
+ # print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
2002
+ # except Exception as e:
2003
+ # print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
2004
+ # return []
2005
+
2006
+ # try:
2007
+ # with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
2008
+ # preprocessed_data = json.load(f)
2009
+ # print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
2010
+ # except Exception:
2011
+ # print("❌ Error loading preprocessed JSON.")
2012
+ # return []
2013
+
2014
+ # try:
2015
+ # doc = fitz.open(pdf_path)
2016
+ # except Exception:
2017
+ # print("❌ Error loading PDF.")
2018
+ # return []
2019
+
2020
+ # final_page_predictions = []
2021
+ # CHUNK_SIZE = 500
2022
+
2023
+ # for page_data in preprocessed_data:
2024
+ # page_num_1_based = page_data['page_number']
2025
+ # page_num_0_based = page_num_1_based - 1
2026
+ # page_raw_predictions = []
2027
+ # print(f"\n *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
2028
+
2029
+ # fitz_page = doc.load_page(page_num_0_based)
2030
+ # page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
2031
+ # print(f" -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
2032
+
2033
+ # all_token_data = []
2034
+ # scale_factor = 2.0
2035
+
2036
+ # for item in page_data['data']:
2037
+ # raw_yolo_bbox = item['bbox']
2038
+ # bbox_pdf = [
2039
+ # int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
2040
+ # int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
2041
+ # ]
2042
+ # normalized_bbox = [
2043
+ # max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
2044
+ # max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
2045
+ # max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
2046
+ # max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
2047
+ # ]
2048
+ # all_token_data.append({
2049
+ # "word": item['word'],
2050
+ # "bbox_raw_pdf_space": bbox_pdf,
2051
+ # "bbox_normalized": normalized_bbox,
2052
+ # "item_original_data": item
2053
+ # })
2054
+
2055
+ # # ==============================================================================
2056
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
2057
+ # # ==============================================================================
2058
+ # print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
2059
+ # debug_count = 0
2060
+ # for t in all_token_data:
2061
+ # if debug_count >= 50: break
2062
+ # w = t['word']
2063
+ # unicode_points = [f"\\u{ord(c):04x}" for c in w]
2064
+ # print(f" Token {debug_count}: '{w}' -> Codes: {unicode_points}")
2065
+ # debug_count += 1
2066
+ # print("----------------------------------------------------------------------\n")
2067
+ # # ==============================================================================
2068
+
2069
+ # if not all_token_data:
2070
+ # continue
2071
+
2072
+ # column_separator_x = page_data.get('column_separator_x', None)
2073
+ # if column_separator_x is not None:
2074
+ # print(f" -> Using SAVED column separator: X={column_separator_x}")
2075
+ # else:
2076
+ # print(" -> No column separator found. Assuming single chunk.")
2077
+
2078
+ # token_chunks = _merge_integrity(all_token_data, column_separator_x)
2079
+ # total_chunks = len(token_chunks)
2080
+
2081
+ # for chunk_idx, chunk_tokens in enumerate(token_chunks):
2082
+ # if not chunk_tokens: continue
2083
+
2084
+ # # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
2085
+ # chunk_words = [
2086
+ # str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
2087
+ # for t in chunk_tokens
2088
+ # ]
2089
+ # chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
2090
+
2091
+ # total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
2092
+ # for i in range(0, len(chunk_words), CHUNK_SIZE):
2093
+ # sub_chunk_idx = i // CHUNK_SIZE + 1
2094
+ # sub_words = chunk_words[i:i + CHUNK_SIZE]
2095
+ # sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
2096
+ # sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
2097
+
2098
+ # print(f" -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
2099
+
2100
+ # # 2. Manual generation of word_ids
2101
+ # manual_word_ids = []
2102
+ # for current_word_idx, word in enumerate(sub_words):
2103
+ # sub_tokens = tokenizer.tokenize(word)
2104
+ # for _ in sub_tokens:
2105
+ # manual_word_ids.append(current_word_idx)
2106
+
2107
+ # encoded_input = tokenizer(
2108
+ # sub_words,
2109
+ # boxes=sub_bboxes,
2110
+ # truncation=True,
2111
+ # padding="max_length",
2112
+ # max_length=512,
2113
+ # is_split_into_words=True,
2114
+ # return_tensors="pt"
2115
+ # )
2116
+
2117
+ # # Check for empty sequence
2118
+ # if encoded_input['input_ids'].shape[0] == 0:
2119
+ # print(f" -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
2120
+ # continue
2121
+
2122
+ # # 3. Finalize word_ids based on encoded output length
2123
+ # sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
2124
+ # content_token_length = max(0, sequence_length - 2)
2125
+
2126
+ # manual_word_ids = manual_word_ids[:content_token_length]
2127
+
2128
+ # final_word_ids = [None] # CLS token (index 0)
2129
+ # final_word_ids.extend(manual_word_ids)
2130
+
2131
+ # if sequence_length > 1:
2132
+ # final_word_ids.append(None) # SEP token
2133
+
2134
+ # final_word_ids.extend([None] * (512 - len(final_word_ids)))
2135
+ # word_ids = final_word_ids[:512] # Final array for mapping
2136
+
2137
+ # # Inputs are already batched by the tokenizer as [1, 512]
2138
+ # input_ids = encoded_input['input_ids'].to(device)
2139
+ # bbox = encoded_input['bbox'].to(device)
2140
+ # attention_mask = encoded_input['attention_mask'].to(device)
2141
+
2142
+ # with torch.no_grad():
2143
+ # model_outputs = model(input_ids, bbox, attention_mask)
2144
+
2145
+ # # --- Robust extraction: support several forward return types ---
2146
+ # logits_tensor = None
2147
+ # decoded_labels_list = None
2148
+
2149
+ # # case 1: tuple/list with (emissions, viterbi)
2150
+ # if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
2151
+ # a, b = model_outputs
2152
+ # if isinstance(a, torch.Tensor):
2153
+ # logits_tensor = a
2154
+ # if isinstance(b, list):
2155
+ # decoded_labels_list = b
2156
+
2157
+ # # case 2: HF ModelOutput with .logits
2158
+ # if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
2159
+ # logits_tensor = model_outputs.logits
2160
+
2161
+ # # case 3: tuple/list - search for a 3D tensor (B, L, C)
2162
+ # if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
2163
+ # found_tensor = None
2164
+ # for item in model_outputs:
2165
+ # if isinstance(item, torch.Tensor):
2166
+ # if item.dim() == 3:
2167
+ # logits_tensor = item
2168
+ # break
2169
+ # if found_tensor is None:
2170
+ # found_tensor = item
2171
+ # if logits_tensor is None and found_tensor is not None:
2172
+ # if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
2173
+ # logits_tensor = found_tensor
2174
+ # elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
2175
+ # logits_tensor = found_tensor.unsqueeze(0)
2176
+
2177
+ # # case 4: model_outputs directly a tensor
2178
+ # if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
2179
+ # logits_tensor = model_outputs
2180
+
2181
+ # # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
2182
+ # if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
2183
+ # decoded_labels_list = model_outputs
2184
+
2185
+ # # If neither logits nor decoded exist, that's fatal
2186
+ # if logits_tensor is None and decoded_labels_list is None:
2187
+ # try:
2188
+ # elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
2189
+ # except Exception:
2190
+ # elem_shapes = str(type(model_outputs))
2191
+ # raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
2192
+
2193
+ # # If we have logits_tensor, normalize shape to [seq_len, num_labels]
2194
+ # if logits_tensor is not None:
2195
+ # if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
2196
+ # preds_tensor = logits_tensor.squeeze(0) # [L, C]
2197
+ # else:
2198
+ # preds_tensor = logits_tensor # possibly [L, C] already
2199
+
2200
+ # if preds_tensor.dim() != 2:
2201
+ # raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
2202
+ # else:
2203
+ # preds_tensor = None # no logits available
2204
+
2205
+ # # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
2206
+ # decoded_token_labels = None
2207
+ # if decoded_labels_list is not None:
2208
+ # decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
2209
+
2210
+ # # Now map token-level predictions -> word-level predictions using word_ids
2211
+ # word_idx_to_pred_id = {}
2212
+
2213
+ # if preds_tensor is not None:
2214
+ # for token_idx, word_idx in enumerate(word_ids):
2215
+ # if token_idx >= sequence_length:
2216
+ # break
2217
+ # if word_idx is not None and word_idx < len(sub_words):
2218
+ # if word_idx not in word_idx_to_pred_id:
2219
+ # pred_id = torch.argmax(preds_tensor[token_idx]).item()
2220
+ # word_idx_to_pred_id[word_idx] = pred_id
2221
+ # else:
2222
+ # if decoded_token_labels is None:
2223
+ # raise RuntimeError("No logits and no decoded labels available for mapping.")
2224
+ # decoded_len = len(decoded_token_labels)
2225
+ # if decoded_len == content_token_length:
2226
+ # decoded_start = 1
2227
+ # elif decoded_len == sequence_length:
2228
+ # decoded_start = 0
2229
+ # else:
2230
+ # decoded_start = 1
2231
+
2232
+ # for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
2233
+ # tok_idx = decoded_start + tok_idx_in_decoded
2234
+ # if tok_idx >= 512:
2235
+ # break
2236
+ # if tok_idx >= sequence_length:
2237
+ # break
2238
+ # word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
2239
+ # if word_idx is not None and word_idx < len(sub_words):
2240
+ # if word_idx not in word_idx_to_pred_id:
2241
+ # word_idx_to_pred_id[word_idx] = int(label_id)
2242
+
2243
+ # # Finally convert mapped word preds -> page_raw_predictions entries
2244
+ # for current_word_idx in range(len(sub_words)):
2245
+ # pred_id = word_idx_to_pred_id.get(current_word_idx, 0) # default to 0
2246
+ # predicted_label = ID_TO_LABEL[pred_id]
2247
+ # original_token = sub_tokens_data[current_word_idx]
2248
+ # page_raw_predictions.append({
2249
+ # "word": original_token['word'],
2250
+ # "bbox": original_token['bbox_raw_pdf_space'],
2251
+ # "predicted_label": predicted_label,
2252
+ # "page_number": page_num_1_based
2253
+ # })
2254
+
2255
+ # if page_raw_predictions:
2256
+ # final_page_predictions.append({
2257
+ # "page_number": page_num_1_based,
2258
+ # "data": page_raw_predictions
2259
+ # })
2260
+ # print(f" *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
2261
+
2262
+ # doc.close()
2263
+ # print("\n" + "=" * 80)
2264
+ # print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
2265
+ # print("=" * 80)
2266
+ # return final_page_predictions
2267
+
2268
+
2269
+
2270
+
2271
+
2272
+
2273
+
2274
 
2275
  # ============================================================================
2276
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 
2757
  # ============================================================================
2758
 
2759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2761
 
2762
+ # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2763
+ # List[Dict[str, Any]]]:
2764
+ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2765
+ if not os.path.exists(input_pdf_path): return None
2766
+
2767
+ print("\n" + "#" * 80)
2768
+ print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2769
+ print("#" * 80)
2770
+
2771
+ pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2772
+ temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2773
+ os.makedirs(temp_pipeline_dir, exist_ok=True)
2774
+
2775
+ preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2776
+ raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2777
+ structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2778
+
2779
+ final_result = None
2780
  try:
2781
+ # Phase 1: Preprocessing with YOLO First + Masking
2782
+ preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2783
+ if not preprocessed_json_path_out: return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2784
 
2785
+ # Phase 2: Inference
2786
+ page_raw_predictions_list = run_inference_and_get_raw_words(
2787
+ input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2788
+ )
2789
+ if not page_raw_predictions_list: return None
2790
+
2791
+ # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2792
+ # Save raw predictions to the temporary file
2793
+ with open(raw_output_path, 'w', encoding='utf-8') as f:
2794
+ json.dump(page_raw_predictions_list, f, indent=4)
2795
+
2796
+ # Explicitly copy/save the raw predictions to the user-specified debug path
2797
+ # if raw_predictions_output_path:
2798
+ # shutil.copy(raw_output_path, raw_predictions_output_path)
2799
+ # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2800
+ # ----------------------------------------
2801
+
2802
+ # Phase 3: Decoding
2803
+ structured_data_list = convert_bio_to_structured_json_relaxed(
2804
+ raw_output_path, structured_intermediate_output_path
2805
+ )
2806
+ if not structured_data_list: return None
2807
+ structured_data_list = correct_misaligned_options(structured_data_list)
2808
+ structured_data_list = process_context_linking(structured_data_list)
2809
 
 
 
 
 
 
 
 
 
2810
 
2811
+ # Phase 4: Embedding / Equation to LaTeX Conversion
2812
+ final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
 
 
2813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2814
 
 
 
2815
 
2816
+
2817
+ #================================================================================
2818
+ # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2819
+ #================================================================================
2820
+
2821
+ print("\n" + "=" * 80)
2822
+ print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2823
+ print("=" * 80)
2824
 
2825
+ # 1. Initialize and Load the Classifier
2826
  classifier = HierarchicalClassifier()
2827
  if classifier.load_models():
2828
+ # 2. Run Classification on the *Final* Result
2829
+ # The function modifies the list in place and returns it
2830
+ final_result = post_process_json_with_inference(
2831
+ final_result, classifier
2832
+ )
2833
+ print("✅ Classification complete. Tags added to final output.")
2834
+ else:
2835
+ print("❌ Classification model loading failed. Outputting un-tagged data.")
2836
 
2837
+ # ====================================================================
2838
+
2839
 
2840
  except Exception as e:
2841
+ print(f"❌ FATAL ERROR: {e}")
2842
  import traceback
2843
  traceback.print_exc()
 
2844
  return None
2845
 
2846
+ finally:
2847
+ try:
2848
+ for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2849
+ os.remove(f)
2850
+ os.rmdir(temp_pipeline_dir)
2851
+ except Exception:
2852
+ pass
2853
+
2854
+ print("\n" + "#" * 80)
2855
+ print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2856
+ print("#" * 80)
2857
+ return final_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2858
 
2859
 
2860