heerjtdev commited on
Commit
f54d98f
Β·
verified Β·
1 Parent(s): 0a214f3

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +164 -765
working_yolo_pipeline.py CHANGED
@@ -144,63 +144,6 @@ def get_latex_from_base64(base64_string: str) -> str:
144
 
145
 
146
 
147
-
148
-
149
- # def get_latex_from_base64(base64_string: str) -> str:
150
- # """
151
- # Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
152
- # to recognize the formula. It cleans the output by removing spaces and
153
- # crucially, replacing double backslashes with single backslashes for correct LaTeX.
154
- # """
155
- # if ort_model is None or processor is None:
156
- # return "[MODEL_ERROR: Model not initialized]"
157
-
158
- # try:
159
- # # 1. Decode Base64 to Image
160
- # image_data = base64.b64decode(base64_string)
161
- # # We must ensure the image is RGB format for the model input
162
- # image = Image.open(io.BytesIO(image_data)).convert('RGB')
163
-
164
- # # 2. Preprocess the image
165
- # pixel_values = processor(images=image, return_tensors="pt").pixel_values
166
-
167
- # # 3. Text Generation (OCR)
168
- # generated_ids = ort_model.generate(pixel_values)
169
- # raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
170
-
171
- # if not raw_generated_text:
172
- # return "[OCR_WARNING: No formula found]"
173
-
174
- # latex_string = raw_generated_text[0]
175
-
176
- # # ==============================================================================
177
- # # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
178
- # # ==============================================================================
179
- # print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
180
- # # ==============================================================================
181
-
182
- # # --- 4. Post-processing and Cleanup ---
183
-
184
- # # # A. Remove all spaces/line breaks
185
- # # cleaned_latex = re.sub(r'\s+', '', latex_string)
186
- # cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
187
-
188
- # # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
189
- # # This corrects model output that already over-escaped the LaTeX commands.
190
- # # Python literal: '\\\\' is replaced with '\\'.
191
- # #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
192
-
193
- # return cleaned_latex
194
-
195
-
196
- # except Exception as e:
197
- # # Catch any unexpected errors
198
- # print(f" ❌ TR-OCR Recognition failed: {e}")
199
- # return f"[TR_OCR_ERROR: Recognition failed: {e}]"
200
-
201
-
202
-
203
-
204
  # ============================================================================
205
  # --- CONFIGURATION AND CONSTANTS ---
206
  # ============================================================================
@@ -640,79 +583,6 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
640
 
641
 
642
 
643
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
644
- # raw_word_data = fitz_page.get_text("words")
645
- # converted_ocr_output = []
646
- # DEFAULT_CONFIDENCE = 99.0
647
-
648
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
649
- # # --- FIX: SANITIZE TEXT HERE ---
650
- # # cleaned_word = sanitize_text(word)
651
- # # if not cleaned_word.strip(): continue
652
-
653
- # x1_pix = int(x1 * scale_factor)
654
- # y1_pix = int(y1 * scale_factor)
655
- # x2_pix = int(x2 * scale_factor)
656
- # y2_pix = int(y2 * scale_factor)
657
- # converted_ocr_output.append({
658
- # 'type': 'text',
659
- # 'word': cleaned_word, # Use the sanitized word
660
- # 'confidence': DEFAULT_CONFIDENCE,
661
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
662
- # 'y0': y1_pix, 'x0': x1_pix
663
- # })
664
- # return converted_ocr_output
665
-
666
-
667
-
668
-
669
-
670
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
671
- # raw_word_data = fitz_page.get_text("words")
672
-
673
- # # ==============================================================================
674
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
675
- # # ==============================================================================
676
- # print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
677
- # debug_count = 0
678
- # for item in raw_word_data:
679
- # if debug_count >= 50: break
680
- # # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
681
- # word_text = item[4]
682
-
683
- # # Generate unicode hex codes for every character in the word
684
- # unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
685
- # print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
686
- # debug_count += 1
687
- # print("----------------------------------------------------------------------\n")
688
- # # ==============================================================================
689
-
690
- # converted_ocr_output = []
691
- # DEFAULT_CONFIDENCE = 99.0
692
-
693
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
694
- # # --- FIX: SANITIZE TEXT HERE ---
695
- # cleaned_word = sanitize_text(word)
696
- # if not cleaned_word.strip(): continue
697
-
698
- # x1_pix = int(x1 * scale_factor)
699
- # y1_pix = int(y1 * scale_factor)
700
- # x2_pix = int(x2 * scale_factor)
701
- # y2_pix = int(y2 * scale_factor)
702
- # converted_ocr_output.append({
703
- # 'type': 'text',
704
- # 'word': cleaned_word, # Use the sanitized word
705
- # 'confidence': DEFAULT_CONFIDENCE,
706
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
707
- # 'y0': y1_pix, 'x0': x1_pix
708
- # })
709
- # return converted_ocr_output
710
-
711
-
712
-
713
-
714
-
715
-
716
 
717
 
718
 
@@ -1237,290 +1107,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1237
 
1238
 
1239
 
1240
-
1241
-
1242
-
1243
-
1244
-
1245
-
1246
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
- # page_num: int, fitz_page: fitz.Page,
1248
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1249
- # """
1250
- # OPTIMIZED FLOW:
1251
- # 1. Run YOLO to find Equations/Tables.
1252
- # 2. Mask raw text with YOLO boxes.
1253
- # 3. Run Column Detection on the MASKED data.
1254
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1255
- # """
1256
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1257
-
1258
- # start_time_total = time.time()
1259
-
1260
- # if original_img is None:
1261
- # print(f" ❌ Invalid image for page {page_num}.")
1262
- # return None, None
1263
-
1264
- # # ====================================================================
1265
- # # --- STEP 1: YOLO DETECTION ---
1266
- # # ====================================================================
1267
- # start_time_yolo = time.time()
1268
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1269
-
1270
- # relevant_detections = []
1271
- # if results and results[0].boxes:
1272
- # for box in results[0].boxes:
1273
- # class_id = int(box.cls[0])
1274
- # class_name = model.names[class_id]
1275
- # if class_name in TARGET_CLASSES:
1276
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1277
- # relevant_detections.append(
1278
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1279
- # )
1280
-
1281
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1282
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1283
-
1284
- # # ====================================================================
1285
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1286
- # # ====================================================================
1287
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1288
- # raw_words_for_layout = get_word_data_for_detection(
1289
- # fitz_page, pdf_path, page_num,
1290
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1291
- # )
1292
-
1293
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1294
-
1295
- # # ====================================================================
1296
- # # --- STEP 3: COLUMN DETECTION ---
1297
- # # ====================================================================
1298
- # page_width_pdf = fitz_page.rect.width
1299
- # page_height_pdf = fitz_page.rect.height
1300
-
1301
- # column_detection_params = {
1302
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1303
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1304
- # }
1305
-
1306
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1307
-
1308
- # page_separator_x = None
1309
- # if separators:
1310
- # central_min = page_width_pdf * 0.35
1311
- # central_max = page_width_pdf * 0.65
1312
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1313
-
1314
- # if central_separators:
1315
- # center_x = page_width_pdf / 2
1316
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1317
- # print(f" βœ… Column Split Confirmed at X={page_separator_x:.1f}")
1318
- # else:
1319
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1320
- # else:
1321
- # print(" -> Single Column Layout Confirmed.")
1322
-
1323
- # # ====================================================================
1324
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1325
- # # ====================================================================
1326
- # start_time_components = time.time()
1327
- # component_metadata = []
1328
- # fig_count_page = 0
1329
- # eq_count_page = 0
1330
-
1331
- # for detection in merged_detections:
1332
- # x1, y1, x2, y2 = detection['coords']
1333
- # class_name = detection['class']
1334
-
1335
- # if class_name == 'figure':
1336
- # GLOBAL_FIGURE_COUNT += 1
1337
- # counter = GLOBAL_FIGURE_COUNT
1338
- # component_word = f"FIGURE{counter}"
1339
- # fig_count_page += 1
1340
- # elif class_name == 'equation':
1341
- # GLOBAL_EQUATION_COUNT += 1
1342
- # counter = GLOBAL_EQUATION_COUNT
1343
- # component_word = f"EQUATION{counter}"
1344
- # eq_count_page += 1
1345
- # else:
1346
- # continue
1347
-
1348
- # component_crop = original_img[y1:y2, x1:x2]
1349
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1350
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1351
-
1352
- # y_midpoint = (y1 + y2) // 2
1353
- # component_metadata.append({
1354
- # 'type': class_name, 'word': component_word,
1355
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1356
- # 'y0': int(y_midpoint), 'x0': int(x1)
1357
- # })
1358
-
1359
- # # ====================================================================
1360
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1361
- # # ====================================================================
1362
- # raw_ocr_output = []
1363
- # scale_factor = 2.0 # Pipeline standard scale
1364
-
1365
- # try:
1366
- # # Try getting native text first
1367
- # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1368
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1369
- # except Exception as e:
1370
- # print(f" ❌ Native text extraction failed: {e}")
1371
-
1372
- # # If native text is missing, fall back to OCR
1373
- # if not raw_ocr_output:
1374
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1375
- # print(f" ⚑ Using cached Tesseract OCR for page {page_num}")
1376
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1377
- # for word_tuple in cached_word_data:
1378
- # word_text, x1, y1, x2, y2 = word_tuple
1379
-
1380
- # # Scale from PDF points to Pipeline Pixels (2.0)
1381
- # x1_pix = int(x1 * scale_factor)
1382
- # y1_pix = int(y1 * scale_factor)
1383
- # x2_pix = int(x2 * scale_factor)
1384
- # y2_pix = int(y2 * scale_factor)
1385
-
1386
- # raw_ocr_output.append({
1387
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1388
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1389
- # 'y0': y1_pix, 'x0': x1_pix
1390
- # })
1391
- # else:
1392
- # # === START OF OPTIMIZED OCR BLOCK ===
1393
- # try:
1394
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1395
- # ocr_zoom = 4.0
1396
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1397
-
1398
- # # Convert PyMuPDF Pixmap to OpenCV format
1399
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1400
- # pix_ocr.n)
1401
- # if pix_ocr.n == 3:
1402
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1403
- # elif pix_ocr.n == 4:
1404
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1405
-
1406
- # # 2. Preprocess (Binarization)
1407
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1408
-
1409
- # # 3. Run Tesseract with Optimized Configuration
1410
- # custom_config = r'--oem 3 --psm 6'
1411
-
1412
- # hocr_data = pytesseract.image_to_data(
1413
- # processed_img,
1414
- # output_type=pytesseract.Output.DICT,
1415
- # config=custom_config
1416
- # )
1417
-
1418
- # # ==============================================================================
1419
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
- # # ==============================================================================
1421
- # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
- # debug_count = 0
1423
- # for i in range(len(hocr_data['level'])):
1424
- # text = hocr_data['text'][i].strip()
1425
- # if text:
1426
- # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
- # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
- # debug_count += 1
1429
- # if debug_count >= 50: break
1430
- # print("----------------------------------------------------------------------\n")
1431
- # # ==============================================================================
1432
-
1433
- # for i in range(len(hocr_data['level'])):
1434
- # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
-
1436
- # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1437
- # cleaned_text = sanitize_text(text).strip()
1438
-
1439
- # if cleaned_text and hocr_data['conf'][i] > -1:
1440
- # # 4. Coordinate Mapping
1441
- # scale_adjustment = scale_factor / ocr_zoom
1442
-
1443
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1444
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1445
- # w = int(hocr_data['width'][i] * scale_adjustment)
1446
- # h = int(hocr_data['height'][i] * scale_adjustment)
1447
- # x2 = x1 + w
1448
- # y2 = y1 + h
1449
-
1450
- # raw_ocr_output.append({
1451
- # 'type': 'text',
1452
- # 'word': cleaned_text, # Use the sanitized word
1453
- # 'confidence': float(hocr_data['conf'][i]),
1454
- # 'bbox': [x1, y1, x2, y2],
1455
- # 'y0': y1,
1456
- # 'x0': x1
1457
- # })
1458
- # except Exception as e:
1459
- # print(f" ❌ Tesseract OCR Error: {e}")
1460
- # # === END OF OPTIMIZED OCR BLOCK ===
1461
-
1462
- # # ====================================================================
1463
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1464
- # # ====================================================================
1465
- # items_to_sort = []
1466
-
1467
- # for ocr_word in raw_ocr_output:
1468
- # is_suppressed = False
1469
- # for component in component_metadata:
1470
- # # Do not include words that are inside figure/equation boxes
1471
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1472
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1473
- # is_suppressed = True
1474
- # break
1475
- # if not is_suppressed:
1476
- # items_to_sort.append(ocr_word)
1477
-
1478
- # # Add figures/equations back into the flow as "words"
1479
- # items_to_sort.extend(component_metadata)
1480
-
1481
- # # ====================================================================
1482
- # # --- STEP 7: LINE-BASED SORTING ---
1483
- # # ====================================================================
1484
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1485
- # lines = []
1486
-
1487
- # for item in items_to_sort:
1488
- # placed = False
1489
- # for line in lines:
1490
- # y_ref = min(it['y0'] for it in line)
1491
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1492
- # line.append(item)
1493
- # placed = True
1494
- # break
1495
- # if not placed and item['type'] in ['equation', 'figure']:
1496
- # for line in lines:
1497
- # y_ref = min(it['y0'] for it in line)
1498
- # if abs(y_ref - item['y0']) < 20:
1499
- # line.append(item)
1500
- # placed = True
1501
- # break
1502
- # if not placed:
1503
- # lines.append([item])
1504
-
1505
- # for line in lines:
1506
- # line.sort(key=lambda x: x['x0'])
1507
-
1508
- # final_output = []
1509
- # for line in lines:
1510
- # for item in line:
1511
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1512
- # if 'tag' in item: data_item['tag'] = item['tag']
1513
- # final_output.append(data_item)
1514
-
1515
- # return final_output, page_separator_x
1516
-
1517
-
1518
-
1519
-
1520
-
1521
-
1522
-
1523
-
1524
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1525
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1526
 
@@ -1973,305 +1559,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1973
 
1974
 
1975
 
1976
-
1977
-
1978
-
1979
-
1980
-
1981
- # def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1982
- # preprocessed_json_path: str,
1983
- # column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
1984
- # print("\n" + "=" * 80)
1985
- # print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
1986
- # print("=" * 80)
1987
-
1988
- # tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
1989
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1990
- # print(f" -> Using device: {device}")
1991
-
1992
- # try:
1993
- # model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
1994
- # checkpoint = torch.load(model_path, map_location=device)
1995
- # model_state = checkpoint.get('model_state_dict', checkpoint)
1996
- # # Apply patch for layoutlmv3 compatibility with saved state_dict
1997
- # fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
1998
- # model.load_state_dict(fixed_state_dict)
1999
- # model.to(device)
2000
- # model.eval()
2001
- # print(f"βœ… LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
2002
- # except Exception as e:
2003
- # print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
2004
- # return []
2005
-
2006
- # try:
2007
- # with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
2008
- # preprocessed_data = json.load(f)
2009
- # print(f"βœ… Loaded preprocessed data with {len(preprocessed_data)} pages.")
2010
- # except Exception:
2011
- # print("❌ Error loading preprocessed JSON.")
2012
- # return []
2013
-
2014
- # try:
2015
- # doc = fitz.open(pdf_path)
2016
- # except Exception:
2017
- # print("❌ Error loading PDF.")
2018
- # return []
2019
-
2020
- # final_page_predictions = []
2021
- # CHUNK_SIZE = 500
2022
-
2023
- # for page_data in preprocessed_data:
2024
- # page_num_1_based = page_data['page_number']
2025
- # page_num_0_based = page_num_1_based - 1
2026
- # page_raw_predictions = []
2027
- # print(f"\n *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
2028
-
2029
- # fitz_page = doc.load_page(page_num_0_based)
2030
- # page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
2031
- # print(f" -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
2032
-
2033
- # all_token_data = []
2034
- # scale_factor = 2.0
2035
-
2036
- # for item in page_data['data']:
2037
- # raw_yolo_bbox = item['bbox']
2038
- # bbox_pdf = [
2039
- # int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
2040
- # int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
2041
- # ]
2042
- # normalized_bbox = [
2043
- # max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
2044
- # max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
2045
- # max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
2046
- # max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
2047
- # ]
2048
- # all_token_data.append({
2049
- # "word": item['word'],
2050
- # "bbox_raw_pdf_space": bbox_pdf,
2051
- # "bbox_normalized": normalized_bbox,
2052
- # "item_original_data": item
2053
- # })
2054
-
2055
- # # ==============================================================================
2056
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
2057
- # # ==============================================================================
2058
- # print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
2059
- # debug_count = 0
2060
- # for t in all_token_data:
2061
- # if debug_count >= 50: break
2062
- # w = t['word']
2063
- # unicode_points = [f"\\u{ord(c):04x}" for c in w]
2064
- # print(f" Token {debug_count}: '{w}' -> Codes: {unicode_points}")
2065
- # debug_count += 1
2066
- # print("----------------------------------------------------------------------\n")
2067
- # # ==============================================================================
2068
-
2069
- # if not all_token_data:
2070
- # continue
2071
-
2072
- # column_separator_x = page_data.get('column_separator_x', None)
2073
- # if column_separator_x is not None:
2074
- # print(f" -> Using SAVED column separator: X={column_separator_x}")
2075
- # else:
2076
- # print(" -> No column separator found. Assuming single chunk.")
2077
-
2078
- # token_chunks = _merge_integrity(all_token_data, column_separator_x)
2079
- # total_chunks = len(token_chunks)
2080
-
2081
- # for chunk_idx, chunk_tokens in enumerate(token_chunks):
2082
- # if not chunk_tokens: continue
2083
-
2084
- # # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
2085
- # chunk_words = [
2086
- # str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
2087
- # for t in chunk_tokens
2088
- # ]
2089
- # chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
2090
-
2091
- # total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
2092
- # for i in range(0, len(chunk_words), CHUNK_SIZE):
2093
- # sub_chunk_idx = i // CHUNK_SIZE + 1
2094
- # sub_words = chunk_words[i:i + CHUNK_SIZE]
2095
- # sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
2096
- # sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
2097
-
2098
- # print(f" -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
2099
-
2100
- # # 2. Manual generation of word_ids
2101
- # manual_word_ids = []
2102
- # for current_word_idx, word in enumerate(sub_words):
2103
- # sub_tokens = tokenizer.tokenize(word)
2104
- # for _ in sub_tokens:
2105
- # manual_word_ids.append(current_word_idx)
2106
-
2107
- # encoded_input = tokenizer(
2108
- # sub_words,
2109
- # boxes=sub_bboxes,
2110
- # truncation=True,
2111
- # padding="max_length",
2112
- # max_length=512,
2113
- # is_split_into_words=True,
2114
- # return_tensors="pt"
2115
- # )
2116
-
2117
- # # Check for empty sequence
2118
- # if encoded_input['input_ids'].shape[0] == 0:
2119
- # print(f" -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
2120
- # continue
2121
-
2122
- # # 3. Finalize word_ids based on encoded output length
2123
- # sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
2124
- # content_token_length = max(0, sequence_length - 2)
2125
-
2126
- # manual_word_ids = manual_word_ids[:content_token_length]
2127
-
2128
- # final_word_ids = [None] # CLS token (index 0)
2129
- # final_word_ids.extend(manual_word_ids)
2130
-
2131
- # if sequence_length > 1:
2132
- # final_word_ids.append(None) # SEP token
2133
-
2134
- # final_word_ids.extend([None] * (512 - len(final_word_ids)))
2135
- # word_ids = final_word_ids[:512] # Final array for mapping
2136
-
2137
- # # Inputs are already batched by the tokenizer as [1, 512]
2138
- # input_ids = encoded_input['input_ids'].to(device)
2139
- # bbox = encoded_input['bbox'].to(device)
2140
- # attention_mask = encoded_input['attention_mask'].to(device)
2141
-
2142
- # with torch.no_grad():
2143
- # model_outputs = model(input_ids, bbox, attention_mask)
2144
-
2145
- # # --- Robust extraction: support several forward return types ---
2146
- # logits_tensor = None
2147
- # decoded_labels_list = None
2148
-
2149
- # # case 1: tuple/list with (emissions, viterbi)
2150
- # if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
2151
- # a, b = model_outputs
2152
- # if isinstance(a, torch.Tensor):
2153
- # logits_tensor = a
2154
- # if isinstance(b, list):
2155
- # decoded_labels_list = b
2156
-
2157
- # # case 2: HF ModelOutput with .logits
2158
- # if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
2159
- # logits_tensor = model_outputs.logits
2160
-
2161
- # # case 3: tuple/list - search for a 3D tensor (B, L, C)
2162
- # if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
2163
- # found_tensor = None
2164
- # for item in model_outputs:
2165
- # if isinstance(item, torch.Tensor):
2166
- # if item.dim() == 3:
2167
- # logits_tensor = item
2168
- # break
2169
- # if found_tensor is None:
2170
- # found_tensor = item
2171
- # if logits_tensor is None and found_tensor is not None:
2172
- # if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
2173
- # logits_tensor = found_tensor
2174
- # elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
2175
- # logits_tensor = found_tensor.unsqueeze(0)
2176
-
2177
- # # case 4: model_outputs directly a tensor
2178
- # if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
2179
- # logits_tensor = model_outputs
2180
-
2181
- # # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
2182
- # if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
2183
- # decoded_labels_list = model_outputs
2184
-
2185
- # # If neither logits nor decoded exist, that's fatal
2186
- # if logits_tensor is None and decoded_labels_list is None:
2187
- # try:
2188
- # elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
2189
- # except Exception:
2190
- # elem_shapes = str(type(model_outputs))
2191
- # raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
2192
-
2193
- # # If we have logits_tensor, normalize shape to [seq_len, num_labels]
2194
- # if logits_tensor is not None:
2195
- # if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
2196
- # preds_tensor = logits_tensor.squeeze(0) # [L, C]
2197
- # else:
2198
- # preds_tensor = logits_tensor # possibly [L, C] already
2199
-
2200
- # if preds_tensor.dim() != 2:
2201
- # raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
2202
- # else:
2203
- # preds_tensor = None # no logits available
2204
-
2205
- # # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
2206
- # decoded_token_labels = None
2207
- # if decoded_labels_list is not None:
2208
- # decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
2209
-
2210
- # # Now map token-level predictions -> word-level predictions using word_ids
2211
- # word_idx_to_pred_id = {}
2212
-
2213
- # if preds_tensor is not None:
2214
- # for token_idx, word_idx in enumerate(word_ids):
2215
- # if token_idx >= sequence_length:
2216
- # break
2217
- # if word_idx is not None and word_idx < len(sub_words):
2218
- # if word_idx not in word_idx_to_pred_id:
2219
- # pred_id = torch.argmax(preds_tensor[token_idx]).item()
2220
- # word_idx_to_pred_id[word_idx] = pred_id
2221
- # else:
2222
- # if decoded_token_labels is None:
2223
- # raise RuntimeError("No logits and no decoded labels available for mapping.")
2224
- # decoded_len = len(decoded_token_labels)
2225
- # if decoded_len == content_token_length:
2226
- # decoded_start = 1
2227
- # elif decoded_len == sequence_length:
2228
- # decoded_start = 0
2229
- # else:
2230
- # decoded_start = 1
2231
-
2232
- # for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
2233
- # tok_idx = decoded_start + tok_idx_in_decoded
2234
- # if tok_idx >= 512:
2235
- # break
2236
- # if tok_idx >= sequence_length:
2237
- # break
2238
- # word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
2239
- # if word_idx is not None and word_idx < len(sub_words):
2240
- # if word_idx not in word_idx_to_pred_id:
2241
- # word_idx_to_pred_id[word_idx] = int(label_id)
2242
-
2243
- # # Finally convert mapped word preds -> page_raw_predictions entries
2244
- # for current_word_idx in range(len(sub_words)):
2245
- # pred_id = word_idx_to_pred_id.get(current_word_idx, 0) # default to 0
2246
- # predicted_label = ID_TO_LABEL[pred_id]
2247
- # original_token = sub_tokens_data[current_word_idx]
2248
- # page_raw_predictions.append({
2249
- # "word": original_token['word'],
2250
- # "bbox": original_token['bbox_raw_pdf_space'],
2251
- # "predicted_label": predicted_label,
2252
- # "page_number": page_num_1_based
2253
- # })
2254
-
2255
- # if page_raw_predictions:
2256
- # final_page_predictions.append({
2257
- # "page_number": page_num_1_based,
2258
- # "data": page_raw_predictions
2259
- # })
2260
- # print(f" *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
2261
-
2262
- # doc.close()
2263
- # print("\n" + "=" * 80)
2264
- # print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
2265
- # print("=" * 80)
2266
- # return final_page_predictions
2267
-
2268
-
2269
-
2270
-
2271
-
2272
-
2273
-
2274
-
2275
  # ============================================================================
2276
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
2277
  # ============================================================================
@@ -2748,10 +2035,6 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2748
 
2749
 
2750
 
2751
-
2752
-
2753
-
2754
-
2755
  # ============================================================================
2756
  # --- MAIN FUNCTION ---
2757
  # ============================================================================
@@ -2761,99 +2044,215 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2761
 
2762
  # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2763
  # List[Dict[str, Any]]]:
2764
- def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2765
- if not os.path.exists(input_pdf_path): return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2766
 
2767
  print("\n" + "#" * 80)
2768
  print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
 
2769
  print("#" * 80)
2770
 
 
2771
  pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2772
  temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2773
  os.makedirs(temp_pipeline_dir, exist_ok=True)
2774
 
2775
  preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2776
  raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2777
- structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2778
-
2779
  final_result = None
2780
  try:
2781
- # Phase 1: Preprocessing with YOLO First + Masking
 
 
2782
  preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2783
- if not preprocessed_json_path_out: return None
 
 
 
2784
 
2785
- # Phase 2: Inference
 
 
2786
  page_raw_predictions_list = run_inference_and_get_raw_words(
2787
  input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2788
  )
2789
- if not page_raw_predictions_list: return None
2790
-
2791
- # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2792
- # Save raw predictions to the temporary file
 
2793
  with open(raw_output_path, 'w', encoding='utf-8') as f:
2794
  json.dump(page_raw_predictions_list, f, indent=4)
 
2795
 
2796
- # Explicitly copy/save the raw predictions to the user-specified debug path
2797
- # if raw_predictions_output_path:
2798
- # shutil.copy(raw_output_path, raw_predictions_output_path)
2799
- # print(f"\nβœ… DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2800
- # ----------------------------------------
2801
-
2802
- # Phase 3: Decoding
2803
  structured_data_list = convert_bio_to_structured_json_relaxed(
2804
  raw_output_path, structured_intermediate_output_path
2805
  )
2806
- if not structured_data_list: return None
 
 
 
 
 
2807
  structured_data_list = correct_misaligned_options(structured_data_list)
2808
  structured_data_list = process_context_linking(structured_data_list)
 
2809
 
2810
-
2811
- # Phase 4: Embedding / Equation to LaTeX Conversion
 
2812
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
 
 
 
 
2813
 
2814
-
2815
-
2816
-
2817
- #================================================================================
2818
- # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2819
- #================================================================================
2820
-
2821
- print("\n" + "=" * 80)
2822
- print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2823
- print("=" * 80)
2824
-
2825
- # 1. Initialize and Load the Classifier
2826
  classifier = HierarchicalClassifier()
2827
  if classifier.load_models():
2828
- # 2. Run Classification on the *Final* Result
2829
- # The function modifies the list in place and returns it
2830
- final_result = post_process_json_with_inference(
2831
- final_result, classifier
2832
- )
2833
- print("βœ… Classification complete. Tags added to final output.")
2834
  else:
2835
- print("❌ Classification model loading failed. Outputting un-tagged data.")
2836
-
2837
- # ====================================================================
2838
-
2839
 
2840
  except Exception as e:
2841
- print(f"❌ FATAL ERROR: {e}")
2842
- import traceback
2843
  traceback.print_exc()
2844
  return None
2845
 
2846
  finally:
 
2847
  try:
2848
  for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2849
  os.remove(f)
2850
  os.rmdir(temp_pipeline_dir)
2851
- except Exception:
2852
- pass
 
2853
 
 
2854
  print("\n" + "#" * 80)
2855
- print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2856
  print("#" * 80)
 
2857
  return final_result
2858
 
2859
 
 
144
 
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # ============================================================================
148
  # --- CONFIGURATION AND CONSTANTS ---
149
  # ============================================================================
 
583
 
584
 
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
 
588
 
 
1107
 
1108
 
1109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1110
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1111
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1112
 
 
1559
 
1560
 
1561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1562
  # ============================================================================
1563
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
1564
  # ============================================================================
 
2035
 
2036
 
2037
 
 
 
 
 
2038
  # ============================================================================
2039
  # --- MAIN FUNCTION ---
2040
  # ============================================================================
 
2044
 
2045
  # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2046
  # List[Dict[str, Any]]]:
2047
+ # def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2048
+ # if not os.path.exists(input_pdf_path): return None
2049
+
2050
+ # print("\n" + "#" * 80)
2051
+ # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2052
+ # print("#" * 80)
2053
+
2054
+ # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2055
+ # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2056
+ # os.makedirs(temp_pipeline_dir, exist_ok=True)
2057
+
2058
+ # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2059
+ # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2060
+ # structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2061
+
2062
+ # final_result = None
2063
+ # try:
2064
+ # # Phase 1: Preprocessing with YOLO First + Masking
2065
+ # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2066
+ # if not preprocessed_json_path_out: return None
2067
+
2068
+ # # Phase 2: Inference
2069
+ # page_raw_predictions_list = run_inference_and_get_raw_words(
2070
+ # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2071
+ # )
2072
+ # if not page_raw_predictions_list: return None
2073
+
2074
+ # # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2075
+ # # Save raw predictions to the temporary file
2076
+ # with open(raw_output_path, 'w', encoding='utf-8') as f:
2077
+ # json.dump(page_raw_predictions_list, f, indent=4)
2078
+
2079
+ # # Explicitly copy/save the raw predictions to the user-specified debug path
2080
+ # # if raw_predictions_output_path:
2081
+ # # shutil.copy(raw_output_path, raw_predictions_output_path)
2082
+ # # print(f"\nβœ… DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2083
+ # # ----------------------------------------
2084
+
2085
+ # # Phase 3: Decoding
2086
+ # structured_data_list = convert_bio_to_structured_json_relaxed(
2087
+ # raw_output_path, structured_intermediate_output_path
2088
+ # )
2089
+ # if not structured_data_list: return None
2090
+
2091
+
2092
+ # structured_data_list = correct_misaligned_options(structured_data_list)
2093
+ # structured_data_list = process_context_linking(structured_data_list)
2094
+
2095
+
2096
+ # # Phase 4: Embedding / Equation to LaTeX Conversion
2097
+ # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2098
+
2099
+
2100
+
2101
+
2102
+ # #================================================================================
2103
+ # # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2104
+ # #================================================================================
2105
+
2106
+ # print("\n" + "=" * 80)
2107
+ # print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2108
+ # print("=" * 80)
2109
+
2110
+ # # 1. Initialize and Load the Classifier
2111
+ # classifier = HierarchicalClassifier()
2112
+ # if classifier.load_models():
2113
+ # # 2. Run Classification on the *Final* Result
2114
+ # # The function modifies the list in place and returns it
2115
+ # final_result = post_process_json_with_inference(
2116
+ # final_result, classifier
2117
+ # )
2118
+ # print("βœ… Classification complete. Tags added to final output.")
2119
+ # else:
2120
+ # print("❌ Classification model loading failed. Outputting un-tagged data.")
2121
+
2122
+ # # ====================================================================
2123
+
2124
+
2125
+ # except Exception as e:
2126
+ # print(f"❌ FATAL ERROR: {e}")
2127
+ # import traceback
2128
+ # traceback.print_exc()
2129
+ # return None
2130
+
2131
+ # finally:
2132
+ # try:
2133
+ # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2134
+ # os.remove(f)
2135
+ # os.rmdir(temp_pipeline_dir)
2136
+ # except Exception:
2137
+ # pass
2138
+
2139
+ # print("\n" + "#" * 80)
2140
+ # print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2141
+ # print("#" * 80)
2142
+ # return final_result
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+ import time
2153
+ import traceback
2154
+ import glob
2155
+
2156
+ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2157
+ if not os.path.exists(input_pdf_path):
2158
+ print(f"❌ ERROR: File not found: {input_pdf_path}")
2159
+ return None
2160
 
2161
  print("\n" + "#" * 80)
2162
  print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2163
+ print(f"Input: {input_pdf_path}")
2164
  print("#" * 80)
2165
 
2166
+ overall_start = time.time()
2167
  pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2168
  temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2169
  os.makedirs(temp_pipeline_dir, exist_ok=True)
2170
 
2171
  preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2172
  raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2173
+
 
2174
  final_result = None
2175
  try:
2176
+ # --- Phase 1: Preprocessing ---
2177
+ print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
2178
+ p1_start = time.time()
2179
  preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2180
+ if not preprocessed_json_path_out:
2181
+ print("❌ FAILED at Step 1: Preprocessing returned None.")
2182
+ return None
2183
+ print(f"βœ… Step 1 Complete ({time.time() - p1_start:.2f}s)")
2184
 
2185
+ # --- Phase 2: Inference ---
2186
+ print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
2187
+ p2_start = time.time()
2188
  page_raw_predictions_list = run_inference_and_get_raw_words(
2189
  input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2190
  )
2191
+ if not page_raw_predictions_list:
2192
+ print("❌ FAILED at Step 2: Inference returned no data.")
2193
+ return None
2194
+
2195
+ # Save raw predictions for Step 3
2196
  with open(raw_output_path, 'w', encoding='utf-8') as f:
2197
  json.dump(page_raw_predictions_list, f, indent=4)
2198
+ print(f"βœ… Step 2 Complete ({time.time() - p2_start:.2f}s)")
2199
 
2200
+ # --- Phase 3: Decoding ---
2201
+ print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
2202
+ p3_start = time.time()
 
 
 
 
2203
  structured_data_list = convert_bio_to_structured_json_relaxed(
2204
  raw_output_path, structured_intermediate_output_path
2205
  )
2206
+ if not structured_data_list:
2207
+ print("❌ FAILED at Step 3: BIO conversion failed.")
2208
+ return None
2209
+
2210
+ # Logic adjustments
2211
+ print("... Correcting misalignments and linking context ...")
2212
  structured_data_list = correct_misaligned_options(structured_data_list)
2213
  structured_data_list = process_context_linking(structured_data_list)
2214
+ print(f"βœ… Step 3 Complete ({time.time() - p3_start:.2f}s)")
2215
 
2216
+ # --- Phase 4: Base64 & LaTeX ---
2217
+ print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
2218
+ p4_start = time.time()
2219
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2220
+ if not final_result:
2221
+ print("❌ FAILED at Step 4: Final formatting failed.")
2222
+ return None
2223
+ print(f"βœ… Step 4 Complete ({time.time() - p4_start:.2f}s)")
2224
 
2225
+ # --- Phase 5: Hierarchical Tagging ---
2226
+ print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
2227
+ p5_start = time.time()
 
 
 
 
 
 
 
 
 
2228
  classifier = HierarchicalClassifier()
2229
  if classifier.load_models():
2230
+ final_result = post_process_json_with_inference(final_result, classifier)
2231
+ print(f"βœ… Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
 
 
 
 
2232
  else:
2233
+ print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
 
 
 
2234
 
2235
  except Exception as e:
2236
+ print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
2237
+ print(f"Error Message: {str(e)}")
2238
  traceback.print_exc()
2239
  return None
2240
 
2241
  finally:
2242
+ print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
2243
  try:
2244
  for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2245
  os.remove(f)
2246
  os.rmdir(temp_pipeline_dir)
2247
+ print("🧹 Cleanup successful.")
2248
+ except Exception as e:
2249
+ print(f"⚠️ Cleanup failed: {e}")
2250
 
2251
+ total_time = time.time() - overall_start
2252
  print("\n" + "#" * 80)
2253
+ print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
2254
  print("#" * 80)
2255
+
2256
  return final_result
2257
 
2258