heerjtdev commited on
Commit
98a2928
Β·
verified Β·
1 Parent(s): 414d12d

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +103 -857
working_yolo_pipeline.py CHANGED
@@ -146,60 +146,6 @@ def get_latex_from_base64(base64_string: str) -> str:
146
 
147
 
148
 
149
- # def get_latex_from_base64(base64_string: str) -> str:
150
- # """
151
- # Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
152
- # to recognize the formula. It cleans the output by removing spaces and
153
- # crucially, replacing double backslashes with single backslashes for correct LaTeX.
154
- # """
155
- # if ort_model is None or processor is None:
156
- # return "[MODEL_ERROR: Model not initialized]"
157
-
158
- # try:
159
- # # 1. Decode Base64 to Image
160
- # image_data = base64.b64decode(base64_string)
161
- # # We must ensure the image is RGB format for the model input
162
- # image = Image.open(io.BytesIO(image_data)).convert('RGB')
163
-
164
- # # 2. Preprocess the image
165
- # pixel_values = processor(images=image, return_tensors="pt").pixel_values
166
-
167
- # # 3. Text Generation (OCR)
168
- # generated_ids = ort_model.generate(pixel_values)
169
- # raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
170
-
171
- # if not raw_generated_text:
172
- # return "[OCR_WARNING: No formula found]"
173
-
174
- # latex_string = raw_generated_text[0]
175
-
176
- # # ==============================================================================
177
- # # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
178
- # # ==============================================================================
179
- # print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
180
- # # ==============================================================================
181
-
182
- # # --- 4. Post-processing and Cleanup ---
183
-
184
- # # # A. Remove all spaces/line breaks
185
- # # cleaned_latex = re.sub(r'\s+', '', latex_string)
186
- # cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
187
-
188
- # # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
189
- # # This corrects model output that already over-escaped the LaTeX commands.
190
- # # Python literal: '\\\\' is replaced with '\\'.
191
- # #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
192
-
193
- # return cleaned_latex
194
-
195
-
196
- # except Exception as e:
197
- # # Catch any unexpected errors
198
- # print(f" ❌ TR-OCR Recognition failed: {e}")
199
- # return f"[TR_OCR_ERROR: Recognition failed: {e}]"
200
-
201
-
202
-
203
 
204
  # ============================================================================
205
  # --- CONFIGURATION AND CONSTANTS ---
@@ -640,79 +586,6 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
640
 
641
 
642
 
643
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
644
- # raw_word_data = fitz_page.get_text("words")
645
- # converted_ocr_output = []
646
- # DEFAULT_CONFIDENCE = 99.0
647
-
648
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
649
- # # --- FIX: SANITIZE TEXT HERE ---
650
- # # cleaned_word = sanitize_text(word)
651
- # # if not cleaned_word.strip(): continue
652
-
653
- # x1_pix = int(x1 * scale_factor)
654
- # y1_pix = int(y1 * scale_factor)
655
- # x2_pix = int(x2 * scale_factor)
656
- # y2_pix = int(y2 * scale_factor)
657
- # converted_ocr_output.append({
658
- # 'type': 'text',
659
- # 'word': cleaned_word, # Use the sanitized word
660
- # 'confidence': DEFAULT_CONFIDENCE,
661
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
662
- # 'y0': y1_pix, 'x0': x1_pix
663
- # })
664
- # return converted_ocr_output
665
-
666
-
667
-
668
-
669
-
670
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
671
- # raw_word_data = fitz_page.get_text("words")
672
-
673
- # # ==============================================================================
674
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
675
- # # ==============================================================================
676
- # print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
677
- # debug_count = 0
678
- # for item in raw_word_data:
679
- # if debug_count >= 50: break
680
- # # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
681
- # word_text = item[4]
682
-
683
- # # Generate unicode hex codes for every character in the word
684
- # unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
685
- # print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
686
- # debug_count += 1
687
- # print("----------------------------------------------------------------------\n")
688
- # # ==============================================================================
689
-
690
- # converted_ocr_output = []
691
- # DEFAULT_CONFIDENCE = 99.0
692
-
693
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
694
- # # --- FIX: SANITIZE TEXT HERE ---
695
- # cleaned_word = sanitize_text(word)
696
- # if not cleaned_word.strip(): continue
697
-
698
- # x1_pix = int(x1 * scale_factor)
699
- # y1_pix = int(y1 * scale_factor)
700
- # x2_pix = int(x2 * scale_factor)
701
- # y2_pix = int(y2 * scale_factor)
702
- # converted_ocr_output.append({
703
- # 'type': 'text',
704
- # 'word': cleaned_word, # Use the sanitized word
705
- # 'confidence': DEFAULT_CONFIDENCE,
706
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
707
- # 'y0': y1_pix, 'x0': x1_pix
708
- # })
709
- # return converted_ocr_output
710
-
711
-
712
-
713
-
714
-
715
-
716
 
717
 
718
 
@@ -1242,285 +1115,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1242
 
1243
 
1244
 
1245
-
1246
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
- # page_num: int, fitz_page: fitz.Page,
1248
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1249
- # """
1250
- # OPTIMIZED FLOW:
1251
- # 1. Run YOLO to find Equations/Tables.
1252
- # 2. Mask raw text with YOLO boxes.
1253
- # 3. Run Column Detection on the MASKED data.
1254
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1255
- # """
1256
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1257
-
1258
- # start_time_total = time.time()
1259
-
1260
- # if original_img is None:
1261
- # print(f" ❌ Invalid image for page {page_num}.")
1262
- # return None, None
1263
-
1264
- # # ====================================================================
1265
- # # --- STEP 1: YOLO DETECTION ---
1266
- # # ====================================================================
1267
- # start_time_yolo = time.time()
1268
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1269
-
1270
- # relevant_detections = []
1271
- # if results and results[0].boxes:
1272
- # for box in results[0].boxes:
1273
- # class_id = int(box.cls[0])
1274
- # class_name = model.names[class_id]
1275
- # if class_name in TARGET_CLASSES:
1276
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1277
- # relevant_detections.append(
1278
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1279
- # )
1280
-
1281
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1282
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1283
-
1284
- # # ====================================================================
1285
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1286
- # # ====================================================================
1287
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1288
- # raw_words_for_layout = get_word_data_for_detection(
1289
- # fitz_page, pdf_path, page_num,
1290
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1291
- # )
1292
-
1293
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1294
-
1295
- # # ====================================================================
1296
- # # --- STEP 3: COLUMN DETECTION ---
1297
- # # ====================================================================
1298
- # page_width_pdf = fitz_page.rect.width
1299
- # page_height_pdf = fitz_page.rect.height
1300
-
1301
- # column_detection_params = {
1302
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1303
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1304
- # }
1305
-
1306
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1307
-
1308
- # page_separator_x = None
1309
- # if separators:
1310
- # central_min = page_width_pdf * 0.35
1311
- # central_max = page_width_pdf * 0.65
1312
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1313
-
1314
- # if central_separators:
1315
- # center_x = page_width_pdf / 2
1316
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1317
- # print(f" βœ… Column Split Confirmed at X={page_separator_x:.1f}")
1318
- # else:
1319
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1320
- # else:
1321
- # print(" -> Single Column Layout Confirmed.")
1322
-
1323
- # # ====================================================================
1324
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1325
- # # ====================================================================
1326
- # start_time_components = time.time()
1327
- # component_metadata = []
1328
- # fig_count_page = 0
1329
- # eq_count_page = 0
1330
-
1331
- # for detection in merged_detections:
1332
- # x1, y1, x2, y2 = detection['coords']
1333
- # class_name = detection['class']
1334
-
1335
- # if class_name == 'figure':
1336
- # GLOBAL_FIGURE_COUNT += 1
1337
- # counter = GLOBAL_FIGURE_COUNT
1338
- # component_word = f"FIGURE{counter}"
1339
- # fig_count_page += 1
1340
- # elif class_name == 'equation':
1341
- # GLOBAL_EQUATION_COUNT += 1
1342
- # counter = GLOBAL_EQUATION_COUNT
1343
- # component_word = f"EQUATION{counter}"
1344
- # eq_count_page += 1
1345
- # else:
1346
- # continue
1347
-
1348
- # component_crop = original_img[y1:y2, x1:x2]
1349
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1350
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1351
-
1352
- # y_midpoint = (y1 + y2) // 2
1353
- # component_metadata.append({
1354
- # 'type': class_name, 'word': component_word,
1355
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1356
- # 'y0': int(y_midpoint), 'x0': int(x1)
1357
- # })
1358
-
1359
- # # ====================================================================
1360
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1361
- # # ====================================================================
1362
- # raw_ocr_output = []
1363
- # scale_factor = 2.0 # Pipeline standard scale
1364
-
1365
- # try:
1366
- # # Try getting native text first
1367
- # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1368
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1369
- # except Exception as e:
1370
- # print(f" ❌ Native text extraction failed: {e}")
1371
-
1372
- # # If native text is missing, fall back to OCR
1373
- # if not raw_ocr_output:
1374
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1375
- # print(f" ⚑ Using cached Tesseract OCR for page {page_num}")
1376
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1377
- # for word_tuple in cached_word_data:
1378
- # word_text, x1, y1, x2, y2 = word_tuple
1379
-
1380
- # # Scale from PDF points to Pipeline Pixels (2.0)
1381
- # x1_pix = int(x1 * scale_factor)
1382
- # y1_pix = int(y1 * scale_factor)
1383
- # x2_pix = int(x2 * scale_factor)
1384
- # y2_pix = int(y2 * scale_factor)
1385
-
1386
- # raw_ocr_output.append({
1387
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1388
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1389
- # 'y0': y1_pix, 'x0': x1_pix
1390
- # })
1391
- # else:
1392
- # # === START OF OPTIMIZED OCR BLOCK ===
1393
- # try:
1394
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1395
- # ocr_zoom = 4.0
1396
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1397
-
1398
- # # Convert PyMuPDF Pixmap to OpenCV format
1399
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1400
- # pix_ocr.n)
1401
- # if pix_ocr.n == 3:
1402
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1403
- # elif pix_ocr.n == 4:
1404
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1405
-
1406
- # # 2. Preprocess (Binarization)
1407
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1408
-
1409
- # # 3. Run Tesseract with Optimized Configuration
1410
- # custom_config = r'--oem 3 --psm 6'
1411
-
1412
- # hocr_data = pytesseract.image_to_data(
1413
- # processed_img,
1414
- # output_type=pytesseract.Output.DICT,
1415
- # config=custom_config
1416
- # )
1417
-
1418
- # # ==============================================================================
1419
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
1420
- # # ==============================================================================
1421
- # print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
1422
- # debug_count = 0
1423
- # for i in range(len(hocr_data['level'])):
1424
- # text = hocr_data['text'][i].strip()
1425
- # if text:
1426
- # unicode_points = [f"\\u{ord(c):04x}" for c in text]
1427
- # print(f" OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
1428
- # debug_count += 1
1429
- # if debug_count >= 50: break
1430
- # print("----------------------------------------------------------------------\n")
1431
- # # ==============================================================================
1432
-
1433
- # for i in range(len(hocr_data['level'])):
1434
- # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1435
-
1436
- # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1437
- # cleaned_text = sanitize_text(text).strip()
1438
-
1439
- # if cleaned_text and hocr_data['conf'][i] > -1:
1440
- # # 4. Coordinate Mapping
1441
- # scale_adjustment = scale_factor / ocr_zoom
1442
-
1443
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1444
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1445
- # w = int(hocr_data['width'][i] * scale_adjustment)
1446
- # h = int(hocr_data['height'][i] * scale_adjustment)
1447
- # x2 = x1 + w
1448
- # y2 = y1 + h
1449
-
1450
- # raw_ocr_output.append({
1451
- # 'type': 'text',
1452
- # 'word': cleaned_text, # Use the sanitized word
1453
- # 'confidence': float(hocr_data['conf'][i]),
1454
- # 'bbox': [x1, y1, x2, y2],
1455
- # 'y0': y1,
1456
- # 'x0': x1
1457
- # })
1458
- # except Exception as e:
1459
- # print(f" ❌ Tesseract OCR Error: {e}")
1460
- # # === END OF OPTIMIZED OCR BLOCK ===
1461
-
1462
- # # ====================================================================
1463
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1464
- # # ====================================================================
1465
- # items_to_sort = []
1466
-
1467
- # for ocr_word in raw_ocr_output:
1468
- # is_suppressed = False
1469
- # for component in component_metadata:
1470
- # # Do not include words that are inside figure/equation boxes
1471
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1472
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1473
- # is_suppressed = True
1474
- # break
1475
- # if not is_suppressed:
1476
- # items_to_sort.append(ocr_word)
1477
-
1478
- # # Add figures/equations back into the flow as "words"
1479
- # items_to_sort.extend(component_metadata)
1480
-
1481
- # # ====================================================================
1482
- # # --- STEP 7: LINE-BASED SORTING ---
1483
- # # ====================================================================
1484
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1485
- # lines = []
1486
-
1487
- # for item in items_to_sort:
1488
- # placed = False
1489
- # for line in lines:
1490
- # y_ref = min(it['y0'] for it in line)
1491
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1492
- # line.append(item)
1493
- # placed = True
1494
- # break
1495
- # if not placed and item['type'] in ['equation', 'figure']:
1496
- # for line in lines:
1497
- # y_ref = min(it['y0'] for it in line)
1498
- # if abs(y_ref - item['y0']) < 20:
1499
- # line.append(item)
1500
- # placed = True
1501
- # break
1502
- # if not placed:
1503
- # lines.append([item])
1504
-
1505
- # for line in lines:
1506
- # line.sort(key=lambda x: x['x0'])
1507
-
1508
- # final_output = []
1509
- # for line in lines:
1510
- # for item in line:
1511
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1512
- # if 'tag' in item: data_item['tag'] = item['tag']
1513
- # final_output.append(data_item)
1514
-
1515
- # return final_output, page_separator_x
1516
-
1517
-
1518
-
1519
-
1520
-
1521
-
1522
-
1523
-
1524
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1525
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1526
 
@@ -1978,299 +1572,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1978
 
1979
 
1980
 
1981
- # def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1982
- # preprocessed_json_path: str,
1983
- # column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
1984
- # print("\n" + "=" * 80)
1985
- # print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
1986
- # print("=" * 80)
1987
-
1988
- # tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
1989
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1990
- # print(f" -> Using device: {device}")
1991
-
1992
- # try:
1993
- # model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
1994
- # checkpoint = torch.load(model_path, map_location=device)
1995
- # model_state = checkpoint.get('model_state_dict', checkpoint)
1996
- # # Apply patch for layoutlmv3 compatibility with saved state_dict
1997
- # fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
1998
- # model.load_state_dict(fixed_state_dict)
1999
- # model.to(device)
2000
- # model.eval()
2001
- # print(f"βœ… LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
2002
- # except Exception as e:
2003
- # print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
2004
- # return []
2005
-
2006
- # try:
2007
- # with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
2008
- # preprocessed_data = json.load(f)
2009
- # print(f"βœ… Loaded preprocessed data with {len(preprocessed_data)} pages.")
2010
- # except Exception:
2011
- # print("❌ Error loading preprocessed JSON.")
2012
- # return []
2013
-
2014
- # try:
2015
- # doc = fitz.open(pdf_path)
2016
- # except Exception:
2017
- # print("❌ Error loading PDF.")
2018
- # return []
2019
-
2020
- # final_page_predictions = []
2021
- # CHUNK_SIZE = 500
2022
-
2023
- # for page_data in preprocessed_data:
2024
- # page_num_1_based = page_data['page_number']
2025
- # page_num_0_based = page_num_1_based - 1
2026
- # page_raw_predictions = []
2027
- # print(f"\n *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
2028
-
2029
- # fitz_page = doc.load_page(page_num_0_based)
2030
- # page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
2031
- # print(f" -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
2032
-
2033
- # all_token_data = []
2034
- # scale_factor = 2.0
2035
-
2036
- # for item in page_data['data']:
2037
- # raw_yolo_bbox = item['bbox']
2038
- # bbox_pdf = [
2039
- # int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
2040
- # int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
2041
- # ]
2042
- # normalized_bbox = [
2043
- # max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
2044
- # max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
2045
- # max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
2046
- # max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
2047
- # ]
2048
- # all_token_data.append({
2049
- # "word": item['word'],
2050
- # "bbox_raw_pdf_space": bbox_pdf,
2051
- # "bbox_normalized": normalized_bbox,
2052
- # "item_original_data": item
2053
- # })
2054
-
2055
- # # ==============================================================================
2056
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
2057
- # # ==============================================================================
2058
- # print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
2059
- # debug_count = 0
2060
- # for t in all_token_data:
2061
- # if debug_count >= 50: break
2062
- # w = t['word']
2063
- # unicode_points = [f"\\u{ord(c):04x}" for c in w]
2064
- # print(f" Token {debug_count}: '{w}' -> Codes: {unicode_points}")
2065
- # debug_count += 1
2066
- # print("----------------------------------------------------------------------\n")
2067
- # # ==============================================================================
2068
-
2069
- # if not all_token_data:
2070
- # continue
2071
-
2072
- # column_separator_x = page_data.get('column_separator_x', None)
2073
- # if column_separator_x is not None:
2074
- # print(f" -> Using SAVED column separator: X={column_separator_x}")
2075
- # else:
2076
- # print(" -> No column separator found. Assuming single chunk.")
2077
-
2078
- # token_chunks = _merge_integrity(all_token_data, column_separator_x)
2079
- # total_chunks = len(token_chunks)
2080
-
2081
- # for chunk_idx, chunk_tokens in enumerate(token_chunks):
2082
- # if not chunk_tokens: continue
2083
-
2084
- # # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
2085
- # chunk_words = [
2086
- # str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
2087
- # for t in chunk_tokens
2088
- # ]
2089
- # chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
2090
-
2091
- # total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
2092
- # for i in range(0, len(chunk_words), CHUNK_SIZE):
2093
- # sub_chunk_idx = i // CHUNK_SIZE + 1
2094
- # sub_words = chunk_words[i:i + CHUNK_SIZE]
2095
- # sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
2096
- # sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
2097
-
2098
- # print(f" -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
2099
-
2100
- # # 2. Manual generation of word_ids
2101
- # manual_word_ids = []
2102
- # for current_word_idx, word in enumerate(sub_words):
2103
- # sub_tokens = tokenizer.tokenize(word)
2104
- # for _ in sub_tokens:
2105
- # manual_word_ids.append(current_word_idx)
2106
-
2107
- # encoded_input = tokenizer(
2108
- # sub_words,
2109
- # boxes=sub_bboxes,
2110
- # truncation=True,
2111
- # padding="max_length",
2112
- # max_length=512,
2113
- # is_split_into_words=True,
2114
- # return_tensors="pt"
2115
- # )
2116
-
2117
- # # Check for empty sequence
2118
- # if encoded_input['input_ids'].shape[0] == 0:
2119
- # print(f" -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
2120
- # continue
2121
-
2122
- # # 3. Finalize word_ids based on encoded output length
2123
- # sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
2124
- # content_token_length = max(0, sequence_length - 2)
2125
-
2126
- # manual_word_ids = manual_word_ids[:content_token_length]
2127
-
2128
- # final_word_ids = [None] # CLS token (index 0)
2129
- # final_word_ids.extend(manual_word_ids)
2130
-
2131
- # if sequence_length > 1:
2132
- # final_word_ids.append(None) # SEP token
2133
-
2134
- # final_word_ids.extend([None] * (512 - len(final_word_ids)))
2135
- # word_ids = final_word_ids[:512] # Final array for mapping
2136
-
2137
- # # Inputs are already batched by the tokenizer as [1, 512]
2138
- # input_ids = encoded_input['input_ids'].to(device)
2139
- # bbox = encoded_input['bbox'].to(device)
2140
- # attention_mask = encoded_input['attention_mask'].to(device)
2141
-
2142
- # with torch.no_grad():
2143
- # model_outputs = model(input_ids, bbox, attention_mask)
2144
-
2145
- # # --- Robust extraction: support several forward return types ---
2146
- # logits_tensor = None
2147
- # decoded_labels_list = None
2148
-
2149
- # # case 1: tuple/list with (emissions, viterbi)
2150
- # if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
2151
- # a, b = model_outputs
2152
- # if isinstance(a, torch.Tensor):
2153
- # logits_tensor = a
2154
- # if isinstance(b, list):
2155
- # decoded_labels_list = b
2156
-
2157
- # # case 2: HF ModelOutput with .logits
2158
- # if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
2159
- # logits_tensor = model_outputs.logits
2160
-
2161
- # # case 3: tuple/list - search for a 3D tensor (B, L, C)
2162
- # if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
2163
- # found_tensor = None
2164
- # for item in model_outputs:
2165
- # if isinstance(item, torch.Tensor):
2166
- # if item.dim() == 3:
2167
- # logits_tensor = item
2168
- # break
2169
- # if found_tensor is None:
2170
- # found_tensor = item
2171
- # if logits_tensor is None and found_tensor is not None:
2172
- # if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
2173
- # logits_tensor = found_tensor
2174
- # elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
2175
- # logits_tensor = found_tensor.unsqueeze(0)
2176
-
2177
- # # case 4: model_outputs directly a tensor
2178
- # if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
2179
- # logits_tensor = model_outputs
2180
-
2181
- # # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
2182
- # if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
2183
- # decoded_labels_list = model_outputs
2184
-
2185
- # # If neither logits nor decoded exist, that's fatal
2186
- # if logits_tensor is None and decoded_labels_list is None:
2187
- # try:
2188
- # elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
2189
- # except Exception:
2190
- # elem_shapes = str(type(model_outputs))
2191
- # raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
2192
-
2193
- # # If we have logits_tensor, normalize shape to [seq_len, num_labels]
2194
- # if logits_tensor is not None:
2195
- # if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
2196
- # preds_tensor = logits_tensor.squeeze(0) # [L, C]
2197
- # else:
2198
- # preds_tensor = logits_tensor # possibly [L, C] already
2199
-
2200
- # if preds_tensor.dim() != 2:
2201
- # raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
2202
- # else:
2203
- # preds_tensor = None # no logits available
2204
-
2205
- # # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
2206
- # decoded_token_labels = None
2207
- # if decoded_labels_list is not None:
2208
- # decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
2209
-
2210
- # # Now map token-level predictions -> word-level predictions using word_ids
2211
- # word_idx_to_pred_id = {}
2212
-
2213
- # if preds_tensor is not None:
2214
- # for token_idx, word_idx in enumerate(word_ids):
2215
- # if token_idx >= sequence_length:
2216
- # break
2217
- # if word_idx is not None and word_idx < len(sub_words):
2218
- # if word_idx not in word_idx_to_pred_id:
2219
- # pred_id = torch.argmax(preds_tensor[token_idx]).item()
2220
- # word_idx_to_pred_id[word_idx] = pred_id
2221
- # else:
2222
- # if decoded_token_labels is None:
2223
- # raise RuntimeError("No logits and no decoded labels available for mapping.")
2224
- # decoded_len = len(decoded_token_labels)
2225
- # if decoded_len == content_token_length:
2226
- # decoded_start = 1
2227
- # elif decoded_len == sequence_length:
2228
- # decoded_start = 0
2229
- # else:
2230
- # decoded_start = 1
2231
-
2232
- # for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
2233
- # tok_idx = decoded_start + tok_idx_in_decoded
2234
- # if tok_idx >= 512:
2235
- # break
2236
- # if tok_idx >= sequence_length:
2237
- # break
2238
- # word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
2239
- # if word_idx is not None and word_idx < len(sub_words):
2240
- # if word_idx not in word_idx_to_pred_id:
2241
- # word_idx_to_pred_id[word_idx] = int(label_id)
2242
-
2243
- # # Finally convert mapped word preds -> page_raw_predictions entries
2244
- # for current_word_idx in range(len(sub_words)):
2245
- # pred_id = word_idx_to_pred_id.get(current_word_idx, 0) # default to 0
2246
- # predicted_label = ID_TO_LABEL[pred_id]
2247
- # original_token = sub_tokens_data[current_word_idx]
2248
- # page_raw_predictions.append({
2249
- # "word": original_token['word'],
2250
- # "bbox": original_token['bbox_raw_pdf_space'],
2251
- # "predicted_label": predicted_label,
2252
- # "page_number": page_num_1_based
2253
- # })
2254
-
2255
- # if page_raw_predictions:
2256
- # final_page_predictions.append({
2257
- # "page_number": page_num_1_based,
2258
- # "data": page_raw_predictions
2259
- # })
2260
- # print(f" *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
2261
-
2262
- # doc.close()
2263
- # print("\n" + "=" * 80)
2264
- # print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
2265
- # print("=" * 80)
2266
- # return final_page_predictions
2267
-
2268
-
2269
-
2270
-
2271
-
2272
-
2273
-
2274
 
2275
  # ============================================================================
2276
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
@@ -2758,207 +2059,152 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2758
 
2759
 
2760
 
2761
-
2762
- # # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2763
- # # List[Dict[str, Any]]]:
2764
- # def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2765
- # if not os.path.exists(input_pdf_path): return None
2766
-
2767
- # print("\n" + "#" * 80)
2768
- # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2769
- # print("#" * 80)
2770
-
2771
- # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2772
- # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2773
- # os.makedirs(temp_pipeline_dir, exist_ok=True)
2774
-
2775
- # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2776
- # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2777
- # structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2778
-
2779
- # final_result = None
2780
- # try:
2781
- # # Phase 1: Preprocessing with YOLO First + Masking
2782
- # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2783
- # if not preprocessed_json_path_out: return None
2784
-
2785
- # # Phase 2: Inference
2786
- # page_raw_predictions_list = run_inference_and_get_raw_words(
2787
- # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2788
- # )
2789
- # if not page_raw_predictions_list: return None
2790
-
2791
- # # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2792
- # # Save raw predictions to the temporary file
2793
- # with open(raw_output_path, 'w', encoding='utf-8') as f:
2794
- # json.dump(page_raw_predictions_list, f, indent=4)
2795
-
2796
- # # Explicitly copy/save the raw predictions to the user-specified debug path
2797
- # # if raw_predictions_output_path:
2798
- # # shutil.copy(raw_output_path, raw_predictions_output_path)
2799
- # # print(f"\nβœ… DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2800
- # # ----------------------------------------
2801
-
2802
- # # Phase 3: Decoding
2803
- # structured_data_list = convert_bio_to_structured_json_relaxed(
2804
- # raw_output_path, structured_intermediate_output_path
2805
- # )
2806
- # if not structured_data_list: return None
2807
- # structured_data_list = correct_misaligned_options(structured_data_list)
2808
- # structured_data_list = process_context_linking(structured_data_list)
2809
-
2810
-
2811
- # # Phase 4: Embedding / Equation to LaTeX Conversion
2812
- # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2813
-
2814
-
2815
-
2816
-
2817
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2818
  """
2819
- Wraps a standard image file into a single-page PyMuPDF Document.
2820
- This ensures it can be processed by your existing fitz-based functions
2821
- (coordinate scaling, column detection, etc.) exactly as before.
2822
  """
2823
  img = Image.open(image_path)
2824
- # Convert image to a PDF stream in memory
2825
  pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
2826
  doc = fitz.open("pdf", pdf_bytes)
2827
  return doc, doc[0]
2828
 
2829
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2830
  """
2831
- Main pipeline modified to handle both PDF and Image files.
 
2832
  """
2833
- # 1. INITIALIZE MODELS (Preserving original logic)
2834
  yolo_model = YOLO(WEIGHTS_PATH)
2835
 
2836
  # 2. DETECT FILE TYPE
2837
- ext = os.path.splitext(input_path)[1].lower()
2838
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2839
 
2840
  all_pages_data = []
2841
-
2842
- # 3. BRANCH LOGIC: IMAGE VS PDF
2843
- if is_image:
2844
- print(f"πŸ“Έ Image detected: {input_path}. Initializing Single-Page Pipeline.")
2845
- doc, page = load_image_as_fitz_page(input_path)
2846
-
2847
- # Process as Page 0. Because there is no native text, your existing
2848
- # Tesseract fallback will naturally trigger to read the content.
2849
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2850
- img_np = pixmap_to_numpy(pix)
2851
-
2852
- page_data, _ = preprocess_and_ocr_page(
2853
- img_np,
2854
- yolo_model,
2855
- input_path,
2856
- 0, # Page 0
2857
- page,
2858
- os.path.basename(input_path)
2859
- )
2860
- if page_data:
2861
- all_pages_data.append(page_data)
2862
- doc.close()
2863
 
2864
- else:
2865
- # Standard PDF Processing Loop
2866
- try:
2867
- doc = fitz.open(input_path)
2868
- print(f"πŸ“„ Processing PDF with {len(doc)} pages: {input_path}")
2869
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2870
  for page_index in range(len(doc)):
2871
  page = doc[page_index]
2872
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2873
  img_np = pixmap_to_numpy(pix)
2874
 
2875
  page_data, _ = preprocess_and_ocr_page(
2876
- img_np,
2877
- yolo_model,
2878
- input_path,
2879
- page_index,
2880
- page,
2881
- os.path.basename(input_path)
2882
  )
2883
  if page_data:
2884
  all_pages_data.append(page_data)
2885
  doc.close()
2886
- except Exception as e:
2887
- print(f"❌ Error opening PDF {input_path}: {e}")
 
2888
  return None
2889
 
2890
- # 4. CONTINUE EXACTLY AS BEFORE: Gathering and Inference
2891
- if not all_pages_data:
2892
- print("❌ No data extracted from document.")
2893
- return None
2894
 
2895
- # Sequence all blocks from all pages (or the single image page)
2896
- sequential_blocks = []
2897
- for p_data in all_pages_data:
2898
- sequential_blocks.extend(p_data.get('blocks', []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2899
 
2900
- print("\n" + "=" * 80)
2901
- print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2902
- print("=" * 80)
2903
 
2904
- # Run LayoutLMv3 Inference on the gathered blocks
2905
- final_structured_data = run_layoutlmv3_inference_on_blocks(
2906
- sequential_blocks,
2907
- layoutlmv3_model_path
2908
- )
2909
 
2910
- # Run Hierarchical classification (Subject/Concept tags)
2911
- classifier = HierarchicalClassifier()
2912
- if classifier.load_models():
2913
- final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
2914
- print("βœ… Classification complete. Tags added.")
2915
- else:
2916
- print("❌ Classifier not found. Returning untagged data.")
2917
 
2918
- return final_structured_data
2919
 
2920
 
2921
- #================================================================================
2922
- # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2923
- #================================================================================
2924
 
2925
- print("\n" + "=" * 80)
2926
- print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2927
- print("=" * 80)
2928
-
2929
- # 1. Initialize and Load the Classifier
2930
- classifier = HierarchicalClassifier()
2931
- if classifier.load_models():
2932
- # 2. Run Classification on the *Final* Result
2933
- # The function modifies the list in place and returns it
2934
- final_result = post_process_json_with_inference(
2935
- final_result, classifier
2936
- )
2937
- print("βœ… Classification complete. Tags added to final output.")
2938
- else:
2939
- print("❌ Classification model loading failed. Outputting un-tagged data.")
2940
 
2941
- # ====================================================================
2942
 
2943
 
2944
- except Exception as e:
2945
- print(f"❌ FATAL ERROR: {e}")
2946
- import traceback
2947
- traceback.print_exc()
2948
- return None
 
 
 
 
 
 
 
 
2949
 
2950
- finally:
2951
- try:
2952
- for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2953
- os.remove(f)
2954
- os.rmdir(temp_pipeline_dir)
2955
- except Exception:
2956
- pass
2957
-
2958
- print("\n" + "#" * 80)
2959
- print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2960
- print("#" * 80)
2961
- return final_result
2962
 
2963
 
2964
 
 
146
 
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # ============================================================================
151
  # --- CONFIGURATION AND CONSTANTS ---
 
586
 
587
 
588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
 
591
 
 
1115
 
1116
 
1117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1119
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1120
 
 
1572
 
1573
 
1574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1575
 
1576
  # ============================================================================
1577
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 
2059
 
2060
 
2061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2062
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2063
  """
2064
+ Wraps an image into a temporary PyMuPDF document/page.
2065
+ This allows your existing column detection and coordinate mapping
2066
+ to work on images exactly as they do on PDFs.
2067
  """
2068
  img = Image.open(image_path)
2069
+ # Convert image to PDF format in memory
2070
  pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
2071
  doc = fitz.open("pdf", pdf_bytes)
2072
  return doc, doc[0]
2073
 
2074
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2075
  """
2076
+ Modified pipeline that handles both PDFs and Images, running YOLO,
2077
+ Tesseract OCR, and LayoutLMv3 inference.
2078
  """
2079
+ # 1. INITIALIZE YOLO
2080
  yolo_model = YOLO(WEIGHTS_PATH)
2081
 
2082
  # 2. DETECT FILE TYPE
2083
+ ext = os.path.splitext(input_path).lower()
2084
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2085
 
2086
  all_pages_data = []
2087
+ pdf_name = os.path.basename(input_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2088
 
2089
+ try:
2090
+ if is_image:
2091
+ print(f"πŸ“Έ Image detected: {input_path}. Processing with YOLO + Tesseract.")
2092
+ doc, page = load_image_as_fitz_page(input_path)
 
2093
 
2094
+ # Render for YOLO (using same scale as your PDF logic)
2095
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2096
+ img_np = pixmap_to_numpy(pix)
2097
+
2098
+ # Since an image has no native text layer, preprocess_and_ocr_page
2099
+ # will automatically use Tesseract OCR fallback as intended.
2100
+ page_data, _ = preprocess_and_ocr_page(
2101
+ img_np, yolo_model, input_path, 0, page, pdf_name
2102
+ )
2103
+ if page_data:
2104
+ all_pages_data.append(page_data)
2105
+ doc.close()
2106
+ else:
2107
+ # --- ORIGINAL PDF LOGIC ---
2108
+ doc = fitz.open(input_path)
2109
+ print(f"πŸ“„ Processing PDF: {pdf_name} ({len(doc)} pages)")
2110
  for page_index in range(len(doc)):
2111
  page = doc[page_index]
2112
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2113
  img_np = pixmap_to_numpy(pix)
2114
 
2115
  page_data, _ = preprocess_and_ocr_page(
2116
+ img_np, yolo_model, input_path, page_index, page, pdf_name
 
 
 
 
 
2117
  )
2118
  if page_data:
2119
  all_pages_data.append(page_data)
2120
  doc.close()
2121
+
2122
+ if not all_pages_data:
2123
+ print("❌ No data extracted.")
2124
  return None
2125
 
2126
+ # 3. CONSOLIDATE BLOCKS FOR INFERENCE
2127
+ sequential_blocks = []
2128
+ for p_data in all_pages_data:
2129
+ sequential_blocks.extend(p_data.get('blocks', []))
2130
 
2131
+ # --- 4. STARTING LAYOUTLMV3 INFERENCE (Exactly as before) ---
2132
+ print("\n" + "=" * 80)
2133
+ print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2134
+ print("=" * 80)
2135
+
2136
+ # (Inlining your existing LayoutLMv3 inference logic)
2137
+ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2138
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2139
+
2140
+ # This assumes LayoutLMv3ForTokenClassification is defined elsewhere in your script
2141
+ model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2142
+ checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
2143
+ model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
2144
+ model.to(device)
2145
+ model.eval()
2146
+
2147
+ # Run inference on sequential_blocks...
2148
+ final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
2149
+
2150
+ # 5. POST-PROCESS CLASSIFICATION
2151
+ classifier = HierarchicalClassifier()
2152
+ if classifier.load_models():
2153
+ final_result = post_process_json_with_inference(final_result, classifier)
2154
+ print("βœ… Classification complete.")
2155
+
2156
+ return final_result
2157
+
2158
+ except Exception as e:
2159
+ print(f"❌ FATAL ERROR in pipeline: {e}")
2160
+ return None
2161
 
 
 
 
2162
 
 
 
 
 
 
2163
 
 
 
 
 
 
 
 
2164
 
 
2165
 
2166
 
2167
+ # #================================================================================
2168
+ # # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2169
+ # #================================================================================
2170
 
2171
+ # print("\n" + "=" * 80)
2172
+ # print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2173
+ # print("=" * 80)
2174
+
2175
+ # # 1. Initialize and Load the Classifier
2176
+ # classifier = HierarchicalClassifier()
2177
+ # if classifier.load_models():
2178
+ # # 2. Run Classification on the *Final* Result
2179
+ # # The function modifies the list in place and returns it
2180
+ # final_result = post_process_json_with_inference(
2181
+ # final_result, classifier
2182
+ # )
2183
+ # print("βœ… Classification complete. Tags added to final output.")
2184
+ # else:
2185
+ # print("❌ Classification model loading failed. Outputting un-tagged data.")
2186
 
2187
+ # # ====================================================================
2188
 
2189
 
2190
+ # except Exception as e:
2191
+ # print(f"❌ FATAL ERROR: {e}")
2192
+ # import traceback
2193
+ # traceback.print_exc()
2194
+ # return None
2195
+
2196
+ # finally:
2197
+ # try:
2198
+ # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2199
+ # os.remove(f)
2200
+ # os.rmdir(temp_pipeline_dir)
2201
+ # except Exception:
2202
+ # pass
2203
 
2204
+ # print("\n" + "#" * 80)
2205
+ # print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2206
+ # print("#" * 80)
2207
+ # return final_result
 
 
 
 
 
 
 
 
2208
 
2209
 
2210