heerjtdev commited on
Commit
b13058c
·
verified ·
1 Parent(s): 71693a6

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +3 -667
working_yolo_pipeline.py CHANGED
@@ -536,183 +536,6 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
536
  return sorted(final_separators)
537
 
538
  #======================================================================================================================================
539
- # def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
540
- # top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
541
- # """Extract word data with OCR caching to avoid redundant Tesseract runs."""
542
- # word_data = page.get_text("words")
543
-
544
- # if len(word_data) > 0:
545
- # word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
546
- # else:
547
- # if _ocr_cache.has_ocr(pdf_path, page_num):
548
- # word_data = _ocr_cache.get_ocr(pdf_path, page_num)
549
- # else:
550
- # try:
551
- # # --- OPTIMIZATION START ---
552
- # # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
553
- # zoom_level = 4.0
554
- # pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
555
-
556
- # # 2. Convert directly to OpenCV format (Faster than PIL)
557
- # img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
558
- # if pix.n == 3:
559
- # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
560
- # elif pix.n == 4:
561
- # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
562
-
563
- # # 3. Apply Preprocessing (Thresholding)
564
- # processed_img = preprocess_image_for_ocr(img_np)
565
-
566
- # # 4. Optimized Tesseract Config
567
- # # --psm 6: Assume a single uniform block of text (Great for columns/questions)
568
- # # --oem 3: Default engine (LSTM)
569
- # custom_config = r'--oem 3 --psm 6'
570
-
571
- # data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
572
- # config=custom_config)
573
-
574
- # full_word_data = []
575
- # for i in range(len(data['level'])):
576
- # text = data['text'][i].strip()
577
- # if text:
578
- # # Scale coordinates back to PDF points
579
- # x1 = data['left'][i] / zoom_level
580
- # y1 = data['top'][i] / zoom_level
581
- # x2 = (data['left'][i] + data['width'][i]) / zoom_level
582
- # y2 = (data['top'][i] + data['height'][i]) / zoom_level
583
- # full_word_data.append((text, x1, y1, x2, y2))
584
-
585
- # word_data = full_word_data
586
- # _ocr_cache.set_ocr(pdf_path, page_num, word_data)
587
- # # --- OPTIMIZATION END ---
588
- # except Exception as e:
589
- # print(f" ❌ OCR Error in detection phase: {e}")
590
- # return []
591
-
592
- # # Apply margin filtering
593
- # page_height = page.rect.height
594
- # y_min = page_height * top_margin_percent
595
- # y_max = page_height * (1 - bottom_margin_percent)
596
- # return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
597
-
598
- #============================================================================================================
599
-
600
-
601
- # def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
602
- # top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
603
- # word_data = page.get_text("words")
604
-
605
- # if len(word_data) > 0:
606
- # word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
607
- # else:
608
- # if _ocr_cache.has_ocr(pdf_path, page_num):
609
- # word_data = _ocr_cache.get_ocr(pdf_path, page_num)
610
- # else:
611
- # try:
612
- # # 1. Render at Higher Resolution
613
- # zoom_level = 4.0
614
- # pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
615
- # img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
616
-
617
- # # Convert to BGR for RapidOCR
618
- # if pix.n == 3: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
619
- # elif pix.n == 4: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
620
-
621
- # # 2. Run RapidOCR
622
- # # RapidOCR returns: [[box, text, score], ...]
623
- # # where box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
624
- # results, _ = ocr_engine(img_np)
625
-
626
- # full_word_data = []
627
- # if results:
628
- # for box, text, score in results:
629
- # text = text.strip()
630
- # if text:
631
- # # 3. Convert Polygon to BBox and Scale back to PDF points
632
- # xs = [p[0] for p in box]
633
- # ys = [p[1] for p in box]
634
-
635
- # x1 = min(xs) / zoom_level
636
- # y1 = min(ys) / zoom_level
637
- # x2 = max(xs) / zoom_level
638
- # y2 = max(ys) / zoom_level
639
-
640
- # full_word_data.append((text, x1, y1, x2, y2))
641
-
642
- # word_data = full_word_data
643
- # _ocr_cache.set_ocr(pdf_path, page_num, word_data)
644
- # except Exception as e:
645
- # print(f" ❌ RapidOCR Error in detection phase: {e}")
646
- # return []
647
-
648
- # # Apply margin filtering
649
- # page_height = page.rect.height
650
- # y_min = page_height * top_margin_percent
651
- # y_max = page_height * (1 - bottom_margin_percent)
652
- # return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
653
-
654
-
655
- # def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
656
- # top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
657
- # word_data = page.get_text("words")
658
-
659
- # if len(word_data) > 0:
660
- # # Reformat standard PyMuPDF output to (text, x1, y1, x2, y2)
661
- # word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
662
- # else:
663
- # if _ocr_cache.has_ocr(pdf_path, page_num):
664
- # word_data = _ocr_cache.get_ocr(pdf_path, page_num)
665
- # else:
666
- # try:
667
- # # 1. Render at Higher Resolution
668
- # zoom_level = 4.0
669
- # pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
670
- # img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
671
-
672
- # # Convert to BGR for RapidOCR
673
- # if pix.n == 3:
674
- # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
675
- # elif pix.n == 4:
676
- # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
677
-
678
- # # 2. Run RapidOCR
679
- # ocr_out = ocr_engine(img_np)
680
-
681
- # full_word_data = []
682
-
683
- # # CRITICAL FIX: Use 'is not None' to avoid NumPy truthiness ambiguity
684
- # if ocr_out is not None and ocr_out.boxes is not None:
685
- # # Use zip to iterate through boxes, text, and scores simultaneously
686
- # for box, text, score in zip(ocr_out.boxes, ocr_out.txts, ocr_out.scores):
687
- # text = str(text).strip()
688
- # if text:
689
- # # 3. Convert Polygon to BBox and Scale back to PDF points
690
- # xs = [p[0] for p in box]
691
- # ys = [p[1] for p in box]
692
-
693
- # x1 = min(xs) / zoom_level
694
- # y1 = min(ys) / zoom_level
695
- # x2 = max(xs) / zoom_level
696
- # y2 = max(ys) / zoom_level
697
-
698
- # full_word_data.append((text, x1, y1, x2, y2))
699
-
700
- # word_data = full_word_data
701
- # _ocr_cache.set_ocr(pdf_path, page_num, word_data)
702
-
703
- # except Exception as e:
704
- # print(f" ❌ RapidOCR Error in detection phase: {e}")
705
- # return []
706
-
707
- # # Apply margin filtering
708
- # page_height = page.rect.height
709
- # y_min = page_height * top_margin_percent
710
- # y_max = page_height * (1 - bottom_margin_percent)
711
-
712
- # # Return filtered data where y-coordinates fall within the margins
713
- # return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
714
-
715
-
716
 
717
 
718
  def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
@@ -1129,19 +952,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1129
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1130
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1131
 
1132
-
1133
- # if results and results[0].boxes:
1134
- # for box in results[0].boxes:
1135
- # class_id = int(box.cls[0])
1136
- # class_name = model.names[class_id]
1137
- # if class_name in TARGET_CLASSES:
1138
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1139
- # relevant_detections.append(
1140
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1141
- # )
1142
-
1143
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1144
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1145
 
1146
  # ====================================================================
1147
  # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
@@ -1252,108 +1063,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1252
  })
1253
  else:
1254
  # === START OF OPTIMIZED OCR BLOCK ===
1255
- # try:
1256
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1257
- # ocr_zoom = 4.0
1258
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1259
-
1260
- # # Convert PyMuPDF Pixmap to OpenCV format
1261
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1262
- # pix_ocr.n)
1263
- # if pix_ocr.n == 3:
1264
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1265
- # elif pix_ocr.n == 4:
1266
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1267
-
1268
- # # 2. Preprocess (Binarization)
1269
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1270
-
1271
- # # 3. Run Tesseract with Optimized Configuration
1272
- # custom_config = r'--oem 3 --psm 6'
1273
-
1274
- # hocr_data = pytesseract.image_to_data(
1275
- # processed_img,
1276
- # output_type=pytesseract.Output.DICT,
1277
- # config=custom_config
1278
- # )
1279
-
1280
- # for i in range(len(hocr_data['level'])):
1281
- # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1282
-
1283
- # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1284
- # cleaned_text = sanitize_text(text).strip()
1285
-
1286
- # if cleaned_text and hocr_data['conf'][i] > -1:
1287
- # # 4. Coordinate Mapping
1288
- # scale_adjustment = scale_factor / ocr_zoom
1289
-
1290
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1291
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1292
- # w = int(hocr_data['width'][i] * scale_adjustment)
1293
- # h = int(hocr_data['height'][i] * scale_adjustment)
1294
- # x2 = x1 + w
1295
- # y2 = y1 + h
1296
-
1297
- # raw_ocr_output.append({
1298
- # 'type': 'text',
1299
- # 'word': cleaned_text, # Use the sanitized word
1300
- # 'confidence': float(hocr_data['conf'][i]),
1301
- # 'bbox': [x1, y1, x2, y2],
1302
- # 'y0': y1,
1303
- # 'x0': x1
1304
- # })
1305
- # except Exception as e:
1306
-
1307
- # print(f" ❌ Tesseract OCR Error: {e}")
1308
  #=============================================================================================================================================================
1309
  #=============================================================================================================================================================
1310
- # else:
1311
- # # === START OF RAPIDOCR BLOCK ===
1312
- # try:
1313
- # # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
1314
- # ocr_zoom = 4.0
1315
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1316
-
1317
- # # Convert PyMuPDF Pixmap to OpenCV format (BGR)
1318
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(
1319
- # pix_ocr.height, pix_ocr.width, pix_ocr.n
1320
- # )
1321
- # if pix_ocr.n == 3:
1322
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1323
- # elif pix_ocr.n == 4:
1324
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1325
-
1326
- # # 2. Run RapidOCR (Models handle preprocessing internally)
1327
- # results, _ = ocr_engine(img_ocr_np)
1328
-
1329
- # if results:
1330
- # # Calculate scaling from OCR image (4.0) to your pipeline standard (scale_factor=2.0)
1331
- # scale_adjustment = scale_factor / ocr_zoom
1332
-
1333
- # for box, text, score in results:
1334
- # # Sanitize and clean text
1335
- # cleaned_text = sanitize_text(text).strip()
1336
-
1337
- # if cleaned_text:
1338
- # # 3. Coordinate Mapping (Convert 4-point polygon to x1, y1, x2, y2)
1339
- # xs = [p[0] for p in box]
1340
- # ys = [p[1] for p in box]
1341
-
1342
- # x1 = int(min(xs) * scale_adjustment)
1343
- # y1 = int(min(ys) * scale_adjustment)
1344
- # x2 = int(max(xs) * scale_adjustment)
1345
- # y2 = int(max(ys) * scale_adjustment)
1346
-
1347
- # raw_ocr_output.append({
1348
- # 'type': 'text',
1349
- # 'word': cleaned_text,
1350
- # 'confidence': float(score) * 100, # Converting 0-1.0 to 0-100 scale
1351
- # 'bbox': [x1, y1, x2, y2],
1352
- # 'y0': y1,
1353
- # 'x0': x1
1354
- # })
1355
- # except Exception as e:
1356
- # print(f" ❌ RapidOCR Fallback Error: {e}")
1357
  try:
1358
  # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
1359
  ocr_zoom = 4.0
@@ -1926,163 +1639,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1926
  # ============================================================================
1927
 
1928
 
1929
- # def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
1930
- # print("\n" + "=" * 80)
1931
- # print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1932
- # print("=" * 80)
1933
- # try:
1934
- # with open(input_path, 'r', encoding='utf-8') as f:
1935
- # predictions_by_page = json.load(f)
1936
- # except Exception as e:
1937
- # print(f"❌ Error loading raw prediction file: {e}")
1938
- # return None
1939
-
1940
- # predictions = []
1941
- # for page_item in predictions_by_page:
1942
- # if isinstance(page_item, dict) and 'data' in page_item:
1943
- # predictions.extend(page_item['data'])
1944
-
1945
- # structured_data = []
1946
- # current_item = None
1947
- # current_option_key = None
1948
- # current_passage_buffer = []
1949
- # current_text_buffer = []
1950
- # first_question_started = False
1951
- # last_entity_type = None
1952
- # just_finished_i_option = False
1953
- # is_in_new_passage = False
1954
-
1955
- # def finalize_passage_to_item(item, passage_buffer):
1956
- # if passage_buffer:
1957
- # passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
1958
- # if item.get('passage'):
1959
- # item['passage'] += ' ' + passage_text
1960
- # else:
1961
- # item['passage'] = passage_text
1962
- # passage_buffer.clear()
1963
-
1964
- # for item in predictions:
1965
- # word = item['word']
1966
- # label = item['predicted_label']
1967
- # entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
1968
- # current_text_buffer.append(word)
1969
- # previous_entity_type = last_entity_type
1970
- # is_passage_label = (entity_type == 'PASSAGE')
1971
-
1972
- # if not first_question_started:
1973
- # if label != 'B-QUESTION' and not is_passage_label:
1974
- # just_finished_i_option = False
1975
- # is_in_new_passage = False
1976
- # continue
1977
- # if is_passage_label:
1978
- # current_passage_buffer.append(word)
1979
- # last_entity_type = 'PASSAGE'
1980
- # just_finished_i_option = False
1981
- # is_in_new_passage = False
1982
- # continue
1983
-
1984
- # if label == 'B-QUESTION':
1985
- # if not first_question_started:
1986
- # header_text = ' '.join(current_text_buffer[:-1]).strip()
1987
- # if header_text or current_passage_buffer:
1988
- # metadata_item = {'type': 'METADATA', 'passage': ''}
1989
- # finalize_passage_to_item(metadata_item, current_passage_buffer)
1990
- # if header_text: metadata_item['text'] = header_text
1991
- # structured_data.append(metadata_item)
1992
- # first_question_started = True
1993
- # current_text_buffer = [word]
1994
-
1995
- # if current_item is not None:
1996
- # finalize_passage_to_item(current_item, current_passage_buffer)
1997
- # current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
1998
- # structured_data.append(current_item)
1999
- # current_text_buffer = [word]
2000
-
2001
- # current_item = {
2002
- # 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
2003
- # }
2004
- # current_option_key = None
2005
- # last_entity_type = 'QUESTION'
2006
- # just_finished_i_option = False
2007
- # is_in_new_passage = False
2008
- # continue
2009
-
2010
- # if current_item is not None:
2011
- # if is_in_new_passage:
2012
- # # 🔑 Robust Initialization and Appending for 'new_passage'
2013
- # if 'new_passage' not in current_item:
2014
- # current_item['new_passage'] = word
2015
- # else:
2016
- # current_item['new_passage'] += f' {word}'
2017
-
2018
- # if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
2019
- # is_in_new_passage = False
2020
- # if label.startswith(('B-', 'I-')): last_entity_type = entity_type
2021
- # continue
2022
- # is_in_new_passage = False
2023
-
2024
- # if label.startswith('B-'):
2025
- # if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
2026
- # finalize_passage_to_item(current_item, current_passage_buffer)
2027
- # current_passage_buffer = []
2028
- # last_entity_type = entity_type
2029
- # if entity_type == 'PASSAGE':
2030
- # if previous_entity_type == 'OPTION' and just_finished_i_option:
2031
- # current_item['new_passage'] = word # Initialize the new passage start
2032
- # is_in_new_passage = True
2033
- # else:
2034
- # current_passage_buffer.append(word)
2035
- # elif entity_type == 'OPTION':
2036
- # current_option_key = word
2037
- # current_item['options'][current_option_key] = word
2038
- # just_finished_i_option = False
2039
- # elif entity_type == 'ANSWER':
2040
- # current_item['answer'] = word
2041
- # current_option_key = None
2042
- # just_finished_i_option = False
2043
- # elif entity_type == 'QUESTION':
2044
- # current_item['question'] += f' {word}'
2045
- # just_finished_i_option = False
2046
-
2047
- # elif label.startswith('I-'):
2048
- # if entity_type == 'QUESTION':
2049
- # current_item['question'] += f' {word}'
2050
- # elif entity_type == 'PASSAGE':
2051
- # if previous_entity_type == 'OPTION' and just_finished_i_option:
2052
- # current_item['new_passage'] = word # Initialize the new passage start
2053
- # is_in_new_passage = True
2054
- # else:
2055
- # if not current_passage_buffer: last_entity_type = 'PASSAGE'
2056
- # current_passage_buffer.append(word)
2057
- # elif entity_type == 'OPTION' and current_option_key is not None:
2058
- # current_item['options'][current_option_key] += f' {word}'
2059
- # just_finished_i_option = True
2060
- # elif entity_type == 'ANSWER':
2061
- # current_item['answer'] += f' {word}'
2062
- # just_finished_i_option = (entity_type == 'OPTION')
2063
-
2064
- # elif label == 'O':
2065
- # if last_entity_type == 'QUESTION':
2066
- # current_item['question'] += f' {word}'
2067
- # just_finished_i_option = False
2068
-
2069
- # if current_item is not None:
2070
- # finalize_passage_to_item(current_item, current_passage_buffer)
2071
- # current_item['text'] = ' '.join(current_text_buffer).strip()
2072
- # structured_data.append(current_item)
2073
-
2074
- # for item in structured_data:
2075
- # item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
2076
- # if 'new_passage' in item:
2077
- # item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
2078
-
2079
- # try:
2080
- # with open(output_path, 'w', encoding='utf-8') as f:
2081
- # json.dump(structured_data, f, indent=2, ensure_ascii=False)
2082
- # except Exception:
2083
- # pass
2084
-
2085
- # return structured_data
2086
 
2087
 
2088
 
@@ -2600,104 +2156,9 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2600
 
2601
 
2602
 
2603
- # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2604
- # List[Dict[str, Any]]]:
2605
- # def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2606
- # if not os.path.exists(input_pdf_path): return None
2607
-
2608
- # print("\n" + "#" * 80)
2609
- # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2610
- # print("#" * 80)
2611
-
2612
- # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2613
- # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2614
- # os.makedirs(temp_pipeline_dir, exist_ok=True)
2615
-
2616
- # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2617
- # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2618
- # structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2619
-
2620
- # final_result = None
2621
- # try:
2622
- # # Phase 1: Preprocessing with YOLO First + Masking
2623
- # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2624
- # if not preprocessed_json_path_out: return None
2625
-
2626
- # # Phase 2: Inference
2627
- # page_raw_predictions_list = run_inference_and_get_raw_words(
2628
- # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2629
- # )
2630
- # if not page_raw_predictions_list: return None
2631
-
2632
- # # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2633
- # # Save raw predictions to the temporary file
2634
- # with open(raw_output_path, 'w', encoding='utf-8') as f:
2635
- # json.dump(page_raw_predictions_list, f, indent=4)
2636
-
2637
- # # Explicitly copy/save the raw predictions to the user-specified debug path
2638
- # # if raw_predictions_output_path:
2639
- # # shutil.copy(raw_output_path, raw_predictions_output_path)
2640
- # # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2641
- # # ----------------------------------------
2642
-
2643
- # # Phase 3: Decoding
2644
- # structured_data_list = convert_bio_to_structured_json_relaxed(
2645
- # raw_output_path, structured_intermediate_output_path
2646
- # )
2647
- # if not structured_data_list: return None
2648
-
2649
-
2650
- # structured_data_list = correct_misaligned_options(structured_data_list)
2651
- # structured_data_list = process_context_linking(structured_data_list)
2652
-
2653
-
2654
- # # Phase 4: Embedding / Equation to LaTeX Conversion
2655
- # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2656
-
2657
 
2658
 
2659
 
2660
- # #================================================================================
2661
- # # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2662
- # #================================================================================
2663
-
2664
- # print("\n" + "=" * 80)
2665
- # print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2666
- # print("=" * 80)
2667
-
2668
- # # 1. Initialize and Load the Classifier
2669
- # classifier = HierarchicalClassifier()
2670
- # if classifier.load_models():
2671
- # # 2. Run Classification on the *Final* Result
2672
- # # The function modifies the list in place and returns it
2673
- # final_result = post_process_json_with_inference(
2674
- # final_result, classifier
2675
- # )
2676
- # print("✅ Classification complete. Tags added to final output.")
2677
- # else:
2678
- # print("❌ Classification model loading failed. Outputting un-tagged data.")
2679
-
2680
- # # ====================================================================
2681
-
2682
-
2683
- # except Exception as e:
2684
- # print(f"❌ FATAL ERROR: {e}")
2685
- # import traceback
2686
- # traceback.print_exc()
2687
- # return None
2688
-
2689
- # finally:
2690
- # try:
2691
- # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2692
- # os.remove(f)
2693
- # os.rmdir(temp_pipeline_dir)
2694
- # except Exception:
2695
- # pass
2696
-
2697
- # print("\n" + "#" * 80)
2698
- # print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
2699
- # print("#" * 80)
2700
- # return final_result
2701
 
2702
 
2703
 
@@ -2783,131 +2244,6 @@ import time
2783
  import traceback
2784
  import glob
2785
 
2786
- # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2787
- # if not os.path.exists(input_pdf_path):
2788
- # print(f"❌ ERROR: File not found: {input_pdf_path}")
2789
- # return None
2790
-
2791
- # print("\n" + "#" * 80)
2792
- # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2793
- # print(f"Input: {input_pdf_path}")
2794
- # print("#" * 80)
2795
-
2796
- # overall_start = time.time()
2797
- # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2798
- # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2799
- # os.makedirs(temp_pipeline_dir, exist_ok=True)
2800
-
2801
- # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2802
- # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2803
-
2804
-
2805
- # # If the user didn't provide a path, create one in the temp directory
2806
- # if structured_intermediate_output_path is None:
2807
- # structured_intermediate_output_path = os.path.join(
2808
- # temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
2809
- # )
2810
-
2811
-
2812
-
2813
- # final_result = None
2814
- # try:
2815
- # # --- Phase 1: Preprocessing ---
2816
- # print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
2817
- # p1_start = time.time()
2818
- # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2819
- # if not preprocessed_json_path_out:
2820
- # print("❌ FAILED at Step 1: Preprocessing returned None.")
2821
- # return None
2822
- # print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
2823
-
2824
- # # --- Phase 2: Inference ---
2825
- # print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
2826
- # p2_start = time.time()
2827
- # page_raw_predictions_list = run_inference_and_get_raw_words(
2828
- # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2829
- # )
2830
- # if not page_raw_predictions_list:
2831
- # print("❌ FAILED at Step 2: Inference returned no data.")
2832
- # return None
2833
-
2834
- # # Save raw predictions for Step 3
2835
- # with open(raw_output_path, 'w', encoding='utf-8') as f:
2836
- # json.dump(page_raw_predictions_list, f, indent=4)
2837
- # print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
2838
-
2839
- # # --- Phase 3: Decoding ---
2840
- # print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
2841
- # p3_start = time.time()
2842
- # structured_data_list = convert_bio_to_structured_json_relaxed(
2843
- # raw_output_path, structured_intermediate_output_path
2844
- # )
2845
- # if not structured_data_list:
2846
- # print("❌ FAILED at Step 3: BIO conversion failed.")
2847
- # return None
2848
-
2849
- # # Logic adjustments
2850
- # print("... Correcting misalignments and linking context ...")
2851
- # structured_data_list = correct_misaligned_options(structured_data_list)
2852
- # structured_data_list = process_context_linking(structured_data_list)
2853
- # print(f"✅ Step 3 Complete ({time.time() - p3_start:.2f}s)")
2854
-
2855
- # # --- Phase 4: Base64 & LaTeX ---
2856
- # print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
2857
- # p4_start = time.time()
2858
- # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2859
- # if not final_result:
2860
- # print("❌ FAILED at Step 4: Final formatting failed.")
2861
- # return None
2862
- # print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
2863
-
2864
-
2865
-
2866
- # # --- ADD THIS NEW STEP HERE ---
2867
- # print(f"\n[Step 4.5/5] Adding Question Type Classification...")
2868
- # p4_5_start = time.time()
2869
- # final_result = add_question_type_validation(final_result)
2870
- # print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
2871
-
2872
-
2873
-
2874
- # # --- END OF NEW STEP ---
2875
-
2876
-
2877
-
2878
-
2879
- # # --- Phase 5: Hierarchical Tagging ---
2880
- # print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
2881
- # p5_start = time.time()
2882
- # classifier = HierarchicalClassifier()
2883
- # if classifier.load_models():
2884
- # final_result = post_process_json_with_inference(final_result, classifier)
2885
- # print(f"✅ Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
2886
- # else:
2887
- # print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
2888
-
2889
- # except Exception as e:
2890
- # print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
2891
- # print(f"Error Message: {str(e)}")
2892
- # traceback.print_exc()
2893
- # return None
2894
-
2895
- # finally:
2896
- # print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
2897
- # try:
2898
- # for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
2899
- # os.remove(f)
2900
- # os.rmdir(temp_pipeline_dir)
2901
- # print("🧹 Cleanup successful.")
2902
- # except Exception as e:
2903
- # print(f"⚠️ Cleanup failed: {e}")
2904
-
2905
- # total_time = time.time() - overall_start
2906
- # print("\n" + "#" * 80)
2907
- # print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
2908
- # print("#" * 80)
2909
-
2910
- # return final_result
2911
 
2912
 
2913
 
 
536
  return sorted(final_separators)
537
 
538
  #======================================================================================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
 
541
  def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
 
952
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
953
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
954
 
955
+
 
 
 
 
 
 
 
 
 
 
 
 
956
 
957
  # ====================================================================
958
  # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
 
1063
  })
1064
  else:
1065
  # === START OF OPTIMIZED OCR BLOCK ===
1066
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
  #=============================================================================================================================================================
1068
  #=============================================================================================================================================================
1069
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1070
  try:
1071
  # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
1072
  ocr_zoom = 4.0
 
1639
  # ============================================================================
1640
 
1641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1642
 
1643
 
1644
 
 
2156
 
2157
 
2158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2159
 
2160
 
2161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2162
 
2163
 
2164
 
 
2244
  import traceback
2245
  import glob
2246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2247
 
2248
 
2249