heerjtdev commited on
Commit
b8ab755
·
verified ·
1 Parent(s): 50d1df6

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +0 -674
working_yolo_pipeline.py CHANGED
@@ -91,51 +91,6 @@ def sanitize_text(text: Optional[str]) -> str:
91
 
92
 
93
 
94
- # def get_latex_from_base64(base64_string: str) -> str:
95
- # """
96
- # Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
97
- # to recognize the formula. It cleans the output by removing spaces and
98
- # crucially, replacing double backslashes with single backslashes for correct LaTeX.
99
- # """
100
- # if ort_model is None or processor is None:
101
- # return "[MODEL_ERROR: Model not initialized]"
102
-
103
- # try:
104
- # # 1. Decode Base64 to Image
105
- # image_data = base64.b64decode(base64_string)
106
- # # We must ensure the image is RGB format for the model input
107
- # image = Image.open(io.BytesIO(image_data)).convert('RGB')
108
-
109
- # # 2. Preprocess the image
110
- # pixel_values = processor(images=image, return_tensors="pt").pixel_values
111
-
112
- # # 3. Text Generation (OCR)
113
- # generated_ids = ort_model.generate(pixel_values)
114
- # raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
115
-
116
- # if not raw_generated_text:
117
- # return "[OCR_WARNING: No formula found]"
118
-
119
- # latex_string = raw_generated_text[0]
120
-
121
- # # --- 4. Post-processing and Cleanup ---
122
-
123
- # # A. Remove all spaces/line breaks
124
- # cleaned_latex = re.sub(r'\s+', '', latex_string)
125
-
126
- # # B. CRITICAL FIX: Replace double backslashes with single backslashes.
127
-
128
- # return cleaned_latex
129
-
130
-
131
- # except Exception as e:
132
- # # Catch any unexpected errors
133
- # print(f" ❌ TR-OCR Recognition failed: {e}")
134
- # return f"[TR_OCR_ERROR: Recognition failed: {e}]"
135
-
136
-
137
-
138
-
139
 
140
  def get_latex_from_base64(base64_string: str) -> str:
141
  """
@@ -622,26 +577,6 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
622
  return img
623
 
624
 
625
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
626
- # raw_word_data = fitz_page.get_text("words")
627
- # converted_ocr_output = []
628
- # DEFAULT_CONFIDENCE = 99.0
629
-
630
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
631
- # if not word.strip(): continue
632
- # x1_pix = int(x1 * scale_factor)
633
- # y1_pix = int(y1 * scale_factor)
634
- # x2_pix = int(x2 * scale_factor)
635
- # y2_pix = int(y2 * scale_factor)
636
- # converted_ocr_output.append({
637
- # 'type': 'text',
638
- # 'word': word,
639
- # 'confidence': DEFAULT_CONFIDENCE,
640
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
641
- # 'y0': y1_pix, 'x0': x1_pix
642
- # })
643
- # return converted_ocr_output
644
-
645
 
646
 
647
  def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
@@ -672,265 +607,6 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
672
 
673
 
674
 
675
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
676
- # page_num: int, fitz_page: fitz.Page,
677
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
678
- # """
679
- # OPTIMIZED FLOW:
680
- # 1. Run YOLO to find Equations/Tables.
681
- # 2. Mask raw text with YOLO boxes.
682
- # 3. Run Column Detection on the MASKED data.
683
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
684
- # """
685
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
686
-
687
- # start_time_total = time.time()
688
-
689
- # if original_img is None:
690
- # print(f" ❌ Invalid image for page {page_num}.")
691
- # return None, None
692
-
693
- # # ====================================================================
694
- # # --- STEP 1: YOLO DETECTION ---
695
- # # ====================================================================
696
- # start_time_yolo = time.time()
697
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
698
-
699
- # relevant_detections = []
700
- # if results and results[0].boxes:
701
- # for box in results[0].boxes:
702
- # class_id = int(box.cls[0])
703
- # class_name = model.names[class_id]
704
- # if class_name in TARGET_CLASSES:
705
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
706
- # relevant_detections.append(
707
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
708
- # )
709
-
710
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
711
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
712
-
713
- # # ====================================================================
714
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
715
- # # ====================================================================
716
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
717
- # raw_words_for_layout = get_word_data_for_detection(
718
- # fitz_page, pdf_path, page_num,
719
- # top_margin_percent=0.10, bottom_margin_percent=0.10
720
- # )
721
-
722
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
723
-
724
- # # ====================================================================
725
- # # --- STEP 3: COLUMN DETECTION ---
726
- # # ====================================================================
727
- # page_width_pdf = fitz_page.rect.width
728
- # page_height_pdf = fitz_page.rect.height
729
-
730
- # column_detection_params = {
731
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
732
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
733
- # }
734
-
735
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
736
-
737
- # page_separator_x = None
738
- # if separators:
739
- # central_min = page_width_pdf * 0.35
740
- # central_max = page_width_pdf * 0.65
741
- # central_separators = [s for s in separators if central_min <= s <= central_max]
742
-
743
- # if central_separators:
744
- # center_x = page_width_pdf / 2
745
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
746
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
747
- # else:
748
- # print(" ⚠️ Gutter found off-center. Ignoring.")
749
- # else:
750
- # print(" -> Single Column Layout Confirmed.")
751
-
752
- # # ====================================================================
753
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
754
- # # ====================================================================
755
- # start_time_components = time.time()
756
- # component_metadata = []
757
- # fig_count_page = 0
758
- # eq_count_page = 0
759
-
760
- # for detection in merged_detections:
761
- # x1, y1, x2, y2 = detection['coords']
762
- # class_name = detection['class']
763
-
764
- # if class_name == 'figure':
765
- # GLOBAL_FIGURE_COUNT += 1
766
- # counter = GLOBAL_FIGURE_COUNT
767
- # component_word = f"FIGURE{counter}"
768
- # fig_count_page += 1
769
- # elif class_name == 'equation':
770
- # GLOBAL_EQUATION_COUNT += 1
771
- # counter = GLOBAL_EQUATION_COUNT
772
- # component_word = f"EQUATION{counter}"
773
- # eq_count_page += 1
774
- # else:
775
- # continue
776
-
777
- # component_crop = original_img[y1:y2, x1:x2]
778
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
779
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
780
-
781
- # y_midpoint = (y1 + y2) // 2
782
- # component_metadata.append({
783
- # 'type': class_name, 'word': component_word,
784
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
785
- # 'y0': int(y_midpoint), 'x0': int(x1)
786
- # })
787
-
788
- # # ====================================================================
789
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
790
- # # ====================================================================
791
- # raw_ocr_output = []
792
- # scale_factor = 2.0 # Pipeline standard scale
793
-
794
- # try:
795
- # # Try getting native text first
796
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
797
- # except Exception as e:
798
- # print(f" ❌ Native text extraction failed: {e}")
799
-
800
- # # If native text is missing, fall back to OCR
801
- # if not raw_ocr_output:
802
- # if _ocr_cache.has_ocr(pdf_path, page_num):
803
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
804
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
805
- # for word_tuple in cached_word_data:
806
- # word_text, x1, y1, x2, y2 = word_tuple
807
-
808
- # # Scale from PDF points to Pipeline Pixels (2.0)
809
- # x1_pix = int(x1 * scale_factor)
810
- # y1_pix = int(y1 * scale_factor)
811
- # x2_pix = int(x2 * scale_factor)
812
- # y2_pix = int(y2 * scale_factor)
813
-
814
- # raw_ocr_output.append({
815
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
816
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
817
- # 'y0': y1_pix, 'x0': x1_pix
818
- # })
819
- # else:
820
- # # === START OF OPTIMIZED OCR BLOCK ===
821
- # try:
822
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
823
- # # We do this specifically for OCR accuracy, separate from the pipeline image
824
- # ocr_zoom = 4.0
825
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
826
-
827
- # # Convert PyMuPDF Pixmap to OpenCV format
828
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
829
- # pix_ocr.n)
830
- # if pix_ocr.n == 3:
831
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
832
- # elif pix_ocr.n == 4:
833
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
834
-
835
- # # 2. Preprocess (Binarization)
836
- # # Ensure 'preprocess_image_for_ocr' is defined at top of file!
837
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
838
-
839
- # # 3. Run Tesseract with Optimized Configuration
840
- # # --oem 3: Default LSTM engine
841
- # # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
842
- # custom_config = r'--oem 3 --psm 6'
843
-
844
- # hocr_data = pytesseract.image_to_data(
845
- # processed_img,
846
- # output_type=pytesseract.Output.DICT,
847
- # config=custom_config
848
- # )
849
-
850
- # for i in range(len(hocr_data['level'])):
851
- # text = hocr_data['text'][i].strip()
852
- # if text and hocr_data['conf'][i] > -1:
853
- # # 4. Coordinate Mapping
854
- # # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
855
- # # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
856
- # scale_adjustment = scale_factor / ocr_zoom
857
-
858
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
859
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
860
- # w = int(hocr_data['width'][i] * scale_adjustment)
861
- # h = int(hocr_data['height'][i] * scale_adjustment)
862
- # x2 = x1 + w
863
- # y2 = y1 + h
864
-
865
- # raw_ocr_output.append({
866
- # 'type': 'text',
867
- # 'word': text,
868
- # 'confidence': float(hocr_data['conf'][i]),
869
- # 'bbox': [x1, y1, x2, y2],
870
- # 'y0': y1,
871
- # 'x0': x1
872
- # })
873
- # except Exception as e:
874
- # print(f" ❌ Tesseract OCR Error: {e}")
875
- # # === END OF OPTIMIZED OCR BLOCK ===
876
-
877
- # # ====================================================================
878
- # # --- STEP 6: OCR CLEANING AND MERGING ---
879
- # # ====================================================================
880
- # items_to_sort = []
881
-
882
- # for ocr_word in raw_ocr_output:
883
- # is_suppressed = False
884
- # for component in component_metadata:
885
- # # Do not include words that are inside figure/equation boxes
886
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
887
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
888
- # is_suppressed = True
889
- # break
890
- # if not is_suppressed:
891
- # items_to_sort.append(ocr_word)
892
-
893
- # # Add figures/equations back into the flow as "words"
894
- # items_to_sort.extend(component_metadata)
895
-
896
- # # ====================================================================
897
- # # --- STEP 7: LINE-BASED SORTING ---
898
- # # ====================================================================
899
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
900
- # lines = []
901
-
902
- # for item in items_to_sort:
903
- # placed = False
904
- # for line in lines:
905
- # y_ref = min(it['y0'] for it in line)
906
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
907
- # line.append(item)
908
- # placed = True
909
- # break
910
- # if not placed and item['type'] in ['equation', 'figure']:
911
- # for line in lines:
912
- # y_ref = min(it['y0'] for it in line)
913
- # if abs(y_ref - item['y0']) < 20:
914
- # line.append(item)
915
- # placed = True
916
- # break
917
- # if not placed:
918
- # lines.append([item])
919
-
920
- # for line in lines:
921
- # line.sort(key=lambda x: x['x0'])
922
-
923
- # final_output = []
924
- # for line in lines:
925
- # for item in line:
926
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
927
- # if 'tag' in item: data_item['tag'] = item['tag']
928
- # final_output.append(data_item)
929
-
930
- # return final_output, page_separator_x
931
-
932
-
933
-
934
 
935
 
936
 
@@ -1648,15 +1324,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1648
 
1649
 
1650
 
1651
-
1652
-
1653
-
1654
-
1655
-
1656
-
1657
-
1658
-
1659
-
1660
  # ============================================================================
1661
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
1662
  # ============================================================================
@@ -1863,133 +1530,6 @@ def calculate_similarity(doc1: str, doc2: str) -> float:
1863
  return 0.0
1864
 
1865
 
1866
- # def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1867
- # """
1868
- # Links questions to passages based on 'passage' flow vs 'new_passage' priority.
1869
- # Includes 'Decay Logic': If 2 consecutive questions fail to match the active passage,
1870
- # the passage context is dropped to prevent false positives downstream.
1871
- # """
1872
- # print("\n" + "=" * 80)
1873
- # print("--- STARTING CONTEXT LINKING (WITH DECAY LOGIC) ---")
1874
- # print("=" * 80)
1875
-
1876
- # if not data: return []
1877
-
1878
- # # --- PHASE 1: IDENTIFY PASSAGE DEFINERS ---
1879
- # passage_definer_indices = []
1880
- # for i, entry in enumerate(data):
1881
- # if entry.get("passage") and entry["passage"].strip():
1882
- # passage_definer_indices.append(i)
1883
- # if entry.get("new_passage") and entry["new_passage"].strip():
1884
- # if i not in passage_definer_indices:
1885
- # passage_definer_indices.append(i)
1886
-
1887
- # # --- PHASE 2: CONTEXT TRANSFER & LINKING ---
1888
- # current_passage_text = None
1889
- # current_new_passage_text = None
1890
-
1891
- # # NEW: Counter to track consecutive linking failures
1892
- # consecutive_failures = 0
1893
- # MAX_CONSECUTIVE_FAILURES = 2
1894
-
1895
- # for i, entry in enumerate(data):
1896
- # item_type = entry.get("type", "Question")
1897
-
1898
- # # A. UNCONDITIONALLY UPDATE CONTEXTS (And Reset Decay Counter)
1899
- # if entry.get("passage") and entry["passage"].strip():
1900
- # current_passage_text = entry["passage"]
1901
- # consecutive_failures = 0 # Reset because we have fresh explicit context
1902
- # # print(f" [Flow] Updated Standard Context from Item {i}")
1903
-
1904
- # if entry.get("new_passage") and entry["new_passage"].strip():
1905
- # current_new_passage_text = entry["new_passage"]
1906
- # # We don't necessarily reset standard failures here as this is a local override
1907
-
1908
- # # B. QUESTION LINKING
1909
- # if entry.get("question") and item_type != "METADATA":
1910
- # combined_query = create_query_text(entry)
1911
-
1912
- # # Skip if query is too short (noise)
1913
- # if len(combined_query.strip()) < 5:
1914
- # continue
1915
-
1916
- # # Calculate scores
1917
- # score_old = calculate_similarity(current_passage_text, combined_query) if current_passage_text else 0.0
1918
- # score_new = calculate_similarity(current_new_passage_text,
1919
- # combined_query) if current_new_passage_text else 0.0
1920
-
1921
- # q_preview = entry['question'][:30] + '...'
1922
-
1923
- # # RESOLUTION LOGIC
1924
- # linked = False
1925
-
1926
- # # 1. Prefer New Passage if significantly better
1927
- # if current_new_passage_text and (score_new > score_old + RESOLUTION_MARGIN) and (
1928
- # score_new >= SIMILARITY_THRESHOLD):
1929
- # entry["passage"] = current_new_passage_text
1930
- # print(f" [Linker] 🚀 Q{i} ('{q_preview}') -> NEW PASSAGE (Score: {score_new:.3f})")
1931
- # linked = True
1932
- # # Note: We do not reset 'consecutive_failures' for the standard passage here,
1933
- # # because we matched the *new* passage, not the standard one.
1934
-
1935
- # # 2. Otherwise use Standard Passage if it meets threshold
1936
- # elif current_passage_text and (score_old >= SIMILARITY_THRESHOLD):
1937
- # entry["passage"] = current_passage_text
1938
- # print(f" [Linker] ✅ Q{i} ('{q_preview}') -> STANDARD PASSAGE (Score: {score_old:.3f})")
1939
- # linked = True
1940
- # consecutive_failures = 0 # Success! Reset the kill switch.
1941
-
1942
- # if not linked:
1943
- # # 3. DECAY LOGIC
1944
- # if current_passage_text:
1945
- # consecutive_failures += 1
1946
- # print(
1947
- # f" [Linker] ⚠️ Q{i} NOT LINKED. (Failures: {consecutive_failures}/{MAX_CONSECUTIVE_FAILURES})")
1948
-
1949
- # if consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
1950
- # print(f" [Linker] 🗑️ Context dropped due to {consecutive_failures} consecutive misses.")
1951
- # current_passage_text = None
1952
- # consecutive_failures = 0
1953
- # else:
1954
- # print(f" [Linker] ⚠️ Q{i} NOT LINKED (No active context).")
1955
-
1956
- # # --- PHASE 3: CLEANUP AND INTERPOLATION ---
1957
- # print(" [Linker] Running Cleanup & Interpolation...")
1958
-
1959
- # # 3A. Self-Correction (Remove weak links)
1960
- # for i in passage_definer_indices:
1961
- # entry = data[i]
1962
- # if entry.get("question") and entry.get("type") != "METADATA":
1963
- # passage_to_check = entry.get("passage") or entry.get("new_passage")
1964
- # if passage_to_check:
1965
- # self_sim = calculate_similarity(passage_to_check, create_query_text(entry))
1966
- # if self_sim < SIMILARITY_THRESHOLD:
1967
- # entry["passage"] = ""
1968
- # if "new_passage" in entry: entry["new_passage"] = ""
1969
- # print(f" [Cleanup] Removed weak link for Q{i}")
1970
-
1971
- # # 3B. Interpolation (Fill gaps)
1972
- # # We only interpolate if the gap is strictly 1 question wide to avoid undoing the decay logic
1973
- # for i in range(1, len(data) - 1):
1974
- # current_entry = data[i]
1975
- # is_gap = current_entry.get("question") and not current_entry.get("passage")
1976
- # if is_gap:
1977
- # prev_p = data[i - 1].get("passage")
1978
- # next_p = data[i + 1].get("passage")
1979
- # if prev_p and next_p and (prev_p == next_p) and prev_p.strip():
1980
- # current_entry["passage"] = prev_p
1981
- # print(f" [Linker] 🥪 Q{i} Interpolated from neighbors.")
1982
-
1983
- # return data
1984
-
1985
-
1986
-
1987
-
1988
-
1989
-
1990
-
1991
-
1992
-
1993
 
1994
 
1995
 
@@ -2162,20 +1702,6 @@ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Di
2162
  return structured_data
2163
 
2164
 
2165
- # ============================================================================
2166
- # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
2167
- # ============================================================================
2168
-
2169
- # def get_base64_for_file(filepath: str) -> str:
2170
- # try:
2171
- # with open(filepath, 'rb') as f:
2172
- # return base64.b64encode(f.read()).decode('utf-8')
2173
- # except Exception as e:
2174
- # print(f" ❌ Error encoding file {filepath}: {e}")
2175
- # return ""
2176
-
2177
-
2178
-
2179
 
2180
  def get_base64_for_file(filepath: str) -> Optional[str]:
2181
  """Reads a file and returns its Base64 encoded string without the data URI prefix."""
@@ -2191,121 +1717,6 @@ def get_base64_for_file(filepath: str) -> Optional[str]:
2191
 
2192
 
2193
 
2194
- # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
2195
- # Dict[str, Any]]:
2196
- # print("\n" + "=" * 80)
2197
- # print("--- 4. STARTING IMAGE EMBEDDING (Base64) ---")
2198
- # print("=" * 80)
2199
- # if not structured_data: return []
2200
- # image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
2201
- # image_lookup = {}
2202
- # tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
2203
- # for filepath in image_files:
2204
- # filename = os.path.basename(filepath)
2205
- # match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
2206
- # if match:
2207
- # key = f"{match.group(1).upper()}{match.group(2)}"
2208
- # image_lookup[key] = filepath
2209
- # print(f" -> Found {len(image_lookup)} image components.")
2210
- # final_structured_data = []
2211
- # for item in structured_data:
2212
- # text_fields = [item.get('question', ''), item.get('passage', '')]
2213
- # if 'options' in item:
2214
- # for opt_val in item['options'].values(): text_fields.append(opt_val)
2215
- # if 'new_passage' in item: text_fields.append(item['new_passage'])
2216
- # unique_tags_to_embed = set()
2217
- # for text in text_fields:
2218
- # if not text: continue
2219
- # for match in tag_regex.finditer(text):
2220
- # tag = match.group(0).upper()
2221
- # if tag in image_lookup: unique_tags_to_embed.add(tag)
2222
- # for tag in sorted(list(unique_tags_to_embed)):
2223
- # filepath = image_lookup[tag]
2224
- # base64_code = get_base64_for_file(filepath)
2225
- # base_key = tag.replace(' ', '').lower()
2226
- # item[base_key] = base64_code
2227
- # final_structured_data.append(item)
2228
- # print(f"✅ Image embedding complete.")
2229
- # return final_structured_data
2230
-
2231
-
2232
-
2233
-
2234
- # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
2235
- # Dict[str, Any]]:
2236
- # print("\n" + "=" * 80)
2237
- # print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
2238
- # print("=" * 80)
2239
- # if not structured_data: return []
2240
- # image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
2241
- # image_lookup = {}
2242
- # tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
2243
- # for filepath in image_files:
2244
- # filename = os.path.basename(filepath)
2245
- # match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
2246
- # if match:
2247
- # key = f"{match.group(1).upper()}{match.group(2)}"
2248
- # image_lookup[key] = filepath
2249
- # print(f" -> Found {len(image_lookup)} image components.")
2250
-
2251
- # final_structured_data = []
2252
-
2253
- # for item in structured_data:
2254
- # text_fields = [item.get('question', ''), item.get('passage', '')]
2255
- # if 'options' in item:
2256
- # for opt_val in item['options'].values(): text_fields.append(opt_val)
2257
- # if 'new_passage' in item: text_fields.append(item['new_passage'])
2258
-
2259
- # unique_tags_to_embed = set()
2260
- # for text in text_fields:
2261
- # if not text: continue
2262
- # for match in tag_regex.finditer(text):
2263
- # tag = match.group(0).upper()
2264
- # if tag in image_lookup: unique_tags_to_embed.add(tag)
2265
-
2266
- # # List of tags that were successfully converted to LaTeX
2267
- # tags_converted_to_latex = set()
2268
-
2269
- # for tag in sorted(list(unique_tags_to_embed)):
2270
- # filepath = image_lookup[tag]
2271
- # # Get the base64 code for processing, whether we embed it or convert it to LaTeX
2272
- # base64_code = get_base64_for_file(filepath)
2273
-
2274
- # # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
2275
- # if tag.startswith('EQUATION') and p2t is not None:
2276
- # print(f" -> Converting EQUATION {tag} to LaTeX...")
2277
- # latex_code = get_latex_from_base64(base64_code)
2278
-
2279
- # # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
2280
- # for key in ['question', 'passage', 'new_passage']:
2281
- # if item.get(key) and tag in item[key]:
2282
- # item[key] = item[key].replace(tag, latex_code)
2283
-
2284
- # if 'options' in item:
2285
- # for opt_key, opt_val in item['options'].items():
2286
- # if tag in opt_val:
2287
- # item['options'][opt_key] = opt_val.replace(tag, latex_code)
2288
-
2289
- # tags_converted_to_latex.add(tag)
2290
- # # Skip the embedding of the Base64 code for equations
2291
- # continue
2292
- # # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
2293
-
2294
- # # Original logic (for figures): Embed the base64 code
2295
- # base_key = tag.replace(' ', '').lower()
2296
- # item[base_key] = base64_code
2297
-
2298
- # final_structured_data.append(item)
2299
-
2300
- # print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
2301
- # return final_structured_data
2302
-
2303
-
2304
-
2305
-
2306
-
2307
-
2308
-
2309
 
2310
  def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[ Dict[str, Any]]:
2311
  print("\n" + "=" * 80)
@@ -2476,91 +1887,6 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
2476
 
2477
 
2478
 
2479
- # if __name__ == "__main__":
2480
- # parser = argparse.ArgumentParser(description="Complete Pipeline")
2481
- # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2482
- # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2483
-
2484
- # # --- ADDED ARGUMENT FOR DEBUGGING ---
2485
- # parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
2486
- # help="Debug path for raw BIO tag predictions (JSON).")
2487
- # # ------------------------------------
2488
- # args = parser.parse_args()
2489
-
2490
- # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2491
- # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2492
-
2493
- # # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
2494
- # # raw_predictions_output_path = os.path.abspath(
2495
- # # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2496
- # # ---------------------------------------------
2497
-
2498
- # # --- UPDATED FUNCTION CALL ---
2499
- # final_json_data = run_document_pipeline(
2500
- # args.input_pdf,
2501
- # args.layoutlmv3_model_path )
2502
- # # -----------------------------
2503
-
2504
-
2505
-
2506
- # if final_json_data:
2507
- # with open(final_output_path, 'w', encoding='utf-8') as f:
2508
- # json.dump(final_json_data, f, indent=2, ensure_ascii=False)
2509
- # print(f"\n✅ Final Data Saved: {final_output_path}")
2510
- # else:
2511
- # print("\n❌ Pipeline Failed.")
2512
- # sys.exit(1)
2513
-
2514
-
2515
-
2516
-
2517
- # if __name__ == "__main__":
2518
- # parser = argparse.ArgumentParser(description="Complete Pipeline")
2519
- # parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
2520
- # parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
2521
-
2522
- # # --- ADDED ARGUMENT FOR DEBUGGING ---
2523
- # parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
2524
- # help="Debug path for raw BIO tag predictions (JSON).")
2525
- # # ------------------------------------
2526
- # args = parser.parse_args()
2527
-
2528
- # pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
2529
- # final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
2530
-
2531
- # # --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
2532
- # # raw_predictions_output_path = os.path.abspath(
2533
- # # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
2534
- # # ---------------------------------------------
2535
-
2536
- # # --- UPDATED FUNCTION CALL ---
2537
- # final_json_data = run_document_pipeline(
2538
- # args.input_pdf,
2539
- # args.layoutlmv3_model_path )
2540
- # # -----------------------------
2541
-
2542
- # # 🛑 CRITICAL FINAL FIX: CUSTOM JSON SAVING TO REMOVE ESCAPING 🛑
2543
- # if final_json_data:
2544
- # # 1. Dump the Python object to a standard JSON string.
2545
- # # This converts the in-memory single backslash ('\') into the JSON escaped double backslash ('\\').
2546
- # json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
2547
-
2548
- # # 2. **UNDO ESCAPING:** The string currently contains '\\' where you need '\' for LaTeX.
2549
- # # We replace the JSON-escaped double backslash ('\\') with a single literal backslash ('\').
2550
- # final_output_content = json_str.replace('\\\\', '\\')
2551
-
2552
- # # 3. Write the corrected string content to the file.
2553
- # with open(final_output_path, 'w', encoding='utf-8') as f:
2554
- # f.write(final_output_content)
2555
-
2556
- # print(f"\n✅ Final Data Saved: {final_output_path}")
2557
- # else:
2558
- # print("\n❌ Pipeline Failed.")
2559
- # sys.exit(1)
2560
-
2561
-
2562
-
2563
-
2564
  if __name__ == "__main__":
2565
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2566
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
 
91
 
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def get_latex_from_base64(base64_string: str) -> str:
96
  """
 
577
  return img
578
 
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
 
582
  def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
 
607
 
608
 
609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
 
611
 
612
 
 
1324
 
1325
 
1326
 
 
 
 
 
 
 
 
 
 
1327
  # ============================================================================
1328
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
1329
  # ============================================================================
 
1530
  return 0.0
1531
 
1532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1533
 
1534
 
1535
 
 
1702
  return structured_data
1703
 
1704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1705
 
1706
  def get_base64_for_file(filepath: str) -> Optional[str]:
1707
  """Reads a file and returns its Base64 encoded string without the data URI prefix."""
 
1717
 
1718
 
1719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1720
 
1721
  def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[ Dict[str, Any]]:
1722
  print("\n" + "=" * 80)
 
1887
 
1888
 
1889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1890
  if __name__ == "__main__":
1891
  parser = argparse.ArgumentParser(description="Complete Pipeline")
1892
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")