Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Jan 21

Commit

b13058c

verified ·

1 Parent(s): 71693a6

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +3 -667

working_yolo_pipeline.py CHANGED Viewed

@@ -536,183 +536,6 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
     return sorted(final_separators)
 #======================================================================================================================================
-# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
-#                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
-#     """Extract word data with OCR caching to avoid redundant Tesseract runs."""
-#     word_data = page.get_text("words")
-#     if len(word_data) > 0:
-#         word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
-#     else:
-#         if _ocr_cache.has_ocr(pdf_path, page_num):
-#             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
-#         else:
-#             try:
-#                 # --- OPTIMIZATION START ---
-#                 # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
-#                 zoom_level = 4.0
-#                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
-#                 # 2. Convert directly to OpenCV format (Faster than PIL)
-#                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-#                 if pix.n == 3:
-#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
-#                 elif pix.n == 4:
-#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
-#                 # 3. Apply Preprocessing (Thresholding)
-#                 processed_img = preprocess_image_for_ocr(img_np)
-#                 # 4. Optimized Tesseract Config
-#                 # --psm 6: Assume a single uniform block of text (Great for columns/questions)
-#                 # --oem 3: Default engine (LSTM)
-#                 custom_config = r'--oem 3 --psm 6'
-#                 data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
-#                                                  config=custom_config)
-#                 full_word_data = []
-#                 for i in range(len(data['level'])):
-#                     text = data['text'][i].strip()
-#                     if text:
-#                         # Scale coordinates back to PDF points
-#                         x1 = data['left'][i] / zoom_level
-#                         y1 = data['top'][i] / zoom_level
-#                         x2 = (data['left'][i] + data['width'][i]) / zoom_level
-#                         y2 = (data['top'][i] + data['height'][i]) / zoom_level
-#                         full_word_data.append((text, x1, y1, x2, y2))
-#                 word_data = full_word_data
-#                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
-#                 # --- OPTIMIZATION END ---
-#             except Exception as e:
-#                 print(f"  ❌ OCR Error in detection phase: {e}")
-#                 return []
-#     # Apply margin filtering
-#     page_height = page.rect.height
-#     y_min = page_height * top_margin_percent
-#     y_max = page_height * (1 - bottom_margin_percent)
-#     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
-#============================================================================================================
-# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
-#                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
-#     word_data = page.get_text("words")
-#     if len(word_data) > 0:
-#         word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
-#     else:
-#         if _ocr_cache.has_ocr(pdf_path, page_num):
-#             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
-#         else:
-#             try:
-#                 # 1. Render at Higher Resolution
-#                 zoom_level = 4.0
-#                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
-#                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-#                 # Convert to BGR for RapidOCR
-#                 if pix.n == 3: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
-#                 elif pix.n == 4: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
-#                 # 2. Run RapidOCR
-#                 # RapidOCR returns: [[box, text, score], ...]
-#                 # where box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
-#                 results, _ = ocr_engine(img_np)
-#                 full_word_data = []
-#                 if results:
-#                     for box, text, score in results:
-#                         text = text.strip()
-#                         if text:
-#                             # 3. Convert Polygon to BBox and Scale back to PDF points
-#                             xs = [p[0] for p in box]
-#                             ys = [p[1] for p in box]
-#                             x1 = min(xs) / zoom_level
-#                             y1 = min(ys) / zoom_level
-#                             x2 = max(xs) / zoom_level
-#                             y2 = max(ys) / zoom_level
-#                             full_word_data.append((text, x1, y1, x2, y2))
-#                 word_data = full_word_data
-#                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
-#             except Exception as e:
-#                 print(f"  ❌ RapidOCR Error in detection phase: {e}")
-#                 return []
-#     # Apply margin filtering
-#     page_height = page.rect.height
-#     y_min = page_height * top_margin_percent
-#     y_max = page_height * (1 - bottom_margin_percent)
-#     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
-# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
-#                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
-#     word_data = page.get_text("words")
-#     if len(word_data) > 0:
-#         # Reformat standard PyMuPDF output to (text, x1, y1, x2, y2)
-#         word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
-#     else:
-#         if _ocr_cache.has_ocr(pdf_path, page_num):
-#             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
-#         else:
-#             try:
-#                 # 1. Render at Higher Resolution
-#                 zoom_level = 4.0
-#                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
-#                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-#                 # Convert to BGR for RapidOCR
-#                 if pix.n == 3:
-#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
-#                 elif pix.n == 4:
-#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
-#                 # 2. Run RapidOCR
-#                 ocr_out = ocr_engine(img_np)
-#                 full_word_data = []
-#                 # CRITICAL FIX: Use 'is not None' to avoid NumPy truthiness ambiguity
-#                 if ocr_out is not None and ocr_out.boxes is not None:
-#                     # Use zip to iterate through boxes, text, and scores simultaneously
-#                     for box, text, score in zip(ocr_out.boxes, ocr_out.txts, ocr_out.scores):
-#                         text = str(text).strip()
-#                         if text:
-#                             # 3. Convert Polygon to BBox and Scale back to PDF points
-#                             xs = [p[0] for p in box]
-#                             ys = [p[1] for p in box]
-#                             x1 = min(xs) / zoom_level
-#                             y1 = min(ys) / zoom_level
-#                             x2 = max(xs) / zoom_level
-#                             y2 = max(ys) / zoom_level
-#                             full_word_data.append((text, x1, y1, x2, y2))
-#                 word_data = full_word_data
-#                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
-#             except Exception as e:
-#                 print(f"   ❌ RapidOCR Error in detection phase: {e}")
-#                 return []
-#     # Apply margin filtering
-#     page_height = page.rect.height
-#     y_min = page_height * top_margin_percent
-#     y_max = page_height * (1 - bottom_margin_percent)
-#     # Return filtered data where y-coordinates fall within the margins
-#     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
 def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
@@ -1129,19 +952,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
-    # if results and results[0].boxes:
-    #     for box in results[0].boxes:
-    #         class_id = int(box.cls[0])
-    #         class_name = model.names[class_id]
-    #         if class_name in TARGET_CLASSES:
-    #             x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
-    #             relevant_detections.append(
-    #                 {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
-    #             )
-    # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
-    # print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
@@ -1252,108 +1063,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                 })
         else:
             # === START OF OPTIMIZED OCR BLOCK ===
-            # try:
-            #     # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
-            #     ocr_zoom = 4.0
-            #     pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
-            #     # Convert PyMuPDF Pixmap to OpenCV format
-            #     img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
-            #                                                                         pix_ocr.n)
-            #     if pix_ocr.n == 3:
-            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
-            #     elif pix_ocr.n == 4:
-            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
-            #     # 2. Preprocess (Binarization)
-            #     processed_img = preprocess_image_for_ocr(img_ocr_np)
-            #     # 3. Run Tesseract with Optimized Configuration
-            #     custom_config = r'--oem 3 --psm 6'
-            #     hocr_data = pytesseract.image_to_data(
-            #         processed_img,
-            #         output_type=pytesseract.Output.DICT,
-            #         config=custom_config
-            #     )
-            #     for i in range(len(hocr_data['level'])):
-            #         text = hocr_data['text'][i] # Retrieve raw Tesseract text
-            #         # --- FIX: SANITIZE TEXT AND THEN STRIP ---
-            #         cleaned_text = sanitize_text(text).strip()
-            #         if cleaned_text and hocr_data['conf'][i] > -1:
-            #             # 4. Coordinate Mapping
-            #             scale_adjustment = scale_factor / ocr_zoom
-            #             x1 = int(hocr_data['left'][i] * scale_adjustment)
-            #             y1 = int(hocr_data['top'][i] * scale_adjustment)
-            #             w = int(hocr_data['width'][i] * scale_adjustment)
-            #             h = int(hocr_data['height'][i] * scale_adjustment)
-            #             x2 = x1 + w
-            #             y2 = y1 + h
-            #             raw_ocr_output.append({
-            #                 'type': 'text',
-            #                 'word': cleaned_text, # Use the sanitized word
-            #                 'confidence': float(hocr_data['conf'][i]),
-            #                 'bbox': [x1, y1, x2, y2],
-            #                 'y0': y1,
-            #                 'x0': x1
-            #             })
-            # except Exception as e:
-        #     print(f"  ❌ Tesseract OCR Error: {e}")
 #=============================================================================================================================================================
 #=============================================================================================================================================================
-            # else:
-            # # === START OF RAPIDOCR BLOCK ===
-            # try:
-            #     # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
-            #     ocr_zoom = 4.0
-            #     pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
-            #     # Convert PyMuPDF Pixmap to OpenCV format (BGR)
-            #     img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(
-            #         pix_ocr.height, pix_ocr.width, pix_ocr.n
-            #     )
-            #     if pix_ocr.n == 3:
-            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
-            #     elif pix_ocr.n == 4:
-            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
-            #     # 2. Run RapidOCR (Models handle preprocessing internally)
-            #     results, _ = ocr_engine(img_ocr_np)
-            #     if results:
-            #         # Calculate scaling from OCR image (4.0) to your pipeline standard (scale_factor=2.0)
-            #         scale_adjustment = scale_factor / ocr_zoom
-            #         for box, text, score in results:
-            #             # Sanitize and clean text
-            #             cleaned_text = sanitize_text(text).strip()
-            #             if cleaned_text:
-            #                 # 3. Coordinate Mapping (Convert 4-point polygon to x1, y1, x2, y2)
-            #                 xs = [p[0] for p in box]
-            #                 ys = [p[1] for p in box]
-            #                 x1 = int(min(xs) * scale_adjustment)
-            #                 y1 = int(min(ys) * scale_adjustment)
-            #                 x2 = int(max(xs) * scale_adjustment)
-            #                 y2 = int(max(ys) * scale_adjustment)
-            #                 raw_ocr_output.append({
-            #                     'type': 'text',
-            #                     'word': cleaned_text,
-            #                     'confidence': float(score) * 100, # Converting 0-1.0 to 0-100 scale
-            #                     'bbox': [x1, y1, x2, y2],
-            #                     'y0': y1,
-            #                     'x0': x1
-            #                 })
-            # except Exception as e:
-            #     print(f"  ❌ RapidOCR Fallback Error: {e}")
             try:
                 # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
                 ocr_zoom = 4.0
@@ -1926,163 +1639,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
 # ============================================================================
-# def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
-#     print("\n" + "=" * 80)
-#     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
-#     print("=" * 80)
-#     try:
-#         with open(input_path, 'r', encoding='utf-8') as f:
-#             predictions_by_page = json.load(f)
-#     except Exception as e:
-#         print(f"❌ Error loading raw prediction file: {e}")
-#         return None
-#     predictions = []
-#     for page_item in predictions_by_page:
-#         if isinstance(page_item, dict) and 'data' in page_item:
-#             predictions.extend(page_item['data'])
-#     structured_data = []
-#     current_item = None
-#     current_option_key = None
-#     current_passage_buffer = []
-#     current_text_buffer = []
-#     first_question_started = False
-#     last_entity_type = None
-#     just_finished_i_option = False
-#     is_in_new_passage = False
-#     def finalize_passage_to_item(item, passage_buffer):
-#         if passage_buffer:
-#             passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
-#             if item.get('passage'):
-#                 item['passage'] += ' ' + passage_text
-#             else:
-#                 item['passage'] = passage_text
-#         passage_buffer.clear()
-#     for item in predictions:
-#         word = item['word']
-#         label = item['predicted_label']
-#         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
-#         current_text_buffer.append(word)
-#         previous_entity_type = last_entity_type
-#         is_passage_label = (entity_type == 'PASSAGE')
-#         if not first_question_started:
-#             if label != 'B-QUESTION' and not is_passage_label:
-#                 just_finished_i_option = False
-#                 is_in_new_passage = False
-#                 continue
-#             if is_passage_label:
-#                 current_passage_buffer.append(word)
-#                 last_entity_type = 'PASSAGE'
-#                 just_finished_i_option = False
-#                 is_in_new_passage = False
-#                 continue
-#         if label == 'B-QUESTION':
-#             if not first_question_started:
-#                 header_text = ' '.join(current_text_buffer[:-1]).strip()
-#                 if header_text or current_passage_buffer:
-#                     metadata_item = {'type': 'METADATA', 'passage': ''}
-#                     finalize_passage_to_item(metadata_item, current_passage_buffer)
-#                     if header_text: metadata_item['text'] = header_text
-#                     structured_data.append(metadata_item)
-#                 first_question_started = True
-#                 current_text_buffer = [word]
-#             if current_item is not None:
-#                 finalize_passage_to_item(current_item, current_passage_buffer)
-#                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
-#                 structured_data.append(current_item)
-#                 current_text_buffer = [word]
-#             current_item = {
-#                 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
-#             }
-#             current_option_key = None
-#             last_entity_type = 'QUESTION'
-#             just_finished_i_option = False
-#             is_in_new_passage = False
-#             continue
-#         if current_item is not None:
-#             if is_in_new_passage:
-#                 # 🔑 Robust Initialization and Appending for 'new_passage'
-#                 if 'new_passage' not in current_item:
-#                     current_item['new_passage'] = word
-#                 else:
-#                     current_item['new_passage'] += f' {word}'
-#                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
-#                     is_in_new_passage = False
-#                 if label.startswith(('B-', 'I-')): last_entity_type = entity_type
-#                 continue
-#             is_in_new_passage = False
-#             if label.startswith('B-'):
-#                 if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
-#                     finalize_passage_to_item(current_item, current_passage_buffer)
-#                     current_passage_buffer = []
-#                 last_entity_type = entity_type
-#                 if entity_type == 'PASSAGE':
-#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
-#                         current_item['new_passage'] = word  # Initialize the new passage start
-#                         is_in_new_passage = True
-#                     else:
-#                         current_passage_buffer.append(word)
-#                 elif entity_type == 'OPTION':
-#                     current_option_key = word
-#                     current_item['options'][current_option_key] = word
-#                     just_finished_i_option = False
-#                 elif entity_type == 'ANSWER':
-#                     current_item['answer'] = word
-#                     current_option_key = None
-#                     just_finished_i_option = False
-#                 elif entity_type == 'QUESTION':
-#                     current_item['question'] += f' {word}'
-#                     just_finished_i_option = False
-#             elif label.startswith('I-'):
-#                 if entity_type == 'QUESTION':
-#                     current_item['question'] += f' {word}'
-#                 elif entity_type == 'PASSAGE':
-#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
-#                         current_item['new_passage'] = word  # Initialize the new passage start
-#                         is_in_new_passage = True
-#                     else:
-#                         if not current_passage_buffer: last_entity_type = 'PASSAGE'
-#                         current_passage_buffer.append(word)
-#                 elif entity_type == 'OPTION' and current_option_key is not None:
-#                     current_item['options'][current_option_key] += f' {word}'
-#                     just_finished_i_option = True
-#                 elif entity_type == 'ANSWER':
-#                     current_item['answer'] += f' {word}'
-#                 just_finished_i_option = (entity_type == 'OPTION')
-#             elif label == 'O':
-#                 if last_entity_type == 'QUESTION':
-#                     current_item['question'] += f' {word}'
-#                 just_finished_i_option = False
-#     if current_item is not None:
-#         finalize_passage_to_item(current_item, current_passage_buffer)
-#         current_item['text'] = ' '.join(current_text_buffer).strip()
-#         structured_data.append(current_item)
-#     for item in structured_data:
-#         item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
-#         if 'new_passage' in item:
-#             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
-#     try:
-#         with open(output_path, 'w', encoding='utf-8') as f:
-#             json.dump(structured_data, f, indent=2, ensure_ascii=False)
-#     except Exception:
-#         pass
-#     return structured_data
@@ -2600,104 +2156,9 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
-# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
-#     List[Dict[str, Any]]]:
-# def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
-#     if not os.path.exists(input_pdf_path): return None
-#     print("\n" + "#" * 80)
-#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
-#     print("#" * 80)
-#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
-#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
-#     os.makedirs(temp_pipeline_dir, exist_ok=True)
-#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
-#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-#     structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
-#     final_result = None
-#     try:
-#         # Phase 1: Preprocessing with YOLO First + Masking
-#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
-#         if not preprocessed_json_path_out: return None
-#         # Phase 2: Inference
-#         page_raw_predictions_list = run_inference_and_get_raw_words(
-#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
-#         )
-#         if not page_raw_predictions_list: return None
-#         # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
-#         # Save raw predictions to the temporary file
-#         with open(raw_output_path, 'w', encoding='utf-8') as f:
-#             json.dump(page_raw_predictions_list, f, indent=4)
-#         # Explicitly copy/save the raw predictions to the user-specified debug path
-#         # if raw_predictions_output_path:
-#         #     shutil.copy(raw_output_path, raw_predictions_output_path)
-#         #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
-#         # ----------------------------------------
-#         # Phase 3: Decoding
-#         structured_data_list = convert_bio_to_structured_json_relaxed(
-#             raw_output_path, structured_intermediate_output_path
-#         )
-#         if not structured_data_list: return None
-#         structured_data_list = correct_misaligned_options(structured_data_list)
-#         structured_data_list = process_context_linking(structured_data_list)
-#         # Phase 4: Embedding / Equation to LaTeX Conversion
-#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
-# #================================================================================
-#         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
-# #================================================================================
-#         print("\n" + "=" * 80)
-#         print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
-#         print("=" * 80)
-#         # 1. Initialize and Load the Classifier
-#         classifier = HierarchicalClassifier()
-#         if classifier.load_models():
-#             # 2. Run Classification on the *Final* Result
-#             # The function modifies the list in place and returns it
-#             final_result = post_process_json_with_inference(
-#                 final_result, classifier
-#             )
-#             print("✅ Classification complete. Tags added to final output.")
-#         else:
-#             print("❌ Classification model loading failed. Outputting un-tagged data.")
-#         # ====================================================================
-#     except Exception as e:
-#         print(f"❌ FATAL ERROR: {e}")
-#         import traceback
-#         traceback.print_exc()
-#         return None
-#     finally:
-#         try:
-#             for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
-#                 os.remove(f)
-#             os.rmdir(temp_pipeline_dir)
-#         except Exception:
-#             pass
-#     print("\n" + "#" * 80)
-#     print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
-#     print("#" * 80)
-#     return final_result
@@ -2783,131 +2244,6 @@ import time
 import traceback
 import glob
-# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
-#     if not os.path.exists(input_pdf_path):
-#         print(f"❌ ERROR: File not found: {input_pdf_path}")
-#         return None
-#     print("\n" + "#" * 80)
-#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
-#     print(f"Input: {input_pdf_path}")
-#     print("#" * 80)
-#     overall_start = time.time()
-#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
-#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
-#     os.makedirs(temp_pipeline_dir, exist_ok=True)
-#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
-#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-#     # If the user didn't provide a path, create one in the temp directory
-#     if structured_intermediate_output_path is None:
-#         structured_intermediate_output_path = os.path.join(
-#             temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
-#         )
-#     final_result = None
-#     try:
-#         # --- Phase 1: Preprocessing ---
-#         print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
-#         p1_start = time.time()
-#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
-#         if not preprocessed_json_path_out:
-#             print("❌ FAILED at Step 1: Preprocessing returned None.")
-#             return None
-#         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
-#         # --- Phase 2: Inference ---
-#         print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
-#         p2_start = time.time()
-#         page_raw_predictions_list = run_inference_and_get_raw_words(
-#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
-#         )
-#         if not page_raw_predictions_list:
-#             print("❌ FAILED at Step 2: Inference returned no data.")
-#             return None
-#         # Save raw predictions for Step 3
-#         with open(raw_output_path, 'w', encoding='utf-8') as f:
-#             json.dump(page_raw_predictions_list, f, indent=4)
-#         print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
-#         # --- Phase 3: Decoding ---
-#         print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
-#         p3_start = time.time()
-#         structured_data_list = convert_bio_to_structured_json_relaxed(
-#             raw_output_path, structured_intermediate_output_path
-#         )
-#         if not structured_data_list:
-#             print("❌ FAILED at Step 3: BIO conversion failed.")
-#             return None
-#         # Logic adjustments
-#         print("... Correcting misalignments and linking context ...")
-#         structured_data_list = correct_misaligned_options(structured_data_list)
-#         structured_data_list = process_context_linking(structured_data_list)
-#         print(f"✅ Step 3 Complete ({time.time() - p3_start:.2f}s)")
-#         # --- Phase 4: Base64 & LaTeX ---
-#         print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
-#         p4_start = time.time()
-#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
-#         if not final_result:
-#             print("❌ FAILED at Step 4: Final formatting failed.")
-#             return None
-#         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
-#         # --- ADD THIS NEW STEP HERE ---
-#         print(f"\n[Step 4.5/5] Adding Question Type Classification...")
-#         p4_5_start = time.time()
-#         final_result = add_question_type_validation(final_result)
-#         print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
-#         # --- END OF NEW STEP ---
-#         # --- Phase 5: Hierarchical Tagging ---
-#         print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
-#         p5_start = time.time()
-#         classifier = HierarchicalClassifier()
-#         if classifier.load_models():
-#             final_result = post_process_json_with_inference(final_result, classifier)
-#             print(f"✅ Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
-#         else:
-#             print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
-#     except Exception as e:
-#         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
-#         print(f"Error Message: {str(e)}")
-#         traceback.print_exc()
-#         return None
-#     finally:
-#         print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
-#         try:
-#             for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
-#                 os.remove(f)
-#             os.rmdir(temp_pipeline_dir)
-#             print("🧹 Cleanup successful.")
-#         except Exception as e:
-#             print(f"⚠️ Cleanup failed: {e}")
-#     total_time = time.time() - overall_start
-#     print("\n" + "#" * 80)
-#     print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
-#     print("#" * 80)
-#     return final_result

     return sorted(final_separators)
 #======================================================================================================================================
 def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
                 })
         else:
             # === START OF OPTIMIZED OCR BLOCK ===
 #=============================================================================================================================================================
 #=============================================================================================================================================================
             try:
                 # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
                 ocr_zoom = 4.0
 # ============================================================================
 import traceback
 import glob