Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on 28 days ago

Commit

17d97ef

verified ·

1 Parent(s): bbc046a

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +778 -146

working_yolo_pipeline.py CHANGED Viewed

@@ -146,6 +146,60 @@ def get_latex_from_base64(base64_string: str) -> str:
 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
@@ -586,6 +640,79 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
@@ -1115,6 +1242,285 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -1572,6 +1978,299 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
@@ -2058,171 +2757,104 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
 # ============================================================================
-def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
-    """
-    Wraps an image into a temporary PyMuPDF document/page safely.
-    Uses an in-memory buffer to bypass 'encoder pdf not available' errors.
-    """
-    # 1. Use PIL to open the image and ensure it's in RGB mode
-    img = Image.open(image_path).convert("RGB")
-    # 2. Use a bytes buffer to save the image as a PDF via PIL's engine
-    pdf_stream = io.BytesIO()
-    img.save(pdf_stream, format="PDF")
-    pdf_stream.seek(0)
-    # 3. Open that PDF stream with PyMuPDF
-    doc = fitz.open("pdf", pdf_stream.read())
-    return doc, doc[0]
-def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
-    """
-    Modified pipeline that handles both PDFs and Images, running YOLO,
-    Tesseract OCR, and LayoutLMv3 inference.
-    """
-    # 1. INITIALIZE YOLO
-    yolo_model = YOLO(WEIGHTS_PATH)
-    # 2. DETECT FILE TYPE
-    ext = os.path.splitext(input_path)[1].lower()
-    is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
-    all_pages_data = []
-    pdf_name = os.path.basename(input_path)
     try:
-        if is_image:
-            print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
-            doc, page = load_image_as_fitz_page(input_path)
-            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-            img_np = pixmap_to_numpy(pix)
-            page_data, _ = preprocess_and_ocr_page(
-                img_np, yolo_model, input_path, 0, page, pdf_name
-            )
-            if page_data:
-                all_pages_data.append(page_data)
-            doc.close()
-        else:
-            doc = fitz.open(input_path)
-            print(f"📄 Processing PDF: {pdf_name} ({len(doc)} pages)")
-            for page_index in range(len(doc)):
-                page = doc[page_index]
-                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-                img_np = pixmap_to_numpy(pix)
-                page_data, _ = preprocess_and_ocr_page(
-                    img_np, yolo_model, input_path, page_index, page, pdf_name
-                )
-                if page_data:
-                    all_pages_data.append(page_data)
-            doc.close()
-        if not all_pages_data:
-            print("❌ No data extracted.")
-            return None
-        # 3. CONSOLIDATE BLOCKS FOR INFERENCE (Safe against List vs Dict)
-        sequential_blocks = []
-        for p_data in all_pages_data:
-            if isinstance(p_data, dict):
-                blocks = p_data.get('blocks', [])
-                sequential_blocks.extend(blocks)
-            elif isinstance(p_data, list):
-                sequential_blocks.extend(p_data)
-        # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
-        print("\n" + "=" * 80)
-        print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
-        print("=" * 80)
-        tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
-        # --- FIX: ROBUST KEY REMAPPING FOR LAYOUTLMV3 ---
-        checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
-        state_dict = checkpoint.get('model_state_dict', checkpoint)
-        # Rename keys from 'layoutlm.xxx' to 'layoutlmv3.xxx' if necessary
-        new_state_dict = {}
-        for key, value in state_dict.items():
-            if key.startswith("layoutlm."):
-                new_key = key.replace("layoutlm.", "layoutlmv3.", 1)
-                new_state_dict[new_key] = value
-            else:
-                new_state_dict[key] = value
-        # Load with strict=False to handle minor metadata differences
-        model.load_state_dict(new_state_dict, strict=False)
-        # -----------------------------------------------
-        model.to(device)
-        model.eval()
-        final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
-        # 5. POST-PROCESS CLASSIFICATION
         classifier = HierarchicalClassifier()
         if classifier.load_models():
-            final_result = post_process_json_with_inference(final_result, classifier)
-            print("✅ Classification complete.")
-        return final_result
     except Exception as e:
         import traceback
         traceback.print_exc()
-        print(f"❌ FATAL ERROR in pipeline: {e}")
         return None
-# #================================================================================
-#         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
-# #================================================================================
-#         print("\n" + "=" * 80)
-#         print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
-#         print("=" * 80)
-#         # 1. Initialize and Load the Classifier
-#         classifier = HierarchicalClassifier()
-#         if classifier.load_models():
-#             # 2. Run Classification on the *Final* Result
-#             # The function modifies the list in place and returns it
-#             final_result = post_process_json_with_inference(
-#                 final_result, classifier
-#             )
-#             print("✅ Classification complete. Tags added to final output.")
-#         else:
-#             print("❌ Classification model loading failed. Outputting un-tagged data.")
-#         # ====================================================================
-#     except Exception as e:
-#         print(f"❌ FATAL ERROR: {e}")
-#         import traceback
-#         traceback.print_exc()
-#         return None
-#     finally:
-#         try:
-#             for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
-#                 os.remove(f)
-#             os.rmdir(temp_pipeline_dir)
-#         except Exception:
-#             pass
-#     print("\n" + "#" * 80)
-#     print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
-#     print("#" * 80)
-#     return final_result

+# def get_latex_from_base64(base64_string: str) -> str:
+#     """
+#     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
+#     to recognize the formula. It cleans the output by removing spaces and
+#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
+#     """
+#     if ort_model is None or processor is None:
+#         return "[MODEL_ERROR: Model not initialized]"
+#     try:
+#         # 1. Decode Base64 to Image
+#         image_data = base64.b64decode(base64_string)
+#         # We must ensure the image is RGB format for the model input
+#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
+#         # 2. Preprocess the image
+#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
+#         # 3. Text Generation (OCR)
+#         generated_ids = ort_model.generate(pixel_values)
+#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+#         if not raw_generated_text:
+#             return "[OCR_WARNING: No formula found]"
+#         latex_string = raw_generated_text[0]
+#         # ==============================================================================
+#         # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
+#         # ==============================================================================
+#         print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
+#         # ==============================================================================
+#         # --- 4. Post-processing and Cleanup ---
+#         # # A. Remove all spaces/line breaks
+#         # cleaned_latex = re.sub(r'\s+', '', latex_string)
+#         cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
+#         # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
+#         # This corrects model output that already over-escaped the LaTeX commands.
+#         # Python literal: '\\\\' is replaced with '\\'.
+#         #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
+#         return cleaned_latex
+#     except Exception as e:
+#         # Catch any unexpected errors
+#         print(f"  ❌ TR-OCR Recognition failed: {e}")
+#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
+# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
+#     raw_word_data = fitz_page.get_text("words")
+#     converted_ocr_output = []
+#     DEFAULT_CONFIDENCE = 99.0
+#     for x1, y1, x2, y2, word, *rest in raw_word_data:
+#         # --- FIX: SANITIZE TEXT HERE ---
+#         # cleaned_word = sanitize_text(word)
+#         # if not cleaned_word.strip(): continue
+#         x1_pix = int(x1 * scale_factor)
+#         y1_pix = int(y1 * scale_factor)
+#         x2_pix = int(x2 * scale_factor)
+#         y2_pix = int(y2 * scale_factor)
+#         converted_ocr_output.append({
+#             'type': 'text',
+#             'word': cleaned_word, # Use the sanitized word
+#             'confidence': DEFAULT_CONFIDENCE,
+#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#             'y0': y1_pix, 'x0': x1_pix
+#         })
+#     return converted_ocr_output
+# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
+#     raw_word_data = fitz_page.get_text("words")
+#     # ==============================================================================
+#     # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
+#     # ==============================================================================
+#     print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
+#     debug_count = 0
+#     for item in raw_word_data:
+#         if debug_count >= 50: break
+#         # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
+#         word_text = item[4]
+#         # Generate unicode hex codes for every character in the word
+#         unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
+#         print(f"  Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
+#         debug_count += 1
+#     print("----------------------------------------------------------------------\n")
+#     # ==============================================================================
+#     converted_ocr_output = []
+#     DEFAULT_CONFIDENCE = 99.0
+#     for x1, y1, x2, y2, word, *rest in raw_word_data:
+#         # --- FIX: SANITIZE TEXT HERE ---
+#         cleaned_word = sanitize_text(word)
+#         if not cleaned_word.strip(): continue
+#         x1_pix = int(x1 * scale_factor)
+#         y1_pix = int(y1 * scale_factor)
+#         x2_pix = int(x2 * scale_factor)
+#         y2_pix = int(y2 * scale_factor)
+#         converted_ocr_output.append({
+#             'type': 'text',
+#             'word': cleaned_word, # Use the sanitized word
+#             'confidence': DEFAULT_CONFIDENCE,
+#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#             'y0': y1_pix, 'x0': x1_pix
+#         })
+#     return converted_ocr_output
+# def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
+#                             page_num: int, fitz_page: fitz.Page,
+#                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
+#     """
+#     OPTIMIZED FLOW:
+#     1. Run YOLO to find Equations/Tables.
+#     2. Mask raw text with YOLO boxes.
+#     3. Run Column Detection on the MASKED data.
+#     4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
+#     """
+#     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+#     start_time_total = time.time()
+#     if original_img is None:
+#         print(f"  ❌ Invalid image for page {page_num}.")
+#         return None, None
+#     # ====================================================================
+#     # --- STEP 1: YOLO DETECTION ---
+#     # ====================================================================
+#     start_time_yolo = time.time()
+#     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
+#     relevant_detections = []
+#     if results and results[0].boxes:
+#         for box in results[0].boxes:
+#             class_id = int(box.cls[0])
+#             class_name = model.names[class_id]
+#             if class_name in TARGET_CLASSES:
+#                 x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
+#                 relevant_detections.append(
+#                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
+#                 )
+#     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
+#     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
+#     # ====================================================================
+#     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
+#     # ====================================================================
+#     # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
+#     raw_words_for_layout = get_word_data_for_detection(
+#         fitz_page, pdf_path, page_num,
+#         top_margin_percent=0.10, bottom_margin_percent=0.10
+#     )
+#     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
+#     # ====================================================================
+#     # --- STEP 3: COLUMN DETECTION ---
+#     # ====================================================================
+#     page_width_pdf = fitz_page.rect.width
+#     page_height_pdf = fitz_page.rect.height
+#     column_detection_params = {
+#         'cluster_bin_size': 2, 'cluster_smoothing': 2,
+#         'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
+#     }
+#     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
+#     page_separator_x = None
+#     if separators:
+#         central_min = page_width_pdf * 0.35
+#         central_max = page_width_pdf * 0.65
+#         central_separators = [s for s in separators if central_min <= s <= central_max]
+#         if central_separators:
+#             center_x = page_width_pdf / 2
+#             page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
+#             print(f"      ✅ Column Split Confirmed at X={page_separator_x:.1f}")
+#         else:
+#             print("      ⚠️ Gutter found off-center. Ignoring.")
+#     else:
+#         print("      -> Single Column Layout Confirmed.")
+#     # ====================================================================
+#     # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
+#     # ====================================================================
+#     start_time_components = time.time()
+#     component_metadata = []
+#     fig_count_page = 0
+#     eq_count_page = 0
+#     for detection in merged_detections:
+#         x1, y1, x2, y2 = detection['coords']
+#         class_name = detection['class']
+#         if class_name == 'figure':
+#             GLOBAL_FIGURE_COUNT += 1
+#             counter = GLOBAL_FIGURE_COUNT
+#             component_word = f"FIGURE{counter}"
+#             fig_count_page += 1
+#         elif class_name == 'equation':
+#             GLOBAL_EQUATION_COUNT += 1
+#             counter = GLOBAL_EQUATION_COUNT
+#             component_word = f"EQUATION{counter}"
+#             eq_count_page += 1
+#         else:
+#             continue
+#         component_crop = original_img[y1:y2, x1:x2]
+#         component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
+#         cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
+#         y_midpoint = (y1 + y2) // 2
+#         component_metadata.append({
+#             'type': class_name, 'word': component_word,
+#             'bbox': [int(x1), int(y1), int(x2), int(y2)],
+#             'y0': int(y_midpoint), 'x0': int(x1)
+#         })
+#     # ====================================================================
+#     # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
+#     # ====================================================================
+#     raw_ocr_output = []
+#     scale_factor = 2.0  # Pipeline standard scale
+#     try:
+#         # Try getting native text first
+#         # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
+#         raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
+#     except Exception as e:
+#         print(f"  ❌ Native text extraction failed: {e}")
+#     # If native text is missing, fall back to OCR
+#     if not raw_ocr_output:
+#         if _ocr_cache.has_ocr(pdf_path, page_num):
+#             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
+#             cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
+#             for word_tuple in cached_word_data:
+#                 word_text, x1, y1, x2, y2 = word_tuple
+#                 # Scale from PDF points to Pipeline Pixels (2.0)
+#                 x1_pix = int(x1 * scale_factor)
+#                 y1_pix = int(y1 * scale_factor)
+#                 x2_pix = int(x2 * scale_factor)
+#                 y2_pix = int(y2 * scale_factor)
+#                 raw_ocr_output.append({
+#                     'type': 'text', 'word': word_text, 'confidence': 95.0,
+#                     'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#                     'y0': y1_pix, 'x0': x1_pix
+#                 })
+#         else:
+#             # === START OF OPTIMIZED OCR BLOCK ===
+#             try:
+#                 # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
+#                 ocr_zoom = 4.0
+#                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
+#                 # Convert PyMuPDF Pixmap to OpenCV format
+#                 img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
+#                                                                                     pix_ocr.n)
+#                 if pix_ocr.n == 3:
+#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
+#                 elif pix_ocr.n == 4:
+#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
+#                 # 2. Preprocess (Binarization)
+#                 processed_img = preprocess_image_for_ocr(img_ocr_np)
+#                 # 3. Run Tesseract with Optimized Configuration
+#                 custom_config = r'--oem 3 --psm 6'
+#                 hocr_data = pytesseract.image_to_data(
+#                     processed_img,
+#                     output_type=pytesseract.Output.DICT,
+#                     config=custom_config
+#                 )
+#                 # ==============================================================================
+#                 # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
+#                 # ==============================================================================
+#                 print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
+#                 debug_count = 0
+#                 for i in range(len(hocr_data['level'])):
+#                     text = hocr_data['text'][i].strip()
+#                     if text:
+#                         unicode_points = [f"\\u{ord(c):04x}" for c in text]
+#                         print(f"  OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
+#                         debug_count += 1
+#                         if debug_count >= 50: break
+#                 print("----------------------------------------------------------------------\n")
+#                 # ==============================================================================
+#                 for i in range(len(hocr_data['level'])):
+#                     text = hocr_data['text'][i] # Retrieve raw Tesseract text
+#                     # --- FIX: SANITIZE TEXT AND THEN STRIP ---
+#                     cleaned_text = sanitize_text(text).strip()
+#                     if cleaned_text and hocr_data['conf'][i] > -1:
+#                         # 4. Coordinate Mapping
+#                         scale_adjustment = scale_factor / ocr_zoom
+#                         x1 = int(hocr_data['left'][i] * scale_adjustment)
+#                         y1 = int(hocr_data['top'][i] * scale_adjustment)
+#                         w = int(hocr_data['width'][i] * scale_adjustment)
+#                         h = int(hocr_data['height'][i] * scale_adjustment)
+#                         x2 = x1 + w
+#                         y2 = y1 + h
+#                         raw_ocr_output.append({
+#                             'type': 'text',
+#                             'word': cleaned_text, # Use the sanitized word
+#                             'confidence': float(hocr_data['conf'][i]),
+#                             'bbox': [x1, y1, x2, y2],
+#                             'y0': y1,
+#                             'x0': x1
+#                         })
+#             except Exception as e:
+#                 print(f"  ❌ Tesseract OCR Error: {e}")
+#             # === END OF OPTIMIZED OCR BLOCK ===
+#     # ====================================================================
+#     # --- STEP 6: OCR CLEANING AND MERGING ---
+#     # ====================================================================
+#     items_to_sort = []
+#     for ocr_word in raw_ocr_output:
+#         is_suppressed = False
+#         for component in component_metadata:
+#             # Do not include words that are inside figure/equation boxes
+#             ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
+#             if ioa > IOA_SUPPRESSION_THRESHOLD:
+#                 is_suppressed = True
+#                 break
+#         if not is_suppressed:
+#             items_to_sort.append(ocr_word)
+#     # Add figures/equations back into the flow as "words"
+#     items_to_sort.extend(component_metadata)
+#     # ====================================================================
+#     # --- STEP 7: LINE-BASED SORTING ---
+#     # ====================================================================
+#     items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
+#     lines = []
+#     for item in items_to_sort:
+#         placed = False
+#         for line in lines:
+#             y_ref = min(it['y0'] for it in line)
+#             if abs(y_ref - item['y0']) < LINE_TOLERANCE:
+#                 line.append(item)
+#                 placed = True
+#                 break
+#         if not placed and item['type'] in ['equation', 'figure']:
+#             for line in lines:
+#                 y_ref = min(it['y0'] for it in line)
+#                 if abs(y_ref - item['y0']) < 20:
+#                     line.append(item)
+#                     placed = True
+#                     break
+#         if not placed:
+#             lines.append([item])
+#     for line in lines:
+#         line.sort(key=lambda x: x['x0'])
+#     final_output = []
+#     for line in lines:
+#         for item in line:
+#             data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
+#             if 'tag' in item: data_item['tag'] = item['tag']
+#             final_output.append(data_item)
+#     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+# def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
+#                                     preprocessed_json_path: str,
+#                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
+#     print("\n" + "=" * 80)
+#     print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
+#     print("=" * 80)
+#     tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     print(f"  -> Using device: {device}")
+#     try:
+#         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
+#         checkpoint = torch.load(model_path, map_location=device)
+#         model_state = checkpoint.get('model_state_dict', checkpoint)
+#         # Apply patch for layoutlmv3 compatibility with saved state_dict
+#         fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
+#         model.load_state_dict(fixed_state_dict)
+#         model.to(device)
+#         model.eval()
+#         print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
+#     except Exception as e:
+#         print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
+#         return []
+#     try:
+#         with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
+#             preprocessed_data = json.load(f)
+#         print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
+#     except Exception:
+#         print("❌ Error loading preprocessed JSON.")
+#         return []
+#     try:
+#         doc = fitz.open(pdf_path)
+#     except Exception:
+#         print("❌ Error loading PDF.")
+#         return []
+#     final_page_predictions = []
+#     CHUNK_SIZE = 500
+#     for page_data in preprocessed_data:
+#         page_num_1_based = page_data['page_number']
+#         page_num_0_based = page_num_1_based - 1
+#         page_raw_predictions = []
+#         print(f"\n  *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
+#         fitz_page = doc.load_page(page_num_0_based)
+#         page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
+#         print(f"    -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
+#         all_token_data = []
+#         scale_factor = 2.0
+#         for item in page_data['data']:
+#             raw_yolo_bbox = item['bbox']
+#             bbox_pdf = [
+#                 int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
+#                 int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
+#             ]
+#             normalized_bbox = [
+#                 max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
+#             ]
+#             all_token_data.append({
+#                 "word": item['word'],
+#                 "bbox_raw_pdf_space": bbox_pdf,
+#                 "bbox_normalized": normalized_bbox,
+#                 "item_original_data": item
+#             })
+#         # ==============================================================================
+#         # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
+#         # ==============================================================================
+#         print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
+#         debug_count = 0
+#         for t in all_token_data:
+#              if debug_count >= 50: break
+#              w = t['word']
+#              unicode_points = [f"\\u{ord(c):04x}" for c in w]
+#              print(f"  Token {debug_count}: '{w}' -> Codes: {unicode_points}")
+#              debug_count += 1
+#         print("----------------------------------------------------------------------\n")
+#         # ==============================================================================
+#         if not all_token_data:
+#             continue
+#         column_separator_x = page_data.get('column_separator_x', None)
+#         if column_separator_x is not None:
+#             print(f"    -> Using SAVED column separator: X={column_separator_x}")
+#         else:
+#             print("    -> No column separator found. Assuming single chunk.")
+#         token_chunks = _merge_integrity(all_token_data, column_separator_x)
+#         total_chunks = len(token_chunks)
+#         for chunk_idx, chunk_tokens in enumerate(token_chunks):
+#             if not chunk_tokens: continue
+#             # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
+#             chunk_words = [
+#                 str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
+#                 for t in chunk_tokens
+#             ]
+#             chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
+#             total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
+#             for i in range(0, len(chunk_words), CHUNK_SIZE):
+#                 sub_chunk_idx = i // CHUNK_SIZE + 1
+#                 sub_words = chunk_words[i:i + CHUNK_SIZE]
+#                 sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
+#                 sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
+#                 print(f"      -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
+#                 # 2. Manual generation of word_ids
+#                 manual_word_ids = []
+#                 for current_word_idx, word in enumerate(sub_words):
+#                     sub_tokens = tokenizer.tokenize(word)
+#                     for _ in sub_tokens:
+#                         manual_word_ids.append(current_word_idx)
+#                 encoded_input = tokenizer(
+#                     sub_words,
+#                     boxes=sub_bboxes,
+#                     truncation=True,
+#                     padding="max_length",
+#                     max_length=512,
+#                     is_split_into_words=True,
+#                     return_tensors="pt"
+#                 )
+#                 # Check for empty sequence
+#                 if encoded_input['input_ids'].shape[0] == 0:
+#                     print(f"        -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
+#                     continue
+#                 # 3. Finalize word_ids based on encoded output length
+#                 sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
+#                 content_token_length = max(0, sequence_length - 2)
+#                 manual_word_ids = manual_word_ids[:content_token_length]
+#                 final_word_ids = [None]  # CLS token (index 0)
+#                 final_word_ids.extend(manual_word_ids)
+#                 if sequence_length > 1:
+#                     final_word_ids.append(None)  # SEP token
+#                 final_word_ids.extend([None] * (512 - len(final_word_ids)))
+#                 word_ids = final_word_ids[:512]  # Final array for mapping
+#                 # Inputs are already batched by the tokenizer as [1, 512]
+#                 input_ids = encoded_input['input_ids'].to(device)
+#                 bbox = encoded_input['bbox'].to(device)
+#                 attention_mask = encoded_input['attention_mask'].to(device)
+#                 with torch.no_grad():
+#                     model_outputs = model(input_ids, bbox, attention_mask)
+#                 # --- Robust extraction: support several forward return types ---
+#                 logits_tensor = None
+#                 decoded_labels_list = None
+#                 # case 1: tuple/list with (emissions, viterbi)
+#                 if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
+#                     a, b = model_outputs
+#                     if isinstance(a, torch.Tensor):
+#                         logits_tensor = a
+#                     if isinstance(b, list):
+#                         decoded_labels_list = b
+#                 # case 2: HF ModelOutput with .logits
+#                 if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
+#                     logits_tensor = model_outputs.logits
+#                 # case 3: tuple/list - search for a 3D tensor (B, L, C)
+#                 if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
+#                     found_tensor = None
+#                     for item in model_outputs:
+#                         if isinstance(item, torch.Tensor):
+#                             if item.dim() == 3:
+#                                 logits_tensor = item
+#                                 break
+#                             if found_tensor is None:
+#                                 found_tensor = item
+#                     if logits_tensor is None and found_tensor is not None:
+#                         if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
+#                             logits_tensor = found_tensor
+#                         elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
+#                             logits_tensor = found_tensor.unsqueeze(0)
+#                 # case 4: model_outputs directly a tensor
+#                 if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
+#                     logits_tensor = model_outputs
+#                 # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
+#                 if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
+#                     decoded_labels_list = model_outputs
+#                 # If neither logits nor decoded exist, that's fatal
+#                 if logits_tensor is None and decoded_labels_list is None:
+#                     try:
+#                         elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
+#                     except Exception:
+#                         elem_shapes = str(type(model_outputs))
+#                     raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
+#                 # If we have logits_tensor, normalize shape to [seq_len, num_labels]
+#                 if logits_tensor is not None:
+#                     if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
+#                         preds_tensor = logits_tensor.squeeze(0)  # [L, C]
+#                     else:
+#                         preds_tensor = logits_tensor  # possibly [L, C] already
+#                     if preds_tensor.dim() != 2:
+#                         raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
+#                 else:
+#                     preds_tensor = None  # no logits available
+#                 # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
+#                 decoded_token_labels = None
+#                 if decoded_labels_list is not None:
+#                     decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
+#                 # Now map token-level predictions -> word-level predictions using word_ids
+#                 word_idx_to_pred_id = {}
+#                 if preds_tensor is not None:
+#                     for token_idx, word_idx in enumerate(word_ids):
+#                         if token_idx >= sequence_length:
+#                             break
+#                         if word_idx is not None and word_idx < len(sub_words):
+#                             if word_idx not in word_idx_to_pred_id:
+#                                 pred_id = torch.argmax(preds_tensor[token_idx]).item()
+#                                 word_idx_to_pred_id[word_idx] = pred_id
+#                 else:
+#                     if decoded_token_labels is None:
+#                         raise RuntimeError("No logits and no decoded labels available for mapping.")
+#                     decoded_len = len(decoded_token_labels)
+#                     if decoded_len == content_token_length:
+#                         decoded_start = 1
+#                     elif decoded_len == sequence_length:
+#                         decoded_start = 0
+#                     else:
+#                         decoded_start = 1
+#                     for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
+#                         tok_idx = decoded_start + tok_idx_in_decoded
+#                         if tok_idx >= 512:
+#                             break
+#                         if tok_idx >= sequence_length:
+#                             break
+#                         word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
+#                         if word_idx is not None and word_idx < len(sub_words):
+#                             if word_idx not in word_idx_to_pred_id:
+#                                 word_idx_to_pred_id[word_idx] = int(label_id)
+#                 # Finally convert mapped word preds -> page_raw_predictions entries
+#                 for current_word_idx in range(len(sub_words)):
+#                     pred_id = word_idx_to_pred_id.get(current_word_idx, 0)  # default to 0
+#                     predicted_label = ID_TO_LABEL[pred_id]
+#                     original_token = sub_tokens_data[current_word_idx]
+#                     page_raw_predictions.append({
+#                         "word": original_token['word'],
+#                         "bbox": original_token['bbox_raw_pdf_space'],
+#                         "predicted_label": predicted_label,
+#                         "page_number": page_num_1_based
+#                     })
+#         if page_raw_predictions:
+#             final_page_predictions.append({
+#                 "page_number": page_num_1_based,
+#                 "data": page_raw_predictions
+#             })
+#             print(f"  *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
+#     doc.close()
+#     print("\n" + "=" * 80)
+#     print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
+#     print("=" * 80)
+#     return final_page_predictions
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 # ============================================================================
+# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
+#     List[Dict[str, Any]]]:
+def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+    if not os.path.exists(input_pdf_path): return None
+    print("\n" + "#" * 80)
+    print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
+    print("#" * 80)
+    pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
+    temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
+    os.makedirs(temp_pipeline_dir, exist_ok=True)
+    preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
+    raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
+    structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
+    final_result = None
     try:
+        # Phase 1: Preprocessing with YOLO First + Masking
+        preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
+        if not preprocessed_json_path_out: return None
+        # Phase 2: Inference
+        page_raw_predictions_list = run_inference_and_get_raw_words(
+            input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
+        )
+        if not page_raw_predictions_list: return None
+        # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
+        # Save raw predictions to the temporary file
+        with open(raw_output_path, 'w', encoding='utf-8') as f:
+            json.dump(page_raw_predictions_list, f, indent=4)
+        # Explicitly copy/save the raw predictions to the user-specified debug path
+        # if raw_predictions_output_path:
+        #     shutil.copy(raw_output_path, raw_predictions_output_path)
+        #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
+        # ----------------------------------------
+        # Phase 3: Decoding
+        structured_data_list = convert_bio_to_structured_json_relaxed(
+            raw_output_path, structured_intermediate_output_path
+        )
+        if not structured_data_list: return None
+        structured_data_list = correct_misaligned_options(structured_data_list)
+        structured_data_list = process_context_linking(structured_data_list)
+        # Phase 4: Embedding / Equation to LaTeX Conversion
+        final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
+#================================================================================
+        # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
+#================================================================================
+        print("\n" + "=" * 80)
+        print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
+        print("=" * 80)
+        # 1. Initialize and Load the Classifier
         classifier = HierarchicalClassifier()
         if classifier.load_models():
+            # 2. Run Classification on the *Final* Result
+            # The function modifies the list in place and returns it
+            final_result = post_process_json_with_inference(
+                final_result, classifier
+            )
+            print("✅ Classification complete. Tags added to final output.")
+        else:
+            print("❌ Classification model loading failed. Outputting un-tagged data.")
+        # ====================================================================
     except Exception as e:
+        print(f"❌ FATAL ERROR: {e}")
         import traceback
         traceback.print_exc()
         return None
+    finally:
+        try:
+            for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
+                os.remove(f)
+            os.rmdir(temp_pipeline_dir)
+        except Exception:
+            pass
+    print("\n" + "#" * 80)
+    print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
+    print("#" * 80)
+    return final_result