Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 5, 2025

Commit

2e5b054

verified ·

1 Parent(s): 9c47ab0

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +722 -33

working_yolo_pipeline.py CHANGED Viewed

@@ -92,6 +92,60 @@ def sanitize_text(text: Optional[str]) -> str:
 def get_latex_from_base64(base64_string: str) -> str:
     """
     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
@@ -118,6 +172,12 @@ def get_latex_from_base64(base64_string: str) -> str:
             return "[OCR_WARNING: No formula found]"
         latex_string = raw_generated_text[0]
         # --- 4. Post-processing and Cleanup ---
@@ -580,8 +640,53 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
 def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
     raw_word_data = fitz_page.get_text("words")
     converted_ocr_output = []
     DEFAULT_CONFIDENCE = 99.0
@@ -796,6 +901,275 @@ def post_process_json_with_inference(json_data, classifier):
 def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                             page_num: int, fitz_page: fitz.Page,
                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -968,6 +1342,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                     config=custom_config
                 )
                 for i in range(len(hocr_data['level'])):
                     text = hocr_data['text'][i] # Retrieve raw Tesseract text
@@ -1053,6 +1442,12 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -1197,6 +1592,319 @@ def _merge_integrity(all_token_data: List[Dict[str, Any]],
 def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                                     preprocessed_json_path: str,
                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
@@ -1271,6 +1979,20 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                 "item_original_data": item
             })
         if not all_token_data:
             continue
@@ -1348,19 +2070,12 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                     model_outputs = model(input_ids, bbox, attention_mask)
                 # --- Robust extraction: support several forward return types ---
-                # We'll try (in order):
-                # 1) model_outputs is (emissions_tensor, viterbi_list)  -> use emissions for logits, keep decoded
-                # 2) model_outputs has .logits attribute (HF ModelOutput)
-                # 3) model_outputs is tuple/list containing a logits tensor
-                # 4) model_outputs is a tensor (assume logits)
-                # 5) model_outputs is a list-of-lists of ints (viterbi decoded) -> use that directly (no logits)
                 logits_tensor = None
                 decoded_labels_list = None
                 # case 1: tuple/list with (emissions, viterbi)
                 if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
                     a, b = model_outputs
-                    # a might be tensor (emissions), b might be viterbi list
                     if isinstance(a, torch.Tensor):
                         logits_tensor = a
                     if isinstance(b, list):
@@ -1375,15 +2090,12 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                     found_tensor = None
                     for item in model_outputs:
                         if isinstance(item, torch.Tensor):
-                            # prefer 3D (batch, seq, labels)
                             if item.dim() == 3:
                                 logits_tensor = item
                                 break
                             if found_tensor is None:
                                 found_tensor = item
                     if logits_tensor is None and found_tensor is not None:
-                        # found_tensor may be (batch, seq, hidden) or (seq, hidden); we avoid guessing.
-                        # Keep found_tensor only if it matches num_labels dimension
                         if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
                             logits_tensor = found_tensor
                         elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
@@ -1395,12 +2107,10 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                 # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
                 if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
-                    # assume model_outputs is already viterbi decoded: List[List[int]] with batch dim first
                     decoded_labels_list = model_outputs
                 # If neither logits nor decoded exist, that's fatal
                 if logits_tensor is None and decoded_labels_list is None:
-                    # helpful debug info
                     try:
                         elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
                     except Exception:
@@ -1409,32 +2119,25 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                 # If we have logits_tensor, normalize shape to [seq_len, num_labels]
                 if logits_tensor is not None:
-                    # If shape is [B, L, C] with B==1, squeeze batch
                     if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
                         preds_tensor = logits_tensor.squeeze(0)  # [L, C]
                     else:
                         preds_tensor = logits_tensor  # possibly [L, C] already
-                    # Safety: ensure we have at least seq_len x channels
                     if preds_tensor.dim() != 2:
-                        # try to reshape or error
                         raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
-                    # We'll use preds_tensor[token_idx] to argmax
                 else:
                     preds_tensor = None  # no logits available
                 # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
                 decoded_token_labels = None
                 if decoded_labels_list is not None:
-                    # decoded_labels_list is batch-first; we used batch size 1
-                    # if multiple sequences returned, take first
                     decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
                 # Now map token-level predictions -> word-level predictions using word_ids
                 word_idx_to_pred_id = {}
                 if preds_tensor is not None:
-                    # We have logits. Use argmax of logits for each token id up to sequence_length
                     for token_idx, word_idx in enumerate(word_ids):
                         if token_idx >= sequence_length:
                             break
@@ -1443,26 +2146,14 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                                 pred_id = torch.argmax(preds_tensor[token_idx]).item()
                                 word_idx_to_pred_id[word_idx] = pred_id
                 else:
-                    # No logits, but we have decoded_token_labels from CRF (one label per token)
-                    # We'll align decoded_token_labels to token positions.
                     if decoded_token_labels is None:
-                        # should not happen due to earlier checks
                         raise RuntimeError("No logits and no decoded labels available for mapping.")
-                    # decoded_token_labels length may be equal to content_token_length (no special tokens)
-                    # or equal to sequence_length; try to align intelligently:
-                    # Prefer using decoded_token_labels aligned to the tokenizer tokens (starting at token 1 for CLS)
-                    # If decoded length == content_token_length, then manual_word_ids maps sub-token -> word idx for content tokens only.
-                    # We'll iterate tokens and pick label accordingly.
-                    # Build token_idx -> decoded_label mapping:
-                    # We'll assume decoded_token_labels correspond to content tokens (no CLS/SEP). If decoded length == sequence_length, then shift by 0.
                     decoded_len = len(decoded_token_labels)
-                    # Heuristic: if decoded_len == content_token_length -> alignment starts at token_idx 1 (skip CLS)
                     if decoded_len == content_token_length:
                         decoded_start = 1
                     elif decoded_len == sequence_length:
                         decoded_start = 0
                     else:
-                        # fallback: prefer decoded_start=1 (most common)
                         decoded_start = 1
                     for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
@@ -1471,11 +2162,9 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                             break
                         if tok_idx >= sequence_length:
                             break
-                        # map this token to a word index if present
                         word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
                         if word_idx is not None and word_idx < len(sub_words):
                             if word_idx not in word_idx_to_pred_id:
-                                # label_id may already be an int
                                 word_idx_to_pred_id[word_idx] = int(label_id)
                 # Finally convert mapped word preds -> page_raw_predictions entries

+# def get_latex_from_base64(base64_string: str) -> str:
+#     """
+#     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
+#     to recognize the formula. It cleans the output by removing spaces and
+#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
+#     """
+#     if ort_model is None or processor is None:
+#         return "[MODEL_ERROR: Model not initialized]"
+#     try:
+#         # 1. Decode Base64 to Image
+#         image_data = base64.b64decode(base64_string)
+#         # We must ensure the image is RGB format for the model input
+#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
+#         # 2. Preprocess the image
+#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
+#         # 3. Text Generation (OCR)
+#         generated_ids = ort_model.generate(pixel_values)
+#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+#         if not raw_generated_text:
+#             return "[OCR_WARNING: No formula found]"
+#         latex_string = raw_generated_text[0]
+#         # --- 4. Post-processing and Cleanup ---
+#         # # A. Remove all spaces/line breaks
+#         # cleaned_latex = re.sub(r'\s+', '', latex_string)
+#         cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
+#         # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
+#         # This corrects model output that already over-escaped the LaTeX commands.
+#         # Python literal: '\\\\' is replaced with '\\'.
+#         #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
+#         return cleaned_latex
+#     except Exception as e:
+#         # Catch any unexpected errors
+#         print(f"  ❌ TR-OCR Recognition failed: {e}")
+#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 def get_latex_from_base64(base64_string: str) -> str:
     """
     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
             return "[OCR_WARNING: No formula found]"
         latex_string = raw_generated_text[0]
+        # ==============================================================================
+        # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
+        # ==============================================================================
+        print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
+        # ==============================================================================
         # --- 4. Post-processing and Cleanup ---
+# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
+#     raw_word_data = fitz_page.get_text("words")
+#     converted_ocr_output = []
+#     DEFAULT_CONFIDENCE = 99.0
+#     for x1, y1, x2, y2, word, *rest in raw_word_data:
+#         # --- FIX: SANITIZE TEXT HERE ---
+#         cleaned_word = sanitize_text(word)
+#         if not cleaned_word.strip(): continue
+#         x1_pix = int(x1 * scale_factor)
+#         y1_pix = int(y1 * scale_factor)
+#         x2_pix = int(x2 * scale_factor)
+#         y2_pix = int(y2 * scale_factor)
+#         converted_ocr_output.append({
+#             'type': 'text',
+#             'word': cleaned_word, # Use the sanitized word
+#             'confidence': DEFAULT_CONFIDENCE,
+#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#             'y0': y1_pix, 'x0': x1_pix
+#         })
+#     return converted_ocr_output
 def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
     raw_word_data = fitz_page.get_text("words")
+    # ==============================================================================
+    # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
+    # ==============================================================================
+    print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
+    debug_count = 0
+    for item in raw_word_data:
+        if debug_count >= 50: break
+        # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
+        word_text = item[4]
+        # Generate unicode hex codes for every character in the word
+        unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
+        print(f"  Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
+        debug_count += 1
+    print("----------------------------------------------------------------------\n")
+    # ==============================================================================
     converted_ocr_output = []
     DEFAULT_CONFIDENCE = 99.0
+# def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
+#                             page_num: int, fitz_page: fitz.Page,
+#                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
+#     """
+#     OPTIMIZED FLOW:
+#     1. Run YOLO to find Equations/Tables.
+#     2. Mask raw text with YOLO boxes.
+#     3. Run Column Detection on the MASKED data.
+#     4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
+#     """
+#     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+#     start_time_total = time.time()
+#     if original_img is None:
+#         print(f"  ❌ Invalid image for page {page_num}.")
+#         return None, None
+#     # ====================================================================
+#     # --- STEP 1: YOLO DETECTION ---
+#     # ====================================================================
+#     start_time_yolo = time.time()
+#     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
+#     relevant_detections = []
+#     if results and results[0].boxes:
+#         for box in results[0].boxes:
+#             class_id = int(box.cls[0])
+#             class_name = model.names[class_id]
+#             if class_name in TARGET_CLASSES:
+#                 x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
+#                 relevant_detections.append(
+#                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
+#                 )
+#     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
+#     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
+#     # ====================================================================
+#     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
+#     # ====================================================================
+#     # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
+#     raw_words_for_layout = get_word_data_for_detection(
+#         fitz_page, pdf_path, page_num,
+#         top_margin_percent=0.10, bottom_margin_percent=0.10
+#     )
+#     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
+#     # ====================================================================
+#     # --- STEP 3: COLUMN DETECTION ---
+#     # ====================================================================
+#     page_width_pdf = fitz_page.rect.width
+#     page_height_pdf = fitz_page.rect.height
+#     column_detection_params = {
+#         'cluster_bin_size': 2, 'cluster_smoothing': 2,
+#         'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
+#     }
+#     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
+#     page_separator_x = None
+#     if separators:
+#         central_min = page_width_pdf * 0.35
+#         central_max = page_width_pdf * 0.65
+#         central_separators = [s for s in separators if central_min <= s <= central_max]
+#         if central_separators:
+#             center_x = page_width_pdf / 2
+#             page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
+#             print(f"      ✅ Column Split Confirmed at X={page_separator_x:.1f}")
+#         else:
+#             print("      ⚠️ Gutter found off-center. Ignoring.")
+#     else:
+#         print("      -> Single Column Layout Confirmed.")
+#     # ====================================================================
+#     # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
+#     # ====================================================================
+#     start_time_components = time.time()
+#     component_metadata = []
+#     fig_count_page = 0
+#     eq_count_page = 0
+#     for detection in merged_detections:
+#         x1, y1, x2, y2 = detection['coords']
+#         class_name = detection['class']
+#         if class_name == 'figure':
+#             GLOBAL_FIGURE_COUNT += 1
+#             counter = GLOBAL_FIGURE_COUNT
+#             component_word = f"FIGURE{counter}"
+#             fig_count_page += 1
+#         elif class_name == 'equation':
+#             GLOBAL_EQUATION_COUNT += 1
+#             counter = GLOBAL_EQUATION_COUNT
+#             component_word = f"EQUATION{counter}"
+#             eq_count_page += 1
+#         else:
+#             continue
+#         component_crop = original_img[y1:y2, x1:x2]
+#         component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
+#         cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
+#         y_midpoint = (y1 + y2) // 2
+#         component_metadata.append({
+#             'type': class_name, 'word': component_word,
+#             'bbox': [int(x1), int(y1), int(x2), int(y2)],
+#             'y0': int(y_midpoint), 'x0': int(x1)
+#         })
+#     # ====================================================================
+#     # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
+#     # ====================================================================
+#     raw_ocr_output = []
+#     scale_factor = 2.0  # Pipeline standard scale
+#     try:
+#         # Try getting native text first
+#         # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
+#         raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
+#     except Exception as e:
+#         print(f"  ❌ Native text extraction failed: {e}")
+#     # If native text is missing, fall back to OCR
+#     if not raw_ocr_output:
+#         if _ocr_cache.has_ocr(pdf_path, page_num):
+#             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
+#             cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
+#             for word_tuple in cached_word_data:
+#                 word_text, x1, y1, x2, y2 = word_tuple
+#                 # Scale from PDF points to Pipeline Pixels (2.0)
+#                 x1_pix = int(x1 * scale_factor)
+#                 y1_pix = int(y1 * scale_factor)
+#                 x2_pix = int(x2 * scale_factor)
+#                 y2_pix = int(y2 * scale_factor)
+#                 raw_ocr_output.append({
+#                     'type': 'text', 'word': word_text, 'confidence': 95.0,
+#                     'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#                     'y0': y1_pix, 'x0': x1_pix
+#                 })
+#         else:
+#             # === START OF OPTIMIZED OCR BLOCK ===
+#             try:
+#                 # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
+#                 ocr_zoom = 4.0
+#                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
+#                 # Convert PyMuPDF Pixmap to OpenCV format
+#                 img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
+#                                                                                     pix_ocr.n)
+#                 if pix_ocr.n == 3:
+#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
+#                 elif pix_ocr.n == 4:
+#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
+#                 # 2. Preprocess (Binarization)
+#                 processed_img = preprocess_image_for_ocr(img_ocr_np)
+#                 # 3. Run Tesseract with Optimized Configuration
+#                 custom_config = r'--oem 3 --psm 6'
+#                 hocr_data = pytesseract.image_to_data(
+#                     processed_img,
+#                     output_type=pytesseract.Output.DICT,
+#                     config=custom_config
+#                 )
+#                 for i in range(len(hocr_data['level'])):
+#                     text = hocr_data['text'][i] # Retrieve raw Tesseract text
+#                     # --- FIX: SANITIZE TEXT AND THEN STRIP ---
+#                     cleaned_text = sanitize_text(text).strip()
+#                     if cleaned_text and hocr_data['conf'][i] > -1:
+#                         # 4. Coordinate Mapping
+#                         scale_adjustment = scale_factor / ocr_zoom
+#                         x1 = int(hocr_data['left'][i] * scale_adjustment)
+#                         y1 = int(hocr_data['top'][i] * scale_adjustment)
+#                         w = int(hocr_data['width'][i] * scale_adjustment)
+#                         h = int(hocr_data['height'][i] * scale_adjustment)
+#                         x2 = x1 + w
+#                         y2 = y1 + h
+#                         raw_ocr_output.append({
+#                             'type': 'text',
+#                             'word': cleaned_text, # Use the sanitized word
+#                             'confidence': float(hocr_data['conf'][i]),
+#                             'bbox': [x1, y1, x2, y2],
+#                             'y0': y1,
+#                             'x0': x1
+#                         })
+#             except Exception as e:
+#                 print(f"  ❌ Tesseract OCR Error: {e}")
+#             # === END OF OPTIMIZED OCR BLOCK ===
+#     # ====================================================================
+#     # --- STEP 6: OCR CLEANING AND MERGING ---
+#     # ====================================================================
+#     items_to_sort = []
+#     for ocr_word in raw_ocr_output:
+#         is_suppressed = False
+#         for component in component_metadata:
+#             # Do not include words that are inside figure/equation boxes
+#             ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
+#             if ioa > IOA_SUPPRESSION_THRESHOLD:
+#                 is_suppressed = True
+#                 break
+#         if not is_suppressed:
+#             items_to_sort.append(ocr_word)
+#     # Add figures/equations back into the flow as "words"
+#     items_to_sort.extend(component_metadata)
+#     # ====================================================================
+#     # --- STEP 7: LINE-BASED SORTING ---
+#     # ====================================================================
+#     items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
+#     lines = []
+#     for item in items_to_sort:
+#         placed = False
+#         for line in lines:
+#             y_ref = min(it['y0'] for it in line)
+#             if abs(y_ref - item['y0']) < LINE_TOLERANCE:
+#                 line.append(item)
+#                 placed = True
+#                 break
+#         if not placed and item['type'] in ['equation', 'figure']:
+#             for line in lines:
+#                 y_ref = min(it['y0'] for it in line)
+#                 if abs(y_ref - item['y0']) < 20:
+#                     line.append(item)
+#                     placed = True
+#                     break
+#         if not placed:
+#             lines.append([item])
+#     for line in lines:
+#         line.sort(key=lambda x: x['x0'])
+#     final_output = []
+#     for line in lines:
+#         for item in line:
+#             data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
+#             if 'tag' in item: data_item['tag'] = item['tag']
+#             final_output.append(data_item)
+#     return final_output, page_separator_x
 def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                             page_num: int, fitz_page: fitz.Page,
                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
                     config=custom_config
                 )
+                # ==============================================================================
+                # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
+                # ==============================================================================
+                print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
+                debug_count = 0
+                for i in range(len(hocr_data['level'])):
+                    text = hocr_data['text'][i].strip()
+                    if text:
+                        unicode_points = [f"\\u{ord(c):04x}" for c in text]
+                        print(f"  OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
+                        debug_count += 1
+                        if debug_count >= 50: break
+                print("----------------------------------------------------------------------\n")
+                # ==============================================================================
                 for i in range(len(hocr_data['level'])):
                     text = hocr_data['text'][i] # Retrieve raw Tesseract text
     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+# def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
+#                                     preprocessed_json_path: str,
+#                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
+#     print("\n" + "=" * 80)
+#     print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
+#     print("=" * 80)
+#     tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     print(f"  -> Using device: {device}")
+#     try:
+#         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
+#         checkpoint = torch.load(model_path, map_location=device)
+#         model_state = checkpoint.get('model_state_dict', checkpoint)
+#         # Apply patch for layoutlmv3 compatibility with saved state_dict
+#         fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
+#         model.load_state_dict(fixed_state_dict)
+#         model.to(device)
+#         model.eval()
+#         print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
+#     except Exception as e:
+#         print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
+#         return []
+#     try:
+#         with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
+#             preprocessed_data = json.load(f)
+#         print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
+#     except Exception:
+#         print("❌ Error loading preprocessed JSON.")
+#         return []
+#     try:
+#         doc = fitz.open(pdf_path)
+#     except Exception:
+#         print("❌ Error loading PDF.")
+#         return []
+#     final_page_predictions = []
+#     CHUNK_SIZE = 500
+#     for page_data in preprocessed_data:
+#         page_num_1_based = page_data['page_number']
+#         page_num_0_based = page_num_1_based - 1
+#         page_raw_predictions = []
+#         print(f"\n  *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
+#         fitz_page = doc.load_page(page_num_0_based)
+#         page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
+#         print(f"    -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
+#         all_token_data = []
+#         scale_factor = 2.0
+#         for item in page_data['data']:
+#             raw_yolo_bbox = item['bbox']
+#             bbox_pdf = [
+#                 int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
+#                 int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
+#             ]
+#             normalized_bbox = [
+#                 max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
+#                 max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
+#             ]
+#             all_token_data.append({
+#                 "word": item['word'],
+#                 "bbox_raw_pdf_space": bbox_pdf,
+#                 "bbox_normalized": normalized_bbox,
+#                 "item_original_data": item
+#             })
+#         if not all_token_data:
+#             continue
+#         column_separator_x = page_data.get('column_separator_x', None)
+#         if column_separator_x is not None:
+#             print(f"    -> Using SAVED column separator: X={column_separator_x}")
+#         else:
+#             print("    -> No column separator found. Assuming single chunk.")
+#         token_chunks = _merge_integrity(all_token_data, column_separator_x)
+#         total_chunks = len(token_chunks)
+#         for chunk_idx, chunk_tokens in enumerate(token_chunks):
+#             if not chunk_tokens: continue
+#             # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
+#             chunk_words = [
+#                 str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
+#                 for t in chunk_tokens
+#             ]
+#             chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
+#             total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
+#             for i in range(0, len(chunk_words), CHUNK_SIZE):
+#                 sub_chunk_idx = i // CHUNK_SIZE + 1
+#                 sub_words = chunk_words[i:i + CHUNK_SIZE]
+#                 sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
+#                 sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
+#                 print(f"      -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
+#                 # 2. Manual generation of word_ids
+#                 manual_word_ids = []
+#                 for current_word_idx, word in enumerate(sub_words):
+#                     sub_tokens = tokenizer.tokenize(word)
+#                     for _ in sub_tokens:
+#                         manual_word_ids.append(current_word_idx)
+#                 encoded_input = tokenizer(
+#                     sub_words,
+#                     boxes=sub_bboxes,
+#                     truncation=True,
+#                     padding="max_length",
+#                     max_length=512,
+#                     is_split_into_words=True,
+#                     return_tensors="pt"
+#                 )
+#                 # Check for empty sequence
+#                 if encoded_input['input_ids'].shape[0] == 0:
+#                     print(f"        -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
+#                     continue
+#                 # 3. Finalize word_ids based on encoded output length
+#                 sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
+#                 content_token_length = max(0, sequence_length - 2)
+#                 manual_word_ids = manual_word_ids[:content_token_length]
+#                 final_word_ids = [None]  # CLS token (index 0)
+#                 final_word_ids.extend(manual_word_ids)
+#                 if sequence_length > 1:
+#                     final_word_ids.append(None)  # SEP token
+#                 final_word_ids.extend([None] * (512 - len(final_word_ids)))
+#                 word_ids = final_word_ids[:512]  # Final array for mapping
+#                 # Inputs are already batched by the tokenizer as [1, 512]
+#                 input_ids = encoded_input['input_ids'].to(device)
+#                 bbox = encoded_input['bbox'].to(device)
+#                 attention_mask = encoded_input['attention_mask'].to(device)
+#                 with torch.no_grad():
+#                     model_outputs = model(input_ids, bbox, attention_mask)
+#                 # --- Robust extraction: support several forward return types ---
+#                 # We'll try (in order):
+#                 # 1) model_outputs is (emissions_tensor, viterbi_list)  -> use emissions for logits, keep decoded
+#                 # 2) model_outputs has .logits attribute (HF ModelOutput)
+#                 # 3) model_outputs is tuple/list containing a logits tensor
+#                 # 4) model_outputs is a tensor (assume logits)
+#                 # 5) model_outputs is a list-of-lists of ints (viterbi decoded) -> use that directly (no logits)
+#                 logits_tensor = None
+#                 decoded_labels_list = None
+#                 # case 1: tuple/list with (emissions, viterbi)
+#                 if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
+#                     a, b = model_outputs
+#                     # a might be tensor (emissions), b might be viterbi list
+#                     if isinstance(a, torch.Tensor):
+#                         logits_tensor = a
+#                     if isinstance(b, list):
+#                         decoded_labels_list = b
+#                 # case 2: HF ModelOutput with .logits
+#                 if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
+#                     logits_tensor = model_outputs.logits
+#                 # case 3: tuple/list - search for a 3D tensor (B, L, C)
+#                 if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
+#                     found_tensor = None
+#                     for item in model_outputs:
+#                         if isinstance(item, torch.Tensor):
+#                             # prefer 3D (batch, seq, labels)
+#                             if item.dim() == 3:
+#                                 logits_tensor = item
+#                                 break
+#                             if found_tensor is None:
+#                                 found_tensor = item
+#                     if logits_tensor is None and found_tensor is not None:
+#                         # found_tensor may be (batch, seq, hidden) or (seq, hidden); we avoid guessing.
+#                         # Keep found_tensor only if it matches num_labels dimension
+#                         if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
+#                             logits_tensor = found_tensor
+#                         elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
+#                             logits_tensor = found_tensor.unsqueeze(0)
+#                 # case 4: model_outputs directly a tensor
+#                 if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
+#                     logits_tensor = model_outputs
+#                 # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
+#                 if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
+#                     # assume model_outputs is already viterbi decoded: List[List[int]] with batch dim first
+#                     decoded_labels_list = model_outputs
+#                 # If neither logits nor decoded exist, that's fatal
+#                 if logits_tensor is None and decoded_labels_list is None:
+#                     # helpful debug info
+#                     try:
+#                         elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
+#                     except Exception:
+#                         elem_shapes = str(type(model_outputs))
+#                     raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
+#                 # If we have logits_tensor, normalize shape to [seq_len, num_labels]
+#                 if logits_tensor is not None:
+#                     # If shape is [B, L, C] with B==1, squeeze batch
+#                     if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
+#                         preds_tensor = logits_tensor.squeeze(0)  # [L, C]
+#                     else:
+#                         preds_tensor = logits_tensor  # possibly [L, C] already
+#                     # Safety: ensure we have at least seq_len x channels
+#                     if preds_tensor.dim() != 2:
+#                         # try to reshape or error
+#                         raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
+#                     # We'll use preds_tensor[token_idx] to argmax
+#                 else:
+#                     preds_tensor = None  # no logits available
+#                 # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
+#                 decoded_token_labels = None
+#                 if decoded_labels_list is not None:
+#                     # decoded_labels_list is batch-first; we used batch size 1
+#                     # if multiple sequences returned, take first
+#                     decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
+#                 # Now map token-level predictions -> word-level predictions using word_ids
+#                 word_idx_to_pred_id = {}
+#                 if preds_tensor is not None:
+#                     # We have logits. Use argmax of logits for each token id up to sequence_length
+#                     for token_idx, word_idx in enumerate(word_ids):
+#                         if token_idx >= sequence_length:
+#                             break
+#                         if word_idx is not None and word_idx < len(sub_words):
+#                             if word_idx not in word_idx_to_pred_id:
+#                                 pred_id = torch.argmax(preds_tensor[token_idx]).item()
+#                                 word_idx_to_pred_id[word_idx] = pred_id
+#                 else:
+#                     # No logits, but we have decoded_token_labels from CRF (one label per token)
+#                     # We'll align decoded_token_labels to token positions.
+#                     if decoded_token_labels is None:
+#                         # should not happen due to earlier checks
+#                         raise RuntimeError("No logits and no decoded labels available for mapping.")
+#                     # decoded_token_labels length may be equal to content_token_length (no special tokens)
+#                     # or equal to sequence_length; try to align intelligently:
+#                     # Prefer using decoded_token_labels aligned to the tokenizer tokens (starting at token 1 for CLS)
+#                     # If decoded length == content_token_length, then manual_word_ids maps sub-token -> word idx for content tokens only.
+#                     # We'll iterate tokens and pick label accordingly.
+#                     # Build token_idx -> decoded_label mapping:
+#                     # We'll assume decoded_token_labels correspond to content tokens (no CLS/SEP). If decoded length == sequence_length, then shift by 0.
+#                     decoded_len = len(decoded_token_labels)
+#                     # Heuristic: if decoded_len == content_token_length -> alignment starts at token_idx 1 (skip CLS)
+#                     if decoded_len == content_token_length:
+#                         decoded_start = 1
+#                     elif decoded_len == sequence_length:
+#                         decoded_start = 0
+#                     else:
+#                         # fallback: prefer decoded_start=1 (most common)
+#                         decoded_start = 1
+#                     for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
+#                         tok_idx = decoded_start + tok_idx_in_decoded
+#                         if tok_idx >= 512:
+#                             break
+#                         if tok_idx >= sequence_length:
+#                             break
+#                         # map this token to a word index if present
+#                         word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
+#                         if word_idx is not None and word_idx < len(sub_words):
+#                             if word_idx not in word_idx_to_pred_id:
+#                                 # label_id may already be an int
+#                                 word_idx_to_pred_id[word_idx] = int(label_id)
+#                 # Finally convert mapped word preds -> page_raw_predictions entries
+#                 for current_word_idx in range(len(sub_words)):
+#                     pred_id = word_idx_to_pred_id.get(current_word_idx, 0)  # default to 0
+#                     predicted_label = ID_TO_LABEL[pred_id]
+#                     original_token = sub_tokens_data[current_word_idx]
+#                     page_raw_predictions.append({
+#                         "word": original_token['word'],
+#                         "bbox": original_token['bbox_raw_pdf_space'],
+#                         "predicted_label": predicted_label,
+#                         "page_number": page_num_1_based
+#                     })
+#         if page_raw_predictions:
+#             final_page_predictions.append({
+#                 "page_number": page_num_1_based,
+#                 "data": page_raw_predictions
+#             })
+#             print(f"  *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
+#     doc.close()
+#     print("\n" + "=" * 80)
+#     print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
+#     print("=" * 80)
+#     return final_page_predictions
 def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
                                     preprocessed_json_path: str,
                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
                 "item_original_data": item
             })
+        # ==============================================================================
+        # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
+        # ==============================================================================
+        print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
+        debug_count = 0
+        for t in all_token_data:
+             if debug_count >= 50: break
+             w = t['word']
+             unicode_points = [f"\\u{ord(c):04x}" for c in w]
+             print(f"  Token {debug_count}: '{w}' -> Codes: {unicode_points}")
+             debug_count += 1
+        print("----------------------------------------------------------------------\n")
+        # ==============================================================================
         if not all_token_data:
             continue
                     model_outputs = model(input_ids, bbox, attention_mask)
                 # --- Robust extraction: support several forward return types ---
                 logits_tensor = None
                 decoded_labels_list = None
                 # case 1: tuple/list with (emissions, viterbi)
                 if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
                     a, b = model_outputs
                     if isinstance(a, torch.Tensor):
                         logits_tensor = a
                     if isinstance(b, list):
                     found_tensor = None
                     for item in model_outputs:
                         if isinstance(item, torch.Tensor):
                             if item.dim() == 3:
                                 logits_tensor = item
                                 break
                             if found_tensor is None:
                                 found_tensor = item
                     if logits_tensor is None and found_tensor is not None:
                         if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
                             logits_tensor = found_tensor
                         elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
                 # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
                 if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
                     decoded_labels_list = model_outputs
                 # If neither logits nor decoded exist, that's fatal
                 if logits_tensor is None and decoded_labels_list is None:
                     try:
                         elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
                     except Exception:
                 # If we have logits_tensor, normalize shape to [seq_len, num_labels]
                 if logits_tensor is not None:
                     if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
                         preds_tensor = logits_tensor.squeeze(0)  # [L, C]
                     else:
                         preds_tensor = logits_tensor  # possibly [L, C] already
                     if preds_tensor.dim() != 2:
                         raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
                 else:
                     preds_tensor = None  # no logits available
                 # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
                 decoded_token_labels = None
                 if decoded_labels_list is not None:
                     decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
                 # Now map token-level predictions -> word-level predictions using word_ids
                 word_idx_to_pred_id = {}
                 if preds_tensor is not None:
                     for token_idx, word_idx in enumerate(word_ids):
                         if token_idx >= sequence_length:
                             break
                                 pred_id = torch.argmax(preds_tensor[token_idx]).item()
                                 word_idx_to_pred_id[word_idx] = pred_id
                 else:
                     if decoded_token_labels is None:
                         raise RuntimeError("No logits and no decoded labels available for mapping.")
                     decoded_len = len(decoded_token_labels)
                     if decoded_len == content_token_length:
                         decoded_start = 1
                     elif decoded_len == sequence_length:
                         decoded_start = 0
                     else:
                         decoded_start = 1
                     for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
                             break
                         if tok_idx >= sequence_length:
                             break
                         word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
                         if word_idx is not None and word_idx < len(sub_words):
                             if word_idx not in word_idx_to_pred_id:
                                 word_idx_to_pred_id[word_idx] = int(label_id)
                 # Finally convert mapped word preds -> page_raw_predictions entries