Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on 26 days ago

Commit

98a2928

verified ·

1 Parent(s): 414d12d

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +103 -857

working_yolo_pipeline.py CHANGED Viewed

@@ -146,60 +146,6 @@ def get_latex_from_base64(base64_string: str) -> str:
-# def get_latex_from_base64(base64_string: str) -> str:
-#     """
-#     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
-#     to recognize the formula. It cleans the output by removing spaces and
-#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
-#     """
-#     if ort_model is None or processor is None:
-#         return "[MODEL_ERROR: Model not initialized]"
-#     try:
-#         # 1. Decode Base64 to Image
-#         image_data = base64.b64decode(base64_string)
-#         # We must ensure the image is RGB format for the model input
-#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
-#         # 2. Preprocess the image
-#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
-#         # 3. Text Generation (OCR)
-#         generated_ids = ort_model.generate(pixel_values)
-#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-#         if not raw_generated_text:
-#             return "[OCR_WARNING: No formula found]"
-#         latex_string = raw_generated_text[0]
-#         # ==============================================================================
-#         # --- DEBUGGING BLOCK: CHECK TrOCR RAW OUTPUT ---
-#         # ==============================================================================
-#         print(f"[DEBUG] TrOCR Raw Output: '{latex_string}'")
-#         # ==============================================================================
-#         # --- 4. Post-processing and Cleanup ---
-#         # # A. Remove all spaces/line breaks
-#         # cleaned_latex = re.sub(r'\s+', '', latex_string)
-#         cleaned_latex = re.sub(r'[\r\n]+', '', latex_string)
-#         # B. CRITICAL FIX: Replace double backslashes (\\) with single backslashes (\).
-#         # This corrects model output that already over-escaped the LaTeX commands.
-#         # Python literal: '\\\\' is replaced with '\\'.
-#         #cleaned_latex = cleaned_latex.replace('\\\\', '\\')
-#         return cleaned_latex
-#     except Exception as e:
-#         # Catch any unexpected errors
-#         print(f"  ❌ TR-OCR Recognition failed: {e}")
-#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
@@ -640,79 +586,6 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
-# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
-#     raw_word_data = fitz_page.get_text("words")
-#     converted_ocr_output = []
-#     DEFAULT_CONFIDENCE = 99.0
-#     for x1, y1, x2, y2, word, *rest in raw_word_data:
-#         # --- FIX: SANITIZE TEXT HERE ---
-#         # cleaned_word = sanitize_text(word)
-#         # if not cleaned_word.strip(): continue
-#         x1_pix = int(x1 * scale_factor)
-#         y1_pix = int(y1 * scale_factor)
-#         x2_pix = int(x2 * scale_factor)
-#         y2_pix = int(y2 * scale_factor)
-#         converted_ocr_output.append({
-#             'type': 'text',
-#             'word': cleaned_word, # Use the sanitized word
-#             'confidence': DEFAULT_CONFIDENCE,
-#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
-#             'y0': y1_pix, 'x0': x1_pix
-#         })
-#     return converted_ocr_output
-# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
-#     raw_word_data = fitz_page.get_text("words")
-#     # ==============================================================================
-#     # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
-#     # ==============================================================================
-#     print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
-#     debug_count = 0
-#     for item in raw_word_data:
-#         if debug_count >= 50: break
-#         # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
-#         word_text = item[4]
-#         # Generate unicode hex codes for every character in the word
-#         unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
-#         print(f"  Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
-#         debug_count += 1
-#     print("----------------------------------------------------------------------\n")
-#     # ==============================================================================
-#     converted_ocr_output = []
-#     DEFAULT_CONFIDENCE = 99.0
-#     for x1, y1, x2, y2, word, *rest in raw_word_data:
-#         # --- FIX: SANITIZE TEXT HERE ---
-#         cleaned_word = sanitize_text(word)
-#         if not cleaned_word.strip(): continue
-#         x1_pix = int(x1 * scale_factor)
-#         y1_pix = int(y1 * scale_factor)
-#         x2_pix = int(x2 * scale_factor)
-#         y2_pix = int(y2 * scale_factor)
-#         converted_ocr_output.append({
-#             'type': 'text',
-#             'word': cleaned_word, # Use the sanitized word
-#             'confidence': DEFAULT_CONFIDENCE,
-#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
-#             'y0': y1_pix, 'x0': x1_pix
-#         })
-#     return converted_ocr_output
@@ -1242,285 +1115,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
-# def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
-#                             page_num: int, fitz_page: fitz.Page,
-#                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
-#     """
-#     OPTIMIZED FLOW:
-#     1. Run YOLO to find Equations/Tables.
-#     2. Mask raw text with YOLO boxes.
-#     3. Run Column Detection on the MASKED data.
-#     4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
-#     """
-#     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
-#     start_time_total = time.time()
-#     if original_img is None:
-#         print(f"  ❌ Invalid image for page {page_num}.")
-#         return None, None
-#     # ====================================================================
-#     # --- STEP 1: YOLO DETECTION ---
-#     # ====================================================================
-#     start_time_yolo = time.time()
-#     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
-#     relevant_detections = []
-#     if results and results[0].boxes:
-#         for box in results[0].boxes:
-#             class_id = int(box.cls[0])
-#             class_name = model.names[class_id]
-#             if class_name in TARGET_CLASSES:
-#                 x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
-#                 relevant_detections.append(
-#                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
-#                 )
-#     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
-#     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
-#     # ====================================================================
-#     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
-#     # ====================================================================
-#     # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
-#     raw_words_for_layout = get_word_data_for_detection(
-#         fitz_page, pdf_path, page_num,
-#         top_margin_percent=0.10, bottom_margin_percent=0.10
-#     )
-#     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
-#     # ====================================================================
-#     # --- STEP 3: COLUMN DETECTION ---
-#     # ====================================================================
-#     page_width_pdf = fitz_page.rect.width
-#     page_height_pdf = fitz_page.rect.height
-#     column_detection_params = {
-#         'cluster_bin_size': 2, 'cluster_smoothing': 2,
-#         'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
-#     }
-#     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
-#     page_separator_x = None
-#     if separators:
-#         central_min = page_width_pdf * 0.35
-#         central_max = page_width_pdf * 0.65
-#         central_separators = [s for s in separators if central_min <= s <= central_max]
-#         if central_separators:
-#             center_x = page_width_pdf / 2
-#             page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
-#             print(f"      ✅ Column Split Confirmed at X={page_separator_x:.1f}")
-#         else:
-#             print("      ⚠️ Gutter found off-center. Ignoring.")
-#     else:
-#         print("      -> Single Column Layout Confirmed.")
-#     # ====================================================================
-#     # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
-#     # ====================================================================
-#     start_time_components = time.time()
-#     component_metadata = []
-#     fig_count_page = 0
-#     eq_count_page = 0
-#     for detection in merged_detections:
-#         x1, y1, x2, y2 = detection['coords']
-#         class_name = detection['class']
-#         if class_name == 'figure':
-#             GLOBAL_FIGURE_COUNT += 1
-#             counter = GLOBAL_FIGURE_COUNT
-#             component_word = f"FIGURE{counter}"
-#             fig_count_page += 1
-#         elif class_name == 'equation':
-#             GLOBAL_EQUATION_COUNT += 1
-#             counter = GLOBAL_EQUATION_COUNT
-#             component_word = f"EQUATION{counter}"
-#             eq_count_page += 1
-#         else:
-#             continue
-#         component_crop = original_img[y1:y2, x1:x2]
-#         component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
-#         cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
-#         y_midpoint = (y1 + y2) // 2
-#         component_metadata.append({
-#             'type': class_name, 'word': component_word,
-#             'bbox': [int(x1), int(y1), int(x2), int(y2)],
-#             'y0': int(y_midpoint), 'x0': int(x1)
-#         })
-#     # ====================================================================
-#     # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
-#     # ====================================================================
-#     raw_ocr_output = []
-#     scale_factor = 2.0  # Pipeline standard scale
-#     try:
-#         # Try getting native text first
-#         # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
-#         raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
-#     except Exception as e:
-#         print(f"  ❌ Native text extraction failed: {e}")
-#     # If native text is missing, fall back to OCR
-#     if not raw_ocr_output:
-#         if _ocr_cache.has_ocr(pdf_path, page_num):
-#             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
-#             cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
-#             for word_tuple in cached_word_data:
-#                 word_text, x1, y1, x2, y2 = word_tuple
-#                 # Scale from PDF points to Pipeline Pixels (2.0)
-#                 x1_pix = int(x1 * scale_factor)
-#                 y1_pix = int(y1 * scale_factor)
-#                 x2_pix = int(x2 * scale_factor)
-#                 y2_pix = int(y2 * scale_factor)
-#                 raw_ocr_output.append({
-#                     'type': 'text', 'word': word_text, 'confidence': 95.0,
-#                     'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
-#                     'y0': y1_pix, 'x0': x1_pix
-#                 })
-#         else:
-#             # === START OF OPTIMIZED OCR BLOCK ===
-#             try:
-#                 # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
-#                 ocr_zoom = 4.0
-#                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
-#                 # Convert PyMuPDF Pixmap to OpenCV format
-#                 img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
-#                                                                                     pix_ocr.n)
-#                 if pix_ocr.n == 3:
-#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
-#                 elif pix_ocr.n == 4:
-#                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
-#                 # 2. Preprocess (Binarization)
-#                 processed_img = preprocess_image_for_ocr(img_ocr_np)
-#                 # 3. Run Tesseract with Optimized Configuration
-#                 custom_config = r'--oem 3 --psm 6'
-#                 hocr_data = pytesseract.image_to_data(
-#                     processed_img,
-#                     output_type=pytesseract.Output.DICT,
-#                     config=custom_config
-#                 )
-#                 # ==============================================================================
-#                 # --- DEBUGGING BLOCK: CHECK FIRST 50 OCR WORDS ---
-#                 # ==============================================================================
-#                 print(f"\n[DEBUG] Tesseract OCR Fallback (Page {page_num}): Checking first 50 words...")
-#                 debug_count = 0
-#                 for i in range(len(hocr_data['level'])):
-#                     text = hocr_data['text'][i].strip()
-#                     if text:
-#                         unicode_points = [f"\\u{ord(c):04x}" for c in text]
-#                         print(f"  OCR Word {debug_count}: '{text}' -> Codes: {unicode_points}")
-#                         debug_count += 1
-#                         if debug_count >= 50: break
-#                 print("----------------------------------------------------------------------\n")
-#                 # ==============================================================================
-#                 for i in range(len(hocr_data['level'])):
-#                     text = hocr_data['text'][i] # Retrieve raw Tesseract text
-#                     # --- FIX: SANITIZE TEXT AND THEN STRIP ---
-#                     cleaned_text = sanitize_text(text).strip()
-#                     if cleaned_text and hocr_data['conf'][i] > -1:
-#                         # 4. Coordinate Mapping
-#                         scale_adjustment = scale_factor / ocr_zoom
-#                         x1 = int(hocr_data['left'][i] * scale_adjustment)
-#                         y1 = int(hocr_data['top'][i] * scale_adjustment)
-#                         w = int(hocr_data['width'][i] * scale_adjustment)
-#                         h = int(hocr_data['height'][i] * scale_adjustment)
-#                         x2 = x1 + w
-#                         y2 = y1 + h
-#                         raw_ocr_output.append({
-#                             'type': 'text',
-#                             'word': cleaned_text, # Use the sanitized word
-#                             'confidence': float(hocr_data['conf'][i]),
-#                             'bbox': [x1, y1, x2, y2],
-#                             'y0': y1,
-#                             'x0': x1
-#                         })
-#             except Exception as e:
-#                 print(f"  ❌ Tesseract OCR Error: {e}")
-#             # === END OF OPTIMIZED OCR BLOCK ===
-#     # ====================================================================
-#     # --- STEP 6: OCR CLEANING AND MERGING ---
-#     # ====================================================================
-#     items_to_sort = []
-#     for ocr_word in raw_ocr_output:
-#         is_suppressed = False
-#         for component in component_metadata:
-#             # Do not include words that are inside figure/equation boxes
-#             ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
-#             if ioa > IOA_SUPPRESSION_THRESHOLD:
-#                 is_suppressed = True
-#                 break
-#         if not is_suppressed:
-#             items_to_sort.append(ocr_word)
-#     # Add figures/equations back into the flow as "words"
-#     items_to_sort.extend(component_metadata)
-#     # ====================================================================
-#     # --- STEP 7: LINE-BASED SORTING ---
-#     # ====================================================================
-#     items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
-#     lines = []
-#     for item in items_to_sort:
-#         placed = False
-#         for line in lines:
-#             y_ref = min(it['y0'] for it in line)
-#             if abs(y_ref - item['y0']) < LINE_TOLERANCE:
-#                 line.append(item)
-#                 placed = True
-#                 break
-#         if not placed and item['type'] in ['equation', 'figure']:
-#             for line in lines:
-#                 y_ref = min(it['y0'] for it in line)
-#                 if abs(y_ref - item['y0']) < 20:
-#                     line.append(item)
-#                     placed = True
-#                     break
-#         if not placed:
-#             lines.append([item])
-#     for line in lines:
-#         line.sort(key=lambda x: x['x0'])
-#     final_output = []
-#     for line in lines:
-#         for item in line:
-#             data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
-#             if 'tag' in item: data_item['tag'] = item['tag']
-#             final_output.append(data_item)
-#     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -1978,299 +1572,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
-# def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
-#                                     preprocessed_json_path: str,
-#                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
-#     print("\n" + "=" * 80)
-#     print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
-#     print("=" * 80)
-#     tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     print(f"  -> Using device: {device}")
-#     try:
-#         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
-#         checkpoint = torch.load(model_path, map_location=device)
-#         model_state = checkpoint.get('model_state_dict', checkpoint)
-#         # Apply patch for layoutlmv3 compatibility with saved state_dict
-#         fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
-#         model.load_state_dict(fixed_state_dict)
-#         model.to(device)
-#         model.eval()
-#         print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
-#     except Exception as e:
-#         print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
-#         return []
-#     try:
-#         with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
-#             preprocessed_data = json.load(f)
-#         print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
-#     except Exception:
-#         print("❌ Error loading preprocessed JSON.")
-#         return []
-#     try:
-#         doc = fitz.open(pdf_path)
-#     except Exception:
-#         print("❌ Error loading PDF.")
-#         return []
-#     final_page_predictions = []
-#     CHUNK_SIZE = 500
-#     for page_data in preprocessed_data:
-#         page_num_1_based = page_data['page_number']
-#         page_num_0_based = page_num_1_based - 1
-#         page_raw_predictions = []
-#         print(f"\n  *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
-#         fitz_page = doc.load_page(page_num_0_based)
-#         page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
-#         print(f"    -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
-#         all_token_data = []
-#         scale_factor = 2.0
-#         for item in page_data['data']:
-#             raw_yolo_bbox = item['bbox']
-#             bbox_pdf = [
-#                 int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
-#                 int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
-#             ]
-#             normalized_bbox = [
-#                 max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
-#             ]
-#             all_token_data.append({
-#                 "word": item['word'],
-#                 "bbox_raw_pdf_space": bbox_pdf,
-#                 "bbox_normalized": normalized_bbox,
-#                 "item_original_data": item
-#             })
-#         # ==============================================================================
-#         # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
-#         # ==============================================================================
-#         print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
-#         debug_count = 0
-#         for t in all_token_data:
-#              if debug_count >= 50: break
-#              w = t['word']
-#              unicode_points = [f"\\u{ord(c):04x}" for c in w]
-#              print(f"  Token {debug_count}: '{w}' -> Codes: {unicode_points}")
-#              debug_count += 1
-#         print("----------------------------------------------------------------------\n")
-#         # ==============================================================================
-#         if not all_token_data:
-#             continue
-#         column_separator_x = page_data.get('column_separator_x', None)
-#         if column_separator_x is not None:
-#             print(f"    -> Using SAVED column separator: X={column_separator_x}")
-#         else:
-#             print("    -> No column separator found. Assuming single chunk.")
-#         token_chunks = _merge_integrity(all_token_data, column_separator_x)
-#         total_chunks = len(token_chunks)
-#         for chunk_idx, chunk_tokens in enumerate(token_chunks):
-#             if not chunk_tokens: continue
-#             # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
-#             chunk_words = [
-#                 str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
-#                 for t in chunk_tokens
-#             ]
-#             chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
-#             total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
-#             for i in range(0, len(chunk_words), CHUNK_SIZE):
-#                 sub_chunk_idx = i // CHUNK_SIZE + 1
-#                 sub_words = chunk_words[i:i + CHUNK_SIZE]
-#                 sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
-#                 sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
-#                 print(f"      -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
-#                 # 2. Manual generation of word_ids
-#                 manual_word_ids = []
-#                 for current_word_idx, word in enumerate(sub_words):
-#                     sub_tokens = tokenizer.tokenize(word)
-#                     for _ in sub_tokens:
-#                         manual_word_ids.append(current_word_idx)
-#                 encoded_input = tokenizer(
-#                     sub_words,
-#                     boxes=sub_bboxes,
-#                     truncation=True,
-#                     padding="max_length",
-#                     max_length=512,
-#                     is_split_into_words=True,
-#                     return_tensors="pt"
-#                 )
-#                 # Check for empty sequence
-#                 if encoded_input['input_ids'].shape[0] == 0:
-#                     print(f"        -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
-#                     continue
-#                 # 3. Finalize word_ids based on encoded output length
-#                 sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
-#                 content_token_length = max(0, sequence_length - 2)
-#                 manual_word_ids = manual_word_ids[:content_token_length]
-#                 final_word_ids = [None]  # CLS token (index 0)
-#                 final_word_ids.extend(manual_word_ids)
-#                 if sequence_length > 1:
-#                     final_word_ids.append(None)  # SEP token
-#                 final_word_ids.extend([None] * (512 - len(final_word_ids)))
-#                 word_ids = final_word_ids[:512]  # Final array for mapping
-#                 # Inputs are already batched by the tokenizer as [1, 512]
-#                 input_ids = encoded_input['input_ids'].to(device)
-#                 bbox = encoded_input['bbox'].to(device)
-#                 attention_mask = encoded_input['attention_mask'].to(device)
-#                 with torch.no_grad():
-#                     model_outputs = model(input_ids, bbox, attention_mask)
-#                 # --- Robust extraction: support several forward return types ---
-#                 logits_tensor = None
-#                 decoded_labels_list = None
-#                 # case 1: tuple/list with (emissions, viterbi)
-#                 if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
-#                     a, b = model_outputs
-#                     if isinstance(a, torch.Tensor):
-#                         logits_tensor = a
-#                     if isinstance(b, list):
-#                         decoded_labels_list = b
-#                 # case 2: HF ModelOutput with .logits
-#                 if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
-#                     logits_tensor = model_outputs.logits
-#                 # case 3: tuple/list - search for a 3D tensor (B, L, C)
-#                 if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
-#                     found_tensor = None
-#                     for item in model_outputs:
-#                         if isinstance(item, torch.Tensor):
-#                             if item.dim() == 3:
-#                                 logits_tensor = item
-#                                 break
-#                             if found_tensor is None:
-#                                 found_tensor = item
-#                     if logits_tensor is None and found_tensor is not None:
-#                         if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
-#                             logits_tensor = found_tensor
-#                         elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
-#                             logits_tensor = found_tensor.unsqueeze(0)
-#                 # case 4: model_outputs directly a tensor
-#                 if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
-#                     logits_tensor = model_outputs
-#                 # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
-#                 if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
-#                     decoded_labels_list = model_outputs
-#                 # If neither logits nor decoded exist, that's fatal
-#                 if logits_tensor is None and decoded_labels_list is None:
-#                     try:
-#                         elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
-#                     except Exception:
-#                         elem_shapes = str(type(model_outputs))
-#                     raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
-#                 # If we have logits_tensor, normalize shape to [seq_len, num_labels]
-#                 if logits_tensor is not None:
-#                     if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
-#                         preds_tensor = logits_tensor.squeeze(0)  # [L, C]
-#                     else:
-#                         preds_tensor = logits_tensor  # possibly [L, C] already
-#                     if preds_tensor.dim() != 2:
-#                         raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
-#                 else:
-#                     preds_tensor = None  # no logits available
-#                 # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
-#                 decoded_token_labels = None
-#                 if decoded_labels_list is not None:
-#                     decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
-#                 # Now map token-level predictions -> word-level predictions using word_ids
-#                 word_idx_to_pred_id = {}
-#                 if preds_tensor is not None:
-#                     for token_idx, word_idx in enumerate(word_ids):
-#                         if token_idx >= sequence_length:
-#                             break
-#                         if word_idx is not None and word_idx < len(sub_words):
-#                             if word_idx not in word_idx_to_pred_id:
-#                                 pred_id = torch.argmax(preds_tensor[token_idx]).item()
-#                                 word_idx_to_pred_id[word_idx] = pred_id
-#                 else:
-#                     if decoded_token_labels is None:
-#                         raise RuntimeError("No logits and no decoded labels available for mapping.")
-#                     decoded_len = len(decoded_token_labels)
-#                     if decoded_len == content_token_length:
-#                         decoded_start = 1
-#                     elif decoded_len == sequence_length:
-#                         decoded_start = 0
-#                     else:
-#                         decoded_start = 1
-#                     for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
-#                         tok_idx = decoded_start + tok_idx_in_decoded
-#                         if tok_idx >= 512:
-#                             break
-#                         if tok_idx >= sequence_length:
-#                             break
-#                         word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
-#                         if word_idx is not None and word_idx < len(sub_words):
-#                             if word_idx not in word_idx_to_pred_id:
-#                                 word_idx_to_pred_id[word_idx] = int(label_id)
-#                 # Finally convert mapped word preds -> page_raw_predictions entries
-#                 for current_word_idx in range(len(sub_words)):
-#                     pred_id = word_idx_to_pred_id.get(current_word_idx, 0)  # default to 0
-#                     predicted_label = ID_TO_LABEL[pred_id]
-#                     original_token = sub_tokens_data[current_word_idx]
-#                     page_raw_predictions.append({
-#                         "word": original_token['word'],
-#                         "bbox": original_token['bbox_raw_pdf_space'],
-#                         "predicted_label": predicted_label,
-#                         "page_number": page_num_1_based
-#                     })
-#         if page_raw_predictions:
-#             final_page_predictions.append({
-#                 "page_number": page_num_1_based,
-#                 "data": page_raw_predictions
-#             })
-#             print(f"  *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
-#     doc.close()
-#     print("\n" + "=" * 80)
-#     print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
-#     print("=" * 80)
-#     return final_page_predictions
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
@@ -2758,207 +2059,152 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
-# # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
-# #     List[Dict[str, Any]]]:
-# def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
-#     if not os.path.exists(input_pdf_path): return None
-#     print("\n" + "#" * 80)
-#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
-#     print("#" * 80)
-#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
-#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
-#     os.makedirs(temp_pipeline_dir, exist_ok=True)
-#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
-#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-#     structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
-#     final_result = None
-#     try:
-#         # Phase 1: Preprocessing with YOLO First + Masking
-#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
-#         if not preprocessed_json_path_out: return None
-#         # Phase 2: Inference
-#         page_raw_predictions_list = run_inference_and_get_raw_words(
-#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
-#         )
-#         if not page_raw_predictions_list: return None
-#         # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
-#         # Save raw predictions to the temporary file
-#         with open(raw_output_path, 'w', encoding='utf-8') as f:
-#             json.dump(page_raw_predictions_list, f, indent=4)
-#         # Explicitly copy/save the raw predictions to the user-specified debug path
-#         # if raw_predictions_output_path:
-#         #     shutil.copy(raw_output_path, raw_predictions_output_path)
-#         #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
-#         # ----------------------------------------
-#         # Phase 3: Decoding
-#         structured_data_list = convert_bio_to_structured_json_relaxed(
-#             raw_output_path, structured_intermediate_output_path
-#         )
-#         if not structured_data_list: return None
-#         structured_data_list = correct_misaligned_options(structured_data_list)
-#         structured_data_list = process_context_linking(structured_data_list)
-#         # Phase 4: Embedding / Equation to LaTeX Conversion
-#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
-    Wraps a standard image file into a single-page PyMuPDF Document.
-    This ensures it can be processed by your existing fitz-based functions
-    (coordinate scaling, column detection, etc.) exactly as before.
     """
     img = Image.open(image_path)
-    # Convert image to a PDF stream in memory
     pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
     doc = fitz.open("pdf", pdf_bytes)
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
-    Main pipeline modified to handle both PDF and Image files.
     """
-    # 1. INITIALIZE MODELS (Preserving original logic)
     yolo_model = YOLO(WEIGHTS_PATH)
     # 2. DETECT FILE TYPE
-    ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
-    # 3. BRANCH LOGIC: IMAGE VS PDF
-    if is_image:
-        print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
-        doc, page = load_image_as_fitz_page(input_path)
-        # Process as Page 0. Because there is no native text, your existing
-        # Tesseract fallback will naturally trigger to read the content.
-        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-        img_np = pixmap_to_numpy(pix)
-        page_data, _ = preprocess_and_ocr_page(
-            img_np,
-            yolo_model,
-            input_path,
-            0, # Page 0
-            page,
-            os.path.basename(input_path)
-        )
-        if page_data:
-            all_pages_data.append(page_data)
-        doc.close()
-    else:
-        # Standard PDF Processing Loop
-        try:
-            doc = fitz.open(input_path)
-            print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
             for page_index in range(len(doc)):
                 page = doc[page_index]
                 pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                 img_np = pixmap_to_numpy(pix)
                 page_data, _ = preprocess_and_ocr_page(
-                    img_np,
-                    yolo_model,
-                    input_path,
-                    page_index,
-                    page,
-                    os.path.basename(input_path)
                 )
                 if page_data:
                     all_pages_data.append(page_data)
             doc.close()
-        except Exception as e:
-            print(f"❌ Error opening PDF {input_path}: {e}")
             return None
-    # 4. CONTINUE EXACTLY AS BEFORE: Gathering and Inference
-    if not all_pages_data:
-        print("❌ No data extracted from document.")
-        return None
-    # Sequence all blocks from all pages (or the single image page)
-    sequential_blocks = []
-    for p_data in all_pages_data:
-        sequential_blocks.extend(p_data.get('blocks', []))
-    print("\n" + "=" * 80)
-    print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
-    print("=" * 80)
-    # Run LayoutLMv3 Inference on the gathered blocks
-    final_structured_data = run_layoutlmv3_inference_on_blocks(
-        sequential_blocks,
-        layoutlmv3_model_path
-    )
-    # Run Hierarchical classification (Subject/Concept tags)
-    classifier = HierarchicalClassifier()
-    if classifier.load_models():
-        final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
-        print("✅ Classification complete. Tags added.")
-    else:
-        print("❌ Classifier not found. Returning untagged data.")
-    return final_structured_data
-#================================================================================
-        # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
-#================================================================================
-        print("\n" + "=" * 80)
-        print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
-        print("=" * 80)
-        # 1. Initialize and Load the Classifier
-        classifier = HierarchicalClassifier()
-        if classifier.load_models():
-            # 2. Run Classification on the *Final* Result
-            # The function modifies the list in place and returns it
-            final_result = post_process_json_with_inference(
-                final_result, classifier
-            )
-            print("✅ Classification complete. Tags added to final output.")
-        else:
-            print("❌ Classification model loading failed. Outputting un-tagged data.")
-        # ====================================================================
-    except Exception as e:
-        print(f"❌ FATAL ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-    finally:
-        try:
-            for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
-                os.remove(f)
-            os.rmdir(temp_pipeline_dir)
-        except Exception:
-            pass
-    print("\n" + "#" * 80)
-    print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
-    print("#" * 80)
-    return final_result

 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
+    Wraps an image into a temporary PyMuPDF document/page.
+    This allows your existing column detection and coordinate mapping
+    to work on images exactly as they do on PDFs.
     """
     img = Image.open(image_path)
+    # Convert image to PDF format in memory
     pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
     doc = fitz.open("pdf", pdf_bytes)
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
+    Modified pipeline that handles both PDFs and Images, running YOLO,
+    Tesseract OCR, and LayoutLMv3 inference.
     """
+    # 1. INITIALIZE YOLO
     yolo_model = YOLO(WEIGHTS_PATH)
     # 2. DETECT FILE TYPE
+    ext = os.path.splitext(input_path).lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
+    pdf_name = os.path.basename(input_path)
+    try:
+        if is_image:
+            print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
+            doc, page = load_image_as_fitz_page(input_path)
+            # Render for YOLO (using same scale as your PDF logic)
+            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+            img_np = pixmap_to_numpy(pix)
+            # Since an image has no native text layer, preprocess_and_ocr_page
+            # will automatically use Tesseract OCR fallback as intended.
+            page_data, _ = preprocess_and_ocr_page(
+                img_np, yolo_model, input_path, 0, page, pdf_name
+            )
+            if page_data:
+                all_pages_data.append(page_data)
+            doc.close()
+        else:
+            # --- ORIGINAL PDF LOGIC ---
+            doc = fitz.open(input_path)
+            print(f"📄 Processing PDF: {pdf_name} ({len(doc)} pages)")
             for page_index in range(len(doc)):
                 page = doc[page_index]
                 pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                 img_np = pixmap_to_numpy(pix)
                 page_data, _ = preprocess_and_ocr_page(
+                    img_np, yolo_model, input_path, page_index, page, pdf_name
                 )
                 if page_data:
                     all_pages_data.append(page_data)
             doc.close()
+        if not all_pages_data:
+            print("❌ No data extracted.")
             return None
+        # 3. CONSOLIDATE BLOCKS FOR INFERENCE
+        sequential_blocks = []
+        for p_data in all_pages_data:
+            sequential_blocks.extend(p_data.get('blocks', []))
+        # --- 4. STARTING LAYOUTLMV3 INFERENCE (Exactly as before) ---
+        print("\n" + "=" * 80)
+        print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
+        print("=" * 80)
+        # (Inlining your existing LayoutLMv3 inference logic)
+        tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # This assumes LayoutLMv3ForTokenClassification is defined elsewhere in your script
+        model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
+        checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
+        model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
+        model.to(device)
+        model.eval()
+        # Run inference on sequential_blocks...
+        final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
+        # 5. POST-PROCESS CLASSIFICATION
+        classifier = HierarchicalClassifier()
+        if classifier.load_models():
+            final_result = post_process_json_with_inference(final_result, classifier)
+            print("✅ Classification complete.")
+        return final_result
+    except Exception as e:
+        print(f"❌ FATAL ERROR in pipeline: {e}")
+        return None
+# #================================================================================
+#         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
+# #================================================================================
+#         print("\n" + "=" * 80)
+#         print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
+#         print("=" * 80)
+#         # 1. Initialize and Load the Classifier
+#         classifier = HierarchicalClassifier()
+#         if classifier.load_models():
+#             # 2. Run Classification on the *Final* Result
+#             # The function modifies the list in place and returns it
+#             final_result = post_process_json_with_inference(
+#                 final_result, classifier
+#             )
+#             print("✅ Classification complete. Tags added to final output.")
+#         else:
+#             print("❌ Classification model loading failed. Outputting un-tagged data.")
+#         # ====================================================================
+#     except Exception as e:
+#         print(f"❌ FATAL ERROR: {e}")
+#         import traceback
+#         traceback.print_exc()
+#         return None
+#     finally:
+#         try:
+#             for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
+#                 os.remove(f)
+#             os.rmdir(temp_pipeline_dir)
+#         except Exception:
+#             pass
+#     print("\n" + "#" * 80)
+#     print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
+#     print("#" * 80)
+#     return final_result