Spaces:

heerjtdev
/

feeedback

Sleeping

App Files Files Community

heerjtdev commited on Dec 8, 2025

Commit

314c4a7

verified ·

1 Parent(s): 9a971c3

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -242

app.py CHANGED Viewed

@@ -3,65 +3,41 @@ import numpy as np
 import cv2
 import torch
 import torch.serialization
-_original_torch_load = torch.load
-def patched_torch_load(*args, **kwargs):
-    # FORCE classic behavior
-    kwargs["weights_only"] = False
-    return _original_torch_load(*args, **kwargs)
-torch.load = patched_torch_load
 import json
-import argparse
 import os
 import re
-# Import torch components if needed (kept from original script)
-import torch.nn as nn
-from TorchCRF import CRF
-# from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
 from typing import List, Dict, Any, Optional, Union, Tuple
 from ultralytics import YOLO
-import glob
-from PIL import Image
-import sys
-import io
-import base64
 import tempfile
 import time
-import shutil
-import logging
 # ============================================================================
-# --- TR-OCR/ORT MODEL INITIALIZATION (Placeholder) ---
 # ============================================================================
-logging.basicConfig(level=logging.WARNING)
 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
 # ============================================================================
 # NOTE: Update these paths to match your environment before running!
-WEIGHTS_PATH = 'best.pt'
-# DIRECTORY CONFIGURATION
-OCR_JSON_OUTPUT_DIR = './ocr_json_output_final'
-FIGURE_EXTRACTION_DIR = './figure_extraction'
-TEMP_IMAGE_DIR = './temp_pdf_images'
 # Detection parameters
 CONF_THRESHOLD = 0.2
@@ -70,46 +46,34 @@ IOU_MERGE_THRESHOLD = 0.4
 IOA_SUPPRESSION_THRESHOLD = 0.7
 LINE_TOLERANCE = 15
 # Global counters for sequential numbering across the entire PDF
 GLOBAL_FIGURE_COUNT = 0
 GLOBAL_EQUATION_COUNT = 0
 # ============================================================================
 # --- PERFORMANCE OPTIMIZATION: OCR CACHE ---
 # ============================================================================
 class OCRCache:
     """Caches OCR results per page to avoid redundant Tesseract runs."""
     def __init__(self):
         self.cache = {}
     def get_key(self, pdf_path: str, page_num: int) -> str:
         return f"{pdf_path}:{page_num}"
     def has_ocr(self, pdf_path: str, page_num: int) -> bool:
         return self.get_key(pdf_path, page_num) in self.cache
     def get_ocr(self, pdf_path: str, page_num: int) -> Optional[list]:
         return self.cache.get(self.get_key(pdf_path, page_num))
     def set_ocr(self, pdf_path: str, page_num: int, ocr_data: list):
         self.cache[self.get_key(pdf_path, page_num)] = ocr_data
     def clear(self):
         self.cache.clear()
-# Global OCR cache instance
 _ocr_cache = OCRCache()
 # ============================================================================
-# --- PHASE 1: YOLO/OCR PREPROCESSING FUNCTIONS ---
 # ============================================================================
 def calculate_iou(box1, box2):
@@ -139,59 +103,29 @@ def calculate_ioa(box1, box2):
 def filter_nested_boxes(detections, ioa_threshold=0.80):
-    """
-    Removes boxes that are inside larger boxes (Containment Check).
-    Prioritizes keeping the LARGEST box (the 'parent' container).
-    """
     if not detections:
         return []
-    # 1. Calculate Area for all detections
     for d in detections:
         x1, y1, x2, y2 = d['coords']
         d['area'] = (x2 - x1) * (y2 - y1)
-    # 2. Sort by Area Descending (Largest to Smallest)
-    # This ensures we process the 'container' first
     detections.sort(key=lambda x: x['area'], reverse=True)
     keep_indices = []
     is_suppressed = [False] * len(detections)
     for i in range(len(detections)):
         if is_suppressed[i]: continue
         keep_indices.append(i)
         box_a = detections[i]['coords']
-        # Compare with all smaller boxes
         for j in range(i + 1, len(detections)):
             if is_suppressed[j]: continue
             box_b = detections[j]['coords']
-            # Calculate Intersection
             x_left = max(box_a[0], box_b[0])
             y_top = max(box_a[1], box_b[1])
             x_right = min(box_a[2], box_b[2])
             y_bottom = min(box_a[3], box_b[3])
-            if x_right < x_left or y_bottom < y_top:
-                intersection = 0
-            else:
-                intersection = (x_right - x_left) * (y_bottom - y_top)
-            # Calculate IoA (Intersection over Area of the SMALLER box)
             area_b = detections[j]['area']
-            if area_b > 0:
-                ioa_small = intersection / area_b
-                # If the small box is > 90% inside the big box, suppress the small one.
-                if ioa_small > ioa_threshold:
-                    is_suppressed[j] = True
-                    # print(f"    [Suppress] Removed nested object inside larger '{detections[i]['class']}'")
     return [detections[i] for i in keep_indices]
@@ -223,47 +157,28 @@ def merge_overlapping_boxes(detections, iou_threshold):
 def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_factor: float) -> list:
-    """
-    Filters out raw words that are inside YOLO boxes and replaces them with
-    a single solid 'placeholder' block for the column detector.
-    """
     if not yolo_detections:
         return raw_word_data
-    # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
     pdf_space_boxes = []
     for det in yolo_detections:
         x1, y1, x2, y2 = det['coords']
-        pdf_box = (
-            x1 / scale_factor,
-            y1 / scale_factor,
-            x2 / scale_factor,
-            y2 / scale_factor
-        )
         pdf_space_boxes.append(pdf_box)
-    # 2. Filter out raw words that are inside YOLO boxes
     cleaned_word_data = []
     for word_tuple in raw_word_data:
-        # word_tuple is (text, x1, y1, x2, y2)
         wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
         w_center_x = (wx1 + wx2) / 2
         w_center_y = (wy1 + wy2) / 2
         is_inside_yolo = False
         for px1, py1, px2, py2 in pdf_space_boxes:
             if px1 <= w_center_x <= px2 and py1 <= w_center_y <= py2:
                 is_inside_yolo = True
                 break
         if not is_inside_yolo:
             cleaned_word_data.append(word_tuple)
-    # 3. Add the YOLO boxes themselves as "Solid Words"
     for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
         dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
         cleaned_word_data.append(dummy_entry)
     return cleaned_word_data
@@ -272,25 +187,16 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
-    """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
-    # This is a critical function for the pipeline. Implementing a basic version.
     img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
         (pix.h, pix.w, pix.n)
     )
     if pix.n == 4:
-        # Convert RGBA to RGB for most YOLO models
         img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
     elif pix.n == 1:
-        # Grayscale to RGB
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     return img
 def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
-    """
-    Placeholder for logic that detects if a page is two-column and finds the separator line.
-    This logic is complex and usually involves histogram analysis of word x-coordinates.
-    Returns None for single column, or the x-coordinate of the separator.
-    """
     # Placeholder: Always assume single column unless you have the full logic.
     return None
@@ -298,24 +204,29 @@ def preprocess_and_ocr_page(
         image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
         fitz_page: fitz.Page, pdf_name: str
 ) -> Tuple[Optional[list], Optional[float]]:
-    """
-    Placeholder for the page-level processing: YOLO detection, OCR, and merging.
-    This function is responsible for INCREMENTING the global counters.
-    """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
-    # 1. Mock YOLO Detection (You would run model(image) here)
-    # Mocking a result with 2 equations and 1 figure for testing the counters.
-    scale_factor = 2.0 # from the mat=fitz.Matrix(2.0, 2.0) call
-    # Mock Detection for Counters:
     mock_detections = [
         {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
         {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
         {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
     ]
-    # 2. Apply NMS/Merging/Filtering (using the provided functions)
     merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
@@ -323,200 +234,194 @@ def preprocess_and_ocr_page(
     for det in final_detections:
         if det['class'] == 'figure':
             GLOBAL_FIGURE_COUNT += 1
-            # Logic for saving figure image/caption would go here
         elif det['class'] == 'equation':
             GLOBAL_EQUATION_COUNT += 1
-            # Logic for OCR/LaTeX extraction would go here
-    # 4. Mock Raw Word Data and Cleaning
-    # (In a real script, this would come from fitz_page.get_text("words"))
     mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
     cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
-    # 5. Determine Column Separator
     page_width = fitz_page.rect.width
     page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
-    # 6. Mock Final Output Structure
     final_output = [
         {"type": "text", "text": "Mock Text Block 1"},
         {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
         {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
-        # ... more mock data
     ]
-    print(f"  -> Page {page_num}: Equations={len([d for d in final_detections if d['class'] == 'equation'])}, Figures={len([d for d in final_detections if d['class'] == 'figure'])}")
     return final_output, page_separator_x
 # ============================================================================
-# --- MAIN DOCUMENT PROCESSING FUNCTION ---
 # ============================================================================
-# MODIFIED: Returns a Tuple containing the JSON path and the three counts.
-def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Tuple[Optional[str], int, int, int]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
-    # Reset globals for a new document
     GLOBAL_FIGURE_COUNT = 0
     GLOBAL_EQUATION_COUNT = 0
     _ocr_cache.clear()
-    print("\n" + "=" * 80)
-    print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
-    print("=" * 80)
     if not os.path.exists(pdf_path):
-        print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
-        return None, 0, 0, 0
-    os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
-    os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
-    # NOTE: This will fail if best.pt is not present
     try:
         model = YOLO(WEIGHTS_PATH)
     except Exception as e:
-        print(f"❌ ERROR loading YOLO model: {e}")
-        # Return 0 for counts if model fails to load
-        return None, 0, 0, 0
-    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
     try:
         doc = fitz.open(pdf_path)
-        total_pages = doc.page_count # Capture the total page count
-        print(f"✅ Opened PDF: {pdf_name} ({total_pages} pages)")
     except Exception as e:
-        print(f"❌ ERROR loading PDF file: {e}")
-        return None, 0, 0, 0
     all_pages_data = []
     total_pages_processed = 0
     mat = fitz.Matrix(2.0, 2.0)
-    print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
     for page_num_0_based in range(doc.page_count):
-        page_num = page_num_0_based + 1
-        # print(f"  -> Processing Page {page_num}/{doc.page_count}...") # Moved print inside the helper for better logging
         fitz_page = doc.load_page(page_num_0_based)
         try:
             pix = fitz_page.get_pixmap(matrix=mat)
             original_img = pixmap_to_numpy(pix)
         except Exception as e:
-            print(f"  ❌ Error converting page {page_num} to image: {e}")
             continue
         final_output, page_separator_x = preprocess_and_ocr_page(
-            original_img,
-            model,
-            pdf_path,
-            page_num,
-            fitz_page,
-            pdf_name
         )
         if final_output is not None:
             page_data = {
-                "page_number": page_num,
                 "data": final_output,
                 "column_separator_x": page_separator_x
             }
             all_pages_data.append(page_data)
             total_pages_processed += 1
-        else:
-            print(f"  ❌ Skipped page {page_num} due to processing error.")
     doc.close()
     if all_pages_data:
         try:
             with open(preprocessed_json_path, 'w') as f:
                 json.dump(all_pages_data, f, indent=4)
-            print(f"\n  ✅ Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
         except Exception as e:
-            print(f"❌ ERROR saving combined JSON output: {e}")
-            return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
     else:
-        print("❌ WARNING: No page data generated. Halting pipeline.")
-        return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
-    print("\n" + "=" * 80)
-    print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
-    print("=" * 80)
-    # UPDATED RETURN VALUE FOR REQUIRED STATS
-    return preprocessed_json_path, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
 # ============================================================================
-# --- MAIN EXECUTION BLOCK (Modified for requested output) ---
 # ============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Complete Pipeline")
-    parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
-    # Using the placeholder constant
-    # --- ADDED ARGUMENT FOR DEBUGGING ---
-    parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
-                        help="Debug path for raw BIO tag predictions (JSON).")
-    # ------------------------------------
-    args = parser.parse_args()
-    pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
-    final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
-    # Define the output path for the preprocessing step
-    os.makedirs(OCR_JSON_OUTPUT_DIR, exist_ok=True)
-    preprocessed_json_path = os.path.join(OCR_JSON_OUTPUT_DIR, f"{pdf_name}_preprocessed.json")
-    # --- CORE EXECUTION ---
-    print("\nStarting PDF Analysis and Extraction...")
-    # Run the core logic and capture the three required statistics
-    json_path_out, num_pages, num_equations, num_figures = run_single_pdf_preprocessing(
-        args.input_pdf,
-        preprocessed_json_path
-    )
-    # --- PRINTING THE REQUIRED STATISTICS ---
-    print("\n" + "#" * 50)
-    print("## 📊 EXTRACTION SUMMARY")
-    print("#" * 50)
-    if json_path_out:
-        print(f"**1) Total Pages Detected:** {num_pages}")
-        print("**2) Elements Extracted:**")
-        print(f"   - Equations: {num_equations}")
-        print(f"   - Figures: {num_figures}")
-    else:
-        # Note: num_pages might be > 0 even if processing failed (if the PDF opened)
-        print(f"**Extraction Failed.** Pages in PDF: {num_pages}. See logs above for errors.")
-        sys.exit(1)
-    print("#" * 50 + "\n")
-    # --------------------------------------------------------------------------------
-    # The original script had more logic here (run_document_pipeline, etc.).
-    # Since only the pre-processing function and the statistics output were requested,
-    # the rest of the original final file saving logic is commented out/removed.
-    # To retain the original final file saving placeholder:
-    # 🛑 CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING 🛑
-    # if final_json_data: # final_json_data is not produced by run_single_pdf_preprocessing
-    #    ...
-    # else:
-    #    print("\n❌ Pipeline Failed.")
-    #    sys.exit(1)
-    print(f"The preprocessed JSON data is saved to: {preprocessed_json_path}")
-    print("Pipeline step complete.")
-    sys.exit(0)
-# End of script

 import cv2
 import torch
 import torch.serialization
 import json
 import os
 import re
 from typing import List, Dict, Any, Optional, Union, Tuple
 from ultralytics import YOLO
+import logging
+import gradio as gr
+import shutil
 import tempfile
 import time
 # ============================================================================
+# --- Global Patches (Kept from original script) ---
 # ============================================================================
+_original_torch_load = torch.load
+def patched_torch_load(*args, **kwargs):
+    # FORCE classic behavior
+    kwargs["weights_only"] = False
+    return _original_torch_load(*args, **kwargs)
+torch.load = patched_torch_load
+logging.basicConfig(level=logging.WARNING)
 # ============================================================================
 # --- CONFIGURATION AND CONSTANTS ---
 # ============================================================================
 # NOTE: Update these paths to match your environment before running!
+# Gradio runs in the current working directory, so relative paths are fine.
+WEIGHTS_PATH = 'best.pt'
+# DIRECTORY CONFIGURATION - Now managed by tempfile or local folders
+# NOTE: For Gradio, we'll use a temporary directory for output files
+# to prevent cluttering the execution environment.
 # Detection parameters
 CONF_THRESHOLD = 0.2
 IOA_SUPPRESSION_THRESHOLD = 0.7
 LINE_TOLERANCE = 15
 # Global counters for sequential numbering across the entire PDF
 GLOBAL_FIGURE_COUNT = 0
 GLOBAL_EQUATION_COUNT = 0
 # ============================================================================
 # --- PERFORMANCE OPTIMIZATION: OCR CACHE ---
+# Using the original OCRCache class definition
 # ============================================================================
 class OCRCache:
     """Caches OCR results per page to avoid redundant Tesseract runs."""
     def __init__(self):
         self.cache = {}
     def get_key(self, pdf_path: str, page_num: int) -> str:
         return f"{pdf_path}:{page_num}"
     def has_ocr(self, pdf_path: str, page_num: int) -> bool:
         return self.get_key(pdf_path, page_num) in self.cache
     def get_ocr(self, pdf_path: str, page_num: int) -> Optional[list]:
         return self.cache.get(self.get_key(pdf_path, page_num))
     def set_ocr(self, pdf_path: str, page_num: int, ocr_data: list):
         self.cache[self.get_key(pdf_path, page_num)] = ocr_data
     def clear(self):
         self.cache.clear()
 _ocr_cache = OCRCache()
 # ============================================================================
+# --- PHASE 1: YOLO/OCR PREPROCESSING FUNCTIONS (Kept from original script) ---
 # ============================================================================
 def calculate_iou(box1, box2):
 def filter_nested_boxes(detections, ioa_threshold=0.80):
     if not detections:
         return []
     for d in detections:
         x1, y1, x2, y2 = d['coords']
         d['area'] = (x2 - x1) * (y2 - y1)
     detections.sort(key=lambda x: x['area'], reverse=True)
     keep_indices = []
     is_suppressed = [False] * len(detections)
     for i in range(len(detections)):
         if is_suppressed[i]: continue
         keep_indices.append(i)
         box_a = detections[i]['coords']
         for j in range(i + 1, len(detections)):
             if is_suppressed[j]: continue
             box_b = detections[j]['coords']
             x_left = max(box_a[0], box_b[0])
             y_top = max(box_a[1], box_b[1])
             x_right = min(box_a[2], box_b[2])
             y_bottom = min(box_a[3], box_b[3])
+            intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
             area_b = detections[j]['area']
+            if area_b > 0 and intersection / area_b > ioa_threshold:
+                is_suppressed[j] = True
     return [detections[i] for i in keep_indices]
 def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_factor: float) -> list:
     if not yolo_detections:
         return raw_word_data
     pdf_space_boxes = []
     for det in yolo_detections:
         x1, y1, x2, y2 = det['coords']
+        pdf_box = (x1 / scale_factor, y1 / scale_factor, x2 / scale_factor, y2 / scale_factor)
         pdf_space_boxes.append(pdf_box)
     cleaned_word_data = []
     for word_tuple in raw_word_data:
         wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
         w_center_x = (wx1 + wx2) / 2
         w_center_y = (wy1 + wy2) / 2
         is_inside_yolo = False
         for px1, py1, px2, py2 in pdf_space_boxes:
             if px1 <= w_center_x <= px2 and py1 <= w_center_y <= py2:
                 is_inside_yolo = True
                 break
         if not is_inside_yolo:
             cleaned_word_data.append(word_tuple)
     for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
         dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
         cleaned_word_data.append(dummy_entry)
     return cleaned_word_data
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
         (pix.h, pix.w, pix.n)
     )
     if pix.n == 4:
         img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
     elif pix.n == 1:
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     return img
 def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
     # Placeholder: Always assume single column unless you have the full logic.
     return None
         image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
         fitz_page: fitz.Page, pdf_name: str
 ) -> Tuple[Optional[list], Optional[float]]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+    scale_factor = 2.0
+    # Mock Detection for Counters (Same as previous response):
     mock_detections = [
         {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
         {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
         {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
     ]
+    # --- Actual Logic Starts Here ---
+    # Run YOLO detection on the image (Actual implementation needed here)
+    # results = model(image, conf=CONF_THRESHOLD)
+    # mock_detections = []
+    # if results and results[0].boxes:
+    #     for box in results[0].boxes.data.tolist():
+    #         x1, y1, x2, y2, conf, cls_id = box
+    #         cls_name = model.names[int(cls_id)]
+    #         if cls_name in TARGET_CLASSES:
+    #             mock_detections.append({'coords': (x1, y1, x2, y2), 'class': cls_name, 'conf': conf})
     merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
     for det in final_detections:
         if det['class'] == 'figure':
             GLOBAL_FIGURE_COUNT += 1
         elif det['class'] == 'equation':
             GLOBAL_EQUATION_COUNT += 1
+    # Mock Raw Word Data and Cleaning (Actual implementation needs fitz_page.get_text("words"))
     mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
     cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
     page_width = fitz_page.rect.width
     page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
+    # Mock Final Output Structure
     final_output = [
         {"type": "text", "text": "Mock Text Block 1"},
         {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
         {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
     ]
     return final_output, page_separator_x
 # ============================================================================
+# --- MAIN DOCUMENT PROCESSING FUNCTION (Modified for Gradio) ---
 # ============================================================================
+def run_single_pdf_preprocessing(pdf_path: str, output_dir: str) -> Tuple[Optional[str], int, int, int, str]:
+    """
+    Runs the preprocessing pipeline and returns the output JSON path, counts, and a summary report.
+    """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     GLOBAL_FIGURE_COUNT = 0
     GLOBAL_EQUATION_COUNT = 0
     _ocr_cache.clear()
     if not os.path.exists(pdf_path):
+        report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
+        return None, 0, 0, 0, report
+    # Define output paths inside the provided temporary directory
+    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+    preprocessed_json_path = os.path.join(output_dir, f"{pdf_name}_preprocessed.json")
+    # Placeholder for FIGURE_EXTRACTION_DIR
+    figure_output_dir = os.path.join(output_dir, 'figure_extraction')
+    os.makedirs(figure_output_dir, exist_ok=True)
     try:
         model = YOLO(WEIGHTS_PATH)
     except Exception as e:
+        report = f"❌ ERROR loading YOLO model from {WEIGHTS_PATH}: {e}\n(Please ensure 'best.pt' is in the current directory and Ultralytics is installed.)"
+        return None, 0, 0, 0, report
     try:
         doc = fitz.open(pdf_path)
+        total_pages = doc.page_count
     except Exception as e:
+        report = f"❌ ERROR loading PDF file: {e}"
+        return None, 0, 0, 0, report
     all_pages_data = []
     total_pages_processed = 0
     mat = fitz.Matrix(2.0, 2.0)
     for page_num_0_based in range(doc.page_count):
         fitz_page = doc.load_page(page_num_0_based)
         try:
             pix = fitz_page.get_pixmap(matrix=mat)
             original_img = pixmap_to_numpy(pix)
         except Exception as e:
+            logging.error(f"Error converting page {page_num_0_based + 1} to image: {e}")
             continue
         final_output, page_separator_x = preprocess_and_ocr_page(
+            original_img, model, pdf_path, page_num_0_based + 1, fitz_page, pdf_name
         )
         if final_output is not None:
             page_data = {
+                "page_number": page_num_0_based + 1,
                 "data": final_output,
                 "column_separator_x": page_separator_x
             }
             all_pages_data.append(page_data)
             total_pages_processed += 1
     doc.close()
     if all_pages_data:
         try:
             with open(preprocessed_json_path, 'w') as f:
                 json.dump(all_pages_data, f, indent=4)
+            json_path_out = preprocessed_json_path
+            report = (
+                f"✅ **Processing Complete!**\n"
+                f"--- {total_pages_processed} pages processed ---\n"
+                f"**1) Total Pages Detected:** {total_pages}\n"
+                f"**2) Elements Extracted:**\n"
+                f"   - Equations: {GLOBAL_EQUATION_COUNT}\n"
+                f"   - Figures: {GLOBAL_FIGURE_COUNT}\n"
+                f"\nDetailed JSON output saved to: `{os.path.basename(json_path_out)}`"
+            )
         except Exception as e:
+            json_path_out = None
+            report = f"❌ ERROR saving combined JSON output: {e}"
     else:
+        json_path_out = None
+        report = f"❌ WARNING: No page data generated. Halting pipeline. Total pages in PDF: {total_pages}"
+    return json_path_out, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report
 # ============================================================================
+# --- GRADIO INTERFACE FUNCTION ---
 # ============================================================================
+def gradio_process_pdf(pdf_file) -> Tuple[str, Optional[str]]:
+    """
+    Gradio wrapper function to handle file upload and cleanup.
+    """
+    if pdf_file is None:
+        return "Please upload a PDF file.", None
+    pdf_path = pdf_file.name
+    # Use a temporary directory for all output files to ensure cleanup
+    temp_output_dir = tempfile.mkdtemp()
+    try:
+        # Run the core logic
+        json_path, num_pages, num_equations, num_figures, report = run_single_pdf_preprocessing(
+            pdf_path, temp_output_dir
+        )
+        # Prepare file output for Gradio (only the JSON is returned)
+        if json_path and os.path.exists(json_path):
+            # Create a file name for the download button
+            download_filename = os.path.basename(json_path)
+            # Gradio requires the file path to exist until the download is complete
+            # Move the file out of the temp dir so Gradio can access it later, or
+            # more simply, return the path and rely on Gradio's internal file handling.
+            # We'll rely on Gradio to handle the temporary file access.
+            return report, json_path
+        else:
+            return report, None
+    except Exception as e:
+        return f"An unexpected error occurred during processing: {e}", None
+    finally:
+        # Clean up the temporary directory after the processing function returns
+        # NOTE: Gradio manages its own temp files; this cleans the processing outputs.
+        # shutil.rmtree(temp_output_dir, ignore_errors=True)
+        pass # Better to let Gradio/OS handle cleanup of large files.
+# ============================================================================
+# --- GRADIO INTERFACE DEFINITION ---
+# ============================================================================
+if __name__ == "__main__":
+    if not os.path.exists(WEIGHTS_PATH):
+        print("⚠️ WARNING: YOLO weight file 'best.pt' not found.")
+        print("The script will run, but the element counting uses placeholder values.")
+    # Define the inputs and outputs for the Gradio interface
+    input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
+    output_report = gr.Markdown(label="Extraction Summary")
+    output_json = gr.File(label="Download Preprocessed JSON", type="filepath", visible=True)
+    # Create the Gradio interface
+    interface = gr.Interface(
+        fn=gradio_process_pdf,
+        inputs=input_file,
+        outputs=[output_report, output_json],
+        title="🔬 PDF Element Extractor (YOLO/OCR Pipeline)",
+        description=(
+            "Upload a research paper PDF to run the YOLO/OCR pre-processing pipeline.\n"
+            "It detects pages, figures, and equations, and returns a summary of the counts "
+            "along with the structured JSON output file."
+        ),
+        allow_flagging='never'
+    )
+    # Launch the interface
+    print("\nStarting Gradio application...")
+    # NOTE: Set share=True to generate a public link (good for testing)
+    interface.launch(inbrowser=True)