Spaces:

heerjtdev
/

feeedback

Running

App Files Files Community

heerjtdev commited on Dec 8, 2025

Commit

12f4426

verified ·

1 Parent(s): 0f353ac

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -220

app.py CHANGED Viewed

@@ -3,24 +3,22 @@ import numpy as np
 import cv2
 import torch
 import torch.serialization
-import json
 import os
-import re
-from typing import List, Dict, Any, Optional, Union, Tuple
 from ultralytics import YOLO
 import logging
 import gradio as gr
 import shutil
 import tempfile
-import time
 # ============================================================================
-# --- Global Patches (Kept from original script) ---
 # ============================================================================
 _original_torch_load = torch.load
 def patched_torch_load(*args, **kwargs):
-    # FORCE classic behavior
     kwargs["weights_only"] = False
     return _original_torch_load(*args, **kwargs)
 torch.load = patched_torch_load
@@ -31,49 +29,20 @@ logging.basicConfig(level=logging.WARNING)
 # --- CONFIGURATION AND CONSTANTS ---
 # ============================================================================
-# NOTE: Update these paths to match your environment before running!
-# Gradio runs in the current working directory, so relative paths are fine.
 WEIGHTS_PATH = 'best.pt'
-# DIRECTORY CONFIGURATION - Now managed by tempfile or local folders
-# NOTE: For Gradio, we'll use a temporary directory for output files
-# to prevent cluttering the execution environment.
-# Detection parameters
 CONF_THRESHOLD = 0.2
 TARGET_CLASSES = ['figure', 'equation']
 IOU_MERGE_THRESHOLD = 0.4
 IOA_SUPPRESSION_THRESHOLD = 0.7
-LINE_TOLERANCE = 15
-# Global counters for sequential numbering across the entire PDF
 GLOBAL_FIGURE_COUNT = 0
 GLOBAL_EQUATION_COUNT = 0
 # ============================================================================
-# --- PERFORMANCE OPTIMIZATION: OCR CACHE ---
-# Using the original OCRCache class definition
-# ============================================================================
-class OCRCache:
-    """Caches OCR results per page to avoid redundant Tesseract runs."""
-    def __init__(self):
-        self.cache = {}
-    def get_key(self, pdf_path: str, page_num: int) -> str:
-        return f"{pdf_path}:{page_num}"
-    def has_ocr(self, pdf_path: str, page_num: int) -> bool:
-        return self.get_key(pdf_path, page_num) in self.cache
-    def get_ocr(self, pdf_path: str, page_num: int) -> Optional[list]:
-        return self.cache.get(self.get_key(pdf_path, page_num))
-    def set_ocr(self, pdf_path: str, page_num: int, ocr_data: list):
-        self.cache[self.get_key(pdf_path, page_num)] = ocr_data
-    def clear(self):
-        self.cache.clear()
-_ocr_cache = OCRCache()
-# ============================================================================
-# --- PHASE 1: YOLO/OCR PREPROCESSING FUNCTIONS (Kept from original script) ---
 # ============================================================================
 def calculate_iou(box1, box2):
@@ -90,21 +59,8 @@ def calculate_iou(box1, box2):
     return intersection_area / union_area if union_area > 0 else 0
-def calculate_ioa(box1, box2):
-    x1_a, y1_a, x2_a, y2_a = box1
-    x1_b, y1_b, x2_b, y2_b = box2
-    x_left = max(x1_a, x1_b)
-    y_top = max(y1_a, y1_b)
-    x_right = min(x2_a, x2_b)
-    y_bottom = min(y2_a, y2_b)
-    intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
-    box_a_area = (x2_a - x1_a) * (y2_a - y1_a)
-    return intersection_area / box_a_area if box_a_area > 0 else 0
 def filter_nested_boxes(detections, ioa_threshold=0.80):
-    if not detections:
-        return []
     for d in detections:
         x1, y1, x2, y2 = d['coords']
         d['area'] = (x2 - x1) * (y2 - y1)
@@ -155,38 +111,12 @@ def merge_overlapping_boxes(detections, iou_threshold):
         })
     return merged_detections
-def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_factor: float) -> list:
-    if not yolo_detections:
-        return raw_word_data
-    pdf_space_boxes = []
-    for det in yolo_detections:
-        x1, y1, x2, y2 = det['coords']
-        pdf_box = (x1 / scale_factor, y1 / scale_factor, x2 / scale_factor, y2 / scale_factor)
-        pdf_space_boxes.append(pdf_box)
-    cleaned_word_data = []
-    for word_tuple in raw_word_data:
-        wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
-        w_center_x = (wx1 + wx2) / 2
-        w_center_y = (wy1 + wy2) / 2
-        is_inside_yolo = False
-        for px1, py1, px2, py2 in pdf_space_boxes:
-            if px1 <= w_center_x <= px2 and py1 <= w_center_y <= py2:
-                is_inside_yolo = True
-                break
-        if not is_inside_yolo:
-            cleaned_word_data.append(word_tuple)
-    for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
-        dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
-        cleaned_word_data.append(dummy_entry)
-    return cleaned_word_data
 # ============================================================================
-# --- MISSING HELPER FUNCTIONS (Placeholders) ---
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
         (pix.h, pix.w, pix.n)
     )
@@ -196,198 +126,143 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     return img
-def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
-    # Placeholder: Always assume single column unless you have the full logic.
-    return None
-def preprocess_and_ocr_page(
-        image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
-        fitz_page: fitz.Page, pdf_name: str
-) -> Tuple[Optional[list], Optional[float]]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
-    scale_factor = 2.0
-    # Mock Detection for Counters (Same as previous response):
-    mock_detections = [
-        {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
-        {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
-        {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
-    ]
-    # --- Actual Logic Starts Here ---
-    # Run YOLO detection on the image (Actual implementation needed here)
-    # results = model(image, conf=CONF_THRESHOLD)
-    # mock_detections = []
-    # if results and results[0].boxes:
-    #     for box in results[0].boxes.data.tolist():
-    #         x1, y1, x2, y2, conf, cls_id = box
-    #         cls_name = model.names[int(cls_id)]
-    #         if cls_name in TARGET_CLASSES:
-    #             mock_detections.append({'coords': (x1, y1, x2, y2), 'class': cls_name, 'conf': conf})
-    merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
-    # 3. Update Global Counters based on Final Detections
     for det in final_detections:
         if det['class'] == 'figure':
             GLOBAL_FIGURE_COUNT += 1
         elif det['class'] == 'equation':
             GLOBAL_EQUATION_COUNT += 1
-    # Mock Raw Word Data and Cleaning (Actual implementation needs fitz_page.get_text("words"))
-    mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
-    cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
-    page_width = fitz_page.rect.width
-    page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
-    # Mock Final Output Structure
-    final_output = [
-        {"type": "text", "text": "Mock Text Block 1"},
-        {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
-        {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
-    ]
-    return final_output, page_separator_x
 # ============================================================================
-# --- MAIN DOCUMENT PROCESSING FUNCTION (Modified for Gradio) ---
 # ============================================================================
-def run_single_pdf_preprocessing(pdf_path: str, output_dir: str) -> Tuple[Optional[str], int, int, int, str]:
     """
-    Runs the preprocessing pipeline and returns the output JSON path, counts, and a summary report.
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     GLOBAL_FIGURE_COUNT = 0
     GLOBAL_EQUATION_COUNT = 0
-    _ocr_cache.clear()
     if not os.path.exists(pdf_path):
         report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
-        return None, 0, 0, 0, report
-    # Define output paths inside the provided temporary directory
-    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    preprocessed_json_path = os.path.join(output_dir, f"{pdf_name}_preprocessed.json")
-    # Placeholder for FIGURE_EXTRACTION_DIR
-    figure_output_dir = os.path.join(output_dir, 'figure_extraction')
-    os.makedirs(figure_output_dir, exist_ok=True)
     try:
         model = YOLO(WEIGHTS_PATH)
     except Exception as e:
-        report = f"❌ ERROR loading YOLO model from {WEIGHTS_PATH}: {e}\n(Please ensure 'best.pt' is in the current directory and Ultralytics is installed.)"
-        return None, 0, 0, 0, report
     try:
         doc = fitz.open(pdf_path)
         total_pages = doc.page_count
     except Exception as e:
         report = f"❌ ERROR loading PDF file: {e}"
-        return None, 0, 0, 0, report
-    all_pages_data = []
-    total_pages_processed = 0
     mat = fitz.Matrix(2.0, 2.0)
     for page_num_0_based in range(doc.page_count):
         fitz_page = doc.load_page(page_num_0_based)
         try:
             pix = fitz_page.get_pixmap(matrix=mat)
             original_img = pixmap_to_numpy(pix)
         except Exception as e:
-            logging.error(f"Error converting page {page_num_0_based + 1} to image: {e}")
             continue
-        final_output, page_separator_x = preprocess_and_ocr_page(
-            original_img, model, pdf_path, page_num_0_based + 1, fitz_page, pdf_name
-        )
-        if final_output is not None:
-            page_data = {
-                "page_number": page_num_0_based + 1,
-                "data": final_output,
-                "column_separator_x": page_separator_x
-            }
-            all_pages_data.append(page_data)
-            total_pages_processed += 1
     doc.close()
-    if all_pages_data:
-        try:
-            with open(preprocessed_json_path, 'w') as f:
-                json.dump(all_pages_data, f, indent=4)
-            json_path_out = preprocessed_json_path
-            report = (
-                f"✅ **Processing Complete!**\n"
-                f"--- {total_pages_processed} pages processed ---\n"
-                f"**1) Total Pages Detected:** {total_pages}\n"
-                f"**2) Elements Extracted:**\n"
-                f"   - Equations: {GLOBAL_EQUATION_COUNT}\n"
-                f"   - Figures: {GLOBAL_FIGURE_COUNT}\n"
-                f"\nDetailed JSON output saved to: `{os.path.basename(json_path_out)}`"
-            )
-        except Exception as e:
-            json_path_out = None
-            report = f"❌ ERROR saving combined JSON output: {e}"
-    else:
-        json_path_out = None
-        report = f"❌ WARNING: No page data generated. Halting pipeline. Total pages in PDF: {total_pages}"
-    return json_path_out, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report
 # ============================================================================
-# --- GRADIO INTERFACE FUNCTION ---
 # ============================================================================
-def gradio_process_pdf(pdf_file) -> Tuple[str, Optional[str]]:
     """
-    Gradio wrapper function to handle file upload and cleanup.
     """
     if pdf_file is None:
-        return "Please upload a PDF file.", None
     pdf_path = pdf_file.name
-    # Use a temporary directory for all output files to ensure cleanup
-    temp_output_dir = tempfile.mkdtemp()
     try:
         # Run the core logic
-        json_path, num_pages, num_equations, num_figures, report = run_single_pdf_preprocessing(
-            pdf_path, temp_output_dir
-        )
-        # Prepare file output for Gradio (only the JSON is returned)
-        if json_path and os.path.exists(json_path):
-            # Create a file name for the download button
-            download_filename = os.path.basename(json_path)
-            # Gradio requires the file path to exist until the download is complete
-            # Move the file out of the temp dir so Gradio can access it later, or
-            # more simply, return the path and rely on Gradio's internal file handling.
-            # We'll rely on Gradio to handle the temporary file access.
-            return report, json_path
-        else:
-            return report, None
     except Exception as e:
-        return f"An unexpected error occurred during processing: {e}", None
-    finally:
-        # Clean up the temporary directory after the processing function returns
-        # NOTE: Gradio manages its own temp files; this cleans the processing outputs.
-        # shutil.rmtree(temp_output_dir, ignore_errors=True)
-        pass # Better to let Gradio/OS handle cleanup of large files.
 # ============================================================================
@@ -397,31 +272,26 @@ def gradio_process_pdf(pdf_file) -> Tuple[str, Optional[str]]:
 if __name__ == "__main__":
     if not os.path.exists(WEIGHTS_PATH):
-        print("⚠️ WARNING: YOLO weight file 'best.pt' not found.")
-        print("The script will run, but the element counting uses placeholder values.")
-    # Define the inputs and outputs for the Gradio interface
     input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
-    output_report = gr.Markdown(label="Extraction Summary")
-    output_json = gr.File(label="Download Preprocessed JSON", type="filepath", visible=True)
-    # Create the Gradio interface
     interface = gr.Interface(
         fn=gradio_process_pdf,
         inputs=input_file,
-        outputs=[output_report, output_json],
-        title="🔬 PDF Element Extractor (YOLO/OCR Pipeline)",
         description=(
-            "Upload a research paper PDF to run the YOLO/OCR pre-processing pipeline.\n"
-            "It detects pages, figures, and equations, and returns a summary of the counts "
-            "along with the structured JSON output file."
         ),
     )
-    # Launch the interface
     print("\nStarting Gradio application...")
-    # NOTE: Set share=True to generate a public link (good for testing)
     interface.launch(inbrowser=True)

 import cv2
 import torch
 import torch.serialization
 import os
+from typing import Optional, Tuple
 from ultralytics import YOLO
 import logging
 import gradio as gr
 import shutil
 import tempfile
+import json # Still needed for simple JSON logging
 # ============================================================================
+# --- Global Patches and Setup ---
 # ============================================================================
+# Patch torch.load to prevent weights_only error with older models
 _original_torch_load = torch.load
 def patched_torch_load(*args, **kwargs):
     kwargs["weights_only"] = False
     return _original_torch_load(*args, **kwargs)
 torch.load = patched_torch_load
 # --- CONFIGURATION AND CONSTANTS ---
 # ============================================================================
 WEIGHTS_PATH = 'best.pt'
+# Detection parameters (Required for your box combination logic)
 CONF_THRESHOLD = 0.2
 TARGET_CLASSES = ['figure', 'equation']
 IOU_MERGE_THRESHOLD = 0.4
 IOA_SUPPRESSION_THRESHOLD = 0.7
+# Global counters (Reset per run)
 GLOBAL_FIGURE_COUNT = 0
 GLOBAL_EQUATION_COUNT = 0
 # ============================================================================
+# --- BOX COMBINATION LOGIC (Retained from your original script) ---
 # ============================================================================
 def calculate_iou(box1, box2):
     return intersection_area / union_area if union_area > 0 else 0
 def filter_nested_boxes(detections, ioa_threshold=0.80):
+    if not detections: return []
     for d in detections:
         x1, y1, x2, y2 = d['coords']
         d['area'] = (x2 - x1) * (y2 - y1)
         })
     return merged_detections
 # ============================================================================
+# --- UTILITY FUNCTIONS (Minimally Required) ---
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
+    """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
     img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
         (pix.h, pix.w, pix.n)
     )
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     return img
+def run_yolo_detection_and_count(
+        image: np.ndarray, model: YOLO, page_num: int
+) -> Tuple[int, int]:
+    """
+    Runs YOLO inference, applies NMS/filtering, and updates global counters.
+    Returns the counts for the current page.
+    """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+    yolo_detections = []
+    page_equations = 0
+    page_figures = 0
+    try:
+        # Run prediction
+        results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
+        if results and results[0].boxes:
+            for box in results[0].boxes.data.tolist():
+                x1, y1, x2, y2, conf, cls_id = box
+                cls_name = model.names[int(cls_id)]
+                if cls_name in TARGET_CLASSES:
+                    yolo_detections.append({
+                        'coords': (x1, y1, x2, y2),
+                        'class': cls_name,
+                        'conf': conf
+                    })
+    except Exception as e:
+        logging.error(f"YOLO inference failed on page {page_num}: {e}")
+        return 0, 0
+    # Apply NMS/Merging/Filtering based on your provided logic
+    merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
+    # Update Global Counters
     for det in final_detections:
         if det['class'] == 'figure':
             GLOBAL_FIGURE_COUNT += 1
+            page_figures += 1
         elif det['class'] == 'equation':
             GLOBAL_EQUATION_COUNT += 1
+            page_equations += 1
+    logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
+    return page_equations, page_figures
 # ============================================================================
+# --- MAIN DOCUMENT PROCESSING FUNCTION (Modified for Minimal Output) ---
 # ============================================================================
+def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str]:
     """
+    Runs the pipeline and returns just the counts and a report.
+    No intermediate JSON saving or complex output structure.
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
+    # Reset globals
     GLOBAL_FIGURE_COUNT = 0
     GLOBAL_EQUATION_COUNT = 0
     if not os.path.exists(pdf_path):
         report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
+        return 0, 0, 0, report
+    # Model Loading (CRITICAL: Requires best.pt)
     try:
         model = YOLO(WEIGHTS_PATH)
+        logging.warning(f"✅ Loaded YOLO model from: {WEIGHTS_PATH}")
     except Exception as e:
+        report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
+        return 0, 0, 0, report
     try:
         doc = fitz.open(pdf_path)
         total_pages = doc.page_count
+        logging.warning(f"✅ Opened PDF: {doc.page_count} pages")
     except Exception as e:
         report = f"❌ ERROR loading PDF file: {e}"
+        return 0, 0, 0, report
     mat = fitz.Matrix(2.0, 2.0)
     for page_num_0_based in range(doc.page_count):
         fitz_page = doc.load_page(page_num_0_based)
+        page_num = page_num_0_based + 1
         try:
             pix = fitz_page.get_pixmap(matrix=mat)
             original_img = pixmap_to_numpy(pix)
         except Exception as e:
+            logging.error(f"Error converting page {page_num} to image: {e}. Skipping.")
             continue
+        # Core Detection and Counting
+        run_yolo_detection_and_count(original_img, model, page_num)
     doc.close()
+    # Final Report Generation
+    report = (
+        f"✅ **YOLO Counting Complete!**\n\n"
+        f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
+        f"**2) Total Equations Detected:** **{GLOBAL_EQUATION_COUNT}**\n"
+        f"**3) Total Figures Detected:** **{GLOBAL_FIGURE_COUNT}**"
+    )
+    return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report
 # ============================================================================
+# --- GRADIO INTERFACE FUNCTION (Modified for minimal output) ---
 # ============================================================================
+def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str]:
     """
+    Gradio wrapper function to handle file upload and return all results as strings.
     """
     if pdf_file is None:
+        return "N/A", "N/A", "N/A", "Please upload a PDF file."
     pdf_path = pdf_file.name
     try:
         # Run the core logic
+        num_pages, num_equations, num_figures, report = run_single_pdf_preprocessing(pdf_path)
+        # Return results as formatted strings
+        return str(num_pages), str(num_equations), str(num_figures), report
     except Exception as e:
+        error_msg = f"An unexpected error occurred: {e}"
+        return "Error", "Error", "Error", error_msg
 # ============================================================================
 if __name__ == "__main__":
     if not os.path.exists(WEIGHTS_PATH):
+        logging.error(f"❌ FATAL ERROR: YOLO weight file '{WEIGHTS_PATH}' not found. Cannot run live inference.")
     input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
+    # Outputs are now discrete number fields and a final markdown report
+    output_pages = gr.Textbox(label="Total Pages in PDF", interactive=False)
+    output_equations = gr.Textbox(label="Total Equations Detected", interactive=False)
+    output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
+    output_report = gr.Markdown(label="Processing Summary")
     interface = gr.Interface(
         fn=gradio_process_pdf,
         inputs=input_file,
+        outputs=[output_pages, output_equations, output_figures, output_report],
+        title="🎯 Minimalist YOLO Counting for PDF Elements",
         description=(
+            "Upload a PDF to instantly run YOLO detection using your **`best.pt`** model "
+            "and get the total counts for pages, equations, and figures."
         ),
     )
     print("\nStarting Gradio application...")
     interface.launch(inbrowser=True)