import os import sys # Standard environment setup (keep this) if "APP_PATH" in os.environ: app_path = os.path.abspath(os.environ["APP_PATH"]) if os.getcwd() != app_path: # fix sys.path for import os.chdir(app_path) if app_path not in sys.path: sys.path.append(app_path) import io import tempfile from typing import List import pypdfium2 import gradio as gr import requests from contextlib import suppress from surya.common.surya.schema import TaskNames from surya.models import load_predictors from surya.debug.draw import draw_polys_on_image from PIL import Image from surya.layout import LayoutResult from surya.settings import settings from surya.common.util import rescale_bbox, expand_bbox # --- Core Functions (Minimal changes required) --- # Get page image from PDF (keep this) def open_pdf(pdf_file): return pypdfium2.PdfDocument(pdf_file) def page_counter(pdf_file): doc = open_pdf(pdf_file) doc_len = len(doc) doc.close() return doc_len def get_page_image(pdf_file, page_num, dpi=settings.IMAGE_DPI): doc = open_pdf(pdf_file) renderer = doc.render( pypdfium2.PdfBitmap.to_pil, page_indices=[page_num - 1], scale=dpi / 72, ) png = list(renderer)[0] png_image = png.convert("RGB") doc.close() return png_image def get_uploaded_image(in_file): return Image.open(in_file).convert("RGB") # Modified layout_detection to filter for Equation and Figure def focused_layout_detection(img) -> (Image.Image, LayoutResult): # Use the existing layout predictor pred = predictors["layout"]([img])[0] # Filter for Equation and Figure bounding boxes filtered_bboxes = [ p for p in pred.bboxes if p.label in ["Equation", "Figure"] # <-- Filter applied here ] # Update the prediction result to only include the filtered boxes pred.bboxes = filtered_bboxes # Prepare data for drawing on the image polygons = [p.polygon for p in filtered_bboxes] labels = [ f"{p.label}-{p.position}-{round(p.top_k[p.label], 2)}" for p in filtered_bboxes ] # Draw the filtered polygons layout_img = draw_polys_on_image( polygons, img.copy(), labels=labels, label_font_size=18 ) return layout_img, pred # Load models (keep this) predictors = load_predictors() # --- Gradio Interface (Significantly simplified) --- with gr.Blocks(title="Surya Equation/Figure Detector") as demo: gr.Markdown(""" # Surya Equation and Figure Detection This application uses Surya OCR's layout analysis model to **specifically detect and locate Equations and Figures** within a document page. The output provides an image with bounding boxes drawn, and the raw JSON bounding box information for the detected elements. Find the original project [here](https://github.com/VikParuchuri/surya). """) with gr.Row(): with gr.Column(): in_file = gr.File(label="PDF file or image:", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".gif", ".webp"]) in_num = gr.Slider(label="Page number", minimum=1, maximum=100, value=1, step=1) in_img = gr.Image(label="Select page of Image", type="pil", sources=None) # Keep only the essential button detection_btn = gr.Button("Run Equation and Figure Detection") with gr.Column(): result_img = gr.Gallery(label="Result image: Detected Equations and Figures", show_label=True, elem_id="gallery", columns=[1], rows=[1], object_fit="contain", height="auto") gr.HTML(""" """) result_json = gr.JSON(label="Result JSON (Bounding Box Data)") # Page Loading Logic (keep this) def show_image(file, num=1): if file.endswith('.pdf'): count = page_counter(file) img = get_page_image(file, num, settings.IMAGE_DPI) return [ gr.update(visible=True, maximum=count), gr.update(value=img)] else: img = get_uploaded_image(file) return [ gr.update(visible=False), gr.update(value=img)] in_file.upload( fn=show_image, inputs=[in_file], outputs=[in_num, in_img], ) in_num.change( fn=show_image, inputs=[in_file, in_num], outputs=[in_num, in_img], ) # Run Focused Detection def run_focused_detection(pil_image): # update counter with suppress(Exception): requests.get("https://counterapi.com/api/xiaoyao9184.github.com/view/docker-surya") layout_img, pred = focused_layout_detection(pil_image) # Exclude the large segmentation map from the JSON output layout_json = pred.model_dump(exclude=["segmentation_map"]) # Count the filtered results num_boxes = len(layout_json.get('bboxes', [])) return ( gr.update(label=f"Result image: {num_boxes} Equations/Figures detected", value=[layout_img], rows=[1], height=layout_img.height), gr.update(label=f"Result JSON: {num_boxes} Equations/Figures detected", value=layout_json) ) detection_btn.click( fn=run_focused_detection, inputs=[in_img], outputs=[result_img, result_json] ) if __name__ == "__main__": demo.launch()