Spaces:

heerjtdev
/

pddle

Sleeping

App Files Files Community

heerjtdev commited on Nov 4, 2025

Commit

1d6b971

verified ·

1 Parent(s): 5e11387

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -600

app.py CHANGED Viewed

@@ -1,615 +1,111 @@
-import base64
-import io
-import json
-import os
-import re
-import time
-import tempfile
-from typing import Dict, List, Tuple, Any, Optional
-from urllib.parse import urlparse
 import gradio as gr
-import numpy as np
-import requests
-from PIL import Image, ImageDraw, ImageFont
 from pdf2image import convert_from_path
-# --- PADDLEOCR INTEGRATION ---
-try:
-    from paddleocr import PPStructureV3, draw_structure_result
-    # Initialize the model globally once to avoid re-loading on every call
-    # This uses the default layout and table recognition models (PP-StructureV3).
-    # Setting show_log=False keeps the console clean.
-    PADDLE_STRUCTURE_PIPELINE = PPStructureV3(
-        layout=True,
-        table=True,
-        ocr=True,
-        show_log=False
-    )
-    print("✅ Paddle Structure Model Initialized for Integrated Inference.")
-except ImportError:
-    PADDLE_STRUCTURE_PIPELINE = None
-    print("❌ PaddleOCR/PPStructureV3 not found. Inference will be disabled.")
-except Exception as e:
-    PADDLE_STRUCTURE_PIPELINE = None
-    print(f"❌ Error initializing PaddleOCR pipeline: {e}")
-# =========================
-# Config (API URLs are now obsolete but kept for reference)
-# =========================
-# DEFAULT_API_URL = os.environ.get("API_URL") # OBSOLETE
-# TOKEN = os.environ.get("TOKEN") # OBSOLETE
-LOGO_IMAGE_PATH = "./assets/logo.jpg"
-GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
-LATEX_DELIMS = [
-    {"left": "$$", "right": "$$", "display": True},
-    {"left": "$",  "right": "$",  "display": False},
-    {"left": "\\(", "right": "\\)", "display": False},
-    {"left": "\\[", "right": "\\]", "display": True},
-]
-# AUTH_HEADER and JSON_HEADERS are OBSOLETE but kept for file structure consistency
-AUTH_HEADER = {}
-JSON_HEADERS = {}
-# =========================
-# Utility Functions
-# =========================
-def _ensure_local_path(path_or_url: str) -> str:
-    """Ensures the input is a local file path, downloading from URL if necessary."""
-    if not path_or_url:
-        raise ValueError("Input path is empty.")
-    is_url = path_or_url.startswith(("http://", "https://"))
-    if not is_url:
-        return path_or_url # Already local file
-    # Download remote URL to a temporary file
-    try:
-        r = requests.get(path_or_url, timeout=600)
-        r.raise_for_status()
-        # Use filename extension if available, otherwise default to .jpg
-        ext = os.path.splitext(urlparse(path_or_url).path)[1].lower() or '.jpg'
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
-        temp_file.write(r.content)
-        temp_file.close()
-        return temp_file.name
-    except Exception as e:
-        raise gr.Error(f"Error downloading image from URL: {e}")
-def image_to_base64_data_url(filepath: str) -> str:
-    """Encodes a local image file to a Base64 data URL for HTML rendering."""
-    try:
-        # Prevent conversion attempt on PDFs which can be huge
-        if filepath.lower().endswith('.pdf'):
-            return ""
-        ext = os.path.splitext(filepath)[1].lower()
-        mime_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"}
-        mime_type = mime_types.get(ext, "image/jpeg")
-        with open(filepath, "rb") as image_file:
-            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
-        return f"data:{mime_type};base64,{encoded_string}"
-    except Exception as e:
-        # print(f"Error encoding image to Base64: {e}")
-        return ""
-def _to_html_img(pil_img: Image.Image) -> str:
-    """Converts a PIL Image to a Base64 data URL string for HTML display."""
-    buffered = io.BytesIO()
-    pil_img.save(buffered, format="PNG")
-    img_str = base64.b64encode(buffered.getvalue()).decode()
-    return f'data:image/png;base64,{img_str}'
-def _escape_inequalities_in_math(md: str) -> str:
-    """Escapes < and > inside math blocks to prevent markdown misinterpretation."""
-    _MATH_PATTERNS = [
-        re.compile(r"\$\$([\s\S]+?)\$\$"),
-        re.compile(r"\$([^\$]+?)\$"),
-        re.compile(r"\\\[([\s\S]+?)\\\]"),
-        re.compile(r"\\\(([\s\S]+?)\\\)"),
-    ]
-    def fix(s: str) -> str:
-        s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
-        s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
-        s = s.replace("<", r" \lt ").replace(">", r" \gt ")
-        return s
-    for pat in _MATH_PATTERNS:
-        md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
-    return md
-def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
-    """Loads example URLs (unchanged)."""
-    BASE_URL = "https://paddle-model-ecology.bj.bcebos.com/PPOCRVL/dataset/examples"
-    supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
-    examples = []
-    if not os.path.exists(dir_path):
-        print(f"Warning: example dir {dir_path} not found.")
-        return []
-    for filename in sorted(os.listdir(dir_path)):
-        ext = os.path.splitext(filename)[1].lower()
-        if ext in supported_exts:
-            subdir = os.path.basename(dir_path.rstrip("/"))
-            img_url = f"{BASE_URL}/{subdir}/{filename}"
-            examples.append([img_url])
-    return examples
-TARGETED_EXAMPLES_DIR = "examples/targeted"
-COMPLEX_EXAMPLES_DIR = "examples/complex"
-targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
-complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
-# =========================
-# UI Helpers
-# =========================
-def render_uploaded_image_div(path_or_url: str) -> str:
-    """Renders the image or a PDF placeholder."""
-    if not path_or_url:
-        return ""
-    is_url = path_or_url.startswith(("http://", "https://"))
-    is_pdf = path_or_url.lower().endswith('.pdf')
-    if is_pdf:
-         return f"""<div style="text-align:center; padding: 20px; color:#888;">PDF file loaded. Use the page selector and click 'Extract...' to process.</div>"""
-    src = path_or_url if is_url else image_to_base64_data_url(path_or_url)
-    if not src:
-        return "" # Handle case where local image B64 conversion failed
-    return f"""
-    <div class="uploaded-image">
-        <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
-    </div>
     """
-def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
-    if path_or_url:
-        html_content = render_uploaded_image_div(path_or_url)
-        return gr.update(value=html_content, visible=True)
-    else:
-        return gr.update(value="", visible=False)
-# =========================
-# Core Inference Logic (Replaces API Calls)
-# =========================
-def _run_paddle_structure(local_path: str, is_doc_parsing: bool = True) -> Tuple[str, str, str]:
-    """Runs PPStructureV3 prediction and formats the results."""
-    if PADDLE_STRUCTURE_PIPELINE is None:
-        raise gr.Error("PaddleOCR model is not loaded. Please check model initialization logs.")
-    start_time = time.time()
-    # 1. Run prediction
-    # Note: PPStructureV3 processes images, not PDFs. local_path should be an image path.
-    result_list = PADDLE_STRUCTURE_PIPELINE.predict(local_path)
-    end_time = time.time()
-    print(f"PaddleOCR Structure inference completed in {end_time - start_time:.2f} seconds.")
-    if not result_list:
-        return "No content recognized.", "<p>No visualization available.</p>", "{}"
-    # We only process the first page/image in the list
-    result = result_list[0]
-    # 2. Markdown Output
-    # PPStructureV3 can generate LaTeX/Markdown based on its components.
-    # This is a simplification; full VL-model output formatting is complex.
-    md_text = result.to_markdown()
-    # 3. Visualization Image
-    image = Image.open(local_path).convert('RGB')
-    # draw_structure_result requires a system font (e.g., simfang.ttf or arial.ttf) to be accessible.
-    try:
-        vis_image = draw_structure_result(image, result, font_path="arial.ttf")
-    except Exception:
-        # Fallback if font isn't found
-        vis_image = draw_structure_result(image, result)
-    output_html = f'<img src="{_to_html_img(vis_image)}" alt="Detection Visualization" loading="lazy">'
-    # 4. Raw JSON Output
-    raw_json = json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
-    md_text = _escape_inequalities_in_math(md_text)
-    return md_text or "(Empty result)", output_html, raw_json
-# --- Inference Handlers for Tabs 1 & 2 ---
-def handle_complex_doc(path_or_url: str, use_chart_recognition: bool, use_doc_unwarping: bool, use_doc_orientation_classify: bool) -> Tuple[str, str, str]:
-    if not path_or_url:
-        raise gr.Error("Please upload an image first.")
-    local_path = _ensure_local_path(path_or_url)
-    if local_path.lower().endswith('.pdf'):
-         raise gr.Error("Document Parsing tab requires an image, not a PDF.")
-    # Note: The switches (chart, unwarping, orientation) are ignored here because
-    # the integrated PPStructureV3 pipeline does not expose simple toggles for them.
-    # The complexity is handled internally by the model version loaded.
-    return _run_paddle_structure(local_path, is_doc_parsing=True)
-def handle_targeted_recognition(path_or_url: str, prompt_choice: str) -> Tuple[str, str]:
-    if not path_or_url:
-        raise gr.Error("Please upload an image first.")
-    local_path = _ensure_local_path(path_or_url)
-    if local_path.lower().endswith('.pdf'):
-         raise gr.Error("Element-level Recognition tab requires an image, not a PDF.")
-    # Map the choice to the desired structure/recognition type (simplified mapping)
-    mapping = {
-        "Text Recognition": "text",
-        "Formula Recognition": "formula",
-        "Table Recognition": "table",
-        "Chart Recognition": "chart",
-    }
-    target_type = mapping.get(prompt_choice, "text")
-    # For integrated PPStructureV3, we run a full structure pass and let the model's
-    # internal logic prioritize the recognition based on the input image content.
-    md_preview, _, md_raw = _run_paddle_structure(local_path, is_doc_parsing=False)
-    # In a real VL system, we'd use the 'prompt_choice' to focus the model output.
-    # Here, we just return the full markdown and raw output.
-    return md_preview, md_raw
-# --- Inference Handler for Tab 3: PDF & Structured Extraction ---
-def _pdf_to_page_image(pdf_path: str, page_num: int) -> Image.Image:
-    """Converts a specific PDF page to a PIL Image."""
     try:
-        pages = convert_from_path(pdf_path, dpi=300, first_page=page_num + 1, last_page=page_num + 1)
-        if not pages:
-             raise ValueError(f"Could not convert page {page_num} of PDF.")
-        return pages[0]
-    except Exception as e:
-        raise gr.Error(f"Error processing PDF with pdf2image (Is Poppler installed?): {e}")
-def _draw_boxes_on_image(img: Image.Image, elements: List[Dict]) -> str:
-    """Draws bounding boxes onto the PIL Image based on PPStructureV3 results."""
-    draw = ImageDraw.Draw(img)
-    try:
-        # Use a common font or fall back
-        font = ImageFont.truetype("arial.ttf", 16)
-    except IOError:
-        font = ImageFont.load_default()
-    for item in elements:
-        # The coordinates are expected in the format [x1, y1, x2, y2]
-        bbox = item.get("box", []) # PPStructureV3 often uses 'box' key
-        item_type = item.get("type", "text")
-        if len(bbox) == 4:
-            x1, y1, x2, y2 = bbox
-            # Draw different colors for different types
-            if item_type in ["figure", "title"]:
-                color = "purple"
-                width = 3
-            elif item_type in ["table", "formula"]:
-                color = "red"
-                width = 2
-            else: # text
-                color = "green"
-                width = 1
-            draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=width)
-            # Optional: Add type label
-            # draw.text((x1 + 5, y1 - 15), item_type, fill=color, font=font)
-    return _to_html_img(img)
-def handle_structured_extraction(pdf_path: Optional[str], page_num: int) -> Tuple[str, str, str]:
-    if PADDLE_STRUCTURE_PIPELINE is None:
-        raise gr.Error("PaddleOCR model is not loaded.")
-    if not pdf_path or not pdf_path.lower().endswith('.pdf'):
-        raise gr.Error("Please upload a PDF file for this feature.")
-    print(f"Processing PDF: {pdf_path}, Page: {page_num}")
-    # --- 1. Convert PDF Page to Image ---
-    try:
-        page_img = _pdf_to_page_image(pdf_path, page_num)
     except Exception as e:
-        return f"Error: {e}", "Error during PDF conversion.", json.dumps({"error": str(e)}, indent=2)
-    # --- 2. Save image to temp file for PPStructureV3 ---
-    temp_img_path = tempfile.mktemp(suffix=".png")
-    page_img.save(temp_img_path)
-    try:
-        # --- 3. Run PPStructureV3 inference ---
-        result_list = PADDLE_STRUCTURE_PIPELINE.predict(temp_img_path)
-        if not result_list:
-            return "No content recognized on this PDF page.", "", "{}"
-        # --- 4. Process Results ---
-        result = result_list[0]
-        elements = result.to_dict().get("res", [])
-        # Extract LaTeX/Formulas
-        all_latex = []
-        for item in elements:
-            if item.get("type") == "formula" and item.get("text"):
-                # Wrap text with $$. PPStructureV3 often outputs raw LaTeX in the 'text' field.
-                all_latex.append(f"$${item['text']}$$")
-        latex_output = "\n\n".join(all_latex) if all_latex else "No formulas (LaTeX) found on this page."
-        # --- 5. Draw Boxes for Visualization ---
-        box_html = f'<img src="{_draw_boxes_on_image(page_img, elements)}" alt="Image with Bounding Boxes" loading="lazy">'
-        # --- 6. Return Results ---
-        return box_html, latex_output, json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
-    except Exception as e:
-        raise gr.Error(f"PaddleOCR inference failed during PDF processing: {e}")
-    finally:
-        if os.path.exists(temp_img_path):
-            os.remove(temp_img_path)
-def get_pdf_page_count(pdf_path):
-    """Gets the total number of pages in the PDF."""
-    if not pdf_path or not pdf_path.lower().endswith('.pdf'):
-        return gr.update(maximum=0, value=0, interactive=False)
-    try:
-        # Load the whole PDF to get the exact count (inefficient but reliable with pdf2image)
-        pages = convert_from_path(pdf_path, use_pdftocairo=True)
-        count = len(pages)
-        return gr.update(maximum=max(0, count - 1), value=0, interactive=True)
-    except Exception as e:
-        print(f"Warning: Could not determine PDF page count: {e}")
-        return gr.update(maximum=0, value=0, interactive=False)
-# =========================
-# CSS & UI (Unchanged)
-# =========================
-custom_css = """
-body, .gradio-container { font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif; }
-.app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
-.gradio-container { padding: 4px 0 !important; }
-.gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
-.gradio-container [data-testid="tabitem"], .gradio-container .tabitem { padding-top: 4px !important; }
-.quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
-.quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
-.quick-links a:hover { text-decoration: underline; }
-.prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
-.prompt-grid button { height: 40px !important; padding: 0 12px !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; letter-spacing: 0.2px; }
-#image_preview_vl, #image_preview_doc, #image_preview_pdf { height: 400px !important; overflow: auto; }
-#image_preview_vl img, #image_preview_doc img, #vis_image_doc img, #box_vis_html img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
-#md_preview_vl, #md_preview_doc { max-height: 540px; min-height: 180px; overflow: auto; scrollbar-gutter: stable both-edges; }
-#md_preview_vl .prose, #md_preview_doc .prose { line-height: 1.7 !important; }
-#md_preview_vl .prose img, #md_preview_doc .prose img { display: block; margin: 0 auto; max-width: 100%; height: auto; }
-.notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
-.notice strong { font-weight: 700; }
-.notice a { color: #3b82f6; text-decoration: none; }
-.notice a:hover { text-decoration: underline; }
-.checkbox-row .gradio-checkbox { flex-grow: 1; text-align: center; }
-"""
-with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
-    logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
-    gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
-    gr.HTML("""<div class="notice"><strong>Heads up:</strong> The Hugging Face demo can be slow at times. For a faster experience, please try <a href="https://aistudio.baidu.com/application/detail/98365" target="_blank" rel="noopener noreferrer">Baidu AI Studio</a> or <a href="https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary" target="_blank" rel="noopener noreferrer">ModelScope</a>.</div>""")
-    gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a> | <a href="https://aistudio.baidu.com/paddleocr" target="_blank">Official Website</a></div>""")
-    with gr.Tabs():
-        # ===================== Tab 1: Document Parsing =====================
-        with gr.Tab("Document Parsing"):
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_doc = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
-                    preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
-                    gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
-                    gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
-                    example_url_doc = gr.State(value=None)
-                    with gr.Row(variant="panel"):
-                        with gr.Column(scale=2):
-                            btn_parse = gr.Button("Parse Document", variant="primary")
-                        with gr.Column(scale=3):
-                            with gr.Row(elem_classes=["checkbox-row"]):
-                                chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, min_width=10)
-                                doc_unwarping_switch = gr.Checkbox(label="Enable document unwarping", value=False, min_width=10)
-                                doc_orientation_switch = gr.Checkbox(label="Enable orientation classification", value=False, min_width=10)
-                    if complex_document_examples:
-                        complex_paths = [e[0] for e in complex_document_examples]
-                        complex_state = gr.State(complex_paths)
-                        gallery_complex = gr.Gallery(
-                            value=complex_paths, columns=4, height=400,
-                            preview=False, label="Example Documents (Select to Load)", allow_preview=False
-                        )
-                        def on_gallery_select_for_doc(paths, evt: gr.SelectData):
-                            idx = evt.index
-                            if isinstance(idx, (list, tuple)):
-                                idx = idx[0]
-                            try:
-                                url = paths[int(idx)]
-                            except Exception:
-                                raise gr.Error(f"Invalid index from gallery: {evt.index}")
-                            return url, update_preview_visibility(url)
-                        gallery_complex.select(
-                            fn=on_gallery_select_for_doc,
-                            inputs=[complex_state],
-                            outputs=[example_url_doc, preview_doc_html],
-                        )
-                    gr.Markdown("""
-                    <div class="notice">
-                        <h3>History Updates</h3>
-                        <ul>
-                            <li><strong>Nov 4, 2025:</strong> Application converted to run PaddleOCR inference locally (integrated mode), removing API dependency.</li>
-                            <li><strong>Oct 30, 2025:</strong> Added two advanced control options under the "Document Parsing" tab.</li>
-                            <li><strong>Oct 16, 2025:</strong> Initial release of the demo.</li>
-                        </ul>
-                    </div>
-                    """)
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Markdown Preview"):
-                            md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
-                        with gr.Tab("Visualization"):
-                            vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
-                        with gr.Tab("Markdown Source"):
-                            md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
-            def on_file_doc_change(fp):
-                return None, update_preview_visibility(fp)
-            file_doc.change(fn=on_file_doc_change, inputs=[file_doc], outputs=[example_url_doc, preview_doc_html])
-            def parse_doc_router(fp, example_url, use_chart, use_unwarping, use_orientation):
-                src = fp if fp else example_url
-                if not src:
-                    raise gr.Error("Please upload an image or pick an example first.")
-                return handle_complex_doc(src, use_chart, use_unwarping, use_orientation)
-            btn_parse.click(fn=parse_doc_router, inputs=[file_doc, example_url_doc, chart_parsing_switch, doc_unwarping_switch, doc_orientation_switch],
-                            outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
-        # ===================== Tab 2: Element-level Recognition =====================
-        with gr.Tab("Element-level Recognition"):
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_vl = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
-                    preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
-                    gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
-                    gr.Markdown("Choose a recognition type:")
-                    with gr.Row(elem_classes=["prompt-grid"]):
-                        btn_ocr = gr.Button("Text Recognition", variant="secondary")
-                        btn_formula = gr.Button("Formula Recognition", variant="secondary")
-                    with gr.Row(elem_classes=["prompt-grid"]):
-                        btn_table = gr.Button("Table Recognition", variant="secondary")
-                        btn_chart = gr.Button("Chart Recognition", variant="secondary")
-                    example_url_vl = gr.State(value=None)
-                    if targeted_recognition_examples:
-                        targeted_paths = [e[0] for e in targeted_recognition_examples]
-                        targeted_state = gr.State(targeted_paths)
-                        gallery_targeted = gr.Gallery(
-                            value=targeted_paths, columns=4, height=400,
-                            preview=False, label="Example Elements (Select to Load)", allow_preview=False
-                        )
-                        def on_gallery_select_for_vl(paths, evt: gr.SelectData):
-                            idx = evt.index
-                            if isinstance(idx, (list, tuple)):
-                                idx = idx[0]
-                            try:
-                                url = paths[int(idx)]
-                            except Exception:
-                                raise gr.Error(f"Invalid index from gallery: {evt.index}")
-                            return url, update_preview_visibility(url)
-                        gallery_targeted.select(
-                            fn=on_gallery_select_for_vl,
-                            inputs=[targeted_state],
-                            outputs=[example_url_vl, preview_vl_html],
-                        )
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Recognition Result"):
-                            md_preview_vl = gr.Markdown("Please upload an image and click a recognition type.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_vl")
-                        with gr.Tab("Raw Output"):
-                            md_raw_vl = gr.Code(label="Raw Output", language="markdown")
-            def on_file_vl_change(fp):
-                return None, update_preview_visibility(fp)
-            file_vl.change(fn=on_file_vl_change, inputs=[file_vl], outputs=[example_url_vl, preview_vl_html])
-            def parse_vl_router(fp, example_url, prompt_choice):
-                src = fp if fp else example_url
-                if not src:
-                    raise gr.Error("Please upload an image or pick an example first.")
-                return handle_targeted_recognition(src, prompt_choice)
-            btn_ocr.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
-            btn_formula.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
-            btn_table.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
-            btn_chart.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
-        # ===================== Tab 3: PDF & Structured Extraction (NEW) =====================
-        with gr.Tab("PDF & Structured Extraction"):
-            gr.Markdown("## 📑 PDF Bounding Box & LaTeX Extractor")
-            gr.Markdown("Upload a PDF to extract structured elements, visualize bounding boxes, and retrieve LaTeX code (Formulas) on a per-page basis.")
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_pdf = gr.File(label="Upload PDF", file_count="single", type="filepath", file_types=[".pdf"], elem_id="file_pdf_input")
-                    preview_pdf_html = gr.HTML(value="", elem_id="image_preview_pdf", visible=False)
-                    page_selector = gr.Slider(
-                        minimum=0, maximum=0, step=1, value=0, label="Select Page (0-indexed)", interactive=False
-                    )
-                    btn_extract_boxes = gr.Button("Extract Bounding Boxes & LaTeX", variant="primary")
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Image with Bounding Boxes"):
-                            box_vis_html = gr.HTML(label="Bounding Box Visualization", elem_id="box_vis_html", value="Upload a PDF and click the button to see the result.")
-                        with gr.Tab("Extracted LaTeX"):
-                            latex_output = gr.Markdown(label="Extracted LaTeX/Formulas", elem_id="latex_output", value="No LaTeX extracted yet.")
-                        with gr.Tab("Raw Structured Data"):
-                            raw_json_output = gr.Code(label="Raw Structured Output (JSON)", language="json", elem_id="raw_json_output")
-            # Logic for PDF input
-            def on_file_pdf_change(fp):
-                # Update page selector when a new PDF is uploaded
-                page_update = get_pdf_page_count(fp)
-                # Update preview
-                preview_update = update_preview_visibility(fp)
-                return page_update, preview_update
-            file_pdf.change(
-                fn=on_file_pdf_change,
-                inputs=[file_pdf],
-                outputs=[page_selector, preview_pdf_html]
-            )
-            # Logic for processing
-            btn_extract_boxes.click(
-                fn=handle_structured_extraction,
-                inputs=[file_pdf, page_selector],
-                outputs=[box_vis_html, latex_output, raw_json_output]
-            )
-if __name__ == "__main__":
-    port = int(os.getenv("PORT", "7860"))
-    # Use queue() for better handling of long-running model inference
-    demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port, share=False)

 import gradio as gr
+import pytesseract
+from PIL import Image
 from pdf2image import convert_from_path
+import os
+import tempfile
+# ----------------------------------------------------------------------
+# 1. OCR Core Function
+# ----------------------------------------------------------------------
+def perform_ocr_on_pdf(pdf_file_path, language="eng"):
     """
+    Converts a PDF file to images and performs OCR on each page.
+    Args:
+        pdf_file_path (str): The file path to the uploaded PDF.
+        language (str): The Tesseract language code (e.g., 'eng', 'fra+deu').
+    Returns:
+        str: The combined extracted text from all PDF pages.
+    """
+    if pdf_file_path is None:
+        return "Please upload a PDF file."
+    extracted_text = []
     try:
+        # 1. Convert PDF pages to PIL images (requires poppler-utils, installed via Dockerfile)
+        # Setting a high DPI (300) improves OCR accuracy for scanned documents.
+        images = convert_from_path(pdf_file_path, dpi=300)
+        # 2. Iterate through each page image and perform OCR
+        for i, image in enumerate(images):
+            # Using tempfile to save the image is sometimes necessary for pytesseract,
+            # though convert_from_path often returns PIL objects directly.
+            # We'll use the PIL object directly for efficiency.
+            # Perform OCR on the image
+            page_text = pytesseract.image_to_string(image, lang=language)
+            extracted_text.append(f"--- PAGE {i+1} ---\n{page_text}\n")
+        return "\n".join(extracted_text)
+    except pytesseract.TesseractNotFoundError:
+        return "Error: Tesseract is not installed or not in PATH. This should be handled by the Dockerfile."
     except Exception as e:
+        return f"An error occurred during OCR processing: {str(e)}"
+# ----------------------------------------------------------------------
+# 2. Gradio Interface
+# ----------------------------------------------------------------------
+# Define the supported languages for the dropdown
+LANGUAGES = {
+    "English": "eng",
+    "Spanish": "spa",
+    "French": "fra",
+    "German": "deu",
+    "Japanese": "jpn",
+    "Chinese (Simplified)": "chi_sim"
+}
+# Create the Gradio interface components
+pdf_input = gr.File(
+    label="Upload PDF Document",
+    file_types=[".pdf"],
+    type="filepath",
+    interactive=True
+)
+lang_dropdown = gr.Dropdown(
+    label="Select OCR Language",
+    choices=list(LANGUAGES.keys()),
+    value="English",
+    type="value",
+    interactive=True
+)
+ocr_output = gr.Textbox(
+    label="Extracted Text (Output)",
+    lines=25,
+    max_lines=30,
+    show_copy_button=True,
+    placeholder="Extracted text will appear here...",
+)
+# Custom wrapper to map the dropdown name back to the Tesseract code
+def lang_wrapper(file_path, lang_name):
+    lang_code = LANGUAGES.get(lang_name, "eng")
+    return perform_ocr_on_pdf(file_path, lang_code)
+# Create the Gradio Interface
+gr.Interface(
+    fn=lang_wrapper,
+    inputs=[pdf_input, lang_dropdown],
+    outputs=ocr_output,
+    title="PDF Optical Character Recognition (OCR) App",
+    description=(
+        "Upload a PDF file to extract text from it using Tesseract OCR. "
+        "Select the primary language to improve accuracy. "
+        "Note: Requires Tesseract and Poppler system dependencies."
+    ),
+    allow_flagging="never",
+    theme=gr.themes.Soft(primary_hue="blue").set(
+        body_background_fill="#f5f7fa",
+        background_fill_primary="#ffffff",
+        shadow_drop_lg="0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05)",
+    )
+).launch(server_name="0.0.0.0", server_port=7860)