GLM-OCR-Demo

Sleeping

App Files Files Community

nicolet8 commited on Feb 25

Commit

a192b42

verified ·

1 Parent(s): 357ce5b

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -118

app.py CHANGED Viewed

@@ -1,15 +1,21 @@
-import gradio as gr
-import torch
-import spaces
 import os
 import tempfile
 from PIL import Image, ImageOps
-from typing import Iterable
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.hot_pink = colors.Color(
     name="hot_pink",
     c50="#FFF0F5",
@@ -166,22 +172,6 @@ body, .gradio-container {
     background: rgba(255, 105, 180, 0.02) !important;
 }
-/* Radio buttons */
-.gradio-radio label {
-    border-radius: 6px !important;
-    transition: all 0.2s ease !important;
-    border: 1px solid transparent !important;
-}
-.gradio-radio label:hover {
-    background: rgba(255, 105, 180, 0.05) !important;
-}
-.gradio-radio label.selected {
-    background: rgba(255, 105, 180, 0.1) !important;
-    border-color: #FF69B4 !important;
-}
 /* Primary button */
 .primary {
     border-radius: 8px !important;
@@ -215,72 +205,13 @@ body, .gradio-container {
     line-height: 1.7 !important;
 }
-.gradio-markdown code {
-    font-family: 'IBM Plex Mono', monospace !important;
-    background: rgba(255, 105, 180, 0.08) !important;
-    padding: 2px 6px !important;
-    border-radius: 4px !important;
-    color: #CC4C8C !important;
-}
-.gradio-markdown pre {
-    background: rgba(255, 105, 180, 0.05) !important;
-    border: 1px solid #FFC0D9 !important;
-    border-radius: 8px !important;
-    padding: 1rem !important;
-}
-/* Examples */
-.gradio-examples .gallery-item {
-    border: 2px solid #FFC0D9 !important;
-    border-radius: 8px !important;
-    transition: all 0.2s ease !important;
-}
-.gradio-examples .gallery-item:hover {
-    border-color: #FF69B4 !important;
-    transform: translateY(-2px) !important;
-    box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
-}
-/* Scrollbar */
-::-webkit-scrollbar { width: 8px; height: 8px; }
-::-webkit-scrollbar-track { background: rgba(255,105,180,0.05); border-radius: 4px; }
-::-webkit-scrollbar-thumb { background: linear-gradient(135deg, #FF69B4, #FF99C4); border-radius: 4px; }
-::-webkit-scrollbar-thumb:hover { background: linear-gradient(135deg, #E55AA0, #FF69B4); }
-/* Accordion */
-.gradio-accordion {
-    border-radius: 10px !important;
-    border: 1px solid #FFC0D9 !important;
-}
-.gradio-accordion > .label-wrap {
-    background: rgba(255, 105, 180, 0.03) !important;
-    border-radius: 10px !important;
-}
-/* Animations */
-@keyframes fadeIn {
-    from { opacity: 0; transform: translateY(10px); }
-    to { opacity: 1; transform: translateY(0); }
-}
-.gradio-row { animation: fadeIn 0.4s ease-out; }
-label { font-weight: 600 !important; color: #333 !important; }
-.dark label { color: #eee !important; }
 footer { display: none !important; }
-/* Wider sidebar */
-.sidebar {
-    min-width: 420px !important;
-    max-width: 480px !important;
-}
 """
 MODEL_PATH = "zai-org/GLM-OCR"
 processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
@@ -297,22 +228,56 @@ TASK_PROMPTS = {
     "Table": "Table Recognition:",
 }
-@spaces.GPU
-def process_image(image, task):
-    """Run OCR on the uploaded image with the selected recognition type."""
-    if image is None:
-        return "Please upload an image first.", "Please upload an image first."
-    if image.mode in ("RGBA", "LA", "P"):
-        image = image.convert("RGB")
-    image = ImageOps.exif_transpose(image)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
     image.save(tmp.name, "PNG")
     tmp.close()
-    prompt = TASK_PROMPTS.get(task, "Text Recognition:")
     messages = [
         {
             "role": "user",
@@ -340,21 +305,45 @@ def process_image(image, task):
     )
     os.unlink(tmp.name)
-    result = output_text.strip()
-    return result, result
-with gr.Blocks(fill_height=True) as demo:
     with gr.Sidebar(width=450):
         gr.Markdown("# **GLM-OCR**", elem_id="main-title")
-        image_input = gr.Image(
-            type="pil",
-            label="Upload Image",
-            sources=["upload", "clipboard"],
-            height=300,
         )
         task = gr.Radio(
@@ -363,18 +352,19 @@ with gr.Blocks(fill_height=True) as demo:
             label="Recognition Type",
         )
         btn = gr.Button("Perform OCR", variant="primary")
-        gr.Examples(
-            examples=[
-                "examples/1.jpg",
-                "examples/4.jpg",
-                "examples/5.webp",
-                "examples/2.jpg",
-                "examples/3.jpg",
-            ],
-            inputs=image_input,
-            label="Examples",
         )
     gr.Markdown("## Output", elem_id="output-title")
@@ -389,12 +379,12 @@ with gr.Blocks(fill_height=True) as demo:
         output_md = gr.Markdown(label="Rendered Markdown")
     btn.click(
-        fn=process_image,
-        inputs=[image_input, task],
         outputs=[output_text, output_md],
     )
-    image_input.change(
         fn=lambda: ("", ""),
         inputs=None,
         outputs=[output_text, output_md],
@@ -402,8 +392,6 @@ with gr.Blocks(fill_height=True) as demo:
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(
-        css=css,
-        theme=hot_pink_theme,
         mcp_server=True,
         ssr_mode=False,
         show_error=True,

 import os
 import tempfile
+from typing import Iterable, List, Tuple
+import fitz  # pymupdf
+import gradio as gr
+import spaces
+import torch
 from PIL import Image, ImageOps
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# -------------------------
+# Theme + CSS (unchanged)
+# -------------------------
 colors.hot_pink = colors.Color(
     name="hot_pink",
     c50="#FFF0F5",
     background: rgba(255, 105, 180, 0.02) !important;
 }
 /* Primary button */
 .primary {
     border-radius: 8px !important;
     line-height: 1.7 !important;
 }
 footer { display: none !important; }
 """
+# -------------------------
+# Model
+# -------------------------
 MODEL_PATH = "zai-org/GLM-OCR"
 processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
     "Table": "Table Recognition:",
 }
+# -------------------------
+# Helpers
+# -------------------------
+def _normalize_pil(img: Image.Image) -> Image.Image:
+    if img.mode in ("RGBA", "LA", "P"):
+        img = img.convert("RGB")
+    img = ImageOps.exif_transpose(img)
+    return img
+def file_to_images(file_path: str, max_pages: int = 20, dpi: int = 200) -> List[Image.Image]:
+    """
+    Convert an input file (pdf or image) into a list of PIL images.
+    Safety: limit pages for huge PDFs.
+    """
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"]:
+        return [_normalize_pil(Image.open(file_path))]
+    if ext == ".pdf":
+        doc = fitz.open(file_path)
+        images: List[Image.Image] = []
+        zoom = dpi / 72
+        mat = fitz.Matrix(zoom, zoom)
+        n = min(len(doc), max_pages)
+        for i in range(n):
+            page = doc.load_page(i)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            images.append(_normalize_pil(img))
+        doc.close()
+        return images
+    raise ValueError(f"Unsupported file type: {ext}")
+def ocr_one_image(image: Image.Image, task: str) -> str:
+    """
+    OCR one PIL image. Returns markdown-like text (model output).
+    """
+    prompt = TASK_PROMPTS.get(task, "Text Recognition:")
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
     image.save(tmp.name, "PNG")
     tmp.close()
     messages = [
         {
             "role": "user",
     )
     os.unlink(tmp.name)
+    return output_text.strip()
+@spaces.GPU
+def process_file(file_obj, task: str, max_pages: int) -> Tuple[str, str]:
+    """
+    Process an uploaded file (PDF or image).
+    Returns (raw_text, rendered_markdown).
+    """
+    if file_obj is None:
+        return "Please upload a PDF or an image first.", "Please upload a PDF or an image first."
+    # Gradio File gives an object with .name (path). Sometimes it's already a string path.
+    file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
+    try:
+        pages = file_to_images(file_path, max_pages=int(max_pages), dpi=200)
+    except Exception as e:
+        return f"Failed to read file: {e}", f"Failed to read file: {e}"
+    md_pages = []
+    for i, img in enumerate(pages, start=1):
+        page_md = ocr_one_image(img, task)
+        md_pages.append(f"<!-- Page {i} -->\n\n{page_md}")
+    final_md = "\n\n---\n\n".join(md_pages)
+    return final_md, final_md
+# -------------------------
+# UI
+# -------------------------
+with gr.Blocks(fill_height=True, css=css, theme=hot_pink_theme) as demo:
     with gr.Sidebar(width=450):
         gr.Markdown("# **GLM-OCR**", elem_id="main-title")
+        file_input = gr.File(
+            label="Upload PDF or Image",
+            file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"],
         )
         task = gr.Radio(
             label="Recognition Type",
         )
+        max_pages = gr.Slider(
+            minimum=1,
+            maximum=50,
+            value=20,
+            step=1,
+            label="Max PDF Pages (safety limit)",
+        )
         btn = gr.Button("Perform OCR", variant="primary")
+        gr.Markdown(
+            "Tip: If you upload a PDF, it will OCR pages in order and join results with separators.\n"
+            "For very large PDFs, increase the page limit carefully."
         )
     gr.Markdown("## Output", elem_id="output-title")
         output_md = gr.Markdown(label="Rendered Markdown")
     btn.click(
+        fn=process_file,
+        inputs=[file_input, task, max_pages],
         outputs=[output_text, output_md],
     )
+    file_input.change(
         fn=lambda: ("", ""),
         inputs=None,
         outputs=[output_text, output_md],
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(
         mcp_server=True,
         ssr_mode=False,
         show_error=True,