Spaces:

prithivMLmods
/

GLM-OCR-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 18

Commit

ef119b4

verified ·

1 Parent(s): b25885d

update app

Browse files

Files changed (1) hide show

app.py +157 -222

app.py CHANGED Viewed

@@ -4,11 +4,9 @@ import spaces
 import os
 import tempfile
 from PIL import Image, ImageOps
-from threading import Thread
 from typing import Iterable
-from transformers import AutoProcessor, AutoModelForImageTextToText
-from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -27,6 +25,7 @@ colors.hot_pink = colors.Color(
     c950="#802050",
 )
 class HotPinkTheme(Soft):
     def __init__(
         self,
@@ -36,10 +35,14 @@ class HotPinkTheme(Soft):
         neutral_hue: colors.Color | str = colors.slate,
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
         ),
         font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
         ),
     ):
         super().__init__(
@@ -78,105 +81,39 @@ class HotPinkTheme(Soft):
             block_label_background_fill="*primary_200",
         )
-hot_pink_theme = HotPinkTheme()
-MODEL_PATH = "zai-org/GLM-OCR"
-processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
-model = AutoModelForImageTextToText.from_pretrained(
-    pretrained_model_name_or_path=MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    trust_remote_code=True
-)
-TASK_PROMPTS = {
-    "Text": "Text Recognition:",
-    "Formula": "Formula Recognition:",
-    "Table": "Table Recognition:",
-}
-@spaces.GPU
-def process_image(image, task):
-    if image is None:
-        return "Please upload an image first"
-    if image.mode in ('RGBA', 'LA', 'P'):
-        image = image.convert('RGB')
-    image = ImageOps.exif_transpose(image)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
-    image.save(tmp.name, 'PNG')
-    tmp.close()
-    prompt = TASK_PROMPTS.get(task, "Text Recognition:")
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "url": tmp.name},
-                {"type": "text", "text": prompt}
-            ],
-        }
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device)
-    inputs.pop("token_type_ids", None)
-    generated_ids = model.generate(**inputs, max_new_tokens=8192)
-    output_text = processor.decode(
-        generated_ids[0][inputs["input_ids"].shape[1]:],
-        skip_special_tokens=True
-    )
-    os.unlink(tmp.name)
-    return output_text.strip()
 css = """
 @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&family=IBM+Plex+Mono:wght@400;500;600&display=swap');
-/* Background grid pattern - Hot Pink theme */
 body, .gradio-container {
     background-color: #FFF0F5 !important;
-    background-image:
-        linear-gradient(#FFC0D9 1px, transparent 1px),
         linear-gradient(90deg, #FFC0D9 1px, transparent 1px) !important;
     background-size: 40px 40px !important;
     font-family: 'Outfit', sans-serif !important;
 }
-/* Dark mode grid */
 .dark body, .dark .gradio-container {
     background-color: #1a1a1a !important;
-    background-image:
-        linear-gradient(rgba(255, 105, 180, 0.1) 1px, transparent 1px),
         linear-gradient(90deg, rgba(255, 105, 180, 0.1) 1px, transparent 1px) !important;
     background-size: 40px 40px !important;
 }
-#col-container {
-    margin: 0 auto;
-    max-width: 1000px;
-}
-/* Main title styling */
-#main-title {
-    text-align: center !important;
-    padding: 1rem 0 0.5rem 0;
 }
 #main-title h1 {
-    font-size: 2.7em !important;
     font-weight: 700 !important;
     background: linear-gradient(135deg, #FF69B4 0%, #FF99C4 50%, #E55AA0 100%);
     background-size: 200% 200%;
@@ -187,33 +124,15 @@ body, .gradio-container {
     letter-spacing: -0.02em;
 }
 @keyframes gradient-shift {
     0%, 100% { background-position: 0% 50%; }
     50% { background-position: 100% 50%; }
 }
-/* Subtitle styling */
-#subtitle {
-    text-align: center !important;
-    margin-bottom: 1.5rem;
-}
-#subtitle p {
-    margin: 0 auto;
-    color: #666666;
-    font-size: 1rem;
-}
-#subtitle a {
-    color: #FF69B4 !important;
-    text-decoration: none;
-    font-weight: 500;
-}
-#subtitle a:hover {
-    text-decoration: underline;
-}
 /* Card styling */
 .gradio-group {
     background: rgba(255, 255, 255, 0.9) !important;
@@ -234,7 +153,7 @@ body, .gradio-container {
     border-color: rgba(255, 105, 180, 0.3) !important;
 }
-/* Image upload area */
 .gradio-image {
     border-radius: 10px !important;
     overflow: hidden;
@@ -248,10 +167,6 @@ body, .gradio-container {
 }
 /* Radio buttons */
-.gradio-radio {
-    border-radius: 8px !important;
-}
 .gradio-radio label {
     border-radius: 6px !important;
     transition: all 0.2s ease !important;
@@ -279,23 +194,6 @@ body, .gradio-container {
     transform: translateY(-2px) !important;
 }
-/* Tabs styling */
-.tab-nav {
-    border-bottom: 2px solid #FFC0D9 !important;
-}
-.tab-nav button {
-    font-weight: 500 !important;
-    padding: 10px 18px !important;
-    border-radius: 8px 8px 0 0 !important;
-    transition: all 0.2s ease !important;
-}
-.tab-nav button.selected {
-    background: rgba(255, 105, 180, 0.1) !important;
-    border-bottom: 2px solid #FF69B4 !important;
-}
 /* Output textbox */
 .gradio-textbox textarea {
     font-family: 'IBM Plex Mono', monospace !important;
@@ -332,11 +230,7 @@ body, .gradio-container {
     padding: 1rem !important;
 }
-/* Examples section */
-.gradio-examples {
-    border-radius: 10px !important;
-}
 .gradio-examples .gallery-item {
     border: 2px solid #FFC0D9 !important;
     border-radius: 8px !important;
@@ -349,27 +243,13 @@ body, .gradio-container {
     box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
 }
-/* Scrollbar styling */
-::-webkit-scrollbar {
-    width: 8px;
-    height: 8px;
-}
-::-webkit-scrollbar-track {
-    background: rgba(255, 105, 180, 0.05);
-    border-radius: 4px;
-}
-::-webkit-scrollbar-thumb {
-    background: linear-gradient(135deg, #FF69B4, #FF99C4);
-    border-radius: 4px;
-}
-::-webkit-scrollbar-thumb:hover {
-    background: linear-gradient(135deg, #E55AA0, #FF69B4);
-}
-/* Accordion styling */
 .gradio-accordion {
     border-radius: 10px !important;
     border: 1px solid #FFC0D9 !important;
@@ -380,89 +260,144 @@ body, .gradio-container {
     border-radius: 10px !important;
 }
-/* Hide footer */
-footer {
-    display: none !important;
-}
 /* Animations */
 @keyframes fadeIn {
     from { opacity: 0; transform: translateY(10px); }
     to { opacity: 1; transform: translateY(0); }
 }
-.gradio-row {
-    animation: fadeIn 0.4s ease-out;
-}
-/* Label styling */
-label {
-    font-weight: 600 !important;
-    color: #333 !important;
-}
-.dark label {
-    color: #eee !important;
-}
 """
-with gr.Blocks() as demo:
-    gr.Markdown("# **GLM-OCR**", elem_id="main-title")
-    gr.Markdown("*A multimodal [OCR model](https://huggingface.co/zai-org/GLM-OCR) for complex document understanding.*", elem_id="subtitle")
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(
-                type="pil",
-                label="Upload Image",
-                sources=["upload", "clipboard"],
-                height=300
-            )
-            with gr.Row():
-                task = gr.Radio(
-                    choices=list(TASK_PROMPTS.keys()),
-                    value="Text",
-                    label="Recognition Type"
-                )
-            with gr.Row():
-                btn = gr.Button("Perform OCR", variant="primary")
-            gr.Examples(
-                examples=["examples/1.jpg", "examples/4.jpg", "examples/5.webp", "examples/2.jpg", "examples/3.jpg"],
-                inputs=image_input,
-                label="Examples"
-            )
-        with gr.Column(scale=1):
-            with gr.Tabs():
-                with gr.Tab("Text"):
-                    output_text = gr.Textbox(
-                        label="Output",
-                        lines=18,
-                        interactive=True,
-                    )
-                with gr.Tab("Markdown"):
-                    output_md = gr.Markdown(value="")
-    def run_ocr(image, task):
-        result = process_image(image, task)
-        return result, result
     btn.click(
-        run_ocr,
-        [image_input, task],
-        [output_text, output_md]
     )
     image_input.change(
-        lambda: ("", ""),
-        None,
-        [output_text, output_md]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(css=css, theme=hot_pink_theme, mcp_server=True, ssr_mode=False, show_error=True)

 import os
 import tempfile
 from PIL import Image, ImageOps
 from typing import Iterable
+from transformers import AutoProcessor, AutoModelForImageTextToText
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
     c950="#802050",
 )
 class HotPinkTheme(Soft):
     def __init__(
         self,
         neutral_hue: colors.Color | str = colors.slate,
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"),
+            "Arial",
+            "sans-serif",
         ),
         font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"),
+            "ui-monospace",
+            "monospace",
         ),
     ):
         super().__init__(
             block_label_background_fill="*primary_200",
         )
+hot_pink_theme = HotPinkTheme()
 css = """
 @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&family=IBM+Plex+Mono:wght@400;500;600&display=swap');
+/* Grid background */
 body, .gradio-container {
     background-color: #FFF0F5 !important;
+    background-image:
+        linear-gradient(#FFC0D9 1px, transparent 1px),
         linear-gradient(90deg, #FFC0D9 1px, transparent 1px) !important;
     background-size: 40px 40px !important;
     font-family: 'Outfit', sans-serif !important;
 }
 .dark body, .dark .gradio-container {
     background-color: #1a1a1a !important;
+    background-image:
+        linear-gradient(rgba(255, 105, 180, 0.1) 1px, transparent 1px),
         linear-gradient(90deg, rgba(255, 105, 180, 0.1) 1px, transparent 1px) !important;
     background-size: 40px 40px !important;
 }
+/* Sidebar width */
+.gradio-sidebar {
+    min-width: 420px !important;
+    max-width: 480px !important;
 }
+/* Titles */
 #main-title h1 {
+    font-size: 2.5em !important;
     font-weight: 700 !important;
     background: linear-gradient(135deg, #FF69B4 0%, #FF99C4 50%, #E55AA0 100%);
     background-size: 200% 200%;
     letter-spacing: -0.02em;
 }
+#output-title h2 {
+    font-size: 2.2em !important;
+}
 @keyframes gradient-shift {
     0%, 100% { background-position: 0% 50%; }
     50% { background-position: 100% 50%; }
 }
 /* Card styling */
 .gradio-group {
     background: rgba(255, 255, 255, 0.9) !important;
     border-color: rgba(255, 105, 180, 0.3) !important;
 }
+/* Image upload */
 .gradio-image {
     border-radius: 10px !important;
     overflow: hidden;
 }
 /* Radio buttons */
 .gradio-radio label {
     border-radius: 6px !important;
     transition: all 0.2s ease !important;
     transform: translateY(-2px) !important;
 }
 /* Output textbox */
 .gradio-textbox textarea {
     font-family: 'IBM Plex Mono', monospace !important;
     padding: 1rem !important;
 }
+/* Examples */
 .gradio-examples .gallery-item {
     border: 2px solid #FFC0D9 !important;
     border-radius: 8px !important;
     box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
 }
+/* Scrollbar */
+::-webkit-scrollbar { width: 8px; height: 8px; }
+::-webkit-scrollbar-track { background: rgba(255,105,180,0.05); border-radius: 4px; }
+::-webkit-scrollbar-thumb { background: linear-gradient(135deg, #FF69B4, #FF99C4); border-radius: 4px; }
+::-webkit-scrollbar-thumb:hover { background: linear-gradient(135deg, #E55AA0, #FF69B4); }
+/* Accordion */
 .gradio-accordion {
     border-radius: 10px !important;
     border: 1px solid #FFC0D9 !important;
     border-radius: 10px !important;
 }
 /* Animations */
 @keyframes fadeIn {
     from { opacity: 0; transform: translateY(10px); }
     to { opacity: 1; transform: translateY(0); }
 }
+.gradio-row { animation: fadeIn 0.4s ease-out; }
+label { font-weight: 600 !important; color: #333 !important; }
+.dark label { color: #eee !important; }
+footer { display: none !important; }
 """
+MODEL_PATH = "zai-org/GLM-OCR"
+processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    pretrained_model_name_or_path=MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+TASK_PROMPTS = {
+    "Text": "Text Recognition:",
+    "Formula": "Formula Recognition:",
+    "Table": "Table Recognition:",
+}
+@spaces.GPU
+def process_image(image, task):
+    """Run OCR on the uploaded image with the selected recognition type."""
+    if image is None:
+        return "Please upload an image first.", "Please upload an image first."
+    if image.mode in ("RGBA", "LA", "P"):
+        image = image.convert("RGB")
+    image = ImageOps.exif_transpose(image)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+    image.save(tmp.name, "PNG")
+    tmp.close()
+    prompt = TASK_PROMPTS.get(task, "Text Recognition:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": tmp.name},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(model.device)
+    inputs.pop("token_type_ids", None)
+    generated_ids = model.generate(**inputs, max_new_tokens=8192)
+    output_text = processor.decode(
+        generated_ids[0][inputs["input_ids"].shape[1] :],
+        skip_special_tokens=True,
+    )
+    os.unlink(tmp.name)
+    result = output_text.strip()
+    return result, result
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Sidebar():
+        gr.Markdown("# **GLM-OCR**", elem_id="main-title")
+        image_input = gr.Image(
+            type="pil",
+            label="Upload Image",
+            sources=["upload", "clipboard"],
+            height=300,
+        )
+        task = gr.Radio(
+            choices=list(TASK_PROMPTS.keys()),
+            value="Text",
+            label="Recognition Type",
+        )
+        btn = gr.Button("Perform OCR", variant="primary")
+        gr.Examples(
+            examples=[
+                "examples/1.jpg",
+                "examples/4.jpg",
+                "examples/5.webp",
+                "examples/2.jpg",
+                "examples/3.jpg",
+            ],
+            inputs=image_input,
+            label="Examples",
+        )
+    gr.Markdown("## Output", elem_id="output-title")
+    output_text = gr.Textbox(
+        label="Raw Output Stream",
+        interactive=True,
+        lines=22,
+    )
+    with gr.Accordion("(Result.md)", open=False):
+        output_md = gr.Markdown(label="Rendered Markdown")
     btn.click(
+        fn=process_image,
+        inputs=[image_input, task],
+        outputs=[output_text, output_md],
     )
     image_input.change(
+        fn=lambda: ("", ""),
+        inputs=None,
+        outputs=[output_text, output_md],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(
+        css=css,
+        theme=hot_pink_theme,
+        mcp_server=True,
+        ssr_mode=False,
+        show_error=True,
+    )