Spaces:

prithivMLmods
/

GLM-OCR-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 26 days ago

Commit

fb8008a

verified ·

1 Parent(s): 0282214

Update app.py

Browse files

Files changed (1) hide show

app.py +364 -399

app.py CHANGED Viewed

@@ -1,443 +1,408 @@
 import gradio as gr
 import torch
 import spaces
-import base64
-import io
-from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
-# -----------------------------------------------------------------------------
-# Model Initialization
-# -----------------------------------------------------------------------------
 MODEL_PATH = "zai-org/GLM-OCR"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading model on {DEVICE}...")
-# Load Processor
-try:
-    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
-    # Load Model
-    model = AutoModelForImageTextToText.from_pretrained(
-        pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        trust_remote_code=True,
-        device_map="auto" if torch.cuda.is_available() else None,
-    )
-    if DEVICE == "cpu":
-        model = model.to("cpu") # explicit fallback if no gpu
-    print("Model loaded successfully.")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    # Fallback for building UI without model (for debugging/building phase)
-    processor = None
-    model = None
-# -----------------------------------------------------------------------------
-# Inference Logic
-# -----------------------------------------------------------------------------
-@spaces.GPU
-def run_inference(image_b64, task_prompt):
-    if not image_b64:
-        return "Please upload an image first."
-    if model is None:
-        return "Model not loaded correctly. Check logs."
-    try:
-        # 1. Decode Base64 to PIL Image
-        if "base64," in image_b64:
-            image_b64 = image_b64.split("base64,")[1]
-        image_data = base64.b64decode(image_b64)
-        image = Image.open(io.BytesIO(image_data)).convert("RGB")
-        # 2. Prepare Messages
-        # The prompt is selected via the radio buttons
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": image,
-                    },
-                    {
-                        "type": "text",
-                        "text": task_prompt
-                    }
-                ],
-            }
-        ]
-        # 3. Process Inputs
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(model.device)
-        # Remove token_type_ids if present (transformers fix)
-        inputs.pop("token_type_ids", None)
-        # 4. Generate
-        with torch.no_grad():
-            generated_ids = model.generate(
-                **inputs,
-                max_new_tokens=2048,
-                do_sample=False, # Deterministic for OCR usually better
-                temperature=0.01
-            )
-        # 5. Decode
-        output_text = processor.decode(
-            generated_ids[0][inputs["input_ids"].shape[1]:],
-            skip_special_tokens=False
-        )
-        # Clean up tags usually returned by VLM
-        output_text = output_text.replace("<|endoftext|>", "").strip()
-        return output_text
-    except Exception as e:
-        return f"Error during inference: {str(e)}"
-# -----------------------------------------------------------------------------
-# Custom Component & UI Assets
-# -----------------------------------------------------------------------------
-# CSS from your snippet + additions for image preview and layout
-CUSTOM_CSS = """
-/* Reset & Layout */
-.container {
-  position: relative;
-  max-width: 600px;
-  width: 100%;
-  background: #FCEDDA;
-  padding: 25px;
-  border-radius: 8px;
-  box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
-  margin: 0 auto;
-  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-}
-.container header {
-  font-size: 1.5rem;
-  color: #000;
-  font-weight: 600;
-  text-align: center;
-  margin-bottom: 20px;
-}
-.form {
-  margin-top: 15px;
-}
-.input-box {
-  width: 100%;
-  margin-top: 15px;
-}
-.input-box label {
-  color: #000;
-  font-weight: 500;
-  margin-bottom: 5px;
-  display: block;
-}
-/* Custom Upload Area */
-.upload-area {
-    width: 100%;
-    min-height: 150px;
-    background: #fff8f0;
-    border: 2px dashed #EE4E34;
-    border-radius: 6px;
     display: flex;
-    flex-direction: column;
     align-items: center;
-    justify-content: center;
-    cursor: pointer;
-    transition: background 0.2s;
-    padding: 10px;
-}
-.upload-area:hover {
-    background: #fff0e0;
-}
-.upload-text {
-    color: #808080;
-    margin-top: 10px;
-}
-#preview-img {
-    max-width: 100%;
-    max-height: 300px;
-    border-radius: 4px;
-    display: none;
-    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-}
-/* Radio Buttons */
-.gender-box {
-  margin-top: 20px;
-}
-.gender-option {
-  display: flex;
-  align-items: center;
-  column-gap: 20px;
-  flex-wrap: wrap;
-  margin-top: 10px;
-  background: #fff8f0;
-  padding: 10px;
-  border-radius: 6px;
-  border: 1px solid #EE4E34;
-}
-.gender {
-  display: flex;
-  align-items: center;
-  column-gap: 5px;
-}
-.gender input {
-  accent-color: #EE4E34;
-  width: 18px;
-  height: 18px;
-  cursor: pointer;
-}
-.gender label {
-  cursor: pointer;
-  margin: 0; /* Reset margin from input-box label */
-}
-/* Textarea Output */
-textarea.result-field {
-    width: 100%;
-    height: 200px;
-    padding: 15px;
-    outline: none;
-    font-size: 0.95rem;
-    color: #333;
-    margin-top: 5px;
-    border: 1px solid #EE4E34;
-    border-radius: 6px;
-    background: #fff;
-    resize: vertical;
-    font-family: monospace;
 }
-/* Submit Button */
 .submit-btn {
-  height: 45px;
-  width: 100%;
-  color: #fff;
-  font-size: 1.1rem;
-  font-weight: 500;
-  margin-top: 25px;
-  border: none;
-  border-radius: 6px;
-  cursor: pointer;
-  transition: all 0.2s ease;
-  background: #EE4E34;
 }
 .submit-btn:hover {
-  background: #d63d24;
 }
-.submit-btn:disabled {
-  background: #fabab5;
-  cursor: not-allowed;
 }
-.status-msg {
     text-align: center;
-    margin-top: 10px;
     font-size: 0.9rem;
-    min-height: 20px;
 }
-"""
-# JavaScript to handle interactions and bridge with Gradio
-CUSTOM_JS = """
-<script>
-function initOcrUI() {
-    const fileInput = document.getElementById('hidden-file-input');
-    const uploadArea = document.getElementById('upload-area');
-    const previewImg = document.getElementById('preview-img');
-    const uploadText = document.getElementById('upload-text');
-    const submitBtn = document.getElementById('custom-submit');
-    const resultArea = document.getElementById('result-area');
-    const statusMsg = document.getElementById('status-msg');
-    // Trigger file input
-    uploadArea.onclick = () => fileInput.click();
-    // Handle File Selection
-    fileInput.onchange = (e) => {
-        const file = e.target.files[0];
-        if (file) {
-            const reader = new FileReader();
-            reader.onload = (evt) => {
-                const b64 = evt.target.result;
-                // Show Preview
-                previewImg.src = b64;
-                previewImg.style.display = 'block';
-                uploadText.style.display = 'none';
-                // Update Hidden Gradio Component
-                updateGradioImage(b64);
-            }
-            reader.readAsDataURL(file);
-        }
-    };
-    // Handle Submit
-    submitBtn.onclick = (e) => {
-        e.preventDefault();
-        // Get selected Task
-        const task = document.querySelector('input[name="task"]:checked').value;
-        // Update Hidden Gradio Task Input
-        updateGradioTask(task);
-        // Visual Feedback
-        submitBtn.innerText = "Processing...";
-        submitBtn.disabled = true;
-        statusMsg.innerText = "Model is running. Please wait...";
-        resultArea.value = ""; // Clear previous
-        // Trigger Hidden Gradio Button
-        const gradioBtn = document.getElementById('bridge-btn');
-        if (gradioBtn) gradioBtn.click();
-    };
-    // --- Bridge Functions ---
-    function updateGradioImage(b64Data) {
-        const ta = document.querySelector('#bridge-img-input textarea');
-        if (ta) {
-            ta.value = b64Data;
-            ta.dispatchEvent(new Event('input', { bubbles: true }));
-        }
-    }
-    function updateGradioTask(taskVal) {
-        const ta = document.querySelector('#bridge-task-input textarea');
-        if (ta) {
-            ta.value = taskVal;
-            ta.dispatchEvent(new Event('input', { bubbles: true }));
-        }
-    }
-}
-// Function called by Gradio when output changes
-function updateResultUI(text) {
-    const resultArea = document.getElementById('result-area');
-    const submitBtn = document.getElementById('custom-submit');
-    const statusMsg = document.getElementById('status-msg');
-    if(resultArea) resultArea.value = text;
-    if(submitBtn) {
-        submitBtn.innerText = "Submit";
-        submitBtn.disabled = false;
-    }
-    if(statusMsg) statusMsg.innerText = "Extraction complete.";
-}
-// Initialize after a slight delay to ensure DOM is ready
-setTimeout(initOcrUI, 1000);
-</script>
-"""
-HTML_TEMPLATE = """
-<div class="container">
-  <header>GLM-OCR Interface</header>
-  <div class="form">
-      <!-- Image Input Section -->
-      <div class="input-box">
-          <label>Document Image</label>
-          <div class="upload-area" id="upload-area">
-              <span class="upload-text" id="upload-text">Click to Upload Image</span>
-              <img id="preview-img" alt="Preview"/>
-          </div>
-          <input type="file" id="hidden-file-input" style="display:none" accept="image/*">
-      </div>
-      <!-- Task Selection -->
-      <div class="gender-box">
-        <label>Extraction Mode</label>
-        <div class="gender-option">
-          <div class="gender">
-            <input type="radio" id="check-text" name="task" value="Text Recognition:" checked>
-            <label for="check-text">Text</label>
-          </div>
-          <div class="gender">
-            <input type="radio" id="check-formula" name="task" value="Formula Recognition:">
-            <label for="check-formula">Formula</label>
-          </div>
-          <div class="gender">
-            <input type="radio" id="check-table" name="task" value="Table Recognition:">
-            <label for="check-table">Table</label>
-          </div>
-        </div>
-      </div>
-      <!-- Submit Action -->
-      <button class="submit-btn" id="custom-submit">Submit</button>
-      <div class="status-msg" id="status-msg"></div>
-      <!-- Result Output -->
-      <div class="input-box">
-        <label>Extraction Result</label>
-        <textarea id="result-area" class="result-field" readonly placeholder="Output will appear here..."></textarea>
-      </div>
-  </div>
-</div>
-"""
-class GlmOcr(gr.HTML):
-    """Custom component wrapper to render the specific UI"""
-    def __init__(self):
-        super().__init__(value=HTML_TEMPLATE + CUSTOM_JS)
-# -----------------------------------------------------------------------------
-# Gradio App Structure
-# -----------------------------------------------------------------------------
 with gr.Blocks(title="GLM-OCR") as demo:
-    # 1. The Custom UI
-    GlmOcr()
-    # 2. Hidden Bridge Components (To transfer data between Custom HTML and Python)
-    with gr.Row(visible=False):
-        # Stores Base64 string of the image
-        bridge_img_input = gr.Textbox(elem_id="bridge-img-input", label="Hidden Img")
-        # Stores the selected task string
-        bridge_task_input = gr.Textbox(elem_id="bridge-task-input", value="Text Recognition:", label="Hidden Task")
-        # The trigger button clicked by JS
-        bridge_btn = gr.Button("Run", elem_id="bridge-btn")
-        # The output storage, watched by JS
-        bridge_output = gr.Textbox(elem_id="bridge-output", label="Hidden Output")
-    # 3. Python Logic Connections
-    bridge_btn.click(
-        fn=run_inference,
-        inputs=[bridge_img_input, bridge_task_input],
-        outputs=[bridge_output]
     )
-    # 4. Feedback Loop: When python output changes, update HTML via JS
-    bridge_output.change(
-        fn=None,
-        inputs=[bridge_output],
-        js="(v) => updateResultUI(v)"
     )
 if __name__ == "__main__":
-    demo.launch(css=CUSTOM_CSS, ssr_mode=False)

 import gradio as gr
 import torch
 import spaces
+import os
+import tempfile
+from PIL import Image, ImageOps
 from transformers import AutoProcessor, AutoModelForImageTextToText
+# Model configuration
 MODEL_PATH = "zai-org/GLM-OCR"
+# Load model and processor
+processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    pretrained_model_name_or_path=MODEL_PATH,
+    torch_dtype="auto",
+    device_map="auto",
+    trust_remote_code=True
+)
+# Task prompts for document parsing
+TASK_PROMPTS = {
+    "Text": "Text Recognition:",
+    "Formula": "Formula Recognition:",
+    "Table": "Table Recognition:"
+}
+# Custom CSS based on the provided theme
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&display=swap');
+* {
+    font-family: 'Outfit', sans-serif !important;
+}
+body, .gradio-container {
+    background: linear-gradient(135deg, #FCEDDA, #FFF5EB) !important;
+    min-height: 100vh;
+}
+.main-header {
+    text-align: center;
+    padding: 20px 0 30px 0;
+}
+.main-header h1 {
+    font-size: 2.8rem;
+    color: #EE4E34;
+    font-weight: 700;
+    margin: 0;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.1);
+}
+.main-header p {
+    color: #555;
+    font-size: 1.1rem;
+    margin-top: 8px;
+}
+.form-section {
+    background: #FCEDDA;
+    padding: 25px;
+    border-radius: 12px;
+    box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
+    border: 2px solid #EE4E34;
+}
+.form-section label {
+    color: #000 !important;
+    font-weight: 600 !important;
+    font-size: 1rem !important;
+}
+.output-section {
+    background: #FCEDDA;
+    padding: 25px;
+    border-radius: 12px;
+    box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
+    border: 2px solid #EE4E34;
+}
+.output-header {
+    color: #EE4E34;
+    font-size: 1.2rem;
+    font-weight: 600;
+    margin-bottom: 15px;
     display: flex;
     align-items: center;
+    gap: 8px;
 }
+/* Image upload styling */
+.image-upload-area {
+    border: 2px dashed #EE4E34 !important;
+    border-radius: 10px !important;
+    background: rgba(255, 255, 255, 0.6) !important;
+    transition: all 0.3s ease !important;
+}
+.image-upload-area:hover {
+    background: rgba(255, 255, 255, 0.9) !important;
+    border-color: #D43E2A !important;
+}
+/* Radio buttons styling */
+.task-radio-group {
+    margin: 15px 0;
+}
+.task-radio-group .wrap {
+    gap: 15px !important;
+}
+.task-radio-group label {
+    background: #fff !important;
+    border: 2px solid #EE4E34 !important;
+    border-radius: 8px !important;
+    padding: 12px 24px !important;
+    cursor: pointer !important;
+    transition: all 0.3s ease !important;
+    font-weight: 500 !important;
+    color: #000 !important;
+}
+.task-radio-group label:hover {
+    background: #FFF0E5 !important;
+}
+.task-radio-group input:checked + label {
+    background: #EE4E34 !important;
+    color: #fff !important;
+}
+/* Submit button */
 .submit-btn {
+    width: 100% !important;
+    height: 48px !important;
+    background: linear-gradient(90deg, #EE4E34, #FF6B4E) !important;
+    color: #fff !important;
+    font-size: 1.1rem !important;
+    font-weight: 600 !important;
+    border: none !important;
+    border-radius: 8px !important;
+    cursor: pointer !important;
+    transition: all 0.3s ease !important;
+    margin-top: 15px !important;
+    box-shadow: 0 4px 15px rgba(238, 78, 52, 0.3) !important;
 }
 .submit-btn:hover {
+    background: linear-gradient(90deg, #D43E2A, #EE4E34) !important;
+    transform: translateY(-2px) !important;
+    box-shadow: 0 6px 20px rgba(238, 78, 52, 0.4) !important;
+}
+/* Output textarea */
+.output-textbox textarea {
+    background: #fff !important;
+    border: 1px solid #EE4E34 !important;
+    border-radius: 8px !important;
+    font-size: 0.95rem !important;
+    line-height: 1.6 !important;
 }
+.output-textbox textarea:focus {
+    border-color: #D43E2A !important;
+    box-shadow: 0 0 0 3px rgba(238, 78, 52, 0.1) !important;
+}
+/* Tabs styling */
+.output-tabs .tab-nav {
+    background: transparent !important;
+    border-bottom: 2px solid #EE4E34 !important;
+    gap: 5px !important;
+}
+.output-tabs .tab-nav button {
+    background: transparent !important;
+    color: #555 !important;
+    font-weight: 500 !important;
+    border: none !important;
+    padding: 10px 20px !important;
+    border-radius: 8px 8px 0 0 !important;
+    transition: all 0.2s ease !important;
+}
+.output-tabs .tab-nav button:hover {
+    color: #EE4E34 !important;
+    background: rgba(238, 78, 52, 0.1) !important;
 }
+.output-tabs .tab-nav button.selected {
+    color: #EE4E34 !important;
+    background: #fff !important;
+    border: 2px solid #EE4E34 !important;
+    border-bottom: 2px solid #fff !important;
+    margin-bottom: -2px !important;
+}
+/* Markdown preview */
+.markdown-preview {
+    background: #fff;
+    padding: 20px;
+    border-radius: 8px;
+    border: 1px solid #EE4E34;
+    min-height: 300px;
+}
+/* Accordion */
+.examples-accordion {
+    border: 1px solid #EE4E34 !important;
+    border-radius: 8px !important;
+    background: rgba(255, 255, 255, 0.5) !important;
+    margin-top: 15px !important;
+}
+.examples-accordion .label-wrap {
+    color: #EE4E34 !important;
+    font-weight: 600 !important;
+}
+/* Footer */
+.footer-section {
     text-align: center;
+    padding: 25px 0;
+    color: #666;
     font-size: 0.9rem;
 }
+.footer-section a {
+    color: #EE4E34;
+    text-decoration: none;
+    font-weight: 500;
+}
+.footer-section a:hover {
+    text-decoration: underline;
+}
+/* Copy button */
+.copy-btn {
+    background: #EE4E34 !important;
+    color: #fff !important;
+}
+/* Loading animation */
+.generating {
+    border-color: #EE4E34 !important;
+}
+"""
+@spaces.GPU
+def process_image(image, task):
+    """Process image with GLM-OCR model."""
+    if image is None:
+        return "⚠️ Please upload an image first.", ""
+    # Convert image mode if needed
+    if image.mode in ('RGBA', 'LA', 'P'):
+        image = image.convert('RGB')
+    image = ImageOps.exif_transpose(image)
+    # Save image temporarily
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
+    image.save(tmp.name, 'PNG')
+    tmp.close()
+    try:
+        # Get prompt for selected task
+        prompt_text = TASK_PROMPTS[task]
+        # Prepare messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": tmp.name},
+                    {"type": "text", "text": prompt_text}
+                ],
+            }
+        ]
+        # Process input
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(model.device)
+        inputs.pop("token_type_ids", None)
+        # Generate output
+        generated_ids = model.generate(**inputs, max_new_tokens=8192)
+        output_text = processor.decode(
+            generated_ids[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        )
+        return output_text.strip(), output_text.strip()
+    except Exception as e:
+        return f"❌ Error: {str(e)}", ""
+    finally:
+        # Clean up temp file
+        if os.path.exists(tmp.name):
+            os.unlink(tmp.name)
+# Build the Gradio interface
 with gr.Blocks(title="GLM-OCR") as demo:
+    # Header
+    gr.HTML("""
+        <div class="main-header">
+            <h1>📄 GLM-OCR</h1>
+            <p>Extract text, formulas, and tables from documents with AI</p>
+        </div>
+    """)
+    with gr.Row(equal_height=True):
+        # Left Column - Input
+        with gr.Column(scale=1):
+            with gr.Group(elem_classes=["form-section"]):
+                image_input = gr.Image(
+                    type="pil",
+                    label="Upload Image",
+                    sources=["upload", "clipboard"],
+                    elem_classes=["image-upload-area"],
+                    height=280
+                )
+                task = gr.Radio(
+                    choices=list(TASK_PROMPTS.keys()),
+                    value="Text",
+                    label="Recognition Type",
+                    elem_classes=["task-radio-group"]
+                )
+                submit_btn = gr.Button(
+                    "🔍 Recognize",
+                    variant="primary",
+                    elem_classes=["submit-btn"]
+                )
+            with gr.Accordion("📁 Examples", open=False, elem_classes=["examples-accordion"]):
+                examples = gr.Examples(
+                    examples=[
+                        ["examples/1.jpg"],
+                        ["examples/2.jpg"],
+                        ["examples/3.jpg"]
+                    ],
+                    inputs=[image_input],
+                    label=""
+                )
+        # Right Column - Output
+        with gr.Column(scale=1):
+            with gr.Group(elem_classes=["output-section"]):
+                gr.HTML('<div class="output-header">📋 Recognition Result</div>')
+                with gr.Tabs(elem_classes=["output-tabs"]):
+                    with gr.Tab("Text"):
+                        text_output = gr.Textbox(
+                            lines=14,
+                            show_label=False,
+                            elem_classes=["output-textbox"],
+                            show_copy_button=True,
+                            placeholder="Recognition result will appear here..."
+                        )
+                    with gr.Tab("Markdown"):
+                        md_output = gr.Markdown(
+                            value="",
+                            elem_classes=["markdown-preview"]
+                        )
+    # Footer
+    gr.HTML("""
+        <div class="footer-section">
+            <p>
+                Powered by <a href="https://huggingface.co/zai-org/GLM-OCR" target="_blank">GLM-OCR</a> ·
+                Built with <a href="https://gradio.app" target="_blank">Gradio</a>
+            </p>
+        </div>
+    """)
+    # Event handlers
+    submit_btn.click(
+        fn=process_image,
+        inputs=[image_input, task],
+        outputs=[text_output, md_output]
     )
+    # Also trigger on image upload
+    image_input.change(
+        fn=lambda: ("", ""),
+        outputs=[text_output, md_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(
+        css=css,
+        show_error=True,
+        share=False
+    )