GLM-OCR-Demo

Build error

App Files Files Community

prithivMLmods commited on Feb 4

Commit

0282214

verified ·

1 Parent(s): 8ecd775

Update app.py

Browse files

Files changed (1) hide show

app.py +406 -227

app.py CHANGED Viewed

@@ -1,264 +1,443 @@
 import gradio as gr
 import torch
 import spaces
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
-from gradio.themes import Soft
-from gradio.themes.utils import colors, fonts, sizes
-from typing import Iterable
-colors.orange_red = colors.Color(
-    name="orange_red",
-    c50="#FFF0E5",
-    c100="#FFE0CC",
-    c200="#FFC299",
-    c300="#FFA366",
-    c400="#FF8533",
-    c500="#FF4500",
-    c600="#E63E00",
-    c700="#CC3700",
-    c800="#B33000",
-    c900="#992900",
-    c950="#802200",
-)
-class OrangeRedTheme(Soft):
-    def __init__(
-        self,
-        *,
-        primary_hue: colors.Color | str = colors.gray,
-        secondary_hue: colors.Color | str = colors.orange_red,
-        neutral_hue: colors.Color | str = colors.slate,
-        text_size: sizes.Size | str = sizes.text_lg,
-        font: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
-        ),
-        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
-        ),
-    ):
-        super().__init__(
-            primary_hue=primary_hue,
-            secondary_hue=secondary_hue,
-            neutral_hue=neutral_hue,
-            text_size=text_size,
-            font=font,
-            font_mono=font_mono,
-        )
-        super().set(
-            background_fill_primary="*primary_50",
-            background_fill_primary_dark="*primary_900",
-            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
-            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
-            button_primary_text_color="white",
-            button_primary_text_color_hover="white",
-            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            slider_color="*secondary_500",
-            block_title_text_weight="600",
-            block_border_width="0px",
-            block_shadow="*shadow_drop_lg",
-            button_large_padding="12px 24px",
-            color_accent_soft="*primary_100",
-        )
-orange_red_theme = OrangeRedTheme()
 MODEL_PATH = "zai-org/GLM-OCR"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading {MODEL_PATH} on {device}...")
 try:
     processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
     model = AutoModelForImageTextToText.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto",
         trust_remote_code=True,
-        attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager"
     )
 except Exception as e:
     print(f"Error loading model: {e}")
-    # Fallback for CPU/No-Flash-Attn environments if necessary
-    model = AutoModelForImageTextToText.from_pretrained(
-        pretrained_model_name_or_path=MODEL_PATH,
-        torch_dtype="auto",
-        device_map="auto",
-        trust_remote_code=True
-    )
-class GlmOcr(gr.HTML):
-    """
-    Custom Header Component for the minimalistic UI.
-    """
-    def __init__(self):
-        content = """
-        <div style="text-align: center; margin-bottom: 2rem; padding: 2rem 1rem;">
-            <h1 style="font-size: 3rem; font-weight: 800; margin: 0;
-                       background: linear-gradient(90deg, #FF4500, #E63E00);
-                       -webkit-background-clip: text; -webkit-text-fill-color: transparent;">
-                GLM-OCR
-            </h1>
-            <p style="font-size: 1.2rem; margin-top: 0.5rem; opacity: 0.8; font-weight: 300;">
-                High-precision Document, Formula, and Table Recognition
-            </p>
-            <div style="display: flex; justify-content: center; gap: 10px; margin-top: 15px;">
-                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Text</span>
-                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">LaTeX Formulas</span>
-                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Tables</span>
-            </div>
-        </div>
-        """
-        super().__init__(value=content)
-TASK_MAPPING = {
-    "Text Parsing": "Text Recognition:",
-    "Formula/LaTeX": "Formula Recognition:",
-    "Table Extraction": "Table Recognition:"
-}
 @spaces.GPU
-def run_ocr(image, task_key):
-    if image is None:
-        return None, "Please upload an image."
-    prompt_text = TASK_MAPPING.get(task_key, "Text Recognition:")
-    # Prepare messages
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image, # Passing PIL image directly
-                },
-                {
-                    "type": "text",
-                    "text": prompt_text
-                }
-            ],
-        }
-    ]
-    # Process inputs
-    # Note: apply_chat_template with return_tensors="pt" handles image processing if the processor is multimodal aware
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device)
-    # Remove token_type_ids if present (common issue with some models)
-    inputs.pop("token_type_ids", None)
-    # Generate
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=8192,
-            do_sample=False, # Deterministic for OCR
-            temperature=0.01
         )
-    # Decode
-    # We skip the input prompt tokens to get only the new text
-    output_text = processor.decode(
-        generated_ids[0][inputs["input_ids"].shape[1]:],
-        skip_special_tokens=True
-    )
-    return output_text, output_text
-css = """
-.gradio-container {
-    max-width: 1200px !important;
-    margin: 0 auto;
 }
-.image-container {
-    border-radius: 12px;
-    overflow: hidden;
-    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
 }
 """
 with gr.Blocks(title="GLM-OCR") as demo:
-    # Custom Header
     GlmOcr()
-    with gr.Row():
-        # Left Column: Inputs
-        with gr.Column(scale=1):
-            with gr.Group():
-                image_input = gr.Image(
-                    type="pil",
-                    label="Document Image",
-                    elem_classes="image-container",
-                    height=400
-                )
-                with gr.Row():
-                    task_select = gr.Dropdown(
-                        choices=list(TASK_MAPPING.keys()),
-                        value="Text Parsing",
-                        label="Extraction Mode",
-                        interactive=True,
-                        scale=2
-                    )
-                    submit_btn = gr.Button(
-                        "Process",
-                        variant="primary",
-                        scale=1,
-                        size="lg"
-                    )
-            with gr.Accordion("Tips", open=True):
-                gr.Markdown("""
-                - **Text Parsing**: Extracts all text and layout structure.
-                - **Formula/LaTeX**: Optimized for scientific papers and math.
-                - **Table Extraction**: Converts tables directly to Markdown/Structure.
-                """)
-        # Right Column: Outputs
-        with gr.Column(scale=1):
-            with gr.Tabs():
-                with gr.Tab("Rendered Output"):
-                    md_output = gr.Markdown(
-                        label="Result",
-                        value="_Output will appear here..._",
-                        latex_delimiters=[
-                            {"left": "$$", "right": "$$", "display": True},
-                            {"left": "$", "right": "$", "display": False},
-                            {"left": "\\(", "right": "\\)", "display": False},
-                            {"left": "\\[", "right": "\\]", "display": True}
-                        ]
-                    )
-                with gr.Tab("Raw Source"):
-                    raw_output = gr.Textbox(
-                        label="Raw Text/LaTeX",
-                        lines=20,
-                        #show_copy_button=True,
-                        interactive=True
-                    )
-    # Event Wiring
-    submit_btn.click(
-        fn=run_ocr,
-        inputs=[image_input, task_select],
-        outputs=[md_output, raw_output]
     )
 if __name__ == "__main__":
-    demo.queue().launch(
-        theme=orange_red_theme,
-        css=css,
-        ssr_mode=False,
-        show_error=True
-    )

 import gradio as gr
 import torch
 import spaces
+import base64
+import io
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
+# -----------------------------------------------------------------------------
+# Model Initialization
+# -----------------------------------------------------------------------------
 MODEL_PATH = "zai-org/GLM-OCR"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading model on {DEVICE}...")
+# Load Processor
 try:
     processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    # Load Model
     model = AutoModelForImageTextToText.from_pretrained(
         pretrained_model_name_or_path=MODEL_PATH,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
         trust_remote_code=True,
+        device_map="auto" if torch.cuda.is_available() else None,
     )
+    if DEVICE == "cpu":
+        model = model.to("cpu") # explicit fallback if no gpu
+    print("Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
+    # Fallback for building UI without model (for debugging/building phase)
+    processor = None
+    model = None
+# -----------------------------------------------------------------------------
+# Inference Logic
+# -----------------------------------------------------------------------------
 @spaces.GPU
+def run_inference(image_b64, task_prompt):
+    if not image_b64:
+        return "Please upload an image first."
+    if model is None:
+        return "Model not loaded correctly. Check logs."
+    try:
+        # 1. Decode Base64 to PIL Image
+        if "base64," in image_b64:
+            image_b64 = image_b64.split("base64,")[1]
+        image_data = base64.b64decode(image_b64)
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        # 2. Prepare Messages
+        # The prompt is selected via the radio buttons
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {
+                        "type": "text",
+                        "text": task_prompt
+                    }
+                ],
+            }
+        ]
+        # 3. Process Inputs
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(model.device)
+        # Remove token_type_ids if present (transformers fix)
+        inputs.pop("token_type_ids", None)
+        # 4. Generate
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=2048,
+                do_sample=False, # Deterministic for OCR usually better
+                temperature=0.01
+            )
+        # 5. Decode
+        output_text = processor.decode(
+            generated_ids[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=False
         )
+        # Clean up tags usually returned by VLM
+        output_text = output_text.replace("<|endoftext|>", "").strip()
+        return output_text
+    except Exception as e:
+        return f"Error during inference: {str(e)}"
+# -----------------------------------------------------------------------------
+# Custom Component & UI Assets
+# -----------------------------------------------------------------------------
+# CSS from your snippet + additions for image preview and layout
+CUSTOM_CSS = """
+/* Reset & Layout */
+.container {
+  position: relative;
+  max-width: 600px;
+  width: 100%;
+  background: #FCEDDA;
+  padding: 25px;
+  border-radius: 8px;
+  box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
+  margin: 0 auto;
+  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.container header {
+  font-size: 1.5rem;
+  color: #000;
+  font-weight: 600;
+  text-align: center;
+  margin-bottom: 20px;
+}
+.form {
+  margin-top: 15px;
+}
+.input-box {
+  width: 100%;
+  margin-top: 15px;
+}
+.input-box label {
+  color: #000;
+  font-weight: 500;
+  margin-bottom: 5px;
+  display: block;
+}
+/* Custom Upload Area */
+.upload-area {
+    width: 100%;
+    min-height: 150px;
+    background: #fff8f0;
+    border: 2px dashed #EE4E34;
+    border-radius: 6px;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    cursor: pointer;
+    transition: background 0.2s;
+    padding: 10px;
+}
+.upload-area:hover {
+    background: #fff0e0;
+}
+.upload-text {
+    color: #808080;
+    margin-top: 10px;
+}
+#preview-img {
+    max-width: 100%;
+    max-height: 300px;
+    border-radius: 4px;
+    display: none;
+    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+}
+/* Radio Buttons */
+.gender-box {
+  margin-top: 20px;
+}
+.gender-option {
+  display: flex;
+  align-items: center;
+  column-gap: 20px;
+  flex-wrap: wrap;
+  margin-top: 10px;
+  background: #fff8f0;
+  padding: 10px;
+  border-radius: 6px;
+  border: 1px solid #EE4E34;
+}
+.gender {
+  display: flex;
+  align-items: center;
+  column-gap: 5px;
+}
+.gender input {
+  accent-color: #EE4E34;
+  width: 18px;
+  height: 18px;
+  cursor: pointer;
+}
+.gender label {
+  cursor: pointer;
+  margin: 0; /* Reset margin from input-box label */
+}
+/* Textarea Output */
+textarea.result-field {
+    width: 100%;
+    height: 200px;
+    padding: 15px;
+    outline: none;
+    font-size: 0.95rem;
+    color: #333;
+    margin-top: 5px;
+    border: 1px solid #EE4E34;
+    border-radius: 6px;
+    background: #fff;
+    resize: vertical;
+    font-family: monospace;
+}
+/* Submit Button */
+.submit-btn {
+  height: 45px;
+  width: 100%;
+  color: #fff;
+  font-size: 1.1rem;
+  font-weight: 500;
+  margin-top: 25px;
+  border: none;
+  border-radius: 6px;
+  cursor: pointer;
+  transition: all 0.2s ease;
+  background: #EE4E34;
+}
+.submit-btn:hover {
+  background: #d63d24;
+}
+.submit-btn:disabled {
+  background: #fabab5;
+  cursor: not-allowed;
+}
+.status-msg {
+    text-align: center;
+    margin-top: 10px;
+    font-size: 0.9rem;
+    min-height: 20px;
+}
+"""
+# JavaScript to handle interactions and bridge with Gradio
+CUSTOM_JS = """
+<script>
+function initOcrUI() {
+    const fileInput = document.getElementById('hidden-file-input');
+    const uploadArea = document.getElementById('upload-area');
+    const previewImg = document.getElementById('preview-img');
+    const uploadText = document.getElementById('upload-text');
+    const submitBtn = document.getElementById('custom-submit');
+    const resultArea = document.getElementById('result-area');
+    const statusMsg = document.getElementById('status-msg');
+    // Trigger file input
+    uploadArea.onclick = () => fileInput.click();
+    // Handle File Selection
+    fileInput.onchange = (e) => {
+        const file = e.target.files[0];
+        if (file) {
+            const reader = new FileReader();
+            reader.onload = (evt) => {
+                const b64 = evt.target.result;
+                // Show Preview
+                previewImg.src = b64;
+                previewImg.style.display = 'block';
+                uploadText.style.display = 'none';
+                // Update Hidden Gradio Component
+                updateGradioImage(b64);
+            }
+            reader.readAsDataURL(file);
+        }
+    };
+    // Handle Submit
+    submitBtn.onclick = (e) => {
+        e.preventDefault();
+        // Get selected Task
+        const task = document.querySelector('input[name="task"]:checked').value;
+        // Update Hidden Gradio Task Input
+        updateGradioTask(task);
+        // Visual Feedback
+        submitBtn.innerText = "Processing...";
+        submitBtn.disabled = true;
+        statusMsg.innerText = "Model is running. Please wait...";
+        resultArea.value = ""; // Clear previous
+        // Trigger Hidden Gradio Button
+        const gradioBtn = document.getElementById('bridge-btn');
+        if (gradioBtn) gradioBtn.click();
+    };
+    // --- Bridge Functions ---
+    function updateGradioImage(b64Data) {
+        const ta = document.querySelector('#bridge-img-input textarea');
+        if (ta) {
+            ta.value = b64Data;
+            ta.dispatchEvent(new Event('input', { bubbles: true }));
+        }
+    }
+    function updateGradioTask(taskVal) {
+        const ta = document.querySelector('#bridge-task-input textarea');
+        if (ta) {
+            ta.value = taskVal;
+            ta.dispatchEvent(new Event('input', { bubbles: true }));
+        }
+    }
 }
+// Function called by Gradio when output changes
+function updateResultUI(text) {
+    const resultArea = document.getElementById('result-area');
+    const submitBtn = document.getElementById('custom-submit');
+    const statusMsg = document.getElementById('status-msg');
+    if(resultArea) resultArea.value = text;
+    if(submitBtn) {
+        submitBtn.innerText = "Submit";
+        submitBtn.disabled = false;
+    }
+    if(statusMsg) statusMsg.innerText = "Extraction complete.";
 }
+// Initialize after a slight delay to ensure DOM is ready
+setTimeout(initOcrUI, 1000);
+</script>
+"""
+HTML_TEMPLATE = """
+<div class="container">
+  <header>GLM-OCR Interface</header>
+  <div class="form">
+      <!-- Image Input Section -->
+      <div class="input-box">
+          <label>Document Image</label>
+          <div class="upload-area" id="upload-area">
+              <span class="upload-text" id="upload-text">Click to Upload Image</span>
+              <img id="preview-img" alt="Preview"/>
+          </div>
+          <input type="file" id="hidden-file-input" style="display:none" accept="image/*">
+      </div>
+      <!-- Task Selection -->
+      <div class="gender-box">
+        <label>Extraction Mode</label>
+        <div class="gender-option">
+          <div class="gender">
+            <input type="radio" id="check-text" name="task" value="Text Recognition:" checked>
+            <label for="check-text">Text</label>
+          </div>
+          <div class="gender">
+            <input type="radio" id="check-formula" name="task" value="Formula Recognition:">
+            <label for="check-formula">Formula</label>
+          </div>
+          <div class="gender">
+            <input type="radio" id="check-table" name="task" value="Table Recognition:">
+            <label for="check-table">Table</label>
+          </div>
+        </div>
+      </div>
+      <!-- Submit Action -->
+      <button class="submit-btn" id="custom-submit">Submit</button>
+      <div class="status-msg" id="status-msg"></div>
+      <!-- Result Output -->
+      <div class="input-box">
+        <label>Extraction Result</label>
+        <textarea id="result-area" class="result-field" readonly placeholder="Output will appear here..."></textarea>
+      </div>
+  </div>
+</div>
 """
+class GlmOcr(gr.HTML):
+    """Custom component wrapper to render the specific UI"""
+    def __init__(self):
+        super().__init__(value=HTML_TEMPLATE + CUSTOM_JS)
+# -----------------------------------------------------------------------------
+# Gradio App Structure
+# -----------------------------------------------------------------------------
 with gr.Blocks(title="GLM-OCR") as demo:
+    # 1. The Custom UI
     GlmOcr()
+    # 2. Hidden Bridge Components (To transfer data between Custom HTML and Python)
+    with gr.Row(visible=False):
+        # Stores Base64 string of the image
+        bridge_img_input = gr.Textbox(elem_id="bridge-img-input", label="Hidden Img")
+        # Stores the selected task string
+        bridge_task_input = gr.Textbox(elem_id="bridge-task-input", value="Text Recognition:", label="Hidden Task")
+        # The trigger button clicked by JS
+        bridge_btn = gr.Button("Run", elem_id="bridge-btn")
+        # The output storage, watched by JS
+        bridge_output = gr.Textbox(elem_id="bridge-output", label="Hidden Output")
+    # 3. Python Logic Connections
+    bridge_btn.click(
+        fn=run_inference,
+        inputs=[bridge_img_input, bridge_task_input],
+        outputs=[bridge_output]
+    )
+    # 4. Feedback Loop: When python output changes, update HTML via JS
+    bridge_output.change(
+        fn=None,
+        inputs=[bridge_output],
+        js="(v) => updateResultUI(v)"
     )
 if __name__ == "__main__":
+    demo.launch(css=CUSTOM_CSS, ssr_mode=False)