Spaces:

prithivMLmods
/

GLM-OCR-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

95d9832

verified ·

1 Parent(s): dd959ff

update app

Browse files

Files changed (1) hide show

app.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import gradio as gr
+import torch
+import spaces
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+from typing import Iterable
+colors.orange_red = colors.Color(
+    name="orange_red",
+    c50="#FFF0E5",
+    c100="#FFE0CC",
+    c200="#FFC299",
+    c300="#FFA366",
+    c400="#FF8533",
+    c500="#FF4500",
+    c600="#E63E00",
+    c700="#CC3700",
+    c800="#B33000",
+    c900="#992900",
+    c950="#802200",
+)
+class OrangeRedTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.orange_red,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            background_fill_primary="*primary_50",
+            background_fill_primary_dark="*primary_900",
+            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
+            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="white",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            slider_color="*secondary_500",
+            block_title_text_weight="600",
+            block_border_width="0px",
+            block_shadow="*shadow_drop_lg",
+            button_large_padding="12px 24px",
+            color_accent_soft="*primary_100",
+        )
+orange_red_theme = OrangeRedTheme()
+MODEL_PATH = "zai-org/GLM-OCR"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading {MODEL_PATH} on {device}...")
+try:
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    model = AutoModelForImageTextToText.from_pretrained(
+        pretrained_model_name_or_path=MODEL_PATH,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager"
+    )
+except Exception as e:
+    print(f"Error loading model: {e}")
+    # Fallback for CPU/No-Flash-Attn environments if necessary
+    model = AutoModelForImageTextToText.from_pretrained(
+        pretrained_model_name_or_path=MODEL_PATH,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+class GlmOcr(gr.HTML):
+    """
+    Custom Header Component for the minimalistic UI.
+    """
+    def __init__(self):
+        content = """
+        <div style="text-align: center; margin-bottom: 2rem; padding: 2rem 1rem;">
+            <h1 style="font-size: 3rem; font-weight: 800; margin: 0;
+                       background: linear-gradient(90deg, #FF4500, #E63E00);
+                       -webkit-background-clip: text; -webkit-text-fill-color: transparent;">
+                GLM-OCR
+            </h1>
+            <p style="font-size: 1.2rem; margin-top: 0.5rem; opacity: 0.8; font-weight: 300;">
+                High-precision Document, Formula, and Table Recognition
+            </p>
+            <div style="display: flex; justify-content: center; gap: 10px; margin-top: 15px;">
+                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Text</span>
+                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">LaTeX Formulas</span>
+                <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Tables</span>
+            </div>
+        </div>
+        """
+        super().__init__(value=content)
+TASK_MAPPING = {
+    "Text Parsing": "Text Recognition:",
+    "Formula/LaTeX": "Formula Recognition:",
+    "Table Extraction": "Table Recognition:"
+}
+@spaces.GPU
+def run_ocr(image, task_key):
+    if image is None:
+        return None, "Please upload an image."
+    prompt_text = TASK_MAPPING.get(task_key, "Text Recognition:")
+    # Prepare messages
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image, # Passing PIL image directly
+                },
+                {
+                    "type": "text",
+                    "text": prompt_text
+                }
+            ],
+        }
+    ]
+    # Process inputs
+    # Note: apply_chat_template with return_tensors="pt" handles image processing if the processor is multimodal aware
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    # Remove token_type_ids if present (common issue with some models)
+    inputs.pop("token_type_ids", None)
+    # Generate
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=8192,
+            do_sample=False, # Deterministic for OCR
+            temperature=0.01
+        )
+    # Decode
+    # We skip the input prompt tokens to get only the new text
+    output_text = processor.decode(
+        generated_ids[0][inputs["input_ids"].shape[1]:],
+        skip_special_tokens=True
+    )
+    return output_text, output_text
+css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: 0 auto;
+}
+.image-container {
+    border-radius: 12px;
+    overflow: hidden;
+    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+}
+"""
+with gr.Blocks(title="GLM-OCR") as demo:
+    # Custom Header
+    GlmOcr()
+    with gr.Row():
+        # Left Column: Inputs
+        with gr.Column(scale=1):
+            with gr.Group():
+                image_input = gr.Image(
+                    type="pil",
+                    label="Document Image",
+                    elem_classes="image-container",
+                    height=400
+                )
+                with gr.Row():
+                    task_select = gr.Dropdown(
+                        choices=list(TASK_MAPPING.keys()),
+                        value="Text Parsing",
+                        label="Extraction Mode",
+                        interactive=True,
+                        scale=2
+                    )
+                    submit_btn = gr.Button(
+                        "Process",
+                        variant="primary",
+                        scale=1,
+                        size="lg"
+                    )
+            with gr.Accordion("Tips", open=True):
+                gr.Markdown("""
+                - **Text Parsing**: Extracts all text and layout structure.
+                - **Formula/LaTeX**: Optimized for scientific papers and math.
+                - **Table Extraction**: Converts tables directly to Markdown/Structure.
+                """)
+        # Right Column: Outputs
+        with gr.Column(scale=1):
+            with gr.Tabs():
+                with gr.Tab("Rendered Output"):
+                    md_output = gr.Markdown(
+                        label="Result",
+                        value="_Output will appear here..._",
+                        latex_delimiters=[
+                            {"left": "$$", "right": "$$", "display": True},
+                            {"left": "$", "right": "$", "display": False},
+                            {"left": "\\(", "right": "\\)", "display": False},
+                            {"left": "\\[", "right": "\\]", "display": True}
+                        ]
+                    )
+                with gr.Tab("Raw Source"):
+                    raw_output = gr.Textbox(
+                        label="Raw Text/LaTeX",
+                        lines=20,
+                        show_copy_button=True,
+                        interactive=False
+                    )
+    # Event Wiring
+    submit_btn.click(
+        fn=run_ocr,
+        inputs=[image_input, task_select],
+        outputs=[md_output, raw_output]
+    )
+if __name__ == "__main__":
+    demo.queue().launch(
+        theme=orange_red_theme,
+        css=css,
+        ssr_mode=False,
+        show_error=True
+    )