Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 22, 2025

Commit

5bcfcb2

verified ·

1 Parent(s): bbef9e6

update app

Browse files

Files changed (1) hide show

app.py +103 -80

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import os
 import sys
 from threading import Thread
 from typing import Iterable
 from huggingface_hub import snapshot_download
@@ -7,7 +11,10 @@ from huggingface_hub import snapshot_download
 import gradio as gr
 import spaces
 import torch
 from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
@@ -17,6 +24,7 @@ from transformers import (
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -67,8 +75,14 @@ class SteelBlueTheme(Soft):
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
@@ -91,6 +105,22 @@ css = """
 }
 """
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
@@ -131,35 +161,24 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load chandra
-MODEL_ID_C = "datalab-to/chandra"
-processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
-model_c = Qwen3VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_C,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Nanonets-OCR2-3B
-MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
-processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Nanonets-OCR2-1.5B-exp
-MODEL_ID_N = "strangervisionhf/excess_layer_pruned-nanonets-1.5b" # -> https://huggingface.co/nanonets/Nanonets-OCR2-1.5B-exp
-processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
-model_n = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID_N,
-    trust_remote_code=True,
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
-).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
@@ -171,33 +190,35 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load PaddleOCR
-MODEL_ID_P = "strangervisionhf/paddle" # -> https://huggingface.co/PaddlePaddle/PaddleOCR-VL
-processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
-model_p = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_P,
     trust_remote_code=True,
-    torch_dtype=torch.bfloat16
 ).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """Generate responses for image input using the selected model."""
-    if model_name == "Nanonets-OCR2-3B":
-        processor, model = processor_m, model_m
-    elif model_name == "Nanonets-OCR2-1.5B(exp)":
-        processor, model = processor_n, model_n
-    elif model_name == "Dots.OCR":
-        processor, model = processor_d, model_d
-    elif model_name == "PaddleOCR":
-        processor, model = processor_p, model_p
     elif model_name == "Chandra-OCR":
-        processor, model = processor_c, model_c
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -206,40 +227,39 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    images = [image.convert("RGB")]
-    if model_name == "PaddleOCR":
-        messages = [
-            {"role": "user", "content": text}
         ]
-    else:
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "image"}] + [{"type": "text", "text": text}]
-            }
-        ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
         "max_new_tokens": max_new_tokens,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
-        "do_sample": True
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
 image_examples = [
@@ -253,34 +273,37 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-            image_upload = gr.Image(type="pil", label="Upload Image", height=320)
-            image_submit = gr.Button("Submit", variant="primary")
-            gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
-            gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
-            with gr.Accordion("[Result.md]", open=False):
-                formatted_output = gr.Markdown(label="Formatted Result")
-            model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR", "Nanonets-OCR2-1.5B(exp)", "PaddleOCR"],
-                label="Select Model",
-                value="Nanonets-OCR2-3B"
-            )
-            gr.Markdown("Note: Currently, PaddleOCR VL only supports OCR inference. Structured OCR document parsing transformer inference is coming soon. [Report – Bug/Issue](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR3/discussions/1)")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[raw_output, formatted_output]
     )
 if __name__ == "__main__":

 import os
 import sys
+import random
+import uuid
+import json
+import time
 from threading import Thread
 from typing import Iterable
 from huggingface_hub import snapshot_download
 import gradio as gr
 import spaces
 import torch
+import numpy as np
 from PIL import Image
+import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
     TextIteratorStreamer,
 )
+from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
+            button_secondary_text_color="black",
+            button_secondary_text_color_hover="white",
+            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
+            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
+            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
+            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
 }
 """
+MAX_MAX_NEW_TOKENS = 4096
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
+print("torch.__version__ =", torch.__version__)
+print("torch.version.cuda =", torch.version.cuda)
+print("cuda available:", torch.cuda.is_available())
+print("cuda device count:", torch.cuda.device_count())
+if torch.cuda.is_available():
+    print("current device:", torch.cuda.current_device())
+    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+print("Using device:", device)
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Chandra-OCR
+MODEL_ID_V = "datalab-to/chandra"
+processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+model_v = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Nanonets-OCR2-3B
+MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
     trust_remote_code=True
 ).eval()
+# Load olmOCR-2-7B-1025
+MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
     trust_remote_code=True,
+    torch_dtype=torch.float16
 ).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int, temperature: float, top_p: float,
+                   top_k: int, repetition_penalty: float):
+    """
+    Generates responses using the selected model for image input.
+    Yields raw text and Markdown-formatted text.
+    """
+    if model_name == "olmOCR-2-7B-1025":
+        processor = processor_m
+        model = model_m
+    elif model_name == "Nanonets-OCR2-3B":
+        processor = processor_x
+        model = model_x
     elif model_name == "Chandra-OCR":
+        processor = processor_v
+        model = model_v
+    elif model_name == "Dots.OCR":
+        processor = processor_d
+        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         yield "Please upload an image.", "Please upload an image."
         return
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": text},
         ]
+    }]
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full],
+        images=[image],
+        return_tensors="pt",
+        padding=True).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
         "max_new_tokens": max_new_tokens,
+        "do_sample": True,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
         yield buffer, buffer
 image_examples = [
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+            image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+            image_submit = gr.Button("Submit", variant="primary")
+            gr.Examples(
+                examples=image_examples,
+                inputs=[image_query, image_upload]
+            )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.7)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
         with gr.Column(scale=3):
+                gr.Markdown("## Output", elem_id="output-title")
+                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
+                with gr.Accordion("(Result.md)", open=False):
+                    markdown_output = gr.Markdown(label="(Result.Md)")
+                model_choice = gr.Radio(
+                    choices=["Nanonets-OCR2-3B", "Chandra-OCR", "olmOCR-2-7B-1025", "Dots.OCR"],
+                    label="Select Model",
+                    value="Nanonets-OCR2-3B"
+                )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
     )
 if __name__ == "__main__":