Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18

Commit

7abceaa

verified ·

1 Parent(s): 33cd763

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -60

app.py CHANGED Viewed

@@ -1,29 +1,20 @@
 import os
 import random
-import uuid
-import json
-import time
-import asyncio
 from threading import Thread
 from typing import Iterable
 import gradio as gr
 import spaces
 import torch
-import numpy as np
-from PIL import Image, ImageOps
-import requests
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
-from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-from huggingface_hub import snapshot_download
 # --- Theme and CSS Definition ---
@@ -106,7 +97,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR-s
-MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
@@ -115,31 +106,26 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # Load Dots.OCR
-MODEL_ID_D = "rednote-hilab/dots.ocr"
-model_path_d = "./models/dots-ocr-local"
-snapshot_download(
-    repo_id=MODEL_ID_D,
-    local_dir=model_path_d,
-    local_dir_use_symlinks=False,
-)
 model_d = AutoModelForCausalLM.from_pretrained(
-    model_path_d,
-    attn_implementation="flash_attention_2" if "cuda" in device.type else "eager",
     torch_dtype=torch.bfloat16,
     device_map="auto",
     trust_remote_code=True
-)
-processor_d = AutoProcessor.from_pretrained(
-    model_path_d,
-    trust_remote_code=True
-)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
     """Generate responses for image input using the selected model."""
-    if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
@@ -151,18 +137,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    images = [image]
     messages = [
         {
             "role": "user",
-            "content": [{"type": "image"}] * len(images) + [
-                {"type": "text", "text": text}
-            ]
         }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
@@ -174,49 +158,31 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
-    # Dots.OCR uses a different generation parameter name for end-of-sequence
-    if "dots.ocr" in model.config.name_or_path.lower():
-        generation_kwargs["eos_token_id"] = processor.tokenizer.eos_token_id
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "").replace("</s>", "")
         yield buffer, buffer
-    # The formatted output is the same as the raw output in this version
-    yield buffer, buffer
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
-    ["Convert this page to markdown", "images/1.png"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal Image OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
-                label="Select Model",
-                value="Nanonets-OCR-s"
-            )
-            query_input = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
-            submit_button = gr.Button("Submit", variant="primary")
-            gr.Examples(examples=image_examples, inputs=[query_input, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -226,14 +192,21 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=18, show_copy_button=True)
-            formatted_output = gr.Markdown(label="Formatted Output (Result.md)")
-    submit_button.click(
         fn=generate_image,
-        inputs=[model_choice, query_input, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(ssr_mode=False, show_error=True)

 import os
 import random
 from threading import Thread
 from typing import Iterable
 import gradio as gr
 import spaces
 import torch
+from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # --- Theme and CSS Definition ---
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR-s
+MODEL_ID_M = "nanonets/Nanonets-OCR-s"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
 ).to(device).eval()
 # Load Dots.OCR
+MODEL_PATH_D = "rednote-hilab/dots.ocr"
+processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH_D,
+    attn_implementation="flash_attention_2",
     torch_dtype=torch.bfloat16,
     device_map="auto",
     trust_remote_code=True
+).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
+    if model_name == "Nanonets-OCR-s":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
         yield "Please upload an image.", "Please upload an image."
         return
+    images = [image.convert("RGB")]
     messages = [
         {
             "role": "user",
+            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
         }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
+            image_submit = gr.Button("Submit", variant="primary")
+            gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=13, show_copy_button=True)
+            with gr.Accordion("Formatted Result", open=True):
+                formatted_output = gr.Markdown(label="Formatted Result")
+            model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "Dots.OCR"],
+                label="Select Model",
+                value="Nanonets-OCR-s"
+            )
+    image_submit.click(
         fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(show_error=True)