Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

8690171

verified ·

1 Parent(s): 924cc45

update app

Browse files

Files changed (1) hide show

app.py +38 -12

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
@@ -160,6 +161,16 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -173,6 +184,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -183,16 +196,29 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     images = [image.convert("RGB")]
-    messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
-        }
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
@@ -237,14 +263,14 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=10, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
-                value="Nanonets-OCR2-3B"
             )
     image_submit.click(

     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
+    AutoTokenizer, # Added for DeepSeek, though AutoProcessor is used
 )
 from gradio.themes import Soft
     trust_remote_code=True
 ).eval()
+# Load DeepSeek-OCR
+MODEL_ID_S = 'deepseek-ai/DeepSeek-OCR'
+processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
+model_s = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_S,
+    _attn_implementation='flash_attention_2',
+    trust_remote_code=True,
+    use_safetensors=True
+).eval().to(device).to(torch.bfloat16)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
+    elif model_name == "DeepSeek-OCR":
+        processor, model = processor_s, model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     images = [image.convert("RGB")]
+    # For DeepSeek-OCR, the recommended prompt format is slightly different
+    if model_name == "DeepSeek-OCR":
+        # Using a format found in documentation for better performance
+        prompt_text = f"<image>\n<|grounding|>{text}"
+        messages = [
+            {"role": "user", "content": prompt_text}
+        ]
+        # apply_chat_template is not used directly, instead we build the prompt manually
+        prompt = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"}] + [{"type": "text", "text": text}]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=9, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["DeepSeek-OCR", "Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
+                value="DeepSeek-OCR"
             )
     image_submit.click(