Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18

Commit

9efae34

verified ·

1 Parent(s): 022a079

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -30

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -100,7 +101,7 @@ if not os.path.exists(CACHE_PATH):
 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
@@ -159,6 +160,12 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -168,10 +175,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -180,35 +191,48 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    images = [image.convert("RGB")]
-    messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
         }
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-        "do_sample": True
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
-        yield buffer, buffer
 # Define examples for image inference
 image_examples = [
@@ -241,7 +265,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )

     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
+    VisionEncoderDecoderModel,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
     trust_remote_code=True
 ).eval()
+# Load ByteDance/Dolphin
+MODEL_ID_B = "ByteDance/Dolphin"
+processor_b = AutoProcessor.from_pretrained(MODEL_ID_B)
+model_b = VisionEncoderDecoderModel.from_pretrained(MODEL_ID_B)
+model_b.to(device).eval().half()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
+    is_streaming = True
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
+    elif model_name == "Dolphin":
+        processor, model = processor_b, model_b
+        is_streaming = False
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         yield "Please upload an image.", "Please upload an image."
         return
+    image_rgb = image.convert("RGB")
+    if is_streaming:
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"}] + [{"type": "text", "text": text}]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=[image_rgb], return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+            "do_sample": True
         }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
+            yield buffer, buffer
+    else:
+        # Handle non-streaming generation for ByteDance/Dolphin
+        pixel_values = processor(images=[image_rgb], return_tensors="pt").pixel_values.to(device).half()
+        # Note: The user's text query is not explicitly used here as the VisionEncoderDecoderModel
+        # pipeline primarily generates captions from images directly.
+        generated_ids = model.generate(pixel_values, max_new_tokens=max_new_tokens)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # For this model, the output appears at once.
+        yield generated_text, generated_text
 # Define examples for image inference
 image_examples = [
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR", "Dolphin"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )