Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 17

Commit

1e73c4d

verified ·

1 Parent(s): 846f854

update app

Browse files

Files changed (1) hide show

app.py +60 -36

app.py CHANGED Viewed

@@ -14,9 +14,8 @@ from PIL import Image
 import cv2
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
 )
@@ -133,13 +132,14 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Qwen2-VL-OCR-2B-Instruct
-MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
     trust_remote_code=True,
-    torch_dtype=torch.float16
 ).to(device).eval()
@@ -151,20 +151,20 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
-    if model_name == "Qwen2-VL-OCR-2B":
-        processor = processor_x
-        model = model_x
-    elif model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
-    if image is None:
-        yield "Please upload an image.", "Please upload an image."
-        return
     messages = [{
         "role": "user",
         "content": [
@@ -180,25 +180,49 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         return_tensors="pt",
         padding=True).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer, buffer
 # Define examples for image inference
@@ -237,7 +261,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                     markdown_output = gr.Markdown(label="(Result.Md)")
                 model_choice = gr.Radio(
-                    choices=["Nanonets-OCR2-3B", "Qwen2-VL-OCR-2B"],
                     label="Select Model",
                     value="Nanonets-OCR2-3B"
                 )

 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoModelForCausalLM, # Added for Dots.OCR
     AutoProcessor,
     TextIteratorStreamer,
 )
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Dots.OCR
+MODEL_ID_D = "rednote-hilab/dots.ocr"
+processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
+model_d = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_D,
     trust_remote_code=True,
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2"
 ).to(device).eval()
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
+    if image is None:
+        yield "Please upload an image.", "Please upload an image."
+        return
+    if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
+    elif model_name == "Dots.OCR":
+        processor = processor_d
+        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     messages = [{
         "role": "user",
         "content": [
         return_tensors="pt",
         padding=True).to(device)
+    # Nanonets model supports streaming
+    if model_name == "Nanonets-OCR2-3B":
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer, buffer
+    # Dots.OCR does not use the streamer in the same way, generate full response
+    elif model_name == "Dots.OCR":
+        generation_kwargs = {
+            **inputs,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        generated_ids = model.generate(**generation_kwargs)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        output_text = output_text.replace("<|im_end|>", "").strip()
+        yield output_text, output_text
 # Define examples for image inference
                     markdown_output = gr.Markdown(label="(Result.Md)")
                 model_choice = gr.Radio(
+                    choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                     label="Select Model",
                     value="Nanonets-OCR2-3B"
                 )