Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 17

Commit

632c48d

verified ·

1 Parent(s): f85950b

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -51

app.py CHANGED Viewed

@@ -15,9 +15,11 @@ import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForCausalLM, # Added for Dots.OCR
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
@@ -123,25 +125,27 @@ if torch.cuda.is_available():
 print("Using device:", device)
 # --- Model Loading ---
-# Load Nanonets-OCR2-3B
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16,
-    #_attn_implementation="flash_attention_2"
-).to(device).eval()
 # Load Dots.OCR (rednote-hilab/dots.ocr)
-MODEL_ID_D = "strangervisionhf/dot.fix"
 processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_D,
     trust_remote_code=True,
     torch_dtype=torch.float16,
-    _attn_implementation="flash_attention_2"
-).to(device).eval()
 @spaces.GPU
@@ -175,55 +179,35 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
-        padding=True).to(device)
-    # Nanonets model supports streaming
-    if model_name == "Nanonets-OCR2-3B":
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer, buffer
-    # Dots.OCR does not use the streamer in the same way, generate full response
-    elif model_name == "Dots.OCR":
-        generation_kwargs = {
-            **inputs,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        generated_ids = model.generate(**generation_kwargs)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        output_text = output_text.replace("<|im_end|>", "").strip()
-        yield output_text, output_text
 # Define examples for image inference

 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoModelForImageTextToText,
+    AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
+    AutoTokenizer
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 print("Using device:", device)
 # --- Model Loading ---
+# Load Nanonets-OCR2-3B using AutoModelForImageTextToText
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+model_v = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="flash_attention_2"
+).eval()
 # Load Dots.OCR (rednote-hilab/dots.ocr)
+MODEL_ID_D = "rednote-hilab/dots.ocr"
 processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_D,
     trust_remote_code=True,
     torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="flash_attention_2"
+).eval()
 @spaces.GPU
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Since model is loaded with device_map="auto", we don't need to manually move inputs to device
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
+        padding=True
+    ).to(model.device)
+    # Both models now use a non-streaming generation approach
+    generation_kwargs = {
+        **inputs,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    generated_ids = model.generate(**generation_kwargs)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    output_text = output_text.replace("<|im_end|>", "").strip()
+    yield output_text, output_text
 # Define examples for image inference