Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 17

Commit

867049a

verified ·

1 Parent(s): 1dc1493

update app

Browse files

Files changed (1) hide show

app.py +21 -25

app.py CHANGED Viewed

@@ -15,11 +15,9 @@ import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
-    AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoTokenizer
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
@@ -125,27 +123,25 @@ if torch.cuda.is_available():
 print("Using device:", device)
 # --- Model Loading ---
-# Load Nanonets-OCR2-3B using its specific, correct class
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
     trust_remote_code=True,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="flash_attention_2"
-).eval()
-# Load Dots.OCR (rednote-hilab/dots.ocr)
-MODEL_ID_D = "rednote-hilab/dots.ocr"
-processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
-model_d = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_D,
     trust_remote_code=True,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="flash_attention_2"
-).eval()
 @spaces.GPU
@@ -163,9 +159,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
-    elif model_name == "Dots.OCR":
-        processor = processor_d
-        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -183,9 +179,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
-        padding=True).to(model.device)
-    # Nanonets model supports streaming, so we use it for a better UX
     if model_name == "Nanonets-OCR2-3B":
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
@@ -207,8 +203,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             time.sleep(0.01)
             yield buffer, buffer
-    # Dots.OCR does not use the streamer in the same way, generate full response
-    elif model_name == "Dots.OCR":
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
@@ -266,7 +262,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                     markdown_output = gr.Markdown(label="(Result.Md)")
                 model_choice = gr.Radio(
-                    choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                     label="Select Model",
                     value="Nanonets-OCR2-3B"
                 )

 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    PaddleOCRVLForConditionalGeneration, # Added for PaddleOCR-VL
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 print("Using device:", device)
 # --- Model Loading ---
+# Load Nanonets-OCR2-3B
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
     trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load PaddleOCR-VL
+MODEL_ID_P = "PaddlePaddle/PaddleOCR-VL"
+SUBFOLDER_P = "PaddleOCR-VL-0.9B"
+processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True, subfolder=SUBFOLDER_P)
+model_p = PaddleOCRVLForConditionalGeneration.from_pretrained(
+    MODEL_ID_P,
     trust_remote_code=True,
+    subfolder=SUBFOLDER_P,
+    torch_dtype=torch.float16
+).to(device).eval()
 @spaces.GPU
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
+    elif model_name == "PaddleOCR-VL":
+        processor = processor_p
+        model = model_p
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
+        padding=True).to(device)
+    # Nanonets model supports streaming
     if model_name == "Nanonets-OCR2-3B":
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             time.sleep(0.01)
             yield buffer, buffer
+    # PaddleOCR-VL does not use a streamer, generate full response
+    elif model_name == "PaddleOCR-VL":
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
                     markdown_output = gr.Markdown(label="(Result.Md)")
                 model_choice = gr.Radio(
+                    choices=["Nanonets-OCR2-3B", "PaddleOCR-VL"],
                     label="Select Model",
                     value="Nanonets-OCR2-3B"
                 )