Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20, 2025

Commit

21a5be4

verified ·

1 Parent(s): 2e447a3

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -19

app.py CHANGED Viewed

@@ -134,7 +134,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -143,18 +142,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Nanonets-OCR2-1.5B-exp
-MODEL_ID_N = "nanonets/Nanonets-OCR2-1.5B-exp"
-processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
-model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_N,
-    trust_remote_code=True,
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
-).to(device).eval()
-# Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
@@ -165,7 +153,6 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load PaddleOCR
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
@@ -185,8 +172,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
-    elif model_name == "Nanonets-OCR2-1.5B-exp":
-        processor, model = processor_n, model_n
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     elif model_name == "PaddleOCR":
@@ -201,6 +186,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     images = [image.convert("RGB")]
     if model_name == "PaddleOCR":
         messages = [
             {"role": "user", "content": text}
@@ -237,9 +225,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 image_examples = [
-    ["Reconstruct the doc [table] as it is.", "images/0.png"],
-    ["Describe the image!", "images/8.png"],
-    ["OCR the image", "images/2.jpg"],
 ]
@@ -266,7 +254,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Nanonets-OCR2-1.5B-exp", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     elif model_name == "PaddleOCR":
     images = [image.convert("RGB")]
+    # --- ERROR FIX ---
+    # PaddleOCR's processor expects a different message format than the others.
+    # Its chat template expects the 'content' to be a simple string, not a list.
     if model_name == "PaddleOCR":
         messages = [
             {"role": "user", "content": text}
 image_examples = [
+    ["Perform OCR on the image.", "images/0.png"],
+    ["Phrase the document [page].", "images/8.png"],
+    ["OCR and reconstruct the table perfectly.", "images/2.jpg"],
 ]
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )