Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 17

Commit

d50ecd0

verified ·

1 Parent(s): 238ed44

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -18

app.py CHANGED Viewed

@@ -132,15 +132,23 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load PaddleOCR-VL [PaddlePaddle/PaddleOCR-VL]
-MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_P,
     trust_remote_code=True,
-    torch_dtype=torch.float16
 ).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -153,12 +161,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
-        # Nanonets/Qwen-VL format: content is a list of dicts
         messages = [{
             "role": "user",
             "content": [
@@ -173,7 +180,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             images=[image],
             return_tensors="pt",
             padding=True).to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
@@ -193,30 +200,31 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer, buffer
     elif model_name == "PaddleOCR-VL":
         processor = processor_p
         model = model_p
-        # PaddleOCR-VL format: content is a simple string
         messages = [{"role": "user", "content": text}]
         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt_full], images=[image], return_tensors="pt").to(device)
-        # Use generation parameters from the reference script for best results
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
-            "do_sample": False,
             "use_cache": True,
         }
         with torch.inference_mode():
             generated_ids = model.generate(**generation_kwargs)
         resp = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Extract only the generated part of the response
         answer = resp.split(prompt_full)[-1].strip()
         yield answer, answer
@@ -224,12 +232,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Invalid model selected.", "Invalid model selected."
         return
-# Define examples for image inference, tailored for both models
 image_examples = [
     ["OCR:", "images/ocr.png"],
     ["Table Recognition:", "images/4.png"],
-    ["Extract the content from this image.", "images/0.png"]
 ]
@@ -238,7 +245,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter query or task (e.g., 'OCR:')")
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")

     torch_dtype=torch.float16
 ).to(device).eval()
+# Load PaddleOCR-VL
+# Using the corrected model path from your previous attempt
+MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_P,
     trust_remote_code=True,
+    torch_dtype=torch.float16,
 ).to(device).eval()
+# --- Task Prompts for PaddleOCR-VL ---
+PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "chart": "Chart Recognition:",
+    "formula": "Formula Recognition:",
+}
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
         messages = [{
             "role": "user",
             "content": [
             images=[image],
             return_tensors="pt",
             padding=True).to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer, buffer
     elif model_name == "PaddleOCR-VL":
         processor = processor_p
         model = model_p
+        # --- CORRECTED LOGIC FOR PADDLEOCR-VL ---
+        # It expects a simple string content, not a list of dicts.
+        # The user's input `text` should be one of the specific prompts.
         messages = [{"role": "user", "content": text}]
         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt_full], images=[image], return_tensors="pt").to(device)
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
+            "do_sample": False, # As per the reference script for best results
             "use_cache": True,
         }
         with torch.inference_mode():
             generated_ids = model.generate(**generation_kwargs)
         resp = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Extract only the model's answer, excluding the prompt
         answer = resp.split(prompt_full)[-1].strip()
         yield answer, answer
         yield "Invalid model selected.", "Invalid model selected."
         return
+# Define examples for image inference, updated for both models
 image_examples = [
     ["OCR:", "images/ocr.png"],
     ["Table Recognition:", "images/4.png"],
+    ["Extract the content of this invoice.", "images/0.png"]
 ]
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            image_query = gr.Textbox(label="Query Input", placeholder="Enter query. For PaddleOCR, use 'OCR:', 'Table Recognition:', etc.")
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")