Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 17

Commit

5f43587

verified ·

1 Parent(s): 5780205

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -45

app.py CHANGED Viewed

@@ -15,8 +15,7 @@ import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
-    AutoModelForCausalLM,# Added for PaddleOCR-VL
     AutoProcessor,
     TextIteratorStreamer,
 )
@@ -133,13 +132,13 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load PaddleOCR-VL [PaddlePaddle/PaddleOCR-VL]
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_P,
     trust_remote_code=True,
-    torch_dtype=torch.float16,
 ).to(device).eval()
@@ -158,30 +157,22 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
-    elif model_name == "PaddleOCR-VL":
-        processor = processor_p
-        model = model_p
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": text},
-        ]
-    }]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True).to(device)
-    # Nanonets model supports streaming
-    if model_name == "Nanonets-OCR2-3B":
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
@@ -202,34 +193,45 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             time.sleep(0.01)
             yield buffer, buffer
-    # PaddleOCR-VL does not use a streamer, generate full response
     elif model_name == "PaddleOCR-VL":
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
         }
-        generated_ids = model.generate(**generation_kwargs)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        output_text = output_text.replace("<|im_end|>", "").strip()
-        yield output_text, output_text
 # Define examples for image inference
 image_examples = [
     ["Extract the full page.", "images/ocr.png"],
-    ["Extract the content.", "images/4.png"],
-    ["Convert this page to doc [table] precisely for markdown.", "images/0.png"]
 ]
@@ -238,7 +240,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")
@@ -256,7 +258,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
                 gr.Markdown("## Output", elem_id="output-title")
-                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")

 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoModelForCausalLM, # Added for PaddleOCR-VL
     AutoProcessor,
     TextIteratorStreamer,
 )
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load PaddleOCR-VL
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_P,
     trust_remote_code=True,
+    torch_dtype=torch.float16
 ).to(device).eval()
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt_full],
+            images=[image],
+            return_tensors="pt",
+            padding=True).to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             time.sleep(0.01)
             yield buffer, buffer
     elif model_name == "PaddleOCR-VL":
+        processor = processor_p
+        model = model_p
+        # FIX: PaddleOCR-VL expects a simple string content, not a list of dicts.
+        messages = [{"role": "user", "content": text}]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt_full],
+            images=[image],
+            return_tensors="pt"
+        ).to(device)
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_new_tokens,
+            "do_sample": False,
+            "use_cache": True,
         }
+        with torch.inference_mode():
+            generated_ids = model.generate(**generation_kwargs)
+        resp = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Clean the output by removing the prompt
+        answer = resp.split(prompt_full)[-1].strip()
+        yield answer, answer
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
 # Define examples for image inference
 image_examples = [
     ["Extract the full page.", "images/ocr.png"],
+    ["OCR:", "images/4.png"], # Example prompt for PaddleOCR-VL
+    ["Table Recognition:", "images/0.png"] # Example prompt for PaddleOCR-VL
 ]
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here... (e.g., 'OCR:')")
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")
         with gr.Column(scale=3):
                 gr.Markdown("## Output", elem_id="output-title")
+                output = gr.Textbox(label="Raw Output", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")