Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

prithivMLmods commited on Oct 19

Commit

edbf0ea

verified ·

1 Parent(s): f75e630

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -190,12 +190,21 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     images = [image.convert("RGB")]
-    messages = [
-        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {

     images = [image.convert("RGB")]
+    # --- FIX: Handle different prompt formats required by models ---
+    if model_name == "PaddleOCR":
+        # PaddleOCR's template expects a simple string content for the text part.
+        # The image is passed to the processor separately.
+        messages = [{"role": "user", "content": text}]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    else:
+        # Nanonets and Dots.OCR support the modern list format for multimodal content.
+        messages = [
+            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    # --- END FIX ---
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {