Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 19

Commit

858d0e5

verified ·

1 Parent(s): 87b573a

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -45

app.py CHANGED Viewed

@@ -100,7 +100,7 @@ if not os.path.exists(CACHE_PATH):
 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
@@ -118,7 +118,10 @@ if os.path.exists(config_file_path):
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
@@ -156,18 +159,9 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load PaddleOCR
-MODEL_ID_P = "strangervisionhf/paddle"
-processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
-model_p = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_P,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to(device).eval()
 @spaces.GPU
-def generate_image(model_name: str, text: str, image: Image.Image, task_type: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
@@ -178,8 +172,6 @@ def generate_image(model_name: str, text: str, image: Image.Image, task_type: st
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
-    elif model_name == "PaddleOCR":
-        processor, model = processor_p, model_p
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -189,28 +181,15 @@ def generate_image(model_name: str, text: str, image: Image.Image, task_type: st
         return
     images = [image.convert("RGB")]
-    # --- FIX: Use task-specific prompts for PaddleOCR for structured output ---
-    if model_name == "PaddleOCR":
-        task_prompts = {
-            "General OCR": "Recognize the text in this image.",
-            "Table Recognition": "Recognize the table in this image.",
-            "Formula Recognition": "Recognize the formula in this image.",
-            "Layout Analysis": "Analyze the layout of this document. Return the result in markdown format."
         }
-        # Use the task-specific prompt and ignore the user's free-form text query
-        prompt_text = task_prompts.get(task_type, "Recognize the text in this image.")
-        messages = [{"role": "user", "content": prompt_text}]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    else:
-        # For other models, use the standard user-provided text query
-        messages = [
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
-        ]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    # --- END FIX ---
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
@@ -262,23 +241,14 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
-            # --- NEW UI ELEMENT FOR PADDLEOCR ---
-            task_type_dropdown = gr.Radio(
-                choices=["General OCR", "Table Recognition", "Formula Recognition", "Layout Analysis"],
-                label="Select Task for PaddleOCR",
-                value="General OCR",
-                info="This selection is used ONLY for the PaddleOCR model to ensure structured output. The 'Query Input' box will be ignored."
-            )
-            # --- END NEW UI ELEMENT ---
     image_submit.click(
         fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, task_type_dropdown, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )

 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
+                # Insert the attributes line to specify which processors to load
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
+        # Write the modified content back to the file
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
     trust_remote_code=True
 ).eval()
 @spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         return
     images = [image.convert("RGB")]
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
         }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
     image_submit.click(
         fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )