Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 19

Commit

87b573a

verified ·

1 Parent(s): eebb9c6

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -49

app.py CHANGED Viewed

@@ -91,19 +91,27 @@ css = """
 """
 # --- Fix for Dots.OCR Processor Loading ---
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
     local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
@@ -114,52 +122,58 @@ if os.path.exists(config_file_path):
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
-    MODEL_PATH_D, attn_implementation="eager", torch_dtype=torch.bfloat16,
-    device_map="auto", trust_remote_code=True
 ).eval()
 # Load PaddleOCR
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_P, trust_remote_code=True, torch_dtype=torch.bfloat16
 ).to(device).eval()
 @spaces.GPU
-def generate_image(model_name: str, text: str, paddle_task: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    PROMPTS = {
-        "OCR": "OCR:",
-        "Table Recognition": "Table Recognition:",
-        "Chart Recognition": "Chart Recognition:",
-        "Formula Recognition": "Formula Recognition:",
-    }
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
@@ -175,16 +189,22 @@ def generate_image(model_name: str, text: str, paddle_task: str, image: Image.Im
         return
     images = [image.convert("RGB")]
-    # --- FIX: Handle different prompt formats required by models ---
     if model_name == "PaddleOCR":
-        # PaddleOCR expects specific, predefined prompts for its tasks.
-        prompt_text = PROMPTS.get(paddle_task, "OCR:")
         messages = [{"role": "user", "content": prompt_text}]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     else:
-        # Nanonets and Dots.OCR support the modern list format for multimodal content.
         messages = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
         ]
@@ -223,17 +243,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            # General query input, visible by default
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...", visible=True)
-            # Specific task selector for PaddleOCR, hidden by default
-            paddle_task = gr.Radio(
-                label="Select PaddleOCR Task",
-                choices=["OCR", "Table Recognition", "Chart Recognition", "Formula Recognition"],
-                value="OCR",
-                visible=False
-            )
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
             image_submit = gr.Button("Submit", variant="primary")
             gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
@@ -256,30 +266,19 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
-    # Function to dynamically update the UI based on model selection
-    def update_ui_for_model(model_name):
-        if model_name == "PaddleOCR":
-            return {
-                image_query: gr.Textbox(visible=False),
-                paddle_task: gr.Radio(visible=True)
-            }
-        else:
-            return {
-                image_query: gr.Textbox(visible=True),
-                paddle_task: gr.Radio(visible=False)
-            }
-    # Attach the function to the model_choice radio button's change event
-    model_choice.change(
-        fn=update_ui_for_model,
-        inputs=model_choice,
-        outputs=[image_query, paddle_task]
-    )
     image_submit.click(
         fn=generate_image,
-        inputs=[model_choice, image_query, paddle_task, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )

 """
 # --- Fix for Dots.OCR Processor Loading ---
+# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
+# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
     local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
+# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
+# Add the local model path to sys.path so transformers can use the modified code
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
+# Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
 ).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH_D,
+    attn_implementation="eager",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
 ).eval()
 # Load PaddleOCR
 MODEL_ID_P = "strangervisionhf/paddle"
 processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
 model_p = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_P,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
 ).to(device).eval()
 @spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image, task_type: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         return
     images = [image.convert("RGB")]
+    # --- FIX: Use task-specific prompts for PaddleOCR for structured output ---
     if model_name == "PaddleOCR":
+        task_prompts = {
+            "General OCR": "Recognize the text in this image.",
+            "Table Recognition": "Recognize the table in this image.",
+            "Formula Recognition": "Recognize the formula in this image.",
+            "Layout Analysis": "Analyze the layout of this document. Return the result in markdown format."
+        }
+        # Use the task-specific prompt and ignore the user's free-form text query
+        prompt_text = task_prompts.get(task_type, "Recognize the text in this image.")
         messages = [{"role": "user", "content": prompt_text}]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     else:
+        # For other models, use the standard user-provided text query
         messages = [
             {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}
         ]
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
             image_submit = gr.Button("Submit", variant="primary")
             gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
+            # --- NEW UI ELEMENT FOR PADDLEOCR ---
+            task_type_dropdown = gr.Radio(
+                choices=["General OCR", "Table Recognition", "Formula Recognition", "Layout Analysis"],
+                label="Select Task for PaddleOCR",
+                value="General OCR",
+                info="This selection is used ONLY for the PaddleOCR model to ensure structured output. The 'Query Input' box will be ignored."
+            )
+            # --- END NEW UI ELEMENT ---
     image_submit.click(
         fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, task_type_dropdown, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )