Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18

Commit

71aa9eb

verified ·

1 Parent(s): 30151c4

update app

Browse files

Files changed (1) hide show

app.py +51 -7

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
-import random
 from threading import Thread
 from typing import Iterable
 import gradio as gr
 import spaces
@@ -89,6 +90,48 @@ css = """
 }
 """
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
@@ -105,8 +148,8 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Dots.OCR
-MODEL_PATH_D = "rednote-hilab/dots.ocr"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_D,
@@ -157,6 +200,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -182,26 +226,26 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
             image_submit = gr.Button("Submit", variant="primary")
             gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
             raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=13, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=True):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],

 import os
+import sys
 from threading import Thread
 from typing import Iterable
+from huggingface_hub import snapshot_download
 import gradio as gr
 import spaces
 }
 """
+# --- Fix for Dots.OCR Processor Loading ---
+# Define a local directory to cache the model
+CACHE_PATH = "./model_cache"
+if not os.path.exists(CACHE_PATH):
+    os.makedirs(CACHE_PATH)
+# Download the model files locally
+model_path_d_local = snapshot_download(
+    repo_id='rednote-hilab/dots.ocr',
+    local_dir=CACHE_PATH,
+    max_workers=20,
+    local_dir_use_symlinks=False
+)
+# Modify the configuration file to fix the processor loading issue
+config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
+if os.path.exists(config_file_path):
+    with open(config_file_path, 'r') as f:
+        input_code = f.read()
+    lines = input_code.splitlines()
+    if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
+        output_lines = []
+        for line in lines:
+            output_lines.append(line)
+            if line.strip().startswith("class DotsVLProcessor"):
+                # Insert the attributes line to specify which processors to load
+                output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
+        # Write the modified content back to the file
+        with open(config_file_path, 'w') as f:
+            f.write('\n'.join(output_lines))
+        print("Patched configuration_dots.py successfully.")
+# Add the local model path to sys.path so transformers can use the modified code
+sys.path.append(model_path_d_local)
+# --- Model Loading ---
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Dots.OCR from the local, patched directory
+MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_D,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
+        "do_sample": True
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
             image_upload = gr.Image(type="pil", label="Upload Image", height=320)
             image_submit = gr.Button("Submit", variant="primary")
             gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
             raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=13, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=True):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],