Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

bf6df93

verified ·

1 Parent(s): 388e24a

update app

Browse files

Files changed (1) hide show

app.py +38 -71

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoTokenizer,
 )
 from gradio.themes import Soft
@@ -92,66 +91,44 @@ css = """
 }
 """
-# --- Model Patching ---
-# Define a local directory to cache models
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
-# --- Fix for Dots.OCR Processor Loading ---
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
-sys.path.append(model_path_d_local)
-# --- Fix for DeepSeek-OCR ImportError ---
-model_path_s_local = snapshot_download(
-    repo_id='deepseek-ai/DeepSeek-OCR',
-    local_dir=os.path.join(CACHE_PATH, 'DeepSeek-OCR'),
-    max_workers=20,
-    local_dir_use_symlinks=False
-)
-modeling_file_path = os.path.join(model_path_s_local, "modeling_deepseekv2.py")
-if os.path.exists(modeling_file_path):
-    with open(modeling_file_path, 'r', encoding='utf-8') as f:
-        input_code = f.read()
-    original_import = "from transformers.models.llama.modeling_llama import (\n    LlamaAttention,\n    LlamaFlashAttention2\n)"
-    if original_import in input_code:
-        safe_import = """from transformers.models.llama.modeling_llama import (
-    LlamaAttention
-)
-try:
-    from transformers.models.llama.modeling_llama import LlamaFlashAttention2
-except ImportError:
-    LlamaFlashAttention2 = LlamaAttention"""
-        patched_code = input_code.replace(original_import, safe_import)
-        with open(modeling_file_path, 'w', encoding='utf-8') as f:
-            f.write(patched_code)
-        print("Patched modeling_deepseekv2.py successfully.")
-sys.path.append(model_path_s_local)
-# --- NEW: Import the specific model class for DeepSeek-OCR ---
-from modeling_deepseekocr import DeepseekOCRForCausalLM
 # --- Model Loading ---
@@ -177,21 +154,19 @@ MODEL_PATH_D = model_path_d_local
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_D,
-    _attn_implementation="flash_attention_2",
     torch_dtype=torch.bfloat16,
     device_map="auto",
     trust_remote_code=True
 ).eval()
-# Load DeepSeek-OCR from the local, patched directory using its specific class
-MODEL_PATH_S = model_path_s_local
-processor_s = AutoProcessor.from_pretrained(MODEL_PATH_S, trust_remote_code=True)
-# --- MODIFIED: Use the specific class instead of AutoModelForCausalLM ---
-model_s = DeepseekOCRForCausalLM.from_pretrained(
-    MODEL_PATH_S,
-    _attn_implementation='eager',
-    torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 ).to(device).eval()
@@ -207,8 +182,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
-    elif model_name == "DeepSeek-OCR":
-        processor, model = processor_s, model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -219,24 +194,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     images = [image.convert("RGB")]
-    if model_name == "DeepSeek-OCR":
-        messages = [
-            {"role": "user", "content": f"<image>\n<|grounding|>{text}"}
-        ]
-        prompt = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    else:
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "image"}] + [{"type": "text", "text": text}]
-            }
-        ]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
@@ -257,14 +224,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 # Define examples for image inference
 image_examples = [
-    ["Reconstruct the doc [table] as it is.", "images/a.jpg"],
-    ["Extract all content.", "images/b.jpg"],
-    ["OCR the image", "images/c.jpg"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
@@ -281,14 +248,14 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=9, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["DeepSeek-OCR", "Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
-                value="DeepSeek-OCR"
             )
     image_submit.click(

     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 }
 """
+# --- Fix for Dots.OCR Processor Loading ---
+# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
+# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'), # Create a dedicated subfolder
     max_workers=20,
     local_dir_use_symlinks=False
 )
+# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
+                # Insert the attributes line to specify which processors to load
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
+        # Write the modified content back to the file
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
+# Add the local model path to sys.path so transformers can use the modified code
+sys.path.append(model_path_d_local)
 # --- Model Loading ---
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 model_d = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_D,
+    attn_implementation="eager",
     torch_dtype=torch.bfloat16,
     device_map="auto",
     trust_remote_code=True
 ).eval()
+# Load PaddleOCR
+MODEL_ID_P = "strangervisionhf/paddle"
+processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
+model_p = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_P,
     trust_remote_code=True,
+    torch_dtype=torch.bfloat16
 ).to(device).eval()
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
+    elif model_name == "PaddleOCR":
+        processor, model = processor_p, model_p
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     images = [image.convert("RGB")]
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
 # Define examples for image inference
 image_examples = [
+    ["Reconstruct the doc [table] as it is.", "images/0.png"],
+    ["Describe the image!", "images/8.png"],
+    ["OCR the image", "images/2.jpg"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
             with gr.Accordion("Formatted Result", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
+                value="Nanonets-OCR2-3B"
             )
     image_submit.click(