Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

ebd6535

verified ·

1 Parent(s): 5838753

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -29

app.py CHANGED Viewed

@@ -9,12 +9,11 @@ import spaces
 import torch
 from PIL import Image
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoTokenizer,
 )
 from gradio.themes import Soft
@@ -93,24 +92,22 @@ css = """
 }
 """
-# --- Fix for Dots.OCR Processor Loading ---
-# Define a local directory to cache the model
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
-# Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
-# Modify the configuration file to fix the processor loading issue
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
@@ -121,17 +118,37 @@ if os.path.exists(config_file_path):
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
-                # Insert the attributes line to specify which processors to load
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
-        # Write the modified content back to the file
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
-# Add the local model path to sys.path so transformers can use the modified code
 sys.path.append(model_path_d_local)
 # --- Model Loading ---
@@ -162,13 +179,14 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load DeepSeek-OCR
-MODEL_ID_S = 'deepseek-ai/DeepSeek-OCR'
-processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
 model_s = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_S,
-    _attn_implementation='flash_attention_2',
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 ).eval()
@@ -196,18 +214,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         return
     images = [image.convert("RGB")]
-    # For DeepSeek-OCR, the recommended prompt format is slightly different
     if model_name == "DeepSeek-OCR":
-        # Using a format found in documentation for better performance
-        prompt_text = f"<image>\n<|grounding|>{text}"
-        messages = [
-            {"role": "user", "content": prompt_text}
-        ]
-        # apply_chat_template is not used directly, instead we build the prompt manually
-        prompt = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     else:
         messages = [
             {
@@ -216,7 +228,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             }
         ]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)

 import torch
 from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
+    AutoTokenizer, # Added for DeepSeek, though AutoProcessor is used
 )
 from gradio.themes import Soft
 }
 """
+# --- Local Model Caching and Patching ---
+# Define a local directory to cache all models
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
+# --- Fix for Dots.OCR Processor Loading ---
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
         for line in lines:
             output_lines.append(line)
             if line.strip().startswith("class DotsVLProcessor"):
                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
+# --- Fix for DeepSeek-OCR ImportError ---
+model_path_s_local = snapshot_download(
+    repo_id='deepseek-ai/DeepSeek-OCR',
+    local_dir=os.path.join(CACHE_PATH, "DeepSeek-OCR"),
+    max_workers=20,
+    local_dir_use_symlinks=False
+)
+deepseek_modeling_file = os.path.join(model_path_s_local, "modeling_deepseekv2.py")
+if os.path.exists(deepseek_modeling_file):
+    with open(deepseek_modeling_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Check if the problematic import exists and hasn't been patched yet
+    problematic_import_str = "from transformers.models.llama.modeling_llama import (\n    LlamaFlashAttention2,"
+    if problematic_import_str in content:
+        # Patch the file by commenting out the LlamaFlashAttention2 import
+        patched_content = content.replace("LlamaFlashAttention2,", "# LlamaFlashAttention2,")
+        with open(deepseek_modeling_file, 'w', encoding='utf-8') as f:
+            f.write(patched_content)
+        print("Patched modeling_deepseekv2.py successfully.")
+sys.path.append(model_path_s_local)
 # --- Model Loading ---
     trust_remote_code=True
 ).eval()
+# Load DeepSeek-OCR from the local, patched directory
+MODEL_PATH_S = model_path_s_local
+processor_s = AutoProcessor.from_pretrained(MODEL_PATH_S, trust_remote_code=True)
 model_s = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH_S,
+    _attn_implementation='eager',
     torch_dtype=torch.bfloat16,
+    device_map="auto",
     trust_remote_code=True,
 ).eval()
         return
     images = [image.convert("RGB")]
+    # Use the model's appropriate processor and chat template
     if model_name == "DeepSeek-OCR":
+        messages = [{"role": "user", "content": f"<image>\n{text}"}]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     else:
         messages = [
             {
             }
         ]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)