Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

d6bbd32

verified ·

1 Parent(s): ebd6535

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -29

app.py CHANGED Viewed

@@ -92,9 +92,9 @@ css = """
 }
 """
-# --- Local Model Caching and Patching ---
-# Define a local directory to cache all models
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
@@ -102,16 +102,14 @@ if not os.path.exists(CACHE_PATH):
 # --- Fix for Dots.OCR Processor Loading ---
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
@@ -122,31 +120,40 @@ if os.path.exists(config_file_path):
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 # --- Fix for DeepSeek-OCR ImportError ---
 model_path_s_local = snapshot_download(
     repo_id='deepseek-ai/DeepSeek-OCR',
-    local_dir=os.path.join(CACHE_PATH, "DeepSeek-OCR"),
     max_workers=20,
     local_dir_use_symlinks=False
 )
-deepseek_modeling_file = os.path.join(model_path_s_local, "modeling_deepseekv2.py")
-if os.path.exists(deepseek_modeling_file):
-    with open(deepseek_modeling_file, 'r', encoding='utf-8') as f:
-        content = f.read()
-    # Check if the problematic import exists and hasn't been patched yet
-    problematic_import_str = "from transformers.models.llama.modeling_llama import (\n    LlamaFlashAttention2,"
-    if problematic_import_str in content:
-        # Patch the file by commenting out the LlamaFlashAttention2 import
-        patched_content = content.replace("LlamaFlashAttention2,", "# LlamaFlashAttention2,")
-        with open(deepseek_modeling_file, 'w', encoding='utf-8') as f:
-            f.write(patched_content)
         print("Patched modeling_deepseekv2.py successfully.")
 sys.path.append(model_path_s_local)
@@ -184,11 +191,10 @@ MODEL_PATH_S = model_path_s_local
 processor_s = AutoProcessor.from_pretrained(MODEL_PATH_S, trust_remote_code=True)
 model_s = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_S,
-    _attn_implementation='eager',
     torch_dtype=torch.bfloat16,
-    device_map="auto",
     trust_remote_code=True,
-).eval()
 @spaces.GPU
@@ -214,12 +220,17 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         return
     images = [image.convert("RGB")]
-    # Use the model's appropriate processor and chat template
     if model_name == "DeepSeek-OCR":
-        messages = [{"role": "user", "content": f"<image>\n{text}"}]
-        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     else:
         messages = [
             {
@@ -228,7 +239,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             }
         ]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)

 }
 """
+# --- Model Patching ---
+# Define a local directory to cache models
 CACHE_PATH = "./model_cache"
 if not os.path.exists(CACHE_PATH):
     os.makedirs(CACHE_PATH)
 # --- Fix for Dots.OCR Processor Loading ---
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
     with open(config_file_path, 'r') as f:
         input_code = f.read()
     lines = input_code.splitlines()
     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
         output_lines = []
         with open(config_file_path, 'w') as f:
             f.write('\n'.join(output_lines))
         print("Patched configuration_dots.py successfully.")
 sys.path.append(model_path_d_local)
 # --- Fix for DeepSeek-OCR ImportError ---
 model_path_s_local = snapshot_download(
     repo_id='deepseek-ai/DeepSeek-OCR',
+    local_dir=os.path.join(CACHE_PATH, 'DeepSeek-OCR'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
+modeling_file_path = os.path.join(model_path_s_local, "modeling_deepseekv2.py")
+if os.path.exists(modeling_file_path):
+    with open(modeling_file_path, 'r', encoding='utf-8') as f:
+        input_code = f.read()
+    # The problematic import line
+    original_import = "from transformers.models.llama.modeling_llama import (\n    LlamaAttention,\n    LlamaFlashAttention2\n)"
+    if original_import in input_code:
+        # Replace with a safe version that handles the ImportError
+        safe_import = """from transformers.models.llama.modeling_llama import (
+    LlamaAttention
+)
+try:
+    from transformers.models.llama.modeling_llama import LlamaFlashAttention2
+except ImportError:
+    print("Warning: `LlamaFlashAttention2` not found. Falling back to `LlamaAttention`.")
+    LlamaFlashAttention2 = LlamaAttention"""
+        patched_code = input_code.replace(original_import, safe_import)
+        with open(modeling_file_path, 'w', encoding='utf-8') as f:
+            f.write(patched_code)
         print("Patched modeling_deepseekv2.py successfully.")
 sys.path.append(model_path_s_local)
 processor_s = AutoProcessor.from_pretrained(MODEL_PATH_S, trust_remote_code=True)
 model_s = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH_S,
+    _attn_implementation='flash_attention_2',
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
+).to(device).eval()
 @spaces.GPU
         return
     images = [image.convert("RGB")]
+    # For DeepSeek-OCR, the recommended prompt format is slightly different
     if model_name == "DeepSeek-OCR":
+        # Using a format found in documentation for better performance
+        # Note: The processor is expected to handle the full templating.
+        # This approach follows the user's implementation.
+        messages = [
+            {"role": "user", "content": f"<image>\n<|grounding|>{text}"}
+        ]
+        prompt = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     else:
         messages = [
             {
             }
         ]
         prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)