Spaces:

DocUA
/

Local_OCR_Demo

Running on Zero

DocUA commited on 24 days ago

Commit

e0b7657

1 Parent(s): 6379065

feat: Add LlamaFlashAttention2 compatibility alias and eager attention implementation for model loading.

Files changed (2) hide show

app.py CHANGED Viewed

@@ -10,6 +10,13 @@ import io
 import gc
 import warnings
 # Suppress annoying warnings
 warnings.filterwarnings("ignore", message="The parameters have been moved from the Blocks constructor to the launch()")
 warnings.filterwarnings("ignore", message="CUDA is not available or torch_xla is imported")

 import gc
 import warnings
+try:
+    from transformers.models.llama import modeling_llama as _modeling_llama
+    if not hasattr(_modeling_llama, "LlamaFlashAttention2") and hasattr(_modeling_llama, "LlamaAttention"):
+        _modeling_llama.LlamaFlashAttention2 = _modeling_llama.LlamaAttention
+except Exception:
+    pass
 # Suppress annoying warnings
 warnings.filterwarnings("ignore", message="The parameters have been moved from the Blocks constructor to the launch()")
 warnings.filterwarnings("ignore", message="CUDA is not available or torch_xla is imported")

app_hf.py CHANGED Viewed

@@ -20,6 +20,13 @@ import fitz  # PyMuPDF
 import io
 import gc
 # Suppress annoying warnings
 warnings.filterwarnings("ignore", message="The parameters have been moved from the Blocks constructor to the launch()")
 warnings.filterwarnings("ignore", message="CUDA is not available or torch_xla is imported")
@@ -50,6 +57,7 @@ class ModelManager:
                     model_name,
                     trust_remote_code=True,
                     use_safetensors=True,
                     torch_dtype=dtype
                 )
                 model.eval()

 import io
 import gc
+try:
+    from transformers.models.llama import modeling_llama as _modeling_llama
+    if not hasattr(_modeling_llama, "LlamaFlashAttention2") and hasattr(_modeling_llama, "LlamaAttention"):
+        _modeling_llama.LlamaFlashAttention2 = _modeling_llama.LlamaAttention
+except Exception:
+    pass
 # Suppress annoying warnings
 warnings.filterwarnings("ignore", message="The parameters have been moved from the Blocks constructor to the launch()")
 warnings.filterwarnings("ignore", message="CUDA is not available or torch_xla is imported")
                     model_name,
                     trust_remote_code=True,
                     use_safetensors=True,
+                    attn_implementation="eager",
                     torch_dtype=dtype
                 )
                 model.eval()