applegrew
/

deepseek-ocr-macos

@@ -34,10 +34,17 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaFlashAttention2
-)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -1235,7 +1242,7 @@ ATTENTION_CLASSES = {
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
-    "mha_flash_attention_2": LlamaFlashAttention2
 }

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+# Handle different transformers versions
+try:
+    from transformers.models.llama.modeling_llama import (
+        LlamaAttention,
+        LlamaFlashAttention2
+    )
+except ImportError:
+    # Newer transformers versions (4.47+) don't have LlamaFlashAttention2
+    from transformers.models.llama.modeling_llama import LlamaAttention
+    LlamaFlashAttention2 = None  # Will use fallback
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
+    "mha_flash_attention_2": LlamaFlashAttention2 if LlamaFlashAttention2 is not None else LlamaAttention
 }