Spaces:

cweigendev
/

videoanalyzer

Paused

cweigendev commited on Aug 6

Commit

91d9b86

verified ·

1 Parent(s): 3becf32

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,13 +16,26 @@ def load_model():
         print("This may take several minutes on first load...")
         # Load model with correct parameters based on official documentation
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,  # Changed from float16 to bfloat16
-            attn_implementation="flash_attention_2",  # Added for better performance
-        )
         # Load processor (not tokenizer)
         processor = AutoProcessor.from_pretrained(

         print("This may take several minutes on first load...")
         # Load model with correct parameters based on official documentation
+        # Try with flash attention first, fall back to standard attention
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,  # Changed from float16 to bfloat16
+                attn_implementation="flash_attention_2",  # Added for better performance
+            )
+            print("Loaded with flash attention")
+        except Exception as flash_error:
+            print(f"Flash attention failed: {flash_error}")
+            print("Falling back to standard attention...")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+            )
+            print("Loaded with standard attention")
         # Load processor (not tokenizer)
         processor = AutoProcessor.from_pretrained(