michaeltangz commited on
Commit
078e579
·
1 Parent(s): 8f2a46b

refactor app.py to move flash attention installation into a try-except block; ensure proper fallback to sdpa if installation fails

Browse files
Files changed (1) hide show
  1. app.py +18 -7
app.py CHANGED
@@ -8,19 +8,30 @@ import scipy.io.wavfile
8
  import time
9
  import numpy as np
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
11
- import subprocess
12
- subprocess.run(
13
- "pip install flash-attn --no-build-isolation",
14
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
- shell=True,
16
- )
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  torch_dtype = torch.float16
20
  MODEL_NAME = "openai/whisper-large-v3-turbo"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
23
- MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
 
 
 
 
24
  )
25
  model.to(device)
26
 
 
8
  import time
9
  import numpy as np
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 
 
 
 
 
 
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  torch_dtype = torch.float16
14
  MODEL_NAME = "openai/whisper-large-v3-turbo"
15
 
16
+ # Try to use flash attention, fall back to sdpa if not available
17
+ try:
18
+ import subprocess
19
+ subprocess.run(
20
+ "pip install flash-attn --no-build-isolation",
21
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
22
+ shell=True,
23
+ )
24
+ from flash_attn import flash_attn_func
25
+ attn_implementation = "flash_attention_2"
26
+ except Exception:
27
+ attn_implementation = "sdpa" # Use PyTorch's scaled dot product attention
28
+
29
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
30
+ MODEL_NAME,
31
+ torch_dtype=torch_dtype,
32
+ low_cpu_mem_usage=True,
33
+ use_safetensors=True,
34
+ attn_implementation=attn_implementation
35
  )
36
  model.to(device)
37