sbompolas commited on
Commit
b55a3fa
·
verified ·
1 Parent(s): e02050f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -17
app.py CHANGED
@@ -28,12 +28,12 @@ class OptimizedWhisperApp:
28
  "openai/whisper-tiny",
29
  "openai/whisper-base",
30
  "openai/whisper-small",
31
- "openai/whisper-medium", # Often the sweet spot
32
  "openai/whisper-large-v2",
33
  "openai/whisper-large-v3",
34
  "distil-whisper/distil-medium.en",
35
  "distil-whisper/distil-large-v2",
36
- "ilsp/whisper_greek_dialect_of_lesbos" # Your specialized model
37
  ]
38
 
39
  def create_pipe(self, model_name, use_flash_attention=True):
@@ -54,13 +54,13 @@ class OptimizedWhisperApp:
54
  attn_implementation = "flash_attention_2"
55
  logger.info("Using Flash Attention 2")
56
  else:
57
- attn_implementation = "sdpa" # Scaled Dot Product Attention
58
  if use_flash_attention and not FLASH_ATTN_AVAILABLE:
59
  logger.info("Flash Attention requested but not available, using SDPA")
60
  else:
61
  logger.info(f"Using {attn_implementation}")
62
 
63
- # Load model directly (like the successful space)
64
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
65
  model_name,
66
  torch_dtype=torch_dtype,
@@ -74,7 +74,7 @@ class OptimizedWhisperApp:
74
  # Load processor
75
  processor = AutoProcessor.from_pretrained(model_name)
76
 
77
- # Create pipeline manually (like the successful space)
78
  pipe = pipeline(
79
  "automatic-speech-recognition",
80
  model=model,
@@ -133,7 +133,7 @@ class OptimizedWhisperApp:
133
  logger.info(f"Settings: {model_name}, {language}, {task}")
134
  logger.info(f"Chunk length: {chunk_length_s}s, Batch size: {batch_size}")
135
 
136
- # Prepare generation kwargs (like the successful space)
137
  generate_kwargs = {}
138
 
139
  # Only set language if not auto-detection and model supports multilingual
@@ -156,7 +156,7 @@ class OptimizedWhisperApp:
156
  generate_kwargs["task"] = task
157
  logger.info(f"Set task: {task}")
158
 
159
- # Transcribe (like the successful space approach)
160
  logger.info("Starting transcription...")
161
  outputs = self.pipe(
162
  audio_file,
@@ -227,7 +227,7 @@ class OptimizedWhisperApp:
227
  output += f"Device: {device}\n"
228
  output += f"Data type: {dtype}\n"
229
 
230
- output += f"Flash Attention 2 available: {FLASH_ATTN_AVAILABLE and is_flash_attn_2_available()}\n"
231
 
232
  output += "\n=== OPTIMIZATIONS ===\n"
233
  output += "• Direct model loading (not pipeline abstraction)\n"
@@ -277,7 +277,7 @@ def create_interface():
277
 
278
  Uses the same optimizations as high-performing Whisper spaces:
279
  - Direct model loading for better control
280
- - Flash Attention 2 support
281
  - Optimized chunking and batching
282
  - Conservative parameter handling
283
  """
@@ -296,13 +296,7 @@ def create_interface():
296
  # Audio input
297
  audio_input = gr.Audio(
298
  label="🎵 Upload Audio File",
299
- type="filepath",
300
- waveform_options=gr.WaveformOptions(
301
- waveform_color="#01C6FF",
302
- waveform_progress_color="#0066B4",
303
- skip_length=2,
304
- show_controls=True,
305
- )
306
  )
307
 
308
  # Model selection
@@ -415,7 +409,7 @@ def create_interface():
415
  **General recommendations:**
416
  - **Medium model** often provides the best balance
417
  - **30-second chunks** work well for most audio
418
- - **Flash Attention** speeds up processing significantly
419
  - **Automatic language detection** usually works well
420
 
421
  ### ⚡ Performance Tips
 
28
  "openai/whisper-tiny",
29
  "openai/whisper-base",
30
  "openai/whisper-small",
31
+ "openai/whisper-medium",
32
  "openai/whisper-large-v2",
33
  "openai/whisper-large-v3",
34
  "distil-whisper/distil-medium.en",
35
  "distil-whisper/distil-large-v2",
36
+ "ilsp/whisper_greek_dialect_of_lesbos"
37
  ]
38
 
39
  def create_pipe(self, model_name, use_flash_attention=True):
 
54
  attn_implementation = "flash_attention_2"
55
  logger.info("Using Flash Attention 2")
56
  else:
57
+ attn_implementation = "sdpa"
58
  if use_flash_attention and not FLASH_ATTN_AVAILABLE:
59
  logger.info("Flash Attention requested but not available, using SDPA")
60
  else:
61
  logger.info(f"Using {attn_implementation}")
62
 
63
+ # Load model directly
64
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
65
  model_name,
66
  torch_dtype=torch_dtype,
 
74
  # Load processor
75
  processor = AutoProcessor.from_pretrained(model_name)
76
 
77
+ # Create pipeline manually
78
  pipe = pipeline(
79
  "automatic-speech-recognition",
80
  model=model,
 
133
  logger.info(f"Settings: {model_name}, {language}, {task}")
134
  logger.info(f"Chunk length: {chunk_length_s}s, Batch size: {batch_size}")
135
 
136
+ # Prepare generation kwargs
137
  generate_kwargs = {}
138
 
139
  # Only set language if not auto-detection and model supports multilingual
 
156
  generate_kwargs["task"] = task
157
  logger.info(f"Set task: {task}")
158
 
159
+ # Transcribe
160
  logger.info("Starting transcription...")
161
  outputs = self.pipe(
162
  audio_file,
 
227
  output += f"Device: {device}\n"
228
  output += f"Data type: {dtype}\n"
229
 
230
+ output += f"Flash Attention 2 available: {FLASH_ATTN_AVAILABLE and is_flash_attn_2_available()}\n"
231
 
232
  output += "\n=== OPTIMIZATIONS ===\n"
233
  output += "• Direct model loading (not pipeline abstraction)\n"
 
277
 
278
  Uses the same optimizations as high-performing Whisper spaces:
279
  - Direct model loading for better control
280
+ - Flash Attention 2 support (when available)
281
  - Optimized chunking and batching
282
  - Conservative parameter handling
283
  """
 
296
  # Audio input
297
  audio_input = gr.Audio(
298
  label="🎵 Upload Audio File",
299
+ type="filepath"
 
 
 
 
 
 
300
  )
301
 
302
  # Model selection
 
409
  **General recommendations:**
410
  - **Medium model** often provides the best balance
411
  - **30-second chunks** work well for most audio
412
+ - **Flash Attention** speeds up processing significantly (when available)
413
  - **Automatic language detection** usually works well
414
 
415
  ### ⚡ Performance Tips