Spaces:

sbompolas
/

Lesbian-dialect-ASR

Sleeping

App Files Files Community

sbompolas commited on Jun 28, 2025

Commit

b55a3fa

verified ·

1 Parent(s): e02050f

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -17

app.py CHANGED Viewed

@@ -28,12 +28,12 @@ class OptimizedWhisperApp:
             "openai/whisper-tiny",
             "openai/whisper-base",
             "openai/whisper-small",
-            "openai/whisper-medium",  # Often the sweet spot
             "openai/whisper-large-v2",
             "openai/whisper-large-v3",
             "distil-whisper/distil-medium.en",
             "distil-whisper/distil-large-v2",
-            "ilsp/whisper_greek_dialect_of_lesbos"  # Your specialized model
         ]
     def create_pipe(self, model_name, use_flash_attention=True):
@@ -54,13 +54,13 @@ class OptimizedWhisperApp:
                 attn_implementation = "flash_attention_2"
                 logger.info("Using Flash Attention 2")
             else:
-                attn_implementation = "sdpa"  # Scaled Dot Product Attention
                 if use_flash_attention and not FLASH_ATTN_AVAILABLE:
                     logger.info("Flash Attention requested but not available, using SDPA")
                 else:
                     logger.info(f"Using {attn_implementation}")
-            # Load model directly (like the successful space)
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
@@ -74,7 +74,7 @@ class OptimizedWhisperApp:
             # Load processor
             processor = AutoProcessor.from_pretrained(model_name)
-            # Create pipeline manually (like the successful space)
             pipe = pipeline(
                 "automatic-speech-recognition",
                 model=model,
@@ -133,7 +133,7 @@ class OptimizedWhisperApp:
             logger.info(f"Settings: {model_name}, {language}, {task}")
             logger.info(f"Chunk length: {chunk_length_s}s, Batch size: {batch_size}")
-            # Prepare generation kwargs (like the successful space)
             generate_kwargs = {}
             # Only set language if not auto-detection and model supports multilingual
@@ -156,7 +156,7 @@ class OptimizedWhisperApp:
                 generate_kwargs["task"] = task
                 logger.info(f"Set task: {task}")
-            # Transcribe (like the successful space approach)
             logger.info("Starting transcription...")
             outputs = self.pipe(
                 audio_file,
@@ -227,7 +227,7 @@ class OptimizedWhisperApp:
             output += f"Device: {device}\n"
             output += f"Data type: {dtype}\n"
-                        output += f"Flash Attention 2 available: {FLASH_ATTN_AVAILABLE and is_flash_attn_2_available()}\n"
         output += "\n=== OPTIMIZATIONS ===\n"
         output += "• Direct model loading (not pipeline abstraction)\n"
@@ -277,7 +277,7 @@ def create_interface():
             Uses the same optimizations as high-performing Whisper spaces:
             - Direct model loading for better control
-            - Flash Attention 2 support
             - Optimized chunking and batching
             - Conservative parameter handling
             """
@@ -296,13 +296,7 @@ def create_interface():
                 # Audio input
                 audio_input = gr.Audio(
                     label="🎵 Upload Audio File",
-                    type="filepath",
-                    waveform_options=gr.WaveformOptions(
-                        waveform_color="#01C6FF",
-                        waveform_progress_color="#0066B4",
-                        skip_length=2,
-                        show_controls=True,
-                    )
                 )
                 # Model selection
@@ -415,7 +409,7 @@ def create_interface():
             **General recommendations:**
             - **Medium model** often provides the best balance
             - **30-second chunks** work well for most audio
-            - **Flash Attention** speeds up processing significantly
             - **Automatic language detection** usually works well
             ### ⚡ Performance Tips

             "openai/whisper-tiny",
             "openai/whisper-base",
             "openai/whisper-small",
+            "openai/whisper-medium",
             "openai/whisper-large-v2",
             "openai/whisper-large-v3",
             "distil-whisper/distil-medium.en",
             "distil-whisper/distil-large-v2",
+            "ilsp/whisper_greek_dialect_of_lesbos"
         ]
     def create_pipe(self, model_name, use_flash_attention=True):
                 attn_implementation = "flash_attention_2"
                 logger.info("Using Flash Attention 2")
             else:
+                attn_implementation = "sdpa"
                 if use_flash_attention and not FLASH_ATTN_AVAILABLE:
                     logger.info("Flash Attention requested but not available, using SDPA")
                 else:
                     logger.info(f"Using {attn_implementation}")
+            # Load model directly
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
             # Load processor
             processor = AutoProcessor.from_pretrained(model_name)
+            # Create pipeline manually
             pipe = pipeline(
                 "automatic-speech-recognition",
                 model=model,
             logger.info(f"Settings: {model_name}, {language}, {task}")
             logger.info(f"Chunk length: {chunk_length_s}s, Batch size: {batch_size}")
+            # Prepare generation kwargs
             generate_kwargs = {}
             # Only set language if not auto-detection and model supports multilingual
                 generate_kwargs["task"] = task
                 logger.info(f"Set task: {task}")
+            # Transcribe
             logger.info("Starting transcription...")
             outputs = self.pipe(
                 audio_file,
             output += f"Device: {device}\n"
             output += f"Data type: {dtype}\n"
+        output += f"Flash Attention 2 available: {FLASH_ATTN_AVAILABLE and is_flash_attn_2_available()}\n"
         output += "\n=== OPTIMIZATIONS ===\n"
         output += "• Direct model loading (not pipeline abstraction)\n"
             Uses the same optimizations as high-performing Whisper spaces:
             - Direct model loading for better control
+            - Flash Attention 2 support (when available)
             - Optimized chunking and batching
             - Conservative parameter handling
             """
                 # Audio input
                 audio_input = gr.Audio(
                     label="🎵 Upload Audio File",
+                    type="filepath"
                 )
                 # Model selection
             **General recommendations:**
             - **Medium model** often provides the best balance
             - **30-second chunks** work well for most audio
+            - **Flash Attention** speeds up processing significantly (when available)
             - **Automatic language detection** usually works well
             ### ⚡ Performance Tips