Spaces:

sbompolas
/

Lesbian-dialect-ASR

Sleeping

App Files Files Community

sbompolas commited on Jun 28, 2025

Commit

75153af

verified ·

1 Parent(s): 7b908d7

Create app.py

Browse files

Files changed (1) hide show

app.py +422 -0

app.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import gradio as gr
+import torch
+import logging
+import gc
+import time
+from pathlib import Path
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+from transformers.utils import is_flash_attn_2_available
+import librosa
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class OptimizedWhisperApp:
+    def __init__(self):
+        self.pipe = None
+        self.current_model = None
+        self.available_models = [
+            "openai/whisper-tiny",
+            "openai/whisper-base",
+            "openai/whisper-small",
+            "openai/whisper-medium",  # Often the sweet spot
+            "openai/whisper-large-v2",
+            "openai/whisper-large-v3",
+            "distil-whisper/distil-medium.en",
+            "distil-whisper/distil-large-v2",
+            "ilsp/whisper_greek_dialect_of_lesbos"  # Your specialized model
+        ]
+    def create_pipe(self, model_name, use_flash_attention=True):
+        """Create pipeline like the successful space"""
+        try:
+            # Device selection
+            if torch.cuda.is_available():
+                device = "cuda:0"
+                torch_dtype = torch.float16
+            else:
+                device = "cpu"
+                torch_dtype = torch.float32
+            logger.info(f"Loading {model_name} on {device} with {torch_dtype}")
+            # Attention implementation
+            if use_flash_attention and is_flash_attn_2_available() and torch.cuda.is_available():
+                attn_implementation = "flash_attention_2"
+                logger.info("Using Flash Attention 2")
+            else:
+                attn_implementation = "sdpa"  # Scaled Dot Product Attention
+                logger.info(f"Using {attn_implementation}")
+            # Load model directly (like the successful space)
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+                use_safetensors=True,
+                attn_implementation=attn_implementation,
+                cache_dir="./cache"
+            )
+            model.to(device)
+            # Load processor
+            processor = AutoProcessor.from_pretrained(model_name)
+            # Create pipeline manually (like the successful space)
+            pipe = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                tokenizer=processor.tokenizer,
+                feature_extractor=processor.feature_extractor,
+                torch_dtype=torch_dtype,
+                device=device,
+            )
+            logger.info("Pipeline created successfully!")
+            return pipe
+        except Exception as e:
+            logger.error(f"Failed to create pipeline: {e}")
+            return None
+    def load_model(self, model_name, use_flash_attention=True):
+        """Load model if different from current"""
+        if self.current_model != model_name or self.pipe is None:
+            logger.info(f"Loading new model: {model_name}")
+            # Clear previous model
+            if self.pipe is not None:
+                del self.pipe
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                gc.collect()
+            # Create new pipeline
+            self.pipe = self.create_pipe(model_name, use_flash_attention)
+            self.current_model = model_name if self.pipe else None
+            return self.pipe is not None
+        else:
+            logger.info("Model already loaded")
+            return True
+    def transcribe_audio(self, audio_file, model_name="openai/whisper-medium",
+                        language="Automatic Detection", task="transcribe",
+                        chunk_length_s=30, batch_size=16, use_flash_attention=True,
+                        return_timestamps=True):
+        """Transcribe using the optimized approach"""
+        if audio_file is None:
+            return "Please upload an audio file", "", ""
+        try:
+            start_time = time.time()
+            # Load model if needed
+            success = self.load_model(model_name, use_flash_attention)
+            if not success:
+                return "Failed to load model", "", ""
+            logger.info(f"Processing: {audio_file}")
+            logger.info(f"Settings: {model_name}, {language}, {task}")
+            logger.info(f"Chunk length: {chunk_length_s}s, Batch size: {batch_size}")
+            # Prepare generation kwargs (like the successful space)
+            generate_kwargs = {}
+            # Only set language if not auto-detection and model supports multilingual
+            if language != "Automatic Detection" and not model_name.endswith(".en"):
+                # Map common language names
+                language_map = {
+                    "Greek": "greek",
+                    "English": "english",
+                    "Spanish": "spanish",
+                    "French": "french",
+                    "German": "german",
+                    "Italian": "italian"
+                }
+                lang_code = language_map.get(language, language.lower())
+                generate_kwargs["language"] = lang_code
+                logger.info(f"Set language: {lang_code}")
+            # Set task if model supports it
+            if not model_name.endswith(".en"):
+                generate_kwargs["task"] = task
+                logger.info(f"Set task: {task}")
+            # Transcribe (like the successful space approach)
+            logger.info("Starting transcription...")
+            outputs = self.pipe(
+                audio_file,
+                chunk_length_s=chunk_length_s,
+                batch_size=batch_size,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=return_timestamps,
+            )
+            transcription_time = time.time() - start_time
+            logger.info(f"Transcription completed in {transcription_time:.2f} seconds")
+            # Extract results
+            transcription = outputs.get("text", "")
+            chunks = outputs.get("chunks", [])
+            # Format timestamps
+            timestamp_text = ""
+            if chunks:
+                timestamp_text = self._format_timestamps(chunks)
+            # Create detailed output
+            detailed_output = self._format_detailed_output(
+                transcription, model_name, language, task,
+                transcription_time, chunk_length_s, batch_size,
+                use_flash_attention, len(chunks)
+            )
+            return transcription.strip(), timestamp_text, detailed_output
+        except Exception as e:
+            error_msg = f"Transcription error: {str(e)}"
+            logger.error(error_msg)
+            return error_msg, "", error_msg
+    def _format_timestamps(self, chunks):
+        """Format timestamp information"""
+        timestamp_text = "=== TIMESTAMPS ===\n"
+        for i, chunk in enumerate(chunks):
+            timestamp = chunk.get('timestamp', [0, 0])
+            text = chunk.get('text', '')
+            start, end = timestamp[0], timestamp[1]
+            timestamp_text += f"[{start:.1f}s - {end:.1f}s]: {text}\n"
+        return timestamp_text
+    def _format_detailed_output(self, transcription, model_name, language, task,
+                               transcription_time, chunk_length_s, batch_size,
+                               use_flash_attention, num_chunks):
+        """Format detailed information"""
+        output = "=== TRANSCRIPTION ===\n"
+        output += f"{transcription}\n\n"
+        output += "=== MODEL INFORMATION ===\n"
+        output += f"Model: {model_name}\n"
+        output += f"Language: {language}\n"
+        output += f"Task: {task}\n"
+        output += f"Processing time: {transcription_time:.2f} seconds\n"
+        output += f"Chunks processed: {num_chunks}\n"
+        output += "\n=== PROCESSING SETTINGS ===\n"
+        output += f"Chunk length: {chunk_length_s} seconds\n"
+        output += f"Batch size: {batch_size}\n"
+        output += f"Flash Attention: {'Enabled' if use_flash_attention else 'Disabled'}\n"
+        if self.pipe:
+            device = next(self.pipe.model.parameters()).device
+            dtype = next(self.pipe.model.parameters()).dtype
+            output += f"Device: {device}\n"
+            output += f"Data type: {dtype}\n"
+        output += f"Flash Attention 2 available: {is_flash_attn_2_available()}\n"
+        output += "\n=== OPTIMIZATIONS ===\n"
+        output += "• Direct model loading (not pipeline abstraction)\n"
+        output += "• Manual pipeline construction\n"
+        output += "• Optimized attention mechanism\n"
+        output += "• Batch processing\n"
+        output += "• Conservative language handling\n"
+        output += "• Proper memory management\n"
+        return output
+    def get_model_info(self):
+        """Get current model information"""
+        if self.pipe is None:
+            return "No model loaded"
+        device = next(self.pipe.model.parameters()).device
+        dtype = next(self.pipe.model.parameters()).dtype
+        return f"✅ {self.current_model} loaded on {device} ({dtype})"
+# Initialize the app
+logger.info("Initializing Optimized Whisper App...")
+whisper_app = OptimizedWhisperApp()
+def transcribe_wrapper(audio, model_name, language, task, chunk_length_s,
+                      batch_size, use_flash_attention, return_timestamps):
+    """Wrapper for Gradio interface"""
+    return whisper_app.transcribe_audio(
+        audio, model_name, language, task,
+        chunk_length_s, batch_size, use_flash_attention, return_timestamps
+    )
+def get_model_status():
+    """Get current model status"""
+    return whisper_app.get_model_info()
+# Create the interface
+def create_interface():
+    with gr.Blocks(title="Optimized Whisper Transcription", theme=gr.themes.Soft()) as interface:
+        gr.Markdown(
+            """
+            # 🚀 Optimized Whisper Transcription
+            **High-Performance Speech-to-Text Based on Successful Implementation**
+            Uses the same optimizations as high-performing Whisper spaces:
+            - Direct model loading for better control
+            - Flash Attention 2 support
+            - Optimized chunking and batching
+            - Conservative parameter handling
+            """
+        )
+        # Model status
+        model_status = gr.Textbox(
+            value=get_model_status(),
+            label="🔧 Current Model Status",
+            interactive=False
+        )
+        # Main interface
+        with gr.Row():
+            with gr.Column():
+                # Audio input
+                audio_input = gr.Audio(
+                    label="🎵 Upload Audio File",
+                    type="filepath",
+                    waveform_options=gr.WaveformOptions(
+                        waveform_color="#01C6FF",
+                        waveform_progress_color="#0066B4",
+                        skip_length=2,
+                        show_controls=True,
+                    )
+                )
+                # Model selection
+                model_dropdown = gr.Dropdown(
+                    choices=whisper_app.available_models,
+                    value="openai/whisper-medium",
+                    label="Model",
+                    info="Medium often works best for real-world usage"
+                )
+                # Basic settings
+                with gr.Row():
+                    language_dropdown = gr.Dropdown(
+                        choices=["Automatic Detection", "Greek", "English", "Spanish", "French", "German", "Italian"],
+                        value="Automatic Detection",
+                        label="Language"
+                    )
+                    task_dropdown = gr.Dropdown(
+                        choices=["transcribe", "translate"],
+                        value="transcribe",
+                        label="Task"
+                    )
+                # Advanced settings
+                with gr.Accordion("Advanced Settings", open=False):
+                    chunk_length_s = gr.Slider(
+                        minimum=10,
+                        maximum=60,
+                        value=30,
+                        step=5,
+                        label="Chunk Length (seconds)",
+                        info="30s is optimal for most cases"
+                    )
+                    batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=32,
+                        value=16,
+                        step=1,
+                        label="Batch Size",
+                        info="Higher = faster, more memory"
+                    )
+                    use_flash_attention = gr.Checkbox(
+                        label="Flash Attention 2",
+                        value=True,
+                        info="Faster processing (requires compatible GPU)"
+                    )
+                    return_timestamps = gr.Checkbox(
+                        label="Return Timestamps",
+                        value=True
+                    )
+                transcribe_btn = gr.Button(
+                    "🚀 Transcribe",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column():
+                # Results
+                transcription_output = gr.Textbox(
+                    label="Transcription",
+                    lines=8,
+                    show_copy_button=True
+                )
+                with gr.Accordion("Timestamps", open=False):
+                    timestamps_output = gr.Textbox(
+                        label="Timestamp Information",
+                        lines=10,
+                        show_copy_button=True
+                    )
+                with gr.Accordion("Detailed Information", open=False):
+                    detailed_output = gr.Textbox(
+                        label="Processing Details & Model Info",
+                        lines=15,
+                        show_copy_button=True
+                    )
+        # Event handlers
+        transcribe_btn.click(
+            fn=transcribe_wrapper,
+            inputs=[audio_input, model_dropdown, language_dropdown, task_dropdown,
+                   chunk_length_s, batch_size, use_flash_attention, return_timestamps],
+            outputs=[transcription_output, timestamps_output, detailed_output],
+            show_progress=True
+        )
+        # Update model status when model changes
+        model_dropdown.change(
+            fn=lambda: "Model will be loaded on next transcription",
+            outputs=[model_status]
+        )
+        # Footer
+        gr.Markdown(
+            """
+            ### 🎯 Model Recommendations
+            **For Greek dialect of Lesbos:**
+            - `ilsp/whisper_greek_dialect_of_lesbos` - Specialized but may have issues
+            - `openai/whisper-medium` - Often better for real-world usage
+            - `openai/whisper-large-v2` - More accurate but slower
+            **General recommendations:**
+            - **Medium model** often provides the best balance
+            - **30-second chunks** work well for most audio
+            - **Flash Attention** speeds up processing significantly
+            - **Automatic language detection** usually works well
+            ### ⚡ Performance Tips
+            - GPU with Flash Attention 2 = Fastest
+            - Batch size 16-24 optimal for most GPUs
+            - Lower chunk length for very noisy audio
+            - Use English-only models (.en) for English-only content
+            """
+        )
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(share=True)