pruthvi423
/

speech-translation-models

+"""
+Enhanced Speech-to-Speech Translation Pipeline with Advanced Gradio Interface
+This script implements a complete pipeline for speech-to-speech translation with
+dynamic model selection and advanced configuration options.
+Features:
+- Dynamic Whisper model switching (tiny, base, small, medium)
+- NLLB model selection (600M, 1.3B)
+- Advanced translation parameters (beam size, temperature, etc.)
+- Real-time processing with detailed model information
+- Comprehensive model descriptions and performance metrics
+Requirements:
+- faster-whisper
+- ctranslate2
+- transformers (version 4.33.0+)
+- torch
+- numpy
+- scipy
+- requests (for fallback tokenizer)
+- gradio
+"""
+import os
+import time
+import torch
+import numpy as np
+import ctranslate2
+import scipy.io.wavfile
+from faster_whisper import WhisperModel
+import gradio as gr
+import re
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Generator
+# Fix for numpy binary incompatibility
+os.environ["PYTHONWARNINGS"] = "ignore::RuntimeWarning"
+class EnhancedS2SPipeline:
+    """
+    Enhanced Speech-to-Speech Translation Pipeline with dynamic model loading
+    """
+    def __init__(self, device="cuda"):
+        """
+        Initialize the pipeline with dynamic model loading capability
+        Args:
+            device: Device to run inference on ('cuda' or 'cpu')
+        """
+        self.device = device if torch.cuda.is_available() else "cpu"
+        self.compute_type = "float16" if self.device == "cuda" else "int8"
+        # Model caches
+        self.whisper_models: Dict[str, WhisperModel] = {}
+        self.nllb_models: Dict[str, ctranslate2.Translator] = {}
+        self.nllb_tokenizer = None
+        self.tts_models = {}
+        self.tts_tokenizers = {}
+        # Model configurations - Updated for HuggingFace Spaces
+        self.model_configs = {
+            "whisper": {
+                "tiny": {"size": "39 MB", "speed": "Very Fast", "accuracy": "Good", "multilingual": True},
+                "base": {"size": "74 MB", "speed": "Fast", "accuracy": "Better", "multilingual": True},
+                "small": {"size": "244 MB", "speed": "Medium", "accuracy": "Good", "multilingual": True},
+                "medium": {"size": "769 MB", "speed": "Slow", "accuracy": "Very Good", "multilingual": True}
+            },
+            "nllb": {
+                "600M": {
+                    "path": "./models/nllb-200-distilled-600M-ct2-int8",
+                    "size": "600M parameters",
+                    "speed": "Fast",
+                    "accuracy": "Good",
+                    "languages": "200+ languages"
+                },
+                "1.3B": {
+                    "path": "./models/nllb-200-distilled-1.3B-ct2-int8",
+                    "size": "1.3B parameters",
+                    "speed": "Medium",
+                    "accuracy": "Better",
+                    "languages": "200+ languages"
+                }
+            }
+        }
+        # Language code mappings for NLLB
+        self.lang_codes = {
+            "English": "eng_Latn",  # English
+            "French": "fra_Latn",   # French
+        }
+        # TTS language mapping
+        self.tts_lang_codes = {
+            "English": "eng",
+            "French": "fra"
+        }
+        print(f"Enhanced Speech-to-Speech pipeline initialized on {self.device}")
+        # Initialize TTS models (these are relatively small, so we can load them upfront)
+        self._initialize_tts_models()
+        # Initialize tokenizer
+        self._initialize_nllb_tokenizer()
+    def _initialize_tts_models(self):
+        """Initialize TTS models for all supported languages"""
+        print("Loading MMS-TTS models for English and French...")
+        try:
+            from transformers.models.vits.modeling_vits import VitsModel
+            from transformers.models.vits.tokenization_vits import VitsTokenizer
+            # Load English TTS model
+            print("Loading English TTS model...")
+            self.tts_models["English"] = VitsModel.from_pretrained(
+                "facebook/mms-tts-eng",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            self.tts_tokenizers["English"] = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+            # Load French TTS model
+            print("Loading French TTS model...")
+            self.tts_models["French"] = VitsModel.from_pretrained(
+                "facebook/mms-tts-fra",
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            self.tts_tokenizers["French"] = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
+            print("TTS models loaded successfully.")
+        except Exception as e:
+            print(f"Error loading TTS models: {e}")
+            print("TTS functionality may be limited.")
+    def _initialize_nllb_tokenizer(self):
+        """Initialize NLLB tokenizer with fallback"""
+        try:
+            print("Loading NLLB tokenizer...")
+            from transformers.models.nllb.tokenization_nllb import NllbTokenizer
+            self.nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+            print("NLLB tokenizer loaded successfully.")
+        except Exception as e:
+            print(f"Error loading NLLB tokenizer: {e}")
+            print("Implementing simplified fallback tokenizer...")
+            self.nllb_tokenizer = self._create_fallback_tokenizer()
+    def _create_fallback_tokenizer(self):
+        """Create a simplified fallback tokenizer for NLLB"""
+        import json
+        import requests
+        class SimplifiedNllbTokenizer:
+            def __init__(self):
+                self.src_lang = "eng_Latn"
+                cache_dir = Path.home() / ".cache" / "simplified_nllb_tokenizer"
+                cache_dir.mkdir(parents=True, exist_ok=True)
+                vocab_file = cache_dir / "vocab.json"
+                if not vocab_file.exists():
+                    print("Downloading NLLB vocabulary for fallback tokenizer...")
+                    url = "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/vocab.json"
+                    try:
+                        response = requests.get(url)
+                        response.raise_for_status()
+                        with open(vocab_file, 'wb') as f:
+                            f.write(response.content)
+                        print("Vocabulary downloaded successfully.")
+                    except requests.exceptions.RequestException as req_e:
+                        print(f"Failed to download vocabulary: {req_e}")
+                        with open(vocab_file, 'w') as f:
+                            json.dump({"[PAD]": 0, "[UNK]": 1}, f)
+                with open(vocab_file, 'r', encoding='utf-8') as f:
+                    self.vocab = json.load(f)
+                self.id_to_token = {v: k for k, v in self.vocab.items()}
+            def tokenize(self, text):
+                text = text.lower()
+                tokens = re.findall(r'\w+|[^\w\s]', text)
+                return tokens
+            def convert_tokens_to_ids(self, tokens):
+                return [self.vocab.get(token, self.vocab.get("[UNK]", 1)) for token in tokens]
+            def convert_ids_to_tokens(self, ids):
+                return [self.id_to_token.get(id, "[UNK]") for id in ids]
+            def decode(self, token_ids, skip_special_tokens=True):
+                tokens = [self.id_to_token.get(id, "[UNK]") for id in token_ids]
+                if skip_special_tokens:
+                    tokens = [t for t in tokens if not t.startswith("[") and not t.endswith("]")]
+                return " ".join(tokens)
+            def __call__(self, text, return_tensors=None, padding=False):
+                tokens = self.tokenize(text)
+                input_ids = self.convert_tokens_to_ids(tokens)
+                if return_tensors == "pt":
+                    import torch
+                    return {"input_ids": torch.tensor([input_ids])}
+                else:
+                    return {"input_ids": [input_ids]}
+        return SimplifiedNllbTokenizer()
+    def get_whisper_model(self, model_size: str) -> WhisperModel:
+        """Get or load Whisper model"""
+        if model_size not in self.whisper_models:
+            print(f"Loading Whisper model '{model_size}'...")
+            # Try to load from local models directory first
+            model_path = f"./models/whisper/{model_size}.pt"
+            if os.path.exists(model_path):
+                print(f"Loading Whisper model from local path: {model_path}")
+                self.whisper_models[model_size] = WhisperModel(
+                    model_path,
+                    device=self.device,
+                    compute_type=self.compute_type
+                )
+            else:
+                # Fallback to HuggingFace Hub
+                print(f"Loading Whisper model from HuggingFace Hub: {model_size}")
+                self.whisper_models[model_size] = WhisperModel(
+                    model_size,
+                    device=self.device,
+                    compute_type=self.compute_type
+                )
+            print(f"Whisper '{model_size}' loaded successfully.")
+        return self.whisper_models[model_size]
+    def get_nllb_model(self, model_size: str) -> ctranslate2.Translator:
+        """Get or load NLLB model"""
+        if model_size not in self.nllb_models:
+            model_path = self.model_configs["nllb"][model_size]["path"]
+            print(f"Loading NLLB model '{model_size}' from {model_path}...")
+            try:
+                self.nllb_models[model_size] = ctranslate2.Translator(
+                    model_path,
+                    device=self.device,
+                    compute_type=self.compute_type
+                )
+                print(f"NLLB '{model_size}' loaded successfully.")
+            except RuntimeError as e:
+                print(f"ERROR: Failed to load NLLB model from '{model_path}'.")
+                print(f"Please ensure the path is correct and contains model files.")
+                raise
+        return self.nllb_models[model_size]
+    def transcribe_realtime(self, audio_file, source_lang=None, whisper_model="tiny",
+                          vad_filter=False, beam_size=5, temperature=0.0):
+        """Enhanced transcription with configurable parameters"""
+        print(f"\n1. Transcribing with Whisper-{whisper_model}...")
+        start_time = time.time()
+        # Get Whisper model
+        whisper = self.get_whisper_model(whisper_model)
+        # Determine language code for Whisper
+        whisper_lang = None
+        if source_lang:
+            whisper_lang = "en" if source_lang == "English" else "fr" if source_lang == "French" else None
+        full_transcript = ""
+        # Configure transcription parameters
+        transcribe_params = {
+            "language": whisper_lang,
+            "beam_size": beam_size,
+            "vad_filter": vad_filter,
+            "word_timestamps": False
+        }
+        if temperature > 0:
+            transcribe_params["temperature"] = temperature
+        segments_generator, info = whisper.transcribe(audio_file, **transcribe_params)
+        yield "", info.language if info else None
+        for segment in segments_generator:
+            full_transcript += segment.text + " "
+            yield full_transcript.strip(), info.language if info else None
+        elapsed_time = time.time() - start_time
+        print(f"Transcription completed in {elapsed_time:.2f}s with {whisper_model}")
+        print(f"Detected language: {info.language} (confidence: {info.language_probability:.4f})")
+        yield full_transcript.strip(), info.language if info else None
+    def translate_realtime(self, text_to_translate, source_lang, target_lang,
+                         nllb_model="600M", beam_size=4, length_penalty=1.0,
+                         repetition_penalty=1.0):
+        """Enhanced translation with configurable parameters"""
+        print(f"\n2. Translating with NLLB-{nllb_model}...")
+        start_time = time.time()
+        # Get NLLB model
+        translator = self.get_nllb_model(nllb_model)
+        src_lang_nllb = self.lang_codes.get(source_lang)
+        tgt_lang_nllb = self.lang_codes.get(target_lang)
+        if not src_lang_nllb or not tgt_lang_nllb:
+            raise ValueError(f"Unsupported language pair: {source_lang} -> {target_lang}")
+        self.nllb_tokenizer.src_lang = src_lang_nllb
+        # Split into sentences
+        sentences = re.findall(r'[^.!?]+[.!?]', text_to_translate + ('.' if not text_to_translate.endswith(('.', '!', '?')) else ''))
+        if not sentences:
+            sentences = [text_to_translate]
+        full_translation = ""
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            try:
+                tokenizer_output = self.nllb_tokenizer(sentence, return_tensors="pt", padding=True)
+                source_tokens = tokenizer_output["input_ids"].tolist()[0]
+                source_tokens_as_str = self.nllb_tokenizer.convert_ids_to_tokens(source_tokens)
+                target_prefix = [tgt_lang_nllb]
+                # Use configured parameters
+                result = translator.translate_batch(
+                    [source_tokens_as_str],
+                    target_prefix=[target_prefix],
+                    beam_size=beam_size,
+                    length_penalty=length_penalty,
+                    repetition_penalty=repetition_penalty,
+                    max_batch_size=32
+                )[0]
+                tgt_tokens = result.hypotheses[0][1:] if len(result.hypotheses[0]) > 1 else result.hypotheses[0]
+                chunk_translation = self.nllb_tokenizer.decode(
+                    self.nllb_tokenizer.convert_tokens_to_ids(tgt_tokens),
+                    skip_special_tokens=True
+                )
+                full_translation += chunk_translation + " "
+                yield full_translation.strip()
+            except Exception as e:
+                print(f"Error translating sentence {i+1}: {e}")
+                error_msg = f"[Translation error for segment {i+1}] "
+                full_translation += error_msg
+                yield full_translation.strip()
+        elapsed_time = time.time() - start_time
+        print(f"Translation completed in {elapsed_time:.2f}s with NLLB-{nllb_model}")
+        yield full_translation.strip()
+    def synthesize(self, text, target_lang, output_file="output.wav", speaking_rate=1.0):
+        """Enhanced synthesis with speaking rate control"""
+        print(f"\n3. Synthesizing speech in {target_lang}...")
+        start_time = time.time()
+        if target_lang not in self.tts_models:
+            raise ValueError(f"TTS for language {target_lang} not supported")
+        model = self.tts_models[target_lang]
+        tokenizer = self.tts_tokenizers[target_lang]
+        # Process text in chunks
+        MAX_LENGTH = 200
+        sentences = re.findall(r'[^.!?]+[.!?]', text + ('.' if not text.endswith(('.', '!', '?')) else ''))
+        sentences = [s.strip() for s in sentences if s.strip()]
+        current_chunk = ""
+        text_chunks = []
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) + 1 <= MAX_LENGTH:
+                current_chunk += (" " if current_chunk else "") + sentence
+            else:
+                if current_chunk:
+                    text_chunks.append(current_chunk)
+                current_chunk = sentence
+        if current_chunk:
+            text_chunks.append(current_chunk)
+        if not text_chunks:
+            text_chunks = [text]
+        print(f"Text split into {len(text_chunks)} chunks for TTS")
+        all_audio = []
+        for i, chunk in enumerate(text_chunks):
+            try:
+                inputs = tokenizer(text=chunk, return_tensors="pt")
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                torch.manual_seed(555 + i)
+                with torch.no_grad():
+                    output = model(**inputs).waveform
+                chunk_audio = output.squeeze().cpu().float().numpy()
+                # Apply speaking rate adjustment
+                if speaking_rate != 1.0:
+                    from scipy.signal import resample
+                    new_length = int(len(chunk_audio) / speaking_rate)
+                    chunk_audio = resample(chunk_audio, new_length)
+                all_audio.append(chunk_audio)
+            except Exception as e:
+                print(f"Error generating speech for chunk {i+1}: {e}")
+        # Combine audio chunks
+        if all_audio:
+            try:
+                audio_data = np.concatenate(all_audio)
+            except Exception as e:
+                print(f"Error concatenating audio: {e}")
+                audio_data = all_audio[0] if all_audio else np.zeros(16000, dtype=np.float32)
+        else:
+            audio_data = np.zeros(16000, dtype=np.float32)
+        # Ensure float32 format
+        if audio_data.dtype != np.float32:
+            audio_data = audio_data.astype(np.float32)
+        # Normalize and convert
+        if np.max(np.abs(audio_data)) > 0:
+            audio_data = audio_data / np.max(np.abs(audio_data))
+        audio_data_int16 = (audio_data * 32767).astype(np.int16)
+        # Save to file
+        sampling_rate = model.config.sampling_rate
+        scipy.io.wavfile.write(output_file, rate=sampling_rate, data=audio_data_int16)
+        elapsed_time = time.time() - start_time
+        audio_duration = len(audio_data) / sampling_rate
+        print(f"Speech synthesis completed in {elapsed_time:.2f}s")
+        print(f"Generated {audio_duration:.2f}s of audio (RTF: {elapsed_time/audio_duration:.2f}x)")
+        return output_file, audio_duration
+    def process_speech_to_speech_realtime(self, audio_file, source_lang, target_lang,
+                                        whisper_model="tiny", nllb_model="600M",
+                                        whisper_beam_size=5, whisper_temperature=0.0,
+                                        vad_filter=False, nllb_beam_size=4,
+                                        length_penalty=1.0, repetition_penalty=1.0,
+                                        speaking_rate=1.0, output_file=None):
+        """Complete pipeline with all configurable parameters"""
+        if output_file is None:
+            output_file = f"output_{source_lang}_to_{target_lang}_{int(time.time())}.wav"
+        print(f"\n===== ENHANCED SPEECH-TO-SPEECH TRANSLATION =====")
+        print(f"Models: Whisper-{whisper_model}, NLLB-{nllb_model}")
+        print(f"Languages: {source_lang} -> {target_lang}")
+        total_start_time = time.time()
+        current_transcript = ""
+        current_translation = ""
+        detected_lang = None
+        output_path = None
+        audio_duration = 0
+        success = False
+        try:
+            # Step 1: Transcribe
+            yield "🎤 Transcribing audio...", "", "", None
+            for partial_transcript, lang in self.transcribe_realtime(
+                audio_file, source_lang, whisper_model, vad_filter,
+                whisper_beam_size, whisper_temperature
+            ):
+                current_transcript = partial_transcript
+                detected_lang = lang
+                yield "🎤 Transcribing audio...", current_transcript, current_translation, None
+            # Step 2: Translate
+            yield "🔄 Translating text...", current_transcript, current_translation, None
+            for partial_translation in self.translate_realtime(
+                current_transcript, source_lang, target_lang, nllb_model,
+                nllb_beam_size, length_penalty, repetition_penalty
+            ):
+                current_translation = partial_translation
+                yield "🔄 Translating text...", current_transcript, current_translation, None
+            # Step 3: Synthesize
+            yield "🔊 Synthesizing speech...", current_transcript, current_translation, None
+            output_path, audio_duration = self.synthesize(
+                current_translation, target_lang, output_file, speaking_rate
+            )
+            success = True
+        except Exception as e:
+            print(f"ERROR in pipeline: {e}")
+            import traceback
+            traceback.print_exc()
+            success = False
+            current_transcript = "❌ Transcription failed"
+            current_translation = "❌ Translation failed"
+            output_path = None
+        total_elapsed_time = time.time() - total_start_time
+        if success:
+            status = (f"✅ Success! Total time: {total_elapsed_time:.2f}s, "
+                     f"Audio: {audio_duration:.2f}s")
+        else:
+            status = "❌ Processing failed"
+        print(f"\n===== TRANSLATION {'COMPLETED' if success else 'FAILED'} =====")
+        yield status, current_transcript, current_translation, output_path
+def create_enhanced_gradio_interface():
+    """Create enhanced Gradio interface with model selection and advanced options"""
+    # Initialize pipeline
+    pipeline = EnhancedS2SPipeline()
+    def get_model_info(model_type, model_name):
+        """Get model information for display"""
+        config = pipeline.model_configs[model_type][model_name]
+        if model_type == "whisper":
+            return f"**{model_name.upper()}** - Size: {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"
+        else:
+            return f"**{model_name}** - {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"
+    def process_audio_enhanced(audio_file, source_lang_str, target_lang_str,
+                             whisper_model, nllb_model, whisper_beam_size,
+                             whisper_temperature, vad_filter, nllb_beam_size,
+                             length_penalty, repetition_penalty, speaking_rate):
+        """Enhanced processing function with all parameters"""
+        if audio_file is None:
+            yield "❌ No audio provided", "No transcript available", "No translation available", None
+            return
+        for status, transcript, translation, output_audio in pipeline.process_speech_to_speech_realtime(
+            audio_file=audio_file,
+            source_lang=source_lang_str,
+            target_lang=target_lang_str,
+            whisper_model=whisper_model,
+            nllb_model=nllb_model,
+            whisper_beam_size=whisper_beam_size,
+            whisper_temperature=whisper_temperature,
+            vad_filter=vad_filter,
+            nllb_beam_size=nllb_beam_size,
+            length_penalty=length_penalty,
+            repetition_penalty=repetition_penalty,
+            speaking_rate=speaking_rate
+        ):
+            yield status, transcript, translation, output_audio
+    # Create the interface
+    with gr.Blocks(title="Enhanced Speech-to-Speech Translation", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎙️ Enhanced Speech-to-Speech Translation")
+        gr.Markdown("Advanced AI-powered speech translation with configurable models and parameters.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📥 Input Configuration")
+                audio_input = gr.Audio(
+                    sources=["microphone", "upload"],
+                    type="filepath",
+                    label="🎵 Upload or Record Audio"
+                )
+                with gr.Row():
+                    source_lang = gr.Radio(
+                        choices=["English", "French"],
+                        value="English",
+                        label="📢 Source Language"
+                    )
+                    target_lang = gr.Radio(
+                        choices=["English", "French"],
+                        value="French",
+                        label="🎯 Target Language"
+                    )
+                gr.Markdown("### 🧠 Model Selection")
+                with gr.Accordion("🎤 Whisper ASR Model", open=True):
+                    whisper_model = gr.Radio(
+                        choices=["tiny", "base", "small", "medium"],
+                        value="tiny",
+                        label="Model Size"
+                    )
+                    whisper_info = gr.Markdown(get_model_info("whisper", "tiny"))
+                with gr.Accordion("🔄 NLLB Translation Model", open=True):
+                    nllb_model = gr.Radio(
+                        choices=["600M", "1.3B"],
+                        value="600M",
+                        label="Model Size"
+                    )
+                    nllb_info = gr.Markdown(get_model_info("nllb", "600M"))
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
+                    gr.Markdown("**Whisper Parameters**")
+                    whisper_beam_size = gr.Slider(1, 10, value=5, step=1, label="Beam Size")
+                    whisper_temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
+                    vad_filter = gr.Checkbox(label="Voice Activity Detection", value=False)
+                    gr.Markdown("**Translation Parameters**")
+                    nllb_beam_size = gr.Slider(1, 8, value=4, step=1, label="Beam Size")
+                    length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty")
+                    repetition_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Repetition Penalty")
+                    gr.Markdown("**Speech Synthesis**")
+                    speaking_rate = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")
+                process_btn = gr.Button("🚀 Translate", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                gr.Markdown("### 📤 Results")
+                status_output = gr.Textbox(label="📊 Status", interactive=False)
+                with gr.Tabs():
+                    with gr.TabItem("📝 Text Results"):
+                        transcript_output = gr.Textbox(
+                            label="🎤 Original Transcript",
+                            lines=6,
+                            interactive=False
+                        )
+                        translation_output = gr.Textbox(
+                            label="🔄 Translation",
+                            lines=6,
+                            interactive=False
+                        )
+                    with gr.TabItem("🔊 Audio Output"):
+                        audio_output = gr.Audio(
+                            type="filepath",
+                            label="🔊 Translated Speech"
+                        )
+        # Example section
+        with gr.Row():
+            gr.Markdown("### 🎵 Try Our Examples")
+            with gr.Row():
+                gr.Examples(
+                    examples=[
+                        ["./examples/input_audio/eng1.wav", "English", "French", "tiny", "600M"],
+                        ["./examples/input_audio/fr1.wav", "French", "English", "tiny", "600M"],
+                        ["./examples/input_audio/eng2.wav", "English", "French", "base", "600M"]
+                    ] if os.path.exists("./examples") else [],
+                    inputs=[audio_input, source_lang, target_lang, whisper_model, nllb_model],
+                    label="Sample Audio Files"
+                )
+        # Model info update functions
+        def update_whisper_info(model):
+            return get_model_info("whisper", model)
+        def update_nllb_info(model):
+            return get_model_info("nllb", model)
+        # Connect update functions
+        whisper_model.change(update_whisper_info, whisper_model, whisper_info)
+        nllb_model.change(update_nllb_info, nllb_model, nllb_info)
+        # Main processing function
+        process_btn.click(
+            fn=process_audio_enhanced,
+            inputs=[
+                audio_input, source_lang, target_lang, whisper_model, nllb_model,
+                whisper_beam_size, whisper_temperature, vad_filter,
+                nllb_beam_size, length_penalty, repetition_penalty, speaking_rate
+            ],
+            outputs=[status_output, transcript_output, translation_output, audio_output]
+        )
+        # Information sections
+        with gr.Accordion("📚 Model Information", open=False):
+            gr.Markdown("""
+            ### 🎤 Whisper Models (OpenAI)
+            - **Tiny**: Fastest, smallest model. Good for quick transcription.
+            - **Base**: Balanced speed and accuracy. Recommended for most use cases.
+            - **Small**: Better accuracy, moderate speed. Good for important content.
+            - **Medium**: High accuracy, slower processing. Professional applications.
+            ### 🔄 NLLB Models (Meta)
+            - **600M**: Faster translation with good quality. Supports 200+ languages.
+            - **1.3B**: Better translation quality with more parameters. Higher accuracy.
+            ### 🔊 MMS-TTS (Meta)
+            - High-quality multilingual text-to-speech synthesis
+            - Supports natural-sounding voice generation
+            - Optimized for English and French
+            """)
+        with gr.Accordion("⚙️ Parameter Guide", open=False):
+            gr.Markdown("""
+            ### Whisper Parameters
+            - **Beam Size**: Higher values = better accuracy, slower processing (1-10)
+            - **Temperature**: Higher values = more diverse outputs (0.0-1.0)
+            - **VAD Filter**: Removes silence automatically (may require additional dependencies)
+            ### Translation Parameters
+            - **Beam Size**: Search breadth for translation (1-8)
+            - **Length Penalty**: Controls output length preference (0.5-2.0)
+            - **Repetition Penalty**: Reduces repetitive translations (0.5-2.0)
+            ### Speech Synthesis
+            - **Speaking Rate**: Playback speed multiplier (0.5-2.0)
+            """)
+        with gr.Accordion("🔧 Usage Instructions", open=False):
+            gr.Markdown("""
+            1. **Upload/Record**: Add your audio file or record directly
+            2. **Select Languages**: Choose source and target languages
+            3. **Choose Models**: Select model sizes based on your speed/quality needs
+            4. **Adjust Settings**: Fine-tune advanced parameters if needed
+            5. **Translate**: Click the translate button and watch real-time progress
+            6. **Download**: Save the translated audio file
+            **Tips:**
+            - Use smaller models for faster processing
+            - Use larger models for better quality
+            - Adjust beam sizes for quality vs speed trade-off
+            - Speaking rate can make output faster or slower
+            """)
+    return demo
+# Launch the application
+if __name__ == "__main__":
+    demo = create_enhanced_gradio_interface()
+    demo.launch()