| | """ |
| | Enhanced Speech-to-Speech Translation Pipeline with Advanced Gradio Interface |
| | |
| | This script implements a complete pipeline for speech-to-speech translation with |
| | dynamic model selection and advanced configuration options. |
| | |
| | Features: |
| | - Dynamic Whisper model switching (tiny, base, small, medium) |
| | - NLLB model selection (600M, 1.3B) |
| | - Advanced translation parameters (beam size, temperature, etc.) |
| | - Real-time processing with detailed model information |
| | - Comprehensive model descriptions and performance metrics |
| | |
| | Requirements: |
| | - faster-whisper |
| | - ctranslate2 |
| | - transformers (version 4.33.0+) |
| | - torch |
| | - numpy |
| | - scipy |
| | - requests (for fallback tokenizer) |
| | - gradio |
| | """ |
| |
|
| | import os |
| | import time |
| | import torch |
| | import numpy as np |
| | import ctranslate2 |
| | import scipy.io.wavfile |
| | from faster_whisper import WhisperModel |
| | import gradio as gr |
| | import re |
| | from pathlib import Path |
| | from typing import Dict, Optional, Tuple, Generator |
| |
|
| | |
| | os.environ["PYTHONWARNINGS"] = "ignore::RuntimeWarning" |
| |
|
| | class EnhancedS2SPipeline: |
| | """ |
| | Enhanced Speech-to-Speech Translation Pipeline with dynamic model loading |
| | """ |
| |
|
| | def __init__(self, device="cuda"): |
| | """ |
| | Initialize the pipeline with dynamic model loading capability |
| | |
| | Args: |
| | device: Device to run inference on ('cuda' or 'cpu') |
| | """ |
| | self.device = device if torch.cuda.is_available() else "cpu" |
| | self.compute_type = "float16" if self.device == "cuda" else "int8" |
| | |
| | |
| | self.whisper_models: Dict[str, WhisperModel] = {} |
| | self.nllb_models: Dict[str, ctranslate2.Translator] = {} |
| | self.nllb_tokenizer = None |
| | self.tts_models = {} |
| | self.tts_tokenizers = {} |
| | |
| | |
| | self.model_configs = { |
| | "whisper": { |
| | "tiny": {"size": "39 MB", "speed": "Very Fast", "accuracy": "Good", "multilingual": True}, |
| | "base": {"size": "74 MB", "speed": "Fast", "accuracy": "Better", "multilingual": True}, |
| | "small": {"size": "244 MB", "speed": "Medium", "accuracy": "Good", "multilingual": True}, |
| | "medium": {"size": "769 MB", "speed": "Slow", "accuracy": "Very Good", "multilingual": True} |
| | }, |
| | "nllb": { |
| | "600M": { |
| | "path": "./models/nllb-200-distilled-600M-ct2-int8", |
| | "size": "600M parameters", |
| | "speed": "Fast", |
| | "accuracy": "Good", |
| | "languages": "200+ languages" |
| | }, |
| | "1.3B": { |
| | "path": "./models/nllb-200-distilled-1.3B-ct2-int8", |
| | "size": "1.3B parameters", |
| | "speed": "Medium", |
| | "accuracy": "Better", |
| | "languages": "200+ languages" |
| | } |
| | } |
| | } |
| | |
| | |
| | self.lang_codes = { |
| | "English": "eng_Latn", |
| | "French": "fra_Latn", |
| | } |
| | |
| | |
| | self.tts_lang_codes = { |
| | "English": "eng", |
| | "French": "fra" |
| | } |
| | |
| | print(f"Enhanced Speech-to-Speech pipeline initialized on {self.device}") |
| | |
| | |
| | self._initialize_tts_models() |
| | |
| | |
| | self._initialize_nllb_tokenizer() |
| |
|
| | def _initialize_tts_models(self): |
| | """Initialize TTS models for all supported languages""" |
| | print("Loading MMS-TTS models for English and French...") |
| | |
| | try: |
| | from transformers.models.vits.modeling_vits import VitsModel |
| | from transformers.models.vits.tokenization_vits import VitsTokenizer |
| | |
| | |
| | print("Loading English TTS model...") |
| | self.tts_models["English"] = VitsModel.from_pretrained( |
| | "facebook/mms-tts-eng", |
| | torch_dtype=torch.float16 if self.device == "cuda" else torch.float32 |
| | ).to(self.device) |
| | self.tts_tokenizers["English"] = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") |
| | |
| | |
| | print("Loading French TTS model...") |
| | self.tts_models["French"] = VitsModel.from_pretrained( |
| | "facebook/mms-tts-fra", |
| | torch_dtype=torch.float16 if self.device == "cuda" else torch.float32 |
| | ).to(self.device) |
| | self.tts_tokenizers["French"] = VitsTokenizer.from_pretrained("facebook/mms-tts-fra") |
| | |
| | print("TTS models loaded successfully.") |
| | |
| | except Exception as e: |
| | print(f"Error loading TTS models: {e}") |
| | print("TTS functionality may be limited.") |
| |
|
| | def _initialize_nllb_tokenizer(self): |
| | """Initialize NLLB tokenizer with fallback""" |
| | try: |
| | print("Loading NLLB tokenizer...") |
| | from transformers.models.nllb.tokenization_nllb import NllbTokenizer |
| | self.nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") |
| | print("NLLB tokenizer loaded successfully.") |
| | except Exception as e: |
| | print(f"Error loading NLLB tokenizer: {e}") |
| | print("Implementing simplified fallback tokenizer...") |
| | self.nllb_tokenizer = self._create_fallback_tokenizer() |
| |
|
| | def _create_fallback_tokenizer(self): |
| | """Create a simplified fallback tokenizer for NLLB""" |
| | import json |
| | import requests |
| | |
| | class SimplifiedNllbTokenizer: |
| | def __init__(self): |
| | self.src_lang = "eng_Latn" |
| | cache_dir = Path.home() / ".cache" / "simplified_nllb_tokenizer" |
| | cache_dir.mkdir(parents=True, exist_ok=True) |
| | vocab_file = cache_dir / "vocab.json" |
| | |
| | if not vocab_file.exists(): |
| | print("Downloading NLLB vocabulary for fallback tokenizer...") |
| | url = "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/vocab.json" |
| | try: |
| | response = requests.get(url) |
| | response.raise_for_status() |
| | with open(vocab_file, 'wb') as f: |
| | f.write(response.content) |
| | print("Vocabulary downloaded successfully.") |
| | except requests.exceptions.RequestException as req_e: |
| | print(f"Failed to download vocabulary: {req_e}") |
| | with open(vocab_file, 'w') as f: |
| | json.dump({"[PAD]": 0, "[UNK]": 1}, f) |
| | |
| | with open(vocab_file, 'r', encoding='utf-8') as f: |
| | self.vocab = json.load(f) |
| | self.id_to_token = {v: k for k, v in self.vocab.items()} |
| | |
| | def tokenize(self, text): |
| | text = text.lower() |
| | tokens = re.findall(r'\w+|[^\w\s]', text) |
| | return tokens |
| | |
| | def convert_tokens_to_ids(self, tokens): |
| | return [self.vocab.get(token, self.vocab.get("[UNK]", 1)) for token in tokens] |
| | |
| | def convert_ids_to_tokens(self, ids): |
| | return [self.id_to_token.get(id, "[UNK]") for id in ids] |
| | |
| | def decode(self, token_ids, skip_special_tokens=True): |
| | tokens = [self.id_to_token.get(id, "[UNK]") for id in token_ids] |
| | if skip_special_tokens: |
| | tokens = [t for t in tokens if not t.startswith("[") and not t.endswith("]")] |
| | return " ".join(tokens) |
| | |
| | def __call__(self, text, return_tensors=None, padding=False): |
| | tokens = self.tokenize(text) |
| | input_ids = self.convert_tokens_to_ids(tokens) |
| | |
| | if return_tensors == "pt": |
| | import torch |
| | return {"input_ids": torch.tensor([input_ids])} |
| | else: |
| | return {"input_ids": [input_ids]} |
| | |
| | return SimplifiedNllbTokenizer() |
| |
|
| | def get_whisper_model(self, model_size: str) -> WhisperModel: |
| | """Get or load Whisper model""" |
| | if model_size not in self.whisper_models: |
| | print(f"Loading Whisper model '{model_size}'...") |
| | |
| | |
| | model_path = f"./models/whisper/{model_size}.pt" |
| | if os.path.exists(model_path): |
| | print(f"Loading Whisper model from local path: {model_path}") |
| | self.whisper_models[model_size] = WhisperModel( |
| | model_path, |
| | device=self.device, |
| | compute_type=self.compute_type |
| | ) |
| | else: |
| | |
| | print(f"Loading Whisper model from HuggingFace Hub: {model_size}") |
| | self.whisper_models[model_size] = WhisperModel( |
| | model_size, |
| | device=self.device, |
| | compute_type=self.compute_type |
| | ) |
| | print(f"Whisper '{model_size}' loaded successfully.") |
| | return self.whisper_models[model_size] |
| |
|
| | def get_nllb_model(self, model_size: str) -> ctranslate2.Translator: |
| | """Get or load NLLB model""" |
| | if model_size not in self.nllb_models: |
| | model_path = self.model_configs["nllb"][model_size]["path"] |
| | print(f"Loading NLLB model '{model_size}' from {model_path}...") |
| | try: |
| | self.nllb_models[model_size] = ctranslate2.Translator( |
| | model_path, |
| | device=self.device, |
| | compute_type=self.compute_type |
| | ) |
| | print(f"NLLB '{model_size}' loaded successfully.") |
| | except RuntimeError as e: |
| | print(f"ERROR: Failed to load NLLB model from '{model_path}'.") |
| | print(f"Please ensure the path is correct and contains model files.") |
| | raise |
| | return self.nllb_models[model_size] |
| |
|
| | def transcribe_realtime(self, audio_file, source_lang=None, whisper_model="tiny", |
| | vad_filter=False, beam_size=5, temperature=0.0): |
| | """Enhanced transcription with configurable parameters""" |
| | print(f"\n1. Transcribing with Whisper-{whisper_model}...") |
| | start_time = time.time() |
| | |
| | |
| | whisper = self.get_whisper_model(whisper_model) |
| | |
| | |
| | whisper_lang = None |
| | if source_lang: |
| | whisper_lang = "en" if source_lang == "English" else "fr" if source_lang == "French" else None |
| | |
| | full_transcript = "" |
| | |
| | |
| | transcribe_params = { |
| | "language": whisper_lang, |
| | "beam_size": beam_size, |
| | "vad_filter": vad_filter, |
| | "word_timestamps": False |
| | } |
| | |
| | if temperature > 0: |
| | transcribe_params["temperature"] = temperature |
| | |
| | segments_generator, info = whisper.transcribe(audio_file, **transcribe_params) |
| | |
| | yield "", info.language if info else None |
| | |
| | for segment in segments_generator: |
| | full_transcript += segment.text + " " |
| | yield full_transcript.strip(), info.language if info else None |
| | |
| | elapsed_time = time.time() - start_time |
| | print(f"Transcription completed in {elapsed_time:.2f}s with {whisper_model}") |
| | print(f"Detected language: {info.language} (confidence: {info.language_probability:.4f})") |
| | |
| | yield full_transcript.strip(), info.language if info else None |
| |
|
| | def translate_realtime(self, text_to_translate, source_lang, target_lang, |
| | nllb_model="600M", beam_size=4, length_penalty=1.0, |
| | repetition_penalty=1.0): |
| | """Enhanced translation with configurable parameters""" |
| | print(f"\n2. Translating with NLLB-{nllb_model}...") |
| | start_time = time.time() |
| | |
| | |
| | translator = self.get_nllb_model(nllb_model) |
| | |
| | src_lang_nllb = self.lang_codes.get(source_lang) |
| | tgt_lang_nllb = self.lang_codes.get(target_lang) |
| | |
| | if not src_lang_nllb or not tgt_lang_nllb: |
| | raise ValueError(f"Unsupported language pair: {source_lang} -> {target_lang}") |
| | |
| | self.nllb_tokenizer.src_lang = src_lang_nllb |
| | |
| | |
| | sentences = re.findall(r'[^.!?]+[.!?]', text_to_translate + ('.' if not text_to_translate.endswith(('.', '!', '?')) else '')) |
| | if not sentences: |
| | sentences = [text_to_translate] |
| | |
| | full_translation = "" |
| | |
| | for i, sentence in enumerate(sentences): |
| | if not sentence.strip(): |
| | continue |
| |
|
| | try: |
| | tokenizer_output = self.nllb_tokenizer(sentence, return_tensors="pt", padding=True) |
| | source_tokens = tokenizer_output["input_ids"].tolist()[0] |
| | source_tokens_as_str = self.nllb_tokenizer.convert_ids_to_tokens(source_tokens) |
| | |
| | target_prefix = [tgt_lang_nllb] |
| | |
| | |
| | result = translator.translate_batch( |
| | [source_tokens_as_str], |
| | target_prefix=[target_prefix], |
| | beam_size=beam_size, |
| | length_penalty=length_penalty, |
| | repetition_penalty=repetition_penalty, |
| | max_batch_size=32 |
| | )[0] |
| | |
| | tgt_tokens = result.hypotheses[0][1:] if len(result.hypotheses[0]) > 1 else result.hypotheses[0] |
| | |
| | chunk_translation = self.nllb_tokenizer.decode( |
| | self.nllb_tokenizer.convert_tokens_to_ids(tgt_tokens), |
| | skip_special_tokens=True |
| | ) |
| | |
| | full_translation += chunk_translation + " " |
| | yield full_translation.strip() |
| | |
| | except Exception as e: |
| | print(f"Error translating sentence {i+1}: {e}") |
| | error_msg = f"[Translation error for segment {i+1}] " |
| | full_translation += error_msg |
| | yield full_translation.strip() |
| | |
| | elapsed_time = time.time() - start_time |
| | print(f"Translation completed in {elapsed_time:.2f}s with NLLB-{nllb_model}") |
| | |
| | yield full_translation.strip() |
| |
|
| | def synthesize(self, text, target_lang, output_file="output.wav", speaking_rate=1.0): |
| | """Enhanced synthesis with speaking rate control""" |
| | print(f"\n3. Synthesizing speech in {target_lang}...") |
| | start_time = time.time() |
| | |
| | if target_lang not in self.tts_models: |
| | raise ValueError(f"TTS for language {target_lang} not supported") |
| | |
| | model = self.tts_models[target_lang] |
| | tokenizer = self.tts_tokenizers[target_lang] |
| | |
| | |
| | MAX_LENGTH = 200 |
| | sentences = re.findall(r'[^.!?]+[.!?]', text + ('.' if not text.endswith(('.', '!', '?')) else '')) |
| | sentences = [s.strip() for s in sentences if s.strip()] |
| | |
| | current_chunk = "" |
| | text_chunks = [] |
| | |
| | for sentence in sentences: |
| | if len(current_chunk) + len(sentence) + 1 <= MAX_LENGTH: |
| | current_chunk += (" " if current_chunk else "") + sentence |
| | else: |
| | if current_chunk: |
| | text_chunks.append(current_chunk) |
| | current_chunk = sentence |
| | |
| | if current_chunk: |
| | text_chunks.append(current_chunk) |
| | |
| | if not text_chunks: |
| | text_chunks = [text] |
| | |
| | print(f"Text split into {len(text_chunks)} chunks for TTS") |
| | |
| | all_audio = [] |
| | |
| | for i, chunk in enumerate(text_chunks): |
| | try: |
| | inputs = tokenizer(text=chunk, return_tensors="pt") |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | |
| | torch.manual_seed(555 + i) |
| | |
| | with torch.no_grad(): |
| | output = model(**inputs).waveform |
| | |
| | chunk_audio = output.squeeze().cpu().float().numpy() |
| | |
| | |
| | if speaking_rate != 1.0: |
| | from scipy.signal import resample |
| | new_length = int(len(chunk_audio) / speaking_rate) |
| | chunk_audio = resample(chunk_audio, new_length) |
| | |
| | all_audio.append(chunk_audio) |
| | |
| | except Exception as e: |
| | print(f"Error generating speech for chunk {i+1}: {e}") |
| | |
| | |
| | if all_audio: |
| | try: |
| | audio_data = np.concatenate(all_audio) |
| | except Exception as e: |
| | print(f"Error concatenating audio: {e}") |
| | audio_data = all_audio[0] if all_audio else np.zeros(16000, dtype=np.float32) |
| | else: |
| | audio_data = np.zeros(16000, dtype=np.float32) |
| | |
| | |
| | if audio_data.dtype != np.float32: |
| | audio_data = audio_data.astype(np.float32) |
| | |
| | |
| | if np.max(np.abs(audio_data)) > 0: |
| | audio_data = audio_data / np.max(np.abs(audio_data)) |
| | |
| | audio_data_int16 = (audio_data * 32767).astype(np.int16) |
| | |
| | |
| | sampling_rate = model.config.sampling_rate |
| | scipy.io.wavfile.write(output_file, rate=sampling_rate, data=audio_data_int16) |
| | |
| | elapsed_time = time.time() - start_time |
| | audio_duration = len(audio_data) / sampling_rate |
| | print(f"Speech synthesis completed in {elapsed_time:.2f}s") |
| | print(f"Generated {audio_duration:.2f}s of audio (RTF: {elapsed_time/audio_duration:.2f}x)") |
| | |
| | return output_file, audio_duration |
| |
|
| | def process_speech_to_speech_realtime(self, audio_file, source_lang, target_lang, |
| | whisper_model="tiny", nllb_model="600M", |
| | whisper_beam_size=5, whisper_temperature=0.0, |
| | vad_filter=False, nllb_beam_size=4, |
| | length_penalty=1.0, repetition_penalty=1.0, |
| | speaking_rate=1.0, output_file=None): |
| | """Complete pipeline with all configurable parameters""" |
| | if output_file is None: |
| | output_file = f"output_{source_lang}_to_{target_lang}_{int(time.time())}.wav" |
| | |
| | print(f"\n===== ENHANCED SPEECH-TO-SPEECH TRANSLATION =====") |
| | print(f"Models: Whisper-{whisper_model}, NLLB-{nllb_model}") |
| | print(f"Languages: {source_lang} -> {target_lang}") |
| | |
| | total_start_time = time.time() |
| | |
| | current_transcript = "" |
| | current_translation = "" |
| | detected_lang = None |
| | output_path = None |
| | audio_duration = 0 |
| | success = False |
| | |
| | try: |
| | |
| | yield "π€ Transcribing audio...", "", "", None |
| | for partial_transcript, lang in self.transcribe_realtime( |
| | audio_file, source_lang, whisper_model, vad_filter, |
| | whisper_beam_size, whisper_temperature |
| | ): |
| | current_transcript = partial_transcript |
| | detected_lang = lang |
| | yield "π€ Transcribing audio...", current_transcript, current_translation, None |
| | |
| | |
| | yield "π Translating text...", current_transcript, current_translation, None |
| | for partial_translation in self.translate_realtime( |
| | current_transcript, source_lang, target_lang, nllb_model, |
| | nllb_beam_size, length_penalty, repetition_penalty |
| | ): |
| | current_translation = partial_translation |
| | yield "π Translating text...", current_transcript, current_translation, None |
| | |
| | |
| | yield "π Synthesizing speech...", current_transcript, current_translation, None |
| | output_path, audio_duration = self.synthesize( |
| | current_translation, target_lang, output_file, speaking_rate |
| | ) |
| | |
| | success = True |
| | |
| | except Exception as e: |
| | print(f"ERROR in pipeline: {e}") |
| | import traceback |
| | traceback.print_exc() |
| | success = False |
| | current_transcript = "β Transcription failed" |
| | current_translation = "β Translation failed" |
| | output_path = None |
| | |
| | total_elapsed_time = time.time() - total_start_time |
| | |
| | if success: |
| | status = (f"β
Success! Total time: {total_elapsed_time:.2f}s, " |
| | f"Audio: {audio_duration:.2f}s") |
| | else: |
| | status = "β Processing failed" |
| | |
| | print(f"\n===== TRANSLATION {'COMPLETED' if success else 'FAILED'} =====") |
| | |
| | yield status, current_transcript, current_translation, output_path |
| |
|
| | def create_enhanced_gradio_interface(): |
| | """Create enhanced Gradio interface with model selection and advanced options""" |
| | |
| | |
| | pipeline = EnhancedS2SPipeline() |
| | |
| | def get_model_info(model_type, model_name): |
| | """Get model information for display""" |
| | config = pipeline.model_configs[model_type][model_name] |
| | if model_type == "whisper": |
| | return f"**{model_name.upper()}** - Size: {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}" |
| | else: |
| | return f"**{model_name}** - {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}" |
| | |
| | def process_audio_enhanced(audio_file, source_lang_str, target_lang_str, |
| | whisper_model, nllb_model, whisper_beam_size, |
| | whisper_temperature, vad_filter, nllb_beam_size, |
| | length_penalty, repetition_penalty, speaking_rate): |
| | """Enhanced processing function with all parameters""" |
| | if audio_file is None: |
| | yield "β No audio provided", "No transcript available", "No translation available", None |
| | return |
| | |
| | for status, transcript, translation, output_audio in pipeline.process_speech_to_speech_realtime( |
| | audio_file=audio_file, |
| | source_lang=source_lang_str, |
| | target_lang=target_lang_str, |
| | whisper_model=whisper_model, |
| | nllb_model=nllb_model, |
| | whisper_beam_size=whisper_beam_size, |
| | whisper_temperature=whisper_temperature, |
| | vad_filter=vad_filter, |
| | nllb_beam_size=nllb_beam_size, |
| | length_penalty=length_penalty, |
| | repetition_penalty=repetition_penalty, |
| | speaking_rate=speaking_rate |
| | ): |
| | yield status, transcript, translation, output_audio |
| | |
| | |
| | with gr.Blocks(title="Enhanced Speech-to-Speech Translation", theme=gr.themes.Soft()) as demo: |
| | gr.Markdown("# ποΈ Enhanced Speech-to-Speech Translation") |
| | gr.Markdown("Advanced AI-powered speech translation with configurable models and parameters.") |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | gr.Markdown("### π₯ Input Configuration") |
| | |
| | audio_input = gr.Audio( |
| | sources=["microphone", "upload"], |
| | type="filepath", |
| | label="π΅ Upload or Record Audio" |
| | ) |
| | |
| | with gr.Row(): |
| | source_lang = gr.Radio( |
| | choices=["English", "French"], |
| | value="English", |
| | label="π’ Source Language" |
| | ) |
| | target_lang = gr.Radio( |
| | choices=["English", "French"], |
| | value="French", |
| | label="π― Target Language" |
| | ) |
| | |
| | gr.Markdown("### π§ Model Selection") |
| | |
| | with gr.Accordion("π€ Whisper ASR Model", open=True): |
| | whisper_model = gr.Radio( |
| | choices=["tiny", "base", "small", "medium"], |
| | value="tiny", |
| | label="Model Size" |
| | ) |
| | whisper_info = gr.Markdown(get_model_info("whisper", "tiny")) |
| | |
| | with gr.Accordion("π NLLB Translation Model", open=True): |
| | nllb_model = gr.Radio( |
| | choices=["600M", "1.3B"], |
| | value="600M", |
| | label="Model Size" |
| | ) |
| | nllb_info = gr.Markdown(get_model_info("nllb", "600M")) |
| | |
| | with gr.Accordion("βοΈ Advanced Settings", open=False): |
| | gr.Markdown("**Whisper Parameters**") |
| | whisper_beam_size = gr.Slider(1, 10, value=5, step=1, label="Beam Size") |
| | whisper_temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature") |
| | vad_filter = gr.Checkbox(label="Voice Activity Detection", value=False) |
| | |
| | gr.Markdown("**Translation Parameters**") |
| | nllb_beam_size = gr.Slider(1, 8, value=4, step=1, label="Beam Size") |
| | length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty") |
| | repetition_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Repetition Penalty") |
| | |
| | gr.Markdown("**Speech Synthesis**") |
| | speaking_rate = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate") |
| | |
| | process_btn = gr.Button("π Translate", variant="primary", size="lg") |
| | |
| | with gr.Column(scale=1): |
| | gr.Markdown("### π€ Results") |
| | |
| | status_output = gr.Textbox(label="π Status", interactive=False) |
| | |
| | with gr.Tabs(): |
| | with gr.TabItem("π Text Results"): |
| | transcript_output = gr.Textbox( |
| | label="π€ Original Transcript", |
| | lines=6, |
| | interactive=False |
| | ) |
| | translation_output = gr.Textbox( |
| | label="π Translation", |
| | lines=6, |
| | interactive=False |
| | ) |
| | |
| | with gr.TabItem("π Audio Output"): |
| | audio_output = gr.Audio( |
| | type="filepath", |
| | label="π Translated Speech" |
| | ) |
| | |
| | |
| | with gr.Row(): |
| | gr.Markdown("### π΅ Try Our Examples") |
| | with gr.Row(): |
| | gr.Examples( |
| | examples=[ |
| | ["./examples/input_audio/eng1.wav", "English", "French", "tiny", "600M"], |
| | ["./examples/input_audio/fr1.wav", "French", "English", "tiny", "600M"], |
| | ["./examples/input_audio/eng2.wav", "English", "French", "base", "600M"] |
| | ] if os.path.exists("./examples") else [], |
| | inputs=[audio_input, source_lang, target_lang, whisper_model, nllb_model], |
| | label="Sample Audio Files" |
| | ) |
| | |
| | |
| | def update_whisper_info(model): |
| | return get_model_info("whisper", model) |
| | |
| | def update_nllb_info(model): |
| | return get_model_info("nllb", model) |
| | |
| | |
| | whisper_model.change(update_whisper_info, whisper_model, whisper_info) |
| | nllb_model.change(update_nllb_info, nllb_model, nllb_info) |
| | |
| | |
| | process_btn.click( |
| | fn=process_audio_enhanced, |
| | inputs=[ |
| | audio_input, source_lang, target_lang, whisper_model, nllb_model, |
| | whisper_beam_size, whisper_temperature, vad_filter, |
| | nllb_beam_size, length_penalty, repetition_penalty, speaking_rate |
| | ], |
| | outputs=[status_output, transcript_output, translation_output, audio_output] |
| | ) |
| | |
| | |
| | with gr.Accordion("π Model Information", open=False): |
| | gr.Markdown(""" |
| | ### π€ Whisper Models (OpenAI) |
| | - **Tiny**: Fastest, smallest model. Good for quick transcription. |
| | - **Base**: Balanced speed and accuracy. Recommended for most use cases. |
| | - **Small**: Better accuracy, moderate speed. Good for important content. |
| | - **Medium**: High accuracy, slower processing. Professional applications. |
| | |
| | ### π NLLB Models (Meta) |
| | - **600M**: Faster translation with good quality. Supports 200+ languages. |
| | - **1.3B**: Better translation quality with more parameters. Higher accuracy. |
| | |
| | ### π MMS-TTS (Meta) |
| | - High-quality multilingual text-to-speech synthesis |
| | - Supports natural-sounding voice generation |
| | - Optimized for English and French |
| | """) |
| | |
| | with gr.Accordion("βοΈ Parameter Guide", open=False): |
| | gr.Markdown(""" |
| | ### Whisper Parameters |
| | - **Beam Size**: Higher values = better accuracy, slower processing (1-10) |
| | - **Temperature**: Higher values = more diverse outputs (0.0-1.0) |
| | - **VAD Filter**: Removes silence automatically (may require additional dependencies) |
| | |
| | ### Translation Parameters |
| | - **Beam Size**: Search breadth for translation (1-8) |
| | - **Length Penalty**: Controls output length preference (0.5-2.0) |
| | - **Repetition Penalty**: Reduces repetitive translations (0.5-2.0) |
| | |
| | ### Speech Synthesis |
| | - **Speaking Rate**: Playback speed multiplier (0.5-2.0) |
| | """) |
| | |
| | with gr.Accordion("π§ Usage Instructions", open=False): |
| | gr.Markdown(""" |
| | 1. **Upload/Record**: Add your audio file or record directly |
| | 2. **Select Languages**: Choose source and target languages |
| | 3. **Choose Models**: Select model sizes based on your speed/quality needs |
| | 4. **Adjust Settings**: Fine-tune advanced parameters if needed |
| | 5. **Translate**: Click the translate button and watch real-time progress |
| | 6. **Download**: Save the translated audio file |
| | |
| | **Tips:** |
| | - Use smaller models for faster processing |
| | - Use larger models for better quality |
| | - Adjust beam sizes for quality vs speed trade-off |
| | - Speaking rate can make output faster or slower |
| | """) |
| | |
| | return demo |
| |
|
| | |
| | if __name__ == "__main__": |
| | demo = create_enhanced_gradio_interface() |
| | demo.launch() |