Upload app.py

f492cc6 verified 9 months ago

32.2 kB

	"""
	Enhanced Speech-to-Speech Translation Pipeline with Advanced Gradio Interface

	This script implements a complete pipeline for speech-to-speech translation with
	dynamic model selection and advanced configuration options.

	Features:
	- Dynamic Whisper model switching (tiny, base, small, medium)
	- NLLB model selection (600M, 1.3B)
	- Advanced translation parameters (beam size, temperature, etc.)
	- Real-time processing with detailed model information
	- Comprehensive model descriptions and performance metrics

	Requirements:
	- faster-whisper
	- ctranslate2
	- transformers (version 4.33.0+)
	- torch
	- numpy
	- scipy
	- requests (for fallback tokenizer)
	- gradio
	"""

	import os
	import time
	import torch
	import numpy as np
	import ctranslate2
	import scipy.io.wavfile
	from faster_whisper import WhisperModel
	import gradio as gr
	import re
	from pathlib import Path
	from typing import Dict, Optional, Tuple, Generator

	# Fix for numpy binary incompatibility
	os.environ["PYTHONWARNINGS"] = "ignore::RuntimeWarning"

	class EnhancedS2SPipeline:
	"""
	Enhanced Speech-to-Speech Translation Pipeline with dynamic model loading
	"""

	def __init__(self, device="cuda"):
	"""
	Initialize the pipeline with dynamic model loading capability

	Args:
	device: Device to run inference on ('cuda' or 'cpu')
	"""
	self.device = device if torch.cuda.is_available() else "cpu"
	self.compute_type = "float16" if self.device == "cuda" else "int8"

	# Model caches
	self.whisper_models: Dict[str, WhisperModel] = {}
	self.nllb_models: Dict[str, ctranslate2.Translator] = {}
	self.nllb_tokenizer = None
	self.tts_models = {}
	self.tts_tokenizers = {}

	# Model configurations - Updated for HuggingFace Spaces
	self.model_configs = {
	"whisper": {
	"tiny": {"size": "39 MB", "speed": "Very Fast", "accuracy": "Good", "multilingual": True},
	"base": {"size": "74 MB", "speed": "Fast", "accuracy": "Better", "multilingual": True},
	"small": {"size": "244 MB", "speed": "Medium", "accuracy": "Good", "multilingual": True},
	"medium": {"size": "769 MB", "speed": "Slow", "accuracy": "Very Good", "multilingual": True}
	},
	"nllb": {
	"600M": {
	"path": "./models/nllb-200-distilled-600M-ct2-int8",
	"size": "600M parameters",
	"speed": "Fast",
	"accuracy": "Good",
	"languages": "200+ languages"
	},
	"1.3B": {
	"path": "./models/nllb-200-distilled-1.3B-ct2-int8",
	"size": "1.3B parameters",
	"speed": "Medium",
	"accuracy": "Better",
	"languages": "200+ languages"
	}
	}
	}

	# Language code mappings for NLLB
	self.lang_codes = {
	"English": "eng_Latn", # English
	"French": "fra_Latn", # French
	}

	# TTS language mapping
	self.tts_lang_codes = {
	"English": "eng",
	"French": "fra"
	}

	print(f"Enhanced Speech-to-Speech pipeline initialized on {self.device}")

	# Initialize TTS models (these are relatively small, so we can load them upfront)
	self._initialize_tts_models()

	# Initialize tokenizer
	self._initialize_nllb_tokenizer()

	def _initialize_tts_models(self):
	"""Initialize TTS models for all supported languages"""
	print("Loading MMS-TTS models for English and French...")

	try:
	from transformers.models.vits.modeling_vits import VitsModel
	from transformers.models.vits.tokenization_vits import VitsTokenizer

	# Load English TTS model
	print("Loading English TTS model...")
	self.tts_models["English"] = VitsModel.from_pretrained(
	"facebook/mms-tts-eng",
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
	).to(self.device)
	self.tts_tokenizers["English"] = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")

	# Load French TTS model
	print("Loading French TTS model...")
	self.tts_models["French"] = VitsModel.from_pretrained(
	"facebook/mms-tts-fra",
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
	).to(self.device)
	self.tts_tokenizers["French"] = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")

	print("TTS models loaded successfully.")

	except Exception as e:
	print(f"Error loading TTS models: {e}")
	print("TTS functionality may be limited.")

	def _initialize_nllb_tokenizer(self):
	"""Initialize NLLB tokenizer with fallback"""
	try:
	print("Loading NLLB tokenizer...")
	from transformers.models.nllb.tokenization_nllb import NllbTokenizer
	self.nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
	print("NLLB tokenizer loaded successfully.")
	except Exception as e:
	print(f"Error loading NLLB tokenizer: {e}")
	print("Implementing simplified fallback tokenizer...")
	self.nllb_tokenizer = self._create_fallback_tokenizer()

	def _create_fallback_tokenizer(self):
	"""Create a simplified fallback tokenizer for NLLB"""
	import json
	import requests

	class SimplifiedNllbTokenizer:
	def __init__(self):
	self.src_lang = "eng_Latn"
	cache_dir = Path.home() / ".cache" / "simplified_nllb_tokenizer"
	cache_dir.mkdir(parents=True, exist_ok=True)
	vocab_file = cache_dir / "vocab.json"

	if not vocab_file.exists():
	print("Downloading NLLB vocabulary for fallback tokenizer...")
	url = "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/vocab.json"
	try:
	response = requests.get(url)
	response.raise_for_status()
	with open(vocab_file, 'wb') as f:
	f.write(response.content)
	print("Vocabulary downloaded successfully.")
	except requests.exceptions.RequestException as req_e:
	print(f"Failed to download vocabulary: {req_e}")
	with open(vocab_file, 'w') as f:
	json.dump({"[PAD]": 0, "[UNK]": 1}, f)

	with open(vocab_file, 'r', encoding='utf-8') as f:
	self.vocab = json.load(f)
	self.id_to_token = {v: k for k, v in self.vocab.items()}

	def tokenize(self, text):
	text = text.lower()
	tokens = re.findall(r'\w+\|[^\w\s]', text)
	return tokens

	def convert_tokens_to_ids(self, tokens):
	return [self.vocab.get(token, self.vocab.get("[UNK]", 1)) for token in tokens]

	def convert_ids_to_tokens(self, ids):
	return [self.id_to_token.get(id, "[UNK]") for id in ids]

	def decode(self, token_ids, skip_special_tokens=True):
	tokens = [self.id_to_token.get(id, "[UNK]") for id in token_ids]
	if skip_special_tokens:
	tokens = [t for t in tokens if not t.startswith("[") and not t.endswith("]")]
	return " ".join(tokens)

	def __call__(self, text, return_tensors=None, padding=False):
	tokens = self.tokenize(text)
	input_ids = self.convert_tokens_to_ids(tokens)

	if return_tensors == "pt":
	import torch
	return {"input_ids": torch.tensor([input_ids])}
	else:
	return {"input_ids": [input_ids]}

	return SimplifiedNllbTokenizer()

	def get_whisper_model(self, model_size: str) -> WhisperModel:
	"""Get or load Whisper model"""
	if model_size not in self.whisper_models:
	print(f"Loading Whisper model '{model_size}'...")

	# Try to load from local models directory first
	model_path = f"./models/whisper/{model_size}.pt"
	if os.path.exists(model_path):
	print(f"Loading Whisper model from local path: {model_path}")
	self.whisper_models[model_size] = WhisperModel(
	model_path,
	device=self.device,
	compute_type=self.compute_type
	)
	else:
	# Fallback to HuggingFace Hub
	print(f"Loading Whisper model from HuggingFace Hub: {model_size}")
	self.whisper_models[model_size] = WhisperModel(
	model_size,
	device=self.device,
	compute_type=self.compute_type
	)
	print(f"Whisper '{model_size}' loaded successfully.")
	return self.whisper_models[model_size]

	def get_nllb_model(self, model_size: str) -> ctranslate2.Translator:
	"""Get or load NLLB model"""
	if model_size not in self.nllb_models:
	model_path = self.model_configs["nllb"][model_size]["path"]
	print(f"Loading NLLB model '{model_size}' from {model_path}...")
	try:
	self.nllb_models[model_size] = ctranslate2.Translator(
	model_path,
	device=self.device,
	compute_type=self.compute_type
	)
	print(f"NLLB '{model_size}' loaded successfully.")
	except RuntimeError as e:
	print(f"ERROR: Failed to load NLLB model from '{model_path}'.")
	print(f"Please ensure the path is correct and contains model files.")
	raise
	return self.nllb_models[model_size]

	def transcribe_realtime(self, audio_file, source_lang=None, whisper_model="tiny",
	vad_filter=False, beam_size=5, temperature=0.0):
	"""Enhanced transcription with configurable parameters"""
	print(f"\n1. Transcribing with Whisper-{whisper_model}...")
	start_time = time.time()

	# Get Whisper model
	whisper = self.get_whisper_model(whisper_model)

	# Determine language code for Whisper
	whisper_lang = None
	if source_lang:
	whisper_lang = "en" if source_lang == "English" else "fr" if source_lang == "French" else None

	full_transcript = ""

	# Configure transcription parameters
	transcribe_params = {
	"language": whisper_lang,
	"beam_size": beam_size,
	"vad_filter": vad_filter,
	"word_timestamps": False
	}

	if temperature > 0:
	transcribe_params["temperature"] = temperature

	segments_generator, info = whisper.transcribe(audio_file, **transcribe_params)

	yield "", info.language if info else None

	for segment in segments_generator:
	full_transcript += segment.text + " "
	yield full_transcript.strip(), info.language if info else None

	elapsed_time = time.time() - start_time
	print(f"Transcription completed in {elapsed_time:.2f}s with {whisper_model}")
	print(f"Detected language: {info.language} (confidence: {info.language_probability:.4f})")

	yield full_transcript.strip(), info.language if info else None

	def translate_realtime(self, text_to_translate, source_lang, target_lang,
	nllb_model="600M", beam_size=4, length_penalty=1.0,
	repetition_penalty=1.0):
	"""Enhanced translation with configurable parameters"""
	print(f"\n2. Translating with NLLB-{nllb_model}...")
	start_time = time.time()

	# Get NLLB model
	translator = self.get_nllb_model(nllb_model)

	src_lang_nllb = self.lang_codes.get(source_lang)
	tgt_lang_nllb = self.lang_codes.get(target_lang)

	if not src_lang_nllb or not tgt_lang_nllb:
	raise ValueError(f"Unsupported language pair: {source_lang} -> {target_lang}")

	self.nllb_tokenizer.src_lang = src_lang_nllb

	# Split into sentences
	sentences = re.findall(r'[^.!?]+[.!?]', text_to_translate + ('.' if not text_to_translate.endswith(('.', '!', '?')) else ''))
	if not sentences:
	sentences = [text_to_translate]

	full_translation = ""

	for i, sentence in enumerate(sentences):
	if not sentence.strip():
	continue

	try:
	tokenizer_output = self.nllb_tokenizer(sentence, return_tensors="pt", padding=True)
	source_tokens = tokenizer_output["input_ids"].tolist()[0]
	source_tokens_as_str = self.nllb_tokenizer.convert_ids_to_tokens(source_tokens)

	target_prefix = [tgt_lang_nllb]

	# Use configured parameters
	result = translator.translate_batch(
	[source_tokens_as_str],
	target_prefix=[target_prefix],
	beam_size=beam_size,
	length_penalty=length_penalty,
	repetition_penalty=repetition_penalty,
	max_batch_size=32
	)[0]

	tgt_tokens = result.hypotheses[0][1:] if len(result.hypotheses[0]) > 1 else result.hypotheses[0]

	chunk_translation = self.nllb_tokenizer.decode(
	self.nllb_tokenizer.convert_tokens_to_ids(tgt_tokens),
	skip_special_tokens=True
	)

	full_translation += chunk_translation + " "
	yield full_translation.strip()

	except Exception as e:
	print(f"Error translating sentence {i+1}: {e}")
	error_msg = f"[Translation error for segment {i+1}] "
	full_translation += error_msg
	yield full_translation.strip()

	elapsed_time = time.time() - start_time
	print(f"Translation completed in {elapsed_time:.2f}s with NLLB-{nllb_model}")

	yield full_translation.strip()

	def synthesize(self, text, target_lang, output_file="output.wav", speaking_rate=1.0):
	"""Enhanced synthesis with speaking rate control"""
	print(f"\n3. Synthesizing speech in {target_lang}...")
	start_time = time.time()

	if target_lang not in self.tts_models:
	raise ValueError(f"TTS for language {target_lang} not supported")

	model = self.tts_models[target_lang]
	tokenizer = self.tts_tokenizers[target_lang]

	# Process text in chunks
	MAX_LENGTH = 200
	sentences = re.findall(r'[^.!?]+[.!?]', text + ('.' if not text.endswith(('.', '!', '?')) else ''))
	sentences = [s.strip() for s in sentences if s.strip()]

	current_chunk = ""
	text_chunks = []

	for sentence in sentences:
	if len(current_chunk) + len(sentence) + 1 <= MAX_LENGTH:
	current_chunk += (" " if current_chunk else "") + sentence
	else:
	if current_chunk:
	text_chunks.append(current_chunk)
	current_chunk = sentence

	if current_chunk:
	text_chunks.append(current_chunk)

	if not text_chunks:
	text_chunks = [text]

	print(f"Text split into {len(text_chunks)} chunks for TTS")

	all_audio = []

	for i, chunk in enumerate(text_chunks):
	try:
	inputs = tokenizer(text=chunk, return_tensors="pt")
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	torch.manual_seed(555 + i)

	with torch.no_grad():
	output = model(**inputs).waveform

	chunk_audio = output.squeeze().cpu().float().numpy()

	# Apply speaking rate adjustment
	if speaking_rate != 1.0:
	from scipy.signal import resample
	new_length = int(len(chunk_audio) / speaking_rate)
	chunk_audio = resample(chunk_audio, new_length)

	all_audio.append(chunk_audio)

	except Exception as e:
	print(f"Error generating speech for chunk {i+1}: {e}")

	# Combine audio chunks
	if all_audio:
	try:
	audio_data = np.concatenate(all_audio)
	except Exception as e:
	print(f"Error concatenating audio: {e}")
	audio_data = all_audio[0] if all_audio else np.zeros(16000, dtype=np.float32)
	else:
	audio_data = np.zeros(16000, dtype=np.float32)

	# Ensure float32 format
	if audio_data.dtype != np.float32:
	audio_data = audio_data.astype(np.float32)

	# Normalize and convert
	if np.max(np.abs(audio_data)) > 0:
	audio_data = audio_data / np.max(np.abs(audio_data))

	audio_data_int16 = (audio_data * 32767).astype(np.int16)

	# Save to file
	sampling_rate = model.config.sampling_rate
	scipy.io.wavfile.write(output_file, rate=sampling_rate, data=audio_data_int16)

	elapsed_time = time.time() - start_time
	audio_duration = len(audio_data) / sampling_rate
	print(f"Speech synthesis completed in {elapsed_time:.2f}s")
	print(f"Generated {audio_duration:.2f}s of audio (RTF: {elapsed_time/audio_duration:.2f}x)")

	return output_file, audio_duration

	def process_speech_to_speech_realtime(self, audio_file, source_lang, target_lang,
	whisper_model="tiny", nllb_model="600M",
	whisper_beam_size=5, whisper_temperature=0.0,
	vad_filter=False, nllb_beam_size=4,
	length_penalty=1.0, repetition_penalty=1.0,
	speaking_rate=1.0, output_file=None):
	"""Complete pipeline with all configurable parameters"""
	if output_file is None:
	output_file = f"output_{source_lang}_to_{target_lang}_{int(time.time())}.wav"

	print(f"\n===== ENHANCED SPEECH-TO-SPEECH TRANSLATION =====")
	print(f"Models: Whisper-{whisper_model}, NLLB-{nllb_model}")
	print(f"Languages: {source_lang} -> {target_lang}")

	total_start_time = time.time()

	current_transcript = ""
	current_translation = ""
	detected_lang = None
	output_path = None
	audio_duration = 0
	success = False

	try:
	# Step 1: Transcribe
	yield "🎤 Transcribing audio...", "", "", None
	for partial_transcript, lang in self.transcribe_realtime(
	audio_file, source_lang, whisper_model, vad_filter,
	whisper_beam_size, whisper_temperature
	):
	current_transcript = partial_transcript
	detected_lang = lang
	yield "🎤 Transcribing audio...", current_transcript, current_translation, None

	# Step 2: Translate
	yield "🔄 Translating text...", current_transcript, current_translation, None
	for partial_translation in self.translate_realtime(
	current_transcript, source_lang, target_lang, nllb_model,
	nllb_beam_size, length_penalty, repetition_penalty
	):
	current_translation = partial_translation
	yield "🔄 Translating text...", current_transcript, current_translation, None

	# Step 3: Synthesize
	yield "🔊 Synthesizing speech...", current_transcript, current_translation, None
	output_path, audio_duration = self.synthesize(
	current_translation, target_lang, output_file, speaking_rate
	)

	success = True

	except Exception as e:
	print(f"ERROR in pipeline: {e}")
	import traceback
	traceback.print_exc()
	success = False
	current_transcript = "❌ Transcription failed"
	current_translation = "❌ Translation failed"
	output_path = None

	total_elapsed_time = time.time() - total_start_time

	if success:
	status = (f"✅ Success! Total time: {total_elapsed_time:.2f}s, "
	f"Audio: {audio_duration:.2f}s")
	else:
	status = "❌ Processing failed"

	print(f"\n===== TRANSLATION {'COMPLETED' if success else 'FAILED'} =====")

	yield status, current_transcript, current_translation, output_path

	def create_enhanced_gradio_interface():
	"""Create enhanced Gradio interface with model selection and advanced options"""

	# Initialize pipeline
	pipeline = EnhancedS2SPipeline()

	def get_model_info(model_type, model_name):
	"""Get model information for display"""
	config = pipeline.model_configs[model_type][model_name]
	if model_type == "whisper":
	return f"{model_name.upper()} - Size: {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"
	else:
	return f"{model_name} - {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"

	def process_audio_enhanced(audio_file, source_lang_str, target_lang_str,
	whisper_model, nllb_model, whisper_beam_size,
	whisper_temperature, vad_filter, nllb_beam_size,
	length_penalty, repetition_penalty, speaking_rate):
	"""Enhanced processing function with all parameters"""
	if audio_file is None:
	yield "❌ No audio provided", "No transcript available", "No translation available", None
	return

	for status, transcript, translation, output_audio in pipeline.process_speech_to_speech_realtime(
	audio_file=audio_file,
	source_lang=source_lang_str,
	target_lang=target_lang_str,
	whisper_model=whisper_model,
	nllb_model=nllb_model,
	whisper_beam_size=whisper_beam_size,
	whisper_temperature=whisper_temperature,
	vad_filter=vad_filter,
	nllb_beam_size=nllb_beam_size,
	length_penalty=length_penalty,
	repetition_penalty=repetition_penalty,
	speaking_rate=speaking_rate
	):
	yield status, transcript, translation, output_audio

	# Create the interface
	with gr.Blocks(title="Enhanced Speech-to-Speech Translation", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎙️ Enhanced Speech-to-Speech Translation")
	gr.Markdown("Advanced AI-powered speech translation with configurable models and parameters.")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📥 Input Configuration")

	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎵 Upload or Record Audio"
	)

	with gr.Row():
	source_lang = gr.Radio(
	choices=["English", "French"],
	value="English",
	label="📢 Source Language"
	)
	target_lang = gr.Radio(
	choices=["English", "French"],
	value="French",
	label="🎯 Target Language"
	)

	gr.Markdown("### 🧠 Model Selection")

	with gr.Accordion("🎤 Whisper ASR Model", open=True):
	whisper_model = gr.Radio(
	choices=["tiny", "base", "small", "medium"],
	value="tiny",
	label="Model Size"
	)
	whisper_info = gr.Markdown(get_model_info("whisper", "tiny"))

	with gr.Accordion("🔄 NLLB Translation Model", open=True):
	nllb_model = gr.Radio(
	choices=["600M", "1.3B"],
	value="600M",
	label="Model Size"
	)
	nllb_info = gr.Markdown(get_model_info("nllb", "600M"))

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	gr.Markdown("Whisper Parameters")
	whisper_beam_size = gr.Slider(1, 10, value=5, step=1, label="Beam Size")
	whisper_temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
	vad_filter = gr.Checkbox(label="Voice Activity Detection", value=False)

	gr.Markdown("Translation Parameters")
	nllb_beam_size = gr.Slider(1, 8, value=4, step=1, label="Beam Size")
	length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty")
	repetition_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Repetition Penalty")

	gr.Markdown("Speech Synthesis")
	speaking_rate = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")

	process_btn = gr.Button("🚀 Translate", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 📤 Results")

	status_output = gr.Textbox(label="📊 Status", interactive=False)

	with gr.Tabs():
	with gr.TabItem("📝 Text Results"):
	transcript_output = gr.Textbox(
	label="🎤 Original Transcript",
	lines=6,
	interactive=False
	)
	translation_output = gr.Textbox(
	label="🔄 Translation",
	lines=6,
	interactive=False
	)

	with gr.TabItem("🔊 Audio Output"):
	audio_output = gr.Audio(
	type="filepath",
	label="🔊 Translated Speech"
	)

	# Example section
	with gr.Row():
	gr.Markdown("### 🎵 Try Our Examples")
	with gr.Row():
	gr.Examples(
	examples=[
	["./examples/input_audio/eng1.wav", "English", "French", "tiny", "600M"],
	["./examples/input_audio/fr1.wav", "French", "English", "tiny", "600M"],
	["./examples/input_audio/eng2.wav", "English", "French", "base", "600M"]
	] if os.path.exists("./examples") else [],
	inputs=[audio_input, source_lang, target_lang, whisper_model, nllb_model],
	label="Sample Audio Files"
	)

	# Model info update functions
	def update_whisper_info(model):
	return get_model_info("whisper", model)

	def update_nllb_info(model):
	return get_model_info("nllb", model)

	# Connect update functions
	whisper_model.change(update_whisper_info, whisper_model, whisper_info)
	nllb_model.change(update_nllb_info, nllb_model, nllb_info)

	# Main processing function
	process_btn.click(
	fn=process_audio_enhanced,
	inputs=[
	audio_input, source_lang, target_lang, whisper_model, nllb_model,
	whisper_beam_size, whisper_temperature, vad_filter,
	nllb_beam_size, length_penalty, repetition_penalty, speaking_rate
	],
	outputs=[status_output, transcript_output, translation_output, audio_output]
	)

	# Information sections
	with gr.Accordion("📚 Model Information", open=False):
	gr.Markdown("""
	### 🎤 Whisper Models (OpenAI)
	- Tiny: Fastest, smallest model. Good for quick transcription.
	- Base: Balanced speed and accuracy. Recommended for most use cases.
	- Small: Better accuracy, moderate speed. Good for important content.
	- Medium: High accuracy, slower processing. Professional applications.

	### 🔄 NLLB Models (Meta)
	- 600M: Faster translation with good quality. Supports 200+ languages.
	- 1.3B: Better translation quality with more parameters. Higher accuracy.

	### 🔊 MMS-TTS (Meta)
	- High-quality multilingual text-to-speech synthesis
	- Supports natural-sounding voice generation
	- Optimized for English and French
	""")

	with gr.Accordion("⚙️ Parameter Guide", open=False):
	gr.Markdown("""
	### Whisper Parameters
	- Beam Size: Higher values = better accuracy, slower processing (1-10)
	- Temperature: Higher values = more diverse outputs (0.0-1.0)
	- VAD Filter: Removes silence automatically (may require additional dependencies)

	### Translation Parameters
	- Beam Size: Search breadth for translation (1-8)
	- Length Penalty: Controls output length preference (0.5-2.0)
	- Repetition Penalty: Reduces repetitive translations (0.5-2.0)

	### Speech Synthesis
	- Speaking Rate: Playback speed multiplier (0.5-2.0)
	""")

	with gr.Accordion("🔧 Usage Instructions", open=False):
	gr.Markdown("""
	1. Upload/Record: Add your audio file or record directly
	2. Select Languages: Choose source and target languages
	3. Choose Models: Select model sizes based on your speed/quality needs
	4. Adjust Settings: Fine-tune advanced parameters if needed
	5. Translate: Click the translate button and watch real-time progress
	6. Download: Save the translated audio file

	Tips:
	- Use smaller models for faster processing
	- Use larger models for better quality
	- Adjust beam sizes for quality vs speed trade-off
	- Speaking rate can make output faster or slower
	""")

	return demo

	# Launch the application
	if __name__ == "__main__":
	demo = create_enhanced_gradio_interface()
	demo.launch()