pruthvi423 commited on
Commit
f492cc6
Β·
verified Β·
1 Parent(s): 44dac74

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +739 -0
app.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Speech-to-Speech Translation Pipeline with Advanced Gradio Interface
3
+
4
+ This script implements a complete pipeline for speech-to-speech translation with
5
+ dynamic model selection and advanced configuration options.
6
+
7
+ Features:
8
+ - Dynamic Whisper model switching (tiny, base, small, medium)
9
+ - NLLB model selection (600M, 1.3B)
10
+ - Advanced translation parameters (beam size, temperature, etc.)
11
+ - Real-time processing with detailed model information
12
+ - Comprehensive model descriptions and performance metrics
13
+
14
+ Requirements:
15
+ - faster-whisper
16
+ - ctranslate2
17
+ - transformers (version 4.33.0+)
18
+ - torch
19
+ - numpy
20
+ - scipy
21
+ - requests (for fallback tokenizer)
22
+ - gradio
23
+ """
24
+
25
+ import os
26
+ import time
27
+ import torch
28
+ import numpy as np
29
+ import ctranslate2
30
+ import scipy.io.wavfile
31
+ from faster_whisper import WhisperModel
32
+ import gradio as gr
33
+ import re
34
+ from pathlib import Path
35
+ from typing import Dict, Optional, Tuple, Generator
36
+
37
+ # Fix for numpy binary incompatibility
38
+ os.environ["PYTHONWARNINGS"] = "ignore::RuntimeWarning"
39
+
40
+ class EnhancedS2SPipeline:
41
+ """
42
+ Enhanced Speech-to-Speech Translation Pipeline with dynamic model loading
43
+ """
44
+
45
+ def __init__(self, device="cuda"):
46
+ """
47
+ Initialize the pipeline with dynamic model loading capability
48
+
49
+ Args:
50
+ device: Device to run inference on ('cuda' or 'cpu')
51
+ """
52
+ self.device = device if torch.cuda.is_available() else "cpu"
53
+ self.compute_type = "float16" if self.device == "cuda" else "int8"
54
+
55
+ # Model caches
56
+ self.whisper_models: Dict[str, WhisperModel] = {}
57
+ self.nllb_models: Dict[str, ctranslate2.Translator] = {}
58
+ self.nllb_tokenizer = None
59
+ self.tts_models = {}
60
+ self.tts_tokenizers = {}
61
+
62
+ # Model configurations - Updated for HuggingFace Spaces
63
+ self.model_configs = {
64
+ "whisper": {
65
+ "tiny": {"size": "39 MB", "speed": "Very Fast", "accuracy": "Good", "multilingual": True},
66
+ "base": {"size": "74 MB", "speed": "Fast", "accuracy": "Better", "multilingual": True},
67
+ "small": {"size": "244 MB", "speed": "Medium", "accuracy": "Good", "multilingual": True},
68
+ "medium": {"size": "769 MB", "speed": "Slow", "accuracy": "Very Good", "multilingual": True}
69
+ },
70
+ "nllb": {
71
+ "600M": {
72
+ "path": "./models/nllb-200-distilled-600M-ct2-int8",
73
+ "size": "600M parameters",
74
+ "speed": "Fast",
75
+ "accuracy": "Good",
76
+ "languages": "200+ languages"
77
+ },
78
+ "1.3B": {
79
+ "path": "./models/nllb-200-distilled-1.3B-ct2-int8",
80
+ "size": "1.3B parameters",
81
+ "speed": "Medium",
82
+ "accuracy": "Better",
83
+ "languages": "200+ languages"
84
+ }
85
+ }
86
+ }
87
+
88
+ # Language code mappings for NLLB
89
+ self.lang_codes = {
90
+ "English": "eng_Latn", # English
91
+ "French": "fra_Latn", # French
92
+ }
93
+
94
+ # TTS language mapping
95
+ self.tts_lang_codes = {
96
+ "English": "eng",
97
+ "French": "fra"
98
+ }
99
+
100
+ print(f"Enhanced Speech-to-Speech pipeline initialized on {self.device}")
101
+
102
+ # Initialize TTS models (these are relatively small, so we can load them upfront)
103
+ self._initialize_tts_models()
104
+
105
+ # Initialize tokenizer
106
+ self._initialize_nllb_tokenizer()
107
+
108
+ def _initialize_tts_models(self):
109
+ """Initialize TTS models for all supported languages"""
110
+ print("Loading MMS-TTS models for English and French...")
111
+
112
+ try:
113
+ from transformers.models.vits.modeling_vits import VitsModel
114
+ from transformers.models.vits.tokenization_vits import VitsTokenizer
115
+
116
+ # Load English TTS model
117
+ print("Loading English TTS model...")
118
+ self.tts_models["English"] = VitsModel.from_pretrained(
119
+ "facebook/mms-tts-eng",
120
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
121
+ ).to(self.device)
122
+ self.tts_tokenizers["English"] = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
123
+
124
+ # Load French TTS model
125
+ print("Loading French TTS model...")
126
+ self.tts_models["French"] = VitsModel.from_pretrained(
127
+ "facebook/mms-tts-fra",
128
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
129
+ ).to(self.device)
130
+ self.tts_tokenizers["French"] = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
131
+
132
+ print("TTS models loaded successfully.")
133
+
134
+ except Exception as e:
135
+ print(f"Error loading TTS models: {e}")
136
+ print("TTS functionality may be limited.")
137
+
138
+ def _initialize_nllb_tokenizer(self):
139
+ """Initialize NLLB tokenizer with fallback"""
140
+ try:
141
+ print("Loading NLLB tokenizer...")
142
+ from transformers.models.nllb.tokenization_nllb import NllbTokenizer
143
+ self.nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
144
+ print("NLLB tokenizer loaded successfully.")
145
+ except Exception as e:
146
+ print(f"Error loading NLLB tokenizer: {e}")
147
+ print("Implementing simplified fallback tokenizer...")
148
+ self.nllb_tokenizer = self._create_fallback_tokenizer()
149
+
150
+ def _create_fallback_tokenizer(self):
151
+ """Create a simplified fallback tokenizer for NLLB"""
152
+ import json
153
+ import requests
154
+
155
+ class SimplifiedNllbTokenizer:
156
+ def __init__(self):
157
+ self.src_lang = "eng_Latn"
158
+ cache_dir = Path.home() / ".cache" / "simplified_nllb_tokenizer"
159
+ cache_dir.mkdir(parents=True, exist_ok=True)
160
+ vocab_file = cache_dir / "vocab.json"
161
+
162
+ if not vocab_file.exists():
163
+ print("Downloading NLLB vocabulary for fallback tokenizer...")
164
+ url = "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/vocab.json"
165
+ try:
166
+ response = requests.get(url)
167
+ response.raise_for_status()
168
+ with open(vocab_file, 'wb') as f:
169
+ f.write(response.content)
170
+ print("Vocabulary downloaded successfully.")
171
+ except requests.exceptions.RequestException as req_e:
172
+ print(f"Failed to download vocabulary: {req_e}")
173
+ with open(vocab_file, 'w') as f:
174
+ json.dump({"[PAD]": 0, "[UNK]": 1}, f)
175
+
176
+ with open(vocab_file, 'r', encoding='utf-8') as f:
177
+ self.vocab = json.load(f)
178
+ self.id_to_token = {v: k for k, v in self.vocab.items()}
179
+
180
+ def tokenize(self, text):
181
+ text = text.lower()
182
+ tokens = re.findall(r'\w+|[^\w\s]', text)
183
+ return tokens
184
+
185
+ def convert_tokens_to_ids(self, tokens):
186
+ return [self.vocab.get(token, self.vocab.get("[UNK]", 1)) for token in tokens]
187
+
188
+ def convert_ids_to_tokens(self, ids):
189
+ return [self.id_to_token.get(id, "[UNK]") for id in ids]
190
+
191
+ def decode(self, token_ids, skip_special_tokens=True):
192
+ tokens = [self.id_to_token.get(id, "[UNK]") for id in token_ids]
193
+ if skip_special_tokens:
194
+ tokens = [t for t in tokens if not t.startswith("[") and not t.endswith("]")]
195
+ return " ".join(tokens)
196
+
197
+ def __call__(self, text, return_tensors=None, padding=False):
198
+ tokens = self.tokenize(text)
199
+ input_ids = self.convert_tokens_to_ids(tokens)
200
+
201
+ if return_tensors == "pt":
202
+ import torch
203
+ return {"input_ids": torch.tensor([input_ids])}
204
+ else:
205
+ return {"input_ids": [input_ids]}
206
+
207
+ return SimplifiedNllbTokenizer()
208
+
209
+ def get_whisper_model(self, model_size: str) -> WhisperModel:
210
+ """Get or load Whisper model"""
211
+ if model_size not in self.whisper_models:
212
+ print(f"Loading Whisper model '{model_size}'...")
213
+
214
+ # Try to load from local models directory first
215
+ model_path = f"./models/whisper/{model_size}.pt"
216
+ if os.path.exists(model_path):
217
+ print(f"Loading Whisper model from local path: {model_path}")
218
+ self.whisper_models[model_size] = WhisperModel(
219
+ model_path,
220
+ device=self.device,
221
+ compute_type=self.compute_type
222
+ )
223
+ else:
224
+ # Fallback to HuggingFace Hub
225
+ print(f"Loading Whisper model from HuggingFace Hub: {model_size}")
226
+ self.whisper_models[model_size] = WhisperModel(
227
+ model_size,
228
+ device=self.device,
229
+ compute_type=self.compute_type
230
+ )
231
+ print(f"Whisper '{model_size}' loaded successfully.")
232
+ return self.whisper_models[model_size]
233
+
234
+ def get_nllb_model(self, model_size: str) -> ctranslate2.Translator:
235
+ """Get or load NLLB model"""
236
+ if model_size not in self.nllb_models:
237
+ model_path = self.model_configs["nllb"][model_size]["path"]
238
+ print(f"Loading NLLB model '{model_size}' from {model_path}...")
239
+ try:
240
+ self.nllb_models[model_size] = ctranslate2.Translator(
241
+ model_path,
242
+ device=self.device,
243
+ compute_type=self.compute_type
244
+ )
245
+ print(f"NLLB '{model_size}' loaded successfully.")
246
+ except RuntimeError as e:
247
+ print(f"ERROR: Failed to load NLLB model from '{model_path}'.")
248
+ print(f"Please ensure the path is correct and contains model files.")
249
+ raise
250
+ return self.nllb_models[model_size]
251
+
252
+ def transcribe_realtime(self, audio_file, source_lang=None, whisper_model="tiny",
253
+ vad_filter=False, beam_size=5, temperature=0.0):
254
+ """Enhanced transcription with configurable parameters"""
255
+ print(f"\n1. Transcribing with Whisper-{whisper_model}...")
256
+ start_time = time.time()
257
+
258
+ # Get Whisper model
259
+ whisper = self.get_whisper_model(whisper_model)
260
+
261
+ # Determine language code for Whisper
262
+ whisper_lang = None
263
+ if source_lang:
264
+ whisper_lang = "en" if source_lang == "English" else "fr" if source_lang == "French" else None
265
+
266
+ full_transcript = ""
267
+
268
+ # Configure transcription parameters
269
+ transcribe_params = {
270
+ "language": whisper_lang,
271
+ "beam_size": beam_size,
272
+ "vad_filter": vad_filter,
273
+ "word_timestamps": False
274
+ }
275
+
276
+ if temperature > 0:
277
+ transcribe_params["temperature"] = temperature
278
+
279
+ segments_generator, info = whisper.transcribe(audio_file, **transcribe_params)
280
+
281
+ yield "", info.language if info else None
282
+
283
+ for segment in segments_generator:
284
+ full_transcript += segment.text + " "
285
+ yield full_transcript.strip(), info.language if info else None
286
+
287
+ elapsed_time = time.time() - start_time
288
+ print(f"Transcription completed in {elapsed_time:.2f}s with {whisper_model}")
289
+ print(f"Detected language: {info.language} (confidence: {info.language_probability:.4f})")
290
+
291
+ yield full_transcript.strip(), info.language if info else None
292
+
293
+ def translate_realtime(self, text_to_translate, source_lang, target_lang,
294
+ nllb_model="600M", beam_size=4, length_penalty=1.0,
295
+ repetition_penalty=1.0):
296
+ """Enhanced translation with configurable parameters"""
297
+ print(f"\n2. Translating with NLLB-{nllb_model}...")
298
+ start_time = time.time()
299
+
300
+ # Get NLLB model
301
+ translator = self.get_nllb_model(nllb_model)
302
+
303
+ src_lang_nllb = self.lang_codes.get(source_lang)
304
+ tgt_lang_nllb = self.lang_codes.get(target_lang)
305
+
306
+ if not src_lang_nllb or not tgt_lang_nllb:
307
+ raise ValueError(f"Unsupported language pair: {source_lang} -> {target_lang}")
308
+
309
+ self.nllb_tokenizer.src_lang = src_lang_nllb
310
+
311
+ # Split into sentences
312
+ sentences = re.findall(r'[^.!?]+[.!?]', text_to_translate + ('.' if not text_to_translate.endswith(('.', '!', '?')) else ''))
313
+ if not sentences:
314
+ sentences = [text_to_translate]
315
+
316
+ full_translation = ""
317
+
318
+ for i, sentence in enumerate(sentences):
319
+ if not sentence.strip():
320
+ continue
321
+
322
+ try:
323
+ tokenizer_output = self.nllb_tokenizer(sentence, return_tensors="pt", padding=True)
324
+ source_tokens = tokenizer_output["input_ids"].tolist()[0]
325
+ source_tokens_as_str = self.nllb_tokenizer.convert_ids_to_tokens(source_tokens)
326
+
327
+ target_prefix = [tgt_lang_nllb]
328
+
329
+ # Use configured parameters
330
+ result = translator.translate_batch(
331
+ [source_tokens_as_str],
332
+ target_prefix=[target_prefix],
333
+ beam_size=beam_size,
334
+ length_penalty=length_penalty,
335
+ repetition_penalty=repetition_penalty,
336
+ max_batch_size=32
337
+ )[0]
338
+
339
+ tgt_tokens = result.hypotheses[0][1:] if len(result.hypotheses[0]) > 1 else result.hypotheses[0]
340
+
341
+ chunk_translation = self.nllb_tokenizer.decode(
342
+ self.nllb_tokenizer.convert_tokens_to_ids(tgt_tokens),
343
+ skip_special_tokens=True
344
+ )
345
+
346
+ full_translation += chunk_translation + " "
347
+ yield full_translation.strip()
348
+
349
+ except Exception as e:
350
+ print(f"Error translating sentence {i+1}: {e}")
351
+ error_msg = f"[Translation error for segment {i+1}] "
352
+ full_translation += error_msg
353
+ yield full_translation.strip()
354
+
355
+ elapsed_time = time.time() - start_time
356
+ print(f"Translation completed in {elapsed_time:.2f}s with NLLB-{nllb_model}")
357
+
358
+ yield full_translation.strip()
359
+
360
+ def synthesize(self, text, target_lang, output_file="output.wav", speaking_rate=1.0):
361
+ """Enhanced synthesis with speaking rate control"""
362
+ print(f"\n3. Synthesizing speech in {target_lang}...")
363
+ start_time = time.time()
364
+
365
+ if target_lang not in self.tts_models:
366
+ raise ValueError(f"TTS for language {target_lang} not supported")
367
+
368
+ model = self.tts_models[target_lang]
369
+ tokenizer = self.tts_tokenizers[target_lang]
370
+
371
+ # Process text in chunks
372
+ MAX_LENGTH = 200
373
+ sentences = re.findall(r'[^.!?]+[.!?]', text + ('.' if not text.endswith(('.', '!', '?')) else ''))
374
+ sentences = [s.strip() for s in sentences if s.strip()]
375
+
376
+ current_chunk = ""
377
+ text_chunks = []
378
+
379
+ for sentence in sentences:
380
+ if len(current_chunk) + len(sentence) + 1 <= MAX_LENGTH:
381
+ current_chunk += (" " if current_chunk else "") + sentence
382
+ else:
383
+ if current_chunk:
384
+ text_chunks.append(current_chunk)
385
+ current_chunk = sentence
386
+
387
+ if current_chunk:
388
+ text_chunks.append(current_chunk)
389
+
390
+ if not text_chunks:
391
+ text_chunks = [text]
392
+
393
+ print(f"Text split into {len(text_chunks)} chunks for TTS")
394
+
395
+ all_audio = []
396
+
397
+ for i, chunk in enumerate(text_chunks):
398
+ try:
399
+ inputs = tokenizer(text=chunk, return_tensors="pt")
400
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
401
+
402
+ torch.manual_seed(555 + i)
403
+
404
+ with torch.no_grad():
405
+ output = model(**inputs).waveform
406
+
407
+ chunk_audio = output.squeeze().cpu().float().numpy()
408
+
409
+ # Apply speaking rate adjustment
410
+ if speaking_rate != 1.0:
411
+ from scipy.signal import resample
412
+ new_length = int(len(chunk_audio) / speaking_rate)
413
+ chunk_audio = resample(chunk_audio, new_length)
414
+
415
+ all_audio.append(chunk_audio)
416
+
417
+ except Exception as e:
418
+ print(f"Error generating speech for chunk {i+1}: {e}")
419
+
420
+ # Combine audio chunks
421
+ if all_audio:
422
+ try:
423
+ audio_data = np.concatenate(all_audio)
424
+ except Exception as e:
425
+ print(f"Error concatenating audio: {e}")
426
+ audio_data = all_audio[0] if all_audio else np.zeros(16000, dtype=np.float32)
427
+ else:
428
+ audio_data = np.zeros(16000, dtype=np.float32)
429
+
430
+ # Ensure float32 format
431
+ if audio_data.dtype != np.float32:
432
+ audio_data = audio_data.astype(np.float32)
433
+
434
+ # Normalize and convert
435
+ if np.max(np.abs(audio_data)) > 0:
436
+ audio_data = audio_data / np.max(np.abs(audio_data))
437
+
438
+ audio_data_int16 = (audio_data * 32767).astype(np.int16)
439
+
440
+ # Save to file
441
+ sampling_rate = model.config.sampling_rate
442
+ scipy.io.wavfile.write(output_file, rate=sampling_rate, data=audio_data_int16)
443
+
444
+ elapsed_time = time.time() - start_time
445
+ audio_duration = len(audio_data) / sampling_rate
446
+ print(f"Speech synthesis completed in {elapsed_time:.2f}s")
447
+ print(f"Generated {audio_duration:.2f}s of audio (RTF: {elapsed_time/audio_duration:.2f}x)")
448
+
449
+ return output_file, audio_duration
450
+
451
+ def process_speech_to_speech_realtime(self, audio_file, source_lang, target_lang,
452
+ whisper_model="tiny", nllb_model="600M",
453
+ whisper_beam_size=5, whisper_temperature=0.0,
454
+ vad_filter=False, nllb_beam_size=4,
455
+ length_penalty=1.0, repetition_penalty=1.0,
456
+ speaking_rate=1.0, output_file=None):
457
+ """Complete pipeline with all configurable parameters"""
458
+ if output_file is None:
459
+ output_file = f"output_{source_lang}_to_{target_lang}_{int(time.time())}.wav"
460
+
461
+ print(f"\n===== ENHANCED SPEECH-TO-SPEECH TRANSLATION =====")
462
+ print(f"Models: Whisper-{whisper_model}, NLLB-{nllb_model}")
463
+ print(f"Languages: {source_lang} -> {target_lang}")
464
+
465
+ total_start_time = time.time()
466
+
467
+ current_transcript = ""
468
+ current_translation = ""
469
+ detected_lang = None
470
+ output_path = None
471
+ audio_duration = 0
472
+ success = False
473
+
474
+ try:
475
+ # Step 1: Transcribe
476
+ yield "🎀 Transcribing audio...", "", "", None
477
+ for partial_transcript, lang in self.transcribe_realtime(
478
+ audio_file, source_lang, whisper_model, vad_filter,
479
+ whisper_beam_size, whisper_temperature
480
+ ):
481
+ current_transcript = partial_transcript
482
+ detected_lang = lang
483
+ yield "🎀 Transcribing audio...", current_transcript, current_translation, None
484
+
485
+ # Step 2: Translate
486
+ yield "πŸ”„ Translating text...", current_transcript, current_translation, None
487
+ for partial_translation in self.translate_realtime(
488
+ current_transcript, source_lang, target_lang, nllb_model,
489
+ nllb_beam_size, length_penalty, repetition_penalty
490
+ ):
491
+ current_translation = partial_translation
492
+ yield "πŸ”„ Translating text...", current_transcript, current_translation, None
493
+
494
+ # Step 3: Synthesize
495
+ yield "πŸ”Š Synthesizing speech...", current_transcript, current_translation, None
496
+ output_path, audio_duration = self.synthesize(
497
+ current_translation, target_lang, output_file, speaking_rate
498
+ )
499
+
500
+ success = True
501
+
502
+ except Exception as e:
503
+ print(f"ERROR in pipeline: {e}")
504
+ import traceback
505
+ traceback.print_exc()
506
+ success = False
507
+ current_transcript = "❌ Transcription failed"
508
+ current_translation = "❌ Translation failed"
509
+ output_path = None
510
+
511
+ total_elapsed_time = time.time() - total_start_time
512
+
513
+ if success:
514
+ status = (f"βœ… Success! Total time: {total_elapsed_time:.2f}s, "
515
+ f"Audio: {audio_duration:.2f}s")
516
+ else:
517
+ status = "❌ Processing failed"
518
+
519
+ print(f"\n===== TRANSLATION {'COMPLETED' if success else 'FAILED'} =====")
520
+
521
+ yield status, current_transcript, current_translation, output_path
522
+
523
+ def create_enhanced_gradio_interface():
524
+ """Create enhanced Gradio interface with model selection and advanced options"""
525
+
526
+ # Initialize pipeline
527
+ pipeline = EnhancedS2SPipeline()
528
+
529
+ def get_model_info(model_type, model_name):
530
+ """Get model information for display"""
531
+ config = pipeline.model_configs[model_type][model_name]
532
+ if model_type == "whisper":
533
+ return f"**{model_name.upper()}** - Size: {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"
534
+ else:
535
+ return f"**{model_name}** - {config['size']}, Speed: {config['speed']}, Accuracy: {config['accuracy']}"
536
+
537
+ def process_audio_enhanced(audio_file, source_lang_str, target_lang_str,
538
+ whisper_model, nllb_model, whisper_beam_size,
539
+ whisper_temperature, vad_filter, nllb_beam_size,
540
+ length_penalty, repetition_penalty, speaking_rate):
541
+ """Enhanced processing function with all parameters"""
542
+ if audio_file is None:
543
+ yield "❌ No audio provided", "No transcript available", "No translation available", None
544
+ return
545
+
546
+ for status, transcript, translation, output_audio in pipeline.process_speech_to_speech_realtime(
547
+ audio_file=audio_file,
548
+ source_lang=source_lang_str,
549
+ target_lang=target_lang_str,
550
+ whisper_model=whisper_model,
551
+ nllb_model=nllb_model,
552
+ whisper_beam_size=whisper_beam_size,
553
+ whisper_temperature=whisper_temperature,
554
+ vad_filter=vad_filter,
555
+ nllb_beam_size=nllb_beam_size,
556
+ length_penalty=length_penalty,
557
+ repetition_penalty=repetition_penalty,
558
+ speaking_rate=speaking_rate
559
+ ):
560
+ yield status, transcript, translation, output_audio
561
+
562
+ # Create the interface
563
+ with gr.Blocks(title="Enhanced Speech-to-Speech Translation", theme=gr.themes.Soft()) as demo:
564
+ gr.Markdown("# πŸŽ™οΈ Enhanced Speech-to-Speech Translation")
565
+ gr.Markdown("Advanced AI-powered speech translation with configurable models and parameters.")
566
+
567
+ with gr.Row():
568
+ with gr.Column(scale=1):
569
+ gr.Markdown("### πŸ“₯ Input Configuration")
570
+
571
+ audio_input = gr.Audio(
572
+ sources=["microphone", "upload"],
573
+ type="filepath",
574
+ label="🎡 Upload or Record Audio"
575
+ )
576
+
577
+ with gr.Row():
578
+ source_lang = gr.Radio(
579
+ choices=["English", "French"],
580
+ value="English",
581
+ label="πŸ“’ Source Language"
582
+ )
583
+ target_lang = gr.Radio(
584
+ choices=["English", "French"],
585
+ value="French",
586
+ label="🎯 Target Language"
587
+ )
588
+
589
+ gr.Markdown("### 🧠 Model Selection")
590
+
591
+ with gr.Accordion("🎀 Whisper ASR Model", open=True):
592
+ whisper_model = gr.Radio(
593
+ choices=["tiny", "base", "small", "medium"],
594
+ value="tiny",
595
+ label="Model Size"
596
+ )
597
+ whisper_info = gr.Markdown(get_model_info("whisper", "tiny"))
598
+
599
+ with gr.Accordion("πŸ”„ NLLB Translation Model", open=True):
600
+ nllb_model = gr.Radio(
601
+ choices=["600M", "1.3B"],
602
+ value="600M",
603
+ label="Model Size"
604
+ )
605
+ nllb_info = gr.Markdown(get_model_info("nllb", "600M"))
606
+
607
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
608
+ gr.Markdown("**Whisper Parameters**")
609
+ whisper_beam_size = gr.Slider(1, 10, value=5, step=1, label="Beam Size")
610
+ whisper_temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
611
+ vad_filter = gr.Checkbox(label="Voice Activity Detection", value=False)
612
+
613
+ gr.Markdown("**Translation Parameters**")
614
+ nllb_beam_size = gr.Slider(1, 8, value=4, step=1, label="Beam Size")
615
+ length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty")
616
+ repetition_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Repetition Penalty")
617
+
618
+ gr.Markdown("**Speech Synthesis**")
619
+ speaking_rate = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")
620
+
621
+ process_btn = gr.Button("πŸš€ Translate", variant="primary", size="lg")
622
+
623
+ with gr.Column(scale=1):
624
+ gr.Markdown("### πŸ“€ Results")
625
+
626
+ status_output = gr.Textbox(label="πŸ“Š Status", interactive=False)
627
+
628
+ with gr.Tabs():
629
+ with gr.TabItem("πŸ“ Text Results"):
630
+ transcript_output = gr.Textbox(
631
+ label="🎀 Original Transcript",
632
+ lines=6,
633
+ interactive=False
634
+ )
635
+ translation_output = gr.Textbox(
636
+ label="πŸ”„ Translation",
637
+ lines=6,
638
+ interactive=False
639
+ )
640
+
641
+ with gr.TabItem("πŸ”Š Audio Output"):
642
+ audio_output = gr.Audio(
643
+ type="filepath",
644
+ label="πŸ”Š Translated Speech"
645
+ )
646
+
647
+ # Example section
648
+ with gr.Row():
649
+ gr.Markdown("### 🎡 Try Our Examples")
650
+ with gr.Row():
651
+ gr.Examples(
652
+ examples=[
653
+ ["./examples/input_audio/eng1.wav", "English", "French", "tiny", "600M"],
654
+ ["./examples/input_audio/fr1.wav", "French", "English", "tiny", "600M"],
655
+ ["./examples/input_audio/eng2.wav", "English", "French", "base", "600M"]
656
+ ] if os.path.exists("./examples") else [],
657
+ inputs=[audio_input, source_lang, target_lang, whisper_model, nllb_model],
658
+ label="Sample Audio Files"
659
+ )
660
+
661
+ # Model info update functions
662
+ def update_whisper_info(model):
663
+ return get_model_info("whisper", model)
664
+
665
+ def update_nllb_info(model):
666
+ return get_model_info("nllb", model)
667
+
668
+ # Connect update functions
669
+ whisper_model.change(update_whisper_info, whisper_model, whisper_info)
670
+ nllb_model.change(update_nllb_info, nllb_model, nllb_info)
671
+
672
+ # Main processing function
673
+ process_btn.click(
674
+ fn=process_audio_enhanced,
675
+ inputs=[
676
+ audio_input, source_lang, target_lang, whisper_model, nllb_model,
677
+ whisper_beam_size, whisper_temperature, vad_filter,
678
+ nllb_beam_size, length_penalty, repetition_penalty, speaking_rate
679
+ ],
680
+ outputs=[status_output, transcript_output, translation_output, audio_output]
681
+ )
682
+
683
+ # Information sections
684
+ with gr.Accordion("πŸ“š Model Information", open=False):
685
+ gr.Markdown("""
686
+ ### 🎀 Whisper Models (OpenAI)
687
+ - **Tiny**: Fastest, smallest model. Good for quick transcription.
688
+ - **Base**: Balanced speed and accuracy. Recommended for most use cases.
689
+ - **Small**: Better accuracy, moderate speed. Good for important content.
690
+ - **Medium**: High accuracy, slower processing. Professional applications.
691
+
692
+ ### πŸ”„ NLLB Models (Meta)
693
+ - **600M**: Faster translation with good quality. Supports 200+ languages.
694
+ - **1.3B**: Better translation quality with more parameters. Higher accuracy.
695
+
696
+ ### πŸ”Š MMS-TTS (Meta)
697
+ - High-quality multilingual text-to-speech synthesis
698
+ - Supports natural-sounding voice generation
699
+ - Optimized for English and French
700
+ """)
701
+
702
+ with gr.Accordion("βš™οΈ Parameter Guide", open=False):
703
+ gr.Markdown("""
704
+ ### Whisper Parameters
705
+ - **Beam Size**: Higher values = better accuracy, slower processing (1-10)
706
+ - **Temperature**: Higher values = more diverse outputs (0.0-1.0)
707
+ - **VAD Filter**: Removes silence automatically (may require additional dependencies)
708
+
709
+ ### Translation Parameters
710
+ - **Beam Size**: Search breadth for translation (1-8)
711
+ - **Length Penalty**: Controls output length preference (0.5-2.0)
712
+ - **Repetition Penalty**: Reduces repetitive translations (0.5-2.0)
713
+
714
+ ### Speech Synthesis
715
+ - **Speaking Rate**: Playback speed multiplier (0.5-2.0)
716
+ """)
717
+
718
+ with gr.Accordion("πŸ”§ Usage Instructions", open=False):
719
+ gr.Markdown("""
720
+ 1. **Upload/Record**: Add your audio file or record directly
721
+ 2. **Select Languages**: Choose source and target languages
722
+ 3. **Choose Models**: Select model sizes based on your speed/quality needs
723
+ 4. **Adjust Settings**: Fine-tune advanced parameters if needed
724
+ 5. **Translate**: Click the translate button and watch real-time progress
725
+ 6. **Download**: Save the translated audio file
726
+
727
+ **Tips:**
728
+ - Use smaller models for faster processing
729
+ - Use larger models for better quality
730
+ - Adjust beam sizes for quality vs speed trade-off
731
+ - Speaking rate can make output faster or slower
732
+ """)
733
+
734
+ return demo
735
+
736
+ # Launch the application
737
+ if __name__ == "__main__":
738
+ demo = create_enhanced_gradio_interface()
739
+ demo.launch()