Rafii commited on
Commit
29c2ac6
Β·
1 Parent(s): d98ca01
Files changed (2) hide show
  1. app.py +473 -0
  2. requirements.txt +21 -0
app.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import os
4
+ import shutil
5
+ from moviepy.editor import VideoFileClip, AudioFileClip
6
+ from faster_whisper import WhisperModel
7
+ import torch
8
+ import torchaudio as ta
9
+ import torchaudio.transforms as transforms
10
+ from chatterbox import ChatterboxMultilingualTTS
11
+ import logging
12
+ from typing import List, Dict
13
+ from deep_translator import GoogleTranslator
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s'
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Configuration
23
+ DEVICE = "cpu"
24
+ COMPUTE_TYPE = "int8"
25
+
26
+ # Set temp directory to writable location
27
+ os.environ['TMPDIR'] = '/tmp'
28
+ tempfile.tempdir = '/tmp'
29
+
30
+ # Patch torch.load to force CPU mapping
31
+ torch_load_orig = torch.load
32
+ def torch_load_cpu(*args, **kwargs):
33
+ kwargs["map_location"] = torch.device("cpu")
34
+ return torch_load_orig(*args, **kwargs)
35
+ torch.load = torch_load_cpu
36
+
37
+ # Global models (loaded once)
38
+ whisper_model = None
39
+ tts_model = None
40
+
41
+ # ==================== Model Loading ====================
42
+
43
+ def load_models():
44
+ """Load models on startup"""
45
+ global whisper_model, tts_model
46
+
47
+ if whisper_model is None:
48
+ logger.info("Loading Whisper model...")
49
+ whisper_model = WhisperModel(
50
+ "large-v3",
51
+ device=DEVICE,
52
+ compute_type=COMPUTE_TYPE,
53
+ cpu_threads=4
54
+ )
55
+ logger.info("βœ… Whisper model loaded!")
56
+
57
+ if tts_model is None:
58
+ logger.info("Loading TTS model...")
59
+ tts_model = ChatterboxMultilingualTTS.from_pretrained(device="cpu")
60
+ logger.info("βœ… TTS model loaded!")
61
+
62
+ return whisper_model, tts_model
63
+
64
+ # ==================== TTS Processing ====================
65
+
66
+ def generate_translated_audio(
67
+ reference_audio_path: str,
68
+ segments: List[Dict],
69
+ output_path: str,
70
+ tts_model,
71
+ progress=gr.Progress(),
72
+ silence_duration: float = 0.5,
73
+ target_language: str = "en"
74
+ ) -> str:
75
+ """Generate translated audio using Chatterbox TTS with progress updates"""
76
+
77
+ try:
78
+ progress(0, desc=f"Generating TTS for {len(segments)} segments...")
79
+
80
+ all_wavs = []
81
+ silence_samples = int(silence_duration * tts_model.sr)
82
+ silence = torch.zeros(1, silence_samples)
83
+
84
+ total_segments = len(segments)
85
+
86
+ for counter, segment in enumerate(segments):
87
+ # Update progress
88
+ prog = (counter + 1) / total_segments
89
+ text_preview = segment['translated_text'][:50]
90
+ progress(prog, desc=f"Processing segment {counter + 1}/{total_segments}: {text_preview}...")
91
+
92
+ original_duration = segment['end'] - segment['start']
93
+
94
+ logger.info(f"Generating audio for text: {segment['translated_text']}")
95
+
96
+ # Generate audio for this segment
97
+ wav = tts_model.generate(
98
+ segment['translated_text'],
99
+ target_language,
100
+ audio_prompt_path=reference_audio_path,
101
+ exaggeration=0.2,
102
+ cfg_weight=0.8,
103
+ temperature=0.4,
104
+ repetition_penalty=1.2,
105
+ min_p=0.05,
106
+ top_p=0.9
107
+ )
108
+
109
+ generated_duration = wav.shape[-1] / tts_model.sr
110
+
111
+ # Add leading silence for the first segment (from 0.0 to segment start)
112
+ if counter == 0 and segment['start'] > 0:
113
+ leading_silence_duration = segment['start']
114
+ leading_silence_samples = int(leading_silence_duration * tts_model.sr)
115
+ leading_silence = torch.zeros((wav.shape[0], leading_silence_samples), dtype=wav.dtype, device=wav.device)
116
+ all_wavs.append(leading_silence)
117
+
118
+ # Handle duration matching
119
+ if generated_duration < original_duration:
120
+ # Generated audio is shorter - add it as is
121
+ all_wavs.append(wav)
122
+
123
+ # Add trailing silence to match original segment duration
124
+ trailing_silence_duration = original_duration - generated_duration
125
+ trailing_silence_samples = int(trailing_silence_duration * tts_model.sr)
126
+ if trailing_silence_samples > 0:
127
+ trailing_silence = torch.zeros((wav.shape[0], trailing_silence_samples), dtype=wav.dtype, device=wav.device)
128
+ all_wavs.append(trailing_silence)
129
+
130
+ elif generated_duration > original_duration:
131
+ # Generated audio is longer - speed it up to fit
132
+ speed_factor = generated_duration / original_duration
133
+ speed_transform = transforms.Speed(tts_model.sr, speed_factor)
134
+ wav_adjusted, _ = speed_transform(wav)
135
+ all_wavs.append(wav_adjusted)
136
+
137
+ else:
138
+ # Duration matches perfectly
139
+ all_wavs.append(wav)
140
+
141
+ # Add silence between segments (not after the last segment)
142
+ if counter < len(segments) - 1:
143
+ next_segment = segments[counter + 1]
144
+ gap_duration = next_segment['start'] - segment['end']
145
+
146
+ if gap_duration > 0:
147
+ gap_samples = int(gap_duration * tts_model.sr)
148
+ gap_silence = torch.zeros((wav.shape[0], gap_samples), dtype=wav.dtype, device=wav.device)
149
+ all_wavs.append(gap_silence)
150
+
151
+ # Save output
152
+ progress(0.95, desc="Combining audio segments...")
153
+ combined_wav = torch.cat(all_wavs, dim=-1)
154
+ ta.save(output_path, combined_wav, tts_model.sr)
155
+
156
+ total_duration = combined_wav.shape[-1] / tts_model.sr
157
+ logger.info(f"TTS completed! Total duration: {total_duration:.2f}s")
158
+
159
+ progress(1.0, desc="TTS generation completed!")
160
+
161
+ return output_path
162
+
163
+ except Exception as e:
164
+ logger.exception("Error generating TTS audio")
165
+ raise
166
+
167
+ # ==================== Helper Functions ====================
168
+
169
+ def audio_extractor(video_path):
170
+ """Extract audio from video"""
171
+ video_clip = VideoFileClip(video_path)
172
+ audio_clip = video_clip.audio
173
+
174
+ temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False, dir='/tmp')
175
+ full_audio_path = temp_file.name
176
+ temp_file.close()
177
+
178
+ audio_clip.write_audiofile(full_audio_path, codec='pcm_s16le', logger=None)
179
+ audio_clip.close()
180
+ video_clip.close()
181
+ return full_audio_path
182
+
183
+ def transcribe(full_audio_path, whisper_model, progress=None):
184
+ """Transcribe audio using faster-whisper"""
185
+ if progress:
186
+ progress(0, desc="Transcribing audio...")
187
+
188
+ # faster-whisper transcription
189
+ segments_generator, info = whisper_model.transcribe(
190
+ full_audio_path,
191
+ beam_size=5,
192
+ word_timestamps=True,
193
+ vad_filter=False,
194
+ # vad_parameters=dict(min_silence_duration_ms=500)
195
+ )
196
+
197
+ detected_language = info.language
198
+
199
+ if progress:
200
+ progress(0, desc=f"Detected language: {detected_language}")
201
+
202
+ # Convert generator to list and format segments
203
+ segments = []
204
+ for segment in segments_generator:
205
+ seg_dict = {
206
+ "start": segment.start,
207
+ "end": segment.end,
208
+ "text": segment.text.strip(),
209
+ "words": []
210
+ }
211
+
212
+ # Add word-level timestamps if available
213
+ if segment.words:
214
+ for word in segment.words:
215
+ seg_dict["words"].append({
216
+ "word": word.word,
217
+ "start": word.start,
218
+ "end": word.end
219
+ })
220
+
221
+ segments.append(seg_dict)
222
+
223
+ result = {
224
+ "segments": segments,
225
+ "language": detected_language,
226
+ "language_code": detected_language
227
+ }
228
+
229
+ if progress:
230
+ progress(0, desc=f"Transcribed {len(segments)} segments")
231
+
232
+ return result
233
+
234
+ def translate_segments(segments: List[Dict], target_lang: str) -> List[Dict]:
235
+ """Translate segments to target language using deep-translator"""
236
+ results = []
237
+ translator = GoogleTranslator(source='auto', target=target_lang)
238
+ for seg in segments:
239
+ clean_seg = {k: v for k, v in seg.items() if k != "words"}
240
+
241
+ if not clean_seg["text"] or clean_seg["text"].isspace():
242
+ translated_text = ""
243
+ else:
244
+ translated_text = translator.translate(clean_seg["text"])
245
+
246
+ clean_seg["translated_text"] = translated_text
247
+ results.append(clean_seg)
248
+ return results
249
+
250
+ def replace_video_audio(video_path, new_audio_path, output_video_path):
251
+ """Replace video audio with proper temp file handling"""
252
+ # Set MoviePy temp directory
253
+ os.environ['FFMPEG_BINARY'] = 'ffmpeg'
254
+
255
+ video_clip = VideoFileClip(video_path)
256
+ new_audio_clip = AudioFileClip(new_audio_path)
257
+
258
+ video_duration = video_clip.duration
259
+ audio_duration = new_audio_clip.duration
260
+
261
+ if audio_duration < video_duration:
262
+ final_video = video_clip.subclip(0, audio_duration)
263
+ final_audio = new_audio_clip
264
+ elif audio_duration > video_duration:
265
+ final_video = video_clip
266
+ final_audio = new_audio_clip.subclip(0, video_duration)
267
+ else:
268
+ final_video = video_clip
269
+ final_audio = new_audio_clip
270
+
271
+ final_clip = final_video.set_audio(final_audio)
272
+
273
+ # Write with explicit temp audiofile location
274
+ final_clip.write_videofile(
275
+ output_video_path,
276
+ codec='libx264',
277
+ audio_codec='aac',
278
+ temp_audiofile=f'/tmp/temp-audio-{os.getpid()}.m4a',
279
+ remove_temp=True,
280
+ logger=None
281
+ )
282
+
283
+ video_clip.close()
284
+ new_audio_clip.close()
285
+ final_audio.close()
286
+ final_video.close()
287
+ final_clip.close()
288
+
289
+ def format_transcription(transcription, translated_segments):
290
+ """Format transcription for display"""
291
+ output = ""
292
+ for i, seg in enumerate(translated_segments):
293
+ output += f"**Segment {i+1}** ({seg['start']:.2f}s - {seg['end']:.2f}s)\n"
294
+ output += f"*Original:* {transcription['segments'][i]['text']}\n"
295
+ output += f"*Translated:* {seg['translated_text']}\n"
296
+ output += "---\n"
297
+ return output
298
+
299
+ # ==================== Main Processing Function ====================
300
+
301
+ def process_video(video_file, target_language, progress=gr.Progress()):
302
+ """Main processing function for Gradio"""
303
+ if video_file is None:
304
+ return None, "Please upload a video file.", ""
305
+
306
+ temp_dir = tempfile.mkdtemp(dir='/tmp')
307
+
308
+ try:
309
+ # Load models
310
+ progress(0.05, desc="Loading models...")
311
+ whisper_mdl, tts_mdl = load_models()
312
+
313
+ # Copy uploaded video to temp directory
314
+ input_video_path = os.path.join(temp_dir, "input_video.mp4")
315
+ shutil.copy(video_file, input_video_path)
316
+
317
+ # Extract audio
318
+ progress(0.1, desc="Extracting audio from video...")
319
+ audio_path = audio_extractor(input_video_path)
320
+
321
+ # Transcribe
322
+ progress(0.2, desc="Transcribing audio...")
323
+ transcription = transcribe(audio_path, whisper_mdl, progress)
324
+ status_msg = f"βœ… Transcribed {len(transcription['segments'])} segments\n"
325
+
326
+ # Translate
327
+ progress(0.4, desc="Translating segments...")
328
+ translated_segments = translate_segments(transcription['segments'], target_language)
329
+ status_msg += f"βœ… Translated {len(translated_segments)} segments\n"
330
+
331
+ # Generate TTS
332
+ progress(0.5, desc="Generating voice-cloned audio...")
333
+ output_audio_path = os.path.join(temp_dir, "translated_audio.wav")
334
+
335
+ generate_translated_audio(
336
+ reference_audio_path=audio_path,
337
+ segments=translated_segments,
338
+ output_path=output_audio_path,
339
+ tts_model=tts_mdl,
340
+ progress=progress,
341
+ silence_duration=0.5,
342
+ target_language=target_language
343
+ )
344
+ status_msg += "βœ… TTS audio generated successfully!\n"
345
+
346
+ # Merge audio with video
347
+ progress(0.9, desc="Merging audio with video...")
348
+ output_video_path = os.path.join(temp_dir, "translated_video.mp4")
349
+ replace_video_audio(input_video_path, output_audio_path, output_video_path)
350
+
351
+ status_msg += "βœ… Video translation completed successfully!"
352
+
353
+ # Format transcription
354
+ transcription_text = format_transcription(transcription, translated_segments)
355
+
356
+ progress(1.0, desc="Complete!")
357
+
358
+ return output_video_path, status_msg, transcription_text
359
+
360
+ except Exception as e:
361
+ logger.exception("Error in translation pipeline")
362
+ return None, f"❌ Error: {str(e)}", ""
363
+
364
+ finally:
365
+ # Clean up audio file if it exists
366
+ try:
367
+ if 'audio_path' in locals() and os.path.exists(audio_path):
368
+ os.remove(audio_path)
369
+ except:
370
+ pass
371
+
372
+ # ==================== Gradio Interface ====================
373
+
374
+ def create_interface():
375
+ """Create Gradio interface"""
376
+
377
+ with gr.Blocks(title="Video Voice Translator", theme=gr.themes.Soft()) as demo:
378
+ gr.Markdown(
379
+ """
380
+ # 🎬 Video Voice Translator
381
+ Upload a video, and we'll translate it to your target language while preserving the voice!
382
+ """
383
+ )
384
+
385
+ with gr.Row():
386
+ with gr.Column(scale=1):
387
+ gr.Markdown("### πŸ“€ Upload Video")
388
+ video_input = gr.Video(label="Choose a video file")
389
+
390
+ gr.Markdown("### βš™οΈ Configuration")
391
+ target_language = gr.Dropdown(
392
+ choices=[
393
+ ("English", "en"),
394
+ ("Hindi", "hi"),
395
+ ("Spanish", "es"),
396
+ ("French", "fr"),
397
+ ("German", "de"),
398
+ ("Italian", "it"),
399
+ ("Portuguese", "pt"),
400
+ ("Russian", "ru"),
401
+ ("Japanese", "ja"),
402
+ ("Korean", "ko"),
403
+ ("Chinese (Simplified)", "zh-cn"),
404
+ ],
405
+ value="en",
406
+ label="Target Language",
407
+ type="value"
408
+ )
409
+
410
+ translate_btn = gr.Button("πŸš€ Start Translation", variant="primary", size="lg")
411
+
412
+ gr.Markdown(
413
+ """
414
+ ### About
415
+ This app uses:
416
+ - **faster-whisper** for transcription
417
+ - **Google Translate** for translation
418
+ - **Chatterbox** for voice cloning TTS
419
+
420
+ All processing runs locally in this app.
421
+ """
422
+ )
423
+
424
+ with gr.Column(scale=1):
425
+ gr.Markdown("### πŸ“₯ Output")
426
+ status_output = gr.Textbox(label="Status", lines=5, interactive=False)
427
+ video_output = gr.Video(label="Translated Video")
428
+
429
+ with gr.Accordion("πŸ“ View Transcription & Translation", open=False):
430
+ transcription_output = gr.Markdown()
431
+
432
+ # Connect the button to the processing function
433
+ translate_btn.click(
434
+ fn=process_video,
435
+ inputs=[video_input, target_language],
436
+ outputs=[video_output, status_output, transcription_output]
437
+ ).then(
438
+ fn=lambda: gr.Button(interactive=True),
439
+ outputs=[translate_btn]
440
+ )
441
+
442
+ # Disable button when clicked
443
+ translate_btn.click(
444
+ fn=lambda: gr.Button(interactive=False),
445
+ outputs=[translate_btn],
446
+ queue=False
447
+ )
448
+
449
+ gr.Markdown(
450
+ """
451
+ ---
452
+ **Note:** Processing time depends on video length and number of segments.
453
+ Large videos may take several minutes to process.
454
+ """
455
+ )
456
+
457
+ return demo
458
+
459
+ # ==================== Main ====================
460
+
461
+ if __name__ == "__main__":
462
+ # Load models at startup
463
+ logger.info("Initializing models...")
464
+ load_models()
465
+ logger.info("Models loaded successfully!")
466
+
467
+ # Create and launch interface
468
+ demo = create_interface()
469
+ demo.launch(
470
+ server_name="0.0.0.0",
471
+ server_port=7860,
472
+ share=False
473
+ )
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ numpy==1.25.2
3
+ Cython
4
+
5
+ # Audio/Video processing
6
+ ffmpeg-python
7
+ imageio-ffmpeg
8
+ moviepy==1.0.3
9
+
10
+ # PyTorch
11
+ torch
12
+ torchaudio
13
+
14
+ # Translation and transcription
15
+ deep-translator
16
+ faster-whisper
17
+ librosa
18
+ numba
19
+
20
+ # UI
21
+ gradio