dooyum commited on
Commit
63ffd97
Β·
verified Β·
1 Parent(s): 65f0f6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -346
app.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  import sys
3
  import subprocess
4
  import tempfile
5
- import logging
6
  from pathlib import Path
7
  from dotenv import load_dotenv
8
  import whisper
@@ -10,11 +9,11 @@ import gradio as gr
10
  import azure.cognitiveservices.speech as speechsdk
11
  import requests
12
  from pydub import AudioSegment
 
13
  import shutil
14
  import io
15
  import asyncio
16
  import json
17
- from langdetect import detect
18
 
19
  # Limit OMP threads (fix libgomp issue)
20
  os.environ["OMP_NUM_THREADS"] = os.getenv("OMP_NUM_THREADS", "1")
@@ -42,9 +41,6 @@ if not AZURE_REGION:
42
  if missing:
43
  sys.exit(f"❌ Missing environment variables: {', '.join(missing)}")
44
 
45
- # Setup logging
46
- logging.basicConfig(filename="dubbing.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
47
-
48
  # --- Language map ---
49
  LANGUAGE_MAP = {
50
  "French": "fr",
@@ -56,34 +52,6 @@ LANGUAGE_MAP = {
56
  "Spanish": "es",
57
  "Polish": "pl",
58
  "Arabic": "ar",
59
- "Chinese (Mandarin, Simplified)": "zh-Hans",
60
- "Chinese (Mandarin, Traditional)": "zh-Hant",
61
- "Czech": "cs",
62
- "Danish": "da",
63
- "English (US)": "en",
64
- "English (UK)": "en",
65
- "Estonian": "et",
66
- "Finnish": "fi",
67
- "Greek": "el",
68
- "Hebrew": "he",
69
- "Hindi": "hi",
70
- "Hungarian": "hu",
71
- "Indonesian": "id",
72
- "Korean": "ko",
73
- "Latvian": "lv",
74
- "Lithuanian": "lt",
75
- "Malay": "ms",
76
- "Norwegian": "nb",
77
- "Portuguese (Brazil)": "pt",
78
- "Portuguese (Portugal)": "pt-pt",
79
- "Romanian": "ro",
80
- "Russian": "ru",
81
- "Slovak": "sk",
82
- "Slovenian": "sl",
83
- "Thai": "th",
84
- "Turkish": "tr",
85
- "Ukrainian": "uk",
86
- "Vietnamese": "vi",
87
  }
88
 
89
  # --- Helper function for SRT formatting ---
@@ -96,193 +64,51 @@ def _format_time(seconds):
96
  return f"{h:02}:{m:02}:{s:02},{ms:03}"
97
 
98
  # --- Async TTS helper function ---
99
- async def _synthesize_tts_async(speech_config, text, line_index):
100
- """Synthesize TTS with better error handling and debugging"""
101
- try:
102
- # Create audio config to capture output
103
- audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False)
104
- synthesizer = speechsdk.SpeechSynthesizer(
105
- speech_config=speech_config,
106
- audio_config=audio_config
107
- )
108
-
109
- # Log what we're trying to synthesize
110
- logging.info(f"Line {line_index+1}: Attempting TTS for text: '{text}'")
111
- print(f"πŸ”Š Line {line_index+1}: Synthesizing: '{text}'")
112
-
113
- # Use synchronous call within executor
114
- loop = asyncio.get_running_loop()
115
- result = await loop.run_in_executor(
116
- None,
117
- lambda: synthesizer.speak_text_async(text).get()
118
- )
119
-
120
- if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
121
- audio_data = result.audio_data
122
- if not audio_data:
123
- error_msg = f"Line {line_index+1}: Empty audio data returned"
124
- logging.error(error_msg)
125
- print(f"❌ {error_msg}")
126
- # Fallback: create silent audio of estimated length
127
- estimated_duration = len(text.split()) * 0.3 # 0.3s per word
128
- return AudioSegment.silent(duration=int(estimated_duration * 1000))
129
-
130
- logging.info(f"Line {line_index+1}: TTS synthesis successful")
131
- print(f"βœ… Line {line_index+1}: TTS synthesis successful")
132
-
133
- # Convert to AudioSegment
134
- try:
135
- # Save to temp file for pydub processing
136
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
137
- temp_wav.write(audio_data)
138
- temp_wav_path = temp_wav.name
139
-
140
- # Load with pydub
141
- audio_segment = AudioSegment.from_wav(temp_wav_path)
142
-
143
- # Clean up temp file
144
- os.unlink(temp_wav_path)
145
-
146
- return audio_segment
147
- except Exception as e:
148
- error_msg = f"Line {line_index+1}: Failed to convert audio data: {str(e)}"
149
- logging.error(error_msg)
150
- print(f"❌ {error_msg}")
151
- # Fallback: create silent audio of estimated length
152
- estimated_duration = len(text.split()) * 0.3
153
- return AudioSegment.silent(duration=int(estimated_duration * 1000))
154
-
155
- else:
156
- cancellation_details = speechsdk.SpeechSynthesisCancellationDetails(result)
157
- error_msg = f"Line {line_index+1}: TTS failed - Reason: {cancellation_details.reason}, Error: {cancellation_details.error_details}"
158
- logging.error(error_msg)
159
- print(f"❌ {error_msg}")
160
- # Fallback: create silent audio of estimated length
161
- estimated_duration = len(text.split()) * 0.3
162
- return AudioSegment.silent(duration=int(estimated_duration * 1000))
163
-
164
- except Exception as e:
165
- error_msg = f"Line {line_index+1}: TTS synthesis error: {str(e)}"
166
- logging.error(error_msg)
167
- print(f"❌ {error_msg}")
168
- # Fallback: create silent audio of estimated length
169
- estimated_duration = len(text.split()) * 0.3
170
- return AudioSegment.silent(duration=int(estimated_duration * 1000))
171
-
172
- def translate_with_azure(texts, target_lang_code):
173
- """Translate text using Azure Translator REST API"""
174
  try:
175
- endpoint = f"https://{AZURE_TRANSLATOR_REGION}.api.cognitive.microsoft.com"
176
- headers = {
177
- "Ocp-Apim-Subscription-Key": AZURE_TRANSLATOR_KEY,
178
- "Ocp-Apim-Subscription-Region": AZURE_TRANSLATOR_REGION,
179
- "Content-Type": "application/json",
180
- "Accept": "application/json"
181
- }
182
-
183
- # Prepare the request body
184
- body = [{'text': text} for text in texts]
185
-
186
- # Make the request
187
- response = requests.post(
188
- f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from=en&to={target_lang_code}",
189
- headers=headers,
190
- json=body
191
- )
192
- response.raise_for_status()
193
-
194
- # Parse the response
195
- result = response.json()
196
- translated_texts = []
197
-
198
- for item in result:
199
- if 'translations' in item and len(item['translations']) > 0:
200
- translated_texts.append(item['translations'][0]['text'])
201
- else:
202
- translated_texts.append("") # Fallback for failed translations
203
 
204
- return translated_texts
 
205
 
 
206
  except Exception as e:
207
- logging.error(f"Azure Translator error: {str(e)}")
208
- print(f"❌ Azure Translator error: {str(e)}")
209
- # Fallback: return original texts if translation fails
210
- return texts
211
-
212
- def fix_outlier_lines(translated_lines, english_lines, target_lang_code):
213
- """Fix lines that are not in the target language"""
214
- corrected_lines = translated_lines.copy()
215
- fixed_indices = []
216
- expected_lang = target_lang_code.split("-")[0]
217
-
218
- for i, line in enumerate(translated_lines):
219
- if not line.strip() or len(line.strip()) < 3:
220
- corrected_lines[i] = english_lines[i] # Fallback for short/empty lines
221
- logging.info(f"Line {i+1}: Used English fallback for short/empty line: {english_lines[i]}")
222
- continue
223
  try:
224
- detected_lang = detect(line)
225
- except Exception:
226
- detected_lang = None
227
- if detected_lang != expected_lang:
228
- # For Azure Translator, we'll retry the translation
229
- retry_translation = translate_with_azure([english_lines[i]], target_lang_code)
230
- if retry_translation and retry_translation[0]:
231
- fixed_line = retry_translation[0].strip()
232
- # Verify re-translated line
233
- try:
234
- if detect(fixed_line) != expected_lang:
235
- logging.warning(f"Line {i+1}: Re-translation still not in {target_lang_code}: {fixed_line}")
236
- corrected_lines[i] = english_lines[i] # Fallback to English
237
- else:
238
- corrected_lines[i] = fixed_line
239
- fixed_indices.append(i)
240
- logging.info(f"Line {i+1}: Fixed from {detected_lang} to {target_lang_code}: {fixed_line}")
241
- except Exception:
242
- corrected_lines[i] = english_lines[i] # Fallback to English
243
- logging.warning(f"Line {i+1}: Language detection failed for re-translated line: {fixed_line}")
244
- else:
245
- corrected_lines[i] = english_lines[i] # Fallback to English
246
- logging.warning(f"Line {i+1}: Re-translation failed for text: {english_lines[i]}")
247
- else:
248
- logging.info(f"Line {i+1}: Correct language detected: {detected_lang}")
249
-
250
- return corrected_lines, fixed_indices
251
-
252
- def test_azure_tts_connection():
253
- """Test Azure TTS connection before starting"""
254
- try:
255
- speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
256
- speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
257
-
258
- synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
259
- result = synthesizer.speak_text_async("Test connection").get()
260
-
261
- if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
262
- print("βœ… Azure TTS connection test successful")
263
- logging.info("Azure TTS connection test successful")
264
- return True
265
- else:
266
- print("❌ Azure TTS connection test failed")
267
- logging.error("Azure TTS connection test failed")
268
- return False
269
- except Exception as e:
270
- print(f"❌ Azure TTS connection error: {str(e)}")
271
- logging.error(f"Azure TTS connection error: {str(e)}")
272
- return False
273
 
274
  # --- Main dubbing function ---
275
  async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
276
- print(f"🎬 Video={uploaded_video_path}, Lang={target_lang_name}, Voice={voice_gender}")
277
- logging.info(f"Starting dubbing: video={uploaded_video_path}, lang={target_lang_name}, voice={voice_gender}")
278
-
279
- # Test Azure TTS connection first
280
- if not test_azure_tts_connection():
281
- return None, None, "❌ Error: Azure TTS connection failed. Please check your Azure credentials and region."
282
-
283
  target_lang_code = LANGUAGE_MAP.get(target_lang_name)
284
  if not target_lang_code:
285
- logging.error(f"Invalid language selected: {target_lang_name}")
286
  return None, None, "❌ Error: Invalid language selected."
287
 
288
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -295,44 +121,67 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
295
 
296
  shutil.copy(uploaded_video_path, video_in)
297
  print("🎧 Extracting audio...")
298
- logging.info("Extracting audio from video")
299
  subprocess.run(["ffmpeg", "-y", "-i", video_in, "-ac", "1", "-ar", "16000", audio_wav])
300
 
301
  print("πŸ“ Transcribing (Whisper)...")
302
- logging.info("Transcribing audio with Whisper base model")
303
- model = whisper.load_model("base") # Using base for faster testing
304
  result = model.transcribe(str(audio_wav), language="en")
305
  segments = result["segments"]
306
 
307
- print(f"🌐 Translating to {target_lang_name} using Azure Translator...")
308
- logging.info(f"Translating to {target_lang_name} using Azure Translator")
309
  english_lines = [seg["text"].strip() for seg in segments]
310
-
311
- # For testing, use simple translations or keep original
312
- if not english_lines:
313
- english_lines = ["Hello world", "This is a test"]
314
-
315
- # Translate using Azure Translator
316
- translated_lines = translate_with_azure(english_lines, target_lang_code)
317
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  print(f"Translated lines:\n{translated_lines}")
319
- logging.info(f"Initial translation: {translated_lines}")
320
-
321
- # Validate line count
322
- if len(translated_lines) != len(english_lines):
323
- logging.warning(f"Translation line count mismatch: got {len(translated_lines)}, expected {len(english_lines)}")
324
- translated_lines = translated_lines[:len(english_lines)] + [""] * (len(english_lines) - len(translated_lines))
325
 
326
- # --- Language detection + auto-fix ---
327
- translated_lines, fixed_indices = fix_outlier_lines(
328
- translated_lines=translated_lines,
329
- english_lines=english_lines,
330
- target_lang_code=target_lang_code
331
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  print("πŸ”Š Generating speech with Azure Neural TTS...")
334
- logging.info(f"Generating TTS with voice: {voice_gender}")
335
-
336
  voice_map = {
337
  "fr": {"female": "fr-FR-DeniseNeural", "male": "fr-FR-HenriNeural"},
338
  "de": {"female": "de-DE-KatjaNeural", "male": "de-DE-ConradNeural"},
@@ -343,114 +192,24 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
343
  "es": {"female": "es-ES-ElviraNeural", "male": "es-ES-AlvaroNeural"},
344
  "pl": {"female": "pl-PL-AgnieszkaNeural", "male": "pl-PL-MarekNeural"},
345
  "ar": {"female": "ar-SA-ZariyahNeural", "male": "ar-SA-HamedNeural"},
346
- "zh-Hans": {"female": "zh-CN-XiaoxiaoNeural", "male": "zh-CN-YunyangNeural"},
347
- "zh-Hant": {"female": "zh-TW-HsiaoChenNeural", "male": "zh-TW-YunJheNeural"},
348
- "cs": {"female": "cs-CZ-VlastaNeural", "male": "cs-CZ-AntoninNeural"},
349
- "da": {"female": "da-DK-ChristelNeural", "male": "da-DK-JeppeNeural"},
350
- "en": {"female": "en-US-JennyNeural", "male": "en-US-GuyNeural"},
351
- "et": {"female": "et-EE-AnuNeural", "male": "et-EE-KertNeural"},
352
- "fi": {"female": "fi-FI-NooraNeural", "male": "fi-FI-HarriNeural"},
353
- "el": {"female": "el-GR-AthinaNeural", "male": "el-GR-NestorasNeural"},
354
- "he": {"female": "he-IL-HilaNeural", "male": "he-IL-AvriNeural"},
355
- "hi": {"female": "hi-IN-SwaraNeural", "male": "hi-IN-MadhurNeural"},
356
- "hu": {"female": "hu-HU-NoemiNeural", "male": "hu-HU-TamasNeural"},
357
- "id": {"female": "id-ID-GadisNeural", "male": "id-ID-ArdiNeural"},
358
- "ko": {"female": "ko-KR-SunHiNeural", "male": "ko-KR-InJoonNeural"},
359
- "lv": {"female": "lv-LV-EveritaNeural", "male": "lv-LV-NilsNeural"},
360
- "lt": {"female": "lt-LT-OnaNeural", "male": "lt-LT-LeonasNeural"},
361
- "ms": {"female": "ms-MY-YasminNeural", "male": "ms-MY-OsmanNeural"},
362
- "nb": {"female": "nb-NO-IselinNeural", "male": "nb-NO-FinnNeural"},
363
- "pt": {"female": "pt-BR-FranciscaNeural", "male": "pt-BR-AntonioNeural"},
364
- "pt-pt": {"female": "pt-PT-FernandaNeural", "male": "pt-PT-DuarteNeural"},
365
- "ro": {"female": "ro-RO-AlinaNeural", "male": "ro-RO-EmilNeural"},
366
- "ru": {"female": "ru-RU-DariyaNeural", "male": "ru-RU-DmitryNeural"},
367
- "sk": {"female": "sk-SK-ViktoriaNeural", "male": "sk-SK-LukasNeural"},
368
- "sl": {"female": "sl-SI-PetraNeural", "male": "sl-SI-RokNeural"},
369
- "th": {"female": "th-TH-AcharaNeural", "male": "th-TH-NiwatNeural"},
370
- "tr": {"female": "tr-TR-EmelNeural", "male": "tr-TR-AhmetNeural"},
371
- "uk": {"female": "uk-UA-PolinaNeural", "male": "uk-UA-OstapNeural"},
372
- "vi": {"female": "vi-VN-HoaiMyNeural", "male": "vi-VN-NamMinhNeural"},
373
  }
374
-
375
  selected_voice = voice_map.get(target_lang_code, {}).get(voice_gender)
376
  if not selected_voice:
377
- # Fallback to English if voice not found
378
- selected_voice = "en-US-JennyNeural" if voice_gender == "female" else "en-US-GuyNeural"
379
- print(f"⚠ Voice not found for {target_lang_name}, using fallback: {selected_voice}")
380
- logging.warning(f"Voice not found for {target_lang_name}, using fallback: {selected_voice}")
381
-
382
  print(f"Using TTS voice: {selected_voice}")
383
- logging.info(f"Using TTS voice: {selected_voice}")
384
-
385
- # Configure speech with detailed error reporting
386
  speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
387
  speech_config.speech_synthesis_voice_name = selected_voice
388
-
389
- # Set output format to ensure compatibility
390
- speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
391
-
392
- # Generate TTS for each line with timeout
393
- tasks = []
394
- for i, text in enumerate(translated_lines):
395
- if text.strip(): # Only process non-empty text
396
- tasks.append(_synthesize_tts_async(speech_config, text, i))
397
- else:
398
- # For empty text, create a short silent segment
399
- tasks.append(asyncio.sleep(0))
400
-
401
- segment_audios = await asyncio.gather(*tasks, return_exceptions=True)
402
-
403
- # Handle failed TTS segments
404
- valid_audios = []
405
- valid_segments = []
406
- valid_lines = []
407
-
408
- for i, (audio, seg, line) in enumerate(zip(segment_audios, segments, translated_lines)):
409
- if audio is not None and not isinstance(audio, Exception):
410
- valid_audios.append(audio)
411
- valid_segments.append(seg)
412
- valid_lines.append(line)
413
- print(f"βœ… Successfully processed line {i+1}")
414
- else:
415
- print(f"❌ Failed to process line {i+1}: '{line}'")
416
- logging.warning(f"Line {i+1}: Skipping due to TTS failure: {line}")
417
- # Add silent audio as fallback
418
- estimated_duration = len(line.split()) * 0.3 * 1000 if line.strip() else 1000
419
- valid_audios.append(AudioSegment.silent(duration=int(estimated_duration)))
420
- valid_segments.append(seg)
421
- valid_lines.append(line)
422
-
423
- if not valid_audios:
424
- error_msg = "❌ Error: No valid audio segments generated. Check Azure TTS configuration."
425
- logging.error(error_msg)
426
- return None, None, error_msg
427
-
428
- print(f"βœ… Successfully generated {len(valid_audios)} audio segments")
429
- logging.info(f"Successfully generated {len(valid_audios)} audio segments")
430
-
431
- # Adjust timestamps based on audio durations
432
- adjusted_segments = []
433
- current_time = 0
434
- for i, audio in enumerate(valid_audios):
435
- start = current_time / 1000
436
- duration = len(audio) # Duration in milliseconds
437
- end = (current_time + duration) / 1000
438
- adjusted_segments.append({"start": start, "end": end, "text": valid_lines[i]})
439
- current_time += duration
440
- logging.info(f"Segment {i+1}: Start={start:.2f}s, End={end:.2f}s, Text={valid_lines[i]}")
441
 
442
- print("πŸŽ₯ Merging dubbed audio into video...")
443
- logging.info("Merging dubbed audio into video")
444
-
445
- # Create silent audio of total duration
446
- full_audio = AudioSegment.silent(duration=current_time)
447
- for seg, segment_audio in zip(adjusted_segments, valid_audios):
448
  start_ms = int(seg["start"] * 1000)
449
  full_audio = full_audio.overlay(segment_audio, position=start_ms)
450
 
 
451
  full_audio.export(str(dubbed_audio_path), format="wav")
452
-
453
- # Merge audio with video
454
  subprocess.run([
455
  "ffmpeg", "-y",
456
  "-i", str(video_in),
@@ -458,23 +217,21 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
458
  "-c:v", "copy",
459
  "-map", "0:v:0",
460
  "-map", "1:a:0",
461
- "-shortest", # Ensure output duration matches the shortest input
462
  str(output_video_temp)
463
- ], check=True)
464
 
465
  print("πŸ“„ Generating subtitle file...")
466
- logging.info("Generating subtitle file")
467
  srt_content = ""
468
- for i, seg in enumerate(adjusted_segments):
469
  start_time = _format_time(seg["start"])
470
  end_time = _format_time(seg["end"])
471
- srt_content += f"{i + 1}\n{start_time} --> {end_time}\n{seg['text']}\n\n"
 
 
472
  output_subtitles_temp.write_text(srt_content, encoding="utf-8")
473
 
474
- print("βœ… Dubbing completed successfully!")
475
- logging.info("Dubbing completed successfully")
476
-
477
- # Copy to output directory
478
  output_dir = Path(tempfile.mkdtemp(prefix="dubbed_output_"))
479
  output_video_path = output_dir / "output_dubbed.mp4"
480
  output_subtitles_path = output_dir / "subtitles.srt"
@@ -482,7 +239,7 @@ async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
482
  shutil.copy(output_video_temp, output_video_path)
483
  shutil.copy(output_subtitles_temp, output_subtitles_path)
484
 
485
- return str(output_video_path), str(output_subtitles_path), "βœ… Dubbing completed successfully!"
486
 
487
  # --- Gradio UI setup ---
488
  with gr.Blocks(title="AI Video Dubber") as demo:
@@ -492,18 +249,22 @@ with gr.Blocks(title="AI Video Dubber") as demo:
492
  with gr.Row():
493
  with gr.Column():
494
  uploaded_video = gr.Video(label="πŸ“€ Upload your video")
 
495
  target_lang_choices = list(LANGUAGE_MAP.keys())
496
  target_lang_dropdown = gr.Dropdown(
497
  label="🌍 Target language",
498
  choices=target_lang_choices,
499
- value="English (US)",
500
  )
 
501
  voice_gender_dropdown = gr.Dropdown(
502
  label="πŸŽ™οΈ Voice Gender",
503
  choices=["female", "male"],
504
  value="female"
505
  )
 
506
  run_button = gr.Button("πŸš€ Start Dubbing")
 
507
  with gr.Column():
508
  dubbed_video_out = gr.Video(label="Dubbed Video")
509
  download_subtitles = gr.File(label="Download Subtitle File")
@@ -516,4 +277,4 @@ with gr.Blocks(title="AI Video Dubber") as demo:
516
  )
517
 
518
  if __name__ == "__main__":
519
- demo.launch(debug=True)
 
2
  import sys
3
  import subprocess
4
  import tempfile
 
5
  from pathlib import Path
6
  from dotenv import load_dotenv
7
  import whisper
 
9
  import azure.cognitiveservices.speech as speechsdk
10
  import requests
11
  from pydub import AudioSegment
12
+ from pydub.utils import make_chunks
13
  import shutil
14
  import io
15
  import asyncio
16
  import json
 
17
 
18
  # Limit OMP threads (fix libgomp issue)
19
  os.environ["OMP_NUM_THREADS"] = os.getenv("OMP_NUM_THREADS", "1")
 
41
  if missing:
42
  sys.exit(f"❌ Missing environment variables: {', '.join(missing)}")
43
 
 
 
 
44
  # --- Language map ---
45
  LANGUAGE_MAP = {
46
  "French": "fr",
 
52
  "Spanish": "es",
53
  "Polish": "pl",
54
  "Arabic": "ar",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
 
57
  # --- Helper function for SRT formatting ---
 
64
  return f"{h:02}:{m:02}:{s:02},{ms:03}"
65
 
66
  # --- Async TTS helper function ---
67
+ async def _synthesize_tts_async(speech_config, text):
68
+ loop = asyncio.get_running_loop()
69
+ synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
70
+
71
+ # Run blocking .get() in thread executor
72
+ result = await loop.run_in_executor(
73
+ None, lambda: synthesizer.speak_text_async(text).get()
74
+ )
75
+
76
+ if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
77
+ print(f"TTS synthesis failed with reason: {result.reason}")
78
+ return AudioSegment.silent(duration=1000) # Return silent audio as fallback
79
+
80
+ audio_data = result.audio_data
81
+ if not audio_data:
82
+ print("No audio data received from TTS")
83
+ return AudioSegment.silent(duration=1000)
84
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  try:
86
+ # Save to temp file for pydub processing
87
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
88
+ temp_wav.write(audio_data)
89
+ temp_wav_path = temp_wav.name
90
+
91
+ # Load with pydub
92
+ audio_segment = AudioSegment.from_wav(temp_wav_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Clean up temp file
95
+ os.unlink(temp_wav_path)
96
 
97
+ return audio_segment
98
  except Exception as e:
99
+ print(f"Error processing TTS audio: {e}")
100
+ # Fallback: try to create silent audio of estimated length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  try:
102
+ estimated_duration = len(text.split()) * 0.3 # Rough estimate: 0.3s per word
103
+ return AudioSegment.silent(duration=int(estimated_duration * 1000))
104
+ except:
105
+ return AudioSegment.silent(duration=1000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # --- Main dubbing function ---
108
  async def dub_video(uploaded_video_path, target_lang_name, voice_gender):
109
+ print(f"Received inputs: video={uploaded_video_path}, lang={target_lang_name}, voice={voice_gender}")
 
 
 
 
 
 
110
  target_lang_code = LANGUAGE_MAP.get(target_lang_name)
111
  if not target_lang_code:
 
112
  return None, None, "❌ Error: Invalid language selected."
113
 
114
  with tempfile.TemporaryDirectory() as temp_dir:
 
121
 
122
  shutil.copy(uploaded_video_path, video_in)
123
  print("🎧 Extracting audio...")
 
124
  subprocess.run(["ffmpeg", "-y", "-i", video_in, "-ac", "1", "-ar", "16000", audio_wav])
125
 
126
  print("πŸ“ Transcribing (Whisper)...")
127
+ model = whisper.load_model("large")
 
128
  result = model.transcribe(str(audio_wav), language="en")
129
  segments = result["segments"]
130
 
131
+ print(f"🌐 Translating to {target_lang_name}...")
 
132
  english_lines = [seg["text"].strip() for seg in segments]
133
+ translated_lines = []
134
+ endpoint = f"https://{AZURE_TRANSLATOR_REGION}.api.cognitive.microsoft.com"
135
+ headers = {
136
+ "Ocp-Apim-Subscription-Key": AZURE_TRANSLATOR_KEY,
137
+ "Ocp-Apim-Subscription-Region": AZURE_TRANSLATOR_REGION,
138
+ "Content-Type": "application/json",
139
+ "Accept": "application/json"
140
+ }
141
+ for line in english_lines:
142
+ if line: # Only translate non-empty lines
143
+ body = [{"text": line}]
144
+ response = requests.post(
145
+ f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from=en&to={target_lang_code}",
146
+ headers=headers,
147
+ json=body
148
+ )
149
+ if response.status_code == 200:
150
+ translations = response.json()
151
+ translated_text = translations[0]["translations"][0]["text"]
152
+ translated_lines.append(translated_text)
153
+ else:
154
+ print(f"Translation error: {response.status_code} - {response.text}")
155
+ translated_lines.append(line) # Fallback to original
156
+ else:
157
+ translated_lines.append("")
158
  print(f"Translated lines:\n{translated_lines}")
 
 
 
 
 
 
159
 
160
+ # --- LANGUAGE DETECTION + AUTO-CORRECTION ---
161
+ from langdetect import detect
162
+ for i, line in enumerate(translated_lines):
163
+ if line: # Skip empty lines
164
+ detected_lang = detect(line)
165
+ if detected_lang != target_lang_code:
166
+ print(f"⚠️ Warning: Detected {detected_lang}, correcting to {target_lang_code}...")
167
+ try:
168
+ body = [{"text": line}]
169
+ response = requests.post(
170
+ f"{endpoint}/translator/text/v3.0/translate?api-version=3.0&from={detected_lang}&to={target_lang_code}",
171
+ headers=headers,
172
+ json=body
173
+ )
174
+ if response.status_code == 200:
175
+ translations = response.json()
176
+ corrected_text = translations[0]["translations"][0]["text"]
177
+ translated_lines[i] = corrected_text
178
+ print(f"βœ… Corrected: {corrected_text}")
179
+ else:
180
+ print(f"❌ Correction failed ({response.status_code}) - keeping original line.")
181
+ except Exception as e:
182
+ print(f"❌ Error correcting translation: {e}")
183
 
184
  print("πŸ”Š Generating speech with Azure Neural TTS...")
 
 
185
  voice_map = {
186
  "fr": {"female": "fr-FR-DeniseNeural", "male": "fr-FR-HenriNeural"},
187
  "de": {"female": "de-DE-KatjaNeural", "male": "de-DE-ConradNeural"},
 
192
  "es": {"female": "es-ES-ElviraNeural", "male": "es-ES-AlvaroNeural"},
193
  "pl": {"female": "pl-PL-AgnieszkaNeural", "male": "pl-PL-MarekNeural"},
194
  "ar": {"female": "ar-SA-ZariyahNeural", "male": "ar-SA-HamedNeural"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  }
 
196
  selected_voice = voice_map.get(target_lang_code, {}).get(voice_gender)
197
  if not selected_voice:
198
+ return None, None, f"❌ Error: Voice for {target_lang_name} ({voice_gender}) not found."
 
 
 
 
199
  print(f"Using TTS voice: {selected_voice}")
 
 
 
200
  speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
201
  speech_config.speech_synthesis_voice_name = selected_voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
+ tasks = [_synthesize_tts_async(speech_config, translated_text) for translated_text in translated_lines]
204
+ segment_audios = await asyncio.gather(*tasks)
205
+
206
+ full_audio = AudioSegment.silent(duration=segments[-1]["end"] * 1000)
207
+ for seg, segment_audio in zip(segments, segment_audios):
 
208
  start_ms = int(seg["start"] * 1000)
209
  full_audio = full_audio.overlay(segment_audio, position=start_ms)
210
 
211
+ print("πŸŽ₯ Merging dubbed audio into video...")
212
  full_audio.export(str(dubbed_audio_path), format="wav")
 
 
213
  subprocess.run([
214
  "ffmpeg", "-y",
215
  "-i", str(video_in),
 
217
  "-c:v", "copy",
218
  "-map", "0:v:0",
219
  "-map", "1:a:0",
220
+ "-map", "-0:a",
221
  str(output_video_temp)
222
+ ])
223
 
224
  print("πŸ“„ Generating subtitle file...")
 
225
  srt_content = ""
226
+ for i, (seg, translated_text) in enumerate(zip(segments, translated_lines)):
227
  start_time = _format_time(seg["start"])
228
  end_time = _format_time(seg["end"])
229
+ srt_content += f"{i + 1}\n"
230
+ srt_content += f"{start_time} --> {end_time}\n"
231
+ srt_content += f"{translated_text}\n\n"
232
  output_subtitles_temp.write_text(srt_content, encoding="utf-8")
233
 
234
+ print("βœ… Done!")
 
 
 
235
  output_dir = Path(tempfile.mkdtemp(prefix="dubbed_output_"))
236
  output_video_path = output_dir / "output_dubbed.mp4"
237
  output_subtitles_path = output_dir / "subtitles.srt"
 
239
  shutil.copy(output_video_temp, output_video_path)
240
  shutil.copy(output_subtitles_temp, output_subtitles_path)
241
 
242
+ return str(output_video_path), str(output_subtitles_path), "βœ… Done! Your video and subtitles are ready."
243
 
244
  # --- Gradio UI setup ---
245
  with gr.Blocks(title="AI Video Dubber") as demo:
 
249
  with gr.Row():
250
  with gr.Column():
251
  uploaded_video = gr.Video(label="πŸ“€ Upload your video")
252
+
253
  target_lang_choices = list(LANGUAGE_MAP.keys())
254
  target_lang_dropdown = gr.Dropdown(
255
  label="🌍 Target language",
256
  choices=target_lang_choices,
257
+ value=target_lang_choices[0],
258
  )
259
+
260
  voice_gender_dropdown = gr.Dropdown(
261
  label="πŸŽ™οΈ Voice Gender",
262
  choices=["female", "male"],
263
  value="female"
264
  )
265
+
266
  run_button = gr.Button("πŸš€ Start Dubbing")
267
+
268
  with gr.Column():
269
  dubbed_video_out = gr.Video(label="Dubbed Video")
270
  download_subtitles = gr.File(label="Download Subtitle File")
 
277
  )
278
 
279
  if __name__ == "__main__":
280
+ demo.launch()