prasanacodes commited on
Commit
305d0a7
Β·
verified Β·
1 Parent(s): 94ae67a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -23
app.py CHANGED
@@ -6,6 +6,7 @@ import ffmpeg
6
  import nltk
7
  import re
8
  from deep_translator import MyMemoryTranslator
 
9
  import soundfile as sf
10
  from gradio_client import Client, handle_file
11
  from openvoice_cli.__main__ import tune_one
@@ -245,10 +246,20 @@ def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
245
  print(f"βœ… Merged video saved to {output_path}")
246
  return output_path
247
 
248
- def main_run(video_path,target_lang):
249
  original_audio_file = extract_audio_from_video(video_path)
250
- original_text , pace = transcribe_audio(original_audio_file)
251
- translated_text = translate_local(original_text,target_lang)
 
 
 
 
 
 
 
 
 
 
252
  translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
253
  synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
254
  cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
@@ -256,12 +267,22 @@ def main_run(video_path,target_lang):
256
  print(f"βœ… Pipeline finished")
257
  return final_video_nobgm
258
 
259
- def audio_pipeline_run(audio_path,target_lang):
260
- original_text , pace = transcribe_audio(audio_path)
261
- translated_text = translate_local(original_text,target_lang)
 
 
 
 
 
 
 
 
 
 
262
  translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
263
  synced_translated_audio = match_audio_duration(audio_path, translated_audio)
264
- cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
265
  print(f"βœ… Pipeline finished")
266
  return cloned_synced_translated_audio
267
 
@@ -295,30 +316,123 @@ with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
295
  gr.Markdown("# πŸš€ Audio/Video Translation Toolkit")
296
  with gr.Tabs():
297
  with gr.Tab("🎬 Translate Video"):
298
- with gr.Row():
299
- with gr.Column():
300
  video_in = gr.Video(label="Input Video", height=500)
301
- lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
302
- submit_btn_vid = gr.Button("Translate Video", variant="primary")
303
-
304
- with gr.Column():
305
  video_out = gr.Video(label="Output Video", interactive=False, height=500)
 
 
 
 
 
 
 
 
 
 
306
 
307
- submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid], outputs=[video_out])
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  with gr.Tab("🎡 Translate Audio"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  with gr.Row():
 
 
 
 
 
 
 
 
311
  with gr.Column():
312
- audio_in_pipe = gr.Audio(type="filepath", label="Input Audio")
313
- lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
314
- submit_btn_aud = gr.Button("Translate Audio", variant="primary")
315
-
 
 
 
316
  with gr.Column():
317
- audio_out_pipe = gr.Audio(label="Output Audio", interactive=False)
318
-
319
- submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in_pipe, lang_radio_aud], outputs=[audio_out_pipe])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- # --- Launch the App ---
322
  if __name__ == "__main__":
323
  # The launch() method creates a web server and makes the interface accessible.
324
- app_interface.launch()
 
6
  import nltk
7
  import re
8
  from deep_translator import MyMemoryTranslator
9
+ import num2words
10
  import soundfile as sf
11
  from gradio_client import Client, handle_file
12
  from openvoice_cli.__main__ import tune_one
 
246
  print(f"βœ… Merged video saved to {output_path}")
247
  return output_path
248
 
249
+ def main_run(video_path,target_lang,user_transcript=None, user_translation=None):
250
  original_audio_file = extract_audio_from_video(video_path)
251
+ if user_transcript:
252
+ original_text , pace = transcribe_audio(original_audio_file)
253
+ original_text = user_transcript
254
+ print(f"Using provided transcript: {original_text}")
255
+ else:
256
+ original_text , pace = transcribe_audio(original_audio_file)
257
+ if user_translation:
258
+ translated_text = user_translation
259
+ print(f"Using provided translation: {translated_text}")
260
+ else:
261
+ translated_text = translate_local(original_text,target_lang)
262
+ print(f"Translated Text: {translated_text}")
263
  translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
264
  synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
265
  cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
 
267
  print(f"βœ… Pipeline finished")
268
  return final_video_nobgm
269
 
270
+ def audio_pipeline_run(audio_path,target_lang,user_transcript=None, user_translation=None):
271
+ if user_transcript:
272
+ original_text , pace = transcribe_audio(audio_path)
273
+ original_text = user_transcript
274
+ print(f"Using provided transcript: {original_text}")
275
+ else:
276
+ original_text , pace = transcribe_audio(audio_path)
277
+ if user_translation:
278
+ translated_text = user_translation
279
+ print(f"Using provided translation: {translated_text}")
280
+ else:
281
+ translated_text = translate_local(original_text,target_lang)
282
+ print(f"Translated Text: {translated_text}")
283
  translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
284
  synced_translated_audio = match_audio_duration(audio_path, translated_audio)
285
+ cloned_synced_translated_audio = clone_voice(synced_translated_audio, audio_path)
286
  print(f"βœ… Pipeline finished")
287
  return cloned_synced_translated_audio
288
 
 
316
  gr.Markdown("# πŸš€ Audio/Video Translation Toolkit")
317
  with gr.Tabs():
318
  with gr.Tab("🎬 Translate Video"):
319
+ with gr.Column():
320
+ with gr.Row():
321
  video_in = gr.Video(label="Input Video", height=500)
 
 
 
 
322
  video_out = gr.Video(label="Output Video", interactive=False, height=500)
323
+ with gr.Row():
324
+ # Radio buttons for selecting target language
325
+ # This allows users to choose one of the mutually exclusive options
326
+ lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
327
+ # Single-select option for mutually exclusive choices
328
+ option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
329
+ # Textboxes for user input, initially hidden
330
+ user_transcript_vid = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
331
+ user_translation_vid = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
332
+ submit_btn_vid = gr.Button("Translate Video", variant="primary")
333
 
334
+ # Toggle visibility based on selected option (only one can be active)
335
+ option_select.change(
336
+ fn=lambda choice: (
337
+ gr.update(visible=(choice == "Use my Transcript")),
338
+ gr.update(visible=(choice == "Use my Translation")),
339
+ ),
340
+ inputs=option_select,
341
+ outputs=[user_transcript_vid, user_translation_vid],
342
+ )
343
+
344
+ # Include the optional transcript/translation textboxes as inputs (they may be hidden)
345
+ submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid, user_transcript_vid, user_translation_vid], outputs=[video_out])
346
 
347
  with gr.Tab("🎡 Translate Audio"):
348
+ with gr.Column():
349
+ with gr.Row():
350
+ audio_in = gr.Audio(label="Input Audio")
351
+ audio_out = gr.Audio(label="Output Audio", interactive=False)
352
+ with gr.Row():
353
+ # Radio buttons for selecting target language
354
+ # This allows users to choose one of the mutually exclusive options
355
+ lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
356
+ # Single-select option for mutually exclusive choices
357
+ option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
358
+ # Textboxes for user input, initially hidden
359
+ user_transcript_aud = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
360
+ user_translation_aud = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
361
+ submit_btn_aud = gr.Button("Translate Audio", variant="primary")
362
+
363
+ # Toggle visibility based on selected option (only one can be active)
364
+ option_select.change(
365
+ fn=lambda choice: (
366
+ gr.update(visible=(choice == "Use my Transcript")),
367
+ gr.update(visible=(choice == "Use my Translation")),
368
+ ),
369
+ inputs=option_select,
370
+ outputs=[user_transcript_aud, user_translation_aud],
371
+ )
372
+ submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in, lang_radio_aud, user_transcript_aud, user_translation_aud], outputs=[audio_out])
373
+
374
+ with gr.Tab("βœ‚οΈ Extract Audio"):
375
  with gr.Row():
376
+ video_in_ext = gr.Video(label="Input Video", height=500)
377
+ audio_out_ext = gr.Audio(label="Extracted Audio")
378
+ btn_ext = gr.Button("Extract", variant="secondary")
379
+ btn_ext.click(fn=extract_audio_from_video, inputs=video_in_ext, outputs=audio_out_ext)
380
+
381
+ with gr.Tab("✍️ Transcribe"):
382
+ with gr.Row():
383
+ audio_in_trans = gr.Audio(type="filepath", label="Input Audio")
384
  with gr.Column():
385
+ text_out_trans = gr.Textbox(label="Transcription")
386
+ text_out_pace = gr.Textbox(label="Detected Pace")
387
+ btn_trans = gr.Button("Transcribe", variant="secondary")
388
+ btn_trans.click(lambda aud: transcribe_audio(aud), inputs=audio_in_trans, outputs=[text_out_trans, text_out_pace])
389
+
390
+ with gr.Tab("🌐 Translate Text"):
391
+ with gr.Row():
392
  with gr.Column():
393
+ text_in_tran = gr.Textbox(label="Text to Translate", lines=5)
394
+ lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
395
+ btn_tran = gr.Button("Translate", variant="secondary")
396
+ text_out_tran = gr.Textbox(label="Translated Text", lines=5, interactive=False)
397
+ btn_tran.click(fn=translate_local, inputs=[text_in_tran, lang_radio_tran], outputs=text_out_tran)
398
+
399
+ with gr.Tab("πŸ”Š Synthesize Speech"):
400
+ with gr.Column():
401
+ with gr.Row():
402
+ text_in_synth = gr.Textbox(label="Text to Synthesize", lines=5)
403
+ audio_out_synth = gr.Audio(label="Synthesized Speech")
404
+ with gr.Row():
405
+ lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
406
+ gender_radio_tran = gr.Radio(choices=["Male", "Female"], label="Speaker Gender", value="Male")
407
+ pace_radio_tran = gr.Radio(choices=["Very_Slow", "Slow", "Normal", "Fast", "Very_Fast"], label="Target Language", value="Normal")
408
+ btn_synth = gr.Button("Synthesize", variant="secondary")
409
+ btn_synth.click(fn=synthesize_speech, inputs=[text_in_synth,lang_radio_tran,gender_radio_tran,pace_radio_tran], outputs=audio_out_synth)
410
+
411
+ with gr.Tab("⏱️ Sync Duration"):
412
+ with gr.Row():
413
+ audio_in_sync1 = gr.Audio(type="filepath", label="Original Audio (for duration reference)")
414
+ audio_in_sync2 = gr.Audio(type="filepath", label="Translated Audio (to be resized)")
415
+ audio_out_sync = gr.Audio(label="Duration-Synced Audio")
416
+ btn_sync = gr.Button("Sync Duration", variant="secondary")
417
+ btn_sync.click(fn=match_audio_duration, inputs=[audio_in_sync1, audio_in_sync2], outputs=audio_out_sync)
418
+
419
+ with gr.Tab("🧬 Clone Voice"):
420
+ with gr.Row():
421
+ audio_in_clone1 = gr.Audio(type="filepath", label="Target Audio (e.g., Synthesized Speech)")
422
+ audio_in_clone2 = gr.Audio(type="filepath", label="Reference Audio (Original Speaker's Voice)")
423
+ audio_out_clone = gr.Audio(label="Cloned Voice Audio")
424
+ btn_clone = gr.Button("Clone Voice", variant="secondary")
425
+ btn_clone.click(fn=clone_voice, inputs=[audio_in_clone1, audio_in_clone2], outputs=audio_out_clone)
426
+
427
+ with gr.Tab("🎞️ Replace Audio"):
428
+ with gr.Row():
429
+ video_in_rep = gr.Video(label="Input Video", height=500)
430
+ audio_in_rep = gr.Audio(type="filepath", label="New Audio")
431
+ video_out_rep = gr.Video(label="Video with Replaced Audio", height=500)
432
+ btn_rep = gr.Button("Replace Audio", variant="secondary")
433
+ btn_rep.click(fn=merge_audio_video, inputs=[video_in_rep, audio_in_rep], outputs=video_out_rep)
434
 
435
+ # --- Launch the App ---
436
  if __name__ == "__main__":
437
  # The launch() method creates a web server and makes the interface accessible.
438
+ app_interface.launch()