|
|
| |
| |
| import numpy as np |
| import gradio as gr |
| import whisperx |
| import torch |
| from gtts import gTTS |
| import librosa |
| import edge_tts |
| import asyncio |
| import gc |
| from pydub import AudioSegment |
| from tqdm import tqdm |
| from deep_translator import GoogleTranslator |
| import os |
| from soni_translate.audio_segments import create_translated_audio |
| from soni_translate.text_to_speech import make_voice_gradio |
| from soni_translate.translate_segments import translate_text |
| |
|
|
| title = "<center><strong><font size='7'>๐ฝ๏ธ SoniTranslate ๐ท๏ธ</font></strong></center>" |
|
|
| news = """ ## ๐ News |
| ๐ฅ 2023/07/01: Support (Thanks for [text](https://github.com)). |
| """ |
|
|
| description = """ ## Translate the audio of a video content from one language to another while preserving synchronization. |
| |
| |
| This is a demo on Github project ๐ฝ๏ธ [SoniTranslate](https://github.com/R3gm/SoniTranslate). |
| |
| ๐ผ You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations. |
| |
| ๐ For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU. |
| |
| [](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb) |
| |
| """ |
|
|
| tutorial = """ # ๐ฐ Instructions for use. |
| |
| 1. Upload a video on the first tab or use a video link on the second tab. |
| |
| 2. Choose the language in which you want to translate the video. |
| |
| 3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language. |
| |
| 4. Press the 'Translate' button to obtain the results. |
| |
| """ |
|
|
|
|
| if not os.path.exists('audio'): |
| os.makedirs('audio') |
|
|
| if not os.path.exists('audio2/audio'): |
| os.makedirs('audio2/audio') |
|
|
| |
| if torch.cuda.is_available(): |
| device = "cuda" |
| list_compute_type = ['float16', 'float32'] |
| compute_type_default = 'float16' |
| whisper_model_default = 'large-v1' |
| else: |
| device = "cpu" |
| list_compute_type = ['float32'] |
| compute_type_default = 'float32' |
| whisper_model_default = 'base' |
| print('Working in: ', device) |
|
|
|
|
| |
| |
|
|
| |
| list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female'] |
|
|
|
|
| def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type, |
| TRANSLATE_AUDIO_TO, min_speakers, max_speakers, |
| tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05): |
|
|
| YOUR_HF_TOKEN = os.getenv("My_hf_token") |
|
|
| |
| OutputFile = 'Video.mp4' |
| audio_wav = "audio.wav" |
| Output_name_file = "audio_dub_solo.wav" |
| mix_audio = "audio_mix.mp3" |
| video_output = "diar_output.mp4" |
| |
| os.system(f"rm {Output_name_file}") |
| os.system("rm Video.mp4") |
| |
| os.system("rm audio.wav") |
| |
|
|
| if os.path.exists(video): |
| print(f"### Start Video ###") |
| if device == 'cpu': |
| |
| print('10 s. Limited for CPU ') |
| os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4') |
| else: |
| os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4') |
| |
| os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav") |
| else: |
| print(f"### Start {video} ###") |
| if device == 'cpu': |
| |
| print('10 s. Limited for CPU ') |
| |
| mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
| wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 audio.wav" |
| else: |
| mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
| wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}' |
| |
| os.system(mp4_) |
| os.system(wav_) |
|
|
| print("Set file complete.") |
| |
| |
| model = whisperx.load_model( |
| WHISPER_MODEL_SIZE, |
| device, |
| compute_type=compute_type |
| ) |
| audio = whisperx.load_audio(audio_wav) |
| result = model.transcribe(audio, batch_size=batch_size) |
| gc.collect(); torch.cuda.empty_cache(); del model |
| print("Transcript complete") |
| |
| |
| model_a, metadata = whisperx.load_align_model( |
| language_code=result["language"], |
| device=device |
| ) |
| result = whisperx.align( |
| result["segments"], |
| model_a, |
| metadata, |
| audio, |
| device, |
| return_char_alignments=True, |
| ) |
| gc.collect(); torch.cuda.empty_cache(); del model_a |
| print("Align complete") |
| |
| |
| diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) |
| diarize_segments = diarize_model( |
| audio_wav, |
| min_speakers=min_speakers, |
| max_speakers=max_speakers) |
| result_diarize = whisperx.assign_word_speakers(diarize_segments, result) |
| gc.collect(); torch.cuda.empty_cache(); del diarize_model |
| print("Diarize complete") |
| |
| result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO) |
| print("Translation complete") |
| |
| audio_files = [] |
|
|
| |
| speaker_to_voice = { |
| 'SPEAKER_00': tts_voice00, |
| 'SPEAKER_01': tts_voice01, |
| 'SPEAKER_02': tts_voice02, |
| 'SPEAKER_03': tts_voice03, |
| 'SPEAKER_04': tts_voice04, |
| 'SPEAKER_05': tts_voice05 |
| } |
|
|
| for segment in result_diarize['segments']: |
|
|
| text = segment['text'] |
| start = segment['start'] |
| end = segment['end'] |
|
|
| try: |
| speaker = segment['speaker'] |
| except KeyError: |
| segment['speaker'] = "SPEAKER_99" |
| speaker = segment['speaker'] |
| print("NO SPEAKER DETECT IN SEGMENT") |
|
|
| |
| filename = f"audio/{start}.ogg" |
|
|
| if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None': |
| make_voice_gradio(text, speaker_to_voice[speaker], filename) |
| elif speaker == "SPEAKER_99": |
| try: |
| tts = gTTS(text, lang=TRANSLATE_AUDIO_TO) |
| tts.save(filename) |
| print('Using GTTS') |
| except: |
| tts = gTTS('a', lang=TRANSLATE_AUDIO_TO) |
| tts.save(filename) |
| print('ERROR AUDIO GTTS') |
|
|
| |
| duration_true = end - start |
| duration_tts = librosa.get_duration(filename=filename) |
|
|
| |
| porcentaje = duration_tts / duration_true |
|
|
| if porcentaje > 2.1: |
| porcentaje = 2.1 |
| elif porcentaje <= 1.2 and porcentaje >= 0.8: |
| porcentaje = 1.0 |
| elif porcentaje <= 0.79: |
| porcentaje = 0.8 |
|
|
| |
| porcentaje = round(porcentaje+0.0, 1) |
|
|
| |
| os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}") |
|
|
| duration_create = librosa.get_duration(filename=f"audio2/{filename}") |
| audio_files.append(filename) |
|
|
| |
| os.system("mv -f audio2/audio/*.ogg audio/") |
|
|
| os.system(f"rm {Output_name_file}") |
|
|
| create_translated_audio(result_diarize, audio_files, Output_name_file) |
|
|
| os.system("rm audio_dub_stereo.wav") |
| os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav") |
| |
| |
|
|
| os.system(f"rm {mix_audio}") |
| |
| |
| os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}') |
| |
| os.system(f"rm {video_output}") |
| os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}") |
| |
| return video_output |
|
|
|
|
|
|
| import sys |
|
|
| class Logger: |
| def __init__(self, filename): |
| self.terminal = sys.stdout |
| self.log = open(filename, "w") |
|
|
| def write(self, message): |
| self.terminal.write(message) |
| self.log.write(message) |
|
|
| def flush(self): |
| self.terminal.flush() |
| self.log.flush() |
|
|
| def isatty(self): |
| return False |
|
|
| sys.stdout = Logger("output.log") |
|
|
| def read_logs(): |
| sys.stdout.flush() |
| with open("output.log", "r") as f: |
| return f.read() |
|
|
|
|
| with gr.Blocks() as demo: |
| gr.Markdown(title) |
| gr.Markdown(description) |
| gr.Markdown(tutorial) |
|
|
| with gr.Tab("Translate audio from video"): |
| with gr.Row(): |
| with gr.Column(): |
| video_input = gr.Video() |
| |
| gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.") |
| TRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to') |
| |
| gr.Markdown("Select how many people are speaking in the video.") |
| min_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1) |
| max_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1) |
| |
| gr.Markdown("Select the voice you want for each speaker.") |
| tts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1') |
| tts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2') |
| tts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3') |
| tts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4') |
| tts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5') |
| tts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6') |
| |
| gr.Markdown("Default configuration of Whisper.") |
| WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model") |
| batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1) |
| compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type") |
| |
| with gr.Column(variant='compact'): |
| with gr.Row(): |
| video_button = gr.Button("TRANSLATE", ) |
| with gr.Row(): |
| video_output = gr.Video() |
|
|
|
|
| gr.Examples( |
| examples=[ |
| [ |
| "./assets/Video_subtitled.mp4", |
| "base", |
| 16, |
| "float32", |
| "en", |
| 1, |
| 2, |
| 'en-AU-WilliamNeural-Male', |
| 'en-CA-ClaraNeural-Female', |
| 'en-GB-ThomasNeural-Male', |
| 'en-GB-SoniaNeural-Female', |
| 'en-NZ-MitchellNeural-Male', |
| 'en-GB-MaisieNeural-Female', |
| ], |
| ], |
| fn=translate_from_video, |
| inputs=[ |
| video_input, |
| WHISPER_MODEL_SIZE, |
| batch_size, |
| compute_type, |
| TRANSLATE_AUDIO_TO, |
| min_speakers, |
| max_speakers, |
| tts_voice00, |
| tts_voice01, |
| tts_voice02, |
| tts_voice03, |
| tts_voice04, |
| tts_voice05, |
| ], |
| outputs=[video_output], |
| cache_examples=True, |
| ) |
|
|
|
|
| with gr.Tab("Translate audio from video link"): |
| with gr.Row(): |
| with gr.Column(): |
| |
| link_input = gr.Textbox(label="Media link. Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...") |
| |
| |
| gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.") |
| bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to') |
| |
| gr.Markdown("Select how many people are speaking in the video.") |
| bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1) |
| bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1) |
| |
| gr.Markdown("Select the voice you want for each speaker.") |
| btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1') |
| btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2') |
| btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3') |
| btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4') |
| btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5') |
| btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6') |
| |
| gr.Markdown("Default configuration of Whisper.") |
| bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model") |
| bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1) |
| bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type") |
| |
| |
| |
|
|
|
|
|
|
| with gr.Column(variant='compact'): |
| with gr.Row(): |
| text_button = gr.Button("TRANSLATE") |
| with gr.Row(): |
| link_output = gr.Video() |
|
|
| gr.Examples( |
| examples=[ |
| [ |
| "https://www.youtube.com/watch?v=5ZeHtRKHl7Y", |
| "base", |
| 16, |
| "float32", |
| "en", |
| 1, |
| 2, |
| 'en-CA-ClaraNeural-Female', |
| 'en-AU-WilliamNeural-Male', |
| 'en-GB-ThomasNeural-Male', |
| 'en-GB-SoniaNeural-Female', |
| 'en-NZ-MitchellNeural-Male', |
| 'en-GB-MaisieNeural-Female', |
| ], |
| ], |
| fn=translate_from_video, |
| inputs=[ |
| link_input, |
| bWHISPER_MODEL_SIZE, |
| bbatch_size, |
| bcompute_type, |
| bTRANSLATE_AUDIO_TO, |
| bmin_speakers, |
| bmax_speakers, |
| btts_voice00, |
| btts_voice01, |
| btts_voice02, |
| btts_voice03, |
| btts_voice04, |
| btts_voice05, |
| ], |
| outputs=[link_output], |
| cache_examples=True, |
| ) |
|
|
|
|
| |
| with gr.Accordion("Logs"): |
| logs = gr.Textbox() |
| demo.load(read_logs, None, logs, every=1) |
|
|
| |
| video_button.click(translate_from_video, inputs=[ |
| video_input, |
| WHISPER_MODEL_SIZE, |
| batch_size, |
| compute_type, |
| TRANSLATE_AUDIO_TO, |
| min_speakers, |
| max_speakers, |
| tts_voice00, |
| tts_voice01, |
| tts_voice02, |
| tts_voice03, |
| tts_voice04, |
| tts_voice05,], outputs=video_output) |
| text_button.click(translate_from_video, inputs=[ |
| link_input, |
| bWHISPER_MODEL_SIZE, |
| bbatch_size, |
| bcompute_type, |
| bTRANSLATE_AUDIO_TO, |
| bmin_speakers, |
| bmax_speakers, |
| btts_voice00, |
| btts_voice01, |
| btts_voice02, |
| btts_voice03, |
| btts_voice04, |
| btts_voice05,], outputs=link_output) |
|
|
|
|
| demo.launch(enable_queue=True) |
|
|
|
|
|
|
|
|
|
|