| import numpy as np |
| import gradio as gr |
| import whisperx |
| import torch |
| from gtts import gTTS |
| import librosa |
| import edge_tts |
| import gc |
| from pydub import AudioSegment |
| from tqdm import tqdm |
| from deep_translator import GoogleTranslator |
| import os |
| from soni_translate.audio_segments import create_translated_audio |
| from soni_translate.text_to_speech import make_voice |
| from soni_translate.translate_segments import translate_text |
| import time |
|
|
| def translate_from_video( |
| video, |
| YOUR_HF_TOKEN, |
| preview=False, |
| WHISPER_MODEL_SIZE="large-v1", |
| batch_size=16, |
| compute_type="float16", |
| SOURCE_LANGUAGE= "Automatic detection", |
| TRANSLATE_AUDIO_TO="en", |
| min_speakers=1, |
| max_speakers=2, |
| tts_voice00="en-AU-WilliamNeural-Male", |
| tts_voice01="en-CA-ClaraNeural-Female", |
| tts_voice02="en-GB-ThomasNeural-Male", |
| tts_voice03="en-GB-SoniaNeural-Female", |
| tts_voice04="en-NZ-MitchellNeural-Male", |
| tts_voice05="en-GB-MaisieNeural-Female", |
| video_output="video_dub.mp4" |
| ): |
|
|
| if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None: |
| YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") |
|
|
| if not os.path.exists('audio'): |
| os.makedirs('audio') |
|
|
| if not os.path.exists('audio2/audio'): |
| os.makedirs('audio2/audio') |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| compute_type = "float32" if device == "cpu" else compute_type |
|
|
| OutputFile = 'Video.mp4' |
| audio_wav = "audio.wav" |
| Output_name_file = "audio_dub_solo.ogg" |
| mix_audio = "audio_mix.mp3" |
|
|
| os.system("rm Video.mp4") |
| os.system("rm audio.webm") |
| os.system("rm audio.wav") |
|
|
| if os.path.exists(video): |
| if preview: |
| print('Creating preview video, 10 seconds') |
| os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4') |
| else: |
| os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4') |
|
|
| os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav") |
| else: |
| if preview: |
| print('Creating preview from link, 10 seconds') |
| |
| mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
| wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" |
| os.system(mp4_) |
| os.system(wav_) |
| else: |
| mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
| wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}' |
|
|
| os.system(wav_) |
|
|
| for i in range (120): |
| time.sleep(1) |
| print('process audio') |
| if os.path.exists(audio_wav) and not os.path.exists('audio.webm'): |
| time.sleep(1) |
| os.system(mp4_) |
| break |
| if i == 119: |
| print('Error donwloading the audio') |
| return |
|
|
| print("Set file complete.") |
|
|
| SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE |
|
|
| |
| model = whisperx.load_model( |
| WHISPER_MODEL_SIZE, |
| device, |
| compute_type=compute_type, |
| language= SOURCE_LANGUAGE, |
| ) |
| audio = whisperx.load_audio(audio_wav) |
| result = model.transcribe(audio, batch_size=batch_size) |
| gc.collect(); torch.cuda.empty_cache(); del model |
| print("Transcript complete") |
|
|
| |
| model_a, metadata = whisperx.load_align_model( |
| language_code=result["language"], |
| device=device |
| ) |
| result = whisperx.align( |
| result["segments"], |
| model_a, |
| metadata, |
| audio, |
| device, |
| return_char_alignments=True, |
| ) |
| gc.collect(); torch.cuda.empty_cache(); del model_a |
| print("Align complete") |
|
|
| if result['segments'] == []: |
| print('No active speech found in audio') |
| return |
|
|
| |
| diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) |
| diarize_segments = diarize_model( |
| audio_wav, |
| min_speakers=min_speakers, |
| max_speakers=max_speakers) |
| result_diarize = whisperx.assign_word_speakers(diarize_segments, result) |
| gc.collect(); torch.cuda.empty_cache(); del diarize_model |
| print("Diarize complete") |
|
|
| result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO) |
| print("Translation complete") |
|
|
| audio_files = [] |
|
|
| |
| speaker_to_voice = { |
| 'SPEAKER_00': tts_voice00, |
| 'SPEAKER_01': tts_voice01, |
| 'SPEAKER_02': tts_voice02, |
| 'SPEAKER_03': tts_voice03, |
| 'SPEAKER_04': tts_voice04, |
| 'SPEAKER_05': tts_voice05 |
| } |
|
|
| for segment in tqdm(result_diarize['segments']): |
|
|
| text = segment['text'] |
| start = segment['start'] |
| end = segment['end'] |
|
|
| try: |
| speaker = segment['speaker'] |
| except KeyError: |
| segment['speaker'] = "SPEAKER_99" |
| speaker = segment['speaker'] |
| print("NO SPEAKER DETECT IN SEGMENT") |
|
|
| |
| filename = f"audio/{start}.ogg" |
|
|
| if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None': |
| make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO) |
| elif speaker == "SPEAKER_99": |
| try: |
| tts = gTTS(text, lang=TRANSLATE_AUDIO_TO) |
| tts.save(filename) |
| print('Using GTTS') |
| except: |
| tts = gTTS('a', lang=TRANSLATE_AUDIO_TO) |
| tts.save(filename) |
| print('Error: Audio will be replaced.') |
|
|
| |
| duration_true = end - start |
| duration_tts = librosa.get_duration(filename=filename) |
|
|
| |
| porcentaje = duration_tts / duration_true |
|
|
| if porcentaje > 2.1: |
| porcentaje = 2.1 |
| elif porcentaje <= 1.2 and porcentaje >= 0.8: |
| porcentaje = 1.0 |
| elif porcentaje <= 0.79: |
| porcentaje = 0.8 |
|
|
| |
| porcentaje = round(porcentaje+0.0, 1) |
|
|
| |
| os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}") |
|
|
| duration_create = librosa.get_duration(filename=f"audio2/{filename}") |
| audio_files.append(filename) |
|
|
| |
| os.system("mv -f audio2/audio/*.ogg audio/") |
|
|
| os.system(f"rm {Output_name_file}") |
| create_translated_audio(result_diarize, audio_files, Output_name_file) |
|
|
| os.system(f"rm {mix_audio}") |
| os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}') |
|
|
| os.system(f"rm {video_output}") |
| os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}") |
|
|
| return video_output |
|
|