Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| import gradio as gr | |
| import moviepy.editor as mp | |
| from TTS.api import TTS | |
| import torch | |
| import assemblyai as aai | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| # Download necessary models if not already present | |
| model_files = { | |
| "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth", | |
| "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth", | |
| "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth", | |
| "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth", | |
| "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" | |
| } | |
| device = "cpu" | |
| # Initialize TTS model | |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) | |
| # Download models | |
| for filename, url in model_files.items(): | |
| file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename) | |
| if not os.path.exists(file_path): | |
| print(f"Downloading {filename}...") | |
| r = requests.get(url) | |
| with open(file_path, 'wb') as f: | |
| f.write(r.content) | |
| # Translation class | |
| class translation: | |
| def __init__(self, video_path, original_language, target_language): | |
| self.video_path = video_path | |
| self.original_language = original_language | |
| self.target_language = target_language | |
| def org_language_parameters(self, original_language): | |
| language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} | |
| self.lan_code = language_codes.get(original_language, '') | |
| def target_language_parameters(self, target_language): | |
| language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} | |
| self.tran_code = language_codes.get(target_language, '') | |
| def extract_audio(self): | |
| video = mp.VideoFileClip(self.video_path) | |
| audio = video.audio | |
| audio_path = "output_audio.wav" | |
| audio.write_audiofile(audio_path) | |
| return audio_path | |
| def transcribe_audio(self, audio_path): | |
| aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") | |
| config = aai.TranscriptionConfig(language_code=self.lan_code) | |
| transcriber = aai.Transcriber(config=config) | |
| transcript = transcriber.transcribe(audio_path) | |
| return transcript.text | |
| def translate_text(self, transcript_text): | |
| base_url = "https://api.cognitive.microsofttranslator.com/translate" | |
| headers = { | |
| "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), | |
| "Content-Type": "application/json", | |
| "Ocp-Apim-Subscription-Region": "southeastasia" | |
| } | |
| params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} | |
| body = [{"text": transcript_text}] | |
| response = requests.post(base_url, headers=headers, params=params, json=body) | |
| translation = response.json()[0]["translations"][0]["text"] | |
| return translation | |
| def generate_audio(self, translated_text): | |
| tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code) | |
| return "output_synth.wav" | |
| def translate_video(self): | |
| audio_path = self.extract_audio() | |
| self.org_language_parameters(self.original_language) | |
| self.target_language_parameters(self.target_language) | |
| transcript_text = self.transcribe_audio(audio_path) | |
| translated_text = self.translate_text(transcript_text) | |
| translated_audio_path = self.generate_audio(translated_text) | |
| # Run Wav2Lip inference (update the path to inference.py) | |
| inference_script_path = "inference.py" # Update this to the actual location of inference.py | |
| os.system(f"python {inference_script_path} --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") | |
| return 'output_video.mp4' | |
| # Gradio Interface | |
| def app(video_path, original_language, target_language): | |
| translator = translation(video_path, original_language, target_language) | |
| video_file = translator.translate_video() | |
| return video_file | |
| interface = gr.Interface( | |
| fn=app, | |
| inputs=[ | |
| gr.Video(label="Video Path"), | |
| gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), | |
| gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), | |
| ], | |
| outputs=gr.Video(label="Translated Video") | |
| ) | |
| interface.launch() | |
| # import os | |
| # import requests | |
| # import gradio as gr | |
| # import moviepy.editor as mp | |
| # from TTS.api import TTS | |
| # import torch | |
| # import assemblyai as aai | |
| # os.environ["COQUI_TOS_AGREED"] = "1" | |
| # # Download necessary models if not already present | |
| # model_files = { | |
| # "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth", | |
| # "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth", | |
| # "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth", | |
| # "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth", | |
| # "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" | |
| # } | |
| # device = "cpu" | |
| # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) | |
| # for filename, url in model_files.items(): | |
| # file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename) | |
| # if not os.path.exists(file_path): | |
| # print(f"Downloading {filename}...") | |
| # r = requests.get(url) | |
| # with open(file_path, 'wb') as f: | |
| # f.write(r.content) | |
| # # Translation class | |
| # class translation: | |
| # def __init__(self, video_path, original_language, target_language): | |
| # self.video_path = video_path | |
| # self.original_language = original_language | |
| # self.target_language = target_language | |
| # def org_language_parameters(self, original_language): | |
| # language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} | |
| # self.lan_code = language_codes.get(original_language, '') | |
| # def target_language_parameters(self, target_language): | |
| # language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} | |
| # self.tran_code = language_codes.get(target_language, '') | |
| # def extract_audio(self): | |
| # video = mp.VideoFileClip(self.video_path) | |
| # audio = video.audio | |
| # audio_path = "output_audio.wav" | |
| # audio.write_audiofile(audio_path) | |
| # return audio_path | |
| # def transcribe_audio(self, audio_path): | |
| # aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") | |
| # config = aai.TranscriptionConfig(language_code=self.lan_code) | |
| # transcriber = aai.Transcriber(config=config) | |
| # transcript = transcriber.transcribe(audio_path) | |
| # return transcript.text | |
| # def translate_text(self, transcript_text): | |
| # base_url = "https://api.cognitive.microsofttranslator.com/translate" | |
| # headers = { | |
| # "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), | |
| # "Content-Type": "application/json", | |
| # "Ocp-Apim-Subscription-Region": "southeastasia" | |
| # } | |
| # params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} | |
| # body = [{"text": transcript_text}] | |
| # response = requests.post(base_url, headers=headers, params=params, json=body) | |
| # translation = response.json()[0]["translations"][0]["text"] | |
| # return translation | |
| # def generate_audio(self, translated_text): | |
| # tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code) | |
| # return "output_synth.wav" | |
| # def translate_video(self): | |
| # audio_path = self.extract_audio() | |
| # self.org_language_parameters(self.original_language) | |
| # self.target_language_parameters(self.target_language) | |
| # transcript_text = self.transcribe_audio(audio_path) | |
| # translated_text = self.translate_text(transcript_text) | |
| # translated_audio_path = self.generate_audio(translated_text) | |
| # # Run Wav2Lip inference | |
| # os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") | |
| # return 'output_video.mp4' | |
| # # Gradio Interface | |
| # def app(video_path, original_language, target_language): | |
| # translator = translation(video_path, original_language, target_language) | |
| # video_file = translator.translate_video() | |
| # return video_file | |
| # interface = gr.Interface( | |
| # fn=app, | |
| # inputs=[ | |
| # gr.Video(label="Video Path"), | |
| # gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), | |
| # gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), | |
| # ], | |
| # outputs=gr.Video(label="Translated Video") | |
| # ) | |
| # interface.launch() |