Spaces:
Runtime error
Runtime error
Commit
·
88a4625
1
Parent(s):
f64cb13
Update appf.py
Browse files
appf.py
CHANGED
|
@@ -1,75 +1,66 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
-
import whisper
|
| 4 |
-
from googletrans import Translator
|
| 5 |
-
import asyncio
|
| 6 |
-
import edge_tts
|
| 7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# Translate Text
|
| 18 |
-
def translate_text(whisper_text, whisper_language, target_language):
|
| 19 |
-
language_mapping = {
|
| 20 |
-
'English': 'en',
|
| 21 |
-
'Spanish': 'es',
|
| 22 |
-
# ... (other mappings)
|
| 23 |
-
}
|
| 24 |
-
target_language_code = language_mapping[target_language]
|
| 25 |
-
translator = Translator()
|
| 26 |
-
translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text
|
| 27 |
-
return translated_text
|
| 28 |
-
|
| 29 |
-
# Generate Voice
|
| 30 |
-
async def generate_voice(translated_text, target_language):
|
| 31 |
-
VOICE_MAPPING = {
|
| 32 |
-
'English': 'en-GB-SoniaNeural',
|
| 33 |
-
'Spanish': 'es-ES-PabloNeural',
|
| 34 |
-
# ... (other mappings)
|
| 35 |
-
}
|
| 36 |
-
voice = VOICE_MAPPING[target_language]
|
| 37 |
-
communicate = edge_tts.Communicate(translated_text, voice)
|
| 38 |
-
await communicate.save("output_synth.wav")
|
| 39 |
-
return "output_synth.wav"
|
| 40 |
|
| 41 |
-
|
| 42 |
-
def generate_lip_synced_video(video_path, output_audio_path):
|
| 43 |
-
# Your lip-synced video generation code here
|
| 44 |
-
# ...
|
| 45 |
-
return "output_high_qual.mp4"
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
f.write(video.read())
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
loop = asyncio.get_event_loop()
|
| 61 |
-
output_audio_path = loop.run_until_complete(generate_voice(translated_text, target_language))
|
| 62 |
|
| 63 |
-
|
| 64 |
-
output_video_path = generate_lip_synced_video(video_path, output_audio_path)
|
| 65 |
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
-
# Gradio Interface
|
| 69 |
iface = gr.Interface(
|
| 70 |
-
fn=process_video,
|
| 71 |
-
inputs=[
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
live=False
|
| 74 |
)
|
| 75 |
-
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
+
from googletrans import Translator
|
| 5 |
+
from TTS.api import TTS
|
| 6 |
+
from IPython.display import Audio, display
|
| 7 |
+
import ffmpeg
|
| 8 |
+
import whisper
|
| 9 |
|
| 10 |
+
def process_video(video, high_quality, target_language):
|
| 11 |
+
try:
|
| 12 |
+
output_filename = "resized_video.mp4"
|
| 13 |
+
if high_quality:
|
| 14 |
+
ffmpeg.input(video).output(output_filename, vf='scale=-1:720').run()
|
| 15 |
+
video_path = output_filename
|
| 16 |
+
else:
|
| 17 |
+
video_path = video
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
ffmpeg.input(video_path).output('output_audio.wav', acodec='pcm_s24le', ar=48000, map='a').run()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
model = whisper.load_model("base")
|
| 22 |
+
result = model.transcribe("output_audio.wav")
|
| 23 |
+
whisper_text = result["text"]
|
| 24 |
+
whisper_language = result['language']
|
|
|
|
| 25 |
|
| 26 |
+
language_mapping = {
|
| 27 |
+
'English': 'en',
|
| 28 |
+
'Spanish': 'es',
|
| 29 |
+
'French': 'fr',
|
| 30 |
+
'German': 'de',
|
| 31 |
+
'Italian': 'it',
|
| 32 |
+
'Portuguese': 'pt',
|
| 33 |
+
'Polish': 'pl',
|
| 34 |
+
'Turkish': 'tr',
|
| 35 |
+
'Russian': 'ru',
|
| 36 |
+
'Dutch': 'nl',
|
| 37 |
+
'Czech': 'cs',
|
| 38 |
+
'Arabic': 'ar',
|
| 39 |
+
'Chinese (Simplified)': 'zh-cn'
|
| 40 |
+
}
|
| 41 |
+
target_language_code = language_mapping[target_language]
|
| 42 |
+
translator = Translator()
|
| 43 |
+
translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text
|
| 44 |
|
| 45 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
|
| 46 |
+
tts.tts_to_file(translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=target_language_code)
|
| 47 |
|
| 48 |
+
subprocess.run(f"python inference.py --face {video_path} --audio 'output_synth.wav' --outfile 'output_high_qual.mp4'", shell=True)
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
return "output_high_qual.mp4"
|
|
|
|
| 51 |
|
| 52 |
+
except Exception as e:
|
| 53 |
+
return str(e)
|
| 54 |
|
|
|
|
| 55 |
iface = gr.Interface(
|
| 56 |
+
fn=process_video,
|
| 57 |
+
inputs=[
|
| 58 |
+
gr.Video(),
|
| 59 |
+
gr.inputs.Checkbox(label="High Quality"),
|
| 60 |
+
gr.inputs.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing")
|
| 61 |
+
],
|
| 62 |
+
outputs=gr.outputs.File(),
|
| 63 |
live=False
|
| 64 |
)
|
| 65 |
+
|
| 66 |
+
iface.launch(share=True)
|