Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,8 @@ import subprocess
|
|
| 5 |
import os, stat
|
| 6 |
import uuid
|
| 7 |
from googletrans import Translator
|
| 8 |
-
|
|
|
|
| 9 |
import ffmpeg
|
| 10 |
import json
|
| 11 |
from scipy.signal import wiener
|
|
@@ -24,7 +25,6 @@ from huggingface_hub import HfApi
|
|
| 24 |
import moviepy.editor as mp
|
| 25 |
|
| 26 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 27 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 28 |
api = HfApi(token=HF_TOKEN)
|
| 29 |
repo_id = "artificialguybr/video-dubbing"
|
| 30 |
ZipFile("ffmpeg.zip").extractall()
|
|
@@ -121,6 +121,10 @@ def transcribe_audio(file_path):
|
|
| 121 |
|
| 122 |
return result
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
@spaces.GPU
|
| 125 |
def process_video(radio, video, target_language, has_closeup_face):
|
| 126 |
try:
|
|
@@ -156,15 +160,34 @@ def process_video(radio, video, target_language, has_closeup_face):
|
|
| 156 |
print(f"Error encountered during transcription: {str(e)}")
|
| 157 |
raise
|
| 158 |
|
| 159 |
-
language_mapping = {
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
translator = Translator()
|
| 162 |
translated_text = translator.translate(whisper_text, dest=target_language_code).text
|
| 163 |
print(translated_text)
|
| 164 |
|
| 165 |
-
|
| 166 |
-
tts.to('cuda')
|
| 167 |
-
tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
|
| 168 |
|
| 169 |
pad_top = 0
|
| 170 |
pad_bottom = 15
|
|
@@ -228,7 +251,7 @@ iface = gr.Interface(
|
|
| 228 |
inputs=[
|
| 229 |
radio,
|
| 230 |
video,
|
| 231 |
-
gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
|
| 232 |
gr.Checkbox(
|
| 233 |
label="Video has a close-up face. Use Wav2lip.",
|
| 234 |
value=False,
|
|
@@ -246,10 +269,9 @@ with gr.Blocks() as demo:
|
|
| 246 |
radio.change(swap, inputs=[radio], outputs=video)
|
| 247 |
gr.Markdown("""
|
| 248 |
**Note:**
|
| 249 |
-
- Video limit is 1 minute. It will
|
| 250 |
- Generation may take up to 5 minutes.
|
| 251 |
-
-
|
| 252 |
-
- The tool uses open-source models for all models. It's a alpha version.
|
| 253 |
- Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
|
| 254 |
- If you need more than 1 minute, duplicate the Space and change the limit on app.py.
|
| 255 |
- If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
|
|
|
|
| 5 |
import os, stat
|
| 6 |
import uuid
|
| 7 |
from googletrans import Translator
|
| 8 |
+
import edge_tts
|
| 9 |
+
import asyncio
|
| 10 |
import ffmpeg
|
| 11 |
import json
|
| 12 |
from scipy.signal import wiener
|
|
|
|
| 25 |
import moviepy.editor as mp
|
| 26 |
|
| 27 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 28 |
api = HfApi(token=HF_TOKEN)
|
| 29 |
repo_id = "artificialguybr/video-dubbing"
|
| 30 |
ZipFile("ffmpeg.zip").extractall()
|
|
|
|
| 121 |
|
| 122 |
return result
|
| 123 |
|
| 124 |
+
async def text_to_speech(text, voice, output_file):
|
| 125 |
+
communicate = edge_tts.Communicate(text, voice)
|
| 126 |
+
await communicate.save(output_file)
|
| 127 |
+
|
| 128 |
@spaces.GPU
|
| 129 |
def process_video(radio, video, target_language, has_closeup_face):
|
| 130 |
try:
|
|
|
|
| 160 |
print(f"Error encountered during transcription: {str(e)}")
|
| 161 |
raise
|
| 162 |
|
| 163 |
+
language_mapping = {
|
| 164 |
+
'English': ('en', 'en-US-EricNeural'),
|
| 165 |
+
'Spanish': ('es', 'es-ES-AlvaroNeural'),
|
| 166 |
+
'French': ('fr', 'fr-FR-HenriNeural'),
|
| 167 |
+
'German': ('de', 'de-DE-ConradNeural'),
|
| 168 |
+
'Italian': ('it', 'it-IT-DiegoNeural'),
|
| 169 |
+
'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
|
| 170 |
+
'Polish': ('pl', 'pl-PL-MarekNeural'),
|
| 171 |
+
'Turkish': ('tr', 'tr-TR-AhmetNeural'),
|
| 172 |
+
'Russian': ('ru', 'ru-RU-DmitryNeural'),
|
| 173 |
+
'Dutch': ('nl', 'nl-NL-MaartenNeural'),
|
| 174 |
+
'Czech': ('cs', 'cs-CZ-AntoninNeural'),
|
| 175 |
+
'Arabic': ('ar', 'ar-SA-HamedNeural'),
|
| 176 |
+
'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
|
| 177 |
+
'Japanese': ('ja', 'ja-JP-KeitaNeural'),
|
| 178 |
+
'Korean': ('ko', 'ko-KR-InJoonNeural'),
|
| 179 |
+
'Hindi': ('hi', 'hi-IN-MadhurNeural'),
|
| 180 |
+
'Swedish': ('sv', 'sv-SE-MattiasNeural'),
|
| 181 |
+
'Danish': ('da', 'da-DK-JeppeNeural'),
|
| 182 |
+
'Finnish': ('fi', 'fi-FI-HarriNeural'),
|
| 183 |
+
'Greek': ('el', 'el-GR-NestorasNeural')
|
| 184 |
+
}
|
| 185 |
+
target_language_code, voice = language_mapping[target_language]
|
| 186 |
translator = Translator()
|
| 187 |
translated_text = translator.translate(whisper_text, dest=target_language_code).text
|
| 188 |
print(translated_text)
|
| 189 |
|
| 190 |
+
asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
|
|
|
|
|
|
|
| 191 |
|
| 192 |
pad_top = 0
|
| 193 |
pad_bottom = 15
|
|
|
|
| 251 |
inputs=[
|
| 252 |
radio,
|
| 253 |
video,
|
| 254 |
+
gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Japanese", "Korean", "Hindi", "Swedish", "Danish", "Finnish", "Greek"], label="Target Language for Dubbing", value="Spanish"),
|
| 255 |
gr.Checkbox(
|
| 256 |
label="Video has a close-up face. Use Wav2lip.",
|
| 257 |
value=False,
|
|
|
|
| 269 |
radio.change(swap, inputs=[radio], outputs=video)
|
| 270 |
gr.Markdown("""
|
| 271 |
**Note:**
|
| 272 |
+
- Video limit is 1 minute. It will dubbing all people using just one voice.
|
| 273 |
- Generation may take up to 5 minutes.
|
| 274 |
+
- The tool uses open-source models for all models. It's an alpha version.
|
|
|
|
| 275 |
- Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
|
| 276 |
- If you need more than 1 minute, duplicate the Space and change the limit on app.py.
|
| 277 |
- If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
|