XtewaldX commited on
Commit
a6ee9c3
Β·
verified Β·
1 Parent(s): bdb4d21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -206
app.py CHANGED
@@ -2,225 +2,231 @@ import os
2
  import uuid
3
  import asyncio
4
  import subprocess
5
- import json
6
- from zipfile import ZipFile
7
- import stat
8
  import gradio as gr
9
- import ffmpeg
10
- import cv2
11
  import edge_tts
12
- from googletrans import Translator
13
- from huggingface_hub import HfApi
14
- import moviepy.editor as mp
15
- import spaces
16
-
17
- # Constants and initialization
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
- REPO_ID = "artificialguybr/video-dubbing"
20
- MAX_VIDEO_DURATION = 60 # seconds
21
-
22
- api = HfApi(token=HF_TOKEN)
23
-
24
- # Extract and set permissions for ffmpeg
25
- ZipFile("ffmpeg.zip").extractall()
26
- st = os.stat('ffmpeg')
27
- os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
28
-
29
- language_mapping = {
30
- 'English': ('en', 'en-US-EricNeural'),
31
- 'Spanish': ('es', 'es-ES-AlvaroNeural'),
32
- 'French': ('fr', 'fr-FR-HenriNeural'),
33
- 'German': ('de', 'de-DE-ConradNeural'),
34
- 'Italian': ('it', 'it-IT-DiegoNeural'),
35
- 'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
36
- 'Polish': ('pl', 'pl-PL-MarekNeural'),
37
- 'Turkish': ('tr', 'tr-TR-AhmetNeural'),
38
- 'Russian': ('ru', 'ru-RU-DmitryNeural'),
39
- 'Dutch': ('nl', 'nl-NL-MaartenNeural'),
40
- 'Czech': ('cs', 'cs-CZ-AntoninNeural'),
41
- 'Arabic': ('ar', 'ar-SA-HamedNeural'),
42
- 'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
43
- 'Japanese': ('ja', 'ja-JP-KeitaNeural'),
44
- 'Korean': ('ko', 'ko-KR-InJoonNeural'),
45
- 'Hindi': ('hi', 'hi-IN-MadhurNeural'),
46
- 'Swedish': ('sv', 'sv-SE-MattiasNeural'),
47
- 'Danish': ('da', 'da-DK-JeppeNeural'),
48
- 'Finnish': ('fi', 'fi-FI-HarriNeural'),
49
- 'Greek': ('el', 'el-GR-NestorasNeural')
50
  }
51
 
52
- print("Starting the program...")
53
-
54
- def generate_unique_filename(extension):
55
- return f"{uuid.uuid4()}{extension}"
56
-
57
- def cleanup_files(*files):
58
- for file in files:
59
- if file and os.path.exists(file):
60
- os.remove(file)
61
- print(f"Removed file: {file}")
62
-
63
- @spaces.GPU(duration=90)
64
- def transcribe_audio(file_path):
65
- print(f"Starting transcription of file: {file_path}")
66
- temp_audio = None
67
-
68
- if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
69
- print("Video file detected. Extracting audio...")
70
- try:
71
- video = mp.VideoFileClip(file_path)
72
- temp_audio = generate_unique_filename(".wav")
73
- video.audio.write_audiofile(temp_audio)
74
- file_path = temp_audio
75
- except Exception as e:
76
- print(f"Error extracting audio from video: {e}")
77
- raise
78
-
79
- output_file = generate_unique_filename(".json")
80
- command = [
81
- "insanely-fast-whisper",
82
- "--file-name", file_path,
83
- "--device-id", "0",
84
- "--model-name", "openai/whisper-large-v3",
85
- "--task", "transcribe",
86
- "--timestamp", "chunk",
87
- "--transcript-path", output_file
88
- ]
89
-
90
- try:
91
- result = subprocess.run(command, check=True, capture_output=True, text=True)
92
- print(f"Transcription output: {result.stdout}")
93
- except subprocess.CalledProcessError as e:
94
- print(f"Error running insanely-fast-whisper: {e}")
95
- raise
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  try:
98
- with open(output_file, "r") as f:
99
- transcription = json.load(f)
100
- except json.JSONDecodeError as e:
101
- print(f"Error decoding JSON: {e}")
102
- raise
103
-
104
- result = transcription.get("text", " ".join([chunk["text"] for chunk in transcription.get("chunks", [])]))
105
-
106
- cleanup_files(output_file, temp_audio)
107
-
108
- return result
109
-
110
- async def text_to_speech(text, voice, output_file):
111
- communicate = edge_tts.Communicate(text, voice)
112
- await communicate.save(output_file)
113
-
114
- @spaces.GPU
115
- def process_video(video, target_language, use_wav2lip):
116
- try:
117
- if target_language is None:
118
- raise ValueError("Please select a Target Language for Dubbing.")
119
-
120
- run_uuid = uuid.uuid4().hex[:6]
121
- output_filename = f"{run_uuid}_resized_video.mp4"
122
- ffmpeg.input(video).output(output_filename, vf='scale=-2:720').run()
123
-
124
- video_path = output_filename
125
-
126
- if not os.path.exists(video_path):
127
- raise FileNotFoundError(f"Error: {video_path} does not exist.")
128
-
129
- video_info = ffmpeg.probe(video_path)
130
- video_duration = float(video_info['streams'][0]['duration'])
131
-
132
- if video_duration > MAX_VIDEO_DURATION:
133
- cleanup_files(video_path)
134
- raise ValueError(f"Video duration exceeds {MAX_VIDEO_DURATION} seconds. Please upload a shorter video.")
135
-
136
- ffmpeg.input(video_path).output(f"{run_uuid}_output_audio.wav", acodec='pcm_s24le', ar=48000, map='a').run()
137
-
138
- subprocess.run(f"ffmpeg -y -i {run_uuid}_output_audio.wav -af lowpass=3000,highpass=100 {run_uuid}_output_audio_final.wav", shell=True, check=True)
139
-
140
- whisper_text = transcribe_audio(f"{run_uuid}_output_audio_final.wav")
141
- print(f"Transcription successful: {whisper_text}")
142
-
143
- target_language_code, voice = language_mapping[target_language]
144
- translator = Translator()
145
- translated_text = translator.translate(whisper_text, dest=target_language_code).text
146
- print(f"Translated text: {translated_text}")
147
-
148
- asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
149
-
150
- if use_wav2lip:
151
- try:
152
- subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
153
- except subprocess.CalledProcessError as e:
154
- print(f"Wav2Lip error: {str(e)}")
155
- gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
156
- subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
157
- else:
158
- subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
159
-
160
- output_video_path = f"{run_uuid}_output_video.mp4"
161
- if not os.path.exists(output_video_path):
162
- raise FileNotFoundError(f"Error: {output_video_path} was not generated.")
163
 
164
- cleanup_files(
165
- f"{run_uuid}_resized_video.mp4",
166
- f"{run_uuid}_output_audio.wav",
167
- f"{run_uuid}_output_audio_final.wav",
168
- f"{run_uuid}_output_synth.wav"
 
 
 
 
169
  )
170
 
171
- return output_video_path, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  except Exception as e:
174
- print(f"Error in process_video: {str(e)}")
175
- return None, f"Error: {str(e)}"
176
-
177
- # Gradio interface setup
178
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
179
- gr.Markdown("# AI Video Dubbing")
180
- gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
181
-
 
182
  with gr.Row():
183
- with gr.Column(scale=2):
184
- video_input = gr.Video(label="Upload Video")
185
- target_language = gr.Dropdown(
186
- choices=list(language_mapping.keys()),
187
- label="Target Language for Dubbing",
188
- value="Spanish"
 
 
 
189
  )
190
- use_wav2lip = gr.Checkbox(
191
- label="Use Wav2Lip for lip sync",
192
- value=False,
193
- info="Enable this if the video has close-up faces. May not work for all videos."
 
 
 
 
194
  )
195
- submit_button = gr.Button("Process Video", variant="primary")
196
-
197
- with gr.Column(scale=2):
198
- output_video = gr.Video(label="Processed Video")
199
- error_message = gr.Textbox(label="Status/Error Message")
200
-
201
- submit_button.click(
202
- process_video,
203
- inputs=[video_input, target_language, use_wav2lip],
204
- outputs=[output_video, error_message]
205
- )
206
-
207
- gr.Markdown("""
208
- ## Notes:
209
- - Video limit is 1 minute. The tool will dub all speakers using a single voice.
210
- - Processing may take up to 5 minutes.
211
- - This is an alpha version using open-source models.
212
- - Quality vs. speed trade-off was made for scalability and hardware limitations.
213
- - For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
214
- """)
215
-
216
- gr.Markdown("""
217
- ---
218
- Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
219
- Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
220
-
221
- Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
222
- """)
223
-
224
- print("Launching Gradio interface...")
225
  demo.queue()
226
  demo.launch()
 
2
  import uuid
3
  import asyncio
4
  import subprocess
5
+ import shutil
6
+ import nest_asyncio
 
7
  import gradio as gr
 
 
8
  import edge_tts
9
+ from deep_translator import GoogleTranslator
10
+ from faster_whisper import WhisperModel
11
+
12
+ # Allow asyncio to run inside Gradio's existing event loop
13
+ nest_asyncio.apply()
14
+
15
+ # Load Whisper model once at startup
16
+ # small = good balance between speed and accuracy on CPU
17
+ # int8 = quantized for lower memory usage
18
+ model = WhisperModel("small", device="cpu", compute_type="int8")
19
+
20
+ # Supported target languages
21
+ # Format: "Display Name": ("translation_code", "edge_tts_voice_name")
22
+ languages = {
23
+ "English": ("en", "en-US-EricNeural"),
24
+ "Spanish": ("es", "es-ES-AlvaroNeural"),
25
+ "French": ("fr", "fr-FR-HenriNeural"),
26
+ "German": ("de", "de-DE-ConradNeural"),
27
+ "Italian": ("it", "it-IT-DiegoNeural"),
28
+ "Russian": ("ru", "ru-RU-DmitryNeural"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def transcribe(audio):
33
+ """
34
+ Transcribe audio file to text using faster-whisper.
35
+ Returns a single string with all segments joined.
36
+ """
37
+ segments, _ = model.transcribe(audio)
38
+ text = ""
39
+ for s in segments:
40
+ text += s.text + " "
41
+ return text.strip()
42
+
43
+
44
+ async def tts_async(text, voice, out):
45
+ """
46
+ Async function to generate speech from text using Microsoft Edge TTS.
47
+ Saves the result to the given output file path.
48
+ """
49
+ t = edge_tts.Communicate(text, voice)
50
+ await t.save(out)
51
+
52
+
53
+ def run_tts(text, voice, out):
54
+ """
55
+ Wrapper to run the async TTS function synchronously
56
+ inside the existing asyncio event loop (required for Gradio).
57
+ """
58
+ loop = asyncio.get_event_loop()
59
+ loop.run_until_complete(tts_async(text, voice, out))
60
+
61
+
62
+ def process(video, language, use_lipsync):
63
+ """
64
+ Main video dubbing pipeline:
65
+ Step 1 - Resize: scale video to 480p for faster processing
66
+ Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format)
67
+ Step 3 - Transcribe: convert audio to text using Whisper
68
+ Step 4 - Translate: translate text to target language using Google Translate
69
+ Step 5 - TTS: generate new speech audio using Edge TTS
70
+ Step 6 - Combine:
71
+ - If lip sync enabled: run Wav2Lip to animate mouth movements
72
+ - If Wav2Lip fails: fallback to simple audio replacement
73
+ - If lip sync disabled: directly replace audio track with TTS audio
74
+ Returns: (output_video_path, status_message)
75
+ """
76
  try:
77
+ # gr.Video returns the file path directly as a string
78
+ video_path = video
79
+
80
+ # Create an isolated temp directory for this job
81
+ # Using short UUID to avoid path collisions between concurrent users
82
+ uid = uuid.uuid4().hex[:6]
83
+ work_dir = f"/tmp/{uid}"
84
+ os.makedirs(work_dir, exist_ok=True)
85
+
86
+ # Copy uploaded video into our work directory
87
+ input_video = os.path.join(work_dir, "input.mp4")
88
+ shutil.copy(video_path, input_video)
89
+
90
+ # -------------------------------------------------------------------
91
+ # Step 1: Resize video to 480p
92
+ # -vf scale=-2:480 keeps aspect ratio, height = 480px
93
+ # Smaller resolution = faster Whisper transcription and Wav2Lip
94
+ # -------------------------------------------------------------------
95
+ resized = os.path.join(work_dir, "video.mp4")
96
+ subprocess.run(
97
+ ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
98
+ check=True,
99
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # -------------------------------------------------------------------
102
+ # Step 2: Extract audio track from resized video
103
+ # -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate
104
+ # 16kHz mono WAV is the required input format for Whisper
105
+ # -------------------------------------------------------------------
106
+ audio = os.path.join(work_dir, "audio.wav")
107
+ subprocess.run(
108
+ ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
109
+ check=True,
110
  )
111
 
112
+ # -------------------------------------------------------------------
113
+ # Step 3: Transcribe audio to text using Whisper
114
+ # -------------------------------------------------------------------
115
+ text = transcribe(audio)
116
+ if not text:
117
+ return None, "❌ Transcription failed or audio is silent."
118
+
119
+ # -------------------------------------------------------------------
120
+ # Step 4: Translate transcribed text to the target language
121
+ # source="auto" = Whisper auto-detects the original language
122
+ # -------------------------------------------------------------------
123
+ lang, voice = languages[language]
124
+ translated = GoogleTranslator(source="auto", target=lang).translate(text)
125
+ if not translated:
126
+ return None, "❌ Translation failed."
127
+
128
+ # -------------------------------------------------------------------
129
+ # Step 5: Generate TTS speech from translated text
130
+ # Edge TTS uses Microsoft neural voices (free, no API key needed)
131
+ # -------------------------------------------------------------------
132
+ speech = os.path.join(work_dir, "tts.wav")
133
+ run_tts(translated, voice, speech)
134
+
135
+ # Output file path for final video
136
+ output = os.path.join(work_dir, "lipsync.mp4")
137
+
138
+ # -------------------------------------------------------------------
139
+ # Step 6a: Lip sync mode β€” run Wav2Lip to animate mouth movements
140
+ # Wav2Lip requires: face video + audio -> outputs lip-synced video
141
+ # -------------------------------------------------------------------
142
+ if use_lipsync:
143
+ result = subprocess.run(
144
+ [
145
+ "python", "Wav2Lip/inference.py",
146
+ "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
147
+ "--face", resized, # input face video
148
+ "--audio", speech, # new TTS audio
149
+ "--outfile", output, # output lip-synced video
150
+ ],
151
+ capture_output=True,
152
+ text=True,
153
+ )
154
+
155
+ # If Wav2Lip failed for any reason, fall back to simple audio swap
156
+ if result.returncode != 0:
157
+ print(f"WAV2LIP STDERR: {result.stderr}")
158
+ print(f"WAV2LIP STDOUT: {result.stdout}")
159
+
160
+ # Fallback: copy video stream, replace audio stream
161
+ subprocess.run(
162
+ f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
163
+ f"-map 0:v:0 -map 1:a:0 {output}",
164
+ shell=True,
165
+ check=True,
166
+ )
167
+ return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"
168
+
169
+ return output, "βœ… Done with lip sync!"
170
+
171
+ # -------------------------------------------------------------------
172
+ # Step 6b: No lip sync β€” just replace the audio track
173
+ # -c:v copy = keep original video stream unchanged
174
+ # -c:a aac = encode new audio as AAC
175
+ # -map 0:v:0 = take video from first input
176
+ # -map 1:a:0 = take audio from second input (TTS)
177
+ # -------------------------------------------------------------------
178
+ else:
179
+ subprocess.run(
180
+ f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
181
+ f"-map 0:v:0 -map 1:a:0 {output}",
182
+ shell=True,
183
+ check=True,
184
+ )
185
+ return output, "βœ… Done! (audio replacement, no lip sync)"
186
 
187
  except Exception as e:
188
+ # Catch any unexpected errors and return them as status message
189
+ return None, f"❌ Error: {str(e)}"
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Gradio UI
194
+ # ---------------------------------------------------------------------------
195
+ with gr.Blocks() as demo:
196
+ gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
197
  with gr.Row():
198
+ with gr.Column():
199
+ # Video upload widget β€” shows preview before processing
200
+ video = gr.Video(label="Upload Video")
201
+
202
+ # Target language selector
203
+ lang = gr.Dropdown(
204
+ list(languages.keys()),
205
+ value="Spanish",
206
+ label="Target Language",
207
  )
208
+
209
+ # Toggle to enable/disable Wav2Lip lip sync
210
+ # Disabled by default β€” faster, works on all videos
211
+ # Enable only if video has close-up face shots
212
+ use_lipsync = gr.Checkbox(
213
+ label="Enable Lip Sync (Wav2Lip)",
214
+ value=False,
215
+ info="Enable if video has close-up face. Slower processing.",
216
  )
217
+
218
+ # Submit button
219
+ run = gr.Button("β–Ά Process", variant="primary")
220
+
221
+ with gr.Column():
222
+ # Output video player
223
+ out = gr.Video(label="Result")
224
+
225
+ # Status/error message box
226
+ status = gr.Textbox(label="Status", lines=3)
227
+
228
+ # Wire up the button click to the process function
229
+ run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status])
230
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  demo.queue()
232
  demo.launch()