Files changed (1) hide show
  1. app.py +135 -208
app.py CHANGED
@@ -2,225 +2,152 @@ import os
2
  import uuid
3
  import asyncio
4
  import subprocess
5
- import json
6
- from zipfile import ZipFile
7
- import stat
8
  import gradio as gr
9
- import ffmpeg
10
- import cv2
11
  import edge_tts
12
- from googletrans import Translator
13
- from huggingface_hub import HfApi
14
- import moviepy.editor as mp
15
- import spaces
16
-
17
- # Constants and initialization
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
- REPO_ID = "artificialguybr/video-dubbing"
20
- MAX_VIDEO_DURATION = 60 # seconds
21
-
22
- api = HfApi(token=HF_TOKEN)
23
-
24
- # Extract and set permissions for ffmpeg
25
- ZipFile("ffmpeg.zip").extractall()
26
- st = os.stat('ffmpeg')
27
- os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
28
-
29
- language_mapping = {
30
- 'English': ('en', 'en-US-EricNeural'),
31
- 'Spanish': ('es', 'es-ES-AlvaroNeural'),
32
- 'French': ('fr', 'fr-FR-HenriNeural'),
33
- 'German': ('de', 'de-DE-ConradNeural'),
34
- 'Italian': ('it', 'it-IT-DiegoNeural'),
35
- 'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
36
- 'Polish': ('pl', 'pl-PL-MarekNeural'),
37
- 'Turkish': ('tr', 'tr-TR-AhmetNeural'),
38
- 'Russian': ('ru', 'ru-RU-DmitryNeural'),
39
- 'Dutch': ('nl', 'nl-NL-MaartenNeural'),
40
- 'Czech': ('cs', 'cs-CZ-AntoninNeural'),
41
- 'Arabic': ('ar', 'ar-SA-HamedNeural'),
42
- 'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
43
- 'Japanese': ('ja', 'ja-JP-KeitaNeural'),
44
- 'Korean': ('ko', 'ko-KR-InJoonNeural'),
45
- 'Hindi': ('hi', 'hi-IN-MadhurNeural'),
46
- 'Swedish': ('sv', 'sv-SE-MattiasNeural'),
47
- 'Danish': ('da', 'da-DK-JeppeNeural'),
48
- 'Finnish': ('fi', 'fi-FI-HarriNeural'),
49
- 'Greek': ('el', 'el-GR-NestorasNeural')
50
  }
51
 
52
- print("Starting the program...")
53
-
54
- def generate_unique_filename(extension):
55
- return f"{uuid.uuid4()}{extension}"
56
-
57
- def cleanup_files(*files):
58
- for file in files:
59
- if file and os.path.exists(file):
60
- os.remove(file)
61
- print(f"Removed file: {file}")
62
-
63
- @spaces.GPU(duration=90)
64
- def transcribe_audio(file_path):
65
- print(f"Starting transcription of file: {file_path}")
66
- temp_audio = None
67
-
68
- if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
69
- print("Video file detected. Extracting audio...")
70
- try:
71
- video = mp.VideoFileClip(file_path)
72
- temp_audio = generate_unique_filename(".wav")
73
- video.audio.write_audiofile(temp_audio)
74
- file_path = temp_audio
75
- except Exception as e:
76
- print(f"Error extracting audio from video: {e}")
77
- raise
78
-
79
- output_file = generate_unique_filename(".json")
80
- command = [
81
- "insanely-fast-whisper",
82
- "--file-name", file_path,
83
- "--device-id", "0",
84
- "--model-name", "openai/whisper-large-v3",
85
- "--task", "transcribe",
86
- "--timestamp", "chunk",
87
- "--transcript-path", output_file
88
- ]
89
-
90
- try:
91
- result = subprocess.run(command, check=True, capture_output=True, text=True)
92
- print(f"Transcription output: {result.stdout}")
93
- except subprocess.CalledProcessError as e:
94
- print(f"Error running insanely-fast-whisper: {e}")
95
- raise
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  try:
98
- with open(output_file, "r") as f:
99
- transcription = json.load(f)
100
- except json.JSONDecodeError as e:
101
- print(f"Error decoding JSON: {e}")
102
- raise
103
-
104
- result = transcription.get("text", " ".join([chunk["text"] for chunk in transcription.get("chunks", [])]))
105
-
106
- cleanup_files(output_file, temp_audio)
107
-
108
- return result
109
-
110
- async def text_to_speech(text, voice, output_file):
111
- communicate = edge_tts.Communicate(text, voice)
112
- await communicate.save(output_file)
113
-
114
- @spaces.GPU
115
- def process_video(video, target_language, use_wav2lip):
116
- try:
117
- if target_language is None:
118
- raise ValueError("Please select a Target Language for Dubbing.")
119
-
120
- run_uuid = uuid.uuid4().hex[:6]
121
- output_filename = f"{run_uuid}_resized_video.mp4"
122
- ffmpeg.input(video).output(output_filename, vf='scale=-2:720').run()
123
-
124
- video_path = output_filename
125
-
126
- if not os.path.exists(video_path):
127
- raise FileNotFoundError(f"Error: {video_path} does not exist.")
128
-
129
- video_info = ffmpeg.probe(video_path)
130
- video_duration = float(video_info['streams'][0]['duration'])
131
-
132
- if video_duration > MAX_VIDEO_DURATION:
133
- cleanup_files(video_path)
134
- raise ValueError(f"Video duration exceeds {MAX_VIDEO_DURATION} seconds. Please upload a shorter video.")
135
-
136
- ffmpeg.input(video_path).output(f"{run_uuid}_output_audio.wav", acodec='pcm_s24le', ar=48000, map='a').run()
137
-
138
- subprocess.run(f"ffmpeg -y -i {run_uuid}_output_audio.wav -af lowpass=3000,highpass=100 {run_uuid}_output_audio_final.wav", shell=True, check=True)
139
-
140
- whisper_text = transcribe_audio(f"{run_uuid}_output_audio_final.wav")
141
- print(f"Transcription successful: {whisper_text}")
142
-
143
- target_language_code, voice = language_mapping[target_language]
144
- translator = Translator()
145
- translated_text = translator.translate(whisper_text, dest=target_language_code).text
146
- print(f"Translated text: {translated_text}")
147
-
148
- asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
149
-
150
- if use_wav2lip:
151
- try:
152
- subprocess.run(f"python Wav2Lip/inference.py --checkpoint_path 'Wav2Lip/checkpoints/wav2lip_gan.pth' --face '{video_path}' --audio '{run_uuid}_output_synth.wav' --pads 0 15 0 0 --resize_factor 1 --nosmooth --outfile '{run_uuid}_output_video.mp4'", shell=True, check=True)
153
- except subprocess.CalledProcessError as e:
154
- print(f"Wav2Lip error: {str(e)}")
155
- gr.Warning("Wav2lip encountered an error. Falling back to simple audio replacement.")
156
- subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
157
- else:
158
- subprocess.run(f"ffmpeg -i {video_path} -i {run_uuid}_output_synth.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {run_uuid}_output_video.mp4", shell=True, check=True)
159
-
160
- output_video_path = f"{run_uuid}_output_video.mp4"
161
- if not os.path.exists(output_video_path):
162
- raise FileNotFoundError(f"Error: {output_video_path} was not generated.")
163
-
164
- cleanup_files(
165
- f"{run_uuid}_resized_video.mp4",
166
- f"{run_uuid}_output_audio.wav",
167
- f"{run_uuid}_output_audio_final.wav",
168
- f"{run_uuid}_output_synth.wav"
169
  )
170
 
171
- return output_video_path, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  except Exception as e:
174
- print(f"Error in process_video: {str(e)}")
175
- return None, f"Error: {str(e)}"
176
-
177
- # Gradio interface setup
178
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
179
- gr.Markdown("# AI Video Dubbing")
180
- gr.Markdown("This tool uses AI to dub videos into different languages. Upload a video, choose a target language, and get a dubbed version!")
181
-
182
  with gr.Row():
183
- with gr.Column(scale=2):
184
- video_input = gr.Video(label="Upload Video")
185
- target_language = gr.Dropdown(
186
- choices=list(language_mapping.keys()),
187
- label="Target Language for Dubbing",
188
- value="Spanish"
189
- )
190
- use_wav2lip = gr.Checkbox(
191
- label="Use Wav2Lip for lip sync",
192
- value=False,
193
- info="Enable this if the video has close-up faces. May not work for all videos."
194
  )
195
- submit_button = gr.Button("Process Video", variant="primary")
196
-
197
- with gr.Column(scale=2):
198
- output_video = gr.Video(label="Processed Video")
199
- error_message = gr.Textbox(label="Status/Error Message")
200
-
201
- submit_button.click(
202
- process_video,
203
- inputs=[video_input, target_language, use_wav2lip],
204
- outputs=[output_video, error_message]
205
- )
206
-
207
- gr.Markdown("""
208
- ## Notes:
209
- - Video limit is 1 minute. The tool will dub all speakers using a single voice.
210
- - Processing may take up to 5 minutes.
211
- - This is an alpha version using open-source models.
212
- - Quality vs. speed trade-off was made for scalability and hardware limitations.
213
- - For videos longer than 1 minute, please duplicate this Space and adjust the limit in the code.
214
- """)
215
-
216
- gr.Markdown("""
217
- ---
218
- Developed by [@artificialguybr](https://twitter.com/artificialguybr) using open-source tools.
219
- Special thanks to Hugging Face for GPU support and [@yeswondwer](https://twitter.com/@yeswondwerr) for the original code.
220
-
221
- Try our [Video Transcription and Translation](https://huggingface.co/spaces/artificialguybr/VIDEO-TRANSLATION-TRANSCRIPTION) tool!
222
- """)
223
-
224
- print("Launching Gradio interface...")
225
  demo.queue()
226
  demo.launch()
 
2
  import uuid
3
  import asyncio
4
  import subprocess
5
+ import shutil
6
+ import nest_asyncio
 
7
  import gradio as gr
 
 
8
  import edge_tts
9
+ from deep_translator import GoogleTranslator
10
+ from faster_whisper import WhisperModel
11
+
12
+ # Allow asyncio to run inside Gradio's existing event loop
13
+ nest_asyncio.apply()
14
+
15
+ # Load Whisper model (small = fast, low memory, good enough for transcription)
16
+ model = WhisperModel("small", device="cpu", compute_type="int8")
17
+
18
+ # Supported languages: (translation code, TTS voice name)
19
+ languages = {
20
+ "English": ("en", "en-US-EricNeural"),
21
+ "Spanish": ("es", "es-ES-AlvaroNeural"),
22
+ "French": ("fr", "fr-FR-HenriNeural"),
23
+ "German": ("de", "de-DE-ConradNeural"),
24
+ "Italian": ("it", "it-IT-DiegoNeural"),
25
+ "Russian": ("ru", "ru-RU-DmitryNeural"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def transcribe(audio):
30
+ """Transcribe audio file to text using Whisper."""
31
+ segments, _ = model.transcribe(audio)
32
+ text = ""
33
+ for s in segments:
34
+ text += s.text + " "
35
+ return text.strip()
36
+
37
+
38
+ async def tts_async(text, voice, out):
39
+ """Generate speech from text using Edge TTS and save to file."""
40
+ t = edge_tts.Communicate(text, voice)
41
+ await t.save(out)
42
+
43
+
44
+ def run_tts(text, voice, out):
45
+ """Run async TTS function inside the current event loop."""
46
+ loop = asyncio.get_event_loop()
47
+ loop.run_until_complete(tts_async(text, voice, out))
48
+
49
+
50
+ def process(video, language):
51
+ """
52
+ Main processing pipeline:
53
+ 1. Resize video
54
+ 2. Extract audio
55
+ 3. Transcribe audio to text
56
+ 4. Translate text to target language
57
+ 5. Generate TTS speech
58
+ 6. Run Wav2Lip for lip sync (fallback: replace audio only)
59
+ """
60
  try:
61
+ # gr.Video returns file path directly
62
+ video_path = video
63
+
64
+ # Create unique temp directory for this job
65
+ uid = uuid.uuid4().hex[:6]
66
+ work_dir = f"/tmp/{uid}"
67
+ os.makedirs(work_dir, exist_ok=True)
68
+
69
+ # Copy uploaded video to work directory
70
+ input_video = os.path.join(work_dir, "input.mp4")
71
+ shutil.copy(video_path, input_video)
72
+
73
+ # Step 1: Resize video to 480p for faster processing
74
+ resized = os.path.join(work_dir, "video.mp4")
75
+ subprocess.run(
76
+ ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
77
+ check=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
 
80
+ # Step 2: Extract mono 16kHz audio (required format for Whisper)
81
+ audio = os.path.join(work_dir, "audio.wav")
82
+ subprocess.run(
83
+ ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
84
+ check=True,
85
+ )
86
+
87
+ # Step 3: Transcribe audio to text
88
+ text = transcribe(audio)
89
+ if not text:
90
+ return None, "❌ Transcription failed or audio is silent."
91
+
92
+ # Step 4: Translate transcribed text to target language
93
+ lang, voice = languages[language]
94
+ translated = GoogleTranslator(source="auto", target=lang).translate(text)
95
+ if not translated:
96
+ return None, "❌ Translation failed."
97
+
98
+ # Step 5: Generate TTS speech from translated text
99
+ speech = os.path.join(work_dir, "tts.wav")
100
+ run_tts(translated, voice, speech)
101
+
102
+ # Step 6: Run Wav2Lip for lip sync
103
+ output = os.path.join(work_dir, "lipsync.mp4")
104
+ result = subprocess.run(
105
+ [
106
+ "python", "Wav2Lip/inference.py",
107
+ "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
108
+ "--face", resized,
109
+ "--audio", speech,
110
+ "--outfile", output,
111
+ ],
112
+ capture_output=True,
113
+ text=True,
114
+ )
115
+
116
+ # If Wav2Lip failed β€” print reason and fallback to simple audio replacement
117
+ if result.returncode != 0:
118
+ print(f"WAV2LIP STDERR: {result.stderr}")
119
+ print(f"WAV2LIP STDOUT: {result.stdout}")
120
+ subprocess.run(
121
+ f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
122
+ f"-map 0:v:0 -map 1:a:0 {output}",
123
+ shell=True,
124
+ check=True,
125
+ )
126
+ return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"
127
+
128
+ return output, "βœ… Done!"
129
 
130
  except Exception as e:
131
+ return None, f"❌ Error: {str(e)}"
132
+
133
+
134
+ # Gradio UI
135
+ with gr.Blocks() as demo:
136
+ gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
 
 
137
  with gr.Row():
138
+ with gr.Column():
139
+ video = gr.Video(label="Upload Video")
140
+ lang = gr.Dropdown(
141
+ list(languages.keys()),
142
+ value="Spanish",
143
+ label="Target Language",
 
 
 
 
 
144
  )
145
+ run = gr.Button("β–Ά Process", variant="primary")
146
+ with gr.Column():
147
+ out = gr.Video(label="Result")
148
+ status = gr.Textbox(label="Status", lines=3)
149
+
150
+ run.click(process, inputs=[video, lang], outputs=[out, status])
151
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  demo.queue()
153
  demo.launch()