mohitrai76 commited on
Commit
924e4df
·
verified ·
1 Parent(s): 03f7ea6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +313 -313
app.py CHANGED
@@ -1,314 +1,314 @@
1
- import os
2
- import shutil
3
- import tempfile
4
- import subprocess
5
- from pathlib import Path
6
- import numpy as np
7
- import soundfile as sf
8
- from pydub import AudioSegment
9
- from faster_whisper import WhisperModel
10
- from openai import OpenAI
11
- import httpx
12
- import asyncio
13
- import gradio as gr
14
-
15
- # --- Demucs-based vocal separation ---
16
- def separate_vocals(input_path):
17
- """Use Demucs to separate vocals and background music"""
18
- temp_dir = tempfile.mkdtemp()
19
- try:
20
- output_dir = os.path.join(temp_dir, "separated")
21
- os.makedirs(output_dir, exist_ok=True)
22
-
23
- from demucs.separate import main as demucs_main
24
- import sys
25
-
26
- original_argv = sys.argv
27
- sys.argv = [
28
- "demucs",
29
- "--two-stems", "vocals",
30
- "-o", output_dir,
31
- input_path
32
- ]
33
-
34
- try:
35
- demucs_main()
36
- finally:
37
- sys.argv = original_argv
38
-
39
- base_name = Path(input_path).stem
40
- vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
41
- noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
42
-
43
- if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
44
- raise FileNotFoundError("Demucs output missing")
45
-
46
- return vocals_path, noise_path, temp_dir
47
- except Exception as e:
48
- print(f"Demucs error: {e}")
49
- shutil.rmtree(temp_dir, ignore_errors=True)
50
- return None, None, None
51
-
52
- # --- AudioProcessor class ---
53
- class AudioProcessor:
54
- def __init__(self, device="cpu"):
55
- self.whisper_model = WhisperModel("small.en", device=device)
56
- self.openrouter_api_key = os.environ.get("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")
57
- self.client = OpenAI(
58
- base_url="https://api.openai.com/v1",
59
- api_key=self.openrouter_api_key,
60
- http_client=httpx.Client(headers={
61
- "Authorization": f"Bearer {self.openrouter_api_key}",
62
- "HTTP-Referer": "https://github.com",
63
- "X-Title": "Audio Translation App"
64
- })
65
- )
66
-
67
- def transcribe_audio_with_pauses(self, audio_path):
68
- segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
69
- previous_end = 0.0
70
- results = []
71
-
72
- for segment in segments:
73
- if segment.start > previous_end + 0.5:
74
- results.append((previous_end, segment.start, None))
75
- results.append((segment.start, segment.end, segment.text.strip()))
76
- previous_end = segment.end
77
-
78
- audio_duration = get_audio_duration(audio_path)
79
- if audio_duration and audio_duration > previous_end + 0.5:
80
- results.append((previous_end, audio_duration, None))
81
-
82
- return results
83
-
84
- def translate_text(self, text):
85
- try:
86
- print(f"Translating text: {text}")
87
- completion = self.client.chat.completions.create(
88
- model="gpt-3.5-turbo",
89
- messages=[
90
- {
91
- "role": "system",
92
- "content": "You are a professional translator from English to Hindi."
93
- },
94
- {
95
- "role": "user",
96
- "content": f"""Translate the following text to Hindi:
97
- "{text}"
98
- Guidelines:
99
- 1. Most important each and every line should be in Hindi of each segment
100
- 2. Use natural conversational Hindi
101
- 3. Preserve meaning/context
102
- 4. Leave proper nouns unchanged
103
- 5. Match original word count
104
- 6. Output ONLY translation
105
- """
106
- }
107
- ],
108
- temperature=0.2,
109
- max_tokens=2000
110
- )
111
- translated = completion.choices[0].message.content.strip()
112
- print(f"Translated text: {translated}")
113
- return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
114
- except Exception as e:
115
- print(f"Translation error: {e}")
116
- return None
117
-
118
- # --- Helper functions ---
119
- def get_audio_duration(audio_path):
120
- try:
121
- with sf.SoundFile(audio_path) as f:
122
- return len(f) / f.samplerate
123
- except Exception as e:
124
- print(f"Duration error: {e}")
125
- return None
126
-
127
- async def synthesize_tts_to_wav(text, voice, output_wav_path):
128
- import edge_tts
129
- temp_mp3 = "temp_tts.mp3"
130
- communicate = edge_tts.Communicate(text, voice)
131
- await communicate.save(temp_mp3)
132
-
133
- audio = AudioSegment.from_file(temp_mp3)
134
- audio = audio.set_channels(1).set_frame_rate(22050)
135
- audio.export(output_wav_path, format="wav")
136
- os.remove(temp_mp3)
137
-
138
- def stretch_audio(input_wav, output_wav, target_duration):
139
- data, sr = sf.read(input_wav)
140
- if len(data) == 0:
141
- raise ValueError("Empty audio")
142
-
143
- tempo_ratio = target_duration / (len(data) / sr)
144
- result = subprocess.run([
145
- "rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
146
- input_wav, output_wav
147
- ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
148
-
149
- if result.returncode != 0:
150
- raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
151
-
152
- def generate_silence_wav(duration_s, output_path, sample_rate=22050):
153
- samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
154
- sf.write(output_path, samples, sample_rate)
155
-
156
- # --- Main Gradio Interface ---
157
- async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
158
- audio_processor = AudioProcessor()
159
-
160
- print("🔎 Separating vocals and music using Demucs...")
161
- vocals_path, background_path, temp_dir = separate_vocals(input_path)
162
- if not vocals_path:
163
- return None, None
164
-
165
- print("🔎 Transcribing vocals...")
166
- segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
167
- print(f"Transcribed {len(segments)} segments.")
168
-
169
- chunk_files = []
170
- chunk_idx = 0
171
-
172
- for start, end, text in segments:
173
- duration = end - start
174
- chunk_idx += 1
175
-
176
- if text is None:
177
- filename = f"chunk_{chunk_idx:03d}_pause.wav"
178
- generate_silence_wav(duration, filename)
179
- chunk_files.append(filename)
180
- else:
181
- translated = audio_processor.translate_text(text) or text
182
- print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
183
-
184
- raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
185
- stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
186
-
187
- await synthesize_tts_to_wav(translated, voice, raw_tts)
188
- stretch_audio(raw_tts, stretched, duration)
189
- chunk_files.append(stretched)
190
- os.remove(raw_tts)
191
-
192
- combined_tts = AudioSegment.empty()
193
- for f in chunk_files:
194
- combined_tts += AudioSegment.from_wav(f)
195
-
196
- print("🎼 Adding original background music...")
197
- background_music = AudioSegment.from_wav(background_path)
198
- background_music = background_music[:len(combined_tts)]
199
- final_mix = combined_tts.overlay(background_music)
200
-
201
- output_path = "final_translated_with_music.wav"
202
- final_mix.export(output_path, format="wav")
203
- print(f"✅ Output saved as: {output_path}")
204
-
205
- final_audio_path = output_path
206
- final_background_path = background_path
207
-
208
- for f in chunk_files:
209
- os.remove(f)
210
- shutil.rmtree(temp_dir, ignore_errors=True)
211
- return final_audio_path, final_background_path
212
-
213
- def gradio_interface(video_file, voice):
214
- try:
215
- # Create temporary directory for processing
216
- temp_dir = Path(tempfile.mkdtemp())
217
- input_video_path = temp_dir / "input_video.mp4"
218
-
219
- # Check if file is a video
220
- if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
221
- raise ValueError("Invalid file type. Please upload a video file.")
222
-
223
- # Save the uploaded file to the temporary directory
224
- shutil.copyfile(video_file.name, input_video_path)
225
-
226
- # Extract audio from video
227
- audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
228
- if not audio_path:
229
- return None
230
-
231
- # Process audio chunks
232
- audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
233
-
234
- if audio_output_path is None or background_path is None:
235
- return None
236
-
237
- # Combine with original video
238
- output_video_path = temp_dir / "translated_video.mp4"
239
- success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
240
-
241
- if success:
242
- # Return the path to the output video
243
- return str(output_video_path)
244
- else:
245
- return None
246
-
247
- except Exception as e:
248
- print(f"Error processing video: {e}")
249
- return None
250
- finally:
251
- # Cleanup temporary files
252
- # Commented out for debugging purposes
253
- # shutil.rmtree(temp_dir, ignore_errors=True)
254
- pass
255
-
256
- def extract_audio_from_video(video_path):
257
- """Extract audio from video file using ffmpeg"""
258
- temp_dir = tempfile.mkdtemp()
259
- audio_path = os.path.join(temp_dir, "extracted_audio.wav")
260
-
261
- try:
262
- subprocess.run([
263
- "ffmpeg", "-y", "-i", video_path,
264
- "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
265
- audio_path
266
- ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
267
-
268
- if not os.path.exists(audio_path):
269
- raise FileNotFoundError("Audio extraction failed")
270
-
271
- return audio_path, temp_dir
272
- except Exception as e:
273
- print(f"Audio extraction error: {e}")
274
- shutil.rmtree(temp_dir, ignore_errors=True)
275
- return None, None
276
-
277
- def combine_video_audio(video_path, audio_path, output_path):
278
- """Combine original video with new audio track"""
279
- try:
280
- subprocess.run([
281
- "ffmpeg", "-y", "-i", video_path,
282
- "-i", audio_path,
283
- "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
284
- "-shortest", output_path
285
- ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
286
- return True
287
- except Exception as e:
288
- print(f"Video combining error: {e}")
289
- return False
290
-
291
- # Create Gradio interface
292
- with gr.Blocks() as demo:
293
- gr.Markdown("# Video Dubbing Application")
294
- gr.Markdown("Upload a video and get a dubbed version with translated audio")
295
-
296
- with gr.Row():
297
- video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
298
- voice_dropdown = gr.Dropdown(
299
- ["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
300
- label="Select Voice",
301
- value="hi-IN-MadhurNeural"
302
- )
303
-
304
- output_video = gr.Video(label="Dubbed Video")
305
-
306
- submit_btn = gr.Button("Start Dubbing")
307
-
308
- submit_btn.click(
309
- gradio_interface,
310
- inputs=[video_input, voice_dropdown],
311
- outputs=output_video
312
- )
313
-
314
  demo.queue().launch(server_name="0.0.0.0", share=True)
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ import subprocess
5
+ from pathlib import Path
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pydub import AudioSegment
9
+ from faster_whisper import WhisperModel
10
+ from openai import OpenAI
11
+ import httpx
12
+ import asyncio
13
+ import gradio as gr
14
+
15
+ # --- Demucs-based vocal separation ---
16
+ def separate_vocals(input_path):
17
+ """Use Demucs to separate vocals and background music"""
18
+ temp_dir = tempfile.mkdtemp()
19
+ try:
20
+ output_dir = os.path.join(temp_dir, "separated")
21
+ os.makedirs(output_dir, exist_ok=True)
22
+
23
+ from demucs.separate import main as demucs_main
24
+ import sys
25
+
26
+ original_argv = sys.argv
27
+ sys.argv = [
28
+ "demucs",
29
+ "--two-stems", "vocals",
30
+ "-o", output_dir,
31
+ input_path
32
+ ]
33
+
34
+ try:
35
+ demucs_main()
36
+ finally:
37
+ sys.argv = original_argv
38
+
39
+ base_name = Path(input_path).stem
40
+ vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
41
+ noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
42
+
43
+ if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
44
+ raise FileNotFoundError("Demucs output missing")
45
+
46
+ return vocals_path, noise_path, temp_dir
47
+ except Exception as e:
48
+ print(f"Demucs error: {e}")
49
+ shutil.rmtree(temp_dir, ignore_errors=True)
50
+ return None, None, None
51
+
52
+ # --- AudioProcessor class ---
53
+ class AudioProcessor:
54
+ def __init__(self, device="cpu"):
55
+ self.whisper_model = WhisperModel("small.en", device=device)
56
+ self.openrouter_api_key = os.environ.get("OPENAI_API_KEY")
57
+ self.client = OpenAI(
58
+ base_url="https://api.openai.com/v1",
59
+ api_key=self.openrouter_api_key,
60
+ http_client=httpx.Client(headers={
61
+ "Authorization": f"Bearer {self.openrouter_api_key}",
62
+ "HTTP-Referer": "https://github.com",
63
+ "X-Title": "Audio Translation App"
64
+ })
65
+ )
66
+
67
+ def transcribe_audio_with_pauses(self, audio_path):
68
+ segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
69
+ previous_end = 0.0
70
+ results = []
71
+
72
+ for segment in segments:
73
+ if segment.start > previous_end + 0.5:
74
+ results.append((previous_end, segment.start, None))
75
+ results.append((segment.start, segment.end, segment.text.strip()))
76
+ previous_end = segment.end
77
+
78
+ audio_duration = get_audio_duration(audio_path)
79
+ if audio_duration and audio_duration > previous_end + 0.5:
80
+ results.append((previous_end, audio_duration, None))
81
+
82
+ return results
83
+
84
+ def translate_text(self, text):
85
+ try:
86
+ print(f"Translating text: {text}")
87
+ completion = self.client.chat.completions.create(
88
+ model="gpt-3.5-turbo",
89
+ messages=[
90
+ {
91
+ "role": "system",
92
+ "content": "You are a professional translator from English to Hindi."
93
+ },
94
+ {
95
+ "role": "user",
96
+ "content": f"""Translate the following text to Hindi:
97
+ "{text}"
98
+ Guidelines:
99
+ 1. Most important each and every line should be in Hindi of each segment
100
+ 2. Use natural conversational Hindi
101
+ 3. Preserve meaning/context
102
+ 4. Leave proper nouns unchanged
103
+ 5. Match original word count
104
+ 6. Output ONLY translation
105
+ """
106
+ }
107
+ ],
108
+ temperature=0.2,
109
+ max_tokens=2000
110
+ )
111
+ translated = completion.choices[0].message.content.strip()
112
+ print(f"Translated text: {translated}")
113
+ return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
114
+ except Exception as e:
115
+ print(f"Translation error: {e}")
116
+ return None
117
+
118
+ # --- Helper functions ---
119
+ def get_audio_duration(audio_path):
120
+ try:
121
+ with sf.SoundFile(audio_path) as f:
122
+ return len(f) / f.samplerate
123
+ except Exception as e:
124
+ print(f"Duration error: {e}")
125
+ return None
126
+
127
+ async def synthesize_tts_to_wav(text, voice, output_wav_path):
128
+ import edge_tts
129
+ temp_mp3 = "temp_tts.mp3"
130
+ communicate = edge_tts.Communicate(text, voice)
131
+ await communicate.save(temp_mp3)
132
+
133
+ audio = AudioSegment.from_file(temp_mp3)
134
+ audio = audio.set_channels(1).set_frame_rate(22050)
135
+ audio.export(output_wav_path, format="wav")
136
+ os.remove(temp_mp3)
137
+
138
+ def stretch_audio(input_wav, output_wav, target_duration):
139
+ data, sr = sf.read(input_wav)
140
+ if len(data) == 0:
141
+ raise ValueError("Empty audio")
142
+
143
+ tempo_ratio = target_duration / (len(data) / sr)
144
+ result = subprocess.run([
145
+ "rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
146
+ input_wav, output_wav
147
+ ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
148
+
149
+ if result.returncode != 0:
150
+ raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
151
+
152
+ def generate_silence_wav(duration_s, output_path, sample_rate=22050):
153
+ samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
154
+ sf.write(output_path, samples, sample_rate)
155
+
156
+ # --- Main Gradio Interface ---
157
+ async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
158
+ audio_processor = AudioProcessor()
159
+
160
+ print("🔎 Separating vocals and music using Demucs...")
161
+ vocals_path, background_path, temp_dir = separate_vocals(input_path)
162
+ if not vocals_path:
163
+ return None, None
164
+
165
+ print("🔎 Transcribing vocals...")
166
+ segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
167
+ print(f"Transcribed {len(segments)} segments.")
168
+
169
+ chunk_files = []
170
+ chunk_idx = 0
171
+
172
+ for start, end, text in segments:
173
+ duration = end - start
174
+ chunk_idx += 1
175
+
176
+ if text is None:
177
+ filename = f"chunk_{chunk_idx:03d}_pause.wav"
178
+ generate_silence_wav(duration, filename)
179
+ chunk_files.append(filename)
180
+ else:
181
+ translated = audio_processor.translate_text(text) or text
182
+ print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
183
+
184
+ raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
185
+ stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
186
+
187
+ await synthesize_tts_to_wav(translated, voice, raw_tts)
188
+ stretch_audio(raw_tts, stretched, duration)
189
+ chunk_files.append(stretched)
190
+ os.remove(raw_tts)
191
+
192
+ combined_tts = AudioSegment.empty()
193
+ for f in chunk_files:
194
+ combined_tts += AudioSegment.from_wav(f)
195
+
196
+ print("🎼 Adding original background music...")
197
+ background_music = AudioSegment.from_wav(background_path)
198
+ background_music = background_music[:len(combined_tts)]
199
+ final_mix = combined_tts.overlay(background_music)
200
+
201
+ output_path = "final_translated_with_music.wav"
202
+ final_mix.export(output_path, format="wav")
203
+ print(f"✅ Output saved as: {output_path}")
204
+
205
+ final_audio_path = output_path
206
+ final_background_path = background_path
207
+
208
+ for f in chunk_files:
209
+ os.remove(f)
210
+ shutil.rmtree(temp_dir, ignore_errors=True)
211
+ return final_audio_path, final_background_path
212
+
213
+ def gradio_interface(video_file, voice):
214
+ try:
215
+ # Create temporary directory for processing
216
+ temp_dir = Path(tempfile.mkdtemp())
217
+ input_video_path = temp_dir / "input_video.mp4"
218
+
219
+ # Check if file is a video
220
+ if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
221
+ raise ValueError("Invalid file type. Please upload a video file.")
222
+
223
+ # Save the uploaded file to the temporary directory
224
+ shutil.copyfile(video_file.name, input_video_path)
225
+
226
+ # Extract audio from video
227
+ audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
228
+ if not audio_path:
229
+ return None
230
+
231
+ # Process audio chunks
232
+ audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
233
+
234
+ if audio_output_path is None or background_path is None:
235
+ return None
236
+
237
+ # Combine with original video
238
+ output_video_path = temp_dir / "translated_video.mp4"
239
+ success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
240
+
241
+ if success:
242
+ # Return the path to the output video
243
+ return str(output_video_path)
244
+ else:
245
+ return None
246
+
247
+ except Exception as e:
248
+ print(f"Error processing video: {e}")
249
+ return None
250
+ finally:
251
+ # Cleanup temporary files
252
+ # Commented out for debugging purposes
253
+ # shutil.rmtree(temp_dir, ignore_errors=True)
254
+ pass
255
+
256
+ def extract_audio_from_video(video_path):
257
+ """Extract audio from video file using ffmpeg"""
258
+ temp_dir = tempfile.mkdtemp()
259
+ audio_path = os.path.join(temp_dir, "extracted_audio.wav")
260
+
261
+ try:
262
+ subprocess.run([
263
+ "ffmpeg", "-y", "-i", video_path,
264
+ "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
265
+ audio_path
266
+ ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
267
+
268
+ if not os.path.exists(audio_path):
269
+ raise FileNotFoundError("Audio extraction failed")
270
+
271
+ return audio_path, temp_dir
272
+ except Exception as e:
273
+ print(f"Audio extraction error: {e}")
274
+ shutil.rmtree(temp_dir, ignore_errors=True)
275
+ return None, None
276
+
277
+ def combine_video_audio(video_path, audio_path, output_path):
278
+ """Combine original video with new audio track"""
279
+ try:
280
+ subprocess.run([
281
+ "ffmpeg", "-y", "-i", video_path,
282
+ "-i", audio_path,
283
+ "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
284
+ "-shortest", output_path
285
+ ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
286
+ return True
287
+ except Exception as e:
288
+ print(f"Video combining error: {e}")
289
+ return False
290
+
291
+ # Create Gradio interface
292
+ with gr.Blocks() as demo:
293
+ gr.Markdown("# Video Dubbing Application")
294
+ gr.Markdown("Upload a video and get a dubbed version with translated audio")
295
+
296
+ with gr.Row():
297
+ video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
298
+ voice_dropdown = gr.Dropdown(
299
+ ["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
300
+ label="Select Voice",
301
+ value="hi-IN-MadhurNeural"
302
+ )
303
+
304
+ output_video = gr.Video(label="Dubbed Video")
305
+
306
+ submit_btn = gr.Button("Start Dubbing")
307
+
308
+ submit_btn.click(
309
+ gradio_interface,
310
+ inputs=[video_input, voice_dropdown],
311
+ outputs=output_video
312
+ )
313
+
314
  demo.queue().launch(server_name="0.0.0.0", share=True)