mohitrai76 commited on
Commit
56c4c63
·
verified ·
1 Parent(s): 34a5831

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -133
app.py CHANGED
@@ -11,35 +11,7 @@ from openai import OpenAI
11
  import httpx
12
  import asyncio
13
  import gradio as gr
14
-
15
- # --- Verify rubberband installation ---
16
- def verify_rubberband():
17
- try:
18
- # Try both possible command names
19
- try:
20
- subprocess.run(["rubberband", "--version"], check=True, capture_output=True)
21
- return "rubberband"
22
- except FileNotFoundError:
23
- subprocess.run(["rubberband-cli", "--version"], check=True, capture_output=True)
24
- return "rubberband-cli"
25
- except Exception as e:
26
- raise RuntimeError(
27
- "Rubberband not found. Please ensure it's installed via apt:\n"
28
- "1. Add to space.yaml:\n"
29
- " image:\n"
30
- " apt:\n"
31
- " packages:\n"
32
- " - rubberband-cli\n"
33
- "2. Or install manually: sudo apt-get install rubberband-cli"
34
- ) from e
35
-
36
- # Get the correct rubberband command name at startup
37
- try:
38
- RUBBERBAND_CMD = verify_rubberband()
39
- print(f"✅ Using rubberband command: {RUBBERBAND_CMD}")
40
- except Exception as e:
41
- print(f"❌ {str(e)}")
42
- RUBBERBAND_CMD = None
43
 
44
  # --- Demucs-based vocal separation ---
45
  def separate_vocals(input_path):
@@ -82,7 +54,7 @@ def separate_vocals(input_path):
82
  class AudioProcessor:
83
  def __init__(self, device="cpu"):
84
  self.whisper_model = WhisperModel("small", device=device)
85
- self.openrouter_api_key="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"
86
  self.client = OpenAI(
87
  base_url="https://openrouter.ai/api/v1",
88
  api_key=self.openrouter_api_key,
@@ -113,37 +85,76 @@ class AudioProcessor:
113
  def translate_segments_batch(self, segments):
114
  """Translate all text segments in a single batch request"""
115
  try:
 
116
  text_segments = [seg for seg in segments if seg is not None]
 
117
  if not text_segments:
118
- return segments
119
-
120
  print(f"Translating {len(text_segments)} segments in batch...")
121
-
122
- prompt = f"""Translate the following text segments to Hindi while maintaining EXACTLY the same format:
 
 
123
  {chr(10).join(text_segments)}
124
-
125
- Rules:
126
- 1. Maintain original order and line count
127
- 2. Use natural Hindi
128
- 3. Preserve context
129
- 4. Leave proper nouns unchanged"""
130
-
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  completion = self.client.chat.completions.create(
132
  model="gpt-3.5-turbo",
133
  messages=[
134
- {"role": "system", "content": "Professional translator EN→HI"},
135
- {"role": "user", "content": prompt}
 
 
 
 
 
 
136
  ],
137
- temperature=0.1,
138
  max_tokens=2000
139
  )
140
-
141
- translations = completion.choices[0].message.content.strip().split('\n')
142
- return [translations.pop(0) if seg is not None else None for seg in segments]
143
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  except Exception as e:
145
- print(f"Translation error: {e}")
146
- return segments
147
 
148
  # --- Helper functions ---
149
  def get_audio_duration(audio_path):
@@ -154,7 +165,7 @@ def get_audio_duration(audio_path):
154
  print(f"Duration error: {e}")
155
  return None
156
 
157
- async def synthesize_tts_to_wav(text, voice, output_wav_path):
158
  import edge_tts
159
  temp_mp3 = "temp_tts.mp3"
160
  communicate = edge_tts.Communicate(text, voice)
@@ -162,135 +173,134 @@ async def synthesize_tts_to_wav(text, voice, output_wav_path):
162
 
163
  audio = AudioSegment.from_file(temp_mp3)
164
  audio = audio.set_channels(1).set_frame_rate(22050)
165
- audio.export(output_wav_path, format="wav")
 
166
  os.remove(temp_mp3)
 
167
 
168
- def stretch_audio(input_wav, output_wav, target_duration):
169
- """Time-stretch audio using rubberband with robust error handling"""
170
- if RUBBERBAND_CMD is None:
171
- raise RuntimeError("Rubberband not available - cannot process audio")
172
-
173
- try:
174
- data, sr = sf.read(input_wav)
175
- if len(data) == 0:
176
- raise ValueError("Empty audio file")
177
-
178
- tempo_ratio = target_duration / (len(data) / sr)
179
-
180
- result = subprocess.run(
181
- [RUBBERBAND_CMD, "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0", input_wav, output_wav],
182
- stdout=subprocess.PIPE,
183
- stderr=subprocess.PIPE,
184
- text=True
185
- )
186
-
187
- if result.returncode != 0:
188
- error_msg = f"Rubberband failed (code {result.returncode}): {result.stderr}"
189
- print(error_msg)
190
- raise RuntimeError(error_msg)
191
-
192
- except Exception as e:
193
- print(f"Audio stretching failed: {e}")
194
- # Fallback: copy original if stretching fails
195
- shutil.copyfile(input_wav, output_wav)
196
- raise
197
 
198
  def generate_silence_wav(duration_s, output_path, sample_rate=22050):
199
  samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
200
  sf.write(output_path, samples, sample_rate)
201
 
202
- # --- Main processing functions ---
 
 
 
 
 
203
  async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
204
- if RUBBERBAND_CMD is None:
205
- raise RuntimeError("System configuration error: Rubberband not available")
206
-
207
  audio_processor = AudioProcessor()
208
 
209
- print("🔎 Separating vocals...")
210
  vocals_path, background_path, temp_dir = separate_vocals(input_audio_path)
211
  if not vocals_path:
212
  return None, None
213
 
214
- print("🔎 Transcribing...")
215
  segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
216
  print(f"Transcribed {len(segments)} segments.")
217
 
218
- translated_texts = audio_processor.translate_segments_batch(
219
- [seg[2] for seg in segments]
220
- )
221
-
 
 
222
  chunk_files = []
223
- for idx, ((start, end, _), translated) in enumerate(zip(segments, translated_texts)):
 
 
224
  duration = end - start
225
- chunk_id = f"{idx:03d}"
226
-
227
  if translated is None:
228
- chunk_path = f"chunk_{chunk_id}_pause.wav"
229
- generate_silence_wav(duration, chunk_path)
 
230
  else:
231
- print(f"🔤 {idx}: {translated}")
232
- raw_path = f"chunk_{chunk_id}_raw.wav"
233
- chunk_path = f"chunk_{chunk_id}_stretched.wav"
234
-
235
- await synthesize_tts_to_wav(translated, voice, raw_path)
236
- try:
237
- stretch_audio(raw_path, chunk_path, duration)
238
- except Exception:
239
- print(f"Using unstretched audio for chunk {idx}")
240
- os.remove(raw_path)
241
-
242
- chunk_files.append(chunk_path)
243
-
244
- # Combine all chunks
245
  combined_tts = AudioSegment.empty()
246
  for f in chunk_files:
247
  combined_tts += AudioSegment.from_wav(f)
248
- os.remove(f)
249
 
250
- # Mix with background
251
- background = AudioSegment.from_wav(background_path)[:len(combined_tts)]
252
- final_mix = combined_tts.overlay(background)
253
-
254
- output_path = "final_translated.wav"
 
255
  final_mix.export(output_path, format="wav")
256
-
 
 
 
 
 
257
  shutil.rmtree(temp_dir, ignore_errors=True)
258
- return output_path, background_path
259
 
260
  def gradio_interface(video_file, voice):
261
  try:
262
  # Create temporary directory for processing
263
  temp_dir = Path(tempfile.mkdtemp())
264
  input_video_path = temp_dir / "input_video.mp4"
265
-
266
  # Check if file is a video
267
  if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
268
  raise ValueError("Invalid file type. Please upload a video file.")
269
-
270
  # Save the uploaded file to the temporary directory
271
  shutil.copyfile(video_file.name, input_video_path)
272
-
273
  # Extract audio from video
274
  audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
275
  if not audio_path:
276
  return None
277
-
278
  # Process audio chunks
279
  audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
280
-
281
  if audio_output_path is None or background_path is None:
282
  return None
283
-
284
  # Combine with original video
285
  output_video_path = temp_dir / "translated_video.mp4"
286
  success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
287
-
288
  if success:
289
  # Return the path to the output video
290
  return str(output_video_path)
291
  else:
292
  return None
293
-
294
  except Exception as e:
295
  print(f"Error processing video: {e}")
296
  return None
@@ -304,17 +314,17 @@ def extract_audio_from_video(video_path):
304
  """Extract audio from video file using ffmpeg"""
305
  temp_dir = tempfile.mkdtemp()
306
  audio_path = os.path.join(temp_dir, "extracted_audio.wav")
307
-
308
  try:
309
  subprocess.run([
310
  "ffmpeg", "-y", "-i", video_path,
311
  "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
312
  audio_path
313
  ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
314
-
315
  if not os.path.exists(audio_path):
316
  raise FileNotFoundError("Audio extraction failed")
317
-
318
  return audio_path, temp_dir
319
  except Exception as e:
320
  print(f"Audio extraction error: {e}")
@@ -339,7 +349,7 @@ def combine_video_audio(video_path, audio_path, output_path):
339
  with gr.Blocks() as demo:
340
  gr.Markdown("# Video Dubbing Application")
341
  gr.Markdown("Upload a video and get a dubbed version with translated audio")
342
-
343
  with gr.Row():
344
  video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
345
  voice_dropdown = gr.Dropdown(
@@ -347,14 +357,15 @@ with gr.Blocks() as demo:
347
  label="Select Voice",
348
  value="hi-IN-MadhurNeural"
349
  )
350
-
351
  output_video = gr.Video(label="Dubbed Video")
 
352
  submit_btn = gr.Button("Start Dubbing")
353
-
354
  submit_btn.click(
355
  gradio_interface,
356
  inputs=[video_input, voice_dropdown],
357
  outputs=output_video
358
  )
359
 
360
- demo.queue().launch(server_name="0.0.0.0", ssr_mode=False, debug=True)
 
11
  import httpx
12
  import asyncio
13
  import gradio as gr
14
+ import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # --- Demucs-based vocal separation ---
17
  def separate_vocals(input_path):
 
54
  class AudioProcessor:
55
  def __init__(self, device="cpu"):
56
  self.whisper_model = WhisperModel("small", device=device)
57
+ self.openrouter_api_key = "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"
58
  self.client = OpenAI(
59
  base_url="https://openrouter.ai/api/v1",
60
  api_key=self.openrouter_api_key,
 
85
  def translate_segments_batch(self, segments):
86
  """Translate all text segments in a single batch request"""
87
  try:
88
+ # Filter out None segments (pauses)
89
  text_segments = [seg for seg in segments if seg is not None]
90
+
91
  if not text_segments:
92
+ return segments # Return original if no text to translate
93
+
94
  print(f"Translating {len(text_segments)} segments in batch...")
95
+
96
+ # Prepare the prompt with clear formatting instructions
97
+ prompt = f"""Translate the following Given language text segments to Hindi while maintaining EXACTLY the same format and order:
98
+
99
  {chr(10).join(text_segments)}
100
+
101
+ IMPORTANT INSTRUCTIONS:
102
+ 1. Maintain the EXACT same order and number of segments
103
+ 2. Each line must be a separate translation
104
+ 3. Use natural conversational Hindi
105
+ 4. Preserve meaning/context
106
+ 5. Leave proper nouns unchanged
107
+ 6. Match original word count where possible
108
+ 7. Output ONLY the translations, one per line, no numbers or bullet points
109
+ 8. Do not add any additional text or explanations
110
+
111
+ Example Input:
112
+ Hello world
113
+ How are you?
114
+
115
+ Example Output:
116
+ नमस्ते दुनिया
117
+ आप कैसे हैं?
118
+ """
119
+
120
  completion = self.client.chat.completions.create(
121
  model="gpt-3.5-turbo",
122
  messages=[
123
+ {
124
+ "role": "system",
125
+ "content": "You are a professional translator from Given language to Hindi. Translate exactly as requested."
126
+ },
127
+ {
128
+ "role": "user",
129
+ "content": prompt
130
+ }
131
  ],
132
+ temperature=0.1, # Lower temperature for more consistent results
133
  max_tokens=2000
134
  )
135
+
136
+ translated_text = completion.choices[0].message.content.strip()
137
+ translations = translated_text.split('\n')
138
+
139
+ # Reconstruct the segments with translations
140
+ translated_segments = []
141
+ translation_idx = 0
142
+
143
+ for seg in segments:
144
+ if seg is None:
145
+ translated_segments.append(None)
146
+ else:
147
+ if translation_idx < len(translations):
148
+ translated_segments.append(translations[translation_idx])
149
+ translation_idx += 1
150
+ else:
151
+ translated_segments.append(seg) # Fallback to original if missing translation
152
+
153
+ return translated_segments
154
+
155
  except Exception as e:
156
+ print(f"Batch translation error: {e}")
157
+ return segments # Return original segments if translation fails
158
 
159
  # --- Helper functions ---
160
  def get_audio_duration(audio_path):
 
165
  print(f"Duration error: {e}")
166
  return None
167
 
168
+ async def synthesize_tts_to_wav(text, voice):
169
  import edge_tts
170
  temp_mp3 = "temp_tts.mp3"
171
  communicate = edge_tts.Communicate(text, voice)
 
173
 
174
  audio = AudioSegment.from_file(temp_mp3)
175
  audio = audio.set_channels(1).set_frame_rate(22050)
176
+ output_wav = "temp_tts.wav"
177
+ audio.export(output_wav, format="wav")
178
  os.remove(temp_mp3)
179
+ return output_wav
180
 
181
+ def stretch_audio(input_wav, target_duration, api_url="https://sox-api.onrender.com/stretch"):
182
+ # Read the input audio file
183
+ with open(input_wav, "rb") as f:
184
+ files = {"file": f}
185
+ data = {"target_duration": str(target_duration)}
186
+ response = requests.post(api_url, files=files, data=data)
187
+
188
+ # Check if the request was successful
189
+ if response.status_code != 200:
190
+ raise RuntimeError(f"API error: {response.status_code} - {response.text}")
191
+
192
+ # Save the response content to a temporary file
193
+ output_wav = tempfile.mkstemp(suffix=".wav")[1]
194
+ with open(output_wav, "wb") as out:
195
+ out.write(response.content)
196
+
197
+ return output_wav
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  def generate_silence_wav(duration_s, output_path, sample_rate=22050):
200
  samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
201
  sf.write(output_path, samples, sample_rate)
202
 
203
+ def cleanup_files(file_list):
204
+ for file in file_list:
205
+ if os.path.exists(file):
206
+ os.remove(file)
207
+
208
+ # --- Main Gradio Interface ---
209
  async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
 
 
 
210
  audio_processor = AudioProcessor()
211
 
212
+ print("🔎 Separating vocals and music using Demucs...")
213
  vocals_path, background_path, temp_dir = separate_vocals(input_audio_path)
214
  if not vocals_path:
215
  return None, None
216
 
217
+ print("🔎 Transcribing vocals...")
218
  segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
219
  print(f"Transcribed {len(segments)} segments.")
220
 
221
+ # Extract text segments for batch processing
222
+ segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
223
+
224
+ # Batch translate all segments at once
225
+ translated_texts = audio_processor.translate_segments_batch(segment_texts)
226
+
227
  chunk_files = []
228
+ chunk_idx = 0
229
+
230
+ for (start, end, _), translated in zip(segments, translated_texts):
231
  duration = end - start
232
+ chunk_idx += 1
233
+
234
  if translated is None:
235
+ filename = f"chunk_{chunk_idx:03d}_pause.wav"
236
+ generate_silence_wav(duration, filename)
237
+ chunk_files.append(filename)
238
  else:
239
+ print(f"🔤 {chunk_idx}: Translated: {translated}")
240
+
241
+ # Synthesize TTS audio
242
+ raw_tts = await synthesize_tts_to_wav(translated, voice)
243
+
244
+ # Stretch the audio to match the target duration
245
+ stretched = stretch_audio(raw_tts, duration)
246
+
247
+ chunk_files.append(stretched)
248
+ os.remove(raw_tts)
249
+
 
 
 
250
  combined_tts = AudioSegment.empty()
251
  for f in chunk_files:
252
  combined_tts += AudioSegment.from_wav(f)
 
253
 
254
+ print("🎼 Adding original background music...")
255
+ background_music = AudioSegment.from_wav(background_path)
256
+ background_music = background_music[:len(combined_tts)]
257
+ final_mix = combined_tts.overlay(background_music)
258
+
259
+ output_path = "final_translated_with_music.wav"
260
  final_mix.export(output_path, format="wav")
261
+ print(f"✅ Output saved as: {output_path}")
262
+
263
+ final_audio_path = output_path
264
+ final_background_path = background_path
265
+
266
+ cleanup_files(chunk_files)
267
  shutil.rmtree(temp_dir, ignore_errors=True)
268
+ return final_audio_path, final_background_path
269
 
270
  def gradio_interface(video_file, voice):
271
  try:
272
  # Create temporary directory for processing
273
  temp_dir = Path(tempfile.mkdtemp())
274
  input_video_path = temp_dir / "input_video.mp4"
275
+
276
  # Check if file is a video
277
  if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
278
  raise ValueError("Invalid file type. Please upload a video file.")
279
+
280
  # Save the uploaded file to the temporary directory
281
  shutil.copyfile(video_file.name, input_video_path)
282
+
283
  # Extract audio from video
284
  audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
285
  if not audio_path:
286
  return None
287
+
288
  # Process audio chunks
289
  audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
290
+
291
  if audio_output_path is None or background_path is None:
292
  return None
293
+
294
  # Combine with original video
295
  output_video_path = temp_dir / "translated_video.mp4"
296
  success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
297
+
298
  if success:
299
  # Return the path to the output video
300
  return str(output_video_path)
301
  else:
302
  return None
303
+
304
  except Exception as e:
305
  print(f"Error processing video: {e}")
306
  return None
 
314
  """Extract audio from video file using ffmpeg"""
315
  temp_dir = tempfile.mkdtemp()
316
  audio_path = os.path.join(temp_dir, "extracted_audio.wav")
317
+
318
  try:
319
  subprocess.run([
320
  "ffmpeg", "-y", "-i", video_path,
321
  "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
322
  audio_path
323
  ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
324
+
325
  if not os.path.exists(audio_path):
326
  raise FileNotFoundError("Audio extraction failed")
327
+
328
  return audio_path, temp_dir
329
  except Exception as e:
330
  print(f"Audio extraction error: {e}")
 
349
  with gr.Blocks() as demo:
350
  gr.Markdown("# Video Dubbing Application")
351
  gr.Markdown("Upload a video and get a dubbed version with translated audio")
352
+
353
  with gr.Row():
354
  video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
355
  voice_dropdown = gr.Dropdown(
 
357
  label="Select Voice",
358
  value="hi-IN-MadhurNeural"
359
  )
360
+
361
  output_video = gr.Video(label="Dubbed Video")
362
+
363
  submit_btn = gr.Button("Start Dubbing")
364
+
365
  submit_btn.click(
366
  gradio_interface,
367
  inputs=[video_input, voice_dropdown],
368
  outputs=output_video
369
  )
370
 
371
+ demo.queue().launch(server_name="0.0.0.0", debug=True)