Spaces:

mohitrai76
/

DeepDub

Runtime error

App Files Files Community

mohitrai76 commited on May 31, 2025

Commit

56c4c63

verified ·

1 Parent(s): 34a5831

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -133

app.py CHANGED Viewed

@@ -11,35 +11,7 @@ from openai import OpenAI
 import httpx
 import asyncio
 import gradio as gr
-# --- Verify rubberband installation ---
-def verify_rubberband():
-    try:
-        # Try both possible command names
-        try:
-            subprocess.run(["rubberband", "--version"], check=True, capture_output=True)
-            return "rubberband"
-        except FileNotFoundError:
-            subprocess.run(["rubberband-cli", "--version"], check=True, capture_output=True)
-            return "rubberband-cli"
-    except Exception as e:
-        raise RuntimeError(
-            "Rubberband not found. Please ensure it's installed via apt:\n"
-            "1. Add to space.yaml:\n"
-            "   image:\n"
-            "     apt:\n"
-            "       packages:\n"
-            "         - rubberband-cli\n"
-            "2. Or install manually: sudo apt-get install rubberband-cli"
-        ) from e
-# Get the correct rubberband command name at startup
-try:
-    RUBBERBAND_CMD = verify_rubberband()
-    print(f"✅ Using rubberband command: {RUBBERBAND_CMD}")
-except Exception as e:
-    print(f"❌ {str(e)}")
-    RUBBERBAND_CMD = None
 # --- Demucs-based vocal separation ---
 def separate_vocals(input_path):
@@ -82,7 +54,7 @@ def separate_vocals(input_path):
 class AudioProcessor:
     def __init__(self, device="cpu"):
         self.whisper_model = WhisperModel("small", device=device)
-        self.openrouter_api_key="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"
         self.client = OpenAI(
             base_url="https://openrouter.ai/api/v1",
             api_key=self.openrouter_api_key,
@@ -113,37 +85,76 @@ class AudioProcessor:
     def translate_segments_batch(self, segments):
         """Translate all text segments in a single batch request"""
         try:
             text_segments = [seg for seg in segments if seg is not None]
             if not text_segments:
-                return segments
             print(f"Translating {len(text_segments)} segments in batch...")
-            prompt = f"""Translate the following text segments to Hindi while maintaining EXACTLY the same format:
             {chr(10).join(text_segments)}
-            Rules:
-            1. Maintain original order and line count
-            2. Use natural Hindi
-            3. Preserve context
-            4. Leave proper nouns unchanged"""
             completion = self.client.chat.completions.create(
                 model="gpt-3.5-turbo",
                 messages=[
-                    {"role": "system", "content": "Professional translator EN→HI"},
-                    {"role": "user", "content": prompt}
                 ],
-                temperature=0.1,
                 max_tokens=2000
             )
-            translations = completion.choices[0].message.content.strip().split('\n')
-            return [translations.pop(0) if seg is not None else None for seg in segments]
         except Exception as e:
-            print(f"Translation error: {e}")
-            return segments
 # --- Helper functions ---
 def get_audio_duration(audio_path):
@@ -154,7 +165,7 @@ def get_audio_duration(audio_path):
         print(f"Duration error: {e}")
         return None
-async def synthesize_tts_to_wav(text, voice, output_wav_path):
     import edge_tts
     temp_mp3 = "temp_tts.mp3"
     communicate = edge_tts.Communicate(text, voice)
@@ -162,135 +173,134 @@ async def synthesize_tts_to_wav(text, voice, output_wav_path):
     audio = AudioSegment.from_file(temp_mp3)
     audio = audio.set_channels(1).set_frame_rate(22050)
-    audio.export(output_wav_path, format="wav")
     os.remove(temp_mp3)
-def stretch_audio(input_wav, output_wav, target_duration):
-    """Time-stretch audio using rubberband with robust error handling"""
-    if RUBBERBAND_CMD is None:
-        raise RuntimeError("Rubberband not available - cannot process audio")
-    try:
-        data, sr = sf.read(input_wav)
-        if len(data) == 0:
-            raise ValueError("Empty audio file")
-        tempo_ratio = target_duration / (len(data) / sr)
-        result = subprocess.run(
-            [RUBBERBAND_CMD, "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0", input_wav, output_wav],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True
-        )
-        if result.returncode != 0:
-            error_msg = f"Rubberband failed (code {result.returncode}): {result.stderr}"
-            print(error_msg)
-            raise RuntimeError(error_msg)
-    except Exception as e:
-        print(f"Audio stretching failed: {e}")
-        # Fallback: copy original if stretching fails
-        shutil.copyfile(input_wav, output_wav)
-        raise
 def generate_silence_wav(duration_s, output_path, sample_rate=22050):
     samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
     sf.write(output_path, samples, sample_rate)
-# --- Main processing functions ---
 async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
-    if RUBBERBAND_CMD is None:
-        raise RuntimeError("System configuration error: Rubberband not available")
     audio_processor = AudioProcessor()
-    print("🔎 Separating vocals...")
     vocals_path, background_path, temp_dir = separate_vocals(input_audio_path)
     if not vocals_path:
         return None, None
-    print("🔎 Transcribing...")
     segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
     print(f"Transcribed {len(segments)} segments.")
-    translated_texts = audio_processor.translate_segments_batch(
-        [seg[2] for seg in segments]
-    )
     chunk_files = []
-    for idx, ((start, end, _), translated) in enumerate(zip(segments, translated_texts)):
         duration = end - start
-        chunk_id = f"{idx:03d}"
         if translated is None:
-            chunk_path = f"chunk_{chunk_id}_pause.wav"
-            generate_silence_wav(duration, chunk_path)
         else:
-            print(f"🔤 {idx}: {translated}")
-            raw_path = f"chunk_{chunk_id}_raw.wav"
-            chunk_path = f"chunk_{chunk_id}_stretched.wav"
-            await synthesize_tts_to_wav(translated, voice, raw_path)
-            try:
-                stretch_audio(raw_path, chunk_path, duration)
-            except Exception:
-                print(f"Using unstretched audio for chunk {idx}")
-            os.remove(raw_path)
-        chunk_files.append(chunk_path)
-    # Combine all chunks
     combined_tts = AudioSegment.empty()
     for f in chunk_files:
         combined_tts += AudioSegment.from_wav(f)
-        os.remove(f)
-    # Mix with background
-    background = AudioSegment.from_wav(background_path)[:len(combined_tts)]
-    final_mix = combined_tts.overlay(background)
-    output_path = "final_translated.wav"
     final_mix.export(output_path, format="wav")
     shutil.rmtree(temp_dir, ignore_errors=True)
-    return output_path, background_path
 def gradio_interface(video_file, voice):
     try:
         # Create temporary directory for processing
         temp_dir = Path(tempfile.mkdtemp())
         input_video_path = temp_dir / "input_video.mp4"
         # Check if file is a video
         if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
             raise ValueError("Invalid file type. Please upload a video file.")
         # Save the uploaded file to the temporary directory
         shutil.copyfile(video_file.name, input_video_path)
         # Extract audio from video
         audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
         if not audio_path:
             return None
         # Process audio chunks
         audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
         if audio_output_path is None or background_path is None:
             return None
         # Combine with original video
         output_video_path = temp_dir / "translated_video.mp4"
         success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
         if success:
             # Return the path to the output video
             return str(output_video_path)
         else:
             return None
     except Exception as e:
         print(f"Error processing video: {e}")
         return None
@@ -304,17 +314,17 @@ def extract_audio_from_video(video_path):
     """Extract audio from video file using ffmpeg"""
     temp_dir = tempfile.mkdtemp()
     audio_path = os.path.join(temp_dir, "extracted_audio.wav")
     try:
         subprocess.run([
             "ffmpeg", "-y", "-i", video_path,
             "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
             audio_path
         ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         if not os.path.exists(audio_path):
             raise FileNotFoundError("Audio extraction failed")
         return audio_path, temp_dir
     except Exception as e:
         print(f"Audio extraction error: {e}")
@@ -339,7 +349,7 @@ def combine_video_audio(video_path, audio_path, output_path):
 with gr.Blocks() as demo:
     gr.Markdown("# Video Dubbing Application")
     gr.Markdown("Upload a video and get a dubbed version with translated audio")
     with gr.Row():
         video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
         voice_dropdown = gr.Dropdown(
@@ -347,14 +357,15 @@ with gr.Blocks() as demo:
             label="Select Voice",
             value="hi-IN-MadhurNeural"
         )
     output_video = gr.Video(label="Dubbed Video")
     submit_btn = gr.Button("Start Dubbing")
     submit_btn.click(
         gradio_interface,
         inputs=[video_input, voice_dropdown],
         outputs=output_video
     )
-demo.queue().launch(server_name="0.0.0.0", ssr_mode=False, debug=True)

 import httpx
 import asyncio
 import gradio as gr
+import requests
 # --- Demucs-based vocal separation ---
 def separate_vocals(input_path):
 class AudioProcessor:
     def __init__(self, device="cpu"):
         self.whisper_model = WhisperModel("small", device=device)
+        self.openrouter_api_key = "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"
         self.client = OpenAI(
             base_url="https://openrouter.ai/api/v1",
             api_key=self.openrouter_api_key,
     def translate_segments_batch(self, segments):
         """Translate all text segments in a single batch request"""
         try:
+            # Filter out None segments (pauses)
             text_segments = [seg for seg in segments if seg is not None]
             if not text_segments:
+                return segments  # Return original if no text to translate
             print(f"Translating {len(text_segments)} segments in batch...")
+            # Prepare the prompt with clear formatting instructions
+            prompt = f"""Translate the following Given language text segments to Hindi while maintaining EXACTLY the same format and order:
             {chr(10).join(text_segments)}
+            IMPORTANT INSTRUCTIONS:
+            1. Maintain the EXACT same order and number of segments
+            2. Each line must be a separate translation
+            3. Use natural conversational Hindi
+            4. Preserve meaning/context
+            5. Leave proper nouns unchanged
+            6. Match original word count where possible
+            7. Output ONLY the translations, one per line, no numbers or bullet points
+            8. Do not add any additional text or explanations
+            Example Input:
+            Hello world
+            How are you?
+            Example Output:
+            नमस्ते दुनिया
+            आप कैसे हैं?
+            """
             completion = self.client.chat.completions.create(
                 model="gpt-3.5-turbo",
                 messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a professional translator from Given language to Hindi. Translate exactly as requested."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
                 ],
+                temperature=0.1,  # Lower temperature for more consistent results
                 max_tokens=2000
             )
+            translated_text = completion.choices[0].message.content.strip()
+            translations = translated_text.split('\n')
+            # Reconstruct the segments with translations
+            translated_segments = []
+            translation_idx = 0
+            for seg in segments:
+                if seg is None:
+                    translated_segments.append(None)
+                else:
+                    if translation_idx < len(translations):
+                        translated_segments.append(translations[translation_idx])
+                        translation_idx += 1
+                    else:
+                        translated_segments.append(seg)  # Fallback to original if missing translation
+            return translated_segments
         except Exception as e:
+            print(f"Batch translation error: {e}")
+            return segments  # Return original segments if translation fails
 # --- Helper functions ---
 def get_audio_duration(audio_path):
         print(f"Duration error: {e}")
         return None
+async def synthesize_tts_to_wav(text, voice):
     import edge_tts
     temp_mp3 = "temp_tts.mp3"
     communicate = edge_tts.Communicate(text, voice)
     audio = AudioSegment.from_file(temp_mp3)
     audio = audio.set_channels(1).set_frame_rate(22050)
+    output_wav = "temp_tts.wav"
+    audio.export(output_wav, format="wav")
     os.remove(temp_mp3)
+    return output_wav
+def stretch_audio(input_wav, target_duration, api_url="https://sox-api.onrender.com/stretch"):
+    # Read the input audio file
+    with open(input_wav, "rb") as f:
+        files = {"file": f}
+        data = {"target_duration": str(target_duration)}
+        response = requests.post(api_url, files=files, data=data)
+    # Check if the request was successful
+    if response.status_code != 200:
+        raise RuntimeError(f"API error: {response.status_code} - {response.text}")
+    # Save the response content to a temporary file
+    output_wav = tempfile.mkstemp(suffix=".wav")[1]
+    with open(output_wav, "wb") as out:
+        out.write(response.content)
+    return output_wav
 def generate_silence_wav(duration_s, output_path, sample_rate=22050):
     samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
     sf.write(output_path, samples, sample_rate)
+def cleanup_files(file_list):
+    for file in file_list:
+        if os.path.exists(file):
+            os.remove(file)
+# --- Main Gradio Interface ---
 async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
     audio_processor = AudioProcessor()
+    print("🔎 Separating vocals and music using Demucs...")
     vocals_path, background_path, temp_dir = separate_vocals(input_audio_path)
     if not vocals_path:
         return None, None
+    print("🔎 Transcribing vocals...")
     segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
     print(f"Transcribed {len(segments)} segments.")
+    # Extract text segments for batch processing
+    segment_texts = [seg[2] if seg[2] is not None else None for seg in segments]
+    # Batch translate all segments at once
+    translated_texts = audio_processor.translate_segments_batch(segment_texts)
     chunk_files = []
+    chunk_idx = 0
+    for (start, end, _), translated in zip(segments, translated_texts):
         duration = end - start
+        chunk_idx += 1
         if translated is None:
+            filename = f"chunk_{chunk_idx:03d}_pause.wav"
+            generate_silence_wav(duration, filename)
+            chunk_files.append(filename)
         else:
+            print(f"🔤 {chunk_idx}: Translated: {translated}")
+            # Synthesize TTS audio
+            raw_tts = await synthesize_tts_to_wav(translated, voice)
+            # Stretch the audio to match the target duration
+            stretched = stretch_audio(raw_tts, duration)
+            chunk_files.append(stretched)
+            os.remove(raw_tts)
     combined_tts = AudioSegment.empty()
     for f in chunk_files:
         combined_tts += AudioSegment.from_wav(f)
+    print("🎼 Adding original background music...")
+    background_music = AudioSegment.from_wav(background_path)
+    background_music = background_music[:len(combined_tts)]
+    final_mix = combined_tts.overlay(background_music)
+    output_path = "final_translated_with_music.wav"
     final_mix.export(output_path, format="wav")
+    print(f"✅ Output saved as: {output_path}")
+    final_audio_path = output_path
+    final_background_path = background_path
+    cleanup_files(chunk_files)
     shutil.rmtree(temp_dir, ignore_errors=True)
+    return final_audio_path, final_background_path
 def gradio_interface(video_file, voice):
     try:
         # Create temporary directory for processing
         temp_dir = Path(tempfile.mkdtemp())
         input_video_path = temp_dir / "input_video.mp4"
         # Check if file is a video
         if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
             raise ValueError("Invalid file type. Please upload a video file.")
         # Save the uploaded file to the temporary directory
         shutil.copyfile(video_file.name, input_video_path)
         # Extract audio from video
         audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
         if not audio_path:
             return None
         # Process audio chunks
         audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
         if audio_output_path is None or background_path is None:
             return None
         # Combine with original video
         output_video_path = temp_dir / "translated_video.mp4"
         success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
         if success:
             # Return the path to the output video
             return str(output_video_path)
         else:
             return None
     except Exception as e:
         print(f"Error processing video: {e}")
         return None
     """Extract audio from video file using ffmpeg"""
     temp_dir = tempfile.mkdtemp()
     audio_path = os.path.join(temp_dir, "extracted_audio.wav")
     try:
         subprocess.run([
             "ffmpeg", "-y", "-i", video_path,
             "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
             audio_path
         ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         if not os.path.exists(audio_path):
             raise FileNotFoundError("Audio extraction failed")
         return audio_path, temp_dir
     except Exception as e:
         print(f"Audio extraction error: {e}")
 with gr.Blocks() as demo:
     gr.Markdown("# Video Dubbing Application")
     gr.Markdown("Upload a video and get a dubbed version with translated audio")
     with gr.Row():
         video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
         voice_dropdown = gr.Dropdown(
             label="Select Voice",
             value="hi-IN-MadhurNeural"
         )
     output_video = gr.Video(label="Dubbed Video")
     submit_btn = gr.Button("Start Dubbing")
     submit_btn.click(
         gradio_interface,
         inputs=[video_input, voice_dropdown],
         outputs=output_video
     )
+demo.queue().launch(server_name="0.0.0.0", debug=True)