Spaces:

mohitrai76
/

DeepDub

Running

App Files Files Community

mohitrai76 commited on May 29, 2025

Commit

924e4df

verified ·

1 Parent(s): 03f7ea6

Update app.py

Browse files

Files changed (1) hide show

app.py +313 -313

app.py CHANGED Viewed

@@ -1,314 +1,314 @@
-import os
-import shutil
-import tempfile
-import subprocess
-from pathlib import Path
-import numpy as np
-import soundfile as sf
-from pydub import AudioSegment
-from faster_whisper import WhisperModel
-from openai import OpenAI
-import httpx
-import asyncio
-import gradio as gr
-# --- Demucs-based vocal separation ---
-def separate_vocals(input_path):
-    """Use Demucs to separate vocals and background music"""
-    temp_dir = tempfile.mkdtemp()
-    try:
-        output_dir = os.path.join(temp_dir, "separated")
-        os.makedirs(output_dir, exist_ok=True)
-        from demucs.separate import main as demucs_main
-        import sys
-        original_argv = sys.argv
-        sys.argv = [
-            "demucs",
-            "--two-stems", "vocals",
-            "-o", output_dir,
-            input_path
-        ]
-        try:
-            demucs_main()
-        finally:
-            sys.argv = original_argv
-        base_name = Path(input_path).stem
-        vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
-        noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
-        if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
-            raise FileNotFoundError("Demucs output missing")
-        return vocals_path, noise_path, temp_dir
-    except Exception as e:
-        print(f"Demucs error: {e}")
-        shutil.rmtree(temp_dir, ignore_errors=True)
-        return None, None, None
-# --- AudioProcessor class ---
-class AudioProcessor:
-    def __init__(self, device="cpu"):
-        self.whisper_model = WhisperModel("small.en", device=device)
-        self.openrouter_api_key = os.environ.get("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")
-        self.client = OpenAI(
-            base_url="https://api.openai.com/v1",
-            api_key=self.openrouter_api_key,
-            http_client=httpx.Client(headers={
-                "Authorization": f"Bearer {self.openrouter_api_key}",
-                "HTTP-Referer": "https://github.com",
-                "X-Title": "Audio Translation App"
-            })
-        )
-    def transcribe_audio_with_pauses(self, audio_path):
-        segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
-        previous_end = 0.0
-        results = []
-        for segment in segments:
-            if segment.start > previous_end + 0.5:
-                results.append((previous_end, segment.start, None))
-            results.append((segment.start, segment.end, segment.text.strip()))
-            previous_end = segment.end
-        audio_duration = get_audio_duration(audio_path)
-        if audio_duration and audio_duration > previous_end + 0.5:
-            results.append((previous_end, audio_duration, None))
-        return results
-    def translate_text(self, text):
-        try:
-            print(f"Translating text: {text}")
-            completion = self.client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": "You are a professional translator from English to Hindi."
-                    },
-                    {
-                        "role": "user",
-                        "content": f"""Translate the following text to Hindi:
-                        "{text}"
-                        Guidelines:
-                        1. Most important each and every line should be in Hindi of each segment
-                        2. Use natural conversational Hindi
-                        3. Preserve meaning/context
-                        4. Leave proper nouns unchanged
-                        5. Match original word count
-                        6. Output ONLY translation
-                        """
-                    }
-                ],
-                temperature=0.2,
-                max_tokens=2000
-            )
-            translated = completion.choices[0].message.content.strip()
-            print(f"Translated text: {translated}")
-            return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
-        except Exception as e:
-            print(f"Translation error: {e}")
-            return None
-# --- Helper functions ---
-def get_audio_duration(audio_path):
-    try:
-        with sf.SoundFile(audio_path) as f:
-            return len(f) / f.samplerate
-    except Exception as e:
-        print(f"Duration error: {e}")
-        return None
-async def synthesize_tts_to_wav(text, voice, output_wav_path):
-    import edge_tts
-    temp_mp3 = "temp_tts.mp3"
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(temp_mp3)
-    audio = AudioSegment.from_file(temp_mp3)
-    audio = audio.set_channels(1).set_frame_rate(22050)
-    audio.export(output_wav_path, format="wav")
-    os.remove(temp_mp3)
-def stretch_audio(input_wav, output_wav, target_duration):
-    data, sr = sf.read(input_wav)
-    if len(data) == 0:
-        raise ValueError("Empty audio")
-    tempo_ratio = target_duration / (len(data) / sr)
-    result = subprocess.run([
-        "rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
-        input_wav, output_wav
-    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if result.returncode != 0:
-        raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
-def generate_silence_wav(duration_s, output_path, sample_rate=22050):
-    samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
-    sf.write(output_path, samples, sample_rate)
-# --- Main Gradio Interface ---
-async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
-    audio_processor = AudioProcessor()
-    print("🔎 Separating vocals and music using Demucs...")
-    vocals_path, background_path, temp_dir = separate_vocals(input_path)
-    if not vocals_path:
-        return None, None
-    print("🔎 Transcribing vocals...")
-    segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
-    print(f"Transcribed {len(segments)} segments.")
-    chunk_files = []
-    chunk_idx = 0
-    for start, end, text in segments:
-        duration = end - start
-        chunk_idx += 1
-        if text is None:
-            filename = f"chunk_{chunk_idx:03d}_pause.wav"
-            generate_silence_wav(duration, filename)
-            chunk_files.append(filename)
-        else:
-            translated = audio_processor.translate_text(text) or text
-            print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
-            raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
-            stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
-            await synthesize_tts_to_wav(translated, voice, raw_tts)
-            stretch_audio(raw_tts, stretched, duration)
-            chunk_files.append(stretched)
-            os.remove(raw_tts)
-    combined_tts = AudioSegment.empty()
-    for f in chunk_files:
-        combined_tts += AudioSegment.from_wav(f)
-    print("🎼 Adding original background music...")
-    background_music = AudioSegment.from_wav(background_path)
-    background_music = background_music[:len(combined_tts)]
-    final_mix = combined_tts.overlay(background_music)
-    output_path = "final_translated_with_music.wav"
-    final_mix.export(output_path, format="wav")
-    print(f"✅ Output saved as: {output_path}")
-    final_audio_path = output_path
-    final_background_path = background_path
-    for f in chunk_files:
-        os.remove(f)
-    shutil.rmtree(temp_dir, ignore_errors=True)
-    return final_audio_path, final_background_path
-def gradio_interface(video_file, voice):
-    try:
-        # Create temporary directory for processing
-        temp_dir = Path(tempfile.mkdtemp())
-        input_video_path = temp_dir / "input_video.mp4"
-        # Check if file is a video
-        if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
-            raise ValueError("Invalid file type. Please upload a video file.")
-        # Save the uploaded file to the temporary directory
-        shutil.copyfile(video_file.name, input_video_path)
-        # Extract audio from video
-        audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
-        if not audio_path:
-            return None
-        # Process audio chunks
-        audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
-        if audio_output_path is None or background_path is None:
-            return None
-        # Combine with original video
-        output_video_path = temp_dir / "translated_video.mp4"
-        success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
-        if success:
-            # Return the path to the output video
-            return str(output_video_path)
-        else:
-            return None
-    except Exception as e:
-        print(f"Error processing video: {e}")
-        return None
-    finally:
-        # Cleanup temporary files
-        # Commented out for debugging purposes
-        # shutil.rmtree(temp_dir, ignore_errors=True)
-        pass
-def extract_audio_from_video(video_path):
-    """Extract audio from video file using ffmpeg"""
-    temp_dir = tempfile.mkdtemp()
-    audio_path = os.path.join(temp_dir, "extracted_audio.wav")
-    try:
-        subprocess.run([
-            "ffmpeg", "-y", "-i", video_path,
-            "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
-            audio_path
-        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        if not os.path.exists(audio_path):
-            raise FileNotFoundError("Audio extraction failed")
-        return audio_path, temp_dir
-    except Exception as e:
-        print(f"Audio extraction error: {e}")
-        shutil.rmtree(temp_dir, ignore_errors=True)
-        return None, None
-def combine_video_audio(video_path, audio_path, output_path):
-    """Combine original video with new audio track"""
-    try:
-        subprocess.run([
-            "ffmpeg", "-y", "-i", video_path,
-            "-i", audio_path,
-            "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
-            "-shortest", output_path
-        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return True
-    except Exception as e:
-        print(f"Video combining error: {e}")
-        return False
-# Create Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Video Dubbing Application")
-    gr.Markdown("Upload a video and get a dubbed version with translated audio")
-    with gr.Row():
-        video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
-        voice_dropdown = gr.Dropdown(
-            ["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
-            label="Select Voice",
-            value="hi-IN-MadhurNeural"
-        )
-    output_video = gr.Video(label="Dubbed Video")
-    submit_btn = gr.Button("Start Dubbing")
-    submit_btn.click(
-        gradio_interface,
-        inputs=[video_input, voice_dropdown],
-        outputs=output_video
-    )
 demo.queue().launch(server_name="0.0.0.0", share=True)

+import os
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from pydub import AudioSegment
+from faster_whisper import WhisperModel
+from openai import OpenAI
+import httpx
+import asyncio
+import gradio as gr
+# --- Demucs-based vocal separation ---
+def separate_vocals(input_path):
+    """Use Demucs to separate vocals and background music"""
+    temp_dir = tempfile.mkdtemp()
+    try:
+        output_dir = os.path.join(temp_dir, "separated")
+        os.makedirs(output_dir, exist_ok=True)
+        from demucs.separate import main as demucs_main
+        import sys
+        original_argv = sys.argv
+        sys.argv = [
+            "demucs",
+            "--two-stems", "vocals",
+            "-o", output_dir,
+            input_path
+        ]
+        try:
+            demucs_main()
+        finally:
+            sys.argv = original_argv
+        base_name = Path(input_path).stem
+        vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
+        noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
+        if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
+            raise FileNotFoundError("Demucs output missing")
+        return vocals_path, noise_path, temp_dir
+    except Exception as e:
+        print(f"Demucs error: {e}")
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        return None, None, None
+# --- AudioProcessor class ---
+class AudioProcessor:
+    def __init__(self, device="cpu"):
+        self.whisper_model = WhisperModel("small.en", device=device)
+        self.openrouter_api_key = os.environ.get("OPENAI_API_KEY")
+        self.client = OpenAI(
+            base_url="https://api.openai.com/v1",
+            api_key=self.openrouter_api_key,
+            http_client=httpx.Client(headers={
+                "Authorization": f"Bearer {self.openrouter_api_key}",
+                "HTTP-Referer": "https://github.com",
+                "X-Title": "Audio Translation App"
+            })
+        )
+    def transcribe_audio_with_pauses(self, audio_path):
+        segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
+        previous_end = 0.0
+        results = []
+        for segment in segments:
+            if segment.start > previous_end + 0.5:
+                results.append((previous_end, segment.start, None))
+            results.append((segment.start, segment.end, segment.text.strip()))
+            previous_end = segment.end
+        audio_duration = get_audio_duration(audio_path)
+        if audio_duration and audio_duration > previous_end + 0.5:
+            results.append((previous_end, audio_duration, None))
+        return results
+    def translate_text(self, text):
+        try:
+            print(f"Translating text: {text}")
+            completion = self.client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a professional translator from English to Hindi."
+                    },
+                    {
+                        "role": "user",
+                        "content": f"""Translate the following text to Hindi:
+                        "{text}"
+                        Guidelines:
+                        1. Most important each and every line should be in Hindi of each segment
+                        2. Use natural conversational Hindi
+                        3. Preserve meaning/context
+                        4. Leave proper nouns unchanged
+                        5. Match original word count
+                        6. Output ONLY translation
+                        """
+                    }
+                ],
+                temperature=0.2,
+                max_tokens=2000
+            )
+            translated = completion.choices[0].message.content.strip()
+            print(f"Translated text: {translated}")
+            return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
+        except Exception as e:
+            print(f"Translation error: {e}")
+            return None
+# --- Helper functions ---
+def get_audio_duration(audio_path):
+    try:
+        with sf.SoundFile(audio_path) as f:
+            return len(f) / f.samplerate
+    except Exception as e:
+        print(f"Duration error: {e}")
+        return None
+async def synthesize_tts_to_wav(text, voice, output_wav_path):
+    import edge_tts
+    temp_mp3 = "temp_tts.mp3"
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(temp_mp3)
+    audio = AudioSegment.from_file(temp_mp3)
+    audio = audio.set_channels(1).set_frame_rate(22050)
+    audio.export(output_wav_path, format="wav")
+    os.remove(temp_mp3)
+def stretch_audio(input_wav, output_wav, target_duration):
+    data, sr = sf.read(input_wav)
+    if len(data) == 0:
+        raise ValueError("Empty audio")
+    tempo_ratio = target_duration / (len(data) / sr)
+    result = subprocess.run([
+        "rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
+        input_wav, output_wav
+    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
+def generate_silence_wav(duration_s, output_path, sample_rate=22050):
+    samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
+    sf.write(output_path, samples, sample_rate)
+# --- Main Gradio Interface ---
+async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
+    audio_processor = AudioProcessor()
+    print("🔎 Separating vocals and music using Demucs...")
+    vocals_path, background_path, temp_dir = separate_vocals(input_path)
+    if not vocals_path:
+        return None, None
+    print("🔎 Transcribing vocals...")
+    segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
+    print(f"Transcribed {len(segments)} segments.")
+    chunk_files = []
+    chunk_idx = 0
+    for start, end, text in segments:
+        duration = end - start
+        chunk_idx += 1
+        if text is None:
+            filename = f"chunk_{chunk_idx:03d}_pause.wav"
+            generate_silence_wav(duration, filename)
+            chunk_files.append(filename)
+        else:
+            translated = audio_processor.translate_text(text) or text
+            print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
+            raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
+            stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
+            await synthesize_tts_to_wav(translated, voice, raw_tts)
+            stretch_audio(raw_tts, stretched, duration)
+            chunk_files.append(stretched)
+            os.remove(raw_tts)
+    combined_tts = AudioSegment.empty()
+    for f in chunk_files:
+        combined_tts += AudioSegment.from_wav(f)
+    print("🎼 Adding original background music...")
+    background_music = AudioSegment.from_wav(background_path)
+    background_music = background_music[:len(combined_tts)]
+    final_mix = combined_tts.overlay(background_music)
+    output_path = "final_translated_with_music.wav"
+    final_mix.export(output_path, format="wav")
+    print(f"✅ Output saved as: {output_path}")
+    final_audio_path = output_path
+    final_background_path = background_path
+    for f in chunk_files:
+        os.remove(f)
+    shutil.rmtree(temp_dir, ignore_errors=True)
+    return final_audio_path, final_background_path
+def gradio_interface(video_file, voice):
+    try:
+        # Create temporary directory for processing
+        temp_dir = Path(tempfile.mkdtemp())
+        input_video_path = temp_dir / "input_video.mp4"
+        # Check if file is a video
+        if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
+            raise ValueError("Invalid file type. Please upload a video file.")
+        # Save the uploaded file to the temporary directory
+        shutil.copyfile(video_file.name, input_video_path)
+        # Extract audio from video
+        audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
+        if not audio_path:
+            return None
+        # Process audio chunks
+        audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
+        if audio_output_path is None or background_path is None:
+            return None
+        # Combine with original video
+        output_video_path = temp_dir / "translated_video.mp4"
+        success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
+        if success:
+            # Return the path to the output video
+            return str(output_video_path)
+        else:
+            return None
+    except Exception as e:
+        print(f"Error processing video: {e}")
+        return None
+    finally:
+        # Cleanup temporary files
+        # Commented out for debugging purposes
+        # shutil.rmtree(temp_dir, ignore_errors=True)
+        pass
+def extract_audio_from_video(video_path):
+    """Extract audio from video file using ffmpeg"""
+    temp_dir = tempfile.mkdtemp()
+    audio_path = os.path.join(temp_dir, "extracted_audio.wav")
+    try:
+        subprocess.run([
+            "ffmpeg", "-y", "-i", video_path,
+            "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
+            audio_path
+        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError("Audio extraction failed")
+        return audio_path, temp_dir
+    except Exception as e:
+        print(f"Audio extraction error: {e}")
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        return None, None
+def combine_video_audio(video_path, audio_path, output_path):
+    """Combine original video with new audio track"""
+    try:
+        subprocess.run([
+            "ffmpeg", "-y", "-i", video_path,
+            "-i", audio_path,
+            "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
+            "-shortest", output_path
+        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except Exception as e:
+        print(f"Video combining error: {e}")
+        return False
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Video Dubbing Application")
+    gr.Markdown("Upload a video and get a dubbed version with translated audio")
+    with gr.Row():
+        video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
+        voice_dropdown = gr.Dropdown(
+            ["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
+            label="Select Voice",
+            value="hi-IN-MadhurNeural"
+        )
+    output_video = gr.Video(label="Dubbed Video")
+    submit_btn = gr.Button("Start Dubbing")
+    submit_btn.click(
+        gradio_interface,
+        inputs=[video_input, voice_dropdown],
+        outputs=output_video
+    )
 demo.queue().launch(server_name="0.0.0.0", share=True)