Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,314 +1,314 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import shutil
|
| 3 |
-
import tempfile
|
| 4 |
-
import subprocess
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
import numpy as np
|
| 7 |
-
import soundfile as sf
|
| 8 |
-
from pydub import AudioSegment
|
| 9 |
-
from faster_whisper import WhisperModel
|
| 10 |
-
from openai import OpenAI
|
| 11 |
-
import httpx
|
| 12 |
-
import asyncio
|
| 13 |
-
import gradio as gr
|
| 14 |
-
|
| 15 |
-
# --- Demucs-based vocal separation ---
|
| 16 |
-
def separate_vocals(input_path):
|
| 17 |
-
"""Use Demucs to separate vocals and background music"""
|
| 18 |
-
temp_dir = tempfile.mkdtemp()
|
| 19 |
-
try:
|
| 20 |
-
output_dir = os.path.join(temp_dir, "separated")
|
| 21 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 22 |
-
|
| 23 |
-
from demucs.separate import main as demucs_main
|
| 24 |
-
import sys
|
| 25 |
-
|
| 26 |
-
original_argv = sys.argv
|
| 27 |
-
sys.argv = [
|
| 28 |
-
"demucs",
|
| 29 |
-
"--two-stems", "vocals",
|
| 30 |
-
"-o", output_dir,
|
| 31 |
-
input_path
|
| 32 |
-
]
|
| 33 |
-
|
| 34 |
-
try:
|
| 35 |
-
demucs_main()
|
| 36 |
-
finally:
|
| 37 |
-
sys.argv = original_argv
|
| 38 |
-
|
| 39 |
-
base_name = Path(input_path).stem
|
| 40 |
-
vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
|
| 41 |
-
noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
|
| 42 |
-
|
| 43 |
-
if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
|
| 44 |
-
raise FileNotFoundError("Demucs output missing")
|
| 45 |
-
|
| 46 |
-
return vocals_path, noise_path, temp_dir
|
| 47 |
-
except Exception as e:
|
| 48 |
-
print(f"Demucs error: {e}")
|
| 49 |
-
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 50 |
-
return None, None, None
|
| 51 |
-
|
| 52 |
-
# --- AudioProcessor class ---
|
| 53 |
-
class AudioProcessor:
|
| 54 |
-
def __init__(self, device="cpu"):
|
| 55 |
-
self.whisper_model = WhisperModel("small.en", device=device)
|
| 56 |
-
self.openrouter_api_key = os.environ.get("
|
| 57 |
-
self.client = OpenAI(
|
| 58 |
-
base_url="https://api.openai.com/v1",
|
| 59 |
-
api_key=self.openrouter_api_key,
|
| 60 |
-
http_client=httpx.Client(headers={
|
| 61 |
-
"Authorization": f"Bearer {self.openrouter_api_key}",
|
| 62 |
-
"HTTP-Referer": "https://github.com",
|
| 63 |
-
"X-Title": "Audio Translation App"
|
| 64 |
-
})
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
def transcribe_audio_with_pauses(self, audio_path):
|
| 68 |
-
segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
|
| 69 |
-
previous_end = 0.0
|
| 70 |
-
results = []
|
| 71 |
-
|
| 72 |
-
for segment in segments:
|
| 73 |
-
if segment.start > previous_end + 0.5:
|
| 74 |
-
results.append((previous_end, segment.start, None))
|
| 75 |
-
results.append((segment.start, segment.end, segment.text.strip()))
|
| 76 |
-
previous_end = segment.end
|
| 77 |
-
|
| 78 |
-
audio_duration = get_audio_duration(audio_path)
|
| 79 |
-
if audio_duration and audio_duration > previous_end + 0.5:
|
| 80 |
-
results.append((previous_end, audio_duration, None))
|
| 81 |
-
|
| 82 |
-
return results
|
| 83 |
-
|
| 84 |
-
def translate_text(self, text):
|
| 85 |
-
try:
|
| 86 |
-
print(f"Translating text: {text}")
|
| 87 |
-
completion = self.client.chat.completions.create(
|
| 88 |
-
model="gpt-3.5-turbo",
|
| 89 |
-
messages=[
|
| 90 |
-
{
|
| 91 |
-
"role": "system",
|
| 92 |
-
"content": "You are a professional translator from English to Hindi."
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"role": "user",
|
| 96 |
-
"content": f"""Translate the following text to Hindi:
|
| 97 |
-
"{text}"
|
| 98 |
-
Guidelines:
|
| 99 |
-
1. Most important each and every line should be in Hindi of each segment
|
| 100 |
-
2. Use natural conversational Hindi
|
| 101 |
-
3. Preserve meaning/context
|
| 102 |
-
4. Leave proper nouns unchanged
|
| 103 |
-
5. Match original word count
|
| 104 |
-
6. Output ONLY translation
|
| 105 |
-
"""
|
| 106 |
-
}
|
| 107 |
-
],
|
| 108 |
-
temperature=0.2,
|
| 109 |
-
max_tokens=2000
|
| 110 |
-
)
|
| 111 |
-
translated = completion.choices[0].message.content.strip()
|
| 112 |
-
print(f"Translated text: {translated}")
|
| 113 |
-
return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
|
| 114 |
-
except Exception as e:
|
| 115 |
-
print(f"Translation error: {e}")
|
| 116 |
-
return None
|
| 117 |
-
|
| 118 |
-
# --- Helper functions ---
|
| 119 |
-
def get_audio_duration(audio_path):
|
| 120 |
-
try:
|
| 121 |
-
with sf.SoundFile(audio_path) as f:
|
| 122 |
-
return len(f) / f.samplerate
|
| 123 |
-
except Exception as e:
|
| 124 |
-
print(f"Duration error: {e}")
|
| 125 |
-
return None
|
| 126 |
-
|
| 127 |
-
async def synthesize_tts_to_wav(text, voice, output_wav_path):
|
| 128 |
-
import edge_tts
|
| 129 |
-
temp_mp3 = "temp_tts.mp3"
|
| 130 |
-
communicate = edge_tts.Communicate(text, voice)
|
| 131 |
-
await communicate.save(temp_mp3)
|
| 132 |
-
|
| 133 |
-
audio = AudioSegment.from_file(temp_mp3)
|
| 134 |
-
audio = audio.set_channels(1).set_frame_rate(22050)
|
| 135 |
-
audio.export(output_wav_path, format="wav")
|
| 136 |
-
os.remove(temp_mp3)
|
| 137 |
-
|
| 138 |
-
def stretch_audio(input_wav, output_wav, target_duration):
|
| 139 |
-
data, sr = sf.read(input_wav)
|
| 140 |
-
if len(data) == 0:
|
| 141 |
-
raise ValueError("Empty audio")
|
| 142 |
-
|
| 143 |
-
tempo_ratio = target_duration / (len(data) / sr)
|
| 144 |
-
result = subprocess.run([
|
| 145 |
-
"rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
|
| 146 |
-
input_wav, output_wav
|
| 147 |
-
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 148 |
-
|
| 149 |
-
if result.returncode != 0:
|
| 150 |
-
raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
|
| 151 |
-
|
| 152 |
-
def generate_silence_wav(duration_s, output_path, sample_rate=22050):
|
| 153 |
-
samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
|
| 154 |
-
sf.write(output_path, samples, sample_rate)
|
| 155 |
-
|
| 156 |
-
# --- Main Gradio Interface ---
|
| 157 |
-
async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
|
| 158 |
-
audio_processor = AudioProcessor()
|
| 159 |
-
|
| 160 |
-
print("🔎 Separating vocals and music using Demucs...")
|
| 161 |
-
vocals_path, background_path, temp_dir = separate_vocals(input_path)
|
| 162 |
-
if not vocals_path:
|
| 163 |
-
return None, None
|
| 164 |
-
|
| 165 |
-
print("🔎 Transcribing vocals...")
|
| 166 |
-
segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
|
| 167 |
-
print(f"Transcribed {len(segments)} segments.")
|
| 168 |
-
|
| 169 |
-
chunk_files = []
|
| 170 |
-
chunk_idx = 0
|
| 171 |
-
|
| 172 |
-
for start, end, text in segments:
|
| 173 |
-
duration = end - start
|
| 174 |
-
chunk_idx += 1
|
| 175 |
-
|
| 176 |
-
if text is None:
|
| 177 |
-
filename = f"chunk_{chunk_idx:03d}_pause.wav"
|
| 178 |
-
generate_silence_wav(duration, filename)
|
| 179 |
-
chunk_files.append(filename)
|
| 180 |
-
else:
|
| 181 |
-
translated = audio_processor.translate_text(text) or text
|
| 182 |
-
print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
|
| 183 |
-
|
| 184 |
-
raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
|
| 185 |
-
stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
|
| 186 |
-
|
| 187 |
-
await synthesize_tts_to_wav(translated, voice, raw_tts)
|
| 188 |
-
stretch_audio(raw_tts, stretched, duration)
|
| 189 |
-
chunk_files.append(stretched)
|
| 190 |
-
os.remove(raw_tts)
|
| 191 |
-
|
| 192 |
-
combined_tts = AudioSegment.empty()
|
| 193 |
-
for f in chunk_files:
|
| 194 |
-
combined_tts += AudioSegment.from_wav(f)
|
| 195 |
-
|
| 196 |
-
print("🎼 Adding original background music...")
|
| 197 |
-
background_music = AudioSegment.from_wav(background_path)
|
| 198 |
-
background_music = background_music[:len(combined_tts)]
|
| 199 |
-
final_mix = combined_tts.overlay(background_music)
|
| 200 |
-
|
| 201 |
-
output_path = "final_translated_with_music.wav"
|
| 202 |
-
final_mix.export(output_path, format="wav")
|
| 203 |
-
print(f"✅ Output saved as: {output_path}")
|
| 204 |
-
|
| 205 |
-
final_audio_path = output_path
|
| 206 |
-
final_background_path = background_path
|
| 207 |
-
|
| 208 |
-
for f in chunk_files:
|
| 209 |
-
os.remove(f)
|
| 210 |
-
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 211 |
-
return final_audio_path, final_background_path
|
| 212 |
-
|
| 213 |
-
def gradio_interface(video_file, voice):
|
| 214 |
-
try:
|
| 215 |
-
# Create temporary directory for processing
|
| 216 |
-
temp_dir = Path(tempfile.mkdtemp())
|
| 217 |
-
input_video_path = temp_dir / "input_video.mp4"
|
| 218 |
-
|
| 219 |
-
# Check if file is a video
|
| 220 |
-
if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
|
| 221 |
-
raise ValueError("Invalid file type. Please upload a video file.")
|
| 222 |
-
|
| 223 |
-
# Save the uploaded file to the temporary directory
|
| 224 |
-
shutil.copyfile(video_file.name, input_video_path)
|
| 225 |
-
|
| 226 |
-
# Extract audio from video
|
| 227 |
-
audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
|
| 228 |
-
if not audio_path:
|
| 229 |
-
return None
|
| 230 |
-
|
| 231 |
-
# Process audio chunks
|
| 232 |
-
audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
|
| 233 |
-
|
| 234 |
-
if audio_output_path is None or background_path is None:
|
| 235 |
-
return None
|
| 236 |
-
|
| 237 |
-
# Combine with original video
|
| 238 |
-
output_video_path = temp_dir / "translated_video.mp4"
|
| 239 |
-
success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
|
| 240 |
-
|
| 241 |
-
if success:
|
| 242 |
-
# Return the path to the output video
|
| 243 |
-
return str(output_video_path)
|
| 244 |
-
else:
|
| 245 |
-
return None
|
| 246 |
-
|
| 247 |
-
except Exception as e:
|
| 248 |
-
print(f"Error processing video: {e}")
|
| 249 |
-
return None
|
| 250 |
-
finally:
|
| 251 |
-
# Cleanup temporary files
|
| 252 |
-
# Commented out for debugging purposes
|
| 253 |
-
# shutil.rmtree(temp_dir, ignore_errors=True)
|
| 254 |
-
pass
|
| 255 |
-
|
| 256 |
-
def extract_audio_from_video(video_path):
|
| 257 |
-
"""Extract audio from video file using ffmpeg"""
|
| 258 |
-
temp_dir = tempfile.mkdtemp()
|
| 259 |
-
audio_path = os.path.join(temp_dir, "extracted_audio.wav")
|
| 260 |
-
|
| 261 |
-
try:
|
| 262 |
-
subprocess.run([
|
| 263 |
-
"ffmpeg", "-y", "-i", video_path,
|
| 264 |
-
"-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
| 265 |
-
audio_path
|
| 266 |
-
], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 267 |
-
|
| 268 |
-
if not os.path.exists(audio_path):
|
| 269 |
-
raise FileNotFoundError("Audio extraction failed")
|
| 270 |
-
|
| 271 |
-
return audio_path, temp_dir
|
| 272 |
-
except Exception as e:
|
| 273 |
-
print(f"Audio extraction error: {e}")
|
| 274 |
-
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 275 |
-
return None, None
|
| 276 |
-
|
| 277 |
-
def combine_video_audio(video_path, audio_path, output_path):
|
| 278 |
-
"""Combine original video with new audio track"""
|
| 279 |
-
try:
|
| 280 |
-
subprocess.run([
|
| 281 |
-
"ffmpeg", "-y", "-i", video_path,
|
| 282 |
-
"-i", audio_path,
|
| 283 |
-
"-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
|
| 284 |
-
"-shortest", output_path
|
| 285 |
-
], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 286 |
-
return True
|
| 287 |
-
except Exception as e:
|
| 288 |
-
print(f"Video combining error: {e}")
|
| 289 |
-
return False
|
| 290 |
-
|
| 291 |
-
# Create Gradio interface
|
| 292 |
-
with gr.Blocks() as demo:
|
| 293 |
-
gr.Markdown("# Video Dubbing Application")
|
| 294 |
-
gr.Markdown("Upload a video and get a dubbed version with translated audio")
|
| 295 |
-
|
| 296 |
-
with gr.Row():
|
| 297 |
-
video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
|
| 298 |
-
voice_dropdown = gr.Dropdown(
|
| 299 |
-
["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
|
| 300 |
-
label="Select Voice",
|
| 301 |
-
value="hi-IN-MadhurNeural"
|
| 302 |
-
)
|
| 303 |
-
|
| 304 |
-
output_video = gr.Video(label="Dubbed Video")
|
| 305 |
-
|
| 306 |
-
submit_btn = gr.Button("Start Dubbing")
|
| 307 |
-
|
| 308 |
-
submit_btn.click(
|
| 309 |
-
gradio_interface,
|
| 310 |
-
inputs=[video_input, voice_dropdown],
|
| 311 |
-
outputs=output_video
|
| 312 |
-
)
|
| 313 |
-
|
| 314 |
demo.queue().launch(server_name="0.0.0.0", share=True)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import tempfile
|
| 4 |
+
import subprocess
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from pydub import AudioSegment
|
| 9 |
+
from faster_whisper import WhisperModel
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
import httpx
|
| 12 |
+
import asyncio
|
| 13 |
+
import gradio as gr
|
| 14 |
+
|
| 15 |
+
# --- Demucs-based vocal separation ---
|
| 16 |
+
def separate_vocals(input_path):
|
| 17 |
+
"""Use Demucs to separate vocals and background music"""
|
| 18 |
+
temp_dir = tempfile.mkdtemp()
|
| 19 |
+
try:
|
| 20 |
+
output_dir = os.path.join(temp_dir, "separated")
|
| 21 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
from demucs.separate import main as demucs_main
|
| 24 |
+
import sys
|
| 25 |
+
|
| 26 |
+
original_argv = sys.argv
|
| 27 |
+
sys.argv = [
|
| 28 |
+
"demucs",
|
| 29 |
+
"--two-stems", "vocals",
|
| 30 |
+
"-o", output_dir,
|
| 31 |
+
input_path
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
demucs_main()
|
| 36 |
+
finally:
|
| 37 |
+
sys.argv = original_argv
|
| 38 |
+
|
| 39 |
+
base_name = Path(input_path).stem
|
| 40 |
+
vocals_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
|
| 41 |
+
noise_path = os.path.join(output_dir, "htdemucs", base_name, "no_vocals.wav")
|
| 42 |
+
|
| 43 |
+
if not os.path.exists(vocals_path) or not os.path.exists(noise_path):
|
| 44 |
+
raise FileNotFoundError("Demucs output missing")
|
| 45 |
+
|
| 46 |
+
return vocals_path, noise_path, temp_dir
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Demucs error: {e}")
|
| 49 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 50 |
+
return None, None, None
|
| 51 |
+
|
| 52 |
+
# --- AudioProcessor class ---
|
| 53 |
+
class AudioProcessor:
|
| 54 |
+
def __init__(self, device="cpu"):
|
| 55 |
+
self.whisper_model = WhisperModel("small.en", device=device)
|
| 56 |
+
self.openrouter_api_key = os.environ.get("OPENAI_API_KEY")
|
| 57 |
+
self.client = OpenAI(
|
| 58 |
+
base_url="https://api.openai.com/v1",
|
| 59 |
+
api_key=self.openrouter_api_key,
|
| 60 |
+
http_client=httpx.Client(headers={
|
| 61 |
+
"Authorization": f"Bearer {self.openrouter_api_key}",
|
| 62 |
+
"HTTP-Referer": "https://github.com",
|
| 63 |
+
"X-Title": "Audio Translation App"
|
| 64 |
+
})
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def transcribe_audio_with_pauses(self, audio_path):
|
| 68 |
+
segments, _ = self.whisper_model.transcribe(audio_path, word_timestamps=True)
|
| 69 |
+
previous_end = 0.0
|
| 70 |
+
results = []
|
| 71 |
+
|
| 72 |
+
for segment in segments:
|
| 73 |
+
if segment.start > previous_end + 0.5:
|
| 74 |
+
results.append((previous_end, segment.start, None))
|
| 75 |
+
results.append((segment.start, segment.end, segment.text.strip()))
|
| 76 |
+
previous_end = segment.end
|
| 77 |
+
|
| 78 |
+
audio_duration = get_audio_duration(audio_path)
|
| 79 |
+
if audio_duration and audio_duration > previous_end + 0.5:
|
| 80 |
+
results.append((previous_end, audio_duration, None))
|
| 81 |
+
|
| 82 |
+
return results
|
| 83 |
+
|
| 84 |
+
def translate_text(self, text):
|
| 85 |
+
try:
|
| 86 |
+
print(f"Translating text: {text}")
|
| 87 |
+
completion = self.client.chat.completions.create(
|
| 88 |
+
model="gpt-3.5-turbo",
|
| 89 |
+
messages=[
|
| 90 |
+
{
|
| 91 |
+
"role": "system",
|
| 92 |
+
"content": "You are a professional translator from English to Hindi."
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"role": "user",
|
| 96 |
+
"content": f"""Translate the following text to Hindi:
|
| 97 |
+
"{text}"
|
| 98 |
+
Guidelines:
|
| 99 |
+
1. Most important each and every line should be in Hindi of each segment
|
| 100 |
+
2. Use natural conversational Hindi
|
| 101 |
+
3. Preserve meaning/context
|
| 102 |
+
4. Leave proper nouns unchanged
|
| 103 |
+
5. Match original word count
|
| 104 |
+
6. Output ONLY translation
|
| 105 |
+
"""
|
| 106 |
+
}
|
| 107 |
+
],
|
| 108 |
+
temperature=0.2,
|
| 109 |
+
max_tokens=2000
|
| 110 |
+
)
|
| 111 |
+
translated = completion.choices[0].message.content.strip()
|
| 112 |
+
print(f"Translated text: {translated}")
|
| 113 |
+
return translated.split("Translation:")[0].strip().replace('"', '').replace("'", '')
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Translation error: {e}")
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
# --- Helper functions ---
|
| 119 |
+
def get_audio_duration(audio_path):
|
| 120 |
+
try:
|
| 121 |
+
with sf.SoundFile(audio_path) as f:
|
| 122 |
+
return len(f) / f.samplerate
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"Duration error: {e}")
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
async def synthesize_tts_to_wav(text, voice, output_wav_path):
|
| 128 |
+
import edge_tts
|
| 129 |
+
temp_mp3 = "temp_tts.mp3"
|
| 130 |
+
communicate = edge_tts.Communicate(text, voice)
|
| 131 |
+
await communicate.save(temp_mp3)
|
| 132 |
+
|
| 133 |
+
audio = AudioSegment.from_file(temp_mp3)
|
| 134 |
+
audio = audio.set_channels(1).set_frame_rate(22050)
|
| 135 |
+
audio.export(output_wav_path, format="wav")
|
| 136 |
+
os.remove(temp_mp3)
|
| 137 |
+
|
| 138 |
+
def stretch_audio(input_wav, output_wav, target_duration):
|
| 139 |
+
data, sr = sf.read(input_wav)
|
| 140 |
+
if len(data) == 0:
|
| 141 |
+
raise ValueError("Empty audio")
|
| 142 |
+
|
| 143 |
+
tempo_ratio = target_duration / (len(data) / sr)
|
| 144 |
+
result = subprocess.run([
|
| 145 |
+
"rubberband", "-t", f"{tempo_ratio:.6f}", "--pitch", "1.0",
|
| 146 |
+
input_wav, output_wav
|
| 147 |
+
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 148 |
+
|
| 149 |
+
if result.returncode != 0:
|
| 150 |
+
raise RuntimeError(f"Rubberband error: {result.stderr.decode()}")
|
| 151 |
+
|
| 152 |
+
def generate_silence_wav(duration_s, output_path, sample_rate=22050):
|
| 153 |
+
samples = np.zeros(int(duration_s * sample_rate), dtype=np.float32)
|
| 154 |
+
sf.write(output_path, samples, sample_rate)
|
| 155 |
+
|
| 156 |
+
# --- Main Gradio Interface ---
|
| 157 |
+
async def process_audio_chunks(input_audio_path, voice="hi-IN-MadhurNeural"):
|
| 158 |
+
audio_processor = AudioProcessor()
|
| 159 |
+
|
| 160 |
+
print("🔎 Separating vocals and music using Demucs...")
|
| 161 |
+
vocals_path, background_path, temp_dir = separate_vocals(input_path)
|
| 162 |
+
if not vocals_path:
|
| 163 |
+
return None, None
|
| 164 |
+
|
| 165 |
+
print("🔎 Transcribing vocals...")
|
| 166 |
+
segments = audio_processor.transcribe_audio_with_pauses(vocals_path)
|
| 167 |
+
print(f"Transcribed {len(segments)} segments.")
|
| 168 |
+
|
| 169 |
+
chunk_files = []
|
| 170 |
+
chunk_idx = 0
|
| 171 |
+
|
| 172 |
+
for start, end, text in segments:
|
| 173 |
+
duration = end - start
|
| 174 |
+
chunk_idx += 1
|
| 175 |
+
|
| 176 |
+
if text is None:
|
| 177 |
+
filename = f"chunk_{chunk_idx:03d}_pause.wav"
|
| 178 |
+
generate_silence_wav(duration, filename)
|
| 179 |
+
chunk_files.append(filename)
|
| 180 |
+
else:
|
| 181 |
+
translated = audio_processor.translate_text(text) or text
|
| 182 |
+
print(f"🔤 {chunk_idx}: Original: {text} → Translated: {translated}")
|
| 183 |
+
|
| 184 |
+
raw_tts = f"chunk_{chunk_idx:03d}_raw.wav"
|
| 185 |
+
stretched = f"chunk_{chunk_idx:03d}_stretched.wav"
|
| 186 |
+
|
| 187 |
+
await synthesize_tts_to_wav(translated, voice, raw_tts)
|
| 188 |
+
stretch_audio(raw_tts, stretched, duration)
|
| 189 |
+
chunk_files.append(stretched)
|
| 190 |
+
os.remove(raw_tts)
|
| 191 |
+
|
| 192 |
+
combined_tts = AudioSegment.empty()
|
| 193 |
+
for f in chunk_files:
|
| 194 |
+
combined_tts += AudioSegment.from_wav(f)
|
| 195 |
+
|
| 196 |
+
print("🎼 Adding original background music...")
|
| 197 |
+
background_music = AudioSegment.from_wav(background_path)
|
| 198 |
+
background_music = background_music[:len(combined_tts)]
|
| 199 |
+
final_mix = combined_tts.overlay(background_music)
|
| 200 |
+
|
| 201 |
+
output_path = "final_translated_with_music.wav"
|
| 202 |
+
final_mix.export(output_path, format="wav")
|
| 203 |
+
print(f"✅ Output saved as: {output_path}")
|
| 204 |
+
|
| 205 |
+
final_audio_path = output_path
|
| 206 |
+
final_background_path = background_path
|
| 207 |
+
|
| 208 |
+
for f in chunk_files:
|
| 209 |
+
os.remove(f)
|
| 210 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 211 |
+
return final_audio_path, final_background_path
|
| 212 |
+
|
| 213 |
+
def gradio_interface(video_file, voice):
|
| 214 |
+
try:
|
| 215 |
+
# Create temporary directory for processing
|
| 216 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 217 |
+
input_video_path = temp_dir / "input_video.mp4"
|
| 218 |
+
|
| 219 |
+
# Check if file is a video
|
| 220 |
+
if not os.path.splitext(video_file.name)[1].lower() in ['.mp4', '.mov', '.avi', '.mkv']:
|
| 221 |
+
raise ValueError("Invalid file type. Please upload a video file.")
|
| 222 |
+
|
| 223 |
+
# Save the uploaded file to the temporary directory
|
| 224 |
+
shutil.copyfile(video_file.name, input_video_path)
|
| 225 |
+
|
| 226 |
+
# Extract audio from video
|
| 227 |
+
audio_path, audio_temp_dir = extract_audio_from_video(str(input_video_path))
|
| 228 |
+
if not audio_path:
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
# Process audio chunks
|
| 232 |
+
audio_output_path, background_path = asyncio.run(process_audio_chunks(audio_path, voice))
|
| 233 |
+
|
| 234 |
+
if audio_output_path is None or background_path is None:
|
| 235 |
+
return None
|
| 236 |
+
|
| 237 |
+
# Combine with original video
|
| 238 |
+
output_video_path = temp_dir / "translated_video.mp4"
|
| 239 |
+
success = combine_video_audio(str(input_video_path), audio_output_path, str(output_video_path))
|
| 240 |
+
|
| 241 |
+
if success:
|
| 242 |
+
# Return the path to the output video
|
| 243 |
+
return str(output_video_path)
|
| 244 |
+
else:
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Error processing video: {e}")
|
| 249 |
+
return None
|
| 250 |
+
finally:
|
| 251 |
+
# Cleanup temporary files
|
| 252 |
+
# Commented out for debugging purposes
|
| 253 |
+
# shutil.rmtree(temp_dir, ignore_errors=True)
|
| 254 |
+
pass
|
| 255 |
+
|
| 256 |
+
def extract_audio_from_video(video_path):
|
| 257 |
+
"""Extract audio from video file using ffmpeg"""
|
| 258 |
+
temp_dir = tempfile.mkdtemp()
|
| 259 |
+
audio_path = os.path.join(temp_dir, "extracted_audio.wav")
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
subprocess.run([
|
| 263 |
+
"ffmpeg", "-y", "-i", video_path,
|
| 264 |
+
"-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
| 265 |
+
audio_path
|
| 266 |
+
], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 267 |
+
|
| 268 |
+
if not os.path.exists(audio_path):
|
| 269 |
+
raise FileNotFoundError("Audio extraction failed")
|
| 270 |
+
|
| 271 |
+
return audio_path, temp_dir
|
| 272 |
+
except Exception as e:
|
| 273 |
+
print(f"Audio extraction error: {e}")
|
| 274 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 275 |
+
return None, None
|
| 276 |
+
|
| 277 |
+
def combine_video_audio(video_path, audio_path, output_path):
|
| 278 |
+
"""Combine original video with new audio track"""
|
| 279 |
+
try:
|
| 280 |
+
subprocess.run([
|
| 281 |
+
"ffmpeg", "-y", "-i", video_path,
|
| 282 |
+
"-i", audio_path,
|
| 283 |
+
"-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
|
| 284 |
+
"-shortest", output_path
|
| 285 |
+
], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 286 |
+
return True
|
| 287 |
+
except Exception as e:
|
| 288 |
+
print(f"Video combining error: {e}")
|
| 289 |
+
return False
|
| 290 |
+
|
| 291 |
+
# Create Gradio interface
|
| 292 |
+
with gr.Blocks() as demo:
|
| 293 |
+
gr.Markdown("# Video Dubbing Application")
|
| 294 |
+
gr.Markdown("Upload a video and get a dubbed version with translated audio")
|
| 295 |
+
|
| 296 |
+
with gr.Row():
|
| 297 |
+
video_input = gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".avi", ".mkv"])
|
| 298 |
+
voice_dropdown = gr.Dropdown(
|
| 299 |
+
["hi-IN-MadhurNeural", "hi-IN-RekhaNeural", "hi-IN-SwaraNeural"],
|
| 300 |
+
label="Select Voice",
|
| 301 |
+
value="hi-IN-MadhurNeural"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
output_video = gr.Video(label="Dubbed Video")
|
| 305 |
+
|
| 306 |
+
submit_btn = gr.Button("Start Dubbing")
|
| 307 |
+
|
| 308 |
+
submit_btn.click(
|
| 309 |
+
gradio_interface,
|
| 310 |
+
inputs=[video_input, voice_dropdown],
|
| 311 |
+
outputs=output_video
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
demo.queue().launch(server_name="0.0.0.0", share=True)
|