Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,9 @@ import re
|
|
| 8 |
from deep_translator import MyMemoryTranslator
|
| 9 |
import soundfile as sf
|
| 10 |
from gradio_client import Client, handle_file
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# You only need to run this download command once
|
| 14 |
nltk.download('punkt_tab')
|
|
@@ -181,18 +183,79 @@ def synthesize_speech(synth_text, target_lang, gender="Male", pace="normal", out
|
|
| 181 |
print("synthesize_speech saved to ", result)
|
| 182 |
return result
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def main_run(video_path,target_lang):
|
| 185 |
original_audio_file = extract_audio_from_video(video_path)
|
| 186 |
original_text , pace = transcribe_audio(original_audio_file)
|
| 187 |
translated_text = translate_local(original_text,target_lang)
|
| 188 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
def audio_pipeline_run(audio_path,target_lang):
|
| 192 |
original_text , pace = transcribe_audio(audio_path)
|
| 193 |
translated_text = translate_local(original_text,target_lang)
|
| 194 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 195 |
-
|
|
|
|
|
|
|
| 196 |
|
| 197 |
|
| 198 |
# --- Gradio Interface Definition ---
|
|
@@ -219,6 +282,7 @@ app_interface = gr.Interface(
|
|
| 219 |
allow_flagging="never"
|
| 220 |
)
|
| 221 |
'''
|
|
|
|
| 222 |
with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
|
| 223 |
gr.Markdown("# 🚀 Audio/Video Translation Toolkit")
|
| 224 |
with gr.Tabs():
|
|
@@ -228,14 +292,9 @@ with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
|
|
| 228 |
video_in = gr.Video(label="Input Video", height=500)
|
| 229 |
lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
|
| 230 |
submit_btn_vid = gr.Button("Translate Video", variant="primary")
|
| 231 |
-
|
| 232 |
-
with gr.Column():
|
| 233 |
-
video_out = gr.Audio(label="Output Audio", interactive=False)
|
| 234 |
-
|
| 235 |
-
'''
|
| 236 |
with gr.Column():
|
| 237 |
video_out = gr.Video(label="Output Video", interactive=False, height=500)
|
| 238 |
-
'''
|
| 239 |
|
| 240 |
submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid], outputs=[video_out])
|
| 241 |
|
|
|
|
| 8 |
from deep_translator import MyMemoryTranslator
|
| 9 |
import soundfile as sf
|
| 10 |
from gradio_client import Client, handle_file
|
| 11 |
+
from openvoice_cli.__main__ import tune_one
|
| 12 |
+
import pyrubberband as rb
|
| 13 |
+
import librosa
|
| 14 |
|
| 15 |
# You only need to run this download command once
|
| 16 |
nltk.download('punkt_tab')
|
|
|
|
| 183 |
print("synthesize_speech saved to ", result)
|
| 184 |
return result
|
| 185 |
|
| 186 |
+
def match_audio_duration(original_path, translated_path, output_path="temp_audio_synced.wav"):
|
| 187 |
+
"""
|
| 188 |
+
Matches Synthesized Audio duration to Original Audio duration
|
| 189 |
+
"""
|
| 190 |
+
print("\n[STEP 7/9] Syncing Audio durations")
|
| 191 |
+
# Load original audio
|
| 192 |
+
original_audio, original_sr = librosa.load(original_path, sr=None)
|
| 193 |
+
original_duration = librosa.get_duration(y=original_audio, sr=original_sr)
|
| 194 |
+
print(f"Original audio duration: {original_duration:.2f} seconds")
|
| 195 |
+
|
| 196 |
+
# Load translated audio
|
| 197 |
+
translated_audio, translated_sr = librosa.load(translated_path, sr=None)
|
| 198 |
+
translated_duration = librosa.get_duration(y=translated_audio, sr=translated_sr)
|
| 199 |
+
print(f"Translated audio duration: {translated_duration:.2f} seconds")
|
| 200 |
+
|
| 201 |
+
# Compute the speed-up/slow-down rate
|
| 202 |
+
# If rate > 1.0, audio is sped up. If rate < 1.0, audio is slowed down.
|
| 203 |
+
rate = translated_duration / original_duration
|
| 204 |
+
print(f"Stretch rate: {rate:.4f}")
|
| 205 |
+
|
| 206 |
+
# Apply time-stretch using the high-quality rubberband library
|
| 207 |
+
# The parameters are: audio_data, sample_rate, and the desired rate
|
| 208 |
+
adjusted_audio = rb.time_stretch(translated_audio, translated_sr, rate=rate)
|
| 209 |
+
|
| 210 |
+
# Save output
|
| 211 |
+
# The sample rate remains the same as the translated audio's original rate
|
| 212 |
+
sf.write(output_path, adjusted_audio, translated_sr)
|
| 213 |
+
print(f"✅ Adjusted audio saved as: {output_path}")
|
| 214 |
+
return output_path
|
| 215 |
+
|
| 216 |
+
def clone_voice(target_audio_path, reference_audio_path, target_lang, gender="Male", pace="normal", output_path="temp_audio_cloned.wav", device=None):
|
| 217 |
+
# Set parameters for single file processing
|
| 218 |
+
ref_file = str('reference/'+target_lang.upper()+'/'+gender.upper()+'_'+pace.upper()+'.wav')
|
| 219 |
+
output_file = 'cloned_audio.wav'
|
| 220 |
+
device = 'cpu' # or 'cuda:0' for GPU processing
|
| 221 |
+
|
| 222 |
+
print("Cloning Voice")
|
| 223 |
+
# Convert the tone color of a single audio file
|
| 224 |
+
tune_one(input_file=input_file, ref_file=ref_file, output_file=output_file, device=device)
|
| 225 |
+
|
| 226 |
+
return output_file
|
| 227 |
+
|
| 228 |
+
def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
|
| 229 |
+
"""
|
| 230 |
+
Merges an audio file with a video file into a single output video.
|
| 231 |
+
"""
|
| 232 |
+
print("\n[STEP] Merging audio and video...")
|
| 233 |
+
video_input = ffmpeg.input(video_path)
|
| 234 |
+
audio_input = ffmpeg.input(audio_path)
|
| 235 |
+
(
|
| 236 |
+
ffmpeg.output(video_input.video, audio_input.audio, output_path, vcodec='copy', acodec='aac', shortest=None)
|
| 237 |
+
.run(overwrite_output=True, quiet=True)
|
| 238 |
+
)
|
| 239 |
+
print(f"✅ Merged video saved to {output_path}")
|
| 240 |
+
return output_path
|
| 241 |
+
|
| 242 |
def main_run(video_path,target_lang):
|
| 243 |
original_audio_file = extract_audio_from_video(video_path)
|
| 244 |
original_text , pace = transcribe_audio(original_audio_file)
|
| 245 |
translated_text = translate_local(original_text,target_lang)
|
| 246 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 247 |
+
synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
|
| 248 |
+
cloned_synced_translated_audio = clone_voice(original_audio_file, synced_translated_audio, target_lang, gender="Male", pace)
|
| 249 |
+
final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio)
|
| 250 |
+
return final_video_nobgm
|
| 251 |
|
| 252 |
def audio_pipeline_run(audio_path,target_lang):
|
| 253 |
original_text , pace = transcribe_audio(audio_path)
|
| 254 |
translated_text = translate_local(original_text,target_lang)
|
| 255 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 256 |
+
synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
|
| 257 |
+
cloned_synced_translated_audio = clone_voice(original_audio_file, synced_translated_audio, target_lang, gender="Male", pace)
|
| 258 |
+
return cloned_synced_translated_audio
|
| 259 |
|
| 260 |
|
| 261 |
# --- Gradio Interface Definition ---
|
|
|
|
| 282 |
allow_flagging="never"
|
| 283 |
)
|
| 284 |
'''
|
| 285 |
+
|
| 286 |
with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
|
| 287 |
gr.Markdown("# 🚀 Audio/Video Translation Toolkit")
|
| 288 |
with gr.Tabs():
|
|
|
|
| 292 |
video_in = gr.Video(label="Input Video", height=500)
|
| 293 |
lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
|
| 294 |
submit_btn_vid = gr.Button("Translate Video", variant="primary")
|
| 295 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
with gr.Column():
|
| 297 |
video_out = gr.Video(label="Output Video", interactive=False, height=500)
|
|
|
|
| 298 |
|
| 299 |
submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid], outputs=[video_out])
|
| 300 |
|