Update app.py
Browse files
app.py
CHANGED
|
@@ -106,12 +106,13 @@ def transcribe_audio(audio_path):
|
|
| 106 |
result = transcriber(audio_path)
|
| 107 |
# The result is a dictionary, and we need the 'text' key
|
| 108 |
transcription = result["text"]
|
| 109 |
-
|
|
|
|
| 110 |
|
| 111 |
duration_s = get_audio_duration(audio_path)
|
| 112 |
wpm = compute_wpm(transcription, duration_s)
|
| 113 |
pace = categorize_wpm(wpm)
|
| 114 |
-
print(f"
|
| 115 |
|
| 116 |
return transcription, pace
|
| 117 |
|
|
@@ -163,7 +164,9 @@ def translate_local(text_to_translate, target_lang='ta-IN', device=None):
|
|
| 163 |
translated_chunks.append("") # Add an empty string on error
|
| 164 |
|
| 165 |
translated_text = " ".join(translated_chunks)
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
return translated_text
|
| 168 |
|
| 169 |
def synthesize_speech(synth_text, target_lang, gender="Male", pace="normal", output_path="temp_audio_synthesized.wav", device="cpu"):
|
|
@@ -218,20 +221,15 @@ def match_audio_duration(original_path, translated_path, output_path="temp_audio
|
|
| 218 |
# Save output
|
| 219 |
# The sample rate remains the same as the translated audio's original rate
|
| 220 |
sf.write(output_path, adjusted_audio, translated_sr)
|
| 221 |
-
print(f"β
Adjusted audio saved as: {output_path}")
|
| 222 |
return output_path
|
| 223 |
|
| 224 |
-
def clone_voice(
|
| 225 |
-
# Set parameters for single file processing
|
| 226 |
-
ref_file = str('reference/'+target_lang.upper()+'/'+gender.upper()+'_'+pace.upper()+'.wav')
|
| 227 |
-
output_file = 'cloned_audio.wav'
|
| 228 |
-
device = 'cpu' # or 'cuda:0' for GPU processing
|
| 229 |
-
|
| 230 |
print("Cloning Voice")
|
| 231 |
# Convert the tone color of a single audio file
|
| 232 |
-
tune_one(input_file=
|
| 233 |
-
|
| 234 |
-
return
|
| 235 |
|
| 236 |
def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
|
| 237 |
"""
|
|
@@ -253,8 +251,9 @@ def main_run(video_path,target_lang):
|
|
| 253 |
translated_text = translate_local(original_text,target_lang)
|
| 254 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 255 |
synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
|
| 256 |
-
cloned_synced_translated_audio = clone_voice(
|
| 257 |
final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio)
|
|
|
|
| 258 |
return final_video_nobgm
|
| 259 |
|
| 260 |
def audio_pipeline_run(audio_path,target_lang):
|
|
@@ -262,7 +261,8 @@ def audio_pipeline_run(audio_path,target_lang):
|
|
| 262 |
translated_text = translate_local(original_text,target_lang)
|
| 263 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 264 |
synced_translated_audio = match_audio_duration(audio_path, translated_audio)
|
| 265 |
-
cloned_synced_translated_audio = clone_voice(
|
|
|
|
| 266 |
return cloned_synced_translated_audio
|
| 267 |
|
| 268 |
|
|
|
|
| 106 |
result = transcriber(audio_path)
|
| 107 |
# The result is a dictionary, and we need the 'text' key
|
| 108 |
transcription = result["text"]
|
| 109 |
+
|
| 110 |
+
print(f"β
Transcription successful: {transcription}")
|
| 111 |
|
| 112 |
duration_s = get_audio_duration(audio_path)
|
| 113 |
wpm = compute_wpm(transcription, duration_s)
|
| 114 |
pace = categorize_wpm(wpm)
|
| 115 |
+
print(f"β
> Pace detected: {pace.upper()} ({wpm:.1f} WPM)")
|
| 116 |
|
| 117 |
return transcription, pace
|
| 118 |
|
|
|
|
| 164 |
translated_chunks.append("") # Add an empty string on error
|
| 165 |
|
| 166 |
translated_text = " ".join(translated_chunks)
|
| 167 |
+
|
| 168 |
+
print(f"β
Translated Text to {target_lang} Successfully")
|
| 169 |
+
|
| 170 |
return translated_text
|
| 171 |
|
| 172 |
def synthesize_speech(synth_text, target_lang, gender="Male", pace="normal", output_path="temp_audio_synthesized.wav", device="cpu"):
|
|
|
|
| 221 |
# Save output
|
| 222 |
# The sample rate remains the same as the translated audio's original rate
|
| 223 |
sf.write(output_path, adjusted_audio, translated_sr)
|
| 224 |
+
print(f"β
Duration Adjusted audio saved as: {output_path}")
|
| 225 |
return output_path
|
| 226 |
|
| 227 |
+
def clone_voice(translated_audio_path, original_audio_path, output_path="temp_audio_cloned.wav", device="cpu"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
print("Cloning Voice")
|
| 229 |
# Convert the tone color of a single audio file
|
| 230 |
+
tune_one(input_file=translated_audio_path, ref_file=original_audio_path, output_file=output_path, device=device)
|
| 231 |
+
print(f"β
Voice cloned audio saved to {output_path}")
|
| 232 |
+
return output_path
|
| 233 |
|
| 234 |
def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
|
| 235 |
"""
|
|
|
|
| 251 |
translated_text = translate_local(original_text,target_lang)
|
| 252 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 253 |
synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
|
| 254 |
+
cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
|
| 255 |
final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio)
|
| 256 |
+
print(f"β
Pipeline finished")
|
| 257 |
return final_video_nobgm
|
| 258 |
|
| 259 |
def audio_pipeline_run(audio_path,target_lang):
|
|
|
|
| 261 |
translated_text = translate_local(original_text,target_lang)
|
| 262 |
translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
|
| 263 |
synced_translated_audio = match_audio_duration(audio_path, translated_audio)
|
| 264 |
+
cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
|
| 265 |
+
print(f"β
Pipeline finished")
|
| 266 |
return cloned_synced_translated_audio
|
| 267 |
|
| 268 |
|