Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -619,11 +619,17 @@ def tutorial():
|
|
| 619 |
- **"m_"**: Male
|
| 620 |
"""
|
| 621 |
with gr.Blocks() as demo2:
|
| 622 |
-
|
| 623 |
gr.Markdown(explanation) # Display the explanation
|
| 624 |
return demo2
|
| 625 |
|
| 626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
#@title subtitle
|
| 628 |
import os
|
| 629 |
import re
|
|
@@ -632,7 +638,8 @@ import shutil
|
|
| 632 |
import platform
|
| 633 |
import datetime
|
| 634 |
import subprocess
|
| 635 |
-
|
|
|
|
| 636 |
import pysrt
|
| 637 |
import librosa
|
| 638 |
import soundfile as sf
|
|
@@ -640,11 +647,13 @@ from tqdm.auto import tqdm
|
|
| 640 |
from pydub import AudioSegment
|
| 641 |
from deep_translator import GoogleTranslator
|
| 642 |
|
| 643 |
-
|
| 644 |
# ---------------------- Utility Functions ----------------------
|
|
|
|
|
|
|
| 645 |
def get_current_time():
|
| 646 |
return datetime.datetime.now().strftime("%I_%M_%p")
|
| 647 |
|
|
|
|
| 648 |
def get_subtitle_Dub_path(srt_file_path, Language):
|
| 649 |
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 650 |
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
|
@@ -654,6 +663,7 @@ def get_subtitle_Dub_path(srt_file_path, Language):
|
|
| 654 |
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
| 655 |
return new_path.replace("__", "_")
|
| 656 |
|
|
|
|
| 657 |
def clean_srt(input_path):
|
| 658 |
def clean_srt_line(text):
|
| 659 |
for bad in ["[", "]", "♫"]:
|
|
@@ -667,16 +677,20 @@ def clean_srt(input_path):
|
|
| 667 |
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
| 668 |
return output_path
|
| 669 |
|
|
|
|
| 670 |
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
| 671 |
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
| 672 |
subs = pysrt.open(input_path, encoding='utf-8')
|
|
|
|
| 673 |
if len(subs) > max_segments:
|
| 674 |
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
| 675 |
return input_path
|
| 676 |
|
|
|
|
| 677 |
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
| 678 |
full_text = "\n".join(original)
|
| 679 |
|
|
|
|
| 680 |
chunks, start = [], 0
|
| 681 |
while start < len(full_text):
|
| 682 |
end = start + chunk_size
|
|
@@ -688,20 +702,24 @@ def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_s
|
|
| 688 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
| 689 |
translated_text = "\n".join(translated_chunks)
|
| 690 |
|
|
|
|
| 691 |
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
| 692 |
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
| 693 |
|
|
|
|
| 694 |
for i, sub in enumerate(subs):
|
| 695 |
sub.text = translated_dict.get(i, sub.text)
|
| 696 |
|
| 697 |
subs.save(output_path, encoding='utf-8')
|
| 698 |
return output_path
|
| 699 |
|
|
|
|
| 700 |
def prepare_srt(srt_path, target_language, translate=False):
|
| 701 |
path = clean_srt(srt_path)
|
| 702 |
return translate_srt(path, target_language) if translate else path
|
| 703 |
|
| 704 |
-
|
|
|
|
| 705 |
def is_ffmpeg_installed():
|
| 706 |
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
| 707 |
try:
|
|
@@ -711,6 +729,21 @@ def is_ffmpeg_installed():
|
|
| 711 |
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
| 712 |
return False, ffmpeg_exe
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
| 715 |
try:
|
| 716 |
y, sr = librosa.load(input_file, sr=None)
|
|
@@ -720,23 +753,29 @@ def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
|
| 720 |
gr.Warning(f"Librosa speedup failed: {e}")
|
| 721 |
shutil.copy(input_file, output_file)
|
| 722 |
|
|
|
|
| 723 |
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
| 724 |
if use_ffmpeg:
|
| 725 |
try:
|
| 726 |
-
subprocess.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
except Exception as e:
|
| 728 |
gr.Error(f"FFmpeg speedup error: {e}")
|
| 729 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 730 |
else:
|
| 731 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 732 |
|
|
|
|
| 733 |
def remove_edge_silence(input_path, output_path):
|
| 734 |
y, sr = librosa.load(input_path, sr=None)
|
| 735 |
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
| 736 |
sf.write(output_path, trimmed_audio, sr)
|
| 737 |
return output_path
|
| 738 |
|
| 739 |
-
|
| 740 |
# ---------------------- Main Class ----------------------
|
| 741 |
class SRTDubbing:
|
| 742 |
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
|
@@ -747,16 +786,63 @@ class SRTDubbing:
|
|
| 747 |
os.makedirs(self.cache_dir, exist_ok=True)
|
| 748 |
|
| 749 |
@staticmethod
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
def convert_to_millisecond(t):
|
| 751 |
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
| 752 |
|
| 753 |
-
|
| 754 |
-
def read_srt_file(file_path):
|
| 755 |
subs = pysrt.open(file_path, encoding='utf-8')
|
| 756 |
entries = []
|
| 757 |
prev_end = 0
|
| 758 |
for idx, sub in enumerate(subs, 1):
|
| 759 |
-
start
|
|
|
|
| 760 |
pause = start - prev_end if idx > 1 else start
|
| 761 |
entries.append({
|
| 762 |
'entry_number': idx,
|
|
@@ -768,55 +854,133 @@ class SRTDubbing:
|
|
| 768 |
'previous_pause': f"{idx}_before_pause.wav",
|
| 769 |
})
|
| 770 |
prev_end = end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
return entries
|
| 772 |
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
remove_edge_silence(path, temp)
|
| 779 |
-
# 📏 Load the trimmed audio and get its duration in milliseconds.
|
| 780 |
-
audio = AudioSegment.from_file(temp)
|
| 781 |
|
| 782 |
-
#
|
| 783 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
shutil.move(temp, audio_path)
|
| 785 |
return
|
| 786 |
|
| 787 |
-
# Step
|
| 788 |
-
if
|
| 789 |
-
path, _ = generate_and_save_audio(
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
remove_edge_silence(path, temp)
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
(audio + silence).export(audio_path, format="wav")
|
| 811 |
-
# ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
|
| 812 |
else:
|
| 813 |
-
shutil.move(temp, audio_path)
|
|
|
|
| 814 |
|
| 815 |
@staticmethod
|
|
|
|
| 816 |
def make_silence(duration, path):
|
| 817 |
AudioSegment.silent(duration=duration).export(path, format="wav")
|
| 818 |
|
| 819 |
@staticmethod
|
|
|
|
| 820 |
def create_folder_for_srt(srt_file_path):
|
| 821 |
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 822 |
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
|
@@ -824,27 +988,30 @@ class SRTDubbing:
|
|
| 824 |
return folder
|
| 825 |
|
| 826 |
@staticmethod
|
|
|
|
| 827 |
def concatenate_audio_files(paths, output):
|
| 828 |
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
| 829 |
audio.export(output, format="wav")
|
| 830 |
|
| 831 |
-
|
|
|
|
| 832 |
entries = self.read_srt_file(srt_path)
|
| 833 |
folder = self.create_folder_for_srt(srt_path)
|
| 834 |
all_audio = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
for entry in tqdm(entries):
|
| 836 |
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
| 837 |
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
| 838 |
-
|
| 839 |
tts_path = os.path.join(folder, entry['audio_name'])
|
| 840 |
-
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
|
| 841 |
all_audio.append(tts_path)
|
| 842 |
-
|
| 843 |
self.concatenate_audio_files(all_audio, output_path)
|
| 844 |
|
| 845 |
-
|
| 846 |
# ---------------------- Entrypoint ----------------------
|
| 847 |
-
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
|
| 848 |
if not srt_path.endswith(".srt"):
|
| 849 |
gr.Error("Please upload a valid .srt file", duration=5)
|
| 850 |
return None
|
|
@@ -853,8 +1020,16 @@ def srt_process(srt_path, Language="American English", voice_name="af_bella", tr
|
|
| 853 |
processed_srt = prepare_srt(srt_path, Language, translate)
|
| 854 |
output_path = get_subtitle_Dub_path(srt_path, Language)
|
| 855 |
|
| 856 |
-
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
|
| 857 |
-
return output_path,output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 858 |
|
| 859 |
def subtitle_ui():
|
| 860 |
with gr.Blocks() as demo:
|
|
@@ -862,9 +1037,9 @@ def subtitle_ui():
|
|
| 862 |
gr.Markdown(
|
| 863 |
"""
|
| 864 |
# Generate Audio File From Subtitle [Upload Only .srt file]
|
| 865 |
-
|
| 866 |
-
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
| 867 |
-
|
| 868 |
"""
|
| 869 |
)
|
| 870 |
with gr.Row():
|
|
@@ -874,19 +1049,20 @@ def subtitle_ui():
|
|
| 874 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
| 875 |
# with gr.Row():
|
| 876 |
voice = gr.Dropdown(
|
| 877 |
-
voice_names,
|
| 878 |
-
value='af_bella',
|
| 879 |
-
allow_custom_value=False,
|
| 880 |
-
label='🎙️ Choose VoicePack',
|
| 881 |
)
|
| 882 |
with gr.Row():
|
| 883 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
| 884 |
|
| 885 |
with gr.Accordion('Other Settings', open=False):
|
|
|
|
| 886 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
with gr.Column():
|
| 891 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 892 |
audio_file = gr.File(label='📥 Download Audio')
|
|
@@ -895,23 +1071,18 @@ def subtitle_ui():
|
|
| 895 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 896 |
|
| 897 |
# srt_file.submit(
|
| 898 |
-
# srt_process,
|
| 899 |
-
# inputs=[srt_file, voice],
|
| 900 |
# outputs=[audio]
|
| 901 |
# )
|
| 902 |
generate_btn_.click(
|
| 903 |
-
srt_process,
|
| 904 |
-
inputs=[srt_file,language_name,voice,translate_text],
|
| 905 |
outputs=[audio,audio_file]
|
| 906 |
)
|
| 907 |
return demo
|
| 908 |
-
|
| 909 |
|
| 910 |
|
| 911 |
-
# Example usage:
|
| 912 |
-
# srt_file_path = "/content/me.srt"
|
| 913 |
-
# dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
|
| 914 |
-
# print(f"Audio file saved at: {dub_audio_path}")
|
| 915 |
|
| 916 |
import click
|
| 917 |
@click.command()
|
|
@@ -937,4 +1108,4 @@ last_used_language = "a"
|
|
| 937 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 938 |
temp_folder = create_audio_dir()
|
| 939 |
if __name__ == "__main__":
|
| 940 |
-
main()
|
|
|
|
| 619 |
- **"m_"**: Male
|
| 620 |
"""
|
| 621 |
with gr.Blocks() as demo2:
|
| 622 |
+
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/Kokoro-TTS-Subtitle)")
|
| 623 |
gr.Markdown(explanation) # Display the explanation
|
| 624 |
return demo2
|
| 625 |
|
| 626 |
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
|
| 633 |
#@title subtitle
|
| 634 |
import os
|
| 635 |
import re
|
|
|
|
| 638 |
import platform
|
| 639 |
import datetime
|
| 640 |
import subprocess
|
| 641 |
+
import math
|
| 642 |
+
import json
|
| 643 |
import pysrt
|
| 644 |
import librosa
|
| 645 |
import soundfile as sf
|
|
|
|
| 647 |
from pydub import AudioSegment
|
| 648 |
from deep_translator import GoogleTranslator
|
| 649 |
|
|
|
|
| 650 |
# ---------------------- Utility Functions ----------------------
|
| 651 |
+
|
| 652 |
+
# Returns the current time formatted as HH_MM_AM/PM (for filenames or logs)
|
| 653 |
def get_current_time():
|
| 654 |
return datetime.datetime.now().strftime("%I_%M_%p")
|
| 655 |
|
| 656 |
+
# Constructs an output file path for the final dubbed audio
|
| 657 |
def get_subtitle_Dub_path(srt_file_path, Language):
|
| 658 |
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 659 |
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
|
|
|
| 663 |
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
| 664 |
return new_path.replace("__", "_")
|
| 665 |
|
| 666 |
+
# Removes noise characters like [♫] from the subtitle text and saves a cleaned SRT
|
| 667 |
def clean_srt(input_path):
|
| 668 |
def clean_srt_line(text):
|
| 669 |
for bad in ["[", "]", "♫"]:
|
|
|
|
| 677 |
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
| 678 |
return output_path
|
| 679 |
|
| 680 |
+
# Translates subtitles using Deep Translator while preserving subtitle index order
|
| 681 |
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
| 682 |
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
| 683 |
subs = pysrt.open(input_path, encoding='utf-8')
|
| 684 |
+
#Blocking large text translations to prevent DDoS, so Google Translate remains free forever.
|
| 685 |
if len(subs) > max_segments:
|
| 686 |
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
| 687 |
return input_path
|
| 688 |
|
| 689 |
+
# Annotate original subtitles with <#index> to preserve mapping during translation
|
| 690 |
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
| 691 |
full_text = "\n".join(original)
|
| 692 |
|
| 693 |
+
# Split into manageable chunks for Google Translate API
|
| 694 |
chunks, start = [], 0
|
| 695 |
while start < len(full_text):
|
| 696 |
end = start + chunk_size
|
|
|
|
| 702 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
| 703 |
translated_text = "\n".join(translated_chunks)
|
| 704 |
|
| 705 |
+
# Rebuild subtitle dictionary after translation
|
| 706 |
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
| 707 |
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
| 708 |
|
| 709 |
+
# Assign translated text back to subtitle entries
|
| 710 |
for i, sub in enumerate(subs):
|
| 711 |
sub.text = translated_dict.get(i, sub.text)
|
| 712 |
|
| 713 |
subs.save(output_path, encoding='utf-8')
|
| 714 |
return output_path
|
| 715 |
|
| 716 |
+
# Cleans and optionally translates an SRT file before dubbing
|
| 717 |
def prepare_srt(srt_path, target_language, translate=False):
|
| 718 |
path = clean_srt(srt_path)
|
| 719 |
return translate_srt(path, target_language) if translate else path
|
| 720 |
|
| 721 |
+
# Checks if FFmpeg is available on the system; if not, warns user and returns fallback
|
| 722 |
+
# To change audio speed explicitly, we can use either FFmpeg or Librosa.
|
| 723 |
def is_ffmpeg_installed():
|
| 724 |
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
| 725 |
try:
|
|
|
|
| 729 |
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
| 730 |
return False, ffmpeg_exe
|
| 731 |
|
| 732 |
+
# Because FFmpeg can handle speeds from 0.5× to 2.0× only
|
| 733 |
+
def atempo_chain(factor):
|
| 734 |
+
if 0.5 <= factor <= 2.0:
|
| 735 |
+
return f"atempo={factor:.3f}"
|
| 736 |
+
parts = []
|
| 737 |
+
while factor > 2.0:
|
| 738 |
+
parts.append("atempo=2.0")
|
| 739 |
+
factor /= 2.0
|
| 740 |
+
while factor < 0.5:
|
| 741 |
+
parts.append("atempo=0.5")
|
| 742 |
+
factor *= 2.0
|
| 743 |
+
parts.append(f"atempo={factor:.3f}")
|
| 744 |
+
return ",".join(parts)
|
| 745 |
+
|
| 746 |
+
# If FFmpeg is not found, we will use Librosa
|
| 747 |
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
| 748 |
try:
|
| 749 |
y, sr = librosa.load(input_file, sr=None)
|
|
|
|
| 753 |
gr.Warning(f"Librosa speedup failed: {e}")
|
| 754 |
shutil.copy(input_file, output_file)
|
| 755 |
|
| 756 |
+
# Change the audio speed if it exceeds the original SRT segment duration.
|
| 757 |
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
| 758 |
if use_ffmpeg:
|
| 759 |
try:
|
| 760 |
+
subprocess.run(
|
| 761 |
+
[ffmpeg_path, "-i", input_file, "-filter:a", atempo_chain(speedup_factor), output_file, "-y"],
|
| 762 |
+
check=True,
|
| 763 |
+
stdout=subprocess.DEVNULL,
|
| 764 |
+
stderr=subprocess.DEVNULL
|
| 765 |
+
)
|
| 766 |
except Exception as e:
|
| 767 |
gr.Error(f"FFmpeg speedup error: {e}")
|
| 768 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 769 |
else:
|
| 770 |
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 771 |
|
| 772 |
+
# Remove silence from the start and end of the audio.
|
| 773 |
def remove_edge_silence(input_path, output_path):
|
| 774 |
y, sr = librosa.load(input_path, sr=None)
|
| 775 |
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
| 776 |
sf.write(output_path, trimmed_audio, sr)
|
| 777 |
return output_path
|
| 778 |
|
|
|
|
| 779 |
# ---------------------- Main Class ----------------------
|
| 780 |
class SRTDubbing:
|
| 781 |
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
|
|
|
| 786 |
os.makedirs(self.cache_dir, exist_ok=True)
|
| 787 |
|
| 788 |
@staticmethod
|
| 789 |
+
# Because our target is single-speaker SRT dubbing,
|
| 790 |
+
# we will calculate the speaker's average talking speed per second.
|
| 791 |
+
def get_avg_speaker_speed(srt_path):
|
| 792 |
+
subs = pysrt.open(srt_path, encoding='utf-8')
|
| 793 |
+
speeds = []
|
| 794 |
+
for sub in subs:
|
| 795 |
+
duration_sec = (sub.end.ordinal - sub.start.ordinal) / 1000
|
| 796 |
+
char_count = len(sub.text.replace(" ", ""))
|
| 797 |
+
if duration_sec > 0 and char_count > 0:
|
| 798 |
+
speeds.append(char_count / duration_sec)
|
| 799 |
+
return sum(speeds) / len(speeds) if speeds else 14
|
| 800 |
+
|
| 801 |
+
@staticmethod
|
| 802 |
+
# Calculate the speaker's default talking speed (e.g., 0.5x, 1x, 1.5x)
|
| 803 |
+
def get_speed_factor(srt_path, default_tts_rate=14):
|
| 804 |
+
avg_rate = SRTDubbing.get_avg_speaker_speed(srt_path)
|
| 805 |
+
speed_factor = avg_rate / default_tts_rate if default_tts_rate > 0 else 1.0
|
| 806 |
+
return math.floor(speed_factor * 100) / 100 # Truncate
|
| 807 |
+
|
| 808 |
+
@staticmethod
|
| 809 |
+
# Merge multiple SRT segments if the gap is small and total duration
|
| 810 |
+
# stays under N milliseconds
|
| 811 |
+
def merge_fast_entries(entries, max_pause_gap=1000, max_merged_duration_ms=8000):
|
| 812 |
+
merged = []
|
| 813 |
+
i = 0
|
| 814 |
+
n = len(entries)
|
| 815 |
+
while i < n:
|
| 816 |
+
curr = entries[i].copy()
|
| 817 |
+
j = i + 1
|
| 818 |
+
while j < n:
|
| 819 |
+
next_ = entries[j]
|
| 820 |
+
gap = next_["start_time"] - curr["end_time"]
|
| 821 |
+
new_duration = next_["end_time"] - curr["start_time"]
|
| 822 |
+
if gap > max_pause_gap or new_duration > max_merged_duration_ms:
|
| 823 |
+
break
|
| 824 |
+
if not curr["text"].strip().endswith((".", "!", "?")):
|
| 825 |
+
curr["text"] = curr["text"].strip() + ","
|
| 826 |
+
curr["text"] += " " + next_["text"]
|
| 827 |
+
curr["end_time"] = next_["end_time"]
|
| 828 |
+
j += 1
|
| 829 |
+
merged.append(curr)
|
| 830 |
+
i = j
|
| 831 |
+
return merged
|
| 832 |
+
|
| 833 |
+
@staticmethod
|
| 834 |
+
# Convert SRT timestamp to milliseconds
|
| 835 |
def convert_to_millisecond(t):
|
| 836 |
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
| 837 |
|
| 838 |
+
# Read SRT file and convert it to our required dictionary format for dubbing
|
| 839 |
+
def read_srt_file(self, file_path):
|
| 840 |
subs = pysrt.open(file_path, encoding='utf-8')
|
| 841 |
entries = []
|
| 842 |
prev_end = 0
|
| 843 |
for idx, sub in enumerate(subs, 1):
|
| 844 |
+
start = self.convert_to_millisecond(sub.start)
|
| 845 |
+
end = self.convert_to_millisecond(sub.end)
|
| 846 |
pause = start - prev_end if idx > 1 else start
|
| 847 |
entries.append({
|
| 848 |
'entry_number': idx,
|
|
|
|
| 854 |
'previous_pause': f"{idx}_before_pause.wav",
|
| 855 |
})
|
| 856 |
prev_end = end
|
| 857 |
+
|
| 858 |
+
entries = self.merge_fast_entries(entries)
|
| 859 |
+
|
| 860 |
+
## For debug
|
| 861 |
+
# with open("./old.json", "w", encoding="utf-8") as f:
|
| 862 |
+
# json.dump(entries, f, indent=2, ensure_ascii=False)
|
| 863 |
+
# with open("/content/new.json", "w", encoding="utf-8") as f:
|
| 864 |
+
# json.dump(entries, f, indent=2, ensure_ascii=False)
|
| 865 |
+
|
| 866 |
return entries
|
| 867 |
|
| 868 |
+
# For TTS, modify this function in the future to use a different TTS or voice cloning tool
|
| 869 |
+
# def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
|
| 870 |
+
# temp = "./cache/temp.wav"
|
| 871 |
+
# if default_speed_factor is None:
|
| 872 |
+
# default_speed_factor = 1.0
|
| 873 |
+
|
| 874 |
+
# # Step 1: Generate clean TTS audio at 1.0x speed (avoid Kokoro noise issue)
|
| 875 |
+
# path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=False, keep_silence_up_to=0.05)
|
| 876 |
+
|
| 877 |
+
# # Step 2: Always adjust the generated TTS to user's speaking speed
|
| 878 |
+
# if default_speed_factor != 1.0:
|
| 879 |
+
# temp_wav = path.replace(".wav", "_user_speed.wav")
|
| 880 |
+
# change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 881 |
+
# path = temp_wav
|
| 882 |
+
|
| 883 |
+
# # Step 3: Trim edges
|
| 884 |
+
# remove_edge_silence(path, temp)
|
| 885 |
+
# audio = AudioSegment.from_file(temp)
|
| 886 |
+
|
| 887 |
+
# # Step 4: If no target duration given, save and exit
|
| 888 |
+
# if actual_duration == 0:
|
| 889 |
+
# shutil.move(temp, audio_path)
|
| 890 |
+
# return
|
| 891 |
+
|
| 892 |
+
# # Step 5: Try regeneration with silence removal if needed
|
| 893 |
+
# if len(audio) > actual_duration:
|
| 894 |
+
# path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1.0, remove_silence=True, keep_silence_up_to=0.05)
|
| 895 |
+
# if default_speed_factor != 1.0:
|
| 896 |
+
# temp_wav = path.replace(".wav", "_tight_user_speed.wav")
|
| 897 |
+
# change_speed(path, temp_wav, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 898 |
+
# path = temp_wav
|
| 899 |
+
# remove_edge_silence(path, temp)
|
| 900 |
+
# audio = AudioSegment.from_file(temp)
|
| 901 |
+
|
| 902 |
+
# # Step 6: Final fallback — force compress audio to fit
|
| 903 |
+
# if len(audio) > actual_duration:
|
| 904 |
+
# factor = len(audio) / actual_duration
|
| 905 |
+
# final_temp = "./cache/speedup_temp.wav"
|
| 906 |
+
# change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 907 |
+
# shutil.move(final_temp, audio_path)
|
| 908 |
+
# elif len(audio) < actual_duration:
|
| 909 |
+
# silence = AudioSegment.silent(duration=actual_duration - len(audio))
|
| 910 |
+
# (audio + silence).export(audio_path, format="wav")
|
| 911 |
+
# else:
|
| 912 |
+
# shutil.move(temp, audio_path)
|
| 913 |
+
|
| 914 |
+
|
| 915 |
+
# For TTS, modify this function in the future to use a different TTS or voice cloning tool
|
| 916 |
+
def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration, default_speed_factor=None):
|
| 917 |
+
import soundfile as sf
|
| 918 |
+
from librosa import get_duration
|
| 919 |
+
|
| 920 |
+
TOLERANCE_MS = 30
|
| 921 |
+
temp = os.path.join(self.cache_dir, "temp.wav")
|
| 922 |
+
|
| 923 |
+
if default_speed_factor is None:
|
| 924 |
+
default_speed_factor = 1.0
|
| 925 |
+
|
| 926 |
+
# Step 1: Generate clean TTS audio (Kokoro safe speed)
|
| 927 |
+
path, _ = generate_and_save_audio(
|
| 928 |
+
text, Language=language, voice=voice,
|
| 929 |
+
speed=1.0, remove_silence=False, keep_silence_up_to=0.05
|
| 930 |
+
)
|
| 931 |
+
|
| 932 |
+
# Step 2: Apply user-defined speaking speed
|
| 933 |
+
if default_speed_factor != 1.0:
|
| 934 |
+
user_speed_path = path.replace(".wav", "_user.wav")
|
| 935 |
+
change_speed(path, user_speed_path, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 936 |
+
path = user_speed_path
|
| 937 |
+
|
| 938 |
+
# Step 3: Trim silence
|
| 939 |
remove_edge_silence(path, temp)
|
|
|
|
|
|
|
| 940 |
|
| 941 |
+
# Step 4: Duration analysis (high precision)
|
| 942 |
+
y, sr = sf.read(temp)
|
| 943 |
+
duration_ms = int(get_duration(y=y, sr=sr) * 1000)
|
| 944 |
+
|
| 945 |
+
# Step 5: If very close, skip correction
|
| 946 |
+
if abs(duration_ms - actual_duration) <= TOLERANCE_MS:
|
| 947 |
shutil.move(temp, audio_path)
|
| 948 |
return
|
| 949 |
|
| 950 |
+
# Step 6: Try regenerating with silence removal if too long
|
| 951 |
+
if duration_ms > actual_duration:
|
| 952 |
+
path, _ = generate_and_save_audio(
|
| 953 |
+
text, Language=language, voice=voice,
|
| 954 |
+
speed=1.0, remove_silence=True, keep_silence_up_to=0.05
|
| 955 |
+
)
|
| 956 |
+
if default_speed_factor != 1.0:
|
| 957 |
+
tighter = path.replace(".wav", "_tight_user.wav")
|
| 958 |
+
change_speed(path, tighter, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 959 |
+
path = tighter
|
| 960 |
remove_edge_silence(path, temp)
|
| 961 |
+
y, sr = sf.read(temp)
|
| 962 |
+
duration_ms = int(get_duration(y=y, sr=sr) * 1000)
|
| 963 |
+
|
| 964 |
+
# Step 7: Final correction
|
| 965 |
+
if duration_ms > actual_duration + TOLERANCE_MS:
|
| 966 |
+
factor = duration_ms / actual_duration
|
| 967 |
+
corrected = os.path.join(self.cache_dir, "speed_final.wav")
|
| 968 |
+
change_speed(temp, corrected, factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 969 |
+
shutil.move(corrected, audio_path)
|
| 970 |
+
elif duration_ms < actual_duration - TOLERANCE_MS:
|
| 971 |
+
silence = AudioSegment.silent(duration=actual_duration - duration_ms)
|
| 972 |
+
(AudioSegment.from_file(temp) + silence).export(audio_path, format="wav")
|
|
|
|
|
|
|
| 973 |
else:
|
| 974 |
+
shutil.move(temp, audio_path)
|
| 975 |
+
|
| 976 |
|
| 977 |
@staticmethod
|
| 978 |
+
# Insert silent gaps between two segments
|
| 979 |
def make_silence(duration, path):
|
| 980 |
AudioSegment.silent(duration=duration).export(path, format="wav")
|
| 981 |
|
| 982 |
@staticmethod
|
| 983 |
+
# Srt save folder
|
| 984 |
def create_folder_for_srt(srt_file_path):
|
| 985 |
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 986 |
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
|
|
|
| 988 |
return folder
|
| 989 |
|
| 990 |
@staticmethod
|
| 991 |
+
# Join Chunks audio files
|
| 992 |
def concatenate_audio_files(paths, output):
|
| 993 |
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
| 994 |
audio.export(output, format="wav")
|
| 995 |
|
| 996 |
+
# Util funtion to call other funtions
|
| 997 |
+
def srt_to_dub(self, srt_path, output_path, language, voice,speaker_talk_speed=True):
|
| 998 |
entries = self.read_srt_file(srt_path)
|
| 999 |
folder = self.create_folder_for_srt(srt_path)
|
| 1000 |
all_audio = []
|
| 1001 |
+
if speaker_talk_speed:
|
| 1002 |
+
default_speed_factor = self.get_speed_factor(srt_path)
|
| 1003 |
+
else:
|
| 1004 |
+
default_speed_factor=1.0
|
| 1005 |
for entry in tqdm(entries):
|
| 1006 |
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
| 1007 |
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
|
|
|
| 1008 |
tts_path = os.path.join(folder, entry['audio_name'])
|
| 1009 |
+
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'], default_speed_factor)
|
| 1010 |
all_audio.append(tts_path)
|
|
|
|
| 1011 |
self.concatenate_audio_files(all_audio, output_path)
|
| 1012 |
|
|
|
|
| 1013 |
# ---------------------- Entrypoint ----------------------
|
| 1014 |
+
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=True):
|
| 1015 |
if not srt_path.endswith(".srt"):
|
| 1016 |
gr.Error("Please upload a valid .srt file", duration=5)
|
| 1017 |
return None
|
|
|
|
| 1020 |
processed_srt = prepare_srt(srt_path, Language, translate)
|
| 1021 |
output_path = get_subtitle_Dub_path(srt_path, Language)
|
| 1022 |
|
| 1023 |
+
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name,speaker_talk_speed)
|
| 1024 |
+
return output_path, output_path
|
| 1025 |
+
|
| 1026 |
+
# Example usage
|
| 1027 |
+
# srt_file_path = "/content/last.srt" # @param {type: "string"}
|
| 1028 |
+
# dub_audio_path, _ = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False,speaker_talk_speed=False)
|
| 1029 |
+
# print(f"Audio file saved at: {dub_audio_path}")
|
| 1030 |
+
|
| 1031 |
+
|
| 1032 |
+
|
| 1033 |
|
| 1034 |
def subtitle_ui():
|
| 1035 |
with gr.Blocks() as demo:
|
|
|
|
| 1037 |
gr.Markdown(
|
| 1038 |
"""
|
| 1039 |
# Generate Audio File From Subtitle [Upload Only .srt file]
|
| 1040 |
+
|
| 1041 |
+
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
| 1042 |
+
|
| 1043 |
"""
|
| 1044 |
)
|
| 1045 |
with gr.Row():
|
|
|
|
| 1049 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
| 1050 |
# with gr.Row():
|
| 1051 |
voice = gr.Dropdown(
|
| 1052 |
+
voice_names,
|
| 1053 |
+
value='af_bella',
|
| 1054 |
+
allow_custom_value=False,
|
| 1055 |
+
label='🎙️ Choose VoicePack',
|
| 1056 |
)
|
| 1057 |
with gr.Row():
|
| 1058 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
| 1059 |
|
| 1060 |
with gr.Accordion('Other Settings', open=False):
|
| 1061 |
+
speaker_speed_ = gr.Checkbox(value=True, label="⚡ Match With Apeaker's Average Talking Speed")
|
| 1062 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Subtitle to Selected Language')
|
| 1063 |
+
|
| 1064 |
+
|
| 1065 |
+
|
| 1066 |
with gr.Column():
|
| 1067 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 1068 |
audio_file = gr.File(label='📥 Download Audio')
|
|
|
|
| 1071 |
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 1072 |
|
| 1073 |
# srt_file.submit(
|
| 1074 |
+
# srt_process,
|
| 1075 |
+
# inputs=[srt_file, voice],
|
| 1076 |
# outputs=[audio]
|
| 1077 |
# )
|
| 1078 |
generate_btn_.click(
|
| 1079 |
+
srt_process,
|
| 1080 |
+
inputs=[srt_file,language_name,voice,translate_text,speaker_speed_],
|
| 1081 |
outputs=[audio,audio_file]
|
| 1082 |
)
|
| 1083 |
return demo
|
|
|
|
| 1084 |
|
| 1085 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1086 |
|
| 1087 |
import click
|
| 1088 |
@click.command()
|
|
|
|
| 1108 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 1109 |
temp_folder = create_audio_dir()
|
| 1110 |
if __name__ == "__main__":
|
| 1111 |
+
main()
|