Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from KOKORO.models import build_model
|
| 2 |
from KOKORO.utils import tts,tts_file_name,podcast
|
| 3 |
import sys
|
|
@@ -6,11 +7,25 @@ import os
|
|
| 6 |
os.system("python download_model.py")
|
| 7 |
import torch
|
| 8 |
import gc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
print("Loading model...")
|
| 10 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 11 |
print(f'Using device: {device}')
|
| 12 |
-
|
| 13 |
-
MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
|
| 14 |
print("Model loaded successfully.")
|
| 15 |
|
| 16 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
|
@@ -22,7 +37,7 @@ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_p
|
|
| 22 |
|
| 23 |
|
| 24 |
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
|
| 25 |
-
current_model = model_list[
|
| 26 |
|
| 27 |
def update_model(model_name):
|
| 28 |
"""
|
|
@@ -43,8 +58,21 @@ def update_model(model_name):
|
|
| 43 |
return f"Model updated to {model_name}"
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
"""
|
| 49 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
| 50 |
"""
|
|
@@ -54,6 +82,12 @@ def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", sp
|
|
| 54 |
minimum_silence = 0.05
|
| 55 |
keep_silence = int(minimum_silence * 1000)
|
| 56 |
save_at = tts_file_name(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
audio_path = tts_maker(
|
| 58 |
text,
|
| 59 |
voice_name,
|
|
@@ -96,7 +130,6 @@ def toggle_autoplay(autoplay):
|
|
| 96 |
|
| 97 |
with gr.Blocks() as demo1:
|
| 98 |
gr.Markdown("# Batched TTS")
|
| 99 |
-
gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
|
| 100 |
with gr.Row():
|
| 101 |
with gr.Column():
|
| 102 |
text = gr.Textbox(
|
|
@@ -115,16 +148,17 @@ with gr.Blocks() as demo1:
|
|
| 115 |
with gr.Row():
|
| 116 |
generate_btn = gr.Button('Generate', variant='primary')
|
| 117 |
with gr.Accordion('Audio Settings', open=False):
|
| 118 |
-
model_name=gr.Dropdown(model_list,label="Model",value=model_list[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
| 120 |
minimum_silence = gr.Number(
|
| 121 |
label="Keep Silence Upto (In seconds)",
|
| 122 |
value=0.05
|
| 123 |
)
|
| 124 |
-
|
| 125 |
-
minimum=0.25, maximum=2, value=1, step=0.1,
|
| 126 |
-
label='⚡️Speed', info='Adjust the speaking speed'
|
| 127 |
-
)
|
| 128 |
# trim = gr.Slider(
|
| 129 |
# minimum=0, maximum=1, value=0, step=0.1,
|
| 130 |
# label='🔪 Trim', info='How much to cut from both ends of each segment'
|
|
@@ -134,6 +168,8 @@ with gr.Blocks() as demo1:
|
|
| 134 |
label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
|
| 135 |
)
|
| 136 |
|
|
|
|
|
|
|
| 137 |
with gr.Column():
|
| 138 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 139 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
@@ -142,16 +178,16 @@ with gr.Blocks() as demo1:
|
|
| 142 |
|
| 143 |
text.submit(
|
| 144 |
text_to_speech,
|
| 145 |
-
inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence],
|
| 146 |
outputs=[audio]
|
| 147 |
)
|
| 148 |
generate_btn.click(
|
| 149 |
text_to_speech,
|
| 150 |
-
inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence],
|
| 151 |
outputs=[audio]
|
| 152 |
)
|
| 153 |
|
| 154 |
-
def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19
|
| 155 |
global MODEL,device
|
| 156 |
update_model(model_name)
|
| 157 |
if not minimum_silence:
|
|
@@ -258,13 +294,13 @@ def your_tts(text,audio_path,actual_duration,speed=1.0):
|
|
| 258 |
global srt_voice_name
|
| 259 |
model_name="kokoro-v0_19.pth"
|
| 260 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
|
| 261 |
-
|
| 262 |
tts_audio = AudioSegment.from_file(tts_path)
|
| 263 |
tts_duration = len(tts_audio)
|
| 264 |
if tts_duration > actual_duration:
|
| 265 |
speedup_factor = tts_duration / actual_duration
|
| 266 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
|
| 267 |
-
|
| 268 |
shutil.copy(tts_path,audio_path)
|
| 269 |
|
| 270 |
|
|
@@ -321,6 +357,77 @@ def clean_srt(input_path):
|
|
| 321 |
|
| 322 |
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
|
| 326 |
class SRTDubbing:
|
|
@@ -343,14 +450,15 @@ class SRTDubbing:
|
|
| 343 |
if tts_duration > actual_duration:
|
| 344 |
speedup_factor = tts_duration / actual_duration
|
| 345 |
speedup_filename = "./cache/speedup_temp.wav"
|
|
|
|
| 346 |
# Use ffmpeg to change audio speed
|
| 347 |
-
subprocess.run([
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
], check=True)
|
| 354 |
|
| 355 |
# Replace the original TTS audio with the sped-up version
|
| 356 |
shutil.move(speedup_filename, audio_path)
|
|
@@ -456,10 +564,27 @@ class SRTDubbing:
|
|
| 456 |
with open("entries.json", "w") as file:
|
| 457 |
json.dump(entries, file, indent=4)
|
| 458 |
return entries
|
| 459 |
-
srt_voice_name="
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
srt_dubbing = SRTDubbing()
|
| 464 |
dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
|
| 465 |
srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
|
|
@@ -476,7 +601,7 @@ with gr.Blocks() as demo3:
|
|
| 476 |
|
| 477 |
gr.Markdown(
|
| 478 |
"""
|
| 479 |
-
# Generate Audio File From Subtitle [
|
| 480 |
|
| 481 |
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
| 482 |
|
|
@@ -495,7 +620,12 @@ with gr.Blocks() as demo3:
|
|
| 495 |
)
|
| 496 |
with gr.Row():
|
| 497 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
with gr.Column():
|
| 500 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 501 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
@@ -509,24 +639,292 @@ with gr.Blocks() as demo3:
|
|
| 509 |
# )
|
| 510 |
generate_btn_.click(
|
| 511 |
srt_process,
|
| 512 |
-
inputs=[srt_file,voice],
|
| 513 |
outputs=[audio]
|
| 514 |
)
|
| 515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
display_text = " \n".join(voice_list)
|
| 518 |
|
| 519 |
-
with gr.Blocks() as
|
| 520 |
-
gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
|
| 521 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
|
|
|
|
|
|
| 522 |
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
def main(debug
|
| 529 |
-
demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS"
|
| 530 |
|
| 531 |
demo.queue().launch(debug=debug, share=share)
|
| 532 |
#Run on local network
|
|
@@ -559,4 +957,4 @@ if __name__ == "__main__":
|
|
| 559 |
|
| 560 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
| 561 |
# shutil.move(result, save_at)
|
| 562 |
-
# print(f"Saved at {save_at}")
|
|
|
|
| 1 |
+
|
| 2 |
from KOKORO.models import build_model
|
| 3 |
from KOKORO.utils import tts,tts_file_name,podcast
|
| 4 |
import sys
|
|
|
|
| 7 |
os.system("python download_model.py")
|
| 8 |
import torch
|
| 9 |
import gc
|
| 10 |
+
import platform
|
| 11 |
+
import shutil
|
| 12 |
+
base_path=os.getcwd()
|
| 13 |
+
def clean_folder_before_start():
|
| 14 |
+
global base_path
|
| 15 |
+
folder_list=["dummy","TTS_DUB","kokoro_audio"]
|
| 16 |
+
for folder in folder_list:
|
| 17 |
+
if os.path.exists(f"{base_path}/{folder}"):
|
| 18 |
+
try:
|
| 19 |
+
shutil.rmtree(f"{base_path}/{folder}")
|
| 20 |
+
except:
|
| 21 |
+
pass
|
| 22 |
+
os.makedirs(f"{base_path}/{folder}", exist_ok=True)
|
| 23 |
+
clean_folder_before_start()
|
| 24 |
+
|
| 25 |
print("Loading model...")
|
| 26 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 27 |
print(f'Using device: {device}')
|
| 28 |
+
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
|
|
|
|
| 29 |
print("Model loaded successfully.")
|
| 30 |
|
| 31 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
|
| 40 |
+
current_model = model_list[0]
|
| 41 |
|
| 42 |
def update_model(model_name):
|
| 43 |
"""
|
|
|
|
| 58 |
return f"Model updated to {model_name}"
|
| 59 |
|
| 60 |
|
| 61 |
+
def manage_files(file_path):
|
| 62 |
+
if os.path.exists(file_path):
|
| 63 |
+
file_extension = os.path.splitext(file_path)[1] # Get file extension
|
| 64 |
+
file_size = os.path.getsize(file_path) # Get file size in bytes
|
| 65 |
+
# Check if file is a valid .pt file and its size is ≤ 5 MB
|
| 66 |
+
if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
|
| 67 |
+
return True # File is valid and kept
|
| 68 |
+
else:
|
| 69 |
+
os.remove(file_path) # Delete invalid or oversized file
|
| 70 |
+
return False
|
| 71 |
+
return False # File does not exist
|
| 72 |
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,custom_voicepack=None,trim=0.0):
|
| 76 |
"""
|
| 77 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
| 78 |
"""
|
|
|
|
| 82 |
minimum_silence = 0.05
|
| 83 |
keep_silence = int(minimum_silence * 1000)
|
| 84 |
save_at = tts_file_name(text)
|
| 85 |
+
# print(voice_name,custom_voicepack)
|
| 86 |
+
if custom_voicepack:
|
| 87 |
+
if manage_files(custom_voicepack):
|
| 88 |
+
voice_name = custom_voicepack
|
| 89 |
+
else:
|
| 90 |
+
gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
|
| 91 |
audio_path = tts_maker(
|
| 92 |
text,
|
| 93 |
voice_name,
|
|
|
|
| 130 |
|
| 131 |
with gr.Blocks() as demo1:
|
| 132 |
gr.Markdown("# Batched TTS")
|
|
|
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column():
|
| 135 |
text = gr.Textbox(
|
|
|
|
| 148 |
with gr.Row():
|
| 149 |
generate_btn = gr.Button('Generate', variant='primary')
|
| 150 |
with gr.Accordion('Audio Settings', open=False):
|
| 151 |
+
model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
|
| 152 |
+
speed = gr.Slider(
|
| 153 |
+
minimum=0.25, maximum=2, value=1, step=0.1,
|
| 154 |
+
label='⚡️Speed', info='Adjust the speaking speed'
|
| 155 |
+
)
|
| 156 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
| 157 |
minimum_silence = gr.Number(
|
| 158 |
label="Keep Silence Upto (In seconds)",
|
| 159 |
value=0.05
|
| 160 |
)
|
| 161 |
+
|
|
|
|
|
|
|
|
|
|
| 162 |
# trim = gr.Slider(
|
| 163 |
# minimum=0, maximum=1, value=0, step=0.1,
|
| 164 |
# label='🔪 Trim', info='How much to cut from both ends of each segment'
|
|
|
|
| 168 |
label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
|
| 169 |
)
|
| 170 |
|
| 171 |
+
custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
|
| 172 |
+
|
| 173 |
with gr.Column():
|
| 174 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 175 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
|
|
| 178 |
|
| 179 |
text.submit(
|
| 180 |
text_to_speech,
|
| 181 |
+
inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
|
| 182 |
outputs=[audio]
|
| 183 |
)
|
| 184 |
generate_btn.click(
|
| 185 |
text_to_speech,
|
| 186 |
+
inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
|
| 187 |
outputs=[audio]
|
| 188 |
)
|
| 189 |
|
| 190 |
+
def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
|
| 191 |
global MODEL,device
|
| 192 |
update_model(model_name)
|
| 193 |
if not minimum_silence:
|
|
|
|
| 294 |
global srt_voice_name
|
| 295 |
model_name="kokoro-v0_19.pth"
|
| 296 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
|
| 297 |
+
# print(tts_path)
|
| 298 |
tts_audio = AudioSegment.from_file(tts_path)
|
| 299 |
tts_duration = len(tts_audio)
|
| 300 |
if tts_duration > actual_duration:
|
| 301 |
speedup_factor = tts_duration / actual_duration
|
| 302 |
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
|
| 303 |
+
# print(tts_path)
|
| 304 |
shutil.copy(tts_path,audio_path)
|
| 305 |
|
| 306 |
|
|
|
|
| 357 |
|
| 358 |
|
| 359 |
|
| 360 |
+
import librosa
|
| 361 |
+
import soundfile as sf
|
| 362 |
+
import subprocess
|
| 363 |
+
|
| 364 |
+
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
| 365 |
+
try:
|
| 366 |
+
# Load the audio file
|
| 367 |
+
y, sr = librosa.load(input_file, sr=None)
|
| 368 |
+
|
| 369 |
+
# Use time stretching to speed up audio without changing pitch
|
| 370 |
+
y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
|
| 371 |
+
|
| 372 |
+
# Save the output with the original sample rate
|
| 373 |
+
sf.write(output_file, y_stretched, sr)
|
| 374 |
+
# print(f"Speed up by {speedup_factor} completed successfully: {output_file}")
|
| 375 |
+
|
| 376 |
+
except Exception as e:
|
| 377 |
+
gr.Warning(f"Error during speedup with Librosa: {e}")
|
| 378 |
+
shutil.copy(input_file, output_file)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def is_ffmpeg_installed():
|
| 384 |
+
if platform.system() == "Windows":
|
| 385 |
+
local_ffmpeg_path = os.path.join("./ffmpeg", "ffmpeg.exe")
|
| 386 |
+
else:
|
| 387 |
+
local_ffmpeg_path = "ffmpeg"
|
| 388 |
+
try:
|
| 389 |
+
subprocess.run([local_ffmpeg_path, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
|
| 390 |
+
# print("FFmpeg is installed")
|
| 391 |
+
return True,local_ffmpeg_path
|
| 392 |
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
| 393 |
+
# print("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing")
|
| 394 |
+
gr.Warning("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing",duration= 20)
|
| 395 |
+
return False,local_ffmpeg_path
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# ffmpeg -i test.wav -filter:a "atempo=2.0" ffmpeg.wav -y
|
| 401 |
+
def change_speed(input_file, output_file, speedup_factor):
|
| 402 |
+
global use_ffmpeg,local_ffmpeg_path
|
| 403 |
+
if use_ffmpeg:
|
| 404 |
+
# print("Using FFmpeg for speedup")
|
| 405 |
+
try:
|
| 406 |
+
# subprocess.run([
|
| 407 |
+
# local_ffmpeg_path,
|
| 408 |
+
# "-i", input_file,
|
| 409 |
+
# "-filter:a", f"atempo={speedup_factor}",
|
| 410 |
+
# output_file,
|
| 411 |
+
# "-y"
|
| 412 |
+
# ], check=True)
|
| 413 |
+
subprocess.run([
|
| 414 |
+
local_ffmpeg_path,
|
| 415 |
+
"-i", input_file,
|
| 416 |
+
"-filter:a", f"atempo={speedup_factor}",
|
| 417 |
+
output_file,
|
| 418 |
+
"-y"
|
| 419 |
+
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 420 |
+
except Exception as e:
|
| 421 |
+
gr.Error(f"Error during speedup with FFmpeg: {e}")
|
| 422 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 423 |
+
else:
|
| 424 |
+
# print("Using Librosa for speedup")
|
| 425 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
|
| 431 |
|
| 432 |
|
| 433 |
class SRTDubbing:
|
|
|
|
| 450 |
if tts_duration > actual_duration:
|
| 451 |
speedup_factor = tts_duration / actual_duration
|
| 452 |
speedup_filename = "./cache/speedup_temp.wav"
|
| 453 |
+
change_speed(tts_filename, speedup_filename, speedup_factor)
|
| 454 |
# Use ffmpeg to change audio speed
|
| 455 |
+
# subprocess.run([
|
| 456 |
+
# "ffmpeg",
|
| 457 |
+
# "-i", tts_filename,
|
| 458 |
+
# "-filter:a", f"atempo={speedup_factor}",
|
| 459 |
+
# speedup_filename,
|
| 460 |
+
# "-y"
|
| 461 |
+
# ], check=True)
|
| 462 |
|
| 463 |
# Replace the original TTS audio with the sped-up version
|
| 464 |
shutil.move(speedup_filename, audio_path)
|
|
|
|
| 564 |
with open("entries.json", "w") as file:
|
| 565 |
json.dump(entries, file, indent=4)
|
| 566 |
return entries
|
| 567 |
+
srt_voice_name="af"
|
| 568 |
+
use_ffmpeg,local_ffmpeg_path = is_ffmpeg_installed()
|
| 569 |
+
# use_ffmpeg=False
|
| 570 |
+
|
| 571 |
+
def srt_process(srt_file_path,voice_name,custom_voicepack=None,dest_language="en"):
|
| 572 |
+
global srt_voice_name,use_ffmpeg
|
| 573 |
+
|
| 574 |
+
if not srt_file_path.endswith(".srt"):
|
| 575 |
+
gr.Error("Please upload a valid .srt file",duration=5)
|
| 576 |
+
return None
|
| 577 |
+
if use_ffmpeg:
|
| 578 |
+
gr.Success("Using FFmpeg for audio speedup to sync with subtitle")
|
| 579 |
+
else:
|
| 580 |
+
gr.Warning("Install FFmpeg to ensure high-quality audio when speeding up the audio to sync with subtitle. Default Using 'librosa' for speedup",duration= 20)
|
| 581 |
+
|
| 582 |
+
if custom_voicepack:
|
| 583 |
+
if manage_files(custom_voicepack):
|
| 584 |
+
srt_voice_name = custom_voicepack
|
| 585 |
+
else:
|
| 586 |
+
srt_voice_name=voice_name
|
| 587 |
+
gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
|
| 588 |
srt_dubbing = SRTDubbing()
|
| 589 |
dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
|
| 590 |
srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
|
|
|
|
| 601 |
|
| 602 |
gr.Markdown(
|
| 603 |
"""
|
| 604 |
+
# Generate Audio File From Subtitle [Upload Only .srt file]
|
| 605 |
|
| 606 |
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
| 607 |
|
|
|
|
| 620 |
)
|
| 621 |
with gr.Row():
|
| 622 |
generate_btn_ = gr.Button('Generate', variant='primary')
|
| 623 |
+
|
| 624 |
+
with gr.Accordion('Audio Settings', open=False):
|
| 625 |
+
custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
|
| 629 |
with gr.Column():
|
| 630 |
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 631 |
with gr.Accordion('Enable Autoplay', open=False):
|
|
|
|
| 639 |
# )
|
| 640 |
generate_btn_.click(
|
| 641 |
srt_process,
|
| 642 |
+
inputs=[srt_file,voice,custom_voicepack],
|
| 643 |
outputs=[audio]
|
| 644 |
)
|
| 645 |
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
#### Voice mixing
|
| 649 |
+
# modified from here
|
| 650 |
+
# https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS
|
| 651 |
+
def get_voices():
|
| 652 |
+
voices = {}
|
| 653 |
+
for i in os.listdir("./KOKORO/voices"):
|
| 654 |
+
if i.endswith(".pt"):
|
| 655 |
+
voice_name = i.replace(".pt", "")
|
| 656 |
+
voices[voice_name] = torch.load(f"./KOKORO/voices/{i}", weights_only=True).to(device)
|
| 657 |
+
|
| 658 |
+
slider_configs = {}
|
| 659 |
+
|
| 660 |
+
# Iterate through the predefined list of voices
|
| 661 |
+
for i in voices:
|
| 662 |
+
# Handle the default case for "af"
|
| 663 |
+
if i == "af":
|
| 664 |
+
slider_configs["af"]= "Default 👩🇺🇸"
|
| 665 |
+
continue
|
| 666 |
+
if i == "af_nicole":
|
| 667 |
+
slider_configs["af_nicole"]="Nicole 😏🇺🇸"
|
| 668 |
+
continue
|
| 669 |
+
if i == "af_bella":
|
| 670 |
+
slider_configs["af_bella"]="Bella 🤗🇺🇸"
|
| 671 |
+
continue
|
| 672 |
+
|
| 673 |
+
# Determine the country emoji
|
| 674 |
+
country = "🇺🇸" if i.startswith("a") else "🇬🇧"
|
| 675 |
+
|
| 676 |
+
# Determine the gender emoji and name
|
| 677 |
+
if "f_" in i:
|
| 678 |
+
display_name = f"{i.split('_')[-1].capitalize()} 👩{country}"
|
| 679 |
+
elif "m_" in i:
|
| 680 |
+
display_name = f"{i.split('_')[-1].capitalize()} 👨{country}"
|
| 681 |
+
else:
|
| 682 |
+
display_name = f"{i.capitalize()} 😐"
|
| 683 |
+
|
| 684 |
+
# Append the voice tuple to the list
|
| 685 |
+
slider_configs[i]= display_name
|
| 686 |
+
|
| 687 |
+
return voices, slider_configs
|
| 688 |
+
|
| 689 |
+
voices, slider_configs = get_voices()
|
| 690 |
+
|
| 691 |
+
|
| 692 |
+
def parse_voice_formula(formula):
|
| 693 |
+
global voices
|
| 694 |
+
"""Parse the voice formula string and return the combined voice tensor."""
|
| 695 |
+
if not formula.strip():
|
| 696 |
+
raise ValueError("Empty voice formula")
|
| 697 |
+
|
| 698 |
+
# Initialize the weighted sum
|
| 699 |
+
weighted_sum = None
|
| 700 |
+
|
| 701 |
+
# Split the formula into terms
|
| 702 |
+
terms = formula.split('+')
|
| 703 |
+
weights=0
|
| 704 |
+
for term in terms:
|
| 705 |
+
# Parse each term (format: "voice_name * 0.333")
|
| 706 |
+
parts = term.strip().split('*')
|
| 707 |
+
if len(parts) != 2:
|
| 708 |
+
raise ValueError(f"Invalid term format: {term.strip()}. Should be 'voice_name * weight'")
|
| 709 |
+
|
| 710 |
+
voice_name = parts[0].strip()
|
| 711 |
+
weight = float(parts[1].strip())
|
| 712 |
+
weights+=weight
|
| 713 |
+
# print(voice_name)
|
| 714 |
+
# print(weight)
|
| 715 |
+
# Get the voice tensor
|
| 716 |
+
if voice_name not in voices:
|
| 717 |
+
raise ValueError(f"Unknown voice: {voice_name}")
|
| 718 |
+
|
| 719 |
+
voice_tensor = voices[voice_name]
|
| 720 |
+
|
| 721 |
+
# Add to weighted sum
|
| 722 |
+
if weighted_sum is None:
|
| 723 |
+
weighted_sum = weight * voice_tensor
|
| 724 |
+
else:
|
| 725 |
+
weighted_sum += weight * voice_tensor
|
| 726 |
+
return weighted_sum/weights
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
def get_new_voice(formula):
|
| 735 |
+
# print(formula)
|
| 736 |
+
try:
|
| 737 |
+
# Parse the formula and get the combined voice tensor
|
| 738 |
+
weighted_voices = parse_voice_formula(formula)
|
| 739 |
+
voice_pack_name = "./weighted_normalised_voices.pt"
|
| 740 |
+
# Save and load the combined voice
|
| 741 |
+
torch.save(weighted_voices, voice_pack_name)
|
| 742 |
+
# print(f"Voice pack saved at: {voice_pack_name}")
|
| 743 |
+
return voice_pack_name
|
| 744 |
+
except Exception as e:
|
| 745 |
+
raise gr.Error(f"Failed to create voice: {str(e)}")
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
def generate_voice_formula(*values):
|
| 749 |
+
"""
|
| 750 |
+
Generate a formatted string showing the normalized voice combination.
|
| 751 |
+
Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
|
| 752 |
+
"""
|
| 753 |
+
n = len(values) // 2
|
| 754 |
+
checkbox_values = values[:n]
|
| 755 |
+
slider_values = list(values[n:])
|
| 756 |
+
global slider_configs
|
| 757 |
+
# Get active sliders and their names
|
| 758 |
+
active_pairs = [(slider_values[i], slider_configs[i][0])
|
| 759 |
+
for i in range(len(slider_configs))
|
| 760 |
+
if checkbox_values[i]]
|
| 761 |
+
|
| 762 |
+
if not active_pairs:
|
| 763 |
+
return ""
|
| 764 |
+
|
| 765 |
+
# If only one voice is selected, use its actual value
|
| 766 |
+
if len(active_pairs) == 1:
|
| 767 |
+
value, name = active_pairs[0]
|
| 768 |
+
return f"{value:.3f} * {name}"
|
| 769 |
+
|
| 770 |
+
# Calculate sum for normalization of multiple voices
|
| 771 |
+
total_sum = sum(value for value, _ in active_pairs)
|
| 772 |
+
|
| 773 |
+
if total_sum == 0:
|
| 774 |
+
return ""
|
| 775 |
+
|
| 776 |
+
# Generate normalized formula for multiple voices
|
| 777 |
+
terms = []
|
| 778 |
+
for value, name in active_pairs:
|
| 779 |
+
normalized_value = value / total_sum
|
| 780 |
+
terms.append(f"{normalized_value:.3f} * {name}")
|
| 781 |
+
|
| 782 |
+
return " + ".join(terms)
|
| 783 |
|
| 784 |
+
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
|
| 788 |
+
def create_voice_mix_ui():
|
| 789 |
+
with gr.Blocks() as demo:
|
| 790 |
+
gr.Markdown(
|
| 791 |
+
"""
|
| 792 |
+
# Kokoro Voice Mixer
|
| 793 |
+
Select voices and adjust their weights to create a mixed voice.
|
| 794 |
+
"""
|
| 795 |
+
)
|
| 796 |
+
|
| 797 |
+
voice_components = {}
|
| 798 |
+
voice_names = list(voices.keys())
|
| 799 |
+
female_voices = [name for name in voice_names if "f_" in name]
|
| 800 |
+
male_voices = [name for name in voice_names if "b_" in name]
|
| 801 |
+
neutral_voices = [name for name in voice_names if "f_" not in name and "b_" not in name]
|
| 802 |
+
|
| 803 |
+
# Define how many columns you want
|
| 804 |
+
num_columns = 3
|
| 805 |
+
|
| 806 |
+
# Function to generate UI
|
| 807 |
+
def generate_ui_row(voice_list):
|
| 808 |
+
num_voices = len(voice_list)
|
| 809 |
+
num_rows = (num_voices + num_columns - 1) // num_columns
|
| 810 |
+
for i in range(num_rows):
|
| 811 |
+
with gr.Row():
|
| 812 |
+
for j in range(num_columns):
|
| 813 |
+
index = i * num_columns + j
|
| 814 |
+
if index < num_voices:
|
| 815 |
+
voice_name = voice_list[index]
|
| 816 |
+
with gr.Column():
|
| 817 |
+
checkbox = gr.Checkbox(label=slider_configs[voice_name])
|
| 818 |
+
weight_slider = gr.Slider(
|
| 819 |
+
minimum=0,
|
| 820 |
+
maximum=1,
|
| 821 |
+
value=1.0,
|
| 822 |
+
step=0.01,
|
| 823 |
+
interactive=False
|
| 824 |
+
)
|
| 825 |
+
voice_components[voice_name] = (checkbox, weight_slider)
|
| 826 |
+
checkbox.change(
|
| 827 |
+
lambda x, slider=weight_slider: gr.update(interactive=x),
|
| 828 |
+
inputs=[checkbox],
|
| 829 |
+
outputs=[weight_slider]
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
generate_ui_row(female_voices)
|
| 833 |
+
generate_ui_row(male_voices)
|
| 834 |
+
generate_ui_row(neutral_voices)
|
| 835 |
+
|
| 836 |
+
formula_inputs = []
|
| 837 |
+
for i in voice_components:
|
| 838 |
+
checkbox, slider = voice_components[i]
|
| 839 |
+
formula_inputs.append(checkbox)
|
| 840 |
+
formula_inputs.append(slider)
|
| 841 |
+
|
| 842 |
+
with gr.Row():
|
| 843 |
+
voice_formula = gr.Textbox(label="Voice Formula", interactive=False)
|
| 844 |
+
|
| 845 |
+
# Function to dynamically update the voice formula
|
| 846 |
+
def update_voice_formula(*args):
|
| 847 |
+
formula_parts = []
|
| 848 |
+
for i, (checkbox, slider) in enumerate(voice_components.values()):
|
| 849 |
+
if args[i * 2]: # If checkbox is selected
|
| 850 |
+
formula_parts.append(f"{list(voice_components.keys())[i]} * {args[i * 2 + 1]:.3f}")
|
| 851 |
+
return " + ".join(formula_parts)
|
| 852 |
+
|
| 853 |
+
|
| 854 |
+
# Update formula whenever any checkbox or slider changes
|
| 855 |
+
for checkbox, slider in voice_components.values():
|
| 856 |
+
checkbox.change(
|
| 857 |
+
update_voice_formula,
|
| 858 |
+
inputs=formula_inputs,
|
| 859 |
+
outputs=[voice_formula]
|
| 860 |
+
)
|
| 861 |
+
slider.change(
|
| 862 |
+
update_voice_formula,
|
| 863 |
+
inputs=formula_inputs,
|
| 864 |
+
outputs=[voice_formula]
|
| 865 |
+
)
|
| 866 |
+
|
| 867 |
+
with gr.Row():
|
| 868 |
+
voice_text = gr.Textbox(
|
| 869 |
+
label='Enter Text',
|
| 870 |
+
lines=3,
|
| 871 |
+
placeholder="Type your text here to preview the custom voice..."
|
| 872 |
+
)
|
| 873 |
+
voice_generator = gr.Button('Generate', variant='primary')
|
| 874 |
+
with gr.Accordion('Audio Settings', open=False):
|
| 875 |
+
model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
|
| 876 |
+
speed = gr.Slider(
|
| 877 |
+
minimum=0.25, maximum=2, value=1, step=0.1,
|
| 878 |
+
label='⚡️Speed', info='Adjust the speaking speed'
|
| 879 |
+
)
|
| 880 |
+
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
| 881 |
+
with gr.Row():
|
| 882 |
+
voice_audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 883 |
+
with gr.Row():
|
| 884 |
+
mix_voice_download = gr.File(label="Download VoicePack")
|
| 885 |
+
with gr.Accordion('Enable Autoplay', open=False):
|
| 886 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 887 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[voice_audio])
|
| 888 |
+
def generate_custom_audio(text_input, formula_text, model_name, speed, remove_silence):
|
| 889 |
+
try:
|
| 890 |
+
new_voice_pack = get_new_voice(formula_text)
|
| 891 |
+
audio_output_path =text_to_speech(text=text_input, model_name=model_name, voice_name="af", speed=speed, pad_between_segments=0, remove_silence=remove_silence, minimum_silence=0.05,custom_voicepack=new_voice_pack,trim=0.0)
|
| 892 |
+
# audio_output_path = text_to_speech(text=text_input, model_name=model_name,voice_name="af", speed=1.0, custom_voicepack=new_voice_pack)
|
| 893 |
+
return audio_output_path,new_voice_pack
|
| 894 |
+
except Exception as e:
|
| 895 |
+
raise gr.Error(f"Failed to generate audio: {e}")
|
| 896 |
+
|
| 897 |
+
|
| 898 |
+
voice_generator.click(
|
| 899 |
+
generate_custom_audio,
|
| 900 |
+
inputs=[voice_text, voice_formula,model_name,speed,remove_silence],
|
| 901 |
+
outputs=[voice_audio,mix_voice_download]
|
| 902 |
+
)
|
| 903 |
+
return demo
|
| 904 |
+
|
| 905 |
+
demo4 = create_voice_mix_ui()
|
| 906 |
+
|
| 907 |
+
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
|
| 914 |
display_text = " \n".join(voice_list)
|
| 915 |
|
| 916 |
+
with gr.Blocks() as demo5:
|
|
|
|
| 917 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
| 918 |
+
|
| 919 |
+
|
| 920 |
|
| 921 |
|
| 922 |
+
import click
|
| 923 |
+
@click.command()
|
| 924 |
+
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
| 925 |
+
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
| 926 |
+
def main(debug, share):
|
| 927 |
+
demo = gr.TabbedInterface([demo1, demo2,demo3,demo4,demo5], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Voice Mix","Available Voice Names"],title="Kokoro TTS")
|
| 928 |
|
| 929 |
demo.queue().launch(debug=debug, share=share)
|
| 930 |
#Run on local network
|
|
|
|
| 957 |
|
| 958 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
| 959 |
# shutil.move(result, save_at)
|
| 960 |
+
# print(f"Saved at {save_at}")
|