Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
# Initalize a pipeline
|
| 3 |
from kokoro import KPipeline
|
| 4 |
# from IPython.display import display, Audio
|
|
@@ -6,24 +5,38 @@ from kokoro import KPipeline
|
|
| 6 |
import os
|
| 7 |
from huggingface_hub import list_repo_files
|
| 8 |
import uuid
|
| 9 |
-
import re
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
|
| 13 |
-
#translate langauge
|
| 14 |
from deep_translator import GoogleTranslator
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
| 28 |
lang_code=language_map_local[target_language]
|
| 29 |
sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
|
|
@@ -43,7 +56,7 @@ def bulk_translate(text, target_language, chunk_size=500):
|
|
| 43 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
| 44 |
result=" ".join(translated_chunks)
|
| 45 |
return result.strip()
|
| 46 |
-
|
| 47 |
# Language mapping dictionary
|
| 48 |
language_map = {
|
| 49 |
"American English": "a",
|
|
@@ -67,7 +80,7 @@ def update_pipeline(Language):
|
|
| 67 |
# Only update if the language is different
|
| 68 |
if new_lang != last_used_language:
|
| 69 |
pipeline = KPipeline(lang_code=new_lang)
|
| 70 |
-
last_used_language = new_lang
|
| 71 |
try:
|
| 72 |
pipeline = KPipeline(lang_code=new_lang)
|
| 73 |
last_used_language = new_lang # Update last used language
|
|
@@ -125,7 +138,7 @@ def clean_text(text):
|
|
| 125 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
| 126 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
| 127 |
r'', flags=re.UNICODE)
|
| 128 |
-
|
| 129 |
text = emoji_pattern.sub(r'', text)
|
| 130 |
|
| 131 |
# Remove multiple spaces and extra line breaks
|
|
@@ -139,13 +152,13 @@ def tts_file_name(text,language):
|
|
| 139 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
| 140 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
| 141 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
| 142 |
-
language=language.replace(" ", "_").strip()
|
| 143 |
# Truncate or handle empty text
|
| 144 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
| 145 |
-
|
| 146 |
# Generate a random string for uniqueness
|
| 147 |
random_string = uuid.uuid4().hex[:8].upper()
|
| 148 |
-
|
| 149 |
# Construct the file name
|
| 150 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
| 151 |
return file_name
|
|
@@ -166,7 +179,7 @@ def remove_silence_function(file_path,minimum_silence=50):
|
|
| 166 |
audio_chunks = split_on_silence(sound,
|
| 167 |
min_silence_len=100,
|
| 168 |
silence_thresh=-45,
|
| 169 |
-
keep_silence=minimum_silence)
|
| 170 |
# Putting the file back together
|
| 171 |
combined = AudioSegment.empty()
|
| 172 |
for chunk in audio_chunks:
|
|
@@ -205,7 +218,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
| 205 |
duration_sec = len(audio_np) / 24000
|
| 206 |
timestamps[i]["duration"] = duration_sec
|
| 207 |
wav_file.writeframes(audio_bytes)
|
| 208 |
-
if remove_silence:
|
| 209 |
keep_silence = int(keep_silence_up_to * 1000)
|
| 210 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
| 211 |
return new_wave_file,timestamps
|
|
@@ -257,7 +270,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
|
|
| 257 |
|
| 258 |
for entry in word_level_timestamps:
|
| 259 |
word = entry["word"]
|
| 260 |
-
|
| 261 |
# Skip punctuation if enabled
|
| 262 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
| 263 |
continue
|
|
@@ -320,13 +333,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
|
|
| 320 |
|
| 321 |
# Skip selected punctuation from remove_punctuation list
|
| 322 |
if word in remove_punctuation:
|
| 323 |
-
continue
|
| 324 |
|
| 325 |
# Attach punctuation to the previous word
|
| 326 |
if word in string.punctuation:
|
| 327 |
if subtitle_words:
|
| 328 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
| 329 |
-
continue
|
| 330 |
|
| 331 |
# Start a new subtitle block if needed
|
| 332 |
if start_time is None:
|
|
@@ -383,16 +396,16 @@ import re
|
|
| 383 |
def fix_punctuation(text):
|
| 384 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
| 385 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
| 386 |
-
|
| 387 |
# Handle quotation marks: remove spaces before and after them
|
| 388 |
text = text.replace('" ', '"')
|
| 389 |
text = text.replace(' "', '"')
|
| 390 |
text = text.replace('" ', '"')
|
| 391 |
-
|
| 392 |
# Track quotation marks to add space after closing quotes
|
| 393 |
track = 0
|
| 394 |
result = []
|
| 395 |
-
|
| 396 |
for index, char in enumerate(text):
|
| 397 |
if char == '"':
|
| 398 |
track += 1
|
|
@@ -495,10 +508,9 @@ def save_current_data():
|
|
| 495 |
if os.path.exists("./last"):
|
| 496 |
shutil.rmtree("./last")
|
| 497 |
os.makedirs("./last",exist_ok=True)
|
| 498 |
-
|
| 499 |
-
|
| 500 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
|
| 501 |
-
if translate_text:
|
| 502 |
text=bulk_translate(text, Language, chunk_size=500)
|
| 503 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
| 504 |
if remove_silence==False:
|
|
@@ -516,16 +528,15 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
|
|
| 516 |
shutil.copy(normal_srt, "./last/")
|
| 517 |
shutil.copy(json_file, "./last/")
|
| 518 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
| 519 |
-
return save_path,save_path,None,None,None
|
| 520 |
-
|
| 521 |
-
|
| 522 |
|
| 523 |
|
| 524 |
|
| 525 |
-
def
|
| 526 |
-
def toggle_autoplay(autoplay):
|
| 527 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
| 528 |
-
|
|
|
|
|
|
|
| 529 |
# Define examples in the format you mentioned
|
| 530 |
dummy_examples = [
|
| 531 |
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
|
|
@@ -538,17 +549,16 @@ def ui():
|
|
| 538 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
| 539 |
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
|
| 540 |
]
|
| 541 |
-
|
| 542 |
with gr.Blocks() as demo:
|
| 543 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
| 544 |
-
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
| 545 |
-
|
| 546 |
-
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
| 547 |
|
| 548 |
with gr.Row():
|
| 549 |
with gr.Column():
|
| 550 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
| 551 |
-
|
| 552 |
with gr.Row():
|
| 553 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
| 554 |
|
|
@@ -588,7 +598,7 @@ def tutorial():
|
|
| 588 |
# Markdown explanation for language code
|
| 589 |
explanation = """
|
| 590 |
## Language Code Explanation:
|
| 591 |
-
Example: `'af_bella'`
|
| 592 |
- **'a'** stands for **American English**.
|
| 593 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
| 594 |
- **'bella'** refers to the specific voice.
|
|
@@ -609,11 +619,298 @@ def tutorial():
|
|
| 609 |
- **"m_"**: Male
|
| 610 |
"""
|
| 611 |
with gr.Blocks() as demo2:
|
| 612 |
-
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
| 613 |
gr.Markdown(explanation) # Display the explanation
|
| 614 |
return demo2
|
| 615 |
|
| 616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
import click
|
| 619 |
@click.command()
|
|
@@ -622,8 +919,9 @@ import click
|
|
| 622 |
def main(debug, share):
|
| 623 |
# def main(debug=True, share=True):
|
| 624 |
demo1 = ui()
|
| 625 |
-
demo2 =
|
| 626 |
-
|
|
|
|
| 627 |
demo.queue().launch(debug=debug, share=share)
|
| 628 |
# demo.queue().launch(debug=debug, share=share,server_port=9000)
|
| 629 |
#Run on local network
|
|
@@ -638,4 +936,4 @@ last_used_language = "a"
|
|
| 638 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 639 |
temp_folder = create_audio_dir()
|
| 640 |
if __name__ == "__main__":
|
| 641 |
-
main()
|
|
|
|
|
|
|
| 1 |
# Initalize a pipeline
|
| 2 |
from kokoro import KPipeline
|
| 3 |
# from IPython.display import display, Audio
|
|
|
|
| 5 |
import os
|
| 6 |
from huggingface_hub import list_repo_files
|
| 7 |
import uuid
|
| 8 |
+
import re
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
|
| 12 |
+
#translate langauge
|
| 13 |
from deep_translator import GoogleTranslator
|
| 14 |
+
language_map_local = {
|
| 15 |
+
"American English": "en",
|
| 16 |
+
"British English": "en",
|
| 17 |
+
"Hindi": "hi",
|
| 18 |
+
"Spanish": "es",
|
| 19 |
+
"French": "fr",
|
| 20 |
+
"Italian": "it",
|
| 21 |
+
"Brazilian Portuguese": "pt",
|
| 22 |
+
"Japanese": "ja",
|
| 23 |
+
"Mandarin Chinese": "zh-CN"
|
| 24 |
+
}
|
| 25 |
+
def bulk_translate(text, target_language, chunk_size=500,MAX_ALLOWED_CHARACTERS = 10000):
|
| 26 |
+
if len(text)>=MAX_ALLOWED_CHARACTERS:
|
| 27 |
+
gr.Warning("[WARNING] Text too long — skipping translation to prevent Google Translate abuse.")
|
| 28 |
+
return text
|
| 29 |
+
# language_map_local = {
|
| 30 |
+
# "American English": "en",
|
| 31 |
+
# "British English": "en",
|
| 32 |
+
# "Hindi": "hi",
|
| 33 |
+
# "Spanish": "es",
|
| 34 |
+
# "French": "fr",
|
| 35 |
+
# "Italian": "it",
|
| 36 |
+
# "Brazilian Portuguese": "pt",
|
| 37 |
+
# "Japanese": "ja",
|
| 38 |
+
# "Mandarin Chinese": "zh-CN"
|
| 39 |
+
# }
|
| 40 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
| 41 |
lang_code=language_map_local[target_language]
|
| 42 |
sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
|
|
|
|
| 56 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
| 57 |
result=" ".join(translated_chunks)
|
| 58 |
return result.strip()
|
| 59 |
+
|
| 60 |
# Language mapping dictionary
|
| 61 |
language_map = {
|
| 62 |
"American English": "a",
|
|
|
|
| 80 |
# Only update if the language is different
|
| 81 |
if new_lang != last_used_language:
|
| 82 |
pipeline = KPipeline(lang_code=new_lang)
|
| 83 |
+
last_used_language = new_lang
|
| 84 |
try:
|
| 85 |
pipeline = KPipeline(lang_code=new_lang)
|
| 86 |
last_used_language = new_lang # Update last used language
|
|
|
|
| 138 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
| 139 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
| 140 |
r'', flags=re.UNICODE)
|
| 141 |
+
|
| 142 |
text = emoji_pattern.sub(r'', text)
|
| 143 |
|
| 144 |
# Remove multiple spaces and extra line breaks
|
|
|
|
| 152 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
| 153 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
| 154 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
| 155 |
+
language=language.replace(" ", "_").strip()
|
| 156 |
# Truncate or handle empty text
|
| 157 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
| 158 |
+
|
| 159 |
# Generate a random string for uniqueness
|
| 160 |
random_string = uuid.uuid4().hex[:8].upper()
|
| 161 |
+
|
| 162 |
# Construct the file name
|
| 163 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
| 164 |
return file_name
|
|
|
|
| 179 |
audio_chunks = split_on_silence(sound,
|
| 180 |
min_silence_len=100,
|
| 181 |
silence_thresh=-45,
|
| 182 |
+
keep_silence=minimum_silence)
|
| 183 |
# Putting the file back together
|
| 184 |
combined = AudioSegment.empty()
|
| 185 |
for chunk in audio_chunks:
|
|
|
|
| 218 |
duration_sec = len(audio_np) / 24000
|
| 219 |
timestamps[i]["duration"] = duration_sec
|
| 220 |
wav_file.writeframes(audio_bytes)
|
| 221 |
+
if remove_silence:
|
| 222 |
keep_silence = int(keep_silence_up_to * 1000)
|
| 223 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
| 224 |
return new_wave_file,timestamps
|
|
|
|
| 270 |
|
| 271 |
for entry in word_level_timestamps:
|
| 272 |
word = entry["word"]
|
| 273 |
+
|
| 274 |
# Skip punctuation if enabled
|
| 275 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
| 276 |
continue
|
|
|
|
| 333 |
|
| 334 |
# Skip selected punctuation from remove_punctuation list
|
| 335 |
if word in remove_punctuation:
|
| 336 |
+
continue
|
| 337 |
|
| 338 |
# Attach punctuation to the previous word
|
| 339 |
if word in string.punctuation:
|
| 340 |
if subtitle_words:
|
| 341 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
| 342 |
+
continue
|
| 343 |
|
| 344 |
# Start a new subtitle block if needed
|
| 345 |
if start_time is None:
|
|
|
|
| 396 |
def fix_punctuation(text):
|
| 397 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
| 398 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
| 399 |
+
|
| 400 |
# Handle quotation marks: remove spaces before and after them
|
| 401 |
text = text.replace('" ', '"')
|
| 402 |
text = text.replace(' "', '"')
|
| 403 |
text = text.replace('" ', '"')
|
| 404 |
+
|
| 405 |
# Track quotation marks to add space after closing quotes
|
| 406 |
track = 0
|
| 407 |
result = []
|
| 408 |
+
|
| 409 |
for index, char in enumerate(text):
|
| 410 |
if char == '"':
|
| 411 |
track += 1
|
|
|
|
| 508 |
if os.path.exists("./last"):
|
| 509 |
shutil.rmtree("./last")
|
| 510 |
os.makedirs("./last",exist_ok=True)
|
| 511 |
+
|
|
|
|
| 512 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
|
| 513 |
+
if translate_text:
|
| 514 |
text=bulk_translate(text, Language, chunk_size=500)
|
| 515 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
| 516 |
if remove_silence==False:
|
|
|
|
| 528 |
shutil.copy(normal_srt, "./last/")
|
| 529 |
shutil.copy(json_file, "./last/")
|
| 530 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
| 531 |
+
return save_path,save_path,None,None,None
|
|
|
|
|
|
|
| 532 |
|
| 533 |
|
| 534 |
|
| 535 |
+
def toggle_autoplay(autoplay):
|
|
|
|
| 536 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
| 537 |
+
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
|
| 538 |
+
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
| 539 |
+
def ui():
|
| 540 |
# Define examples in the format you mentioned
|
| 541 |
dummy_examples = [
|
| 542 |
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
|
|
|
|
| 549 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
| 550 |
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
|
| 551 |
]
|
| 552 |
+
|
| 553 |
with gr.Blocks() as demo:
|
| 554 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
| 555 |
+
# gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
| 556 |
+
|
|
|
|
| 557 |
|
| 558 |
with gr.Row():
|
| 559 |
with gr.Column():
|
| 560 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
| 561 |
+
|
| 562 |
with gr.Row():
|
| 563 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
| 564 |
|
|
|
|
| 598 |
# Markdown explanation for language code
|
| 599 |
explanation = """
|
| 600 |
## Language Code Explanation:
|
| 601 |
+
Example: `'af_bella'`
|
| 602 |
- **'a'** stands for **American English**.
|
| 603 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
| 604 |
- **'bella'** refers to the specific voice.
|
|
|
|
| 619 |
- **"m_"**: Male
|
| 620 |
"""
|
| 621 |
with gr.Blocks() as demo2:
|
| 622 |
+
# gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
| 623 |
gr.Markdown(explanation) # Display the explanation
|
| 624 |
return demo2
|
| 625 |
|
| 626 |
|
| 627 |
+
#@title subtitle
|
| 628 |
+
import os
|
| 629 |
+
import re
|
| 630 |
+
import uuid
|
| 631 |
+
import shutil
|
| 632 |
+
import platform
|
| 633 |
+
import datetime
|
| 634 |
+
import subprocess
|
| 635 |
+
|
| 636 |
+
import pysrt
|
| 637 |
+
import librosa
|
| 638 |
+
import soundfile as sf
|
| 639 |
+
from tqdm.auto import tqdm
|
| 640 |
+
from pydub import AudioSegment
|
| 641 |
+
from deep_translator import GoogleTranslator
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
# ---------------------- Utility Functions ----------------------
|
| 645 |
+
def get_current_time():
|
| 646 |
+
return datetime.datetime.now().strftime("%I_%M_%p")
|
| 647 |
+
|
| 648 |
+
def get_subtitle_Dub_path(srt_file_path, Language):
|
| 649 |
+
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 650 |
+
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
| 651 |
+
os.makedirs(full_base_path, exist_ok=True)
|
| 652 |
+
random_string = str(uuid.uuid4())[:6]
|
| 653 |
+
lang = language_map_local.get(Language, Language.replace(" ", "_"))
|
| 654 |
+
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
| 655 |
+
return new_path.replace("__", "_")
|
| 656 |
+
|
| 657 |
+
def clean_srt(input_path):
|
| 658 |
+
def clean_srt_line(text):
|
| 659 |
+
for bad in ["[", "]", "♫"]:
|
| 660 |
+
text = text.replace(bad, "")
|
| 661 |
+
return text.strip()
|
| 662 |
+
|
| 663 |
+
subs = pysrt.open(input_path, encoding='utf-8')
|
| 664 |
+
output_path = input_path.lower().replace(".srt", "") + "_.srt"
|
| 665 |
+
with open(output_path, "w", encoding='utf-8') as file:
|
| 666 |
+
for sub in subs:
|
| 667 |
+
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
| 668 |
+
return output_path
|
| 669 |
+
|
| 670 |
+
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
| 671 |
+
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
| 672 |
+
subs = pysrt.open(input_path, encoding='utf-8')
|
| 673 |
+
if len(subs) > max_segments:
|
| 674 |
+
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
| 675 |
+
return input_path
|
| 676 |
+
|
| 677 |
+
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
| 678 |
+
full_text = "\n".join(original)
|
| 679 |
+
|
| 680 |
+
chunks, start = [], 0
|
| 681 |
+
while start < len(full_text):
|
| 682 |
+
end = start + chunk_size
|
| 683 |
+
split_point = full_text.rfind("<#", start, end) if end < len(full_text) else len(full_text)
|
| 684 |
+
chunks.append(full_text[start:split_point])
|
| 685 |
+
start = split_point
|
| 686 |
+
|
| 687 |
+
lang_code = language_map_local.get(target_language, "en")
|
| 688 |
+
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
| 689 |
+
translated_text = "\n".join(translated_chunks)
|
| 690 |
+
|
| 691 |
+
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
| 692 |
+
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
| 693 |
+
|
| 694 |
+
for i, sub in enumerate(subs):
|
| 695 |
+
sub.text = translated_dict.get(i, sub.text)
|
| 696 |
+
|
| 697 |
+
subs.save(output_path, encoding='utf-8')
|
| 698 |
+
return output_path
|
| 699 |
+
|
| 700 |
+
def prepare_srt(srt_path, target_language, translate=False):
|
| 701 |
+
path = clean_srt(srt_path)
|
| 702 |
+
return translate_srt(path, target_language) if translate else path
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
def is_ffmpeg_installed():
|
| 706 |
+
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
| 707 |
+
try:
|
| 708 |
+
subprocess.run([ffmpeg_exe, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 709 |
+
return True, ffmpeg_exe
|
| 710 |
+
except Exception:
|
| 711 |
+
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
| 712 |
+
return False, ffmpeg_exe
|
| 713 |
+
|
| 714 |
+
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
| 715 |
+
try:
|
| 716 |
+
y, sr = librosa.load(input_file, sr=None)
|
| 717 |
+
y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
|
| 718 |
+
sf.write(output_file, y_stretched, sr)
|
| 719 |
+
except Exception as e:
|
| 720 |
+
gr.Warning(f"Librosa speedup failed: {e}")
|
| 721 |
+
shutil.copy(input_file, output_file)
|
| 722 |
+
|
| 723 |
+
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
| 724 |
+
if use_ffmpeg:
|
| 725 |
+
try:
|
| 726 |
+
subprocess.run([ffmpeg_path, "-i", input_file, "-filter:a", f"atempo={speedup_factor}", output_file, "-y"], check=True)
|
| 727 |
+
except Exception as e:
|
| 728 |
+
gr.Error(f"FFmpeg speedup error: {e}")
|
| 729 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 730 |
+
else:
|
| 731 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
| 732 |
+
|
| 733 |
+
def remove_edge_silence(input_path, output_path):
|
| 734 |
+
y, sr = librosa.load(input_path, sr=None)
|
| 735 |
+
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
| 736 |
+
sf.write(output_path, trimmed_audio, sr)
|
| 737 |
+
return output_path
|
| 738 |
+
|
| 739 |
+
|
| 740 |
+
# ---------------------- Main Class ----------------------
|
| 741 |
+
class SRTDubbing:
|
| 742 |
+
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
| 743 |
+
self.use_ffmpeg = use_ffmpeg
|
| 744 |
+
self.ffmpeg_path = ffmpeg_path
|
| 745 |
+
self.cache_dir = "./cache"
|
| 746 |
+
os.makedirs("./dummy", exist_ok=True)
|
| 747 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
| 748 |
+
|
| 749 |
+
@staticmethod
|
| 750 |
+
def convert_to_millisecond(t):
|
| 751 |
+
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
| 752 |
+
|
| 753 |
+
@staticmethod
|
| 754 |
+
def read_srt_file(file_path):
|
| 755 |
+
subs = pysrt.open(file_path, encoding='utf-8')
|
| 756 |
+
entries = []
|
| 757 |
+
prev_end = 0
|
| 758 |
+
for idx, sub in enumerate(subs, 1):
|
| 759 |
+
start, end = SRTDubbing.convert_to_millisecond(sub.start), SRTDubbing.convert_to_millisecond(sub.end)
|
| 760 |
+
pause = start - prev_end if idx > 1 else start
|
| 761 |
+
entries.append({
|
| 762 |
+
'entry_number': idx,
|
| 763 |
+
'start_time': start,
|
| 764 |
+
'end_time': end,
|
| 765 |
+
'text': sub.text.strip(),
|
| 766 |
+
'pause_time': pause,
|
| 767 |
+
'audio_name': f"{idx}.wav",
|
| 768 |
+
'previous_pause': f"{idx}_before_pause.wav",
|
| 769 |
+
})
|
| 770 |
+
prev_end = end
|
| 771 |
+
return entries
|
| 772 |
+
|
| 773 |
+
def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration):
|
| 774 |
+
temp = "./cache/temp.wav"
|
| 775 |
+
# Step 1: Generate initial audio
|
| 776 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=False, keep_silence_up_to=0.05)
|
| 777 |
+
# ✂️ Remove leading and trailing silence to make timing tight without trimming actual speech.
|
| 778 |
+
remove_edge_silence(path, temp)
|
| 779 |
+
# 📏 Load the trimmed audio and get its duration in milliseconds.
|
| 780 |
+
audio = AudioSegment.from_file(temp)
|
| 781 |
+
|
| 782 |
+
# ⏱️ If no duration is specified (edge case), use the TTS as-is without speed/timing adjustments.
|
| 783 |
+
if actual_duration == 0:
|
| 784 |
+
shutil.move(temp, audio_path)
|
| 785 |
+
return
|
| 786 |
+
|
| 787 |
+
# Step 2: If TTS audio is longer, retry with remove_silence=True
|
| 788 |
+
if len(audio) > actual_duration:
|
| 789 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=True, keep_silence_up_to=0.05)
|
| 790 |
+
remove_edge_silence(path, temp)
|
| 791 |
+
audio = AudioSegment.from_file(temp)
|
| 792 |
+
|
| 793 |
+
# Step 3: If still longer → speed up
|
| 794 |
+
if len(audio) > actual_duration:
|
| 795 |
+
factor = len(audio) / actual_duration
|
| 796 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=factor, remove_silence=True, keep_silence_up_to=0.05)
|
| 797 |
+
remove_edge_silence(path, temp)
|
| 798 |
+
audio = AudioSegment.from_file(temp)
|
| 799 |
+
|
| 800 |
+
# Final Adjustment: Speed up via FFmpeg or librosa
|
| 801 |
+
if len(audio) > actual_duration:
|
| 802 |
+
factor = len(audio) / actual_duration
|
| 803 |
+
final_temp = "./cache/speedup_temp.wav"
|
| 804 |
+
change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
|
| 805 |
+
shutil.move(final_temp, audio_path)
|
| 806 |
+
|
| 807 |
+
# Add silence if too short
|
| 808 |
+
elif len(audio) < actual_duration:
|
| 809 |
+
silence = AudioSegment.silent(duration=actual_duration - len(audio))
|
| 810 |
+
(audio + silence).export(audio_path, format="wav")
|
| 811 |
+
# ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
|
| 812 |
+
else:
|
| 813 |
+
shutil.move(temp, audio_path) #bad code
|
| 814 |
+
|
| 815 |
+
@staticmethod
|
| 816 |
+
def make_silence(duration, path):
|
| 817 |
+
AudioSegment.silent(duration=duration).export(path, format="wav")
|
| 818 |
+
|
| 819 |
+
@staticmethod
|
| 820 |
+
def create_folder_for_srt(srt_file_path):
|
| 821 |
+
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
| 822 |
+
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
| 823 |
+
os.makedirs(folder, exist_ok=True)
|
| 824 |
+
return folder
|
| 825 |
+
|
| 826 |
+
@staticmethod
|
| 827 |
+
def concatenate_audio_files(paths, output):
|
| 828 |
+
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
| 829 |
+
audio.export(output, format="wav")
|
| 830 |
+
|
| 831 |
+
def srt_to_dub(self, srt_path, output_path, language, voice):
|
| 832 |
+
entries = self.read_srt_file(srt_path)
|
| 833 |
+
folder = self.create_folder_for_srt(srt_path)
|
| 834 |
+
all_audio = []
|
| 835 |
+
for entry in tqdm(entries):
|
| 836 |
+
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
| 837 |
+
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
| 838 |
+
|
| 839 |
+
tts_path = os.path.join(folder, entry['audio_name'])
|
| 840 |
+
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
|
| 841 |
+
all_audio.append(tts_path)
|
| 842 |
+
|
| 843 |
+
self.concatenate_audio_files(all_audio, output_path)
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
# ---------------------- Entrypoint ----------------------
|
| 847 |
+
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
|
| 848 |
+
if not srt_path.endswith(".srt"):
|
| 849 |
+
gr.Error("Please upload a valid .srt file", duration=5)
|
| 850 |
+
return None
|
| 851 |
+
|
| 852 |
+
use_ffmpeg, ffmpeg_path = is_ffmpeg_installed()
|
| 853 |
+
processed_srt = prepare_srt(srt_path, Language, translate)
|
| 854 |
+
output_path = get_subtitle_Dub_path(srt_path, Language)
|
| 855 |
+
|
| 856 |
+
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
|
| 857 |
+
return output_path
|
| 858 |
+
|
| 859 |
+
def subtitle_ui():
|
| 860 |
+
with gr.Blocks() as demo:
|
| 861 |
+
|
| 862 |
+
gr.Markdown(
|
| 863 |
+
"""
|
| 864 |
+
# Generate Audio File From Subtitle [Upload Only .srt file]
|
| 865 |
+
|
| 866 |
+
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
| 867 |
+
|
| 868 |
+
"""
|
| 869 |
+
)
|
| 870 |
+
with gr.Row():
|
| 871 |
+
with gr.Column():
|
| 872 |
+
srt_file = gr.File(label='Upload .srt Subtitle File Only')
|
| 873 |
+
with gr.Row():
|
| 874 |
+
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
| 875 |
+
with gr.Row():
|
| 876 |
+
voice = gr.Dropdown(
|
| 877 |
+
voice_names,
|
| 878 |
+
value='af_bella',
|
| 879 |
+
allow_custom_value=False,
|
| 880 |
+
label='🎙️ Choose VoicePack',
|
| 881 |
+
)
|
| 882 |
+
with gr.Row():
|
| 883 |
+
generate_btn_ = gr.Button('Generate', variant='primary')
|
| 884 |
+
|
| 885 |
+
with gr.Accordion('Other Settings', open=False):
|
| 886 |
+
translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
|
| 890 |
+
with gr.Column():
|
| 891 |
+
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 892 |
+
with gr.Accordion('Enable Autoplay', open=False):
|
| 893 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 894 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 895 |
+
|
| 896 |
+
# srt_file.submit(
|
| 897 |
+
# srt_process,
|
| 898 |
+
# inputs=[srt_file, voice],
|
| 899 |
+
# outputs=[audio]
|
| 900 |
+
# )
|
| 901 |
+
generate_btn_.click(
|
| 902 |
+
srt_process,
|
| 903 |
+
inputs=[srt_file,language_name,voice,translate_text],
|
| 904 |
+
outputs=[audio]
|
| 905 |
+
)
|
| 906 |
+
return demo
|
| 907 |
+
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
# Example usage:
|
| 911 |
+
# srt_file_path = "/content/me.srt"
|
| 912 |
+
# dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
|
| 913 |
+
# print(f"Audio file saved at: {dub_audio_path}")
|
| 914 |
|
| 915 |
import click
|
| 916 |
@click.command()
|
|
|
|
| 919 |
def main(debug, share):
|
| 920 |
# def main(debug=True, share=True):
|
| 921 |
demo1 = ui()
|
| 922 |
+
demo2 = subtitle_ui()
|
| 923 |
+
demo3 = tutorial()
|
| 924 |
+
demo = gr.TabbedInterface([demo1, demo2,demo3],["Multilingual TTS","SRT Dubbing","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
| 925 |
demo.queue().launch(debug=debug, share=share)
|
| 926 |
# demo.queue().launch(debug=debug, share=share,server_port=9000)
|
| 927 |
#Run on local network
|
|
|
|
| 936 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 937 |
temp_folder = create_audio_dir()
|
| 938 |
if __name__ == "__main__":
|
| 939 |
+
main()
|