koko / app.py
iammraat's picture
Update app.py
106ac57 verified
raw
history blame
3.01 kB
import gradio as gr
import tempfile
import soundfile as sf
import numpy as np
from kokoro import KPipeline
import time
import nltk
# Download the necessary NLTK data for sentence splitting
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
pipeline = KPipeline(lang_code="a")
VOICES = [
"af_heart", "af_bella", "af_nicole",
"am_adam", "am_michael",
"bf_emma", "bm_george"
]
SR = 24000
def tts_stream(text, voice):
text = (text or "").strip()
if not text:
yield None, None, 0, "Please enter text."
return
# --- IMPROVEMENT HERE ---
# Use NLTK to split text into linguistically correct sentences.
# This handles "Dr.", "Mr.", "?", "!", and quotes correctly.
sentences = sent_tokenize(text)
total = len(sentences)
audio_chunks = []
# Initialize an empty array for the concatenated audio
full_audio = np.array([], dtype=np.float32)
print(f"Split into {total} sentences.")
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
# Run Kokoro on the specific sentence
gen = pipeline(sentence, voice=voice)
# Kokoro returns a generator, we grab the audio from it
for (gs, ps, audio) in gen:
audio = np.asarray(audio, dtype=np.float32)
audio_chunks.append(audio)
# Progress streaming to UI
progress = int((i + 1) / total * 100)
yield None, None, progress, f"Processing sentence {i+1}/{total}..."
# Anti-timeout heartbeat
time.sleep(0.05)
if audio_chunks:
final_audio = np.concatenate(audio_chunks)
else:
final_audio = np.array([], dtype=np.float32)
# Write to a temp file for the download button
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, final_audio, SR)
# Return the audio to the player and the file for download
yield (SR, final_audio), tmp.name, 100, "Completed!"
with gr.Blocks(title="Kokoro TTS (Smart Split)") as demo:
gr.Markdown("## ⚡ Kokoro TTS – Smart Sentence Splitting")
with gr.Row():
with gr.Column():
text = gr.Textbox(lines=12, label="Input text", placeholder="Paste long text here...")
voice = gr.Dropdown(VOICES, value="af_heart", label="Voice")
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Audio Output", interactive=False)
file_download = gr.File(label="Download WAV")
progress = gr.Slider(0, 100, step=1, label="Progress", interactive=False)
status = gr.Textbox(label="Status", interactive=False)
run_btn.click(
fn=tts_stream,
inputs=[text, voice],
outputs=[audio_output, file_download, progress, status],
)
demo.queue().launch()