Spaces:
Sleeping
Sleeping
File size: 3,007 Bytes
77dc7d0 1e48f34 b3f30bd 3bb4d23 106ac57 77dc7d0 b3f30bd 77dc7d0 1e48f34 b3f30bd 1e48f34 b3f30bd 1e48f34 77dc7d0 3bb4d23 77dc7d0 3bb4d23 1e48f34 3bb4d23 77dc7d0 106ac57 3bb4d23 106ac57 b3f30bd 3bb4d23 106ac57 3bb4d23 77dc7d0 106ac57 b3f30bd 3bb4d23 106ac57 3bb4d23 106ac57 77dc7d0 106ac57 b3f30bd 106ac57 b3f30bd 106ac57 3bb4d23 dff9996 106ac57 3bb4d23 77dc7d0 dff9996 106ac57 b3f30bd 3bb4d23 b3f30bd 106ac57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import gradio as gr
import tempfile
import soundfile as sf
import numpy as np
from kokoro import KPipeline
import time
import nltk
# Download the necessary NLTK data for sentence splitting
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
pipeline = KPipeline(lang_code="a")
VOICES = [
"af_heart", "af_bella", "af_nicole",
"am_adam", "am_michael",
"bf_emma", "bm_george"
]
SR = 24000
def tts_stream(text, voice):
text = (text or "").strip()
if not text:
yield None, None, 0, "Please enter text."
return
# --- IMPROVEMENT HERE ---
# Use NLTK to split text into linguistically correct sentences.
# This handles "Dr.", "Mr.", "?", "!", and quotes correctly.
sentences = sent_tokenize(text)
total = len(sentences)
audio_chunks = []
# Initialize an empty array for the concatenated audio
full_audio = np.array([], dtype=np.float32)
print(f"Split into {total} sentences.")
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
# Run Kokoro on the specific sentence
gen = pipeline(sentence, voice=voice)
# Kokoro returns a generator, we grab the audio from it
for (gs, ps, audio) in gen:
audio = np.asarray(audio, dtype=np.float32)
audio_chunks.append(audio)
# Progress streaming to UI
progress = int((i + 1) / total * 100)
yield None, None, progress, f"Processing sentence {i+1}/{total}..."
# Anti-timeout heartbeat
time.sleep(0.05)
if audio_chunks:
final_audio = np.concatenate(audio_chunks)
else:
final_audio = np.array([], dtype=np.float32)
# Write to a temp file for the download button
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, final_audio, SR)
# Return the audio to the player and the file for download
yield (SR, final_audio), tmp.name, 100, "Completed!"
with gr.Blocks(title="Kokoro TTS (Smart Split)") as demo:
gr.Markdown("## ⚡ Kokoro TTS – Smart Sentence Splitting")
with gr.Row():
with gr.Column():
text = gr.Textbox(lines=12, label="Input text", placeholder="Paste long text here...")
voice = gr.Dropdown(VOICES, value="af_heart", label="Voice")
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Audio Output", interactive=False)
file_download = gr.File(label="Download WAV")
progress = gr.Slider(0, 100, step=1, label="Progress", interactive=False)
status = gr.Textbox(label="Status", interactive=False)
run_btn.click(
fn=tts_stream,
inputs=[text, voice],
outputs=[audio_output, file_download, progress, status],
)
demo.queue().launch() |