File size: 3,007 Bytes
77dc7d0
1e48f34
 
 
b3f30bd
3bb4d23
106ac57
 
 
 
 
 
 
 
 
 
77dc7d0
b3f30bd
77dc7d0
1e48f34
b3f30bd
1e48f34
b3f30bd
1e48f34
77dc7d0
3bb4d23
77dc7d0
3bb4d23
1e48f34
 
3bb4d23
 
77dc7d0
106ac57
 
 
 
 
3bb4d23
 
106ac57
 
 
 
 
b3f30bd
3bb4d23
 
 
 
106ac57
3bb4d23
77dc7d0
106ac57
b3f30bd
3bb4d23
 
106ac57
 
3bb4d23
106ac57
77dc7d0
106ac57
 
b3f30bd
106ac57
 
 
 
b3f30bd
106ac57
3bb4d23
 
dff9996
106ac57
3bb4d23
77dc7d0
dff9996
106ac57
 
 
 
 
 
 
 
 
 
 
 
 
 
b3f30bd
3bb4d23
 
 
 
b3f30bd
 
106ac57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
import tempfile
import soundfile as sf
import numpy as np
from kokoro import KPipeline
import time
import nltk

# Download the necessary NLTK data for sentence splitting
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')
    nltk.download('punkt')

from nltk.tokenize import sent_tokenize

pipeline = KPipeline(lang_code="a")

VOICES = [
    "af_heart", "af_bella", "af_nicole",
    "am_adam", "am_michael",
    "bf_emma", "bm_george"
]

SR = 24000

def tts_stream(text, voice):
    text = (text or "").strip()
    if not text:
        yield None, None, 0, "Please enter text."
        return

    # --- IMPROVEMENT HERE ---
    # Use NLTK to split text into linguistically correct sentences.
    # This handles "Dr.", "Mr.", "?", "!", and quotes correctly.
    sentences = sent_tokenize(text)
    
    total = len(sentences)
    audio_chunks = []
    
    # Initialize an empty array for the concatenated audio
    full_audio = np.array([], dtype=np.float32)

    print(f"Split into {total} sentences.")

    for i, sentence in enumerate(sentences):
        if not sentence.strip():
            continue

        # Run Kokoro on the specific sentence
        gen = pipeline(sentence, voice=voice)

        # Kokoro returns a generator, we grab the audio from it
        for (gs, ps, audio) in gen:
            audio = np.asarray(audio, dtype=np.float32)
            audio_chunks.append(audio)
            
        # Progress streaming to UI
        progress = int((i + 1) / total * 100)
        yield None, None, progress, f"Processing sentence {i+1}/{total}..."

        # Anti-timeout heartbeat
        time.sleep(0.05)

    if audio_chunks:
        final_audio = np.concatenate(audio_chunks)
    else:
        final_audio = np.array([], dtype=np.float32)

    # Write to a temp file for the download button
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, final_audio, SR)

    # Return the audio to the player and the file for download
    yield (SR, final_audio), tmp.name, 100, "Completed!"


with gr.Blocks(title="Kokoro TTS (Smart Split)") as demo:
    gr.Markdown("## ⚡ Kokoro TTS – Smart Sentence Splitting")
    
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(lines=12, label="Input text", placeholder="Paste long text here...")
            voice = gr.Dropdown(VOICES, value="af_heart", label="Voice")
            run_btn = gr.Button("Generate", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(label="Audio Output", interactive=False)
            file_download = gr.File(label="Download WAV")
            progress = gr.Slider(0, 100, step=1, label="Progress", interactive=False)
            status = gr.Textbox(label="Status", interactive=False)

    run_btn.click(
        fn=tts_stream,
        inputs=[text, voice],
        outputs=[audio_output, file_download, progress, status],
    )

demo.queue().launch()