Your Name commited on
Commit
e3b97f2
·
1 Parent(s): f07d5a1
Files changed (2) hide show
  1. demo.py +256 -0
  2. requirements.txt +4 -0
demo.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import os
4
+ from kittentts import KittenTTS
5
+
6
+ SAMPLE_RATE = 24000
7
+
8
+ MODELS = {
9
+ "Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32",
10
+ "Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8",
11
+ "Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8",
12
+ }
13
+
14
+ VOICES = [
15
+ "Bella",
16
+ "Jasper",
17
+ "Luna",
18
+ "Bruno",
19
+ "Rosie",
20
+ "Hugo",
21
+ "Kiki",
22
+ "Leo",
23
+ ]
24
+
25
+ # Initialize all models at startup
26
+ print("Loading models...")
27
+ _model_cache: dict[str, KittenTTS] = {}
28
+ for model_name, model_id in MODELS.items():
29
+ print(f"Loading {model_name}...")
30
+ _model_cache[model_name] = KittenTTS(model_id)
31
+ print("All models loaded!")
32
+
33
+
34
+ def get_model(model_name: str) -> KittenTTS:
35
+ return _model_cache[model_name]
36
+
37
+
38
+ def synthesize(text: str, model_name: str, voice: str, speed: float):
39
+ if not text or not text.strip():
40
+ raise gr.Error("Please enter some text.")
41
+
42
+ tts = get_model(model_name)
43
+ # Note: speed parameter may not be supported in v0.8
44
+ # If you get an error, remove speed=speed from the generate call
45
+ try:
46
+ audio = tts.generate(text.strip(), voice=voice, speed=speed)
47
+ except TypeError:
48
+ # Fallback if speed is not supported
49
+ audio = tts.generate(text.strip(), voice=voice)
50
+
51
+ # audio shape is (1, samples) or (samples,) — normalize to 1-D
52
+ audio = np.squeeze(audio)
53
+
54
+ return (SAMPLE_RATE, audio)
55
+
56
+
57
+ theme = gr.themes.Base(
58
+ primary_hue="neutral",
59
+ secondary_hue="neutral",
60
+ neutral_hue="neutral",
61
+ font=gr.themes.GoogleFont("Inter"),
62
+ ).set(
63
+ body_background_fill="white",
64
+ body_background_fill_dark="white",
65
+ block_background_fill="white",
66
+ block_background_fill_dark="white",
67
+ block_border_color="#e5e5e5",
68
+ block_border_color_dark="#e5e5e5",
69
+ block_shadow="none",
70
+ block_shadow_dark="none",
71
+ button_primary_background_fill="#111111",
72
+ button_primary_background_fill_hover="#333333",
73
+ button_primary_text_color="white",
74
+ button_primary_border_color="#111111",
75
+ input_background_fill="white",
76
+ input_background_fill_dark="white",
77
+ input_border_color="#e5e5e5",
78
+ slider_color="#111111",
79
+ table_border_color="#e5e5e5",
80
+ table_even_background_fill="white",
81
+ table_odd_background_fill="white",
82
+ table_row_focus="white",
83
+ )
84
+
85
+ css = """
86
+ /* Force light mode — prevents OS dark mode from affecting the page */
87
+ :root, html, body { color-scheme: light !important; }
88
+ body, .gradio-container, .main { background: white !important; }
89
+ .gradio-container { max-width: 860px !important; margin: 40px auto !important; }
90
+ footer { display: none !important; }
91
+
92
+ /* Force all text to black — no accent colors */
93
+ *, *::before, *::after {
94
+ color: #111 !important;
95
+ --body-text-color: #111 !important;
96
+ --block-label-text-color: #111 !important;
97
+ --block-title-text-color: #111 !important;
98
+ --color-accent: #111 !important;
99
+ --link-text-color: #111 !important;
100
+ --link-text-color-hover: #111 !important;
101
+ --link-text-color-visited: #111 !important;
102
+ --link-text-color-active: #111 !important;
103
+ }
104
+
105
+ /* Exceptions — keep button text white */
106
+ button.primary, button[variant="primary"] { color: white !important; }
107
+
108
+ /* Error toast notification */
109
+ .toast-wrap, .toast-body, [class*="toast"] {
110
+ background: white !important;
111
+ border: 1px solid #e5e5e5 !important;
112
+ box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
113
+ }
114
+ [class*="toast"] .toast-title, [class*="toast"] .error,
115
+ .toast-wrap .error, span.error {
116
+ color: #b91c1c !important;
117
+ font-weight: 600 !important;
118
+ }
119
+ [class*="toast"] p, [class*="toast"] .toast-text {
120
+ color: #555 !important;
121
+ }
122
+ /* Error badge inside output block */
123
+ .error-wrap, .error {
124
+ background: #fef2f2 !important;
125
+ border-color: #fca5a5 !important;
126
+ color: #b91c1c !important;
127
+ }
128
+
129
+ /* Placeholder text */
130
+ ::placeholder { color: #aaa !important; }
131
+
132
+ /* Backgrounds */
133
+ .block, .form, .wrap, .panel, .gap, .tabs { background: white !important; }
134
+
135
+ /* Block label tabs (e.g. "Output" on the audio component) */
136
+ [data-testid="block-label"] {
137
+ background: white !important;
138
+ color: #111 !important;
139
+ border-color: #e5e5e5 !important;
140
+ }
141
+ [data-testid="block-label"] * { color: #111 !important; }
142
+
143
+ /* Dropdown closed state — gray on the full inner wrapper with its natural padding */
144
+ input[role="listbox"] {
145
+ background: transparent !important;
146
+ }
147
+ .wrap-inner {
148
+ background: #f7f7f7 !important;
149
+ border-radius: 4px !important;
150
+ }
151
+
152
+ /* Dropdown popup list */
153
+ ul.options {
154
+ background: #f7f7f7 !important;
155
+ border: 1px solid #e5e5e5 !important;
156
+ box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important;
157
+ }
158
+ ul.options li {
159
+ background: #f7f7f7 !important;
160
+ color: #111 !important;
161
+ }
162
+ ul.options li:hover, ul.options li.selected {
163
+ background: #eeeeee !important;
164
+ }
165
+
166
+ /* Examples table — force all borders to match */
167
+ .examples-holder, .table-wrap, table, thead, tbody, tr, td, th {
168
+ background: white !important;
169
+ border-color: #e5e5e5 !important;
170
+ }
171
+ .tr-head { box-shadow: none !important; }
172
+ tr:hover td { background: #f9f9f9 !important; }
173
+
174
+ /* Speed number input container and divider */
175
+ .tab-like-container, .tab-like-container *, input[type=number] {
176
+ border-color: #e5e5e5 !important;
177
+ }
178
+ .reset-button {
179
+ -webkit-appearance: none !important;
180
+ appearance: none !important;
181
+ border: none !important;
182
+ background: white !important;
183
+ }
184
+
185
+ /* Slider track */
186
+ input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; }
187
+ input[type=range]::-webkit-slider-thumb { background: #111 !important; }
188
+ """
189
+
190
+ with gr.Blocks(title="KittenTTS Demo") as demo:
191
+ gr.Markdown("# KittenTTS Demo")
192
+ gr.Markdown('<img width="607" height="255" alt="KittenTTS Banner" src="https://github.com/user-attachments/assets/f4646722-ba78-4b25-8a65-81bacee0d4f6" />')
193
+ gr.Markdown("Text-to-speech synthesis with multiple models and voices.")
194
+
195
+ with gr.Row():
196
+ with gr.Column(scale=2):
197
+ text_input = gr.Textbox(
198
+ label="Text",
199
+ placeholder="Enter text to synthesize…",
200
+ lines=5,
201
+ )
202
+ with gr.Row():
203
+ model_select = gr.Dropdown(
204
+ choices=list(MODELS.keys()),
205
+ value="Micro (40M - Balanced)",
206
+ label="Model",
207
+ )
208
+ voice_select = gr.Dropdown(
209
+ choices=VOICES,
210
+ value="Jasper",
211
+ label="Voice",
212
+ )
213
+ speed_slider = gr.Slider(
214
+ minimum=0.5,
215
+ maximum=2.0,
216
+ value=1.0,
217
+ step=0.05,
218
+ label="Speed",
219
+ )
220
+ generate_btn = gr.Button("Generate Speech", variant="primary")
221
+
222
+ with gr.Column(scale=1):
223
+ audio_output = gr.Audio(label="Output", type="numpy")
224
+
225
+ generate_btn.click(
226
+ fn=synthesize,
227
+ inputs=[text_input, model_select, voice_select, speed_slider],
228
+ outputs=audio_output,
229
+ )
230
+
231
+ gr.Examples(
232
+ examples=[
233
+ [
234
+ "Space is a three-dimensional continuum containing positions and directions.",
235
+ "Micro (40M - Balanced)",
236
+ "Jasper",
237
+ 1.0,
238
+ ],
239
+ [
240
+ "It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.",
241
+ "Mini (80M - Best Quality)",
242
+ "Luna",
243
+ 1.0,
244
+ ],
245
+ [
246
+ "Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.",
247
+ "Nano (15M - Fastest)",
248
+ "Bella",
249
+ 1.1,
250
+ ],
251
+ ],
252
+ inputs=[text_input, model_select, voice_select, speed_slider],
253
+ )
254
+
255
+ if __name__ == "__main__":
256
+ demo.launch(server_name="0.0.0.0", theme=theme, css=css)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ numpy
3
+ soundfile
4
+ kittentts @ https://github.com/KittenML/KittenTTS/releases/download/0.8/kittentts-0.8.0-py3-none-any.whl