GoutamSachdev commited on
Commit
0d8dbf9
·
verified ·
1 Parent(s): a65f203

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -184
app.py CHANGED
@@ -1,184 +1,199 @@
1
- from kokoro import KModel, KPipeline
2
- import streamlit as st
3
- import os
4
- import random
5
- import torch
6
- import numpy as np
7
- from io import BytesIO
8
- import time
9
-
10
- IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
11
- CUDA_AVAILABLE = False
12
-
13
- CHAR_LIMIT = None if IS_DUPLICATE else 5000
14
-
15
- # pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
16
- # Initialize models and pipelines only once using session state
17
- if 'models' not in st.session_state:
18
- st.session_state.models = {False: KModel(repo_id='hexgrad/Kokoro-82M').to('cpu').eval()}
19
- if 'pipelines' not in st.session_state:
20
- st.session_state.pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False, repo_id='hexgrad/Kokoro-82M') for lang_code in 'ab'} # Add custom pronunciations
21
- st.session_state.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
22
- st.session_state.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
23
- # Use the models and pipelines from session state
24
- models = st.session_state.models
25
- pipelines = st.session_state.pipelines
26
-
27
-
28
-
29
-
30
-
31
-
32
-
33
-
34
- pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
35
- pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
36
-
37
- def generate_audio(text, voice='af_heart', speed=1):
38
- start_time = time.time()
39
- text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
40
- pipeline = pipelines[voice[0]]
41
- pack = pipeline.load_voice(voice)
42
- for _, ps, _ in pipeline(text, voice, speed):
43
- ref_s = pack[len(ps)-1]
44
- try:
45
- audio = models[False](ps, ref_s, speed)
46
- generation_time = time.time() - start_time
47
- return (24000, audio.numpy()), ps, generation_time
48
- except Exception as e:
49
- st.error(f"Error generating audio: {e}")
50
- return None, '', 0
51
- return None, '', 0
52
- def tokenize_text(text, voice='af_heart'):
53
- pipeline = pipelines[voice[0]]
54
- for _, ps, _ in pipeline(text, voice):
55
- return ps
56
- return ''
57
-
58
- # Load random quotes
59
- with open('en.txt', 'r') as r:
60
- random_quotes = [line.strip() for line in r]
61
-
62
- def get_random_quote():
63
- return random.choice(random_quotes)
64
-
65
- def get_gatsby():
66
- with open('gatsby5k.md', 'r') as r:
67
- return r.read().strip()
68
-
69
- def get_frankenstein():
70
- with open('frankenstein5k.md', 'r') as r:
71
- return r.read().strip()
72
-
73
- CHOICES = {
74
- '🇺🇸 🚺 Heart ❤️': 'af_heart',
75
- '🇺🇸 🚺 Bella 🔥': 'af_bella',
76
- '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
77
- '🇺🇸 🚺 Aoede': 'af_aoede',
78
- '🇺🇸 🚺 Kore': 'af_kore',
79
- '🇺🇸 🚺 Sarah': 'af_sarah',
80
- '🇺🇸 🚺 Nova': 'af_nova',
81
- '🇺🇸 🚺 Sky': 'af_sky',
82
- '🇺🇸 🚺 Alloy': 'af_alloy',
83
- '🇺🇸 🚺 Jessica': 'af_jessica',
84
- '🇺🇸 🚺 River': 'af_river',
85
- '🇺🇸 🚹 Michael': 'am_michael',
86
- '🇺🇸 🚹 Fenrir': 'am_fenrir',
87
- '🇺🇸 🚹 Puck': 'am_puck',
88
- '🇺🇸 🚹 Echo': 'am_echo',
89
- '🇺🇸 🚹 Eric': 'am_eric',
90
- '🇺🇸 🚹 Liam': 'am_liam',
91
- '🇺🇸 🚹 Onyx': 'am_onyx',
92
- '🇺🇸 🚹 Santa': 'am_santa',
93
- '🇺🇸 🚹 Adam': 'am_adam',
94
- '🇬🇧 🚺 Emma': 'bf_emma',
95
- '🇬🇧 🚺 Isabella': 'bf_isabella',
96
- '🇬🇧 🚺 Alice': 'bf_alice',
97
- '🇬🇧 🚺 Lily': 'bf_lily',
98
- '🇬🇧 🚹 George': 'bm_george',
99
- '🇬🇧 🚹 Fable': 'bm_fable',
100
- '🇬🇧 🚹 Lewis': 'bm_lewis',
101
- '🇬🇧 🚹 Daniel': 'bm_daniel',
102
- }
103
-
104
- # Preload voices
105
- for v in CHOICES.values():
106
- pipelines[v[0]].load_voice(v)
107
-
108
- TOKEN_NOTE = '''
109
- 💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
110
-
111
- 💬 To adjust intonation, try punctuation `;:,.!?—…"()""` or stress `ˈ` and `ˌ`
112
-
113
- ⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
114
-
115
- ⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
116
- '''
117
-
118
- STREAM_NOTE = ['⚠️ Streaming is not supported in this Streamlit version.']
119
- if CHAR_LIMIT is not None:
120
- STREAM_NOTE.append(f'✂️ Each generation is capped at {CHAR_LIMIT} characters.')
121
- STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space.')
122
- STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
123
-
124
- BANNER_TEXT = '''
125
- # Kokoro TTS
126
-
127
- ***Kokoro*** **is an open-weight TTS model with 82 million parameters.**
128
-
129
- This demo only showcases English, but you can directly use the model to access other languages.
130
- '''
131
-
132
- # Streamlit UI
133
- st.set_page_config(page_title="Kokoro TTS", layout="wide")
134
- st.markdown(BANNER_TEXT)
135
-
136
- col1, col2 = st.columns([1, 1])
137
-
138
- with col1:
139
- text_input = st.text_area("Input Text", height=150,
140
- help=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per generation")
141
-
142
- voice_options = list(CHOICES.keys())
143
- voice_values = list(CHOICES.values())
144
- voice_index = voice_values.index('af_heart')
145
-
146
- voice_selection = st.selectbox("Voice", voice_options, index=voice_index,
147
- help="Quality and availability vary by language")
148
- voice_value = CHOICES[voice_selection]
149
-
150
- speed = st.slider("Speed", min_value=0.5, max_value=2.0, value=1.0, step=0.1)
151
-
152
- col1_1, col1_2, col1_3 = st.columns(3)
153
-
154
-
155
- with col2:
156
- tab1, tab2 = st.tabs(["Generate", "Tokens"])
157
-
158
- with tab1:
159
- if st.button("Generate Audio", type="primary"):
160
- with st.spinner("Generating audio..."):
161
- audio_result, tokens, generation_time = generate_audio(text_input, voice_value, speed)
162
- if audio_result:
163
- sample_rate, audio_data = audio_result
164
- st.audio(audio_data, sample_rate=sample_rate)
165
- st.session_state.last_tokens = tokens
166
- st.info(f"Audio generated in {generation_time:.2f} seconds")
167
- else:
168
- st.error("Failed to generate audio")
169
-
170
- st.markdown(STREAM_NOTE)
171
-
172
- with tab2:
173
- if st.button("Tokenize"):
174
- tokens = tokenize_text(text_input, voice_value)
175
- st.text_area("Output Tokens", value=tokens, height=200,
176
- help="Tokens used to generate the audio, up to 510 context length.")
177
- elif 'last_tokens' in st.session_state:
178
- st.text_area("Output Tokens", value=st.session_state.last_tokens, height=200,
179
- help="Tokens used to generate the audio, up to 510 context length.")
180
- else:
181
- st.text_area("Output Tokens", value="", height=200,
182
- help="Tokens used to generate the audio, up to 510 context length.")
183
-
184
- st.markdown(TOKEN_NOTE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from kokoro import KModel, KPipeline
3
+ import gradio as gr
4
+ import os
5
+ import random
6
+ import torch
7
+
8
+ IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
9
+ CUDA_AVAILABLE = torch.cuda.is_available()
10
+ if not IS_DUPLICATE:
11
+ import kokoro
12
+ import misaki
13
+ print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)
14
+
15
+ CHAR_LIMIT = None if IS_DUPLICATE else 5000
16
+ models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
17
+ pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
18
+ pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
19
+ pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
20
+
21
+ @spaces.GPU(duration=30)
22
+ def forward_gpu(ps, ref_s, speed):
23
+ return models[True](ps, ref_s, speed)
24
+
25
+ def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
26
+ text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
27
+ pipeline = pipelines[voice[0]]
28
+ pack = pipeline.load_voice(voice)
29
+ use_gpu = use_gpu and CUDA_AVAILABLE
30
+ for _, ps, _ in pipeline(text, voice, speed):
31
+ ref_s = pack[len(ps)-1]
32
+ try:
33
+ if use_gpu:
34
+ audio = forward_gpu(ps, ref_s, speed)
35
+ else:
36
+ audio = models[False](ps, ref_s, speed)
37
+ except gr.exceptions.Error as e:
38
+ if use_gpu:
39
+ gr.Warning(str(e))
40
+ gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
41
+ audio = models[False](ps, ref_s, speed)
42
+ else:
43
+ raise gr.Error(e)
44
+ return (24000, audio.numpy()), ps
45
+ return None, ''
46
+
47
+ # Arena API
48
+ def predict(text, voice='af_heart', speed=1):
49
+ return generate_first(text, voice, speed, use_gpu=False)[0]
50
+
51
+ def tokenize_first(text, voice='af_heart'):
52
+ pipeline = pipelines[voice[0]]
53
+ for _, ps, _ in pipeline(text, voice):
54
+ return ps
55
+ return ''
56
+
57
+ def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
58
+ text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
59
+ pipeline = pipelines[voice[0]]
60
+ pack = pipeline.load_voice(voice)
61
+ use_gpu = use_gpu and CUDA_AVAILABLE
62
+ first = True
63
+ for _, ps, _ in pipeline(text, voice, speed):
64
+ ref_s = pack[len(ps)-1]
65
+ try:
66
+ if use_gpu:
67
+ audio = forward_gpu(ps, ref_s, speed)
68
+ else:
69
+ audio = models[False](ps, ref_s, speed)
70
+ except gr.exceptions.Error as e:
71
+ if use_gpu:
72
+ gr.Warning(str(e))
73
+ gr.Info('Switching to CPU')
74
+ audio = models[False](ps, ref_s, speed)
75
+ else:
76
+ raise gr.Error(e)
77
+ yield 24000, audio.numpy()
78
+ if first:
79
+ first = False
80
+ yield 24000, torch.zeros(1).numpy()
81
+
82
+ with open('en.txt', 'r') as r:
83
+ random_quotes = [line.strip() for line in r]
84
+
85
+ def get_random_quote():
86
+ return random.choice(random_quotes)
87
+
88
+ def get_gatsby():
89
+ with open('gatsby5k.md', 'r') as r:
90
+ return r.read().strip()
91
+
92
+ def get_frankenstein():
93
+ with open('frankenstein5k.md', 'r') as r:
94
+ return r.read().strip()
95
+
96
+ CHOICES = {
97
+ '🇺🇸 🚺 Heart ❤️': 'af_heart',
98
+ '🇺🇸 🚺 Bella 🔥': 'af_bella',
99
+ '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
100
+ '🇺🇸 🚺 Aoede': 'af_aoede',
101
+ '🇺🇸 🚺 Kore': 'af_kore',
102
+ '🇺🇸 🚺 Sarah': 'af_sarah',
103
+ '🇺🇸 🚺 Nova': 'af_nova',
104
+ '🇺🇸 🚺 Sky': 'af_sky',
105
+ '🇺🇸 🚺 Alloy': 'af_alloy',
106
+ '🇺🇸 🚺 Jessica': 'af_jessica',
107
+ '🇺🇸 🚺 River': 'af_river',
108
+ '🇺🇸 🚹 Michael': 'am_michael',
109
+ '🇺🇸 🚹 Fenrir': 'am_fenrir',
110
+ '🇺🇸 🚹 Puck': 'am_puck',
111
+ '🇺🇸 🚹 Echo': 'am_echo',
112
+ '🇺🇸 🚹 Eric': 'am_eric',
113
+ '🇺🇸 🚹 Liam': 'am_liam',
114
+ '🇺🇸 🚹 Onyx': 'am_onyx',
115
+ '🇺🇸 🚹 Santa': 'am_santa',
116
+ '🇺🇸 🚹 Adam': 'am_adam',
117
+ '🇬🇧 🚺 Emma': 'bf_emma',
118
+ '🇬🇧 🚺 Isabella': 'bf_isabella',
119
+ '🇬🇧 🚺 Alice': 'bf_alice',
120
+ '🇬🇧 🚺 Lily': 'bf_lily',
121
+ '🇬🇧 🚹 George': 'bm_george',
122
+ '🇬🇧 🚹 Fable': 'bm_fable',
123
+ '🇬🇧 🚹 Lewis': 'bm_lewis',
124
+ '🇬🇧 🚹 Daniel': 'bm_daniel',
125
+ }
126
+ for v in CHOICES.values():
127
+ pipelines[v[0]].load_voice(v)
128
+
129
+ TOKEN_NOTE = '''
130
+ 💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
131
+ 💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
132
+ ⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
133
+ ⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
134
+ '''
135
+
136
+ with gr.Blocks() as generate_tab:
137
+ out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
138
+ generate_btn = gr.Button('Generate', variant='primary')
139
+ with gr.Accordion('Output Tokens', open=True):
140
+ out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
141
+ tokenize_btn = gr.Button('Tokenize', variant='secondary')
142
+ gr.Markdown(TOKEN_NOTE)
143
+ predict_btn = gr.Button('Predict', variant='secondary', visible=False)
144
+
145
+ STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
146
+ if CHAR_LIMIT is not None:
147
+ STREAM_NOTE.append(f'✂️ Each stream is capped at {CHAR_LIMIT} characters.')
148
+ STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space:')
149
+ STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
150
+
151
+ with gr.Blocks() as stream_tab:
152
+ out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
153
+ with gr.Row():
154
+ stream_btn = gr.Button('Stream', variant='primary')
155
+ stop_btn = gr.Button('Stop', variant='stop')
156
+ with gr.Accordion('Note', open=True):
157
+ gr.Markdown(STREAM_NOTE)
158
+ gr.DuplicateButton()
159
+
160
+ BANNER_TEXT = '''
161
+ [***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
162
+ This demo only showcases English, but you can directly use the model to access other languages.
163
+ '''
164
+ API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
165
+ API_NAME = None if API_OPEN else False
166
+ with gr.Blocks() as app:
167
+ with gr.Row():
168
+ gr.Markdown(BANNER_TEXT, container=True)
169
+ with gr.Row():
170
+ with gr.Column():
171
+ text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
172
+ with gr.Row():
173
+ voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
174
+ use_gpu = gr.Dropdown(
175
+ [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
176
+ value=CUDA_AVAILABLE,
177
+ label='Hardware',
178
+ info='GPU is usually faster, but has a usage quota',
179
+ interactive=CUDA_AVAILABLE
180
+ )
181
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
182
+ random_btn = gr.Button('🎲 Random Quote 💬', variant='secondary')
183
+ with gr.Row():
184
+ gatsby_btn = gr.Button('🥂 Gatsby 📕', variant='secondary')
185
+ frankenstein_btn = gr.Button('💀 Frankenstein 📗', variant='secondary')
186
+ with gr.Column():
187
+ gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
188
+ random_btn.click(fn=get_random_quote, inputs=[], outputs=[text], api_name=API_NAME)
189
+ gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text], api_name=API_NAME)
190
+ frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text], api_name=API_NAME)
191
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps], api_name=API_NAME)
192
+ tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
193
+ stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream], api_name=API_NAME)
194
+ stop_btn.click(fn=None, cancels=stream_event)
195
+ predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
196
+
197
+ if __name__ == '__main__':
198
+ app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)
199
+