Nymbo commited on
Commit
0289739
·
verified ·
1 Parent(s): 1a280d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -44
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import gradio as gr
 
2
  import os
3
  import tempfile
4
  import time
5
  import wave
6
  import struct
7
  import numpy as np
8
- import io
9
  from openai import OpenAI
10
  from elevenlabs.client import ElevenLabs
11
  from elevenlabs import stream, play
@@ -113,7 +113,7 @@ def get_elevenlabs_voices(api_key):
113
  "Sam": "yoZ06aMxZJJ28mfd3POQ"
114
  }
115
 
116
- # Kokoro TTS (CPU/GPU)
117
  _KOKORO_STATE = { "initialized": False, "device": "cpu", "model": None, "pipelines": {} }
118
 
119
  def _init_kokoro() -> None:
@@ -121,14 +121,8 @@ def _init_kokoro() -> None:
121
  return
122
  if KModel is None or KPipeline is None:
123
  raise gr.Error("Kokoro is not installed. Please add 'kokoro>=0.9.4' and 'torch' to requirements and install.")
124
- # Prefer CUDA if available, otherwise CPU
125
  device = "cpu"
126
- try:
127
- if torch is not None and hasattr(torch, "cuda") and torch.cuda.is_available():
128
- device = "cuda"
129
- except Exception:
130
- device = "cpu"
131
-
132
  model = KModel().to(device).eval()
133
  pipelines = {"a": KPipeline(lang_code="a", model=False)}
134
  try:
@@ -138,16 +132,6 @@ def _init_kokoro() -> None:
138
 
139
  _KOKORO_STATE.update({"initialized": True, "device": device, "model": model, "pipelines": pipelines})
140
 
141
- def _int16_wav_bytes(audio_int16: np.ndarray, sr: int = 24_000) -> bytes:
142
- """Encode a mono int16 numpy array to WAV bytes (single chunk)."""
143
- with io.BytesIO() as buf:
144
- with wave.open(buf, "wb") as wf:
145
- wf.setnchannels(1)
146
- wf.setsampwidth(2)
147
- wf.setframerate(sr)
148
- wf.writeframes(audio_int16.tobytes())
149
- return buf.getvalue()
150
-
151
  def get_kokoro_voices():
152
  """Get list of available Kokoro voice IDs."""
153
  try:
@@ -171,11 +155,33 @@ def get_kokoro_voices():
171
  "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi", "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
172
  ]
173
 
174
- def kokoro_tts_stream(text: str, speed: float, voice: str):
175
- """Stream speech with Kokoro-82M by yielding WAV bytes per segment.
 
 
176
 
177
- Yields: bytes (WAV) chunks suitable for gr.Audio(streaming=True).
178
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  if not text or not text.strip():
180
  raise gr.Error("Please enter text to synthesize.")
181
 
@@ -186,51 +192,80 @@ def kokoro_tts_stream(text: str, speed: float, voice: str):
186
  if pipeline is None:
187
  raise gr.Error("Kokoro English pipeline not initialized.")
188
 
189
- sr = 24_000
190
  pack = pipeline.load_voice(voice)
191
 
192
  try:
193
- # Iterate lazily; do not materialize all segments
194
  for idx, (_, ps, _) in enumerate(pipeline(text, voice, speed)):
195
  ref_s = pack[len(ps) - 1]
196
  try:
197
- audio = model(ps, ref_s, float(speed)) # torch tensor possibly on GPU/CPU
198
  audio_np = audio.detach().cpu().numpy()
199
- audio_clipped = np.clip(audio_np, -1.0, 1.0)
200
- audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
201
- yield _int16_wav_bytes(audio_int16, sr)
202
  except Exception as e:
203
  raise gr.Error(f"Error generating audio for segment {idx + 1}: {str(e)[:200]}...")
204
  except gr.Error:
205
  raise
206
  except Exception as e:
207
- raise gr.Error(f"Error during streaming generation: {str(e)[:200]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  # Main dispatcher function to handle all services
 
 
 
 
 
 
210
  def generate_tts(text, service, openai_api_key, openai_model, openai_voice,
211
  elevenlabs_api_key, elevenlabs_voice, voice_dict,
212
  kokoro_speed, kokoro_voice):
213
- """Route to appropriate TTS service.
214
-
215
- Acts as a generator to support streaming audio output.
216
- """
217
  if service == "Kokoro":
218
- for chunk in kokoro_tts_stream(text, kokoro_speed, kokoro_voice):
219
- yield chunk
220
- return
221
- elif service == "OpenAI":
222
- final_path = openai_tts(text, openai_model, openai_voice, openai_api_key)
223
- yield final_path
224
  return
 
 
 
225
  elif service == "ElevenLabs":
226
  voice_id = voice_dict.get(elevenlabs_voice, elevenlabs_voice)
227
- final_path = elevenlabs_tts(text, voice_id, elevenlabs_api_key)
228
- yield final_path
229
- return
230
  else:
231
- # Fallback in case of an unknown service
232
  raise gr.Error(f"Unknown service selected: {service}")
233
 
 
 
 
 
 
 
 
 
 
 
234
  # Function to update ElevenLabs voices when API key changes
235
  def update_elevenlabs_voices(api_key):
236
  """Update voice dropdown when API key is entered"""
@@ -337,6 +372,7 @@ with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
337
  label="Generated Speech",
338
  streaming=True,
339
  autoplay=True,
 
340
  )
341
 
342
  # ==========================
 
1
  import gradio as gr
2
+ import io
3
  import os
4
  import tempfile
5
  import time
6
  import wave
7
  import struct
8
  import numpy as np
 
9
  from openai import OpenAI
10
  from elevenlabs.client import ElevenLabs
11
  from elevenlabs import stream, play
 
113
  "Sam": "yoZ06aMxZJJ28mfd3POQ"
114
  }
115
 
116
+ # Kokoro TTS (CPU-only)
117
  _KOKORO_STATE = { "initialized": False, "device": "cpu", "model": None, "pipelines": {} }
118
 
119
  def _init_kokoro() -> None:
 
121
  return
122
  if KModel is None or KPipeline is None:
123
  raise gr.Error("Kokoro is not installed. Please add 'kokoro>=0.9.4' and 'torch' to requirements and install.")
124
+
125
  device = "cpu"
 
 
 
 
 
 
126
  model = KModel().to(device).eval()
127
  pipelines = {"a": KPipeline(lang_code="a", model=False)}
128
  try:
 
132
 
133
  _KOKORO_STATE.update({"initialized": True, "device": device, "model": model, "pipelines": pipelines})
134
 
 
 
 
 
 
 
 
 
 
 
135
  def get_kokoro_voices():
136
  """Get list of available Kokoro voice IDs."""
137
  try:
 
155
  "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi", "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
156
  ]
157
 
158
+ def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
159
+ audio_clipped = np.clip(audio_np, -1.0, 1.0)
160
+ return (audio_clipped * 32767.0).astype(np.int16)
161
+
162
 
163
+ def _write_wav_file(audio_int16: np.ndarray, sample_rate: int = 24_000) -> str:
164
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
165
+ path = tmp.name
166
+ with wave.open(path, "wb") as wf:
167
+ wf.setnchannels(1)
168
+ wf.setsampwidth(2)
169
+ wf.setframerate(sample_rate)
170
+ wf.writeframes(audio_int16.tobytes())
171
+ return path
172
+
173
+
174
+ def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int = 24_000) -> bytes:
175
+ buffer = io.BytesIO()
176
+ with wave.open(buffer, "wb") as wf:
177
+ wf.setnchannels(1)
178
+ wf.setsampwidth(2)
179
+ wf.setframerate(sample_rate)
180
+ wf.writeframes(audio_int16.tobytes())
181
+ return buffer.getvalue()
182
+
183
+
184
+ def _kokoro_segment_generator(text: str, speed: float, voice: str):
185
  if not text or not text.strip():
186
  raise gr.Error("Please enter text to synthesize.")
187
 
 
192
  if pipeline is None:
193
  raise gr.Error("Kokoro English pipeline not initialized.")
194
 
 
195
  pack = pipeline.load_voice(voice)
196
 
197
  try:
 
198
  for idx, (_, ps, _) in enumerate(pipeline(text, voice, speed)):
199
  ref_s = pack[len(ps) - 1]
200
  try:
201
+ audio = model(ps, ref_s, float(speed))
202
  audio_np = audio.detach().cpu().numpy()
203
+ yield audio_np
 
 
204
  except Exception as e:
205
  raise gr.Error(f"Error generating audio for segment {idx + 1}: {str(e)[:200]}...")
206
  except gr.Error:
207
  raise
208
  except Exception as e:
209
+ raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
210
+
211
+
212
+ def kokoro_tts(text: str, speed: float, voice: str) -> str:
213
+ sr = 24_000
214
+ segments = list(_kokoro_segment_generator(text, speed, voice))
215
+ if not segments:
216
+ raise gr.Error("No audio was generated.")
217
+
218
+ audio_np = segments[0] if len(segments) == 1 else np.concatenate(segments, axis=0)
219
+ audio_int16 = _audio_np_to_int16(audio_np)
220
+ return _write_wav_file(audio_int16, sr)
221
+
222
+
223
+ def kokoro_tts_stream(text: str, speed: float, voice: str):
224
+ sr = 24_000
225
+ produced_any = False
226
+
227
+ for audio_np in _kokoro_segment_generator(text, speed, voice):
228
+ produced_any = True
229
+ audio_int16 = _audio_np_to_int16(audio_np)
230
+ chunk_bytes = _wav_bytes_from_int16(audio_int16, sr)
231
+ yield chunk_bytes
232
+
233
+ if not produced_any:
234
+ raise gr.Error("No audio was generated.")
235
 
236
  # Main dispatcher function to handle all services
237
+ def _read_file_bytes(path: str) -> bytes:
238
+ with open(path, "rb") as file:
239
+ data = file.read()
240
+ return data
241
+
242
+
243
  def generate_tts(text, service, openai_api_key, openai_model, openai_voice,
244
  elevenlabs_api_key, elevenlabs_voice, voice_dict,
245
  kokoro_speed, kokoro_voice):
246
+ """Route to appropriate TTS service based on selection"""
 
 
 
247
  if service == "Kokoro":
248
+ yield from kokoro_tts_stream(text, kokoro_speed, kokoro_voice)
 
 
 
 
 
249
  return
250
+
251
+ if service == "OpenAI":
252
+ file_path = openai_tts(text, openai_model, openai_voice, openai_api_key)
253
  elif service == "ElevenLabs":
254
  voice_id = voice_dict.get(elevenlabs_voice, elevenlabs_voice)
255
+ file_path = elevenlabs_tts(text, voice_id, elevenlabs_api_key)
 
 
256
  else:
 
257
  raise gr.Error(f"Unknown service selected: {service}")
258
 
259
+ try:
260
+ audio_bytes = _read_file_bytes(file_path)
261
+ finally:
262
+ try:
263
+ os.remove(file_path)
264
+ except OSError:
265
+ pass
266
+
267
+ yield audio_bytes
268
+
269
  # Function to update ElevenLabs voices when API key changes
270
  def update_elevenlabs_voices(api_key):
271
  """Update voice dropdown when API key is entered"""
 
372
  label="Generated Speech",
373
  streaming=True,
374
  autoplay=True,
375
+ show_download_button=True,
376
  )
377
 
378
  # ==========================