Revrse commited on
Commit
65621f7
·
verified ·
1 Parent(s): 9c2e92c

Upload 3 files

Browse files
Files changed (2) hide show
  1. app.py +129 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -119,10 +119,40 @@ def generate_audio_piper(text: str, speed: float = 1.0):
119
  raise FileNotFoundError("Piper model not found")
120
 
121
  piper_voice = piper.PiperVoice.load(model_path)
122
- audio_data_np = piper_voice.synthesize(text)
123
 
124
- # Return as numpy array for Gradio
125
- return (piper_voice.config.sample_rate, audio_data_np)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  except Exception as e:
128
  raise Exception(f"Piper TTS failed: {str(e)}")
@@ -156,6 +186,22 @@ def generate_audio_coqui(text: str, speed: float = 1.0):
156
  if hasattr(tts, 'synthesizer') and hasattr(tts.synthesizer, 'output_sample_rate'):
157
  sample_rate = tts.synthesizer.output_sample_rate
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return (sample_rate, wav)
160
 
161
  except Exception as e:
@@ -173,6 +219,18 @@ def generate_audio_espeak(text: str, speed: float = 1.0):
173
  import soundfile as sf
174
  audio_data, sample_rate = sf.read(audio_file_path)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  return (sample_rate, audio_data)
177
  except Exception as e:
178
  raise Exception(f"eSpeak TTS failed: {str(e)}")
@@ -203,6 +261,18 @@ def generate_audio_gtts(text: str, speed: float = 1.0):
203
  import soundfile as sf
204
  audio_data, sample_rate = sf.read(wav_buffer)
205
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  return (sample_rate, audio_data)
207
  except Exception as e:
208
  raise Exception(f"gTTS failed: {str(e)}")
@@ -224,6 +294,18 @@ def generate_audio_pyttsx3(text: str, speed: float = 1.0):
224
  import soundfile as sf
225
  audio_data, sample_rate = sf.read(audio_file_path)
226
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  os.unlink(audio_file_path)
228
  return (sample_rate, audio_data)
229
  except Exception as e:
@@ -263,6 +345,18 @@ def generate_audio_edge_tts(text: str, speed: float = 1.0):
263
  import soundfile as sf
264
  audio_array, sample_rate = sf.read(wav_buffer)
265
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  return (sample_rate, audio_array)
267
 
268
  except Exception as e:
@@ -295,7 +389,34 @@ def generate_speech(text: str, engine: str, speed: float = 1.0):
295
  else: # espeak
296
  sample_rate, audio_data = generate_audio_espeak(text, speed)
297
 
298
- return (sample_rate, audio_data), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  except Exception as e:
301
  return None, f"Error: {str(e)}"
@@ -341,7 +462,7 @@ with gr.Blocks(title="sub200 - Ultra Low Latency TTS", theme=gr.themes.Soft()) a
341
 
342
  generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
343
 
344
- audio_output = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
345
  error_output = gr.Textbox(label="Status", visible=True)
346
 
347
  # Engine status
@@ -374,4 +495,6 @@ except:
374
  pass
375
 
376
  if __name__ == "__main__":
377
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
 
 
119
  raise FileNotFoundError("Piper model not found")
120
 
121
  piper_voice = piper.PiperVoice.load(model_path)
 
122
 
123
+ # synthesize() returns an iterable of AudioChunk objects
124
+ audio_chunks = piper_voice.synthesize(text)
125
+
126
+ # Collect all audio chunks and concatenate them
127
+ audio_arrays = []
128
+ sample_rate = piper_voice.config.sample_rate
129
+
130
+ for chunk in audio_chunks:
131
+ # Each chunk has an audio_float_array property
132
+ audio_arrays.append(chunk.audio_float_array)
133
+ # Use sample_rate from first chunk if available
134
+ if hasattr(chunk, 'sample_rate') and chunk.sample_rate:
135
+ sample_rate = chunk.sample_rate
136
+
137
+ # Concatenate all chunks into a single array
138
+ if audio_arrays:
139
+ audio_data_np = np.concatenate(audio_arrays)
140
+ else:
141
+ raise Exception("No audio chunks generated")
142
+
143
+ # Ensure it's a numpy array and float32
144
+ if not isinstance(audio_data_np, np.ndarray):
145
+ audio_data_np = np.array(audio_data_np, dtype=np.float32)
146
+
147
+ # Ensure audio is 1D (mono)
148
+ if len(audio_data_np.shape) > 1:
149
+ audio_data_np = audio_data_np.flatten()
150
+
151
+ # Convert to float32 if needed
152
+ if audio_data_np.dtype != np.float32:
153
+ audio_data_np = audio_data_np.astype(np.float32)
154
+
155
+ return (sample_rate, audio_data_np)
156
 
157
  except Exception as e:
158
  raise Exception(f"Piper TTS failed: {str(e)}")
 
186
  if hasattr(tts, 'synthesizer') and hasattr(tts.synthesizer, 'output_sample_rate'):
187
  sample_rate = tts.synthesizer.output_sample_rate
188
 
189
+ # Convert to numpy array if it's a tensor or list
190
+ if hasattr(wav, 'cpu'): # PyTorch tensor
191
+ wav = wav.cpu().numpy()
192
+ elif hasattr(wav, 'numpy'): # TensorFlow tensor
193
+ wav = wav.numpy()
194
+ elif not isinstance(wav, np.ndarray):
195
+ wav = np.array(wav, dtype=np.float32)
196
+
197
+ # Ensure audio is 1D (mono) and float32
198
+ if len(wav.shape) > 1:
199
+ wav = wav.flatten()
200
+
201
+ # Convert to float32 if needed
202
+ if wav.dtype != np.float32:
203
+ wav = wav.astype(np.float32)
204
+
205
  return (sample_rate, wav)
206
 
207
  except Exception as e:
 
219
  import soundfile as sf
220
  audio_data, sample_rate = sf.read(audio_file_path)
221
 
222
+ # Ensure it's a numpy array and float32
223
+ if not isinstance(audio_data, np.ndarray):
224
+ audio_data = np.array(audio_data, dtype=np.float32)
225
+
226
+ # Ensure audio is 1D (mono)
227
+ if len(audio_data.shape) > 1:
228
+ audio_data = audio_data.flatten()
229
+
230
+ # Convert to float32 if needed
231
+ if audio_data.dtype != np.float32:
232
+ audio_data = audio_data.astype(np.float32)
233
+
234
  return (sample_rate, audio_data)
235
  except Exception as e:
236
  raise Exception(f"eSpeak TTS failed: {str(e)}")
 
261
  import soundfile as sf
262
  audio_data, sample_rate = sf.read(wav_buffer)
263
 
264
+ # Ensure it's a numpy array and float32
265
+ if not isinstance(audio_data, np.ndarray):
266
+ audio_data = np.array(audio_data, dtype=np.float32)
267
+
268
+ # Ensure audio is 1D (mono)
269
+ if len(audio_data.shape) > 1:
270
+ audio_data = audio_data.flatten()
271
+
272
+ # Convert to float32 if needed
273
+ if audio_data.dtype != np.float32:
274
+ audio_data = audio_data.astype(np.float32)
275
+
276
  return (sample_rate, audio_data)
277
  except Exception as e:
278
  raise Exception(f"gTTS failed: {str(e)}")
 
294
  import soundfile as sf
295
  audio_data, sample_rate = sf.read(audio_file_path)
296
 
297
+ # Ensure it's a numpy array and float32
298
+ if not isinstance(audio_data, np.ndarray):
299
+ audio_data = np.array(audio_data, dtype=np.float32)
300
+
301
+ # Ensure audio is 1D (mono)
302
+ if len(audio_data.shape) > 1:
303
+ audio_data = audio_data.flatten()
304
+
305
+ # Convert to float32 if needed
306
+ if audio_data.dtype != np.float32:
307
+ audio_data = audio_data.astype(np.float32)
308
+
309
  os.unlink(audio_file_path)
310
  return (sample_rate, audio_data)
311
  except Exception as e:
 
345
  import soundfile as sf
346
  audio_array, sample_rate = sf.read(wav_buffer)
347
 
348
+ # Ensure it's a numpy array and float32
349
+ if not isinstance(audio_array, np.ndarray):
350
+ audio_array = np.array(audio_array, dtype=np.float32)
351
+
352
+ # Ensure audio is 1D (mono)
353
+ if len(audio_array.shape) > 1:
354
+ audio_array = audio_array.flatten()
355
+
356
+ # Convert to float32 if needed
357
+ if audio_array.dtype != np.float32:
358
+ audio_array = audio_array.astype(np.float32)
359
+
360
  return (sample_rate, audio_array)
361
 
362
  except Exception as e:
 
389
  else: # espeak
390
  sample_rate, audio_data = generate_audio_espeak(text, speed)
391
 
392
+ # Ensure audio_data is a numpy array (not a list)
393
+ if not isinstance(audio_data, np.ndarray):
394
+ audio_data = np.array(audio_data, dtype=np.float32)
395
+
396
+ # Ensure audio is 1D (mono)
397
+ if len(audio_data.shape) > 1:
398
+ audio_data = audio_data.flatten()
399
+
400
+ # Normalize audio to [-1, 1] range if needed
401
+ max_val = np.max(np.abs(audio_data))
402
+ if max_val > 1.0:
403
+ audio_data = audio_data / max_val
404
+
405
+ # Ensure it's still a numpy array after normalization
406
+ if not isinstance(audio_data, np.ndarray):
407
+ audio_data = np.array(audio_data, dtype=np.float32)
408
+
409
+ # Save to temporary file for Gradio Audio component
410
+ import tempfile
411
+ import soundfile as sf
412
+
413
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
414
+ tmp_path = tmp.name
415
+
416
+ sf.write(tmp_path, audio_data, int(sample_rate))
417
+
418
+ # Return file path for Gradio Audio component
419
+ return tmp_path, None
420
 
421
  except Exception as e:
422
  return None, f"Error: {str(e)}"
 
462
 
463
  generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
464
 
465
+ audio_output = gr.Audio(label="Generated Audio", type="filepath", autoplay=True)
466
  error_output = gr.Textbox(label="Status", visible=True)
467
 
468
  # Engine status
 
495
  pass
496
 
497
  if __name__ == "__main__":
498
+ # Get port from environment (Hugging Face Spaces uses 7860, local uses 8000)
499
+ port = int(os.getenv("PORT", 8000))
500
+ demo.launch(server_name="0.0.0.0", server_port=port, share=False)
requirements.txt CHANGED
@@ -14,3 +14,4 @@ pydub==0.25.1
14
 
15
  # Note: numpy version is managed by TTS (1.22.0 for Python 3.10)
16
  # torch and torchaudio are pre-installed in HF Spaces base image
 
 
14
 
15
  # Note: numpy version is managed by TTS (1.22.0 for Python 3.10)
16
  # torch and torchaudio are pre-installed in HF Spaces base image
17
+ # pandas version is managed by Gradio (compatible version)