Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -149,7 +149,7 @@ class NeuTTSWrapper:
|
|
| 149 |
|
| 150 |
# 3. Infer full text
|
| 151 |
with torch.no_grad():
|
| 152 |
-
audio = self.tts_model.infer(text, ref_s)
|
| 153 |
return audio.cpu().numpy()
|
| 154 |
|
| 155 |
def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
|
|
@@ -172,7 +172,7 @@ class NeuTTSWrapper:
|
|
| 172 |
|
| 173 |
# Infer sentence
|
| 174 |
with torch.no_grad():
|
| 175 |
-
audio_chunk = self.tts_model.infer(sentence, ref_s)
|
| 176 |
|
| 177 |
# Convert and yield
|
| 178 |
yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
|
|
@@ -295,6 +295,7 @@ def cleanup_files_blocking():
|
|
| 295 |
@app.post("/synthesize", response_class=Response)
|
| 296 |
async def text_to_speech(
|
| 297 |
text: str = Form(...),
|
|
|
|
| 298 |
speed: float = Form(1.0, ge=0.5, le=2.0),
|
| 299 |
output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
|
| 300 |
reference_audio: UploadFile = File(...)):
|
|
@@ -321,7 +322,8 @@ async def text_to_speech(
|
|
| 321 |
audio_data = await run_blocking_task_async(
|
| 322 |
app.state.tts_wrapper.generate_speech_blocking,
|
| 323 |
text,
|
| 324 |
-
converted_wav_path # IMPORTANT: Pass the CONVERTED WAV path
|
|
|
|
| 325 |
)
|
| 326 |
|
| 327 |
# 4. Convert to requested format (Blocking, but usually fast)
|
|
@@ -365,6 +367,7 @@ async def text_to_speech(
|
|
| 365 |
@app.post("/synthesize/stream")
|
| 366 |
async def stream_text_to_speech_cloning(
|
| 367 |
text: str = Form(..., min_length=1, max_length=5000),
|
|
|
|
| 368 |
speed: float = Form(1.0, ge=0.5, le=2.0),
|
| 369 |
output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
|
| 370 |
reference_audio: UploadFile = File(...)):
|
|
@@ -397,6 +400,7 @@ async def stream_text_to_speech_cloning(
|
|
| 397 |
for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
|
| 398 |
text,
|
| 399 |
path_to_delete, # Pass the CONVERTED WAV path
|
|
|
|
| 400 |
speed,
|
| 401 |
output_format
|
| 402 |
):
|
|
|
|
| 149 |
|
| 150 |
# 3. Infer full text
|
| 151 |
with torch.no_grad():
|
| 152 |
+
audio = self.tts_model.infer(text, ref_s, reference_text)
|
| 153 |
return audio.cpu().numpy()
|
| 154 |
|
| 155 |
def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
|
|
|
|
| 172 |
|
| 173 |
# Infer sentence
|
| 174 |
with torch.no_grad():
|
| 175 |
+
audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
|
| 176 |
|
| 177 |
# Convert and yield
|
| 178 |
yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
|
|
|
|
| 295 |
@app.post("/synthesize", response_class=Response)
|
| 296 |
async def text_to_speech(
|
| 297 |
text: str = Form(...),
|
| 298 |
+
reference_text: str = Form(...),
|
| 299 |
speed: float = Form(1.0, ge=0.5, le=2.0),
|
| 300 |
output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
|
| 301 |
reference_audio: UploadFile = File(...)):
|
|
|
|
| 322 |
audio_data = await run_blocking_task_async(
|
| 323 |
app.state.tts_wrapper.generate_speech_blocking,
|
| 324 |
text,
|
| 325 |
+
converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
|
| 326 |
+
reference_text
|
| 327 |
)
|
| 328 |
|
| 329 |
# 4. Convert to requested format (Blocking, but usually fast)
|
|
|
|
| 367 |
@app.post("/synthesize/stream")
|
| 368 |
async def stream_text_to_speech_cloning(
|
| 369 |
text: str = Form(..., min_length=1, max_length=5000),
|
| 370 |
+
reference_text: str = Form(...),
|
| 371 |
speed: float = Form(1.0, ge=0.5, le=2.0),
|
| 372 |
output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
|
| 373 |
reference_audio: UploadFile = File(...)):
|
|
|
|
| 400 |
for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
|
| 401 |
text,
|
| 402 |
path_to_delete, # Pass the CONVERTED WAV path
|
| 403 |
+
reference_text,
|
| 404 |
speed,
|
| 405 |
output_format
|
| 406 |
):
|