Rajhuggingface4253 commited on
Commit
4932f88
·
verified ·
1 Parent(s): 7d77f94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -149,7 +149,7 @@ class NeuTTSWrapper:
149
 
150
  # 3. Infer full text
151
  with torch.no_grad():
152
- audio = self.tts_model.infer(text, ref_s)
153
  return audio.cpu().numpy()
154
 
155
  def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
@@ -172,7 +172,7 @@ class NeuTTSWrapper:
172
 
173
  # Infer sentence
174
  with torch.no_grad():
175
- audio_chunk = self.tts_model.infer(sentence, ref_s)
176
 
177
  # Convert and yield
178
  yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
@@ -295,6 +295,7 @@ def cleanup_files_blocking():
295
  @app.post("/synthesize", response_class=Response)
296
  async def text_to_speech(
297
  text: str = Form(...),
 
298
  speed: float = Form(1.0, ge=0.5, le=2.0),
299
  output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
300
  reference_audio: UploadFile = File(...)):
@@ -321,7 +322,8 @@ async def text_to_speech(
321
  audio_data = await run_blocking_task_async(
322
  app.state.tts_wrapper.generate_speech_blocking,
323
  text,
324
- converted_wav_path # IMPORTANT: Pass the CONVERTED WAV path
 
325
  )
326
 
327
  # 4. Convert to requested format (Blocking, but usually fast)
@@ -365,6 +367,7 @@ async def text_to_speech(
365
  @app.post("/synthesize/stream")
366
  async def stream_text_to_speech_cloning(
367
  text: str = Form(..., min_length=1, max_length=5000),
 
368
  speed: float = Form(1.0, ge=0.5, le=2.0),
369
  output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
370
  reference_audio: UploadFile = File(...)):
@@ -397,6 +400,7 @@ async def stream_text_to_speech_cloning(
397
  for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
398
  text,
399
  path_to_delete, # Pass the CONVERTED WAV path
 
400
  speed,
401
  output_format
402
  ):
 
149
 
150
  # 3. Infer full text
151
  with torch.no_grad():
152
+ audio = self.tts_model.infer(text, ref_s, reference_text)
153
  return audio.cpu().numpy()
154
 
155
  def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
 
172
 
173
  # Infer sentence
174
  with torch.no_grad():
175
+ audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
176
 
177
  # Convert and yield
178
  yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
 
295
  @app.post("/synthesize", response_class=Response)
296
  async def text_to_speech(
297
  text: str = Form(...),
298
+ reference_text: str = Form(...),
299
  speed: float = Form(1.0, ge=0.5, le=2.0),
300
  output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
301
  reference_audio: UploadFile = File(...)):
 
322
  audio_data = await run_blocking_task_async(
323
  app.state.tts_wrapper.generate_speech_blocking,
324
  text,
325
+ converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
326
+ reference_text
327
  )
328
 
329
  # 4. Convert to requested format (Blocking, but usually fast)
 
367
  @app.post("/synthesize/stream")
368
  async def stream_text_to_speech_cloning(
369
  text: str = Form(..., min_length=1, max_length=5000),
370
+ reference_text: str = Form(...),
371
  speed: float = Form(1.0, ge=0.5, le=2.0),
372
  output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
373
  reference_audio: UploadFile = File(...)):
 
400
  for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
401
  text,
402
  path_to_delete, # Pass the CONVERTED WAV path
403
+ reference_text,
404
  speed,
405
  output_format
406
  ):