Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 18

Commit

4932f88

verified ·

1 Parent(s): 7d77f94

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -3

app.py CHANGED Viewed

@@ -149,7 +149,7 @@ class NeuTTSWrapper:
         # 3. Infer full text
         with torch.no_grad():
-            audio = self.tts_model.infer(text, ref_s)
         return audio.cpu().numpy()
     def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
@@ -172,7 +172,7 @@ class NeuTTSWrapper:
             # Infer sentence
             with torch.no_grad():
-                audio_chunk = self.tts_model.infer(sentence, ref_s)
             # Convert and yield
             yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
@@ -295,6 +295,7 @@ def cleanup_files_blocking():
 @app.post("/synthesize", response_class=Response)
 async def text_to_speech(
     text: str = Form(...),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
@@ -321,7 +322,8 @@ async def text_to_speech(
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
-            converted_wav_path # IMPORTANT: Pass the CONVERTED WAV path
         )
         # 4. Convert to requested format (Blocking, but usually fast)
@@ -365,6 +367,7 @@ async def text_to_speech(
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
@@ -397,6 +400,7 @@ async def stream_text_to_speech_cloning(
                 for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
                     text,
                     path_to_delete, # Pass the CONVERTED WAV path
                     speed,
                     output_format
                 ):

         # 3. Infer full text
         with torch.no_grad():
+            audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio.cpu().numpy()
     def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
             # Infer sentence
             with torch.no_grad():
+                audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
             # Convert and yield
             yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
 @app.post("/synthesize", response_class=Response)
 async def text_to_speech(
     text: str = Form(...),
+    reference_text: str = Form(...),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
+            converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
+            reference_text
         )
         # 4. Convert to requested format (Blocking, but usually fast)
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
+    reference_text: str = Form(...),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
                 for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
                     text,
                     path_to_delete, # Pass the CONVERTED WAV path
+                    reference_text,
                     speed,
                     output_format
                 ):