Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 19

Commit

01b3f2d

verified ·

1 Parent(s): c0df123

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -79

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ DEVICE = "cpu"
 MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
-CLEANUP_THRESHOLD = 3600 # 1 hour in seconds
 TEMP_AUDIO_DIR = "temp_audio"
 GENERATED_AUDIO_DIR = "generated_audio"
 os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
@@ -134,7 +134,12 @@ class NeuTTSWrapper:
         audio_buffer.seek(0)
         return audio_buffer.read()
     def generate_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
         """Blocking synthesis for standard endpoint."""
@@ -147,60 +152,32 @@ class NeuTTSWrapper:
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
-    def _split_into_streaming_chunks(self, text: str) -> list[str]:
-        """
-        Splits text into smaller, more manageable chunks for streaming.
-        """
-        sentences = []
-        current_sentence = ""
-        for char in text:
-            current_sentence += char
-            if char in '.!?;:':
-                sentences.append(current_sentence.strip())
-                current_sentence = ""
-        if current_sentence.strip():
-            sentences.append(current_sentence.strip())
-        if not sentences:
-            if ',' in text:
-                sentences = [chunk.strip() for chunk in text.split(',') if chunk.strip()]
-            else:
-                chunk_size = 100
-                sentences = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-        return [s for s in sentences if s]
-    # --- NEW: Parallel Worker (Now a method of the class) ---
-    def _synthesize_chunk_blocking(self, sentence: str, ref_s: torch.Tensor, ref_text: str) -> np.ndarray:
-        """Worker function to synthesize a single chunk of text. Runs in a thread pool."""
-        with torch.no_grad():
-            # It now correctly calls the model stored in self.tts_model
-            audio_chunk = self.tts_model.infer(sentence, ref_s, ref_text)
-        return audio_chunk
-    # --- NEW: Parallel Streaming Generator (Now a method of the class) ---
-    async def stream_speech_parallel(self, text: str, ref_audio_path: str, ref_text: str, executor: ThreadPoolExecutor):
-        """
-        Performs streaming synthesis using a parallel producer-consumer pattern.
-        """
-        loop = asyncio.get_event_loop()
-        # It now correctly calls the model's encode_reference method
-        ref_s = await loop.run_in_executor(
-            executor, self.tts_model.encode_reference, ref_audio_path
-        )
-        # It now correctly calls its own text splitting method
-        sentences = self._split_into_streaming_chunks(text)
-        tasks = [
-            loop.run_in_executor(
-                # It now correctly calls its own worker method
-                executor, self._synthesize_chunk_blocking, sentence, ref_s, ref_text
-            )
-            for sentence in sentences
-        ]
-        for task in tasks:
-            audio_chunk = await task
-            yield audio_chunk
 # --- Asynchronous Offloading ---
@@ -391,49 +368,54 @@ async def text_to_speech(
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     reference_text: str = Form(...),
-    speed: float = Form(1.0, ge=0.5, le=2.0), # Kept for API compatibility, not used in this logic
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
-    reference_audio: UploadFile = File(...)
-):
     """
-    High-performance parallel streaming endpoint using the local wrapper.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     temp_ref_path = await save_upload_file_async(reference_audio)
-    converted_wav_path = None
     try:
         converted_wav_path = await run_blocking_task_async(
-            convert_to_wav_blocking, temp_ref_path
         )
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
-        async def stream_generator(path_to_delete: str):
             try:
-                # This now calls our new wrapper's parallel streaming method
-                async for audio_chunk in app.state.tts_wrapper.stream_speech_parallel(
-                    text=text,
-                    ref_audio_path=path_to_delete,
-                    ref_text=reference_text,
-                    executor=tts_executor
                 ):
-                    audio_buffer = io.BytesIO()
-                    sf.write(audio_buffer, audio_chunk, SAMPLE_RATE, format=output_format)
-                    audio_buffer.seek(0)
-                    yield audio_buffer.read()
             except Exception as e:
                 logger.error(f"Streaming generator error: {e}")
-                raise
             finally:
                 if os.path.exists(path_to_delete):
                     os.unlink(path_to_delete)
                     logger.info(f"Cleaned up converted file: {path_to_delete}")
         return StreamingResponse(
             stream_generator(converted_wav_path),
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
@@ -444,17 +426,20 @@ async def stream_text_to_speech_cloning(
                 "X-Accel-Buffering": "no"
             }
         )
     except Exception as e:
         logger.error(f"Streaming setup error: {e}")
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
         if converted_wav_path and os.path.exists(converted_wav_path):
             os.unlink(converted_wav_path)
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):

 MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
+CLEANUP_THRESHOLD = 300 # 1 hour in seconds
 TEMP_AUDIO_DIR = "temp_audio"
 GENERATED_AUDIO_DIR = "generated_audio"
 os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
         audio_buffer.seek(0)
         return audio_buffer.read()
+    def _split_text_into_chunks(self, text: str) -> list[str]:
+        """Simple sentence splitting for streaming (can be enhanced with regex)."""
+        sentences = [s.strip() for s in text.split('.') if s.strip()]
+        if not sentences:
+            sentences = [text.strip()]
+        return sentences
     def generate_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
         """Blocking synthesis for standard endpoint."""
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
+    def stream_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
+        """Sentence-by-Sentence Streaming (Blocking)."""
+        logger.info(f"Starting streaming synthesis for text length: {len(text)}")
+        ref_s = self.tts_model.encode_reference(ref_audio_path)
+        # 3. Split text
+        sentences = self._split_text_into_chunks(text)
+        # 4. Stream chunks
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
+            # Infer sentence
+            with torch.no_grad():
+                audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
+            # Convert and yield
+            yield self._convert_to_streamable_format(audio_chunk, audio_format)
+        logger.info("Streaming synthesis complete.")
 # --- Asynchronous Offloading ---
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     reference_text: str = Form(...),
+    speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
+    reference_audio: UploadFile = File(...)):
     """
+    Sentence-by-Sentence Streaming Endpoint.
+    Fixes race condition by moving cleanup into the streaming generator.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
+    # 1. Asynchronously save reference audio (non-blocking)
     temp_ref_path = await save_upload_file_async(reference_audio)
+    converted_wav_path = None # Initialize for cleanup
     try:
+        # 2. Convert the uploaded file (WebM, etc.) to a 24kHz WAV file
         converted_wav_path = await run_blocking_task_async(
+            convert_to_wav_blocking,
+            temp_ref_path
         )
+        # 2.5. CLEANUP ORIGINAL FILE IMMEDIATELY: It is no longer needed after conversion
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
+        # 3. Define the generator function, which will run in the thread pool
+        def stream_generator(path_to_delete: str):
             try:
+                # This logic uses the path_to_delete parameter, which is guaranteed to exist
+                for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
+                    text,
+                    path_to_delete, # Pass the CONVERTED WAV path
+                    reference_text,
+                    speed,
+                    output_format
                 ):
+                    yield chunk_bytes
             except Exception as e:
+                # Log the error and raise it to stop the stream
                 logger.error(f"Streaming generator error: {e}")
+                raise # Re-raise to ensure the stream terminates
             finally:
+                # 4. **CRUCIAL FIX:** Clean up the converted file ONLY AFTER GENERATION IS DONE
                 if os.path.exists(path_to_delete):
                     os.unlink(path_to_delete)
                     logger.info(f"Cleaned up converted file: {path_to_delete}")
+        # Return StreamingResponse, passing the path to the generator
         return StreamingResponse(
             stream_generator(converted_wav_path),
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
                 "X-Accel-Buffering": "no"
             }
         )
     except Exception as e:
         logger.error(f"Streaming setup error: {e}")
+        # Clean up files only if the setup failed *before* starting the generator
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
         if converted_wav_path and os.path.exists(converted_wav_path):
             os.unlink(converted_wav_path)
+        # Reraise HTTPExceptions that may have come from the conversion step
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
+    # Note: The outer 'finally' block is now removed as its logic is handled in 2.5 and 4.
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):