Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 19

Commit

5d68bda

verified ·

1 Parent(s): 01b3f2d

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -152

app.py CHANGED Viewed

@@ -18,7 +18,10 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Query
 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 # Ensure the cloned neutts-air repository is in the path
 import sys
 sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
@@ -33,7 +36,7 @@ logger = logging.getLogger("NeuTTS-API")
 # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
 DEVICE = "cpu"
 # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
-MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
 CLEANUP_THRESHOLD = 300 # 1 hour in seconds
@@ -49,62 +52,43 @@ class TTSRequestModel(BaseModel):
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
-def convert_to_wav_blocking(input_path: str) -> str:
     """
-    NEW FUNCTION: Uses FFmpeg to convert any uploaded audio format (WebM, MP4, etc.)
-    to a 24kHz, 16-bit PCM WAV file, which is required by soundfile/libsndfile.
-    This function must run in the ThreadPoolExecutor.
     """
-    # Create a unique temporary filename for the converted WAV file
-    # We use tempfile.NamedTemporaryFile to safely create a path
-    # and then delete the file handle so ffmpeg can write to it.
-    with tempfile.NamedTemporaryFile(suffix=".wav", dir=TEMP_AUDIO_DIR, delete=False) as tmp:
-        output_path = tmp.name
-    logger.info(f"Converting '{os.path.basename(input_path)}' to WAV (24kHz, mono) at {os.path.basename(output_path)}")
-    # FFmpeg command details:
-    # -y: overwrite output file if it exists
-    # -i: input file path
-    # -f wav: output format is WAV
-    # -ar 24000: set sample rate to 24000 (required by NeuTTS)
-    # -ac 1: set audio channels to 1 (mono)
-    # -c:a pcm_s16le: set codec to uncompressed 16-bit PCM (standard WAV)
-    command = [
-        "ffmpeg",
-        "-y",
-        "-i", input_path,
         "-f", "wav",
-        "-ar", str(SAMPLE_RATE),
-        "-ac", "1",
-        "-c:a", "pcm_s16le",
-        output_path
     ]
-    try:
-        # Run the FFmpeg command
-        # Use a short timeout to prevent runaway processes
-        result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
-        logger.info(f"FFmpeg conversion successful.")
-        return output_path
-    except subprocess.CalledProcessError as e:
-        logger.error(f"FFmpeg conversion failed: {e.stderr}")
-        # Clean up the output path if FFmpeg failed to write it
-        if os.path.exists(output_path):
-            os.unlink(output_path)
         # Provide the last line of the FFmpeg error to the user
-        error_detail = e.stderr.splitlines()[-1] if e.stderr else "Unknown FFmpeg error."
         raise HTTPException(status_code=400, detail=f"Audio format conversion failed: {error_detail}")
-    except subprocess.TimeoutExpired:
-        logger.error("FFmpeg conversion timed out.")
-        if os.path.exists(output_path):
-            os.unlink(output_path)
-        raise HTTPException(status_code=504, detail="Audio conversion timed out after 30 seconds.")
-    except Exception as e:
-        logger.error(f"General conversion error: {e}")
-        if os.path.exists(output_path):
-            os.unlink(output_path)
-        raise HTTPException(status_code=500, detail="An unexpected error occurred during audio conversion.")
 # --- Model Wrapper and Logic ---
 class NeuTTSWrapper:
@@ -135,32 +119,50 @@ class NeuTTSWrapper:
         return audio_buffer.read()
     def _split_text_into_chunks(self, text: str) -> list[str]:
-        """Simple sentence splitting for streaming (can be enhanced with regex)."""
-        sentences = [s.strip() for s in text.split('.') if s.strip()]
-        if not sentences:
-            sentences = [text.strip()]
-        return sentences
-    def generate_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
-        """Blocking synthesis for standard endpoint."""
-        ref_s = self.tts_model.encode_reference(ref_audio_path)
         # 3. Infer full text
         with torch.no_grad():
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
-    def stream_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
-        """Sentence-by-Sentence Streaming (Blocking)."""
         logger.info(f"Starting streaming synthesis for text length: {len(text)}")
-        ref_s = self.tts_model.encode_reference(ref_audio_path)
-        # 3. Split text
         sentences = self._split_text_into_chunks(text)
         # 4. Stream chunks
@@ -170,11 +172,9 @@ class NeuTTSWrapper:
             logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
-            # Infer sentence
             with torch.no_grad():
                 audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
-            # Convert and yield
             yield self._convert_to_streamable_format(audio_chunk, audio_format)
         logger.info("Streaming synthesis complete.")
@@ -300,69 +300,48 @@ async def text_to_speech(
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
-    Includes FFmpeg conversion for uploaded audio format compatibility.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
-    # 1. Asynchronously save reference audio (original upload)
-    temp_ref_path = await save_upload_file_async(reference_audio)
-    converted_wav_path = None # NEW: Initialize for cleanup
     start_time = time.time()
     try:
-        # 2. **NEW STEP**: Convert the uploaded file (WebM, etc.) to a 24kHz WAV file using FFmpeg
-        converted_wav_path = await run_blocking_task_async(
-            convert_to_wav_blocking,
-            temp_ref_path
-        )
-        # 3. Offload the ENTIRE blocking process (encode + infer) to a thread
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
-            converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
             reference_text
         )
-        # 4. Convert to requested format (Blocking, but usually fast)
         audio_bytes = await run_blocking_task_async(
             app.state.tts_wrapper._convert_to_streamable_format,
             audio_data,
             output_format
         )
-        # 5. Save to disk (Original NeuTTS requirement)
-        audio_filename = f"tts_{time.time()}.{output_format}"
-        final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
-        await run_blocking_task_async(
-            lambda: open(final_path, 'wb').write(audio_bytes)
-        )
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
-                "Content-Disposition": f"attachment; filename={audio_filename}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
                 "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
         logger.error(f"Synthesis error: {e}")
-        # Reraise HTTPExceptions that may have come from the conversion step
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
-    finally:
-        # 6. Clean up BOTH the original file AND the converted WAV file
-        if os.path.exists(temp_ref_path):
-            os.unlink(temp_ref_path)
-        if converted_wav_path and os.path.exists(converted_wav_path):
-            os.unlink(converted_wav_path)
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
@@ -372,74 +351,79 @@ async def stream_text_to_speech_cloning(
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Sentence-by-Sentence Streaming Endpoint.
-    Fixes race condition by moving cleanup into the streaming generator.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
-    # 1. Asynchronously save reference audio (non-blocking)
-    temp_ref_path = await save_upload_file_async(reference_audio)
-    converted_wav_path = None # Initialize for cleanup
     try:
-        # 2. Convert the uploaded file (WebM, etc.) to a 24kHz WAV file
-        converted_wav_path = await run_blocking_task_async(
-            convert_to_wav_blocking,
-            temp_ref_path
-        )
-        # 2.5. CLEANUP ORIGINAL FILE IMMEDIATELY: It is no longer needed after conversion
-        if os.path.exists(temp_ref_path):
-            os.unlink(temp_ref_path)
-        # 3. Define the generator function, which will run in the thread pool
-        def stream_generator(path_to_delete: str):
-            try:
-                # This logic uses the path_to_delete parameter, which is guaranteed to exist
-                for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
-                    text,
-                    path_to_delete, # Pass the CONVERTED WAV path
-                    reference_text,
-                    speed,
-                    output_format
-                ):
-                    yield chunk_bytes
-            except Exception as e:
-                # Log the error and raise it to stop the stream
-                logger.error(f"Streaming generator error: {e}")
-                raise # Re-raise to ensure the stream terminates
-            finally:
-                # 4. **CRUCIAL FIX:** Clean up the converted file ONLY AFTER GENERATION IS DONE
-                if os.path.exists(path_to_delete):
-                    os.unlink(path_to_delete)
-                    logger.info(f"Cleaned up converted file: {path_to_delete}")
-        # Return StreamingResponse, passing the path to the generator
         return StreamingResponse(
-            stream_generator(converted_wav_path),
-            media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
-            headers={
-                "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
-                "Transfer-Encoding": "chunked",
-                "Cache-Control": "no-cache",
-                "X-Accel-Buffering": "no"
-            }
         )
     except Exception as e:
         logger.error(f"Streaming setup error: {e}")
-        # Clean up files only if the setup failed *before* starting the generator
-        if os.path.exists(temp_ref_path):
-            os.unlink(temp_ref_path)
-        if converted_wav_path and os.path.exists(converted_wav_path):
-            os.unlink(converted_wav_path)
-        # Reraise HTTPExceptions that may have come from the conversion step
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
-    # Note: The outer 'finally' block is now removed as its logic is handled in 2.5 and 4.
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):

 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
+import re
+import hashlib
+from functools import lru_cache
+import queue
 # Ensure the cloned neutts-air repository is in the path
 import sys
 sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
 # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
 DEVICE = "cpu"
 # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
+MAX_WORKERS = 1
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
 CLEANUP_THRESHOLD = 300 # 1 hour in seconds
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
+async def convert_to_wav_in_memory(upload_file: UploadFile) -> io.BytesIO:
     """
+    Converts uploaded audio to a 24kHz WAV in memory using FFmpeg pipes.
+    This avoids all intermediate disk I/O for maximum speed.
     """
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i", "pipe:0",      # Read from stdin
         "-f", "wav",
+        "-ar", str(SAMPLE_RATE),
+        "-ac", "1",
+        "-c:a", "pcm_s16le",
+        "pipe:1"             # Write to stdout
     ]
+    # Start the subprocess with pipes for stdin, stdout, and stderr
+    proc = await asyncio.create_subprocess_exec(
+        *ffmpeg_command,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    # Stream the uploaded file data into ffmpeg's stdin
+    # and capture the resulting WAV data from its stdout
+    wav_data, stderr_data = await proc.communicate(input=await upload_file.read())
+    if proc.returncode != 0:
+        error_message = stderr_data.decode()
+        logger.error(f"In-memory conversion failed: {error_message}")
         # Provide the last line of the FFmpeg error to the user
+        error_detail = error_message.splitlines()[-1] if error_message else "Unknown FFmpeg error."
         raise HTTPException(status_code=400, detail=f"Audio format conversion failed: {error_detail}")
+    logger.info("In-memory FFmpeg conversion successful.")
+    # Return the raw WAV data in a BytesIO buffer, ready for the model
+    return io.BytesIO(wav_data)
 # --- Model Wrapper and Logic ---
 class NeuTTSWrapper:
         return audio_buffer.read()
     def _split_text_into_chunks(self, text: str) -> list[str]:
+        """
+        Splits text into sentences OR clauses using a robust regex.
+        This is fast, library-free, and now handles commas.
+        """
+        # This regex now finds all sequences of characters that are not a sentence-ending
+        # or clause-ending punctuation mark, followed by that punctuation.
+        # The only change is adding ',' to the character sets.
+        chunks = re.findall(r'[^.,!?]+[.,!?]*', text)
+        return [c.strip() for c in chunks if c.strip()]
+    @lru_cache(maxsize=32)
+    def _get_or_create_reference_encoding(self, audio_content_hash: str, audio_bytes: bytes) -> torch.Tensor:
+        """
+        Caches the expensive reference encoding operation using an in-memory LRU cache.
+        The hash of the audio content is the key.
+        """
+        logger.info(f"Cache miss for hash: {audio_content_hash[:10]}... Encoding new reference.")
+        # The model's encode_reference can take a file-like object (BytesIO)
+        return self.tts_model.encode_reference(io.BytesIO(audio_bytes))
+    def generate_speech_blocking(self, text: str, ref_audio_bytes: bytes, reference_text: str) -> np.ndarray:
+        """Blocking synthesis using cached reference encoding."""
+        # 1. Hash the audio bytes to get a cache key
+        audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
+        # 2. Get the encoding from the cache (or create it if new)
+        ref_s = self._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
         # 3. Infer full text
         with torch.no_grad():
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
+    def stream_speech_blocking(self, text: str, ref_audio_bytes: bytes, reference_text: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
+        """Sentence-by-Sentence Streaming using cached reference encoding."""
         logger.info(f"Starting streaming synthesis for text length: {len(text)}")
+        # 1. Hash the audio bytes once
+        audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
+        # 2. Get the reference encoding from cache, once for the whole stream
+        ref_s = self._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
+        # 3. Split text using the new regex method
         sentences = self._split_text_into_chunks(text)
         # 4. Stream chunks
             logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
             with torch.no_grad():
                 audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
             yield self._convert_to_streamable_format(audio_chunk, audio_format)
         logger.info("Streaming synthesis complete.")
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    Standard blocking TTS endpoint with in-memory processing and caching.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     start_time = time.time()
     try:
+        # 1. Convert the uploaded file to WAV directly in memory
+        converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
+        ref_audio_bytes = converted_wav_buffer.getvalue()
+        # 2. Offload the blocking AI process (now faster with caching)
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
+            ref_audio_bytes, # Pass bytes, not a path
             reference_text
         )
+        # 3. Convert to requested output format
         audio_bytes = await run_blocking_task_async(
             app.state.tts_wrapper._convert_to_streamable_format,
             audio_data,
             output_format
         )
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
+                "Content-Disposition": f"attachment; filename=tts_output.{output_format}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
                 "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
         logger.error(f"Synthesis error: {e}")
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    Sentence-by-Sentence Streaming using a parallel producer-consumer pipeline
+    to ensure continuous, low-latency audio flow.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     try:
+        # Initial audio conversion is still done once, in memory.
+        converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
+        ref_audio_bytes = converted_wav_buffer.getvalue()
+        def stream_generator():
+            # 1. Create a queue to communicate between the producer and consumer.
+            # A small maxsize acts as a "look-ahead" buffer.
+            q = queue.Queue(maxsize=2)
+            # 2. Define the PRODUCER (The "Grill Chef")
+            # This function runs in a background thread to generate audio continuously.
+            def producer():
+                try:
+                    # Get reference encoding once for the whole stream
+                    audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
+                    ref_s = app.state.tts_wrapper._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
+                    sentences = app.state.tts_wrapper._split_text_into_chunks(text)
+                    for sentence in sentences:
+                        # Generate the raw audio (CPU-heavy part)
+                        with torch.no_grad():
+                            audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence, ref_s, reference_text)
+                        # Put the finished audio (a numpy array) into the queue
+                        q.put(audio_chunk)
+                except Exception as e:
+                    logger.error(f"Error in producer thread: {e}")
+                    # If an error occurs, put the exception in the queue to notify the consumer
+                    q.put(e)
+                finally:
+                    # 3. Signal that production is finished by putting None in the queue
+                    q.put(None)
+            # 4. Start the producer in the background ThreadPoolExecutor
+            loop = asyncio.get_event_loop()
+            loop.run_in_executor(tts_executor, producer)
+            # 5. The main thread becomes the CONSUMER (The "Finisher")
+            while True:
+                # Get the next audio chunk from the queue (this will wait if the queue is empty)
+                result = q.get()
+                # Check for the "end of stream" signal
+                if result is None:
+                    break
+                # Check if the producer sent an error
+                if isinstance(result, Exception):
+                    logger.error(f"Terminating stream due to producer error: {result}")
+                    raise result
+                # Convert the raw audio to the desired format and yield it to the user
+                yield app.state.tts_wrapper._convert_to_streamable_format(result, output_format)
+        # Return the StreamingResponse with our new high-performance generator
         return StreamingResponse(
+            stream_generator(),
+            media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
         )
     except Exception as e:
         logger.error(f"Streaming setup error: {e}")
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):