Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 20

Commit

24bb5f8

verified ·

1 Parent(s): b9c3cb2

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -108

app.py CHANGED Viewed

@@ -2,19 +2,16 @@ import os
 import io
 import asyncio
 import time
-import shutil
 import numpy as np
 import psutil
 import soundfile as sf
 import subprocess
-import tempfile
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional, Generator
 from contextlib import asynccontextmanager
 import logging
-import aiofiles
 import torch
-from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Query
 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
@@ -38,16 +35,10 @@ DEVICE = "cpu"
 MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
-CLEANUP_THRESHOLD = 300 # 1 hour in seconds
-TEMP_AUDIO_DIR = "temp_audio"
-GENERATED_AUDIO_DIR = "generated_audio"
-os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
-os.makedirs(GENERATED_AUDIO_DIR, exist_ok=True)
 class TTSRequestModel(BaseModel):
     """Model for non-file inputs to synthesis and streaming."""
     text: str = Field(..., min_length=1, max_length=1000)
-    speed: float = Field(default=1.0, ge=0.5, le=2.0)
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
@@ -151,32 +142,6 @@ class NeuTTSWrapper:
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
-    def stream_speech_blocking(self, text: str, ref_audio_bytes: bytes, reference_text: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
-        """Sentence-by-Sentence Streaming using cached reference encoding."""
-        logger.info(f"Starting streaming synthesis for text length: {len(text)}")
-        # 1. Hash the audio bytes once
-        audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
-        # 2. Get the reference encoding from cache, once for the whole stream
-        ref_s = self._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
-        # 3. Split text using the new regex method
-        sentences = self._split_text_into_chunks(text)
-        # 4. Stream chunks
-        for i, sentence in enumerate(sentences):
-            if not sentence.strip():
-                continue
-            logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
-            with torch.no_grad():
-                audio_chunk = self.tts_model.infer(sentence, ref_s, reference_text)
-            yield self._convert_to_streamable_format(audio_chunk, audio_format)
-        logger.info("Streaming synthesis complete.")
 # --- Asynchronous Offloading ---
@@ -188,18 +153,6 @@ async def run_blocking_task_async(func, *args, **kwargs):
         lambda: func(*args, **kwargs)
     )
-async def save_upload_file_async(upload_file: UploadFile) -> str:
-    """Asynchronously saves the UploadFile to disk."""
-    temp_filename = os.path.join(TEMP_AUDIO_DIR, f"{time.time()}_{upload_file.filename}")
-    try:
-        # Use asyncio to read the file chunks in a non-blocking manner
-        async with aiofiles.open(temp_filename, 'wb') as out_file:
-            while content := await upload_file.read(1024 * 1024):
-                await out_file.write(content)
-        return temp_filename
-    except Exception as e:
-        logger.error(f"Error saving file: {e}")
-        raise HTTPException(status_code=500, detail="Could not save reference audio file")
 # --- FastAPI Lifespan Manager (Kokoro Feature) ---
@@ -262,31 +215,6 @@ async def health_check():
         }
     }
-@app.delete("/cleanup")
-async def cleanup_files():
-    """Maintenance endpoint to remove old generated and temporary files."""
-    await run_blocking_task_async(cleanup_files_blocking)
-    return {"message": "Cleanup initiated successfully."}
-def cleanup_files_blocking():
-    """Blocking file cleanup logic (original NeuTTS feature)."""
-    now = time.time()
-    deleted_count = 0
-    for directory in [GENERATED_AUDIO_DIR, TEMP_AUDIO_DIR]:
-        for filename in os.listdir(directory):
-            filepath = os.path.join(directory, filename)
-            if os.path.isfile(filepath):
-                try:
-                    # Original cleanup logic: delete if older than CLEANUP_THRESHOLD
-                    if now - os.path.getctime(filepath) > CLEANUP_THRESHOLD:
-                        os.remove(filepath)
-                        deleted_count += 1
-                except Exception as e:
-                    logger.warning(f"Failed to delete {filepath}: {e}")
-    logger.info(f"Cleanup completed: {deleted_count} files removed.")
-    return deleted_count
 # --- Core Synthesis Endpoints ---
@@ -295,7 +223,6 @@ def cleanup_files_blocking():
 async def text_to_speech(
     text: str = Form(...),
     reference_text: str = Form(...),
-    speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
@@ -346,31 +273,30 @@ async def text_to_speech(
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     reference_text: str = Form(...),
-    speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
     Sentence-by-Sentence Streaming using a high-performance, asyncio-native
-    producer-consumer pipeline.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
-        q = asyncio.Queue(maxsize=2)
         async def producer():
             try:
                 converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
                 ref_audio_bytes = converted_wav_buffer.getvalue()
-                audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
-                # ✅ Use LRU cache like blocking endpoint
                 ref_s = await loop.run_in_executor(
                     tts_executor,
                     app.state.tts_wrapper._get_or_create_reference_encoding,
-                    audio_hash,
                     ref_audio_bytes
                 )
@@ -381,32 +307,37 @@ async def stream_text_to_speech_cloning(
                         audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
                     return app.state.tts_wrapper._convert_to_streamable_format(audio_chunk, output_format)
-                # Schedule all chunks to be processed in the background.
                 for sentence in sentences:
                     task = loop.run_in_executor(tts_executor, process_chunk, sentence)
-                    await q.put(task) # Put the FUTURE, not the result, in the queue.
             except Exception as e:
                 logger.error(f"Error in producer task: {e}")
                 await q.put(e)
             finally:
-                await q.put(None) # Signal that all tasks have been scheduled.
         producer_task = asyncio.create_task(producer())
-        # The CONSUMER's job is to wait for each result and yield it.
-        while True:
-            result = await q.get()
-            if result is None:
-                break
-            if isinstance(result, Exception):
-                logger.error(f"Terminating stream due to producer error: {result}")
-                raise result
-            # Await the result of the background task
-            chunk_bytes = await result
             yield chunk_bytes
         await producer_task
@@ -414,16 +345,3 @@ async def stream_text_to_speech_cloning(
         stream_generator(),
         media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
     )
-@app.get("/audio/{filename}")
-async def get_audio(filename: str):
-    """Original NeuTTS feature to serve generated audio files."""
-    file_path = os.path.join(GENERATED_AUDIO_DIR, filename)
-    if not os.path.exists(file_path):
-        raise HTTPException(status_code=404, detail="Audio file not found")
-    return Response(
-        content=open(file_path, "rb").read(),
-        media_type=f"audio/{filename.split('.')[-1]}", # Simple media type detection
-        headers={"Content-Disposition": f"attachment; filename={filename}"}
-    )

 import io
 import asyncio
 import time
 import numpy as np
 import psutil
 import soundfile as sf
 import subprocess
 from concurrent.futures import ThreadPoolExecutor
+from typing import Generator
 from contextlib import asynccontextmanager
 import logging
 import torch
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 SAMPLE_RATE = 24000
 class TTSRequestModel(BaseModel):
     """Model for non-file inputs to synthesis and streaming."""
     text: str = Field(..., min_length=1, max_length=1000)
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
 # --- Asynchronous Offloading ---
         lambda: func(*args, **kwargs)
     )
 # --- FastAPI Lifespan Manager (Kokoro Feature) ---
         }
     }
 # --- Core Synthesis Endpoints ---
 async def text_to_speech(
     text: str = Form(...),
     reference_text: str = Form(...),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     reference_text: str = Form(...),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
     Sentence-by-Sentence Streaming using a high-performance, asyncio-native
+    look-ahead pipeline. This ensures true overlap of CPU work and network I/O.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
+        q = asyncio.Queue(maxsize=MAX_WORKERS + 1) # Queue size based on workers
         async def producer():
             try:
                 converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
                 ref_audio_bytes = converted_wav_buffer.getvalue()
+                # Perform the one-time voice encoding
+                audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
                 ref_s = await loop.run_in_executor(
                     tts_executor,
                     app.state.tts_wrapper._get_or_create_reference_encoding,
+                    audio_hash,
                     ref_audio_bytes
                 )
                         audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
                     return app.state.tts_wrapper._convert_to_streamable_format(audio_chunk, output_format)
+                # Schedule all chunks for background processing
                 for sentence in sentences:
                     task = loop.run_in_executor(tts_executor, process_chunk, sentence)
+                    await q.put(task)
             except Exception as e:
                 logger.error(f"Error in producer task: {e}")
                 await q.put(e)
             finally:
+                await q.put(None)
         producer_task = asyncio.create_task(producer())
+        # --- High-Performance Consumer with Look-Ahead ---
+        # Get the first task from the queue to start the process.
+        current_task = await q.get()
+        while current_task is not None:
+            # Simultaneously, get the NEXT task from the queue.
+            # This allows the next chunk to start processing while we wait for the current one.
+            next_task = await q.get()
+            # Now, wait for the CURRENT task to finish.
+            if isinstance(current_task, Exception):
+                raise current_task
+            chunk_bytes = await current_task
             yield chunk_bytes
+            # The next task becomes the current task for the next iteration.
+            current_task = next_task
         await producer_task
         stream_generator(),
         media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
     )