Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 20

Commit

9a55341

verified ·

1 Parent(s): 6c72969

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -48

app.py CHANGED Viewed

@@ -14,10 +14,19 @@ import torch
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
 import re
 import hashlib
 from functools import lru_cache
 # Ensure the cloned neutts-air repository is in the path
 import sys
 sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
@@ -31,16 +40,16 @@ logger = logging.getLogger("NeuTTS-API")
 # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
 DEVICE = "cpu"
-# Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
-MAX_WORKERS = 2
-tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
-SAMPLE_RATE = 24000
-class TTSRequestModel(BaseModel):
-    """Model for non-file inputs to synthesis and streaming."""
-    text: str = Field(..., min_length=1, max_length=1000)
-    output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
 async def convert_to_wav_in_memory(upload_file: UploadFile) -> io.BytesIO:
     """
@@ -79,24 +88,104 @@ async def convert_to_wav_in_memory(upload_file: UploadFile) -> io.BytesIO:
     logger.info("In-memory FFmpeg conversion successful.")
     # Return the raw WAV data in a BytesIO buffer, ready for the model
     return io.BytesIO(wav_data)
-# --- Model Wrapper and Logic ---
 class NeuTTSWrapper:
-    def __init__(self, device: str = "cpu"):
         self.tts_model = None
         self.device = device
         self.load_model()
     def load_model(self):
         try:
-            logger.info(f"Loading NeuTTSAir model on device: {self.device}")
-            # Ensure we respect the CPU configuration
-            self.tts_model = NeuTTSAir(backbone_device=self.device, codec_device=self.device)
             logger.info("✅ NeuTTSAir model loaded successfully.")
         except Exception as e:
             logger.error(f"❌ Model loading failed: {e}")
             raise
     def _convert_to_streamable_format(self, audio_data: np.ndarray, audio_format: str) -> bytes:
         """Converts NumPy audio array to streamable bytes in the specified format."""
         audio_buffer = io.BytesIO()
@@ -108,16 +197,87 @@ class NeuTTSWrapper:
         audio_buffer.seek(0)
         return audio_buffer.read()
     def _split_text_into_chunks(self, text: str) -> list[str]:
         """
-        Splits text into sentences OR clauses using a robust regex.
-        This is fast, library-free, and now handles commas.
         """
-        # This regex now finds all sequences of characters that are not a sentence-ending
-        # or clause-ending punctuation mark, followed by that punctuation.
-        # The only change is adding ',' to the character sets.
-        chunks = re.findall(r'[^.,!?]+[.,!?]*', text)
-        return [c.strip() for c in chunks if c.strip()]
     @lru_cache(maxsize=32)
     def _get_or_create_reference_encoding(self, audio_content_hash: str, audio_bytes: bytes) -> torch.Tensor:
@@ -137,11 +297,58 @@ class NeuTTSWrapper:
         # 2. Get the encoding from the cache (or create it if new)
         ref_s = self._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
-        # 3. Infer full text
         with torch.no_grad():
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
 # --- Asynchronous Offloading ---
@@ -153,17 +360,23 @@ async def run_blocking_task_async(func, *args, **kwargs):
         lambda: func(*args, **kwargs)
     )
-# --- FastAPI Lifespan Manager (Kokoro Feature) ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Modern lifespan management: initialize model on startup, shutdown executor."""
     try:
-        app.state.tts_wrapper = NeuTTSWrapper(device=DEVICE)
     except Exception as e:
         logger.error(f"Fatal startup error: {e}")
-        # Terminate the application if the model can't load
         tts_executor.shutdown(wait=False)
         raise RuntimeError("Model initialization failed.")
@@ -175,8 +388,8 @@ async def lifespan(app: FastAPI):
 # --- FastAPI Application Setup ---
 app = FastAPI(
-    title="NeuTTS Air Instant Cloning API",
-    version="2.0.0-PROD-ENHANCED",
     docs_url="/docs",
     lifespan=lifespan
 )
@@ -188,23 +401,28 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# --- New Endpoints and Enhancements ---
 @app.get("/")
 async def root():
-    return {"message": "NeuTTS Air API v2.0 - Ready for Instant Voice Cloning"}
 @app.get("/health")
 async def health_check():
-    """Enhanced health check (Kokoro Feature + Original Metrics)"""
     mem = psutil.virtual_memory()
     disk = psutil.disk_usage('/')
     return {
         "status": "healthy",
         "model_loaded": hasattr(app.state, 'tts_wrapper') and app.state.tts_wrapper.tts_model is not None,
         "device": DEVICE,
         "concurrency_limit": MAX_WORKERS,
         "memory_usage": {
             "total_gb": round(mem.total / (1024**3), 2),
             "used_percent": mem.percent
@@ -215,8 +433,6 @@ async def health_check():
         }
     }
 # --- Core Synthesis Endpoints ---
 @app.post("/synthesize", response_class=Response)
@@ -226,7 +442,7 @@ async def text_to_speech(
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Standard blocking TTS endpoint with in-memory processing and caching.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
@@ -237,11 +453,11 @@ async def text_to_speech(
         converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
         ref_audio_bytes = converted_wav_buffer.getvalue()
-        # 2. Offload the blocking AI process (now faster with caching)
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
-            ref_audio_bytes, # Pass bytes, not a path
             reference_text
         )
@@ -254,13 +470,17 @@ async def text_to_speech(
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
                 "Content-Disposition": f"attachment; filename=tts_output.{output_format}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
-                "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
@@ -276,15 +496,14 @@ async def stream_text_to_speech_cloning(
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Sentence-by-Sentence Streaming using a high-performance, asyncio-native
-    look-ahead pipeline. This ensures true overlap of CPU work and network I/O.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
-        q = asyncio.Queue(maxsize=MAX_WORKERS + 1) # Queue size based on workers
         async def producer():
             try:
@@ -301,6 +520,7 @@ async def stream_text_to_speech_cloning(
                 )
                 sentences = app.state.tts_wrapper._split_text_into_chunks(text)
                 def process_chunk(sentence_text):
                     with torch.no_grad():
@@ -321,27 +541,25 @@ async def stream_text_to_speech_cloning(
         producer_task = asyncio.create_task(producer())
         # --- High-Performance Consumer with Look-Ahead ---
-        # Get the first task from the queue to start the process.
         current_task = await q.get()
         while current_task is not None:
-            # Simultaneously, get the NEXT task from the queue.
-            # This allows the next chunk to start processing while we wait for the current one.
             next_task = await q.get()
-            # Now, wait for the CURRENT task to finish.
             if isinstance(current_task, Exception):
                 raise current_task
             chunk_bytes = await current_task
             yield chunk_bytes
-            # The next task becomes the current task for the next iteration.
             current_task = next_task
         await producer_task
     return StreamingResponse(
         stream_generator(),
-        media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
-    )

 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.responses import Response, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 import re
 import hashlib
 from functools import lru_cache
+# ONNX Runtime import
+try:
+    import onnxruntime as ort
+    ONNX_AVAILABLE = True
+    logger.info("✅ ONNX Runtime available")
+except ImportError:
+    ONNX_AVAILABLE = False
+    logger.warning("⚠️ ONNX Runtime not available, falling back to PyTorch")
 # Ensure the cloned neutts-air repository is in the path
 import sys
 sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
 # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
 DEVICE = "cpu"
+# ONNX Configuration
+USE_ONNX = True and ONNX_AVAILABLE  # Auto-disable if ONNX not available
+ONNX_MODEL_DIR = "onnx_models"
+os.makedirs(ONNX_MODEL_DIR, exist_ok=True)
+# Configure Max Workers for concurrent synthesis threads
+MAX_WORKERS = min(4, (os.cpu_count() or 2))
+tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
+SAMPLE_RATE = 24000
 async def convert_to_wav_in_memory(upload_file: UploadFile) -> io.BytesIO:
     """
     logger.info("In-memory FFmpeg conversion successful.")
     # Return the raw WAV data in a BytesIO buffer, ready for the model
     return io.BytesIO(wav_data)
+# --- ONNX Optimized Model Wrapper ---
+class NeuTTSONNXWrapper:
+    """ONNX optimized wrapper for NeuTTS model inference"""
+    def __init__(self, onnx_model_path: str):
+        self.session_options = ort.SessionOptions()
+        # Optimize for CPU performance
+        self.session_options.intra_op_num_threads = os.cpu_count() or 4
+        self.session_options.inter_op_num_threads = 2
+        self.session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        self.session_options.enable_profiling = False
+        # Use CPU execution provider
+        providers = ['CPUExecutionProvider']
+        self.session = ort.InferenceSession(
+            onnx_model_path,
+            sess_options=self.session_options,
+            providers=providers
+        )
+        # Get model metadata
+        self.input_names = [input.name for input in self.session.get_inputs()]
+        self.output_names = [output.name for output in self.session.get_outputs()]
+        logger.info(f"✅ ONNX model loaded: {onnx_model_path}")
+        logger.info(f"   Inputs: {self.input_names}")
+        logger.info(f"   Outputs: {self.output_names}")
 class NeuTTSWrapper:
+    def __init__(self, device: str = "cpu", use_onnx: bool = USE_ONNX):
         self.tts_model = None
         self.device = device
+        self.use_onnx = use_onnx
+        self.onnx_wrapper = None
         self.load_model()
     def load_model(self):
         try:
+            logger.info(f"Loading NeuTTSAir model on device: {self.device} (ONNX: {self.use_onnx})")
+            # Configure phonemizer for better performance
+            os.environ['PHONEMIZER_OPTIMIZE'] = '1'
+            os.environ['PHONEMIZER_VERBOSE'] = '0'
+            # Use ONNX codec decoder for maximum speed if available
+            codec_repo = "neuphonic/neucodec-onnx-decoder" if self.use_onnx else "neuphonic/neucodec"
+            self.tts_model = NeuTTSAir(
+                backbone_device=self.device,
+                codec_device=self.device,
+                codec_repo=codec_repo
+            )
+            # Initialize ONNX if enabled
+            if self.use_onnx:
+                self._initialize_onnx()
             logger.info("✅ NeuTTSAir model loaded successfully.")
+            # Test phonemizer with sample text
+            self._test_phonemizer()
         except Exception as e:
             logger.error(f"❌ Model loading failed: {e}")
             raise
+    def _initialize_onnx(self):
+        """Initialize ONNX components for optimized inference"""
+        try:
+            # Check if ONNX model exists, if not we'll use PyTorch fallback
+            onnx_model_path = os.path.join(ONNX_MODEL_DIR, "neutts_backbone.onnx")
+            if os.path.exists(onnx_model_path):
+                self.onnx_wrapper = NeuTTSONNXWrapper(onnx_model_path)
+                logger.info("✅ ONNX optimization enabled")
+            else:
+                logger.warning("⚠️ ONNX model not found, using PyTorch backend")
+                self.use_onnx = False
+        except Exception as e:
+            logger.warning(f"⚠️ ONNX initialization failed: {e}, using PyTorch backend")
+            self.use_onnx = False
+    def _test_phonemizer(self):
+        """Test phonemizer with sample text to catch issues early."""
+        try:
+            test_text = "Hello world this is a test."
+            # This will trigger phonemizer initialization and catch config issues
+            with torch.no_grad():
+                _ = self.tts_model.infer(test_text, torch.randn(1, 512), test_text)
+            logger.info("✅ Phonemizer tested successfully")
+        except Exception as e:
+            logger.warning(f"⚠️ Phonemizer test had issues: {e}")
     def _convert_to_streamable_format(self, audio_data: np.ndarray, audio_format: str) -> bytes:
         """Converts NumPy audio array to streamable bytes in the specified format."""
         audio_buffer = io.BytesIO()
         audio_buffer.seek(0)
         return audio_buffer.read()
+    def _preprocess_text_for_phonemizer(self, text: str) -> str:
+        """
+        Clean text for phonemizer to prevent word count mismatches.
+        This eliminates the warnings and significantly speeds up processing.
+        """
+        # Remove or replace problematic characters
+        text = re.sub(r'[^\w\s\.\,\!\?\-\'\"]', '', text)  # Keep only safe chars
+        # Normalize whitespace
+        text = ' '.join(text.split())
+        # Ensure proper sentence separation for phonemizer
+        text = re.sub(r'\.\s*', '. ', text)  # Standardize periods
+        text = re.sub(r'\?\s*', '? ', text)  # Standardize question marks
+        text = re.sub(r'\!\s*', '! ', text)  # Standardize exclamation marks
+        return text.strip()
     def _split_text_into_chunks(self, text: str) -> list[str]:
         """
+        Enhanced text splitting that's phonemizer-friendly.
+        Pre-processes each chunk to avoid word count mismatches.
         """
+        # First, preprocess the entire text
+        clean_text = self._preprocess_text_for_phonemizer(text)
+        # Use more robust sentence splitting
+        sentence_endings = r'[.!?]+'
+        chunks = []
+        # Split on sentence endings while preserving the endings
+        start = 0
+        for match in re.finditer(sentence_endings, clean_text):
+            end = match.end()
+            chunk = clean_text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            start = end
+        # Add any remaining text
+        if start < len(clean_text):
+            remaining = clean_text[start:].strip()
+            if remaining:
+                chunks.append(remaining)
+        # If no sentence endings found, split by commas or length
+        if not chunks:
+            chunks = self._fallback_chunking(clean_text)
+        return [chunk for chunk in chunks if chunk.strip()]
+    def _fallback_chunking(self, text: str) -> list[str]:
+        """Fallback chunking when no sentence endings are found."""
+        # Split by commas first
+        comma_chunks = [chunk.strip() + ',' for chunk in text.split(',') if chunk.strip()]
+        if comma_chunks:
+            # Remove trailing comma from last chunk
+            if comma_chunks[-1].endswith(','):
+                comma_chunks[-1] = comma_chunks[-1][:-1]
+            return comma_chunks
+        # Fallback to length-based chunking
+        max_chunk_length = 150
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        for word in words:
+            current_chunk.append(word)
+            if len(' '.join(current_chunk)) > max_chunk_length:
+                if len(current_chunk) > 1:
+                    chunks.append(' '.join(current_chunk[:-1]))
+                    current_chunk = [current_chunk[-1]]
+                else:
+                    chunks.append(' '.join(current_chunk))
+                    current_chunk = []
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
     @lru_cache(maxsize=32)
     def _get_or_create_reference_encoding(self, audio_content_hash: str, audio_bytes: bytes) -> torch.Tensor:
         # 2. Get the encoding from the cache (or create it if new)
         ref_s = self._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
+        # 3. Infer full text (ONNX optimized if available)
         with torch.no_grad():
             audio = self.tts_model.infer(text, ref_s, reference_text)
         return audio
+# --- ONNX Conversion Function ---
+def convert_model_to_onnx():
+    """Convert PyTorch model to ONNX format for optimized inference"""
+    try:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        import torch.onnx
+        model_repo = "neuphonic/neutts-air"
+        onnx_path = os.path.join(ONNX_MODEL_DIR, "neutts_backbone.onnx")
+        logger.info("Starting ONNX conversion...")
+        # Load original model
+        tokenizer = AutoTokenizer.from_pretrained(model_repo)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_repo,
+            torch_dtype=torch.float32  # Use float32 for better ONNX compatibility
+        ).cpu()
+        model.eval()
+        # Create dummy input (typical sequence length)
+        dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 512), dtype=torch.long)
+        # Export to ONNX
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_path,
+            input_names=['input_ids'],
+            output_names=['logits'],
+            dynamic_axes={
+                'input_ids': {0: 'batch_size', 1: 'sequence_length'},
+                'logits': {0: 'batch_size', 1: 'sequence_length'}
+            },
+            opset_version=14,
+            do_constant_folding=True,
+            export_params=True,
+            verbose=False
+        )
+        logger.info(f"✅ ONNX conversion successful: {onnx_path}")
+        return True
+    except Exception as e:
+        logger.error(f"❌ ONNX conversion failed: {e}")
+        return False
 # --- Asynchronous Offloading ---
         lambda: func(*args, **kwargs)
     )
+# --- FastAPI Lifespan Manager ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Modern lifespan management: initialize model on startup with ONNX optimization."""
     try:
+        # Convert to ONNX on first run if enabled but model doesn't exist
+        if USE_ONNX and not os.path.exists(os.path.join(ONNX_MODEL_DIR, "neutts_backbone.onnx")):
+            logger.info("First run: Converting model to ONNX for optimization...")
+            success = await run_blocking_task_async(convert_model_to_onnx)
+            if not success:
+                logger.warning("ONNX conversion failed, using PyTorch backend")
+        app.state.tts_wrapper = NeuTTSWrapper(device=DEVICE, use_onnx=USE_ONNX)
     except Exception as e:
         logger.error(f"Fatal startup error: {e}")
         tts_executor.shutdown(wait=False)
         raise RuntimeError("Model initialization failed.")
 # --- FastAPI Application Setup ---
 app = FastAPI(
+    title="NeuTTS Air Instant Cloning API (ONNX Optimized)",
+    version="2.1.0-ONNX",
     docs_url="/docs",
     lifespan=lifespan
 )
     allow_headers=["*"],
 )
+# --- Endpoints ---
 @app.get("/")
 async def root():
+    return {"message": "NeuTTS Air API v2.1 - ONNX Optimized for Speed"}
 @app.get("/health")
 async def health_check():
+    """Enhanced health check with ONNX status."""
     mem = psutil.virtual_memory()
     disk = psutil.disk_usage('/')
+    onnx_status = "enabled" if USE_ONNX else "disabled"
+    if hasattr(app.state, 'tts_wrapper'):
+        onnx_status = "active" if app.state.tts_wrapper.use_onnx else "fallback"
     return {
         "status": "healthy",
         "model_loaded": hasattr(app.state, 'tts_wrapper') and app.state.tts_wrapper.tts_model is not None,
         "device": DEVICE,
         "concurrency_limit": MAX_WORKERS,
+        "onnx_optimization": onnx_status,
         "memory_usage": {
             "total_gb": round(mem.total / (1024**3), 2),
             "used_percent": mem.percent
         }
     }
 # --- Core Synthesis Endpoints ---
 @app.post("/synthesize", response_class=Response)
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    Standard blocking TTS endpoint with in-memory processing and ONNX optimization.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
         converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
         ref_audio_bytes = converted_wav_buffer.getvalue()
+        # 2. Offload the blocking AI process (ONNX optimized if available)
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
+            ref_audio_bytes,
             reference_text
         )
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
+        logger.info(f"✅ Synthesis completed in {processing_time:.2f}s (ONNX: {app.state.tts_wrapper.use_onnx})")
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
                 "Content-Disposition": f"attachment; filename=tts_output.{output_format}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
+                "X-Audio-Duration": f"{audio_duration:.2f}s",
+                "X-ONNX-Optimized": str(app.state.tts_wrapper.use_onnx)
             }
         )
     except Exception as e:
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    Sentence-by-Sentence Streaming with ONNX optimization.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
+        q = asyncio.Queue(maxsize=MAX_WORKERS + 1)
         async def producer():
             try:
                 )
                 sentences = app.state.tts_wrapper._split_text_into_chunks(text)
+                logger.info(f"Streaming {len(sentences)} chunks (ONNX: {app.state.tts_wrapper.use_onnx})")
                 def process_chunk(sentence_text):
                     with torch.no_grad():
         producer_task = asyncio.create_task(producer())
         # --- High-Performance Consumer with Look-Ahead ---
         current_task = await q.get()
         while current_task is not None:
             next_task = await q.get()
             if isinstance(current_task, Exception):
                 raise current_task
             chunk_bytes = await current_task
             yield chunk_bytes
             current_task = next_task
         await producer_task
     return StreamingResponse(
         stream_generator(),
+        media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
+        headers={
+            "X-ONNX-Optimized": str(app.state.tts_wrapper.use_onnx)
+        }
+    )