Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 17

Commit

f8d6527

verified ·

1 Parent(s): dc2764b

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -27

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from typing import Optional, Dict, Any
 from uuid import uuid4
 from pathlib import Path
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
@@ -21,7 +21,7 @@ import psutil
 import logging
 import soundfile as sf
-# Add NeuTTS Air to path
 sys.path.insert(0, "/app/neutts-air")
 # Configure logging
@@ -31,18 +31,19 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Configuration - OPTIMIZED FOR MEMORY
 class Config:
     MAX_TEXT_LENGTH = 1000
     MIN_AUDIO_DURATION = 2
     MAX_AUDIO_DURATION = 30
-    SAMPLE_RATE = 24000
     REFERENCE_SAMPLE_RATE = 16000
     CHUNK_SIZE = 8192
     MAX_CONCURRENT_REQUESTS = 2
     CACHE_MAX_FILES = 5  # Very small cache
     CACHE_MAX_SIZE_MB = 5  # Only 5MB cache
     TEMP_FILE_TIMEOUT = 300  # 5 minutes
 config = Config()
@@ -51,6 +52,9 @@ tts_model = None
 model_loading = False
 active_requests = 0
 # Small in-memory cache for recent requests
 audio_cache = {}
 cache_access_order = []
@@ -62,6 +66,7 @@ class MemoryOptimizedProcessor:
     async def process_reference_audio(upload_file: UploadFile) -> str:
         """Process reference audio and return temp file path - CLEANED AFTER USE"""
         temp_ref_path = f"/tmp/ref_{uuid4().hex}.wav"
         try:
             # Read file content
@@ -72,7 +77,7 @@ class MemoryOptimizedProcessor:
             async with aiofiles.open(temp_input, 'wb') as f:
                 await f.write(file_content)
-            # Convert to WAV using ffmpeg
             cmd = [
                 'ffmpeg', '-i', temp_input,
                 '-ac', '1',
@@ -87,7 +92,10 @@ class MemoryOptimizedProcessor:
                 stderr=asyncio.subprocess.PIPE
             )
-            await process.communicate()
             # Validate audio duration
             try:
@@ -102,18 +110,22 @@ class MemoryOptimizedProcessor:
             return temp_ref_path
-        except Exception as e:
             # Cleanup on error
-            for temp_file in [temp_input, temp_ref_path]:
-                if os.path.exists(temp_file):
-                    try:
-                        os.remove(temp_file)
-                    except:
-                        pass
             raise
         finally:
             # Always cleanup input temp file
-            if 'temp_input' in locals() and os.path.exists(temp_input):
                 try:
                     os.remove(temp_input)
                 except:
@@ -176,6 +188,7 @@ async def load_tts_model():
         # Import and initialize model
         from neuttsair.neutts import NeuTTSAir
         tts_model = NeuTTSAir(
             backbone_repo="neuphonic/neutts-air",
             backbone_device="cpu",
@@ -183,6 +196,20 @@ async def load_tts_model():
             codec_device="cpu"
         )
         logger.info("✅ NeuTTS Air model loaded successfully!")
     except Exception as e:
@@ -209,7 +236,10 @@ async def lifespan(app: FastAPI):
     logger.info("🛑 Shutting down NeuTTS Air API")
     global tts_model
     if tts_model is not None:
-        del tts_model
         tts_model = None
     gc.collect()
@@ -285,6 +315,20 @@ async def health_check():
             system_memory_gb=0
         )
 @app.post("/synthesize", response_model=TTSResponse)
 async def synthesize_speech(
     background_tasks: BackgroundTasks,
@@ -293,15 +337,22 @@ async def synthesize_speech(
     reference_audio: UploadFile = File(...)
 ):
     """
-    Efficient synthesis with streaming and minimal memory usage
     """
     global active_requests
     start_time = time.time()
     request_id = str(uuid4())[:8]
     temp_ref_path = None
     active_requests += 1
     try:
         if tts_model is None:
             raise HTTPException(status_code=503, detail="Model loading, please wait")
@@ -315,9 +366,15 @@ async def synthesize_speech(
         # Process reference audio - creates temp file
         temp_ref_path = await MemoryOptimizedProcessor.process_reference_audio(reference_audio)
-        # Perform TTS (this is where most memory is used)
-        ref_codes = tts_model.encode_reference(temp_ref_path)
-        wav_output = tts_model.infer(text, ref_codes, reference_text)
         # Generate audio ID and add to small cache
         audio_id = f"audio_{request_id}"
@@ -344,6 +401,7 @@ async def synthesize_speech(
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
     finally:
         active_requests -= 1
         # Schedule cleanup of temp reference file
         if temp_ref_path and os.path.exists(temp_ref_path):
             background_tasks.add_task(cleanup_temp_file, temp_ref_path)
@@ -355,12 +413,18 @@ async def synthesize_and_stream(
     reference_audio: UploadFile = File(...)
 ):
     """
-    Direct synthesis and streaming - no caching, minimal memory
     """
     global active_requests
     start_time = time.time()
     temp_ref_path = None
     active_requests += 1
     try:
@@ -370,9 +434,12 @@ async def synthesize_and_stream(
         # Process reference audio
         temp_ref_path = await MemoryOptimizedProcessor.process_reference_audio(reference_audio)
-        # Perform TTS
-        ref_codes = tts_model.encode_reference(temp_ref_path)
-        wav_output = tts_model.infer(text, ref_codes, reference_text)
         # Convert to WAV bytes in memory
         wav_buffer = io.BytesIO()
@@ -383,7 +450,7 @@ async def synthesize_and_stream(
         logger.info(f"Stream synthesis completed: {processing_time:.2f}s")
-        # Stream directly without storing
         async def generate_stream():
             chunk_size = config.CHUNK_SIZE
             for i in range(0, len(wav_bytes), chunk_size):
@@ -400,11 +467,14 @@ async def synthesize_and_stream(
             }
         )
     except Exception as e:
         logger.error(f"Stream synthesis error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Stream synthesis failed: {str(e)}")
     finally:
         active_requests -= 1
         if temp_ref_path and os.path.exists(temp_ref_path):
             asyncio.create_task(cleanup_temp_file(temp_ref_path))

 from uuid import uuid4
 from pathlib import Path
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks, Response
 from fastapi.responses import JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 import logging
 import soundfile as sf
+# Add NeuTTS Air to path (adjust if needed)
 sys.path.insert(0, "/app/neutts-air")
 # Configure logging
 )
 logger = logging.getLogger(__name__)
+# Configuration - OPTIMIZED FOR MEMORY & CPU
 class Config:
     MAX_TEXT_LENGTH = 1000
     MIN_AUDIO_DURATION = 2
     MAX_AUDIO_DURATION = 30
+    SAMPLE_RATE = 24000  # Consider 16000 if you need more speed/less CPU
     REFERENCE_SAMPLE_RATE = 16000
     CHUNK_SIZE = 8192
     MAX_CONCURRENT_REQUESTS = 2
     CACHE_MAX_FILES = 5  # Very small cache
     CACHE_MAX_SIZE_MB = 5  # Only 5MB cache
     TEMP_FILE_TIMEOUT = 300  # 5 minutes
+    ENABLE_QUANTIZATION = True  # Best-effort dynamic quantization
 config = Config()
 model_loading = False
 active_requests = 0
+# Concurrency control
+_infer_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_REQUESTS)
 # Small in-memory cache for recent requests
 audio_cache = {}
 cache_access_order = []
     async def process_reference_audio(upload_file: UploadFile) -> str:
         """Process reference audio and return temp file path - CLEANED AFTER USE"""
         temp_ref_path = f"/tmp/ref_{uuid4().hex}.wav"
+        temp_input = None
         try:
             # Read file content
             async with aiofiles.open(temp_input, 'wb') as f:
                 await f.write(file_content)
+            # Convert to WAV using ffmpeg (installed in image)
             cmd = [
                 'ffmpeg', '-i', temp_input,
                 '-ac', '1',
                 stderr=asyncio.subprocess.PIPE
             )
+            _, stderr = await process.communicate()
+            if process.returncode != 0:
+                logger.warning(f"ffmpeg conversion failed: {stderr.decode('utf-8', errors='ignore')}")
+                raise ValueError("ffmpeg failed to convert reference audio")
             # Validate audio duration
             try:
             return temp_ref_path
+        except Exception:
             # Cleanup on error
+            if temp_input and os.path.exists(temp_input):
+                try:
+                    os.remove(temp_input)
+                except:
+                    pass
+            if os.path.exists(temp_ref_path):
+                try:
+                    os.remove(temp_ref_path)
+                except:
+                    pass
             raise
         finally:
             # Always cleanup input temp file
+            if temp_input and os.path.exists(temp_input):
                 try:
                     os.remove(temp_input)
                 except:
         # Import and initialize model
         from neuttsair.neutts import NeuTTSAir
+        # Force CPU devices so Hugging Face free tier works
         tts_model = NeuTTSAir(
             backbone_repo="neuphonic/neutts-air",
             backbone_device="cpu",
             codec_device="cpu"
         )
+        # Best-effort: dynamic quantization to speed up CPU inference
+        if config.ENABLE_QUANTIZATION:
+            try:
+                # quantize_dynamic is safe for models with Linear/RNN modules; skip if not compatible
+                if isinstance(tts_model, torch.nn.Module):
+                    tts_model = torch.quantization.quantize_dynamic(
+                        tts_model, {torch.nn.Linear}, dtype=torch.qint8
+                    )
+                    logger.info("✅ Applied dynamic quantization to model (best-effort).")
+                else:
+                    logger.info("Model is not an nn.Module; skipping dynamic quantization.")
+            except Exception as e:
+                logger.warning(f"Dynamic quantization failed (continuing without it): {e}")
         logger.info("✅ NeuTTS Air model loaded successfully!")
     except Exception as e:
     logger.info("🛑 Shutting down NeuTTS Air API")
     global tts_model
     if tts_model is not None:
+        try:
+            del tts_model
+        except:
+            pass
         tts_model = None
     gc.collect()
             system_memory_gb=0
         )
+async def _encode_reference_async(temp_ref_path: str):
+    """Wrap encode_reference to run off the event loop"""
+    def _encode():
+        with torch.inference_mode(), torch.no_grad():
+            return tts_model.encode_reference(temp_ref_path)
+    return await asyncio.to_thread(_encode)
+async def _infer_async(text: str, ref_codes, reference_text: str):
+    """Wrap infer to run off the event loop"""
+    def _infer():
+        with torch.inference_mode(), torch.no_grad():
+            return tts_model.infer(text, ref_codes, reference_text)
+    return await asyncio.to_thread(_infer)
 @app.post("/synthesize", response_model=TTSResponse)
 async def synthesize_speech(
     background_tasks: BackgroundTasks,
     reference_audio: UploadFile = File(...)
 ):
     """
+    Efficient synthesis with small-cache and minimal memory usage.
+    Uses a semaphore to limit concurrent CPU-bound inferences.
     """
     global active_requests
     start_time = time.time()
     request_id = str(uuid4())[:8]
     temp_ref_path = None
+    # Quick concurrency check
+    if _infer_semaphore.locked():
+        # If queue is full, respond quickly
+        raise HTTPException(status_code=503, detail="Server busy - try again shortly")
+    await _infer_semaphore.acquire()
     active_requests += 1
     try:
         if tts_model is None:
             raise HTTPException(status_code=503, detail="Model loading, please wait")
         # Process reference audio - creates temp file
         temp_ref_path = await MemoryOptimizedProcessor.process_reference_audio(reference_audio)
+        # Encode reference (run in thread)
+        ref_codes = await _encode_reference_async(temp_ref_path)
+        # Perform TTS (run in thread)
+        wav_output = await _infer_async(text, ref_codes, reference_text)
+        if not isinstance(wav_output, np.ndarray):
+            # Defensive conversion if needed
+            wav_output = np.asarray(wav_output, dtype=np.float32)
         # Generate audio ID and add to small cache
         audio_id = f"audio_{request_id}"
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
     finally:
         active_requests -= 1
+        _infer_semaphore.release()
         # Schedule cleanup of temp reference file
         if temp_ref_path and os.path.exists(temp_ref_path):
             background_tasks.add_task(cleanup_temp_file, temp_ref_path)
     reference_audio: UploadFile = File(...)
 ):
     """
+    Direct synthesis and streaming - inference runs off the event loop
+    and the resulting audio is streamed as chunks once ready.
     """
     global active_requests
     start_time = time.time()
     temp_ref_path = None
+    # Quick concurrency check
+    if _infer_semaphore.locked():
+        raise HTTPException(status_code=503, detail="Server busy - try again shortly")
+    await _infer_semaphore.acquire()
     active_requests += 1
     try:
         # Process reference audio
         temp_ref_path = await MemoryOptimizedProcessor.process_reference_audio(reference_audio)
+        # Encode & infer in background (off event loop)
+        ref_codes = await _encode_reference_async(temp_ref_path)
+        wav_output = await _infer_async(text, ref_codes, reference_text)
+        if not isinstance(wav_output, np.ndarray):
+            wav_output = np.asarray(wav_output, dtype=np.float32)
         # Convert to WAV bytes in memory
         wav_buffer = io.BytesIO()
         logger.info(f"Stream synthesis completed: {processing_time:.2f}s")
+        # Stream directly without storing on disk
         async def generate_stream():
             chunk_size = config.CHUNK_SIZE
             for i in range(0, len(wav_bytes), chunk_size):
             }
         )
+    except HTTPException:
+        raise
     except Exception as e:
         logger.error(f"Stream synthesis error: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Stream synthesis failed: {str(e)}")
     finally:
         active_requests -= 1
+        _infer_semaphore.release()
         if temp_ref_path and os.path.exists(temp_ref_path):
             asyncio.create_task(cleanup_temp_file(temp_ref_path))