Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 18

Commit

6caac4d

verified ·

1 Parent(s): 2ed48de

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -688

app.py CHANGED Viewed

@@ -1,770 +1,362 @@
 import os
-import sys
 import time
-import gc
-import torch
 import numpy as np
-import asyncio
-import aiofiles
-import re
-import io
-from concurrent.futures import ThreadPoolExecutor
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Query
-from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, Response
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-from typing import Optional, Dict, Any, Generator, List
 import psutil
-import logging
 import soundfile as sf
 from contextlib import asynccontextmanager
-os.environ['HF_HOME'] = '/app/cache'
-os.environ['HUGGINGFACE_HUB_CACHE'] = '/app/cache'
-# Add NeuTTS Air to path
-sys.path.append("neutts-air")
 # Configure logging
 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Device detection and optimization
-def get_best_device():
-    return "cuda" if torch.cuda.is_available() else "cpu"
-DEVICE = get_best_device()
-MAX_WORKERS = 1 if DEVICE == "cpu" else 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
-# Global model instance
-tts_model = None
-model_loading = False
-# Pydantic models
-class TTSRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000)
-    reference_text: str = Field(..., min_length=1, max_length=1000)
-    reference_audio_path: Optional[str] = None
-    output_format: str = Field(default="wav")
-    speed: float = Field(default=1.0, ge=0.5, le=2.0)
-class StreamingRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000)
-    reference_text: str = Field(..., min_length=1, max_length=1000)
-    reference_audio_path: str
-    speed: float = Field(default=1.0, ge=0.5, le=2.0)
-    chunk_size: int = Field(default=2048, ge=512, le=8192)
-class TTSResponse(BaseModel):
-    success: bool
-    audio_url: Optional[str] = None
-    message: Optional[str] = None
-    processing_time: Optional[float] = None
-    audio_duration: Optional[float] = None
-class HealthResponse(BaseModel):
-    status: str
-    model_loaded: bool
-    device: str
-    memory_usage: Dict[str, float]
-    disk_usage: Dict[str, float]
-    streaming_supported: bool = True
-def load_tts_model():
-    global tts_model, model_loading
-    if tts_model is not None or model_loading:
-        return
-    model_loading = True
-    try:
-        logger.info(f"Loading NeuTTS Air model on {DEVICE}...")
-        # Try to import with fallbacks
-        try:
-            from neuttsair.neutts import NeuTTSAir
-        except ImportError as e:
-            logger.error(f"Failed to import NeuTTS Air: {e}")
-            # Try alternative import path
-            sys.path.insert(0, "/app/neutts-air")
-            from neuttsair.neutts import NeuTTSAir
-        # Use appropriate device with fallback
-        device = DEVICE
         try:
-            tts_model = NeuTTSAir(
-                backbone_repo="neuphonic/neutts-air",
-                backbone_device=device,
-                codec_repo="neuphonic/neucodec",
-                codec_device=device
-            )
         except Exception as e:
-            logger.warning(f"Failed to load on {device}, falling back to CPU: {e}")
-            tts_model = NeuTTSAir(
-                backbone_repo="neuphonic/neutts-air",
-                backbone_device="cpu",
-                codec_repo="neuphonic/neucodec",
-                codec_device="cpu"
-            )
-        # Warm up the model
-        warm_up_model()
-        logger.info("NeuTTS Air model loaded successfully!")
-    except Exception as e:
-        logger.error(f"Failed to load model: {str(e)}")
-        model_loading = False
-        raise e
-    model_loading = False
-def warm_up_model():
-    """Warm up the model with a short inference"""
-    try:
-        if tts_model is None:
-            return
-        logger.info("Warming up model...")
-        # Create a temporary warm-up audio file
-        temp_dir = "temp_audio"
-        os.makedirs(temp_dir, exist_ok=True)
-        # Generate a simple sine wave as warm-up reference
-        import scipy.io.wavfile as wavfile
-        warmup_audio_path = os.path.join(temp_dir, "warmup_ref.wav")
-        # Create 1 second of 440Hz sine wave
-        sample_rate = 24000
-        t = np.linspace(0, 1, sample_rate)
-        audio_data = 0.3 * np.sin(2 * np.pi * 440 * t)
-        audio_data = (audio_data * 32767).astype(np.int16)
-        wavfile.write(warmup_audio_path, sample_rate, audio_data)
-        # Perform warm-up inference
-        ref_codes = tts_model.encode_reference(warmup_audio_path)
-        wav = tts_model.infer("Hello, this is a warm-up.", ref_codes, "Hello warm up")
-        # Clean up
-        if os.path.exists(warmup_audio_path):
-            os.remove(warmup_audio_path)
-        logger.info(f"Model warm-up completed! Generated audio length: {len(wav)}")
-    except Exception as e:
-        logger.warning(f"Model warm-up failed: {e}")
-def validate_audio_file(audio_path: str):
-    """
-    Enhanced audio validation with strict NeuTTS Air requirements
-    Reference: 3-15 seconds of clean, mono audio for optimal results
-    """
-    try:
-        import librosa
-        # Check file exists
-        if not os.path.exists(audio_path):
-            raise ValueError("Audio file not found")
-        # Check file size (roughly 10MB limit)
-        file_size = os.path.getsize(audio_path) / (1024 * 1024)  # MB
-        if file_size > 10:
-            raise ValueError(f"Audio file too large: {file_size:.1f}MB. Maximum 10MB allowed.")
-        # Load and validate audio properties
-        audio_data, sample_rate = librosa.load(audio_path, sr=None, mono=False)
-        audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
-        # Enhanced validation rules based on NeuTTS Air specifications
-        if audio_duration < 3 or audio_duration > 15:
-            raise ValueError(f"Audio duration ({audio_duration:.1f}s) must be between 3-15 seconds for optimal voice cloning")
-        if len(audio_data.shape) > 1 and audio_data.shape[0] > 1:
-            logger.warning("Stereo audio detected. For best results, use mono audio")
-            # Convert to mono by averaging channels
-            audio_data = np.mean(audio_data, axis=0)
-        if sample_rate < 16000 or sample_rate > 44100:
-            logger.warning(f"Sample rate {sample_rate}Hz should ideally be between 16-44kHz")
-        # Check for sufficient audio quality (basic RMS check)
-        rms = np.sqrt(np.mean(audio_data**2))
-        if rms < 0.01:  # Too quiet
-            raise ValueError("Audio signal is too quiet. Please use a clearer recording.")
-        logger.info(f"Audio validation passed: {audio_duration:.1f}s, {sample_rate}Hz")
-        return audio_duration
-    except Exception as e:
-        logger.error(f"Audio validation failed: {str(e)}")
-        raise ValueError(f"Invalid audio file: {str(e)}")
-def intelligent_text_chunking(text: str) -> List[str]:
-    """
-    Intelligent text chunking for optimal streaming
-    Splits text into meaningful chunks for sequential processing
-    """
-    # Clean and normalize text
-    text = re.sub(r'\s+', ' ', text.strip())
-    # First, split by sentences (., !, ?)
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        # If sentence is too long, split by clauses (commas, semicolons)
-        if len(sentence) > 100:
-            clauses = re.split(r'(?<=[,;:])\s+', sentence)
-            for clause in clauses:
-                clause = clause.strip()
-                if clause:
-                    # If clause is still long, split by length
-                    if len(clause) > 80:
-                        words = clause.split()
-                        current_chunk = []
-                        current_length = 0
-                        for word in words:
-                            if current_length + len(word) + 1 > 80 and current_chunk:
-                                chunks.append(' '.join(current_chunk))
-                                current_chunk = [word]
-                                current_length = len(word)
-                            else:
-                                current_chunk.append(word)
-                                current_length += len(word) + 1
-                        if current_chunk:
-                            chunks.append(' '.join(current_chunk))
-                    else:
-                        chunks.append(clause)
-        else:
-            chunks.append(sentence)
-    # Ensure we have at least one chunk
-    if not chunks:
-        chunks = [text]
-    logger.info(f"Split text into {len(chunks)} chunks for streaming")
-    return chunks
-async def generate_chunk_audio(chunk_text: str, ref_codes: Any, reference_text: str, speed: float) -> np.ndarray:
-    """Generate audio for a single text chunk asynchronously"""
     loop = asyncio.get_event_loop()
     return await loop.run_in_executor(
         tts_executor,
-        tts_model.infer,
-        chunk_text, ref_codes, reference_text
     )
-async def convert_chunk_to_mp3(audio_chunk: np.ndarray) -> bytes:
-    """Convert audio chunk to MP3 format asynchronously"""
-    loop = asyncio.get_event_loop()
-    def _convert():
-        mp3_buffer = io.BytesIO()
-        sf.write(mp3_buffer, audio_chunk, 24000, format='mp3')
-        return mp3_buffer.getvalue()
-    return await loop.run_in_executor(tts_executor, _convert)
-def generate_silent_mp3_header(duration_ms: int = 100) -> bytes:
-    """Generate a short silent MP3 header for immediate playback"""
-    silent_audio = np.zeros(int(24000 * duration_ms / 1000))  # 100ms of silence
-    mp3_buffer = io.BytesIO()
-    sf.write(mp3_buffer, silent_audio, 24000, format='mp3')
-    return mp3_buffer.getvalue()
-async def true_realtime_generator(
-    text: str,
-    ref_codes: Any,
-    reference_text: str,
-    speed: float = 1.0
-) -> Generator[bytes, None, None]:
-    """
-    TRUE real-time streaming generator
-    Processes text line-by-line and streams MP3 chunks immediately
-    """
-    start_time = time.time()
     try:
-        logger.info("Starting TRUE real-time streaming generation...")
-        # Step 1: Send MP3 header for immediate browser playback
-        header_data = generate_silent_mp3_header()
-        yield header_data
-        logger.info("Sent MP3 header for immediate playback")
-        # Step 2: Intelligent text chunking
-        text_chunks = intelligent_text_chunking(text)
-        total_chunks = len(text_chunks)
-        logger.info(f"Processing {total_chunks} text chunks sequentially")
-        # Step 3: Process each chunk in sequence with immediate streaming
-        successful_chunks = 0
-        for chunk_index, chunk_text in enumerate(text_chunks, 1):
-            if not chunk_text.strip():
-                continue
-            chunk_start_time = time.time()
-            logger.info(f"Processing chunk {chunk_index}/{total_chunks}: '{chunk_text[:50]}...'")
-            try:
-                # Generate audio for this specific chunk
-                chunk_audio = await generate_chunk_audio(chunk_text, ref_codes, reference_text, speed)
-                # Convert to MP3 immediately
-                mp3_data = await convert_chunk_to_mp3(chunk_audio)
-                # Stream the MP3 chunk immediately
-                yield mp3_data
-                chunk_processing_time = time.time() - chunk_start_time
-                successful_chunks += 1
-                logger.info(f"✓ Streamed chunk {chunk_index}/{total_chunks} in {chunk_processing_time:.2f}s, size: {len(mp3_data)} bytes")
-                # Small delay to ensure smooth streaming (optional)
-                await asyncio.sleep(0.01)
-            except Exception as chunk_error:
-                logger.error(f"✗ Failed to process chunk {chunk_index}: {chunk_error}")
-                # Continue with next chunk instead of failing entirely
-                continue
-        total_processing_time = time.time() - start_time
-        logger.info(f"TRUE real-time streaming completed: {successful_chunks}/{total_chunks} chunks in {total_processing_time:.2f}s")
     except Exception as e:
-        logger.error(f"TRUE real-time streaming generator failed: {e}")
-        raise
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Modern lifespan management"""
     try:
-        load_tts_model()
-        logger.info(f"✅ NeuTTS Air model loaded on {DEVICE}")
     except Exception as e:
-        logger.error(f"❌ Model loading failed: {e}")
-        raise
-    yield
-    # Cleanup
     tts_executor.shutdown(wait=False)
-    # Clean up temporary files
-    await cleanup_audio_files()
 app = FastAPI(
-    title="NeuTTS Air API - Enhanced",
-    description="High-quality on-device Text-to-Speech with instant voice cloning and TRUE real-time streaming",
-    version="2.1.0",
-    docs_url="/docs",
     lifespan=lifespan
 )
-# CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
-    allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-async def run_tts_async(text: str, ref_codes: Any, reference_text: str, speed: float = 1.0):
-    """Offload blocking TTS call to thread pool"""
-    loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(
-        tts_executor,
-        tts_model.infer,
-        text, ref_codes, reference_text
-    )
-def encode_reference_async(audio_path: str):
-    """Encode reference audio in thread pool"""
-    loop = asyncio.get_event_loop()
-    return loop.run_in_executor(
-        tts_executor,
-        tts_model.encode_reference,
-        audio_path
-    )
 @app.get("/")
 async def root():
     return {
-        "message": "Enhanced NeuTTS Air API with TRUE Real-time Streaming!",
         "status": "healthy",
-        "version": "2.1.0",
-        "features": [
-            "voice_cloning",
-            "true_realtime_streaming",
-            "line_by_line_processing",
-            "multiple_formats",
-            "production_ready"
-        ]
     }
-@app.get("/health")
-async def health_check():
-    """Enhanced health check endpoint"""
-    try:
-        memory = psutil.virtual_memory()
-        disk = psutil.disk_usage('/')
-        return HealthResponse(
-            status="healthy",
-            model_loaded=tts_model is not None,
-            device=DEVICE,
-            memory_usage={
-                "total_gb": round(memory.total / (1024**3), 2),
-                "available_gb": round(memory.available / (1024**3), 2),
-                "used_percent": round(memory.percent, 2)
-            },
-            disk_usage={
-                "total_gb": round(disk.total / (1024**3), 2),
-                "free_gb": round(disk.free / (1024**3), 2),
-                "used_percent": round(disk.percent, 2)
-            }
-        )
-    except Exception as e:
-        return HealthResponse(
-            status="degraded",
-            model_loaded=tts_model is not None,
-            device=DEVICE,
-            memory_usage={"error": str(e)},
-            disk_usage={"error": str(e)}
-        )
-@app.post("/synthesize")
-async def synthesize_speech(
-    reference_text: str = Form(..., min_length=1, max_length=1000),
-    text: str = Form(..., min_length=1, max_length=5000),
-    reference_audio: UploadFile = File(...),
-    output_format: str = Form("wav"),
-    speed: float = Form(1.0)
 ):
     """
-    Standard synthesis endpoint with audio validation and multiple output formats
     """
     start_time = time.time()
-    if tts_model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
-    temp_ref_path = None
     try:
-        # Save uploaded file temporarily
-        temp_dir = "temp_audio"
-        os.makedirs(temp_dir, exist_ok=True)
-        file_extension = os.path.splitext(reference_audio.filename)[1] or ".wav"
-        temp_ref_path = os.path.join(temp_dir, f"ref_{int(time.time())}{file_extension}")
-        async with aiofiles.open(temp_ref_path, 'wb') as out_file:
-            content = await reference_audio.read()
-            await out_file.write(content)
-        # Enhanced audio validation
-        audio_duration = validate_audio_file(temp_ref_path)
-        # Perform TTS
-        logger.info(f"Starting synthesis for text: {text[:50]}...")
-        # Encode reference and generate speech asynchronously
-        ref_codes = await encode_reference_async(temp_ref_path)
-        wav = await run_tts_async(text, ref_codes, reference_text, speed)
         processing_time = time.time() - start_time
-        output_audio_duration = len(wav) / 24000
-        logger.info(f"Synthesis completed in {processing_time:.2f}s")
-        # Handle different output formats
-        if output_format.lower() in ["mp3", "flac"]:
-            audio_buffer = io.BytesIO()
-            if output_format.lower() == "mp3":
-                sf.write(audio_buffer, wav, 24000, format='mp3')
-                media_type = "audio/mpeg"
-            else:
-                sf.write(audio_buffer, wav, 24000, format='flac')
-                media_type = "audio/flac"
-            audio_buffer.seek(0)
-            return Response(
-                content=audio_buffer.read(),
-                media_type=media_type,
-                headers={
-                    "Content-Disposition": f"attachment; filename=cloned_speech.{output_format}",
-                    "X-Processing-Time": str(round(processing_time, 2)),
-                    "X-Audio-Duration": str(round(output_audio_duration, 2))
-                }
-            )
-        else:
-            # Default WAV format
-            output_dir = "generated_audio"
-            os.makedirs(output_dir, exist_ok=True)
-            output_filename = f"output_{int(time.time())}.wav"
-            output_path = os.path.join(output_dir, output_filename)
-            sf.write(output_path, wav, 24000)
-            return TTSResponse(
-                success=True,
-                audio_url=f"/audio/{output_filename}",
-                message="Speech synthesized successfully",
-                processing_time=round(processing_time, 2),
-                audio_duration=round(output_audio_duration, 2)
-            )
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        logger.error(f"Synthesis error: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if temp_ref_path and os.path.exists(temp_ref_path):
-            try:
-                os.remove(temp_ref_path)
-            except:
-                pass
-@app.post("/synthesize/true-realtime")
-async def true_realtime_synthesis(request: StreamingRequest):
-    """
-    TRUE real-time streaming endpoint - processes text line-by-line and streams immediately
-    First audio chunk delivered in 2-3 seconds even for long texts
-    """
-    if tts_model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
-    try:
-        # Validate reference audio exists and meets requirements
-        if not os.path.exists(request.reference_audio_path):
-            raise HTTPException(status_code=400, detail="Reference audio path not found")
-        validate_audio_file(request.reference_audio_path)
-        # Encode reference asynchronously (this happens once at the start)
-        ref_codes = await encode_reference_async(request.reference_audio_path)
-        start_time = time.time()
-        return StreamingResponse(
-            true_realtime_generator(
-                text=request.text,
-                ref_codes=ref_codes,
-                reference_text=request.reference_text,
-                speed=request.speed
-            ),
-            media_type="audio/mpeg",
             headers={
-                "Content-Disposition": "attachment; filename=realtime_speech.mp3",
-                "Transfer-Encoding": "chunked",
-                "X-Streaming-Type": "true-realtime-line-by-line",
-                "X-First-Chunk-ETA": "2-3s",
-                "Cache-Control": "no-cache",
-                "X-Start-Time": str(start_time)
             }
         )
     except Exception as e:
-        logger.error(f"TRUE real-time streaming error: {e}")
-        raise HTTPException(status_code=500, detail=f"TRUE real-time streaming failed: {str(e)}")
-# Legacy streaming endpoint (fake streaming) for backward compatibility
 @app.post("/synthesize/stream")
-async def legacy_stream_synthesis(request: StreamingRequest):
     """
-    Legacy streaming endpoint (fake streaming) - for backward compatibility
-    Use /synthesize/true-realtime for real streaming
     """
-    if tts_model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
-    try:
-        if not os.path.exists(request.reference_audio_path):
-            raise HTTPException(status_code=400, detail="Reference audio path not found")
-        validate_audio_file(request.reference_audio_path)
-        ref_codes = await encode_reference_async(request.reference_audio_path)
-        # Legacy approach: generate complete audio then chunk
-        def legacy_stream_generator():
-            wav = tts_model.infer(request.text, ref_codes, request.reference_text)
-            audio_buffer = io.BytesIO()
-            sf.write(audio_buffer, wav, 24000, format='mp3')
-            audio_data = audio_buffer.getvalue()
-            # Stream in chunks
-            chunk_size = request.chunk_size
-            for i in range(0, len(audio_data), chunk_size):
-                yield audio_data[i:i + chunk_size]
-        return StreamingResponse(
-            legacy_stream_generator(),
-            media_type="audio/mpeg",
-            headers={
-                "Content-Disposition": "attachment; filename=legacy_stream.mp3",
-                "X-Streaming-Type": "legacy-chunked"
-            }
-        )
-    except Exception as e:
-        logger.error(f"Legacy streaming error: {e}")
-        raise HTTPException(status_code=500, detail=f"Legacy streaming failed: {str(e)}")
 @app.get("/audio/{filename}")
-async def get_audio_file(filename: str):
-    """Serve generated audio files"""
-    file_path = os.path.join("generated_audio", filename)
     if not os.path.exists(file_path):
         raise HTTPException(status_code=404, detail="Audio file not found")
-    return FileResponse(
-        file_path,
-        media_type="audio/wav",
-        filename=f"cloned_speech_{filename}"
-    )
-@app.post("/synthesize-with-url")
-async def synthesize_with_url(request: TTSRequest):
-    """
-    Enhanced synthesis with URL support and multiple formats
-    """
-    start_time = time.time()
-    if tts_model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
-    if not request.reference_audio_path or not os.path.exists(request.reference_audio_path):
-        raise HTTPException(status_code=400, detail="Reference audio path not found")
-    try:
-        validate_audio_file(request.reference_audio_path)
-        # Perform TTS asynchronously
-        logger.info(f"Starting synthesis for text: {request.text[:50]}...")
-        ref_codes = await encode_reference_async(request.reference_audio_path)
-        wav = await run_tts_async(request.text, ref_codes, request.reference_text, request.speed)
-        processing_time = time.time() - start_time
-        audio_duration = len(wav) / 24000
-        # Handle output format
-        if request.output_format.lower() in ["mp3", "flac"]:
-            audio_buffer = io.BytesIO()
-            if request.output_format.lower() == "mp3":
-                sf.write(audio_buffer, wav, 24000, format='mp3')
-                media_type = "audio/mpeg"
-            else:
-                sf.write(audio_buffer, wav, 24000, format='flac')
-                media_type = "audio/flac"
-            audio_buffer.seek(0)
-            return Response(
-                content=audio_buffer.read(),
-                media_type=media_type,
-                headers={
-                    "Content-Disposition": f"attachment; filename=cloned_speech.{request.output_format}",
-                    "X-Processing-Time": str(round(processing_time, 2)),
-                    "X-Audio-Duration": str(round(audio_duration, 2))
-                }
-            )
-        else:
-            # Save as WAV
-            output_dir = "generated_audio"
-            os.makedirs(output_dir, exist_ok=True)
-            output_filename = f"output_{int(time.time())}.wav"
-            output_path = os.path.join(output_dir, output_filename)
-            sf.write(output_path, wav, 24000)
-            return TTSResponse(
-                success=True,
-                audio_url=f"/audio/{output_filename}",
-                message="Speech synthesized successfully",
-                processing_time=round(processing_time, 2),
-                audio_duration=round(audio_duration, 2)
-            )
-    except Exception as e:
-        logger.error(f"Synthesis error: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
-@app.delete("/cleanup")
-async def cleanup_audio_files():
-    """Enhanced cleanup with efficient file management"""
-    try:
-        output_dir = "generated_audio"
-        temp_dir = "temp_audio"
-        deleted_count = 0
-        current_time = time.time()
-        # Clean generated audio
-        if os.path.exists(output_dir):
-            for filename in os.listdir(output_dir):
-                file_path = os.path.join(output_dir, filename)
-                if os.path.isfile(file_path):
-                    file_age = current_time - os.path.getctime(file_path)
-                    if file_age > 3600:  # 1 hour
-                        os.remove(file_path)
-                        deleted_count += 1
-        # Clean temp audio (shorter retention)
-        if os.path.exists(temp_dir):
-            for filename in os.listdir(temp_dir):
-                file_path = os.path.join(temp_dir, filename)
-                if os.path.isfile(file_path):
-                    file_age = current_time - os.path.getctime(file_path)
-                    if file_age > 1800:  # 30 minutes for temp files
-                        os.remove(file_path)
-                        deleted_count += 1
-        # Force garbage collection
-        gc.collect()
-        return {
-            "message": f"Cleaned up {deleted_count} files",
-            "memory_cleaned": "true",
-            "next_cleanup": "in_1_hour"
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Cleanup failed: {str(e)}")
-# GET endpoint for simple synthesis
-@app.get("/synthesize")
-async def synthesize_speech_get(
-    text: str = Query(..., min_length=1, max_length=5000),
-    reference_text: str = Query(..., min_length=1, max_length=1000),
-    reference_audio_path: str = Query(...),
-    output_format: str = Query("wav"),
-    speed: float = Query(1.0)
-):
-    """GET endpoint for speech synthesis"""
-    request = TTSRequest(
-        text=text,
-        reference_text=reference_text,
-        reference_audio_path=reference_audio_path,
-        output_format=output_format,
-        speed=speed
     )
-    return await synthesize_with_url(request)
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)

 import os
+import io
+import asyncio
 import time
+import shutil
 import numpy as np
 import psutil
 import soundfile as sf
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional, Generator
 from contextlib import asynccontextmanager
+import logging
+import aiofiles
+import torch
+from fastapi import FastAPI, HTTPException, Response, StreamingResponse, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+# Ensure the cloned neutts-air repository is in the path
+import sys
+sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
+from neuttsair.neutts import NeuTTSAir
 # Configure logging
 logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("NeuTTS-API")
+# --- Configuration & Utility Functions ---
+# Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
+DEVICE = "cpu"
+# Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
+MAX_WORKERS = 2
 tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
+SAMPLE_RATE = 24000
+CLEANUP_THRESHOLD = 3600 # 1 hour in seconds
+TEMP_AUDIO_DIR = "temp_audio"
+GENERATED_AUDIO_DIR = "generated_audio"
+os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
+os.makedirs(GENERATED_AUDIO_DIR, exist_ok=True)
+class TTSRequestModel(BaseModel):
+    """Model for non-file inputs to synthesis and streaming."""
+    text: str = Field(..., min_length=1, max_length=1000)
+    speed: float = Field(default=1.0, ge=0.5, le=2.0)
+    output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
+# --- Model Wrapper and Logic ---
+class NeuTTSWrapper:
+    def __init__(self, device: str = "cpu"):
+        self.tts_model = None
+        self.device = device
+        self.load_model()
+    def load_model(self):
         try:
+            logger.info(f"Loading NeuTTSAir model on device: {self.device}")
+            # Ensure we respect the CPU configuration
+            self.tts_model = NeuTTSAir(backbone_device=self.device, codec_device=self.device)
+            logger.info("✅ NeuTTSAir model loaded successfully.")
         except Exception as e:
+            logger.error(f"❌ Model loading failed: {e}")
+            raise
+    def _convert_to_streamable_format(self, audio_data: np.ndarray, audio_format: str) -> bytes:
+        """Converts NumPy audio array to streamable bytes in the specified format."""
+        audio_buffer = io.BytesIO()
+        try:
+            sf.write(audio_buffer, audio_data, SAMPLE_RATE, format=audio_format)
+        except Exception as e:
+            logger.error(f"Failed to write audio data to format {audio_format}: {e}")
+            raise
+        audio_buffer.seek(0)
+        return audio_buffer.read()
+    def _split_text_into_chunks(self, text: str) -> list[str]:
+        """Simple sentence splitting for streaming (can be enhanced with regex)."""
+        sentences = [s.strip() for s in text.split('.') if s.strip()]
+        if not sentences:
+            sentences = [text.strip()]
+        return sentences
+    def generate_speech_blocking(self, text: str, ref_audio_path: str) -> np.ndarray:
+        """Blocking synthesis for standard endpoint."""
+        # 1. Load reference
+        reference_audio, sr = sf.read(ref_audio_path)
+        if sr != SAMPLE_RATE:
+            # Simple check/resize logic required if sample rate mismatch occurs
+            pass
+        # 2. Encode reference
+        ref_s = self.tts_model.encode_reference(reference_audio)
+        # 3. Infer full text
+        with torch.no_grad():
+            audio = self.tts_model.infer(text, ref_s, speed=1.0)
+        return audio.cpu().numpy()
+    def stream_speech_blocking(self, text: str, ref_audio_path: str, speed: float, audio_format: str) -> Generator[bytes, None, None]:
+        """Sentence-by-Sentence Streaming (Blocking)."""
+        logger.info(f"Starting streaming synthesis for text length: {len(text)}")
+        # 1. Load reference audio (ONLY ONCE)
+        reference_audio, sr = sf.read(ref_audio_path)
+        # 2. Encode reference (ONLY ONCE)
+        ref_s = self.tts_model.encode_reference(reference_audio)
+        # 3. Split text
+        sentences = self._split_text_into_chunks(text)
+        # 4. Stream chunks
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
+            # Infer sentence
+            with torch.no_grad():
+                audio_chunk = self.tts_model.infer(sentence, ref_s, speed=speed)
+            # Convert and yield
+            yield self._convert_to_streamable_format(audio_chunk.cpu().numpy(), audio_format)
+        logger.info("Streaming synthesis complete.")
+# --- Asynchronous Offloading ---
+async def run_blocking_task_async(func, *args, **kwargs):
+    """Offloads a blocking function call to the ThreadPoolExecutor."""
     loop = asyncio.get_event_loop()
     return await loop.run_in_executor(
         tts_executor,
+        lambda: func(*args, **kwargs)
     )
+async def save_upload_file_async(upload_file: UploadFile) -> str:
+    """Asynchronously saves the UploadFile to disk."""
+    temp_filename = os.path.join(TEMP_AUDIO_DIR, f"{time.time()}_{upload_file.filename}")
     try:
+        # Use asyncio to read the file chunks in a non-blocking manner
+        async with aiofiles.open(temp_filename, 'wb') as out_file:
+            while content := await upload_file.read(1024 * 1024):
+                await out_file.write(content)
+        return temp_filename
     except Exception as e:
+        logger.error(f"Error saving file: {e}")
+        raise HTTPException(status_code=500, detail="Could not save reference audio file")
+# --- FastAPI Lifespan Manager (Kokoro Feature) ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Modern lifespan management: initialize model on startup, shutdown executor."""
     try:
+        app.state.tts_wrapper = NeuTTSWrapper(device=DEVICE)
     except Exception as e:
+        logger.error(f"Fatal startup error: {e}")
+        # Terminate the application if the model can't load
+        tts_executor.shutdown(wait=False)
+        raise RuntimeError("Model initialization failed.")
+    yield # Application serves requests
+    # Shutdown
+    logger.info("Shutting down ThreadPoolExecutor.")
     tts_executor.shutdown(wait=False)
+# --- FastAPI Application Setup ---
 app = FastAPI(
+    title="NeuTTS Air Instant Cloning API",
+    version="2.0.0-PROD-ENHANCED",
+    docs_url="/docs",
     lifespan=lifespan
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# --- New Endpoints and Enhancements ---
 @app.get("/")
 async def root():
+    return {"message": "NeuTTS Air API v2.0 - Ready for Instant Voice Cloning"}
+@app.get("/health")
+async def health_check():
+    """Enhanced health check (Kokoro Feature + Original Metrics)"""
+    mem = psutil.virtual_memory()
+    disk = psutil.disk_usage('/')
     return {
         "status": "healthy",
+        "model_loaded": hasattr(app.state, 'tts_wrapper') and app.state.tts_wrapper.tts_model is not None,
+        "device": DEVICE,
+        "concurrency_limit": MAX_WORKERS,
+        "memory_usage": {
+            "total_gb": round(mem.total / (1024**3), 2),
+            "used_percent": mem.percent
+        },
+        "disk_usage": {
+            "total_gb": round(disk.total / (1024**3), 2),
+            "used_percent": disk.percent
+        }
     }
+@app.delete("/cleanup")
+async def cleanup_files():
+    """Maintenance endpoint to remove old generated and temporary files."""
+    await run_blocking_task_async(cleanup_files_blocking)
+    return {"message": "Cleanup initiated successfully."}
+def cleanup_files_blocking():
+    """Blocking file cleanup logic (original NeuTTS feature)."""
+    now = time.time()
+    deleted_count = 0
+    for directory in [GENERATED_AUDIO_DIR, TEMP_AUDIO_DIR]:
+        for filename in os.listdir(directory):
+            filepath = os.path.join(directory, filename)
+            if os.path.isfile(filepath):
+                try:
+                    # Original cleanup logic: delete if older than CLEANUP_THRESHOLD
+                    if now - os.path.getctime(filepath) > CLEANUP_THRESHOLD:
+                        os.remove(filepath)
+                        deleted_count += 1
+                except Exception as e:
+                    logger.warning(f"Failed to delete {filepath}: {e}")
+    logger.info(f"Cleanup completed: {deleted_count} files removed.")
+    return deleted_count
+# --- Core Synthesis Endpoints ---
+@app.post("/synthesize", response_class=Response)
+async def text_to_speech(
+    text: str = Form(...),
+    speed: float = Form(1.0, ge=0.5, le=2.0),
+    output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
+    reference_audio: UploadFile = File(...)
 ):
     """
+    Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
+    Uses ThreadPoolExecutor for non-blocking API responsiveness.
     """
+    if not hasattr(app.state, 'tts_wrapper'):
+        raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
+    # 1. Asynchronously save reference audio
+    temp_ref_path = await save_upload_file_async(reference_audio)
     start_time = time.time()
     try:
+        # 2. Offload the ENTIRE blocking process (encode + infer) to a thread
+        audio_data = await run_blocking_task_async(
+            app.state.tts_wrapper.generate_speech_blocking,
+            text,
+            temp_ref_path
+        )
+        # 3. Convert to requested format (Blocking, but usually fast)
+        audio_bytes = await run_blocking_task_async(
+            app.state.tts_wrapper._convert_to_streamable_format,
+            audio_data,
+            output_format
+        )
+        # 4. Save to disk (Original NeuTTS requirement)
+        audio_filename = f"tts_{time.time()}.{output_format}"
+        final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
+        # We perform the file write operation in a blocking manner inside the thread pool.
+        await run_blocking_task_async(
+            lambda: open(final_path, 'wb').write(audio_bytes)
+        )
         processing_time = time.time() - start_time
+        audio_duration = len(audio_data) / SAMPLE_RATE
+        return Response(
+            content=audio_bytes,
+            media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
+                "Content-Disposition": f"attachment; filename={audio_filename}",
+                "X-Processing-Time": f"{processing_time:.2f}s",
+                "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
+        logger.error(f"Synthesis error: {e}")
+        raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
+    finally:
+        # 5. Clean up the temporary reference file
+        if os.path.exists(temp_ref_path):
+            os.unlink(temp_ref_path)
 @app.post("/synthesize/stream")
+async def stream_text_to_speech_cloning(
+    text: str = Form(..., min_length=1, max_length=5000), # Increased limit for streaming
+    speed: float = Form(1.0, ge=0.5, le=2.0),
+    output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"), # MP3 is best for streaming
+    reference_audio: UploadFile = File(...)
+):
     """
+    Sentence-by-Sentence Streaming Endpoint (Kokoro Feature adaptation).
+    Performs encoding once, then synthesizes and streams chunks.
     """
+    if not hasattr(app.state, 'tts_wrapper'):
+        raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
+    # 1. Asynchronously save reference audio (non-blocking)
+    temp_ref_path = await save_upload_file_async(reference_audio)
+    # 2. Define the generator function, which will run in the thread pool implicitly
+    def stream_generator():
+        try:
+            # The entire streaming process runs blocking inside the thread pool
+            for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
+                text,
+                temp_ref_path,
+                speed,
+                output_format
+            ):
+                yield chunk_bytes
+        except Exception as e:
+            logger.error(f"Streaming generator error: {e}")
+            # Raise an exception if necessary, though it might break the stream
+        finally:
+            # 3. Cleanup the temporary reference file after the stream is done
+            if os.path.exists(temp_ref_path):
+                os.unlink(temp_ref_path)
+    # The StreamingResponse handles the transfer encoding and chunking
+    return StreamingResponse(
+        stream_generator(),
+        media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
+        headers={
+            "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
+            "Transfer-Encoding": "chunked",
+            "Cache-Control": "no-cache"
+        }
+    )
 @app.get("/audio/{filename}")
+async def get_audio(filename: str):
+    """Original NeuTTS feature to serve generated audio files."""
+    file_path = os.path.join(GENERATED_AUDIO_DIR, filename)
     if not os.path.exists(file_path):
         raise HTTPException(status_code=404, detail="Audio file not found")
+    return Response(
+        content=open(file_path, "rb").read(),
+        media_type=f"audio/{filename.split('.')[-1]}", # Simple media type detection
+        headers={"Content-Disposition": f"attachment; filename={filename}"}
     )