Spaces:
Paused
Paused
| import os | |
| import sys | |
| import time | |
| import gc | |
| import torch | |
| import numpy as np | |
| import asyncio | |
| import aiofiles | |
| import re | |
| import io | |
| from concurrent.futures import ThreadPoolExecutor | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Query | |
| from fastapi.responses import JSONResponse, FileResponse, StreamingResponse, Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| from typing import Optional, Dict, Any, Generator, List | |
| import psutil | |
| import logging | |
| import soundfile as sf | |
| from contextlib import asynccontextmanager | |
| os.environ['HF_HOME'] = '/app/cache' | |
| os.environ['HUGGINGFACE_HUB_CACHE'] = '/app/cache' | |
| # Add NeuTTS Air to path | |
| sys.path.append("neutts-air") | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Device detection and optimization | |
| def get_best_device(): | |
| return "cuda" if torch.cuda.is_available() else "cpu" | |
| DEVICE = get_best_device() | |
| MAX_WORKERS = 1 if DEVICE == "cpu" else 2 | |
| tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) | |
| # Global model instance | |
| tts_model = None | |
| model_loading = False | |
| # Pydantic models | |
| class TTSRequest(BaseModel): | |
| text: str = Field(..., min_length=1, max_length=5000) | |
| reference_text: str = Field(..., min_length=1, max_length=1000) | |
| reference_audio_path: Optional[str] = None | |
| output_format: str = Field(default="wav") | |
| speed: float = Field(default=1.0, ge=0.5, le=2.0) | |
| class StreamingRequest(BaseModel): | |
| text: str = Field(..., min_length=1, max_length=5000) | |
| reference_text: str = Field(..., min_length=1, max_length=1000) | |
| reference_audio_path: str | |
| speed: float = Field(default=1.0, ge=0.5, le=2.0) | |
| chunk_size: int = Field(default=2048, ge=512, le=8192) | |
| class TTSResponse(BaseModel): | |
| success: bool | |
| audio_url: Optional[str] = None | |
| message: Optional[str] = None | |
| processing_time: Optional[float] = None | |
| audio_duration: Optional[float] = None | |
| class HealthResponse(BaseModel): | |
| status: str | |
| model_loaded: bool | |
| device: str | |
| memory_usage: Dict[str, float] | |
| disk_usage: Dict[str, float] | |
| streaming_supported: bool = True | |
| def load_tts_model(): | |
| global tts_model, model_loading | |
| if tts_model is not None or model_loading: | |
| return | |
| model_loading = True | |
| try: | |
| logger.info(f"Loading NeuTTS Air model on {DEVICE}...") | |
| # Try to import with fallbacks | |
| try: | |
| from neuttsair.neutts import NeuTTSAir | |
| except ImportError as e: | |
| logger.error(f"Failed to import NeuTTS Air: {e}") | |
| # Try alternative import path | |
| sys.path.insert(0, "/app/neutts-air") | |
| from neuttsair.neutts import NeuTTSAir | |
| # Use appropriate device with fallback | |
| device = DEVICE | |
| try: | |
| tts_model = NeuTTSAir( | |
| backbone_repo="neuphonic/neutts-air", | |
| backbone_device=device, | |
| codec_repo="neuphonic/neucodec", | |
| codec_device=device | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Failed to load on {device}, falling back to CPU: {e}") | |
| tts_model = NeuTTSAir( | |
| backbone_repo="neuphonic/neutts-air", | |
| backbone_device="cpu", | |
| codec_repo="neuphonic/neucodec", | |
| codec_device="cpu" | |
| ) | |
| # Warm up the model | |
| warm_up_model() | |
| logger.info("NeuTTS Air model loaded successfully!") | |
| except Exception as e: | |
| logger.error(f"Failed to load model: {str(e)}") | |
| model_loading = False | |
| raise e | |
| model_loading = False | |
| def warm_up_model(): | |
| """Warm up the model with a short inference""" | |
| try: | |
| if tts_model is None: | |
| return | |
| logger.info("Warming up model...") | |
| # Create a temporary warm-up audio file | |
| temp_dir = "temp_audio" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| # Generate a simple sine wave as warm-up reference | |
| import scipy.io.wavfile as wavfile | |
| warmup_audio_path = os.path.join(temp_dir, "warmup_ref.wav") | |
| # Create 1 second of 440Hz sine wave | |
| sample_rate = 24000 | |
| t = np.linspace(0, 1, sample_rate) | |
| audio_data = 0.3 * np.sin(2 * np.pi * 440 * t) | |
| audio_data = (audio_data * 32767).astype(np.int16) | |
| wavfile.write(warmup_audio_path, sample_rate, audio_data) | |
| # Perform warm-up inference | |
| ref_codes = tts_model.encode_reference(warmup_audio_path) | |
| wav = tts_model.infer("Hello, this is a warm-up.", ref_codes, "Hello warm up") | |
| # Clean up | |
| if os.path.exists(warmup_audio_path): | |
| os.remove(warmup_audio_path) | |
| logger.info(f"Model warm-up completed! Generated audio length: {len(wav)}") | |
| except Exception as e: | |
| logger.warning(f"Model warm-up failed: {e}") | |
| def validate_audio_file(audio_path: str): | |
| """ | |
| Enhanced audio validation with strict NeuTTS Air requirements | |
| Reference: 3-15 seconds of clean, mono audio for optimal results | |
| """ | |
| try: | |
| import librosa | |
| # Check file exists | |
| if not os.path.exists(audio_path): | |
| raise ValueError("Audio file not found") | |
| # Check file size (roughly 10MB limit) | |
| file_size = os.path.getsize(audio_path) / (1024 * 1024) # MB | |
| if file_size > 10: | |
| raise ValueError(f"Audio file too large: {file_size:.1f}MB. Maximum 10MB allowed.") | |
| # Load and validate audio properties | |
| audio_data, sample_rate = librosa.load(audio_path, sr=None, mono=False) | |
| audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate) | |
| # Enhanced validation rules based on NeuTTS Air specifications | |
| if audio_duration < 3 or audio_duration > 15: | |
| raise ValueError(f"Audio duration ({audio_duration:.1f}s) must be between 3-15 seconds for optimal voice cloning") | |
| if len(audio_data.shape) > 1 and audio_data.shape[0] > 1: | |
| logger.warning("Stereo audio detected. For best results, use mono audio") | |
| # Convert to mono by averaging channels | |
| audio_data = np.mean(audio_data, axis=0) | |
| if sample_rate < 16000 or sample_rate > 44100: | |
| logger.warning(f"Sample rate {sample_rate}Hz should ideally be between 16-44kHz") | |
| # Check for sufficient audio quality (basic RMS check) | |
| rms = np.sqrt(np.mean(audio_data**2)) | |
| if rms < 0.01: # Too quiet | |
| raise ValueError("Audio signal is too quiet. Please use a clearer recording.") | |
| logger.info(f"Audio validation passed: {audio_duration:.1f}s, {sample_rate}Hz") | |
| return audio_duration | |
| except Exception as e: | |
| logger.error(f"Audio validation failed: {str(e)}") | |
| raise ValueError(f"Invalid audio file: {str(e)}") | |
| def intelligent_text_chunking(text: str) -> List[str]: | |
| """ | |
| Intelligent text chunking for optimal streaming | |
| Splits text into meaningful chunks for sequential processing | |
| """ | |
| # Clean and normalize text | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| # First, split by sentences (., !, ?) | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| # If sentence is too long, split by clauses (commas, semicolons) | |
| if len(sentence) > 100: | |
| clauses = re.split(r'(?<=[,;:])\s+', sentence) | |
| for clause in clauses: | |
| clause = clause.strip() | |
| if clause: | |
| # If clause is still long, split by length | |
| if len(clause) > 80: | |
| words = clause.split() | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| if current_length + len(word) + 1 > 80 and current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = len(word) | |
| else: | |
| current_chunk.append(word) | |
| current_length += len(word) + 1 | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| else: | |
| chunks.append(clause) | |
| else: | |
| chunks.append(sentence) | |
| # Ensure we have at least one chunk | |
| if not chunks: | |
| chunks = [text] | |
| logger.info(f"Split text into {len(chunks)} chunks for streaming") | |
| return chunks | |
| async def generate_chunk_audio(chunk_text: str, ref_codes: Any, reference_text: str, speed: float) -> np.ndarray: | |
| """Generate audio for a single text chunk asynchronously""" | |
| loop = asyncio.get_event_loop() | |
| return await loop.run_in_executor( | |
| tts_executor, | |
| tts_model.infer, | |
| chunk_text, ref_codes, reference_text | |
| ) | |
| async def convert_chunk_to_mp3(audio_chunk: np.ndarray) -> bytes: | |
| """Convert audio chunk to MP3 format asynchronously""" | |
| loop = asyncio.get_event_loop() | |
| def _convert(): | |
| mp3_buffer = io.BytesIO() | |
| sf.write(mp3_buffer, audio_chunk, 24000, format='mp3') | |
| return mp3_buffer.getvalue() | |
| return await loop.run_in_executor(tts_executor, _convert) | |
| def generate_silent_mp3_header(duration_ms: int = 100) -> bytes: | |
| """Generate a short silent MP3 header for immediate playback""" | |
| silent_audio = np.zeros(int(24000 * duration_ms / 1000)) # 100ms of silence | |
| mp3_buffer = io.BytesIO() | |
| sf.write(mp3_buffer, silent_audio, 24000, format='mp3') | |
| return mp3_buffer.getvalue() | |
| async def true_realtime_generator( | |
| text: str, | |
| ref_codes: Any, | |
| reference_text: str, | |
| speed: float = 1.0 | |
| ) -> Generator[bytes, None, None]: | |
| """ | |
| TRUE real-time streaming generator | |
| Processes text line-by-line and streams MP3 chunks immediately | |
| """ | |
| start_time = time.time() | |
| try: | |
| logger.info("Starting TRUE real-time streaming generation...") | |
| # Step 1: Send MP3 header for immediate browser playback | |
| header_data = generate_silent_mp3_header() | |
| yield header_data | |
| logger.info("Sent MP3 header for immediate playback") | |
| # Step 2: Intelligent text chunking | |
| text_chunks = intelligent_text_chunking(text) | |
| total_chunks = len(text_chunks) | |
| logger.info(f"Processing {total_chunks} text chunks sequentially") | |
| # Step 3: Process each chunk in sequence with immediate streaming | |
| successful_chunks = 0 | |
| for chunk_index, chunk_text in enumerate(text_chunks, 1): | |
| if not chunk_text.strip(): | |
| continue | |
| chunk_start_time = time.time() | |
| logger.info(f"Processing chunk {chunk_index}/{total_chunks}: '{chunk_text[:50]}...'") | |
| try: | |
| # Generate audio for this specific chunk | |
| chunk_audio = await generate_chunk_audio(chunk_text, ref_codes, reference_text, speed) | |
| # Convert to MP3 immediately | |
| mp3_data = await convert_chunk_to_mp3(chunk_audio) | |
| # Stream the MP3 chunk immediately | |
| yield mp3_data | |
| chunk_processing_time = time.time() - chunk_start_time | |
| successful_chunks += 1 | |
| logger.info(f"✓ Streamed chunk {chunk_index}/{total_chunks} in {chunk_processing_time:.2f}s, size: {len(mp3_data)} bytes") | |
| # Small delay to ensure smooth streaming (optional) | |
| await asyncio.sleep(0.01) | |
| except Exception as chunk_error: | |
| logger.error(f"✗ Failed to process chunk {chunk_index}: {chunk_error}") | |
| # Continue with next chunk instead of failing entirely | |
| continue | |
| total_processing_time = time.time() - start_time | |
| logger.info(f"TRUE real-time streaming completed: {successful_chunks}/{total_chunks} chunks in {total_processing_time:.2f}s") | |
| except Exception as e: | |
| logger.error(f"TRUE real-time streaming generator failed: {e}") | |
| raise | |
| async def lifespan(app: FastAPI): | |
| """Modern lifespan management""" | |
| try: | |
| load_tts_model() | |
| logger.info(f"✅ NeuTTS Air model loaded on {DEVICE}") | |
| except Exception as e: | |
| logger.error(f"❌ Model loading failed: {e}") | |
| raise | |
| yield | |
| # Cleanup | |
| tts_executor.shutdown(wait=False) | |
| # Clean up temporary files | |
| await cleanup_audio_files() | |
| app = FastAPI( | |
| title="NeuTTS Air API - Enhanced", | |
| description="High-quality on-device Text-to-Speech with instant voice cloning and TRUE real-time streaming", | |
| version="2.1.0", | |
| docs_url="/docs", | |
| lifespan=lifespan | |
| ) | |
| # CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| async def run_tts_async(text: str, ref_codes: Any, reference_text: str, speed: float = 1.0): | |
| """Offload blocking TTS call to thread pool""" | |
| loop = asyncio.get_event_loop() | |
| return await loop.run_in_executor( | |
| tts_executor, | |
| tts_model.infer, | |
| text, ref_codes, reference_text | |
| ) | |
| def encode_reference_async(audio_path: str): | |
| """Encode reference audio in thread pool""" | |
| loop = asyncio.get_event_loop() | |
| return loop.run_in_executor( | |
| tts_executor, | |
| tts_model.encode_reference, | |
| audio_path | |
| ) | |
| async def root(): | |
| return { | |
| "message": "Enhanced NeuTTS Air API with TRUE Real-time Streaming!", | |
| "status": "healthy", | |
| "version": "2.1.0", | |
| "features": [ | |
| "voice_cloning", | |
| "true_realtime_streaming", | |
| "line_by_line_processing", | |
| "multiple_formats", | |
| "production_ready" | |
| ] | |
| } | |
| async def health_check(): | |
| """Enhanced health check endpoint""" | |
| try: | |
| memory = psutil.virtual_memory() | |
| disk = psutil.disk_usage('/') | |
| return HealthResponse( | |
| status="healthy", | |
| model_loaded=tts_model is not None, | |
| device=DEVICE, | |
| memory_usage={ | |
| "total_gb": round(memory.total / (1024**3), 2), | |
| "available_gb": round(memory.available / (1024**3), 2), | |
| "used_percent": round(memory.percent, 2) | |
| }, | |
| disk_usage={ | |
| "total_gb": round(disk.total / (1024**3), 2), | |
| "free_gb": round(disk.free / (1024**3), 2), | |
| "used_percent": round(disk.percent, 2) | |
| } | |
| ) | |
| except Exception as e: | |
| return HealthResponse( | |
| status="degraded", | |
| model_loaded=tts_model is not None, | |
| device=DEVICE, | |
| memory_usage={"error": str(e)}, | |
| disk_usage={"error": str(e)} | |
| ) | |
| async def synthesize_speech( | |
| reference_text: str = Form(..., min_length=1, max_length=1000), | |
| text: str = Form(..., min_length=1, max_length=5000), | |
| reference_audio: UploadFile = File(...), | |
| output_format: str = Form("wav"), | |
| speed: float = Form(1.0) | |
| ): | |
| """ | |
| Standard synthesis endpoint with audio validation and multiple output formats | |
| """ | |
| start_time = time.time() | |
| if tts_model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| temp_ref_path = None | |
| try: | |
| # Save uploaded file temporarily | |
| temp_dir = "temp_audio" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| file_extension = os.path.splitext(reference_audio.filename)[1] or ".wav" | |
| temp_ref_path = os.path.join(temp_dir, f"ref_{int(time.time())}{file_extension}") | |
| async with aiofiles.open(temp_ref_path, 'wb') as out_file: | |
| content = await reference_audio.read() | |
| await out_file.write(content) | |
| # Enhanced audio validation | |
| audio_duration = validate_audio_file(temp_ref_path) | |
| # Perform TTS | |
| logger.info(f"Starting synthesis for text: {text[:50]}...") | |
| # Encode reference and generate speech asynchronously | |
| ref_codes = await encode_reference_async(temp_ref_path) | |
| wav = await run_tts_async(text, ref_codes, reference_text, speed) | |
| processing_time = time.time() - start_time | |
| output_audio_duration = len(wav) / 24000 | |
| logger.info(f"Synthesis completed in {processing_time:.2f}s") | |
| # Handle different output formats | |
| if output_format.lower() in ["mp3", "flac"]: | |
| audio_buffer = io.BytesIO() | |
| if output_format.lower() == "mp3": | |
| sf.write(audio_buffer, wav, 24000, format='mp3') | |
| media_type = "audio/mpeg" | |
| else: | |
| sf.write(audio_buffer, wav, 24000, format='flac') | |
| media_type = "audio/flac" | |
| audio_buffer.seek(0) | |
| return Response( | |
| content=audio_buffer.read(), | |
| media_type=media_type, | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=cloned_speech.{output_format}", | |
| "X-Processing-Time": str(round(processing_time, 2)), | |
| "X-Audio-Duration": str(round(output_audio_duration, 2)) | |
| } | |
| ) | |
| else: | |
| # Default WAV format | |
| output_dir = "generated_audio" | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_filename = f"output_{int(time.time())}.wav" | |
| output_path = os.path.join(output_dir, output_filename) | |
| sf.write(output_path, wav, 24000) | |
| return TTSResponse( | |
| success=True, | |
| audio_url=f"/audio/{output_filename}", | |
| message="Speech synthesized successfully", | |
| processing_time=round(processing_time, 2), | |
| audio_duration=round(output_audio_duration, 2) | |
| ) | |
| except ValueError as e: | |
| raise HTTPException(status_code=400, detail=str(e)) | |
| except Exception as e: | |
| logger.error(f"Synthesis error: {str(e)}") | |
| raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}") | |
| finally: | |
| # Clean up temporary file | |
| if temp_ref_path and os.path.exists(temp_ref_path): | |
| try: | |
| os.remove(temp_ref_path) | |
| except: | |
| pass | |
| async def true_realtime_synthesis(request: StreamingRequest): | |
| """ | |
| TRUE real-time streaming endpoint - processes text line-by-line and streams immediately | |
| First audio chunk delivered in 2-3 seconds even for long texts | |
| """ | |
| if tts_model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| try: | |
| # Validate reference audio exists and meets requirements | |
| if not os.path.exists(request.reference_audio_path): | |
| raise HTTPException(status_code=400, detail="Reference audio path not found") | |
| validate_audio_file(request.reference_audio_path) | |
| # Encode reference asynchronously (this happens once at the start) | |
| ref_codes = await encode_reference_async(request.reference_audio_path) | |
| start_time = time.time() | |
| return StreamingResponse( | |
| true_realtime_generator( | |
| text=request.text, | |
| ref_codes=ref_codes, | |
| reference_text=request.reference_text, | |
| speed=request.speed | |
| ), | |
| media_type="audio/mpeg", | |
| headers={ | |
| "Content-Disposition": "attachment; filename=realtime_speech.mp3", | |
| "Transfer-Encoding": "chunked", | |
| "X-Streaming-Type": "true-realtime-line-by-line", | |
| "X-First-Chunk-ETA": "2-3s", | |
| "Cache-Control": "no-cache", | |
| "X-Start-Time": str(start_time) | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"TRUE real-time streaming error: {e}") | |
| raise HTTPException(status_code=500, detail=f"TRUE real-time streaming failed: {str(e)}") | |
| # Legacy streaming endpoint (fake streaming) for backward compatibility | |
| async def legacy_stream_synthesis(request: StreamingRequest): | |
| """ | |
| Legacy streaming endpoint (fake streaming) - for backward compatibility | |
| Use /synthesize/true-realtime for real streaming | |
| """ | |
| if tts_model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| try: | |
| if not os.path.exists(request.reference_audio_path): | |
| raise HTTPException(status_code=400, detail="Reference audio path not found") | |
| validate_audio_file(request.reference_audio_path) | |
| ref_codes = await encode_reference_async(request.reference_audio_path) | |
| # Legacy approach: generate complete audio then chunk | |
| def legacy_stream_generator(): | |
| wav = tts_model.infer(request.text, ref_codes, request.reference_text) | |
| audio_buffer = io.BytesIO() | |
| sf.write(audio_buffer, wav, 24000, format='mp3') | |
| audio_data = audio_buffer.getvalue() | |
| # Stream in chunks | |
| chunk_size = request.chunk_size | |
| for i in range(0, len(audio_data), chunk_size): | |
| yield audio_data[i:i + chunk_size] | |
| return StreamingResponse( | |
| legacy_stream_generator(), | |
| media_type="audio/mpeg", | |
| headers={ | |
| "Content-Disposition": "attachment; filename=legacy_stream.mp3", | |
| "X-Streaming-Type": "legacy-chunked" | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"Legacy streaming error: {e}") | |
| raise HTTPException(status_code=500, detail=f"Legacy streaming failed: {str(e)}") | |
| async def get_audio_file(filename: str): | |
| """Serve generated audio files""" | |
| file_path = os.path.join("generated_audio", filename) | |
| if not os.path.exists(file_path): | |
| raise HTTPException(status_code=404, detail="Audio file not found") | |
| return FileResponse( | |
| file_path, | |
| media_type="audio/wav", | |
| filename=f"cloned_speech_{filename}" | |
| ) | |
| async def synthesize_with_url(request: TTSRequest): | |
| """ | |
| Enhanced synthesis with URL support and multiple formats | |
| """ | |
| start_time = time.time() | |
| if tts_model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| if not request.reference_audio_path or not os.path.exists(request.reference_audio_path): | |
| raise HTTPException(status_code=400, detail="Reference audio path not found") | |
| try: | |
| validate_audio_file(request.reference_audio_path) | |
| # Perform TTS asynchronously | |
| logger.info(f"Starting synthesis for text: {request.text[:50]}...") | |
| ref_codes = await encode_reference_async(request.reference_audio_path) | |
| wav = await run_tts_async(request.text, ref_codes, request.reference_text, request.speed) | |
| processing_time = time.time() - start_time | |
| audio_duration = len(wav) / 24000 | |
| # Handle output format | |
| if request.output_format.lower() in ["mp3", "flac"]: | |
| audio_buffer = io.BytesIO() | |
| if request.output_format.lower() == "mp3": | |
| sf.write(audio_buffer, wav, 24000, format='mp3') | |
| media_type = "audio/mpeg" | |
| else: | |
| sf.write(audio_buffer, wav, 24000, format='flac') | |
| media_type = "audio/flac" | |
| audio_buffer.seek(0) | |
| return Response( | |
| content=audio_buffer.read(), | |
| media_type=media_type, | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=cloned_speech.{request.output_format}", | |
| "X-Processing-Time": str(round(processing_time, 2)), | |
| "X-Audio-Duration": str(round(audio_duration, 2)) | |
| } | |
| ) | |
| else: | |
| # Save as WAV | |
| output_dir = "generated_audio" | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_filename = f"output_{int(time.time())}.wav" | |
| output_path = os.path.join(output_dir, output_filename) | |
| sf.write(output_path, wav, 24000) | |
| return TTSResponse( | |
| success=True, | |
| audio_url=f"/audio/{output_filename}", | |
| message="Speech synthesized successfully", | |
| processing_time=round(processing_time, 2), | |
| audio_duration=round(audio_duration, 2) | |
| ) | |
| except Exception as e: | |
| logger.error(f"Synthesis error: {str(e)}") | |
| raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}") | |
| async def cleanup_audio_files(): | |
| """Enhanced cleanup with efficient file management""" | |
| try: | |
| output_dir = "generated_audio" | |
| temp_dir = "temp_audio" | |
| deleted_count = 0 | |
| current_time = time.time() | |
| # Clean generated audio | |
| if os.path.exists(output_dir): | |
| for filename in os.listdir(output_dir): | |
| file_path = os.path.join(output_dir, filename) | |
| if os.path.isfile(file_path): | |
| file_age = current_time - os.path.getctime(file_path) | |
| if file_age > 3600: # 1 hour | |
| os.remove(file_path) | |
| deleted_count += 1 | |
| # Clean temp audio (shorter retention) | |
| if os.path.exists(temp_dir): | |
| for filename in os.listdir(temp_dir): | |
| file_path = os.path.join(temp_dir, filename) | |
| if os.path.isfile(file_path): | |
| file_age = current_time - os.path.getctime(file_path) | |
| if file_age > 1800: # 30 minutes for temp files | |
| os.remove(file_path) | |
| deleted_count += 1 | |
| # Force garbage collection | |
| gc.collect() | |
| return { | |
| "message": f"Cleaned up {deleted_count} files", | |
| "memory_cleaned": "true", | |
| "next_cleanup": "in_1_hour" | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Cleanup failed: {str(e)}") | |
| # GET endpoint for simple synthesis | |
| async def synthesize_speech_get( | |
| text: str = Query(..., min_length=1, max_length=5000), | |
| reference_text: str = Query(..., min_length=1, max_length=1000), | |
| reference_audio_path: str = Query(...), | |
| output_format: str = Query("wav"), | |
| speed: float = Query(1.0) | |
| ): | |
| """GET endpoint for speech synthesis""" | |
| request = TTSRequest( | |
| text=text, | |
| reference_text=reference_text, | |
| reference_audio_path=reference_audio_path, | |
| output_format=output_format, | |
| speed=speed | |
| ) | |
| return await synthesize_with_url(request) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860, workers=1) |