Rajhuggingface4253 commited on
Commit
c2ab408
·
verified ·
1 Parent(s): cd4b7dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +483 -215
app.py CHANGED
@@ -5,60 +5,194 @@ import gc
5
  import torch
6
  import numpy as np
7
  import aiofiles
8
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
9
- from fastapi.responses import JSONResponse, FileResponse
 
 
 
 
 
 
 
 
10
  from fastapi.middleware.cors import CORSMiddleware
11
- from pydantic import BaseModel
12
- from typing import Optional, Dict, Any
13
  import psutil
14
  import logging
 
15
 
16
  # Add NeuTTS Air to path
17
- sys.path.append("neutts-air")
18
 
19
  # Configure logging
20
- logging.basicConfig(level=logging.INFO)
 
 
 
21
  logger = logging.getLogger(__name__)
22
 
23
- app = FastAPI(
24
- title="NeuTTS Air API",
25
- description="High-quality on-device Text-to-Speech with instant voice cloning",
26
- version="1.0.0"
27
- )
 
 
 
 
 
28
 
29
- # CORS middleware
30
- app.add_middleware(
31
- CORSMiddleware,
32
- allow_origins=["*"],
33
- allow_credentials=True,
34
- allow_methods=["*"],
35
- allow_headers=["*"],
36
- )
37
 
38
- # Global model instance
39
  tts_model = None
40
  model_loading = False
 
 
41
 
42
- # Pydantic models
43
- class TTSRequest(BaseModel):
44
- text: str
45
- reference_text: str
46
- reference_audio_path: Optional[str] = None
47
 
48
- class TTSResponse(BaseModel):
49
- success: bool
50
- audio_url: Optional[str] = None
51
- message: Optional[str] = None
52
- processing_time: Optional[float] = None
53
- audio_duration: Optional[float] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- class HealthResponse(BaseModel):
56
- status: str
57
- model_loaded: bool
58
- memory_usage: Dict[str, float]
59
- disk_usage: Dict[str, float]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- def load_tts_model():
 
62
  global tts_model, model_loading
63
 
64
  if tts_model is not None or model_loading:
@@ -68,16 +202,19 @@ def load_tts_model():
68
  try:
69
  logger.info("Loading NeuTTS Air model...")
70
 
71
- # Try to import with fallbacks
 
 
 
 
 
72
  try:
73
  from neuttsair.neutts import NeuTTSAir
74
  except ImportError as e:
75
  logger.error(f"Failed to import NeuTTS Air: {e}")
76
- # Try alternative import path
77
- sys.path.insert(0, "/app/neutts-air")
78
- from neuttsair.neutts import NeuTTSAir
79
 
80
- # Use CPU for Hugging Face free tier with fallback models
81
  tts_model = NeuTTSAir(
82
  backbone_repo="neuphonic/neutts-air",
83
  backbone_device="cpu",
@@ -89,62 +226,140 @@ def load_tts_model():
89
 
90
  except Exception as e:
91
  logger.error(f"Failed to load model: {str(e)}")
92
- model_loading = False
93
  raise e
 
 
 
 
 
 
 
 
94
 
95
- model_loading = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- @app.on_event("startup")
98
- async def startup_event():
99
- """Load model on startup with error handling"""
100
- try:
101
- load_tts_model()
102
- except Exception as e:
103
- logger.error(f"Startup model loading failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  @app.get("/")
106
  async def root():
107
- return {"message": "NeuTTS Air API is running!", "status": "healthy"}
 
 
 
 
 
 
108
 
109
  @app.get("/health")
110
  async def health_check():
111
- """Health check endpoint"""
112
  try:
113
  memory = psutil.virtual_memory()
114
- disk = psutil.disk_usage('/')
115
 
116
  return HealthResponse(
117
  status="healthy",
118
  model_loaded=tts_model is not None,
 
 
119
  memory_usage={
120
  "total_gb": round(memory.total / (1024**3), 2),
121
  "available_gb": round(memory.available / (1024**3), 2),
122
  "used_percent": round(memory.percent, 2)
123
- },
124
- disk_usage={
125
- "total_gb": round(disk.total / (1024**3), 2),
126
- "free_gb": round(disk.free / (1024**3), 2),
127
- "used_percent": round(disk.percent, 2)
128
  }
129
  )
130
  except Exception as e:
131
  return HealthResponse(
132
  status="degraded",
133
  model_loaded=tts_model is not None,
134
- memory_usage={"error": str(e)},
135
- disk_usage={"error": str(e)}
 
136
  )
137
 
138
- @app.post("/synthesize")
139
  async def synthesize_speech(
140
  reference_text: str = Form(...),
141
  text: str = Form(...),
142
  reference_audio: UploadFile = File(...)
143
  ):
144
  """
145
- Synthesize speech using reference audio and text
146
  """
147
  start_time = time.time()
 
 
 
148
 
149
  if tts_model is None:
150
  raise HTTPException(status_code=503, detail="Model not loaded yet")
@@ -153,182 +368,235 @@ async def synthesize_speech(
153
  if not reference_text.strip() or not text.strip():
154
  raise HTTPException(status_code=400, detail="Text fields cannot be empty")
155
 
156
- if len(text) > 1000:
157
- raise HTTPException(status_code=400, detail="Text too long. Maximum 1000 characters allowed.")
158
-
159
- temp_ref_path = None
160
  try:
161
- # Save uploaded file temporarily
162
- temp_dir = "temp_audio"
163
- os.makedirs(temp_dir, exist_ok=True)
164
-
165
- file_extension = os.path.splitext(reference_audio.filename)[1] or ".wav"
166
- temp_ref_path = os.path.join(temp_dir, f"ref_{int(time.time())}{file_extension}")
167
 
168
- async with aiofiles.open(temp_ref_path, 'wb') as out_file:
169
- content = await reference_audio.read()
170
- await out_file.write(content)
171
 
172
- # Validate audio file
 
173
  try:
174
- import librosa
175
- audio_duration = librosa.get_duration(path=temp_ref_path)
176
- if audio_duration < 2 or audio_duration > 30:
177
- raise HTTPException(
178
- status_code=400,
179
- detail=f"Audio duration ({audio_duration:.1f}s) should be between 3-15 seconds"
180
- )
181
- except Exception as e:
182
- raise HTTPException(status_code=400, detail=f"Invalid audio file: {str(e)}")
183
-
184
- # Perform TTS
185
- logger.info(f"Starting synthesis for text: {text[:50]}...")
186
-
187
- # Encode reference
188
- ref_codes = tts_model.encode_reference(temp_ref_path)
189
-
190
- # Generate speech
191
- wav = tts_model.infer(text, ref_codes, reference_text)
192
-
193
- # Save output
194
- output_dir = "generated_audio"
195
- os.makedirs(output_dir, exist_ok=True)
196
- output_filename = f"output_{int(time.time())}.wav"
197
- output_path = os.path.join(output_dir, output_filename)
198
-
199
- import soundfile as sf
200
- sf.write(output_path, wav, 24000)
201
-
202
- processing_time = time.time() - start_time
203
- audio_duration = len(wav) / 24000
204
-
205
- logger.info(f"Synthesis completed in {processing_time:.2f}s")
206
-
207
- return TTSResponse(
208
- success=True,
209
- audio_url=f"/audio/{output_filename}",
210
- message="Speech synthesized successfully",
211
- processing_time=round(processing_time, 2),
212
- audio_duration=round(audio_duration, 2)
213
- )
214
-
215
  except Exception as e:
216
- logger.error(f"Synthesis error: {str(e)}")
217
  raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
218
 
219
- finally:
220
- # Clean up temporary file
221
- if temp_ref_path and os.path.exists(temp_ref_path):
222
- try:
223
- os.remove(temp_ref_path)
224
- except:
225
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- @app.get("/audio/{filename}")
228
- async def get_audio_file(filename: str):
229
- """Serve generated audio files"""
230
- file_path = os.path.join("generated_audio", filename)
 
 
 
 
 
 
 
231
 
232
- if not os.path.exists(file_path):
233
- raise HTTPException(status_code=404, detail="Audio file not found")
 
 
234
 
235
- return FileResponse(
236
- file_path,
237
  media_type="audio/wav",
238
- filename=f"generated_speech_{filename}"
 
 
 
239
  )
240
 
241
- @app.post("/synthesize-with-url")
242
- async def synthesize_with_url(request: TTSRequest):
 
 
 
 
243
  """
244
- Synthesize speech using a pre-uploaded reference audio file path
245
  """
246
  start_time = time.time()
247
 
248
  if tts_model is None:
249
  raise HTTPException(status_code=503, detail="Model not loaded yet")
250
 
251
- if not request.reference_audio_path or not os.path.exists(request.reference_audio_path):
252
- raise HTTPException(status_code=400, detail="Reference audio path not found")
253
-
254
  try:
255
- # Validate audio file
256
- import librosa
257
- audio_duration = librosa.get_duration(path=request.reference_audio_path)
258
- if audio_duration < 2 or audio_duration > 30:
259
- raise HTTPException(
260
- status_code=400,
261
- detail=f"Audio duration ({audio_duration:.1f}s) should be between 3-15 seconds"
262
- )
263
-
264
- # Perform TTS
265
- logger.info(f"Starting synthesis for text: {request.text[:50]}...")
266
-
267
- # Encode reference
268
- ref_codes = tts_model.encode_reference(request.reference_audio_path)
269
-
270
- # Generate speech
271
- wav = tts_model.infer(request.text, ref_codes, request.reference_text)
272
-
273
- # Save output
274
- output_dir = "generated_audio"
275
- os.makedirs(output_dir, exist_ok=True)
276
- output_filename = f"output_{int(time.time())}.wav"
277
- output_path = os.path.join(output_dir, output_filename)
278
-
279
- import soundfile as sf
280
- sf.write(output_path, wav, 24000)
281
-
282
- processing_time = time.time() - start_time
283
- audio_duration = len(wav) / 24000
284
-
285
- return TTSResponse(
286
- success=True,
287
- audio_url=f"/audio/{output_filename}",
288
- message="Speech synthesized successfully",
289
- processing_time=round(processing_time, 2),
290
- audio_duration=round(audio_duration, 2)
291
- )
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  except Exception as e:
294
- logger.error(f"Synthesis error: {str(e)}")
295
- raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
296
 
297
- @app.delete("/cleanup")
298
- async def cleanup_audio_files():
299
- """Clean up generated audio files older than 1 hour"""
300
- try:
301
- output_dir = "generated_audio"
302
- temp_dir = "temp_audio"
303
-
304
- deleted_count = 0
305
- current_time = time.time()
306
-
307
- # Clean generated audio
308
- if os.path.exists(output_dir):
309
- for filename in os.listdir(output_dir):
310
- file_path = os.path.join(output_dir, filename)
311
- if os.path.isfile(file_path):
312
- file_age = current_time - os.path.getctime(file_path)
313
- if file_age > 3600: # 1 hour
314
- os.remove(file_path)
315
- deleted_count += 1
316
-
317
- # Clean temp audio
318
- if os.path.exists(temp_dir):
319
- for filename in os.listdir(temp_dir):
320
- file_path = os.path.join(temp_dir, filename)
321
- if os.path.isfile(file_path):
322
- file_age = current_time - os.path.getctime(file_path)
323
- if file_age > 3600: # 1 hour
324
- os.remove(file_path)
325
- deleted_count += 1
326
-
327
- return {"message": f"Cleaned up {deleted_count} files"}
328
-
329
- except Exception as e:
330
- raise HTTPException(status_code=500, detail=f"Cleanup failed: {str(e)}")
 
 
 
 
 
 
 
 
331
 
332
  if __name__ == "__main__":
333
  import uvicorn
334
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
5
  import torch
6
  import numpy as np
7
  import aiofiles
8
+ import asyncio
9
+ import subprocess
10
+ import io
11
+ from contextlib import asynccontextmanager
12
+ from typing import Optional, Dict, Any, AsyncGenerator
13
+ from uuid import uuid4
14
+ from pathlib import Path
15
+
16
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks, Request
17
+ from fastapi.responses import JSONResponse, StreamingResponse, Response
18
  from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel, Field
 
20
  import psutil
21
  import logging
22
+ import soundfile as sf
23
 
24
  # Add NeuTTS Air to path
25
+ sys.path.insert(0, "/app/neutts-air")
26
 
27
  # Configure logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
31
+ )
32
  logger = logging.getLogger(__name__)
33
 
34
+ # Configuration
35
+ class Config:
36
+ MAX_TEXT_LENGTH = 1000
37
+ MIN_AUDIO_DURATION = 2
38
+ MAX_AUDIO_DURATION = 30
39
+ SAMPLE_RATE = 24000
40
+ REFERENCE_SAMPLE_RATE = 16000
41
+ CHUNK_SIZE = 4096 # For streaming
42
+ MAX_CONCURRENT_REQUESTS = 3
43
+ REQUEST_TIMEOUT = 120
44
 
45
+ config = Config()
 
 
 
 
 
 
 
46
 
47
+ # Global model instance with async support
48
  tts_model = None
49
  model_loading = False
50
+ active_requests = 0
51
+ request_semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_REQUESTS)
52
 
53
+ # In-memory audio cache to avoid disk usage
54
+ audio_cache = {}
55
+ CACHE_MAX_SIZE = 50 # Max cached audio files
56
+ CACHE_CLEANUP_INTERVAL = 300 # 5 minutes
 
57
 
58
+ class AudioCache:
59
+ """In-memory audio cache to avoid disk usage"""
60
+
61
+ def __init__(self, max_size: int = 50):
62
+ self.cache = {}
63
+ self.max_size = max_size
64
+ self.access_order = []
65
+
66
+ async def store_audio(self, audio_id: str, audio_data: np.ndarray, sample_rate: int):
67
+ """Store audio in memory"""
68
+ if len(self.cache) >= self.max_size:
69
+ await self._remove_oldest()
70
+
71
+ self.cache[audio_id] = {
72
+ 'audio': audio_data,
73
+ 'sample_rate': sample_rate,
74
+ 'created_at': time.time(),
75
+ 'accessed_at': time.time()
76
+ }
77
+ self.access_order.append(audio_id)
78
+
79
+ async def get_audio(self, audio_id: str) -> Optional[Dict]:
80
+ """Retrieve audio from memory"""
81
+ if audio_id in self.cache:
82
+ self.cache[audio_id]['accessed_at'] = time.time()
83
+ # Move to end of access order
84
+ if audio_id in self.access_order:
85
+ self.access_order.remove(audio_id)
86
+ self.access_order.append(audio_id)
87
+ return self.cache[audio_id]
88
+ return None
89
+
90
+ async def _remove_oldest(self):
91
+ """Remove least recently used audio"""
92
+ if self.access_order:
93
+ oldest_id = self.access_order.pop(0)
94
+ if oldest_id in self.cache:
95
+ del self.cache[oldest_id]
96
+ logger.debug(f"Removed cached audio: {oldest_id}")
97
 
98
+ # Initialize cache
99
+ audio_cache = AudioCache(max_size=CACHE_MAX_SIZE)
100
+
101
+ class AudioStreamProcessor:
102
+ """Process audio in memory without disk usage"""
103
+
104
+ @staticmethod
105
+ async def convert_audio_to_wav_memory(upload_file: UploadFile) -> tuple[bytes, float]:
106
+ """Convert uploaded audio to WAV format in memory"""
107
+ try:
108
+ # Read uploaded file into memory
109
+ file_content = await upload_file.read()
110
+
111
+ # Create temporary in-memory files
112
+ input_buffer = io.BytesIO(file_content)
113
+ output_buffer = io.BytesIO()
114
+
115
+ # Save input to temporary file (minimal disk usage for ffmpeg)
116
+ temp_input_path = f"/tmp/input_{uuid4().hex}{Path(upload_file.filename).suffix}"
117
+ temp_output_path = f"/tmp/output_{uuid4().hex}.wav"
118
+
119
+ try:
120
+ # Write input to temp file
121
+ async with aiofiles.open(temp_input_path, 'wb') as f:
122
+ await f.write(file_content)
123
+
124
+ # Convert using ffmpeg
125
+ cmd = [
126
+ 'ffmpeg', '-i', temp_input_path,
127
+ '-ac', '1',
128
+ '-ar', str(config.REFERENCE_SAMPLE_RATE),
129
+ '-acodec', 'pcm_s16le',
130
+ '-y', temp_output_path
131
+ ]
132
+
133
+ process = await asyncio.create_subprocess_exec(
134
+ *cmd,
135
+ stdout=asyncio.subprocess.PIPE,
136
+ stderr=asyncio.subprocess.PIPE
137
+ )
138
+
139
+ stdout, stderr = await process.communicate()
140
+
141
+ if process.returncode != 0:
142
+ raise Exception(f"FFmpeg failed: {stderr.decode()}")
143
+
144
+ # Read converted file into memory
145
+ async with aiofiles.open(temp_output_path, 'rb') as f:
146
+ wav_data = await f.read()
147
+
148
+ # Get duration
149
+ duration = await AudioStreamProcessor.get_audio_duration_memory(wav_data)
150
+
151
+ return wav_data, duration
152
+
153
+ finally:
154
+ # Cleanup temp files
155
+ for temp_file in [temp_input_path, temp_output_path]:
156
+ if os.path.exists(temp_file):
157
+ try:
158
+ os.remove(temp_file)
159
+ except:
160
+ pass
161
+
162
+ except Exception as e:
163
+ logger.error(f"Audio conversion failed: {e}")
164
+ raise
165
+
166
+ @staticmethod
167
+ async def get_audio_duration_memory(audio_data: bytes) -> float:
168
+ """Get audio duration from in-memory WAV data"""
169
+ try:
170
+ # Use soundfile with BytesIO
171
+ with sf.SoundFile(io.BytesIO(audio_data)) as audio_file:
172
+ return len(audio_file) / audio_file.samplerate
173
+ except Exception as e:
174
+ logger.warning(f"SoundFile duration failed: {e}, using librosa")
175
+ # Fallback to librosa
176
+ import librosa
177
+ audio_array, sr = librosa.load(io.BytesIO(audio_data), sr=None)
178
+ return len(audio_array) / sr
179
+
180
+ @staticmethod
181
+ async def validate_audio_duration(duration: float):
182
+ """Validate audio duration"""
183
+ if duration < config.MIN_AUDIO_DURATION:
184
+ raise HTTPException(
185
+ status_code=400,
186
+ detail=f"Audio too short: {duration:.1f}s (minimum {config.MIN_AUDIO_DURATION}s)"
187
+ )
188
+ if duration > config.MAX_AUDIO_DURATION:
189
+ raise HTTPException(
190
+ status_code=400,
191
+ detail=f"Audio too long: {duration:.1f}s (maximum {config.MAX_AUDIO_DURATION}s)"
192
+ )
193
 
194
+ async def load_tts_model():
195
+ """Load TTS model asynchronously"""
196
  global tts_model, model_loading
197
 
198
  if tts_model is not None or model_loading:
 
202
  try:
203
  logger.info("Loading NeuTTS Air model...")
204
 
205
+ # Clear memory before loading
206
+ gc.collect()
207
+ if torch.cuda.is_available():
208
+ torch.cuda.empty_cache()
209
+
210
+ # Import model
211
  try:
212
  from neuttsair.neutts import NeuTTSAir
213
  except ImportError as e:
214
  logger.error(f"Failed to import NeuTTS Air: {e}")
215
+ raise
 
 
216
 
217
+ # Initialize model
218
  tts_model = NeuTTSAir(
219
  backbone_repo="neuphonic/neutts-air",
220
  backbone_device="cpu",
 
226
 
227
  except Exception as e:
228
  logger.error(f"Failed to load model: {str(e)}")
 
229
  raise e
230
+ finally:
231
+ model_loading = False
232
+
233
+ @asynccontextmanager
234
+ async def lifespan(app: FastAPI):
235
+ """Lifespan manager with async startup/shutdown"""
236
+ # Startup
237
+ logger.info("🚀 Starting NeuTTS Air Streaming API")
238
 
239
+ # Load model in background
240
+ asyncio.create_task(load_tts_model())
241
+
242
+ # Start cache cleanup task
243
+ asyncio.create_task(cache_cleanup_task())
244
+
245
+ yield
246
+
247
+ # Shutdown
248
+ logger.info("🛑 Shutting down NeuTTS Air API")
249
+ global tts_model
250
+ if tts_model is not None:
251
+ del tts_model
252
+ tts_model = None
253
+ gc.collect()
254
 
255
+ app = FastAPI(
256
+ title="NeuTTS Air Streaming API",
257
+ description="High-quality on-device TTS with streaming and no disk usage",
258
+ version="2.0.0",
259
+ lifespan=lifespan
260
+ )
261
+
262
+ # CORS middleware
263
+ app.add_middleware(
264
+ CORSMiddleware,
265
+ allow_origins=["*"],
266
+ allow_credentials=True,
267
+ allow_methods=["*"],
268
+ allow_headers=["*"],
269
+ )
270
+
271
+ # Pydantic models
272
+ class TTSRequest(BaseModel):
273
+ text: str = Field(..., min_length=1, max_length=1000)
274
+ reference_text: str = Field(..., min_length=1, max_length=500)
275
+ reference_audio_path: Optional[str] = None
276
+
277
+ class TTSResponse(BaseModel):
278
+ success: bool
279
+ audio_id: Optional[str] = None
280
+ message: Optional[str] = None
281
+ processing_time: Optional[float] = None
282
+ audio_duration: Optional[float] = None
283
+ stream_url: Optional[str] = None
284
+
285
+ class HealthResponse(BaseModel):
286
+ status: str
287
+ model_loaded: bool
288
+ active_requests: int
289
+ cache_size: int
290
+ memory_usage: Dict[str, float]
291
+
292
+ # Async middleware for request limiting
293
+ @app.middleware("http")
294
+ async def limit_concurrent_requests(request: Request, call_next):
295
+ global active_requests
296
+
297
+ if active_requests >= config.MAX_CONCURRENT_REQUESTS:
298
+ return JSONResponse(
299
+ status_code=429,
300
+ content={"detail": "Too many concurrent requests"}
301
+ )
302
+
303
+ async with request_semaphore:
304
+ active_requests += 1
305
+ try:
306
+ start_time = time.time()
307
+ response = await call_next(request)
308
+ process_time = time.time() - start_time
309
+ logger.info(f"{request.method} {request.url.path} completed in {process_time:.2f}s")
310
+ return response
311
+ finally:
312
+ active_requests -= 1
313
 
314
  @app.get("/")
315
  async def root():
316
+ return {
317
+ "message": "NeuTTS Air Streaming API",
318
+ "status": "healthy",
319
+ "features": ["streaming", "no_disk_usage", "async", "in_memory_cache"],
320
+ "model_loaded": tts_model is not None,
321
+ "active_requests": active_requests
322
+ }
323
 
324
  @app.get("/health")
325
  async def health_check():
326
+ """Health check with memory usage"""
327
  try:
328
  memory = psutil.virtual_memory()
 
329
 
330
  return HealthResponse(
331
  status="healthy",
332
  model_loaded=tts_model is not None,
333
+ active_requests=active_requests,
334
+ cache_size=len(audio_cache.cache),
335
  memory_usage={
336
  "total_gb": round(memory.total / (1024**3), 2),
337
  "available_gb": round(memory.available / (1024**3), 2),
338
  "used_percent": round(memory.percent, 2)
 
 
 
 
 
339
  }
340
  )
341
  except Exception as e:
342
  return HealthResponse(
343
  status="degraded",
344
  model_loaded=tts_model is not None,
345
+ active_requests=active_requests,
346
+ cache_size=len(audio_cache.cache),
347
+ memory_usage={"error": str(e)}
348
  )
349
 
350
+ @app.post("/synthesize", response_model=TTSResponse)
351
  async def synthesize_speech(
352
  reference_text: str = Form(...),
353
  text: str = Form(...),
354
  reference_audio: UploadFile = File(...)
355
  ):
356
  """
357
+ Synthesize speech with streaming support and no disk usage
358
  """
359
  start_time = time.time()
360
+ request_id = str(uuid4())[:8]
361
+
362
+ logger.info(f"[{request_id}] Starting streaming synthesis")
363
 
364
  if tts_model is None:
365
  raise HTTPException(status_code=503, detail="Model not loaded yet")
 
368
  if not reference_text.strip() or not text.strip():
369
  raise HTTPException(status_code=400, detail="Text fields cannot be empty")
370
 
 
 
 
 
371
  try:
372
+ # Convert audio to WAV in memory
373
+ wav_data, audio_duration = await AudioStreamProcessor.convert_audio_to_wav_memory(reference_audio)
374
+ await AudioStreamProcessor.validate_audio_duration(audio_duration)
 
 
 
375
 
376
+ logger.info(f"[{request_id}] Audio validated: {audio_duration:.2f}s")
 
 
377
 
378
+ # Create temporary file for model processing (minimal disk usage)
379
+ temp_ref_path = f"/tmp/ref_{request_id}.wav"
380
  try:
381
+ async with aiofiles.open(temp_ref_path, 'wb') as f:
382
+ await f.write(wav_data)
383
+
384
+ # Perform TTS
385
+ logger.info(f"[{request_id}] Synthesizing: '{text[:50]}...'")
386
+
387
+ # Encode reference and generate speech
388
+ ref_codes = tts_model.encode_reference(temp_ref_path)
389
+ wav_output = tts_model.infer(text, ref_codes, reference_text)
390
+
391
+ # Generate audio ID for caching
392
+ audio_id = f"audio_{request_id}"
393
+
394
+ # Store in memory cache
395
+ await audio_cache.store_audio(audio_id, wav_output, config.SAMPLE_RATE)
396
+
397
+ processing_time = time.time() - start_time
398
+ output_duration = len(wav_output) / config.SAMPLE_RATE
399
+
400
+ logger.info(f"[{request_id}] Synthesis completed in {processing_time:.2f}s")
401
+
402
+ return TTSResponse(
403
+ success=True,
404
+ audio_id=audio_id,
405
+ message="Speech synthesized successfully",
406
+ processing_time=round(processing_time, 2),
407
+ audio_duration=round(output_duration, 2),
408
+ stream_url=f"/stream/{audio_id}"
409
+ )
410
+
411
+ finally:
412
+ # Cleanup temp file
413
+ if os.path.exists(temp_ref_path):
414
+ try:
415
+ os.remove(temp_ref_path)
416
+ except:
417
+ pass
418
+
419
+ except HTTPException:
420
+ raise
 
421
  except Exception as e:
422
+ logger.error(f"[{request_id}] Synthesis error: {str(e)}")
423
  raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
424
+
425
+ @app.get("/stream/{audio_id}")
426
+ async def stream_audio(audio_id: str):
427
+ """
428
+ Stream audio directly from memory cache
429
+ """
430
+ # Get audio from cache
431
+ cached_audio = await audio_cache.get_audio(audio_id)
432
+ if not cached_audio:
433
+ raise HTTPException(status_code=404, detail="Audio not found or expired")
434
 
435
+ audio_data = cached_audio['audio']
436
+ sample_rate = cached_audio['sample_rate']
437
+
438
+ # Convert numpy array to WAV bytes in memory
439
+ wav_buffer = io.BytesIO()
440
+ sf.write(wav_buffer, audio_data, sample_rate, format='WAV')
441
+ wav_bytes = wav_buffer.getvalue()
442
+
443
+ # Create async generator for streaming
444
+ async def generate_audio_stream():
445
+ chunk_size = config.CHUNK_SIZE
446
+ for i in range(0, len(wav_bytes), chunk_size):
447
+ yield wav_bytes[i:i + chunk_size]
448
+ await asyncio.sleep(0.001) # Small delay for proper streaming
449
+
450
+ return StreamingResponse(
451
+ generate_audio_stream(),
452
+ media_type="audio/wav",
453
+ headers={
454
+ "Content-Disposition": f"attachment; filename=speech_{audio_id}.wav",
455
+ "Cache-Control": "no-cache",
456
+ "Content-Length": str(len(wav_bytes))
457
+ }
458
+ )
459
 
460
+ @app.get("/download/{audio_id}")
461
+ async def download_audio(audio_id: str):
462
+ """
463
+ Download audio as complete file
464
+ """
465
+ cached_audio = await audio_cache.get_audio(audio_id)
466
+ if not cached_audio:
467
+ raise HTTPException(status_code=404, detail="Audio not found or expired")
468
+
469
+ audio_data = cached_audio['audio']
470
+ sample_rate = cached_audio['sample_rate']
471
 
472
+ # Convert to WAV in memory
473
+ wav_buffer = io.BytesIO()
474
+ sf.write(wav_buffer, audio_data, sample_rate, format='WAV')
475
+ wav_bytes = wav_buffer.getvalue()
476
 
477
+ return Response(
478
+ content=wav_bytes,
479
  media_type="audio/wav",
480
+ headers={
481
+ "Content-Disposition": f"attachment; filename=speech_{audio_id}.wav",
482
+ "Content-Length": str(len(wav_bytes))
483
+ }
484
  )
485
 
486
+ @app.post("/synthesize-and-stream")
487
+ async def synthesize_and_stream(
488
+ reference_text: str = Form(...),
489
+ text: str = Form(...),
490
+ reference_audio: UploadFile = File(...)
491
+ ):
492
  """
493
+ Real-time synthesis and streaming in one endpoint
494
  """
495
  start_time = time.time()
496
 
497
  if tts_model is None:
498
  raise HTTPException(status_code=503, detail="Model not loaded yet")
499
 
 
 
 
500
  try:
501
+ # Convert audio to WAV in memory
502
+ wav_data, audio_duration = await AudioStreamProcessor.convert_audio_to_wav_memory(reference_audio)
503
+ await AudioStreamProcessor.validate_audio_duration(audio_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
+ # Create temporary file for model processing
506
+ temp_ref_path = f"/tmp/ref_stream_{uuid4().hex}.wav"
507
+ try:
508
+ async with aiofiles.open(temp_ref_path, 'wb') as f:
509
+ await f.write(wav_data)
510
+
511
+ # Perform TTS
512
+ ref_codes = tts_model.encode_reference(temp_ref_path)
513
+ wav_output = tts_model.infer(text, ref_codes, reference_text)
514
+
515
+ processing_time = time.time() - start_time
516
+ logger.info(f"Real-time synthesis completed in {processing_time:.2f}s")
517
+
518
+ # Convert to WAV bytes
519
+ wav_buffer = io.BytesIO()
520
+ sf.write(wav_buffer, wav_output, config.SAMPLE_RATE, format='WAV')
521
+ wav_bytes = wav_buffer.getvalue()
522
+
523
+ # Stream directly
524
+ async def generate_stream():
525
+ chunk_size = config.CHUNK_SIZE
526
+ for i in range(0, len(wav_bytes), chunk_size):
527
+ yield wav_bytes[i:i + chunk_size]
528
+ await asyncio.sleep(0.001)
529
+
530
+ return StreamingResponse(
531
+ generate_stream(),
532
+ media_type="audio/wav",
533
+ headers={
534
+ "Content-Disposition": "attachment; filename=speech_stream.wav",
535
+ "Cache-Control": "no-cache",
536
+ "X-Processing-Time": f"{processing_time:.2f}"
537
+ }
538
+ )
539
+
540
+ finally:
541
+ if os.path.exists(temp_ref_path):
542
+ try:
543
+ os.remove(temp_ref_path)
544
+ except:
545
+ pass
546
+
547
  except Exception as e:
548
+ logger.error(f"Stream synthesis error: {str(e)}")
549
+ raise HTTPException(status_code=500, detail=f"Stream synthesis failed: {str(e)}")
550
 
551
+ @app.delete("/cache/{audio_id}")
552
+ async def clear_cached_audio(audio_id: str):
553
+ """Clear specific audio from cache"""
554
+ if audio_id in audio_cache.cache:
555
+ del audio_cache.cache[audio_id]
556
+ if audio_id in audio_cache.access_order:
557
+ audio_cache.access_order.remove(audio_id)
558
+ return {"message": f"Audio {audio_id} cleared from cache"}
559
+ else:
560
+ raise HTTPException(status_code=404, detail="Audio not found in cache")
561
+
562
+ @app.delete("/cache")
563
+ async def clear_all_cache():
564
+ """Clear all audio cache"""
565
+ cache_size = len(audio_cache.cache)
566
+ audio_cache.cache.clear()
567
+ audio_cache.access_order.clear()
568
+ return {"message": f"Cleared all {cache_size} cached audio files"}
569
+
570
+ async def cache_cleanup_task():
571
+ """Background task to clean up old cache entries"""
572
+ while True:
573
+ await asyncio.sleep(CACHE_CLEANUP_INTERVAL)
574
+ try:
575
+ current_time = time.time()
576
+ expired_ids = []
577
+
578
+ for audio_id, data in audio_cache.cache.items():
579
+ if current_time - data['accessed_at'] > 3600: # 1 hour
580
+ expired_ids.append(audio_id)
581
+
582
+ for audio_id in expired_ids:
583
+ if audio_id in audio_cache.cache:
584
+ del audio_cache.cache[audio_id]
585
+ if audio_id in audio_cache.access_order:
586
+ audio_cache.access_order.remove(audio_id)
587
+
588
+ if expired_ids:
589
+ logger.info(f"Cache cleanup removed {len(expired_ids)} expired entries")
590
+
591
+ except Exception as e:
592
+ logger.error(f"Cache cleanup error: {e}")
593
 
594
  if __name__ == "__main__":
595
  import uvicorn
596
+ uvicorn.run(
597
+ app,
598
+ host="0.0.0.0",
599
+ port=7860,
600
+ workers=1,
601
+ log_level="info"
602
+ )