Rajhuggingface4253 commited on
Commit
308a219
·
verified ·
1 Parent(s): e3fd3e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -421
app.py CHANGED
@@ -2,540 +2,332 @@ import os
2
  import io
3
  import asyncio
4
  import time
5
- import shutil
6
  import numpy as np
7
- import psutil
8
  import soundfile as sf
9
  import subprocess
10
  import tempfile
11
  from concurrent.futures import ThreadPoolExecutor
12
- from typing import Optional, Generator, AsyncGenerator
13
  from contextlib import asynccontextmanager
14
  import logging
15
  import aiofiles
16
  import torch
17
- from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Query, BackgroundTasks
18
  from fastapi.responses import Response, StreamingResponse
19
  from fastapi.middleware.cors import CORSMiddleware
20
- from pydantic import BaseModel, Field
21
- import uuid
22
- from dataclasses import dataclass
23
- from queue import Queue, Empty
24
- import threading
25
 
26
- # Ensure the cloned neutts-air repository is in the path
27
- import sys
28
- sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
29
- from neuttsair.neutts import NeuTTSAir
30
-
31
- # Configure logging
32
- logging.basicConfig(
33
- level=logging.INFO,
34
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
35
- )
36
- logger = logging.getLogger("NeuTTS-API")
37
-
38
- # --- Configuration & Constants ---
39
  DEVICE = "cpu"
40
- MAX_WORKERS = 2
41
- tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
42
  SAMPLE_RATE = 24000
43
- CLEANUP_THRESHOLD = 300
 
44
  TEMP_AUDIO_DIR = "temp_audio"
45
- GENERATED_AUDIO_DIR = "generated_audio"
46
  os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
47
- os.makedirs(GENERATED_AUDIO_DIR, exist_ok=True)
48
 
49
- # --- Data Models ---
50
- class TTSRequestModel(BaseModel):
51
- text: str = Field(..., min_length=1, max_length=1000)
52
- speed: float = Field(default=1.0, ge=0.5, le=2.0)
53
- output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
54
-
55
- @dataclass
56
- class SynthesisTask:
57
- task_id: str
58
- text: str
59
- reference_audio_path: str
60
- reference_text: str
61
- output_format: str
62
- created_at: float
63
-
64
- # --- Enhanced Audio Conversion with Async Support ---
65
- async def convert_to_wav_async(input_path: str) -> str:
66
- """Asynchronous audio conversion using subprocess with async wrapper."""
67
- with tempfile.NamedTemporaryFile(suffix=".wav", dir=TEMP_AUDIO_DIR, delete=False) as tmp:
68
- output_path = tmp.name
69
-
70
- logger.info(f"Converting '{os.path.basename(input_path)}' to WAV")
71
-
72
- command = [
73
- "ffmpeg", "-y", "-i", input_path,
74
- "-f", "wav", "-ar", str(SAMPLE_RATE),
75
- "-ac", "1", "-c:a", "pcm_s16le", output_path
76
- ]
77
-
78
- try:
79
- # Run FFmpeg asynchronously
80
- process = await asyncio.create_subprocess_exec(
81
- *command,
82
- stdout=asyncio.subprocess.PIPE,
83
- stderr=asyncio.subprocess.PIPE
84
- )
85
-
86
- stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=30)
87
-
88
- if process.returncode != 0:
89
- error_detail = stderr.decode().splitlines()[-1] if stderr else "Unknown FFmpeg error"
90
- logger.error(f"FFmpeg conversion failed: {error_detail}")
91
- if os.path.exists(output_path):
92
- os.unlink(output_path)
93
- raise HTTPException(status_code=400, detail=f"Audio conversion failed: {error_detail}")
94
-
95
- logger.info("FFmpeg conversion successful")
96
- return output_path
97
-
98
- except asyncio.TimeoutError:
99
- logger.error("FFmpeg conversion timed out")
100
- if os.path.exists(output_path):
101
- os.unlink(output_path)
102
- raise HTTPException(status_code=504, detail="Audio conversion timed out")
103
- except Exception as e:
104
- logger.error(f"Conversion error: {e}")
105
- if os.path.exists(output_path):
106
- os.unlink(output_path)
107
- raise HTTPException(status_code=500, detail="Unexpected conversion error")
108
 
109
- # --- Enhanced Model Wrapper with Async Streaming ---
110
- class NeuTTSWrapper:
111
  def __init__(self, device: str = "cpu"):
112
  self.tts_model = None
113
  self.device = device
114
- self._model_lock = asyncio.Lock() # For thread-safe model access
115
  self.load_model()
116
 
117
  def load_model(self):
 
118
  try:
119
- logger.info(f"Loading NeuTTSAir model on device: {self.device}")
120
  self.tts_model = NeuTTSAir(backbone_device=self.device, codec_device=self.device)
121
- logger.info("✅ NeuTTSAir model loaded successfully")
122
  except Exception as e:
123
  logger.error(f"❌ Model loading failed: {e}")
124
  raise
125
 
126
- def _convert_to_streamable_format(self, audio_data: np.ndarray, audio_format: str) -> bytes:
127
- """Convert NumPy audio array to streamable bytes."""
128
- audio_buffer = io.BytesIO()
129
- try:
130
- sf.write(audio_buffer, audio_data, SAMPLE_RATE, format=audio_format)
131
- except Exception as e:
132
- logger.error(f"Failed to write audio data to format {audio_format}: {e}")
133
- raise
134
- audio_buffer.seek(0)
135
- return audio_buffer.read()
136
 
137
- def _split_text_into_chunks(self, text: str, max_chunk_length: int = 100) -> list[str]:
138
- """Enhanced text splitting for better streaming chunks."""
139
- # Simple sentence-based splitting with length limits
140
- sentences = []
141
- current_sentence = ""
142
 
143
- for word in text.split():
144
- test_sentence = f"{current_sentence} {word}".strip()
145
- if len(test_sentence) <= max_chunk_length:
146
- current_sentence = test_sentence
147
- else:
148
- if current_sentence:
149
- sentences.append(current_sentence)
150
- current_sentence = word
151
 
152
- if current_sentence:
153
- sentences.append(current_sentence)
154
-
155
- return sentences or [text]
156
-
157
- async def generate_speech_async(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
158
- """Asynchronous speech generation with proper locking."""
159
- async with self._model_lock:
160
- return await asyncio.get_event_loop().run_in_executor(
161
- tts_executor,
162
- self._generate_speech_blocking,
163
- text, ref_audio_path, reference_text
164
- )
165
-
166
- def _generate_speech_blocking(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
167
- """Blocking speech generation (runs in thread pool)."""
168
- ref_s = self.tts_model.encode_reference(ref_audio_path)
169
  with torch.no_grad():
170
  audio = self.tts_model.infer(text, ref_s, reference_text)
 
 
171
  return audio
172
 
173
- async def stream_speech_async(
174
- self,
175
- text: str,
176
- ref_audio_path: str,
177
- reference_text: str,
178
- audio_format: str
179
- ) -> AsyncGenerator[bytes, None]:
180
- """True asynchronous streaming with immediate chunk delivery."""
181
- logger.info(f"Starting true streaming synthesis for text length: {len(text)}")
182
 
183
- # Encode reference once (this is the only blocking part we need to do first)
184
- async with self._model_lock:
185
- ref_s = await asyncio.get_event_loop().run_in_executor(
186
- tts_executor,
187
- self.tts_model.encode_reference,
188
- ref_audio_path
189
- )
190
 
191
- # Split text into chunks for streaming
192
- sentences = self._split_text_into_chunks(text)
193
- logger.info(f"Split text into {len(sentences)} chunks for streaming")
194
 
195
- # Stream each chunk asynchronously
196
- for i, sentence in enumerate(sentences):
197
- if not sentence.strip():
198
- continue
199
-
200
- logger.debug(f"Generating streaming chunk {i+1}: '{sentence[:30]}...'")
201
-
202
- # Generate this chunk asynchronously
203
- audio_chunk = await asyncio.get_event_loop().run_in_executor(
204
- tts_executor,
205
- self._infer_chunk,
206
- sentence, ref_s, reference_text
207
- )
208
 
209
- # Convert and yield immediately
210
- chunk_bytes = await asyncio.get_event_loop().run_in_executor(
211
- tts_executor,
212
- self._convert_to_streamable_format,
213
- audio_chunk, audio_format
214
- )
215
-
216
- yield chunk_bytes
217
- logger.debug(f"Yielded chunk {i+1} ({len(chunk_bytes)} bytes)")
218
 
219
- logger.info("Streaming synthesis complete")
 
220
 
221
- def _infer_chunk(self, sentence: str, ref_s, reference_text: str) -> np.ndarray:
222
- """Infer a single chunk (runs in thread pool)."""
223
- with torch.no_grad():
224
- return self.tts_model.infer(sentence, ref_s, reference_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- # --- Async Utility Functions ---
227
- async def save_upload_file_async(upload_file: UploadFile) -> str:
228
- """Asynchronously saves the UploadFile to disk."""
229
- temp_filename = os.path.join(TEMP_AUDIO_DIR, f"{time.time()}_{upload_file.filename}")
230
- try:
231
- async with aiofiles.open(temp_filename, 'wb') as out_file:
232
- while content := await upload_file.read(1024 * 1024):
233
- await out_file.write(content)
234
- return temp_filename
235
- except Exception as e:
236
- logger.error(f"Error saving file: {e}")
237
- raise HTTPException(status_code=500, detail="Could not save reference audio file")
238
 
239
- async def cleanup_file_async(file_path: str):
240
- """Asynchronously clean up a file."""
241
- try:
242
- if os.path.exists(file_path):
243
- os.unlink(file_path)
244
- logger.debug(f"Cleaned up file: {file_path}")
245
- except Exception as e:
246
- logger.warning(f"Failed to cleanup file {file_path}: {e}")
247
 
248
- async def scheduled_cleanup_task():
249
- """Runs the cleanup task periodically in the background."""
250
- while True:
251
- await asyncio.sleep(CLEANUP_THRESHOLD) # Wait for the defined period (e.g., 1 hour)
252
- logger.info("Running scheduled cleanup of old audio files...")
253
- try:
254
- await cleanup_files_async()
255
- except Exception as e:
256
- logger.error(f"Scheduled cleanup task failed: {e}")
257
- # --- FastAPI Lifespan Manager ---
258
  @asynccontextmanager
259
  async def lifespan(app: FastAPI):
260
- """Modern lifespan management."""
261
- try:
262
- app.state.tts_wrapper = NeuTTSWrapper(device=DEVICE)
263
- app.state.synthesis_tasks = {} # Track active tasks
264
- asyncio.create_task(scheduled_cleanup_task())
265
- logger.info("✅ Application startup complete")
266
- except Exception as e:
267
- logger.error(f"Fatal startup error: {e}")
268
- tts_executor.shutdown(wait=False)
269
- raise RuntimeError("Model initialization failed")
270
-
271
- yield
272
-
273
- logger.info("Shutting down ThreadPoolExecutor")
274
- tts_executor.shutdown(wait=True)
275
 
276
- # --- FastAPI Application Setup ---
277
- app = FastAPI(
278
- title="NeuTTS Air Instant Cloning API - Enhanced",
279
- version="3.0.0-PROD-STREAMING",
280
- docs_url="/docs",
281
- lifespan=lifespan
282
- )
283
-
284
- app.add_middleware(
285
- CORSMiddleware,
286
- allow_origins=["*"],
287
- allow_methods=["*"],
288
- allow_headers=["*"],
289
- )
290
-
291
- # --- Enhanced Endpoints ---
292
- @app.get("/")
293
- async def root():
294
- return {"message": "NeuTTS Air API v3.0 - True Streaming Ready"}
295
 
296
- @app.get("/health")
297
- async def health_check():
298
- """Enhanced health check with streaming metrics."""
299
- mem = psutil.virtual_memory()
300
- disk = psutil.disk_usage('/')
301
-
302
- active_tasks = len(getattr(app.state, 'synthesis_tasks', {}))
303
-
304
  return {
305
- "status": "healthy",
306
- "model_loaded": hasattr(app.state, 'tts_wrapper') and app.state.tts_wrapper.tts_model is not None,
307
  "device": DEVICE,
308
- "concurrency_limit": MAX_WORKERS,
309
- "active_synthesis_tasks": active_tasks,
310
- "memory_usage": {
311
- "total_gb": round(mem.total / (1024**3), 2),
312
- "used_percent": mem.percent
313
- },
314
- "disk_usage": {
315
- "total_gb": round(disk.total / (1024**3), 2),
316
- "used_percent": disk.percent
317
- }
318
  }
319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  @app.post("/synthesize", response_class=Response)
321
- async def text_to_speech(
322
  text: str = Form(...),
323
  reference_text: str = Form(...),
324
- speed: float = Form(1.0, ge=0.5, le=2.0),
325
- output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
326
  reference_audio: UploadFile = File(...),
327
  background_tasks: BackgroundTasks = None
328
  ):
329
- """
330
- Enhanced standard TTS endpoint with better async handling.
331
- """
332
- if not hasattr(app.state, 'tts_wrapper'):
333
- raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
334
-
335
  start_time = time.time()
336
- temp_ref_path = None
337
- converted_wav_path = None
338
 
339
  try:
340
- # 1. Save uploaded file
341
- temp_ref_path = await save_upload_file_async(reference_audio)
342
-
343
- # 2. Convert to WAV
344
- converted_wav_path = await convert_to_wav_async(temp_ref_path)
345
 
346
- # 3. Generate speech asynchronously
347
- audio_data = await app.state.tts_wrapper.generate_speech_async(
348
- text, converted_wav_path, reference_text
 
 
349
  )
350
 
351
- # 4. Convert to requested format
352
- audio_bytes = await asyncio.get_event_loop().run_in_executor(
353
- tts_executor,
354
- app.state.tts_wrapper._convert_to_streamable_format,
355
- audio_data, output_format
356
- )
357
-
358
- # 5. Save to disk (optional - can be disabled in production)
359
- audio_filename = f"tts_{int(time.time())}.{output_format}"
360
- final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
361
-
362
- async with aiofiles.open(final_path, 'wb') as f:
363
- await f.write(audio_bytes)
364
 
365
- processing_time = time.time() - start_time
366
- audio_duration = len(audio_data) / SAMPLE_RATE
367
 
368
  return Response(
369
  content=audio_bytes,
370
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
371
  headers={
372
- "Content-Disposition": f"attachment; filename={audio_filename}",
373
- "X-Processing-Time": f"{processing_time:.2f}s",
374
- "X-Audio-Duration": f"{audio_duration:.2f}s",
375
- "X-First-Chunk-Time": f"{processing_time:.2f}s" # For comparison
376
  }
377
  )
378
 
379
  except Exception as e:
380
  logger.error(f"Synthesis error: {e}")
381
- if isinstance(e, HTTPException):
382
- raise
383
- raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
384
  finally:
385
- # Schedule cleanup in background
386
- if background_tasks:
387
- if temp_ref_path:
388
- background_tasks.add_task(cleanup_file_async, temp_ref_path)
389
- if converted_wav_path:
390
- background_tasks.add_task(cleanup_file_async, converted_wav_path)
391
- else:
392
- # Fallback synchronous cleanup
393
- if temp_ref_path and os.path.exists(temp_ref_path):
394
- os.unlink(temp_ref_path)
395
- if converted_wav_path and os.path.exists(converted_wav_path):
396
- os.unlink(converted_wav_path)
397
 
398
  @app.post("/synthesize/stream")
399
- async def stream_text_to_speech(
400
- text: str = Form(..., min_length=1, max_length=5000),
401
  reference_text: str = Form(...),
402
- speed: float = Form(1.0, ge=0.5, le=2.0),
403
- output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
404
  reference_audio: UploadFile = File(...)
405
  ):
406
- """
407
- TRUE Streaming Endpoint - delivers audio chunks as they're generated.
408
- """
409
- if not hasattr(app.state, 'tts_wrapper'):
410
- raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
411
-
412
- temp_ref_path = None
413
- converted_wav_path = None
414
 
415
  try:
416
- # 1. Save and convert reference audio
417
- temp_ref_path = await save_upload_file_async(reference_audio)
418
- converted_wav_path = await convert_to_wav_async(temp_ref_path)
419
-
420
- # 2. Clean up original file immediately
421
- if temp_ref_path and os.path.exists(temp_ref_path):
422
- await cleanup_file_async(temp_ref_path)
423
- temp_ref_path = None
424
 
425
- # 3. Create async generator for streaming
426
- async def generate_audio_stream():
427
- """Async generator that yields audio chunks as they're produced."""
428
  try:
429
- first_chunk_time = time.time()
430
  chunk_count = 0
431
 
432
- async for chunk_bytes in app.state.tts_wrapper.stream_speech_async(
433
- text, converted_wav_path, reference_text, output_format
434
- ):
 
 
435
  chunk_count += 1
436
 
437
- # Log timing for first chunk
438
- if chunk_count == 1:
439
- first_chunk_time = time.time() - first_chunk_time
440
- logger.info(f"First audio chunk delivered in {first_chunk_time:.2f}s")
441
 
442
- yield chunk_bytes
 
 
 
 
443
 
 
 
 
 
 
 
444
  except Exception as e:
445
- logger.error(f"Stream generation error: {e}")
446
  raise
447
  finally:
448
- # Clean up converted file when streaming is complete
449
- if converted_wav_path and os.path.exists(converted_wav_path):
450
- await cleanup_file_async(converted_wav_path)
451
 
452
- # 4. Return streaming response
453
  return StreamingResponse(
454
- generate_audio_stream(),
455
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
456
  headers={
457
- "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
458
  "Transfer-Encoding": "chunked",
459
  "Cache-Control": "no-cache",
460
- "X-Accel-Buffering": "no",
461
  "X-Streaming": "true"
462
  }
463
  )
464
 
465
  except Exception as e:
466
- logger.error(f"Streaming setup error: {e}")
467
- # Cleanup on error
468
- if temp_ref_path and os.path.exists(temp_ref_path):
469
- await cleanup_file_async(temp_ref_path)
470
- if converted_wav_path and os.path.exists(converted_wav_path):
471
- await cleanup_file_async(converted_wav_path)
472
-
473
- if isinstance(e, HTTPException):
474
- raise
475
- raise HTTPException(status_code=500, detail=f"Streaming setup failed: {e}")
476
-
477
- @app.get("/audio/{filename}")
478
- async def get_audio(filename: str):
479
- """Serve generated audio files."""
480
- file_path = os.path.join(GENERATED_AUDIO_DIR, filename)
481
- if not os.path.exists(file_path):
482
- raise HTTPException(status_code=404, detail="Audio file not found")
483
-
484
- # Use async file reading for better performance
485
- async with aiofiles.open(file_path, "rb") as f:
486
- content = await f.read()
487
-
488
- return Response(
489
- content=content,
490
- media_type=f"audio/{filename.split('.')[-1]}",
491
- headers={"Content-Disposition": f"attachment; filename={filename}"}
492
- )
493
-
494
- @app.delete("/cleanup")
495
- async def cleanup_files():
496
- """Enhanced cleanup endpoint."""
497
- deleted_count = await cleanup_files_async()
498
- return {"message": f"Cleanup completed: {deleted_count} files removed"}
499
-
500
- async def cleanup_files_async():
501
- """Async file cleanup."""
502
- now = time.time()
503
- deleted_count = 0
504
-
505
- for directory in [GENERATED_AUDIO_DIR, TEMP_AUDIO_DIR]:
506
- if not os.path.exists(directory):
507
- continue
508
-
509
- for filename in os.listdir(directory):
510
- filepath = os.path.join(directory, filename)
511
- if os.path.isfile(filepath):
512
- try:
513
- if now - os.path.getctime(filepath) > CLEANUP_THRESHOLD:
514
- await cleanup_file_async(filepath)
515
- deleted_count += 1
516
- except Exception as e:
517
- logger.warning(f"Failed to delete {filepath}: {e}")
518
-
519
- logger.info(f"Cleanup completed: {deleted_count} files removed")
520
- return deleted_count
521
 
522
- # Performance monitoring endpoint
523
- @app.get("/metrics")
524
- async def get_metrics():
525
- """Performance metrics endpoint."""
526
- return {
527
- "active_threads": threading.active_count(),
528
- "executor_queue_size": tts_executor._work_queue.qsize() if hasattr(tts_executor, '_work_queue') else 0,
529
- "memory_usage_mb": psutil.Process().memory_info().rss / 1024 / 1024
530
- }
531
 
532
  if __name__ == "__main__":
533
  import uvicorn
534
  uvicorn.run(
535
- "app:app",
536
  host="0.0.0.0",
537
  port=7860,
538
- workers=1, # Multiple workers not supported with in-memory model
539
  loop="asyncio",
540
- access_log=True
 
541
  )
 
2
  import io
3
  import asyncio
4
  import time
 
5
  import numpy as np
 
6
  import soundfile as sf
7
  import subprocess
8
  import tempfile
9
  from concurrent.futures import ThreadPoolExecutor
10
+ from typing import Optional, AsyncGenerator
11
  from contextlib import asynccontextmanager
12
  import logging
13
  import aiofiles
14
  import torch
15
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
16
  from fastapi.responses import Response, StreamingResponse
17
  from fastapi.middleware.cors import CORSMiddleware
 
 
 
 
 
18
 
19
+ # Performance-focused configuration
 
 
 
 
 
 
 
 
 
 
 
 
20
  DEVICE = "cpu"
21
+ MAX_WORKERS = 1 # Reduced for CPU efficiency
 
22
  SAMPLE_RATE = 24000
23
+
24
+ # Minimal storage - no persistent files
25
  TEMP_AUDIO_DIR = "temp_audio"
 
26
  os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
 
27
 
28
+ # Performance logging
29
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
30
+ logger = logging.getLogger("NeuTTS-Perf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ class HighPerformanceTTSWrapper:
 
33
  def __init__(self, device: str = "cpu"):
34
  self.tts_model = None
35
  self.device = device
36
+ self._ref_cache = {} # Cache encoded references
37
  self.load_model()
38
 
39
  def load_model(self):
40
+ """Load model once and keep in memory."""
41
  try:
42
+ logger.info("🚀 Loading NeuTTSAir model...")
43
  self.tts_model = NeuTTSAir(backbone_device=self.device, codec_device=self.device)
44
+ logger.info("✅ Model loaded successfully")
45
  except Exception as e:
46
  logger.error(f"❌ Model loading failed: {e}")
47
  raise
48
 
49
+ def encode_reference_audio(self, audio_path: str) -> torch.Tensor:
50
+ """Encode reference audio with caching."""
51
+ cache_key = f"{os.path.getsize(audio_path)}_{os.path.getmtime(audio_path)}"
52
+ if cache_key in self._ref_cache:
53
+ return self._ref_cache[cache_key]
54
+
55
+ ref_s = self.tts_model.encode_reference(audio_path)
56
+ self._ref_cache[cache_key] = ref_s
57
+ return ref_s
 
58
 
59
+ def synthesize_complete(self, text: str, ref_audio_path: str, reference_text: str) -> np.ndarray:
60
+ """High-performance complete synthesis."""
61
+ start_time = time.time()
 
 
62
 
63
+ # Encode reference
64
+ ref_s = self.encode_reference_audio(ref_audio_path)
 
 
 
 
 
 
65
 
66
+ # Synthesize complete audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  with torch.no_grad():
68
  audio = self.tts_model.infer(text, ref_s, reference_text)
69
+
70
+ logger.info(f"🎯 Complete synthesis: {time.time() - start_time:.2f}s")
71
  return audio
72
 
73
+ def synthesize_streaming(self, text: str, ref_audio_path: str, reference_text: str) -> AsyncGenerator[np.ndarray, None]:
74
+ """True streaming synthesis with optimal chunking."""
75
+ start_time = time.time()
 
 
 
 
 
 
76
 
77
+ # Encode reference once
78
+ ref_s = self.encode_reference_audio(ref_audio_path)
79
+ encoding_time = time.time() - start_time
80
+ logger.info(f"🔧 Reference encoded: {encoding_time:.2f}s")
 
 
 
81
 
82
+ # Smart text chunking for optimal performance
83
+ chunks = self._optimized_text_chunking(text)
84
+ logger.info(f"📝 Split into {len(chunks)} chunks")
85
 
86
+ # Stream chunks
87
+ for i, chunk in enumerate(chunks):
88
+ chunk_start = time.time()
89
+ with torch.no_grad():
90
+ audio_chunk = self.tts_model.infer(chunk, ref_s, reference_text)
 
 
 
 
 
 
 
 
91
 
92
+ chunk_time = time.time() - chunk_start
93
+ logger.info(f"🎵 Chunk {i+1}/{len(chunks)}: {chunk_time:.2f}s")
94
+ yield audio_chunk
 
 
 
 
 
 
95
 
96
+ total_time = time.time() - start_time
97
+ logger.info(f"✅ Streaming complete: {total_time:.2f}s")
98
 
99
+ def _optimized_text_chunking(self, text: str, max_chars: int = 200) -> list[str]:
100
+ """Optimized chunking for TTS performance."""
101
+ if len(text) <= max_chars:
102
+ return [text]
103
+
104
+ # Split by sentences first, then by length
105
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
106
+ chunks = []
107
+ current_chunk = ""
108
+
109
+ for sentence in sentences:
110
+ if len(current_chunk) + len(sentence) + 1 <= max_chars:
111
+ current_chunk += (" " + sentence) if current_chunk else sentence
112
+ else:
113
+ if current_chunk:
114
+ chunks.append(current_chunk)
115
+ current_chunk = sentence
116
+
117
+ if current_chunk:
118
+ chunks.append(current_chunk)
119
+
120
+ return chunks if chunks else [text]
121
 
122
+ def audio_to_bytes(self, audio_data: np.ndarray, audio_format: str) -> bytes:
123
+ """Convert audio to bytes efficiently."""
124
+ audio_buffer = io.BytesIO()
125
+ sf.write(audio_buffer, audio_data, SAMPLE_RATE, format=audio_format)
126
+ return audio_buffer.getvalue()
 
 
 
 
 
 
 
127
 
128
+ # Global instances for performance
129
+ tts_wrapper = HighPerformanceTTSWrapper(device=DEVICE)
130
+ executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 
 
 
 
 
131
 
132
+ # FastAPI app with minimal overhead
 
 
 
 
 
 
 
 
 
133
  @asynccontextmanager
134
  async def lifespan(app: FastAPI):
135
+ yield # Model already loaded
136
+ executor.shutdown(wait=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ app = FastAPI(lifespan=lifespan, title="NeuTTS High-Performance API")
139
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ # Performance monitoring
142
+ @app.get("/performance")
143
+ async def performance_status():
 
 
 
 
 
144
  return {
145
+ "status": "operational",
146
+ "model_loaded": tts_wrapper.tts_model is not None,
147
  "device": DEVICE,
148
+ "max_workers": MAX_WORKERS,
149
+ "reference_cache_size": len(tts_wrapper._ref_cache)
 
 
 
 
 
 
 
 
150
  }
151
 
152
+ # High-performance file operations
153
+ async def save_and_convert_audio(upload_file: UploadFile) -> str:
154
+ """Save and convert audio in one efficient operation."""
155
+ # Create temp file
156
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir=TEMP_AUDIO_DIR) as tmp:
157
+ temp_wav_path = tmp.name
158
+
159
+ try:
160
+ # Save uploaded file temporarily
161
+ temp_upload_path = f"{temp_wav_path}.upload"
162
+ async with aiofiles.open(temp_upload_path, 'wb') as f:
163
+ content = await upload_file.read() # Read once
164
+ await f.write(content)
165
+
166
+ # Convert to WAV using subprocess (most efficient)
167
+ cmd = [
168
+ "ffmpeg", "-y", "-i", temp_upload_path,
169
+ "-f", "wav", "-ar", str(SAMPLE_RATE), "-ac", "1",
170
+ "-c:a", "pcm_s16le", temp_wav_path
171
+ ]
172
+
173
+ process = await asyncio.create_subprocess_exec(*cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
174
+ await process.wait()
175
+
176
+ # Cleanup upload file
177
+ if os.path.exists(temp_upload_path):
178
+ os.unlink(temp_upload_path)
179
+
180
+ return temp_wav_path
181
+
182
+ except Exception as e:
183
+ # Cleanup on error
184
+ if os.path.exists(temp_wav_path):
185
+ os.unlink(temp_wav_path)
186
+ if 'temp_upload_path' in locals() and os.path.exists(temp_upload_path):
187
+ os.unlink(temp_upload_path)
188
+ raise e
189
+
190
+ async def cleanup_file(path: str):
191
+ """Async file cleanup."""
192
+ try:
193
+ if os.path.exists(path):
194
+ os.unlink(path)
195
+ except:
196
+ pass
197
+
198
+ # High-performance endpoints
199
  @app.post("/synthesize", response_class=Response)
200
+ async def synthesize_speech(
201
  text: str = Form(...),
202
  reference_text: str = Form(...),
203
+ output_format: str = Form("wav"),
 
204
  reference_audio: UploadFile = File(...),
205
  background_tasks: BackgroundTasks = None
206
  ):
207
+ """High-performance complete synthesis."""
 
 
 
 
 
208
  start_time = time.time()
209
+ temp_path = None
 
210
 
211
  try:
212
+ # 1. Process audio (fast)
213
+ temp_path = await save_and_convert_audio(reference_audio)
214
+ process_time = time.time() - start_time
215
+ logger.info(f"📁 Audio processed: {process_time:.2f}s")
 
216
 
217
+ # 2. Synthesize (blocking but efficient)
218
+ audio_data = await asyncio.get_event_loop().run_in_executor(
219
+ executor,
220
+ tts_wrapper.synthesize_complete,
221
+ text, temp_path, reference_text
222
  )
223
 
224
+ # 3. Convert to bytes
225
+ audio_bytes = tts_wrapper.audio_to_bytes(audio_data, output_format)
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ total_time = time.time() - start_time
228
+ logger.info(f"✅ Complete request: {total_time:.2f}s")
229
 
230
  return Response(
231
  content=audio_bytes,
232
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
233
  headers={
234
+ "X-Processing-Time": f"{total_time:.2f}s",
235
+ "X-Audio-Length": f"{len(audio_data)/SAMPLE_RATE:.2f}s"
 
 
236
  }
237
  )
238
 
239
  except Exception as e:
240
  logger.error(f"Synthesis error: {e}")
241
+ raise HTTPException(status_code=500, detail=str(e))
 
 
242
  finally:
243
+ if temp_path:
244
+ if background_tasks:
245
+ background_tasks.add_task(cleanup_file, temp_path)
246
+ else:
247
+ await cleanup_file(temp_path)
 
 
 
 
 
 
 
248
 
249
  @app.post("/synthesize/stream")
250
+ async def stream_speech(
251
+ text: str = Form(...),
252
  reference_text: str = Form(...),
253
+ output_format: str = Form("mp3"),
 
254
  reference_audio: UploadFile = File(...)
255
  ):
256
+ """True streaming with immediate delivery."""
257
+ start_time = time.time()
258
+ temp_path = None
 
 
 
 
 
259
 
260
  try:
261
+ # Process audio first
262
+ temp_path = await save_and_convert_audio(reference_audio)
263
+ setup_time = time.time() - start_time
264
+ logger.info(f"🎯 Streaming setup: {setup_time:.2f}s")
 
 
 
 
265
 
266
+ async def generate_stream():
267
+ """True streaming generator."""
 
268
  try:
269
+ first_chunk_sent = False
270
  chunk_count = 0
271
 
272
+ # Get the async generator
273
+ audio_chunks = tts_wrapper.synthesize_streaming(text, temp_path, reference_text)
274
+
275
+ # Stream chunks immediately as they're generated
276
+ async for audio_chunk in audio_chunks:
277
  chunk_count += 1
278
 
279
+ # Convert to bytes
280
+ chunk_bytes = tts_wrapper.audio_to_bytes(audio_chunk, output_format)
 
 
281
 
282
+ # Track first chunk timing
283
+ if not first_chunk_sent:
284
+ first_chunk_time = time.time() - start_time
285
+ logger.info(f"🚀 FIRST CHUNK SENT: {first_chunk_time:.2f}s")
286
+ first_chunk_sent = True
287
 
288
+ logger.info(f"📦 Yielding chunk {chunk_count} ({len(chunk_bytes)} bytes)")
289
+ yield chunk_bytes
290
+
291
+ total_time = time.time() - start_time
292
+ logger.info(f"🎉 Streaming completed: {total_time:.2f}s, {chunk_count} chunks")
293
+
294
  except Exception as e:
295
+ logger.error(f"Stream error: {e}")
296
  raise
297
  finally:
298
+ # Cleanup
299
+ if temp_path:
300
+ await cleanup_file(temp_path)
301
 
 
302
  return StreamingResponse(
303
+ generate_stream(),
304
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
305
  headers={
306
+ "Content-Disposition": "attachment; filename=stream.mp3",
307
  "Transfer-Encoding": "chunked",
308
  "Cache-Control": "no-cache",
 
309
  "X-Streaming": "true"
310
  }
311
  )
312
 
313
  except Exception as e:
314
+ logger.error(f"Stream setup error: {e}")
315
+ if temp_path:
316
+ await cleanup_file(temp_path)
317
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ @app.get("/")
320
+ async def root():
321
+ return {"message": "NeuTTS High-Performance API - Optimized for Speed"}
 
 
 
 
 
 
322
 
323
  if __name__ == "__main__":
324
  import uvicorn
325
  uvicorn.run(
326
+ app,
327
  host="0.0.0.0",
328
  port=7860,
329
+ workers=1,
330
  loop="asyncio",
331
+ access_log=False, # Disable access logs for performance
332
+ log_level="warning"
333
  )