Rajhuggingface4253 commited on
Commit
69ddef5
·
verified ·
1 Parent(s): ab0a603

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -19
app.py CHANGED
@@ -35,7 +35,7 @@ logger = logging.getLogger("NeuTTS-API")
35
  # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
36
  DEVICE = "cpu"
37
  # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
38
- MAX_WORKERS = 2
39
  tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
40
  SAMPLE_RATE = 24000
41
  CLEANUP_THRESHOLD = 300 # 1 hour in seconds
@@ -94,7 +94,6 @@ class NeuTTSWrapper:
94
  def __init__(self, device: str = "cpu"):
95
  self.tts_model = None
96
  self.device = device
97
- self.encoding_cache = {}
98
  self.load_model()
99
 
100
  def load_model(self):
@@ -352,7 +351,7 @@ async def stream_text_to_speech_cloning(
352
  reference_audio: UploadFile = File(...)):
353
  """
354
  Sentence-by-Sentence Streaming using a high-performance, asyncio-native
355
- producer-consumer pipeline. This overlaps CPU-bound AI work with network I/O.
356
  """
357
  if not hasattr(app.state, 'tts_wrapper'):
358
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
@@ -361,29 +360,22 @@ async def stream_text_to_speech_cloning(
361
  loop = asyncio.get_event_loop()
362
  q = asyncio.Queue(maxsize=2)
363
 
364
- # The PRODUCER's job is to quickly schedule work, not wait for it.
365
  async def producer():
366
  try:
367
  converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
368
  ref_audio_bytes = converted_wav_buffer.getvalue()
369
  audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
370
 
371
- # Check cache for reference encoding
372
- if audio_hash in app.state.tts_wrapper.encoding_cache:
373
- logger.info(f"Streaming Cache HIT for hash: {audio_hash[:10]}...")
374
- ref_s = app.state.tts_wrapper.encoding_cache[audio_hash]
375
- else:
376
- logger.info(f"Streaming Cache MISS for hash: {audio_hash[:10]}...")
377
- ref_s = await loop.run_in_executor(
378
- tts_executor,
379
- app.state.tts_wrapper.get_reference_encoding,
380
- ref_audio_bytes
381
- )
382
- app.state.tts_wrapper.encoding_cache[audio_hash] = ref_s
383
 
384
  sentences = app.state.tts_wrapper._split_text_into_chunks(text)
385
 
386
- # This function does the heavy lifting for one chunk.
387
  def process_chunk(sentence_text):
388
  with torch.no_grad():
389
  audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
@@ -408,7 +400,6 @@ async def stream_text_to_speech_cloning(
408
  if result is None:
409
  break
410
 
411
- # Check if the item in the queue is a task (future) or an exception
412
  if isinstance(result, Exception):
413
  logger.error(f"Terminating stream due to producer error: {result}")
414
  raise result
@@ -423,7 +414,6 @@ async def stream_text_to_speech_cloning(
423
  stream_generator(),
424
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
425
  )
426
- # Note: The outer 'finally' block is now removed as its logic is handled in 2.5 and 4.
427
 
428
  @app.get("/audio/{filename}")
429
  async def get_audio(filename: str):
 
35
  # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
36
  DEVICE = "cpu"
37
  # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
38
+ MAX_WORKERS = 3
39
  tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
40
  SAMPLE_RATE = 24000
41
  CLEANUP_THRESHOLD = 300 # 1 hour in seconds
 
94
  def __init__(self, device: str = "cpu"):
95
  self.tts_model = None
96
  self.device = device
 
97
  self.load_model()
98
 
99
  def load_model(self):
 
351
  reference_audio: UploadFile = File(...)):
352
  """
353
  Sentence-by-Sentence Streaming using a high-performance, asyncio-native
354
+ producer-consumer pipeline.
355
  """
356
  if not hasattr(app.state, 'tts_wrapper'):
357
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
 
360
  loop = asyncio.get_event_loop()
361
  q = asyncio.Queue(maxsize=2)
362
 
 
363
  async def producer():
364
  try:
365
  converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
366
  ref_audio_bytes = converted_wav_buffer.getvalue()
367
  audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
368
 
369
+ # Use LRU cache like blocking endpoint
370
+ ref_s = await loop.run_in_executor(
371
+ tts_executor,
372
+ app.state.tts_wrapper._get_or_create_reference_encoding,
373
+ audio_hash,
374
+ ref_audio_bytes
375
+ )
 
 
 
 
 
376
 
377
  sentences = app.state.tts_wrapper._split_text_into_chunks(text)
378
 
 
379
  def process_chunk(sentence_text):
380
  with torch.no_grad():
381
  audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
 
400
  if result is None:
401
  break
402
 
 
403
  if isinstance(result, Exception):
404
  logger.error(f"Terminating stream due to producer error: {result}")
405
  raise result
 
414
  stream_generator(),
415
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
416
  )
 
417
 
418
  @app.get("/audio/{filename}")
419
  async def get_audio(filename: str):