Rajhuggingface4253 commited on
Commit
b307da8
·
verified ·
1 Parent(s): 2c4e22c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -44
app.py CHANGED
@@ -21,12 +21,11 @@ from pydantic import BaseModel, Field
21
  import re
22
  import hashlib
23
  from functools import lru_cache
24
- import queue
25
  # Ensure the cloned neutts-air repository is in the path
26
  import sys
27
  sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
28
  from neuttsair.neutts import NeuTTSAir
29
- from threading import Thread
30
  # Configure logging
31
  logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger("NeuTTS-API")
@@ -36,7 +35,7 @@ logger = logging.getLogger("NeuTTS-API")
36
  # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
37
  DEVICE = "cpu"
38
  # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
39
- MAX_WORKERS = 1
40
  tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
41
  SAMPLE_RATE = 24000
42
  CLEANUP_THRESHOLD = 300 # 1 hour in seconds
@@ -351,56 +350,33 @@ async def stream_text_to_speech_cloning(
351
  output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
352
  reference_audio: UploadFile = File(...)):
353
  """
354
- Sentence-by-Sentence Streaming using a parallel producer-consumer pipeline
355
- to ensure continuous, low-latency audio flow.
356
  """
357
  if not hasattr(app.state, 'tts_wrapper'):
358
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
359
 
360
  try:
 
361
  converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
362
  ref_audio_bytes = converted_wav_buffer.getvalue()
363
 
 
364
  def stream_generator():
365
- q = queue.Queue(maxsize=2)
366
-
367
- def producer():
368
- try:
369
- audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
370
- ref_s = app.state.tts_wrapper._get_or_create_reference_encoding(audio_hash, ref_audio_bytes)
371
-
372
- sentences = app.state.tts_wrapper._split_text_into_chunks(text)
373
-
374
- for sentence in sentences:
375
- with torch.no_grad():
376
- audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence, ref_s, reference_text)
377
- q.put(audio_chunk)
378
-
379
- except Exception as e:
380
- logger.error(f"Error in producer thread: {e}")
381
- q.put(e)
382
- finally:
383
- q.put(None)
384
-
385
- # === THIS IS THE FIX ===
386
- # Start the producer in a standard, separate thread.
387
- # This avoids the asyncio loop error.
388
- producer_thread = Thread(target=producer)
389
- producer_thread.start()
390
- # =======================
391
-
392
- while True:
393
- result = q.get()
394
-
395
- if result is None:
396
- break
397
-
398
- if isinstance(result, Exception):
399
- logger.error(f"Terminating stream due to producer error: {result}")
400
- raise result
401
-
402
- yield app.state.tts_wrapper._convert_to_streamable_format(result, output_format)
403
-
404
  return StreamingResponse(
405
  stream_generator(),
406
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
@@ -411,6 +387,7 @@ async def stream_text_to_speech_cloning(
411
  if isinstance(e, HTTPException):
412
  raise
413
  raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
 
414
 
415
  @app.get("/audio/{filename}")
416
  async def get_audio(filename: str):
 
21
  import re
22
  import hashlib
23
  from functools import lru_cache
 
24
  # Ensure the cloned neutts-air repository is in the path
25
  import sys
26
  sys.path.append(os.path.join(os.getcwd(), 'neutts-air'))
27
  from neuttsair.neutts import NeuTTSAir
28
+
29
  # Configure logging
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger("NeuTTS-API")
 
35
  # Explicitly use CPU as per Dockerfile and Hugging Face free tier compatibility
36
  DEVICE = "cpu"
37
  # Configure Max Workers for concurrent synthesis threads (1-2 is safe for CPU-only)
38
+ MAX_WORKERS = 2
39
  tts_executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
40
  SAMPLE_RATE = 24000
41
  CLEANUP_THRESHOLD = 300 # 1 hour in seconds
 
350
  output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
351
  reference_audio: UploadFile = File(...)):
352
  """
353
+ Sentence-by-Sentence Streaming with in-memory processing and caching.
 
354
  """
355
  if not hasattr(app.state, 'tts_wrapper'):
356
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
357
 
358
  try:
359
+ # 1. Convert the uploaded file to WAV directly in memory
360
  converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
361
  ref_audio_bytes = converted_wav_buffer.getvalue()
362
 
363
+ # 2. The generator now runs in the thread pool, using the audio bytes
364
  def stream_generator():
365
+ try:
366
+ for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
367
+ text,
368
+ ref_audio_bytes, # Pass bytes, not a path
369
+ reference_text,
370
+ speed,
371
+ output_format
372
+ ):
373
+ yield chunk_bytes
374
+ except Exception as e:
375
+ logger.error(f"Streaming generator error: {e}")
376
+ # This ensures the stream terminates on an error
377
+ raise
378
+
379
+ # Return StreamingResponse with the generator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  return StreamingResponse(
381
  stream_generator(),
382
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}"
 
387
  if isinstance(e, HTTPException):
388
  raise
389
  raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
390
+ # Note: The outer 'finally' block is now removed as its logic is handled in 2.5 and 4.
391
 
392
  @app.get("/audio/{filename}")
393
  async def get_audio(filename: str):