Rajhuggingface4253 commited on
Commit
b01e4fa
·
verified ·
1 Parent(s): ac0fe7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -52
app.py CHANGED
@@ -6,6 +6,8 @@ import shutil
6
  import numpy as np
7
  import psutil
8
  import soundfile as sf
 
 
9
  from concurrent.futures import ThreadPoolExecutor
10
  from typing import Optional, Generator
11
  from contextlib import asynccontextmanager
@@ -46,6 +48,63 @@ class TTSRequestModel(BaseModel):
46
  speed: float = Field(default=1.0, ge=0.5, le=2.0)
47
  output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # --- Model Wrapper and Logic ---
50
 
51
  class NeuTTSWrapper:
@@ -245,45 +304,49 @@ async def text_to_speech(
245
  text: str = Form(...),
246
  speed: float = Form(1.0, ge=0.5, le=2.0),
247
  output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
248
- reference_audio: UploadFile = File(...)
249
- ):
250
  """
251
  Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
252
- Uses ThreadPoolExecutor for non-blocking API responsiveness.
253
  """
254
  if not hasattr(app.state, 'tts_wrapper'):
255
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
256
-
257
- # 1. Asynchronously save reference audio
258
  temp_ref_path = await save_upload_file_async(reference_audio)
 
259
  start_time = time.time()
260
-
261
  try:
262
- # 2. Offload the ENTIRE blocking process (encode + infer) to a thread
 
 
 
 
 
 
263
  audio_data = await run_blocking_task_async(
264
  app.state.tts_wrapper.generate_speech_blocking,
265
  text,
266
- temp_ref_path
267
  )
268
-
269
- # 3. Convert to requested format (Blocking, but usually fast)
270
  audio_bytes = await run_blocking_task_async(
271
  app.state.tts_wrapper._convert_to_streamable_format,
272
  audio_data,
273
  output_format
274
  )
275
-
276
- # 4. Save to disk (Original NeuTTS requirement)
277
  audio_filename = f"tts_{time.time()}.{output_format}"
278
  final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
279
- # We perform the file write operation in a blocking manner inside the thread pool.
280
  await run_blocking_task_async(
281
  lambda: open(final_path, 'wb').write(audio_bytes)
282
  )
283
-
284
  processing_time = time.time() - start_time
285
  audio_duration = len(audio_data) / SAMPLE_RATE
286
-
287
  return Response(
288
  content=audio_bytes,
289
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
@@ -293,61 +356,80 @@ async def text_to_speech(
293
  "X-Audio-Duration": f"{audio_duration:.2f}s"
294
  }
295
  )
296
-
297
  except Exception as e:
298
  logger.error(f"Synthesis error: {e}")
 
 
 
299
  raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
300
  finally:
301
- # 5. Clean up the temporary reference file
302
  if os.path.exists(temp_ref_path):
303
  os.unlink(temp_ref_path)
 
 
304
 
305
  @app.post("/synthesize/stream")
306
  async def stream_text_to_speech_cloning(
307
- text: str = Form(..., min_length=1, max_length=5000), # Increased limit for streaming
308
  speed: float = Form(1.0, ge=0.5, le=2.0),
309
- output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"), # MP3 is best for streaming
310
- reference_audio: UploadFile = File(...)
311
- ):
312
  """
313
  Sentence-by-Sentence Streaming Endpoint (Kokoro Feature adaptation).
314
- Performs encoding once, then synthesizes and streams chunks.
315
  """
316
  if not hasattr(app.state, 'tts_wrapper'):
317
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
318
-
319
  # 1. Asynchronously save reference audio (non-blocking)
320
  temp_ref_path = await save_upload_file_async(reference_audio)
 
321
 
322
- # 2. Define the generator function, which will run in the thread pool implicitly
323
- def stream_generator():
324
- try:
325
- # The entire streaming process runs blocking inside the thread pool
326
- for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
327
- text,
328
- temp_ref_path,
329
- speed,
330
- output_format
331
- ):
332
- yield chunk_bytes
333
- except Exception as e:
334
- logger.error(f"Streaming generator error: {e}")
335
- # Raise an exception if necessary, though it might break the stream
336
- finally:
337
- # 3. Cleanup the temporary reference file after the stream is done
338
- if os.path.exists(temp_ref_path):
339
- os.unlink(temp_ref_path)
340
-
341
- # The StreamingResponse handles the transfer encoding and chunking
342
- return StreamingResponse(
343
- stream_generator(),
344
- media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
345
- headers={
346
- "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
347
- "Transfer-Encoding": "chunked",
348
- "Cache-Control": "no-cache"
349
- }
350
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  @app.get("/audio/{filename}")
353
  async def get_audio(filename: str):
 
6
  import numpy as np
7
  import psutil
8
  import soundfile as sf
9
+ import subprocess
10
+ import tempfile
11
  from concurrent.futures import ThreadPoolExecutor
12
  from typing import Optional, Generator
13
  from contextlib import asynccontextmanager
 
48
  speed: float = Field(default=1.0, ge=0.5, le=2.0)
49
  output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
50
 
51
+
52
+ def convert_to_wav_blocking(input_path: str) -> str:
53
+ """
54
+ NEW FUNCTION: Uses FFmpeg to convert any uploaded audio format (WebM, MP4, etc.)
55
+ to a 24kHz, 16-bit PCM WAV file, which is required by soundfile/libsndfile.
56
+ This function must run in the ThreadPoolExecutor.
57
+ """
58
+ # Create a unique temporary filename for the converted WAV file
59
+ # We use tempfile.NamedTemporaryFile to safely create a path
60
+ # and then delete the file handle so ffmpeg can write to it.
61
+ with tempfile.NamedTemporaryFile(suffix=".wav", dir=TEMP_AUDIO_DIR, delete=False) as tmp:
62
+ output_path = tmp.name
63
+
64
+ logger.info(f"Converting '{os.path.basename(input_path)}' to WAV (24kHz, mono) at {os.path.basename(output_path)}")
65
+
66
+ # FFmpeg command details:
67
+ # -y: overwrite output file if it exists
68
+ # -i: input file path
69
+ # -f wav: output format is WAV
70
+ # -ar 24000: set sample rate to 24000 (required by NeuTTS)
71
+ # -ac 1: set audio channels to 1 (mono)
72
+ # -c:a pcm_s16le: set codec to uncompressed 16-bit PCM (standard WAV)
73
+ command = [
74
+ "ffmpeg",
75
+ "-y",
76
+ "-i", input_path,
77
+ "-f", "wav",
78
+ "-ar", str(SAMPLE_RATE),
79
+ "-ac", "1",
80
+ "-c:a", "pcm_s16le",
81
+ output_path
82
+ ]
83
+
84
+ try:
85
+ # Run the FFmpeg command
86
+ # Use a short timeout to prevent runaway processes
87
+ result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
88
+ logger.info(f"FFmpeg conversion successful.")
89
+ return output_path
90
+ except subprocess.CalledProcessError as e:
91
+ logger.error(f"FFmpeg conversion failed: {e.stderr}")
92
+ # Clean up the output path if FFmpeg failed to write it
93
+ if os.path.exists(output_path):
94
+ os.unlink(output_path)
95
+ # Provide the last line of the FFmpeg error to the user
96
+ error_detail = e.stderr.splitlines()[-1] if e.stderr else "Unknown FFmpeg error."
97
+ raise HTTPException(status_code=400, detail=f"Audio format conversion failed: {error_detail}")
98
+ except subprocess.TimeoutExpired:
99
+ logger.error("FFmpeg conversion timed out.")
100
+ if os.path.exists(output_path):
101
+ os.unlink(output_path)
102
+ raise HTTPException(status_code=504, detail="Audio conversion timed out after 30 seconds.")
103
+ except Exception as e:
104
+ logger.error(f"General conversion error: {e}")
105
+ if os.path.exists(output_path):
106
+ os.unlink(output_path)
107
+ raise HTTPException(status_code=500, detail="An unexpected error occurred during audio conversion.")
108
  # --- Model Wrapper and Logic ---
109
 
110
  class NeuTTSWrapper:
 
304
  text: str = Form(...),
305
  speed: float = Form(1.0, ge=0.5, le=2.0),
306
  output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
307
+ reference_audio: UploadFile = File(...)):
 
308
  """
309
  Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
310
+ Includes FFmpeg conversion for uploaded audio format compatibility.
311
  """
312
  if not hasattr(app.state, 'tts_wrapper'):
313
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
314
+
315
+ # 1. Asynchronously save reference audio (original upload)
316
  temp_ref_path = await save_upload_file_async(reference_audio)
317
+ converted_wav_path = None # NEW: Initialize for cleanup
318
  start_time = time.time()
319
+
320
  try:
321
+ # 2. **NEW STEP**: Convert the uploaded file (WebM, etc.) to a 24kHz WAV file using FFmpeg
322
+ converted_wav_path = await run_blocking_task_async(
323
+ convert_to_wav_blocking,
324
+ temp_ref_path
325
+ )
326
+
327
+ # 3. Offload the ENTIRE blocking process (encode + infer) to a thread
328
  audio_data = await run_blocking_task_async(
329
  app.state.tts_wrapper.generate_speech_blocking,
330
  text,
331
+ converted_wav_path # IMPORTANT: Pass the CONVERTED WAV path
332
  )
333
+
334
+ # 4. Convert to requested format (Blocking, but usually fast)
335
  audio_bytes = await run_blocking_task_async(
336
  app.state.tts_wrapper._convert_to_streamable_format,
337
  audio_data,
338
  output_format
339
  )
340
+
341
+ # 5. Save to disk (Original NeuTTS requirement)
342
  audio_filename = f"tts_{time.time()}.{output_format}"
343
  final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
 
344
  await run_blocking_task_async(
345
  lambda: open(final_path, 'wb').write(audio_bytes)
346
  )
347
+
348
  processing_time = time.time() - start_time
349
  audio_duration = len(audio_data) / SAMPLE_RATE
 
350
  return Response(
351
  content=audio_bytes,
352
  media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
 
356
  "X-Audio-Duration": f"{audio_duration:.2f}s"
357
  }
358
  )
 
359
  except Exception as e:
360
  logger.error(f"Synthesis error: {e}")
361
+ # Reraise HTTPExceptions that may have come from the conversion step
362
+ if isinstance(e, HTTPException):
363
+ raise
364
  raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
365
  finally:
366
+ # 6. Clean up BOTH the original file AND the converted WAV file
367
  if os.path.exists(temp_ref_path):
368
  os.unlink(temp_ref_path)
369
+ if converted_wav_path and os.path.exists(converted_wav_path):
370
+ os.unlink(converted_wav_path)
371
 
372
  @app.post("/synthesize/stream")
373
  async def stream_text_to_speech_cloning(
374
+ text: str = Form(..., min_length=1, max_length=5000),
375
  speed: float = Form(1.0, ge=0.5, le=2.0),
376
+ output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
377
+ reference_audio: UploadFile = File(...)):
 
378
  """
379
  Sentence-by-Sentence Streaming Endpoint (Kokoro Feature adaptation).
380
+ Includes FFmpeg conversion for uploaded audio format compatibility.
381
  """
382
  if not hasattr(app.state, 'tts_wrapper'):
383
  raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
384
+
385
  # 1. Asynchronously save reference audio (non-blocking)
386
  temp_ref_path = await save_upload_file_async(reference_audio)
387
+ converted_wav_path = None # NEW: Initialize for cleanup
388
 
389
+ try:
390
+ # 2. **NEW STEP**: Convert the uploaded file (WebM, etc.) to a 24kHz WAV file using FFmpeg
391
+ converted_wav_path = await run_blocking_task_async(
392
+ convert_to_wav_blocking,
393
+ temp_ref_path
394
+ )
395
+
396
+ # 3. Define the generator function, which will run in the thread pool implicitly
397
+ def stream_generator():
398
+ try:
399
+ # The entire streaming process runs blocking inside the thread pool
400
+ for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
401
+ text,
402
+ converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
403
+ speed,
404
+ output_format
405
+ ):
406
+ yield chunk_bytes
407
+ except Exception as e:
408
+ logger.error(f"Streaming generator error: {e}")
409
+ # Note: Cleanup for converted_wav_path is handled in the main finally block below.
410
+
411
+ # The StreamingResponse is returned immediately to start the stream
412
+ return StreamingResponse(
413
+ stream_generator(),
414
+ media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
415
+ headers={
416
+ "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
417
+ "Transfer-Encoding": "chunked",
418
+ "Cache-Control": "no-cache"
419
+ }
420
+ )
421
+ except Exception as e:
422
+ logger.error(f"Streaming setup error: {e}")
423
+ # Reraise HTTPExceptions that may have come from the conversion step
424
+ if isinstance(e, HTTPException):
425
+ raise
426
+ raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
427
+ finally:
428
+ # 4. Clean up BOTH the original file AND the converted WAV file
429
+ if os.path.exists(temp_ref_path):
430
+ os.unlink(temp_ref_path)
431
+ if converted_wav_path and os.path.exists(converted_wav_path):
432
+ os.unlink(converted_wav_path)
433
 
434
  @app.get("/audio/{filename}")
435
  async def get_audio(filename: str):