Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 18

Commit

b01e4fa

verified ·

1 Parent(s): ac0fe7c

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -52

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import shutil
 import numpy as np
 import psutil
 import soundfile as sf
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Generator
 from contextlib import asynccontextmanager
@@ -46,6 +48,63 @@ class TTSRequestModel(BaseModel):
     speed: float = Field(default=1.0, ge=0.5, le=2.0)
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
 # --- Model Wrapper and Logic ---
 class NeuTTSWrapper:
@@ -245,45 +304,49 @@ async def text_to_speech(
     text: str = Form(...),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
-    reference_audio: UploadFile = File(...)
-):
     """
     Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
-    Uses ThreadPoolExecutor for non-blocking API responsiveness.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
-    # 1. Asynchronously save reference audio
     temp_ref_path = await save_upload_file_async(reference_audio)
     start_time = time.time()
     try:
-        # 2. Offload the ENTIRE blocking process (encode + infer) to a thread
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
-            temp_ref_path
         )
-        # 3. Convert to requested format (Blocking, but usually fast)
         audio_bytes = await run_blocking_task_async(
             app.state.tts_wrapper._convert_to_streamable_format,
             audio_data,
             output_format
         )
-        # 4. Save to disk (Original NeuTTS requirement)
         audio_filename = f"tts_{time.time()}.{output_format}"
         final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
-        # We perform the file write operation in a blocking manner inside the thread pool.
         await run_blocking_task_async(
             lambda: open(final_path, 'wb').write(audio_bytes)
         )
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
@@ -293,61 +356,80 @@ async def text_to_speech(
                 "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
         logger.error(f"Synthesis error: {e}")
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
     finally:
-        # 5. Clean up the temporary reference file
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
-    text: str = Form(..., min_length=1, max_length=5000), # Increased limit for streaming
     speed: float = Form(1.0, ge=0.5, le=2.0),
-    output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"), # MP3 is best for streaming
-    reference_audio: UploadFile = File(...)
-):
     """
     Sentence-by-Sentence Streaming Endpoint (Kokoro Feature adaptation).
-    Performs encoding once, then synthesizes and streams chunks.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     # 1. Asynchronously save reference audio (non-blocking)
     temp_ref_path = await save_upload_file_async(reference_audio)
-    # 2. Define the generator function, which will run in the thread pool implicitly
-    def stream_generator():
-        try:
-            # The entire streaming process runs blocking inside the thread pool
-            for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
-                text,
-                temp_ref_path,
-                speed,
-                output_format
-            ):
-                yield chunk_bytes
-        except Exception as e:
-            logger.error(f"Streaming generator error: {e}")
-            # Raise an exception if necessary, though it might break the stream
-        finally:
-            # 3. Cleanup the temporary reference file after the stream is done
-            if os.path.exists(temp_ref_path):
-                os.unlink(temp_ref_path)
-    # The StreamingResponse handles the transfer encoding and chunking
-    return StreamingResponse(
-        stream_generator(),
-        media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
-        headers={
-            "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
-            "Transfer-Encoding": "chunked",
-            "Cache-Control": "no-cache"
-        }
-    )
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):

 import numpy as np
 import psutil
 import soundfile as sf
+import subprocess
+import tempfile
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Generator
 from contextlib import asynccontextmanager
     speed: float = Field(default=1.0, ge=0.5, le=2.0)
     output_format: str = Field(default="wav", pattern="^(wav|mp3|flac)$")
+def convert_to_wav_blocking(input_path: str) -> str:
+    """
+    NEW FUNCTION: Uses FFmpeg to convert any uploaded audio format (WebM, MP4, etc.)
+    to a 24kHz, 16-bit PCM WAV file, which is required by soundfile/libsndfile.
+    This function must run in the ThreadPoolExecutor.
+    """
+    # Create a unique temporary filename for the converted WAV file
+    # We use tempfile.NamedTemporaryFile to safely create a path
+    # and then delete the file handle so ffmpeg can write to it.
+    with tempfile.NamedTemporaryFile(suffix=".wav", dir=TEMP_AUDIO_DIR, delete=False) as tmp:
+        output_path = tmp.name
+    logger.info(f"Converting '{os.path.basename(input_path)}' to WAV (24kHz, mono) at {os.path.basename(output_path)}")
+    # FFmpeg command details:
+    # -y: overwrite output file if it exists
+    # -i: input file path
+    # -f wav: output format is WAV
+    # -ar 24000: set sample rate to 24000 (required by NeuTTS)
+    # -ac 1: set audio channels to 1 (mono)
+    # -c:a pcm_s16le: set codec to uncompressed 16-bit PCM (standard WAV)
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i", input_path,
+        "-f", "wav",
+        "-ar", str(SAMPLE_RATE),
+        "-ac", "1",
+        "-c:a", "pcm_s16le",
+        output_path
+    ]
+    try:
+        # Run the FFmpeg command
+        # Use a short timeout to prevent runaway processes
+        result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
+        logger.info(f"FFmpeg conversion successful.")
+        return output_path
+    except subprocess.CalledProcessError as e:
+        logger.error(f"FFmpeg conversion failed: {e.stderr}")
+        # Clean up the output path if FFmpeg failed to write it
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+        # Provide the last line of the FFmpeg error to the user
+        error_detail = e.stderr.splitlines()[-1] if e.stderr else "Unknown FFmpeg error."
+        raise HTTPException(status_code=400, detail=f"Audio format conversion failed: {error_detail}")
+    except subprocess.TimeoutExpired:
+        logger.error("FFmpeg conversion timed out.")
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+        raise HTTPException(status_code=504, detail="Audio conversion timed out after 30 seconds.")
+    except Exception as e:
+        logger.error(f"General conversion error: {e}")
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+        raise HTTPException(status_code=500, detail="An unexpected error occurred during audio conversion.")
 # --- Model Wrapper and Logic ---
 class NeuTTSWrapper:
     text: str = Form(...),
     speed: float = Form(1.0, ge=0.5, le=2.0),
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
+    reference_audio: UploadFile = File(...)):
     """
     Standard blocking TTS endpoint with Multi-Format Output (Kokoro Feature).
+    Includes FFmpeg conversion for uploaded audio format compatibility.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
+    # 1. Asynchronously save reference audio (original upload)
     temp_ref_path = await save_upload_file_async(reference_audio)
+    converted_wav_path = None # NEW: Initialize for cleanup
     start_time = time.time()
     try:
+        # 2. **NEW STEP**: Convert the uploaded file (WebM, etc.) to a 24kHz WAV file using FFmpeg
+        converted_wav_path = await run_blocking_task_async(
+            convert_to_wav_blocking,
+            temp_ref_path
+        )
+        # 3. Offload the ENTIRE blocking process (encode + infer) to a thread
         audio_data = await run_blocking_task_async(
             app.state.tts_wrapper.generate_speech_blocking,
             text,
+            converted_wav_path # IMPORTANT: Pass the CONVERTED WAV path
         )
+        # 4. Convert to requested format (Blocking, but usually fast)
         audio_bytes = await run_blocking_task_async(
             app.state.tts_wrapper._convert_to_streamable_format,
             audio_data,
             output_format
         )
+        # 5. Save to disk (Original NeuTTS requirement)
         audio_filename = f"tts_{time.time()}.{output_format}"
         final_path = os.path.join(GENERATED_AUDIO_DIR, audio_filename)
         await run_blocking_task_async(
             lambda: open(final_path, 'wb').write(audio_bytes)
         )
         processing_time = time.time() - start_time
         audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
                 "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
         logger.error(f"Synthesis error: {e}")
+        # Reraise HTTPExceptions that may have come from the conversion step
+        if isinstance(e, HTTPException):
+             raise
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
     finally:
+        # 6. Clean up BOTH the original file AND the converted WAV file
         if os.path.exists(temp_ref_path):
             os.unlink(temp_ref_path)
+        if converted_wav_path and os.path.exists(converted_wav_path):
+            os.unlink(converted_wav_path)
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
+    text: str = Form(..., min_length=1, max_length=5000),
     speed: float = Form(1.0, ge=0.5, le=2.0),
+    output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
+    reference_audio: UploadFile = File(...)):
     """
     Sentence-by-Sentence Streaming Endpoint (Kokoro Feature adaptation).
+    Includes FFmpeg conversion for uploaded audio format compatibility.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     # 1. Asynchronously save reference audio (non-blocking)
     temp_ref_path = await save_upload_file_async(reference_audio)
+    converted_wav_path = None # NEW: Initialize for cleanup
+    try:
+        # 2. **NEW STEP**: Convert the uploaded file (WebM, etc.) to a 24kHz WAV file using FFmpeg
+        converted_wav_path = await run_blocking_task_async(
+            convert_to_wav_blocking,
+            temp_ref_path
+        )
+        # 3. Define the generator function, which will run in the thread pool implicitly
+        def stream_generator():
+            try:
+                # The entire streaming process runs blocking inside the thread pool
+                for chunk_bytes in app.state.tts_wrapper.stream_speech_blocking(
+                    text,
+                    converted_wav_path, # IMPORTANT: Pass the CONVERTED WAV path
+                    speed,
+                    output_format
+                ):
+                    yield chunk_bytes
+            except Exception as e:
+                logger.error(f"Streaming generator error: {e}")
+                # Note: Cleanup for converted_wav_path is handled in the main finally block below.
+        # The StreamingResponse is returned immediately to start the stream
+        return StreamingResponse(
+            stream_generator(),
+            media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
+            headers={
+                "Content-Disposition": "attachment; filename=tts_live_stream.mp3",
+                "Transfer-Encoding": "chunked",
+                "Cache-Control": "no-cache"
+            }
+        )
+    except Exception as e:
+        logger.error(f"Streaming setup error: {e}")
+        # Reraise HTTPExceptions that may have come from the conversion step
+        if isinstance(e, HTTPException):
+             raise
+        raise HTTPException(status_code=500, detail=f"Streaming synthesis failed: {e}")
+    finally:
+        # 4. Clean up BOTH the original file AND the converted WAV file
+        if os.path.exists(temp_ref_path):
+            os.unlink(temp_ref_path)
+        if converted_wav_path and os.path.exists(converted_wav_path):
+            os.unlink(converted_wav_path)
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):