Spaces:

wolf1997
/

text_to_speech_api

Sleeping

App Files Files Community

wolf1997 commited on May 21, 2025

Commit

157a15c

verified ·

1 Parent(s): 74d6f76

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +36 -0
app.py +87 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+# Use Python 3.11.7 slim image as base
+FROM python:3.11.7-slim
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Install system dependencies required for soundfile
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY . .
+# Expose the port the app runs on
+EXPOSE 8000
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.responses import Response
+from pydantic import BaseModel
+from kokoro import KPipeline
+import soundfile as sf
+import torch
+import os
+import uuid
+import numpy as np
+import io
+from typing import Optional
+pipeline = KPipeline(lang_code='a')
+app = FastAPI(title="Text to Speech API")
+class TextToSpeechRequest(BaseModel):
+    text: str
+    language: Optional[str] = "en"
+    slow: Optional[bool] = False
+def tensor_to_audio_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000) -> bytes:
+    """
+    Convert a float audio tensor to bytes.
+    Args:
+        audio_tensor (torch.Tensor): Input audio tensor of shape (samples,) or (channels, samples)
+        sample_rate (int): Sample rate of the audio in Hz. Default is 24000.
+    Returns:
+        bytes: Audio data in bytes format
+    """
+    # Ensure tensor is on CPU and convert to numpy
+    audio_np = audio_tensor.detach().cpu().numpy()
+    # Handle different input shapes
+    if len(audio_np.shape) == 1:
+        # Mono audio (samples,)
+        audio_np = audio_np.reshape(1, -1)
+    elif len(audio_np.shape) > 2:
+        raise ValueError(f"Expected 1D or 2D tensor, got shape {audio_np.shape}")
+    # Create a bytes buffer
+    buffer = io.BytesIO()
+    # Write audio data to buffer using soundfile
+    sf.write(buffer, audio_np.T, sample_rate, format='WAV')
+    # Get the bytes from the buffer
+    audio_bytes = buffer.getvalue()
+    buffer.close()
+    return audio_bytes
+@app.post("/tts")
+async def text_to_speech(request: TextToSpeechRequest):
+    try:
+        generator = pipeline(request.text, voice='af_heart')
+        for i, (gs, ps, audio) in enumerate(generator):
+            audio_tensor = audio
+        audio_bytes = tensor_to_audio_bytes(audio_tensor)
+        # Return audio bytes directly with appropriate headers
+        return Response(
+            content=audio_bytes,
+            media_type="audio/wav",
+            headers={
+                "Content-Disposition": "attachment; filename=speech.wav"
+            }
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Text to Speech API. Use POST /tts to convert text to speech. the body should be a json with the following fields: {'text': 'text to convert to speech', 'language': 'language code (optional, default is en)', 'slow': 'boolean (optional, default is False)'}"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.115.12
+kokoro==0.9.4
+numpy==2.2.6
+pydantic==2.11.4
+soundfile==0.13.1
+torch==2.7.0
+uvicorn==0.34.2