Spaces:

nxdev-org
/

tts

Running

App Files Files Community

waxz commited on Dec 23, 2025

Commit

1574efa

1 Parent(s): 1068b6c

add project

Browse files

Files changed (4) hide show

Dockerfile +42 -0
README.md +34 -0
app.py +204 -0
requirements.txt +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+# Use Python 3.10 slim image for a balance of size and compatibility
+FROM python:3.10-slim
+# Install system dependencies
+# libsndfile1 and ffmpeg are often required for audio processing (scipy/numpy/onnx)
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Set up a new user named "user" with user ID 1000
+# Hugging Face Spaces strictly require running as non-root (ID 1000)
+RUN useradd -m -u 1000 user
+# Switch to the "user" context
+USER user
+# Set environment variables
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy requirements first to leverage Docker cache
+COPY --chown=user requirements.txt requirements.txt
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY --chown=user . $HOME/app
+# Expose the port that Hugging Face expects
+EXPOSE 7860
+# Start the application
+# We map host to 0.0.0.0 and port to 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -9,3 +9,37 @@ short_description: openai api style tts engine
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# tts-proxy
+A simple openai api style tts server based on supertonic.
+- https://huggingface.co/spaces/Supertone/supertonic
+- https://github.com/supertone-inc/supertonic/tree/main/py
+## install dependencies
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+```bash
+uv venv -p 3.10
+source .venv/bin/activate
+uv pip install -r ./requirements.txt
+```
+## run server
+```bash
+python server.py
+```
+## run client
+```bash
+curl http://localhost:8000/v1/audio/speech   -H "Content-Type: application/json"   -d '{
+    "model": "tts-1",
+    "input": "Hello, this is Supertonic running locally!",
+    "voice": "F1"
+  }'   --output ./test.wav
+```

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import io
+import time
+import re
+import asyncio
+import numpy as np
+import argparse
+import uvicorn
+import sys
+import struct
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import Optional, Literal
+from supertonic import TTS
+# -----------------------------------------------------------------------------
+# 1. Utility Functions
+# -----------------------------------------------------------------------------
+def split_text_into_sentences(text: str):
+    """
+    Splits text into chunks (sentences) for streaming.
+    """
+    parts = re.split(r'([.?!]+)', text)
+    sentences = []
+    current = ""
+    for part in parts:
+        current += part
+        if re.search(r'[.?!]', part):
+            if current.strip():
+                sentences.append(current.strip())
+            current = ""
+    if current.strip():
+        sentences.append(current.strip())
+    return sentences
+def create_wav_header(sample_rate: int, channels: int = 1, bits_per_sample: int = 16):
+    """
+    Generates a generic WAV header with "unknown" file size (0xFFFFFFFF)
+    so browsers/clients treat it as a stream.
+    """
+    byte_rate = sample_rate * channels * bits_per_sample // 8
+    block_align = channels * bits_per_sample // 8
+    header = b'RIFF'
+    header += struct.pack('<I', 0xFFFFFFFF)
+    header += b'WAVE'
+    header += b'fmt '
+    header += struct.pack('<I', 16)
+    header += struct.pack('<H', 1)
+    header += struct.pack('<H', channels)
+    header += struct.pack('<I', sample_rate)
+    header += struct.pack('<I', byte_rate)
+    header += struct.pack('<H', block_align)
+    header += struct.pack('<H', bits_per_sample)
+    header += b'data'
+    header += struct.pack('<I', 0xFFFFFFFF)
+    return header
+def float_to_pcm16(audio_array):
+    """Converts float32 audio to int16 bytes."""
+    audio_array = np.array(audio_array)
+    if len(audio_array.shape) > 1:
+        audio_array = audio_array.flatten()
+    audio_array = np.clip(audio_array, -1.0, 1.0)
+    audio_int16 = (audio_array * 32767).astype(np.int16)
+    return audio_int16.tobytes()
+# -----------------------------------------------------------------------------
+# 2. Streaming Engine with Fallback Logic
+# -----------------------------------------------------------------------------
+class StreamingEngine:
+    def __init__(self, onnx_dir: str, voice_dir: str):
+        self.onnx_dir = onnx_dir
+        self.model = None
+        self.sample_rate = 24000
+        self.lock = asyncio.Lock()
+        # Default fallback voice
+        self.default_voice = "F1"
+        # Mapping OpenAI voice names to Supertonic IDs
+        self.voice_mapping = {
+            "alloy": "F1",
+            "echo": "M1",
+            "fable": "M2",
+            "onyx": "M3",
+            "nova": "F2",
+            "shimmer": "F3"
+        }
+        print(f"Loading Supertonic model...")
+        try:
+            self.model = TTS(auto_download=True)
+            self.sample_rate = self.model.sample_rate
+            print(f"Model Loaded. Rate: {self.sample_rate}")
+        except Exception as e:
+            print(f"Error initializing model: {e}")
+            sys.exit(1)
+    def get_style_safe(self, voice_name: str):
+        """
+        Safely retrieves a voice style.
+        1. Checks mapping (alloy -> F1).
+        2. Tries to load.
+        3. If fails, returns default (F1).
+        """
+        # 1. Normalize and Map
+        clean_name = voice_name.lower().strip()
+        target_name = self.voice_mapping.get(clean_name, voice_name) # map or keep original
+        # 2. Try to get style
+        try:
+            # Note: We rely on supertonic throwing an error if name is invalid
+            style = self.model.get_voice_style(voice_name=target_name)
+            return style, target_name
+        except Exception:
+            # 3. Fallback
+            print(f"WARNING: Voice '{voice_name}' (mapped to '{target_name}') not found. Using '{self.default_voice}'.")
+            try:
+                style = self.model.get_voice_style(voice_name=self.default_voice)
+                return style, self.default_voice
+            except Exception as e:
+                print(f"CRITICAL: Default voice '{self.default_voice}' also failed.")
+                raise e
+    async def stream_generator(self, text: str, voice_name: str, speed: float):
+        # 1. Resolve Voice Style ONCE before the loop
+        # We do this here so we don't re-calculate embedding for every sentence
+        try:
+            style, resolved_name = self.get_style_safe(voice_name)
+        except Exception as e:
+            print(f"Error resolving voice: {e}")
+            return
+        yield create_wav_header(self.sample_rate)
+        chunks = split_text_into_sentences(text)
+        print(f"Streaming '{text[:20]}...' using voice: {resolved_name}")
+        loop = asyncio.get_event_loop()
+        for i, chunk in enumerate(chunks):
+            # async with self.lock guarantees only one heavy TTS task runs globally
+            async with self.lock:
+                audio_float, _ = await loop.run_in_executor(
+                    None,
+                    self.model.synthesize,
+                    chunk,
+                    style
+                    # speed # Add speed here if your supertonic version supports it
+                )
+            pcm_bytes = float_to_pcm16(audio_float)
+            yield pcm_bytes
+# -----------------------------------------------------------------------------
+# 3. API Setup
+# -----------------------------------------------------------------------------
+engine = None
+class SpeechRequest(BaseModel):
+    model: Optional[str] = "tts-1"
+    input: str
+    voice: str = "F1" # Defaults to F1, but handles 'alloy' etc via mapping
+    response_format: Optional[str] = "wav"
+    speed: Optional[float] = 1.0
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global engine
+    engine = StreamingEngine("assets/onnx", "assets/voice_styles")
+    yield
+    print("Engine shutting down")
+app = FastAPI(lifespan=lifespan)
+@app.post("/v1/audio/speech")
+async def text_to_speech(request: SpeechRequest):
+    global engine
+    if not engine:
+        raise HTTPException(500, "Engine not loaded")
+    return StreamingResponse(
+        engine.stream_generator(request.input, request.voice, request.speed),
+        media_type="audio/wav"
+    )
+@app.get("/v1/models")
+async def list_models():
+    return {"data": [{"id": "tts-1", "owned_by": "supertonic"}]}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    uvicorn.run(app, host=args.host, port=args.port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+supertonic
+uvicorn
+fastapi