waxz commited on
Commit
1574efa
·
1 Parent(s): 1068b6c

add project

Browse files
Files changed (4) hide show
  1. Dockerfile +42 -0
  2. README.md +34 -0
  3. app.py +204 -0
  4. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 slim image for a balance of size and compatibility
2
+ FROM python:3.10-slim
3
+
4
+ # Install system dependencies
5
+ # libsndfile1 and ffmpeg are often required for audio processing (scipy/numpy/onnx)
6
+ RUN apt-get update && apt-get install -y \
7
+ libsndfile1 \
8
+ ffmpeg \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set up a new user named "user" with user ID 1000
13
+ # Hugging Face Spaces strictly require running as non-root (ID 1000)
14
+ RUN useradd -m -u 1000 user
15
+
16
+ # Switch to the "user" context
17
+ USER user
18
+
19
+ # Set environment variables
20
+ ENV HOME=/home/user \
21
+ PATH=/home/user/.local/bin:$PATH
22
+
23
+ # Set the working directory to the user's home directory
24
+ WORKDIR $HOME/app
25
+
26
+ # Copy requirements first to leverage Docker cache
27
+ COPY --chown=user requirements.txt requirements.txt
28
+
29
+ # Install Python dependencies
30
+ RUN pip install --no-cache-dir --upgrade pip && \
31
+ pip install --no-cache-dir -r requirements.txt
32
+
33
+ # Copy the rest of the application code
34
+ COPY --chown=user . $HOME/app
35
+
36
+
37
+ # Expose the port that Hugging Face expects
38
+ EXPOSE 7860
39
+
40
+ # Start the application
41
+ # We map host to 0.0.0.0 and port to 7860
42
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -9,3 +9,37 @@ short_description: openai api style tts engine
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
+ # tts-proxy
13
+ A simple openai api style tts server based on supertonic.
14
+
15
+ - https://huggingface.co/spaces/Supertone/supertonic
16
+ - https://github.com/supertone-inc/supertonic/tree/main/py
17
+
18
+
19
+ ## install dependencies
20
+
21
+ ```bash
22
+ curl -LsSf https://astral.sh/uv/install.sh | sh
23
+ ```
24
+
25
+ ```bash
26
+ uv venv -p 3.10
27
+ source .venv/bin/activate
28
+ uv pip install -r ./requirements.txt
29
+ ```
30
+
31
+ ## run server
32
+
33
+ ```bash
34
+ python server.py
35
+ ```
36
+
37
+ ## run client
38
+
39
+ ```bash
40
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
41
+ "model": "tts-1",
42
+ "input": "Hello, this is Supertonic running locally!",
43
+ "voice": "F1"
44
+ }' --output ./test.wav
45
+ ```
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import time
4
+ import re
5
+ import asyncio
6
+ import numpy as np
7
+ import argparse
8
+ import uvicorn
9
+ import sys
10
+ import struct
11
+ from contextlib import asynccontextmanager
12
+ from fastapi import FastAPI, HTTPException
13
+ from fastapi.responses import StreamingResponse
14
+ from pydantic import BaseModel
15
+ from typing import Optional, Literal
16
+ from supertonic import TTS
17
+
18
+ # -----------------------------------------------------------------------------
19
+ # 1. Utility Functions
20
+ # -----------------------------------------------------------------------------
21
+
22
+ def split_text_into_sentences(text: str):
23
+ """
24
+ Splits text into chunks (sentences) for streaming.
25
+ """
26
+ parts = re.split(r'([.?!]+)', text)
27
+ sentences = []
28
+ current = ""
29
+ for part in parts:
30
+ current += part
31
+ if re.search(r'[.?!]', part):
32
+ if current.strip():
33
+ sentences.append(current.strip())
34
+ current = ""
35
+ if current.strip():
36
+ sentences.append(current.strip())
37
+ return sentences
38
+
39
+ def create_wav_header(sample_rate: int, channels: int = 1, bits_per_sample: int = 16):
40
+ """
41
+ Generates a generic WAV header with "unknown" file size (0xFFFFFFFF)
42
+ so browsers/clients treat it as a stream.
43
+ """
44
+ byte_rate = sample_rate * channels * bits_per_sample // 8
45
+ block_align = channels * bits_per_sample // 8
46
+
47
+ header = b'RIFF'
48
+ header += struct.pack('<I', 0xFFFFFFFF)
49
+ header += b'WAVE'
50
+ header += b'fmt '
51
+ header += struct.pack('<I', 16)
52
+ header += struct.pack('<H', 1)
53
+ header += struct.pack('<H', channels)
54
+ header += struct.pack('<I', sample_rate)
55
+ header += struct.pack('<I', byte_rate)
56
+ header += struct.pack('<H', block_align)
57
+ header += struct.pack('<H', bits_per_sample)
58
+ header += b'data'
59
+ header += struct.pack('<I', 0xFFFFFFFF)
60
+
61
+ return header
62
+
63
+ def float_to_pcm16(audio_array):
64
+ """Converts float32 audio to int16 bytes."""
65
+ audio_array = np.array(audio_array)
66
+ if len(audio_array.shape) > 1:
67
+ audio_array = audio_array.flatten()
68
+ audio_array = np.clip(audio_array, -1.0, 1.0)
69
+ audio_int16 = (audio_array * 32767).astype(np.int16)
70
+ return audio_int16.tobytes()
71
+
72
+ # -----------------------------------------------------------------------------
73
+ # 2. Streaming Engine with Fallback Logic
74
+ # -----------------------------------------------------------------------------
75
+
76
+ class StreamingEngine:
77
+ def __init__(self, onnx_dir: str, voice_dir: str):
78
+ self.onnx_dir = onnx_dir
79
+ self.model = None
80
+ self.sample_rate = 24000
81
+ self.lock = asyncio.Lock()
82
+
83
+ # Default fallback voice
84
+ self.default_voice = "F1"
85
+
86
+ # Mapping OpenAI voice names to Supertonic IDs
87
+ self.voice_mapping = {
88
+ "alloy": "F1",
89
+ "echo": "M1",
90
+ "fable": "M2",
91
+ "onyx": "M3",
92
+ "nova": "F2",
93
+ "shimmer": "F3"
94
+ }
95
+
96
+ print(f"Loading Supertonic model...")
97
+ try:
98
+ self.model = TTS(auto_download=True)
99
+ self.sample_rate = self.model.sample_rate
100
+ print(f"Model Loaded. Rate: {self.sample_rate}")
101
+ except Exception as e:
102
+ print(f"Error initializing model: {e}")
103
+ sys.exit(1)
104
+
105
+ def get_style_safe(self, voice_name: str):
106
+ """
107
+ Safely retrieves a voice style.
108
+ 1. Checks mapping (alloy -> F1).
109
+ 2. Tries to load.
110
+ 3. If fails, returns default (F1).
111
+ """
112
+ # 1. Normalize and Map
113
+ clean_name = voice_name.lower().strip()
114
+ target_name = self.voice_mapping.get(clean_name, voice_name) # map or keep original
115
+
116
+ # 2. Try to get style
117
+ try:
118
+ # Note: We rely on supertonic throwing an error if name is invalid
119
+ style = self.model.get_voice_style(voice_name=target_name)
120
+ return style, target_name
121
+ except Exception:
122
+ # 3. Fallback
123
+ print(f"WARNING: Voice '{voice_name}' (mapped to '{target_name}') not found. Using '{self.default_voice}'.")
124
+ try:
125
+ style = self.model.get_voice_style(voice_name=self.default_voice)
126
+ return style, self.default_voice
127
+ except Exception as e:
128
+ print(f"CRITICAL: Default voice '{self.default_voice}' also failed.")
129
+ raise e
130
+
131
+ async def stream_generator(self, text: str, voice_name: str, speed: float):
132
+ # 1. Resolve Voice Style ONCE before the loop
133
+ # We do this here so we don't re-calculate embedding for every sentence
134
+ try:
135
+ style, resolved_name = self.get_style_safe(voice_name)
136
+ except Exception as e:
137
+ print(f"Error resolving voice: {e}")
138
+ return
139
+
140
+ yield create_wav_header(self.sample_rate)
141
+
142
+ chunks = split_text_into_sentences(text)
143
+ print(f"Streaming '{text[:20]}...' using voice: {resolved_name}")
144
+
145
+ loop = asyncio.get_event_loop()
146
+
147
+ for i, chunk in enumerate(chunks):
148
+ # async with self.lock guarantees only one heavy TTS task runs globally
149
+ async with self.lock:
150
+ audio_float, _ = await loop.run_in_executor(
151
+ None,
152
+ self.model.synthesize,
153
+ chunk,
154
+ style
155
+ # speed # Add speed here if your supertonic version supports it
156
+ )
157
+
158
+ pcm_bytes = float_to_pcm16(audio_float)
159
+ yield pcm_bytes
160
+
161
+ # -----------------------------------------------------------------------------
162
+ # 3. API Setup
163
+ # -----------------------------------------------------------------------------
164
+
165
+ engine = None
166
+
167
+ class SpeechRequest(BaseModel):
168
+ model: Optional[str] = "tts-1"
169
+ input: str
170
+ voice: str = "F1" # Defaults to F1, but handles 'alloy' etc via mapping
171
+ response_format: Optional[str] = "wav"
172
+ speed: Optional[float] = 1.0
173
+
174
+ @asynccontextmanager
175
+ async def lifespan(app: FastAPI):
176
+ global engine
177
+ engine = StreamingEngine("assets/onnx", "assets/voice_styles")
178
+ yield
179
+ print("Engine shutting down")
180
+
181
+ app = FastAPI(lifespan=lifespan)
182
+
183
+ @app.post("/v1/audio/speech")
184
+ async def text_to_speech(request: SpeechRequest):
185
+ global engine
186
+ if not engine:
187
+ raise HTTPException(500, "Engine not loaded")
188
+
189
+ return StreamingResponse(
190
+ engine.stream_generator(request.input, request.voice, request.speed),
191
+ media_type="audio/wav"
192
+ )
193
+
194
+ @app.get("/v1/models")
195
+ async def list_models():
196
+ return {"data": [{"id": "tts-1", "owned_by": "supertonic"}]}
197
+
198
+ if __name__ == "__main__":
199
+ parser = argparse.ArgumentParser()
200
+ parser.add_argument("--host", default="0.0.0.0")
201
+ parser.add_argument("--port", type=int, default=8000)
202
+ args = parser.parse_args()
203
+
204
+ uvicorn.run(app, host=args.host, port=args.port)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ supertonic
2
+ uvicorn
3
+ fastapi