waxz commited on
Commit
f20a8ad
·
1 Parent(s): 24244a4

update model

Browse files
Files changed (9) hide show
  1. README.md +12 -4
  2. app.py +49 -230
  3. base_model.py +77 -0
  4. kokoro_model.py +74 -0
  5. requirements.txt +3 -0
  6. supertonic_model.py +89 -0
  7. test/run_kokoro.py +10 -0
  8. test/speech.sh +27 -0
  9. utils.py +114 -0
README.md CHANGED
@@ -8,7 +8,6 @@ pinned: false
8
  short_description: openai api style tts engine
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
  # tts-proxy
13
  A simple openai api style tts server based on supertonic.
14
 
@@ -32,7 +31,8 @@ uv pip install -r ./requirements.txt
32
 
33
  ```bash
34
  export API_KEY=yourapi
35
- python server.py
 
36
  ```
37
 
38
  ## run client
@@ -40,7 +40,15 @@ python server.py
40
  ```bash
41
  curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
42
  "model": "tts-1",
43
- "input": "Hello, this is Supertonic running locally!",
44
- "voice": "F1"
 
45
  }' --output ./test.wav
 
 
 
 
 
 
 
46
  ```
 
8
  short_description: openai api style tts engine
9
  ---
10
 
 
11
  # tts-proxy
12
  A simple openai api style tts server based on supertonic.
13
 
 
31
 
32
  ```bash
33
  export API_KEY=yourapi
34
+ export MODELS="{'tts-2':'supertonic','tts-1':'kokoro'}"
35
+ python app.py
36
  ```
37
 
38
  ## run client
 
40
  ```bash
41
  curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
42
  "model": "tts-1",
43
+ "input": "Hello World! Come Here!",
44
+ "voice": "F1",
45
+ "format": "wav"
46
  }' --output ./test.wav
47
+
48
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
49
+ "model": "tts-1",
50
+ "input": "Hello World! Come Here!",
51
+ "voice": "F1",
52
+ "format": "mp3"
53
+ }' --output ./test.mp3
54
  ```
app.py CHANGED
@@ -1,22 +1,16 @@
1
  import os
2
- import io
3
- import time
4
- import re
5
- import asyncio
6
- import numpy as np
7
  import argparse
8
  import uvicorn
9
  import sys
10
- import struct
11
  import secrets
 
12
  from contextlib import asynccontextmanager
13
  from fastapi import FastAPI, HTTPException, Security, status, Depends
14
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
15
- from fastapi.responses import StreamingResponse
16
  from pydantic import BaseModel
17
  from typing import Optional, Literal
18
- from supertonic import TTS
19
-
20
 
21
 
22
 
@@ -56,235 +50,21 @@ async def verify_api_key(credentials: HTTPAuthorizationCredentials = Security(se
56
  # 2. Text & Audio Utilities
57
  # -----------------------------------------------------------------------------
58
 
59
- def split_text_into_sentences(text: str, min_chunk_size: int = 150):
60
- """
61
- Smart splitting for low-latency streaming.
62
-
63
- Logic:
64
- 1. Split text into "atomic" sentences (preserving punctuation).
65
- 2. ALWAYS yield the first sentence immediately (fastest time-to-first-byte).
66
- 3. For subsequent text, combine small sentences into larger chunks
67
- (up to min_chunk_size) to improve GPU efficiency and sentence flow.
68
- """
69
- if not text:
70
- return []
71
-
72
- # 1. Clean up extra whitespace (newlines become spaces for flow)
73
- # We want a continuous stream of text for better merging
74
- text = re.sub(r'\s+', ' ', text).strip()
75
-
76
- # 2. Split into atomic sentences
77
- # Pattern: Split by [.?!:;] but keep the delimiter.
78
- # We look for punctuation followed by a space or end of string.
79
- raw_parts = re.split(r'([.?!:;]+)(?=\s|$)', text)
80
-
81
- atomic_sentences = []
82
- current_atomic = ""
83
-
84
- # Re-assemble the split parts (e.g. "Hello" + "." -> "Hello.")
85
- for part in raw_parts:
86
- if re.match(r'^[.?!:;]+$', part):
87
- current_atomic += part
88
- if current_atomic.strip():
89
- atomic_sentences.append(current_atomic.strip())
90
- current_atomic = ""
91
- else:
92
- current_atomic += part
93
-
94
- if current_atomic.strip():
95
- atomic_sentences.append(current_atomic.strip())
96
-
97
- # 3. Batching Logic
98
- final_chunks = []
99
- current_buffer = ""
100
- first_sentence_sent = False
101
-
102
- for sentence in atomic_sentences:
103
- # CASE A: The very first sentence.
104
- # Send it immediately, no matter how short, to start audio playback.
105
- if not first_sentence_sent:
106
- final_chunks.append(sentence)
107
- first_sentence_sent = True
108
- continue
109
-
110
- # CASE B: Subsequent sentences.
111
- # Add to buffer.
112
- if current_buffer:
113
- current_buffer += " " + sentence
114
- else:
115
- current_buffer = sentence
116
-
117
- # If buffer is long enough, flush it.
118
- # This prevents generating audio for tiny fragments like "No." or "Ok."
119
- if len(current_buffer) >= min_chunk_size:
120
- final_chunks.append(current_buffer)
121
- current_buffer = ""
122
-
123
- # Flush any remaining text in the buffer
124
- if current_buffer:
125
- final_chunks.append(current_buffer)
126
-
127
- return final_chunks
128
-
129
-
130
- # -----------------------------------------------------------------------------
131
- # 1. Utility Functions
132
- # -----------------------------------------------------------------------------
133
-
134
-
135
- def create_wav_header(sample_rate: int, channels: int = 1, bits_per_sample: int = 16):
136
- """
137
- Generates a generic WAV header with "unknown" file size (0xFFFFFFFF)
138
- so browsers/clients treat it as a stream.
139
- """
140
- byte_rate = sample_rate * channels * bits_per_sample // 8
141
- block_align = channels * bits_per_sample // 8
142
-
143
- header = b'RIFF'
144
- header += struct.pack('<I', 0xFFFFFFFF)
145
- header += b'WAVE'
146
- header += b'fmt '
147
- header += struct.pack('<I', 16)
148
- header += struct.pack('<H', 1)
149
- header += struct.pack('<H', channels)
150
- header += struct.pack('<I', sample_rate)
151
- header += struct.pack('<I', byte_rate)
152
- header += struct.pack('<H', block_align)
153
- header += struct.pack('<H', bits_per_sample)
154
- header += b'data'
155
- header += struct.pack('<I', 0xFFFFFFFF)
156
-
157
- return header
158
-
159
- def float_to_pcm16(audio_array):
160
- """Converts float32 audio to int16 bytes."""
161
- audio_array = np.array(audio_array)
162
- if len(audio_array.shape) > 1:
163
- audio_array = audio_array.flatten()
164
- audio_array = np.clip(audio_array, -1.0, 1.0)
165
- audio_int16 = (audio_array * 32767).astype(np.int16)
166
- return audio_int16.tobytes()
167
-
168
  # -----------------------------------------------------------------------------
169
  # 2. Streaming Engine with Fallback Logic
170
  # -----------------------------------------------------------------------------
171
 
172
- class StreamingEngine:
173
- def __init__(self):
174
- self.model = None
175
- self.sample_rate = 441000
176
- self.lock = asyncio.Lock()
177
-
178
- # Default fallback voice
179
- self.default_voice = "F1"
180
-
181
- # Mapping OpenAI voice names to Supertonic IDs
182
- self.voice_mapping = {
183
- "alloy": "F1",
184
- "echo": "M1",
185
- "fable": "M2",
186
- "onyx": "M3",
187
- "nova": "F2",
188
- "shimmer": "F3"
189
- }
190
-
191
- print(f"Loading Supertonic model...")
192
- try:
193
- self.tts = TTS(auto_download=True)
194
- self.text_processor = self.tts.model.text_processor
195
- self.sample_rate = self.tts.sample_rate
196
- print(f"Model Loaded. Rate: {self.sample_rate}")
197
- except Exception as e:
198
- print(f"Error initializing model: {e}")
199
- sys.exit(1)
200
-
201
- def get_style_safe(self, voice_name: str):
202
- """
203
- Safely retrieves a voice style.
204
- 1. Checks mapping (alloy -> F1).
205
- 2. Tries to load.
206
- 3. If fails, returns default (F1).
207
- """
208
- # 1. Normalize and Map
209
- clean_name = voice_name.lower().strip()
210
- target_name = self.voice_mapping.get(clean_name, voice_name) # map or keep original
211
-
212
- # 2. Try to get style
213
- try:
214
- # Note: We rely on supertonic throwing an error if name is invalid
215
- style = self.tts.get_voice_style(voice_name=target_name)
216
- return style, target_name
217
- except Exception:
218
- # 3. Fallback
219
- print(f"WARNING: Voice '{voice_name}' (mapped to '{target_name}') not found. Using '{self.default_voice}'.")
220
- try:
221
- style = self.tts.get_voice_style(voice_name=self.default_voice)
222
- return style, self.default_voice
223
- except Exception as e:
224
- print(f"CRITICAL: Default voice '{self.default_voice}' also failed.")
225
- raise e
226
-
227
- async def stream_generator(self, text: str, voice_name: str, speed: float):
228
- # 1. Resolve Voice Style ONCE before the loop
229
- # We do this here so we don't re-calculate embedding for every sentence
230
- try:
231
- style, resolved_name = self.get_style_safe(voice_name)
232
- except Exception as e:
233
- print(f"Error resolving voice: {e}")
234
- return
235
-
236
- yield create_wav_header(self.sample_rate)
237
-
238
-
239
- is_valid, unsupported = self.text_processor.validate_text(text)
240
-
241
- if not is_valid:
242
- print(f" ⚠️ Contains {len(unsupported)} unsupported character(s): {unsupported[:5]}")
243
- pattern = f"[{re.escape(''.join(unsupported))}]"
244
- preprocessed = re.sub(pattern, "", text) #self.text_processor._preprocess_text(text)
245
- if preprocessed != text:
246
- print(f" After preprocessing: {preprocessed[:50]}...")
247
- text = preprocessed
248
-
249
- else:
250
- print(" ✓ All characters supported")
251
-
252
-
253
- #chunks = split_text_into_sentences(text)
254
- chunks = split_text_into_sentences(text, min_chunk_size=150)
255
-
256
-
257
-
258
-
259
- print(f"Streaming '{text[:20]}...' using voice: {resolved_name}")
260
-
261
- loop = asyncio.get_event_loop()
262
-
263
- for i, chunk in enumerate(chunks):
264
- # async with self.lock guarantees only one heavy TTS task runs globally
265
- async with self.lock:
266
- audio_float, _ = await loop.run_in_executor(
267
- None,
268
- self.tts.synthesize,
269
- chunk,
270
- style
271
- # speed # Add speed here if your supertonic version supports it
272
- )
273
-
274
- pcm_bytes = float_to_pcm16(audio_float)
275
- yield pcm_bytes
276
-
277
  # -----------------------------------------------------------------------------
278
  # 3. API Setup
279
  # -----------------------------------------------------------------------------
280
 
281
- engine = None
282
 
283
  class SpeechRequest(BaseModel):
284
  model: Optional[str] = "tts-1"
285
  input: str
286
- voice: str = "F1" # Defaults to F1, but handles 'alloy' etc via mapping
287
- response_format: Optional[str] = "wav"
288
  speed: Optional[float] = 1.0
289
 
290
 
@@ -296,8 +76,28 @@ async def lifespan(app: FastAPI):
296
  print("\n!!! WARNING: API_KEY not set. API is open to the public. !!!\n")
297
  else:
298
  print(f"\n*** Secure Mode: API Key protection enabled. ***\n")
299
-
300
- engine = StreamingEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  yield
302
 
303
 
@@ -312,9 +112,28 @@ async def text_to_speech(request: SpeechRequest):
312
  if not engine:
313
  raise HTTPException(500, "Engine not loaded")
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  return StreamingResponse(
316
- engine.stream_generator(request.input, request.voice, request.speed),
317
- media_type="audio/wav"
318
  )
319
 
320
  @app.get("/v1/models")
 
1
  import os
 
 
 
 
 
2
  import argparse
3
  import uvicorn
4
  import sys
 
5
  import secrets
6
+ import json
7
  from contextlib import asynccontextmanager
8
  from fastapi import FastAPI, HTTPException, Security, status, Depends
9
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
10
+ from fastapi.responses import StreamingResponse, Response
11
  from pydantic import BaseModel
12
  from typing import Optional, Literal
13
+ import supertonic_model,kokoro_model
 
14
 
15
 
16
 
 
50
  # 2. Text & Audio Utilities
51
  # -----------------------------------------------------------------------------
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # -----------------------------------------------------------------------------
54
  # 2. Streaming Engine with Fallback Logic
55
  # -----------------------------------------------------------------------------
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # -----------------------------------------------------------------------------
58
  # 3. API Setup
59
  # -----------------------------------------------------------------------------
60
 
61
+ engine = {}
62
 
63
  class SpeechRequest(BaseModel):
64
  model: Optional[str] = "tts-1"
65
  input: str
66
+ voice: str = "alloy" # Default 'alloy'
67
+ format: Optional[str] = "wav"
68
  speed: Optional[float] = 1.0
69
 
70
 
 
76
  print("\n!!! WARNING: API_KEY not set. API is open to the public. !!!\n")
77
  else:
78
  print(f"\n*** Secure Mode: API Key protection enabled. ***\n")
79
+
80
+ MODELS = None
81
+ if not os.getenv("MODELS"):
82
+ print(f"\n!!! WARNING: MODELS not set")
83
+ sys.exit(0)
84
+ else:
85
+ MODELS = os.getenv("MODELS")
86
+
87
+ print(f"\n!!! WARNING: eval {MODELS}")
88
+ try:
89
+ MODELS = eval(MODELS)
90
+ except:
91
+ print(f"\n!!! WARNING: eval {MODELS} failed")
92
+ sys.exit(0)
93
+
94
+ print(f"\n*** Load {MODELS}. ***\n")
95
+ for k,v in MODELS.items():
96
+ print(f"Mapping {k}-->{v}")
97
+ if "supertonic" == v:
98
+ engine[k] = supertonic_model.StreamingEngine(f"{k}-->{v}")
99
+ if "kokoro" == v:
100
+ engine[k] = kokoro_model.StreamingEngine(f"{k}-->{v}")
101
  yield
102
 
103
 
 
112
  if not engine:
113
  raise HTTPException(500, "Engine not loaded")
114
 
115
+ print(f"request:{request}")
116
+ format = request.format
117
+ model = request.model
118
+ if format not in ["wav", "mp3"]:
119
+ format = "wav"
120
+ if model not in engine.keys():
121
+ print(f"!!!WARNING {model} not found")
122
+
123
+ content = {
124
+ "ok": False,
125
+ "message": f"!!!WARNING {model} not found"
126
+ }
127
+
128
+ content = json.dumps(content)
129
+
130
+ return Response(content=content, status_code=404,media_type="application/json")
131
+
132
+
133
+
134
  return StreamingResponse(
135
+ engine[model].stream_generator(request.input, request.voice, request.speed, format),
136
+ media_type=f"audio/{format}"
137
  )
138
 
139
  @app.get("/v1/models")
base_model.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import utils
3
+
4
+ class BaseEngine:
5
+ def __init__(self, name):
6
+ self.lock = asyncio.Lock()
7
+ self.name = name
8
+ self.tts = None
9
+ # Initialize with default, subclass should overwrite or load_model should update it
10
+ self.sample_rate = 24000
11
+
12
+ print(f"Init model {self.name}")
13
+ self.load_model()
14
+
15
+ def load_model(self):
16
+ raise NotImplementedError("Subclass must implement abstract method")
17
+
18
+ def get_style_safe(self, voice_name: str):
19
+ raise NotImplementedError("Subclass must implement abstract method")
20
+
21
+ # FIX: Changed from async to sync because it's run in an executor
22
+ # FIX: Fixed typo 'genetrate' -> 'generate'
23
+ def generate(self, chunks: str, voice_name: str, speed: float):
24
+ """
25
+ Should return (audio_float_array, sample_rate)
26
+ This method is CPU blocking, so it stays synchronous.
27
+ """
28
+ raise NotImplementedError("Subclass must implement abstract method")
29
+
30
+ # FIX: Added a default preprocessor in case subclass doesn't have one
31
+ def preprocess_text(self, text: str):
32
+ return text
33
+ async def stream_generator(self, text: str, voice_name: str, speed: float, format: str):
34
+ encoder = None
35
+
36
+ if format == "wav":
37
+ yield utils.create_wav_header(self.sample_rate)
38
+ elif format == "mp3":
39
+ encoder = utils.create_mp3_encoder(sample_rate=self.sample_rate)
40
+
41
+ # Preprocess text and voice
42
+ try:
43
+ voice_name = self.get_style_safe(voice_name)
44
+ except NotImplementedError:
45
+ pass
46
+
47
+ text = self.preprocess_text(text)
48
+ chunks = utils.split_text_into_sentences(text, min_chunk_size=150)
49
+
50
+ loop = asyncio.get_event_loop()
51
+
52
+ for i, chunk in enumerate(chunks):
53
+ async with self.lock:
54
+ # Run synchronous generation in executor
55
+ audio_float = await loop.run_in_executor(
56
+ None,
57
+ self.generate,
58
+ chunk,
59
+ voice_name,
60
+ speed
61
+ )
62
+ for audio in audio_float:
63
+ if format == "wav":
64
+ pcm_bytes = utils.float_to_pcm16(audio)
65
+ yield pcm_bytes
66
+
67
+ elif format == "mp3":
68
+ # This now returns 'bytes', so it is safe
69
+ mp3_bytes = utils.float_to_mp3(audio, encoder)
70
+ if len(mp3_bytes) > 0:
71
+ yield mp3_bytes
72
+
73
+ # Flush MP3 encoder to get remaining audio frames
74
+ if format == "mp3" and encoder is not None:
75
+ final_data = encoder.flush()
76
+ if len(final_data) > 0:
77
+ yield bytes(final_data) # <--- CRITICAL FIX: Cast to bytes
kokoro_model.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import re
3
+ import asyncio
4
+ from kokoro import KPipeline
5
+ import base_model
6
+
7
+ class StreamingEngine(base_model.BaseEngine):
8
+ def __init__(self, name):
9
+ # 1. Initialize configuration variables first
10
+ self.default_voice = "af_heart"
11
+ self.voice_mapping = {
12
+ "alloy": "af_heart",
13
+ "echo": "af_bella",
14
+ "fable": "af_nicole",
15
+ "onyx": "af_aoede",
16
+ "nova": "af_aoede",
17
+ "shimmer": "af_aoede"
18
+ }
19
+
20
+ # 2. Call super init (which usually calls load_model)
21
+ super().__init__(name)
22
+
23
+ def load_model(self):
24
+ try:
25
+ self.tts = KPipeline(lang_code='a')
26
+ # self.text_processor = self.tts.model.text_processor
27
+ self.sample_rate = 24000
28
+ print(f"Model Loaded. Rate: {self.sample_rate}")
29
+ except Exception as e:
30
+ # 3. CRITICAL FIX: Don't sys.exit(1). Raise exception instead.
31
+ print(f"Error initializing model {self.name}: {e}")
32
+ raise RuntimeError(f"Failed to load model {self.name}") from e
33
+
34
+ def get_style_safe(self, voice_name: str):
35
+ """
36
+ Safely retrieves a voice style.
37
+ """
38
+ # 4. Logic optimized: Map -> Try -> Fallback
39
+ clean_name = voice_name.lower().strip()
40
+ target_name = self.voice_mapping.get(clean_name, self.default_voice)
41
+ print(f"Found voice {target_name}")
42
+ return target_name
43
+
44
+ def preprocess_text(self, text):
45
+ if not text:
46
+ return ""
47
+
48
+ is_valid, unsupported = True, []
49
+ if not is_valid:
50
+ print(f" ⚠️ Contains {len(unsupported)} unsupported character(s): {unsupported[:5]}")
51
+ # Escape characters safe for regex usage
52
+ pattern = f"[{re.escape(''.join(unsupported))}]"
53
+ preprocessed = re.sub(pattern, "", text)
54
+
55
+ if preprocessed != text:
56
+ print(f" After preprocessing: {preprocessed[:50]}...")
57
+ text = preprocessed
58
+ else:
59
+ # Optional: Comment this out in production to reduce log spam
60
+ print(" ✓ All characters supported")
61
+
62
+ return text
63
+
64
+ def generate(self, chunks: str, voice_name: str, speed: float):
65
+ """
66
+ Generates audio.
67
+ Returns: audio_float_array
68
+ """
69
+
70
+
71
+ # If supertonic DOES NOT support speed, simple generation:
72
+ generator = self.tts(chunks, voice=voice_name,speed=speed)
73
+ for i, (gs, ps, audio) in enumerate(generator):
74
+ yield audio.numpy()
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  supertonic
 
 
 
2
  uvicorn
3
  fastapi
 
1
  supertonic
2
+ kokoro
3
+ pip
4
+ lameenc
5
  uvicorn
6
  fastapi
supertonic_model.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import re
3
+ import asyncio
4
+ from supertonic import TTS
5
+ import base_model
6
+
7
+ class StreamingEngine(base_model.BaseEngine):
8
+ def __init__(self, name):
9
+ # 1. Initialize configuration variables first
10
+ self.default_voice = "F1"
11
+ self.voice_mapping = {
12
+ "alloy": "F1",
13
+ "echo": "M1",
14
+ "fable": "M2",
15
+ "onyx": "M3",
16
+ "nova": "F2",
17
+ "shimmer": "F3"
18
+ }
19
+
20
+ # 2. Call super init (which usually calls load_model)
21
+ super().__init__(name)
22
+
23
+ def load_model(self):
24
+ try:
25
+ self.tts = TTS(auto_download=True)
26
+ self.text_processor = self.tts.model.text_processor
27
+ self.sample_rate = self.tts.sample_rate
28
+ print(f"Model Loaded. Rate: {self.sample_rate}")
29
+ except Exception as e:
30
+ # 3. CRITICAL FIX: Don't sys.exit(1). Raise exception instead.
31
+ print(f"Error initializing model {self.name}: {e}")
32
+ raise RuntimeError(f"Failed to load model {self.name}") from e
33
+
34
+ def get_style_safe(self, voice_name: str):
35
+ """
36
+ Safely retrieves a voice style.
37
+ """
38
+ # 4. Logic optimized: Map -> Try -> Fallback
39
+ clean_name = voice_name.lower().strip()
40
+ target_name = self.voice_mapping.get(clean_name, self.default_voice)
41
+ print(f"Found voice {target_name}")
42
+
43
+ try:
44
+ # Try specific voice
45
+ return self.tts.get_voice_style(voice_name=target_name)
46
+ except Exception:
47
+ print(f"WARNING: Voice '{voice_name}' (mapped to '{target_name}') not found. Using '{self.default_voice}'.")
48
+
49
+ # Fallback to default
50
+ try:
51
+ return self.tts.get_voice_style(voice_name=self.default_voice)
52
+ except Exception as e:
53
+ # If default fails, we are in trouble
54
+ print(f"CRITICAL: Default voice '{self.default_voice}' also failed.")
55
+ raise e
56
+
57
+ def preprocess_text(self, text):
58
+ if not text:
59
+ return ""
60
+
61
+ is_valid, unsupported = self.text_processor.validate_text(text)
62
+
63
+ if not is_valid:
64
+ print(f" ⚠️ Contains {len(unsupported)} unsupported character(s): {unsupported[:5]}")
65
+ # Escape characters safe for regex usage
66
+ pattern = f"[{re.escape(''.join(unsupported))}]"
67
+ preprocessed = re.sub(pattern, "", text)
68
+
69
+ if preprocessed != text:
70
+ print(f" After preprocessing: {preprocessed[:50]}...")
71
+ text = preprocessed
72
+ else:
73
+ # Optional: Comment this out in production to reduce log spam
74
+ print(" ✓ All characters supported")
75
+
76
+ return text
77
+
78
+ def generate(self, chunks: str, voice_name: str, speed: float):
79
+ """
80
+ Generates audio.
81
+ Returns: audio_float_array
82
+ """
83
+ # 5. Handle Speed (if supported by supertonic, otherwise ignore or warn)
84
+ # Assuming supertonic.synthesize supports a speed or speed_ratio argument:
85
+ # audio = self.tts.synthesize(chunks, voice_name, speed=speed)
86
+
87
+ # If supertonic DOES NOT support speed, simple generation:
88
+ audio,_ = self.tts.synthesize(chunks, voice_name)
89
+ yield audio
test/run_kokoro.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from kokoro import KPipeline
2
+ import soundfile as sf
3
+ pipeline = KPipeline(lang_code='a')
4
+ text = '''
5
+ [Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
6
+ '''
7
+ generator = pipeline(text, voice='af_heart')
8
+ for i, (gs, ps, audio) in enumerate(generator):
9
+ print(i, gs, ps)
10
+ sf.write(f'{i}.wav', audio, 24000)
test/speech.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
2
+ "model": "tts-2",
3
+ "input": "Supertonic is a lightning-fast, on-device text-to-speech system designed for extreme performance with minimal computational overhead. Powered by ONNX Runtime, it runs entirely on your device—no cloud, no API calls, no privacy concerns.",
4
+ "voice": "alloy",
5
+ "format": "wav"
6
+ }' --output ./v2.wav
7
+
8
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
9
+ "model": "tts-2",
10
+ "input": "Supertonic is a lightning-fast, on-device text-to-speech system designed for extreme performance with minimal computational overhead. Powered by ONNX Runtime, it runs entirely on your device—no cloud, no API calls, no privacy concerns.",
11
+ "voice": "alloy",
12
+ "format": "mp3"
13
+ }' --output ./v2.mp3
14
+
15
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
16
+ "model": "tts-1",
17
+ "input": "[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.",
18
+ "voice": "alloy",
19
+ "format": "wav"
20
+ }' --output ./v1.wav
21
+
22
+ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -H "Authorization: Bearer yourapi" -d '{
23
+ "model": "tts-1",
24
+ "input": "[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.",
25
+ "voice": "alloy",
26
+ "format": "mp3"
27
+ }' --output ./v1.mp3
utils.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import re
3
+ import struct
4
+ import lameenc
5
+
6
+ def split_text_into_sentences(text: str, min_chunk_size: int = 150):
7
+ if not text:
8
+ return []
9
+
10
+ # Clean text
11
+ text = re.sub(r'\s+', ' ', text).strip()
12
+
13
+ # Split atomic sentences
14
+ raw_parts = re.split(r'([.?!:;]+)(?=\s|$)', text)
15
+
16
+ atomic_sentences = []
17
+ current_atomic = ""
18
+
19
+ for part in raw_parts:
20
+ if re.match(r'^[.?!:;]+$', part):
21
+ current_atomic += part
22
+ if current_atomic.strip():
23
+ atomic_sentences.append(current_atomic.strip())
24
+ current_atomic = ""
25
+ else:
26
+ current_atomic += part
27
+
28
+ if current_atomic.strip():
29
+ atomic_sentences.append(current_atomic.strip())
30
+
31
+ # Batching Logic
32
+ final_chunks = []
33
+ current_buffer = ""
34
+ first_sentence_sent = False
35
+
36
+ for sentence in atomic_sentences:
37
+ if not first_sentence_sent:
38
+ final_chunks.append(sentence)
39
+ first_sentence_sent = True
40
+ continue
41
+
42
+ if current_buffer:
43
+ current_buffer += " " + sentence
44
+ else:
45
+ current_buffer = sentence
46
+
47
+ if len(current_buffer) >= min_chunk_size:
48
+ final_chunks.append(current_buffer)
49
+ current_buffer = ""
50
+
51
+ if current_buffer:
52
+ final_chunks.append(current_buffer)
53
+
54
+ return final_chunks
55
+
56
+ def create_wav_header(sample_rate: int, channels: int = 1, bits_per_sample: int = 16):
57
+ byte_rate = sample_rate * channels * bits_per_sample // 8
58
+ block_align = channels * bits_per_sample // 8
59
+
60
+ header = b'RIFF'
61
+ header += struct.pack('<I', 0xFFFFFFFF)
62
+ header += b'WAVE'
63
+ header += b'fmt '
64
+ header += struct.pack('<I', 16)
65
+ header += struct.pack('<H', 1)
66
+ header += struct.pack('<H', channels)
67
+ header += struct.pack('<I', sample_rate)
68
+ header += struct.pack('<I', byte_rate)
69
+ header += struct.pack('<H', block_align)
70
+ header += struct.pack('<H', bits_per_sample)
71
+ header += b'data'
72
+ header += struct.pack('<I', 0xFFFFFFFF)
73
+
74
+ return header
75
+
76
+ def float_to_pcm16(audio_array):
77
+ """Converts float32 audio to int16 bytes."""
78
+ audio_array = np.array(audio_array)
79
+ if len(audio_array.shape) > 1:
80
+ audio_array = audio_array.flatten()
81
+
82
+ # Clip to prevent distortion
83
+ audio_array = np.clip(audio_array, -1.0, 1.0)
84
+
85
+ # Convert to 16-bit PCM
86
+ audio_int16 = (audio_array * 32767).astype(np.int16)
87
+ return audio_int16.tobytes()
88
+
89
+ def create_mp3_encoder(sample_rate=44100, channels=1, bit_rate=128, quality=5):
90
+ encoder = lameenc.Encoder()
91
+ encoder.set_bit_rate(bit_rate)
92
+ encoder.set_in_sample_rate(sample_rate)
93
+ encoder.set_channels(channels)
94
+ encoder.set_quality(quality)
95
+ return encoder
96
+
97
+
98
+ def float_to_mp3(audio_array, encoder):
99
+ """
100
+ Converts float32 audio -> Int16 -> Encoded MP3 bytes.
101
+ """
102
+ # 1. Convert Float to PCM Int16
103
+ audio_array = np.array(audio_array)
104
+ if len(audio_array.shape) > 1:
105
+ audio_array = audio_array.flatten()
106
+
107
+ audio_array = np.clip(audio_array, -1.0, 1.0)
108
+ audio_int16 = (audio_array * 32767).astype(np.int16)
109
+
110
+ # 2. Encode to MP3
111
+ # lameenc returns a bytearray, but FastAPI/Starlette requires strictly 'bytes'
112
+ mp3_data = encoder.encode(audio_int16.tobytes())
113
+
114
+ return bytes(mp3_data) # <--- CRITICAL FIX: Convert bytearray to bytes