Xenova HF Staff commited on
Commit
2beb552
·
verified ·
1 Parent(s): bf785c9

Upload 18 files

Browse files
eslint.config.js ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from "@eslint/js";
2
+ import globals from "globals";
3
+ import reactHooks from "eslint-plugin-react-hooks";
4
+ import reactRefresh from "eslint-plugin-react-refresh";
5
+ import tseslint from "typescript-eslint";
6
+ import { defineConfig, globalIgnores } from "eslint/config";
7
+
8
+ export default defineConfig([
9
+ globalIgnores(["dist"]),
10
+ {
11
+ files: ["**/*.{ts,tsx}"],
12
+ extends: [
13
+ js.configs.recommended,
14
+ tseslint.configs.recommended,
15
+ reactHooks.configs.flat.recommended,
16
+ reactRefresh.configs.vite,
17
+ ],
18
+ languageOptions: {
19
+ ecmaVersion: 2020,
20
+ globals: globals.browser,
21
+ },
22
+ },
23
+ ]);
index.html ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link
6
+ rel="icon"
7
+ href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>⚡️</text></svg>"
8
+ />
9
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
10
+ <title>Supertonic WebGPU</title>
11
+ </head>
12
+ <body>
13
+ <div id="root"></div>
14
+ <script type="module" src="/src/main.tsx"></script>
15
+ </body>
16
+ </html>
package.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "supertonic-webgpu",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "tsc -b && vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
+ },
12
+ "dependencies": {
13
+ "@huggingface/transformers": "^3.8.0",
14
+ "@tailwindcss/vite": "^4.1.17",
15
+ "lucide-react": "^0.554.0",
16
+ "react": "^19.2.0",
17
+ "react-dom": "^19.2.0",
18
+ "tailwindcss": "^4.1.17"
19
+ },
20
+ "devDependencies": {
21
+ "@eslint/js": "^9.39.1",
22
+ "@types/node": "^24.10.0",
23
+ "@types/react": "^19.2.2",
24
+ "@types/react-dom": "^19.2.2",
25
+ "@vitejs/plugin-react": "^5.1.0",
26
+ "eslint": "^9.39.1",
27
+ "eslint-plugin-react-hooks": "^7.0.1",
28
+ "eslint-plugin-react-refresh": "^0.4.24",
29
+ "globals": "^16.5.0",
30
+ "typescript": "~5.9.3",
31
+ "typescript-eslint": "^8.46.3",
32
+ "vite": "^7.2.2"
33
+ }
34
+ }
public/the-great-gatsby.txt ADDED
The diff for this file is too large to render. See raw diff
 
src/App.tsx ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useEffect, useRef } from "react";
2
+ import { Zap, AlignLeft, Quote, Type, FileText, Check, X, Dices } from "lucide-react";
3
+ import { useTTS } from "./components/TTSContext";
4
+ import { TTSProvider } from "./components/TTSProvider";
5
+ import { streamTTS, createAudioBlob } from "./tts";
6
+ import { SAMPLE_RATE, EXAMPLE_SENTENCES } from "./constants";
7
+ import { AudioResult } from "./components/AudioResult";
8
+ import { Controls } from "./components/Controls";
9
+
10
+ const AppContent = () => {
11
+ const [text, setText] = useState(
12
+ "Introducing Supertonic WebGPU: blazingly fast text-to-speech running 100% locally in your browser.",
13
+ );
14
+ const [activeTab, setActiveTab] = useState<string | null>("Freeform");
15
+ const [isGenerating, setIsGenerating] = useState(false);
16
+ const [showResults, setShowResults] = useState(false);
17
+ const [quality, setQuality] = useState(5);
18
+ const [speed, setSpeed] = useState(1.0);
19
+ const [voice, setVoice] = useState("Female");
20
+
21
+ const { pipelineReady, tts, speakerEmbeddings, downloadProgress } = useTTS();
22
+
23
+ const [stats, setStats] = useState({
24
+ firstLatency: null as number | null,
25
+ processingTime: 0,
26
+ charsPerSec: 0,
27
+ rtf: 0,
28
+ totalDuration: 0,
29
+ currentDuration: 0,
30
+ });
31
+ const [generationProgress, setGenerationProgress] = useState(0);
32
+ const [isPlaying, setIsPlaying] = useState(false);
33
+
34
+ const audioContextRef = useRef<AudioContext | null>(null);
35
+ const nextPlayTimeRef = useRef<number>(0);
36
+ const fullAudioBufferRef = useRef<Float32Array[]>([]);
37
+ const playbackStartTimeRef = useRef<number>(0);
38
+ const playbackAnimationFrameRef = useRef<number>(0);
39
+ const activeSourceNodesRef = useRef<AudioBufferSourceNode[]>([]);
40
+ const isPlaybackInterruptedRef = useRef(false);
41
+ const stopGenerationRef = useRef(false);
42
+
43
+ const [exampleTexts, setExampleTexts] = useState<Record<string, string | string[]>>(EXAMPLE_SENTENCES);
44
+
45
+ useEffect(() => {
46
+ fetch("/the-great-gatsby.txt")
47
+ .then((res) => res.text())
48
+ .then((text) => {
49
+ setExampleTexts((prev) => ({ ...prev, "Full story": text }));
50
+ })
51
+ .catch((e) => console.error("Failed to load story", e));
52
+ }, []);
53
+
54
+ useEffect(() => {
55
+ return () => {
56
+ if (audioContextRef.current) {
57
+ audioContextRef.current.close();
58
+ }
59
+ cancelAnimationFrame(playbackAnimationFrameRef.current);
60
+ };
61
+ }, []);
62
+
63
+ useEffect(() => {
64
+ const updatePlaybackUI = () => {
65
+ if (isPlaying && audioContextRef.current) {
66
+ const ctx = audioContextRef.current;
67
+ const elapsed = ctx.currentTime - playbackStartTimeRef.current;
68
+
69
+ // If reached end of current known duration
70
+ if (elapsed >= stats.totalDuration && !isGenerating && stats.totalDuration > 0) {
71
+ setIsPlaying(false);
72
+ setStats((prev) => ({
73
+ ...prev,
74
+ currentDuration: prev.totalDuration,
75
+ })); // Snap to end
76
+ return;
77
+ }
78
+
79
+ setStats((prev) => ({
80
+ ...prev,
81
+ currentDuration: Math.min(elapsed, prev.totalDuration),
82
+ }));
83
+
84
+ playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
85
+ }
86
+ };
87
+
88
+ if (isPlaying) {
89
+ playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
90
+ } else {
91
+ cancelAnimationFrame(playbackAnimationFrameRef.current);
92
+ }
93
+ }, [isPlaying, isGenerating, stats.totalDuration]);
94
+
95
+ const handleExampleClick = (type: string) => {
96
+ setActiveTab(type);
97
+ let selection = exampleTexts[type];
98
+ if (Array.isArray(selection)) {
99
+ setText(selection[Math.floor(Math.random() * selection.length)]);
100
+ return;
101
+ }
102
+ setText(selection);
103
+ };
104
+
105
+ const stopAllAudio = () => {
106
+ activeSourceNodesRef.current.forEach((node) => {
107
+ try {
108
+ node.stop();
109
+ } catch (e) {}
110
+ });
111
+ activeSourceNodesRef.current = [];
112
+ };
113
+
114
+ const handleStop = () => {
115
+ stopGenerationRef.current = true;
116
+ };
117
+
118
+ const handleGenerate = async () => {
119
+ if (isGenerating) return;
120
+
121
+ stopAllAudio();
122
+
123
+ setShowResults(true);
124
+ setIsGenerating(true);
125
+ setGenerationProgress(0);
126
+ stopGenerationRef.current = false;
127
+ setStats({
128
+ firstLatency: null,
129
+ processingTime: 0,
130
+ charsPerSec: 0,
131
+ rtf: 0,
132
+ totalDuration: 0,
133
+ currentDuration: 0,
134
+ });
135
+ fullAudioBufferRef.current = [];
136
+ isPlaybackInterruptedRef.current = false;
137
+
138
+ if (!audioContextRef.current) {
139
+ audioContextRef.current = new (window.AudioContext || (window as any).webkitAudioContext)();
140
+ }
141
+ const ctx = audioContextRef.current;
142
+ if (ctx.state === "suspended") {
143
+ await ctx.resume();
144
+ }
145
+
146
+ nextPlayTimeRef.current = ctx.currentTime + 0.1;
147
+ playbackStartTimeRef.current = nextPlayTimeRef.current;
148
+ setIsPlaying(true);
149
+
150
+ const startTime = performance.now();
151
+ let processedChars = 0;
152
+ let generatedAudioSeconds = 0;
153
+
154
+ try {
155
+ if (!tts.current || !speakerEmbeddings.current) throw new Error("TTS pipeline not ready");
156
+ const selectedEmbedding = speakerEmbeddings.current[voice];
157
+
158
+ for await (const result of streamTTS(text, tts.current, selectedEmbedding, quality, speed)) {
159
+ if (stopGenerationRef.current) {
160
+ break;
161
+ }
162
+
163
+ const now = performance.now();
164
+ const elapsedSec = (now - startTime) / 1000;
165
+
166
+ setStats((prev) => ({
167
+ ...prev,
168
+ firstLatency: prev.firstLatency === null ? elapsedSec : prev.firstLatency,
169
+ processingTime: elapsedSec,
170
+ }));
171
+
172
+ const chunkDuration = result.audio.audio.length / result.audio.sampling_rate;
173
+ generatedAudioSeconds += chunkDuration;
174
+
175
+ fullAudioBufferRef.current.push(result.audio.audio);
176
+
177
+ // Only schedule streaming playback if user hasn't interrupted
178
+ if (!isPlaybackInterruptedRef.current) {
179
+ const buffer = ctx.createBuffer(1, result.audio.audio.length, result.audio.sampling_rate);
180
+ buffer.copyToChannel(result.audio.audio as any, 0);
181
+
182
+ const source = ctx.createBufferSource();
183
+ source.buffer = buffer;
184
+ source.connect(ctx.destination);
185
+ source.start(nextPlayTimeRef.current);
186
+
187
+ activeSourceNodesRef.current.push(source);
188
+ source.onended = () => {
189
+ const idx = activeSourceNodesRef.current.indexOf(source);
190
+ if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
191
+ };
192
+
193
+ nextPlayTimeRef.current += buffer.duration;
194
+ }
195
+
196
+ processedChars += result.text.length;
197
+ const currentRtf = elapsedSec / generatedAudioSeconds;
198
+ const currentCharsPerSec = processedChars / elapsedSec;
199
+
200
+ setStats((prev) => ({
201
+ ...prev,
202
+ charsPerSec: currentCharsPerSec,
203
+ rtf: currentRtf,
204
+ totalDuration: generatedAudioSeconds,
205
+ }));
206
+
207
+ setGenerationProgress((result.index / result.total) * 100);
208
+ }
209
+ } catch (e) {
210
+ console.error("Generation failed", e);
211
+ } finally {
212
+ setIsGenerating(false);
213
+ isPlaybackInterruptedRef.current = false; // Reset after completion
214
+ }
215
+ };
216
+
217
+ const handleSeek = (percentage: number) => {
218
+ if (!audioContextRef.current || fullAudioBufferRef.current.length === 0) return;
219
+
220
+ const ctx = audioContextRef.current;
221
+
222
+ isPlaybackInterruptedRef.current = true;
223
+ stopAllAudio();
224
+
225
+ const seekTime = stats.totalDuration * percentage;
226
+
227
+ let currentTimeInAudio = 0;
228
+ let nextPlayTime = ctx.currentTime;
229
+
230
+ // Reset startTime such that (currentTime - startTime) = seekTime
231
+ playbackStartTimeRef.current = ctx.currentTime - seekTime;
232
+
233
+ for (const chunk of fullAudioBufferRef.current) {
234
+ const chunkDuration = chunk.length / SAMPLE_RATE;
235
+ const chunkEndTime = currentTimeInAudio + chunkDuration;
236
+
237
+ if (chunkEndTime > seekTime) {
238
+ // This chunk needs to be played
239
+ const offsetInChunk = Math.max(0, seekTime - currentTimeInAudio);
240
+ const durationToPlay = chunkDuration - offsetInChunk;
241
+
242
+ const buffer = ctx.createBuffer(1, chunk.length, SAMPLE_RATE);
243
+ buffer.copyToChannel(chunk as any, 0);
244
+
245
+ const source = ctx.createBufferSource();
246
+ source.buffer = buffer;
247
+ source.connect(ctx.destination);
248
+
249
+ source.start(nextPlayTime, offsetInChunk);
250
+
251
+ activeSourceNodesRef.current.push(source);
252
+ source.onended = () => {
253
+ const idx = activeSourceNodesRef.current.indexOf(source);
254
+ if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
255
+ };
256
+
257
+ nextPlayTime += durationToPlay;
258
+ }
259
+
260
+ currentTimeInAudio += chunkDuration;
261
+ }
262
+
263
+ if (ctx.state === "suspended") ctx.resume();
264
+ setIsPlaying(true);
265
+ };
266
+
267
+ const handleDownload = () => {
268
+ if (fullAudioBufferRef.current.length === 0) return;
269
+ const blob = createAudioBlob(fullAudioBufferRef.current, SAMPLE_RATE);
270
+ const url = URL.createObjectURL(blob);
271
+ const a = document.createElement("a");
272
+ a.href = url;
273
+ a.download = "audio.wav";
274
+ a.click();
275
+ URL.revokeObjectURL(url);
276
+ };
277
+
278
+ const togglePlay = async () => {
279
+ if (!audioContextRef.current) return;
280
+
281
+ if (isPlaying) {
282
+ setIsPlaying(false);
283
+ audioContextRef.current.suspend();
284
+ } else {
285
+ setIsPlaying(true);
286
+ audioContextRef.current.resume();
287
+
288
+ // If we finished playing and hit play again, replay from start
289
+ if (!isGenerating && stats.currentDuration >= stats.totalDuration) {
290
+ handleSeek(0);
291
+ } else if (!isGenerating && fullAudioBufferRef.current.length > 0 && activeSourceNodesRef.current.length === 0) {
292
+ // This handles the case where we paused/stopped but haven't technically reached "end" OR we are resuming replay
293
+ const currentProgress = stats.totalDuration > 0 ? stats.currentDuration / stats.totalDuration : 0;
294
+ handleSeek(currentProgress);
295
+ }
296
+ }
297
+ };
298
+
299
+ const canGenerate = text.length >= 10 && pipelineReady;
300
+
301
+ return (
302
+ <div className="min-h-screen bg-[#F2F2F2] font-sans text-gray-900 selection:bg-yellow-200 flex items-center justify-center py-10">
303
+ <div className="w-full max-w-7xl px-4 md:px-6">
304
+ <div className="text-center mb-10">
305
+ <h3 className="text-4xl md:text-6xl font-medium text-gray-900 tracking-tight">Supertonic WebGPU</h3>
306
+ <h4 className="text-gray-600 mt-3 text-2xl md:text-3xl font-light">
307
+ Generate speech directly in your browser
308
+ </h4>
309
+ </div>
310
+
311
+ <div className="bg-white rounded-2xl shadow-2xl overflow-hidden border border-gray-100 max-w-7xl mx-auto p-2">
312
+ <div className="hidden md:grid grid-cols-1 md:grid-cols-2 border-b border-gray-100 bg-white relative rounded-t-xl">
313
+ <div className="px-8 py-6 flex items-center justify-center">
314
+ <div className="text-3xl font-normal text-gray-800">Text</div>
315
+ </div>
316
+
317
+ <div className="px-8 py-6 flex flex-col items-center justify-center relative bg-gray-50/30 md:bg-white">
318
+ <div className="text-3xl font-normal text-gray-800 mb-2">Speech</div>
319
+ </div>
320
+
321
+ <div className="absolute left-1/2 top-1/2 -translate-x-1/2 -translate-y-1/2 bg-white p-3 rounded-full z-10 shadow-sm border border-gray-50">
322
+ <Zap className="text-yellow-400 fill-yellow-400 drop-shadow-sm" size={32} />
323
+ </div>
324
+ </div>
325
+
326
+ <div className="flex flex-col md:flex-row min-h-[450px]">
327
+ <div className="w-full md:w-1/2 p-8 border-r border-gray-100 flex flex-col bg-white relative">
328
+ <textarea
329
+ className="w-full flex-grow text-xl md:text-2xl text-gray-800 placeholder-gray-300 outline-none resize-none font-light leading-relaxed bg-transparent"
330
+ placeholder="This text-to-speech system runs entirely in your browser, providing fast and private operation without sending any data to external servers."
331
+ value={text}
332
+ onChange={(e) => {
333
+ setText(e.target.value);
334
+ setActiveTab("Freeform");
335
+ }}
336
+ spellCheck={false}
337
+ />
338
+
339
+ <div className="mt-auto w-full">
340
+ <div className="flex justify-end mb-2">
341
+ <div className="flex items-center gap-2 text-xs md:text-sm font-mono text-gray-400">
342
+ {text.length > 0 ? text.length : 0} chars
343
+ {text.length >= 10 ? (
344
+ <Check size={14} className="text-green-500" />
345
+ ) : (
346
+ <X size={14} className="text-red-500" />
347
+ )}
348
+ </div>
349
+ </div>
350
+
351
+ <div className="pt-6 flex flex-wrap items-center border-t border-gray-100 text-gray-500">
352
+ <div className="flex gap-3 md:gap-5 text-sm md:text-base overflow-x-auto pb-2 md:pb-0 w-full">
353
+ {Object.keys(exampleTexts).map((key) => (
354
+ <button
355
+ key={key}
356
+ onClick={() => handleExampleClick(key)}
357
+ className={`flex items-center gap-1.5 transition whitespace-nowrap ${activeTab === key ? "text-blue-600 font-semibold border-b-2 border-blue-500 pb-0.5" : "hover:text-gray-900"}`}
358
+ >
359
+ {key === "Quote" && <Quote size={16} />}
360
+ {key === "Paragraph" && <AlignLeft size={16} />}
361
+ {key === "Full story" && <FileText size={16} />}
362
+ {key === "Random" && <Dices size={16} />}
363
+ {key === "Freeform" && <Type size={16} />}
364
+ {key}
365
+ </button>
366
+ ))}
367
+ </div>
368
+ </div>
369
+ </div>
370
+ </div>
371
+
372
+ <Controls
373
+ quality={quality}
374
+ setQuality={setQuality}
375
+ speed={speed}
376
+ setSpeed={setSpeed}
377
+ voice={voice}
378
+ setVoice={setVoice}
379
+ onGenerate={handleGenerate}
380
+ onStop={handleStop}
381
+ isGenerating={isGenerating}
382
+ canGenerate={canGenerate}
383
+ pipelineReady={pipelineReady}
384
+ progress={generationProgress}
385
+ loadingProgress={downloadProgress}
386
+ />
387
+ </div>
388
+
389
+ {showResults && (
390
+ <div className="px-4 pb-4">
391
+ <AudioResult
392
+ stats={stats}
393
+ progressPercentage={generationProgress}
394
+ isGenerating={isGenerating}
395
+ isPlaying={isPlaying}
396
+ onTogglePlay={togglePlay}
397
+ onDownload={handleDownload}
398
+ onSeek={handleSeek}
399
+ />
400
+ </div>
401
+ )}
402
+ </div>
403
+ </div>
404
+ </div>
405
+ );
406
+ };
407
+
408
+ const App = () => {
409
+ return (
410
+ <TTSProvider>
411
+ <AppContent />
412
+ </TTSProvider>
413
+ );
414
+ };
415
+
416
+ export default App;
src/components/AudioResult.tsx ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useRef } from "react";
2
+ import { Play, Pause, Download } from "lucide-react";
3
+
4
+ interface AudioResultProps {
5
+ stats: {
6
+ firstLatency: number | null;
7
+ processingTime: number;
8
+ charsPerSec: number;
9
+ rtf: number;
10
+ totalDuration: number;
11
+ currentDuration: number;
12
+ };
13
+ progressPercentage: number;
14
+ isGenerating: boolean;
15
+ isPlaying: boolean;
16
+ onTogglePlay: () => void;
17
+ onDownload: () => void;
18
+ onSeek: (percentage: number) => void;
19
+ }
20
+
21
+ export const AudioResult = ({
22
+ stats,
23
+ progressPercentage,
24
+ isGenerating,
25
+ isPlaying,
26
+ onTogglePlay,
27
+ onDownload,
28
+ onSeek,
29
+ }: AudioResultProps) => {
30
+ const progressBarRef = useRef<HTMLDivElement>(null);
31
+
32
+ const formatTime = (secs: number) => {
33
+ const minutes = Math.floor(secs / 60);
34
+ const seconds = (secs % 60).toFixed(2);
35
+ return `${minutes}:${seconds.padStart(5, "0")}`;
36
+ };
37
+
38
+ const playbackProgress =
39
+ stats.totalDuration > 0 ? Math.min(100, (stats.currentDuration / stats.totalDuration) * 100) : 0;
40
+
41
+ const handleSeekClick = (e: React.MouseEvent<HTMLDivElement>) => {
42
+ if (progressBarRef.current) {
43
+ const rect = progressBarRef.current.getBoundingClientRect();
44
+ const x = e.clientX - rect.left;
45
+ const percentage = Math.max(0, Math.min(1, x / rect.width));
46
+ onSeek(percentage);
47
+ }
48
+ };
49
+
50
+ return (
51
+ <div className="mt-8 relative rounded-xl overflow-hidden animate-in fade-in slide-in-from-top-4 duration-500 border border-gray-200">
52
+ <div
53
+ className="absolute top-0 left-0 bottom-0 bg-[#E5E7EB] transition-all duration-500 ease-out -z-10"
54
+ style={{ width: `${progressPercentage}%` }}
55
+ />
56
+ <div className="absolute top-0 left-0 w-full h-full bg-gray-50 -z-20" />
57
+
58
+ <div className="p-4 flex flex-col md:flex-row items-center gap-6 relative z-10">
59
+ <div className="flex flex-col min-w-[80px]">
60
+ <span className="text-blue-600 font-semibold text-lg leading-tight">Supertonic</span>
61
+ <span className="text-gray-600 text-sm font-medium">On-Device</span>
62
+ </div>
63
+
64
+ <div className="flex-1 grid grid-cols-3 gap-x-8 gap-y-1 text-center border-l border-r border-gray-300 px-4 md:px-8">
65
+ <div className="flex flex-col items-center">
66
+ <div className="font-mono text-gray-900 text-lg tracking-tight">
67
+ {stats.firstLatency !== null ? (
68
+ <>
69
+ <span className="text-[10px] text-gray-500 font-sans font-bold uppercase mr-1 align-middle">
70
+ First
71
+ </span>
72
+ {stats.firstLatency.toFixed(2)}
73
+ <span className="text-sm text-gray-500">s</span>
74
+ <span className="mx-1 text-gray-400">/</span>
75
+ </>
76
+ ) : null}
77
+ {stats.processingTime.toFixed(2)}
78
+ <span className="text-sm text-gray-500">s</span>
79
+ </div>
80
+ <div className="text-[10px] text-gray-500 uppercase font-bold tracking-wider mt-1">Processing Time ↓</div>
81
+ </div>
82
+
83
+ <div className="flex flex-col items-center">
84
+ <div className="font-mono text-gray-900 text-lg tracking-tight">
85
+ {stats.charsPerSec > 0 ? stats.charsPerSec.toFixed(1) : "-"}
86
+ </div>
87
+ <div className="text-[10px] text-gray-500 uppercase font-bold tracking-wider mt-1">Chars/sec ↑</div>
88
+ </div>
89
+
90
+ <div className="flex flex-col items-center">
91
+ <div className="font-mono text-gray-900 text-lg tracking-tight">
92
+ {stats.rtf > 0 ? stats.rtf.toFixed(3) : "-"}
93
+ <span className="text-sm text-gray-500">x</span>
94
+ </div>
95
+ <div className="text-[10px] text-gray-500 uppercase font-bold tracking-wider mt-1">RTF ↓</div>
96
+ </div>
97
+ </div>
98
+
99
+ <div className="flex items-center gap-4 min-w-[300px] w-full md:w-auto">
100
+ <button
101
+ onClick={onTogglePlay}
102
+ className={`w-10 h-10 rounded-full flex items-center justify-center transition-colors flex-shrink-0 text-gray-800 bg-white hover:bg-gray-100 shadow-sm`}
103
+ >
104
+ {isPlaying ? (
105
+ <Pause size={18} fill="currentColor" />
106
+ ) : (
107
+ <Play size={18} fill="currentColor" className="ml-0.5" />
108
+ )}
109
+ </button>
110
+
111
+ <span className="font-mono text-xs text-gray-600 w-[50px] text-right">
112
+ {formatTime(stats.currentDuration)}
113
+ </span>
114
+
115
+ <div
116
+ ref={progressBarRef}
117
+ className="relative flex-1 h-1.5 bg-gray-300 rounded-full overflow-hidden min-w-[100px] cursor-pointer hover:h-2 transition-all group"
118
+ onClick={handleSeekClick}
119
+ >
120
+ <div
121
+ className="absolute top-0 left-0 h-full bg-blue-500 rounded-full group-hover:bg-blue-600"
122
+ style={{ width: `${playbackProgress}%` }}
123
+ />
124
+ </div>
125
+
126
+ <span className="font-mono text-xs text-gray-600 w-[50px]">{formatTime(stats.totalDuration)}</span>
127
+
128
+ <button
129
+ onClick={onDownload}
130
+ disabled={isGenerating}
131
+ className={`p-2 rounded-full text-gray-700 transition-colors ${isGenerating ? "opacity-50 cursor-not-allowed" : "hover:bg-gray-200"}`}
132
+ >
133
+ <Download size={18} />
134
+ </button>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ );
139
+ };
src/components/Controls.tsx ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Zap, Square } from "lucide-react";
2
+
3
+ interface ControlsProps {
4
+ quality: number;
5
+ setQuality: (value: number) => void;
6
+ speed: number;
7
+ setSpeed: (value: number) => void;
8
+ voice: string;
9
+ setVoice: (value: string) => void;
10
+ onGenerate: () => void;
11
+ onStop: () => void;
12
+ isGenerating: boolean;
13
+ canGenerate: boolean;
14
+ pipelineReady: boolean;
15
+ progress?: number;
16
+ loadingProgress: number;
17
+ }
18
+
19
+ export const Controls = ({
20
+ quality,
21
+ setQuality,
22
+ speed,
23
+ setSpeed,
24
+ voice,
25
+ setVoice,
26
+ onGenerate,
27
+ onStop,
28
+ isGenerating,
29
+ canGenerate,
30
+ pipelineReady,
31
+ progress,
32
+ loadingProgress,
33
+ }: ControlsProps) => {
34
+ return (
35
+ <div className="w-full md:w-1/2 p-8 bg-[#F9FAFB] flex flex-col gap-8 border-t md:border-t-0">
36
+ <div className="flex items-center gap-6">
37
+ <span className="font-semibold text-gray-900">Voice:</span>
38
+ <div className="flex gap-4 text-sm">
39
+ {["Female", "Male"].map((v) => (
40
+ <button
41
+ key={v}
42
+ onClick={() => setVoice(v)}
43
+ className={`pb-1 transition-all font-medium border-b-2 ${
44
+ voice === v ? "text-blue-600 border-blue-600" : "text-gray-400 hover:text-gray-600 border-transparent"
45
+ }`}
46
+ >
47
+ {v}
48
+ </button>
49
+ ))}
50
+ </div>
51
+ </div>
52
+
53
+ <div>
54
+ <div className="flex justify-between mb-3 items-end">
55
+ <span className="font-semibold text-gray-900 text-sm">
56
+ Quality (Steps): <span className="text-base">{quality}</span>
57
+ </span>
58
+ <span className="text-gray-400 text-xs italic">Higher = Better quality but slower</span>
59
+ </div>
60
+ <input
61
+ type="range"
62
+ min="1"
63
+ max="50"
64
+ value={quality}
65
+ onChange={(e) => setQuality(parseInt(e.target.value))}
66
+ className="w-full h-1.5 bg-gray-300 rounded-lg appearance-none cursor-pointer accent-gray-900 hover:accent-blue-600"
67
+ />
68
+ <div className="w-full flex justify-center mt-1">
69
+ <div className="w-0.5 h-1 bg-gray-300"></div>
70
+ </div>
71
+ </div>
72
+
73
+ <div>
74
+ <div className="flex justify-between mb-3 items-end">
75
+ <span className="font-semibold text-gray-900 text-sm">
76
+ Speed: <span className="text-base">{speed.toFixed(2)}x</span>
77
+ </span>
78
+ <span className="text-gray-400 text-xs italic">Higher = faster speech</span>
79
+ </div>
80
+ <input
81
+ type="range"
82
+ min="0.8"
83
+ max="1.2"
84
+ step="0.01"
85
+ value={speed}
86
+ onChange={(e) => setSpeed(parseFloat(e.target.value))}
87
+ className="w-full h-1.5 bg-gray-300 rounded-lg appearance-none cursor-pointer accent-gray-900 hover:accent-blue-600"
88
+ />
89
+ <div className="w-full flex justify-center mt-1">
90
+ <div className="w-0.5 h-1 bg-gray-300"></div>
91
+ </div>
92
+ </div>
93
+
94
+ <div className="mt-auto pt-4 flex gap-2">
95
+ <button
96
+ onClick={onGenerate}
97
+ disabled={isGenerating || !canGenerate}
98
+ className={`
99
+ flex-1 py-4 rounded-lg font-bold text-lg flex items-center justify-center gap-3 shadow-sm transition-all
100
+ ${
101
+ isGenerating || !canGenerate
102
+ ? "bg-gray-200 text-gray-400 cursor-not-allowed"
103
+ : "bg-yellow-400 text-gray-900 hover:bg-yellow-300 active:scale-[0.99]"
104
+ }
105
+ `}
106
+ >
107
+ {isGenerating ? (
108
+ <>
109
+ <div className="animate-spin rounded-full h-5 w-5 border-b-2 border-gray-400"></div>
110
+ <span>
111
+ Generating... {progress !== undefined && <span className="font-mono">({Math.round(progress)}%)</span>}
112
+ </span>
113
+ </>
114
+ ) : (
115
+ <>
116
+ <Zap size={20} className={!canGenerate ? "fill-gray-400" : "fill-black"} />
117
+ {pipelineReady ? (
118
+ "Generate Speech"
119
+ ) : (
120
+ <span>
121
+ Loading Model...
122
+ {loadingProgress > 0 && <span className="font-mono"> ({Math.round(loadingProgress)}%)</span>}
123
+ </span>
124
+ )}
125
+ </>
126
+ )}
127
+ </button>
128
+ {isGenerating && (
129
+ <button
130
+ onClick={onStop}
131
+ className="px-6 rounded-lg font-bold text-lg flex items-center justify-center shadow-sm transition-all bg-red-100 text-red-600 hover:bg-red-200 active:scale-[0.99]"
132
+ >
133
+ <Square size={20} fill="currentColor" />
134
+ </button>
135
+ )}
136
+ </div>
137
+ </div>
138
+ );
139
+ };
src/components/TTSContext.tsx ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { createContext, useContext } from "react";
2
+ import type { RefObject } from "react";
3
+ import type { TextToAudioPipeline } from "@huggingface/transformers";
4
+
5
+ export interface TTSContextType {
6
+ pipelineReady: boolean;
7
+ downloadProgress: number;
8
+ tts: RefObject<TextToAudioPipeline | null>;
9
+ speakerEmbeddings: RefObject<Record<string, Float32Array> | null>;
10
+ }
11
+
12
+ export const TTSContext = createContext<TTSContextType | undefined>(undefined);
13
+
14
+ export const useTTS = () => {
15
+ const context = useContext(TTSContext);
16
+ if (context === undefined) {
17
+ throw new Error("useTTS must be used within a TTSProvider");
18
+ }
19
+ return context;
20
+ };
src/components/TTSProvider.tsx ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useEffect, useRef } from "react";
2
+ import { loadPipeline, loadEmbeddings } from "../tts";
3
+ import { TTSContext } from "./TTSContext";
4
+
5
+ import type { ReactNode } from "react";
6
+ import type { TextToAudioPipeline } from "@huggingface/transformers";
7
+
8
+ export const TTSProvider = ({ children }: { children: ReactNode }) => {
9
+ const [pipelineReady, setPipelineReady] = useState(false);
10
+ const [downloadProgress, setDownloadProgress] = useState(0);
11
+ const tts = useRef<TextToAudioPipeline | null>(null);
12
+ const speakerEmbeddings = useRef<Record<string, Float32Array> | null>(null);
13
+
14
+ useEffect(() => {
15
+ if (pipelineReady) return;
16
+
17
+ const progressMap = new Map<string, number>();
18
+ const onProgress = (info: any) => {
19
+ if (info.status === "progress" && info.file.endsWith(".onnx_data")) {
20
+ progressMap.set(info.file, info.loaded / info.total);
21
+ const total = Array.from(progressMap.values()).reduce((a, b) => a + b, 0);
22
+ setDownloadProgress((total / 3) * 100); // 3 model files to download
23
+ }
24
+ };
25
+
26
+ Promise.all([loadPipeline(onProgress), loadEmbeddings()]).then(([pipeline, embeddings]) => {
27
+ tts.current = pipeline;
28
+ speakerEmbeddings.current = embeddings;
29
+ setPipelineReady(true);
30
+ });
31
+ }, [pipelineReady]);
32
+
33
+ return (
34
+ <TTSContext.Provider
35
+ value={{
36
+ pipelineReady,
37
+ downloadProgress,
38
+ tts,
39
+ speakerEmbeddings,
40
+ }}
41
+ >
42
+ {children}
43
+ </TTSContext.Provider>
44
+ );
45
+ };
src/constants.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export const SAMPLE_RATE = 44100;
2
+
3
+ export const EXAMPLE_SENTENCES = {
4
+ Quote: '"It is not death that a man should fear, but he should fear never beginning to live."',
5
+ Paragraph:
6
+ "The concept of artificial intelligence has captivated human imagination for decades. From early science fiction to modern practical applications, AI has evolved from a dream into a tangible reality that shapes our daily lives.",
7
+ "Full story": "",
8
+ Random: [
9
+ "The startup secured $5.2M in venture capital, a huge leap from their initial $450K seed round.",
10
+ "The train delay was announced at 4:45 PM on Wed, Apr 3, 2024 due to track maintenance.",
11
+ "You can reach the hotel front desk at (212) 555-0142 ext. 402 anytime.",
12
+ "Our drone battery lasts 2.3h when flying at 30kph with full camera payload.",
13
+ "The recipe calls for 250g of flour, 150ml of milk, and 2 large eggs.",
14
+ "Her favorite painting is Starry Night by Vincent van Gogh, created in 1889.",
15
+ "The new smartphone model features a 6.5-inch OLED display and a 48MP camera.",
16
+ "The hiking trail to the summit is approximately 8.4 miles round trip with a 2,000 feet elevation gain.",
17
+ "The novel explores themes of identity, freedom, and the human condition in a dystopian future.",
18
+ "The symphony orchestra will perform Beethoven's 9th Symphony this Saturday at 7 PM.",
19
+ "Hugging Face was founded in 2016 and has become a leading platform for open-source machine learning.",
20
+ ],
21
+ Freeform: "",
22
+ };
src/index.css ADDED
@@ -0,0 +1 @@
 
 
1
+ @import "tailwindcss";
src/main.tsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import "./index.css";
4
+ import App from "./App.tsx";
5
+
6
+ createRoot(document.getElementById("root")!).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>,
10
+ );
src/splitter.ts ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Returns true if the character is considered a sentence terminator.
3
+ * This includes ASCII (".", "!", "?") and common Unicode terminators.
4
+ * NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
5
+ * @param c The character to test.
6
+ * @param includeNewlines Whether to treat newlines as terminators.
7
+ */
8
+ function isSentenceTerminator(c: string, includeNewlines: boolean = true): boolean {
9
+ return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
10
+ }
11
+
12
+ /**
13
+ * Returns true if the character should be attached to the sentence terminator,
14
+ * such as closing quotes or brackets.
15
+ * @param c The character to test.
16
+ */
17
+ function isTrailingChar(c: string): boolean {
18
+ return "\"')]}」』".includes(c);
19
+ }
20
+
21
+ /**
22
+ * Extracts a token (a contiguous sequence of non–whitespace characters)
23
+ * from the buffer starting at the given index.
24
+ * @param buffer The input text.
25
+ * @param start The starting index.
26
+ * @returns The extracted token.
27
+ */
28
+ function getTokenFromBuffer(buffer: string, start: number): string {
29
+ let end = start;
30
+ while (end < buffer.length && !/\s/.test(buffer[end])) {
31
+ ++end;
32
+ }
33
+ return buffer.substring(start, end);
34
+ }
35
+
36
+ // List of common abbreviations. Note that strings with single letters joined by periods
37
+ // (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
38
+ const ABBREVIATIONS: Set<string> = new Set([
39
+ "mr",
40
+ "mrs",
41
+ "ms",
42
+ "dr",
43
+ "prof",
44
+ "sr",
45
+ "jr",
46
+ "sgt",
47
+ "col",
48
+ "gen",
49
+ "rep",
50
+ "sen",
51
+ "gov",
52
+ "lt",
53
+ "maj",
54
+ "capt",
55
+ "st",
56
+ "mt",
57
+ "etc",
58
+ "co",
59
+ "inc",
60
+ "ltd",
61
+ "dept",
62
+ "vs",
63
+ "p",
64
+ "pg",
65
+ "jan",
66
+ "feb",
67
+ "mar",
68
+ "apr",
69
+ "jun",
70
+ "jul",
71
+ "aug",
72
+ "sep",
73
+ "sept",
74
+ "oct",
75
+ "nov",
76
+ "dec",
77
+ "sun",
78
+ "mon",
79
+ "tu",
80
+ "tue",
81
+ "tues",
82
+ "wed",
83
+ "th",
84
+ "thu",
85
+ "thur",
86
+ "thurs",
87
+ "fri",
88
+ "sat",
89
+ ]);
90
+
91
+ /**
92
+ * Determines if the given token (or series of initials) is a known abbreviation.
93
+ * @param token The token to check.
94
+ */
95
+ function isAbbreviation(token: string): boolean {
96
+ // Remove possessive endings and trailing periods.
97
+ token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
98
+ return ABBREVIATIONS.has(token.toLowerCase());
99
+ }
100
+
101
+ // Map of closing punctuation to their corresponding opening punctuation.
102
+ const MATCHING: Map<string, string> = new Map([
103
+ [")", "("],
104
+ ["]", "["],
105
+ ["}", "{"],
106
+ ["》", "《"],
107
+ ["〉", "〈"],
108
+ ["›", "‹"],
109
+ ["»", "«"],
110
+ ["〉", "〈"],
111
+ ["」", "「"],
112
+ ["』", "『"],
113
+ ["〕", "〔"],
114
+ ["】", "【"],
115
+ ]);
116
+
117
+ // Set of opening punctuation characters.
118
+ const OPENING: Set<string> = new Set(MATCHING.values());
119
+
120
+ /**
121
+ * Updates the nesting stack to track quotes and paired punctuation.
122
+ * This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
123
+ * (An apostrophe between letters is ignored so that contractions remain intact.)
124
+ * @param c The current character.
125
+ * @param stack The current nesting stack.
126
+ * @param i The index of the character in the buffer.
127
+ * @param buffer The full text being processed.
128
+ */
129
+ function updateStack(c: string, stack: string[], i: number, buffer: string): void {
130
+ // Handle standard quotes.
131
+ if (c === '"' || c === "'") {
132
+ // Ignore an apostrophe if it's between letters (e.g., in contractions).
133
+ if (
134
+ c === "'" &&
135
+ i > 0 &&
136
+ i < buffer.length - 1 &&
137
+ /[A-Za-z]/.test(buffer[i - 1]) &&
138
+ /[A-Za-z]/.test(buffer[i + 1])
139
+ ) {
140
+ return;
141
+ }
142
+
143
+ // Ignore an apostrophe if it's at the end of a word (e.g., possessive "wives'").
144
+ if (c === "'" && i > 0 && /[A-Za-z]/.test(buffer[i - 1]) && (!stack.length || stack.at(-1) !== "'")) {
145
+ return;
146
+ }
147
+
148
+ // If the quote is already in the stack, it means we are closing it.
149
+ // We search from the top of the stack down.
150
+ const stackIndex = stack.lastIndexOf(c);
151
+ if (stackIndex !== -1) {
152
+ // We found the matching opening quote.
153
+ // If it's not at the top (e.g. stack is ['"', "'"] and c is '"'),
154
+ // it means the intermediate quotes (like the single quote) were likely
155
+ // apostrophes/contractions that were misidentified as opening quotes.
156
+ // We "close" them all by unwinding the stack to this point.
157
+ stack.splice(stackIndex);
158
+ } else {
159
+ stack.push(c);
160
+ }
161
+ return;
162
+ }
163
+ // Handle opening punctuation.
164
+ if (OPENING.has(c)) {
165
+ stack.push(c);
166
+ return;
167
+ }
168
+ // Handle closing punctuation.
169
+ const expectedOpening = MATCHING.get(c);
170
+ if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
171
+ stack.pop();
172
+ }
173
+ }
174
+
175
+ /**
176
+ * A simple stream-based text splitter that emits complete sentences.
177
+ */
178
+ export class TextSplitterStream implements AsyncIterable<string>, Iterable<string> {
179
+ private _buffer: string;
180
+ private _sentences: string[];
181
+ private _resolver: (() => void) | null;
182
+ private _closed: boolean;
183
+
184
+ constructor() {
185
+ this._buffer = "";
186
+ this._sentences = [];
187
+ this._resolver = null;
188
+ this._closed = false;
189
+ }
190
+
191
+ /**
192
+ * Push one or more text chunks into the stream.
193
+ * @param texts Text fragments to process.
194
+ */
195
+ push(...texts: string[]): void {
196
+ for (const txt of texts) {
197
+ this._buffer += txt;
198
+ this._process();
199
+ }
200
+ }
201
+
202
+ /**
203
+ * Closes the stream, signaling that no more text will be pushed.
204
+ * This will flush any remaining text in the buffer as a sentence
205
+ * and allow the consuming process to finish processing the stream.
206
+ */
207
+ close(): void {
208
+ if (this._closed) {
209
+ throw new Error("Stream is already closed.");
210
+ }
211
+ this._closed = true;
212
+ this.flush();
213
+ }
214
+
215
+ /**
216
+ * Flushes any remaining text in the buffer as a sentence.
217
+ */
218
+ flush(): void {
219
+ const remainder = this._buffer.trim();
220
+ if (remainder.length > 0) {
221
+ this._sentences.push(remainder);
222
+ }
223
+ this._buffer = "";
224
+ this._resolve();
225
+ }
226
+
227
+ /**
228
+ * Resolve the pending promise to signal that sentences are available.
229
+ */
230
+ private _resolve(): void {
231
+ if (this._resolver) {
232
+ this._resolver();
233
+ this._resolver = null;
234
+ }
235
+ }
236
+
237
+ /**
238
+ * Processes the internal buffer to extract complete sentences.
239
+ * If the potential sentence boundary is at the end of the current buffer,
240
+ * it waits for more text before splitting.
241
+ */
242
+ private _process(): void {
243
+ let sentenceStart = 0;
244
+ const buffer = this._buffer;
245
+ const len = buffer.length;
246
+ let i = 0;
247
+ let stack: string[] = [];
248
+
249
+ // Helper to scan from the current index over trailing terminators and punctuation.
250
+ const scanBoundary = (idx: number): { end: number; nextNonSpace: number } => {
251
+ let end = idx;
252
+ // Consume contiguous sentence terminators (excluding newlines).
253
+ while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
254
+ ++end;
255
+ }
256
+ // Consume trailing characters (e.g., closing quotes/brackets).
257
+ while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
258
+ ++end;
259
+ }
260
+ let nextNonSpace = end + 1;
261
+ while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
262
+ ++nextNonSpace;
263
+ }
264
+ return { end, nextNonSpace };
265
+ };
266
+
267
+ while (i < len) {
268
+ const c = buffer[i];
269
+ updateStack(c, stack, i, buffer);
270
+
271
+ // Only consider splitting if we're not inside any nested structure.
272
+ if (stack.length === 0 && isSentenceTerminator(c)) {
273
+ const currentSegment = buffer.slice(sentenceStart, i);
274
+ // Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
275
+ if (/(^|\n)\d+$/.test(currentSegment)) {
276
+ ++i;
277
+ continue;
278
+ }
279
+
280
+ const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
281
+
282
+ // If the terminator is not a newline and there's no extra whitespace,
283
+ // we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
284
+ if (i === nextNonSpace - 1 && c !== "\n") {
285
+ ++i;
286
+ continue;
287
+ }
288
+
289
+ // Wait for more text if there's no non-whitespace character yet.
290
+ if (nextNonSpace === len) {
291
+ break;
292
+ }
293
+
294
+ // Determine the token immediately preceding the terminator.
295
+ let tokenStart = i - 1;
296
+ while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
297
+ tokenStart--;
298
+ }
299
+ tokenStart = Math.max(sentenceStart, tokenStart + 1);
300
+ const token = getTokenFromBuffer(buffer, tokenStart);
301
+ if (!token) {
302
+ ++i;
303
+ continue;
304
+ }
305
+
306
+ // --- URL/email protection ---
307
+ // If the token appears to be a URL or email (contains "://" or "@")
308
+ // and does not already end with a terminator, skip splitting.
309
+ if (
310
+ (/https?[,:]\/\//.test(token) || token.includes("@")) &&
311
+ token.at(-1) &&
312
+ !isSentenceTerminator(token.at(-1)!)
313
+ ) {
314
+ i = tokenStart + token.length;
315
+ continue;
316
+ }
317
+
318
+ // --- Abbreviation protection ---
319
+ if (isAbbreviation(token)) {
320
+ ++i;
321
+ continue;
322
+ }
323
+
324
+ // --- Middle initials heuristic ---
325
+ // If the token is a series of single-letter initials (each ending in a period)
326
+ // and is followed by a capitalized word, assume it's part of a name.
327
+ if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
328
+ ++i;
329
+ continue;
330
+ }
331
+
332
+ // --- Lookahead heuristic ---
333
+ // If the terminator is a period and the next non–whitespace character is lowercase,
334
+ // assume it is not the end of a sentence.
335
+ if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
336
+ ++i;
337
+ continue;
338
+ }
339
+
340
+ // Special case: ellipsis that stands alone should be merged with the following sentence.
341
+ const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
342
+ if (sentence === "..." || sentence === "��") {
343
+ ++i;
344
+ continue;
345
+ }
346
+
347
+ // Accept the sentence boundary.
348
+ if (sentence) {
349
+ this._sentences.push(sentence);
350
+ }
351
+ // Move to the next sentence.
352
+ i = sentenceStart = boundaryEnd + 1;
353
+ continue;
354
+ }
355
+ ++i;
356
+ }
357
+
358
+ // Remove the processed portion of the buffer.
359
+ this._buffer = buffer.substring(sentenceStart);
360
+
361
+ // Resolve any pending promise if sentences are available.
362
+ if (this._sentences.length > 0) {
363
+ this._resolve();
364
+ }
365
+ }
366
+
367
+ /**
368
+ * Async iterator to yield sentences as they become available.
369
+ */
370
+ async *[Symbol.asyncIterator](): AsyncGenerator<string, void, void> {
371
+ if (this._resolver) {
372
+ throw new Error("Another iterator is already active.");
373
+ }
374
+ while (true) {
375
+ if (this._sentences.length > 0) {
376
+ // We use shift()! because we checked length > 0, so it cannot be undefined
377
+ yield this._sentences.shift()!;
378
+ } else if (this._closed) {
379
+ // No more text will be pushed.
380
+ break;
381
+ } else {
382
+ // Wait for more text.
383
+ await new Promise<void>((resolve) => {
384
+ this._resolver = resolve;
385
+ });
386
+ }
387
+ }
388
+ }
389
+
390
+ /**
391
+ * Synchronous iterator that flushes the buffer and returns all sentences.
392
+ */
393
+ [Symbol.iterator](): Iterator<string> {
394
+ this.flush();
395
+ const iterator = this._sentences[Symbol.iterator]();
396
+ this._sentences = [];
397
+ return iterator;
398
+ }
399
+
400
+ /**
401
+ * Returns the array of sentences currently available.
402
+ */
403
+ get sentences(): string[] {
404
+ return this._sentences;
405
+ }
406
+ }
407
+
408
+ /**
409
+ * Splits the input text into an array of sentences.
410
+ * @param text The text to split.
411
+ * @returns An array of sentences.
412
+ */
413
+ export function split(text: string): string[] {
414
+ const splitter = new TextSplitterStream();
415
+ splitter.push(text);
416
+ return [...splitter];
417
+ }
src/tts.ts ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { pipeline, RawAudio, TextToAudioPipeline } from "@huggingface/transformers";
2
+ import { split } from "./splitter";
3
+
4
+ const MODEL_ID = "onnx-community/Supertonic-TTS-ONNX";
5
+ const VOICES_URL = `https://huggingface.co/${MODEL_ID}/resolve/main/voices/`;
6
+
7
+ let pipelinePromise: Promise<TextToAudioPipeline> | null = null;
8
+ let embeddingsPromise: Promise<Record<string, Float32Array>> | null = null;
9
+
10
+ export async function loadPipeline(progressCallback: (info: any) => void) {
11
+ // @ts-ignore
12
+ return (pipelinePromise ??= pipeline("text-to-speech", MODEL_ID, {
13
+ device: "webgpu",
14
+ progress_callback: progressCallback,
15
+ }) as Promise<TextToAudioPipeline>);
16
+ }
17
+
18
+ export async function loadEmbeddings() {
19
+ return (embeddingsPromise ??= (async () => {
20
+ const [female, male] = await Promise.all([
21
+ fetch(`${VOICES_URL}F1.bin`).then((r) => r.arrayBuffer()),
22
+ fetch(`${VOICES_URL}M1.bin`).then((r) => r.arrayBuffer()),
23
+ ]);
24
+ return {
25
+ Female: new Float32Array(female),
26
+ Male: new Float32Array(male),
27
+ };
28
+ })());
29
+ }
30
+
31
+ export interface StreamResult {
32
+ time: number;
33
+ audio: RawAudio;
34
+ text: string;
35
+ index: number;
36
+ total: number;
37
+ }
38
+
39
+ function splitWithConstraints(text: string, { minCharacters = 1, maxCharacters = Infinity } = {}): string[] {
40
+ if (!text) return [];
41
+ const rawLines = split(text);
42
+ const result: string[] = [];
43
+ let currentBuffer = "";
44
+
45
+ for (const rawLine of rawLines) {
46
+ const line = rawLine.trim();
47
+ if (!line) continue;
48
+ if (line.length > maxCharacters) {
49
+ throw new Error(`A single segment exceeds the maximum character limit of ${maxCharacters} characters.`);
50
+ }
51
+
52
+ if (currentBuffer) currentBuffer += " ";
53
+ currentBuffer += line;
54
+
55
+ while (currentBuffer.length > maxCharacters) {
56
+ result.push(currentBuffer.slice(0, maxCharacters));
57
+ currentBuffer = currentBuffer.slice(maxCharacters);
58
+ }
59
+ if (currentBuffer.length >= minCharacters) {
60
+ result.push(currentBuffer);
61
+ currentBuffer = "";
62
+ }
63
+ }
64
+ if (currentBuffer) result.push(currentBuffer);
65
+ return result;
66
+ }
67
+
68
+ export async function* streamTTS(
69
+ text: string,
70
+ tts: TextToAudioPipeline,
71
+ speaker_embeddings: Float32Array,
72
+ quality: number,
73
+ speed: number,
74
+ ): AsyncGenerator<StreamResult> {
75
+ const chunks = splitWithConstraints(text, {
76
+ minCharacters: 100,
77
+ maxCharacters: 1000,
78
+ });
79
+
80
+ if (chunks.length === 0) chunks.push(text);
81
+
82
+ for (let i = 0; i < chunks.length; ++i) {
83
+ const chunk = chunks[i];
84
+ if (!chunk.trim()) continue;
85
+
86
+ const output = (await tts(chunk, {
87
+ speaker_embeddings,
88
+ num_inference_steps: quality,
89
+ speed,
90
+ })) as RawAudio;
91
+
92
+ if (i < chunks.length - 1) {
93
+ // Add 0.5s silence between chunks for more natural flow
94
+ const silenceSamples = Math.floor(0.5 * output.sampling_rate);
95
+ const padded = new Float32Array(output.audio.length + silenceSamples);
96
+ padded.set(output.audio);
97
+ output.audio = padded;
98
+ }
99
+ yield {
100
+ time: performance.now(),
101
+ audio: output,
102
+ text: chunk,
103
+ index: i + 1,
104
+ total: chunks.length,
105
+ };
106
+ }
107
+ }
108
+
109
+ export function createAudioBlob(chunks: Float32Array[], sampling_rate: number): Blob {
110
+ const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
111
+ const result = new Float32Array(totalLength);
112
+ let offset = 0;
113
+ for (const chunk of chunks) {
114
+ result.set(chunk, offset);
115
+ offset += chunk.length;
116
+ }
117
+
118
+ const audio = new RawAudio(result, sampling_rate);
119
+ const blob = audio.toBlob();
120
+ return blob;
121
+ }
tsconfig.app.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
4
+ "target": "ES2022",
5
+ "useDefineForClassFields": true,
6
+ "lib": ["ES2022", "DOM", "DOM.Iterable"],
7
+ "module": "ESNext",
8
+ "types": ["vite/client"],
9
+ "skipLibCheck": true,
10
+
11
+ /* Bundler mode */
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "moduleDetection": "force",
16
+ "noEmit": true,
17
+ "jsx": "react-jsx",
18
+
19
+ /* Linting */
20
+ "strict": true,
21
+ "noUnusedLocals": true,
22
+ "noUnusedParameters": true,
23
+ "erasableSyntaxOnly": true,
24
+ "noFallthroughCasesInSwitch": true,
25
+ "noUncheckedSideEffectImports": true
26
+ },
27
+ "include": ["src"]
28
+ }
tsconfig.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "references": [{ "path": "./tsconfig.app.json" }, { "path": "./tsconfig.node.json" }]
4
+ }
tsconfig.node.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
4
+ "target": "ES2023",
5
+ "lib": ["ES2023"],
6
+ "module": "ESNext",
7
+ "types": ["node"],
8
+ "skipLibCheck": true,
9
+
10
+ /* Bundler mode */
11
+ "moduleResolution": "bundler",
12
+ "allowImportingTsExtensions": true,
13
+ "verbatimModuleSyntax": true,
14
+ "moduleDetection": "force",
15
+ "noEmit": true,
16
+
17
+ /* Linting */
18
+ "strict": true,
19
+ "noUnusedLocals": true,
20
+ "noUnusedParameters": true,
21
+ "erasableSyntaxOnly": true,
22
+ "noFallthroughCasesInSwitch": true,
23
+ "noUncheckedSideEffectImports": true
24
+ },
25
+ "include": ["vite.config.ts"]
26
+ }
vite.config.ts ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+ import tailwindcss from "@tailwindcss/vite";
4
+
5
+ // https://vite.dev/config/
6
+ export default defineConfig({
7
+ plugins: [react(), tailwindcss()],
8
+ });