miraclemind's picture
Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. Based on helpers in https://github.com/supertone-inc/supertonic
f76dade
raw
history blame
15.1 kB
import { useState, useEffect, useRef } from "react";
import { Zap, AlignLeft, Quote, Type, FileText, Check, X, Dices } from "lucide-react";
import { useTTS } from "./components/TTSContext";
import { TTSProvider } from "./components/TTSProvider";
import { streamTTS, createAudioBlob } from "./tts";
import { preprocessText } from "./text-preprocessor";
import { SAMPLE_RATE, EXAMPLE_SENTENCES } from "./constants";
import { AudioResult } from "./components/AudioResult";
import { Controls } from "./components/Controls";
const AppContent = () => {
const [text, setText] = useState(
"Introducing Supertonic WebGPU: blazingly fast text-to-speech running 100% locally in your browser.",
);
const [activeTab, setActiveTab] = useState<string | null>("Freeform");
const [isGenerating, setIsGenerating] = useState(false);
const [showResults, setShowResults] = useState(false);
const [quality, setQuality] = useState(5);
const [speed, setSpeed] = useState(1.0);
const [voice, setVoice] = useState("Female");
const { pipelineReady, tts, speakerEmbeddings, downloadProgress } = useTTS();
const [stats, setStats] = useState({
firstLatency: null as number | null,
processingTime: 0,
charsPerSec: 0,
rtf: 0,
totalDuration: 0,
currentDuration: 0,
});
const [generationProgress, setGenerationProgress] = useState(0);
const [isPlaying, setIsPlaying] = useState(false);
const audioContextRef = useRef<AudioContext | null>(null);
const nextPlayTimeRef = useRef<number>(0);
const fullAudioBufferRef = useRef<Float32Array[]>([]);
const playbackStartTimeRef = useRef<number>(0);
const playbackAnimationFrameRef = useRef<number>(0);
const activeSourceNodesRef = useRef<AudioBufferSourceNode[]>([]);
const isPlaybackInterruptedRef = useRef(false);
const stopGenerationRef = useRef(false);
const [exampleTexts, setExampleTexts] = useState<Record<string, string | string[]>>(EXAMPLE_SENTENCES);
useEffect(() => {
fetch("/the-great-gatsby.txt")
.then((res) => res.text())
.then((text) => {
setExampleTexts((prev) => ({ ...prev, "Full story": text }));
})
.catch((e) => console.error("Failed to load story", e));
}, []);
useEffect(() => {
return () => {
if (audioContextRef.current) {
audioContextRef.current.close();
}
cancelAnimationFrame(playbackAnimationFrameRef.current);
};
}, []);
useEffect(() => {
const updatePlaybackUI = () => {
if (isPlaying && audioContextRef.current) {
const ctx = audioContextRef.current;
const elapsed = ctx.currentTime - playbackStartTimeRef.current;
// If reached end of current known duration
if (elapsed >= stats.totalDuration && !isGenerating && stats.totalDuration > 0) {
setIsPlaying(false);
setStats((prev) => ({
...prev,
currentDuration: prev.totalDuration,
})); // Snap to end
return;
}
setStats((prev) => ({
...prev,
currentDuration: Math.min(elapsed, prev.totalDuration),
}));
playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
}
};
if (isPlaying) {
playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
} else {
cancelAnimationFrame(playbackAnimationFrameRef.current);
}
}, [isPlaying, isGenerating, stats.totalDuration]);
const handleExampleClick = (type: string) => {
setActiveTab(type);
let selection = exampleTexts[type];
if (Array.isArray(selection)) {
setText(selection[Math.floor(Math.random() * selection.length)]);
return;
}
setText(selection);
};
const stopAllAudio = () => {
activeSourceNodesRef.current.forEach((node) => {
try {
node.stop();
} catch (e) {}
});
activeSourceNodesRef.current = [];
};
const handleStop = () => {
stopGenerationRef.current = true;
};
const handleGenerate = async () => {
if (isGenerating) return;
stopAllAudio();
setShowResults(true);
setIsGenerating(true);
setGenerationProgress(0);
stopGenerationRef.current = false;
setStats({
firstLatency: null,
processingTime: 0,
charsPerSec: 0,
rtf: 0,
totalDuration: 0,
currentDuration: 0,
});
fullAudioBufferRef.current = [];
isPlaybackInterruptedRef.current = false;
if (!audioContextRef.current) {
audioContextRef.current = new (window.AudioContext || (window as any).webkitAudioContext)();
}
const ctx = audioContextRef.current;
if (ctx.state === "suspended") {
await ctx.resume();
}
nextPlayTimeRef.current = ctx.currentTime + 0.1;
playbackStartTimeRef.current = nextPlayTimeRef.current;
setIsPlaying(true);
const startTime = performance.now();
let processedChars = 0;
let generatedAudioSeconds = 0;
try {
if (!tts.current || !speakerEmbeddings.current) throw new Error("TTS pipeline not ready");
const selectedEmbedding = speakerEmbeddings.current[voice];
const preprocessedText = preprocessText(text);
for await (const result of streamTTS(preprocessedText, tts.current, selectedEmbedding, quality, speed)) {
if (stopGenerationRef.current) {
break;
}
const now = performance.now();
const elapsedSec = (now - startTime) / 1000;
setStats((prev) => ({
...prev,
firstLatency: prev.firstLatency === null ? elapsedSec : prev.firstLatency,
processingTime: elapsedSec,
}));
const chunkDuration = result.audio.audio.length / result.audio.sampling_rate;
generatedAudioSeconds += chunkDuration;
fullAudioBufferRef.current.push(result.audio.audio);
// Only schedule streaming playback if user hasn't interrupted
if (!isPlaybackInterruptedRef.current) {
const buffer = ctx.createBuffer(1, result.audio.audio.length, result.audio.sampling_rate);
buffer.copyToChannel(result.audio.audio as any, 0);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
source.start(nextPlayTimeRef.current);
activeSourceNodesRef.current.push(source);
source.onended = () => {
const idx = activeSourceNodesRef.current.indexOf(source);
if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
};
nextPlayTimeRef.current += buffer.duration;
}
processedChars += result.text.length;
const currentRtf = elapsedSec / generatedAudioSeconds;
const currentCharsPerSec = processedChars / elapsedSec;
setStats((prev) => ({
...prev,
charsPerSec: currentCharsPerSec,
rtf: currentRtf,
totalDuration: generatedAudioSeconds,
}));
setGenerationProgress((result.index / result.total) * 100);
}
} catch (e) {
console.error("Generation failed", e);
} finally {
setIsGenerating(false);
isPlaybackInterruptedRef.current = false; // Reset after completion
}
};
const handleSeek = (percentage: number) => {
if (!audioContextRef.current || fullAudioBufferRef.current.length === 0) return;
const ctx = audioContextRef.current;
isPlaybackInterruptedRef.current = true;
stopAllAudio();
const seekTime = stats.totalDuration * percentage;
let currentTimeInAudio = 0;
let nextPlayTime = ctx.currentTime;
// Reset startTime such that (currentTime - startTime) = seekTime
playbackStartTimeRef.current = ctx.currentTime - seekTime;
for (const chunk of fullAudioBufferRef.current) {
const chunkDuration = chunk.length / SAMPLE_RATE;
const chunkEndTime = currentTimeInAudio + chunkDuration;
if (chunkEndTime > seekTime) {
// This chunk needs to be played
const offsetInChunk = Math.max(0, seekTime - currentTimeInAudio);
const durationToPlay = chunkDuration - offsetInChunk;
const buffer = ctx.createBuffer(1, chunk.length, SAMPLE_RATE);
buffer.copyToChannel(chunk as any, 0);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
source.start(nextPlayTime, offsetInChunk);
activeSourceNodesRef.current.push(source);
source.onended = () => {
const idx = activeSourceNodesRef.current.indexOf(source);
if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
};
nextPlayTime += durationToPlay;
}
currentTimeInAudio += chunkDuration;
}
if (ctx.state === "suspended") ctx.resume();
setIsPlaying(true);
};
const handleDownload = () => {
if (fullAudioBufferRef.current.length === 0) return;
const blob = createAudioBlob(fullAudioBufferRef.current, SAMPLE_RATE);
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = "audio.wav";
a.click();
URL.revokeObjectURL(url);
};
const togglePlay = async () => {
if (!audioContextRef.current) return;
if (isPlaying) {
setIsPlaying(false);
audioContextRef.current.suspend();
} else {
setIsPlaying(true);
audioContextRef.current.resume();
// If we finished playing and hit play again, replay from start
if (!isGenerating && stats.currentDuration >= stats.totalDuration) {
handleSeek(0);
} else if (!isGenerating && fullAudioBufferRef.current.length > 0 && activeSourceNodesRef.current.length === 0) {
// This handles the case where we paused/stopped but haven't technically reached "end" OR we are resuming replay
const currentProgress = stats.totalDuration > 0 ? stats.currentDuration / stats.totalDuration : 0;
handleSeek(currentProgress);
}
}
};
const canGenerate = text.length >= 10 && pipelineReady;
return (
<div className="min-h-screen bg-[#F2F2F2] font-sans text-gray-900 selection:bg-yellow-200 flex items-center justify-center py-10">
<div className="w-full max-w-7xl px-4 md:px-6">
<div className="text-center mb-10">
<h3 className="text-4xl md:text-6xl font-medium text-gray-900 tracking-tight">Supertonic WebGPU</h3>
<h4 className="text-gray-600 mt-3 text-2xl md:text-3xl font-light">
Generate speech directly in your browser
</h4>
</div>
<div className="bg-white rounded-2xl shadow-2xl overflow-hidden border border-gray-100 max-w-7xl mx-auto p-2">
<div className="hidden md:grid grid-cols-1 md:grid-cols-2 border-b border-gray-100 bg-white relative rounded-t-xl">
<div className="px-8 py-6 flex items-center justify-center">
<div className="text-3xl font-normal text-gray-800">Text</div>
</div>
<div className="px-8 py-6 flex flex-col items-center justify-center relative bg-gray-50/30 md:bg-white">
<div className="text-3xl font-normal text-gray-800 mb-2">Speech</div>
</div>
<div className="absolute left-1/2 top-1/2 -translate-x-1/2 -translate-y-1/2 bg-white p-3 rounded-full z-10 shadow-sm border border-gray-50">
<Zap className="text-yellow-400 fill-yellow-400 drop-shadow-sm" size={32} />
</div>
</div>
<div className="flex flex-col md:flex-row min-h-[450px]">
<div className="w-full md:w-1/2 p-8 border-r border-gray-100 flex flex-col bg-white relative">
<textarea
className="w-full flex-grow text-xl md:text-2xl text-gray-800 placeholder-gray-300 outline-none resize-none font-light leading-relaxed bg-transparent"
placeholder="This text-to-speech system runs entirely in your browser, providing fast and private operation without sending any data to external servers."
value={text}
onChange={(e) => {
setText(e.target.value);
setActiveTab("Freeform");
}}
spellCheck={false}
/>
<div className="mt-auto w-full">
<div className="flex justify-end mb-2">
<div className="flex items-center gap-2 text-xs md:text-sm font-mono text-gray-400">
{text.length > 0 ? text.length : 0} chars
{text.length >= 10 ? (
<Check size={14} className="text-green-500" />
) : (
<X size={14} className="text-red-500" />
)}
</div>
</div>
<div className="pt-6 flex flex-wrap items-center border-t border-gray-100 text-gray-500">
<div className="flex gap-3 md:gap-5 text-sm md:text-base overflow-x-auto pb-2 md:pb-0 w-full">
{Object.keys(exampleTexts).map((key) => (
<button
key={key}
onClick={() => handleExampleClick(key)}
className={`flex items-center gap-1.5 transition whitespace-nowrap ${activeTab === key ? "text-blue-600 font-semibold border-b-2 border-blue-500 pb-0.5" : "hover:text-gray-900"}`}
>
{key === "Quote" && <Quote size={16} />}
{key === "Paragraph" && <AlignLeft size={16} />}
{key === "Full story" && <FileText size={16} />}
{key === "Random" && <Dices size={16} />}
{key === "Freeform" && <Type size={16} />}
{key}
</button>
))}
</div>
</div>
</div>
</div>
<Controls
quality={quality}
setQuality={setQuality}
speed={speed}
setSpeed={setSpeed}
voice={voice}
setVoice={setVoice}
onGenerate={handleGenerate}
onStop={handleStop}
isGenerating={isGenerating}
canGenerate={canGenerate}
pipelineReady={pipelineReady}
progress={generationProgress}
loadingProgress={downloadProgress}
/>
</div>
{showResults && (
<div className="px-4 pb-4">
<AudioResult
stats={stats}
progressPercentage={generationProgress}
isGenerating={isGenerating}
isPlaying={isPlaying}
onTogglePlay={togglePlay}
onDownload={handleDownload}
onSeek={handleSeek}
/>
</div>
)}
</div>
</div>
</div>
);
};
const App = () => {
return (
<TTSProvider>
<AppContent />
</TTSProvider>
);
};
export default App;