Spaces:

webml-community
/

Supertonic-TTS-WebGPU

Running

Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. Based on helpers in https://github.com/supertone-inc/supertonic

f76dade 15 days ago

raw

history blame

15.1 kB

	import { useState, useEffect, useRef } from "react";
	import { Zap, AlignLeft, Quote, Type, FileText, Check, X, Dices } from "lucide-react";
	import { useTTS } from "./components/TTSContext";
	import { TTSProvider } from "./components/TTSProvider";
	import { streamTTS, createAudioBlob } from "./tts";
	import { preprocessText } from "./text-preprocessor";
	import { SAMPLE_RATE, EXAMPLE_SENTENCES } from "./constants";
	import { AudioResult } from "./components/AudioResult";
	import { Controls } from "./components/Controls";

	const AppContent = () => {
	const [text, setText] = useState(
	"Introducing Supertonic WebGPU: blazingly fast text-to-speech running 100% locally in your browser.",
	);
	const [activeTab, setActiveTab] = useState<string \| null>("Freeform");
	const [isGenerating, setIsGenerating] = useState(false);
	const [showResults, setShowResults] = useState(false);
	const [quality, setQuality] = useState(5);
	const [speed, setSpeed] = useState(1.0);
	const [voice, setVoice] = useState("Female");

	const { pipelineReady, tts, speakerEmbeddings, downloadProgress } = useTTS();

	const [stats, setStats] = useState({
	firstLatency: null as number \| null,
	processingTime: 0,
	charsPerSec: 0,
	rtf: 0,
	totalDuration: 0,
	currentDuration: 0,
	});
	const [generationProgress, setGenerationProgress] = useState(0);
	const [isPlaying, setIsPlaying] = useState(false);

	const audioContextRef = useRef<AudioContext \| null>(null);
	const nextPlayTimeRef = useRef<number>(0);
	const fullAudioBufferRef = useRef<Float32Array[]>([]);
	const playbackStartTimeRef = useRef<number>(0);
	const playbackAnimationFrameRef = useRef<number>(0);
	const activeSourceNodesRef = useRef<AudioBufferSourceNode[]>([]);
	const isPlaybackInterruptedRef = useRef(false);
	const stopGenerationRef = useRef(false);

	const [exampleTexts, setExampleTexts] = useState<Record<string, string \| string[]>>(EXAMPLE_SENTENCES);

	useEffect(() => {
	fetch("/the-great-gatsby.txt")
	.then((res) => res.text())
	.then((text) => {
	setExampleTexts((prev) => ({ ...prev, "Full story": text }));
	})
	.catch((e) => console.error("Failed to load story", e));
	}, []);

	useEffect(() => {
	return () => {
	if (audioContextRef.current) {
	audioContextRef.current.close();
	}
	cancelAnimationFrame(playbackAnimationFrameRef.current);
	};
	}, []);

	useEffect(() => {
	const updatePlaybackUI = () => {
	if (isPlaying && audioContextRef.current) {
	const ctx = audioContextRef.current;
	const elapsed = ctx.currentTime - playbackStartTimeRef.current;

	// If reached end of current known duration
	if (elapsed >= stats.totalDuration && !isGenerating && stats.totalDuration > 0) {
	setIsPlaying(false);
	setStats((prev) => ({
	...prev,
	currentDuration: prev.totalDuration,
	})); // Snap to end
	return;
	}

	setStats((prev) => ({
	...prev,
	currentDuration: Math.min(elapsed, prev.totalDuration),
	}));

	playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
	}
	};

	if (isPlaying) {
	playbackAnimationFrameRef.current = requestAnimationFrame(updatePlaybackUI);
	} else {
	cancelAnimationFrame(playbackAnimationFrameRef.current);
	}
	}, [isPlaying, isGenerating, stats.totalDuration]);

	const handleExampleClick = (type: string) => {
	setActiveTab(type);
	let selection = exampleTexts[type];
	if (Array.isArray(selection)) {
	setText(selection[Math.floor(Math.random() * selection.length)]);
	return;
	}
	setText(selection);
	};

	const stopAllAudio = () => {
	activeSourceNodesRef.current.forEach((node) => {
	try {
	node.stop();
	} catch (e) {}
	});
	activeSourceNodesRef.current = [];
	};

	const handleStop = () => {
	stopGenerationRef.current = true;
	};

	const handleGenerate = async () => {
	if (isGenerating) return;

	stopAllAudio();

	setShowResults(true);
	setIsGenerating(true);
	setGenerationProgress(0);
	stopGenerationRef.current = false;
	setStats({
	firstLatency: null,
	processingTime: 0,
	charsPerSec: 0,
	rtf: 0,
	totalDuration: 0,
	currentDuration: 0,
	});
	fullAudioBufferRef.current = [];
	isPlaybackInterruptedRef.current = false;

	if (!audioContextRef.current) {
	audioContextRef.current = new (window.AudioContext \|\| (window as any).webkitAudioContext)();
	}
	const ctx = audioContextRef.current;
	if (ctx.state === "suspended") {
	await ctx.resume();
	}

	nextPlayTimeRef.current = ctx.currentTime + 0.1;
	playbackStartTimeRef.current = nextPlayTimeRef.current;
	setIsPlaying(true);

	const startTime = performance.now();
	let processedChars = 0;
	let generatedAudioSeconds = 0;

	try {
	if (!tts.current \|\| !speakerEmbeddings.current) throw new Error("TTS pipeline not ready");
	const selectedEmbedding = speakerEmbeddings.current[voice];

	const preprocessedText = preprocessText(text);
	for await (const result of streamTTS(preprocessedText, tts.current, selectedEmbedding, quality, speed)) {
	if (stopGenerationRef.current) {
	break;
	}

	const now = performance.now();
	const elapsedSec = (now - startTime) / 1000;

	setStats((prev) => ({
	...prev,
	firstLatency: prev.firstLatency === null ? elapsedSec : prev.firstLatency,
	processingTime: elapsedSec,
	}));

	const chunkDuration = result.audio.audio.length / result.audio.sampling_rate;
	generatedAudioSeconds += chunkDuration;

	fullAudioBufferRef.current.push(result.audio.audio);

	// Only schedule streaming playback if user hasn't interrupted
	if (!isPlaybackInterruptedRef.current) {
	const buffer = ctx.createBuffer(1, result.audio.audio.length, result.audio.sampling_rate);
	buffer.copyToChannel(result.audio.audio as any, 0);

	const source = ctx.createBufferSource();
	source.buffer = buffer;
	source.connect(ctx.destination);
	source.start(nextPlayTimeRef.current);

	activeSourceNodesRef.current.push(source);
	source.onended = () => {
	const idx = activeSourceNodesRef.current.indexOf(source);
	if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
	};

	nextPlayTimeRef.current += buffer.duration;
	}

	processedChars += result.text.length;
	const currentRtf = elapsedSec / generatedAudioSeconds;
	const currentCharsPerSec = processedChars / elapsedSec;

	setStats((prev) => ({
	...prev,
	charsPerSec: currentCharsPerSec,
	rtf: currentRtf,
	totalDuration: generatedAudioSeconds,
	}));

	setGenerationProgress((result.index / result.total) * 100);
	}
	} catch (e) {
	console.error("Generation failed", e);
	} finally {
	setIsGenerating(false);
	isPlaybackInterruptedRef.current = false; // Reset after completion
	}
	};

	const handleSeek = (percentage: number) => {
	if (!audioContextRef.current \|\| fullAudioBufferRef.current.length === 0) return;

	const ctx = audioContextRef.current;

	isPlaybackInterruptedRef.current = true;
	stopAllAudio();

	const seekTime = stats.totalDuration * percentage;

	let currentTimeInAudio = 0;
	let nextPlayTime = ctx.currentTime;

	// Reset startTime such that (currentTime - startTime) = seekTime
	playbackStartTimeRef.current = ctx.currentTime - seekTime;

	for (const chunk of fullAudioBufferRef.current) {
	const chunkDuration = chunk.length / SAMPLE_RATE;
	const chunkEndTime = currentTimeInAudio + chunkDuration;

	if (chunkEndTime > seekTime) {
	// This chunk needs to be played
	const offsetInChunk = Math.max(0, seekTime - currentTimeInAudio);
	const durationToPlay = chunkDuration - offsetInChunk;

	const buffer = ctx.createBuffer(1, chunk.length, SAMPLE_RATE);
	buffer.copyToChannel(chunk as any, 0);

	const source = ctx.createBufferSource();
	source.buffer = buffer;
	source.connect(ctx.destination);

	source.start(nextPlayTime, offsetInChunk);

	activeSourceNodesRef.current.push(source);
	source.onended = () => {
	const idx = activeSourceNodesRef.current.indexOf(source);
	if (idx > -1) activeSourceNodesRef.current.splice(idx, 1);
	};

	nextPlayTime += durationToPlay;
	}

	currentTimeInAudio += chunkDuration;
	}

	if (ctx.state === "suspended") ctx.resume();
	setIsPlaying(true);
	};

	const handleDownload = () => {
	if (fullAudioBufferRef.current.length === 0) return;
	const blob = createAudioBlob(fullAudioBufferRef.current, SAMPLE_RATE);
	const url = URL.createObjectURL(blob);
	const a = document.createElement("a");
	a.href = url;
	a.download = "audio.wav";
	a.click();
	URL.revokeObjectURL(url);
	};

	const togglePlay = async () => {
	if (!audioContextRef.current) return;

	if (isPlaying) {
	setIsPlaying(false);
	audioContextRef.current.suspend();
	} else {
	setIsPlaying(true);
	audioContextRef.current.resume();

	// If we finished playing and hit play again, replay from start
	if (!isGenerating && stats.currentDuration >= stats.totalDuration) {
	handleSeek(0);
	} else if (!isGenerating && fullAudioBufferRef.current.length > 0 && activeSourceNodesRef.current.length === 0) {
	// This handles the case where we paused/stopped but haven't technically reached "end" OR we are resuming replay
	const currentProgress = stats.totalDuration > 0 ? stats.currentDuration / stats.totalDuration : 0;
	handleSeek(currentProgress);
	}
	}
	};

	const canGenerate = text.length >= 10 && pipelineReady;

	return (
	<div className="min-h-screen bg-[#F2F2F2] font-sans text-gray-900 selection:bg-yellow-200 flex items-center justify-center py-10">
	<div className="w-full max-w-7xl px-4 md:px-6">
	<div className="text-center mb-10">
	<h3 className="text-4xl md:text-6xl font-medium text-gray-900 tracking-tight">Supertonic WebGPU</h3>
	<h4 className="text-gray-600 mt-3 text-2xl md:text-3xl font-light">
	Generate speech directly in your browser
	</h4>
	</div>

	<div className="bg-white rounded-2xl shadow-2xl overflow-hidden border border-gray-100 max-w-7xl mx-auto p-2">
	<div className="hidden md:grid grid-cols-1 md:grid-cols-2 border-b border-gray-100 bg-white relative rounded-t-xl">
	<div className="px-8 py-6 flex items-center justify-center">
	<div className="text-3xl font-normal text-gray-800">Text</div>
	</div>

	<div className="px-8 py-6 flex flex-col items-center justify-center relative bg-gray-50/30 md:bg-white">
	<div className="text-3xl font-normal text-gray-800 mb-2">Speech</div>
	</div>

	<div className="absolute left-1/2 top-1/2 -translate-x-1/2 -translate-y-1/2 bg-white p-3 rounded-full z-10 shadow-sm border border-gray-50">
	<Zap className="text-yellow-400 fill-yellow-400 drop-shadow-sm" size={32} />
	</div>
	</div>

	<div className="flex flex-col md:flex-row min-h-[450px]">
	<div className="w-full md:w-1/2 p-8 border-r border-gray-100 flex flex-col bg-white relative">
	<textarea
	className="w-full flex-grow text-xl md:text-2xl text-gray-800 placeholder-gray-300 outline-none resize-none font-light leading-relaxed bg-transparent"
	placeholder="This text-to-speech system runs entirely in your browser, providing fast and private operation without sending any data to external servers."
	value={text}
	onChange={(e) => {
	setText(e.target.value);
	setActiveTab("Freeform");
	}}
	spellCheck={false}
	/>

	<div className="mt-auto w-full">
	<div className="flex justify-end mb-2">
	<div className="flex items-center gap-2 text-xs md:text-sm font-mono text-gray-400">
	{text.length > 0 ? text.length : 0} chars
	{text.length >= 10 ? (
	<Check size={14} className="text-green-500" />
	) : (
	<X size={14} className="text-red-500" />
	)}
	</div>
	</div>

	<div className="pt-6 flex flex-wrap items-center border-t border-gray-100 text-gray-500">
	<div className="flex gap-3 md:gap-5 text-sm md:text-base overflow-x-auto pb-2 md:pb-0 w-full">
	{Object.keys(exampleTexts).map((key) => (
	<button
	key={key}
	onClick={() => handleExampleClick(key)}
	className={`flex items-center gap-1.5 transition whitespace-nowrap ${activeTab === key ? "text-blue-600 font-semibold border-b-2 border-blue-500 pb-0.5" : "hover:text-gray-900"}`}
	>
	{key === "Quote" && <Quote size={16} />}
	{key === "Paragraph" && <AlignLeft size={16} />}
	{key === "Full story" && <FileText size={16} />}
	{key === "Random" && <Dices size={16} />}
	{key === "Freeform" && <Type size={16} />}
	{key}
	</button>
	))}
	</div>
	</div>
	</div>
	</div>

	<Controls
	quality={quality}
	setQuality={setQuality}
	speed={speed}
	setSpeed={setSpeed}
	voice={voice}
	setVoice={setVoice}
	onGenerate={handleGenerate}
	onStop={handleStop}
	isGenerating={isGenerating}
	canGenerate={canGenerate}
	pipelineReady={pipelineReady}
	progress={generationProgress}
	loadingProgress={downloadProgress}
	/>
	</div>

	{showResults && (
	<div className="px-4 pb-4">
	<AudioResult
	stats={stats}
	progressPercentage={generationProgress}
	isGenerating={isGenerating}
	isPlaying={isPlaying}
	onTogglePlay={togglePlay}
	onDownload={handleDownload}
	onSeek={handleSeek}
	/>
	</div>
	)}
	</div>
	</div>
	</div>
	);
	};

	const App = () => {
	return (
	<TTSProvider>
	<AppContent />
	</TTSProvider>
	);
	};

	export default App;