import { useEffect, useMemo, useRef, useState } from "react"; import { motion } from "motion/react"; const LANGUAGES = [ ["Hindi", "नमस्ते, आप कैसे हैं?"], ["Bengali", "নমস্কার, আপনি কেমন আছেন?"], ["Marathi", "नमस्कार, तुम्ही कसे आहात?"], ["Telugu", "నమస్కారం, మీరు ఎలా ఉన్నారు?"], ["Kannada", "ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ?"], ["Tamil", "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"], ["Malayalam", "നമസ്കാരം, സുഖമാണോ?"], ["Gujarati", "નમસ્તે, તમે કેમ છો?"], ["Punjabi", "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?"], ["Assamese", "নমস্কাৰ, আপুনি কেনে আছে?"], ["Bhojpuri", "नमस्कार, राउर का हाल बा?"], ["Magahi", "नमस्कार, तू कैसन हे?"], ["Maithili", "नमस्कार, अहाँ कोना छी?"], ["Chhattisgarhi", "नमस्कार, आप कइसन हन?"], ["Bodo", "नमस्कार, नोँ बेसेबा डंनो?"], ["Dogri", "नमस्ते, तुसें कि’यां ओ?"], ["Nepali", "नमस्ते, तपाईं कस्तो हुनुहुन्छ?"], ["Sanskrit", "नमस्कारः, भवान् कथमस्ति?"], ["English (Indian)", "Hello, how are you?"], ]; const LANGUAGE_DETAILS = { Hindi: { script: "Devanagari", region: "North India" }, Bengali: { script: "Bengali", region: "Eastern India" }, Marathi: { script: "Devanagari", region: "Maharashtra" }, Telugu: { script: "Telugu", region: "Andhra Pradesh + Telangana" }, Kannada: { script: "Kannada", region: "Karnataka" }, Tamil: { script: "Tamil", region: "Tamil Nadu" }, Malayalam: { script: "Malayalam", region: "Kerala" }, Gujarati: { script: "Gujarati", region: "Gujarat" }, Punjabi: { script: "Gurmukhi", region: "Punjab" }, Assamese: { script: "Assamese", region: "Assam" }, Bhojpuri: { script: "Devanagari", region: "Bihar + Eastern UP" }, Magahi: { script: "Devanagari", region: "Bihar" }, Maithili: { script: "Devanagari", region: "Mithila" }, Chhattisgarhi: { script: "Devanagari", region: "Chhattisgarh" }, Bodo: { script: "Devanagari", region: "Northeast India" }, Dogri: { script: "Devanagari", region: "Jammu" }, Nepali: { script: "Devanagari", region: "Nepal + India" }, Sanskrit: { script: "Devanagari", region: "Classical Indic" }, "English (Indian)": { script: "Latin", region: "Indian English" }, }; const VOICES = LANGUAGES.flatMap(([lang]) => [ `${lang} (Female)`, `${lang} (Male)`, ]); const DTYPES = [ { value: "q4f16", label: "q4f16", note: "~1.95 GB · fastest cold start" }, { value: "q8", label: "q8", note: "~4.32 GB · cleaner, slower preload" }, ]; const STACK_FACTS = [ { label: "Model", value: "Svāra-TTS v1" }, { label: "Codec", value: "SNAC 24 kHz" }, { label: "Runtime", value: "WebGPU + Transformers.js" }, ]; function withEmotionTag(text, tag) { return `${text.replace(/\s*<[^>]+>\s*$/u, "").trim()} ${tag}`; } export default function App() { const worker = useRef(null); const runtimeReadyRef = useRef(false); const loadedDtypesRef = useRef([]); const [selectedVoice, setSelectedVoice] = useState("Hindi (Female)"); const [inputText, setInputText] = useState(LANGUAGES[0][1]); const [dtype, setDtype] = useState("q4f16"); const [status, setStatus] = useState(null); const [error, setError] = useState(null); const [runtimeReady, setRuntimeReady] = useState(false); const [loadingDtype, setLoadingDtype] = useState(null); const [loadedDtypes, setLoadedDtypes] = useState([]); const [loadingMessage, setLoadingMessage] = useState( "Detecting WebGPU support...", ); const [results, setResults] = useState([]); const selectedLanguage = selectedVoice.split(" (")[0]; const selectedGender = selectedVoice.includes("(Male)") ? "Male" : "Female"; const languageDetail = LANGUAGE_DETAILS[selectedLanguage] ?? { script: "Indic", region: "South Asia", }; const currentSample = LANGUAGES.find(([lang]) => lang === selectedLanguage)?.[1] ?? inputText; const currentDtype = DTYPES.find((entry) => entry.value === dtype) ?? DTYPES[0]; const isCurrentDtypeLoaded = loadedDtypes.includes(dtype); const isLoadingCurrentDtype = status === "loading" && loadingDtype === currentDtype.value; const promptChips = useMemo( () => [ { label: "Sample line", value: currentSample }, { label: "Sample + ", value: withEmotionTag(currentSample, "") }, { label: "Sample + ", value: withEmotionTag(currentSample, ""), }, ], [currentSample], ); useEffect(() => { runtimeReadyRef.current = runtimeReady; }, [runtimeReady]); useEffect(() => { loadedDtypesRef.current = loadedDtypes; }, [loadedDtypes]); useEffect(() => { worker.current ??= new Worker(new URL("./worker.js", import.meta.url), { type: "module", }); const onMessageReceived = (e) => { switch (e.data.status) { case "feature-success": runtimeReadyRef.current = true; setRuntimeReady(true); setError(null); setStatus("idle"); setLoadingMessage( "WebGPU is available. Load a model when you want to start local synthesis.", ); break; case "feature-error": setError(e.data.data); break; case "loading": setError(null); if (loadedDtypesRef.current.includes(e.data.dtype)) { setLoadingDtype(null); setStatus("running"); } else { setLoadingDtype(e.data.dtype); setLoadingMessage( e.data.dtype === "q8" ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..." : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...", ); setStatus("loading"); } break; case "ready": setLoadingDtype(null); setError(null); setLoadedDtypes((prev) => { if (prev.includes(e.data.dtype)) return prev; const next = [...prev, e.data.dtype]; loadedDtypesRef.current = next; return next; }); setStatus("ready"); break; case "complete": setResults((prev) => [ { text: e.data.text, src: e.data.audio, voice: e.data.voice, dtype: e.data.dtype, createdAt: new Date().toLocaleTimeString([], { hour: "numeric", minute: "2-digit", }), }, ...prev, ]); setError(null); setStatus("ready"); break; case "error": setLoadingDtype(null); setError(e.data.data); setStatus( loadedDtypesRef.current.includes(e.data.dtype) ? "ready" : runtimeReadyRef.current ? "idle" : null, ); break; } }; worker.current.addEventListener("message", onMessageReceived); worker.current.addEventListener("error", (event) => console.error(event)); return () => { worker.current.removeEventListener("message", onMessageReceived); }; }, []); const handleSubmit = (event) => { event.preventDefault(); if (!isCurrentDtypeLoaded) return; setStatus("running"); setError(null); worker.current.postMessage({ type: "generate", text: inputText.trim(), speaker_id: selectedVoice, dtype, }); }; const handleLoadModel = () => { if (!runtimeReady || isCurrentDtypeLoaded) return; setError(null); setLoadingDtype(dtype); setLoadingMessage( dtype === "q8" ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..." : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...", ); setStatus("loading"); worker.current?.postMessage({ type: "preload", dtype }); }; const onLanguageChange = (lang) => { const sample = LANGUAGES.find(([entry]) => entry === lang)?.[1] ?? inputText; setInputText(sample); setSelectedVoice(`${lang} (Female)`); }; const onDtypeChange = (next) => { if (next === dtype) return; setDtype(next); setError(null); setLoadingDtype(null); setStatus(loadedDtypesRef.current.includes(next) ? "ready" : "idle"); }; let statusHeadline = "Checking browser runtime"; let statusBody = loadingMessage; if (error) { statusHeadline = runtimeReady ? "Load issue" : "Startup issue"; statusBody = error; } else if (status === "running") { statusHeadline = "Rendering speech locally"; statusBody = `Synthesizing with ${selectedVoice} on ${currentDtype.label}.`; } else if (status === "loading") { statusHeadline = "Loading model weights"; } else if (runtimeReady && !isCurrentDtypeLoaded) { statusHeadline = "Ready to load model"; statusBody = `${currentDtype.label} is a one-time ${currentDtype.note.split("·")[0].trim()} download. Tap Load model to cache it in this browser.`; } else if (isCurrentDtypeLoaded) { statusHeadline = "Model ready in this browser"; statusBody = `${selectedVoice} is ready on ${currentDtype.label}. Everything runs locally after the one-time model download.`; } const statusActivityLabel = status === "running" ? "Generating audio..." : status === "loading" ? "Loading in the background" : null; const statusCardBusy = !error && ( status === "loading" || status === "running" || status === null ); const loadButtonLabel = isLoadingCurrentDtype ? `Loading ${currentDtype.label}...` : `Load ${currentDtype.label}`; return (
Svāra TTS · WebGPU

Svāra

स्वरा · Indic text-to-speech in the browser

A warmer frontend for the same local synthesis engine: 19 languages, 38 voices, SNAC decoding, and no server round-trip once the model is cached in this browser.

Session

{statusHeadline}

{statusBody}

{statusActivityLabel && !error && ( )} {runtimeReady && !isCurrentDtypeLoaded && status !== "loading" && (

Model load is explicit in this build.

{loadedDtypes.length > 0 ? `Cached here: ${loadedDtypes.join(", ")}` : "Nothing cached in this browser session yet."}
)}
19 languages 38 voices 24 kHz mono Runs locally

Compose

Switch language, adjust voice, synthesize