Spaces:
Running
Running
| import { useEffect, useMemo, useRef, useState } from "react"; | |
| import { motion } from "motion/react"; | |
| const LANGUAGES = [ | |
| ["Hindi", "नमस्ते, आप कैसे हैं?"], | |
| ["Bengali", "নমস্কার, আপনি কেমন আছেন?"], | |
| ["Marathi", "नमस्कार, तुम्ही कसे आहात?"], | |
| ["Telugu", "నమస్కారం, మీరు ఎలా ఉన్నారు?"], | |
| ["Kannada", "ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ?"], | |
| ["Tamil", "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?"], | |
| ["Malayalam", "നമസ്കാരം, സുഖമാണോ?"], | |
| ["Gujarati", "નમસ્તે, તમે કેમ છો?"], | |
| ["Punjabi", "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?"], | |
| ["Assamese", "নমস্কাৰ, আপুনি কেনে আছে?"], | |
| ["Bhojpuri", "नमस्कार, राउर का हाल बा?"], | |
| ["Magahi", "नमस्कार, तू कैसन हे?"], | |
| ["Maithili", "नमस्कार, अहाँ कोना छी?"], | |
| ["Chhattisgarhi", "नमस्कार, आप कइसन हन?"], | |
| ["Bodo", "नमस्कार, नोँ बेसेबा डंनो?"], | |
| ["Dogri", "नमस्ते, तुसें कि’यां ओ?"], | |
| ["Nepali", "नमस्ते, तपाईं कस्तो हुनुहुन्छ?"], | |
| ["Sanskrit", "नमस्कारः, भवान् कथमस्ति?"], | |
| ["English (Indian)", "Hello, how are you?"], | |
| ]; | |
| const LANGUAGE_DETAILS = { | |
| Hindi: { script: "Devanagari", region: "North India" }, | |
| Bengali: { script: "Bengali", region: "Eastern India" }, | |
| Marathi: { script: "Devanagari", region: "Maharashtra" }, | |
| Telugu: { script: "Telugu", region: "Andhra Pradesh + Telangana" }, | |
| Kannada: { script: "Kannada", region: "Karnataka" }, | |
| Tamil: { script: "Tamil", region: "Tamil Nadu" }, | |
| Malayalam: { script: "Malayalam", region: "Kerala" }, | |
| Gujarati: { script: "Gujarati", region: "Gujarat" }, | |
| Punjabi: { script: "Gurmukhi", region: "Punjab" }, | |
| Assamese: { script: "Assamese", region: "Assam" }, | |
| Bhojpuri: { script: "Devanagari", region: "Bihar + Eastern UP" }, | |
| Magahi: { script: "Devanagari", region: "Bihar" }, | |
| Maithili: { script: "Devanagari", region: "Mithila" }, | |
| Chhattisgarhi: { script: "Devanagari", region: "Chhattisgarh" }, | |
| Bodo: { script: "Devanagari", region: "Northeast India" }, | |
| Dogri: { script: "Devanagari", region: "Jammu" }, | |
| Nepali: { script: "Devanagari", region: "Nepal + India" }, | |
| Sanskrit: { script: "Devanagari", region: "Classical Indic" }, | |
| "English (Indian)": { script: "Latin", region: "Indian English" }, | |
| }; | |
| const VOICES = LANGUAGES.flatMap(([lang]) => [ | |
| `${lang} (Female)`, | |
| `${lang} (Male)`, | |
| ]); | |
| const DTYPES = [ | |
| { value: "q4f16", label: "q4f16", note: "~1.95 GB · fastest cold start" }, | |
| { value: "q8", label: "q8", note: "~4.32 GB · cleaner, slower preload" }, | |
| ]; | |
| const STACK_FACTS = [ | |
| { label: "Model", value: "Svāra-TTS v1" }, | |
| { label: "Codec", value: "SNAC 24 kHz" }, | |
| { label: "Runtime", value: "WebGPU + Transformers.js" }, | |
| ]; | |
| function withEmotionTag(text, tag) { | |
| return `${text.replace(/\s*<[^>]+>\s*$/u, "").trim()} ${tag}`; | |
| } | |
| export default function App() { | |
| const worker = useRef(null); | |
| const runtimeReadyRef = useRef(false); | |
| const loadedDtypesRef = useRef([]); | |
| const [selectedVoice, setSelectedVoice] = useState("Hindi (Female)"); | |
| const [inputText, setInputText] = useState(LANGUAGES[0][1]); | |
| const [dtype, setDtype] = useState("q4f16"); | |
| const [status, setStatus] = useState(null); | |
| const [error, setError] = useState(null); | |
| const [runtimeReady, setRuntimeReady] = useState(false); | |
| const [loadingDtype, setLoadingDtype] = useState(null); | |
| const [loadedDtypes, setLoadedDtypes] = useState([]); | |
| const [loadingMessage, setLoadingMessage] = useState( | |
| "Detecting WebGPU support...", | |
| ); | |
| const [results, setResults] = useState([]); | |
| const selectedLanguage = selectedVoice.split(" (")[0]; | |
| const selectedGender = selectedVoice.includes("(Male)") ? "Male" : "Female"; | |
| const languageDetail = LANGUAGE_DETAILS[selectedLanguage] ?? { | |
| script: "Indic", | |
| region: "South Asia", | |
| }; | |
| const currentSample = | |
| LANGUAGES.find(([lang]) => lang === selectedLanguage)?.[1] ?? inputText; | |
| const currentDtype = DTYPES.find((entry) => entry.value === dtype) ?? DTYPES[0]; | |
| const isCurrentDtypeLoaded = loadedDtypes.includes(dtype); | |
| const isLoadingCurrentDtype = | |
| status === "loading" && loadingDtype === currentDtype.value; | |
| const promptChips = useMemo( | |
| () => [ | |
| { label: "Sample line", value: currentSample }, | |
| { label: "Sample + <sad>", value: withEmotionTag(currentSample, "<sad>") }, | |
| { | |
| label: "Sample + <happy>", | |
| value: withEmotionTag(currentSample, "<happy>"), | |
| }, | |
| ], | |
| [currentSample], | |
| ); | |
| useEffect(() => { | |
| runtimeReadyRef.current = runtimeReady; | |
| }, [runtimeReady]); | |
| useEffect(() => { | |
| loadedDtypesRef.current = loadedDtypes; | |
| }, [loadedDtypes]); | |
| useEffect(() => { | |
| worker.current ??= new Worker(new URL("./worker.js", import.meta.url), { | |
| type: "module", | |
| }); | |
| const onMessageReceived = (e) => { | |
| switch (e.data.status) { | |
| case "feature-success": | |
| runtimeReadyRef.current = true; | |
| setRuntimeReady(true); | |
| setError(null); | |
| setStatus("idle"); | |
| setLoadingMessage( | |
| "WebGPU is available. Load a model when you want to start local synthesis.", | |
| ); | |
| break; | |
| case "feature-error": | |
| setError(e.data.data); | |
| break; | |
| case "loading": | |
| setError(null); | |
| if (loadedDtypesRef.current.includes(e.data.dtype)) { | |
| setLoadingDtype(null); | |
| setStatus("running"); | |
| } else { | |
| setLoadingDtype(e.data.dtype); | |
| setLoadingMessage( | |
| e.data.dtype === "q8" | |
| ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..." | |
| : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...", | |
| ); | |
| setStatus("loading"); | |
| } | |
| break; | |
| case "ready": | |
| setLoadingDtype(null); | |
| setError(null); | |
| setLoadedDtypes((prev) => { | |
| if (prev.includes(e.data.dtype)) return prev; | |
| const next = [...prev, e.data.dtype]; | |
| loadedDtypesRef.current = next; | |
| return next; | |
| }); | |
| setStatus("ready"); | |
| break; | |
| case "complete": | |
| setResults((prev) => [ | |
| { | |
| text: e.data.text, | |
| src: e.data.audio, | |
| voice: e.data.voice, | |
| dtype: e.data.dtype, | |
| createdAt: new Date().toLocaleTimeString([], { | |
| hour: "numeric", | |
| minute: "2-digit", | |
| }), | |
| }, | |
| ...prev, | |
| ]); | |
| setError(null); | |
| setStatus("ready"); | |
| break; | |
| case "error": | |
| setLoadingDtype(null); | |
| setError(e.data.data); | |
| setStatus( | |
| loadedDtypesRef.current.includes(e.data.dtype) | |
| ? "ready" | |
| : runtimeReadyRef.current | |
| ? "idle" | |
| : null, | |
| ); | |
| break; | |
| } | |
| }; | |
| worker.current.addEventListener("message", onMessageReceived); | |
| worker.current.addEventListener("error", (event) => console.error(event)); | |
| return () => { | |
| worker.current.removeEventListener("message", onMessageReceived); | |
| }; | |
| }, []); | |
| const handleSubmit = (event) => { | |
| event.preventDefault(); | |
| if (!isCurrentDtypeLoaded) return; | |
| setStatus("running"); | |
| setError(null); | |
| worker.current.postMessage({ | |
| type: "generate", | |
| text: inputText.trim(), | |
| speaker_id: selectedVoice, | |
| dtype, | |
| }); | |
| }; | |
| const handleLoadModel = () => { | |
| if (!runtimeReady || isCurrentDtypeLoaded) return; | |
| setError(null); | |
| setLoadingDtype(dtype); | |
| setLoadingMessage( | |
| dtype === "q8" | |
| ? "Loading q8 weights (~4.32 GB, sharded). First run can take a minute..." | |
| : "Loading q4f16 weights (~1.95 GB). First run downloads once, then stays cached...", | |
| ); | |
| setStatus("loading"); | |
| worker.current?.postMessage({ type: "preload", dtype }); | |
| }; | |
| const onLanguageChange = (lang) => { | |
| const sample = LANGUAGES.find(([entry]) => entry === lang)?.[1] ?? inputText; | |
| setInputText(sample); | |
| setSelectedVoice(`${lang} (Female)`); | |
| }; | |
| const onDtypeChange = (next) => { | |
| if (next === dtype) return; | |
| setDtype(next); | |
| setError(null); | |
| setLoadingDtype(null); | |
| setStatus(loadedDtypesRef.current.includes(next) ? "ready" : "idle"); | |
| }; | |
| let statusHeadline = "Checking browser runtime"; | |
| let statusBody = loadingMessage; | |
| if (error) { | |
| statusHeadline = runtimeReady ? "Load issue" : "Startup issue"; | |
| statusBody = error; | |
| } else if (status === "running") { | |
| statusHeadline = "Rendering speech locally"; | |
| statusBody = `Synthesizing with ${selectedVoice} on ${currentDtype.label}.`; | |
| } else if (status === "loading") { | |
| statusHeadline = "Loading model weights"; | |
| } else if (runtimeReady && !isCurrentDtypeLoaded) { | |
| statusHeadline = "Ready to load model"; | |
| statusBody = `${currentDtype.label} is a one-time ${currentDtype.note.split("·")[0].trim()} download. Tap Load model to cache it in this browser.`; | |
| } else if (isCurrentDtypeLoaded) { | |
| statusHeadline = "Model ready in this browser"; | |
| statusBody = `${selectedVoice} is ready on ${currentDtype.label}. Everything runs locally after the one-time model download.`; | |
| } | |
| const statusActivityLabel = status === "running" | |
| ? "Generating audio..." | |
| : status === "loading" | |
| ? "Loading in the background" | |
| : null; | |
| const statusCardBusy = !error && ( | |
| status === "loading" || status === "running" || status === null | |
| ); | |
| const loadButtonLabel = isLoadingCurrentDtype | |
| ? `Loading ${currentDtype.label}...` | |
| : `Load ${currentDtype.label}`; | |
| return ( | |
| <div className="app-shell"> | |
| <div className="ornament ornament-top" aria-hidden="true"> | |
| <img src="/warli-strip.svg" alt="" /> | |
| </div> | |
| <main className="app-main"> | |
| <header className="hero"> | |
| <span className="hero-kicker">Svāra TTS · WebGPU</span> | |
| <h1 className="hero-title">Svāra</h1> | |
| <span className="hero-subline">स्वरा · Indic text-to-speech in the browser</span> | |
| <p className="hero-copy"> | |
| A warmer frontend for the same local synthesis engine: 19 languages, | |
| 38 voices, SNAC decoding, and no server round-trip once the model is | |
| cached in this browser. | |
| </p> | |
| <div className="hero-links"> | |
| <a | |
| href="https://huggingface.co/kenpath/svara-tts-v1" | |
| target="_blank" | |
| rel="noreferrer" | |
| > | |
| Base model | |
| </a> | |
| <a | |
| href="https://huggingface.co/shreyask/svara-tts-v1-ONNX" | |
| target="_blank" | |
| rel="noreferrer" | |
| > | |
| ONNX export | |
| </a> | |
| <a | |
| href="https://huggingface.co/onnx-community/snac_24khz-ONNX" | |
| target="_blank" | |
| rel="noreferrer" | |
| > | |
| SNAC codec | |
| </a> | |
| </div> | |
| </header> | |
| <section | |
| className={`card status-card ${statusCardBusy ? "is-busy" : ""}`} | |
| > | |
| <div className="status-main"> | |
| <p className="section-kicker">Session</p> | |
| <h2>{statusHeadline}</h2> | |
| <p className={`status-copy ${error ? "is-error" : ""}`}> | |
| {statusBody} | |
| </p> | |
| {statusActivityLabel && !error && ( | |
| <div className="inline-loader" aria-hidden="true"> | |
| <span className="inline-loader-dot"></span> | |
| <span className="inline-loader-label">{statusActivityLabel}</span> | |
| </div> | |
| )} | |
| {runtimeReady && !isCurrentDtypeLoaded && status !== "loading" && ( | |
| <div className="model-gate"> | |
| <div> | |
| <p className="model-gate-copy"> | |
| Model load is explicit in this build. | |
| </p> | |
| <span className="model-gate-sub"> | |
| {loadedDtypes.length > 0 | |
| ? `Cached here: ${loadedDtypes.join(", ")}` | |
| : "Nothing cached in this browser session yet."} | |
| </span> | |
| </div> | |
| <button | |
| type="button" | |
| className="primary-button load-button" | |
| onClick={handleLoadModel} | |
| disabled={!runtimeReady || status === "running"} | |
| > | |
| {loadButtonLabel} | |
| </button> | |
| </div> | |
| )} | |
| </div> | |
| <div className="pill-row"> | |
| <span className="pill">19 languages</span> | |
| <span className="pill">38 voices</span> | |
| <span className="pill">24 kHz mono</span> | |
| <span className="pill">Runs locally</span> | |
| </div> | |
| </section> | |
| <div className="workspace"> | |
| <section className="card composer-card"> | |
| <div className="card-header"> | |
| <div> | |
| <p className="section-kicker">Compose</p> | |
| <h2>Switch language, adjust voice, synthesize</h2> | |
| </div> | |
| <button | |
| type="button" | |
| className="ghost-button" | |
| onClick={() => setInputText(currentSample)} | |
| > | |
| Use sample | |
| </button> | |
| </div> | |
| <form onSubmit={handleSubmit} className="composer-form"> | |
| <div className="control-grid"> | |
| <label className="field"> | |
| <span className="field-label">Language</span> | |
| <select | |
| value={selectedLanguage} | |
| onChange={(event) => onLanguageChange(event.target.value)} | |
| > | |
| {LANGUAGES.map(([lang]) => ( | |
| <option key={lang} value={lang}> | |
| {lang} | |
| </option> | |
| ))} | |
| </select> | |
| </label> | |
| <label className="field"> | |
| <span className="field-label">Voice</span> | |
| <select | |
| value={selectedVoice} | |
| onChange={(event) => setSelectedVoice(event.target.value)} | |
| > | |
| {VOICES.filter((voice) => voice.startsWith(`${selectedLanguage} (`)).map( | |
| (voice) => ( | |
| <option key={voice} value={voice}> | |
| {voice.split("(")[1].replace(")", "")} | |
| </option> | |
| ), | |
| )} | |
| </select> | |
| </label> | |
| <label className="field field-wide"> | |
| <span className="field-label">Quantization</span> | |
| <select | |
| value={dtype} | |
| onChange={(event) => onDtypeChange(event.target.value)} | |
| disabled={status === "running" || status === "loading"} | |
| > | |
| {DTYPES.map((entry) => ( | |
| <option key={entry.value} value={entry.value}> | |
| {entry.label} | |
| </option> | |
| ))} | |
| </select> | |
| <small className="field-note">{currentDtype.note}</small> | |
| </label> | |
| </div> | |
| <label className="field"> | |
| <div className="label-row"> | |
| <span className="field-label">Prompt</span> | |
| <span className="field-meta"> | |
| {languageDetail.script} · {languageDetail.region} | |
| </span> | |
| </div> | |
| <textarea | |
| placeholder="Enter text in any supported Indian language..." | |
| value={inputText} | |
| onChange={(event) => setInputText(event.target.value)} | |
| rows={Math.min(8, Math.max(4, inputText.split("\n").length))} | |
| /> | |
| </label> | |
| <div className="chip-bar"> | |
| {promptChips.map((chip) => ( | |
| <button | |
| key={chip.label} | |
| type="button" | |
| className="utility-chip" | |
| onClick={() => setInputText(chip.value)} | |
| > | |
| {chip.label} | |
| </button> | |
| ))} | |
| </div> | |
| <div className="composer-footer"> | |
| <p className="helper-copy"> | |
| Emotion tags can be appended at the end of the sentence, for | |
| example <code><sad></code> or <code><happy></code>. | |
| Use <code>q8</code> if you want the cleanest output and can | |
| afford the larger one-time download. | |
| </p> | |
| <button | |
| type="submit" | |
| className="primary-button" | |
| disabled={ | |
| status !== "ready" || | |
| !isCurrentDtypeLoaded || | |
| inputText.trim() === "" | |
| } | |
| > | |
| {status === "running" | |
| ? "Generating audio..." | |
| : isLoadingCurrentDtype | |
| ? `Loading ${currentDtype.label}...` | |
| : !isCurrentDtypeLoaded | |
| ? "Load model to continue" | |
| : "Generate speech"} | |
| </button> | |
| </div> | |
| </form> | |
| </section> | |
| <aside className="sidebar"> | |
| <section className="card inspector-card"> | |
| <p className="section-kicker">Inspector</p> | |
| <h3 className="inspector-title">{selectedVoice}</h3> | |
| <dl className="compact-meta-grid"> | |
| <div className="compact-meta"> | |
| <dt>Script</dt> | |
| <dd>{languageDetail.script}</dd> | |
| </div> | |
| <div className="compact-meta"> | |
| <dt>Region</dt> | |
| <dd>{languageDetail.region}</dd> | |
| </div> | |
| <div className="compact-meta"> | |
| <dt>Type</dt> | |
| <dd>{selectedGender}</dd> | |
| </div> | |
| <div className="compact-meta"> | |
| <dt>Quant</dt> | |
| <dd>{currentDtype.label}</dd> | |
| </div> | |
| </dl> | |
| <div className="stack-chip-list"> | |
| {STACK_FACTS.map((fact) => ( | |
| <div key={fact.label} className="stack-chip"> | |
| <span>{fact.label}</span> | |
| <strong>{fact.value}</strong> | |
| </div> | |
| ))} | |
| </div> | |
| <details className="debug-notes"> | |
| <summary>Usage notes</summary> | |
| <ul className="note-list note-list-compact"> | |
| <li>Model and codec are browser-cached after the first load.</li> | |
| <li>Short prompts are the best way to compare voices and quant levels.</li> | |
| <li>The results archive below preserves each render with the actual voice used.</li> | |
| </ul> | |
| </details> | |
| </section> | |
| </aside> | |
| </div> | |
| {results.length > 0 && ( | |
| <section className="results-section"> | |
| <div className="results-header"> | |
| <div> | |
| <p className="section-kicker">Archive</p> | |
| <h2>Generated takes</h2> | |
| </div> | |
| <span className="results-meta">Newest first</span> | |
| </div> | |
| <div className="results-grid"> | |
| {results.map((result, index) => ( | |
| <motion.article | |
| key={`${result.voice}-${result.createdAt}-${index}`} | |
| initial={{ y: 24, opacity: 0 }} | |
| animate={{ y: 0, opacity: 1 }} | |
| transition={{ duration: 0.35, delay: index * 0.04 }} | |
| className="card result-card" | |
| > | |
| <div className="result-head"> | |
| <div> | |
| <h3>{result.voice}</h3> | |
| <p>{result.createdAt}</p> | |
| </div> | |
| <span className="result-pill">{result.dtype}</span> | |
| </div> | |
| <p className="result-text">{result.text}</p> | |
| <audio controls src={result.src} className="result-audio"> | |
| Your browser does not support the audio element. | |
| </audio> | |
| </motion.article> | |
| ))} | |
| </div> | |
| </section> | |
| )} | |
| </main> | |
| <div className="ornament ornament-bottom" aria-hidden="true"> | |
| <img src="/warli-strip.svg" alt="" /> | |
| </div> | |
| </div> | |
| ); | |
| } | |