Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

tiny-army / web /ttsQwen3.js

polats's picture

Voice: provider-driven per-hero voices; settings is provider-only

e648dca 5 days ago

history blame contribute delete

5.07 kB

	// TTS engine: Qwen3-TTS Voice Design via Alibaba DashScope, proxied through our backend
	// (/qwen-tts) so the API key stays server-side. The "voice" here is a free-form natural-
	// language DESCRIPTION (the persona's `voice` field, or a preset), used as DashScope's
	// `voice_prompt`. NETWORKED — not local-first (clearly labeled). mode 'pcm'.
	import { decodeAudio } from '/web/ttsAudio.js'

	// Endpoint: default is our Space backend (/qwen-tts → DashScope). A `?tts=<base>` query
	// param (persisted to localStorage) points it at a self-run local server instead —
	// the LeLab-style bridge: hosted UI → Qwen3-TTS on YOUR GPU, off the grid. `?tts=`
	// (empty) clears the override. e.g. ?tts=http://localhost:8800
	const TTS_STORE = 'tinyarmy.ttsBase'
	function ttsBase() {
	try {
	const q = new URLSearchParams(location.search).get('tts')
	if (q !== null) {
	if (q) localStorage.setItem(TTS_STORE, q.replace(/\/+$/, ''))
	else localStorage.removeItem(TTS_STORE)
	}
	return (localStorage.getItem(TTS_STORE) \|\| '').replace(/\/+$/, '')
	} catch { return '' }
	}

	// `desc()` returns the instruct string. 'persona' uses the dynamically-set description.
	let _desc = ''
	const VOICES = [
	{ id: 'persona', label: '✨ Persona voice (designed)', desc: () => _desc },
	{ id: 'veteran', label: 'Gruff veteran', desc: () => 'A gravelly, battle-worn male baritone — slow, deliberate, weary, with a wry edge.' },
	{ id: 'herald', label: 'Bright herald', desc: () => 'A clear, bright young male voice — brisk, energetic, projecting and confident.' },
	{ id: 'medic', label: 'Steady medic', desc: () => 'A calm, warm female voice — measured pace, clear articulation, reassuring.' },
	{ id: 'rogue', label: 'Sly rogue', desc: () => 'A low, smooth voice with a sly, amused lilt — unhurried, with a dangerous edge.' },
	]
	const get = (id) => VOICES.find((v) => v.id === id) \|\| VOICES[0]

	export const isLocalhost = () => {
	try { return /^(localhost\|127\.0\.0\.1\|\[?::1\]?\|0\.0\.0\.0)$/i.test(location.hostname) } catch { return false }
	}

	// POST to `${base}/qwen-tts` → raw WAV ArrayBuffer. base '' = same-origin.
	async function postSynthWav(base, text, voiceId) {
	const instruct = (get(voiceId).desc() \|\| '').trim()
	const resp = await fetch(`${base}/qwen-tts`, {
	method: 'POST', headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ text, instruct, language: 'English' }),
	})
	if (!resp.ok) throw new Error(`Qwen3-TTS ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
	return resp.arrayBuffer()
	}
	const postSynth = async (base, text, voiceId) => decodeAudio(await postSynthWav(base, text, voiceId))

	// Voice CLONE: synth `text` using a reference WAV (the last created voice) so the timbre
	// stays identical — only the words change. ref is an ArrayBuffer; sent as base64.
	function abToB64(ab) {
	let s = ''; const u = new Uint8Array(ab); const C = 0x8000
	for (let i = 0; i < u.length; i += C) s += String.fromCharCode.apply(null, u.subarray(i, i + C))
	return btoa(s)
	}
	async function postClone(base, text, refAb, refText, instruct) {
	const resp = await fetch(`${base}/qwen-tts`, {
	method: 'POST', headers: { 'Content-Type': 'application/json' },
	// instruct lets prod (DashScope, no clone model) gracefully re-design from the
	// description instead of cloning; local mode uses ref_audio to clone the timbre.
	body: JSON.stringify({ text, ref_audio: abToB64(refAb), ref_text: refText \|\| '', instruct: instruct \|\| '', language: 'English' }),
	})
	if (!resp.ok) throw new Error(`Qwen3-TTS clone ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
	return resp.arrayBuffer()
	}

	const common = {
	mode: 'pcm', needsDownload: false, networked: true,
	design: true, // designs a voice from a free-form description (the persona's `voice`)
	listVoices: () => VOICES, defaultVoice: 'persona',
	ensure: async () => { /* nothing to load — server-side */ },
	setDesc(d) { _desc = (d \|\| '').trim() }, // shared _desc across both variants
	}

	// CLOUD: the hosted backend (/qwen-tts → DashScope). `?tts=` can still bridge it.
	export const engine = {
	...common,
	id: 'qwen3',
	label: 'Qwen3-TTS · Voice Design (cloud)',
	available: () => true,
	synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
	synthWav: (text, voiceId) => postSynthWav(ttsBase(), text, voiceId),
	cloneWav: (text, refAb, refText, instruct) => postClone(ttsBase(), text, refAb, refText, instruct),
	backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
	}

	// LOCAL: same-origin /qwen-tts on a locally-run app.py (TINY_TTS_MODE=local → the open
	// weights on your GPU). Only offered on localhost; disabled with a note in prod.
	export const engineLocal = {
	...common,
	id: 'qwen3local',
	label: 'Qwen3-TTS · local (your GPU)',
	available: () => isLocalhost(),
	note: 'run the project locally',
	synth: (text, voiceId) => postSynth('', text, voiceId),
	backendLabel: () => '🖥 local model',
	}