Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on 5 days ago

Commit

72160ec

1 Parent(s): 308478f

Persona: red section headings, icon buttons, edit-aware voice + clone-on-play

- Result laid out in sections (line + red heading): About, Quote, Voice design.
- ▶ play icon sits on the Quote heading; 🎙 create/recreate sits on the Voice design
heading; both simplified to icons (play is neutral, not red).
- Editing the quote or voice marks the cached audio stale → a red badge on ▶.
- Tapping ▶ when badged does NOT redesign the voice: it CLONES the last voice file
(Qwen3-TTS Base model, keeping the exact timbre) for the new line, plays it, saves
over the old file, and clears the badge. Otherwise ▶ replays the cached file.
🎙 is the explicit "new voice from the description" path.
- Backend: TINY_TTS_MODE=local loads the Base model lazily and clones via
generate_voice_clone((numpy,sr) ref). Prod (DashScope) gets the voice description so
it re-designs gracefully (no clone model). Verified clone end-to-end on the GPU (5s).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

app.py +38 -12
web/personaPanel.js +58 -29
web/shell/persona.css +22 -12
web/tts.js +5 -0
web/ttsQwen3.js +19 -0

app.py CHANGED Viewed

@@ -246,26 +246,47 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
 # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
-_local_tts = None
 _local_tts_lock = threading.Lock()
 def _local_voice_design(text, instruct, language="English"):
     global _local_tts
-    import io
     with _local_tts_lock:  # one GPU model can't decode in parallel
         if _local_tts is None:
-            import torch
-            from qwen_tts import Qwen3TTSModel
-            mid = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
-            dev = "cuda:0" if torch.cuda.is_available() else "cpu"
-            dt = torch.bfloat16 if dev != "cpu" else torch.float32
-            _local_tts = Qwen3TTSModel.from_pretrained(mid, device_map=dev, dtype=dt)
-        import soundfile as sf
         wavs, sr = _local_tts.generate_voice_design(
             text=text, language=language, instruct=instruct or "A clear, natural voice at a moderate pace.")
-    out = io.BytesIO()
-    sf.write(out, wavs[0], sr, format="WAV")
     return out.getvalue()
@@ -303,11 +324,16 @@ async def qwen_tts(request: Request):
     text = (body.get("text") or "").strip()
     instruct = (body.get("instruct") or "").strip()
     language = body.get("language") or "English"
     if not text:
         return Response("text required", status_code=400)
     if TTS_MODE == "local":  # in-process open weights (dev)
         try:
-            wav = await asyncio.to_thread(_local_voice_design, text, instruct, language)
         except Exception as e:  # noqa: BLE001 — surface a clear setup hint
             return Response(f"local TTS error (pip install qwen-tts torch soundfile?): {e}", status_code=500)
         return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})

 # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
+_local_tts = None       # VoiceDesign model
+_local_clone = None     # Base model (voice clone) — lazy, only if a clone is requested
 _local_tts_lock = threading.Lock()
+def _load(which):
+    import torch
+    from qwen_tts import Qwen3TTSModel
+    mid = os.environ.get(
+        "QWEN_TTS_MODEL" if which == "design" else "QWEN_TTS_CLONE_MODEL",
+        "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign" if which == "design" else "Qwen/Qwen3-TTS-12Hz-1.7B-Base")
+    dev = "cuda:0" if torch.cuda.is_available() else "cpu"
+    dt = torch.bfloat16 if dev != "cpu" else torch.float32
+    return Qwen3TTSModel.from_pretrained(mid, device_map=dev, dtype=dt)
 def _local_voice_design(text, instruct, language="English"):
     global _local_tts
+    import io, soundfile as sf
     with _local_tts_lock:  # one GPU model can't decode in parallel
         if _local_tts is None:
+            _local_tts = _load("design")
         wavs, sr = _local_tts.generate_voice_design(
             text=text, language=language, instruct=instruct or "A clear, natural voice at a moderate pace.")
+    out = io.BytesIO(); sf.write(out, wavs[0], sr, format="WAV")
+    return out.getvalue()
+def _local_voice_clone(text, ref_audio_b64, ref_text, language="English"):
+    # Keep the SAME timbre as a previously-created voice by cloning from its audio (the
+    # "Voice Design → Clone" workflow). qwen-tts wants ref_audio as a (numpy, sr) tuple
+    # (a raw base64 string gets mistaken for a file path), so decode the WAV here.
+    global _local_clone
+    import io, soundfile as sf
+    ref_np, ref_sr = sf.read(io.BytesIO(base64.b64decode(ref_audio_b64)))
+    with _local_tts_lock:
+        if _local_clone is None:
+            _local_clone = _load("clone")
+        wavs, sr = _local_clone.generate_voice_clone(
+            text=text, language=language, ref_audio=(ref_np, ref_sr), ref_text=ref_text or "")
+    out = io.BytesIO(); sf.write(out, wavs[0], sr, format="WAV")
     return out.getvalue()
     text = (body.get("text") or "").strip()
     instruct = (body.get("instruct") or "").strip()
     language = body.get("language") or "English"
+    ref_audio = body.get("ref_audio")  # base64 WAV → clone (keep timbre, new words)
+    ref_text = body.get("ref_text") or ""
     if not text:
         return Response("text required", status_code=400)
     if TTS_MODE == "local":  # in-process open weights (dev)
         try:
+            if ref_audio:
+                wav = await asyncio.to_thread(_local_voice_clone, text, ref_audio, ref_text, language)
+            else:
+                wav = await asyncio.to_thread(_local_voice_design, text, instruct, language)
         except Exception as e:  # noqa: BLE001 — surface a clear setup hint
             return Response(f"local TTS error (pip install qwen-tts torch soundfile?): {e}", status_code=500)
         return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})

web/personaPanel.js CHANGED Viewed

@@ -7,7 +7,7 @@ import { streamChat, ensureModel, currentModel, currentModelId, getEngineId, bac
 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
 import { PERSONA_SYSTEM, personaUserPrompt, stripThink, stripThinkFinal, noThink } from '/web/personaPrompts.js'
-import { createVoiceWav, playWav, stopPreview } from '/web/tts.js'
 import { listPersonas, savePersona, removePersona, onRosterChange, putAudio, getAudio } from '/web/personaStore.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
@@ -36,30 +36,50 @@ export function mountPersonaPanel(host) {
   const tagsEl = el('div', { class: 'persona-tags' })
   const aboutEl = el('div', { class: 'persona-about persona-edit', 'data-ph': 'Their story…' })
   const quoteEl = el('blockquote', { class: 'persona-quote persona-edit', 'data-ph': 'A line they say…' })
-  const replayBtn = el('button', { class: 'persona-replay', type: 'button', title: 'Replay voice', style: 'display:none' }, '▶')
-  const quoteRow = el('div', { class: 'persona-quote-row' }, [quoteEl, replayBtn])
-  const voiceLabel = el('div', { class: 'persona-voice-lbl' }, '🎙 Voice design')
   const voiceEl = el('div', { class: 'persona-voice-desc persona-edit', 'data-ph': 'How they sound…' })
-  const createBtn = el('button', { class: 'persona-go persona-go-alt persona-create', type: 'button', style: 'display:none' }, '🎙 Create voice')
   const thinkEl = el('pre', { class: 'persona-think' })
   const copyBtn = el('button', { class: 'persona-copy', type: 'button' }, '📋 Copy debug')
   const thinkWrap = el('details', { class: 'persona-think-wrap' },
     [el('summary', {}, 'model output / debug (raw)'), copyBtn, thinkEl])
   const controls = el('aside', { class: 'persona-controls' }, [
     el('label', { class: 'persona-label' }, 'Class'), sel,
     el('label', { class: 'persona-label' }, 'Seed'), seed,
     btn, stats, status,
     el('label', { class: 'persona-label persona-roster-label' }, 'Barracks (saved)'), rosterEl,
   ])
-  const result = el('div', { class: 'persona-result' },
-    [nameEl, tagsEl, aboutEl, quoteRow, voiceLabel, voiceEl, el('div', { class: 'persona-actions' }, [createBtn]), thinkWrap])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
   let lastPersona = null  // the persona currently shown
   let savedId = null      // its roster id (set the moment it's shown — always saved)
   let working = false
   function autosave() {
     if (!lastPersona) return
     const rec = savePersona({ ...lastPersona, id: savedId, unitClass: lastPersona.unitClass || sel.value, seed: lastPersona.seed || seed.value })
@@ -77,6 +97,7 @@ export function mountPersonaPanel(host) {
       if ((lastPersona[field] || '') === v) return
       lastPersona[field] = v
       autosave()
     })
   }
   editable(nameEl, 'name', { single: true })
@@ -97,42 +118,50 @@ export function mountPersonaPanel(host) {
     aboutEl.textContent = p.about || ''
     quoteEl.textContent = p.quote || ''
     voiceEl.textContent = p.voice || ''
-    createBtn.style.display = ''
-    // Show replay if we have a cached voice file for this saved persona.
-    const has = savedId ? !!(await getAudio(savedId)) : false
-    replayBtn.style.display = has ? '' : 'none'
-    createBtn.textContent = has ? '🎙 Recreate voice' : '🎙 Create voice'
   }
-  // 🎙 Create voice — synth the QUOTE in the designed voice, cache the WAV, play it.
   async function createVoice() {
     if (working || !lastPersona) return
-    const line = (lastPersona.quote || '').trim() || (lastPersona.about || '').trim() || `${lastPersona.name} reporting for duty.`
     if (!lastPersona.voice) { status.textContent = 'add a voice design first'; return }
     autosave() // ensure an id to key the audio
-    working = true; const prevTxt = createBtn.textContent; createBtn.textContent = '🎙 designing…'; createBtn.disabled = true
-    const prev = status.textContent
     try {
       const wav = await createVoiceWav(lastPersona.voice, line)
       await putAudio(savedId, new Blob([wav], { type: 'audio/wav' }))
-      try { await playWav(wav.slice(0)) } catch { /* autoplay blocked — replay button still works */ }
-      replayBtn.style.display = ''
-      createBtn.textContent = '🎙 Recreate voice'
       status.textContent = prev
-    } catch (e) {
-      status.textContent = `voice failed: ${e.message || e}`
-      createBtn.textContent = prevTxt
-    } finally { working = false; createBtn.disabled = false }
   }
   createBtn.addEventListener('click', createVoice)
-  async function replay() {
-    if (!savedId) return
     const blob = await getAudio(savedId)
-    if (!blob) return createVoice()
-    try { await playWav(await blob.arrayBuffer()) } catch { /* ignore */ }
   }
-  replayBtn.addEventListener('click', replay)
   // ── Barracks roster (saved soldiers) ──────────────────────────────────────
   function renderRoster(personas) {
@@ -180,7 +209,7 @@ export function mountPersonaPanel(host) {
     if (window.innerWidth <= 768) result.scrollIntoView({ behavior: 'smooth', block: 'start' })
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
     quoteEl.textContent = ''; voiceEl.textContent = ''
-    createBtn.style.display = 'none'; replayBtn.style.display = 'none'; lastPersona = null; savedId = null
     stopPreview()
     thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
     let acc = ''

 import { extractLivePersona } from '/web/personaStream.js'
 import { parsePersonaJson } from '/web/personaParse.js'
 import { PERSONA_SYSTEM, personaUserPrompt, stripThink, stripThinkFinal, noThink } from '/web/personaPrompts.js'
+import { createVoiceWav, cloneVoiceWav, playWav, stopPreview } from '/web/tts.js'
 import { listPersonas, savePersona, removePersona, onRosterChange, putAudio, getAudio } from '/web/personaStore.js'
 const CLASSES = ['Warrior', 'Ranger', 'Monk', 'Assassin', 'Mage', 'Paladin', 'Cleric', 'Knight']
   const tagsEl = el('div', { class: 'persona-tags' })
   const aboutEl = el('div', { class: 'persona-about persona-edit', 'data-ph': 'Their story…' })
   const quoteEl = el('blockquote', { class: 'persona-quote persona-edit', 'data-ph': 'A line they say…' })
   const voiceEl = el('div', { class: 'persona-voice-desc persona-edit', 'data-ph': 'How they sound…' })
+  // ▶ play sits on the Quote heading; 🎙 create sits on the Voice design heading.
+  const playBtn = el('button', { class: 'persona-ico persona-play', type: 'button', title: 'Play voice', style: 'display:none' }, '▶')
+  const createBtn = el('button', { class: 'persona-ico persona-create', type: 'button', title: 'Create voice', style: 'display:none' }, '🎙')
   const thinkEl = el('pre', { class: 'persona-think' })
   const copyBtn = el('button', { class: 'persona-copy', type: 'button' }, '📋 Copy debug')
   const thinkWrap = el('details', { class: 'persona-think-wrap' },
     [el('summary', {}, 'model output / debug (raw)'), copyBtn, thinkEl])
+  // A section header: a top line + a small red heading, with an optional action on the right.
+  const secHead = (title, action) =>
+    el('div', { class: 'persona-sec' }, [el('div', { class: 'persona-sec-title' }, title), action || el('span')])
   const controls = el('aside', { class: 'persona-controls' }, [
     el('label', { class: 'persona-label' }, 'Class'), sel,
     el('label', { class: 'persona-label' }, 'Seed'), seed,
     btn, stats, status,
     el('label', { class: 'persona-label persona-roster-label' }, 'Barracks (saved)'), rosterEl,
   ])
+  const result = el('div', { class: 'persona-result' }, [
+    nameEl, tagsEl,
+    secHead('About'), aboutEl,
+    secHead('Quote', playBtn), quoteEl,
+    secHead('Voice design', createBtn), voiceEl,
+    thinkWrap,
+  ])
   host.appendChild(el('div', { class: 'persona-view' }, [controls, result]))
   let lastPersona = null  // the persona currently shown
   let savedId = null      // its roster id (set the moment it's shown — always saved)
+  let hasVoice = false    // a cached voice file exists for this persona
   let working = false
+  // The line the voice actually says (quote, else about, else a fallback).
+  const lineFor = (p) => (p.quote || '').trim() || (p.about || '').trim() || `${p.name || 'A soldier'} reporting for duty.`
+  // Cached audio is stale if the line or the voice design changed since it was made.
+  const isDirty = () => hasVoice && lastPersona && (lineFor(lastPersona) !== lastPersona.voiceQuote || (lastPersona.voice || '') !== (lastPersona.voiceDesignUsed || ''))
+  function updateVoiceUI() {
+    playBtn.style.display = hasVoice ? '' : 'none'
+    playBtn.classList.toggle('badged', isDirty())
+    createBtn.style.display = lastPersona ? '' : 'none'
+    createBtn.title = hasVoice ? 'Recreate voice' : 'Create voice'
+  }
   function autosave() {
     if (!lastPersona) return
     const rec = savePersona({ ...lastPersona, id: savedId, unitClass: lastPersona.unitClass || sel.value, seed: lastPersona.seed || seed.value })
       if ((lastPersona[field] || '') === v) return
       lastPersona[field] = v
       autosave()
+      if (field === 'quote' || field === 'voice') updateVoiceUI() // may go stale → badge
     })
   }
   editable(nameEl, 'name', { single: true })
     aboutEl.textContent = p.about || ''
     quoteEl.textContent = p.quote || ''
     voiceEl.textContent = p.voice || ''
+    hasVoice = savedId ? !!(await getAudio(savedId)) : false
+    updateVoiceUI()
   }
+  // 🎙 Create / Recreate voice — DESIGN a fresh voice from the description and cache it.
   async function createVoice() {
     if (working || !lastPersona) return
     if (!lastPersona.voice) { status.textContent = 'add a voice design first'; return }
     autosave() // ensure an id to key the audio
+    const line = lineFor(lastPersona)
+    working = true; createBtn.classList.add('busy'); createBtn.disabled = true
+    const prev = status.textContent; status.textContent = 'designing the voice…'
     try {
       const wav = await createVoiceWav(lastPersona.voice, line)
       await putAudio(savedId, new Blob([wav], { type: 'audio/wav' }))
+      lastPersona.voiceQuote = line; lastPersona.voiceDesignUsed = lastPersona.voice
+      hasVoice = true; autosave()
+      try { await playWav(wav.slice(0)) } catch { /* autoplay blocked — ▶ still works */ }
       status.textContent = prev
+    } catch (e) { status.textContent = `voice failed: ${e.message || e}` }
+    finally { working = false; createBtn.classList.remove('busy'); createBtn.disabled = false; updateVoiceUI() }
   }
   createBtn.addEventListener('click', createVoice)
+  // ▶ Play — plays the cached file. If the quote/voice changed since (badge), re-render
+  // the new line by CLONING the last voice (keeps the same timbre), then save over it.
+  async function play() {
+    if (working || !hasVoice || !savedId) return
     const blob = await getAudio(savedId)
+    if (!blob) { hasVoice = false; updateVoiceUI(); return }
+    if (!isDirty()) { try { await playWav(await blob.arrayBuffer()) } catch { /* ignore */ } return }
+    working = true; playBtn.classList.add('busy'); playBtn.disabled = true
+    const prev = status.textContent; status.textContent = 'updating the voice…'
+    try {
+      const line = lineFor(lastPersona)
+      const newWav = await cloneVoiceWav(await blob.arrayBuffer(), lastPersona.voiceQuote || '', line, lastPersona.voice || '')
+      try { await playWav(newWav.slice(0)) } catch { /* ignore */ }
+      await putAudio(savedId, new Blob([newWav], { type: 'audio/wav' })) // save over
+      lastPersona.voiceQuote = line; lastPersona.voiceDesignUsed = lastPersona.voice
+      autosave(); status.textContent = prev
+    } catch (e) { status.textContent = `voice update failed: ${e.message || e}` }
+    finally { working = false; playBtn.classList.remove('busy'); playBtn.disabled = false; updateVoiceUI() }
   }
+  playBtn.addEventListener('click', play)
   // ── Barracks roster (saved soldiers) ──────────────────────────────────────
   function renderRoster(personas) {
     if (window.innerWidth <= 768) result.scrollIntoView({ behavior: 'smooth', block: 'start' })
     nameEl.textContent = '…'; aboutEl.textContent = ''; tagsEl.replaceChildren()
     quoteEl.textContent = ''; voiceEl.textContent = ''
+    createBtn.style.display = 'none'; playBtn.style.display = 'none'; lastPersona = null; savedId = null; hasVoice = false
     stopPreview()
     thinkEl.textContent = ''; thinkWrap.open = true; stats.textContent = ''
     let acc = ''

web/shell/persona.css CHANGED Viewed

@@ -62,29 +62,39 @@
   font-size: 17px; line-height: 1.6; max-width: 60ch; color: var(--p-ink);
   white-space: pre-wrap;
 }
-.persona-voice-lbl {
-  font-family: var(--p-mono); font-size: 10px; letter-spacing: .14em; text-transform: uppercase;
-  color: var(--p-muted); margin-top: 16px;
 }
 .persona-voice-desc {
   font-family: var(--p-mono); font-size: 12px; line-height: 1.5; color: var(--p-muted);
-  max-width: 60ch; margin-top: 4px; font-style: italic;
 }
-.persona-quote-row { display: flex; align-items: flex-start; gap: 10px; margin-top: 16px; }
 .persona-quote {
-  flex: 1; margin: 0; padding: 6px 0 6px 16px; border-left: 3px solid var(--p-transmit);
   font-family: 'Fraunces', Georgia, serif; font-size: 21px; font-style: italic;
   line-height: 1.35; color: var(--p-ink); max-width: 54ch;
 }
 .persona-quote:not(:empty)::before { content: '“'; }
 .persona-quote:not(:empty)::after { content: '”'; }
-.persona-replay {
-  flex-shrink: 0; cursor: pointer; margin-top: 4px;
-  font-size: 13px !important; color: var(--p-paper) !important; background: var(--p-transmit) !important;
-  border: 1.5px solid var(--p-transmit) !important; border-radius: 0 !important; padding: 6px 11px !important; line-height: 1;
 }
-.persona-replay:hover { background: var(--p-ink) !important; border-color: var(--p-ink) !important; }
-.persona-actions { display: flex; flex-wrap: wrap; gap: 10px; margin-top: 14px; }
 /* Click-to-edit fields (name / about / quote / voice) — auto-saved on blur. */
 .persona-edit { cursor: text; border-radius: 0; outline: none; transition: background .12s; }

   font-size: 17px; line-height: 1.6; max-width: 60ch; color: var(--p-ink);
   white-space: pre-wrap;
 }
+/* ── Section headers (line + red heading, action on the right) ─────────────── */
+.persona-sec {
+  display: flex; align-items: center; justify-content: space-between; gap: 10px;
+  margin-top: 20px; padding-top: 9px; border-top: 1px solid var(--p-ink);
+}
+.persona-sec-title {
+  font-family: var(--p-mono); font-size: 11px; font-weight: 500; letter-spacing: .2em;
+  text-transform: uppercase; color: var(--p-transmit);
 }
 .persona-voice-desc {
   font-family: var(--p-mono); font-size: 12px; line-height: 1.5; color: var(--p-muted);
+  max-width: 60ch; margin-top: 8px; font-style: italic;
 }
 .persona-quote {
+  margin: 8px 0 0; padding: 4px 0 4px 16px; border-left: 3px solid var(--p-transmit);
   font-family: 'Fraunces', Georgia, serif; font-size: 21px; font-style: italic;
   line-height: 1.35; color: var(--p-ink); max-width: 54ch;
 }
 .persona-quote:not(:empty)::before { content: '“'; }
 .persona-quote:not(:empty)::after { content: '”'; }
+/* Simple icon buttons on the section headers. */
+.persona-ico {
+  position: relative; cursor: pointer; flex-shrink: 0; line-height: 1;
+  font-size: 13px !important; color: var(--p-ink) !important; background: var(--p-card) !important;
+  border: 1.5px solid var(--p-ink) !important; border-radius: 0 !important; padding: 5px 10px !important;
+}
+.persona-ico:hover { background: var(--p-paper-2) !important; }
+.persona-ico.busy { opacity: .55; cursor: default; }
+.persona-play.badged::after { /* "voice changed — tap to refresh" badge */
+  content: ''; position: absolute; top: -4px; right: -4px; width: 9px; height: 9px;
+  background: var(--p-transmit); border: 1.5px solid var(--p-card); border-radius: 50%;
 }
 /* Click-to-edit fields (name / about / quote / voice) — auto-saved on blur. */
 .persona-edit { cursor: text; border-radius: 0; outline: none; transition: background .12s; }

web/tts.js CHANGED Viewed

@@ -30,6 +30,11 @@ export async function createVoiceWav(desc, text) {
   qwen3.setDesc(desc)
   return qwen3.synthWav(text, 'persona')
 }
 export async function playWav(arrayBuffer) {
   const { audio, sampleRate } = await decodeAudio(arrayBuffer)
   return playSamples(audio, sampleRate)

   qwen3.setDesc(desc)
   return qwen3.synthWav(text, 'persona')
 }
+// Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
+// voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
+export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
+  return qwen3.cloneWav(text, refArrayBuffer, refText, desc)
+}
 export async function playWav(arrayBuffer) {
   const { audio, sampleRate } = await decodeAudio(arrayBuffer)
   return playSamples(audio, sampleRate)

web/ttsQwen3.js CHANGED Viewed

@@ -47,6 +47,24 @@ async function postSynthWav(base, text, voiceId) {
 }
 const postSynth = async (base, text, voiceId) => decodeAudio(await postSynthWav(base, text, voiceId))
 const common = {
   mode: 'pcm', needsDownload: false, networked: true,
   listVoices: () => VOICES, defaultVoice: 'persona',
@@ -62,6 +80,7 @@ export const engine = {
   available: () => true,
   synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
   synthWav: (text, voiceId) => postSynthWav(ttsBase(), text, voiceId),
   backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
 }

 }
 const postSynth = async (base, text, voiceId) => decodeAudio(await postSynthWav(base, text, voiceId))
+// Voice CLONE: synth `text` using a reference WAV (the last created voice) so the timbre
+// stays identical — only the words change. ref is an ArrayBuffer; sent as base64.
+function abToB64(ab) {
+  let s = ''; const u = new Uint8Array(ab); const C = 0x8000
+  for (let i = 0; i < u.length; i += C) s += String.fromCharCode.apply(null, u.subarray(i, i + C))
+  return btoa(s)
+}
+async function postClone(base, text, refAb, refText, instruct) {
+  const resp = await fetch(`${base}/qwen-tts`, {
+    method: 'POST', headers: { 'Content-Type': 'application/json' },
+    // instruct lets prod (DashScope, no clone model) gracefully re-design from the
+    // description instead of cloning; local mode uses ref_audio to clone the timbre.
+    body: JSON.stringify({ text, ref_audio: abToB64(refAb), ref_text: refText || '', instruct: instruct || '', language: 'English' }),
+  })
+  if (!resp.ok) throw new Error(`Qwen3-TTS clone ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
+  return resp.arrayBuffer()
+}
 const common = {
   mode: 'pcm', needsDownload: false, networked: true,
   listVoices: () => VOICES, defaultVoice: 'persona',
   available: () => true,
   synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
   synthWav: (text, voiceId) => postSynthWav(ttsBase(), text, voiceId),
+  cloneWav: (text, refAb, refText, instruct) => postClone(ttsBase(), text, refAb, refText, instruct),
   backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
 }