BexttsStream

Running on Zero

App Files Files Community

archivartaunik commited on Nov 16, 2025

Commit

c3bdefc

verified ·

1 Parent(s): d823d7e

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -147

app.py CHANGED Viewed

@@ -3,18 +3,9 @@ os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
-import sys
-import re
-import time
-import json
-import base64
-import hashlib
-import tempfile
-import subprocess
-import inspect
 from typing import Iterator, Iterable, Optional, Tuple, Any, List
 from dataclasses import dataclass
-import pathlib
 import spaces
 import gradio as gr
@@ -23,7 +14,7 @@ import numpy as np
 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
-# ---------- coqui-ai-TTS fork ----------
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
 if not os.path.exists(REPO_DIR):
@@ -36,10 +27,9 @@ from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
-# ---------- model files ----------
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
-model_dir = "./model"
-os.makedirs(model_dir, exist_ok=True)
 for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
     if not os.path.exists(os.path.join(model_dir, fname)):
         hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
@@ -48,7 +38,7 @@ config_file     = os.path.join(model_dir, "config.json")
 vocab_file      = os.path.join(model_dir, "vocab.json")
 default_voice_file = os.path.join(model_dir, "voice.wav")
-# ---------- load model ----------
 config = XttsConfig(); config.load_json(config_file)
 XTTS_MODEL = Xtts.init_from_config(config)
 XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
@@ -60,35 +50,36 @@ if device.startswith("cuda"):
     torch.backends.cudnn.allow_tf32 = True
     torch.backends.cudnn.benchmark = True
     torch.set_float32_matmul_precision("high")
 XTTS_MODEL.to(device).eval()
 sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
-# ---------- defaults ----------
-DEF_MIN_BUFFER_S      = 0.06
-DEF_FIRST_CHUNK_S     = 0.03
-DEF_TOKENS_PER_STEP   = 1
-DEF_ENABLE_TEXT_SPLIT = True
 DEF_FIRST_SEGMENT_LIMIT = 160
 FADE_S = 0.004
-DEF_CLIENT_PREROLL    = 0.18
-DEF_CLIENT_LOWWM      = 0.06
-MAX_CLIENT_PREROLL    = 0.32
-STEP_CLIENT_PREROLL   = 0.04
-# ---------- audio utils ----------
-def _seconds_to_samples(sec: float, sr: int) -> int:
-    return max(1, int(sec * sr))
 def _to_np_audio(x) -> np.ndarray:
     if isinstance(x, dict) and "wav" in x: x = x["wav"]
     if isinstance(x, torch.Tensor):
         if x.dtype != torch.float32: x = x.float()
         return x.detach().cpu().contiguous().view(-1).numpy()
-    x = np.asarray(x); x = x.reshape(-1) if x.ndim > 1 else x
     return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
@@ -98,7 +89,7 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
     if fade_n <= 1: return np.concatenate([a, b], axis=0)
     fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
-    head = a[:-fade_n]; tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in); rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
@@ -108,8 +99,7 @@ def _bpe_prefixes(text: str, lang: str, step_tokens: int):
         if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
     except Exception: pass
     pseudo = re.findall(r"\S+|\s+", text); acc = ""
-    for i in range(0, len(pseudo), step_tokens):
-        acc = "".join(pseudo[: i + step_tokens]); yield acc
     if acc.strip() != text.strip(): yield text
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
@@ -164,11 +154,14 @@ def init_stream_support():
     Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
 init_stream_support()
-# ---------- latents cache ----------
-PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache"); PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
 class LatentsMeta:
     model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
@@ -178,11 +171,13 @@ def _latents_key(path: str | None, meta: LatentsMeta) -> str:
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
 def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
 def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
 def _load_latents_from_disk(key: str):
-    p=_latents_disk_path(key);
     if not p.exists(): return None
-    obj=torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
 def _compute_latents_cpu(path: str | None):
     with torch.inference_mode():
@@ -208,7 +203,7 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None):
 try: _ = _latents_for(default_voice_file)
 except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
-# ---------- stream packing ----------
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
@@ -228,7 +223,7 @@ def _pcm_f32_to_b64(x: np.ndarray) -> str:
     if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
     return base64.b64encode(x.tobytes()).decode("ascii")
-# ---------- text split ----------
 _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
@@ -272,17 +267,20 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int, first_seg
         except Exception: pass
     return parts + (rest or [text_for_rest])
-# ---------- TTS endpoint ----------
 @spaces.GPU(duration=60)
-def text_to_speech(belarusian_story, speaker_audio_file=None,
-                   min_buffer_s: float = DEF_MIN_BUFFER_S,
-                   first_chunk_s: float = DEF_FIRST_CHUNK_S,
-                   enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
-                   tokens_per_step: int = DEF_TOKENS_PER_STEP,
-                   first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT):
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
         raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
     if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
         speaker_audio_file = default_voice_file
@@ -313,13 +311,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None,
     yield ("", None, None, json.dumps(server_metrics))
     full_audio_chunks=[]; first_chunk_seen=False; t_gen0=time.perf_counter()
     for part in texts:
         gen = XTTS_MODEL.generate(
             text=part, do_stream=True, language=lang_short,
             gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
-            min_buffer_s=float(first_chunk_s), tokens_per_step=int(tokens_per_step),
             stream_chunk_size_s=float(first_chunk_s),
-            temperature=0.1, length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3,
         )
         for buf in _chunker(gen, sampling_rate, float(min_buffer_s)):
             if not first_chunk_seen:
@@ -349,7 +350,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None,
     yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
-# ---------- UI ----------
 examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
 with gr.Blocks() as demo:
@@ -364,7 +365,7 @@ with gr.Blocks() as demo:
         with gr.Row():
             ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01,
                                    label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
-            ui_lowwm   = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM, step=0.005,
                                    label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
         with gr.Row():
             apply_btn = gr.Button("Прымяніць налады прайгравальніка")
@@ -383,15 +384,7 @@ with gr.Blocks() as demo:
         play_btn = gr.Button("▶️ Play (stream)")
         stop_btn = gr.Button("⏹ Stop (stream)")
         run_btn = gr.Button("Згенераваць")
-        gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
-    # невялікі CSS на ўсякі выпадак
-    gr.HTML("""
-<style>
-#preroll_slider input[type="range"],
-#lowwm_slider input[type="range"] { pointer-events:auto !important; cursor: default !important; }
-</style>
-""")
     log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
                         label="Лагі плэера")
@@ -403,7 +396,7 @@ with gr.Blocks() as demo:
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
     play_final_btn = gr.Button("▶️ Play Final")
-    # ---- AudioWorklet processor JS ----
     AUDIO_WORKLET_PROCESSOR = r"""
 class StreamBufferProcessor extends AudioWorkletProcessor {
   constructor() {
@@ -415,6 +408,7 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
     this.thresholdSamples = 0;
     this.lowWatermarkSamples = 0;
     this.underrunSent = false;
     this.port.onmessage = (e) => {
       const msg = e.data || {};
       if (msg.type === 'push') {
@@ -426,12 +420,19 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
       } else if (msg.type === 'set_thresholds') {
         this.thresholdSamples = msg.thresholdSamples|0;
         this.lowWatermarkSamples = msg.lowWatermarkSamples|0;
       }
     };
   }
   process(inputs, outputs, parameters) {
     const out = outputs[0][0];
     let i = 0;
     if (!this.started) {
       if (this.bufferedSamples >= this.thresholdSamples) {
         this.started = true;
@@ -441,6 +442,7 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
         return true;
       }
     }
     while (i < out.length) {
       if (this.queue.length === 0) {
         if (!this.underrunSent) { this.underrunSent = true; this.port.postMessage({ type:'underrun' }); }
@@ -466,16 +468,15 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
 registerProcessor('stream-buffer', StreamBufferProcessor);
 """
-    # ---- INIT + player ----
     INIT_RESET_AND_PLAY_JS = f"""
 () => {{
-  const sampleRate = {sampling_rate};
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
-  function getLocalFloat(key, fallback) {{
     try {{ const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; }} catch(e) {{}}
-    return fallback;
   }}
   const DEFAULT_PREROLL = {DEF_CLIENT_PREROLL};
@@ -486,132 +487,155 @@ registerProcessor('stream-buffer', StreamBufferProcessor);
   let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
   let LOW_WM_S  = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
   function toSec(ms) {{ return (ms/1000); }}
   function logUpdate() {{
-    const el = document.getElementById('wa-log');
-    if (!el || !window.__wa || !window.__wa.meta) return;
-    const m = window.__wa.meta;
     const lines = [];
     lines.push("Клік (Згенераваць): 0.000 s");
-    if (m.t_first_push_ms) {{
-      const click_to_first = (m.t_first_push_ms - m.t_click_ms)/1000;
-      lines.push("Першы чанк прыйшоў:   " + click_to_first.toFixed(3) + " s");
-      if (m.t_first_audio_ms) {{
-        lines.push("Пачатак прайгравання: " + ((m.t_first_audio_ms - m.t_click_ms)/1000).toFixed(3) + " s");
-        lines.push("Затрымка (чанк→аўдыя): " + ((m.t_first_audio_ms - m.t_first_push_ms)/1000).toFixed(3) + " s");
       }}
     }}
-    const s = (m.server || {{}});
-    function p(v){{return (v==null)?"n/a":v.toFixed(3)+" s";}}
-    lines.push(""); lines.push("— Серверныя метрыкі —");
-    lines.push("Latents (умоўны голас):  " + p(s.latents_s));
-    lines.push("Падзел тэксту:           " + p(s.text_split_s));
-    lines.push("Ініт→1-ы чанк:           " + p(s.gen_init_to_first_chunk_s));
-    lines.push("Усё да 1-га чанка:       " + p(s.until_first_chunk_total_s));
-    lines.push("Іншая серверная апрац.:  " + p(s.server_unaccounted_before_first_chunk_s));
-    lines.push("Запіс WAV:               " + p(s.file_write_s));
-    if (m.click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
-      const est = Math.max(0, m.click_to_first_chunk_s - s.until_first_chunk_total_s);
       lines.push(""); lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est.toFixed(3) + " s");
     }}
-    lines.push(""); lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
     lines.push("PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s");
     el.textContent = lines.join("\\n");
   }}
-  const ctx  = new AC({{ sampleRate }});
-  const blob = new Blob([`{AUDIO_WORKLET_PROCESSOR}`], {{ type: 'application/javascript' }});
-  const url  = URL.createObjectURL(blob);
-  const meta = {{ t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null, server: null, click_to_first_chunk_s: null }};
-  let workletNode = null, gate = null, connected = false, queuedSamples = 0, underrunSeen = false;
-  async function setup() {{
     await ctx.audioWorklet.addModule(url);
     workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
     gate = ctx.createGain(); gate.gain.value = 1.0;
     workletNode.connect(gate);
-    workletNode.port.postMessage({{
-      type: 'set_thresholds',
-      thresholdSamples: Math.floor(PREROLL_S * sampleRate),
-      lowWatermarkSamples: Math.floor(LOW_WM_S * sampleRate),
-    }});
     workletNode.port.onmessage = (e) => {{
       const msg = e.data || {{}};
-      if (msg.type === 'first_audio') {{
         if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
-      }} else if (msg.type === 'underrun') {{ underrunSeen = true; }}
     }};
     window.__wa = {{
       ctx, workletNode, gate,
       get playing() {{ return connected; }},
-      get eos() {{ return false; }},
-      set eos(v) {{}},
       meta,
       push: async (f32) => {{
         try {{ await ctx.resume(); }} catch(e) {{}}
-        workletNode.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]);
-        queuedSamples += f32.length;
-        if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); meta.click_to_first_chunk_s = (meta.t_first_push_ms - meta.t_click_ms)/1000; logUpdate(); }}
-        if (!connected) {{ try {{ gate.connect(ctx.destination); connected = true; }} catch(e) {{}} logUpdate(); }}
       }},
-      stop: () => {{ if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected = false; logUpdate(); }} }},
       reset: () => {{
-        try {{ if (underrunSeen) {{ const cur = Math.min({MAX_CLIENT_PREROLL}, PREROLL_S + {STEP_CLIENT_PREROLL}); localStorage.setItem("tts_preroll_s", String(cur)); }} }} catch(e) {{}}
-        queuedSamples = 0; underrunSeen = False;
         if (workletNode) {{
-          workletNode.port.postMessage({{ type: 'reset' }});
-          workletNode.port.postMessage({{ type:'set_thresholds', thresholdSamples: Math.floor(PREROLL_S*sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S*sampleRate) }});
         }}
-        if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected = false; }}
         meta.t_first_push_ms = null; meta.t_first_audio_ms = null; meta.click_to_first_chunk_s = null; logUpdate();
       }},
       updateLog: logUpdate,
     }};
-  }}
-  setup();
 }}
 """.replace("{AUDIO_WORKLET_PROCESSOR}", AUDIO_WORKLET_PROCESSOR)
     STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
     PLAY_JS = "() => { if (window.__wa) { try { window.__wa.ctx.resume(); } catch(e){}; if (!window.__wa.playing) { try { window.__wa.gate.connect(window.__wa.ctx.destination); } catch(e){} } window.__wa.updateLog && window.__wa.updateLog(); } }"
-    # ---- Apply/Reset client settings (live) ----
     APPLY_JS = """
 () => {
-  const pWrap = document.getElementById('preroll_slider');
-  const lWrap = document.getElementById('lowwm_slider');
-  const p = pWrap ? pWrap.querySelector('input[type="range"]') : null;
-  const l = lWrap ? lWrap.querySelector('input[type="range"]') : null;
   const pr = p && p.value ? parseFloat(p.value) : 0.18;
   const lw = l && l.value ? parseFloat(l.value) : 0.06;
-  try { localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); } catch(e) {}
-  if (window.__wa && window.__wa.workletNode && window.__wa.ctx) {
-    const sr = window.__wa.ctx.sampleRate || 24000;
-    window.__wa.workletNode.port.postMessage({ type:'set_thresholds', thresholdSamples: Math.floor(pr*sr), lowWatermarkSamples: Math.floor(lw*sr) });
-    window.__wa.updateLog && window.__wa.updateLog();
-  }
 }
 """
-    RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} })()"
-    # ---- enable sliders on load (fix 'forbidden' cursor) ----
-    ENABLE_SLIDERS_JS = """
-() => {
-  ['preroll_slider','lowwm_slider'].forEach(id => {
-    const wrap = document.getElementById(id);
-    if (!wrap) return;
-    const inp = wrap.querySelector('input[type="range"]');
-    if (inp) { inp.disabled = false; inp.removeAttribute('readonly'); inp.style.pointerEvents='auto'; inp.style.cursor='default'; }
-  });
-}
-"""
-    # ---- streaming JS ----
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
@@ -629,28 +653,23 @@ registerProcessor('stream-buffer', StreamBufferProcessor);
   try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {}
 }
 """
-    PLAY_FINAL_JS = """
-() => { const host = document.getElementById('final-audio'); if (!host) return; const audio = host.querySelector('audio'); if (audio) { try { audio.play(); } catch(e) {} } }
-"""
-    # ---- wiring ----
     apply_btn.click(fn=None, inputs=[], outputs=[], js=APPLY_JS)
     reset_btn.click(fn=None, inputs=[], outputs=[], js=RESET_JS)
     play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
     stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
     run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
-    run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
                   outputs=[stream_pipe, final_file, final_audio, log_pipe])
     stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
     log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
     play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
-    # <<< enable sliders right after app loads >>>
-    demo.load(fn=None, inputs=None, outputs=None, js=ENABLE_SLIDERS_JS)
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
 if __name__ == "__main__":

 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
+import sys, re, time, json, base64, hashlib, tempfile, subprocess, inspect, pathlib
 from typing import Iterator, Iterable, Optional, Tuple, Any, List
 from dataclasses import dataclass
 import spaces
 import gradio as gr
 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
+# ----------------- clone fork -----------------
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
 if not os.path.exists(REPO_DIR):
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
+# ----------------- model files ----------------
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
+model_dir = "./model"; os.makedirs(model_dir, exist_ok=True)
 for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
     if not os.path.exists(os.path.join(model_dir, fname)):
         hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
 vocab_file      = os.path.join(model_dir, "vocab.json")
 default_voice_file = os.path.join(model_dir, "voice.wav")
+# ----------------- load XTTS ------------------
 config = XttsConfig(); config.load_json(config_file)
 XTTS_MODEL = Xtts.init_from_config(config)
 XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
     torch.backends.cudnn.allow_tf32 = True
     torch.backends.cudnn.benchmark = True
     torch.set_float32_matmul_precision("high")
 XTTS_MODEL.to(device).eval()
 sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
+# ----------------- defaults -------------------
+DEF_MIN_BUFFER_S        = 0.06
+DEF_FIRST_CHUNK_S       = 0.03
+DEF_TOKENS_PER_STEP     = 1
+DEF_ENABLE_TEXT_SPLIT   = True
 DEF_FIRST_SEGMENT_LIMIT = 160
 FADE_S = 0.004
+DEF_CLIENT_PREROLL  = 0.18
+DEF_CLIENT_LOWWM    = 0.06
+MAX_CLIENT_PREROLL  = 0.40
+STEP_CLIENT_PREROLL = 0.04
+# ----------------- audio utils ----------------
+def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
 def _to_np_audio(x) -> np.ndarray:
     if isinstance(x, dict) and "wav" in x: x = x["wav"]
     if isinstance(x, torch.Tensor):
         if x.dtype != torch.float32: x = x.float()
         return x.detach().cpu().contiguous().view(-1).numpy()
+    x = np.asarray(x);
+    if x.ndim > 1: x = x.reshape(-1)
     return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
     if fade_n <= 1: return np.concatenate([a, b], axis=0)
     fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
+    head = a[:-fade_n]; tail = a[-fade_n:] * fade_out + b[:fade_n] * fade_in; rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
         if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
     except Exception: pass
     pseudo = re.findall(r"\S+|\s+", text); acc = ""
+    for i in range(0, len(pseudo), step_tokens): acc = "".join(pseudo[: i + step_tokens]); yield acc
     if acc.strip() != text.strip(): yield text
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
     Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
 init_stream_support()
+# ----------------- latents cache ---------------
+PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
+PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
 class LatentsMeta:
     model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
 def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
 def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
 def _load_latents_from_disk(key: str):
+    p = _latents_disk_path(key)
     if not p.exists(): return None
+    obj = torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
 def _compute_latents_cpu(path: str | None):
     with torch.inference_mode():
 try: _ = _latents_for(default_voice_file)
 except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
+# ----------------- stream packing --------------
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
     if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
     return base64.b64encode(x.tobytes()).decode("ascii")
+# ----------------- split text -----------------
 _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
         except Exception: pass
     return parts + (rest or [text_for_rest])
+# ----------------- TTS endpoint ---------------
 @spaces.GPU(duration=60)
+def text_to_speech(
+    belarusian_story, speaker_audio_file=None,
+    min_buffer_s: float = DEF_MIN_BUFFER_S,
+    first_chunk_s: float = DEF_FIRST_CHUNK_S,
+    enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
+    tokens_per_step: int = DEF_TOKENS_PER_STEP,
+    first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT,
+):
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
         raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
     if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
         speaker_audio_file = default_voice_file
     yield ("", None, None, json.dumps(server_metrics))
     full_audio_chunks=[]; first_chunk_seen=False; t_gen0=time.perf_counter()
     for part in texts:
         gen = XTTS_MODEL.generate(
             text=part, do_stream=True, language=lang_short,
             gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+            min_buffer_s=float(first_chunk_s),
+            tokens_per_step=int(tokens_per_step),
             stream_chunk_size_s=float(first_chunk_s),
+            temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
+            top_k=10, top_p=0.3,
         )
         for buf in _chunker(gen, sampling_rate, float(min_buffer_s)):
             if not first_chunk_seen:
     yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
+# ----------------- UI ------------------------
 examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
 with gr.Blocks() as demo:
         with gr.Row():
             ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01,
                                    label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
+            ui_lowwm   = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM,   step=0.005,
                                    label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
         with gr.Row():
             apply_btn = gr.Button("Прымяніць налады прайгравальніка")
         play_btn = gr.Button("▶️ Play (stream)")
         stop_btn = gr.Button("⏹ Stop (stream)")
         run_btn = gr.Button("Згенераваць")
+        gr.Markdown(f"**Model SR:** {sampling_rate} Hz")
     log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
                         label="Лагі плэера")
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
     play_final_btn = gr.Button("▶️ Play Final")
+    # ---------- AudioWorklet processor (with handshake) ----------
     AUDIO_WORKLET_PROCESSOR = r"""
 class StreamBufferProcessor extends AudioWorkletProcessor {
   constructor() {
     this.thresholdSamples = 0;
     this.lowWatermarkSamples = 0;
     this.underrunSent = false;
     this.port.onmessage = (e) => {
       const msg = e.data || {};
       if (msg.type === 'push') {
       } else if (msg.type === 'set_thresholds') {
         this.thresholdSamples = msg.thresholdSamples|0;
         this.lowWatermarkSamples = msg.lowWatermarkSamples|0;
+        // handshake back to main
+        this.port.postMessage({ type: 'thresholds_ready',
+                                thresholdSamples: this.thresholdSamples,
+                                lowWatermarkSamples: this.lowWatermarkSamples,
+                                ctxSR: sampleRate });
       }
     };
   }
   process(inputs, outputs, parameters) {
     const out = outputs[0][0];
     let i = 0;
     if (!this.started) {
       if (this.bufferedSamples >= this.thresholdSamples) {
         this.started = true;
         return true;
       }
     }
     while (i < out.length) {
       if (this.queue.length === 0) {
         if (!this.underrunSent) { this.underrunSent = true; this.port.postMessage({ type:'underrun' }); }
 registerProcessor('stream-buffer', StreamBufferProcessor);
 """
+    # ---------- INIT + player (wait-for-thresholds) ----------
     INIT_RESET_AND_PLAY_JS = f"""
 () => {{
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
+  function getLocalFloat(key, defVal) {{
     try {{ const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; }} catch(e) {{}}
+    return defVal;
   }}
   const DEFAULT_PREROLL = {DEF_CLIENT_PREROLL};
   let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
   let LOW_WM_S  = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
+  const blob = new Blob([`{AUDIO_WORKLET_PROCESSOR}`], {{ type: 'application/javascript' }});
+  const url  = URL.createObjectURL(blob);
+  const ctx = new AC({{ sampleRate: {sampling_rate} }});
+  const meta = {{
+    t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null,
+    server: null, click_to_first_chunk_s: null, ctx_sr: ctx.sampleRate,
+    thresholds: null
+  }};
+  let workletNode = null, gate = null, connected = false;
+  let ready = false;                 // WAIT for thresholds_ready
+  const pending = [];               // queue chunks before ready
+  let underrunSeen = false;
   function toSec(ms) {{ return (ms/1000); }}
+  function p3(x) {{ return (x==null)?'n/a':x.toFixed(3)+' s'; }}
   function logUpdate() {{
+    const el = document.getElementById('wa-log'); if (!el) return;
+    const s = meta.server || {{}};
     const lines = [];
     lines.push("Клік (Згенераваць): 0.000 s");
+    if (meta.t_first_push_ms) {{
+      lines.push("Першы чанк прыйшоў:   " + (toSec(meta.t_first_push_ms - meta.t_click_ms)).toFixed(3) + " s");
+      if (meta.t_first_audio_ms) {{
+        lines.push("Пачатак прайгравання: " + (toSec(meta.t_first_audio_ms - meta.t_click_ms)).toFixed(3) + " s");
+        lines.push("Затрымка (чанк→аўдыя): " + (toSec(meta.t_first_audio_ms - meta.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
+    lines.push("");
+    lines.push("— Серверныя метрыкі —");
+    lines.push("Latents (умоўны голас):  " + p3(s.latents_s));
+    lines.push("Падзел тэксту:           " + p3(s.text_split_s));
+    lines.push("Ініт→1-ы чанк:           " + p3(s.gen_init_to_first_chunk_s));
+    lines.push("Усё да 1-га чанка:       " + p3(s.until_first_chunk_total_s));
+    lines.push("Іншая серверная апрац.:  " + p3(s.server_unaccounted_before_first_chunk_s));
+    lines.push("Запіс WAV:               " + p3(s.file_write_s));
+    if (meta.click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
+      const est = Math.max(0, meta.click_to_first_chunk_s - s.until_first_chunk_total_s);
       lines.push(""); lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est.toFixed(3) + " s");
     }}
+    lines.push("");
+    lines.push("Статус стриму: " + (connected ? "playing" : "stopped"));
     lines.push("PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s");
+    lines.push("ctx.sampleRate: " + meta.ctx_sr + " Hz");
+    if (meta.thresholds) {{
+      lines.push("thresholdSamples: " + meta.thresholds.thresholdSamples + " | lowWM: " + meta.thresholds.lowWatermarkSamples);
+    }}
     el.textContent = lines.join("\\n");
   }}
+  (async () => {{
     await ctx.audioWorklet.addModule(url);
     workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
     gate = ctx.createGain(); gate.gain.value = 1.0;
     workletNode.connect(gate);
     workletNode.port.onmessage = (e) => {{
       const msg = e.data || {{}};
+      if (msg.type === 'thresholds_ready') {{
+        ready = true; meta.thresholds = {{ thresholdSamples: msg.thresholdSamples, lowWatermarkSamples: msg.lowWatermarkSamples }};
+        // flush pending
+        for (const f32 of pending) {{
+          workletNode.port.postMessage({{ type:'push', buffer:f32.buffer }}, [f32.buffer]);
+        }}
+        pending.length = 0;
+        logUpdate();
+      }} else if (msg.type === 'first_audio') {{
         if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
+      }} else if (msg.type === 'underrun') {{
+        underrunSeen = true;
+      }}
     }};
+    // send thresholds using **ctx.sampleRate**
+    workletNode.port.postMessage({{
+      type: 'set_thresholds',
+      thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
+      lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
+    }});
     window.__wa = {{
       ctx, workletNode, gate,
       get playing() {{ return connected; }},
       meta,
       push: async (f32) => {{
         try {{ await ctx.resume(); }} catch(e) {{}}
+        if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); meta.click_to_first_chunk_s = (meta.t_first_push_ms - meta.t_click_ms)/1000; }}
+        // if thresholds not ready yet — buffer locally
+        if (!ready) {{ pending.push(f32); }}
+        else        {{ workletNode.port.postMessage({{ type:'push', buffer:f32.buffer }}, [f32.buffer]); }}
+        if (!connected) {{ try {{ gate.connect(ctx.destination); connected = true; }} catch(e) {{}} }}
+        logUpdate();
       }},
+      stop: () => {{ if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected=false; logUpdate(); }} }},
       reset: () => {{
+        try {{
+          if (underrunSeen) {{
+            const cur = Math.min({MAX_CLIENT_PREROLL}, PREROLL_S + {STEP_CLIENT_PREROLL});
+            localStorage.setItem("tts_preroll_s", String(cur));
+          }}
+        }} catch(e) {{}}
         if (workletNode) {{
+          workletNode.port.postMessage({{ type:'reset' }});
+          ready = false; pending.length = 0;
+          workletNode.port.postMessage({{
+            type:'set_thresholds',
+            thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
+            lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
+          }});
         }}
+        if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected=false; }}
         meta.t_first_push_ms = null; meta.t_first_audio_ms = null; meta.click_to_first_chunk_s = null; logUpdate();
       }},
       updateLog: logUpdate,
+      applyClient: (pr, lw) => {{
+        PREROLL_S = pr; LOW_WM_S = lw;
+        try {{ localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); }} catch(e) {{}}
+        if (workletNode) {{
+          workletNode.port.postMessage({{
+            type:'set_thresholds',
+            thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
+            lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
+          }});
+        }}
+        logUpdate();
+      }}
     }};
+    logUpdate();
+  } )();
 }}
 """.replace("{AUDIO_WORKLET_PROCESSOR}", AUDIO_WORKLET_PROCESSOR)
     STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
     PLAY_JS = "() => { if (window.__wa) { try { window.__wa.ctx.resume(); } catch(e){}; if (!window.__wa.playing) { try { window.__wa.gate.connect(window.__wa.ctx.destination); } catch(e){} } window.__wa.updateLog && window.__wa.updateLog(); } }"
     APPLY_JS = """
 () => {
+  const p = document.getElementById('preroll_slider')?.querySelector('input[type="range"]');
+  const l = document.getElementById('lowwm_slider')?.querySelector('input[type="range"]');
   const pr = p && p.value ? parseFloat(p.value) : 0.18;
   const lw = l && l.value ? parseFloat(l.value) : 0.06;
+  if (window.__wa && window.__wa.applyClient) { window.__wa.applyClient(pr, lw); }
 }
 """
+    RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} })()"
+    # -------- streaming + logs --------
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
   try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {}
 }
 """
+    PLAY_FINAL_JS = "(() => { const el=document.getElementById('final-audio'); const a=el?.querySelector('audio'); if (a) { try{a.play();}catch(e){} } })()"
+    # wiring
     apply_btn.click(fn=None, inputs=[], outputs=[], js=APPLY_JS)
     reset_btn.click(fn=None, inputs=[], outputs=[], js=RESET_JS)
     play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
     stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
     run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
+    run_btn.click(fn=text_to_speech,
+                  inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
                   outputs=[stream_pipe, final_file, final_audio, log_pipe])
     stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
     log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
     play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
 if __name__ == "__main__":