BexttsStream

Sleeping

App Files Files Community

archivartaunik commited on Nov 15, 2025

Commit

6da2e8a

verified ·

1 Parent(s): 3e4a584

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -439

app.py CHANGED Viewed

@@ -23,9 +23,9 @@ import numpy as np
 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
-# =========================================================
-# 0) Repo + imports для XTTS (coqui fork)
-# =========================================================
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
@@ -40,28 +40,29 @@ from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
-# =========================================================
-# 1) Мадэльныя файлы
-# =========================================================
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
 model_dir = "./model"
 os.makedirs(model_dir, exist_ok=True)
 for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
     fpath = os.path.join(model_dir, fname)
     if not os.path.exists(fpath):
         hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
-checkpoint_file = os.path.join(model_dir, "model.pth")
-config_file     = os.path.join(model_dir, "config.json")
-vocab_file      = os.path.join(model_dir, "vocab.json")
-default_voice_file = os.path.join(model_dir, "voice.wav")
-# =========================================================
-# 2) Загрузка мадэлі + CUDA налады
-# =========================================================
-config = XttsConfig(); config.load_json(config_file)
-XTTS_MODEL: Xtts = Xtts.init_from_config(config)
 XTTS_MODEL.load_checkpoint(
     config,
     checkpoint_path=checkpoint_file,
@@ -81,23 +82,20 @@ if device.startswith("cuda"):
 XTTS_MODEL.to(device).eval()
 sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
-# tokenizer
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
 # =========================================================
-# 3) Канстанты стриму
 # =========================================================
-MIN_BUFFER_S = 0.020
-RUNTIME_FIRST_CHUNK_S = 0.010
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
-FIRST_SEGMENT_LIMIT = 120
-# =========================================================
-# 4) Аўдыя-ўтыліты
-# =========================================================
 def _seconds_to_samples(sec: float, sr: int) -> int:
     return max(1, int(sec * sr))
@@ -117,59 +115,18 @@ def _to_np_audio(x) -> np.ndarray:
     return x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
-    if a.size == 0:
-        return b.astype(np.float32, copy=False)
-    if b.size == 0:
-        return a.astype(np.float32, copy=False)
-    a = a.astype(np.float32, copy=False)
-    b = b.astype(np.float32, copy=False)
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
-    if fade_n <= 1:
-        return np.concatenate([a, b], axis=0)
     fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
-    fade_in = 1.0 - fade_out
     head = a[:-fade_n]
     tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
     rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
-def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
-    if not chunks:
-        return np.zeros((0,), dtype=np.float32)
-    out = chunks[0]
-    for i in range(1, len(chunks)):
-        out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
-    return out
-def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
-    target_samples = _seconds_to_samples(target_s, sr)
-    buf = np.zeros((0,), dtype=np.float32)
-    first = True
-    for c in chunks:
-        c = _to_np_audio(c)
-        if c.size == 0:
-            continue
-        if first:
-            buf = c if buf.size == 0 else np.concatenate([buf, c], axis=0)
-            first = False
-        else:
-            buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
-        if buf.size >= target_samples:
-            yield buf
-            buf = np.zeros((0,), dtype=np.float32)
-    if buf.size:
-        yield buf
-def _pcm_f32_to_int16_b64(x: np.ndarray) -> str:
-    if x.dtype != np.float32:
-        x = x.astype(np.float32, copy=False)
-    y = np.clip(x, -1.0, 0.9999695)
-    i16 = (y * 32767.0).astype("<i2", copy=False)
-    return base64.b64encode(i16.tobytes()).decode("ascii")
-# =========================================================
-# 5) BPE-prefix і стрим-генерацыя з fallback
-# =========================================================
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
     try:
         ids = tokenizer.encode(text, lang=lang)
@@ -192,7 +149,7 @@ def _bpe_prefixes(text: str, lang: str, step_tokens: int):
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
     sig = inspect.signature(model.inference_stream)
     call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
-    for k in ("temperature", "length_penalty", "repetition_penalty", "top_k", "top_p", "stream_chunk_size_s"):
         if k in gen_kwargs and k in sig.parameters:
             call_kwargs[k] = gen_kwargs[k]
     autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
@@ -207,76 +164,42 @@ def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent
         autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
         with torch.inference_mode(), autocast_ctx:
             out = model.inference(
-                text=prefix,
-                language=language,
-                gpt_cond_latent=gpt_cond_latent,
-                speaker_embedding=speaker_embedding,
                 temperature=gen_kwargs.get("temperature", 0.1),
-                length_penalty=1.0,
-                repetition_penalty=10.0,
-                top_k=gen_kwargs.get("top_k", 10),
-                top_p=gen_kwargs.get("top_p", 0.3),
             )
         wav = _to_np_audio(out)
-        new_part = wav[emitted:]
-        emitted = wav.size
-        if new_part.size:
-            yield new_part
 class NewTTSGenerationMixin:
     @torch.inference_mode()
-    def generate(
-        self: Xtts,
-        text: Optional[str] = None,
-        *,
-        do_stream: bool = False,
-        language: str = "be",
-        gpt_cond_latent: Any = None,
-        speaker_embedding: Any = None,
-        min_buffer_s: float = MIN_BUFFER_S,
-        tokens_per_step: int = TOKENS_PER_STEP,
-        **gen_kwargs,
-    ):
         assert isinstance(text, str) and text.strip(), "text is required"
         if not do_stream:
             autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
             with autocast_ctx:
                 out = self.inference(
-                    text=text,
-                    language=language,
-                    gpt_cond_latent=gpt_cond_latent,
-                    speaker_embedding=speaker_embedding,
                     temperature=gen_kwargs.get("temperature", 0.1),
-                    length_penalty=1.0,
-                    repetition_penalty=10.0,
-                    top_k=10,
-                    top_p=0.3,
                 )
             return _to_np_audio(out)
         return self.sample_stream(
-            text=text,
-            language=language,
-            gpt_cond_latent=gpt_cond_latent,
-            speaker_embedding=speaker_embedding,
-            min_buffer_s=min_buffer_s,
-            tokens_per_step=tokens_per_step,
-            **gen_kwargs,
         )
     @torch.inference_mode()
-    def sample_stream(
-        self: Xtts,
-        *,
-        text: str,
-        language: str,
-        gpt_cond_latent: Any,
-        speaker_embedding: Any,
-        min_buffer_s: float = MIN_BUFFER_S,
-        tokens_per_step: int = TOKENS_PER_STEP,
-        **gen_kwargs,
-    ) -> Iterator[np.ndarray]:
-        local_kwargs = dict(gen_kwargs)
-        local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
         if hasattr(self, "inference_stream"):
             for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
                 yield chunk
@@ -290,9 +213,9 @@ def init_stream_support():
 init_stream_support()
-# =========================================================
-# 6) Кэш латэнтаў (CPU/GPU) + дыск
-# =========================================================
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
@@ -312,16 +235,13 @@ def _latents_key(path: str | None, meta: LatentsMeta) -> str:
         base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
     else:
         base = "default_voice"
-    meta_str = json.dumps(
-        {
-            "model_id": meta.model_id,
-            "gpt_cond_len": meta.gpt_cond_len,
-            "max_ref_len": meta.max_ref_len,
-            "sound_norm_refs": meta.sound_norm_refs,
-            "xtts_git": meta.xtts_git,
-        },
-        sort_keys=True,
-    )
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
 def _latents_disk_path(key: str) -> pathlib.Path:
@@ -332,8 +252,7 @@ def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embed
 def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
     p = _latents_disk_path(key)
-    if not p.exists():
-        return None
     obj = torch.load(p, map_location="cpu")
     return obj["gpt_cond_latent"], obj["speaker_embedding"]
@@ -378,74 +297,83 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
         return g2, s2
     return g, s
-# warm-up
 try:
     _ = _latents_for(default_voice_file)
-    if device.startswith("cuda"):
-        g_gpu, s_gpu = _latents_for(default_voice_file, to_device=device)
-        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
-            _ = XTTS_MODEL.inference(
-                text=".", language="be",
-                gpt_cond_latent=g_gpu, speaker_embedding=s_gpu,
-                temperature=0.1, top_k=1, top_p=0.1,
-            )
 except Exception as e:
-    print(f"[warn] warm-up failed: {e}")
-# =========================================================
-# 7) Падзел тэксту
-# =========================================================
-_SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
     text = text.strip()
-    if not text:
-        return []
     parts = []
     start = 0
     for m in _SENT_END.finditer(text):
         end = m.end()
         parts.append(text[start:end].strip())
         start = end
-    if start < len(text):
-        parts.append(text[start:].strip())
     chunks = []
     cur = ""
     for s in parts:
         if len(cur) + 1 + len(s) <= limit:
             cur = (cur + " " + s).strip() if cur else s
         else:
-            if cur:
-                chunks.append(cur)
             if len(s) <= limit:
                 cur = s
             else:
-                w = _WS.split(s)
-                acc = ""
                 for tok in w:
                     if len(acc) + 1 + len(tok) <= limit:
                         acc = (acc + " " + tok).strip() if acc else tok
                     else:
-                        if acc:
-                            chunks.append(acc)
                         acc = tok
-                if acc:
-                    cur = acc
-                else:
-                    cur = ""
-    if cur:
-        chunks.append(cur)
     return [c for c in chunks if c]
 def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
     text_in = text_in.strip()
-    if not text_in:
-        return []
     parts: List[str] = []
     if len(text_in) > FIRST_SEGMENT_LIMIT:
         head = text_in[:FIRST_SEGMENT_LIMIT]
-        m = re.search(r'.*[\.!\?…»)]', head)
         if m and len(m.group(0)) > 30:
             head = m.group(0)
         tail = text_in[len(head):].lstrip()
@@ -453,33 +381,38 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
         text_for_rest = tail
     else:
         text_for_rest = text_in
-    if not text_for_rest:
-        return parts or [text_in]
     rest = _fast_split(text_for_rest, chunk_limit)
     if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
         try:
             rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
             rest2 = [s.strip() for s in rest2 if s and s.strip()]
-            if rest2:
-                rest = rest2
         except Exception:
             pass
     return parts + (rest or [text_for_rest])
-# =========================================================
-# 8) TTS стрим (выдае JSON-пакеты ў Textbox)
-# =========================================================
 @spaces.GPU(duration=60)
 def text_to_speech(belarusian_story, speaker_audio_file=None):
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
-        yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
         raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
     if not speaker_audio_file or (
-        not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""
     ):
         speaker_audio_file = default_voice_file
@@ -487,17 +420,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
     lang_short = "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
-    # Latents
     t_lat0 = time.perf_counter()
-    to_dev = device if device.startswith("cuda") else None
     gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
     t_lat1 = time.perf_counter()
     # Split
     t_split0 = time.perf_counter()
     texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
-    if not texts:
-        texts = [text_in]
     t_split1 = time.perf_counter()
     server_metrics = {
@@ -508,9 +440,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
         "server_unaccounted_before_first_chunk_s": None,
         "file_write_s": None,
     }
-    seq = 0
-    yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
     full_audio_chunks: List[np.ndarray] = []
     first_chunk_seen = False
@@ -518,339 +448,255 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
     for part in texts:
         gen = XTTS_MODEL.generate(
-            text=part,
-            do_stream=True,
-            language=lang_short,
-            gpt_cond_latent=gpt_cond_latent,
-            speaker_embedding=speaker_embedding,
             min_buffer_s=RUNTIME_FIRST_CHUNK_S,
             tokens_per_step=TOKENS_PER_STEP,
             stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
-            temperature=0.1,
-            length_penalty=1.0,
-            repetition_penalty=10.0,
-            top_k=10,
-            top_p=0.3,
         )
         for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
             if not first_chunk_seen:
                 t_first = time.perf_counter()
                 server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
                 server_metrics["until_first_chunk_total_s"] = (t_first - t0)
-                known = (
-                    server_metrics["latents_s"]
-                    + server_metrics["text_split_s"]
-                    + server_metrics["gen_init_to_first_chunk_s"]
-                )
                 other = server_metrics["until_first_chunk_total_s"] - known
                 server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
                 first_chunk_seen = True
-                seq += 1
-                yield (json.dumps({"seq": seq, "b64": _pcm_f32_to_int16_b64(buf), "log": server_metrics, "stop": False}), None, None)
             else:
-                seq += 1
-                yield (json.dumps({"seq": seq, "b64": _pcm_f32_to_int16_b64(buf), "log": None, "stop": False}), None, None)
             full_audio_chunks.append(buf)
-    final_file_path = None
-    final_audio_path = None
-    if full_audio_chunks:
-        t_w0 = time.perf_counter()
-        full_audio = _merge_for_file(full_audio_chunks)
-        try:
-            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-            write(tmp.name, sampling_rate, full_audio.astype(np.float32))
-            final_file_path = tmp.name
-            final_audio_path = tmp.name
-        except Exception as e:
-            server_metrics["_file_error"] = str(e)
-        finally:
-            t_w1 = time.perf_counter()
-            server_metrics["file_write_s"] = (t_w1 - t_w0)
-    seq += 1
-    yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
-# =========================================================
-# 9) UI + AudioWorklet з polling (без gr.Audio streaming)
-# =========================================================
 examples = [
-    [
-        "Прывітанне! Гэта праверка жывога струменя беларускага TTS.",
-        "Nestarka.wav",
-    ],
 ]
 with gr.Blocks() as demo:
-    gr.Markdown("## Belarusian TTS — нізкая латэнтнасць (AudioWorklet) + фінальны WAV (SSR OFF, polling)")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
     with gr.Row():
-        run_btn = gr.Button("Згенераваць / Старт стриму")
-        stop_btn = gr.Button("⏹ Спыніць прайграванне")
-        play_btn = gr.Button("▶️ Працягнуць прайграванне")
-        gr.Markdown(f"**Sample rate:** {sampling_rate} Hz | **Stream format:** INT16LE(base64)")
     log_panel = gr.HTML(
         value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
         label="Лагі плэера",
     )
-    # ВАЖНА: робім stream_pipe бачным у DOM (visible=True), але хаваем праз CSS
-    stream_pipe = gr.Textbox(value="", visible=True, label="stream_pipe", elem_id="stream-pipe")
-    final_file = gr.File(label="Згенераваны WAV (спампаваць)")
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
     play_final_btn = gr.Button("▶️ Play Final")
-    # CSS — схаваць stream_pipe і актыўнасці
-    gr.HTML("""
-<style>
-#stream-pipe { position:absolute; left:-99999px; width:1px; height:1px; opacity:0; pointer-events:none; }
-</style>
-""")
-    # --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
-    FRONT_HTML = f"""
-<script>
-(function() {{
   const sampleRate = {sampling_rate};
-  const POLL_MS = 30;
   const AC = window.AudioContext || window.webkitAudioContext;
   function toSec(ms) {{ return (ms/1000); }}
-  function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
-  function updateLog() {{
     const el = document.getElementById('wa-log');
     if (!el || !window.__wa || !window.__wa.meta) return;
     const m = window.__wa.meta;
     const lines = [];
-    lines.push('Клік (Старт): ' + (m.t_click_ms ? '0.000 s' : 'n/a'));
     let click_to_first_chunk_s = null;
     if (m.t_first_push_ms) {{
       click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
-      lines.push('Першы чанк прыйшоў:   ' + click_to_first_chunk_s.toFixed(3) + ' s');
       if (m.t_first_audio_ms) {{
-        lines.push('Пачатак прайгравання: ' + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + ' s');
-        lines.push('Затрымка (чанк→аўдыя): ' + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + ' s');
       }}
     }}
     const s = (m.server || {{}});
-    lines.push('');
-    lines.push('— Серверныя метрыкі —');
-    lines.push('Latents (умоўны голас):  ' + fmtS(s.latents_s));
-    lines.push('Падзел тэксту:           ' + fmtS(s.text_split_s));
-    lines.push('Ініт→1-ы чанк:           ' + fmtS(s.gen_init_to_first_chunk_s));
-    lines.push('Усё да 1-га чанка:       ' + fmtS(s.until_first_chunk_total_s));
-    lines.push('Іншая серверная апрац.:  ' + fmtS(s.server_unaccounted_before_first_chunk_s));
-    lines.push('Запіс WAV:               ' + fmtS(s.file_write_s));
-    if (m.t_first_push_ms && s.until_first_chunk_total_s != null) {{
       let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
       if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
-      lines.push('');
-      lines.push('Ацэнка чаргі ZeroGPU + сеткі: ' + est_queue_net.toFixed(3) + ' s');
     }} else {{
-      lines.push('');
-      lines.push('Ацэнка чаргі ZeroGPU + сеткі: n/a');
     }}
-    lines.push('');
-    lines.push('Статус стриму: ' + (window.__wa.playing ? 'playing' : 'stopped'));
-    el.textContent = lines.join('\\n');
-  }}
-  async function ensureWorklet(ctx) {{
-    const code = `
-class PushPlayerProcessor extends AudioWorkletProcessor {{
-  constructor() {{
-    super();
-    this.queue = [];
-    this.readIndex = 0;
-    this.port.onmessage = (e) => {{
-      const d = e.data || {{}};
-      if (d.type === 'push' && d.buffer) {{
-        const f32 = new Float32Array(d.buffer);
-        this.queue.push(f32);
-      }} else if (d.type === 'reset') {{
-        this.queue.length = 0;
-        this.readIndex = 0;
-      }}
-    }};
-  }}
-  process(inputs, outputs) {{
-    const out = outputs[0][0];
-    let i = 0;
-    while (i < out.length) {{
-      if (this.queue.length === 0) {{ out[i++] = 0.0; continue; }}
-      const cur = this.queue[0];
-      const remaining = cur.length - this.readIndex;
-      const take = Math.min(remaining, out.length - i);
-      out.set(cur.subarray(this.readIndex, this.readIndex + take), i);
-      i += take; this.readIndex += take;
-      if (this.readIndex >= cur.length) {{ this.queue.shift(); this.readIndex = 0; }}
-    }}
-    return true;
-  }}
-}}
-registerProcessor('push-player', PushPlayerProcessor);
-    `;
-    const blob = new Blob([code], {{ type: 'application/javascript' }});
-    const url = URL.createObjectURL(blob);
-    await ctx.audioWorklet.addModule(url);
   }}
-  async function ensurePlayer() {{
-    if (window.__wa) return window.__wa;
-    if (!AC) return null;
     const ctx = new AC({{ sampleRate }});
-    try {{ await ctx.resume(); }} catch (e) {{}}
-    await ensureWorklet(ctx);
-    const node = new AudioWorkletNode(ctx, 'push-player');
-    node.connect(ctx.destination);
-    let playing = true;
     const meta = {{
-      t_click_ms: null,
       t_first_push_ms: null,
       t_first_audio_ms: null,
-      server: null
     }};
-    const api = {{
       ctx, node,
       get playing() {{ return playing; }},
-      start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing = True; updateLog(); }},
-      stop: () => {{ try {{ ctx.suspend(); }} catch(e){{}} playing = false; updateLog(); }},
-      reset: () => {{
-        try {{ node.port.postMessage({{ type: 'reset' }}); }} catch(e) {{}}
-        meta.t_first_push_ms = null;
-        meta.t_first_audio_ms = null;
-        updateLog();
-      }},
-      push: (f32) {{
-        try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
         if (!meta.t_first_push_ms) {{
           meta.t_first_push_ms = performance.now();
-          if (!meta.t_first_audio_ms) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
-          updateLog();
         }}
-        if (!playing) api.start();
       }},
-      meta
     }};
-    window.__wa = api;
-    updateLog();
-    return api;
-  }}
-  function getPipeEl() {{
-    // Textbox у Gradio мае textarea унутры div#stream-pipe
-    const root = document.getElementById('stream-pipe');
-    if (!root) return null;
-    const ta = root.querySelector('textarea');
-    if (ta) return ta;
-    const inp = root.querySelector('input');
-    if (inp) return inp;
-    return root;
   }}
-  function startPolling() {{
-    const pipe = getPipeEl();
-    if (!pipe) return;
-    const apiPromise = ensurePlayer();
-    let lastSeq = -1;
-    setInterval(async () => {{
-      const api = await apiPromise;
-      if (!api) return;
-      const txt = (pipe.value !== undefined) ? pipe.value : (pipe.innerText || pipe.textContent || '');
-      if (!txt) return;
-      let pkt = null;
-      try {{ pkt = JSON.parse(txt); }} catch(e) {{ return; }}
-      if (!pkt || typeof pkt.seq !== 'number') return;
-      if (pkt.seq <= lastSeq) return;
-      lastSeq = pkt.seq;
-      if (pkt.log) {{
-        api.meta.server = pkt.log;
-        updateLog();
-      }}
-      if (pkt.b64 === '__STOP__') {{
-        api.stop();
-        updateLog();
-        return;
-      }}
-      if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
-        const bin = atob(pkt.b64);
-        const len = bin.length;
-        const buf = new ArrayBuffer(len);
-        const view = new Uint8Array(buf);
-        for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
-        const i16 = new Int16Array(buf);
-        const f32 = new Float32Array(i16.length);
-        for (let i=0;i<i16.length;i++) {{
-          let s = i16[i];
-          f32[i] = Math.max(-1, s / 32768);
-        }}
-        api.push(f32);
-      }}
-    }}, POLL_MS);
-  }}
-  window.__wa_start_click = async function() {{
-    const api = await ensurePlayer();
-    api.meta.t_click_ms = performance.now();
-    updateLog();
-  }};
-  window.__wa_stop = async function() {{
-    const api = await ensurePlayer();
-    api.stop();
-  }};
-  window.__wa_play = async function() {{
-    const api = await ensurePlayer();
-    api.start();
-  }};
-  window.__wa_play_final = function() {{
-    const host = document.getElementById('final-audio');
-    if (!host) return;
-    const audio = host.querySelector('audio');
-    if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
-  }};
-  // Стартуем polling пасля загрузкі
-  startPolling();
-}})();
-</script>
-"""
-    gr.HTML(FRONT_HTML)
-    # Падзеі
-    run_btn.click(
-        fn=lambda: None,
-        inputs=[],
-        outputs=[],
-        js="window.__wa_start_click && window.__wa_start_click();"
-    )
-    run_btn.click(
-        fn=text_to_speech,
-        inputs=[inp_text, inp_voice],
-        outputs=[stream_pipe, final_file, final_audio],
-    )
-    stop_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_stop && window.__wa_stop();")
-    play_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_play && window.__wa_play();")
-    play_final_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_play_final && window.__wa_play_final();")
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
-# чарга + запуск (SSR OFF)
 if __name__ == "__main__":
-    demo.queue(max_size=8).launch(ssr_mode=False)

 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
+# ---------------------------------------------------------
+# 1) coqui-ai-TTS fork
+# ---------------------------------------------------------
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
+# ---------------------------------------------------------
+# 2) мадэльныя файлы
+# ---------------------------------------------------------
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
 model_dir = "./model"
 os.makedirs(model_dir, exist_ok=True)
+checkpoint_file = os.path.join(model_dir, "model.pth")
+config_file = os.path.join(model_dir, "config.json")
+vocab_file = os.path.join(model_dir, "vocab.json")
+default_voice_file = os.path.join(model_dir, "voice.wav")
 for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
     fpath = os.path.join(model_dir, fname)
     if not os.path.exists(fpath):
         hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
+# ---------------------------------------------------------
+# 3) загрузка мадэлі
+# ---------------------------------------------------------
+config = XttsConfig()
+config.load_json(config_file)
+XTTS_MODEL = Xtts.init_from_config(config)
 XTTS_MODEL.load_checkpoint(
     config,
     checkpoint_path=checkpoint_file,
 XTTS_MODEL.to(device).eval()
 sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
 # =========================================================
+# 4) Streaming-канфіг
 # =========================================================
+MIN_BUFFER_S = 0.03            # бяспечны выхадны буфер для плэера
+RUNTIME_FIRST_CHUNK_S = 0.02   # унутраны чанк у генерацыі
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
+FIRST_SEGMENT_LIMIT = 160      # стабільная прасадыя для 1-га сегмента
+# -------------------- утыліты аўдыя ----------------------
 def _seconds_to_samples(sec: float, sr: int) -> int:
     return max(1, int(sec * sr))
     return x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
+    if a.size == 0: return b.astype(np.float32, copy=False)
+    if b.size == 0: return a.astype(np.float32, copy=False)
+    a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
+    if fade_n <= 1: return np.concatenate([a, b], axis=0)
     fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
+    fade_in  = 1.0 - fade_out
     head = a[:-fade_n]
     tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
     rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
     try:
         ids = tokenizer.encode(text, lang=lang)
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
     sig = inspect.signature(model.inference_stream)
     call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
+    for k in ("temperature","length_penalty","repetition_penalty","top_k","top_p","stream_chunk_size_s"):
         if k in gen_kwargs and k in sig.parameters:
             call_kwargs[k] = gen_kwargs[k]
     autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
         autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
         with torch.inference_mode(), autocast_ctx:
             out = model.inference(
+                text=prefix, language=language,
+                gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
                 temperature=gen_kwargs.get("temperature", 0.1),
+                length_penalty=1.0, repetition_penalty=10.0,
+                top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
             )
         wav = _to_np_audio(out)
+        new_part = wav[emitted:]; emitted = wav.size
+        if new_part.size: yield new_part
 class NewTTSGenerationMixin:
     @torch.inference_mode()
+    def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
+                 gpt_cond_latent: Any = None, speaker_embedding: Any = None,
+                 min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs):
         assert isinstance(text, str) and text.strip(), "text is required"
         if not do_stream:
             autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
             with autocast_ctx:
                 out = self.inference(
+                    text=text, language=language,
+                    gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
                     temperature=gen_kwargs.get("temperature", 0.1),
+                    length_penalty=1.0, repetition_penalty=10.0,
+                    top_k=10, top_p=0.3,
                 )
             return _to_np_audio(out)
         return self.sample_stream(
+            text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+            min_buffer_s=min_buffer_s, tokens_per_step=tokens_per_step, **gen_kwargs
         )
     @torch.inference_mode()
+    def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any,
+                      min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs) -> Iterator[np.ndarray]:
+        local_kwargs = dict(gen_kwargs); local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
         if hasattr(self, "inference_stream"):
             for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
                 yield chunk
 init_stream_support()
+# ---------------------------------------------------------
+# 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
+# ---------------------------------------------------------
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
         base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
     else:
         base = "default_voice"
+    meta_str = json.dumps({
+        "model_id": meta.model_id,
+        "gpt_cond_len": meta.gpt_cond_len,
+        "max_ref_len": meta.max_ref_len,
+        "sound_norm_refs": meta.sound_norm_refs,
+        "xtts_git": meta.xtts_git,
+    }, sort_keys=True)
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
 def _latents_disk_path(key: str) -> pathlib.Path:
 def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
     p = _latents_disk_path(key)
+    if not p.exists(): return None
     obj = torch.load(p, map_location="cpu")
     return obj["gpt_cond_latent"], obj["speaker_embedding"]
         return g2, s2
     return g, s
+# аўтападлік для default voice (CPU) — без дадатковых запытаў
 try:
     _ = _latents_for(default_voice_file)
 except Exception as e:
+    print(f"[warn] precompute default voice latents failed: {e}")
+# ---------------------------------------------------------
+# 6) буферы + base64
+# ---------------------------------------------------------
+def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
+    if not chunks: return np.zeros((0,), dtype=np.float32)
+    out = chunks[0]
+    for i in range(1, len(chunks)):
+        out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
+    return out
+def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
+    target_samples = _seconds_to_samples(target_s, sr)
+    buf = np.zeros((0,), dtype=np.float32)
+    for c in chunks:
+        c = _to_np_audio(c)
+        if c.size == 0: continue
+        buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
+        if buf.size >= target_samples:
+            yield buf
+            buf = np.zeros((0,), dtype=np.float32)
+    if buf.size: yield buf
+def _pcm_f32_to_b64(x: np.ndarray) -> str:
+    if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
+    return base64.b64encode(x.tobytes()).decode("ascii")
+# ---------------------------------------------------------
+# 7) падзел тэксту: хуткі + fallback
+# ---------------------------------------------------------
+_SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
     text = text.strip()
+    if not text: return []
     parts = []
     start = 0
     for m in _SENT_END.finditer(text):
         end = m.end()
         parts.append(text[start:end].strip())
         start = end
+    if start < len(text): parts.append(text[start:].strip())
     chunks = []
     cur = ""
     for s in parts:
         if len(cur) + 1 + len(s) <= limit:
             cur = (cur + " " + s).strip() if cur else s
         else:
+            if cur: chunks.append(cur)
             if len(s) <= limit:
                 cur = s
             else:
+                w = _WS.split(s); acc = ""
                 for tok in w:
                     if len(acc) + 1 + len(tok) <= limit:
                         acc = (acc + " " + tok).strip() if acc else tok
                     else:
+                        if acc: chunks.append(acc)
                         acc = tok
+                if acc: cur = acc
+                else: cur = ""
+    if cur: chunks.append(cur)
     return [c for c in chunks if c]
 def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
     text_in = text_in.strip()
+    if not text_in: return []
     parts: List[str] = []
     if len(text_in) > FIRST_SEGMENT_LIMIT:
         head = text_in[:FIRST_SEGMENT_LIMIT]
+        m = re.search(r".*[\.!\?…»)]", head)
         if m and len(m.group(0)) > 30:
             head = m.group(0)
         tail = text_in[len(head):].lstrip()
         text_for_rest = tail
     else:
         text_for_rest = text_in
+    if not text_for_rest: return parts or [text_in]
     rest = _fast_split(text_for_rest, chunk_limit)
     if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
         try:
             rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
             rest2 = [s.strip() for s in rest2 if s and s.strip()]
+            if rest2: rest = rest2
         except Exception:
             pass
     return parts + (rest or [text_for_rest])
+# ---------------------------------------------------------
+# 8) TTS — стрим + фінальны файл + лагі
+# ---------------------------------------------------------
 @spaces.GPU(duration=60)
 def text_to_speech(belarusian_story, speaker_audio_file=None):
+    """
+    Выхады:
+      1) stream_pipe — base64(PCM float32) чанкі, у фінале "__STOP__"
+      2) final_file  — шлях да WAV
+      3) final_audio — шлях да WAV для прайгравання
+      4) log_pipe    — JSON з сервернымі метрыкамі (секунды)
+    """
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
         raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
     if not speaker_audio_file or (
+        not isinstance(speaker_audio_file, str)
+        and getattr(speaker_audio_file, "name", "") == ""
     ):
         speaker_audio_file = default_voice_file
     lang_short = "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
+    # Latents (кэш CPU/GPU)
     t_lat0 = time.perf_counter()
+    to_dev = "cuda:0" if torch.cuda.is_available() else None
     gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
     t_lat1 = time.perf_counter()
     # Split
     t_split0 = time.perf_counter()
     texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
+    if not texts: texts = [text_in]
     t_split1 = time.perf_counter()
     server_metrics = {
         "server_unaccounted_before_first_chunk_s": None,
         "file_write_s": None,
     }
+    yield ("", None, None, json.dumps(server_metrics))
     full_audio_chunks: List[np.ndarray] = []
     first_chunk_seen = False
     for part in texts:
         gen = XTTS_MODEL.generate(
+            text=part, do_stream=True, language=lang_short,
+            gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
             min_buffer_s=RUNTIME_FIRST_CHUNK_S,
             tokens_per_step=TOKENS_PER_STEP,
             stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
+            temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
+            top_k=10, top_p=0.3,
         )
         for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
             if not first_chunk_seen:
                 t_first = time.perf_counter()
                 server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
                 server_metrics["until_first_chunk_total_s"] = (t_first - t0)
+                known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
                 other = server_metrics["until_first_chunk_total_s"] - known
                 server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
                 first_chunk_seen = True
+                yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
             else:
+                yield (_pcm_f32_to_b64(buf), None, None, None)
             full_audio_chunks.append(buf)
+    if not full_audio_chunks:
+        yield ("__STOP__", None, None, json.dumps(server_metrics)); return
+    t_w0 = time.perf_counter()
+    full_audio = _merge_for_file(full_audio_chunks)
+    tmp = None
+    try:
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        write(tmp.name, sampling_rate, full_audio.astype(np.float32))
+    except Exception as e:
+        raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
+    finally:
+        t_w1 = time.perf_counter()
+        server_metrics["file_write_s"] = (t_w1 - t_w0)
+    yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
+# ---------------------------------------------------------
+# 9) UI (лагі ў секундах + Play Final; без underrun’аў)
+# ---------------------------------------------------------
 examples = [
+    ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
 ]
 with gr.Blocks() as demo:
+    gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
     with gr.Row():
+        play_btn = gr.Button("▶️ Play (stream)")
+        stop_btn = gr.Button("⏹ Stop (stream)")
+        run_btn = gr.Button("Згенераваць")
+        gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
     log_panel = gr.HTML(
         value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
         label="Лагі плэера",
     )
+    stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
+    log_pipe    = gr.Textbox(value="", visible=False, label="log_pipe")
+    final_file  = gr.File(label="Згенераваны WAV (спампаваць)")
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
     play_final_btn = gr.Button("▶️ Play Final")
+    INIT_RESET_AND_PLAY_JS = f"""
+() => {{
   const sampleRate = {sampling_rate};
   const AC = window.AudioContext || window.webkitAudioContext;
+  if (!AC) return;
+  const PRIME_CHUNKS = 2;       // мін. к-ць чанкаў перад стартаваннем гуку
+  let primeCounter = 0;
   function toSec(ms) {{ return (ms/1000); }}
+  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
+  function logUpdate() {{
     const el = document.getElementById('wa-log');
     if (!el || !window.__wa || !window.__wa.meta) return;
     const m = window.__wa.meta;
     const lines = [];
+    lines.push("Клік (Згенераваць): 0.000 s");
     let click_to_first_chunk_s = null;
     if (m.t_first_push_ms) {{
       click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
+      lines.push("Першы чанк прыйшоў:   " + click_to_first_chunk_s.toFixed(3) + " s");
       if (m.t_first_audio_ms) {{
+        lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
+        lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
     const s = (m.server || {{}});
+    lines.push("");
+    lines.push("— Серверныя метрыкі —");
+    lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
+    lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
+    lines.push("Ініт→1-ы чанк:           " + fmtS(s.gen_init_to_first_chunk_s));
+    lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
+    lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
+    lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
+    if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
       let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
       if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
+      lines.push("");
+      lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
     }} else {{
+      lines.push("");
+      lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
     }}
+    lines.push("");
+    lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
+    el.textContent = lines.join("\\n");
+    try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
   }}
+  if (!window.__wa) {{
     const ctx = new AC({{ sampleRate }});
+    const bufferSize = 2048; // большы буфер = менш underrun’аў
+    const node = ctx.createScriptProcessor(bufferSize, 0, 1);
+    let queue = [];
+    let playing = false;
+    let eos = false;
     const meta = {{
+      t_click_ms: performance.now(),
       t_first_push_ms: null,
       t_first_audio_ms: null,
+      server: null,
+    }};
+    node.onaudioprocess = (e) => {{
+      const out = e.outputBuffer.getChannelData(0);
+      let i = 0;
+      while (i < out.length) {{
+        if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
+        let cur = queue[0];
+        const take = Math.min(cur.length, out.length - i);
+        if (meta.t_first_audio_ms === null) {{
+          meta.t_first_audio_ms = performance.now();
+          logUpdate();
+        }}
+        out.set(cur.subarray(0, take), i);
+        i += take;
+        if (take === cur.length) queue.shift();
+        else queue[0] = cur.subarray(take);
+      }}
+      if (eos && queue.length === 0 && playing) {{
+        playing = false;
+        logUpdate();
+      }}
     }};
+    node.connect(ctx.destination);
+    window.__wa = {{
       ctx, node,
       get playing() {{ return playing; }},
+      get eos() {{ return eos; }},
+      set eos(v) {{ eos = v; }},
+      meta,
+      push: (f32) => {{
+        queue.push(f32);
         if (!meta.t_first_push_ms) {{
           meta.t_first_push_ms = performance.now();
+          logUpdate();
+        }}
+        if (!playing && queue.length >= PRIME_CHUNKS) {{
+          // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
+          window.__wa.start();
         }}
       }},
+      start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
+      stop: () => {{ playing = false; logUpdate(); }},
+      reset: () => {{
+        playing = false; eos = false; queue = [];
+        primeCounter = 0;
+        meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
+        logUpdate();
+      }},
+      updateLog: logUpdate,
     }};
+  }} else {{
+    window.__wa.reset();
+    window.__wa.meta.t_click_ms = performance.now();
   }}
+}}
+"""
+    STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
+    PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
+    PUSH_JS = """
+(b64) => {
+  if (!window.__wa || !b64) return;
+  if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
+  const bin = atob(b64);
+  const len = bin.length;
+  const buf = new ArrayBuffer(len);
+  const view = new Uint8Array(buf);
+  for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
+  const f32 = new Float32Array(buf);
+  window.__wa.push(f32);
+}
+"""
+    LOG_JS = """
+(js) => {
+  if (!window.__wa) return;
+  try {
+    if (js) {
+      const obj = JSON.parse(js);
+      window.__wa.meta.server = obj;
+      window.__wa.updateLog && window.__wa.updateLog();
+    }
+  } catch (e) {}
+}
+"""
+    PLAY_FINAL_JS = """
+() => {
+  const host = document.getElementById('final-audio');
+  if (!host) return;
+  const audio = host.querySelector('audio');
+  if (audio) { try { audio.play(); } catch(e) {} }
+}
+"""
+    play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
+    stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
+    run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
+    run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
+    stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
+    log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
+    play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
 if __name__ == "__main__":
+    demo.launch()