BexttsStream

Running on Zero

App Files Files Community

archivartaunik commited on Nov 16, 2025

Commit

99298aa

verified ·

1 Parent(s): 91d0f80

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -337

app.py CHANGED Viewed

@@ -88,12 +88,15 @@ XTTS_MODEL.tokenizer = tokenizer
 # =========================================================
 # 4) Streaming-канфіг
 # =========================================================
-MIN_BUFFER_S = 0.03            # бяспечны выхадны буфер для плэера
-RUNTIME_FIRST_CHUNK_S = 0.02   # унутраны чанк у генерацыі
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
-FIRST_SEGMENT_LIMIT = 160      # стабільная прасадыя для 1-га сегмента
 # -------------------- утыліты аўдыя ----------------------
 def _seconds_to_samples(sec: float, sr: int) -> int:
@@ -127,25 +130,6 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
     rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
-def _bpe_prefixes(text: str, lang: str, step_tokens: int):
-    try:
-        ids = tokenizer.encode(text, lang=lang)
-        n = len(ids)
-        for k in range(step_tokens, n + 1, step_tokens):
-            yield tokenizer.decode(ids[:k], lang=lang)
-        if n % step_tokens != 0:
-            yield tokenizer.decode(ids, lang=lang)
-        return
-    except Exception:
-        pass
-    pseudo_tokens = re.findall(r"\S+|\s+", text)
-    acc = ""
-    for i in range(0, len(pseudo_tokens), step_tokens):
-        acc = "".join(pseudo_tokens[: i + step_tokens])
-        yield acc
-    if acc.strip() != text.strip():
-        yield text
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
     sig = inspect.signature(model.inference_stream)
     call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
@@ -158,22 +142,6 @@ def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any,
         for out in generator:
             yield _to_np_audio(out)
-def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, tokens_per_step: int, **gen_kwargs) -> Iterator[np.ndarray]:
-    emitted = 0
-    for prefix in _bpe_prefixes(text, language, tokens_per_step):
-        autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
-        with torch.inference_mode(), autocast_ctx:
-            out = model.inference(
-                text=prefix, language=language,
-                gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
-                temperature=gen_kwargs.get("temperature", 0.1),
-                length_penalty=1.0, repetition_penalty=10.0,
-                top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
-            )
-        wav = _to_np_audio(out)
-        new_part = wav[emitted:]; emitted = wav.size
-        if new_part.size: yield new_part
 class NewTTSGenerationMixin:
     @torch.inference_mode()
     def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
@@ -204,8 +172,7 @@ class NewTTSGenerationMixin:
             for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
                 yield chunk
             return
-        for chunk in _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs):
-            yield chunk
 def init_stream_support():
     Xtts.generate = NewTTSGenerationMixin.generate
@@ -220,35 +187,18 @@ PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
-class LatentsMeta:
-    model_id: str
-    gpt_cond_len: int
-    max_ref_len: int
-    sound_norm_refs: bool
-    xtts_git: str | None = None
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
 def _latents_key(path: str | None, meta: LatentsMeta) -> str:
-    if path and os.path.exists(path):
-        base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
-    else:
-        base = "default_voice"
-    meta_str = json.dumps({
-        "model_id": meta.model_id,
-        "gpt_cond_len": meta.gpt_cond_len,
-        "max_ref_len": meta.max_ref_len,
-        "sound_norm_refs": meta.sound_norm_refs,
-        "xtts_git": meta.xtts_git,
-    }, sort_keys=True)
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
-def _latents_disk_path(key: str) -> pathlib.Path:
-    return PERSIST_LATENTS_DIR / f"{key}.pt"
-def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
-    torch.save({"gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu()}, _latents_disk_path(key))
 def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
     p = _latents_disk_path(key)
@@ -258,49 +208,27 @@ def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tens
 def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
     with torch.inference_mode():
-        g, s = XTTS_MODEL.get_conditioning_latents(
-            audio_path=path,
-            gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
-            max_ref_length=XTTS_MODEL.config.max_ref_len,
-            sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
-        )
     return g.cpu(), s.cpu()
 def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
-    meta = LatentsMeta(
-        model_id=repo_id,
-        gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
-        max_ref_len=XTTS_MODEL.config.max_ref_len,
-        sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
-        xtts_git=None,
-    )
     key = _latents_key(path, meta)
-    if key in LATENT_CACHE:
-        g, s = LATENT_CACHE[key]
-    else:
-        loaded = _load_latents_from_disk(key)
-        if loaded is None:
-            g, s = _compute_latents_cpu(path)
-            _save_latents_to_disk(key, g, s)
-        else:
-            g, s = loaded
-        LATENT_CACHE[key] = (g, s)
     if to_device and to_device.startswith("cuda"):
         dev_key = (key, to_device)
-        if dev_key in GPU_LATENT_CACHE:
-            return GPU_LATENT_CACHE[dev_key]
-        g2 = g.to(to_device, non_blocking=True)
-        s2 = s.to(to_device, non_blocking=True)
         GPU_LATENT_CACHE[dev_key] = (g2, s2)
         return g2, s2
     return g, s
-try:
-    _ = _latents_for(default_voice_file)
-except Exception as e:
-    print(f"[warn] precompute default voice latents failed: {e}")
 # ---------------------------------------------------------
 # 6) буферы + base64
@@ -308,113 +236,65 @@ except Exception as e:
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
-    for i in range(1, len(chunks)):
-        out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
     return out
-def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
-    target_samples = _seconds_to_samples(target_s, sr)
-    buf = np.zeros((0,), dtype=np.float32)
     for c in chunks:
-        c = _to_np_audio(c)
-        if c.size == 0: continue
-        buf = c if buf.size == 0 else np.concatenate([buf, c], axis=0) # <--- Спрошчанае зліццё для буфера
-        if buf.size >= target_samples:
-            yield buf
-            buf = np.zeros((0,), dtype=np.float32)
-    if buf.size: yield buf
-def _pcm_f32_to_b64(x: np.ndarray) -> str:
-    if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
-    return base64.b64encode(x.tobytes()).decode("ascii")
 # ---------------------------------------------------------
-# 7) падзел тэксту: хуткі + fallback
 # ---------------------------------------------------------
-_SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
-_WS = re.compile(r"\s+")
-def _fast_split(text: str, limit: int) -> List[str]:
-    text = text.strip()
-    if not text: return []
-    parts = []
-    start = 0
-    for m in _SENT_END.finditer(text):
-        end = m.end()
-        parts.append(text[start:end].strip())
-        start = end
-    if start < len(text): parts.append(text[start:].strip())
-    chunks = []
-    cur = ""
-    for s in parts:
-        if len(cur) + 1 + len(s) <= limit:
-            cur = (cur + " " + s).strip() if cur else s
-        else:
-            if cur: chunks.append(cur)
-            if len(s) <= limit:
-                cur = s
-            else:
-                w = _WS.split(s); acc = ""
-                for tok in w:
-                    if len(acc) + 1 + len(tok) <= limit:
-                        acc = (acc + " " + tok).strip() if acc else tok
-                    else:
-                        if acc: chunks.append(acc)
-                        acc = tok
-                if acc: cur = acc
-                else: cur = ""
-    if cur: chunks.append(cur)
-    return [c for c in chunks if c]
 def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
     text_in = text_in.strip()
     if not text_in: return []
-    parts: List[str] = []
-    if len(text_in) > FIRST_SEGMENT_LIMIT:
-        head = text_in[:FIRST_SEGMENT_LIMIT]
-        m = re.search(r".*[\.!\?…»)]", head)
-        if m and len(m.group(0)) > 30:
-            head = m.group(0)
-        tail = text_in[len(head):].lstrip()
-        parts.append(head)
-        text_for_rest = tail
-    else:
-        text_for_rest = text_in
-    if not text_for_rest: return parts or [text_in]
-    rest = _fast_split(text_for_rest, chunk_limit)
-    if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
-        try:
-            rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
-            rest2 = [s.strip() for s in rest2 if s and s.strip()]
-            if rest2: rest = rest2
-        except Exception:
-            pass
-    return parts + (rest or [text_for_rest])
 # ---------------------------------------------------------
-# 8) TTS — стрим + фінальны файл + лагі
 # ---------------------------------------------------------
 @spaces.GPU(duration=60)
-def text_to_speech(belarusian_story, speaker_audio_file=None):
     t0 = time.perf_counter()
-    if not belarusian_story or str(belarusian_story).strip() == "":
-        raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
-    if not speaker_audio_file or (
-        not isinstance(speaker_audio_file, str)
-        and getattr(speaker_audio_file, "name", "") == ""
-    ):
-        speaker_audio_file = default_voice_file
-    text_in = str(belarusian_story).strip()
-    lang_short = "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
     t_lat0 = time.perf_counter()
-    to_dev = "cuda:0" if torch.cuda.is_available() else None
-    gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
     t_lat1 = time.perf_counter()
     t_split0 = time.perf_counter()
@@ -423,39 +303,28 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
     t_split1 = time.perf_counter()
     server_metrics = {
-        "latents_s": (t_lat1 - t_lat0),
-        "text_split_s": (t_split1 - t_split0),
-        "gen_init_to_first_chunk_s": None,
-        "until_first_chunk_total_s": None,
-        "server_unaccounted_before_first_chunk_s": None,
-        "file_write_s": None,
     }
     yield ("", None, None, json.dumps(server_metrics))
-    full_audio_chunks: List[np.ndarray] = []
-    first_chunk_seen = False
     t_gen0 = time.perf_counter()
-    # <--- ВЫПРАЎЛЕННЕ: вернута простая і надзейная логіка апрацоўкі стрыму
     for part in texts:
         gen = XTTS_MODEL.generate(
             text=part, do_stream=True, language=lang_short,
             gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
-            min_buffer_s=RUNTIME_FIRST_CHUNK_S,
-            tokens_per_step=TOKENS_PER_STEP,
-            stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
-            temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
-            top_k=10, top_p=0.3,
         )
-        # Мы выкарыстоўваем _chunker для ўсяго патоку, каб забяспечыць стабільны памер чанкаў і пазбегнуць паўз
-        for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
             if not first_chunk_seen:
                 t_first = time.perf_counter()
-                server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
-                server_metrics["until_first_chunk_total_s"] = (t_first - t0)
-                known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
-                other = server_metrics["until_first_chunk_total_s"] - known
-                server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
                 first_chunk_seen = True
                 yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
             else:
@@ -467,49 +336,36 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
     t_w0 = time.perf_counter()
     full_audio = _merge_for_file(full_audio_chunks)
-    tmp = None
-    try:
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        write(tmp.name, sampling_rate, full_audio.astype(np.float32))
-    except Exception as e:
-        raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
-    finally:
-        t_w1 = time.perf_counter()
-        server_metrics["file_write_s"] = (t_w1 - t_w0)
     yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
 # ---------------------------------------------------------
-# 9) UI (JavaScript застаецца без змен, ён працаваў правільна)
 # ---------------------------------------------------------
-examples = [
-    ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None],
-]
 with gr.Blocks() as demo:
     gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
     with gr.Row():
-        play_btn = gr.Button("▶️ Play (stream)")
-        stop_btn = gr.Button("⏹ Stop (stream)")
         run_btn = gr.Button("Згенераваць")
         gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
-    log_panel = gr.HTML(
-        value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
-        label="Лагі плэера",
-    )
-    stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
-    log_pipe    = gr.Textbox(value="", visible=False, label="log_pipe")
-    final_file  = gr.File(label="Згенераваны WAV (спампаваць)")
-    final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
-    play_final_btn = gr.Button("▶️ Play Final")
     INIT_RESET_AND_PLAY_JS = f"""
 () => {{
@@ -517,9 +373,6 @@ with gr.Blocks() as demo:
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
-  const PRIME_CHUNKS = 1;
-  let primeCounter = 0;
   function toSec(ms) {{ return (ms/1000); }}
   function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
@@ -527,60 +380,49 @@ with gr.Blocks() as demo:
     const el = document.getElementById('wa-log');
     if (!el || !window.__wa || !window.__wa.meta) return;
     const m = window.__wa.meta;
     const lines = [];
     lines.push("Клік (Згенераваць): 0.000 s");
-    let click_to_first_chunk_s = null;
     if (m.t_first_push_ms) {{
-      click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
       lines.push("Першы чанк прыйшоў:   " + click_to_first_chunk_s.toFixed(3) + " s");
       if (m.t_first_audio_ms) {{
         lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
         lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
-    const s = (m.server || {{}});
-    lines.push("");
-    lines.push("— Серверныя метрыкі —");
     lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
     lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
     lines.push("Ініт→1-ы чанк:           " + fmtS(s.gen_init_to_first_chunk_s));
     lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
     lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
     lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
-    if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
-      let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
-      if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
-      lines.push("");
-      lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
-    }} else {{
-      lines.push("");
-      lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
     }}
-    lines.push("");
-    lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
-    el.textContent = lines.join("\\n");
-    try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
   }}
   if (!window.__wa) {{
     const ctx = new AC({{ sampleRate }});
-    const bufferSize = 2048;
-    const node = ctx.createScriptProcessor(bufferSize, 0, 1);
-    let queue = [];
-    let playing = false;
-    let eos = false;
-    const meta = {{
-      t_click_ms: performance.now(),
-      t_first_push_ms: null,
-      t_first_audio_ms: null,
-      server: null,
-    }};
     node.onaudioprocess = (e) => {{
       const out = e.outputBuffer.getChannelData(0);
       let i = 0;
@@ -588,106 +430,72 @@ with gr.Blocks() as demo:
         if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
         let cur = queue[0];
         const take = Math.min(cur.length, out.length - i);
-        if (meta.t_first_audio_ms === null) {{
-          meta.t_first_audio_ms = performance.now();
-          logUpdate();
-        }}
         out.set(cur.subarray(0, take), i);
         i += take;
-        if (take === cur.length) queue.shift();
-        else queue[0] = cur.subarray(take);
-      }}
-      if (eos && queue.length === 0 && playing) {{
-        playing = false;
-        logUpdate();
       }}
     }};
     node.connect(ctx.destination);
     window.__wa = {{
-      ctx, node,
-      get playing() {{ return playing; }},
-      get eos() {{ return eos; }},
-      set eos(v) {{ eos = v; }},
-      meta,
       push: (f32) => {{
         queue.push(f32);
-        if (!meta.t_first_push_ms) {{
-          meta.t_first_push_ms = performance.now();
-          logUpdate();
-        }}
-        if (!playing && queue.length >= PRIME_CHUNKS) {{
-          window.__wa.start();
-        }}
       }},
-      start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
-      stop: () => {{ playing = false; logUpdate(); }},
       reset: () => {{
         playing = false; eos = false; queue = [];
-        primeCounter = 0;
         meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
         logUpdate();
       }},
       updateLog: logUpdate,
     }};
-  }} else {{
-    window.__wa.reset();
-    window.__wa.meta.t_click_ms = performance.now();
   }}
 }}
 """
-    STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
-    PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
-  if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
   const bin = atob(b64);
-  const len = bin.length;
-  const buf = new ArrayBuffer(len);
   const view = new Uint8Array(buf);
-  for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
   const f32 = new Float32Array(buf);
   window.__wa.push(f32);
 }
 """
     LOG_JS = """
 (js) => {
-  if (!window.__wa) return;
   try {
-    if (js) {
-      const obj = JSON.parse(js);
-      window.__wa.meta.server = obj;
-      window.__wa.updateLog && window.__wa.updateLog();
-    }
   } catch (e) {}
 }
 """
-    PLAY_FINAL_JS = """
-() => {
-  const host = document.getElementById('final-audio');
-  if (!host) return;
-  const audio = host.querySelector('audio');
-  if (audio) { try { audio.play(); } catch(e) {} }
-}
-"""
-    play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
-    stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
-    run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
-    run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
-    stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
-    log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
-    play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
-    gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
 if __name__ == "__main__":
     demo.launch()

 # =========================================================
 # 4) Streaming-канфіг
 # =========================================================
+# Значэнні па змаўчанні, якія цяпер будуць перавызначацца з UI
+INITIAL_MIN_BUFFER_S = 0.25
+MIN_BUFFER_S = 0.1
+RUNTIME_FIRST_CHUNK_S = 0.02
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
+FIRST_SEGMENT_LIMIT = 160
 # -------------------- утыліты аўдыя ----------------------
 def _seconds_to_samples(sec: float, sr: int) -> int:
     rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
 def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
     sig = inspect.signature(model.inference_stream)
     call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
         for out in generator:
             yield _to_np_audio(out)
 class NewTTSGenerationMixin:
     @torch.inference_mode()
     def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
             for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
                 yield chunk
             return
+        raise NotImplementedError("Fallback streaming is not implemented")
 def init_stream_support():
     Xtts.generate = NewTTSGenerationMixin.generate
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
+class LatentsMeta: model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
 def _latents_key(path: str | None, meta: LatentsMeta) -> str:
+    base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
+    meta_str = json.dumps(meta.__dict__, sort_keys=True)
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
+def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
+def _save_latents_to_disk(key: str, g: torch.Tensor, s: torch.Tensor): torch.save({"gpt_cond_latent": g.cpu(), "speaker_embedding": s.cpu()}, _latents_disk_path(key))
 def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
     p = _latents_disk_path(key)
 def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
     with torch.inference_mode():
+        g, s = XTTS_MODEL.get_conditioning_latents(audio_path=path)
     return g.cpu(), s.cpu()
 def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    meta = LatentsMeta(model_id=repo_id, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_len=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, xtts_git=None)
     key = _latents_key(path, meta)
+    g, s = LATENT_CACHE.get(key) or _load_latents_from_disk(key) or (None, None)
+    if g is None:
+        g, s = _compute_latents_cpu(path)
+        _save_latents_to_disk(key, g, s)
+    LATENT_CACHE[key] = (g, s)
     if to_device and to_device.startswith("cuda"):
         dev_key = (key, to_device)
+        if dev_key in GPU_LATENT_CACHE: return GPU_LATENT_CACHE[dev_key]
+        g2, s2 = g.to(to_device, non_blocking=True), s.to(to_device, non_blocking=True)
         GPU_LATENT_CACHE[dev_key] = (g2, s2)
         return g2, s2
     return g, s
+try: _ = _latents_for(default_voice_file)
+except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
 # ---------------------------------------------------------
 # 6) буферы + base64
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
+    for i in range(1, len(chunks)): out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
     return out
+def _chunker_with_initial_buffer(chunks: Iterable[np.ndarray], sr: int, initial_target_s: float, target_s: float) -> Iterable[np.ndarray]:
+    is_first = True
+    target_samples = _seconds_to_samples(initial_target_s, sr)
+    buffer_list, buffer_len = [], 0
     for c in chunks:
+        c_np = _to_np_audio(c)
+        if c_np.size == 0: continue
+        buffer_list.append(c_np); buffer_len += c_np.size
+        if buffer_len >= target_samples:
+            full_chunk = np.concatenate(buffer_list, axis=0)
+            yield full_chunk
+            buffer_list, buffer_len = [], 0
+            if is_first: is_first = False; target_samples = _seconds_to_samples(target_s, sr)
+    if buffer_len > 0: yield np.concatenate(buffer_list, axis=0)
+def _pcm_f32_to_b64(x: np.ndarray) -> str: return base64.b64encode(x.astype(np.float32).tobytes()).decode("ascii")
 # ---------------------------------------------------------
+# 7) падзел тэксту
 # ---------------------------------------------------------
 def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
+    # (Функцыя засталася без змен)
+    _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
+    def _fast_split(text: str, limit: int) -> List[str]:
+        text = text.strip()
+        if not text: return []
+        parts = [s.strip() for s in _SENT_END.split(text) if s and s.strip()]
+        chunks, cur = [], ""
+        for s in parts:
+            if len(cur) + 1 + len(s) <= limit: cur = (cur + " " + s).strip() if cur else s
+            else:
+                if cur: chunks.append(cur)
+                cur = s
+        if cur: chunks.append(cur)
+        return chunks
     text_in = text_in.strip()
     if not text_in: return []
+    try:
+        return [s.strip() for s in split_sentence(text_in, lang=lang_short, text_split_length=chunk_limit) if s and s.strip()]
+    except Exception:
+        return _fast_split(text_in, chunk_limit)
 # ---------------------------------------------------------
+# 8) TTS — стрым-функцыя
 # ---------------------------------------------------------
 @spaces.GPU(duration=60)
+def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subsequent_buffer_s):
     t0 = time.perf_counter()
+    if not belarusian_story or str(belarusian_story).strip() == "": raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
+    speaker_audio_file = speaker_audio_file or default_voice_file
+    text_in, lang_short = str(belarusian_story).strip(), "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
     t_lat0 = time.perf_counter()
+    gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=device)
     t_lat1 = time.perf_counter()
     t_split0 = time.perf_counter()
     t_split1 = time.perf_counter()
     server_metrics = {
+        "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
+        "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
     }
     yield ("", None, None, json.dumps(server_metrics))
+    full_audio_chunks, first_chunk_seen = [], False
     t_gen0 = time.perf_counter()
     for part in texts:
         gen = XTTS_MODEL.generate(
             text=part, do_stream=True, language=lang_short,
             gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+            stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S, temperature=0.1,
+            length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3,
         )
+        chunk_iterator = _chunker_with_initial_buffer(gen, sampling_rate, initial_buffer_s, subsequent_buffer_s)
+        for buf in chunk_iterator:
             if not first_chunk_seen:
                 t_first = time.perf_counter()
+                server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
+                server_metrics["until_first_chunk_total_s"] = t_first - t0
+                known = sum(v for k, v in server_metrics.items() if k.endswith("_s"))
+                server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0) - known)
                 first_chunk_seen = True
                 yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
             else:
     t_w0 = time.perf_counter()
     full_audio = _merge_for_file(full_audio_chunks)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        write(tmp.name, sampling_rate, full_audio)
+    t_w1 = time.perf_counter()
+    server_metrics["file_write_s"] = t_w1 - t_w0
     yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
 # ---------------------------------------------------------
+# 9) UI
 # ---------------------------------------------------------
+examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, 0.25, 0.1]]
 with gr.Blocks() as demo:
     gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
+    # <--- ВЫПРАЎЛЕННЕ: Дададзены слайдары для налад
+    with gr.Accordion("Дадатковыя налады стрымінгу", open=True):
+        initial_buffer_slider = gr.Slider(minimum=0.1, maximum=1.0, value=INITIAL_MIN_BUFFER_S, step=0.05, label="Пачатковы буфер (с)", info="Большае з��ачэнне памяншае рызыку паўзы на старце, але трохі павялічвае пачатковую затрымку.")
+        subsequent_buffer_slider = gr.Slider(minimum=0.05, maximum=0.5, value=MIN_BUFFER_S, step=0.01, label="Наступны буфер (с)", info="Меншае значэнне дае меншую агульную затрымку, але патрабуе больш стабільнай працы мадэлі.")
     with gr.Row():
         run_btn = gr.Button("Згенераваць")
         gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
+    log_panel = gr.HTML(value='<div id="wa-log" style="font-family:monospace;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Лагі плэера")
+    stream_pipe, log_pipe = gr.Textbox(visible=False), gr.Textbox(visible=False)
+    final_file = gr.File(label="Згенераваны WAV (спампаваць)")
+    final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False)
     INIT_RESET_AND_PLAY_JS = f"""
 () => {{
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
   function toSec(ms) {{ return (ms/1000); }}
   function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
     const el = document.getElementById('wa-log');
     if (!el || !window.__wa || !window.__wa.meta) return;
     const m = window.__wa.meta;
+    const s = (m.server || {{}});
     const lines = [];
     lines.push("Клік (Згенераваць): 0.000 s");
     if (m.t_first_push_ms) {{
+      const click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
       lines.push("Першы чанк прыйшоў:   " + click_to_first_chunk_s.toFixed(3) + " s");
       if (m.t_first_audio_ms) {{
         lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
         lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
+    // <--- ВЫПРАЎЛЕННЕ: Новы блок логаў
+    lines.push("\\n— Налады стрыму —");
+    lines.push("Пачатковы буфер (запыт):  " + fmtS(s.initial_buffer_s));
+    lines.push("Наступны буфер (запыт):   " + fmtS(s.subsequent_buffer_s));
+    if (m.chunk_durations && m.chunk_durations.length > 0) {{
+        lines.push("Працягласць 1-га чанка:    " + m.chunk_durations[0] + " s");
+        lines.push("Атрымана чанкаў:          " + m.chunk_durations.length);
+    }}
+    lines.push("\\n— Серверныя метрыкі —");
     lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
     lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
     lines.push("Ініт→1-ы чанк:           " + fmtS(s.gen_init_to_first_chunk_s));
     lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
     lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
     lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
+    if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
+        let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
+        lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
     }}
+    lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
+    el.innerHTML = lines.join("\\n");
   }}
   if (!window.__wa) {{
     const ctx = new AC({{ sampleRate }});
+    const node = ctx.createScriptProcessor(4096, 0, 1);
+    let queue = [], playing = false, eos = false;
+    const meta = {{ t_click_ms: performance.now(), chunk_durations: [] }};
     node.onaudioprocess = (e) => {{
       const out = e.outputBuffer.getChannelData(0);
       let i = 0;
         if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
         let cur = queue[0];
         const take = Math.min(cur.length, out.length - i);
+        if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
         out.set(cur.subarray(0, take), i);
         i += take;
+        if (take === cur.length) queue.shift(); else queue[0] = cur.subarray(take);
       }}
+      if (eos && queue.length === 0 && playing) {{ playing = false; logUpdate(); }}
     }};
     node.connect(ctx.destination);
     window.__wa = {{
+      ctx, node, meta, playing, eos,
       push: (f32) => {{
         queue.push(f32);
+        if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); }}
+        if (!playing && queue.length >= 1) {{ playing = true; try{{ctx.resume()}}catch(e){{}} }}
+        logUpdate();
       }},
       reset: () => {{
         playing = false; eos = false; queue = [];
+        meta.t_click_ms = performance.now();
         meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
+        meta.chunk_durations = []; meta.server = null;
         logUpdate();
       }},
       updateLog: logUpdate,
     }};
   }}
+  window.__wa.reset();
 }}
 """
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
+  if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog(); return; }
   const bin = atob(b64);
+  const buf = new ArrayBuffer(bin.length);
   const view = new Uint8Array(buf);
+  for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
   const f32 = new Float32Array(buf);
+  // <--- ВЫПРАЎЛЕННЕ: Дадаем разлік працягласці чанка ў логі
+  const duration = f32.length / window.__wa.ctx.sampleRate;
+  window.__wa.meta.chunk_durations.push(duration.toFixed(3));
   window.__wa.push(f32);
 }
 """
     LOG_JS = """
 (js) => {
+  if (!window.__wa || !js) return;
   try {
+    window.__wa.meta.server = JSON.parse(js);
+    window.__wa.updateLog();
   } catch (e) {}
 }
 """
+    # <--- ВЫПРАЎЛЕННЕ: Перадаем значэнні са слайдараў у бэкэнд
+    run_btn.click(fn=None, _js=INIT_RESET_AND_PLAY_JS)
+    run_btn.click(
+        fn=text_to_speech,
+        inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider],
+        outputs=[stream_pipe, final_file, final_audio, log_pipe]
+    )
+    stream_pipe.change(fn=None, inputs=[stream_pipe], _js=PUSH_JS)
+    log_pipe.change(fn=None, inputs=[log_pipe], _js=LOG_JS)
+    gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
 if __name__ == "__main__":
     demo.launch()