BexttsStream

Running on Zero

App Files Files Community

archivartaunik commited on Nov 16, 2025

Commit

36c434c

verified ·

1 Parent(s): 0f31af2

Update app.py

Browse files

Files changed (1) hide show

app.py +528 -343

app.py CHANGED Viewed

@@ -3,9 +3,18 @@ os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
-import sys, re, time, json, base64, hashlib, tempfile, subprocess, inspect, pathlib
-from typing import Iterable, Optional, Tuple, List
 from dataclasses import dataclass
 import spaces
 import gradio as gr
@@ -14,11 +23,13 @@ import numpy as np
 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
-# ----------------- 1. Кланаванне рэпазіторыя і ўсталяванне залежнасцяў -----------------
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
 if not os.path.exists(REPO_DIR):
-    print(f"Кланаванне рэпазіторыя {REPO_URL}...")
     subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
 repo_root = os.path.abspath(REPO_DIR)
@@ -29,29 +40,38 @@ from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
-# ----------------- 2. Загрузка файлаў мадэлі -----------------
-print("Загрузка файлаў мадэлі XTTSv2...")
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
 model_dir = "./model"
 os.makedirs(model_dir, exist_ok=True)
-for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
-    if not os.path.exists(os.path.join(model_dir, fname)):
-        hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
 checkpoint_file = os.path.join(model_dir, "model.pth")
 config_file = os.path.join(model_dir, "config.json")
 vocab_file = os.path.join(model_dir, "vocab.json")
 default_voice_file = os.path.join(model_dir, "voice.wav")
-# ----------------- 3. Ініцыялізацыя мадэлі XTTS ------------------
-print("Ініцыялізацыя мадэлі XTTS...")
 config = XttsConfig()
 config.load_json(config_file)
 XTTS_MODEL = Xtts.init_from_config(config)
-XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-print(f"Мадэль будзе працаваць на: {device.upper()}")
 torch.set_num_threads(1)
 if device.startswith("cuda"):
     torch.backends.cuda.matmul.allow_tf32 = True
@@ -64,35 +84,35 @@ sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
-print("Мадэль паспяхова загружана.")
-# ----------------- Канфігурацыя стрымінгу -------------------
-FORCE_FALLBACK_STREAM = True
-# Серверныя налады
-DEF_MIN_BUFFER_S = 0.14
-DEF_FIRST_CHUNK_S = 0.10
-DEF_TOKENS_PER_STEP = 2
-DEF_ENABLE_TEXT_SPLIT = False
-DEF_FIRST_SEGMENT_LIMIT = 160
-FADE_S = 0.004
-# Кліенцкія налады
-DEF_CLIENT_PREROLL = 0.30
-DEF_CLIENT_LOWWM = 0.06
-MAX_CLIENT_PREROLL = 0.40
-STEP_CLIENT_PREROLL = 0.04
-# ----------------- Дапаможныя функцыі для аўдыя ----------------
-def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
 def _to_np_audio(x) -> np.ndarray:
-    if isinstance(x, dict) and "wav" in x: x = x["wav"]
     if isinstance(x, torch.Tensor):
-        if x.dtype != torch.float32: x = x.float()
-        return x.detach().cpu().contiguous().view(-1).numpy()
     x = np.asarray(x)
-    if x.ndim > 1: x = x.reshape(-1)
-    return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
     if a.size == 0: return b.astype(np.float32, copy=False)
@@ -100,418 +120,583 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
     a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
     if fade_n <= 1: return np.concatenate([a, b], axis=0)
-    fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
-    head = a[:-fade_n]; tail = a[-fade_n:] * fade_out + b[:fade_n] * fade_in; rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
-# ----------------- Логіка стрымінгу -----------------
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
     try:
-        ids = tokenizer.encode(text, lang=lang); n = len(ids)
-        for k in range(step_tokens, n + 1, step_tokens): yield tokenizer.decode(ids[:k], lang=lang)
-        if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
-    except Exception: pass
-    pseudo = re.findall(r"\S+|\s+", text); acc = ""
-    for i in range(0, len(pseudo), step_tokens): acc = "".join(pseudo[: i + step_tokens]); yield acc
-    if acc.strip() != text.strip(): yield text
-def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, tokens_per_step: int, **gen_kwargs):
     emitted = 0
-    step = 0
     for prefix in _bpe_prefixes(text, language, tokens_per_step):
-        t0 = time.perf_counter()
-        autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
         with torch.inference_mode(), autocast_ctx:
             out = model.inference(
-                text=prefix, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
-                temperature=gen_kwargs.get("temperature", 0.1), length_penalty=1.0, repetition_penalty=10.0,
-                top_k=10, top_p=0.3,
             )
         wav = _to_np_audio(out)
-        new_part = wav[emitted:]
-        emitted = wav.size
-        t1 = time.perf_counter()
-        yield {"__DBG__": f"[srv] fb_step={step} tps={tokens_per_step} new_s={new_part.size/sampling_rate:.3f} "
-                          f"total_s={emitted/sampling_rate:.3f} dt_inf={t1-t0:.3f}s"}
-        step += 1
-        if new_part.size:
-            yield new_part
 class NewTTSGenerationMixin:
     @torch.inference_mode()
     def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
-                 gpt_cond_latent=None, speaker_embedding=None, **gen_kwargs):
-        assert isinstance(text, str) and text.strip()
         if not do_stream:
-            autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
             with autocast_ctx:
-                out = self.inference(text=text, language=language, gpt_cond_latent=gpt_cond_latent,
-                                     speaker_embedding=speaker_embedding, temperature=gen_kwargs.get("temperature", 0.1),
-                                     length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3)
             return _to_np_audio(out)
-        return self.sample_stream(text=text, language=language, gpt_cond_latent=gpt_cond_latent,
-                                  speaker_embedding=speaker_embedding, **gen_kwargs)
     @torch.inference_mode()
-    def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
-        if FORCE_FALLBACK_STREAM or not hasattr(self, "inference_stream"):
-            yield from _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, **gen_kwargs)
-        else: # Native stream is not used, but kept for reference
-            local_kwargs = dict(gen_kwargs)
-            autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
-            with torch.inference_mode(), autocast_ctx:
-                for out in self.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
-                    yield _to_np_audio(out)
 def init_stream_support():
     Xtts.generate = NewTTSGenerationMixin.generate
     Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
 init_stream_support()
-# ----------------- Кэшаванне "голасу" (latents) -----------------
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
 class LatentsMeta:
-    model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
 def _latents_key(path: str | None, meta: LatentsMeta) -> str:
-    base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
-    meta_str = json.dumps(meta.__dict__, sort_keys=True)
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
-def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
-def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
-def _load_latents_from_disk(key: str):
     p = _latents_disk_path(key)
     if not p.exists(): return None
-    obj = torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
-def _compute_latents_cpu(path: str | None):
     with torch.inference_mode():
-        g, s = XTTS_MODEL.get_conditioning_latents(audio_path=path, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
     return g.cpu(), s.cpu()
-def _latents_for(path: str | None, *, to_device: Optional[str] = None):
-    meta = LatentsMeta(repo_id, XTTS_MODEL.config.gpt_cond_len, XTTS_MODEL.config.max_ref_len, XTTS_MODEL.config.sound_norm_refs)
     key = _latents_key(path, meta)
-    if key in LATENT_CACHE: g,s = LATENT_CACHE[key]
     else:
         loaded = _load_latents_from_disk(key)
-        if loaded is None: g,s = _compute_latents_cpu(path); _save_latents_to_disk(key, g, s)
-        else: g,s = loaded
-        LATENT_CACHE[key]=(g,s)
     if to_device and to_device.startswith("cuda"):
-        dev_key=(key,to_device)
-        if dev_key in GPU_LATENT_CACHE: return GPU_LATENT_CACHE[dev_key]
-        g2=g.to(to_device, non_blocking=True); s2=s.to(to_device, non_blocking=True); GPU_LATENT_CACHE[dev_key]=(g2,s2); return g2,s2
-    return g,s
-try: _ = _latents_for(default_voice_file)
-except Exception as e: print(f"[warn] Памылка пры папярэднім разліку голасу па змаўчанні: {e}")
-# ----------------- Утыліты для апрацоўкі стрыму -----------------
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
-    for i in range(1, len(chunks)): out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
     return out
 def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
-    target_samples = _seconds_to_samples(target_s, sr); buf = np.zeros((0,), dtype=np.float32)
     for c in chunks:
-        if isinstance(c, dict) and "__DBG__" in c: yield c; continue
         c = _to_np_audio(c)
         if c.size == 0: continue
         buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
-        if buf.size >= target_samples: yield buf; buf = np.zeros((0,), dtype=np.float32)
     if buf.size: yield buf
 def _pcm_f32_to_b64(x: np.ndarray) -> str:
     if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
     return base64.b64encode(x.tobytes()).decode("ascii")
-# ----------------- Утыліты для падзелу тэксту -----------------
 _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
-    text = text.strip(); parts=[]; chunks=[]; cur=""
     if not text: return []
-    start=0
-    for m in _SENT_END.finditer(text): parts.append(text[start:m.end()].strip()); start=m.end()
     if start < len(text): parts.append(text[start:].strip())
     for s in parts:
-        if len(cur)+1+len(s) <= limit: cur = (cur+" "+s).strip() if cur else s
         else:
             if cur: chunks.append(cur)
-            if len(s)<=limit: cur=s
             else:
-                w=_WS.split(s); acc=""
                 for tok in w:
-                    if len(acc)+1+len(tok)<=limit: acc=(acc+" "+tok).strip() if acc else tok
                     else:
-                        if acc: chunks.append(acc); acc=tok
-                cur=acc
     if cur: chunks.append(cur)
     return [c for c in chunks if c]
-def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int, first_segment_limit: int) -> List[str]:
     text_in = text_in.strip()
     if not text_in: return []
-    parts=[]
-    if len(text_in)>first_segment_limit:
-        head=text_in[:first_segment_limit]; m=re.search(r".*[\.!\?…»)]", head)
-        if m and len(m.group(0))>30: head=m.group(0)
-        tail=text_in[len(head):].lstrip(); parts.append(head); text_for_rest=tail
-    else: text_for_rest=text_in
     if not text_for_rest: return parts or [text_in]
-    rest=_fast_split(text_for_rest, chunk_limit)
-    if not rest or sum(len(x) for x in rest) < int(0.6*len(text_for_rest)):
         try:
             rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
             rest2 = [s.strip() for s in rest2 if s and s.strip()]
             if rest2: rest = rest2
-        except Exception: pass
     return parts + (rest or [text_for_rest])
-# ----------------- 4. Асноўная функцыя для Gradio -----------------
 @spaces.GPU(duration=60)
-def text_to_speech(
-    belarusian_story, speaker_audio_file=None,
-    min_buffer_s: float = DEF_MIN_BUFFER_S,
-    first_chunk_s: float = DEF_FIRST_CHUNK_S,
-    enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
-    tokens_per_step: int = DEF_TOKENS_PER_STEP,
-    first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT,
-):
-    print("--- Python function 'text_to_speech' STARTED ---") # Дыягнастычнае паведамленне
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
-        raise gr.Error("Увядзіце тэкст для агучвання.")
-    if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
         speaker_audio_file = default_voice_file
     text_in = str(belarusian_story).strip()
     lang_short = "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
     t_lat0 = time.perf_counter()
     to_dev = "cuda:0" if torch.cuda.is_available() else None
     gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
     t_lat1 = time.perf_counter()
     t_split0 = time.perf_counter()
-    texts = [text_in] if not enable_text_splitting else (_split_text_smart(text_in, lang_short, chunk_limit, int(first_segment_limit)) or [text_in])
     t_split1 = time.perf_counter()
-    server_metrics = {"latents_s": (t_lat1 - t_lat0), "text_split_s": (t_split1 - t_split0), "gen_init_to_first_chunk_s": None, "until_first_chunk_total_s": None, "server_unaccounted_before_first_chunk_s": None, "file_write_s": None, "backend": "fallback", "sr": sampling_rate, "min_buffer_s": float(min_buffer_s), "first_chunk_s": float(first_chunk_s), "tokens_per_step": int(tokens_per_step)}
-    yield ("", None, None, json.dumps(server_metrics), "[srv] start")
-    full_audio_chunks = []
     try:
-        first_chunk_seen=False; t_gen0=time.perf_counter()
-        chunk_idx = 0; last_emit_t = time.perf_counter(); cum_sec = 0.0
-        for part in texts:
-            yield (None, None, None, None, f"[srv] part_start chars={len(part)}")
-            gen = XTTS_MODEL.generate(text=part, do_stream=True, language=lang_short, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, tokens_per_step=int(tokens_per_step))
-            for piece in _chunker(gen, sampling_rate, float(min_buffer_s)):
-                if isinstance(piece, dict) and "__DBG__" in piece:
-                    yield (None, None, None, None, piece["__DBG__"]); continue
-                now = time.perf_counter()
-                dt_emit = now - last_emit_t; last_emit_t = now
-                buf = _to_np_audio(piece)
-                if buf.size == 0: continue
-                sec = buf.size / sampling_rate; chunk_idx += 1; cum_sec += sec
-                full_audio_chunks.append(buf)
-                if not first_chunk_seen:
-                    t_first = time.perf_counter()
-                    server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
-                    server_metrics["until_first_chunk_total_s"] = (t_first - t0)
-                    known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
-                    server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, server_metrics["until_first_chunk_total_s"] - known)
-                    first_chunk_seen=True
-                    yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics), f"[srv] first_chunk idx=1 sec={sec:.3f} cum={cum_sec:.3f} dt_emit={dt_emit:.3f}")
-                else:
-                    yield (_pcm_f32_to_b64(buf), None, None, None, f"[srv] chunk idx={chunk_idx} sec={sec:.3f} cum={cum_sec:.3f} dt_emit={dt_emit:.3f}")
-            yield (None, None, None, None, "[srv] part_end")
     finally:
-        if not full_audio_chunks:
-            yield ("__STOP__", None, None, json.dumps(server_metrics), "[srv] no_chunks"); return
-        t_w0 = time.perf_counter()
-        full_audio = _merge_for_file(full_audio_chunks)
-        tmp_path = None
-        try:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-                write(tmp.name, sampling_rate, full_audio.astype(np.float32)); tmp_path = tmp.name
-        except Exception as e: raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
-        finally:
-            t_w1 = time.perf_counter(); server_metrics["file_write_s"] = (t_w1 - t_w0)
-        yield ("__STOP__", tmp_path, tmp_path, json.dumps(server_metrics), f"[srv] file_ready dur={full_audio.size/sampling_rate:.3f}s")
-# ----------------- 5. Карыстальніцкі інтэрфейс (UI) Gradio ------------------------
-examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
 with gr.Blocks() as demo:
-    gr.Markdown("## Беларускі TTS — Стрымінг + фінальны файл")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
-    with gr.Accordion("Налады", open=False):
-        gr.Markdown("### Кліенцкія (прайграванне ў браўзеры)");
-        with gr.Row():
-            ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01, label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
-            ui_lowwm = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM, step=0.005, label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
-        with gr.Row(): apply_btn = gr.Button("Прымяніць налады"); reset_btn = gr.Button("Скінуць налады")
-        gr.Markdown("### Серверныя (генерацыя гуку)")
-        with gr.Row():
-            ui_minbuf = gr.Slider(0.03, 0.25, value=DEF_MIN_BUFFER_S, step=0.005, label="Памер сервернага чанка (сек.)", interactive=True)
-            ui_firstch = gr.Slider(0.02, 0.16, value=DEF_FIRST_CHUNK_S, step=0.005, label="Памер першага чанка (сек.)", interactive=True)
-        with gr.Row():
-            ui_tokens = gr.Slider(1, 6, value=DEF_TOKENS_PER_STEP, step=1, label="Tokens per step (fallback)", interactive=True)
-            ui_split = gr.Checkbox(value=DEF_ENABLE_TEXT_SPLIT, label="Падзяляць тэкст на сказы", interactive=True)
-            ui_firstseg = gr.Slider(80, 300, value=DEF_FIRST_SEGMENT_LIMIT, step=5, label="Ліміт для першага сегменту", interactive=True)
     with gr.Row():
-        run_btn = gr.Button("▶️ Згенераваць і прайграць (стрым)")
-        stop_btn = gr.Button("⏹ Спыніць")
-        gr.Markdown(f"**Частата дыскрэтызацыі:** {sampling_rate} Гц")
-    # ВЫРАШЭННЕ ПРАБЛЕМЫ: Дадаем схаваны кампанент-трыгер
-    # Кнопка будзе запісваць у яго значэнне, а змена гэтага значэння запусціць Python-код.
-    # Гэта робіць ланцужок падзей JS -> Python больш надзейным.
-    js_trigger = gr.Textbox(value="0", visible=False)
-    log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Статыстыка")
-    log_debug = gr.HTML(value='<div id="wa-dbg" style="font-family:ui-monospace,Menlo,Consolas,monospace;font-size:12px;white-space:pre;max-height:260px;overflow:auto;border:1px solid #ddd;padding:6px;border-radius:6px;">[дыягностыка пустая]</div>', label="Дыягностыка стрыму")
-    stream_pipe = gr.Textbox(value="", visible=False); log_pipe = gr.Textbox(value="", visible=False); srv_dbg_pipe = gr.Textbox(value="", visible=False)
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
-    final_file  = gr.File(label="Спампаваць згенераваны WAV")
-    AUDIO_WORKLET_PROCESSOR = r"""
-class StreamBufferProcessor extends AudioWorkletProcessor{constructor(){super();this.queue=[];this.readIndex=0;this.bufferedSamples=0;this.started=!1;this.wasStartedOnce=!1;this.thresholdSamples=0;this.lowWatermarkSamples=0;this.underrunSent=!1;this.bufferingStarted=!1;this.postDbg=a=>{try{this.port.postMessage({type:"dbg",...a})}catch(b){}};this.port.onmessage=a=>{const b=a.data||{};"push"===b.type?(a=new Float32Array(b.buffer),this.queue.push(a),this.bufferedSamples+=a.length,this.bufferingStarted||0>=this.bufferedSamples||(this.bufferingStarted=!0,this.port.postMessage({type:"buffer_start",bufferedSamples:this.bufferedSamples,ctxSR:sampleRate})),this.postDbg({ev:"push",len:a.length,buffered:this.bufferedSamples,qlen:this.queue.length})):"reset"===b.type?(this.queue=[],this.readIndex=0,this.bufferedSamples=0,this.started=!1,this.wasStartedOnce=!1,this.underrunSent=!1,this.bufferingStarted=!1,this.postDbg({ev:"reset"})):"set_thresholds"===b.type&&(this.thresholdSamples=b.thresholdSamples|0,this.lowWatermarkSamples=b.lowWatermarkSamples|0,this.port.postMessage({type:"thresholds_ready",thresholdSamples:this.thresholdSamples,lowWatermarkSamples:this.lowWatermarkSamples,ctxSR:sampleRate}),this.postDbg({ev:"thresholds",thr:this.thresholdSamples,lowwm:this.lowWatermarkSamples}))}}process(a,b,c){const d=b[0][0];let e=0;if(!this.started){if(this.bufferedSamples>= (this.wasStartedOnce?Math.max(128,this.lowWatermarkSamples):this.thresholdSamples)){if(this.started=!0,b=this.wasStartedOnce?"lowwm_restart":"preroll_start",!this.wasStartedOnce&&(this.wasStartedOnce=!0,this.port.postMessage({type:"first_audio",bufferedSamples:this.bufferedSamples,ctxSR:sampleRate})),this.postDbg({ev:"start",reason:b,buffered:this.bufferedSamples}),0===this.queue.length)return!0}else{for(;e<d.length;)d[e++]=0;return!0}}for(;e<d.length;){if(0===this.queue.length){this.underrunSent||(this.underrunSent=!0,this.port.postMessage({type:"underrun"}));this.started=!1;this.postDbg({ev:"stop",reason:"empty_queue"});for(;e<d.length;)d[e++]=0;return!0}b=this.queue[0];c=b.length-this.readIndex;const f=Math.min(c,d.length-e);d.set(b.subarray(this.readIndex,this.readIndex+f),e);e+=f;this.readIndex+=f;this.bufferedSamples-=f;this.readIndex>=b.length&&(this.queue.shift(),this.readIndex=0)}return!0}}registerProcessor("stream-buffer",StreamBufferProcessor);
-"""
-    INIT_AND_RUN_JS = """
-() => {
   const AC = window.AudioContext || window.webkitAudioContext;
-  if (!AC) { console.error("Web Audio API не падтрымліваецца"); return "error"; }
-  if (window.__wa && window.__wa.reset) {
-    window.__wa.reset();
-    return "reset_" + new Date().getTime();
-  }
-  function getLocalFloat(key, defVal) {
-    try { const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; } catch(e) {}
-    return defVal;
-  }
-  const DEFAULT_PREROLL = __DEF_PR__, MAX_PREROLL = __MAX_PR__, STEP_PREROLL = __STEP_PR__, DEFAULT_LOWWM = __DEF_LW__;
-  let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
-  let LOW_WM_S  = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
-  const blob = new Blob([`__AW_CODE__`], { type: 'application/javascript' });
-  const url  = URL.createObjectURL(blob);
-  const ctx = new AC({ sampleRate: __SR__ });
-  const meta = { t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null, server: null };
-  let workletNode = null, gate = null, connected = false, ready = false;
-  const pending = [], dbgLines = [];
-  function dbg(obj) {
-    try {
-      const ts = ((performance.now())/1000).toFixed(3), s = (typeof obj === 'string') ? obj : JSON.stringify(obj);
-      dbgLines.push(ts + " " + s);
-      while (dbgLines.length > 200) dbgLines.shift();
-      const el = document.getElementById('wa-dbg'); if (el) { el.textContent = dbgLines.join("\\n"); el.scrollTop = el.scrollHeight; }
-      console.log("[AW]", s);
-    } catch(e) {}
-  }
-  function p3(x) { return (x==null)?'n/a':x.toFixed(3)+' s'; }
-  function logUpdate() {
-    const el = document.getElementById('wa-log'); if (!el) return;
-    const s = meta.server || {};
-    el.textContent = [
-      "Клік -> Першы чанк: " + p3(meta.t_first_push_ms ? (meta.t_first_push_ms - meta.t_click_ms)/1000 : null),
-      "Клік -> Пачатак гуку: " + p3(meta.t_first_audio_ms ? (meta.t_first_audio_ms - meta.t_click_ms)/1000 : null),
-      "", "--- Сервер ---", "Latents (голас): " + p3(s.latents_s), "Ініт. генер. -> 1-ы чанк: " + p3(s.gen_init_to_first_chunk_s),
-      "Агульны час да 1-га чанка: " + p3(s.until_first_chunk_total_s),
-      "Ацэнка сеткі/чаргі: " + p3((meta.t_first_push_ms&&s.until_first_chunk_total_s)?(meta.t_first_push_ms/1000-meta.t_click_ms/1000-s.until_first_chunk_total_s):null),
-      "", "--- Кліент ---", "Статус: " + (connected ? "playing" : "stopped"), "PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s",
-    ].join("\\n");
-  }
-  (async () => {
-    await ctx.audioWorklet.addModule(url);
-    workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
-    gate = ctx.createGain(); gate.gain.value = 1.0;
-    workletNode.connect(gate);
-    workletNode.port.onmessage = (e) => {
-      const msg = e.data || {};
-      if (msg.type === 'thresholds_ready') {
-        ready = true;
-        for (const f32 of pending) { workletNode.port.postMessage({ type:'push', buffer:f32.buffer }, [f32.buffer]); }
-        pending.length = 0; logUpdate(); dbg({ev:'thresholds_ready', ...msg});
-      } else if (msg.type === 'buffer_start' && meta.t_first_push_ms === null) { meta.t_first_push_ms = performance.now(); dbg({ev:'buffer_start'});
-      } else if (msg.type === 'first_audio' && meta.t_first_audio_ms === null) { meta.t_first_audio_ms = performance.now(); logUpdate(); dbg({ev:'first_audio'});
-      } else if (msg.type === 'dbg') { dbg(msg); }
-    };
-    workletNode.port.postMessage({ type: 'set_thresholds', thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate) });
-    window.__wa = {
-      push: async (f32) => {
-        try { await ctx.resume(); } catch(e) {}
-        if (!ready) { pending.push(f32); } else { workletNode.port.postMessage({ type:'push', buffer:f32.buffer }, [f32.buffer]); }
-        if (!connected) { try { gate.connect(ctx.destination); connected = true; } catch(e){} }
-        logUpdate();
-      },
-      stop: () => { if (connected) { try { gate.disconnect(); } catch(e) {} connected=false; logUpdate(); } },
-      reset: () => {
-        if(connected){ gate.disconnect(); connected=false; }
-        workletNode.port.postMessage({ type:'reset' });
-        meta.t_click_ms = performance.now(); meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
         logUpdate();
-      },
-      applyClient: (pr, lw) => {
-        PREROLL_S = pr; LOW_WM_S = lw;
-        try { localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); } catch(e) {}
-        if (workletNode) { workletNode.port.postMessage({ type:'set_thresholds', thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate) }); }
         logUpdate();
-      }, meta, updateLog: logUpdate
-    };
-    logUpdate();
-  })();
-  return "init_" + new Date().getTime();
-}
 """
     STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
-    APPLY_JS = """
-() => {
-  const p = document.getElementById('preroll_slider')?.querySelector('input[type="range"]');
-  const l = document.getElementById('lowwm_slider')?.querySelector('input[type="range"]');
-  const pr = p && p.value ? parseFloat(p.value) : 0.30;
-  const lw = l && l.value ? parseFloat(l.value) : 0.06;
-  if (window.__wa && window.__wa.applyClient) { window.__wa.applyClient(pr, lw); }
-}"""
-    RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} window.location.reload(); })()"
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
-  if (b64 === "__STOP__") { window.__wa.updateLog && window.__wa.updateLog(); return; }
-  const bin = atob(b64); const len = bin.length; const buf = new ArrayBuffer(len); const view = new Uint8Array(buf);
   for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
-  const f32 = new Float32Array(buf); window.__wa.push(f32);
-}"""
     LOG_JS = """
-(js) => { if (!window.__wa) return; try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {} }
 """
-    SRV_DBG_JS = """
-(line) => {
-  if (!line) return; try { const el = document.getElementById('wa-dbg'); if (!el) return;
-    const prev = el.textContent.startsWith("[") ? "" : el.textContent; const lines = (prev ? prev.split("\\n") : []);
-    lines.push(line); while (lines.length > 200) lines.shift();
-    el.textContent = lines.join("\\n"); el.scrollTop = el.scrollHeight; console.log("[SRV]", line);
-  } catch(e) {}
-}"""
-    INIT_AND_RUN_JS = INIT_AND_RUN_JS.replace("__AW_CODE__", AUDIO_WORKLET_PROCESSOR).replace("__SR__", str(sampling_rate)).replace("__DEF_PR__", str(DEF_CLIENT_PREROLL)).replace("__MAX_PR__", str(MAX_CLIENT_PREROLL)).replace("__STEP_PR__", str(STEP_CLIENT_PREROLL)).replace("__DEF_LW__", str(DEF_CLIENT_LOWWM))
-    # ВЫПРАЎЛЕНАЯ ПРЫВЯЗКА ПАДЗЕЙ
-    # 1. Націск кнопкі выклікае JS, які рыхтуе плэер і вяртае ўнікальнае значэнне ў схаванае поле `js_trigger`.
-    run_btn.click(fn=None, js=INIT_AND_RUN_JS, outputs=[js_trigger])
-    # 2. Змена значэння ў `js_trigger` запускае асноўную функцыю `text_to_speech` на Python.
-    run_event = js_trigger.change(
-        fn=text_to_speech,
-        inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
-        outputs=[stream_pipe, final_file, final_audio, log_pipe, srv_dbg_pipe]
-    )
-    # 3. Кнопка "Спыніць" адмяняе падзею, запушчаную трыгерам, і спыняе плэер.
-    stop_btn.click(fn=None, js=STOP_JS, cancels=[run_event])
-    apply_btn.click(fn=None, js=APPLY_JS)
-    reset_btn.click(fn=None, js=RESET_JS)
-    stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
-    log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
-    srv_dbg_pipe.change(fn=None, inputs=[srv_dbg_pipe], js=SRV_DBG_JS)
-    gr.Examples(examples=examples, inputs=[inp_text, inp_voice])
 if __name__ == "__main__":
-    demo.launch()

 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
+import sys
+import re
+import time
+import json
+import base64
+import hashlib
+import tempfile
+import subprocess
+import inspect
+from typing import Iterator, Iterable, Optional, Tuple, Any, List
 from dataclasses import dataclass
+import pathlib
 import spaces
 import gradio as gr
 from huggingface_hub import hf_hub_download
 from scipy.io.wavfile import write
+# ---------------------------------------------------------
+# 1) coqui-ai-TTS fork
+# ---------------------------------------------------------
 REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
 REPO_DIR = "coqui-ai-TTS"
 if not os.path.exists(REPO_DIR):
     subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
 repo_root = os.path.abspath(REPO_DIR)
 from TTS.tts.models.xtts import Xtts
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
+# ---------------------------------------------------------
+# 2) мадэльныя файлы
+# ---------------------------------------------------------
 repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
 model_dir = "./model"
 os.makedirs(model_dir, exist_ok=True)
 checkpoint_file = os.path.join(model_dir, "model.pth")
 config_file = os.path.join(model_dir, "config.json")
 vocab_file = os.path.join(model_dir, "vocab.json")
 default_voice_file = os.path.join(model_dir, "voice.wav")
+for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
+    fpath = os.path.join(model_dir, fname)
+    if not os.path.exists(fpath):
+        hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
+# ---------------------------------------------------------
+# 3) загрузка мадэлі
+# ---------------------------------------------------------
 config = XttsConfig()
 config.load_json(config_file)
 XTTS_MODEL = Xtts.init_from_config(config)
+XTTS_MODEL.load_checkpoint(
+    config,
+    checkpoint_path=checkpoint_file,
+    vocab_path=vocab_file,
+    use_deepspeed=False,
+)
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch.set_num_threads(1)
 if device.startswith("cuda"):
     torch.backends.cuda.matmul.allow_tf32 = True
 tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
 XTTS_MODEL.tokenizer = tokenizer
+# =========================================================
+# 4) Streaming-канфіг
+# =========================================================
+MIN_BUFFER_S = 0.03            # бяспечны выхадны буфер для плэера
+RUNTIME_FIRST_CHUNK_S = 0.02   # унутраны чанк у генерацыі
+FADE_S       = 0.004
+TOKENS_PER_STEP = 1
+ENABLE_TEXT_SPLITTING = True
+FIRST_SEGMENT_LIMIT = 160      # стабільная прасадыя для 1-га сегмента
+# -------------------- утыліты аўдыя ----------------------
+def _seconds_to_samples(sec: float, sr: int) -> int:
+    return max(1, int(sec * sr))
 def _to_np_audio(x) -> np.ndarray:
+    if isinstance(x, dict) and "wav" in x:
+        x = x["wav"]
     if isinstance(x, torch.Tensor):
+        if x.dtype != torch.float32:
+            x = x.float()
+        x = x.detach().cpu().contiguous().view(-1)
+        return x.numpy()
     x = np.asarray(x)
+    if x.ndim > 1:
+        x = x.reshape(-1)
+    if x.dtype != np.float32:
+        x = x.astype(np.float32, copy=False)
+    return x
 def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
     if a.size == 0: return b.astype(np.float32, copy=False)
     a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
     fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
     if fade_n <= 1: return np.concatenate([a, b], axis=0)
+    fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
+    fade_in  = 1.0 - fade_out
+    head = a[:-fade_n]
+    tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
+    rest = b[fade_n:]
     return np.concatenate([head, tail, rest], axis=0)
 def _bpe_prefixes(text: str, lang: str, step_tokens: int):
     try:
+        ids = tokenizer.encode(text, lang=lang)
+        n = len(ids)
+        for k in range(step_tokens, n + 1, step_tokens):
+            yield tokenizer.decode(ids[:k], lang=lang)
+        if n % step_tokens != 0:
+            yield tokenizer.decode(ids, lang=lang)
+        return
+    except Exception:
+        pass
+    pseudo_tokens = re.findall(r"\S+|\s+", text)
+    acc = ""
+    for i in range(0, len(pseudo_tokens), step_tokens):
+        acc = "".join(pseudo_tokens[: i + step_tokens])
+        yield acc
+    if acc.strip() != text.strip():
+        yield text
+def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
+    sig = inspect.signature(model.inference_stream)
+    call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
+    for k in ("temperature","length_penalty","repetition_penalty","top_k","top_p","stream_chunk_size_s"):
+        if k in gen_kwargs and k in sig.parameters:
+            call_kwargs[k] = gen_kwargs[k]
+    autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
+    with torch.inference_mode(), autocast_ctx:
+        generator = model.inference_stream(**call_kwargs)
+        for out in generator:
+            yield _to_np_audio(out)
+def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, tokens_per_step: int, **gen_kwargs) -> Iterator[np.ndarray]:
     emitted = 0
     for prefix in _bpe_prefixes(text, language, tokens_per_step):
+        autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
         with torch.inference_mode(), autocast_ctx:
             out = model.inference(
+                text=prefix, language=language,
+                gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+                temperature=gen_kwargs.get("temperature", 0.1),
+                length_penalty=1.0, repetition_penalty=10.0,
+                top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
             )
         wav = _to_np_audio(out)
+        new_part = wav[emitted:]; emitted = wav.size
+        if new_part.size: yield new_part
 class NewTTSGenerationMixin:
     @torch.inference_mode()
     def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
+                 gpt_cond_latent: Any = None, speaker_embedding: Any = None,
+                 min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs):
+        assert isinstance(text, str) and text.strip(), "text is required"
         if not do_stream:
+            autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
             with autocast_ctx:
+                out = self.inference(
+                    text=text, language=language,
+                    gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+                    temperature=gen_kwargs.get("temperature", 0.1),
+                    length_penalty=1.0, repetition_penalty=10.0,
+                    top_k=10, top_p=0.3,
+                )
             return _to_np_audio(out)
+        return self.sample_stream(
+            text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+            min_buffer_s=min_buffer_s, tokens_per_step=tokens_per_step, **gen_kwargs
+        )
     @torch.inference_mode()
+    def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any,
+                      min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs) -> Iterator[np.ndarray]:
+        local_kwargs = dict(gen_kwargs); local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
+        if hasattr(self, "inference_stream"):
+            for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
+                yield chunk
+            return
+        for chunk in _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs):
+            yield chunk
 def init_stream_support():
     Xtts.generate = NewTTSGenerationMixin.generate
     Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
 init_stream_support()
+# ---------------------------------------------------------
+# 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
+# ---------------------------------------------------------
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 @dataclass(frozen=True)
 class LatentsMeta:
+    model_id: str
+    gpt_cond_len: int
+    max_ref_len: int
+    sound_norm_refs: bool
+    xtts_git: str | None = None
 LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
 def _latents_key(path: str | None, meta: LatentsMeta) -> str:
+    if path and os.path.exists(path):
+        base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
+    else:
+        base = "default_voice"
+    meta_str = json.dumps({
+        "model_id": meta.model_id,
+        "gpt_cond_len": meta.gpt_cond_len,
+        "max_ref_len": meta.max_ref_len,
+        "sound_norm_refs": meta.sound_norm_refs,
+        "xtts_git": meta.xtts_git,
+    }, sort_keys=True)
     return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
+def _latents_disk_path(key: str) -> pathlib.Path:
+    return PERSIST_LATENTS_DIR / f"{key}.pt"
+def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
+    torch.save({"gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu()}, _latents_disk_path(key))
+def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
     p = _latents_disk_path(key)
     if not p.exists(): return None
+    obj = torch.load(p, map_location="cpu")
+    return obj["gpt_cond_latent"], obj["speaker_embedding"]
+def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
     with torch.inference_mode():
+        g, s = XTTS_MODEL.get_conditioning_latents(
+            audio_path=path,
+            gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
+            max_ref_length=XTTS_MODEL.config.max_ref_len,
+            sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
+        )
     return g.cpu(), s.cpu()
+def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    meta = LatentsMeta(
+        model_id=repo_id,
+        gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
+        max_ref_len=XTTS_MODEL.config.max_ref_len,
+        sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
+        xtts_git=None,
+    )
     key = _latents_key(path, meta)
+    if key in LATENT_CACHE:
+        g, s = LATENT_CACHE[key]
     else:
         loaded = _load_latents_from_disk(key)
+        if loaded is None:
+            g, s = _compute_latents_cpu(path)
+            _save_latents_to_disk(key, g, s)
+        else:
+            g, s = loaded
+        LATENT_CACHE[key] = (g, s)
     if to_device and to_device.startswith("cuda"):
+        dev_key = (key, to_device)
+        if dev_key in GPU_LATENT_CACHE:
+            return GPU_LATENT_CACHE[dev_key]
+        g2 = g.to(to_device, non_blocking=True)
+        s2 = s.to(to_device, non_blocking=True)
+        GPU_LATENT_CACHE[dev_key] = (g2, s2)
+        return g2, s2
+    return g, s
+# аўтападлік для default voice (CPU) — без дадатковых запытаў
+try:
+    _ = _latents_for(default_voice_file)
+except Exception as e:
+    print(f"[warn] precompute default voice latents failed: {e}")
+# ---------------------------------------------------------
+# 6) буферы + base64
+# ---------------------------------------------------------
 def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
     if not chunks: return np.zeros((0,), dtype=np.float32)
     out = chunks[0]
+    for i in range(1, len(chunks)):
+        out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
     return out
 def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
+    target_samples = _seconds_to_samples(target_s, sr)
+    buf = np.zeros((0,), dtype=np.float32)
     for c in chunks:
         c = _to_np_audio(c)
         if c.size == 0: continue
         buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
+        if buf.size >= target_samples:
+            yield buf
+            buf = np.zeros((0,), dtype=np.float32)
     if buf.size: yield buf
 def _pcm_f32_to_b64(x: np.ndarray) -> str:
     if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
     return base64.b64encode(x.tobytes()).decode("ascii")
+# ---------------------------------------------------------
+# 7) падзел тэксту: хуткі + fallback
+# ---------------------------------------------------------
 _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
 _WS = re.compile(r"\s+")
 def _fast_split(text: str, limit: int) -> List[str]:
+    text = text.strip()
     if not text: return []
+    parts = []
+    start = 0
+    for m in _SENT_END.finditer(text):
+        end = m.end()
+        parts.append(text[start:end].strip())
+        start = end
     if start < len(text): parts.append(text[start:].strip())
+    chunks = []
+    cur = ""
     for s in parts:
+        if len(cur) + 1 + len(s) <= limit:
+            cur = (cur + " " + s).strip() if cur else s
         else:
             if cur: chunks.append(cur)
+            if len(s) <= limit:
+                cur = s
             else:
+                w = _WS.split(s); acc = ""
                 for tok in w:
+                    if len(acc) + 1 + len(tok) <= limit:
+                        acc = (acc + " " + tok).strip() if acc else tok
                     else:
+                        if acc: chunks.append(acc)
+                        acc = tok
+                if acc: cur = acc
+                else: cur = ""
     if cur: chunks.append(cur)
     return [c for c in chunks if c]
+def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
     text_in = text_in.strip()
     if not text_in: return []
+    parts: List[str] = []
+    if len(text_in) > FIRST_SEGMENT_LIMIT:
+        head = text_in[:FIRST_SEGMENT_LIMIT]
+        m = re.search(r".*[\.!\?…»)]", head)
+        if m and len(m.group(0)) > 30:
+            head = m.group(0)
+        tail = text_in[len(head):].lstrip()
+        parts.append(head)
+        text_for_rest = tail
+    else:
+        text_for_rest = text_in
     if not text_for_rest: return parts or [text_in]
+    rest = _fast_split(text_for_rest, chunk_limit)
+    if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
         try:
             rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
             rest2 = [s.strip() for s in rest2 if s and s.strip()]
             if rest2: rest = rest2
+        except Exception:
+            pass
     return parts + (rest or [text_for_rest])
+# ---------------------------------------------------------
+# 8) TTS — стрим + фінальны файл + лагі
+# ---------------------------------------------------------
 @spaces.GPU(duration=60)
+def text_to_speech(belarusian_story, speaker_audio_file=None):
+    """
+    Выхады:
+      1) stream_pipe — base64(PCM float32) чанкі, у фінале "__STOP__"
+      2) final_file  — шлях да WAV
+      3) final_audio — шлях да WAV для прайгравання
+      4) log_pipe    — JSON з сервернымі метрыкамі (секунды)
+    """
     t0 = time.perf_counter()
     if not belarusian_story or str(belarusian_story).strip() == "":
+        raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
+    if not speaker_audio_file or (
+        not isinstance(speaker_audio_file, str)
+        and getattr(speaker_audio_file, "name", "") == ""
+    ):
         speaker_audio_file = default_voice_file
     text_in = str(belarusian_story).strip()
     lang_short = "be"
     chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
+    # Latents (кэш CPU/GPU)
     t_lat0 = time.perf_counter()
     to_dev = "cuda:0" if torch.cuda.is_available() else None
     gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
     t_lat1 = time.perf_counter()
+    # Split
     t_split0 = time.perf_counter()
+    texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
+    if not texts: texts = [text_in]
     t_split1 = time.perf_counter()
+    server_metrics = {
+        "latents_s": (t_lat1 - t_lat0),
+        "text_split_s": (t_split1 - t_split0),
+        "gen_init_to_first_chunk_s": None,
+        "until_first_chunk_total_s": None,
+        "server_unaccounted_before_first_chunk_s": None,
+        "file_write_s": None,
+    }
+    yield ("", None, None, json.dumps(server_metrics))
+    full_audio_chunks: List[np.ndarray] = []
+    first_chunk_seen = False
+    t_gen0 = time.perf_counter()
+    for part in texts:
+        gen = XTTS_MODEL.generate(
+            text=part, do_stream=True, language=lang_short,
+            gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
+            min_buffer_s=RUNTIME_FIRST_CHUNK_S,
+            tokens_per_step=TOKENS_PER_STEP,
+            stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
+            temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
+            top_k=10, top_p=0.3,
+        )
+        for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
+            if not first_chunk_seen:
+                t_first = time.perf_counter()
+                server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
+                server_metrics["until_first_chunk_total_s"] = (t_first - t0)
+                known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
+                other = server_metrics["until_first_chunk_total_s"] - known
+                server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
+                first_chunk_seen = True
+                yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
+            else:
+                yield (_pcm_f32_to_b64(buf), None, None, None)
+            full_audio_chunks.append(buf)
+    if not full_audio_chunks:
+        yield ("__STOP__", None, None, json.dumps(server_metrics)); return
+    t_w0 = time.perf_counter()
+    full_audio = _merge_for_file(full_audio_chunks)
+    tmp = None
     try:
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        write(tmp.name, sampling_rate, full_audio.astype(np.float32))
+    except Exception as e:
+        raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
     finally:
+        t_w1 = time.perf_counter()
+        server_metrics["file_write_s"] = (t_w1 - t_w0)
+    yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
+# ---------------------------------------------------------
+# 9) UI (лагі ў секундах + Play Final; без underrun’аў)
+# ---------------------------------------------------------
+examples = [
+    ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
+]
 with gr.Blocks() as demo:
+    gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
     with gr.Row():
         inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
         inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
     with gr.Row():
+        play_btn = gr.Button("▶️ Play (stream)")
+        stop_btn = gr.Button("⏹ Stop (stream)")
+        run_btn = gr.Button("Згенераваць")
+        gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
+    log_panel = gr.HTML(
+        value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
+        label="Лагі плэера",
+    )
+    stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
+    log_pipe    = gr.Textbox(value="", visible=False, label="log_pipe")
+    final_file  = gr.File(label="Згенераваны WAV (спампаваць)")
     final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
+    play_final_btn = gr.Button("▶️ Play Final")
+    INIT_RESET_AND_PLAY_JS = f"""
+() => {{
+  const sampleRate = {sampling_rate};
   const AC = window.AudioContext || window.webkitAudioContext;
+  if (!AC) return;
+  const PRIME_CHUNKS = 2;       // мін. к-ць чанкаў перад стартаваннем гуку
+  let primeCounter = 0;
+  function toSec(ms) {{ return (ms/1000); }}
+  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
+  function logUpdate() {{
+    const el = document.getElementById('wa-log');
+    if (!el || !window.__wa || !window.__wa.meta) return;
+    const m = window.__wa.meta;
+    const lines = [];
+    lines.push("Клік (Згенераваць): 0.000 s");
+    let click_to_first_chunk_s = null;
+    if (m.t_first_push_ms) {{
+      click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
+      lines.push("Першы чанк прыйшоў:   " + click_to_first_chunk_s.toFixed(3) + " s");
+      if (m.t_first_audio_ms) {{
+        lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
+        lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
+      }}
+    }}
+    const s = (m.server || {{}});
+    lines.push("");
+    lines.push("— Серверныя метрыкі —");
+    lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
+    lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
+    lines.push("Ініт→1-ы чанк:           " + fmtS(s.gen_init_to_first_chunk_s));
+    lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
+    lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
+    lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
+    if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
+      let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
+      if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
+      lines.push("");
+      lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
+    }} else {{
+      lines.push("");
+      lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
+    }}
+    lines.push("");
+    lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
+    el.textContent = lines.join("\\n");
+    try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
+  }}
+  if (!window.__wa) {{
+    const ctx = new AC({{ sampleRate }});
+    const bufferSize = 2048; // большы буфер = менш underrun’аў
+    const node = ctx.createScriptProcessor(bufferSize, 0, 1);
+    let queue = [];
+    let playing = false;
+    let eos = false;
+    const meta = {{
+      t_click_ms: performance.now(),
+      t_first_push_ms: null,
+      t_first_audio_ms: null,
+      server: null,
+    }};
+    node.onaudioprocess = (e) => {{
+      const out = e.outputBuffer.getChannelData(0);
+      let i = 0;
+      while (i < out.length) {{
+        if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
+        let cur = queue[0];
+        const take = Math.min(cur.length, out.length - i);
+        if (meta.t_first_audio_ms === null) {{
+          meta.t_first_audio_ms = performance.now();
+          logUpdate();
+        }}
+        out.set(cur.subarray(0, take), i);
+        i += take;
+        if (take === cur.length) queue.shift();
+        else queue[0] = cur.subarray(take);
+      }}
+      if (eos && queue.length === 0 && playing) {{
+        playing = false;
         logUpdate();
+      }}
+    }};
+    node.connect(ctx.destination);
+    window.__wa = {{
+      ctx, node,
+      get playing() {{ return playing; }},
+      get eos() {{ return eos; }},
+      set eos(v) {{ eos = v; }},
+      meta,
+      push: (f32) => {{
+        queue.push(f32);
+        if (!meta.t_first_push_ms) {{
+          meta.t_first_push_ms = performance.now();
+          logUpdate();
+        }}
+        if (!playing && queue.length >= PRIME_CHUNKS) {{
+          // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
+          window.__wa.start();
+        }}
+      }},
+      start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
+      stop: () => {{ playing = false; logUpdate(); }},
+      reset: () => {{
+        playing = false; eos = false; queue = [];
+        primeCounter = 0;
+        meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
         logUpdate();
+      }},
+      updateLog: logUpdate,
+    }};
+  }} else {{
+    window.__wa.reset();
+    window.__wa.meta.t_click_ms = performance.now();
+  }}
+}}
 """
     STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
+    PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
     PUSH_JS = """
 (b64) => {
   if (!window.__wa || !b64) return;
+  if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
+  const bin = atob(b64);
+  const len = bin.length;
+  const buf = new ArrayBuffer(len);
+  const view = new Uint8Array(buf);
   for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
+  const f32 = new Float32Array(buf);
+  window.__wa.push(f32);
+}
+"""
     LOG_JS = """
+(js) => {
+  if (!window.__wa) return;
+  try {
+    if (js) {
+      const obj = JSON.parse(js);
+      window.__wa.meta.server = obj;
+      window.__wa.updateLog && window.__wa.updateLog();
+    }
+  } catch (e) {}
+}
 """
+    PLAY_FINAL_JS = """
+() => {
+  const host = document.getElementById('final-audio');
+  if (!host) return;
+  const audio = host.querySelector('audio');
+  if (audio) { try { audio.play(); } catch(e) {} }
+}
+"""
+    play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
+    stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
+    run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
+    run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
+    stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
+    log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
+    play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
+    gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
 if __name__ == "__main__":
+    demo.launch()