archivartaunik commited on
Commit
36c434c
·
verified ·
1 Parent(s): 0f31af2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +528 -343
app.py CHANGED
@@ -3,9 +3,18 @@ os.environ.setdefault("OMP_NUM_THREADS", "1")
3
  os.environ.setdefault("MKL_NUM_THREADS", "1")
4
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
5
 
6
- import sys, re, time, json, base64, hashlib, tempfile, subprocess, inspect, pathlib
7
- from typing import Iterable, Optional, Tuple, List
 
 
 
 
 
 
 
 
8
  from dataclasses import dataclass
 
9
 
10
  import spaces
11
  import gradio as gr
@@ -14,11 +23,13 @@ import numpy as np
14
  from huggingface_hub import hf_hub_download
15
  from scipy.io.wavfile import write
16
 
17
- # ----------------- 1. Кланаванне рэпазіторыя і ўсталяванне залежнасцяў -----------------
 
 
18
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
19
  REPO_DIR = "coqui-ai-TTS"
 
20
  if not os.path.exists(REPO_DIR):
21
- print(f"Кланаванне рэпазіторыя {REPO_URL}...")
22
  subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
23
 
24
  repo_root = os.path.abspath(REPO_DIR)
@@ -29,29 +40,38 @@ from TTS.tts.configs.xtts_config import XttsConfig
29
  from TTS.tts.models.xtts import Xtts
30
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
31
 
32
- # ----------------- 2. Загрузка файлаў мадэлі -----------------
33
- print("Загрузка файлаў мадэлі XTTSv2...")
 
34
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
35
  model_dir = "./model"
36
  os.makedirs(model_dir, exist_ok=True)
37
- for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
38
- if not os.path.exists(os.path.join(model_dir, fname)):
39
- hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
40
 
41
  checkpoint_file = os.path.join(model_dir, "model.pth")
42
  config_file = os.path.join(model_dir, "config.json")
43
  vocab_file = os.path.join(model_dir, "vocab.json")
44
  default_voice_file = os.path.join(model_dir, "voice.wav")
45
 
46
- # ----------------- 3. Ініцыялізацыя мадэлі XTTS ------------------
47
- print("Ініцыялізацыя мадэлі XTTS...")
 
 
 
 
 
 
48
  config = XttsConfig()
49
  config.load_json(config_file)
50
  XTTS_MODEL = Xtts.init_from_config(config)
51
- XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
 
 
 
 
 
52
 
53
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
54
- print(f"Мадэль будзе працаваць на: {device.upper()}")
55
  torch.set_num_threads(1)
56
  if device.startswith("cuda"):
57
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -64,35 +84,35 @@ sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
64
 
65
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
66
  XTTS_MODEL.tokenizer = tokenizer
67
- print("Мадэль паспяхова загружана.")
68
-
69
- # ----------------- Канфігурацыя стрымінгу -------------------
70
- FORCE_FALLBACK_STREAM = True
71
-
72
- # Серверныя налады
73
- DEF_MIN_BUFFER_S = 0.14
74
- DEF_FIRST_CHUNK_S = 0.10
75
- DEF_TOKENS_PER_STEP = 2
76
- DEF_ENABLE_TEXT_SPLIT = False
77
- DEF_FIRST_SEGMENT_LIMIT = 160
78
- FADE_S = 0.004
79
-
80
- # Кліенцкія налады
81
- DEF_CLIENT_PREROLL = 0.30
82
- DEF_CLIENT_LOWWM = 0.06
83
- MAX_CLIENT_PREROLL = 0.40
84
- STEP_CLIENT_PREROLL = 0.04
85
-
86
- # ----------------- Дапаможныя функцыі для аўдыя ----------------
87
- def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
88
  def _to_np_audio(x) -> np.ndarray:
89
- if isinstance(x, dict) and "wav" in x: x = x["wav"]
 
90
  if isinstance(x, torch.Tensor):
91
- if x.dtype != torch.float32: x = x.float()
92
- return x.detach().cpu().contiguous().view(-1).numpy()
 
 
93
  x = np.asarray(x)
94
- if x.ndim > 1: x = x.reshape(-1)
95
- return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
 
 
 
96
 
97
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
98
  if a.size == 0: return b.astype(np.float32, copy=False)
@@ -100,418 +120,583 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
100
  a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
101
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
102
  if fade_n <= 1: return np.concatenate([a, b], axis=0)
103
- fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
104
- head = a[:-fade_n]; tail = a[-fade_n:] * fade_out + b[:fade_n] * fade_in; rest = b[fade_n:]
 
 
 
105
  return np.concatenate([head, tail, rest], axis=0)
106
 
107
- # ----------------- Логіка стрымінгу -----------------
108
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
109
  try:
110
- ids = tokenizer.encode(text, lang=lang); n = len(ids)
111
- for k in range(step_tokens, n + 1, step_tokens): yield tokenizer.decode(ids[:k], lang=lang)
112
- if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
113
- except Exception: pass
114
- pseudo = re.findall(r"\S+|\s+", text); acc = ""
115
- for i in range(0, len(pseudo), step_tokens): acc = "".join(pseudo[: i + step_tokens]); yield acc
116
- if acc.strip() != text.strip(): yield text
117
-
118
- def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, tokens_per_step: int, **gen_kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  emitted = 0
120
- step = 0
121
  for prefix in _bpe_prefixes(text, language, tokens_per_step):
122
- t0 = time.perf_counter()
123
- autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
124
  with torch.inference_mode(), autocast_ctx:
125
  out = model.inference(
126
- text=prefix, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
127
- temperature=gen_kwargs.get("temperature", 0.1), length_penalty=1.0, repetition_penalty=10.0,
128
- top_k=10, top_p=0.3,
 
 
129
  )
130
  wav = _to_np_audio(out)
131
- new_part = wav[emitted:]
132
- emitted = wav.size
133
- t1 = time.perf_counter()
134
- yield {"__DBG__": f"[srv] fb_step={step} tps={tokens_per_step} new_s={new_part.size/sampling_rate:.3f} "
135
- f"total_s={emitted/sampling_rate:.3f} dt_inf={t1-t0:.3f}s"}
136
- step += 1
137
- if new_part.size:
138
- yield new_part
139
 
140
  class NewTTSGenerationMixin:
141
  @torch.inference_mode()
142
  def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
143
- gpt_cond_latent=None, speaker_embedding=None, **gen_kwargs):
144
- assert isinstance(text, str) and text.strip()
 
145
  if not do_stream:
146
- autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
147
  with autocast_ctx:
148
- out = self.inference(text=text, language=language, gpt_cond_latent=gpt_cond_latent,
149
- speaker_embedding=speaker_embedding, temperature=gen_kwargs.get("temperature", 0.1),
150
- length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3)
 
 
 
 
151
  return _to_np_audio(out)
152
- return self.sample_stream(text=text, language=language, gpt_cond_latent=gpt_cond_latent,
153
- speaker_embedding=speaker_embedding, **gen_kwargs)
 
 
 
154
  @torch.inference_mode()
155
- def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
156
- if FORCE_FALLBACK_STREAM or not hasattr(self, "inference_stream"):
157
- yield from _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, **gen_kwargs)
158
- else: # Native stream is not used, but kept for reference
159
- local_kwargs = dict(gen_kwargs)
160
- autocast_ctx = torch.autocast("cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
161
- with torch.inference_mode(), autocast_ctx:
162
- for out in self.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
163
- yield _to_np_audio(out)
164
 
165
  def init_stream_support():
166
  Xtts.generate = NewTTSGenerationMixin.generate
167
  Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
 
168
  init_stream_support()
169
 
170
- # ----------------- Кэшаванне "голасу" (latents) -----------------
 
 
171
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
172
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 
173
  @dataclass(frozen=True)
174
  class LatentsMeta:
175
- model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool
 
 
 
 
 
176
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
177
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
 
178
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
179
- base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
180
- meta_str = json.dumps(meta.__dict__, sort_keys=True)
 
 
 
 
 
 
 
 
 
181
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
182
- def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
183
- def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
184
- def _load_latents_from_disk(key: str):
 
 
 
 
 
185
  p = _latents_disk_path(key)
186
  if not p.exists(): return None
187
- obj = torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
188
- def _compute_latents_cpu(path: str | None):
 
 
189
  with torch.inference_mode():
190
- g, s = XTTS_MODEL.get_conditioning_latents(audio_path=path, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
 
 
 
 
 
191
  return g.cpu(), s.cpu()
192
- def _latents_for(path: str | None, *, to_device: Optional[str] = None):
193
- meta = LatentsMeta(repo_id, XTTS_MODEL.config.gpt_cond_len, XTTS_MODEL.config.max_ref_len, XTTS_MODEL.config.sound_norm_refs)
 
 
 
 
 
 
 
194
  key = _latents_key(path, meta)
195
- if key in LATENT_CACHE: g,s = LATENT_CACHE[key]
 
 
196
  else:
197
  loaded = _load_latents_from_disk(key)
198
- if loaded is None: g,s = _compute_latents_cpu(path); _save_latents_to_disk(key, g, s)
199
- else: g,s = loaded
200
- LATENT_CACHE[key]=(g,s)
 
 
 
 
201
  if to_device and to_device.startswith("cuda"):
202
- dev_key=(key,to_device)
203
- if dev_key in GPU_LATENT_CACHE: return GPU_LATENT_CACHE[dev_key]
204
- g2=g.to(to_device, non_blocking=True); s2=s.to(to_device, non_blocking=True); GPU_LATENT_CACHE[dev_key]=(g2,s2); return g2,s2
205
- return g,s
206
- try: _ = _latents_for(default_voice_file)
207
- except Exception as e: print(f"[warn] Памылка пры папярэднім разліку голасу па змаўчанні: {e}")
208
-
209
- # ----------------- Утыліты для апрацоўкі стрыму -----------------
 
 
 
 
 
 
 
 
 
 
210
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
211
  if not chunks: return np.zeros((0,), dtype=np.float32)
212
  out = chunks[0]
213
- for i in range(1, len(chunks)): out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
 
214
  return out
 
215
  def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
216
- target_samples = _seconds_to_samples(target_s, sr); buf = np.zeros((0,), dtype=np.float32)
 
217
  for c in chunks:
218
- if isinstance(c, dict) and "__DBG__" in c: yield c; continue
219
  c = _to_np_audio(c)
220
  if c.size == 0: continue
221
  buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
222
- if buf.size >= target_samples: yield buf; buf = np.zeros((0,), dtype=np.float32)
 
 
223
  if buf.size: yield buf
 
224
  def _pcm_f32_to_b64(x: np.ndarray) -> str:
225
  if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
226
  return base64.b64encode(x.tobytes()).decode("ascii")
227
 
228
- # ----------------- Утыліты для падзелу тэксту -----------------
 
 
229
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
230
  _WS = re.compile(r"\s+")
 
231
  def _fast_split(text: str, limit: int) -> List[str]:
232
- text = text.strip(); parts=[]; chunks=[]; cur=""
233
  if not text: return []
234
- start=0
235
- for m in _SENT_END.finditer(text): parts.append(text[start:m.end()].strip()); start=m.end()
 
 
 
 
236
  if start < len(text): parts.append(text[start:].strip())
 
 
237
  for s in parts:
238
- if len(cur)+1+len(s) <= limit: cur = (cur+" "+s).strip() if cur else s
 
239
  else:
240
  if cur: chunks.append(cur)
241
- if len(s)<=limit: cur=s
 
242
  else:
243
- w=_WS.split(s); acc=""
244
  for tok in w:
245
- if len(acc)+1+len(tok)<=limit: acc=(acc+" "+tok).strip() if acc else tok
 
246
  else:
247
- if acc: chunks.append(acc); acc=tok
248
- cur=acc
 
 
249
  if cur: chunks.append(cur)
250
  return [c for c in chunks if c]
251
- def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int, first_segment_limit: int) -> List[str]:
 
252
  text_in = text_in.strip()
253
  if not text_in: return []
254
- parts=[]
255
- if len(text_in)>first_segment_limit:
256
- head=text_in[:first_segment_limit]; m=re.search(r".*[\.!\?…»)]", head)
257
- if m and len(m.group(0))>30: head=m.group(0)
258
- tail=text_in[len(head):].lstrip(); parts.append(head); text_for_rest=tail
259
- else: text_for_rest=text_in
 
 
 
 
 
260
  if not text_for_rest: return parts or [text_in]
261
- rest=_fast_split(text_for_rest, chunk_limit)
262
- if not rest or sum(len(x) for x in rest) < int(0.6*len(text_for_rest)):
 
263
  try:
264
  rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
265
  rest2 = [s.strip() for s in rest2 if s and s.strip()]
266
  if rest2: rest = rest2
267
- except Exception: pass
 
268
  return parts + (rest or [text_for_rest])
269
 
270
- # ----------------- 4. Асноўная функцыя для Gradio -----------------
 
 
271
  @spaces.GPU(duration=60)
272
- def text_to_speech(
273
- belarusian_story, speaker_audio_file=None,
274
- min_buffer_s: float = DEF_MIN_BUFFER_S,
275
- first_chunk_s: float = DEF_FIRST_CHUNK_S,
276
- enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
277
- tokens_per_step: int = DEF_TOKENS_PER_STEP,
278
- first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT,
279
- ):
280
- print("--- Python function 'text_to_speech' STARTED ---") # Дыягнастычнае паведамленне
281
  t0 = time.perf_counter()
 
282
  if not belarusian_story or str(belarusian_story).strip() == "":
283
- raise gr.Error("Увядзіце тэкст для агучвання.")
284
- if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
 
 
 
 
285
  speaker_audio_file = default_voice_file
 
286
  text_in = str(belarusian_story).strip()
287
  lang_short = "be"
288
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
 
 
289
  t_lat0 = time.perf_counter()
290
  to_dev = "cuda:0" if torch.cuda.is_available() else None
291
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
292
  t_lat1 = time.perf_counter()
 
 
293
  t_split0 = time.perf_counter()
294
- texts = [text_in] if not enable_text_splitting else (_split_text_smart(text_in, lang_short, chunk_limit, int(first_segment_limit)) or [text_in])
 
295
  t_split1 = time.perf_counter()
296
- server_metrics = {"latents_s": (t_lat1 - t_lat0), "text_split_s": (t_split1 - t_split0), "gen_init_to_first_chunk_s": None, "until_first_chunk_total_s": None, "server_unaccounted_before_first_chunk_s": None, "file_write_s": None, "backend": "fallback", "sr": sampling_rate, "min_buffer_s": float(min_buffer_s), "first_chunk_s": float(first_chunk_s), "tokens_per_step": int(tokens_per_step)}
297
- yield ("", None, None, json.dumps(server_metrics), "[srv] start")
298
- full_audio_chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  try:
300
- first_chunk_seen=False; t_gen0=time.perf_counter()
301
- chunk_idx = 0; last_emit_t = time.perf_counter(); cum_sec = 0.0
302
- for part in texts:
303
- yield (None, None, None, None, f"[srv] part_start chars={len(part)}")
304
- gen = XTTS_MODEL.generate(text=part, do_stream=True, language=lang_short, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, tokens_per_step=int(tokens_per_step))
305
- for piece in _chunker(gen, sampling_rate, float(min_buffer_s)):
306
- if isinstance(piece, dict) and "__DBG__" in piece:
307
- yield (None, None, None, None, piece["__DBG__"]); continue
308
- now = time.perf_counter()
309
- dt_emit = now - last_emit_t; last_emit_t = now
310
- buf = _to_np_audio(piece)
311
- if buf.size == 0: continue
312
- sec = buf.size / sampling_rate; chunk_idx += 1; cum_sec += sec
313
- full_audio_chunks.append(buf)
314
- if not first_chunk_seen:
315
- t_first = time.perf_counter()
316
- server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
317
- server_metrics["until_first_chunk_total_s"] = (t_first - t0)
318
- known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
319
- server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, server_metrics["until_first_chunk_total_s"] - known)
320
- first_chunk_seen=True
321
- yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics), f"[srv] first_chunk idx=1 sec={sec:.3f} cum={cum_sec:.3f} dt_emit={dt_emit:.3f}")
322
- else:
323
- yield (_pcm_f32_to_b64(buf), None, None, None, f"[srv] chunk idx={chunk_idx} sec={sec:.3f} cum={cum_sec:.3f} dt_emit={dt_emit:.3f}")
324
- yield (None, None, None, None, "[srv] part_end")
325
  finally:
326
- if not full_audio_chunks:
327
- yield ("__STOP__", None, None, json.dumps(server_metrics), "[srv] no_chunks"); return
328
- t_w0 = time.perf_counter()
329
- full_audio = _merge_for_file(full_audio_chunks)
330
- tmp_path = None
331
- try:
332
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
333
- write(tmp.name, sampling_rate, full_audio.astype(np.float32)); tmp_path = tmp.name
334
- except Exception as e: raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
335
- finally:
336
- t_w1 = time.perf_counter(); server_metrics["file_write_s"] = (t_w1 - t_w0)
337
- yield ("__STOP__", tmp_path, tmp_path, json.dumps(server_metrics), f"[srv] file_ready dur={full_audio.size/sampling_rate:.3f}s")
338
-
339
- # ----------------- 5. Карыстальніцкі інтэрфейс (UI) Gradio ------------------------
340
- examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
341
  with gr.Blocks() as demo:
342
- gr.Markdown("## Беларускі TTS — Стрымінг + фінальны файл")
 
343
  with gr.Row():
344
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
345
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
346
- with gr.Accordion("Налады", open=False):
347
- gr.Markdown("### Кліенцкія (прайграванне ў браўзеры)");
348
- with gr.Row():
349
- ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01, label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
350
- ui_lowwm = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM, step=0.005, label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
351
- with gr.Row(): apply_btn = gr.Button("Прымяніць налады"); reset_btn = gr.Button("Скінуць налады")
352
- gr.Markdown("### Серверныя (генерацыя гуку)")
353
- with gr.Row():
354
- ui_minbuf = gr.Slider(0.03, 0.25, value=DEF_MIN_BUFFER_S, step=0.005, label="Памер сервернага чанка (сек.)", interactive=True)
355
- ui_firstch = gr.Slider(0.02, 0.16, value=DEF_FIRST_CHUNK_S, step=0.005, label="Памер першага чанка (сек.)", interactive=True)
356
- with gr.Row():
357
- ui_tokens = gr.Slider(1, 6, value=DEF_TOKENS_PER_STEP, step=1, label="Tokens per step (fallback)", interactive=True)
358
- ui_split = gr.Checkbox(value=DEF_ENABLE_TEXT_SPLIT, label="Падзяляць тэкст на сказы", interactive=True)
359
- ui_firstseg = gr.Slider(80, 300, value=DEF_FIRST_SEGMENT_LIMIT, step=5, label="Ліміт для першага сегменту", interactive=True)
360
  with gr.Row():
361
- run_btn = gr.Button("▶️ Згенераваць і прайграць (стрым)")
362
- stop_btn = gr.Button("⏹ Спыніць")
363
- gr.Markdown(f"**Частата дыскрэтызацыі:** {sampling_rate} Гц")
364
-
365
- # ВЫРАШЭННЕ ПРАБЛЕМЫ: Дадаем схаваны кампанент-трыгер
366
- # Кнопка будзе запісваць у яго значэнне, а змена гэтага значэння запусціць Python-код.
367
- # Гэта робіць ланцужок падзей JS -> Python больш надзейным.
368
- js_trigger = gr.Textbox(value="0", visible=False)
369
-
370
- log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Статыстыка")
371
- log_debug = gr.HTML(value='<div id="wa-dbg" style="font-family:ui-monospace,Menlo,Consolas,monospace;font-size:12px;white-space:pre;max-height:260px;overflow:auto;border:1px solid #ddd;padding:6px;border-radius:6px;">[дыягностыка пустая]</div>', label="Дыягностыка стрыму")
372
- stream_pipe = gr.Textbox(value="", visible=False); log_pipe = gr.Textbox(value="", visible=False); srv_dbg_pipe = gr.Textbox(value="", visible=False)
 
 
373
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
374
- final_file = gr.File(label="Спампаваць згенераваны WAV")
375
-
376
- AUDIO_WORKLET_PROCESSOR = r"""
377
- class StreamBufferProcessor extends AudioWorkletProcessor{constructor(){super();this.queue=[];this.readIndex=0;this.bufferedSamples=0;this.started=!1;this.wasStartedOnce=!1;this.thresholdSamples=0;this.lowWatermarkSamples=0;this.underrunSent=!1;this.bufferingStarted=!1;this.postDbg=a=>{try{this.port.postMessage({type:"dbg",...a})}catch(b){}};this.port.onmessage=a=>{const b=a.data||{};"push"===b.type?(a=new Float32Array(b.buffer),this.queue.push(a),this.bufferedSamples+=a.length,this.bufferingStarted||0>=this.bufferedSamples||(this.bufferingStarted=!0,this.port.postMessage({type:"buffer_start",bufferedSamples:this.bufferedSamples,ctxSR:sampleRate})),this.postDbg({ev:"push",len:a.length,buffered:this.bufferedSamples,qlen:this.queue.length})):"reset"===b.type?(this.queue=[],this.readIndex=0,this.bufferedSamples=0,this.started=!1,this.wasStartedOnce=!1,this.underrunSent=!1,this.bufferingStarted=!1,this.postDbg({ev:"reset"})):"set_thresholds"===b.type&&(this.thresholdSamples=b.thresholdSamples|0,this.lowWatermarkSamples=b.lowWatermarkSamples|0,this.port.postMessage({type:"thresholds_ready",thresholdSamples:this.thresholdSamples,lowWatermarkSamples:this.lowWatermarkSamples,ctxSR:sampleRate}),this.postDbg({ev:"thresholds",thr:this.thresholdSamples,lowwm:this.lowWatermarkSamples}))}}process(a,b,c){const d=b[0][0];let e=0;if(!this.started){if(this.bufferedSamples>= (this.wasStartedOnce?Math.max(128,this.lowWatermarkSamples):this.thresholdSamples)){if(this.started=!0,b=this.wasStartedOnce?"lowwm_restart":"preroll_start",!this.wasStartedOnce&&(this.wasStartedOnce=!0,this.port.postMessage({type:"first_audio",bufferedSamples:this.bufferedSamples,ctxSR:sampleRate})),this.postDbg({ev:"start",reason:b,buffered:this.bufferedSamples}),0===this.queue.length)return!0}else{for(;e<d.length;)d[e++]=0;return!0}}for(;e<d.length;){if(0===this.queue.length){this.underrunSent||(this.underrunSent=!0,this.port.postMessage({type:"underrun"}));this.started=!1;this.postDbg({ev:"stop",reason:"empty_queue"});for(;e<d.length;)d[e++]=0;return!0}b=this.queue[0];c=b.length-this.readIndex;const f=Math.min(c,d.length-e);d.set(b.subarray(this.readIndex,this.readIndex+f),e);e+=f;this.readIndex+=f;this.bufferedSamples-=f;this.readIndex>=b.length&&(this.queue.shift(),this.readIndex=0)}return!0}}registerProcessor("stream-buffer",StreamBufferProcessor);
378
- """
379
- INIT_AND_RUN_JS = """
380
- () => {
381
  const AC = window.AudioContext || window.webkitAudioContext;
382
- if (!AC) { console.error("Web Audio API не падтрымліваецца"); return "error"; }
383
- if (window.__wa && window.__wa.reset) {
384
- window.__wa.reset();
385
- return "reset_" + new Date().getTime();
386
- }
387
- function getLocalFloat(key, defVal) {
388
- try { const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; } catch(e) {}
389
- return defVal;
390
- }
391
- const DEFAULT_PREROLL = __DEF_PR__, MAX_PREROLL = __MAX_PR__, STEP_PREROLL = __STEP_PR__, DEFAULT_LOWWM = __DEF_LW__;
392
- let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
393
- let LOW_WM_S = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
394
- const blob = new Blob([`__AW_CODE__`], { type: 'application/javascript' });
395
- const url = URL.createObjectURL(blob);
396
- const ctx = new AC({ sampleRate: __SR__ });
397
- const meta = { t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null, server: null };
398
- let workletNode = null, gate = null, connected = false, ready = false;
399
- const pending = [], dbgLines = [];
400
- function dbg(obj) {
401
- try {
402
- const ts = ((performance.now())/1000).toFixed(3), s = (typeof obj === 'string') ? obj : JSON.stringify(obj);
403
- dbgLines.push(ts + " " + s);
404
- while (dbgLines.length > 200) dbgLines.shift();
405
- const el = document.getElementById('wa-dbg'); if (el) { el.textContent = dbgLines.join("\\n"); el.scrollTop = el.scrollHeight; }
406
- console.log("[AW]", s);
407
- } catch(e) {}
408
- }
409
- function p3(x) { return (x==null)?'n/a':x.toFixed(3)+' s'; }
410
- function logUpdate() {
411
- const el = document.getElementById('wa-log'); if (!el) return;
412
- const s = meta.server || {};
413
- el.textContent = [
414
- "Клік -> Першы чанк: " + p3(meta.t_first_push_ms ? (meta.t_first_push_ms - meta.t_click_ms)/1000 : null),
415
- "Клік -> Пачатак гуку: " + p3(meta.t_first_audio_ms ? (meta.t_first_audio_ms - meta.t_click_ms)/1000 : null),
416
- "", "--- Сервер ---", "Latents (голас): " + p3(s.latents_s), "Ініт. генер. -> 1-ы чанк: " + p3(s.gen_init_to_first_chunk_s),
417
- "Агульны час да 1-га чанка: " + p3(s.until_first_chunk_total_s),
418
- "Ацэнка сеткі/чаргі: " + p3((meta.t_first_push_ms&&s.until_first_chunk_total_s)?(meta.t_first_push_ms/1000-meta.t_click_ms/1000-s.until_first_chunk_total_s):null),
419
- "", "--- Кліент ---", "Статус: " + (connected ? "playing" : "stopped"), "PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s",
420
- ].join("\\n");
421
- }
422
- (async () => {
423
- await ctx.audioWorklet.addModule(url);
424
- workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
425
- gate = ctx.createGain(); gate.gain.value = 1.0;
426
- workletNode.connect(gate);
427
- workletNode.port.onmessage = (e) => {
428
- const msg = e.data || {};
429
- if (msg.type === 'thresholds_ready') {
430
- ready = true;
431
- for (const f32 of pending) { workletNode.port.postMessage({ type:'push', buffer:f32.buffer }, [f32.buffer]); }
432
- pending.length = 0; logUpdate(); dbg({ev:'thresholds_ready', ...msg});
433
- } else if (msg.type === 'buffer_start' && meta.t_first_push_ms === null) { meta.t_first_push_ms = performance.now(); dbg({ev:'buffer_start'});
434
- } else if (msg.type === 'first_audio' && meta.t_first_audio_ms === null) { meta.t_first_audio_ms = performance.now(); logUpdate(); dbg({ev:'first_audio'});
435
- } else if (msg.type === 'dbg') { dbg(msg); }
436
- };
437
- workletNode.port.postMessage({ type: 'set_thresholds', thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate) });
438
- window.__wa = {
439
- push: async (f32) => {
440
- try { await ctx.resume(); } catch(e) {}
441
- if (!ready) { pending.push(f32); } else { workletNode.port.postMessage({ type:'push', buffer:f32.buffer }, [f32.buffer]); }
442
- if (!connected) { try { gate.connect(ctx.destination); connected = true; } catch(e){} }
443
- logUpdate();
444
- },
445
- stop: () => { if (connected) { try { gate.disconnect(); } catch(e) {} connected=false; logUpdate(); } },
446
- reset: () => {
447
- if(connected){ gate.disconnect(); connected=false; }
448
- workletNode.port.postMessage({ type:'reset' });
449
- meta.t_click_ms = performance.now(); meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  logUpdate();
451
- },
452
- applyClient: (pr, lw) => {
453
- PREROLL_S = pr; LOW_WM_S = lw;
454
- try { localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); } catch(e) {}
455
- if (workletNode) { workletNode.port.postMessage({ type:'set_thresholds', thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate) }); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  logUpdate();
457
- }, meta, updateLog: logUpdate
458
- };
459
- logUpdate();
460
- })();
461
- return "init_" + new Date().getTime();
462
- }
 
 
463
  """
 
464
  STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
465
- APPLY_JS = """
466
- () => {
467
- const p = document.getElementById('preroll_slider')?.querySelector('input[type="range"]');
468
- const l = document.getElementById('lowwm_slider')?.querySelector('input[type="range"]');
469
- const pr = p && p.value ? parseFloat(p.value) : 0.30;
470
- const lw = l && l.value ? parseFloat(l.value) : 0.06;
471
- if (window.__wa && window.__wa.applyClient) { window.__wa.applyClient(pr, lw); }
472
- }"""
473
- RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} window.location.reload(); })()"
474
  PUSH_JS = """
475
  (b64) => {
476
  if (!window.__wa || !b64) return;
477
- if (b64 === "__STOP__") { window.__wa.updateLog && window.__wa.updateLog(); return; }
478
- const bin = atob(b64); const len = bin.length; const buf = new ArrayBuffer(len); const view = new Uint8Array(buf);
 
 
 
479
  for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
480
- const f32 = new Float32Array(buf); window.__wa.push(f32);
481
- }"""
 
 
 
482
  LOG_JS = """
483
- (js) => { if (!window.__wa) return; try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {} }
 
 
 
 
 
 
 
 
 
484
  """
485
- SRV_DBG_JS = """
486
- (line) => {
487
- if (!line) return; try { const el = document.getElementById('wa-dbg'); if (!el) return;
488
- const prev = el.textContent.startsWith("[") ? "" : el.textContent; const lines = (prev ? prev.split("\\n") : []);
489
- lines.push(line); while (lines.length > 200) lines.shift();
490
- el.textContent = lines.join("\\n"); el.scrollTop = el.scrollHeight; console.log("[SRV]", line);
491
- } catch(e) {}
492
- }"""
493
- INIT_AND_RUN_JS = INIT_AND_RUN_JS.replace("__AW_CODE__", AUDIO_WORKLET_PROCESSOR).replace("__SR__", str(sampling_rate)).replace("__DEF_PR__", str(DEF_CLIENT_PREROLL)).replace("__MAX_PR__", str(MAX_CLIENT_PREROLL)).replace("__STEP_PR__", str(STEP_CLIENT_PREROLL)).replace("__DEF_LW__", str(DEF_CLIENT_LOWWM))
494
-
495
- # ВЫПРАЎЛЕНАЯ ПРЫВЯЗКА ПАДЗЕЙ
496
- # 1. Націск кнопкі выклікае JS, які рыхтуе плэер і вяртае ўнікальнае значэнне ў схаванае поле `js_trigger`.
497
- run_btn.click(fn=None, js=INIT_AND_RUN_JS, outputs=[js_trigger])
498
-
499
- # 2. Змена значэння ў `js_trigger` запускае асноўную функцыю `text_to_speech` на Python.
500
- run_event = js_trigger.change(
501
- fn=text_to_speech,
502
- inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
503
- outputs=[stream_pipe, final_file, final_audio, log_pipe, srv_dbg_pipe]
504
- )
505
-
506
- # 3. Кнопка "Спыніць" адмяняе падзею, запушчаную трыгерам, і спыняе плэер.
507
- stop_btn.click(fn=None, js=STOP_JS, cancels=[run_event])
508
 
509
- apply_btn.click(fn=None, js=APPLY_JS)
510
- reset_btn.click(fn=None, js=RESET_JS)
511
- stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
512
- log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
513
- srv_dbg_pipe.change(fn=None, inputs=[srv_dbg_pipe], js=SRV_DBG_JS)
514
- gr.Examples(examples=examples, inputs=[inp_text, inp_voice])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
  if __name__ == "__main__":
517
- demo.launch()
 
3
  os.environ.setdefault("MKL_NUM_THREADS", "1")
4
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
5
 
6
+ import sys
7
+ import re
8
+ import time
9
+ import json
10
+ import base64
11
+ import hashlib
12
+ import tempfile
13
+ import subprocess
14
+ import inspect
15
+ from typing import Iterator, Iterable, Optional, Tuple, Any, List
16
  from dataclasses import dataclass
17
+ import pathlib
18
 
19
  import spaces
20
  import gradio as gr
 
23
  from huggingface_hub import hf_hub_download
24
  from scipy.io.wavfile import write
25
 
26
+ # ---------------------------------------------------------
27
+ # 1) coqui-ai-TTS fork
28
+ # ---------------------------------------------------------
29
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
30
  REPO_DIR = "coqui-ai-TTS"
31
+
32
  if not os.path.exists(REPO_DIR):
 
33
  subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
34
 
35
  repo_root = os.path.abspath(REPO_DIR)
 
40
  from TTS.tts.models.xtts import Xtts
41
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
42
 
43
+ # ---------------------------------------------------------
44
+ # 2) мадэльныя файлы
45
+ # ---------------------------------------------------------
46
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
47
  model_dir = "./model"
48
  os.makedirs(model_dir, exist_ok=True)
 
 
 
49
 
50
  checkpoint_file = os.path.join(model_dir, "model.pth")
51
  config_file = os.path.join(model_dir, "config.json")
52
  vocab_file = os.path.join(model_dir, "vocab.json")
53
  default_voice_file = os.path.join(model_dir, "voice.wav")
54
 
55
+ for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
56
+ fpath = os.path.join(model_dir, fname)
57
+ if not os.path.exists(fpath):
58
+ hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
59
+
60
+ # ---------------------------------------------------------
61
+ # 3) загрузка мадэлі
62
+ # ---------------------------------------------------------
63
  config = XttsConfig()
64
  config.load_json(config_file)
65
  XTTS_MODEL = Xtts.init_from_config(config)
66
+ XTTS_MODEL.load_checkpoint(
67
+ config,
68
+ checkpoint_path=checkpoint_file,
69
+ vocab_path=vocab_file,
70
+ use_deepspeed=False,
71
+ )
72
 
73
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
74
+
75
  torch.set_num_threads(1)
76
  if device.startswith("cuda"):
77
  torch.backends.cuda.matmul.allow_tf32 = True
 
84
 
85
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
86
  XTTS_MODEL.tokenizer = tokenizer
87
+
88
+ # =========================================================
89
+ # 4) Streaming-канфіг
90
+ # =========================================================
91
+ MIN_BUFFER_S = 0.03 # бяспечны выхадны буфер для плэера
92
+ RUNTIME_FIRST_CHUNK_S = 0.02 # унутраны чанк у генерацыі
93
+ FADE_S = 0.004
94
+ TOKENS_PER_STEP = 1
95
+ ENABLE_TEXT_SPLITTING = True
96
+ FIRST_SEGMENT_LIMIT = 160 # стабільная прасадыя для 1-га сегмента
97
+
98
+ # -------------------- утыліты аўдыя ----------------------
99
+ def _seconds_to_samples(sec: float, sr: int) -> int:
100
+ return max(1, int(sec * sr))
101
+
 
 
 
 
 
 
102
  def _to_np_audio(x) -> np.ndarray:
103
+ if isinstance(x, dict) and "wav" in x:
104
+ x = x["wav"]
105
  if isinstance(x, torch.Tensor):
106
+ if x.dtype != torch.float32:
107
+ x = x.float()
108
+ x = x.detach().cpu().contiguous().view(-1)
109
+ return x.numpy()
110
  x = np.asarray(x)
111
+ if x.ndim > 1:
112
+ x = x.reshape(-1)
113
+ if x.dtype != np.float32:
114
+ x = x.astype(np.float32, copy=False)
115
+ return x
116
 
117
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
118
  if a.size == 0: return b.astype(np.float32, copy=False)
 
120
  a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
121
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
122
  if fade_n <= 1: return np.concatenate([a, b], axis=0)
123
+ fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
124
+ fade_in = 1.0 - fade_out
125
+ head = a[:-fade_n]
126
+ tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
127
+ rest = b[fade_n:]
128
  return np.concatenate([head, tail, rest], axis=0)
129
 
 
130
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
131
  try:
132
+ ids = tokenizer.encode(text, lang=lang)
133
+ n = len(ids)
134
+ for k in range(step_tokens, n + 1, step_tokens):
135
+ yield tokenizer.decode(ids[:k], lang=lang)
136
+ if n % step_tokens != 0:
137
+ yield tokenizer.decode(ids, lang=lang)
138
+ return
139
+ except Exception:
140
+ pass
141
+ pseudo_tokens = re.findall(r"\S+|\s+", text)
142
+ acc = ""
143
+ for i in range(0, len(pseudo_tokens), step_tokens):
144
+ acc = "".join(pseudo_tokens[: i + step_tokens])
145
+ yield acc
146
+ if acc.strip() != text.strip():
147
+ yield text
148
+
149
+ def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
150
+ sig = inspect.signature(model.inference_stream)
151
+ call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
152
+ for k in ("temperature","length_penalty","repetition_penalty","top_k","top_p","stream_chunk_size_s"):
153
+ if k in gen_kwargs and k in sig.parameters:
154
+ call_kwargs[k] = gen_kwargs[k]
155
+ autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
156
+ with torch.inference_mode(), autocast_ctx:
157
+ generator = model.inference_stream(**call_kwargs)
158
+ for out in generator:
159
+ yield _to_np_audio(out)
160
+
161
+ def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, tokens_per_step: int, **gen_kwargs) -> Iterator[np.ndarray]:
162
  emitted = 0
 
163
  for prefix in _bpe_prefixes(text, language, tokens_per_step):
164
+ autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
 
165
  with torch.inference_mode(), autocast_ctx:
166
  out = model.inference(
167
+ text=prefix, language=language,
168
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
169
+ temperature=gen_kwargs.get("temperature", 0.1),
170
+ length_penalty=1.0, repetition_penalty=10.0,
171
+ top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
172
  )
173
  wav = _to_np_audio(out)
174
+ new_part = wav[emitted:]; emitted = wav.size
175
+ if new_part.size: yield new_part
 
 
 
 
 
 
176
 
177
  class NewTTSGenerationMixin:
178
  @torch.inference_mode()
179
  def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
180
+ gpt_cond_latent: Any = None, speaker_embedding: Any = None,
181
+ min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs):
182
+ assert isinstance(text, str) and text.strip(), "text is required"
183
  if not do_stream:
184
+ autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
185
  with autocast_ctx:
186
+ out = self.inference(
187
+ text=text, language=language,
188
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
189
+ temperature=gen_kwargs.get("temperature", 0.1),
190
+ length_penalty=1.0, repetition_penalty=10.0,
191
+ top_k=10, top_p=0.3,
192
+ )
193
  return _to_np_audio(out)
194
+ return self.sample_stream(
195
+ text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
196
+ min_buffer_s=min_buffer_s, tokens_per_step=tokens_per_step, **gen_kwargs
197
+ )
198
+
199
  @torch.inference_mode()
200
+ def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any,
201
+ min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs) -> Iterator[np.ndarray]:
202
+ local_kwargs = dict(gen_kwargs); local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
203
+ if hasattr(self, "inference_stream"):
204
+ for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
205
+ yield chunk
206
+ return
207
+ for chunk in _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs):
208
+ yield chunk
209
 
210
  def init_stream_support():
211
  Xtts.generate = NewTTSGenerationMixin.generate
212
  Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
213
+
214
  init_stream_support()
215
 
216
+ # ---------------------------------------------------------
217
+ # 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
218
+ # ---------------------------------------------------------
219
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
220
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
221
+
222
  @dataclass(frozen=True)
223
  class LatentsMeta:
224
+ model_id: str
225
+ gpt_cond_len: int
226
+ max_ref_len: int
227
+ sound_norm_refs: bool
228
+ xtts_git: str | None = None
229
+
230
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
231
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
232
+
233
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
234
+ if path and os.path.exists(path):
235
+ base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
236
+ else:
237
+ base = "default_voice"
238
+ meta_str = json.dumps({
239
+ "model_id": meta.model_id,
240
+ "gpt_cond_len": meta.gpt_cond_len,
241
+ "max_ref_len": meta.max_ref_len,
242
+ "sound_norm_refs": meta.sound_norm_refs,
243
+ "xtts_git": meta.xtts_git,
244
+ }, sort_keys=True)
245
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
246
+
247
+ def _latents_disk_path(key: str) -> pathlib.Path:
248
+ return PERSIST_LATENTS_DIR / f"{key}.pt"
249
+
250
+ def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
251
+ torch.save({"gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu()}, _latents_disk_path(key))
252
+
253
+ def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
254
  p = _latents_disk_path(key)
255
  if not p.exists(): return None
256
+ obj = torch.load(p, map_location="cpu")
257
+ return obj["gpt_cond_latent"], obj["speaker_embedding"]
258
+
259
+ def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
260
  with torch.inference_mode():
261
+ g, s = XTTS_MODEL.get_conditioning_latents(
262
+ audio_path=path,
263
+ gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
264
+ max_ref_length=XTTS_MODEL.config.max_ref_len,
265
+ sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
266
+ )
267
  return g.cpu(), s.cpu()
268
+
269
+ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
270
+ meta = LatentsMeta(
271
+ model_id=repo_id,
272
+ gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
273
+ max_ref_len=XTTS_MODEL.config.max_ref_len,
274
+ sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
275
+ xtts_git=None,
276
+ )
277
  key = _latents_key(path, meta)
278
+
279
+ if key in LATENT_CACHE:
280
+ g, s = LATENT_CACHE[key]
281
  else:
282
  loaded = _load_latents_from_disk(key)
283
+ if loaded is None:
284
+ g, s = _compute_latents_cpu(path)
285
+ _save_latents_to_disk(key, g, s)
286
+ else:
287
+ g, s = loaded
288
+ LATENT_CACHE[key] = (g, s)
289
+
290
  if to_device and to_device.startswith("cuda"):
291
+ dev_key = (key, to_device)
292
+ if dev_key in GPU_LATENT_CACHE:
293
+ return GPU_LATENT_CACHE[dev_key]
294
+ g2 = g.to(to_device, non_blocking=True)
295
+ s2 = s.to(to_device, non_blocking=True)
296
+ GPU_LATENT_CACHE[dev_key] = (g2, s2)
297
+ return g2, s2
298
+ return g, s
299
+
300
+ # аўтападлік для default voice (CPU) — без дадатковых запытаў
301
+ try:
302
+ _ = _latents_for(default_voice_file)
303
+ except Exception as e:
304
+ print(f"[warn] precompute default voice latents failed: {e}")
305
+
306
+ # ---------------------------------------------------------
307
+ # 6) буферы + base64
308
+ # ---------------------------------------------------------
309
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
310
  if not chunks: return np.zeros((0,), dtype=np.float32)
311
  out = chunks[0]
312
+ for i in range(1, len(chunks)):
313
+ out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
314
  return out
315
+
316
  def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
317
+ target_samples = _seconds_to_samples(target_s, sr)
318
+ buf = np.zeros((0,), dtype=np.float32)
319
  for c in chunks:
 
320
  c = _to_np_audio(c)
321
  if c.size == 0: continue
322
  buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
323
+ if buf.size >= target_samples:
324
+ yield buf
325
+ buf = np.zeros((0,), dtype=np.float32)
326
  if buf.size: yield buf
327
+
328
  def _pcm_f32_to_b64(x: np.ndarray) -> str:
329
  if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
330
  return base64.b64encode(x.tobytes()).decode("ascii")
331
 
332
+ # ---------------------------------------------------------
333
+ # 7) падзел тэксту: хуткі + fallback
334
+ # ---------------------------------------------------------
335
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
336
  _WS = re.compile(r"\s+")
337
+
338
  def _fast_split(text: str, limit: int) -> List[str]:
339
+ text = text.strip()
340
  if not text: return []
341
+ parts = []
342
+ start = 0
343
+ for m in _SENT_END.finditer(text):
344
+ end = m.end()
345
+ parts.append(text[start:end].strip())
346
+ start = end
347
  if start < len(text): parts.append(text[start:].strip())
348
+ chunks = []
349
+ cur = ""
350
  for s in parts:
351
+ if len(cur) + 1 + len(s) <= limit:
352
+ cur = (cur + " " + s).strip() if cur else s
353
  else:
354
  if cur: chunks.append(cur)
355
+ if len(s) <= limit:
356
+ cur = s
357
  else:
358
+ w = _WS.split(s); acc = ""
359
  for tok in w:
360
+ if len(acc) + 1 + len(tok) <= limit:
361
+ acc = (acc + " " + tok).strip() if acc else tok
362
  else:
363
+ if acc: chunks.append(acc)
364
+ acc = tok
365
+ if acc: cur = acc
366
+ else: cur = ""
367
  if cur: chunks.append(cur)
368
  return [c for c in chunks if c]
369
+
370
+ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
371
  text_in = text_in.strip()
372
  if not text_in: return []
373
+ parts: List[str] = []
374
+ if len(text_in) > FIRST_SEGMENT_LIMIT:
375
+ head = text_in[:FIRST_SEGMENT_LIMIT]
376
+ m = re.search(r".*[\.!\?…»)]", head)
377
+ if m and len(m.group(0)) > 30:
378
+ head = m.group(0)
379
+ tail = text_in[len(head):].lstrip()
380
+ parts.append(head)
381
+ text_for_rest = tail
382
+ else:
383
+ text_for_rest = text_in
384
  if not text_for_rest: return parts or [text_in]
385
+
386
+ rest = _fast_split(text_for_rest, chunk_limit)
387
+ if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
388
  try:
389
  rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
390
  rest2 = [s.strip() for s in rest2 if s and s.strip()]
391
  if rest2: rest = rest2
392
+ except Exception:
393
+ pass
394
  return parts + (rest or [text_for_rest])
395
 
396
+ # ---------------------------------------------------------
397
+ # 8) TTS — стрим + фінальны файл + лагі
398
+ # ---------------------------------------------------------
399
  @spaces.GPU(duration=60)
400
+ def text_to_speech(belarusian_story, speaker_audio_file=None):
401
+ """
402
+ Выхады:
403
+ 1) stream_pipe base64(PCM float32) чанкі, у фінале "__STOP__"
404
+ 2) final_file — шлях да WAV
405
+ 3) final_audio шлях да WAV для прайгравання
406
+ 4) log_pipe JSON з сервернымі метрыкамі (секунды)
407
+ """
 
408
  t0 = time.perf_counter()
409
+
410
  if not belarusian_story or str(belarusian_story).strip() == "":
411
+ raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
412
+
413
+ if not speaker_audio_file or (
414
+ not isinstance(speaker_audio_file, str)
415
+ and getattr(speaker_audio_file, "name", "") == ""
416
+ ):
417
  speaker_audio_file = default_voice_file
418
+
419
  text_in = str(belarusian_story).strip()
420
  lang_short = "be"
421
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
422
+
423
+ # Latents (кэш CPU/GPU)
424
  t_lat0 = time.perf_counter()
425
  to_dev = "cuda:0" if torch.cuda.is_available() else None
426
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
427
  t_lat1 = time.perf_counter()
428
+
429
+ # Split
430
  t_split0 = time.perf_counter()
431
+ texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
432
+ if not texts: texts = [text_in]
433
  t_split1 = time.perf_counter()
434
+
435
+ server_metrics = {
436
+ "latents_s": (t_lat1 - t_lat0),
437
+ "text_split_s": (t_split1 - t_split0),
438
+ "gen_init_to_first_chunk_s": None,
439
+ "until_first_chunk_total_s": None,
440
+ "server_unaccounted_before_first_chunk_s": None,
441
+ "file_write_s": None,
442
+ }
443
+ yield ("", None, None, json.dumps(server_metrics))
444
+
445
+ full_audio_chunks: List[np.ndarray] = []
446
+ first_chunk_seen = False
447
+ t_gen0 = time.perf_counter()
448
+
449
+ for part in texts:
450
+ gen = XTTS_MODEL.generate(
451
+ text=part, do_stream=True, language=lang_short,
452
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
453
+ min_buffer_s=RUNTIME_FIRST_CHUNK_S,
454
+ tokens_per_step=TOKENS_PER_STEP,
455
+ stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
456
+ temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
457
+ top_k=10, top_p=0.3,
458
+ )
459
+ for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
460
+ if not first_chunk_seen:
461
+ t_first = time.perf_counter()
462
+ server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
463
+ server_metrics["until_first_chunk_total_s"] = (t_first - t0)
464
+ known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
465
+ other = server_metrics["until_first_chunk_total_s"] - known
466
+ server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
467
+ first_chunk_seen = True
468
+ yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
469
+ else:
470
+ yield (_pcm_f32_to_b64(buf), None, None, None)
471
+ full_audio_chunks.append(buf)
472
+
473
+ if not full_audio_chunks:
474
+ yield ("__STOP__", None, None, json.dumps(server_metrics)); return
475
+
476
+ t_w0 = time.perf_counter()
477
+ full_audio = _merge_for_file(full_audio_chunks)
478
+ tmp = None
479
  try:
480
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
481
+ write(tmp.name, sampling_rate, full_audio.astype(np.float32))
482
+ except Exception as e:
483
+ raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  finally:
485
+ t_w1 = time.perf_counter()
486
+ server_metrics["file_write_s"] = (t_w1 - t_w0)
487
+
488
+ yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
489
+
490
+ # ---------------------------------------------------------
491
+ # 9) UI (лагі ў секундах + Play Final; без underrun’аў)
492
+ # ---------------------------------------------------------
493
+ examples = [
494
+ ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
495
+ ]
496
+
 
 
 
497
  with gr.Blocks() as demo:
498
+ gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
499
+
500
  with gr.Row():
501
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
502
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
503
+
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  with gr.Row():
505
+ play_btn = gr.Button("▶️ Play (stream)")
506
+ stop_btn = gr.Button("⏹ Stop (stream)")
507
+ run_btn = gr.Button("Згенераваць")
508
+ gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
509
+
510
+ log_panel = gr.HTML(
511
+ value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
512
+ label="Лагі плэера",
513
+ )
514
+
515
+ stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
516
+ log_pipe = gr.Textbox(value="", visible=False, label="log_pipe")
517
+
518
+ final_file = gr.File(label="Згенераваны WAV (спампаваць)")
519
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
520
+ play_final_btn = gr.Button("▶️ Play Final")
521
+
522
+ INIT_RESET_AND_PLAY_JS = f"""
523
+ () => {{
524
+ const sampleRate = {sampling_rate};
 
 
525
  const AC = window.AudioContext || window.webkitAudioContext;
526
+ if (!AC) return;
527
+
528
+ const PRIME_CHUNKS = 2; // мін. к-ць чанкаў перад стартаваннем гуку
529
+ let primeCounter = 0;
530
+
531
+ function toSec(ms) {{ return (ms/1000); }}
532
+ function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
533
+
534
+ function logUpdate() {{
535
+ const el = document.getElementById('wa-log');
536
+ if (!el || !window.__wa || !window.__wa.meta) return;
537
+ const m = window.__wa.meta;
538
+ const lines = [];
539
+ lines.push("Клік (Згенераваць): 0.000 s");
540
+
541
+ let click_to_first_chunk_s = null;
542
+ if (m.t_first_push_ms) {{
543
+ click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
544
+ lines.push("Першы чанк прыйшоў: " + click_to_first_chunk_s.toFixed(3) + " s");
545
+ if (m.t_first_audio_ms) {{
546
+ lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
547
+ lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
548
+ }}
549
+ }}
550
+
551
+ const s = (m.server || {{}});
552
+ lines.push("");
553
+ lines.push("— Серверныя метрыкі —");
554
+ lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
555
+ lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
556
+ lines.push("Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s));
557
+ lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
558
+ lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
559
+ lines.push("Запіс WAV: " + fmtS(s.file_write_s));
560
+
561
+ if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
562
+ let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
563
+ if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
564
+ lines.push("");
565
+ lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
566
+ }} else {{
567
+ lines.push("");
568
+ lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
569
+ }}
570
+
571
+ lines.push("");
572
+ lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
573
+ el.textContent = lines.join("\\n");
574
+ try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
575
+ }}
576
+
577
+ if (!window.__wa) {{
578
+ const ctx = new AC({{ sampleRate }});
579
+ const bufferSize = 2048; // большы буфер = менш underrun’аў
580
+ const node = ctx.createScriptProcessor(bufferSize, 0, 1);
581
+ let queue = [];
582
+ let playing = false;
583
+ let eos = false;
584
+
585
+ const meta = {{
586
+ t_click_ms: performance.now(),
587
+ t_first_push_ms: null,
588
+ t_first_audio_ms: null,
589
+ server: null,
590
+ }};
591
+
592
+ node.onaudioprocess = (e) => {{
593
+ const out = e.outputBuffer.getChannelData(0);
594
+ let i = 0;
595
+ while (i < out.length) {{
596
+ if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
597
+ let cur = queue[0];
598
+ const take = Math.min(cur.length, out.length - i);
599
+ if (meta.t_first_audio_ms === null) {{
600
+ meta.t_first_audio_ms = performance.now();
601
+ logUpdate();
602
+ }}
603
+ out.set(cur.subarray(0, take), i);
604
+ i += take;
605
+ if (take === cur.length) queue.shift();
606
+ else queue[0] = cur.subarray(take);
607
+ }}
608
+ if (eos && queue.length === 0 && playing) {{
609
+ playing = false;
610
  logUpdate();
611
+ }}
612
+ }};
613
+ node.connect(ctx.destination);
614
+
615
+ window.__wa = {{
616
+ ctx, node,
617
+ get playing() {{ return playing; }},
618
+ get eos() {{ return eos; }},
619
+ set eos(v) {{ eos = v; }},
620
+ meta,
621
+ push: (f32) => {{
622
+ queue.push(f32);
623
+ if (!meta.t_first_push_ms) {{
624
+ meta.t_first_push_ms = performance.now();
625
+ logUpdate();
626
+ }}
627
+ if (!playing && queue.length >= PRIME_CHUNKS) {{
628
+ // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
629
+ window.__wa.start();
630
+ }}
631
+ }},
632
+ start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
633
+ stop: () => {{ playing = false; logUpdate(); }},
634
+ reset: () => {{
635
+ playing = false; eos = false; queue = [];
636
+ primeCounter = 0;
637
+ meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
638
  logUpdate();
639
+ }},
640
+ updateLog: logUpdate,
641
+ }};
642
+ }} else {{
643
+ window.__wa.reset();
644
+ window.__wa.meta.t_click_ms = performance.now();
645
+ }}
646
+ }}
647
  """
648
+
649
  STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
650
+ PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
651
+
 
 
 
 
 
 
 
652
  PUSH_JS = """
653
  (b64) => {
654
  if (!window.__wa || !b64) return;
655
+ if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
656
+ const bin = atob(b64);
657
+ const len = bin.length;
658
+ const buf = new ArrayBuffer(len);
659
+ const view = new Uint8Array(buf);
660
  for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
661
+ const f32 = new Float32Array(buf);
662
+ window.__wa.push(f32);
663
+ }
664
+ """
665
+
666
  LOG_JS = """
667
+ (js) => {
668
+ if (!window.__wa) return;
669
+ try {
670
+ if (js) {
671
+ const obj = JSON.parse(js);
672
+ window.__wa.meta.server = obj;
673
+ window.__wa.updateLog && window.__wa.updateLog();
674
+ }
675
+ } catch (e) {}
676
+ }
677
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678
 
679
+ PLAY_FINAL_JS = """
680
+ () => {
681
+ const host = document.getElementById('final-audio');
682
+ if (!host) return;
683
+ const audio = host.querySelector('audio');
684
+ if (audio) { try { audio.play(); } catch(e) {} }
685
+ }
686
+ """
687
+
688
+ play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
689
+ stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
690
+
691
+ run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
692
+ run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
693
+
694
+ stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
695
+ log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
696
+
697
+ play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
698
+
699
+ gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
700
 
701
  if __name__ == "__main__":
702
+ demo.launch()