archivartaunik commited on
Commit
6da2e8a
·
verified ·
1 Parent(s): 3e4a584

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -439
app.py CHANGED
@@ -23,9 +23,9 @@ import numpy as np
23
  from huggingface_hub import hf_hub_download
24
  from scipy.io.wavfile import write
25
 
26
- # =========================================================
27
- # 0) Repo + imports для XTTS (coqui fork)
28
- # =========================================================
29
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
30
  REPO_DIR = "coqui-ai-TTS"
31
 
@@ -40,28 +40,29 @@ from TTS.tts.configs.xtts_config import XttsConfig
40
  from TTS.tts.models.xtts import Xtts
41
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
42
 
43
- # =========================================================
44
- # 1) Мадэльныя файлы
45
- # =========================================================
46
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
47
  model_dir = "./model"
48
  os.makedirs(model_dir, exist_ok=True)
49
 
 
 
 
 
 
50
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
51
  fpath = os.path.join(model_dir, fname)
52
  if not os.path.exists(fpath):
53
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
54
 
55
- checkpoint_file = os.path.join(model_dir, "model.pth")
56
- config_file = os.path.join(model_dir, "config.json")
57
- vocab_file = os.path.join(model_dir, "vocab.json")
58
- default_voice_file = os.path.join(model_dir, "voice.wav")
59
-
60
- # =========================================================
61
- # 2) Загрузка мадэлі + CUDA налады
62
- # =========================================================
63
- config = XttsConfig(); config.load_json(config_file)
64
- XTTS_MODEL: Xtts = Xtts.init_from_config(config)
65
  XTTS_MODEL.load_checkpoint(
66
  config,
67
  checkpoint_path=checkpoint_file,
@@ -81,23 +82,20 @@ if device.startswith("cuda"):
81
  XTTS_MODEL.to(device).eval()
82
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
83
 
84
- # tokenizer
85
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
86
  XTTS_MODEL.tokenizer = tokenizer
87
 
88
  # =========================================================
89
- # 3) Канстанты стриму
90
  # =========================================================
91
- MIN_BUFFER_S = 0.020
92
- RUNTIME_FIRST_CHUNK_S = 0.010
93
  FADE_S = 0.004
94
  TOKENS_PER_STEP = 1
95
  ENABLE_TEXT_SPLITTING = True
96
- FIRST_SEGMENT_LIMIT = 120
97
 
98
- # =========================================================
99
- # 4) Аўдыя-ўтыліты
100
- # =========================================================
101
  def _seconds_to_samples(sec: float, sr: int) -> int:
102
  return max(1, int(sec * sr))
103
 
@@ -117,59 +115,18 @@ def _to_np_audio(x) -> np.ndarray:
117
  return x
118
 
119
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
120
- if a.size == 0:
121
- return b.astype(np.float32, copy=False)
122
- if b.size == 0:
123
- return a.astype(np.float32, copy=False)
124
- a = a.astype(np.float32, copy=False)
125
- b = b.astype(np.float32, copy=False)
126
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
127
- if fade_n <= 1:
128
- return np.concatenate([a, b], axis=0)
129
  fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
130
- fade_in = 1.0 - fade_out
131
  head = a[:-fade_n]
132
  tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
133
  rest = b[fade_n:]
134
  return np.concatenate([head, tail, rest], axis=0)
135
 
136
- def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
137
- if not chunks:
138
- return np.zeros((0,), dtype=np.float32)
139
- out = chunks[0]
140
- for i in range(1, len(chunks)):
141
- out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
142
- return out
143
-
144
- def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
145
- target_samples = _seconds_to_samples(target_s, sr)
146
- buf = np.zeros((0,), dtype=np.float32)
147
- first = True
148
- for c in chunks:
149
- c = _to_np_audio(c)
150
- if c.size == 0:
151
- continue
152
- if first:
153
- buf = c if buf.size == 0 else np.concatenate([buf, c], axis=0)
154
- first = False
155
- else:
156
- buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
157
- if buf.size >= target_samples:
158
- yield buf
159
- buf = np.zeros((0,), dtype=np.float32)
160
- if buf.size:
161
- yield buf
162
-
163
- def _pcm_f32_to_int16_b64(x: np.ndarray) -> str:
164
- if x.dtype != np.float32:
165
- x = x.astype(np.float32, copy=False)
166
- y = np.clip(x, -1.0, 0.9999695)
167
- i16 = (y * 32767.0).astype("<i2", copy=False)
168
- return base64.b64encode(i16.tobytes()).decode("ascii")
169
-
170
- # =========================================================
171
- # 5) BPE-prefix і стрим-генерацыя з fallback
172
- # =========================================================
173
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
174
  try:
175
  ids = tokenizer.encode(text, lang=lang)
@@ -192,7 +149,7 @@ def _bpe_prefixes(text: str, lang: str, step_tokens: int):
192
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
193
  sig = inspect.signature(model.inference_stream)
194
  call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
195
- for k in ("temperature", "length_penalty", "repetition_penalty", "top_k", "top_p", "stream_chunk_size_s"):
196
  if k in gen_kwargs and k in sig.parameters:
197
  call_kwargs[k] = gen_kwargs[k]
198
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
@@ -207,76 +164,42 @@ def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent
207
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
208
  with torch.inference_mode(), autocast_ctx:
209
  out = model.inference(
210
- text=prefix,
211
- language=language,
212
- gpt_cond_latent=gpt_cond_latent,
213
- speaker_embedding=speaker_embedding,
214
  temperature=gen_kwargs.get("temperature", 0.1),
215
- length_penalty=1.0,
216
- repetition_penalty=10.0,
217
- top_k=gen_kwargs.get("top_k", 10),
218
- top_p=gen_kwargs.get("top_p", 0.3),
219
  )
220
  wav = _to_np_audio(out)
221
- new_part = wav[emitted:]
222
- emitted = wav.size
223
- if new_part.size:
224
- yield new_part
225
 
226
  class NewTTSGenerationMixin:
227
  @torch.inference_mode()
228
- def generate(
229
- self: Xtts,
230
- text: Optional[str] = None,
231
- *,
232
- do_stream: bool = False,
233
- language: str = "be",
234
- gpt_cond_latent: Any = None,
235
- speaker_embedding: Any = None,
236
- min_buffer_s: float = MIN_BUFFER_S,
237
- tokens_per_step: int = TOKENS_PER_STEP,
238
- **gen_kwargs,
239
- ):
240
  assert isinstance(text, str) and text.strip(), "text is required"
241
  if not do_stream:
242
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
243
  with autocast_ctx:
244
  out = self.inference(
245
- text=text,
246
- language=language,
247
- gpt_cond_latent=gpt_cond_latent,
248
- speaker_embedding=speaker_embedding,
249
  temperature=gen_kwargs.get("temperature", 0.1),
250
- length_penalty=1.0,
251
- repetition_penalty=10.0,
252
- top_k=10,
253
- top_p=0.3,
254
  )
255
  return _to_np_audio(out)
256
  return self.sample_stream(
257
- text=text,
258
- language=language,
259
- gpt_cond_latent=gpt_cond_latent,
260
- speaker_embedding=speaker_embedding,
261
- min_buffer_s=min_buffer_s,
262
- tokens_per_step=tokens_per_step,
263
- **gen_kwargs,
264
  )
265
 
266
  @torch.inference_mode()
267
- def sample_stream(
268
- self: Xtts,
269
- *,
270
- text: str,
271
- language: str,
272
- gpt_cond_latent: Any,
273
- speaker_embedding: Any,
274
- min_buffer_s: float = MIN_BUFFER_S,
275
- tokens_per_step: int = TOKENS_PER_STEP,
276
- **gen_kwargs,
277
- ) -> Iterator[np.ndarray]:
278
- local_kwargs = dict(gen_kwargs)
279
- local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
280
  if hasattr(self, "inference_stream"):
281
  for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
282
  yield chunk
@@ -290,9 +213,9 @@ def init_stream_support():
290
 
291
  init_stream_support()
292
 
293
- # =========================================================
294
- # 6) Кэш латэнтаў (CPU/GPU) + дыск
295
- # =========================================================
296
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
297
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
298
 
@@ -312,16 +235,13 @@ def _latents_key(path: str | None, meta: LatentsMeta) -> str:
312
  base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
313
  else:
314
  base = "default_voice"
315
- meta_str = json.dumps(
316
- {
317
- "model_id": meta.model_id,
318
- "gpt_cond_len": meta.gpt_cond_len,
319
- "max_ref_len": meta.max_ref_len,
320
- "sound_norm_refs": meta.sound_norm_refs,
321
- "xtts_git": meta.xtts_git,
322
- },
323
- sort_keys=True,
324
- )
325
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
326
 
327
  def _latents_disk_path(key: str) -> pathlib.Path:
@@ -332,8 +252,7 @@ def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embed
332
 
333
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
334
  p = _latents_disk_path(key)
335
- if not p.exists():
336
- return None
337
  obj = torch.load(p, map_location="cpu")
338
  return obj["gpt_cond_latent"], obj["speaker_embedding"]
339
 
@@ -378,74 +297,83 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
378
  return g2, s2
379
  return g, s
380
 
381
- # warm-up
382
  try:
383
  _ = _latents_for(default_voice_file)
384
- if device.startswith("cuda"):
385
- g_gpu, s_gpu = _latents_for(default_voice_file, to_device=device)
386
- with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
387
- _ = XTTS_MODEL.inference(
388
- text=".", language="be",
389
- gpt_cond_latent=g_gpu, speaker_embedding=s_gpu,
390
- temperature=0.1, top_k=1, top_p=0.1,
391
- )
392
  except Exception as e:
393
- print(f"[warn] warm-up failed: {e}")
394
 
395
- # =========================================================
396
- # 7) Падзел тэксту
397
- # =========================================================
398
- _SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  _WS = re.compile(r"\s+")
400
 
401
  def _fast_split(text: str, limit: int) -> List[str]:
402
  text = text.strip()
403
- if not text:
404
- return []
405
  parts = []
406
  start = 0
407
  for m in _SENT_END.finditer(text):
408
  end = m.end()
409
  parts.append(text[start:end].strip())
410
  start = end
411
- if start < len(text):
412
- parts.append(text[start:].strip())
413
  chunks = []
414
  cur = ""
415
  for s in parts:
416
  if len(cur) + 1 + len(s) <= limit:
417
  cur = (cur + " " + s).strip() if cur else s
418
  else:
419
- if cur:
420
- chunks.append(cur)
421
  if len(s) <= limit:
422
  cur = s
423
  else:
424
- w = _WS.split(s)
425
- acc = ""
426
  for tok in w:
427
  if len(acc) + 1 + len(tok) <= limit:
428
  acc = (acc + " " + tok).strip() if acc else tok
429
  else:
430
- if acc:
431
- chunks.append(acc)
432
  acc = tok
433
- if acc:
434
- cur = acc
435
- else:
436
- cur = ""
437
- if cur:
438
- chunks.append(cur)
439
  return [c for c in chunks if c]
440
 
441
  def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
442
  text_in = text_in.strip()
443
- if not text_in:
444
- return []
445
  parts: List[str] = []
446
  if len(text_in) > FIRST_SEGMENT_LIMIT:
447
  head = text_in[:FIRST_SEGMENT_LIMIT]
448
- m = re.search(r'.*[\.!\?…»)]', head)
449
  if m and len(m.group(0)) > 30:
450
  head = m.group(0)
451
  tail = text_in[len(head):].lstrip()
@@ -453,33 +381,38 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
453
  text_for_rest = tail
454
  else:
455
  text_for_rest = text_in
456
- if not text_for_rest:
457
- return parts or [text_in]
458
 
459
  rest = _fast_split(text_for_rest, chunk_limit)
460
  if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
461
  try:
462
  rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
463
  rest2 = [s.strip() for s in rest2 if s and s.strip()]
464
- if rest2:
465
- rest = rest2
466
  except Exception:
467
  pass
468
  return parts + (rest or [text_for_rest])
469
 
470
- # =========================================================
471
- # 8) TTS стрим (выдае JSON-пакеты ў Textbox)
472
- # =========================================================
473
  @spaces.GPU(duration=60)
474
  def text_to_speech(belarusian_story, speaker_audio_file=None):
 
 
 
 
 
 
 
475
  t0 = time.perf_counter()
476
 
477
  if not belarusian_story or str(belarusian_story).strip() == "":
478
- yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
479
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
480
 
481
  if not speaker_audio_file or (
482
- not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""
 
483
  ):
484
  speaker_audio_file = default_voice_file
485
 
@@ -487,17 +420,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
487
  lang_short = "be"
488
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
489
 
490
- # Latents
491
  t_lat0 = time.perf_counter()
492
- to_dev = device if device.startswith("cuda") else None
493
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
494
  t_lat1 = time.perf_counter()
495
 
496
  # Split
497
  t_split0 = time.perf_counter()
498
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
499
- if not texts:
500
- texts = [text_in]
501
  t_split1 = time.perf_counter()
502
 
503
  server_metrics = {
@@ -508,9 +440,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
508
  "server_unaccounted_before_first_chunk_s": None,
509
  "file_write_s": None,
510
  }
511
-
512
- seq = 0
513
- yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
514
 
515
  full_audio_chunks: List[np.ndarray] = []
516
  first_chunk_seen = False
@@ -518,339 +448,255 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
518
 
519
  for part in texts:
520
  gen = XTTS_MODEL.generate(
521
- text=part,
522
- do_stream=True,
523
- language=lang_short,
524
- gpt_cond_latent=gpt_cond_latent,
525
- speaker_embedding=speaker_embedding,
526
  min_buffer_s=RUNTIME_FIRST_CHUNK_S,
527
  tokens_per_step=TOKENS_PER_STEP,
528
  stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
529
- temperature=0.1,
530
- length_penalty=1.0,
531
- repetition_penalty=10.0,
532
- top_k=10,
533
- top_p=0.3,
534
  )
535
  for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
536
  if not first_chunk_seen:
537
  t_first = time.perf_counter()
538
  server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
539
  server_metrics["until_first_chunk_total_s"] = (t_first - t0)
540
- known = (
541
- server_metrics["latents_s"]
542
- + server_metrics["text_split_s"]
543
- + server_metrics["gen_init_to_first_chunk_s"]
544
- )
545
  other = server_metrics["until_first_chunk_total_s"] - known
546
  server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
547
  first_chunk_seen = True
548
- seq += 1
549
- yield (json.dumps({"seq": seq, "b64": _pcm_f32_to_int16_b64(buf), "log": server_metrics, "stop": False}), None, None)
550
  else:
551
- seq += 1
552
- yield (json.dumps({"seq": seq, "b64": _pcm_f32_to_int16_b64(buf), "log": None, "stop": False}), None, None)
553
  full_audio_chunks.append(buf)
554
 
555
- final_file_path = None
556
- final_audio_path = None
557
- if full_audio_chunks:
558
- t_w0 = time.perf_counter()
559
- full_audio = _merge_for_file(full_audio_chunks)
560
- try:
561
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
562
- write(tmp.name, sampling_rate, full_audio.astype(np.float32))
563
- final_file_path = tmp.name
564
- final_audio_path = tmp.name
565
- except Exception as e:
566
- server_metrics["_file_error"] = str(e)
567
- finally:
568
- t_w1 = time.perf_counter()
569
- server_metrics["file_write_s"] = (t_w1 - t_w0)
570
-
571
- seq += 1
572
- yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
573
 
574
- # =========================================================
575
- # 9) UI + AudioWorklet з polling (без gr.Audio streaming)
576
- # =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  examples = [
578
- [
579
- "Прывітанне! Гэта праверка жывога струменя беларускага TTS.",
580
- "Nestarka.wav",
581
- ],
582
  ]
583
 
584
  with gr.Blocks() as demo:
585
- gr.Markdown("## Belarusian TTS — нізкая латэнтнасць (AudioWorklet) + фінальны WAV (SSR OFF, polling)")
586
 
587
  with gr.Row():
588
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
589
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
590
 
591
  with gr.Row():
592
- run_btn = gr.Button("Згенераваць / Старт стриму")
593
- stop_btn = gr.Button("⏹ Спыніць прайграванне")
594
- play_btn = gr.Button("▶️ Працягнуць прайграванне")
595
- gr.Markdown(f"**Sample rate:** {sampling_rate} Hz | **Stream format:** INT16LE(base64)")
596
 
597
  log_panel = gr.HTML(
598
  value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
599
  label="Лагі плэера",
600
  )
601
 
602
- # ВАЖНА: робім stream_pipe бачным у DOM (visible=True), але хаваем праз CSS
603
- stream_pipe = gr.Textbox(value="", visible=True, label="stream_pipe", elem_id="stream-pipe")
604
- final_file = gr.File(label="Згенераваны WAV (спампаваць)")
 
605
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
606
  play_final_btn = gr.Button("▶️ Play Final")
607
 
608
- # CSS — схаваць stream_pipe і актыўнасці
609
- gr.HTML("""
610
- <style>
611
- #stream-pipe { position:absolute; left:-99999px; width:1px; height:1px; opacity:0; pointer-events:none; }
612
- </style>
613
- """)
614
-
615
- # --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
616
- FRONT_HTML = f"""
617
- <script>
618
- (function() {{
619
  const sampleRate = {sampling_rate};
620
- const POLL_MS = 30;
621
  const AC = window.AudioContext || window.webkitAudioContext;
 
 
 
 
622
 
623
  function toSec(ms) {{ return (ms/1000); }}
624
- function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
625
 
626
- function updateLog() {{
627
  const el = document.getElementById('wa-log');
628
  if (!el || !window.__wa || !window.__wa.meta) return;
629
  const m = window.__wa.meta;
630
  const lines = [];
631
- lines.push('Клік (Старт): ' + (m.t_click_ms ? '0.000 s' : 'n/a'));
 
632
  let click_to_first_chunk_s = null;
633
  if (m.t_first_push_ms) {{
634
  click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
635
- lines.push('Першы чанк прыйшоў: ' + click_to_first_chunk_s.toFixed(3) + ' s');
636
  if (m.t_first_audio_ms) {{
637
- lines.push('Пачатак прайгравання: ' + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + ' s');
638
- lines.push('Затрымка (чанк→аўдыя): ' + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + ' s');
639
  }}
640
  }}
 
641
  const s = (m.server || {{}});
642
- lines.push('');
643
- lines.push('— Серверныя метрыкі —');
644
- lines.push('Latents (умоўны голас): ' + fmtS(s.latents_s));
645
- lines.push('Падзел тэксту: ' + fmtS(s.text_split_s));
646
- lines.push('Ініт→1-ы чанк: ' + fmtS(s.gen_init_to_first_chunk_s));
647
- lines.push('Усё да 1-га чанка: ' + fmtS(s.until_first_chunk_total_s));
648
- lines.push('Іншая серверная апрац.: ' + fmtS(s.server_unaccounted_before_first_chunk_s));
649
- lines.push('Запіс WAV: ' + fmtS(s.file_write_s));
650
-
651
- if (m.t_first_push_ms && s.until_first_chunk_total_s != null) {{
652
  let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
653
  if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
654
- lines.push('');
655
- lines.push('Ацэнка чаргі ZeroGPU + сеткі: ' + est_queue_net.toFixed(3) + ' s');
656
  }} else {{
657
- lines.push('');
658
- lines.push('Ацэнка чаргі ZeroGPU + сеткі: n/a');
659
  }}
660
- lines.push('');
661
- lines.push('Статус стриму: ' + (window.__wa.playing ? 'playing' : 'stopped'));
662
- el.textContent = lines.join('\\n');
663
- }}
664
 
665
- async function ensureWorklet(ctx) {{
666
- const code = `
667
- class PushPlayerProcessor extends AudioWorkletProcessor {{
668
- constructor() {{
669
- super();
670
- this.queue = [];
671
- this.readIndex = 0;
672
- this.port.onmessage = (e) => {{
673
- const d = e.data || {{}};
674
- if (d.type === 'push' && d.buffer) {{
675
- const f32 = new Float32Array(d.buffer);
676
- this.queue.push(f32);
677
- }} else if (d.type === 'reset') {{
678
- this.queue.length = 0;
679
- this.readIndex = 0;
680
- }}
681
- }};
682
- }}
683
- process(inputs, outputs) {{
684
- const out = outputs[0][0];
685
- let i = 0;
686
- while (i < out.length) {{
687
- if (this.queue.length === 0) {{ out[i++] = 0.0; continue; }}
688
- const cur = this.queue[0];
689
- const remaining = cur.length - this.readIndex;
690
- const take = Math.min(remaining, out.length - i);
691
- out.set(cur.subarray(this.readIndex, this.readIndex + take), i);
692
- i += take; this.readIndex += take;
693
- if (this.readIndex >= cur.length) {{ this.queue.shift(); this.readIndex = 0; }}
694
- }}
695
- return true;
696
- }}
697
- }}
698
- registerProcessor('push-player', PushPlayerProcessor);
699
- `;
700
- const blob = new Blob([code], {{ type: 'application/javascript' }});
701
- const url = URL.createObjectURL(blob);
702
- await ctx.audioWorklet.addModule(url);
703
  }}
704
 
705
- async function ensurePlayer() {{
706
- if (window.__wa) return window.__wa;
707
- if (!AC) return null;
708
  const ctx = new AC({{ sampleRate }});
709
- try {{ await ctx.resume(); }} catch (e) {{}}
710
- await ensureWorklet(ctx);
711
- const node = new AudioWorkletNode(ctx, 'push-player');
712
- node.connect(ctx.destination);
713
-
714
- let playing = true;
715
 
716
  const meta = {{
717
- t_click_ms: null,
718
  t_first_push_ms: null,
719
  t_first_audio_ms: null,
720
- server: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  }};
 
722
 
723
- const api = {{
724
  ctx, node,
725
  get playing() {{ return playing; }},
726
- start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing = True; updateLog(); }},
727
- stop: () => {{ try {{ ctx.suspend(); }} catch(e){{}} playing = false; updateLog(); }},
728
- reset: () => {{
729
- try {{ node.port.postMessage({{ type: 'reset' }}); }} catch(e) {{}}
730
- meta.t_first_push_ms = null;
731
- meta.t_first_audio_ms = null;
732
- updateLog();
733
- }},
734
- push: (f32) {{
735
- try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
736
  if (!meta.t_first_push_ms) {{
737
  meta.t_first_push_ms = performance.now();
738
- if (!meta.t_first_audio_ms) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
739
- updateLog();
 
 
 
740
  }}
741
- if (!playing) api.start();
742
  }},
743
- meta
 
 
 
 
 
 
 
 
744
  }};
745
- window.__wa = api;
746
- updateLog();
747
- return api;
748
- }}
749
-
750
- function getPipeEl() {{
751
- // Textbox у Gradio мае textarea унутры div#stream-pipe
752
- const root = document.getElementById('stream-pipe');
753
- if (!root) return null;
754
- const ta = root.querySelector('textarea');
755
- if (ta) return ta;
756
- const inp = root.querySelector('input');
757
- if (inp) return inp;
758
- return root;
759
  }}
 
 
760
 
761
- function startPolling() {{
762
- const pipe = getPipeEl();
763
- if (!pipe) return;
764
- const apiPromise = ensurePlayer();
765
- let lastSeq = -1;
766
-
767
- setInterval(async () => {{
768
- const api = await apiPromise;
769
- if (!api) return;
770
-
771
- const txt = (pipe.value !== undefined) ? pipe.value : (pipe.innerText || pipe.textContent || '');
772
- if (!txt) return;
 
 
 
 
773
 
774
- let pkt = null;
775
- try {{ pkt = JSON.parse(txt); }} catch(e) {{ return; }}
776
- if (!pkt || typeof pkt.seq !== 'number') return;
777
- if (pkt.seq <= lastSeq) return;
778
- lastSeq = pkt.seq;
 
 
 
 
 
 
 
779
 
780
- if (pkt.log) {{
781
- api.meta.server = pkt.log;
782
- updateLog();
783
- }}
 
 
 
 
784
 
785
- if (pkt.b64 === '__STOP__') {{
786
- api.stop();
787
- updateLog();
788
- return;
789
- }}
790
 
791
- if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
792
- const bin = atob(pkt.b64);
793
- const len = bin.length;
794
- const buf = new ArrayBuffer(len);
795
- const view = new Uint8Array(buf);
796
- for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
797
- const i16 = new Int16Array(buf);
798
- const f32 = new Float32Array(i16.length);
799
- for (let i=0;i<i16.length;i++) {{
800
- let s = i16[i];
801
- f32[i] = Math.max(-1, s / 32768);
802
- }}
803
- api.push(f32);
804
- }}
805
- }}, POLL_MS);
806
- }}
807
 
808
- window.__wa_start_click = async function() {{
809
- const api = await ensurePlayer();
810
- api.meta.t_click_ms = performance.now();
811
- updateLog();
812
- }};
813
- window.__wa_stop = async function() {{
814
- const api = await ensurePlayer();
815
- api.stop();
816
- }};
817
- window.__wa_play = async function() {{
818
- const api = await ensurePlayer();
819
- api.start();
820
- }};
821
- window.__wa_play_final = function() {{
822
- const host = document.getElementById('final-audio');
823
- if (!host) return;
824
- const audio = host.querySelector('audio');
825
- if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
826
- }};
827
-
828
- // Стартуем polling пасля загрузкі
829
- startPolling();
830
- }})();
831
- </script>
832
- """
833
- gr.HTML(FRONT_HTML)
834
-
835
- # Падзеі
836
- run_btn.click(
837
- fn=lambda: None,
838
- inputs=[],
839
- outputs=[],
840
- js="window.__wa_start_click && window.__wa_start_click();"
841
- )
842
- run_btn.click(
843
- fn=text_to_speech,
844
- inputs=[inp_text, inp_voice],
845
- outputs=[stream_pipe, final_file, final_audio],
846
- )
847
 
848
- stop_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_stop && window.__wa_stop();")
849
- play_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_play && window.__wa_play();")
850
- play_final_btn.click(fn=None, inputs=[], outputs=[], js="window.__wa_play_final && window.__wa_play_final();")
851
 
852
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
853
 
854
- # чарга + запуск (SSR OFF)
855
  if __name__ == "__main__":
856
- demo.queue(max_size=8).launch(ssr_mode=False)
 
23
  from huggingface_hub import hf_hub_download
24
  from scipy.io.wavfile import write
25
 
26
+ # ---------------------------------------------------------
27
+ # 1) coqui-ai-TTS fork
28
+ # ---------------------------------------------------------
29
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
30
  REPO_DIR = "coqui-ai-TTS"
31
 
 
40
  from TTS.tts.models.xtts import Xtts
41
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
42
 
43
+ # ---------------------------------------------------------
44
+ # 2) мадэльныя файлы
45
+ # ---------------------------------------------------------
46
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
47
  model_dir = "./model"
48
  os.makedirs(model_dir, exist_ok=True)
49
 
50
+ checkpoint_file = os.path.join(model_dir, "model.pth")
51
+ config_file = os.path.join(model_dir, "config.json")
52
+ vocab_file = os.path.join(model_dir, "vocab.json")
53
+ default_voice_file = os.path.join(model_dir, "voice.wav")
54
+
55
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
56
  fpath = os.path.join(model_dir, fname)
57
  if not os.path.exists(fpath):
58
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
59
 
60
+ # ---------------------------------------------------------
61
+ # 3) загрузка мадэлі
62
+ # ---------------------------------------------------------
63
+ config = XttsConfig()
64
+ config.load_json(config_file)
65
+ XTTS_MODEL = Xtts.init_from_config(config)
 
 
 
 
66
  XTTS_MODEL.load_checkpoint(
67
  config,
68
  checkpoint_path=checkpoint_file,
 
82
  XTTS_MODEL.to(device).eval()
83
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
84
 
 
85
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
86
  XTTS_MODEL.tokenizer = tokenizer
87
 
88
  # =========================================================
89
+ # 4) Streaming-канфіг
90
  # =========================================================
91
+ MIN_BUFFER_S = 0.03 # бяспечны выхадны буфер для плэера
92
+ RUNTIME_FIRST_CHUNK_S = 0.02 # унутраны чанк у генерацыі
93
  FADE_S = 0.004
94
  TOKENS_PER_STEP = 1
95
  ENABLE_TEXT_SPLITTING = True
96
+ FIRST_SEGMENT_LIMIT = 160 # стабільная прасадыя для 1-га сегмента
97
 
98
+ # -------------------- утыліты аўдыя ----------------------
 
 
99
  def _seconds_to_samples(sec: float, sr: int) -> int:
100
  return max(1, int(sec * sr))
101
 
 
115
  return x
116
 
117
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
118
+ if a.size == 0: return b.astype(np.float32, copy=False)
119
+ if b.size == 0: return a.astype(np.float32, copy=False)
120
+ a = a.astype(np.float32, copy=False); b = b.astype(np.float32, copy=False)
 
 
 
121
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
122
+ if fade_n <= 1: return np.concatenate([a, b], axis=0)
 
123
  fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
124
+ fade_in = 1.0 - fade_out
125
  head = a[:-fade_n]
126
  tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
127
  rest = b[fade_n:]
128
  return np.concatenate([head, tail, rest], axis=0)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
131
  try:
132
  ids = tokenizer.encode(text, lang=lang)
 
149
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
150
  sig = inspect.signature(model.inference_stream)
151
  call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
152
+ for k in ("temperature","length_penalty","repetition_penalty","top_k","top_p","stream_chunk_size_s"):
153
  if k in gen_kwargs and k in sig.parameters:
154
  call_kwargs[k] = gen_kwargs[k]
155
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
 
164
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
165
  with torch.inference_mode(), autocast_ctx:
166
  out = model.inference(
167
+ text=prefix, language=language,
168
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
 
 
169
  temperature=gen_kwargs.get("temperature", 0.1),
170
+ length_penalty=1.0, repetition_penalty=10.0,
171
+ top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
 
 
172
  )
173
  wav = _to_np_audio(out)
174
+ new_part = wav[emitted:]; emitted = wav.size
175
+ if new_part.size: yield new_part
 
 
176
 
177
  class NewTTSGenerationMixin:
178
  @torch.inference_mode()
179
+ def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
180
+ gpt_cond_latent: Any = None, speaker_embedding: Any = None,
181
+ min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs):
 
 
 
 
 
 
 
 
 
182
  assert isinstance(text, str) and text.strip(), "text is required"
183
  if not do_stream:
184
  autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
185
  with autocast_ctx:
186
  out = self.inference(
187
+ text=text, language=language,
188
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
 
 
189
  temperature=gen_kwargs.get("temperature", 0.1),
190
+ length_penalty=1.0, repetition_penalty=10.0,
191
+ top_k=10, top_p=0.3,
 
 
192
  )
193
  return _to_np_audio(out)
194
  return self.sample_stream(
195
+ text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
196
+ min_buffer_s=min_buffer_s, tokens_per_step=tokens_per_step, **gen_kwargs
 
 
 
 
 
197
  )
198
 
199
  @torch.inference_mode()
200
+ def sample_stream(self: Xtts, *, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any,
201
+ min_buffer_s: float = MIN_BUFFER_S, tokens_per_step: int = TOKENS_PER_STEP, **gen_kwargs) -> Iterator[np.ndarray]:
202
+ local_kwargs = dict(gen_kwargs); local_kwargs.setdefault("stream_chunk_size_s", float(min_buffer_s))
 
 
 
 
 
 
 
 
 
 
203
  if hasattr(self, "inference_stream"):
204
  for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
205
  yield chunk
 
213
 
214
  init_stream_support()
215
 
216
+ # ---------------------------------------------------------
217
+ # 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
218
+ # ---------------------------------------------------------
219
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
220
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
221
 
 
235
  base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
236
  else:
237
  base = "default_voice"
238
+ meta_str = json.dumps({
239
+ "model_id": meta.model_id,
240
+ "gpt_cond_len": meta.gpt_cond_len,
241
+ "max_ref_len": meta.max_ref_len,
242
+ "sound_norm_refs": meta.sound_norm_refs,
243
+ "xtts_git": meta.xtts_git,
244
+ }, sort_keys=True)
 
 
 
245
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
246
 
247
  def _latents_disk_path(key: str) -> pathlib.Path:
 
252
 
253
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
254
  p = _latents_disk_path(key)
255
+ if not p.exists(): return None
 
256
  obj = torch.load(p, map_location="cpu")
257
  return obj["gpt_cond_latent"], obj["speaker_embedding"]
258
 
 
297
  return g2, s2
298
  return g, s
299
 
300
+ # аўтападлік для default voice (CPU) — без дадатковых запытаў
301
  try:
302
  _ = _latents_for(default_voice_file)
 
 
 
 
 
 
 
 
303
  except Exception as e:
304
+ print(f"[warn] precompute default voice latents failed: {e}")
305
 
306
+ # ---------------------------------------------------------
307
+ # 6) буферы + base64
308
+ # ---------------------------------------------------------
309
+ def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
310
+ if not chunks: return np.zeros((0,), dtype=np.float32)
311
+ out = chunks[0]
312
+ for i in range(1, len(chunks)):
313
+ out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
314
+ return out
315
+
316
+ def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
317
+ target_samples = _seconds_to_samples(target_s, sr)
318
+ buf = np.zeros((0,), dtype=np.float32)
319
+ for c in chunks:
320
+ c = _to_np_audio(c)
321
+ if c.size == 0: continue
322
+ buf = c if buf.size == 0 else _crossfade_concat(buf, c, sr, FADE_S)
323
+ if buf.size >= target_samples:
324
+ yield buf
325
+ buf = np.zeros((0,), dtype=np.float32)
326
+ if buf.size: yield buf
327
+
328
+ def _pcm_f32_to_b64(x: np.ndarray) -> str:
329
+ if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
330
+ return base64.b64encode(x.tobytes()).decode("ascii")
331
+
332
+ # ---------------------------------------------------------
333
+ # 7) падзел тэксту: хуткі + fallback
334
+ # ---------------------------------------------------------
335
+ _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
336
  _WS = re.compile(r"\s+")
337
 
338
  def _fast_split(text: str, limit: int) -> List[str]:
339
  text = text.strip()
340
+ if not text: return []
 
341
  parts = []
342
  start = 0
343
  for m in _SENT_END.finditer(text):
344
  end = m.end()
345
  parts.append(text[start:end].strip())
346
  start = end
347
+ if start < len(text): parts.append(text[start:].strip())
 
348
  chunks = []
349
  cur = ""
350
  for s in parts:
351
  if len(cur) + 1 + len(s) <= limit:
352
  cur = (cur + " " + s).strip() if cur else s
353
  else:
354
+ if cur: chunks.append(cur)
 
355
  if len(s) <= limit:
356
  cur = s
357
  else:
358
+ w = _WS.split(s); acc = ""
 
359
  for tok in w:
360
  if len(acc) + 1 + len(tok) <= limit:
361
  acc = (acc + " " + tok).strip() if acc else tok
362
  else:
363
+ if acc: chunks.append(acc)
 
364
  acc = tok
365
+ if acc: cur = acc
366
+ else: cur = ""
367
+ if cur: chunks.append(cur)
 
 
 
368
  return [c for c in chunks if c]
369
 
370
  def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
371
  text_in = text_in.strip()
372
+ if not text_in: return []
 
373
  parts: List[str] = []
374
  if len(text_in) > FIRST_SEGMENT_LIMIT:
375
  head = text_in[:FIRST_SEGMENT_LIMIT]
376
+ m = re.search(r".*[\.!\?…»)]", head)
377
  if m and len(m.group(0)) > 30:
378
  head = m.group(0)
379
  tail = text_in[len(head):].lstrip()
 
381
  text_for_rest = tail
382
  else:
383
  text_for_rest = text_in
384
+ if not text_for_rest: return parts or [text_in]
 
385
 
386
  rest = _fast_split(text_for_rest, chunk_limit)
387
  if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
388
  try:
389
  rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
390
  rest2 = [s.strip() for s in rest2 if s and s.strip()]
391
+ if rest2: rest = rest2
 
392
  except Exception:
393
  pass
394
  return parts + (rest or [text_for_rest])
395
 
396
+ # ---------------------------------------------------------
397
+ # 8) TTS стрим + фінальны файл + лагі
398
+ # ---------------------------------------------------------
399
  @spaces.GPU(duration=60)
400
  def text_to_speech(belarusian_story, speaker_audio_file=None):
401
+ """
402
+ Выхады:
403
+ 1) stream_pipe — base64(PCM float32) чанкі, у фінале "__STOP__"
404
+ 2) final_file — шлях да WAV
405
+ 3) final_audio — шлях да WAV для прайгравання
406
+ 4) log_pipe — JSON з сервернымі метрыкамі (секунды)
407
+ """
408
  t0 = time.perf_counter()
409
 
410
  if not belarusian_story or str(belarusian_story).strip() == "":
 
411
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
412
 
413
  if not speaker_audio_file or (
414
+ not isinstance(speaker_audio_file, str)
415
+ and getattr(speaker_audio_file, "name", "") == ""
416
  ):
417
  speaker_audio_file = default_voice_file
418
 
 
420
  lang_short = "be"
421
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
422
 
423
+ # Latents (кэш CPU/GPU)
424
  t_lat0 = time.perf_counter()
425
+ to_dev = "cuda:0" if torch.cuda.is_available() else None
426
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
427
  t_lat1 = time.perf_counter()
428
 
429
  # Split
430
  t_split0 = time.perf_counter()
431
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
432
+ if not texts: texts = [text_in]
 
433
  t_split1 = time.perf_counter()
434
 
435
  server_metrics = {
 
440
  "server_unaccounted_before_first_chunk_s": None,
441
  "file_write_s": None,
442
  }
443
+ yield ("", None, None, json.dumps(server_metrics))
 
 
444
 
445
  full_audio_chunks: List[np.ndarray] = []
446
  first_chunk_seen = False
 
448
 
449
  for part in texts:
450
  gen = XTTS_MODEL.generate(
451
+ text=part, do_stream=True, language=lang_short,
452
+ gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
 
 
 
453
  min_buffer_s=RUNTIME_FIRST_CHUNK_S,
454
  tokens_per_step=TOKENS_PER_STEP,
455
  stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
456
+ temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
457
+ top_k=10, top_p=0.3,
 
 
 
458
  )
459
  for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
460
  if not first_chunk_seen:
461
  t_first = time.perf_counter()
462
  server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
463
  server_metrics["until_first_chunk_total_s"] = (t_first - t0)
464
+ known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
 
 
 
 
465
  other = server_metrics["until_first_chunk_total_s"] - known
466
  server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
467
  first_chunk_seen = True
468
+ yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
 
469
  else:
470
+ yield (_pcm_f32_to_b64(buf), None, None, None)
 
471
  full_audio_chunks.append(buf)
472
 
473
+ if not full_audio_chunks:
474
+ yield ("__STOP__", None, None, json.dumps(server_metrics)); return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
+ t_w0 = time.perf_counter()
477
+ full_audio = _merge_for_file(full_audio_chunks)
478
+ tmp = None
479
+ try:
480
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
481
+ write(tmp.name, sampling_rate, full_audio.astype(np.float32))
482
+ except Exception as e:
483
+ raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
484
+ finally:
485
+ t_w1 = time.perf_counter()
486
+ server_metrics["file_write_s"] = (t_w1 - t_w0)
487
+
488
+ yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
489
+
490
+ # ---------------------------------------------------------
491
+ # 9) UI (лагі ў секундах + Play Final; без underrun’аў)
492
+ # ---------------------------------------------------------
493
  examples = [
494
+ ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
 
 
 
495
  ]
496
 
497
  with gr.Blocks() as demo:
498
+ gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
499
 
500
  with gr.Row():
501
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
502
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
503
 
504
  with gr.Row():
505
+ play_btn = gr.Button("▶️ Play (stream)")
506
+ stop_btn = gr.Button("⏹ Stop (stream)")
507
+ run_btn = gr.Button("Згенераваць")
508
+ gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
509
 
510
  log_panel = gr.HTML(
511
  value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
512
  label="Лагі плэера",
513
  )
514
 
515
+ stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
516
+ log_pipe = gr.Textbox(value="", visible=False, label="log_pipe")
517
+
518
+ final_file = gr.File(label="Згенераваны WAV (спампаваць)")
519
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
520
  play_final_btn = gr.Button("▶️ Play Final")
521
 
522
+ INIT_RESET_AND_PLAY_JS = f"""
523
+ () => {{
 
 
 
 
 
 
 
 
 
524
  const sampleRate = {sampling_rate};
 
525
  const AC = window.AudioContext || window.webkitAudioContext;
526
+ if (!AC) return;
527
+
528
+ const PRIME_CHUNKS = 2; // мін. к-ць чанкаў перад стартаваннем гуку
529
+ let primeCounter = 0;
530
 
531
  function toSec(ms) {{ return (ms/1000); }}
532
+ function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
533
 
534
+ function logUpdate() {{
535
  const el = document.getElementById('wa-log');
536
  if (!el || !window.__wa || !window.__wa.meta) return;
537
  const m = window.__wa.meta;
538
  const lines = [];
539
+ lines.push("Клік (Згенераваць): 0.000 s");
540
+
541
  let click_to_first_chunk_s = null;
542
  if (m.t_first_push_ms) {{
543
  click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
544
+ lines.push("Першы чанк прыйшоў: " + click_to_first_chunk_s.toFixed(3) + " s");
545
  if (m.t_first_audio_ms) {{
546
+ lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
547
+ lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
548
  }}
549
  }}
550
+
551
  const s = (m.server || {{}});
552
+ lines.push("");
553
+ lines.push("— Серверныя метрыкі —");
554
+ lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
555
+ lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
556
+ lines.push("Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s));
557
+ lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
558
+ lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
559
+ lines.push("Запіс WAV: " + fmtS(s.file_write_s));
560
+
561
+ if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
562
  let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
563
  if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
564
+ lines.push("");
565
+ lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
566
  }} else {{
567
+ lines.push("");
568
+ lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
569
  }}
 
 
 
 
570
 
571
+ lines.push("");
572
+ lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
573
+ el.textContent = lines.join("\\n");
574
+ try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  }}
576
 
577
+ if (!window.__wa) {{
 
 
578
  const ctx = new AC({{ sampleRate }});
579
+ const bufferSize = 2048; // большы буфер = менш underrun’аў
580
+ const node = ctx.createScriptProcessor(bufferSize, 0, 1);
581
+ let queue = [];
582
+ let playing = false;
583
+ let eos = false;
 
584
 
585
  const meta = {{
586
+ t_click_ms: performance.now(),
587
  t_first_push_ms: null,
588
  t_first_audio_ms: null,
589
+ server: null,
590
+ }};
591
+
592
+ node.onaudioprocess = (e) => {{
593
+ const out = e.outputBuffer.getChannelData(0);
594
+ let i = 0;
595
+ while (i < out.length) {{
596
+ if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
597
+ let cur = queue[0];
598
+ const take = Math.min(cur.length, out.length - i);
599
+ if (meta.t_first_audio_ms === null) {{
600
+ meta.t_first_audio_ms = performance.now();
601
+ logUpdate();
602
+ }}
603
+ out.set(cur.subarray(0, take), i);
604
+ i += take;
605
+ if (take === cur.length) queue.shift();
606
+ else queue[0] = cur.subarray(take);
607
+ }}
608
+ if (eos && queue.length === 0 && playing) {{
609
+ playing = false;
610
+ logUpdate();
611
+ }}
612
  }};
613
+ node.connect(ctx.destination);
614
 
615
+ window.__wa = {{
616
  ctx, node,
617
  get playing() {{ return playing; }},
618
+ get eos() {{ return eos; }},
619
+ set eos(v) {{ eos = v; }},
620
+ meta,
621
+ push: (f32) => {{
622
+ queue.push(f32);
 
 
 
 
 
623
  if (!meta.t_first_push_ms) {{
624
  meta.t_first_push_ms = performance.now();
625
+ logUpdate();
626
+ }}
627
+ if (!playing && queue.length >= PRIME_CHUNKS) {{
628
+ // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
629
+ window.__wa.start();
630
  }}
 
631
  }},
632
+ start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
633
+ stop: () => {{ playing = false; logUpdate(); }},
634
+ reset: () => {{
635
+ playing = false; eos = false; queue = [];
636
+ primeCounter = 0;
637
+ meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
638
+ logUpdate();
639
+ }},
640
+ updateLog: logUpdate,
641
  }};
642
+ }} else {{
643
+ window.__wa.reset();
644
+ window.__wa.meta.t_click_ms = performance.now();
 
 
 
 
 
 
 
 
 
 
 
645
  }}
646
+ }}
647
+ """
648
 
649
+ STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
650
+ PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
651
+
652
+ PUSH_JS = """
653
+ (b64) => {
654
+ if (!window.__wa || !b64) return;
655
+ if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
656
+ const bin = atob(b64);
657
+ const len = bin.length;
658
+ const buf = new ArrayBuffer(len);
659
+ const view = new Uint8Array(buf);
660
+ for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
661
+ const f32 = new Float32Array(buf);
662
+ window.__wa.push(f32);
663
+ }
664
+ """
665
 
666
+ LOG_JS = """
667
+ (js) => {
668
+ if (!window.__wa) return;
669
+ try {
670
+ if (js) {
671
+ const obj = JSON.parse(js);
672
+ window.__wa.meta.server = obj;
673
+ window.__wa.updateLog && window.__wa.updateLog();
674
+ }
675
+ } catch (e) {}
676
+ }
677
+ """
678
 
679
+ PLAY_FINAL_JS = """
680
+ () => {
681
+ const host = document.getElementById('final-audio');
682
+ if (!host) return;
683
+ const audio = host.querySelector('audio');
684
+ if (audio) { try { audio.play(); } catch(e) {} }
685
+ }
686
+ """
687
 
688
+ play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
689
+ stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
 
 
 
690
 
691
+ run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
692
+ run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
+ stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
695
+ log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
 
697
+ play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
 
 
698
 
699
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
700
 
 
701
  if __name__ == "__main__":
702
+ demo.launch()