archivartaunik commited on
Commit
b547c0a
·
verified ·
1 Parent(s): 2bd6cf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -72
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- # Абмежуем «шум» патокаў, каб пазбегнуць thrashing
3
  os.environ.setdefault("OMP_NUM_THREADS", "1")
4
  os.environ.setdefault("MKL_NUM_THREADS", "1")
5
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
@@ -25,7 +24,7 @@ from huggingface_hub import hf_hub_download
25
  from scipy.io.wavfile import write
26
 
27
  # ---------------------------------------------------------
28
- # 1) Клануем і падключаем coqui-ai-TTS (fork з падтрымкай BE)
29
  # ---------------------------------------------------------
30
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
31
  REPO_DIR = "coqui-ai-TTS"
@@ -42,7 +41,7 @@ from TTS.tts.models.xtts import Xtts
42
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
43
 
44
  # ---------------------------------------------------------
45
- # 2) Файлы мадэлі
46
  # ---------------------------------------------------------
47
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
48
  model_dir = "./model"
@@ -59,7 +58,7 @@ for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
59
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
60
 
61
  # ---------------------------------------------------------
62
- # 3) Загрузка мадэлі і токенайзера
63
  # ---------------------------------------------------------
64
  config = XttsConfig()
65
  config.load_json(config_file)
@@ -73,7 +72,6 @@ XTTS_MODEL.load_checkpoint(
73
 
74
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
75
 
76
- # GPU/CPU налады
77
  torch.set_num_threads(1)
78
  if device.startswith("cuda"):
79
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -90,12 +88,12 @@ XTTS_MODEL.tokenizer = tokenizer
90
  # =========================================================
91
  # 4) Streaming-канфіг
92
  # =========================================================
93
- MIN_BUFFER_S = 0.02 # агульны мікс-бфер (стабільны для плэера)
94
- RUNTIME_FIRST_CHUNK_S = 0.015 # канкрэтна для 1-га чанка ў генерацыі (хутчэй)
95
  FADE_S = 0.004
96
  TOKENS_PER_STEP = 1
97
  ENABLE_TEXT_SPLITTING = True
98
- FIRST_SEGMENT_LIMIT = 120 # кароткі першы сегмент (хутчэйшы першы чанк)
99
 
100
  # -------------------- утыліты аўдыя ----------------------
101
  def _seconds_to_samples(sec: float, sr: int) -> int:
@@ -216,7 +214,7 @@ def init_stream_support():
216
  init_stream_support()
217
 
218
  # ---------------------------------------------------------
219
- # 5) Пастаянны кэш латэнтаў (CPU) + GPU-кэш (без дадатковых запытаў)
220
  # ---------------------------------------------------------
221
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
222
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
@@ -229,8 +227,8 @@ class LatentsMeta:
229
  sound_norm_refs: bool
230
  xtts_git: str | None = None
231
 
232
- LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {} # RAM CPU
233
- GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {} # RAM GPU
234
 
235
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
236
  if path and os.path.exists(path):
@@ -250,18 +248,11 @@ def _latents_disk_path(key: str) -> pathlib.Path:
250
  return PERSIST_LATENTS_DIR / f"{key}.pt"
251
 
252
  def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
253
- torch.save(
254
- {
255
- "gpt_cond_latent": gpt_cond_latent.cpu(),
256
- "speaker_embedding": speaker_embedding.cpu(),
257
- },
258
- _latents_disk_path(key),
259
- )
260
 
261
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
262
  p = _latents_disk_path(key)
263
- if not p.exists():
264
- return None
265
  obj = torch.load(p, map_location="cpu")
266
  return obj["gpt_cond_latent"], obj["speaker_embedding"]
267
 
@@ -276,7 +267,6 @@ def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
276
  return g.cpu(), s.cpu()
277
 
278
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
279
- """Вяртае латэнты з RAM/дыска; калі няма — палічыць на CPU і захаваць. Пры патрэбе — кэшуе і на GPU."""
280
  meta = LatentsMeta(
281
  model_id=repo_id,
282
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
@@ -286,21 +276,17 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
286
  )
287
  key = _latents_key(path, meta)
288
 
289
- # 1) CPU RAM
290
  if key in LATENT_CACHE:
291
  g, s = LATENT_CACHE[key]
292
  else:
293
- # 2) дыск
294
  loaded = _load_latents_from_disk(key)
295
  if loaded is None:
296
- # 3) палічыць на CPU і захаваць
297
  g, s = _compute_latents_cpu(path)
298
  _save_latents_to_disk(key, g, s)
299
  else:
300
  g, s = loaded
301
  LATENT_CACHE[key] = (g, s)
302
 
303
- # 4) GPU-кэш (калі патрэбны)
304
  if to_device and to_device.startswith("cuda"):
305
  dev_key = (key, to_device)
306
  if dev_key in GPU_LATENT_CACHE:
@@ -309,19 +295,16 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
309
  s2 = s.to(to_device, non_blocking=True)
310
  GPU_LATENT_CACHE[dev_key] = (g2, s2)
311
  return g2, s2
312
-
313
  return g, s
314
 
315
- # Аўтаматычна падлічым латэнты на CPU для default voice (адзін раз, без GPU-запытаў)
316
- DEFAULT_VOICE_PRECOMPUTE = True
317
- if DEFAULT_VOICE_PRECOMPUTE:
318
- try:
319
- _ = _latents_for(default_voice_file) # CPU-разлік і захаванне, калі няма
320
- except Exception as e:
321
- print(f"[warn] precompute default voice latents failed: {e}")
322
 
323
  # ---------------------------------------------------------
324
- # 6) Хэлперы: буферы + base64
325
  # ---------------------------------------------------------
326
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
327
  if not chunks: return np.zeros((0,), dtype=np.float32)
@@ -347,7 +330,7 @@ def _pcm_f32_to_b64(x: np.ndarray) -> str:
347
  return base64.b64encode(x.tobytes()).decode("ascii")
348
 
349
  # ---------------------------------------------------------
350
- # 7) Хуткі падзел тэксту: lightweight + fallback
351
  # ---------------------------------------------------------
352
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
353
  _WS = re.compile(r"\s+")
@@ -361,8 +344,7 @@ def _fast_split(text: str, limit: int) -> List[str]:
361
  end = m.end()
362
  parts.append(text[start:end].strip())
363
  start = end
364
- if start < len(text):
365
- parts.append(text[start:].strip())
366
  chunks = []
367
  cur = ""
368
  for s in parts:
@@ -373,8 +355,7 @@ def _fast_split(text: str, limit: int) -> List[str]:
373
  if len(s) <= limit:
374
  cur = s
375
  else:
376
- w = _WS.split(s)
377
- acc = ""
378
  for tok in w:
379
  if len(acc) + 1 + len(tok) <= limit:
380
  acc = (acc + " " + tok).strip() if acc else tok
@@ -400,9 +381,7 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
400
  text_for_rest = tail
401
  else:
402
  text_for_rest = text_in
403
-
404
- if not text_for_rest:
405
- return parts or [text_in]
406
 
407
  rest = _fast_split(text_for_rest, chunk_limit)
408
  if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
@@ -412,11 +391,10 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
412
  if rest2: rest = rest2
413
  except Exception:
414
  pass
415
-
416
  return parts + (rest or [text_for_rest])
417
 
418
  # ---------------------------------------------------------
419
- # 8) Асноўная функцыя TTS — стрим + фінальны файл + лагі
420
  # ---------------------------------------------------------
421
  @spaces.GPU(duration=60)
422
  def text_to_speech(belarusian_story, speaker_audio_file=None):
@@ -432,7 +410,6 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
432
  if not belarusian_story or str(belarusian_story).strip() == "":
433
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
434
 
435
- # Голас па змаўчанні
436
  if not speaker_audio_file or (
437
  not isinstance(speaker_audio_file, str)
438
  and getattr(speaker_audio_file, "name", "") == ""
@@ -443,13 +420,13 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
443
  lang_short = "be"
444
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
445
 
446
- # --- Latents з CPU-/дыск-кэшу; GPU-кэш (без extra запытаў) ---
447
  t_lat0 = time.perf_counter()
448
  to_dev = "cuda:0" if torch.cuda.is_available() else None
449
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
450
  t_lat1 = time.perf_counter()
451
 
452
- # --- Split (хуткі) ---
453
  t_split0 = time.perf_counter()
454
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
455
  if not texts: texts = [text_in]
@@ -463,10 +440,8 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
463
  "server_unaccounted_before_first_chunk_s": None,
464
  "file_write_s": None,
465
  }
466
- # пачатковыя метрыкі
467
  yield ("", None, None, json.dumps(server_metrics))
468
 
469
- # --- Генерацыя і стрим ---
470
  full_audio_chunks: List[np.ndarray] = []
471
  first_chunk_seen = False
472
  t_gen0 = time.perf_counter()
@@ -475,13 +450,12 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
475
  gen = XTTS_MODEL.generate(
476
  text=part, do_stream=True, language=lang_short,
477
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
478
- min_buffer_s=RUNTIME_FIRST_CHUNK_S, # 1-ы чанк хутчэй
479
  tokens_per_step=TOKENS_PER_STEP,
480
- stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S, # важна для native stream
481
  temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
482
  top_k=10, top_p=0.3,
483
  )
484
- # На выхад у плэер — больш стабільны буфер MIN_BUFFER_S
485
  for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
486
  if not first_chunk_seen:
487
  t_first = time.perf_counter()
@@ -496,7 +470,6 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
496
  yield (_pcm_f32_to_b64(buf), None, None, None)
497
  full_audio_chunks.append(buf)
498
 
499
- # --- Фінал: WAV ---
500
  if not full_audio_chunks:
501
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
502
 
@@ -515,14 +488,14 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
515
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
516
 
517
  # ---------------------------------------------------------
518
- # 9) UI: лагі ў секундах + Play Final
519
  # ---------------------------------------------------------
520
  examples = [
521
  ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
522
  ]
523
 
524
  with gr.Blocks() as demo:
525
- gr.Markdown("## Belarusian TTS — Streaming (GPU-кэш latents, хуткі 1-ы чанк) + фінальны файл")
526
 
527
  with gr.Row():
528
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
@@ -552,6 +525,9 @@ with gr.Blocks() as demo:
552
  const AC = window.AudioContext || window.webkitAudioContext;
553
  if (!AC) return;
554
 
 
 
 
555
  function toSec(ms) {{ return (ms/1000); }}
556
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
557
 
@@ -600,7 +576,7 @@ with gr.Blocks() as demo:
600
 
601
  if (!window.__wa) {{
602
  const ctx = new AC({{ sampleRate }});
603
- const bufferSize = 1024;
604
  const node = ctx.createScriptProcessor(bufferSize, 0, 1);
605
  let queue = [];
606
  let playing = false;
@@ -642,18 +618,31 @@ with gr.Blocks() as demo:
642
  get eos() {{ return eos; }},
643
  set eos(v) {{ eos = v; }},
644
  meta,
645
- push: (f32) => {{ queue.push(f32); }},
 
 
 
 
 
 
 
 
 
 
646
  start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
647
  stop: () => {{ playing = false; logUpdate(); }},
648
- reset: () => {{ playing = false; eos = false; queue = []; meta.t_first_push_ms = null; meta.t_first_audio_ms = null; logUpdate(); }},
 
 
 
 
 
649
  updateLog: logUpdate,
650
  }};
651
  }} else {{
652
  window.__wa.reset();
653
  window.__wa.meta.t_click_ms = performance.now();
654
  }}
655
-
656
- window.__wa.start();
657
  }}
658
  """
659
 
@@ -663,16 +652,7 @@ with gr.Blocks() as demo:
663
  PUSH_JS = """
664
  (b64) => {
665
  if (!window.__wa || !b64) return;
666
- const meta = window.__wa.meta || {};
667
- if (b64 === "__STOP__") {
668
- window.__wa.eos = true;
669
- window.__wa.updateLog && window.__wa.updateLog();
670
- return;
671
- }
672
- if (!meta.t_first_push_ms) {
673
- meta.t_first_push_ms = performance.now();
674
- window.__wa.updateLog && window.__wa.updateLog();
675
- }
676
  const bin = atob(b64);
677
  const len = bin.length;
678
  const buf = new ArrayBuffer(len);
@@ -689,7 +669,7 @@ with gr.Blocks() as demo:
689
  try {
690
  if (js) {
691
  const obj = JSON.parse(js);
692
- window.__wa.meta.server = obj; // значэнні ў секундах
693
  window.__wa.updateLog && window.__wa.updateLog();
694
  }
695
  } catch (e) {}
@@ -705,7 +685,6 @@ with gr.Blocks() as demo:
705
  }
706
  """
707
 
708
- # кнопкі
709
  play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
710
  stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
711
 
 
1
  import os
 
2
  os.environ.setdefault("OMP_NUM_THREADS", "1")
3
  os.environ.setdefault("MKL_NUM_THREADS", "1")
4
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
 
24
  from scipy.io.wavfile import write
25
 
26
  # ---------------------------------------------------------
27
+ # 1) coqui-ai-TTS fork
28
  # ---------------------------------------------------------
29
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
30
  REPO_DIR = "coqui-ai-TTS"
 
41
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
42
 
43
  # ---------------------------------------------------------
44
+ # 2) мадэльныя файлы
45
  # ---------------------------------------------------------
46
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
47
  model_dir = "./model"
 
58
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
59
 
60
  # ---------------------------------------------------------
61
+ # 3) загрузка мадэлі
62
  # ---------------------------------------------------------
63
  config = XttsConfig()
64
  config.load_json(config_file)
 
72
 
73
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
74
 
 
75
  torch.set_num_threads(1)
76
  if device.startswith("cuda"):
77
  torch.backends.cuda.matmul.allow_tf32 = True
 
88
  # =========================================================
89
  # 4) Streaming-канфіг
90
  # =========================================================
91
+ MIN_BUFFER_S = 0.03 # бяспечны выхадны буфер для плэера
92
+ RUNTIME_FIRST_CHUNK_S = 0.02 # унутраны чанк у генерацыі
93
  FADE_S = 0.004
94
  TOKENS_PER_STEP = 1
95
  ENABLE_TEXT_SPLITTING = True
96
+ FIRST_SEGMENT_LIMIT = 160 # стабільная прасадыя для 1-га сегмента
97
 
98
  # -------------------- утыліты аўдыя ----------------------
99
  def _seconds_to_samples(sec: float, sr: int) -> int:
 
214
  init_stream_support()
215
 
216
  # ---------------------------------------------------------
217
+ # 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
218
  # ---------------------------------------------------------
219
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
220
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 
227
  sound_norm_refs: bool
228
  xtts_git: str | None = None
229
 
230
+ LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
231
+ GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
232
 
233
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
234
  if path and os.path.exists(path):
 
248
  return PERSIST_LATENTS_DIR / f"{key}.pt"
249
 
250
  def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
251
+ torch.save({"gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu()}, _latents_disk_path(key))
 
 
 
 
 
 
252
 
253
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
254
  p = _latents_disk_path(key)
255
+ if not p.exists(): return None
 
256
  obj = torch.load(p, map_location="cpu")
257
  return obj["gpt_cond_latent"], obj["speaker_embedding"]
258
 
 
267
  return g.cpu(), s.cpu()
268
 
269
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
 
270
  meta = LatentsMeta(
271
  model_id=repo_id,
272
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
 
276
  )
277
  key = _latents_key(path, meta)
278
 
 
279
  if key in LATENT_CACHE:
280
  g, s = LATENT_CACHE[key]
281
  else:
 
282
  loaded = _load_latents_from_disk(key)
283
  if loaded is None:
 
284
  g, s = _compute_latents_cpu(path)
285
  _save_latents_to_disk(key, g, s)
286
  else:
287
  g, s = loaded
288
  LATENT_CACHE[key] = (g, s)
289
 
 
290
  if to_device and to_device.startswith("cuda"):
291
  dev_key = (key, to_device)
292
  if dev_key in GPU_LATENT_CACHE:
 
295
  s2 = s.to(to_device, non_blocking=True)
296
  GPU_LATENT_CACHE[dev_key] = (g2, s2)
297
  return g2, s2
 
298
  return g, s
299
 
300
+ # аўтападлік для default voice (CPU) без дадатковых запытаў
301
+ try:
302
+ _ = _latents_for(default_voice_file)
303
+ except Exception as e:
304
+ print(f"[warn] precompute default voice latents failed: {e}")
 
 
305
 
306
  # ---------------------------------------------------------
307
+ # 6) буферы + base64
308
  # ---------------------------------------------------------
309
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
310
  if not chunks: return np.zeros((0,), dtype=np.float32)
 
330
  return base64.b64encode(x.tobytes()).decode("ascii")
331
 
332
  # ---------------------------------------------------------
333
+ # 7) падзел тэксту: хуткі + fallback
334
  # ---------------------------------------------------------
335
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
336
  _WS = re.compile(r"\s+")
 
344
  end = m.end()
345
  parts.append(text[start:end].strip())
346
  start = end
347
+ if start < len(text): parts.append(text[start:].strip())
 
348
  chunks = []
349
  cur = ""
350
  for s in parts:
 
355
  if len(s) <= limit:
356
  cur = s
357
  else:
358
+ w = _WS.split(s); acc = ""
 
359
  for tok in w:
360
  if len(acc) + 1 + len(tok) <= limit:
361
  acc = (acc + " " + tok).strip() if acc else tok
 
381
  text_for_rest = tail
382
  else:
383
  text_for_rest = text_in
384
+ if not text_for_rest: return parts or [text_in]
 
 
385
 
386
  rest = _fast_split(text_for_rest, chunk_limit)
387
  if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
 
391
  if rest2: rest = rest2
392
  except Exception:
393
  pass
 
394
  return parts + (rest or [text_for_rest])
395
 
396
  # ---------------------------------------------------------
397
+ # 8) TTS — стрим + фінальны файл + лагі
398
  # ---------------------------------------------------------
399
  @spaces.GPU(duration=60)
400
  def text_to_speech(belarusian_story, speaker_audio_file=None):
 
410
  if not belarusian_story or str(belarusian_story).strip() == "":
411
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
412
 
 
413
  if not speaker_audio_file or (
414
  not isinstance(speaker_audio_file, str)
415
  and getattr(speaker_audio_file, "name", "") == ""
 
420
  lang_short = "be"
421
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
422
 
423
+ # Latents (кэш CPU/GPU)
424
  t_lat0 = time.perf_counter()
425
  to_dev = "cuda:0" if torch.cuda.is_available() else None
426
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
427
  t_lat1 = time.perf_counter()
428
 
429
+ # Split
430
  t_split0 = time.perf_counter()
431
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
432
  if not texts: texts = [text_in]
 
440
  "server_unaccounted_before_first_chunk_s": None,
441
  "file_write_s": None,
442
  }
 
443
  yield ("", None, None, json.dumps(server_metrics))
444
 
 
445
  full_audio_chunks: List[np.ndarray] = []
446
  first_chunk_seen = False
447
  t_gen0 = time.perf_counter()
 
450
  gen = XTTS_MODEL.generate(
451
  text=part, do_stream=True, language=lang_short,
452
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
453
+ min_buffer_s=RUNTIME_FIRST_CHUNK_S,
454
  tokens_per_step=TOKENS_PER_STEP,
455
+ stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
456
  temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
457
  top_k=10, top_p=0.3,
458
  )
 
459
  for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
460
  if not first_chunk_seen:
461
  t_first = time.perf_counter()
 
470
  yield (_pcm_f32_to_b64(buf), None, None, None)
471
  full_audio_chunks.append(buf)
472
 
 
473
  if not full_audio_chunks:
474
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
475
 
 
488
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
489
 
490
  # ---------------------------------------------------------
491
+ # 9) UI (лагі ў секундах + Play Final; без underrun’аў)
492
  # ---------------------------------------------------------
493
  examples = [
494
  ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
495
  ]
496
 
497
  with gr.Blocks() as demo:
498
+ gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
499
 
500
  with gr.Row():
501
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
 
525
  const AC = window.AudioContext || window.webkitAudioContext;
526
  if (!AC) return;
527
 
528
+ const PRIME_CHUNKS = 2; // мін. к-ць чанкаў перад стартаваннем гуку
529
+ let primeCounter = 0;
530
+
531
  function toSec(ms) {{ return (ms/1000); }}
532
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
533
 
 
576
 
577
  if (!window.__wa) {{
578
  const ctx = new AC({{ sampleRate }});
579
+ const bufferSize = 2048; // большы буфер = менш underrun’аў
580
  const node = ctx.createScriptProcessor(bufferSize, 0, 1);
581
  let queue = [];
582
  let playing = false;
 
618
  get eos() {{ return eos; }},
619
  set eos(v) {{ eos = v; }},
620
  meta,
621
+ push: (f32) => {{
622
+ queue.push(f32);
623
+ if (!meta.t_first_push_ms) {{
624
+ meta.t_first_push_ms = performance.now();
625
+ logUpdate();
626
+ }}
627
+ if (!playing && queue.length >= PRIME_CHUNKS) {{
628
+ // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
629
+ window.__wa.start();
630
+ }}
631
+ }},
632
  start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
633
  stop: () => {{ playing = false; logUpdate(); }},
634
+ reset: () => {{
635
+ playing = false; eos = false; queue = [];
636
+ primeCounter = 0;
637
+ meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
638
+ logUpdate();
639
+ }},
640
  updateLog: logUpdate,
641
  }};
642
  }} else {{
643
  window.__wa.reset();
644
  window.__wa.meta.t_click_ms = performance.now();
645
  }}
 
 
646
  }}
647
  """
648
 
 
652
  PUSH_JS = """
653
  (b64) => {
654
  if (!window.__wa || !b64) return;
655
+ if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
 
 
 
 
 
 
 
 
 
656
  const bin = atob(b64);
657
  const len = bin.length;
658
  const buf = new ArrayBuffer(len);
 
669
  try {
670
  if (js) {
671
  const obj = JSON.parse(js);
672
+ window.__wa.meta.server = obj;
673
  window.__wa.updateLog && window.__wa.updateLog();
674
  }
675
  } catch (e) {}
 
685
  }
686
  """
687
 
 
688
  play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
689
  stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
690