archivartaunik commited on
Commit
99298aa
·
verified ·
1 Parent(s): 91d0f80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -337
app.py CHANGED
@@ -88,12 +88,15 @@ XTTS_MODEL.tokenizer = tokenizer
88
  # =========================================================
89
  # 4) Streaming-канфіг
90
  # =========================================================
91
- MIN_BUFFER_S = 0.03 # бяспечны выхадны буфер для плэера
92
- RUNTIME_FIRST_CHUNK_S = 0.02 # унутраны чанк у генерацыі
 
 
 
93
  FADE_S = 0.004
94
  TOKENS_PER_STEP = 1
95
  ENABLE_TEXT_SPLITTING = True
96
- FIRST_SEGMENT_LIMIT = 160 # стабільная прасадыя для 1-га сегмента
97
 
98
  # -------------------- утыліты аўдыя ----------------------
99
  def _seconds_to_samples(sec: float, sr: int) -> int:
@@ -127,25 +130,6 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
127
  rest = b[fade_n:]
128
  return np.concatenate([head, tail, rest], axis=0)
129
 
130
- def _bpe_prefixes(text: str, lang: str, step_tokens: int):
131
- try:
132
- ids = tokenizer.encode(text, lang=lang)
133
- n = len(ids)
134
- for k in range(step_tokens, n + 1, step_tokens):
135
- yield tokenizer.decode(ids[:k], lang=lang)
136
- if n % step_tokens != 0:
137
- yield tokenizer.decode(ids, lang=lang)
138
- return
139
- except Exception:
140
- pass
141
- pseudo_tokens = re.findall(r"\S+|\s+", text)
142
- acc = ""
143
- for i in range(0, len(pseudo_tokens), step_tokens):
144
- acc = "".join(pseudo_tokens[: i + step_tokens])
145
- yield acc
146
- if acc.strip() != text.strip():
147
- yield text
148
-
149
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
150
  sig = inspect.signature(model.inference_stream)
151
  call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
@@ -158,22 +142,6 @@ def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any,
158
  for out in generator:
159
  yield _to_np_audio(out)
160
 
161
- def _fallback_incremental(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, tokens_per_step: int, **gen_kwargs) -> Iterator[np.ndarray]:
162
- emitted = 0
163
- for prefix in _bpe_prefixes(text, language, tokens_per_step):
164
- autocast_ctx = torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda"))
165
- with torch.inference_mode(), autocast_ctx:
166
- out = model.inference(
167
- text=prefix, language=language,
168
- gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
169
- temperature=gen_kwargs.get("temperature", 0.1),
170
- length_penalty=1.0, repetition_penalty=10.0,
171
- top_k=gen_kwargs.get("top_k", 10), top_p=gen_kwargs.get("top_p", 0.3),
172
- )
173
- wav = _to_np_audio(out)
174
- new_part = wav[emitted:]; emitted = wav.size
175
- if new_part.size: yield new_part
176
-
177
  class NewTTSGenerationMixin:
178
  @torch.inference_mode()
179
  def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
@@ -204,8 +172,7 @@ class NewTTSGenerationMixin:
204
  for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
205
  yield chunk
206
  return
207
- for chunk in _fallback_incremental(self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs):
208
- yield chunk
209
 
210
  def init_stream_support():
211
  Xtts.generate = NewTTSGenerationMixin.generate
@@ -220,35 +187,18 @@ PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
220
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
221
 
222
  @dataclass(frozen=True)
223
- class LatentsMeta:
224
- model_id: str
225
- gpt_cond_len: int
226
- max_ref_len: int
227
- sound_norm_refs: bool
228
- xtts_git: str | None = None
229
 
230
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
231
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
232
 
233
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
234
- if path and os.path.exists(path):
235
- base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
236
- else:
237
- base = "default_voice"
238
- meta_str = json.dumps({
239
- "model_id": meta.model_id,
240
- "gpt_cond_len": meta.gpt_cond_len,
241
- "max_ref_len": meta.max_ref_len,
242
- "sound_norm_refs": meta.sound_norm_refs,
243
- "xtts_git": meta.xtts_git,
244
- }, sort_keys=True)
245
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
246
 
247
- def _latents_disk_path(key: str) -> pathlib.Path:
248
- return PERSIST_LATENTS_DIR / f"{key}.pt"
249
-
250
- def _save_latents_to_disk(key: str, gpt_cond_latent: torch.Tensor, speaker_embedding: torch.Tensor):
251
- torch.save({"gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu()}, _latents_disk_path(key))
252
 
253
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
254
  p = _latents_disk_path(key)
@@ -258,49 +208,27 @@ def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tens
258
 
259
  def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
260
  with torch.inference_mode():
261
- g, s = XTTS_MODEL.get_conditioning_latents(
262
- audio_path=path,
263
- gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
264
- max_ref_length=XTTS_MODEL.config.max_ref_len,
265
- sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
266
- )
267
  return g.cpu(), s.cpu()
268
 
269
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
270
- meta = LatentsMeta(
271
- model_id=repo_id,
272
- gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
273
- max_ref_len=XTTS_MODEL.config.max_ref_len,
274
- sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
275
- xtts_git=None,
276
- )
277
  key = _latents_key(path, meta)
278
-
279
- if key in LATENT_CACHE:
280
- g, s = LATENT_CACHE[key]
281
- else:
282
- loaded = _load_latents_from_disk(key)
283
- if loaded is None:
284
- g, s = _compute_latents_cpu(path)
285
- _save_latents_to_disk(key, g, s)
286
- else:
287
- g, s = loaded
288
- LATENT_CACHE[key] = (g, s)
289
-
290
  if to_device and to_device.startswith("cuda"):
291
  dev_key = (key, to_device)
292
- if dev_key in GPU_LATENT_CACHE:
293
- return GPU_LATENT_CACHE[dev_key]
294
- g2 = g.to(to_device, non_blocking=True)
295
- s2 = s.to(to_device, non_blocking=True)
296
  GPU_LATENT_CACHE[dev_key] = (g2, s2)
297
  return g2, s2
298
  return g, s
299
 
300
- try:
301
- _ = _latents_for(default_voice_file)
302
- except Exception as e:
303
- print(f"[warn] precompute default voice latents failed: {e}")
304
 
305
  # ---------------------------------------------------------
306
  # 6) буферы + base64
@@ -308,113 +236,65 @@ except Exception as e:
308
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
309
  if not chunks: return np.zeros((0,), dtype=np.float32)
310
  out = chunks[0]
311
- for i in range(1, len(chunks)):
312
- out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
313
  return out
314
 
315
- def _chunker(chunks: Iterable[np.ndarray], sr: int, target_s: float) -> Iterable[np.ndarray]:
316
- target_samples = _seconds_to_samples(target_s, sr)
317
- buf = np.zeros((0,), dtype=np.float32)
 
318
  for c in chunks:
319
- c = _to_np_audio(c)
320
- if c.size == 0: continue
321
- buf = c if buf.size == 0 else np.concatenate([buf, c], axis=0) # <--- Спрошчанае зліццё для буфера
322
- if buf.size >= target_samples:
323
- yield buf
324
- buf = np.zeros((0,), dtype=np.float32)
325
- if buf.size: yield buf
326
-
327
- def _pcm_f32_to_b64(x: np.ndarray) -> str:
328
- if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
329
- return base64.b64encode(x.tobytes()).decode("ascii")
330
 
331
  # ---------------------------------------------------------
332
- # 7) падзел тэксту: хуткі + fallback
333
  # ---------------------------------------------------------
334
- _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
335
- _WS = re.compile(r"\s+")
336
-
337
- def _fast_split(text: str, limit: int) -> List[str]:
338
- text = text.strip()
339
- if not text: return []
340
- parts = []
341
- start = 0
342
- for m in _SENT_END.finditer(text):
343
- end = m.end()
344
- parts.append(text[start:end].strip())
345
- start = end
346
- if start < len(text): parts.append(text[start:].strip())
347
- chunks = []
348
- cur = ""
349
- for s in parts:
350
- if len(cur) + 1 + len(s) <= limit:
351
- cur = (cur + " " + s).strip() if cur else s
352
- else:
353
- if cur: chunks.append(cur)
354
- if len(s) <= limit:
355
- cur = s
356
- else:
357
- w = _WS.split(s); acc = ""
358
- for tok in w:
359
- if len(acc) + 1 + len(tok) <= limit:
360
- acc = (acc + " " + tok).strip() if acc else tok
361
- else:
362
- if acc: chunks.append(acc)
363
- acc = tok
364
- if acc: cur = acc
365
- else: cur = ""
366
- if cur: chunks.append(cur)
367
- return [c for c in chunks if c]
368
-
369
  def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  text_in = text_in.strip()
371
  if not text_in: return []
372
- parts: List[str] = []
373
- if len(text_in) > FIRST_SEGMENT_LIMIT:
374
- head = text_in[:FIRST_SEGMENT_LIMIT]
375
- m = re.search(r".*[\.!\?…»)]", head)
376
- if m and len(m.group(0)) > 30:
377
- head = m.group(0)
378
- tail = text_in[len(head):].lstrip()
379
- parts.append(head)
380
- text_for_rest = tail
381
- else:
382
- text_for_rest = text_in
383
- if not text_for_rest: return parts or [text_in]
384
-
385
- rest = _fast_split(text_for_rest, chunk_limit)
386
- if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
387
- try:
388
- rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
389
- rest2 = [s.strip() for s in rest2 if s and s.strip()]
390
- if rest2: rest = rest2
391
- except Exception:
392
- pass
393
- return parts + (rest or [text_for_rest])
394
 
395
  # ---------------------------------------------------------
396
- # 8) TTS — стрим + фінальны файл + лагі
397
  # ---------------------------------------------------------
398
  @spaces.GPU(duration=60)
399
- def text_to_speech(belarusian_story, speaker_audio_file=None):
400
  t0 = time.perf_counter()
401
-
402
- if not belarusian_story or str(belarusian_story).strip() == "":
403
- raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
404
-
405
- if not speaker_audio_file or (
406
- not isinstance(speaker_audio_file, str)
407
- and getattr(speaker_audio_file, "name", "") == ""
408
- ):
409
- speaker_audio_file = default_voice_file
410
-
411
- text_in = str(belarusian_story).strip()
412
- lang_short = "be"
413
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
414
 
415
  t_lat0 = time.perf_counter()
416
- to_dev = "cuda:0" if torch.cuda.is_available() else None
417
- gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
418
  t_lat1 = time.perf_counter()
419
 
420
  t_split0 = time.perf_counter()
@@ -423,39 +303,28 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
423
  t_split1 = time.perf_counter()
424
 
425
  server_metrics = {
426
- "latents_s": (t_lat1 - t_lat0),
427
- "text_split_s": (t_split1 - t_split0),
428
- "gen_init_to_first_chunk_s": None,
429
- "until_first_chunk_total_s": None,
430
- "server_unaccounted_before_first_chunk_s": None,
431
- "file_write_s": None,
432
  }
433
  yield ("", None, None, json.dumps(server_metrics))
434
 
435
- full_audio_chunks: List[np.ndarray] = []
436
- first_chunk_seen = False
437
  t_gen0 = time.perf_counter()
438
-
439
- # <--- ВЫПРАЎЛЕННЕ: вернута простая і надзейная логіка апрацоўкі стрыму
440
  for part in texts:
441
  gen = XTTS_MODEL.generate(
442
  text=part, do_stream=True, language=lang_short,
443
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
444
- min_buffer_s=RUNTIME_FIRST_CHUNK_S,
445
- tokens_per_step=TOKENS_PER_STEP,
446
- stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S,
447
- temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
448
- top_k=10, top_p=0.3,
449
  )
450
- # Мы выкарыстоўваем _chunker для ўсяго патоку, каб забяспечыць стабільны памер чанкаў і пазбегнуць паўз
451
- for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
452
  if not first_chunk_seen:
453
  t_first = time.perf_counter()
454
- server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
455
- server_metrics["until_first_chunk_total_s"] = (t_first - t0)
456
- known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
457
- other = server_metrics["until_first_chunk_total_s"] - known
458
- server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
459
  first_chunk_seen = True
460
  yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
461
  else:
@@ -467,49 +336,36 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
467
 
468
  t_w0 = time.perf_counter()
469
  full_audio = _merge_for_file(full_audio_chunks)
470
- tmp = None
471
- try:
472
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
473
- write(tmp.name, sampling_rate, full_audio.astype(np.float32))
474
- except Exception as e:
475
- raise gr.Error(f"Памылка пры запісе фінальнага WAV: {e}")
476
- finally:
477
- t_w1 = time.perf_counter()
478
- server_metrics["file_write_s"] = (t_w1 - t_w0)
479
-
480
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
481
 
482
  # ---------------------------------------------------------
483
- # 9) UI (JavaScript застаецца без змен, ён працаваў правільна)
484
  # ---------------------------------------------------------
485
- examples = [
486
- ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None],
487
- ]
488
 
489
  with gr.Blocks() as demo:
490
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
491
-
492
  with gr.Row():
493
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
494
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
 
 
 
 
 
495
 
496
  with gr.Row():
497
- play_btn = gr.Button("▶️ Play (stream)")
498
- stop_btn = gr.Button("⏹ Stop (stream)")
499
  run_btn = gr.Button("Згенераваць")
500
  gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
501
-
502
- log_panel = gr.HTML(
503
- value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
504
- label="Лагі плэера",
505
- )
506
-
507
- stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe")
508
- log_pipe = gr.Textbox(value="", visible=False, label="log_pipe")
509
-
510
- final_file = gr.File(label="Згенераваны WAV (спампаваць)")
511
- final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
512
- play_final_btn = gr.Button("▶️ Play Final")
513
 
514
  INIT_RESET_AND_PLAY_JS = f"""
515
  () => {{
@@ -517,9 +373,6 @@ with gr.Blocks() as demo:
517
  const AC = window.AudioContext || window.webkitAudioContext;
518
  if (!AC) return;
519
 
520
- const PRIME_CHUNKS = 1;
521
- let primeCounter = 0;
522
-
523
  function toSec(ms) {{ return (ms/1000); }}
524
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
525
 
@@ -527,60 +380,49 @@ with gr.Blocks() as demo:
527
  const el = document.getElementById('wa-log');
528
  if (!el || !window.__wa || !window.__wa.meta) return;
529
  const m = window.__wa.meta;
 
530
  const lines = [];
531
  lines.push("Клік (Згенераваць): 0.000 s");
532
-
533
- let click_to_first_chunk_s = null;
534
  if (m.t_first_push_ms) {{
535
- click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
536
  lines.push("Першы чанк прыйшоў: " + click_to_first_chunk_s.toFixed(3) + " s");
537
  if (m.t_first_audio_ms) {{
538
  lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
539
  lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
540
  }}
541
  }}
 
 
 
 
 
 
 
 
 
542
 
543
- const s = (m.server || {{}});
544
- lines.push("");
545
- lines.push("— Серверныя метрыкі —");
546
  lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
547
  lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
548
  lines.push("Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s));
549
  lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
550
  lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
551
  lines.push("Запіс WAV: " + fmtS(s.file_write_s));
552
-
553
- if (click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
554
- let est_queue_net = click_to_first_chunk_s - s.until_first_chunk_total_s;
555
- if (!isFinite(est_queue_net) || est_queue_net < 0) est_queue_net = 0;
556
- lines.push("");
557
- lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est_queue_net.toFixed(3) + " s");
558
- }} else {{
559
- lines.push("");
560
- lines.push("Ацэнка чаргі ZeroGPU + сеткі: n/a");
561
  }}
562
-
563
- lines.push("");
564
- lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
565
- el.textContent = lines.join("\\n");
566
- try {{ console.log(lines.join("\\n")); }} catch (e) {{}}
567
  }}
568
 
569
  if (!window.__wa) {{
570
  const ctx = new AC({{ sampleRate }});
571
- const bufferSize = 2048;
572
- const node = ctx.createScriptProcessor(bufferSize, 0, 1);
573
- let queue = [];
574
- let playing = false;
575
- let eos = false;
576
-
577
- const meta = {{
578
- t_click_ms: performance.now(),
579
- t_first_push_ms: null,
580
- t_first_audio_ms: null,
581
- server: null,
582
- }};
583
-
584
  node.onaudioprocess = (e) => {{
585
  const out = e.outputBuffer.getChannelData(0);
586
  let i = 0;
@@ -588,106 +430,72 @@ with gr.Blocks() as demo:
588
  if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
589
  let cur = queue[0];
590
  const take = Math.min(cur.length, out.length - i);
591
- if (meta.t_first_audio_ms === null) {{
592
- meta.t_first_audio_ms = performance.now();
593
- logUpdate();
594
- }}
595
  out.set(cur.subarray(0, take), i);
596
  i += take;
597
- if (take === cur.length) queue.shift();
598
- else queue[0] = cur.subarray(take);
599
- }}
600
- if (eos && queue.length === 0 && playing) {{
601
- playing = false;
602
- logUpdate();
603
  }}
 
604
  }};
605
  node.connect(ctx.destination);
606
-
607
  window.__wa = {{
608
- ctx, node,
609
- get playing() {{ return playing; }},
610
- get eos() {{ return eos; }},
611
- set eos(v) {{ eos = v; }},
612
- meta,
613
  push: (f32) => {{
614
  queue.push(f32);
615
- if (!meta.t_first_push_ms) {{
616
- meta.t_first_push_ms = performance.now();
617
- logUpdate();
618
- }}
619
- if (!playing && queue.length >= PRIME_CHUNKS) {{
620
- window.__wa.start();
621
- }}
622
  }},
623
- start: async () => {{ try {{ await ctx.resume(); }} catch(e){{}} playing = true; logUpdate(); }},
624
- stop: () => {{ playing = false; logUpdate(); }},
625
  reset: () => {{
626
  playing = false; eos = false; queue = [];
627
- primeCounter = 0;
628
  meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
 
629
  logUpdate();
630
  }},
631
  updateLog: logUpdate,
632
  }};
633
- }} else {{
634
- window.__wa.reset();
635
- window.__wa.meta.t_click_ms = performance.now();
636
  }}
 
637
  }}
638
  """
639
-
640
- STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
641
- PLAY_JS = "() => { if (window.__wa) window.__wa.start(); }"
642
-
643
  PUSH_JS = """
644
  (b64) => {
645
  if (!window.__wa || !b64) return;
646
- if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog && window.__wa.updateLog(); return; }
647
  const bin = atob(b64);
648
- const len = bin.length;
649
- const buf = new ArrayBuffer(len);
650
  const view = new Uint8Array(buf);
651
- for (let i=0;i<len;i++) view[i] = bin.charCodeAt(i);
652
  const f32 = new Float32Array(buf);
 
 
 
 
 
653
  window.__wa.push(f32);
654
  }
655
  """
656
-
657
  LOG_JS = """
658
  (js) => {
659
- if (!window.__wa) return;
660
  try {
661
- if (js) {
662
- const obj = JSON.parse(js);
663
- window.__wa.meta.server = obj;
664
- window.__wa.updateLog && window.__wa.updateLog();
665
- }
666
  } catch (e) {}
667
  }
668
  """
 
 
 
 
 
 
 
 
 
669
 
670
- PLAY_FINAL_JS = """
671
- () => {
672
- const host = document.getElementById('final-audio');
673
- if (!host) return;
674
- const audio = host.querySelector('audio');
675
- if (audio) { try { audio.play(); } catch(e) {} }
676
- }
677
- """
678
-
679
- play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
680
- stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
681
-
682
- run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
683
- run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice], outputs=[stream_pipe, final_file, final_audio, log_pipe])
684
-
685
- stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
686
- log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
687
-
688
- play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
689
-
690
- gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
691
 
692
  if __name__ == "__main__":
693
  demo.launch()
 
88
  # =========================================================
89
  # 4) Streaming-канфіг
90
  # =========================================================
91
+ # Значэнні па змаўчанні, якія цяпер будуць перавызначацца з UI
92
+ INITIAL_MIN_BUFFER_S = 0.25
93
+ MIN_BUFFER_S = 0.1
94
+
95
+ RUNTIME_FIRST_CHUNK_S = 0.02
96
  FADE_S = 0.004
97
  TOKENS_PER_STEP = 1
98
  ENABLE_TEXT_SPLITTING = True
99
+ FIRST_SEGMENT_LIMIT = 160
100
 
101
  # -------------------- утыліты аўдыя ----------------------
102
  def _seconds_to_samples(sec: float, sr: int) -> int:
 
130
  rest = b[fade_n:]
131
  return np.concatenate([head, tail, rest], axis=0)
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent: Any, speaker_embedding: Any, **gen_kwargs) -> Iterator[np.ndarray]:
134
  sig = inspect.signature(model.inference_stream)
135
  call_kwargs = dict(text=text, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding)
 
142
  for out in generator:
143
  yield _to_np_audio(out)
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  class NewTTSGenerationMixin:
146
  @torch.inference_mode()
147
  def generate(self: Xtts, text: Optional[str] = None, *, do_stream: bool = False, language: str = "be",
 
172
  for chunk in _native_stream(self, text, language, gpt_cond_latent, speaker_embedding, **local_kwargs):
173
  yield chunk
174
  return
175
+ raise NotImplementedError("Fallback streaming is not implemented")
 
176
 
177
  def init_stream_support():
178
  Xtts.generate = NewTTSGenerationMixin.generate
 
187
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
188
 
189
  @dataclass(frozen=True)
190
+ class LatentsMeta: model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
 
 
 
 
 
191
 
192
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
193
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
194
 
195
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
196
+ base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
197
+ meta_str = json.dumps(meta.__dict__, sort_keys=True)
 
 
 
 
 
 
 
 
 
198
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
199
 
200
+ def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
201
+ def _save_latents_to_disk(key: str, g: torch.Tensor, s: torch.Tensor): torch.save({"gpt_cond_latent": g.cpu(), "speaker_embedding": s.cpu()}, _latents_disk_path(key))
 
 
 
202
 
203
  def _load_latents_from_disk(key: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
204
  p = _latents_disk_path(key)
 
208
 
209
  def _compute_latents_cpu(path: str | None) -> Tuple[torch.Tensor, torch.Tensor]:
210
  with torch.inference_mode():
211
+ g, s = XTTS_MODEL.get_conditioning_latents(audio_path=path)
 
 
 
 
 
212
  return g.cpu(), s.cpu()
213
 
214
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
215
+ meta = LatentsMeta(model_id=repo_id, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_len=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, xtts_git=None)
 
 
 
 
 
 
216
  key = _latents_key(path, meta)
217
+ g, s = LATENT_CACHE.get(key) or _load_latents_from_disk(key) or (None, None)
218
+ if g is None:
219
+ g, s = _compute_latents_cpu(path)
220
+ _save_latents_to_disk(key, g, s)
221
+ LATENT_CACHE[key] = (g, s)
 
 
 
 
 
 
 
222
  if to_device and to_device.startswith("cuda"):
223
  dev_key = (key, to_device)
224
+ if dev_key in GPU_LATENT_CACHE: return GPU_LATENT_CACHE[dev_key]
225
+ g2, s2 = g.to(to_device, non_blocking=True), s.to(to_device, non_blocking=True)
 
 
226
  GPU_LATENT_CACHE[dev_key] = (g2, s2)
227
  return g2, s2
228
  return g, s
229
 
230
+ try: _ = _latents_for(default_voice_file)
231
+ except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
 
 
232
 
233
  # ---------------------------------------------------------
234
  # 6) буферы + base64
 
236
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
237
  if not chunks: return np.zeros((0,), dtype=np.float32)
238
  out = chunks[0]
239
+ for i in range(1, len(chunks)): out = _crossfade_concat(out, chunks[i], sampling_rate, FADE_S)
 
240
  return out
241
 
242
+ def _chunker_with_initial_buffer(chunks: Iterable[np.ndarray], sr: int, initial_target_s: float, target_s: float) -> Iterable[np.ndarray]:
243
+ is_first = True
244
+ target_samples = _seconds_to_samples(initial_target_s, sr)
245
+ buffer_list, buffer_len = [], 0
246
  for c in chunks:
247
+ c_np = _to_np_audio(c)
248
+ if c_np.size == 0: continue
249
+ buffer_list.append(c_np); buffer_len += c_np.size
250
+ if buffer_len >= target_samples:
251
+ full_chunk = np.concatenate(buffer_list, axis=0)
252
+ yield full_chunk
253
+ buffer_list, buffer_len = [], 0
254
+ if is_first: is_first = False; target_samples = _seconds_to_samples(target_s, sr)
255
+ if buffer_len > 0: yield np.concatenate(buffer_list, axis=0)
256
+
257
+ def _pcm_f32_to_b64(x: np.ndarray) -> str: return base64.b64encode(x.astype(np.float32).tobytes()).decode("ascii")
258
 
259
  # ---------------------------------------------------------
260
+ # 7) падзел тэксту
261
  # ---------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
263
+ # (Функцыя засталася без змен)
264
+ _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
265
+ def _fast_split(text: str, limit: int) -> List[str]:
266
+ text = text.strip()
267
+ if not text: return []
268
+ parts = [s.strip() for s in _SENT_END.split(text) if s and s.strip()]
269
+ chunks, cur = [], ""
270
+ for s in parts:
271
+ if len(cur) + 1 + len(s) <= limit: cur = (cur + " " + s).strip() if cur else s
272
+ else:
273
+ if cur: chunks.append(cur)
274
+ cur = s
275
+ if cur: chunks.append(cur)
276
+ return chunks
277
+
278
  text_in = text_in.strip()
279
  if not text_in: return []
280
+ try:
281
+ return [s.strip() for s in split_sentence(text_in, lang=lang_short, text_split_length=chunk_limit) if s and s.strip()]
282
+ except Exception:
283
+ return _fast_split(text_in, chunk_limit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  # ---------------------------------------------------------
286
+ # 8) TTS — стрым-функцыя
287
  # ---------------------------------------------------------
288
  @spaces.GPU(duration=60)
289
+ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subsequent_buffer_s):
290
  t0 = time.perf_counter()
291
+ if not belarusian_story or str(belarusian_story).strip() == "": raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
292
+ speaker_audio_file = speaker_audio_file or default_voice_file
293
+ text_in, lang_short = str(belarusian_story).strip(), "be"
 
 
 
 
 
 
 
 
 
294
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
295
 
296
  t_lat0 = time.perf_counter()
297
+ gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=device)
 
298
  t_lat1 = time.perf_counter()
299
 
300
  t_split0 = time.perf_counter()
 
303
  t_split1 = time.perf_counter()
304
 
305
  server_metrics = {
306
+ "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
307
+ "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
 
 
 
 
308
  }
309
  yield ("", None, None, json.dumps(server_metrics))
310
 
311
+ full_audio_chunks, first_chunk_seen = [], False
 
312
  t_gen0 = time.perf_counter()
 
 
313
  for part in texts:
314
  gen = XTTS_MODEL.generate(
315
  text=part, do_stream=True, language=lang_short,
316
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
317
+ stream_chunk_size_s=RUNTIME_FIRST_CHUNK_S, temperature=0.1,
318
+ length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3,
 
 
 
319
  )
320
+ chunk_iterator = _chunker_with_initial_buffer(gen, sampling_rate, initial_buffer_s, subsequent_buffer_s)
321
+ for buf in chunk_iterator:
322
  if not first_chunk_seen:
323
  t_first = time.perf_counter()
324
+ server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
325
+ server_metrics["until_first_chunk_total_s"] = t_first - t0
326
+ known = sum(v for k, v in server_metrics.items() if k.endswith("_s"))
327
+ server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0) - known)
 
328
  first_chunk_seen = True
329
  yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
330
  else:
 
336
 
337
  t_w0 = time.perf_counter()
338
  full_audio = _merge_for_file(full_audio_chunks)
339
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
340
+ write(tmp.name, sampling_rate, full_audio)
341
+ t_w1 = time.perf_counter()
342
+ server_metrics["file_write_s"] = t_w1 - t_w0
 
 
 
 
 
 
343
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
344
 
345
  # ---------------------------------------------------------
346
+ # 9) UI
347
  # ---------------------------------------------------------
348
+ examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, 0.25, 0.1]]
 
 
349
 
350
  with gr.Blocks() as demo:
351
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
 
352
  with gr.Row():
353
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
354
  inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
355
+
356
+ # <--- ВЫПРАЎЛЕННЕ: Дададзены слайдары для налад
357
+ with gr.Accordion("Дадатковыя налады стрымінгу", open=True):
358
+ initial_buffer_slider = gr.Slider(minimum=0.1, maximum=1.0, value=INITIAL_MIN_BUFFER_S, step=0.05, label="Пачатковы буфер (с)", info="Большае з��ачэнне памяншае рызыку паўзы на старце, але трохі павялічвае пачатковую затрымку.")
359
+ subsequent_buffer_slider = gr.Slider(minimum=0.05, maximum=0.5, value=MIN_BUFFER_S, step=0.01, label="Наступны буфер (с)", info="Меншае значэнне дае меншую агульную затрымку, але патрабуе больш стабільнай працы мадэлі.")
360
 
361
  with gr.Row():
 
 
362
  run_btn = gr.Button("Згенераваць")
363
  gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
364
+
365
+ log_panel = gr.HTML(value='<div id="wa-log" style="font-family:monospace;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Лагі плэера")
366
+ stream_pipe, log_pipe = gr.Textbox(visible=False), gr.Textbox(visible=False)
367
+ final_file = gr.File(label="Згенераваны WAV (спампаваць)")
368
+ final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False)
 
 
 
 
 
 
 
369
 
370
  INIT_RESET_AND_PLAY_JS = f"""
371
  () => {{
 
373
  const AC = window.AudioContext || window.webkitAudioContext;
374
  if (!AC) return;
375
 
 
 
 
376
  function toSec(ms) {{ return (ms/1000); }}
377
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
378
 
 
380
  const el = document.getElementById('wa-log');
381
  if (!el || !window.__wa || !window.__wa.meta) return;
382
  const m = window.__wa.meta;
383
+ const s = (m.server || {{}});
384
  const lines = [];
385
  lines.push("Клік (Згенераваць): 0.000 s");
 
 
386
  if (m.t_first_push_ms) {{
387
+ const click_to_first_chunk_s = toSec(m.t_first_push_ms - m.t_click_ms);
388
  lines.push("Першы чанк прыйшоў: " + click_to_first_chunk_s.toFixed(3) + " s");
389
  if (m.t_first_audio_ms) {{
390
  lines.push("Пачатак прайгравання: " + (toSec(m.t_first_audio_ms - m.t_click_ms)).toFixed(3) + " s");
391
  lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
392
  }}
393
  }}
394
+
395
+ // <--- ВЫПРАЎЛЕННЕ: Новы блок логаў
396
+ lines.push("\\n— Налады стрыму —");
397
+ lines.push("Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s));
398
+ lines.push("Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
399
+ if (m.chunk_durations && m.chunk_durations.length > 0) {{
400
+ lines.push("Працягласць 1-га чанка: " + m.chunk_durations[0] + " s");
401
+ lines.push("Атрымана чанкаў: " + m.chunk_durations.length);
402
+ }}
403
 
404
+ lines.push("\\n— Серверныя метрыкі —");
 
 
405
  lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
406
  lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
407
  lines.push("Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s));
408
  lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
409
  lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
410
  lines.push("Запіс WAV: " + fmtS(s.file_write_s));
411
+
412
+ if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
413
+ let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
414
+ lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
 
 
 
 
 
415
  }}
416
+
417
+ lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
418
+ el.innerHTML = lines.join("\\n");
 
 
419
  }}
420
 
421
  if (!window.__wa) {{
422
  const ctx = new AC({{ sampleRate }});
423
+ const node = ctx.createScriptProcessor(4096, 0, 1);
424
+ let queue = [], playing = false, eos = false;
425
+ const meta = {{ t_click_ms: performance.now(), chunk_durations: [] }};
 
 
 
 
 
 
 
 
 
 
426
  node.onaudioprocess = (e) => {{
427
  const out = e.outputBuffer.getChannelData(0);
428
  let i = 0;
 
430
  if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
431
  let cur = queue[0];
432
  const take = Math.min(cur.length, out.length - i);
433
+ if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
 
 
 
434
  out.set(cur.subarray(0, take), i);
435
  i += take;
436
+ if (take === cur.length) queue.shift(); else queue[0] = cur.subarray(take);
 
 
 
 
 
437
  }}
438
+ if (eos && queue.length === 0 && playing) {{ playing = false; logUpdate(); }}
439
  }};
440
  node.connect(ctx.destination);
 
441
  window.__wa = {{
442
+ ctx, node, meta, playing, eos,
 
 
 
 
443
  push: (f32) => {{
444
  queue.push(f32);
445
+ if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); }}
446
+ if (!playing && queue.length >= 1) {{ playing = true; try{{ctx.resume()}}catch(e){{}} }}
447
+ logUpdate();
 
 
 
 
448
  }},
 
 
449
  reset: () => {{
450
  playing = false; eos = false; queue = [];
451
+ meta.t_click_ms = performance.now();
452
  meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
453
+ meta.chunk_durations = []; meta.server = null;
454
  logUpdate();
455
  }},
456
  updateLog: logUpdate,
457
  }};
 
 
 
458
  }}
459
+ window.__wa.reset();
460
  }}
461
  """
 
 
 
 
462
  PUSH_JS = """
463
  (b64) => {
464
  if (!window.__wa || !b64) return;
465
+ if (b64 === "__STOP__") { window.__wa.eos = true; window.__wa.updateLog(); return; }
466
  const bin = atob(b64);
467
+ const buf = new ArrayBuffer(bin.length);
 
468
  const view = new Uint8Array(buf);
469
+ for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
470
  const f32 = new Float32Array(buf);
471
+
472
+ // <--- ВЫПРАЎЛЕННЕ: Дадаем разлік працягласці чанка ў логі
473
+ const duration = f32.length / window.__wa.ctx.sampleRate;
474
+ window.__wa.meta.chunk_durations.push(duration.toFixed(3));
475
+
476
  window.__wa.push(f32);
477
  }
478
  """
 
479
  LOG_JS = """
480
  (js) => {
481
+ if (!window.__wa || !js) return;
482
  try {
483
+ window.__wa.meta.server = JSON.parse(js);
484
+ window.__wa.updateLog();
 
 
 
485
  } catch (e) {}
486
  }
487
  """
488
+ # <--- ВЫПРАЎЛЕННЕ: Перадаем значэнні са слайдараў у бэкэнд
489
+ run_btn.click(fn=None, _js=INIT_RESET_AND_PLAY_JS)
490
+ run_btn.click(
491
+ fn=text_to_speech,
492
+ inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider],
493
+ outputs=[stream_pipe, final_file, final_audio, log_pipe]
494
+ )
495
+ stream_pipe.change(fn=None, inputs=[stream_pipe], _js=PUSH_JS)
496
+ log_pipe.change(fn=None, inputs=[log_pipe], _js=LOG_JS)
497
 
498
+ gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
  if __name__ == "__main__":
501
  demo.launch()