archivartaunik commited on
Commit
c3bdefc
·
verified ·
1 Parent(s): d823d7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -147
app.py CHANGED
@@ -3,18 +3,9 @@ os.environ.setdefault("OMP_NUM_THREADS", "1")
3
  os.environ.setdefault("MKL_NUM_THREADS", "1")
4
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
5
 
6
- import sys
7
- import re
8
- import time
9
- import json
10
- import base64
11
- import hashlib
12
- import tempfile
13
- import subprocess
14
- import inspect
15
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
16
  from dataclasses import dataclass
17
- import pathlib
18
 
19
  import spaces
20
  import gradio as gr
@@ -23,7 +14,7 @@ import numpy as np
23
  from huggingface_hub import hf_hub_download
24
  from scipy.io.wavfile import write
25
 
26
- # ---------- coqui-ai-TTS fork ----------
27
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
28
  REPO_DIR = "coqui-ai-TTS"
29
  if not os.path.exists(REPO_DIR):
@@ -36,10 +27,9 @@ from TTS.tts.configs.xtts_config import XttsConfig
36
  from TTS.tts.models.xtts import Xtts
37
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
38
 
39
- # ---------- model files ----------
40
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
41
- model_dir = "./model"
42
- os.makedirs(model_dir, exist_ok=True)
43
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
44
  if not os.path.exists(os.path.join(model_dir, fname)):
45
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
@@ -48,7 +38,7 @@ config_file = os.path.join(model_dir, "config.json")
48
  vocab_file = os.path.join(model_dir, "vocab.json")
49
  default_voice_file = os.path.join(model_dir, "voice.wav")
50
 
51
- # ---------- load model ----------
52
  config = XttsConfig(); config.load_json(config_file)
53
  XTTS_MODEL = Xtts.init_from_config(config)
54
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
@@ -60,35 +50,36 @@ if device.startswith("cuda"):
60
  torch.backends.cudnn.allow_tf32 = True
61
  torch.backends.cudnn.benchmark = True
62
  torch.set_float32_matmul_precision("high")
 
63
  XTTS_MODEL.to(device).eval()
64
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
65
 
66
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
67
  XTTS_MODEL.tokenizer = tokenizer
68
 
69
- # ---------- defaults ----------
70
- DEF_MIN_BUFFER_S = 0.06
71
- DEF_FIRST_CHUNK_S = 0.03
72
- DEF_TOKENS_PER_STEP = 1
73
- DEF_ENABLE_TEXT_SPLIT = True
74
  DEF_FIRST_SEGMENT_LIMIT = 160
75
  FADE_S = 0.004
76
 
77
- DEF_CLIENT_PREROLL = 0.18
78
- DEF_CLIENT_LOWWM = 0.06
79
- MAX_CLIENT_PREROLL = 0.32
80
- STEP_CLIENT_PREROLL = 0.04
81
 
82
- # ---------- audio utils ----------
83
- def _seconds_to_samples(sec: float, sr: int) -> int:
84
- return max(1, int(sec * sr))
85
 
86
  def _to_np_audio(x) -> np.ndarray:
87
  if isinstance(x, dict) and "wav" in x: x = x["wav"]
88
  if isinstance(x, torch.Tensor):
89
  if x.dtype != torch.float32: x = x.float()
90
  return x.detach().cpu().contiguous().view(-1).numpy()
91
- x = np.asarray(x); x = x.reshape(-1) if x.ndim > 1 else x
 
92
  return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
93
 
94
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
@@ -98,7 +89,7 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> n
98
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
99
  if fade_n <= 1: return np.concatenate([a, b], axis=0)
100
  fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
101
- head = a[:-fade_n]; tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in); rest = b[fade_n:]
102
  return np.concatenate([head, tail, rest], axis=0)
103
 
104
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
@@ -108,8 +99,7 @@ def _bpe_prefixes(text: str, lang: str, step_tokens: int):
108
  if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
109
  except Exception: pass
110
  pseudo = re.findall(r"\S+|\s+", text); acc = ""
111
- for i in range(0, len(pseudo), step_tokens):
112
- acc = "".join(pseudo[: i + step_tokens]); yield acc
113
  if acc.strip() != text.strip(): yield text
114
 
115
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
@@ -164,11 +154,14 @@ def init_stream_support():
164
  Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
165
  init_stream_support()
166
 
167
- # ---------- latents cache ----------
168
- PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache"); PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 
 
169
  @dataclass(frozen=True)
170
  class LatentsMeta:
171
  model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
 
172
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
173
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
174
 
@@ -178,11 +171,13 @@ def _latents_key(path: str | None, meta: LatentsMeta) -> str:
178
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
179
 
180
  def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
 
181
  def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
 
182
  def _load_latents_from_disk(key: str):
183
- p=_latents_disk_path(key);
184
  if not p.exists(): return None
185
- obj=torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
186
 
187
  def _compute_latents_cpu(path: str | None):
188
  with torch.inference_mode():
@@ -208,7 +203,7 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None):
208
  try: _ = _latents_for(default_voice_file)
209
  except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
210
 
211
- # ---------- stream packing ----------
212
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
213
  if not chunks: return np.zeros((0,), dtype=np.float32)
214
  out = chunks[0]
@@ -228,7 +223,7 @@ def _pcm_f32_to_b64(x: np.ndarray) -> str:
228
  if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
229
  return base64.b64encode(x.tobytes()).decode("ascii")
230
 
231
- # ---------- text split ----------
232
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
233
  _WS = re.compile(r"\s+")
234
  def _fast_split(text: str, limit: int) -> List[str]:
@@ -272,17 +267,20 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int, first_seg
272
  except Exception: pass
273
  return parts + (rest or [text_for_rest])
274
 
275
- # ---------- TTS endpoint ----------
276
  @spaces.GPU(duration=60)
277
- def text_to_speech(belarusian_story, speaker_audio_file=None,
278
- min_buffer_s: float = DEF_MIN_BUFFER_S,
279
- first_chunk_s: float = DEF_FIRST_CHUNK_S,
280
- enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
281
- tokens_per_step: int = DEF_TOKENS_PER_STEP,
282
- first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT):
 
 
283
  t0 = time.perf_counter()
284
  if not belarusian_story or str(belarusian_story).strip() == "":
285
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
 
286
  if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
287
  speaker_audio_file = default_voice_file
288
 
@@ -313,13 +311,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None,
313
  yield ("", None, None, json.dumps(server_metrics))
314
 
315
  full_audio_chunks=[]; first_chunk_seen=False; t_gen0=time.perf_counter()
 
316
  for part in texts:
317
  gen = XTTS_MODEL.generate(
318
  text=part, do_stream=True, language=lang_short,
319
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
320
- min_buffer_s=float(first_chunk_s), tokens_per_step=int(tokens_per_step),
 
321
  stream_chunk_size_s=float(first_chunk_s),
322
- temperature=0.1, length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3,
 
323
  )
324
  for buf in _chunker(gen, sampling_rate, float(min_buffer_s)):
325
  if not first_chunk_seen:
@@ -349,7 +350,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None,
349
 
350
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
351
 
352
- # ---------- UI ----------
353
  examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
354
 
355
  with gr.Blocks() as demo:
@@ -364,7 +365,7 @@ with gr.Blocks() as demo:
364
  with gr.Row():
365
  ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01,
366
  label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
367
- ui_lowwm = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM, step=0.005,
368
  label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
369
  with gr.Row():
370
  apply_btn = gr.Button("Прымяніць налады прайгравальніка")
@@ -383,15 +384,7 @@ with gr.Blocks() as demo:
383
  play_btn = gr.Button("▶️ Play (stream)")
384
  stop_btn = gr.Button("⏹ Stop (stream)")
385
  run_btn = gr.Button("Згенераваць")
386
- gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
387
-
388
- # невялікі CSS на ўсякі выпадак
389
- gr.HTML("""
390
- <style>
391
- #preroll_slider input[type="range"],
392
- #lowwm_slider input[type="range"] { pointer-events:auto !important; cursor: default !important; }
393
- </style>
394
- """)
395
 
396
  log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
397
  label="Лагі плэера")
@@ -403,7 +396,7 @@ with gr.Blocks() as demo:
403
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
404
  play_final_btn = gr.Button("▶️ Play Final")
405
 
406
- # ---- AudioWorklet processor JS ----
407
  AUDIO_WORKLET_PROCESSOR = r"""
408
  class StreamBufferProcessor extends AudioWorkletProcessor {
409
  constructor() {
@@ -415,6 +408,7 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
415
  this.thresholdSamples = 0;
416
  this.lowWatermarkSamples = 0;
417
  this.underrunSent = false;
 
418
  this.port.onmessage = (e) => {
419
  const msg = e.data || {};
420
  if (msg.type === 'push') {
@@ -426,12 +420,19 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
426
  } else if (msg.type === 'set_thresholds') {
427
  this.thresholdSamples = msg.thresholdSamples|0;
428
  this.lowWatermarkSamples = msg.lowWatermarkSamples|0;
 
 
 
 
 
429
  }
430
  };
431
  }
 
432
  process(inputs, outputs, parameters) {
433
  const out = outputs[0][0];
434
  let i = 0;
 
435
  if (!this.started) {
436
  if (this.bufferedSamples >= this.thresholdSamples) {
437
  this.started = true;
@@ -441,6 +442,7 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
441
  return true;
442
  }
443
  }
 
444
  while (i < out.length) {
445
  if (this.queue.length === 0) {
446
  if (!this.underrunSent) { this.underrunSent = true; this.port.postMessage({ type:'underrun' }); }
@@ -466,16 +468,15 @@ class StreamBufferProcessor extends AudioWorkletProcessor {
466
  registerProcessor('stream-buffer', StreamBufferProcessor);
467
  """
468
 
469
- # ---- INIT + player ----
470
  INIT_RESET_AND_PLAY_JS = f"""
471
  () => {{
472
- const sampleRate = {sampling_rate};
473
  const AC = window.AudioContext || window.webkitAudioContext;
474
  if (!AC) return;
475
 
476
- function getLocalFloat(key, fallback) {{
477
  try {{ const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; }} catch(e) {{}}
478
- return fallback;
479
  }}
480
 
481
  const DEFAULT_PREROLL = {DEF_CLIENT_PREROLL};
@@ -486,132 +487,155 @@ registerProcessor('stream-buffer', StreamBufferProcessor);
486
  let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
487
  let LOW_WM_S = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  function toSec(ms) {{ return (ms/1000); }}
 
490
  function logUpdate() {{
491
- const el = document.getElementById('wa-log');
492
- if (!el || !window.__wa || !window.__wa.meta) return;
493
- const m = window.__wa.meta;
494
  const lines = [];
495
  lines.push("Клік (Згенераваць): 0.000 s");
496
- if (m.t_first_push_ms) {{
497
- const click_to_first = (m.t_first_push_ms - m.t_click_ms)/1000;
498
- lines.push("Першы чанк прыйшоў: " + click_to_first.toFixed(3) + " s");
499
- if (m.t_first_audio_ms) {{
500
- lines.push("Пачатак прайгравання: " + ((m.t_first_audio_ms - m.t_click_ms)/1000).toFixed(3) + " s");
501
- lines.push("Затрымка (чанк→аўдыя): " + ((m.t_first_audio_ms - m.t_first_push_ms)/1000).toFixed(3) + " s");
502
  }}
503
  }}
504
- const s = (m.server || {{}});
505
- function p(v){{return (v==null)?"n/a":v.toFixed(3)+" s";}}
506
- lines.push(""); lines.push("— Серверныя метрыкі —");
507
- lines.push("Latents (умоўны голас): " + p(s.latents_s));
508
- lines.push("Падзел тэксту: " + p(s.text_split_s));
509
- lines.push("Ініт→1-ы чанк: " + p(s.gen_init_to_first_chunk_s));
510
- lines.push("Усё да 1-га чанка: " + p(s.until_first_chunk_total_s));
511
- lines.push("Іншая серверная апрац.: " + p(s.server_unaccounted_before_first_chunk_s));
512
- lines.push("Запіс WAV: " + p(s.file_write_s));
513
- if (m.click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
514
- const est = Math.max(0, m.click_to_first_chunk_s - s.until_first_chunk_total_s);
515
  lines.push(""); lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est.toFixed(3) + " s");
516
  }}
517
- lines.push(""); lines.push("Статус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
 
518
  lines.push("PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s");
 
 
 
 
519
  el.textContent = lines.join("\\n");
520
  }}
521
 
522
- const ctx = new AC({{ sampleRate }});
523
- const blob = new Blob([`{AUDIO_WORKLET_PROCESSOR}`], {{ type: 'application/javascript' }});
524
- const url = URL.createObjectURL(blob);
525
-
526
- const meta = {{ t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null, server: null, click_to_first_chunk_s: null }};
527
-
528
- let workletNode = null, gate = null, connected = false, queuedSamples = 0, underrunSeen = false;
529
-
530
- async function setup() {{
531
  await ctx.audioWorklet.addModule(url);
532
  workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
533
  gate = ctx.createGain(); gate.gain.value = 1.0;
534
  workletNode.connect(gate);
535
 
536
- workletNode.port.postMessage({{
537
- type: 'set_thresholds',
538
- thresholdSamples: Math.floor(PREROLL_S * sampleRate),
539
- lowWatermarkSamples: Math.floor(LOW_WM_S * sampleRate),
540
- }});
541
-
542
  workletNode.port.onmessage = (e) => {{
543
  const msg = e.data || {{}};
544
- if (msg.type === 'first_audio') {{
 
 
 
 
 
 
 
 
545
  if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
546
- }} else if (msg.type === 'underrun') {{ underrunSeen = true; }}
 
 
547
  }};
548
 
 
 
 
 
 
 
 
549
  window.__wa = {{
550
  ctx, workletNode, gate,
551
  get playing() {{ return connected; }},
552
- get eos() {{ return false; }},
553
- set eos(v) {{}},
554
  meta,
555
  push: async (f32) => {{
556
  try {{ await ctx.resume(); }} catch(e) {{}}
557
- workletNode.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]);
558
- queuedSamples += f32.length;
559
- if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); meta.click_to_first_chunk_s = (meta.t_first_push_ms - meta.t_click_ms)/1000; logUpdate(); }}
560
- if (!connected) {{ try {{ gate.connect(ctx.destination); connected = true; }} catch(e) {{}} logUpdate(); }}
 
 
561
  }},
562
- stop: () => {{ if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected = false; logUpdate(); }} }},
563
  reset: () => {{
564
- try {{ if (underrunSeen) {{ const cur = Math.min({MAX_CLIENT_PREROLL}, PREROLL_S + {STEP_CLIENT_PREROLL}); localStorage.setItem("tts_preroll_s", String(cur)); }} }} catch(e) {{}}
565
- queuedSamples = 0; underrunSeen = False;
 
 
 
 
566
  if (workletNode) {{
567
- workletNode.port.postMessage({{ type: 'reset' }});
568
- workletNode.port.postMessage({{ type:'set_thresholds', thresholdSamples: Math.floor(PREROLL_S*sampleRate), lowWatermarkSamples: Math.floor(LOW_WM_S*sampleRate) }});
 
 
 
 
 
569
  }}
570
- if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected = false; }}
571
  meta.t_first_push_ms = null; meta.t_first_audio_ms = null; meta.click_to_first_chunk_s = null; logUpdate();
572
  }},
573
  updateLog: logUpdate,
 
 
 
 
 
 
 
 
 
 
 
 
574
  }};
575
- }}
576
- setup();
577
  }}
578
  """.replace("{AUDIO_WORKLET_PROCESSOR}", AUDIO_WORKLET_PROCESSOR)
579
 
580
  STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
581
  PLAY_JS = "() => { if (window.__wa) { try { window.__wa.ctx.resume(); } catch(e){}; if (!window.__wa.playing) { try { window.__wa.gate.connect(window.__wa.ctx.destination); } catch(e){} } window.__wa.updateLog && window.__wa.updateLog(); } }"
582
 
583
- # ---- Apply/Reset client settings (live) ----
584
  APPLY_JS = """
585
  () => {
586
- const pWrap = document.getElementById('preroll_slider');
587
- const lWrap = document.getElementById('lowwm_slider');
588
- const p = pWrap ? pWrap.querySelector('input[type="range"]') : null;
589
- const l = lWrap ? lWrap.querySelector('input[type="range"]') : null;
590
  const pr = p && p.value ? parseFloat(p.value) : 0.18;
591
  const lw = l && l.value ? parseFloat(l.value) : 0.06;
592
- try { localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); } catch(e) {}
593
- if (window.__wa && window.__wa.workletNode && window.__wa.ctx) {
594
- const sr = window.__wa.ctx.sampleRate || 24000;
595
- window.__wa.workletNode.port.postMessage({ type:'set_thresholds', thresholdSamples: Math.floor(pr*sr), lowWatermarkSamples: Math.floor(lw*sr) });
596
- window.__wa.updateLog && window.__wa.updateLog();
597
- }
598
  }
599
  """
600
- RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} })()"
601
 
602
- # ---- enable sliders on load (fix 'forbidden' cursor) ----
603
- ENABLE_SLIDERS_JS = """
604
- () => {
605
- ['preroll_slider','lowwm_slider'].forEach(id => {
606
- const wrap = document.getElementById(id);
607
- if (!wrap) return;
608
- const inp = wrap.querySelector('input[type="range"]');
609
- if (inp) { inp.disabled = false; inp.removeAttribute('readonly'); inp.style.pointerEvents='auto'; inp.style.cursor='default'; }
610
- });
611
- }
612
- """
613
 
614
- # ---- streaming JS ----
615
  PUSH_JS = """
616
  (b64) => {
617
  if (!window.__wa || !b64) return;
@@ -629,28 +653,23 @@ registerProcessor('stream-buffer', StreamBufferProcessor);
629
  try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {}
630
  }
631
  """
632
- PLAY_FINAL_JS = """
633
- () => { const host = document.getElementById('final-audio'); if (!host) return; const audio = host.querySelector('audio'); if (audio) { try { audio.play(); } catch(e) {} } }
634
- """
635
 
636
- # ---- wiring ----
637
  apply_btn.click(fn=None, inputs=[], outputs=[], js=APPLY_JS)
638
  reset_btn.click(fn=None, inputs=[], outputs=[], js=RESET_JS)
639
-
640
  play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
641
  stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
642
 
643
  run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
644
- run_btn.click(fn=text_to_speech, inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
 
645
  outputs=[stream_pipe, final_file, final_audio, log_pipe])
646
 
647
  stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
648
  log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
649
  play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
650
 
651
- # <<< enable sliders right after app loads >>>
652
- demo.load(fn=None, inputs=None, outputs=None, js=ENABLE_SLIDERS_JS)
653
-
654
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
655
 
656
  if __name__ == "__main__":
 
3
  os.environ.setdefault("MKL_NUM_THREADS", "1")
4
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
5
 
6
+ import sys, re, time, json, base64, hashlib, tempfile, subprocess, inspect, pathlib
 
 
 
 
 
 
 
 
7
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
8
  from dataclasses import dataclass
 
9
 
10
  import spaces
11
  import gradio as gr
 
14
  from huggingface_hub import hf_hub_download
15
  from scipy.io.wavfile import write
16
 
17
+ # ----------------- clone fork -----------------
18
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
19
  REPO_DIR = "coqui-ai-TTS"
20
  if not os.path.exists(REPO_DIR):
 
27
  from TTS.tts.models.xtts import Xtts
28
  from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
29
 
30
+ # ----------------- model files ----------------
31
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
32
+ model_dir = "./model"; os.makedirs(model_dir, exist_ok=True)
 
33
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
34
  if not os.path.exists(os.path.join(model_dir, fname)):
35
  hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
 
38
  vocab_file = os.path.join(model_dir, "vocab.json")
39
  default_voice_file = os.path.join(model_dir, "voice.wav")
40
 
41
+ # ----------------- load XTTS ------------------
42
  config = XttsConfig(); config.load_json(config_file)
43
  XTTS_MODEL = Xtts.init_from_config(config)
44
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
 
50
  torch.backends.cudnn.allow_tf32 = True
51
  torch.backends.cudnn.benchmark = True
52
  torch.set_float32_matmul_precision("high")
53
+
54
  XTTS_MODEL.to(device).eval()
55
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
56
 
57
  tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
58
  XTTS_MODEL.tokenizer = tokenizer
59
 
60
+ # ----------------- defaults -------------------
61
+ DEF_MIN_BUFFER_S = 0.06
62
+ DEF_FIRST_CHUNK_S = 0.03
63
+ DEF_TOKENS_PER_STEP = 1
64
+ DEF_ENABLE_TEXT_SPLIT = True
65
  DEF_FIRST_SEGMENT_LIMIT = 160
66
  FADE_S = 0.004
67
 
68
+ DEF_CLIENT_PREROLL = 0.18
69
+ DEF_CLIENT_LOWWM = 0.06
70
+ MAX_CLIENT_PREROLL = 0.40
71
+ STEP_CLIENT_PREROLL = 0.04
72
 
73
+ # ----------------- audio utils ----------------
74
+ def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
 
75
 
76
  def _to_np_audio(x) -> np.ndarray:
77
  if isinstance(x, dict) and "wav" in x: x = x["wav"]
78
  if isinstance(x, torch.Tensor):
79
  if x.dtype != torch.float32: x = x.float()
80
  return x.detach().cpu().contiguous().view(-1).numpy()
81
+ x = np.asarray(x);
82
+ if x.ndim > 1: x = x.reshape(-1)
83
  return x.astype(np.float32, copy=False) if x.dtype != np.float32 else x
84
 
85
  def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
 
89
  fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
90
  if fade_n <= 1: return np.concatenate([a, b], axis=0)
91
  fade_out = np.linspace(1.0, 0.0, fade_n, dtype=np.float32); fade_in = 1.0 - fade_out
92
+ head = a[:-fade_n]; tail = a[-fade_n:] * fade_out + b[:fade_n] * fade_in; rest = b[fade_n:]
93
  return np.concatenate([head, tail, rest], axis=0)
94
 
95
  def _bpe_prefixes(text: str, lang: str, step_tokens: int):
 
99
  if n % step_tokens != 0: yield tokenizer.decode(ids, lang=lang); return
100
  except Exception: pass
101
  pseudo = re.findall(r"\S+|\s+", text); acc = ""
102
+ for i in range(0, len(pseudo), step_tokens): acc = "".join(pseudo[: i + step_tokens]); yield acc
 
103
  if acc.strip() != text.strip(): yield text
104
 
105
  def _native_stream(model: Xtts, text: str, language: str, gpt_cond_latent, speaker_embedding, **gen_kwargs):
 
154
  Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
155
  init_stream_support()
156
 
157
+ # ----------------- latents cache ---------------
158
+ PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
159
+ PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
160
+
161
  @dataclass(frozen=True)
162
  class LatentsMeta:
163
  model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool; xtts_git: str | None = None
164
+
165
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
166
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
167
 
 
171
  return hashlib.md5((base + "|" + meta_str).encode("utf-8")).hexdigest()
172
 
173
  def _latents_disk_path(key: str) -> pathlib.Path: return PERSIST_LATENTS_DIR / f"{key}.pt"
174
+
175
  def _save_latents_to_disk(key: str, gpt, spk): torch.save({"gpt_cond_latent": gpt.cpu(), "speaker_embedding": spk.cpu()}, _latents_disk_path(key))
176
+
177
  def _load_latents_from_disk(key: str):
178
+ p = _latents_disk_path(key)
179
  if not p.exists(): return None
180
+ obj = torch.load(p, map_location="cpu"); return obj["gpt_cond_latent"], obj["speaker_embedding"]
181
 
182
  def _compute_latents_cpu(path: str | None):
183
  with torch.inference_mode():
 
203
  try: _ = _latents_for(default_voice_file)
204
  except Exception as e: print(f"[warn] precompute default voice latents failed: {e}")
205
 
206
+ # ----------------- stream packing --------------
207
  def _merge_for_file(chunks: List[np.ndarray]) -> np.ndarray:
208
  if not chunks: return np.zeros((0,), dtype=np.float32)
209
  out = chunks[0]
 
223
  if x.dtype != np.float32: x = x.astype(np.float32, copy=False)
224
  return base64.b64encode(x.tobytes()).decode("ascii")
225
 
226
+ # ----------------- split text -----------------
227
  _SENT_END = re.compile(r"([\.!\?…]+[»\")\]]*\s+)")
228
  _WS = re.compile(r"\s+")
229
  def _fast_split(text: str, limit: int) -> List[str]:
 
267
  except Exception: pass
268
  return parts + (rest or [text_for_rest])
269
 
270
+ # ----------------- TTS endpoint ---------------
271
  @spaces.GPU(duration=60)
272
+ def text_to_speech(
273
+ belarusian_story, speaker_audio_file=None,
274
+ min_buffer_s: float = DEF_MIN_BUFFER_S,
275
+ first_chunk_s: float = DEF_FIRST_CHUNK_S,
276
+ enable_text_splitting: bool = DEF_ENABLE_TEXT_SPLIT,
277
+ tokens_per_step: int = DEF_TOKENS_PER_STEP,
278
+ first_segment_limit: int = DEF_FIRST_SEGMENT_LIMIT,
279
+ ):
280
  t0 = time.perf_counter()
281
  if not belarusian_story or str(belarusian_story).strip() == "":
282
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
283
+
284
  if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and getattr(speaker_audio_file, "name", "") == ""):
285
  speaker_audio_file = default_voice_file
286
 
 
311
  yield ("", None, None, json.dumps(server_metrics))
312
 
313
  full_audio_chunks=[]; first_chunk_seen=False; t_gen0=time.perf_counter()
314
+
315
  for part in texts:
316
  gen = XTTS_MODEL.generate(
317
  text=part, do_stream=True, language=lang_short,
318
  gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
319
+ min_buffer_s=float(first_chunk_s),
320
+ tokens_per_step=int(tokens_per_step),
321
  stream_chunk_size_s=float(first_chunk_s),
322
+ temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
323
+ top_k=10, top_p=0.3,
324
  )
325
  for buf in _chunker(gen, sampling_rate, float(min_buffer_s)):
326
  if not first_chunk_seen:
 
350
 
351
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
352
 
353
+ # ----------------- UI ------------------------
354
  examples=[["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"]]
355
 
356
  with gr.Blocks() as demo:
 
365
  with gr.Row():
366
  ui_preroll = gr.Slider(0.08, 0.40, value=DEF_CLIENT_PREROLL, step=0.01,
367
  label="PREROLL (сек.)", elem_id="preroll_slider", interactive=True)
368
+ ui_lowwm = gr.Slider(0.02, 0.15, value=DEF_CLIENT_LOWWM, step=0.005,
369
  label="Ніжні ўзровень (сек.)", elem_id="lowwm_slider", interactive=True)
370
  with gr.Row():
371
  apply_btn = gr.Button("Прымяніць налады прайгравальніка")
 
384
  play_btn = gr.Button("▶️ Play (stream)")
385
  stop_btn = gr.Button("⏹ Stop (stream)")
386
  run_btn = gr.Button("Згенераваць")
387
+ gr.Markdown(f"**Model SR:** {sampling_rate} Hz")
 
 
 
 
 
 
 
 
388
 
389
  log_panel = gr.HTML(value='<div id="wa-log" style="font-family:system-ui;font-size:12px;white-space:pre-line">[лог пусты]</div>',
390
  label="Лагі плэера")
 
396
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
397
  play_final_btn = gr.Button("▶️ Play Final")
398
 
399
+ # ---------- AudioWorklet processor (with handshake) ----------
400
  AUDIO_WORKLET_PROCESSOR = r"""
401
  class StreamBufferProcessor extends AudioWorkletProcessor {
402
  constructor() {
 
408
  this.thresholdSamples = 0;
409
  this.lowWatermarkSamples = 0;
410
  this.underrunSent = false;
411
+
412
  this.port.onmessage = (e) => {
413
  const msg = e.data || {};
414
  if (msg.type === 'push') {
 
420
  } else if (msg.type === 'set_thresholds') {
421
  this.thresholdSamples = msg.thresholdSamples|0;
422
  this.lowWatermarkSamples = msg.lowWatermarkSamples|0;
423
+ // handshake back to main
424
+ this.port.postMessage({ type: 'thresholds_ready',
425
+ thresholdSamples: this.thresholdSamples,
426
+ lowWatermarkSamples: this.lowWatermarkSamples,
427
+ ctxSR: sampleRate });
428
  }
429
  };
430
  }
431
+
432
  process(inputs, outputs, parameters) {
433
  const out = outputs[0][0];
434
  let i = 0;
435
+
436
  if (!this.started) {
437
  if (this.bufferedSamples >= this.thresholdSamples) {
438
  this.started = true;
 
442
  return true;
443
  }
444
  }
445
+
446
  while (i < out.length) {
447
  if (this.queue.length === 0) {
448
  if (!this.underrunSent) { this.underrunSent = true; this.port.postMessage({ type:'underrun' }); }
 
468
  registerProcessor('stream-buffer', StreamBufferProcessor);
469
  """
470
 
471
+ # ---------- INIT + player (wait-for-thresholds) ----------
472
  INIT_RESET_AND_PLAY_JS = f"""
473
  () => {{
 
474
  const AC = window.AudioContext || window.webkitAudioContext;
475
  if (!AC) return;
476
 
477
+ function getLocalFloat(key, defVal) {{
478
  try {{ const v = parseFloat(localStorage.getItem(key)); if (isFinite(v) && v > 0) return v; }} catch(e) {{}}
479
+ return defVal;
480
  }}
481
 
482
  const DEFAULT_PREROLL = {DEF_CLIENT_PREROLL};
 
487
  let PREROLL_S = getLocalFloat("tts_preroll_s", DEFAULT_PREROLL);
488
  let LOW_WM_S = getLocalFloat("tts_lowwm_s", DEFAULT_LOWWM);
489
 
490
+ const blob = new Blob([`{AUDIO_WORKLET_PROCESSOR}`], {{ type: 'application/javascript' }});
491
+ const url = URL.createObjectURL(blob);
492
+
493
+ const ctx = new AC({{ sampleRate: {sampling_rate} }});
494
+ const meta = {{
495
+ t_click_ms: performance.now(), t_first_push_ms: null, t_first_audio_ms: null,
496
+ server: null, click_to_first_chunk_s: null, ctx_sr: ctx.sampleRate,
497
+ thresholds: null
498
+ }};
499
+
500
+ let workletNode = null, gate = null, connected = false;
501
+ let ready = false; // WAIT for thresholds_ready
502
+ const pending = []; // queue chunks before ready
503
+ let underrunSeen = false;
504
+
505
  function toSec(ms) {{ return (ms/1000); }}
506
+ function p3(x) {{ return (x==null)?'n/a':x.toFixed(3)+' s'; }}
507
  function logUpdate() {{
508
+ const el = document.getElementById('wa-log'); if (!el) return;
509
+ const s = meta.server || {{}};
 
510
  const lines = [];
511
  lines.push("Клік (Згенераваць): 0.000 s");
512
+ if (meta.t_first_push_ms) {{
513
+ lines.push("Першы чанк прыйшоў: " + (toSec(meta.t_first_push_ms - meta.t_click_ms)).toFixed(3) + " s");
514
+ if (meta.t_first_audio_ms) {{
515
+ lines.push("Пачатак прайгравання: " + (toSec(meta.t_first_audio_ms - meta.t_click_ms)).toFixed(3) + " s");
516
+ lines.push("Затрымка анк→аўдыя): " + (toSec(meta.t_first_audio_ms - meta.t_first_push_ms)).toFixed(3) + " s");
 
517
  }}
518
  }}
519
+ lines.push("");
520
+ lines.push(" Серверныя метрыкі —");
521
+ lines.push("Latents (умоўны голас): " + p3(s.latents_s));
522
+ lines.push("Падзел тэксту: " + p3(s.text_split_s));
523
+ lines.push("Ініт→1-ы чанк: " + p3(s.gen_init_to_first_chunk_s));
524
+ lines.push("Усё да 1-га чанка: " + p3(s.until_first_chunk_total_s));
525
+ lines.push("Іншая серверная апрац.: " + p3(s.server_unaccounted_before_first_chunk_s));
526
+ lines.push("Запіс WAV: " + p3(s.file_write_s));
527
+ if (meta.click_to_first_chunk_s !== null && s.until_first_chunk_total_s !== null) {{
528
+ const est = Math.max(0, meta.click_to_first_chunk_s - s.until_first_chunk_total_s);
 
529
  lines.push(""); lines.push("Ацэнка чаргі ZeroGPU + сеткі: " + est.toFixed(3) + " s");
530
  }}
531
+ lines.push("");
532
+ lines.push("Статус стриму: " + (connected ? "playing" : "stopped"));
533
  lines.push("PREROLL: " + PREROLL_S.toFixed(3) + " s | LOW WM: " + LOW_WM_S.toFixed(3) + " s");
534
+ lines.push("ctx.sampleRate: " + meta.ctx_sr + " Hz");
535
+ if (meta.thresholds) {{
536
+ lines.push("thresholdSamples: " + meta.thresholds.thresholdSamples + " | lowWM: " + meta.thresholds.lowWatermarkSamples);
537
+ }}
538
  el.textContent = lines.join("\\n");
539
  }}
540
 
541
+ (async () => {{
 
 
 
 
 
 
 
 
542
  await ctx.audioWorklet.addModule(url);
543
  workletNode = new AudioWorkletNode(ctx, 'stream-buffer');
544
  gate = ctx.createGain(); gate.gain.value = 1.0;
545
  workletNode.connect(gate);
546
 
 
 
 
 
 
 
547
  workletNode.port.onmessage = (e) => {{
548
  const msg = e.data || {{}};
549
+ if (msg.type === 'thresholds_ready') {{
550
+ ready = true; meta.thresholds = {{ thresholdSamples: msg.thresholdSamples, lowWatermarkSamples: msg.lowWatermarkSamples }};
551
+ // flush pending
552
+ for (const f32 of pending) {{
553
+ workletNode.port.postMessage({{ type:'push', buffer:f32.buffer }}, [f32.buffer]);
554
+ }}
555
+ pending.length = 0;
556
+ logUpdate();
557
+ }} else if (msg.type === 'first_audio') {{
558
  if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
559
+ }} else if (msg.type === 'underrun') {{
560
+ underrunSeen = true;
561
+ }}
562
  }};
563
 
564
+ // send thresholds using **ctx.sampleRate**
565
+ workletNode.port.postMessage({{
566
+ type: 'set_thresholds',
567
+ thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
568
+ lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
569
+ }});
570
+
571
  window.__wa = {{
572
  ctx, workletNode, gate,
573
  get playing() {{ return connected; }},
 
 
574
  meta,
575
  push: async (f32) => {{
576
  try {{ await ctx.resume(); }} catch(e) {{}}
577
+ if (!meta.t_first_push_ms) {{ meta.t_first_push_ms = performance.now(); meta.click_to_first_chunk_s = (meta.t_first_push_ms - meta.t_click_ms)/1000; }}
578
+ // if thresholds not ready yet — buffer locally
579
+ if (!ready) {{ pending.push(f32); }}
580
+ else {{ workletNode.port.postMessage({{ type:'push', buffer:f32.buffer }}, [f32.buffer]); }}
581
+ if (!connected) {{ try {{ gate.connect(ctx.destination); connected = true; }} catch(e) {{}} }}
582
+ logUpdate();
583
  }},
584
+ stop: () => {{ if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected=false; logUpdate(); }} }},
585
  reset: () => {{
586
+ try {{
587
+ if (underrunSeen) {{
588
+ const cur = Math.min({MAX_CLIENT_PREROLL}, PREROLL_S + {STEP_CLIENT_PREROLL});
589
+ localStorage.setItem("tts_preroll_s", String(cur));
590
+ }}
591
+ }} catch(e) {{}}
592
  if (workletNode) {{
593
+ workletNode.port.postMessage({{ type:'reset' }});
594
+ ready = false; pending.length = 0;
595
+ workletNode.port.postMessage({{
596
+ type:'set_thresholds',
597
+ thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
598
+ lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
599
+ }});
600
  }}
601
+ if (connected) {{ try {{ gate.disconnect(); }} catch(e) {{}} connected=false; }}
602
  meta.t_first_push_ms = null; meta.t_first_audio_ms = null; meta.click_to_first_chunk_s = null; logUpdate();
603
  }},
604
  updateLog: logUpdate,
605
+ applyClient: (pr, lw) => {{
606
+ PREROLL_S = pr; LOW_WM_S = lw;
607
+ try {{ localStorage.setItem("tts_preroll_s", String(pr)); localStorage.setItem("tts_lowwm_s", String(lw)); }} catch(e) {{}}
608
+ if (workletNode) {{
609
+ workletNode.port.postMessage({{
610
+ type:'set_thresholds',
611
+ thresholdSamples: Math.floor(PREROLL_S * ctx.sampleRate),
612
+ lowWatermarkSamples: Math.floor(LOW_WM_S * ctx.sampleRate),
613
+ }});
614
+ }}
615
+ logUpdate();
616
+ }}
617
  }};
618
+ logUpdate();
619
+ } )();
620
  }}
621
  """.replace("{AUDIO_WORKLET_PROCESSOR}", AUDIO_WORKLET_PROCESSOR)
622
 
623
  STOP_JS = "() => { if (window.__wa) window.__wa.stop(); }"
624
  PLAY_JS = "() => { if (window.__wa) { try { window.__wa.ctx.resume(); } catch(e){}; if (!window.__wa.playing) { try { window.__wa.gate.connect(window.__wa.ctx.destination); } catch(e){} } window.__wa.updateLog && window.__wa.updateLog(); } }"
625
 
 
626
  APPLY_JS = """
627
  () => {
628
+ const p = document.getElementById('preroll_slider')?.querySelector('input[type="range"]');
629
+ const l = document.getElementById('lowwm_slider')?.querySelector('input[type="range"]');
 
 
630
  const pr = p && p.value ? parseFloat(p.value) : 0.18;
631
  const lw = l && l.value ? parseFloat(l.value) : 0.06;
632
+ if (window.__wa && window.__wa.applyClient) { window.__wa.applyClient(pr, lw); }
 
 
 
 
 
633
  }
634
  """
 
635
 
636
+ RESET_JS = "(() => { try { localStorage.removeItem('tts_preroll_s'); localStorage.removeItem('tts_lowwm_s'); } catch(e) {} })()"
 
 
 
 
 
 
 
 
 
 
637
 
638
+ # -------- streaming + logs --------
639
  PUSH_JS = """
640
  (b64) => {
641
  if (!window.__wa || !b64) return;
 
653
  try { if (js) { const obj = JSON.parse(js); window.__wa.meta.server = obj; window.__wa.updateLog && window.__wa.updateLog(); } } catch (e) {}
654
  }
655
  """
656
+ PLAY_FINAL_JS = "(() => { const el=document.getElementById('final-audio'); const a=el?.querySelector('audio'); if (a) { try{a.play();}catch(e){} } })()"
 
 
657
 
658
+ # wiring
659
  apply_btn.click(fn=None, inputs=[], outputs=[], js=APPLY_JS)
660
  reset_btn.click(fn=None, inputs=[], outputs=[], js=RESET_JS)
 
661
  play_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_JS)
662
  stop_btn.click(fn=None, inputs=[], outputs=[], js=STOP_JS)
663
 
664
  run_btn.click(fn=None, inputs=[], outputs=[], js=INIT_RESET_AND_PLAY_JS)
665
+ run_btn.click(fn=text_to_speech,
666
+ inputs=[inp_text, inp_voice, ui_minbuf, ui_firstch, ui_split, ui_tokens, ui_firstseg],
667
  outputs=[stream_pipe, final_file, final_audio, log_pipe])
668
 
669
  stream_pipe.change(fn=None, inputs=[stream_pipe], outputs=[], js=PUSH_JS)
670
  log_pipe.change(fn=None, inputs=[log_pipe], outputs=[], js=LOG_JS)
671
  play_final_btn.click(fn=None, inputs=[], outputs=[], js=PLAY_FINAL_JS)
672
 
 
 
 
673
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
674
 
675
  if __name__ == "__main__":