archivartaunik commited on
Commit
3e4a584
·
verified ·
1 Parent(s): 8c8bf98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -47
app.py CHANGED
@@ -71,7 +71,6 @@ XTTS_MODEL.load_checkpoint(
71
 
72
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
73
 
74
- # налады
75
  torch.set_num_threads(1)
76
  if device.startswith("cuda"):
77
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -87,7 +86,7 @@ tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
87
  XTTS_MODEL.tokenizer = tokenizer
88
 
89
  # =========================================================
90
- # 3) Канстанты стриму (меншыя затрымкі)
91
  # =========================================================
92
  MIN_BUFFER_S = 0.020
93
  RUNTIME_FIRST_CHUNK_S = 0.010
@@ -97,7 +96,7 @@ ENABLE_TEXT_SPLITTING = True
97
  FIRST_SEGMENT_LIMIT = 120
98
 
99
  # =========================================================
100
- # 4) Утыліты аўдыя (стрим у INT16 base64)
101
  # =========================================================
102
  def _seconds_to_samples(sec: float, sr: int) -> int:
103
  return max(1, int(sec * sr))
@@ -165,7 +164,7 @@ def _pcm_f32_to_int16_b64(x: np.ndarray) -> str:
165
  if x.dtype != np.float32:
166
  x = x.astype(np.float32, copy=False)
167
  y = np.clip(x, -1.0, 0.9999695)
168
- i16 = (y * 32767.0).astype("<i2", copy=False) # little-endian int16
169
  return base64.b64encode(i16.tobytes()).decode("ascii")
170
 
171
  # =========================================================
@@ -394,7 +393,7 @@ except Exception as e:
394
  print(f"[warn] warm-up failed: {e}")
395
 
396
  # =========================================================
397
- # 7) Падзел тэксту (хуткі + fallback)
398
  # =========================================================
399
  _SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
400
  _WS = re.compile(r"\s+")
@@ -469,16 +468,13 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
469
  return parts + (rest or [text_for_rest])
470
 
471
  # =========================================================
472
- # 8) TTS стрим + лагі
473
- # — замянілі механіку: Python выдае JSON-пакеты ў адзін схаваны Textbox,
474
- # фронт (JS у gr.HTML) опускае іх праз polling у AudioWorklet
475
  # =========================================================
476
  @spaces.GPU(duration=60)
477
  def text_to_speech(belarusian_story, speaker_audio_file=None):
478
  t0 = time.perf_counter()
479
 
480
  if not belarusian_story or str(belarusian_story).strip() == "":
481
- # пусты пакет, каб фронт не зламаўся
482
  yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
483
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
484
 
@@ -491,7 +487,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
491
  lang_short = "be"
492
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
493
 
494
- # Latents (кэш CPU/GPU)
495
  t_lat0 = time.perf_counter()
496
  to_dev = device if device.startswith("cuda") else None
497
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
@@ -514,7 +510,6 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
514
  }
515
 
516
  seq = 0
517
- # пачатковы пакет — паведамляем фронту, што ўсё гатова
518
  yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
519
 
520
  full_audio_chunks: List[np.ndarray] = []
@@ -568,18 +563,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
568
  final_file_path = tmp.name
569
  final_audio_path = tmp.name
570
  except Exception as e:
571
- # паведамляем пра памылку праз лаг
572
  server_metrics["_file_error"] = str(e)
573
  finally:
574
  t_w1 = time.perf_counter()
575
  server_metrics["file_write_s"] = (t_w1 - t_w0)
576
 
577
- # стоп-пакет
578
  seq += 1
579
  yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
580
 
581
  # =========================================================
582
- # 9) UI + AudioWorklet з polling (без SharedArrayBuffer, без gr.Audio streaming)
583
  # =========================================================
584
  examples = [
585
  [
@@ -606,12 +599,19 @@ with gr.Blocks() as demo:
606
  label="Лагі плэера",
607
  )
608
 
609
- # Схаваныя каналы: stream JSON + выніковы файл + аўдыя
610
- stream_pipe = gr.Textbox(value="", visible=False, label="stream_pipe", elem_id="stream-pipe")
611
  final_file = gr.File(label="Згенераваны WAV (спампаваць)")
612
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
613
  play_final_btn = gr.Button("▶️ Play Final")
614
 
 
 
 
 
 
 
 
615
  # --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
616
  FRONT_HTML = f"""
617
  <script>
@@ -623,7 +623,6 @@ with gr.Blocks() as demo:
623
  function toSec(ms) {{ return (ms/1000); }}
624
  function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
625
 
626
- // Лагі
627
  function updateLog() {{
628
  const el = document.getElementById('wa-log');
629
  if (!el || !window.__wa || !window.__wa.meta) return;
@@ -664,13 +663,22 @@ with gr.Blocks() as demo:
664
  }}
665
 
666
  async function ensureWorklet(ctx) {{
667
- // AudioWorklet код у выглядзе радка
668
  const code = `
669
  class PushPlayerProcessor extends AudioWorkletProcessor {{
670
  constructor() {{
671
  super();
672
  this.queue = [];
673
  this.readIndex = 0;
 
 
 
 
 
 
 
 
 
 
674
  }}
675
  process(inputs, outputs) {{
676
  const out = outputs[0][0];
@@ -694,7 +702,6 @@ registerProcessor('push-player', PushPlayerProcessor);
694
  await ctx.audioWorklet.addModule(url);
695
  }}
696
 
697
- // Ініцыялізацыя плэера
698
  async function ensurePlayer() {{
699
  if (window.__wa) return window.__wa;
700
  if (!AC) return null;
@@ -705,7 +712,7 @@ registerProcessor('push-player', PushPlayerProcessor);
705
  node.connect(ctx.destination);
706
 
707
  let playing = true;
708
- let seq_seen = -1;
709
  const meta = {{
710
  t_click_ms: null,
711
  t_first_push_ms: null,
@@ -716,20 +723,19 @@ registerProcessor('push-player', PushPlayerProcessor);
716
  const api = {{
717
  ctx, node,
718
  get playing() {{ return playing; }},
719
- start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing = true; updateLog(); }},
720
- stop: () => {{ playing = false; updateLog(); }},
721
  reset: () => {{
722
- seq_seen = -1;
723
  meta.t_first_push_ms = null;
724
  meta.t_first_audio_ms = null;
725
  updateLog();
726
  }},
727
- push: (f32) => {{
728
  try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
729
  if (!meta.t_first_push_ms) {{
730
  meta.t_first_push_ms = performance.now();
731
- // адразу адзначым пачатак гуку
732
- if (!meta.t_first_audio_ms && playing) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
733
  updateLog();
734
  }}
735
  if (!playing) api.start();
@@ -741,26 +747,35 @@ registerProcessor('push-player', PushPlayerProcessor);
741
  return api;
742
  }}
743
 
744
- // Polling чытанне з скрытага Textbox
745
- async function pollLoop() {{
746
- const api = await ensurePlayer();
747
- if (!api) return;
748
- const box = document.querySelector('#stream-pipe textarea, #stream-pipe input') || document.getElementById('stream-pipe');
749
- if (!box) return;
 
 
 
 
750
 
751
- function parsePacket(txt) {{
752
- try {{ return JSON.parse(txt); }} catch(e) {{ return null; }}
753
- }}
 
 
754
 
755
- // асноўны цыкл
756
- setInterval(() => {{
757
- const txt = (box.value !== undefined) ? box.value : (box.innerText || box.textContent || '');
758
- const pkt = parsePacket(txt);
759
- if (!pkt || typeof pkt.seq !== 'number') return;
760
- if (!api.meta.t_click_ms) api.meta.t_click_ms = performance.now();
761
 
762
- if (pkt.seq <= (api._seq_seen || -1)) return;
763
- api._seq_seen = pkt.seq;
 
 
 
 
 
 
764
 
765
  if (pkt.log) {{
766
  api.meta.server = pkt.log;
@@ -774,7 +789,6 @@ registerProcessor('push-player', PushPlayerProcessor);
774
  }}
775
 
776
  if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
777
- // распакаваць INT16LE->F32
778
  const bin = atob(pkt.b64);
779
  const len = bin.length;
780
  const buf = new ArrayBuffer(len);
@@ -791,7 +805,6 @@ registerProcessor('push-player', PushPlayerProcessor);
791
  }}, POLL_MS);
792
  }}
793
 
794
- // кнопкі кіравання
795
  window.__wa_start_click = async function() {{
796
  const api = await ensurePlayer();
797
  api.meta.t_click_ms = performance.now();
@@ -812,8 +825,8 @@ registerProcessor('push-player', PushPlayerProcessor);
812
  if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
813
  }};
814
 
815
- // запуск polling адразу пры загрузцы блока
816
- pollLoop();
817
  }})();
818
  </script>
819
  """
 
71
 
72
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
73
 
 
74
  torch.set_num_threads(1)
75
  if device.startswith("cuda"):
76
  torch.backends.cuda.matmul.allow_tf32 = True
 
86
  XTTS_MODEL.tokenizer = tokenizer
87
 
88
  # =========================================================
89
+ # 3) Канстанты стриму
90
  # =========================================================
91
  MIN_BUFFER_S = 0.020
92
  RUNTIME_FIRST_CHUNK_S = 0.010
 
96
  FIRST_SEGMENT_LIMIT = 120
97
 
98
  # =========================================================
99
+ # 4) Аўдыя-ўтыліты
100
  # =========================================================
101
  def _seconds_to_samples(sec: float, sr: int) -> int:
102
  return max(1, int(sec * sr))
 
164
  if x.dtype != np.float32:
165
  x = x.astype(np.float32, copy=False)
166
  y = np.clip(x, -1.0, 0.9999695)
167
+ i16 = (y * 32767.0).astype("<i2", copy=False)
168
  return base64.b64encode(i16.tobytes()).decode("ascii")
169
 
170
  # =========================================================
 
393
  print(f"[warn] warm-up failed: {e}")
394
 
395
  # =========================================================
396
+ # 7) Падзел тэксту
397
  # =========================================================
398
  _SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
399
  _WS = re.compile(r"\s+")
 
468
  return parts + (rest or [text_for_rest])
469
 
470
  # =========================================================
471
+ # 8) TTS стрим (выдае JSON-пакеты ў Textbox)
 
 
472
  # =========================================================
473
  @spaces.GPU(duration=60)
474
  def text_to_speech(belarusian_story, speaker_audio_file=None):
475
  t0 = time.perf_counter()
476
 
477
  if not belarusian_story or str(belarusian_story).strip() == "":
 
478
  yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
479
  raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
480
 
 
487
  lang_short = "be"
488
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
489
 
490
+ # Latents
491
  t_lat0 = time.perf_counter()
492
  to_dev = device if device.startswith("cuda") else None
493
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
 
510
  }
511
 
512
  seq = 0
 
513
  yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
514
 
515
  full_audio_chunks: List[np.ndarray] = []
 
563
  final_file_path = tmp.name
564
  final_audio_path = tmp.name
565
  except Exception as e:
 
566
  server_metrics["_file_error"] = str(e)
567
  finally:
568
  t_w1 = time.perf_counter()
569
  server_metrics["file_write_s"] = (t_w1 - t_w0)
570
 
 
571
  seq += 1
572
  yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
573
 
574
  # =========================================================
575
+ # 9) UI + AudioWorklet з polling (без gr.Audio streaming)
576
  # =========================================================
577
  examples = [
578
  [
 
599
  label="Лагі плэера",
600
  )
601
 
602
+ # ВАЖНА: робім stream_pipe бачным у DOM (visible=True), але хаваем праз CSS
603
+ stream_pipe = gr.Textbox(value="", visible=True, label="stream_pipe", elem_id="stream-pipe")
604
  final_file = gr.File(label="Згенераваны WAV (спампаваць)")
605
  final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
606
  play_final_btn = gr.Button("▶️ Play Final")
607
 
608
+ # CSS — схаваць stream_pipe і актыўнасці
609
+ gr.HTML("""
610
+ <style>
611
+ #stream-pipe { position:absolute; left:-99999px; width:1px; height:1px; opacity:0; pointer-events:none; }
612
+ </style>
613
+ """)
614
+
615
  # --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
616
  FRONT_HTML = f"""
617
  <script>
 
623
  function toSec(ms) {{ return (ms/1000); }}
624
  function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
625
 
 
626
  function updateLog() {{
627
  const el = document.getElementById('wa-log');
628
  if (!el || !window.__wa || !window.__wa.meta) return;
 
663
  }}
664
 
665
  async function ensureWorklet(ctx) {{
 
666
  const code = `
667
  class PushPlayerProcessor extends AudioWorkletProcessor {{
668
  constructor() {{
669
  super();
670
  this.queue = [];
671
  this.readIndex = 0;
672
+ this.port.onmessage = (e) => {{
673
+ const d = e.data || {{}};
674
+ if (d.type === 'push' && d.buffer) {{
675
+ const f32 = new Float32Array(d.buffer);
676
+ this.queue.push(f32);
677
+ }} else if (d.type === 'reset') {{
678
+ this.queue.length = 0;
679
+ this.readIndex = 0;
680
+ }}
681
+ }};
682
  }}
683
  process(inputs, outputs) {{
684
  const out = outputs[0][0];
 
702
  await ctx.audioWorklet.addModule(url);
703
  }}
704
 
 
705
  async function ensurePlayer() {{
706
  if (window.__wa) return window.__wa;
707
  if (!AC) return null;
 
712
  node.connect(ctx.destination);
713
 
714
  let playing = true;
715
+
716
  const meta = {{
717
  t_click_ms: null,
718
  t_first_push_ms: null,
 
723
  const api = {{
724
  ctx, node,
725
  get playing() {{ return playing; }},
726
+ start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing = True; updateLog(); }},
727
+ stop: () => {{ try {{ ctx.suspend(); }} catch(e){{}} playing = false; updateLog(); }},
728
  reset: () => {{
729
+ try {{ node.port.postMessage({{ type: 'reset' }}); }} catch(e) {{}}
730
  meta.t_first_push_ms = null;
731
  meta.t_first_audio_ms = null;
732
  updateLog();
733
  }},
734
+ push: (f32) {{
735
  try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
736
  if (!meta.t_first_push_ms) {{
737
  meta.t_first_push_ms = performance.now();
738
+ if (!meta.t_first_audio_ms) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
 
739
  updateLog();
740
  }}
741
  if (!playing) api.start();
 
747
  return api;
748
  }}
749
 
750
+ function getPipeEl() {{
751
+ // Textbox у Gradio мае textarea унутры div#stream-pipe
752
+ const root = document.getElementById('stream-pipe');
753
+ if (!root) return null;
754
+ const ta = root.querySelector('textarea');
755
+ if (ta) return ta;
756
+ const inp = root.querySelector('input');
757
+ if (inp) return inp;
758
+ return root;
759
+ }}
760
 
761
+ function startPolling() {{
762
+ const pipe = getPipeEl();
763
+ if (!pipe) return;
764
+ const apiPromise = ensurePlayer();
765
+ let lastSeq = -1;
766
 
767
+ setInterval(async () => {{
768
+ const api = await apiPromise;
769
+ if (!api) return;
 
 
 
770
 
771
+ const txt = (pipe.value !== undefined) ? pipe.value : (pipe.innerText || pipe.textContent || '');
772
+ if (!txt) return;
773
+
774
+ let pkt = null;
775
+ try {{ pkt = JSON.parse(txt); }} catch(e) {{ return; }}
776
+ if (!pkt || typeof pkt.seq !== 'number') return;
777
+ if (pkt.seq <= lastSeq) return;
778
+ lastSeq = pkt.seq;
779
 
780
  if (pkt.log) {{
781
  api.meta.server = pkt.log;
 
789
  }}
790
 
791
  if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
 
792
  const bin = atob(pkt.b64);
793
  const len = bin.length;
794
  const buf = new ArrayBuffer(len);
 
805
  }}, POLL_MS);
806
  }}
807
 
 
808
  window.__wa_start_click = async function() {{
809
  const api = await ensurePlayer();
810
  api.meta.t_click_ms = performance.now();
 
825
  if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
826
  }};
827
 
828
+ // Стартуем polling пасля загрузкі
829
+ startPolling();
830
  }})();
831
  </script>
832
  """