archivartaunik commited on
Commit
04019fb
·
verified ·
1 Parent(s): 34bd026

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -232
app.py CHANGED
@@ -11,7 +11,6 @@ import base64
11
  import hashlib
12
  import tempfile
13
  import subprocess
14
- import inspect
15
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
16
  from dataclasses import dataclass
17
  import pathlib
@@ -28,13 +27,9 @@ from scipy.io.wavfile import write
28
  # ---------------------------------------------------------
29
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
30
  REPO_DIR = "coqui-ai-TTS"
31
-
32
- if not os.path.exists(REPO_DIR):
33
- subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
34
-
35
  repo_root = os.path.abspath(REPO_DIR)
36
- if repo_root not in sys.path:
37
- sys.path.insert(0, repo_root)
38
 
39
  from TTS.tts.configs.xtts_config import XttsConfig
40
  from TTS.tts.models.xtts import Xtts
@@ -44,81 +39,40 @@ from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
44
  # 2) мадэльныя файлы
45
  # ---------------------------------------------------------
46
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
47
- model_dir = "./model"
48
- os.makedirs(model_dir, exist_ok=True)
49
-
50
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
51
- fpath = os.path.join(model_dir, fname)
52
- if not os.path.exists(fpath):
53
- hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
54
 
55
  # ---------------------------------------------------------
56
  # 3) загрузка мадэлі
57
  # ---------------------------------------------------------
58
  config = XttsConfig()
59
- config.load_json(os.path.join(model_dir, "config.json"))
60
  XTTS_MODEL = Xtts.init_from_config(config)
61
- XTTS_MODEL.load_checkpoint(
62
- config,
63
- checkpoint_path=os.path.join(model_dir, "model.pth"),
64
- vocab_path=os.path.join(model_dir, "vocab.json"),
65
- use_deepspeed=False,
66
- )
67
 
68
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
69
-
70
- torch.set_num_threads(1)
71
  if device.startswith("cuda"):
 
72
  torch.backends.cuda.matmul.allow_tf32 = True
73
  torch.backends.cudnn.allow_tf32 = True
74
- torch.backends.cudnn.benchmark = True
75
- torch.set_float32_matmul_precision("high")
76
-
77
- XTTS_MODEL.to(device).eval()
78
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
79
 
80
- tokenizer = VoiceBpeTokenizer(vocab_file=os.path.join(model_dir, "vocab.json"))
81
- XTTS_MODEL.tokenizer = tokenizer
82
-
83
  # =========================================================
84
  # 4) Streaming-канфіг і дапаўненні
85
  # =========================================================
86
- INITIAL_MIN_BUFFER_S = 0.35
87
- MIN_BUFFER_S = 0.1
88
- FADE_S = 0.004
89
  ENABLE_TEXT_SPLITTING = True
90
 
91
  def _to_np_audio(x) -> np.ndarray:
92
  if isinstance(x, dict) and "wav" in x: x = x["wav"]
93
- if isinstance(x, torch.Tensor): x = x.detach().cpu().float().contiguous().view(-1)
94
  x = np.asarray(x, dtype=np.float32)
95
- if x.ndim > 1: x = x.reshape(-1)
96
- return x
97
-
98
- def _native_stream(model: Xtts, **kwargs) -> Iterator[np.ndarray]:
99
- # <--- ВЫПРАЎЛЕННЕ: Выдалены непадтрымоўваемы параметр `stream_chunk_size_s`
100
- # Ён перадаваўся няяўна праз kwargs, таму мы проста не будзем яго дадаваць
101
- with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda")):
102
- for out in model.inference_stream(**kwargs):
103
- yield _to_np_audio(out)
104
-
105
- class NewTTSGenerationMixin:
106
- @torch.inference_mode()
107
- def generate(self: Xtts, text: str, do_stream: bool = False, **kwargs):
108
- if not do_stream:
109
- with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda")):
110
- return _to_np_audio(self.inference(text=text, **kwargs))
111
- return self.sample_stream(text=text, **kwargs)
112
-
113
- @torch.inference_mode()
114
- def sample_stream(self: Xtts, **kwargs) -> Iterator[np.ndarray]:
115
- yield from _native_stream(self, **kwargs)
116
-
117
- def init_stream_support():
118
- Xtts.generate = NewTTSGenerationMixin.generate
119
- Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
120
-
121
- init_stream_support()
122
 
123
  # ---------------------------------------------------------
124
  # 5) пастаянны кэш латэнтаў
@@ -129,7 +83,7 @@ PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
129
  class LatentsMeta: model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool
130
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
131
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
132
- default_voice_file = os.path.join(model_dir, "voice.wav")
133
 
134
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
135
  base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
@@ -138,8 +92,8 @@ def _latents_key(path: str | None, meta: LatentsMeta) -> str:
138
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
139
  meta = LatentsMeta(model_id=repo_id, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_len=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
140
  key = _latents_key(path, meta)
141
- if key in LATENT_CACHE: g, s = LATENT_CACHE[key]
142
- else:
143
  disk_path = PERSIST_LATENTS_DIR / f"{key}.pt"
144
  if disk_path.exists():
145
  data = torch.load(disk_path, map_location="cpu"); g, s = data["gpt_cond_latent"], data["speaker_embedding"]
@@ -155,228 +109,204 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
155
  GPU_LATENT_CACHE[dev_key] = (g, s)
156
  return g, s
157
 
158
- # ---------------------------------------------------------
159
- # 6) "Прагрэў" прыкладання пры запуску
160
- # ---------------------------------------------------------
161
- print("Application warmup started...")
162
- t_warmup_start = time.perf_counter()
163
- try:
164
- default_latents = _latents_for(default_voice_file, to_device=device)
165
- print(f"Default voice latents cached and moved to {device}.")
166
- _ = split_sentence("Прывітанне, свет.", lang="be")
167
- print("Text splitter warmed up.")
168
- with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16, enabled=device.startswith("cuda")):
169
- _ = XTTS_MODEL.inference(" ", "be", default_latents[0], default_latents[1])
170
- print("Main TTS model warmed up.")
171
- except Exception as e:
172
- print(f"An error occurred during application warmup: {e}")
173
- t_warmup_end = time.perf_counter()
174
- print(f"Application warmup finished in {t_warmup_end - t_warmup_start:.2f} seconds.")
175
 
176
  # ---------------------------------------------------------
177
  # 7) Дапаможныя функцыі для стрыму
178
  # ---------------------------------------------------------
179
  def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
180
- def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
181
- if a.size == 0: return b
182
- if b.size == 0: return a
183
- fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
184
- if fade_n <= 1: return np.concatenate([a, b])
185
- fade_out, fade_in = np.linspace(1.0, 0.0, fade_n, dtype=np.float32), np.linspace(0.0, 1.0, fade_n, dtype=np.float32)
186
- tail = (a[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
187
- return np.concatenate([a[:-fade_n], tail, b[fade_n:]])
188
-
189
- def _chunker_with_initial_buffer(chunks: Iterable[np.ndarray], sr: int, initial_target_s: float, target_s: float) -> Iterable[np.ndarray]:
 
 
 
190
  is_first, target_samples = True, _seconds_to_samples(initial_target_s, sr)
191
- buffer_list, buffer_len = [], 0
192
  for c_np in map(_to_np_audio, chunks):
193
  if c_np.size == 0: continue
194
- buffer_list.append(c_np); buffer_len += c_np.size
195
- if buffer_len >= target_samples:
196
- yield np.concatenate(buffer_list)
197
- buffer_list, buffer_len = [], 0
198
  if is_first: is_first = False; target_samples = _seconds_to_samples(target_s, sr)
199
- if buffer_len > 0: yield np.concatenate(buffer_list)
200
 
201
  def _pcm_f32_to_b64(x: np.ndarray) -> str: return base64.b64encode(x.tobytes()).decode("ascii")
202
 
203
- def _split_text_smart(text_in: str, lang: str, limit: int) -> List[str]:
204
- # <--- ВЫПРАЎЛЕННЕ: Больш надзейная логіка падзелу тэксту
205
- try:
206
- sentences = split_sentence(text_in, lang=lang)
207
- chunks = []
208
- current_chunk = ""
209
- for sentence in sentences:
210
- if len(current_chunk) + len(sentence) + 1 > limit:
211
- if current_chunk: chunks.append(current_chunk)
212
- current_chunk = sentence
213
- else:
214
- current_chunk = (current_chunk + " " + sentence).strip()
215
- if current_chunk: chunks.append(current_chunk)
216
- # Калі нейкі кавалак усё роўна занадта доўгі, прымусова яго падзелім
217
- final_chunks = []
218
- for chunk in chunks:
219
- if len(chunk) > limit:
220
- final_chunks.extend([chunk[i:i+limit] for i in range(0, len(chunk), limit)])
221
- else:
222
- final_chunks.append(chunk)
223
- return [c.strip() for c in final_chunks if c and c.strip()]
224
- except Exception as e:
225
- print(f"Error in text splitter: {e}. Falling back to basic split.")
226
- return [text_in[i:i+limit] for i in range(0, len(text_in), limit)]
227
 
228
  # ---------------------------------------------------------
229
  # 8) TTS — асноўная функцыя
230
  # ---------------------------------------------------------
231
  @spaces.GPU(duration=120)
232
- def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subsequent_buffer_s):
233
- t0 = time.perf_counter()
234
- if not belarusian_story or not str(belarusian_story).strip(): raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
235
 
236
- t_lat0 = time.perf_counter()
237
- gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file or default_voice_file, to_device=device)
238
- t_lat1 = time.perf_counter()
239
 
240
- t_split0 = time.perf_counter()
241
  char_limit = XTTS_MODEL.tokenizer.char_limits.get("be", 250)
242
- texts = _split_text_smart(str(belarusian_story).strip(), "be", char_limit) if ENABLE_TEXT_SPLITTING else [str(belarusian_story).strip()]
243
- t_split1 = time.perf_counter()
244
-
245
- server_metrics = {
246
- "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
247
- "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
248
- }
249
  yield ("", None, None, json.dumps(server_metrics))
250
-
251
- full_audio_chunks, first_chunk_seen = [], False
252
- t_gen0 = time.perf_counter()
253
- for part in texts:
254
- gen = XTTS_MODEL.generate(
255
- text=part, do_stream=True, language="be",
256
- gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
257
- temperature=0.1, length_penalty=1.0, repetition_penalty=10.0, top_k=10, top_p=0.3,
 
258
  )
259
- chunk_iterator = _chunker_with_initial_buffer(gen, sampling_rate, initial_buffer_s, subsequent_buffer_s)
260
- for buf in chunk_iterator:
261
- if not first_chunk_seen:
262
- t_first = time.perf_counter()
263
- server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
264
- server_metrics["until_first_chunk_total_s"] = t_first - t0
265
- known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
266
- server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, server_metrics["until_first_chunk_total_s"] - known)
267
- first_chunk_seen = True
268
- yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
269
- else:
270
- yield (_pcm_f32_to_b64(buf), None, None, None)
271
- full_audio_chunks.append(buf)
272
 
273
  if not full_audio_chunks:
274
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
275
 
276
- t_w0 = time.perf_counter()
277
- full_audio = full_audio_chunks[0]
278
- for i in range(1, len(full_audio_chunks)):
279
- full_audio = _crossfade_concat(full_audio, full_audio_chunks[i], sampling_rate, FADE_S)
280
-
281
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
282
  write(tmp.name, sampling_rate, full_audio)
283
- server_metrics["file_write_s"] = time.perf_counter() - t_w0
284
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
285
 
286
  # ---------------------------------------------------------
287
  # 9) UI
288
  # ---------------------------------------------------------
289
  examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, INITIAL_MIN_BUFFER_S, MIN_BUFFER_S]]
290
-
291
  with gr.Blocks() as demo:
292
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
293
  with gr.Row():
294
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
295
- inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)", interactive=True)
296
  with gr.Accordion("Дадатковыя налады стрымінгу", open=True):
297
- initial_buffer_slider = gr.Slider(minimum=0.1, maximum=1.0, value=INITIAL_MIN_BUFFER_S, step=0.05, label="Пачатковы буфер (с)")
298
  subsequent_buffer_slider = gr.Slider(minimum=0.05, maximum=0.5, value=MIN_BUFFER_S, step=0.01, label="Наступны буфер (с)")
299
- with gr.Row():
300
- run_btn = gr.Button("Згенераваць")
301
- gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
302
  log_panel = gr.HTML(value='<div id="wa-log" style="font-family:monospace;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Лагі плэера")
303
  stream_pipe, log_pipe, final_file, final_audio = gr.Textbox(visible=False), gr.Textbox(visible=False), gr.File(label="Згенераваны WAV"), gr.Audio(label="Фінальнае аўдыя", type="filepath")
304
 
305
- INIT_RESET_JS = f"""
306
  () => {{
307
  const sampleRate = {sampling_rate};
308
- if (window.__wa) {{ window.__wa.reset(); return; }}
309
- const AC = window.AudioContext || window.webkitAudioContext;
310
- if (!AC) return;
311
- const ctx = new AC({{ sampleRate }});
312
- const node = ctx.createScriptProcessor(4096, 0, 1);
313
- let queue = [], playing = false, eos = false;
314
- const meta = {{ t_click_ms: performance.now(), chunk_durations: [] }};
315
- node.onaudioprocess = (e) => {{
316
- const out = e.outputBuffer.getChannelData(0); let i = 0;
317
- while (i < out.length) {{
318
- if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
319
- let cur = queue[0];
320
- const take = Math.min(cur.length, out.length - i);
321
- if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
322
- out.set(cur.subarray(0, take), i); i += take;
323
- if (take === cur.length) queue.shift(); else queue[0] = cur.subarray(take);
324
- }}
325
- if (eos && queue.length === 0 && playing) {{ playing = false; logUpdate(); }}
326
- }};
327
- node.connect(ctx.destination);
328
- function fmtS(x) {{ return x === null || x === undefined ? "n/a" : x.toFixed(3) + " s"; }}
329
- function logUpdate() {{
330
- const el = document.getElementById('wa-log'); if (!el) return;
331
- const s = meta.server || {{}};
332
- const lines = ["Клік (Згенераваць): 0.000 s"];
333
- if (meta.t_first_push_ms) {{
334
- lines.push("Першы чанк прыйшоў: " + fmtS((meta.t_first_push_ms - meta.t_click_ms) / 1000));
335
- if (meta.t_first_audio_ms) {{
336
- lines.push("Пачатак прайгравання: " + fmtS((meta.t_first_audio_ms - meta.t_click_ms) / 1000));
337
- lines.push("Затрымка (чанк→аўдыя): " + fmtS((meta.t_first_audio_ms - meta.t_first_push_ms) / 1000));
 
 
 
 
338
  }}
 
 
 
 
 
 
339
  }}
340
- lines.push("\\n— Налады стрыму —", "Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s), "Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
341
- if (meta.chunk_durations.length > 0) {{ lines.push("Працягласць 1-га чанка: " + meta.chunk_durations[0] + " s", "Атрымана чанкаў: " + meta.chunk_durations.length); }}
342
- lines.push("\\n— Серверныя метрыкі —", "Latents (умоўны голас): " + fmtS(s.latents_s), "Падзел тэксту: " + fmtS(s.text_split_s), "Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s), "Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s), "Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s), "Запіс WAV: " + fmtS(s.file_write_s));
343
- if (meta.t_first_push_ms && s.until_first_chunk_total_s) {{ lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + fmtS(Math.max(0, (meta.t_first_push_ms - meta.t_click_ms) / 1000 - s.until_first_chunk_total_s))); }}
344
- lines.push("\\nСтатус стриму: " + (playing ? "playing" : "stopped"));
345
- el.innerHTML = lines.join("\\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  }}
347
- window.__wa = {{
348
- ctx, meta,
349
- push: (b64) => {{
350
- if (b64 === "__STOP__") {{ eos = true; logUpdate(); return; }}
351
- const bin = atob(b64); const buf = new ArrayBuffer(bin.length); const view = new Uint8Array(buf);
352
- for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
353
- const f32 = new Float32Array(buf);
354
- meta.chunk_durations.push((f32.length / ctx.sampleRate).toFixed(3));
355
- queue.push(f32);
356
- if (!meta.t_first_push_ms) meta.t_first_push_ms = performance.now();
357
- if (!playing && queue.length >= 1) {{ playing = true; try{{ctx.resume()}}catch(e){{}} }}
358
- logUpdate();
359
- }},
360
- update_server_metrics: (js) => {{ if(js) meta.server = JSON.parse(js); logUpdate(); }},
361
- reset: () => {{
362
- playing = false; eos = false; queue = [];
363
- meta.t_click_ms = performance.now(); meta.t_first_push_ms = null; meta.t_first_audio_ms = null;
364
- meta.chunk_durations = []; meta.server = null; logUpdate();
365
- }},
366
- }};
367
- }}
368
  """
369
- PUSH_JS = "(b64) => { if (window.__wa) window.__wa.push(b64); }"
370
- LOG_JS = "(js) => { if (window.__wa) window.__wa.update_server_metrics(js); }"
371
-
372
- run_btn.click(fn=None, js=INIT_RESET_JS)
373
- run_btn.click(
374
  fn=text_to_speech,
375
  inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider],
376
  outputs=[stream_pipe, final_file, final_audio, log_pipe]
377
  )
378
- stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
379
- log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
380
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
381
 
382
  if __name__ == "__main__":
 
11
  import hashlib
12
  import tempfile
13
  import subprocess
 
14
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
15
  from dataclasses import dataclass
16
  import pathlib
 
27
  # ---------------------------------------------------------
28
  REPO_URL = "https://github.com/tuteishygpt/coqui-ai-TTS.git"
29
  REPO_DIR = "coqui-ai-TTS"
30
+ if not os.path.exists(REPO_DIR): subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
 
 
 
31
  repo_root = os.path.abspath(REPO_DIR)
32
+ if repo_root not in sys.path: sys.path.insert(0, repo_root)
 
33
 
34
  from TTS.tts.configs.xtts_config import XttsConfig
35
  from TTS.tts.models.xtts import Xtts
 
39
  # 2) мадэльныя файлы
40
  # ---------------------------------------------------------
41
  repo_id = "archivartaunik/BE_XTTS_V2_10ep250k"
42
+ model_dir = pathlib.Path("./model")
43
+ model_dir.mkdir(exist_ok=True)
 
44
  for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
45
+ if not (model_dir / fname).exists(): hf_hub_download(repo_id, filename=fname, local_dir=model_dir)
 
 
46
 
47
  # ---------------------------------------------------------
48
  # 3) загрузка мадэлі
49
  # ---------------------------------------------------------
50
  config = XttsConfig()
51
+ config.load_json(str(model_dir / "config.json"))
52
  XTTS_MODEL = Xtts.init_from_config(config)
53
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=str(model_dir / "model.pth"), vocab_path=str(model_dir / "vocab.json"), use_deepspeed=False)
 
 
 
 
 
54
 
55
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
56
  if device.startswith("cuda"):
57
+ torch.cuda.manual_seed(0)
58
  torch.backends.cuda.matmul.allow_tf32 = True
59
  torch.backends.cudnn.allow_tf32 = True
60
+ XTTS_MODEL.to(device)
 
 
 
61
  sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
62
 
 
 
 
63
  # =========================================================
64
  # 4) Streaming-канфіг і дапаўненні
65
  # =========================================================
66
+ INITIAL_MIN_BUFFER_S = 0.40 # Рэкамендаванае значэнне для балансу
67
+ MIN_BUFFER_S = 0.15
68
+ FADE_S = 0.005
69
  ENABLE_TEXT_SPLITTING = True
70
 
71
  def _to_np_audio(x) -> np.ndarray:
72
  if isinstance(x, dict) and "wav" in x: x = x["wav"]
73
+ if isinstance(x, torch.Tensor): x = x.detach().cpu().float().contiguous().view(-1).numpy()
74
  x = np.asarray(x, dtype=np.float32)
75
+ return x.reshape(-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # ---------------------------------------------------------
78
  # 5) пастаянны кэш латэнтаў
 
83
  class LatentsMeta: model_id: str; gpt_cond_len: int; max_ref_len: int; sound_norm_refs: bool
84
  LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
85
  GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
86
+ default_voice_file = str(model_dir / "voice.wav")
87
 
88
  def _latents_key(path: str | None, meta: LatentsMeta) -> str:
89
  base = f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}" if path and os.path.exists(path) else "default_voice"
 
92
  def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
93
  meta = LatentsMeta(model_id=repo_id, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_len=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
94
  key = _latents_key(path, meta)
95
+ g, s = LATENT_CACHE.get(key) or (None, None)
96
+ if g is None:
97
  disk_path = PERSIST_LATENTS_DIR / f"{key}.pt"
98
  if disk_path.exists():
99
  data = torch.load(disk_path, map_location="cpu"); g, s = data["gpt_cond_latent"], data["speaker_embedding"]
 
109
  GPU_LATENT_CACHE[dev_key] = (g, s)
110
  return g, s
111
 
112
+ # "Прагрэў" кэша для голасу па змаўчанні пры запуску
113
+ try: _latents_for(default_voice_file, to_device=device)
114
+ except Exception as e: print(f"Warning: Could not pre-cache default voice: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # ---------------------------------------------------------
117
  # 7) Дапаможныя функцыі для стрыму
118
  # ---------------------------------------------------------
119
  def _seconds_to_samples(sec: float, sr: int) -> int: return max(1, int(sec * sr))
120
+ def _crossfade_concat(chunks: List[np.ndarray], sr: int, fade_s: float) -> np.ndarray:
121
+ if not chunks: return np.array([], dtype=np.float32)
122
+ result = chunks[0]
123
+ for i in range(1, len(chunks)):
124
+ b = chunks[i]
125
+ fade_n = min(_seconds_to_samples(fade_s, sr), result.size, b.size)
126
+ if fade_n <= 1: result = np.concatenate([result, b]); continue
127
+ fade_out, fade_in = np.linspace(1.0, 0.0, fade_n, dtype=np.float32), np.linspace(0.0, 1.0, fade_n, dtype=np.float32)
128
+ tail = (result[-fade_n:] * fade_out) + (b[:fade_n] * fade_in)
129
+ result = np.concatenate([result[:-fade_n], tail, b[fade_n:]])
130
+ return result
131
+
132
+ def _chunker(chunks: Iterable[np.ndarray], sr: int, initial_target_s: float, target_s: float) -> Iterator[np.ndarray]:
133
  is_first, target_samples = True, _seconds_to_samples(initial_target_s, sr)
134
+ buffer = np.array([], dtype=np.float32)
135
  for c_np in map(_to_np_audio, chunks):
136
  if c_np.size == 0: continue
137
+ buffer = np.concatenate([buffer, c_np])
138
+ if buffer.size >= target_samples:
139
+ yield buffer
140
+ buffer = np.array([], dtype=np.float32)
141
  if is_first: is_first = False; target_samples = _seconds_to_samples(target_s, sr)
142
+ if buffer.size > 0: yield buffer
143
 
144
  def _pcm_f32_to_b64(x: np.ndarray) -> str: return base64.b64encode(x.tobytes()).decode("ascii")
145
 
146
+ def _split_text_smart(text: str, lang: str, limit: int) -> List[str]:
147
+ try: sentences = split_sentence(text, lang=lang)
148
+ except Exception: sentences = [text]
149
+ chunks, current_chunk = [], ""
150
+ for sentence in sentences:
151
+ if len(current_chunk) + len(sentence) + 1 > limit and current_chunk:
152
+ chunks.append(current_chunk); current_chunk = ""
153
+ current_chunk = (current_chunk + " " + sentence).strip()
154
+ if current_chunk: chunks.append(current_chunk)
155
+ final_chunks = []
156
+ for chunk in chunks:
157
+ if len(chunk) > limit: final_chunks.extend(chunk[i:i+limit] for i in range(0, len(chunk), limit))
158
+ else: final_chunks.append(chunk)
159
+ return [c.strip() for c in final_chunks if c.strip()]
 
 
 
 
 
 
 
 
 
 
160
 
161
  # ---------------------------------------------------------
162
  # 8) TTS — асноўная функцыя
163
  # ---------------------------------------------------------
164
  @spaces.GPU(duration=120)
165
+ def text_to_speech(text_input, speaker_audio, initial_buffer_s, subsequent_buffer_s):
166
+ t_start_req = time.perf_counter()
167
+ if not text_input or not str(text_input).strip(): raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
168
 
169
+ t_lat_0 = time.perf_counter()
170
+ gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio or default_voice_file, to_device=device)
171
+ t_lat_1 = time.perf_counter()
172
 
173
+ t_split_0 = time.perf_counter()
174
  char_limit = XTTS_MODEL.tokenizer.char_limits.get("be", 250)
175
+ texts = _split_text_smart(str(text_input).strip(), "be", char_limit) if ENABLE_TEXT_SPLITTING else [str(text_input).strip()]
176
+ t_split_1 = time.perf_counter()
177
+
178
+ # Фаза 1: Адпраўка пачатковых метрык неадкладна
179
+ server_metrics = { "latents_s": t_lat_1 - t_lat_0, "text_split_s": t_split_1 - t_split_0, "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s }
 
 
180
  yield ("", None, None, json.dumps(server_metrics))
181
+
182
+ # Фаза 2: Генерацыя і стрымінг
183
+ full_audio_chunks, first_chunk_sent = [], False
184
+ t_gen_start = time.perf_counter()
185
+
186
+ all_chunks_iterator = (
187
+ chunk for part in texts for chunk in XTTS_MODEL.inference_stream(
188
+ text=part, language="be", gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding,
189
+ temperature=0.2, length_penalty=1.0, repetition_penalty=10.0, top_k=20, top_p=0.85
190
  )
191
+ )
192
+
193
+ for audio_chunk in _chunker(all_chunks_iterator, sampling_rate, initial_buffer_s, subsequent_buffer_s):
194
+ if not first_chunk_sent:
195
+ t_first_chunk_ready = time.perf_counter()
196
+ server_metrics["gen_init_to_first_chunk_s"] = t_first_chunk_ready - t_gen_start
197
+ server_metrics["until_first_chunk_total_s"] = t_first_chunk_ready - t_start_req
198
+ yield (_pcm_f32_to_b64(audio_chunk), None, None, json.dumps(server_metrics))
199
+ first_chunk_sent = True
200
+ else:
201
+ yield (_pcm_f32_to_b64(audio_chunk), None, None, None)
202
+ full_audio_chunks.append(audio_chunk)
 
203
 
204
  if not full_audio_chunks:
205
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
206
 
207
+ t_write_0 = time.perf_counter()
208
+ full_audio = _crossfade_concat(full_audio_chunks, sampling_rate, FADE_S)
 
 
 
209
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
210
  write(tmp.name, sampling_rate, full_audio)
211
+ server_metrics["file_write_s"] = time.perf_counter() - t_write_0
212
  yield ("__STOP__", tmp.name, tmp.name, json.dumps(server_metrics))
213
 
214
  # ---------------------------------------------------------
215
  # 9) UI
216
  # ---------------------------------------------------------
217
  examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, INITIAL_MIN_BUFFER_S, MIN_BUFFER_S]]
 
218
  with gr.Blocks() as demo:
219
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
220
  with gr.Row():
221
  inp_text = gr.Textbox(lines=5, label="Тэкст на беларускай мове")
222
+ inp_voice = gr.Audio(type="filepath", label="Прыклад голасу (6–10 сек)")
223
  with gr.Accordion("Дадатковыя налады стрымінгу", open=True):
224
+ initial_buffer_slider = gr.Slider(minimum=0.1, maximum=1.5, value=INITIAL_MIN_BUFFER_S, step=0.05, label="Пачатковы буфер (с)")
225
  subsequent_buffer_slider = gr.Slider(minimum=0.05, maximum=0.5, value=MIN_BUFFER_S, step=0.01, label="Наступны буфер (с)")
226
+ with gr.Row(): run_btn = gr.Button("Згенераваць"); gr.Markdown(f"**Sample rate:** {sampling_rate} Hz")
 
 
227
  log_panel = gr.HTML(value='<div id="wa-log" style="font-family:monospace;font-size:12px;white-space:pre-line">[лог пусты]</div>', label="Лагі плэера")
228
  stream_pipe, log_pipe, final_file, final_audio = gr.Textbox(visible=False), gr.Textbox(visible=False), gr.File(label="Згенераваны WAV"), gr.Audio(label="Фінальнае аўдыя", type="filepath")
229
 
230
+ JS_CODE = f"""
231
  () => {{
232
  const sampleRate = {sampling_rate};
233
+ // Ініцыялізацыя або скід стану плэера
234
+ function initOrResetPlayer() {{
235
+ if (window.__wa) {{ window.__wa.reset(); return; }}
236
+ const AC = window.AudioContext || window.webkitAudioContext;
237
+ if (!AC) {{ console.error("AudioContext is not supported."); return; }}
238
+ const ctx = new AC({{ sampleRate }});
239
+ const node = ctx.createScriptProcessor(4096, 1, 1);
240
+ let queue = [], playing = false, eos = false;
241
+ let meta = {{ t_click_ms: performance.now(), chunk_durations: [] }};
242
+
243
+ node.onaudioprocess = (e) => {{
244
+ const out = e.outputBuffer.getChannelData(0); let i = 0;
245
+ while (i < out.length) {{
246
+ if (queue.length === 0 || !playing) {{ out[i++] = 0.0; continue; }}
247
+ let cur = queue[0];
248
+ const take = Math.min(cur.length, out.length - i);
249
+ if (meta.t_first_audio_ms === null) {{ meta.t_first_audio_ms = performance.now(); logUpdate(); }}
250
+ out.set(cur.subarray(0, take), i); i += take;
251
+ if (take === cur.length) queue.shift(); else queue[0] = cur.subarray(take);
252
+ }}
253
+ if (eos && queue.length === 0 && playing) {{ playing = false; logUpdate(); }}
254
+ }};
255
+ node.connect(ctx.destination);
256
+
257
+ function fmtS(x) {{ return x === null || x === undefined ? "n/a" : x.toFixed(3) + " s"; }}
258
+ function logUpdate() {{
259
+ const el = document.getElementById('wa-log'); if (!el) return;
260
+ const s = meta.server || {{}}; const lines = ["Клік (Згенераваць): 0.000 s"];
261
+ if (meta.t_first_push_ms) {{
262
+ lines.push("Першы чанк прыйшоў: " + fmtS((meta.t_first_push_ms - meta.t_click_ms) / 1000));
263
+ if (meta.t_first_audio_ms) {{
264
+ lines.push("Пачатак прайгравання: " + fmtS((meta.t_first_audio_ms - meta.t_click_ms) / 1000));
265
+ lines.push("Затрымка (чанк→аўдыя): " + fmtS((meta.t_first_audio_ms - meta.t_first_push_ms) / 1000));
266
+ }}
267
  }}
268
+ lines.push("\\n— Налады стрыму —", "Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s), "Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
269
+ if (meta.chunk_durations.length > 0) {{ lines.push("Працягласць 1-га чанка: " + meta.chunk_durations[0] + " s", "Атрымана чанкаў: " + meta.chunk_durations.length); }}
270
+ lines.push("\\n— Серверныя метрыкі —", "Latents (умоўны голас): " + fmtS(s.latents_s), "Падзел тэксту: " + fmtS(s.text_split_s), "Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s), "Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
271
+ if (meta.t_first_push_ms && s.until_first_chunk_total_s) {{ lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + fmtS(Math.max(0, (meta.t_first_push_ms - meta.t_click_ms) / 1000 - s.until_first_chunk_total_s))); }}
272
+ lines.push("\\nСтатус стриму: " + (playing ? "playing" : "stopped"));
273
+ el.innerHTML = lines.join("\\n");
274
  }}
275
+
276
+ window.__wa = {{
277
+ push: (b64) => {{
278
+ if (b64 === "__STOP__") {{ eos = true; logUpdate(); return; }}
279
+ const bin = atob(b64); const buf = new ArrayBuffer(bin.length); const view = new Uint8Array(buf);
280
+ for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
281
+ const f32 = new Float32Array(buf);
282
+ if (meta.chunk_durations.length === 0 && f32.length > 0) meta.t_first_push_ms = performance.now();
283
+ meta.chunk_durations.push((f32.length / ctx.sampleRate).toFixed(3));
284
+ queue.push(f32);
285
+ if (!playing && queue.length > 0) {{ playing = true; if(ctx.state === "suspended") ctx.resume(); }}
286
+ logUpdate();
287
+ }},
288
+ update_server_metrics: (js) => {{ if(js) meta.server = JSON.parse(js); logUpdate(); }},
289
+ reset: () => {{
290
+ playing = false; eos = false; queue.length = 0;
291
+ meta = {{ t_click_ms: performance.now(), chunk_durations: [], server: null }}; logUpdate();
292
+ }},
293
+ }};
294
+ }}
295
+ // Асноўная функцыя, якая выклікаецца па падзеі
296
+ return function(...args) {{
297
+ initOrResetPlayer();
298
+ // Вяртаем аргументы для Gradio, каб ён працягнуў ланцужок выклікаў
299
+ return args;
300
  }}
301
+ }}()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  """
303
+ run_btn.click(fn=None, js=JS_CODE, inputs=None, outputs=None).then(
 
 
 
 
304
  fn=text_to_speech,
305
  inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider],
306
  outputs=[stream_pipe, final_file, final_audio, log_pipe]
307
  )
308
+ stream_pipe.change(fn=lambda b64: window.__wa.push(b64) if window.__wa else None, inputs=[stream_pipe], js="(b64) => { if(window.__wa) window.__wa.push(b64); }")
309
+ log_pipe.change(fn=lambda js: window.__wa.update_server_metrics(js) if window.__wa else None, inputs=[log_pipe], js="(js) => { if(window.__wa) window.__wa.update_server_metrics(js); }")
310
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
311
 
312
  if __name__ == "__main__":