archivartaunik commited on
Commit
6ddb476
·
verified ·
1 Parent(s): 7982f36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -26
app.py CHANGED
@@ -88,13 +88,12 @@ XTTS_MODEL.tokenizer = tokenizer
88
  # =========================================================
89
  # 4) Streaming-канфіг
90
  # =========================================================
91
- INITIAL_MIN_BUFFER_S = 0.25
92
  MIN_BUFFER_S = 0.1
93
  RUNTIME_FIRST_CHUNK_S = 0.02
94
  FADE_S = 0.004
95
  TOKENS_PER_STEP = 1
96
  ENABLE_TEXT_SPLITTING = True
97
- FIRST_SEGMENT_LIMIT = 160
98
 
99
  # -------------------- утыліты аўдыя ----------------------
100
  def _seconds_to_samples(sec: float, sr: int) -> int:
@@ -156,7 +155,7 @@ def init_stream_support():
156
  init_stream_support()
157
 
158
  # ---------------------------------------------------------
159
- # 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
160
  # ---------------------------------------------------------
161
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
162
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
@@ -228,16 +227,8 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
228
  text_in = text_in.strip()
229
  if not text_in: return []
230
  try:
231
- sentences = split_sentence(text_in, lang=lang_short)
232
- chunks, current_chunk = [], ""
233
- for sentence in sentences:
234
- if len(current_chunk) + len(sentence) + 1 <= chunk_limit: current_chunk += " " + sentence
235
- else:
236
- if current_chunk: chunks.append(current_chunk.strip())
237
- current_chunk = sentence
238
- if current_chunk: chunks.append(current_chunk.strip())
239
- return [c for c in chunks if c]
240
- except Exception: return [text_in]
241
 
242
  # ---------------------------------------------------------
243
  # 8) TTS — стрым-функцыя
@@ -262,6 +253,21 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
262
  "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
263
  "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
264
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  yield ("", None, None, json.dumps(server_metrics))
266
 
267
  full_audio_chunks, first_chunk_seen = [], False
@@ -277,9 +283,9 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
277
  if not first_chunk_seen:
278
  t_first = time.perf_counter()
279
  server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
280
- server_metrics["until_first_chunk_total_s"] = t_first - t0
281
  known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
282
- server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0) - known)
283
  first_chunk_seen = True
284
  yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
285
  else:
@@ -300,7 +306,7 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
300
  # ---------------------------------------------------------
301
  # 9) UI
302
  # ---------------------------------------------------------
303
- examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, 0.25, 0.1]]
304
 
305
  with gr.Blocks() as demo:
306
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
@@ -326,7 +332,6 @@ with gr.Blocks() as demo:
326
  const sampleRate = {sampling_rate};
327
  const AC = window.AudioContext || window.webkitAudioContext;
328
  if (!AC) return;
329
-
330
  function toSec(ms) {{ return (ms/1000); }}
331
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
332
 
@@ -345,7 +350,6 @@ with gr.Blocks() as demo:
345
  lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
346
  }}
347
  }}
348
-
349
  lines.push("\\n— Налады стрыму —");
350
  lines.push("Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s));
351
  lines.push("Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
@@ -353,20 +357,18 @@ with gr.Blocks() as demo:
353
  lines.push("Працягласць 1-га чанка: " + m.chunk_durations[0] + " s");
354
  lines.push("Атрымана чанкаў: " + m.chunk_durations.length);
355
  }}
356
-
357
  lines.push("\\n— Серверныя метрыкі —");
358
  lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
359
  lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
360
- lines.push("Ініт→1-ы чанк: " + fmtS(s.gen_init_to_first_chunk_s));
 
361
  lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
362
  lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
363
  lines.push("Запіс WAV: " + fmtS(s.file_write_s));
364
-
365
  if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
366
  let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
367
  lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
368
  }}
369
-
370
  lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
371
  el.innerHTML = lines.join("\\n");
372
  }}
@@ -423,10 +425,8 @@ with gr.Blocks() as demo:
423
  const view = new Uint8Array(buf);
424
  for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
425
  const f32 = new Float32Array(buf);
426
-
427
  const duration = f32.length / window.__wa.ctx.sampleRate;
428
  window.__wa.meta.chunk_durations.push(duration.toFixed(3));
429
-
430
  window.__wa.push(f32);
431
  }
432
  """
@@ -439,7 +439,6 @@ with gr.Blocks() as demo:
439
  } catch (e) {}
440
  }
441
  """
442
- # <--- ВЫПРАЎЛЕННЕ: выкарыстаны правільны параметр `js` замест `_js`
443
  run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
444
  run_btn.click(
445
  fn=text_to_speech,
@@ -448,7 +447,6 @@ with gr.Blocks() as demo:
448
  )
449
  stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
450
  log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
451
-
452
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
453
 
454
  if __name__ == "__main__":
 
88
  # =========================================================
89
  # 4) Streaming-канфіг
90
  # =========================================================
91
+ INITIAL_MIN_BUFFER_S = 0.35 # <--- Рэкамендуемае значэнне пасля выпраўлення
92
  MIN_BUFFER_S = 0.1
93
  RUNTIME_FIRST_CHUNK_S = 0.02
94
  FADE_S = 0.004
95
  TOKENS_PER_STEP = 1
96
  ENABLE_TEXT_SPLITTING = True
 
97
 
98
  # -------------------- утыліты аўдыя ----------------------
99
  def _seconds_to_samples(sec: float, sr: int) -> int:
 
155
  init_stream_support()
156
 
157
  # ---------------------------------------------------------
158
+ # 5) пастаянны кэш латэнтаў
159
  # ---------------------------------------------------------
160
  PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
161
  PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
 
227
  text_in = text_in.strip()
228
  if not text_in: return []
229
  try:
230
+ return [s.strip() for s in split_sentence(text_in, lang=lang_short) if s and s.strip()]
231
+ except: return [text_in]
 
 
 
 
 
 
 
 
232
 
233
  # ---------------------------------------------------------
234
  # 8) TTS — стрым-функцыя
 
253
  "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
254
  "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
255
  }
256
+
257
+ # <--- НОВЫ БЛОК: "Прагрэў" мадэлі ---
258
+ t_warmup0 = time.perf_counter()
259
+ try:
260
+ # Робім кароткі пусты выклік, каб "прагрэць" мадэль (JIT-кампіляцыя і г.д.)
261
+ # Гэта аплачвае "кошт запуску" перад пачаткам рэальнай генерацыі.
262
+ _ = XTTS_MODEL.inference(
263
+ text=" ", language=lang_short, gpt_cond_latent=gpt_cond_latent,
264
+ speaker_embedding=speaker_embedding, temperature=0.1, length_penalty=1.0,
265
+ )
266
+ except Exception as e: print(f"[warn] Model warmup inference failed: {e}")
267
+ t_warmup1 = time.perf_counter()
268
+ server_metrics["warmup_s"] = t_warmup1 - t_warmup0
269
+ # -----------------------------------------
270
+
271
  yield ("", None, None, json.dumps(server_metrics))
272
 
273
  full_audio_chunks, first_chunk_seen = [], False
 
283
  if not first_chunk_seen:
284
  t_first = time.perf_counter()
285
  server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
286
+ server_metrics["until_first_chunk_total_s"] = t_first - t0 + server_metrics["warmup_s"]
287
  known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
288
+ server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0 + server_metrics["warmup_s"]) - known)
289
  first_chunk_seen = True
290
  yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
291
  else:
 
306
  # ---------------------------------------------------------
307
  # 9) UI
308
  # ---------------------------------------------------------
309
+ examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, INITIAL_MIN_BUFFER_S, MIN_BUFFER_S]]
310
 
311
  with gr.Blocks() as demo:
312
  gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
 
332
  const sampleRate = {sampling_rate};
333
  const AC = window.AudioContext || window.webkitAudioContext;
334
  if (!AC) return;
 
335
  function toSec(ms) {{ return (ms/1000); }}
336
  function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
337
 
 
350
  lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
351
  }}
352
  }}
 
353
  lines.push("\\n— Налады стрыму —");
354
  lines.push("Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s));
355
  lines.push("Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
 
357
  lines.push("Працягласць 1-га чанка: " + m.chunk_durations[0] + " s");
358
  lines.push("Атрымана чанкаў: " + m.chunk_durations.length);
359
  }}
 
360
  lines.push("\\n— Серверныя метрыкі —");
361
  lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
362
  lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
363
+ lines.push("Прагрэў мадэлі: " + fmtS(s.warmup_s)); // <--- Новы радок у логах
364
+ lines.push("Ініт→1-ы чанк (пасля прагрэву): " + fmtS(s.gen_init_to_first_chunk_s)); // <--- Зменены подпіс
365
  lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
366
  lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
367
  lines.push("Запіс WAV: " + fmtS(s.file_write_s));
 
368
  if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
369
  let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
370
  lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
371
  }}
 
372
  lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
373
  el.innerHTML = lines.join("\\n");
374
  }}
 
425
  const view = new Uint8Array(buf);
426
  for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
427
  const f32 = new Float32Array(buf);
 
428
  const duration = f32.length / window.__wa.ctx.sampleRate;
429
  window.__wa.meta.chunk_durations.push(duration.toFixed(3));
 
430
  window.__wa.push(f32);
431
  }
432
  """
 
439
  } catch (e) {}
440
  }
441
  """
 
442
  run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
443
  run_btn.click(
444
  fn=text_to_speech,
 
447
  )
448
  stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
449
  log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
 
450
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
451
 
452
  if __name__ == "__main__":