archivartaunik commited on
Commit
0744763
·
verified ·
1 Parent(s): 36c434c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -27
app.py CHANGED
@@ -12,6 +12,7 @@ import hashlib
12
  import tempfile
13
  import subprocess
14
  import inspect
 
15
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
16
  from dataclasses import dataclass
17
  import pathlib
@@ -297,7 +298,6 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
297
  return g2, s2
298
  return g, s
299
 
300
- # аўтападлік для default voice (CPU) — без дадатковых запытаў
301
  try:
302
  _ = _latents_for(default_voice_file)
303
  except Exception as e:
@@ -398,13 +398,6 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
398
  # ---------------------------------------------------------
399
  @spaces.GPU(duration=60)
400
  def text_to_speech(belarusian_story, speaker_audio_file=None):
401
- """
402
- Выхады:
403
- 1) stream_pipe — base64(PCM float32) чанкі, у фінале "__STOP__"
404
- 2) final_file — шлях да WAV
405
- 3) final_audio — шлях да WAV для прайгравання
406
- 4) log_pipe — JSON з сервернымі метрыкамі (секунды)
407
- """
408
  t0 = time.perf_counter()
409
 
410
  if not belarusian_story or str(belarusian_story).strip() == "":
@@ -420,13 +413,11 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
420
  lang_short = "be"
421
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
422
 
423
- # Latents (кэш CPU/GPU)
424
  t_lat0 = time.perf_counter()
425
  to_dev = "cuda:0" if torch.cuda.is_available() else None
426
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
427
  t_lat1 = time.perf_counter()
428
 
429
- # Split
430
  t_split0 = time.perf_counter()
431
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
432
  if not texts: texts = [text_in]
@@ -456,19 +447,42 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
456
  temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
457
  top_k=10, top_p=0.3,
458
  )
459
- for buf in _chunker(gen, sampling_rate, MIN_BUFFER_S):
460
- if not first_chunk_seen:
461
- t_first = time.perf_counter()
462
- server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
463
- server_metrics["until_first_chunk_total_s"] = (t_first - t0)
464
- known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
465
- other = server_metrics["until_first_chunk_total_s"] - known
466
- server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
467
- first_chunk_seen = True
468
- yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
469
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  yield (_pcm_f32_to_b64(buf), None, None, None)
471
- full_audio_chunks.append(buf)
 
 
 
472
 
473
  if not full_audio_chunks:
474
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
@@ -491,7 +505,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
491
  # 9) UI (лагі ў секундах + Play Final; без underrun’аў)
492
  # ---------------------------------------------------------
493
  examples = [
494
- ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", "Nestarka.wav"],
495
  ]
496
 
497
  with gr.Blocks() as demo:
@@ -525,7 +539,7 @@ with gr.Blocks() as demo:
525
  const AC = window.AudioContext || window.webkitAudioContext;
526
  if (!AC) return;
527
 
528
- const PRIME_CHUNKS = 2; // мін. к-ць чанкаў перад стартаваннем гуку
529
  let primeCounter = 0;
530
 
531
  function toSec(ms) {{ return (ms/1000); }}
@@ -576,7 +590,7 @@ with gr.Blocks() as demo:
576
 
577
  if (!window.__wa) {{
578
  const ctx = new AC({{ sampleRate }});
579
- const bufferSize = 2048; // большы буфер = менш underrun’аў
580
  const node = ctx.createScriptProcessor(bufferSize, 0, 1);
581
  let queue = [];
582
  let playing = false;
@@ -625,7 +639,6 @@ with gr.Blocks() as demo:
625
  logUpdate();
626
  }}
627
  if (!playing && queue.length >= PRIME_CHUNKS) {{
628
- // стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
629
  window.__wa.start();
630
  }}
631
  }},
@@ -699,4 +712,4 @@ with gr.Blocks() as demo:
699
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
700
 
701
  if __name__ == "__main__":
702
- demo.launch()
 
12
  import tempfile
13
  import subprocess
14
  import inspect
15
+ import itertools
16
  from typing import Iterator, Iterable, Optional, Tuple, Any, List
17
  from dataclasses import dataclass
18
  import pathlib
 
298
  return g2, s2
299
  return g, s
300
 
 
301
  try:
302
  _ = _latents_for(default_voice_file)
303
  except Exception as e:
 
398
  # ---------------------------------------------------------
399
  @spaces.GPU(duration=60)
400
  def text_to_speech(belarusian_story, speaker_audio_file=None):
 
 
 
 
 
 
 
401
  t0 = time.perf_counter()
402
 
403
  if not belarusian_story or str(belarusian_story).strip() == "":
 
413
  lang_short = "be"
414
  chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
415
 
 
416
  t_lat0 = time.perf_counter()
417
  to_dev = "cuda:0" if torch.cuda.is_available() else None
418
  gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
419
  t_lat1 = time.perf_counter()
420
 
 
421
  t_split0 = time.perf_counter()
422
  texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
423
  if not texts: texts = [text_in]
 
447
  temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
448
  top_k=10, top_p=0.3,
449
  )
450
+
451
+ # <--- ВЫПРАЎЛЕННЕ: Аптымізаваная логіка адпраўкі чанкаў
452
+ # Адпраўляем першы кавалак аўдыя неадкладна, астатнія групуем праз _chunker
453
+ gen_iterator = iter(gen)
454
+ try:
455
+ first_raw_chunk = next(gen_iterator)
456
+ if first_raw_chunk.size > 0:
457
+ # Адпраўка першага чанка
458
+ if not first_chunk_seen:
459
+ t_first = time.perf_counter()
460
+ server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
461
+ server_metrics["until_first_chunk_total_s"] = (t_first - t0)
462
+ known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
463
+ other = server_metrics["until_first_chunk_total_s"] - known
464
+ server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
465
+ first_chunk_seen = True
466
+ yield (_pcm_f32_to_b64(first_raw_chunk), None, None, json.dumps(server_metrics))
467
+ else:
468
+ yield (_pcm_f32_to_b64(first_raw_chunk), None, None, None)
469
+ full_audio_chunks.append(first_raw_chunk)
470
+
471
+ # Апрацоўка астатніх чанкаў праз _chunker
472
+ # Мы злучаем першы чанк (які ўжо адпраўлены) з астатнім генератарам,
473
+ # каб _chunker мог правільна зрабіць cross-fade, калі спатрэбіцца.
474
+ remaining_gen = itertools.chain([first_raw_chunk], gen_iterator)
475
+ for buf in _chunker(remaining_gen, sampling_rate, MIN_BUFFER_S):
476
+ # Калі першы чанк быў меншы за MIN_BUFFER_S, _chunker можа зноў яго вярнуць.
477
+ # Правяраем, ці не той гэта самы аб'ект, каб не адправіць двойчы.
478
+ if buf is first_raw_chunk and len(full_audio_chunks) > 0 and np.array_equal(buf, full_audio_chunks[-1]):
479
+ continue
480
+
481
  yield (_pcm_f32_to_b64(buf), None, None, None)
482
+ full_audio_chunks.append(buf)
483
+
484
+ except StopIteration:
485
+ continue # Генератар быў пусты
486
 
487
  if not full_audio_chunks:
488
  yield ("__STOP__", None, None, json.dumps(server_metrics)); return
 
505
  # 9) UI (лагі ў секундах + Play Final; без underrun’аў)
506
  # ---------------------------------------------------------
507
  examples = [
508
+ ["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None],
509
  ]
510
 
511
  with gr.Blocks() as demo:
 
539
  const AC = window.AudioContext || window.webkitAudioContext;
540
  if (!AC) return;
541
 
542
+ const PRIME_CHUNKS = 1; // <--- ВЫПРАЎЛЕННЕ: Пачынаем прайграванне пасля 1-га чанка, а не 2-х
543
  let primeCounter = 0;
544
 
545
  function toSec(ms) {{ return (ms/1000); }}
 
590
 
591
  if (!window.__wa) {{
592
  const ctx = new AC({{ sampleRate }});
593
+ const bufferSize = 2048;
594
  const node = ctx.createScriptProcessor(bufferSize, 0, 1);
595
  let queue = [];
596
  let playing = false;
 
639
  logUpdate();
640
  }}
641
  if (!playing && queue.length >= PRIME_CHUNKS) {{
 
642
  window.__wa.start();
643
  }}
644
  }},
 
712
  gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
713
 
714
  if __name__ == "__main__":
715
+ demo.launch()