Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import hashlib
|
|
| 12 |
import tempfile
|
| 13 |
import subprocess
|
| 14 |
import inspect
|
|
|
|
| 15 |
from typing import Iterator, Iterable, Optional, Tuple, Any, List
|
| 16 |
from dataclasses import dataclass
|
| 17 |
import pathlib
|
|
@@ -297,7 +298,6 @@ def _latents_for(path: str | None, *, to_device: Optional[str] = None) -> Tuple[
|
|
| 297 |
return g2, s2
|
| 298 |
return g, s
|
| 299 |
|
| 300 |
-
# аўтападлік для default voice (CPU) — без дадатковых запытаў
|
| 301 |
try:
|
| 302 |
_ = _latents_for(default_voice_file)
|
| 303 |
except Exception as e:
|
|
@@ -398,13 +398,6 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
|
|
| 398 |
# ---------------------------------------------------------
|
| 399 |
@spaces.GPU(duration=60)
|
| 400 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
| 401 |
-
"""
|
| 402 |
-
Выхады:
|
| 403 |
-
1) stream_pipe — base64(PCM float32) чанкі, у фінале "__STOP__"
|
| 404 |
-
2) final_file — шлях да WAV
|
| 405 |
-
3) final_audio — шлях да WAV для прайгравання
|
| 406 |
-
4) log_pipe — JSON з сервернымі метрыкамі (секунды)
|
| 407 |
-
"""
|
| 408 |
t0 = time.perf_counter()
|
| 409 |
|
| 410 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
|
@@ -420,13 +413,11 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 420 |
lang_short = "be"
|
| 421 |
chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
|
| 422 |
|
| 423 |
-
# Latents (кэш CPU/GPU)
|
| 424 |
t_lat0 = time.perf_counter()
|
| 425 |
to_dev = "cuda:0" if torch.cuda.is_available() else None
|
| 426 |
gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
|
| 427 |
t_lat1 = time.perf_counter()
|
| 428 |
|
| 429 |
-
# Split
|
| 430 |
t_split0 = time.perf_counter()
|
| 431 |
texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
|
| 432 |
if not texts: texts = [text_in]
|
|
@@ -456,19 +447,42 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 456 |
temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
|
| 457 |
top_k=10, top_p=0.3,
|
| 458 |
)
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
yield (_pcm_f32_to_b64(buf), None, None, None)
|
| 471 |
-
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
if not full_audio_chunks:
|
| 474 |
yield ("__STOP__", None, None, json.dumps(server_metrics)); return
|
|
@@ -491,7 +505,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 491 |
# 9) UI (лагі ў секундах + Play Final; без underrun’аў)
|
| 492 |
# ---------------------------------------------------------
|
| 493 |
examples = [
|
| 494 |
-
["Прывітанне! Гэта праверка жывога струменя беларускага TTS.",
|
| 495 |
]
|
| 496 |
|
| 497 |
with gr.Blocks() as demo:
|
|
@@ -525,7 +539,7 @@ with gr.Blocks() as demo:
|
|
| 525 |
const AC = window.AudioContext || window.webkitAudioContext;
|
| 526 |
if (!AC) return;
|
| 527 |
|
| 528 |
-
const PRIME_CHUNKS =
|
| 529 |
let primeCounter = 0;
|
| 530 |
|
| 531 |
function toSec(ms) {{ return (ms/1000); }}
|
|
@@ -576,7 +590,7 @@ with gr.Blocks() as demo:
|
|
| 576 |
|
| 577 |
if (!window.__wa) {{
|
| 578 |
const ctx = new AC({{ sampleRate }});
|
| 579 |
-
const bufferSize = 2048;
|
| 580 |
const node = ctx.createScriptProcessor(bufferSize, 0, 1);
|
| 581 |
let queue = [];
|
| 582 |
let playing = false;
|
|
@@ -625,7 +639,6 @@ with gr.Blocks() as demo:
|
|
| 625 |
logUpdate();
|
| 626 |
}}
|
| 627 |
if (!playing && queue.length >= PRIME_CHUNKS) {{
|
| 628 |
-
// стартуем толькі калі ёсць мінімум 2 чанкі ў чарзе
|
| 629 |
window.__wa.start();
|
| 630 |
}}
|
| 631 |
}},
|
|
@@ -699,4 +712,4 @@ with gr.Blocks() as demo:
|
|
| 699 |
gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
|
| 700 |
|
| 701 |
if __name__ == "__main__":
|
| 702 |
-
demo.launch()
|
|
|
|
| 12 |
import tempfile
|
| 13 |
import subprocess
|
| 14 |
import inspect
|
| 15 |
+
import itertools
|
| 16 |
from typing import Iterator, Iterable, Optional, Tuple, Any, List
|
| 17 |
from dataclasses import dataclass
|
| 18 |
import pathlib
|
|
|
|
| 298 |
return g2, s2
|
| 299 |
return g, s
|
| 300 |
|
|
|
|
| 301 |
try:
|
| 302 |
_ = _latents_for(default_voice_file)
|
| 303 |
except Exception as e:
|
|
|
|
| 398 |
# ---------------------------------------------------------
|
| 399 |
@spaces.GPU(duration=60)
|
| 400 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
t0 = time.perf_counter()
|
| 402 |
|
| 403 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
|
|
|
| 413 |
lang_short = "be"
|
| 414 |
chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
|
| 415 |
|
|
|
|
| 416 |
t_lat0 = time.perf_counter()
|
| 417 |
to_dev = "cuda:0" if torch.cuda.is_available() else None
|
| 418 |
gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
|
| 419 |
t_lat1 = time.perf_counter()
|
| 420 |
|
|
|
|
| 421 |
t_split0 = time.perf_counter()
|
| 422 |
texts = _split_text_smart(text_in, lang_short, chunk_limit) if ENABLE_TEXT_SPLITTING else [text_in]
|
| 423 |
if not texts: texts = [text_in]
|
|
|
|
| 447 |
temperature=0.1, length_penalty=1.0, repetition_penalty=10.0,
|
| 448 |
top_k=10, top_p=0.3,
|
| 449 |
)
|
| 450 |
+
|
| 451 |
+
# <--- ВЫПРАЎЛЕННЕ: Аптымізаваная логіка адпраўкі чанкаў
|
| 452 |
+
# Адпраўляем першы кавалак аўдыя неадкладна, астатнія групуем праз _chunker
|
| 453 |
+
gen_iterator = iter(gen)
|
| 454 |
+
try:
|
| 455 |
+
first_raw_chunk = next(gen_iterator)
|
| 456 |
+
if first_raw_chunk.size > 0:
|
| 457 |
+
# Адпраўка першага чанка
|
| 458 |
+
if not first_chunk_seen:
|
| 459 |
+
t_first = time.perf_counter()
|
| 460 |
+
server_metrics["gen_init_to_first_chunk_s"] = (t_first - t_gen0)
|
| 461 |
+
server_metrics["until_first_chunk_total_s"] = (t_first - t0)
|
| 462 |
+
known = server_metrics["latents_s"] + server_metrics["text_split_s"] + server_metrics["gen_init_to_first_chunk_s"]
|
| 463 |
+
other = server_metrics["until_first_chunk_total_s"] - known
|
| 464 |
+
server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, other)
|
| 465 |
+
first_chunk_seen = True
|
| 466 |
+
yield (_pcm_f32_to_b64(first_raw_chunk), None, None, json.dumps(server_metrics))
|
| 467 |
+
else:
|
| 468 |
+
yield (_pcm_f32_to_b64(first_raw_chunk), None, None, None)
|
| 469 |
+
full_audio_chunks.append(first_raw_chunk)
|
| 470 |
+
|
| 471 |
+
# Апрацоўка астатніх чанкаў праз _chunker
|
| 472 |
+
# Мы злучаем першы чанк (які ўжо адпраўлены) з астатнім генератарам,
|
| 473 |
+
# каб _chunker мог правільна зрабіць cross-fade, калі спатрэбіцца.
|
| 474 |
+
remaining_gen = itertools.chain([first_raw_chunk], gen_iterator)
|
| 475 |
+
for buf in _chunker(remaining_gen, sampling_rate, MIN_BUFFER_S):
|
| 476 |
+
# Калі першы чанк быў меншы за MIN_BUFFER_S, _chunker можа зноў яго вярнуць.
|
| 477 |
+
# Правяраем, ці не той гэта самы аб'ект, каб не адправіць двойчы.
|
| 478 |
+
if buf is first_raw_chunk and len(full_audio_chunks) > 0 and np.array_equal(buf, full_audio_chunks[-1]):
|
| 479 |
+
continue
|
| 480 |
+
|
| 481 |
yield (_pcm_f32_to_b64(buf), None, None, None)
|
| 482 |
+
full_audio_chunks.append(buf)
|
| 483 |
+
|
| 484 |
+
except StopIteration:
|
| 485 |
+
continue # Генератар быў пусты
|
| 486 |
|
| 487 |
if not full_audio_chunks:
|
| 488 |
yield ("__STOP__", None, None, json.dumps(server_metrics)); return
|
|
|
|
| 505 |
# 9) UI (лагі ў секундах + Play Final; без underrun’аў)
|
| 506 |
# ---------------------------------------------------------
|
| 507 |
examples = [
|
| 508 |
+
["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None],
|
| 509 |
]
|
| 510 |
|
| 511 |
with gr.Blocks() as demo:
|
|
|
|
| 539 |
const AC = window.AudioContext || window.webkitAudioContext;
|
| 540 |
if (!AC) return;
|
| 541 |
|
| 542 |
+
const PRIME_CHUNKS = 1; // <--- ВЫПРАЎЛЕННЕ: Пачынаем прайграванне пасля 1-га чанка, а не 2-х
|
| 543 |
let primeCounter = 0;
|
| 544 |
|
| 545 |
function toSec(ms) {{ return (ms/1000); }}
|
|
|
|
| 590 |
|
| 591 |
if (!window.__wa) {{
|
| 592 |
const ctx = new AC({{ sampleRate }});
|
| 593 |
+
const bufferSize = 2048;
|
| 594 |
const node = ctx.createScriptProcessor(bufferSize, 0, 1);
|
| 595 |
let queue = [];
|
| 596 |
let playing = false;
|
|
|
|
| 639 |
logUpdate();
|
| 640 |
}}
|
| 641 |
if (!playing && queue.length >= PRIME_CHUNKS) {{
|
|
|
|
| 642 |
window.__wa.start();
|
| 643 |
}}
|
| 644 |
}},
|
|
|
|
| 712 |
gr.Examples(examples=examples, inputs=[inp_text, inp_voice], fn=None, cache_examples=False)
|
| 713 |
|
| 714 |
if __name__ == "__main__":
|
| 715 |
+
demo.launch()
|