Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -71,7 +71,6 @@ XTTS_MODEL.load_checkpoint(
|
|
| 71 |
|
| 72 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 73 |
|
| 74 |
-
# налады
|
| 75 |
torch.set_num_threads(1)
|
| 76 |
if device.startswith("cuda"):
|
| 77 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
@@ -87,7 +86,7 @@ tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
|
|
| 87 |
XTTS_MODEL.tokenizer = tokenizer
|
| 88 |
|
| 89 |
# =========================================================
|
| 90 |
-
# 3) Канстанты стриму
|
| 91 |
# =========================================================
|
| 92 |
MIN_BUFFER_S = 0.020
|
| 93 |
RUNTIME_FIRST_CHUNK_S = 0.010
|
|
@@ -97,7 +96,7 @@ ENABLE_TEXT_SPLITTING = True
|
|
| 97 |
FIRST_SEGMENT_LIMIT = 120
|
| 98 |
|
| 99 |
# =========================================================
|
| 100 |
-
# 4)
|
| 101 |
# =========================================================
|
| 102 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
| 103 |
return max(1, int(sec * sr))
|
|
@@ -165,7 +164,7 @@ def _pcm_f32_to_int16_b64(x: np.ndarray) -> str:
|
|
| 165 |
if x.dtype != np.float32:
|
| 166 |
x = x.astype(np.float32, copy=False)
|
| 167 |
y = np.clip(x, -1.0, 0.9999695)
|
| 168 |
-
i16 = (y * 32767.0).astype("<i2", copy=False)
|
| 169 |
return base64.b64encode(i16.tobytes()).decode("ascii")
|
| 170 |
|
| 171 |
# =========================================================
|
|
@@ -394,7 +393,7 @@ except Exception as e:
|
|
| 394 |
print(f"[warn] warm-up failed: {e}")
|
| 395 |
|
| 396 |
# =========================================================
|
| 397 |
-
# 7) Падзел тэксту
|
| 398 |
# =========================================================
|
| 399 |
_SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
|
| 400 |
_WS = re.compile(r"\s+")
|
|
@@ -469,16 +468,13 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
|
|
| 469 |
return parts + (rest or [text_for_rest])
|
| 470 |
|
| 471 |
# =========================================================
|
| 472 |
-
# 8) TTS стрим
|
| 473 |
-
# — замянілі механіку: Python выдае JSON-пакеты ў адзін схаваны Textbox,
|
| 474 |
-
# фронт (JS у gr.HTML) опускае іх праз polling у AudioWorklet
|
| 475 |
# =========================================================
|
| 476 |
@spaces.GPU(duration=60)
|
| 477 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
| 478 |
t0 = time.perf_counter()
|
| 479 |
|
| 480 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
| 481 |
-
# пусты пакет, каб фронт не зламаўся
|
| 482 |
yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
|
| 483 |
raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
|
| 484 |
|
|
@@ -491,7 +487,7 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 491 |
lang_short = "be"
|
| 492 |
chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
|
| 493 |
|
| 494 |
-
# Latents
|
| 495 |
t_lat0 = time.perf_counter()
|
| 496 |
to_dev = device if device.startswith("cuda") else None
|
| 497 |
gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
|
|
@@ -514,7 +510,6 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 514 |
}
|
| 515 |
|
| 516 |
seq = 0
|
| 517 |
-
# пачатковы пакет — паведамляем фронту, што ўсё гатова
|
| 518 |
yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
|
| 519 |
|
| 520 |
full_audio_chunks: List[np.ndarray] = []
|
|
@@ -568,18 +563,16 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 568 |
final_file_path = tmp.name
|
| 569 |
final_audio_path = tmp.name
|
| 570 |
except Exception as e:
|
| 571 |
-
# паведамляем пра памылку праз лаг
|
| 572 |
server_metrics["_file_error"] = str(e)
|
| 573 |
finally:
|
| 574 |
t_w1 = time.perf_counter()
|
| 575 |
server_metrics["file_write_s"] = (t_w1 - t_w0)
|
| 576 |
|
| 577 |
-
# стоп-пакет
|
| 578 |
seq += 1
|
| 579 |
yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
|
| 580 |
|
| 581 |
# =========================================================
|
| 582 |
-
# 9) UI + AudioWorklet з polling (без
|
| 583 |
# =========================================================
|
| 584 |
examples = [
|
| 585 |
[
|
|
@@ -606,12 +599,19 @@ with gr.Blocks() as demo:
|
|
| 606 |
label="Лагі плэера",
|
| 607 |
)
|
| 608 |
|
| 609 |
-
#
|
| 610 |
-
stream_pipe = gr.Textbox(value="", visible=
|
| 611 |
final_file = gr.File(label="Згенераваны WAV (спампаваць)")
|
| 612 |
final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
|
| 613 |
play_final_btn = gr.Button("▶️ Play Final")
|
| 614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
# --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
|
| 616 |
FRONT_HTML = f"""
|
| 617 |
<script>
|
|
@@ -623,7 +623,6 @@ with gr.Blocks() as demo:
|
|
| 623 |
function toSec(ms) {{ return (ms/1000); }}
|
| 624 |
function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
|
| 625 |
|
| 626 |
-
// Лагі
|
| 627 |
function updateLog() {{
|
| 628 |
const el = document.getElementById('wa-log');
|
| 629 |
if (!el || !window.__wa || !window.__wa.meta) return;
|
|
@@ -664,13 +663,22 @@ with gr.Blocks() as demo:
|
|
| 664 |
}}
|
| 665 |
|
| 666 |
async function ensureWorklet(ctx) {{
|
| 667 |
-
// AudioWorklet код у выглядзе радка
|
| 668 |
const code = `
|
| 669 |
class PushPlayerProcessor extends AudioWorkletProcessor {{
|
| 670 |
constructor() {{
|
| 671 |
super();
|
| 672 |
this.queue = [];
|
| 673 |
this.readIndex = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
}}
|
| 675 |
process(inputs, outputs) {{
|
| 676 |
const out = outputs[0][0];
|
|
@@ -694,7 +702,6 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 694 |
await ctx.audioWorklet.addModule(url);
|
| 695 |
}}
|
| 696 |
|
| 697 |
-
// Ініцыялізацыя плэера
|
| 698 |
async function ensurePlayer() {{
|
| 699 |
if (window.__wa) return window.__wa;
|
| 700 |
if (!AC) return null;
|
|
@@ -705,7 +712,7 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 705 |
node.connect(ctx.destination);
|
| 706 |
|
| 707 |
let playing = true;
|
| 708 |
-
|
| 709 |
const meta = {{
|
| 710 |
t_click_ms: null,
|
| 711 |
t_first_push_ms: null,
|
|
@@ -716,20 +723,19 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 716 |
const api = {{
|
| 717 |
ctx, node,
|
| 718 |
get playing() {{ return playing; }},
|
| 719 |
-
start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing =
|
| 720 |
-
stop: () => {{ playing = false; updateLog(); }},
|
| 721 |
reset: () => {{
|
| 722 |
-
|
| 723 |
meta.t_first_push_ms = null;
|
| 724 |
meta.t_first_audio_ms = null;
|
| 725 |
updateLog();
|
| 726 |
}},
|
| 727 |
-
push: (f32)
|
| 728 |
try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
|
| 729 |
if (!meta.t_first_push_ms) {{
|
| 730 |
meta.t_first_push_ms = performance.now();
|
| 731 |
-
|
| 732 |
-
if (!meta.t_first_audio_ms && playing) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
|
| 733 |
updateLog();
|
| 734 |
}}
|
| 735 |
if (!playing) api.start();
|
|
@@ -741,26 +747,35 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 741 |
return api;
|
| 742 |
}}
|
| 743 |
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
const
|
| 747 |
-
if (!
|
| 748 |
-
const
|
| 749 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
|
|
|
|
|
|
| 754 |
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
const pkt = parsePacket(txt);
|
| 759 |
-
if (!pkt || typeof pkt.seq !== 'number') return;
|
| 760 |
-
if (!api.meta.t_click_ms) api.meta.t_click_ms = performance.now();
|
| 761 |
|
| 762 |
-
|
| 763 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
if (pkt.log) {{
|
| 766 |
api.meta.server = pkt.log;
|
|
@@ -774,7 +789,6 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 774 |
}}
|
| 775 |
|
| 776 |
if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
|
| 777 |
-
// распакаваць INT16LE->F32
|
| 778 |
const bin = atob(pkt.b64);
|
| 779 |
const len = bin.length;
|
| 780 |
const buf = new ArrayBuffer(len);
|
|
@@ -791,7 +805,6 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 791 |
}}, POLL_MS);
|
| 792 |
}}
|
| 793 |
|
| 794 |
-
// кнопкі кіравання
|
| 795 |
window.__wa_start_click = async function() {{
|
| 796 |
const api = await ensurePlayer();
|
| 797 |
api.meta.t_click_ms = performance.now();
|
|
@@ -812,8 +825,8 @@ registerProcessor('push-player', PushPlayerProcessor);
|
|
| 812 |
if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
|
| 813 |
}};
|
| 814 |
|
| 815 |
-
//
|
| 816 |
-
|
| 817 |
}})();
|
| 818 |
</script>
|
| 819 |
"""
|
|
|
|
| 71 |
|
| 72 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 73 |
|
|
|
|
| 74 |
torch.set_num_threads(1)
|
| 75 |
if device.startswith("cuda"):
|
| 76 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
| 86 |
XTTS_MODEL.tokenizer = tokenizer
|
| 87 |
|
| 88 |
# =========================================================
|
| 89 |
+
# 3) Канстанты стриму
|
| 90 |
# =========================================================
|
| 91 |
MIN_BUFFER_S = 0.020
|
| 92 |
RUNTIME_FIRST_CHUNK_S = 0.010
|
|
|
|
| 96 |
FIRST_SEGMENT_LIMIT = 120
|
| 97 |
|
| 98 |
# =========================================================
|
| 99 |
+
# 4) Аўдыя-ўтыліты
|
| 100 |
# =========================================================
|
| 101 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
| 102 |
return max(1, int(sec * sr))
|
|
|
|
| 164 |
if x.dtype != np.float32:
|
| 165 |
x = x.astype(np.float32, copy=False)
|
| 166 |
y = np.clip(x, -1.0, 0.9999695)
|
| 167 |
+
i16 = (y * 32767.0).astype("<i2", copy=False)
|
| 168 |
return base64.b64encode(i16.tobytes()).decode("ascii")
|
| 169 |
|
| 170 |
# =========================================================
|
|
|
|
| 393 |
print(f"[warn] warm-up failed: {e}")
|
| 394 |
|
| 395 |
# =========================================================
|
| 396 |
+
# 7) Падзел тэксту
|
| 397 |
# =========================================================
|
| 398 |
_SENT_END = re.compile(r'([\.!\?…]+[»")\]]*\s+)')
|
| 399 |
_WS = re.compile(r"\s+")
|
|
|
|
| 468 |
return parts + (rest or [text_for_rest])
|
| 469 |
|
| 470 |
# =========================================================
|
| 471 |
+
# 8) TTS стрим (выдае JSON-пакеты ў Textbox)
|
|
|
|
|
|
|
| 472 |
# =========================================================
|
| 473 |
@spaces.GPU(duration=60)
|
| 474 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
| 475 |
t0 = time.perf_counter()
|
| 476 |
|
| 477 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
|
|
|
| 478 |
yield (json.dumps({"seq": 0, "b64": "", "log": None, "stop": False}), None, None)
|
| 479 |
raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
|
| 480 |
|
|
|
|
| 487 |
lang_short = "be"
|
| 488 |
chunk_limit = getattr(XTTS_MODEL.tokenizer, "char_limits", {}).get(lang_short, 250)
|
| 489 |
|
| 490 |
+
# Latents
|
| 491 |
t_lat0 = time.perf_counter()
|
| 492 |
to_dev = device if device.startswith("cuda") else None
|
| 493 |
gpt_cond_latent, speaker_embedding = _latents_for(speaker_audio_file, to_device=to_dev)
|
|
|
|
| 510 |
}
|
| 511 |
|
| 512 |
seq = 0
|
|
|
|
| 513 |
yield (json.dumps({"seq": seq, "b64": "", "log": server_metrics, "stop": False}), None, None)
|
| 514 |
|
| 515 |
full_audio_chunks: List[np.ndarray] = []
|
|
|
|
| 563 |
final_file_path = tmp.name
|
| 564 |
final_audio_path = tmp.name
|
| 565 |
except Exception as e:
|
|
|
|
| 566 |
server_metrics["_file_error"] = str(e)
|
| 567 |
finally:
|
| 568 |
t_w1 = time.perf_counter()
|
| 569 |
server_metrics["file_write_s"] = (t_w1 - t_w0)
|
| 570 |
|
|
|
|
| 571 |
seq += 1
|
| 572 |
yield (json.dumps({"seq": seq, "b64": "__STOP__", "log": server_metrics, "stop": True}), final_file_path, final_audio_path)
|
| 573 |
|
| 574 |
# =========================================================
|
| 575 |
+
# 9) UI + AudioWorklet з polling (без gr.Audio streaming)
|
| 576 |
# =========================================================
|
| 577 |
examples = [
|
| 578 |
[
|
|
|
|
| 599 |
label="Лагі плэера",
|
| 600 |
)
|
| 601 |
|
| 602 |
+
# ВАЖНА: робім stream_pipe бачным у DOM (visible=True), але хаваем праз CSS
|
| 603 |
+
stream_pipe = gr.Textbox(value="", visible=True, label="stream_pipe", elem_id="stream-pipe")
|
| 604 |
final_file = gr.File(label="Згенераваны WAV (спампаваць)")
|
| 605 |
final_audio = gr.Audio(label="Фінальнае аўдыя", type="filepath", interactive=False, elem_id="final-audio")
|
| 606 |
play_final_btn = gr.Button("▶️ Play Final")
|
| 607 |
|
| 608 |
+
# CSS — схаваць stream_pipe і актыўнасці
|
| 609 |
+
gr.HTML("""
|
| 610 |
+
<style>
|
| 611 |
+
#stream-pipe { position:absolute; left:-99999px; width:1px; height:1px; opacity:0; pointer-events:none; }
|
| 612 |
+
</style>
|
| 613 |
+
""")
|
| 614 |
+
|
| 615 |
# --------- Frontend JS (пастаянны polling + AudioWorklet) ----------
|
| 616 |
FRONT_HTML = f"""
|
| 617 |
<script>
|
|
|
|
| 623 |
function toSec(ms) {{ return (ms/1000); }}
|
| 624 |
function fmtS(x) {{ return (x==null) ? 'n/a' : (x.toFixed ? x.toFixed(3) : x) + ' s'; }}
|
| 625 |
|
|
|
|
| 626 |
function updateLog() {{
|
| 627 |
const el = document.getElementById('wa-log');
|
| 628 |
if (!el || !window.__wa || !window.__wa.meta) return;
|
|
|
|
| 663 |
}}
|
| 664 |
|
| 665 |
async function ensureWorklet(ctx) {{
|
|
|
|
| 666 |
const code = `
|
| 667 |
class PushPlayerProcessor extends AudioWorkletProcessor {{
|
| 668 |
constructor() {{
|
| 669 |
super();
|
| 670 |
this.queue = [];
|
| 671 |
this.readIndex = 0;
|
| 672 |
+
this.port.onmessage = (e) => {{
|
| 673 |
+
const d = e.data || {{}};
|
| 674 |
+
if (d.type === 'push' && d.buffer) {{
|
| 675 |
+
const f32 = new Float32Array(d.buffer);
|
| 676 |
+
this.queue.push(f32);
|
| 677 |
+
}} else if (d.type === 'reset') {{
|
| 678 |
+
this.queue.length = 0;
|
| 679 |
+
this.readIndex = 0;
|
| 680 |
+
}}
|
| 681 |
+
}};
|
| 682 |
}}
|
| 683 |
process(inputs, outputs) {{
|
| 684 |
const out = outputs[0][0];
|
|
|
|
| 702 |
await ctx.audioWorklet.addModule(url);
|
| 703 |
}}
|
| 704 |
|
|
|
|
| 705 |
async function ensurePlayer() {{
|
| 706 |
if (window.__wa) return window.__wa;
|
| 707 |
if (!AC) return null;
|
|
|
|
| 712 |
node.connect(ctx.destination);
|
| 713 |
|
| 714 |
let playing = true;
|
| 715 |
+
|
| 716 |
const meta = {{
|
| 717 |
t_click_ms: null,
|
| 718 |
t_first_push_ms: null,
|
|
|
|
| 723 |
const api = {{
|
| 724 |
ctx, node,
|
| 725 |
get playing() {{ return playing; }},
|
| 726 |
+
start: async () => {{ try {{ await ctx.resume(); }} catch(e) {{}} playing = True; updateLog(); }},
|
| 727 |
+
stop: () => {{ try {{ ctx.suspend(); }} catch(e){{}} playing = false; updateLog(); }},
|
| 728 |
reset: () => {{
|
| 729 |
+
try {{ node.port.postMessage({{ type: 'reset' }}); }} catch(e) {{}}
|
| 730 |
meta.t_first_push_ms = null;
|
| 731 |
meta.t_first_audio_ms = null;
|
| 732 |
updateLog();
|
| 733 |
}},
|
| 734 |
+
push: (f32) {{
|
| 735 |
try {{ node.port.postMessage({{ type: 'push', buffer: f32.buffer }}, [f32.buffer]); }} catch (e) {{}}
|
| 736 |
if (!meta.t_first_push_ms) {{
|
| 737 |
meta.t_first_push_ms = performance.now();
|
| 738 |
+
if (!meta.t_first_audio_ms) meta.t_first_audio_ms = meta.t_first_push_ms + 10;
|
|
|
|
| 739 |
updateLog();
|
| 740 |
}}
|
| 741 |
if (!playing) api.start();
|
|
|
|
| 747 |
return api;
|
| 748 |
}}
|
| 749 |
|
| 750 |
+
function getPipeEl() {{
|
| 751 |
+
// Textbox у Gradio мае textarea унутры div#stream-pipe
|
| 752 |
+
const root = document.getElementById('stream-pipe');
|
| 753 |
+
if (!root) return null;
|
| 754 |
+
const ta = root.querySelector('textarea');
|
| 755 |
+
if (ta) return ta;
|
| 756 |
+
const inp = root.querySelector('input');
|
| 757 |
+
if (inp) return inp;
|
| 758 |
+
return root;
|
| 759 |
+
}}
|
| 760 |
|
| 761 |
+
function startPolling() {{
|
| 762 |
+
const pipe = getPipeEl();
|
| 763 |
+
if (!pipe) return;
|
| 764 |
+
const apiPromise = ensurePlayer();
|
| 765 |
+
let lastSeq = -1;
|
| 766 |
|
| 767 |
+
setInterval(async () => {{
|
| 768 |
+
const api = await apiPromise;
|
| 769 |
+
if (!api) return;
|
|
|
|
|
|
|
|
|
|
| 770 |
|
| 771 |
+
const txt = (pipe.value !== undefined) ? pipe.value : (pipe.innerText || pipe.textContent || '');
|
| 772 |
+
if (!txt) return;
|
| 773 |
+
|
| 774 |
+
let pkt = null;
|
| 775 |
+
try {{ pkt = JSON.parse(txt); }} catch(e) {{ return; }}
|
| 776 |
+
if (!pkt || typeof pkt.seq !== 'number') return;
|
| 777 |
+
if (pkt.seq <= lastSeq) return;
|
| 778 |
+
lastSeq = pkt.seq;
|
| 779 |
|
| 780 |
if (pkt.log) {{
|
| 781 |
api.meta.server = pkt.log;
|
|
|
|
| 789 |
}}
|
| 790 |
|
| 791 |
if (typeof pkt.b64 === 'string' && pkt.b64.length > 0) {{
|
|
|
|
| 792 |
const bin = atob(pkt.b64);
|
| 793 |
const len = bin.length;
|
| 794 |
const buf = new ArrayBuffer(len);
|
|
|
|
| 805 |
}}, POLL_MS);
|
| 806 |
}}
|
| 807 |
|
|
|
|
| 808 |
window.__wa_start_click = async function() {{
|
| 809 |
const api = await ensurePlayer();
|
| 810 |
api.meta.t_click_ms = performance.now();
|
|
|
|
| 825 |
if (audio) {{ try {{ audio.play(); }} catch(e) {{}} }}
|
| 826 |
}};
|
| 827 |
|
| 828 |
+
// Стартуем polling пасля загрузкі
|
| 829 |
+
startPolling();
|
| 830 |
}})();
|
| 831 |
</script>
|
| 832 |
"""
|