Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -88,13 +88,12 @@ XTTS_MODEL.tokenizer = tokenizer
|
|
| 88 |
# =========================================================
|
| 89 |
# 4) Streaming-канфіг
|
| 90 |
# =========================================================
|
| 91 |
-
INITIAL_MIN_BUFFER_S = 0.
|
| 92 |
MIN_BUFFER_S = 0.1
|
| 93 |
RUNTIME_FIRST_CHUNK_S = 0.02
|
| 94 |
FADE_S = 0.004
|
| 95 |
TOKENS_PER_STEP = 1
|
| 96 |
ENABLE_TEXT_SPLITTING = True
|
| 97 |
-
FIRST_SEGMENT_LIMIT = 160
|
| 98 |
|
| 99 |
# -------------------- утыліты аўдыя ----------------------
|
| 100 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
|
@@ -156,7 +155,7 @@ def init_stream_support():
|
|
| 156 |
init_stream_support()
|
| 157 |
|
| 158 |
# ---------------------------------------------------------
|
| 159 |
-
# 5) пастаянны кэш латэнтаў
|
| 160 |
# ---------------------------------------------------------
|
| 161 |
PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
|
| 162 |
PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
@@ -228,16 +227,8 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
|
|
| 228 |
text_in = text_in.strip()
|
| 229 |
if not text_in: return []
|
| 230 |
try:
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
for sentence in sentences:
|
| 234 |
-
if len(current_chunk) + len(sentence) + 1 <= chunk_limit: current_chunk += " " + sentence
|
| 235 |
-
else:
|
| 236 |
-
if current_chunk: chunks.append(current_chunk.strip())
|
| 237 |
-
current_chunk = sentence
|
| 238 |
-
if current_chunk: chunks.append(current_chunk.strip())
|
| 239 |
-
return [c for c in chunks if c]
|
| 240 |
-
except Exception: return [text_in]
|
| 241 |
|
| 242 |
# ---------------------------------------------------------
|
| 243 |
# 8) TTS — стрым-функцыя
|
|
@@ -262,6 +253,21 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
|
|
| 262 |
"latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
|
| 263 |
"initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
|
| 264 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
yield ("", None, None, json.dumps(server_metrics))
|
| 266 |
|
| 267 |
full_audio_chunks, first_chunk_seen = [], False
|
|
@@ -277,9 +283,9 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
|
|
| 277 |
if not first_chunk_seen:
|
| 278 |
t_first = time.perf_counter()
|
| 279 |
server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
|
| 280 |
-
server_metrics["until_first_chunk_total_s"] = t_first - t0
|
| 281 |
known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
|
| 282 |
-
server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0) - known)
|
| 283 |
first_chunk_seen = True
|
| 284 |
yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
|
| 285 |
else:
|
|
@@ -300,7 +306,7 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
|
|
| 300 |
# ---------------------------------------------------------
|
| 301 |
# 9) UI
|
| 302 |
# ---------------------------------------------------------
|
| 303 |
-
examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None,
|
| 304 |
|
| 305 |
with gr.Blocks() as demo:
|
| 306 |
gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
|
|
@@ -326,7 +332,6 @@ with gr.Blocks() as demo:
|
|
| 326 |
const sampleRate = {sampling_rate};
|
| 327 |
const AC = window.AudioContext || window.webkitAudioContext;
|
| 328 |
if (!AC) return;
|
| 329 |
-
|
| 330 |
function toSec(ms) {{ return (ms/1000); }}
|
| 331 |
function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
|
| 332 |
|
|
@@ -345,7 +350,6 @@ with gr.Blocks() as demo:
|
|
| 345 |
lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
|
| 346 |
}}
|
| 347 |
}}
|
| 348 |
-
|
| 349 |
lines.push("\\n— Налады стрыму —");
|
| 350 |
lines.push("Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s));
|
| 351 |
lines.push("Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
|
|
@@ -353,20 +357,18 @@ with gr.Blocks() as demo:
|
|
| 353 |
lines.push("Працягласць 1-га чанка: " + m.chunk_durations[0] + " s");
|
| 354 |
lines.push("Атрымана чанкаў: " + m.chunk_durations.length);
|
| 355 |
}}
|
| 356 |
-
|
| 357 |
lines.push("\\n— Серверныя метрыкі —");
|
| 358 |
lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
|
| 359 |
lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
|
| 360 |
-
lines.push("
|
|
|
|
| 361 |
lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
|
| 362 |
lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
|
| 363 |
lines.push("Запіс WAV: " + fmtS(s.file_write_s));
|
| 364 |
-
|
| 365 |
if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
|
| 366 |
let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
|
| 367 |
lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
|
| 368 |
}}
|
| 369 |
-
|
| 370 |
lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
|
| 371 |
el.innerHTML = lines.join("\\n");
|
| 372 |
}}
|
|
@@ -423,10 +425,8 @@ with gr.Blocks() as demo:
|
|
| 423 |
const view = new Uint8Array(buf);
|
| 424 |
for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
|
| 425 |
const f32 = new Float32Array(buf);
|
| 426 |
-
|
| 427 |
const duration = f32.length / window.__wa.ctx.sampleRate;
|
| 428 |
window.__wa.meta.chunk_durations.push(duration.toFixed(3));
|
| 429 |
-
|
| 430 |
window.__wa.push(f32);
|
| 431 |
}
|
| 432 |
"""
|
|
@@ -439,7 +439,6 @@ with gr.Blocks() as demo:
|
|
| 439 |
} catch (e) {}
|
| 440 |
}
|
| 441 |
"""
|
| 442 |
-
# <--- ВЫПРАЎЛЕННЕ: выкарыстаны правільны параметр `js` замест `_js`
|
| 443 |
run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
|
| 444 |
run_btn.click(
|
| 445 |
fn=text_to_speech,
|
|
@@ -448,7 +447,6 @@ with gr.Blocks() as demo:
|
|
| 448 |
)
|
| 449 |
stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
|
| 450 |
log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
|
| 451 |
-
|
| 452 |
gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
|
| 453 |
|
| 454 |
if __name__ == "__main__":
|
|
|
|
| 88 |
# =========================================================
|
| 89 |
# 4) Streaming-канфіг
|
| 90 |
# =========================================================
|
| 91 |
+
INITIAL_MIN_BUFFER_S = 0.35 # <--- Рэкамендуемае значэнне пасля выпраўлення
|
| 92 |
MIN_BUFFER_S = 0.1
|
| 93 |
RUNTIME_FIRST_CHUNK_S = 0.02
|
| 94 |
FADE_S = 0.004
|
| 95 |
TOKENS_PER_STEP = 1
|
| 96 |
ENABLE_TEXT_SPLITTING = True
|
|
|
|
| 97 |
|
| 98 |
# -------------------- утыліты аўдыя ----------------------
|
| 99 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
|
|
|
| 155 |
init_stream_support()
|
| 156 |
|
| 157 |
# ---------------------------------------------------------
|
| 158 |
+
# 5) пастаянны кэш латэнтаў
|
| 159 |
# ---------------------------------------------------------
|
| 160 |
PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
|
| 161 |
PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 227 |
text_in = text_in.strip()
|
| 228 |
if not text_in: return []
|
| 229 |
try:
|
| 230 |
+
return [s.strip() for s in split_sentence(text_in, lang=lang_short) if s and s.strip()]
|
| 231 |
+
except: return [text_in]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
# ---------------------------------------------------------
|
| 234 |
# 8) TTS — стрым-функцыя
|
|
|
|
| 253 |
"latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
|
| 254 |
"initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
|
| 255 |
}
|
| 256 |
+
|
| 257 |
+
# <--- НОВЫ БЛОК: "Прагрэў" мадэлі ---
|
| 258 |
+
t_warmup0 = time.perf_counter()
|
| 259 |
+
try:
|
| 260 |
+
# Робім кароткі пусты выклік, каб "прагрэць" мадэль (JIT-кампіляцыя і г.д.)
|
| 261 |
+
# Гэта аплачвае "кошт запуску" перад пачаткам рэальнай генерацыі.
|
| 262 |
+
_ = XTTS_MODEL.inference(
|
| 263 |
+
text=" ", language=lang_short, gpt_cond_latent=gpt_cond_latent,
|
| 264 |
+
speaker_embedding=speaker_embedding, temperature=0.1, length_penalty=1.0,
|
| 265 |
+
)
|
| 266 |
+
except Exception as e: print(f"[warn] Model warmup inference failed: {e}")
|
| 267 |
+
t_warmup1 = time.perf_counter()
|
| 268 |
+
server_metrics["warmup_s"] = t_warmup1 - t_warmup0
|
| 269 |
+
# -----------------------------------------
|
| 270 |
+
|
| 271 |
yield ("", None, None, json.dumps(server_metrics))
|
| 272 |
|
| 273 |
full_audio_chunks, first_chunk_seen = [], False
|
|
|
|
| 283 |
if not first_chunk_seen:
|
| 284 |
t_first = time.perf_counter()
|
| 285 |
server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
|
| 286 |
+
server_metrics["until_first_chunk_total_s"] = t_first - t0 + server_metrics["warmup_s"]
|
| 287 |
known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
|
| 288 |
+
server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0 + server_metrics["warmup_s"]) - known)
|
| 289 |
first_chunk_seen = True
|
| 290 |
yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
|
| 291 |
else:
|
|
|
|
| 306 |
# ---------------------------------------------------------
|
| 307 |
# 9) UI
|
| 308 |
# ---------------------------------------------------------
|
| 309 |
+
examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, INITIAL_MIN_BUFFER_S, MIN_BUFFER_S]]
|
| 310 |
|
| 311 |
with gr.Blocks() as demo:
|
| 312 |
gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
|
|
|
|
| 332 |
const sampleRate = {sampling_rate};
|
| 333 |
const AC = window.AudioContext || window.webkitAudioContext;
|
| 334 |
if (!AC) return;
|
|
|
|
| 335 |
function toSec(ms) {{ return (ms/1000); }}
|
| 336 |
function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
|
| 337 |
|
|
|
|
| 350 |
lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
|
| 351 |
}}
|
| 352 |
}}
|
|
|
|
| 353 |
lines.push("\\n— Налады стрыму —");
|
| 354 |
lines.push("Пачатковы буфер (запыт): " + fmtS(s.initial_buffer_s));
|
| 355 |
lines.push("Наступны буфер (запыт): " + fmtS(s.subsequent_buffer_s));
|
|
|
|
| 357 |
lines.push("Працягласць 1-га чанка: " + m.chunk_durations[0] + " s");
|
| 358 |
lines.push("Атрымана чанкаў: " + m.chunk_durations.length);
|
| 359 |
}}
|
|
|
|
| 360 |
lines.push("\\n— Серверныя метрыкі —");
|
| 361 |
lines.push("Latents (умоўны голас): " + fmtS(s.latents_s));
|
| 362 |
lines.push("Падзел тэксту: " + fmtS(s.text_split_s));
|
| 363 |
+
lines.push("Прагрэў мадэлі: " + fmtS(s.warmup_s)); // <--- Новы радок у логах
|
| 364 |
+
lines.push("Ініт→1-ы чанк (пасля прагрэву): " + fmtS(s.gen_init_to_first_chunk_s)); // <--- Зменены подпіс
|
| 365 |
lines.push("Усё да 1-га чанка: " + fmtS(s.until_first_chunk_total_s));
|
| 366 |
lines.push("Іншая серверная апрац.: " + fmtS(s.server_unaccounted_before_first_chunk_s));
|
| 367 |
lines.push("Запіс WAV: " + fmtS(s.file_write_s));
|
|
|
|
| 368 |
if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
|
| 369 |
let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
|
| 370 |
lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
|
| 371 |
}}
|
|
|
|
| 372 |
lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
|
| 373 |
el.innerHTML = lines.join("\\n");
|
| 374 |
}}
|
|
|
|
| 425 |
const view = new Uint8Array(buf);
|
| 426 |
for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
|
| 427 |
const f32 = new Float32Array(buf);
|
|
|
|
| 428 |
const duration = f32.length / window.__wa.ctx.sampleRate;
|
| 429 |
window.__wa.meta.chunk_durations.push(duration.toFixed(3));
|
|
|
|
| 430 |
window.__wa.push(f32);
|
| 431 |
}
|
| 432 |
"""
|
|
|
|
| 439 |
} catch (e) {}
|
| 440 |
}
|
| 441 |
"""
|
|
|
|
| 442 |
run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
|
| 443 |
run_btn.click(
|
| 444 |
fn=text_to_speech,
|
|
|
|
| 447 |
)
|
| 448 |
stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
|
| 449 |
log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
|
|
|
|
| 450 |
gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
|
| 451 |
|
| 452 |
if __name__ == "__main__":
|