BexttsStream

Running on Zero

App Files Files Community

archivartaunik commited on Nov 16, 2025

Commit

6ddb476

verified ·

1 Parent(s): 7982f36

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -26

app.py CHANGED Viewed

@@ -88,13 +88,12 @@ XTTS_MODEL.tokenizer = tokenizer
 # =========================================================
 # 4) Streaming-канфіг
 # =========================================================
-INITIAL_MIN_BUFFER_S = 0.25
 MIN_BUFFER_S = 0.1
 RUNTIME_FIRST_CHUNK_S = 0.02
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
-FIRST_SEGMENT_LIMIT = 160
 # -------------------- утыліты аўдыя ----------------------
 def _seconds_to_samples(sec: float, sr: int) -> int:
@@ -156,7 +155,7 @@ def init_stream_support():
 init_stream_support()
 # ---------------------------------------------------------
-# 5) пастаянны кэш латэнтаў (CPU) + GPU-кэш
 # ---------------------------------------------------------
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
@@ -228,16 +227,8 @@ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[s
     text_in = text_in.strip()
     if not text_in: return []
     try:
-        sentences = split_sentence(text_in, lang=lang_short)
-        chunks, current_chunk = [], ""
-        for sentence in sentences:
-            if len(current_chunk) + len(sentence) + 1 <= chunk_limit: current_chunk += " " + sentence
-            else:
-                if current_chunk: chunks.append(current_chunk.strip())
-                current_chunk = sentence
-        if current_chunk: chunks.append(current_chunk.strip())
-        return [c for c in chunks if c]
-    except Exception: return [text_in]
 # ---------------------------------------------------------
 # 8) TTS — стрым-функцыя
@@ -262,6 +253,21 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
         "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
         "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
     }
     yield ("", None, None, json.dumps(server_metrics))
     full_audio_chunks, first_chunk_seen = [], False
@@ -277,9 +283,9 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
             if not first_chunk_seen:
                 t_first = time.perf_counter()
                 server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
-                server_metrics["until_first_chunk_total_s"] = t_first - t0
                 known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
-                server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0) - known)
                 first_chunk_seen = True
                 yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
             else:
@@ -300,7 +306,7 @@ def text_to_speech(belarusian_story, speaker_audio_file, initial_buffer_s, subse
 # ---------------------------------------------------------
 # 9) UI
 # ---------------------------------------------------------
-examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, 0.25, 0.1]]
 with gr.Blocks() as demo:
     gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
@@ -326,7 +332,6 @@ with gr.Blocks() as demo:
   const sampleRate = {sampling_rate};
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
   function toSec(ms) {{ return (ms/1000); }}
   function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
@@ -345,7 +350,6 @@ with gr.Blocks() as demo:
         lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
     lines.push("\\n— Налады стрыму —");
     lines.push("Пачатковы буфер (запыт):  " + fmtS(s.initial_buffer_s));
     lines.push("Наступны буфер (запыт):   " + fmtS(s.subsequent_buffer_s));
@@ -353,20 +357,18 @@ with gr.Blocks() as demo:
         lines.push("Працягласць 1-га чанка:    " + m.chunk_durations[0] + " s");
         lines.push("Атрымана чанкаў:          " + m.chunk_durations.length);
     }}
     lines.push("\\n— Серверныя метрыкі —");
     lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
     lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
-    lines.push("Ініт→1-ы чанк:           " + fmtS(s.gen_init_to_first_chunk_s));
     lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
     lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
     lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
     if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
         let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
         lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
     }}
     lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
     el.innerHTML = lines.join("\\n");
   }}
@@ -423,10 +425,8 @@ with gr.Blocks() as demo:
   const view = new Uint8Array(buf);
   for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
   const f32 = new Float32Array(buf);
   const duration = f32.length / window.__wa.ctx.sampleRate;
   window.__wa.meta.chunk_durations.push(duration.toFixed(3));
   window.__wa.push(f32);
 }
 """
@@ -439,7 +439,6 @@ with gr.Blocks() as demo:
   } catch (e) {}
 }
 """
-    # <--- ВЫПРАЎЛЕННЕ: выкарыстаны правільны параметр `js` замест `_js`
     run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
     run_btn.click(
         fn=text_to_speech,
@@ -448,7 +447,6 @@ with gr.Blocks() as demo:
     )
     stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
     log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
 if __name__ == "__main__":

 # =========================================================
 # 4) Streaming-канфіг
 # =========================================================
+INITIAL_MIN_BUFFER_S = 0.35 # <--- Рэкамендуемае значэнне пасля выпраўлення
 MIN_BUFFER_S = 0.1
 RUNTIME_FIRST_CHUNK_S = 0.02
 FADE_S       = 0.004
 TOKENS_PER_STEP = 1
 ENABLE_TEXT_SPLITTING = True
 # -------------------- утыліты аўдыя ----------------------
 def _seconds_to_samples(sec: float, sr: int) -> int:
 init_stream_support()
 # ---------------------------------------------------------
+# 5) пастаянны кэш латэнтаў
 # ---------------------------------------------------------
 PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
 PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
     text_in = text_in.strip()
     if not text_in: return []
     try:
+        return [s.strip() for s in split_sentence(text_in, lang=lang_short) if s and s.strip()]
+    except: return [text_in]
 # ---------------------------------------------------------
 # 8) TTS — стрым-функцыя
         "latents_s": t_lat1 - t_lat0, "text_split_s": t_split1 - t_split0,
         "initial_buffer_s": initial_buffer_s, "subsequent_buffer_s": subsequent_buffer_s,
     }
+    # <--- НОВЫ БЛОК: "Прагрэў" мадэлі ---
+    t_warmup0 = time.perf_counter()
+    try:
+        # Робім кароткі пусты выклік, каб "прагрэць" мадэль (JIT-кампіляцыя і г.д.)
+        # Гэта аплачвае "кошт запуску" перад пачаткам рэальнай генерацыі.
+        _ = XTTS_MODEL.inference(
+            text=" ", language=lang_short, gpt_cond_latent=gpt_cond_latent,
+            speaker_embedding=speaker_embedding, temperature=0.1, length_penalty=1.0,
+        )
+    except Exception as e: print(f"[warn] Model warmup inference failed: {e}")
+    t_warmup1 = time.perf_counter()
+    server_metrics["warmup_s"] = t_warmup1 - t_warmup0
+    # -----------------------------------------
     yield ("", None, None, json.dumps(server_metrics))
     full_audio_chunks, first_chunk_seen = [], False
             if not first_chunk_seen:
                 t_first = time.perf_counter()
                 server_metrics["gen_init_to_first_chunk_s"] = t_first - t_gen0
+                server_metrics["until_first_chunk_total_s"] = t_first - t0 + server_metrics["warmup_s"]
                 known = sum(v for k, v in server_metrics.items() if isinstance(v, (int, float)) and k.endswith('_s'))
+                server_metrics["server_unaccounted_before_first_chunk_s"] = max(0.0, (t_first - t0 + server_metrics["warmup_s"]) - known)
                 first_chunk_seen = True
                 yield (_pcm_f32_to_b64(buf), None, None, json.dumps(server_metrics))
             else:
 # ---------------------------------------------------------
 # 9) UI
 # ---------------------------------------------------------
+examples = [["Прывітанне! Гэта праверка жывога струменя беларускага TTS.", None, INITIAL_MIN_BUFFER_S, MIN_BUFFER_S]]
 with gr.Blocks() as demo:
     gr.Markdown("## Belarusian TTS — Streaming (стабільны старт) + фінальны файл")
   const sampleRate = {sampling_rate};
   const AC = window.AudioContext || window.webkitAudioContext;
   if (!AC) return;
   function toSec(ms) {{ return (ms/1000); }}
   function fmtS(x) {{ return (x===null||x===undefined) ? "n/a" : x.toFixed(3) + " s"; }}
         lines.push("Затрымка (чанк→аўдыя): " + (toSec(m.t_first_audio_ms - m.t_first_push_ms)).toFixed(3) + " s");
       }}
     }}
     lines.push("\\n— Налады стрыму —");
     lines.push("Пачатковы буфер (запыт):  " + fmtS(s.initial_buffer_s));
     lines.push("Наступны буфер (запыт):   " + fmtS(s.subsequent_buffer_s));
         lines.push("Працягласць 1-га чанка:    " + m.chunk_durations[0] + " s");
         lines.push("Атрымана чанкаў:          " + m.chunk_durations.length);
     }}
     lines.push("\\n— Серверныя метрыкі —");
     lines.push("Latents (умоўны голас):  " + fmtS(s.latents_s));
     lines.push("Падзел тэксту:           " + fmtS(s.text_split_s));
+    lines.push("Прагрэў мадэлі:          " + fmtS(s.warmup_s)); // <--- Новы радок у логах
+    lines.push("Ініт→1-ы чанк (пасля прагрэву): " + fmtS(s.gen_init_to_first_chunk_s)); // <--- Зменены подпіс
     lines.push("Усё да 1-га чанка:       " + fmtS(s.until_first_chunk_total_s));
     lines.push("Іншая серверная апрац.:  " + fmtS(s.server_unaccounted_before_first_chunk_s));
     lines.push("Запіс WAV:               " + fmtS(s.file_write_s));
     if (m.t_first_push_ms && s.until_first_chunk_total_s) {{
         let est_queue_net = toSec(m.t_first_push_ms - m.t_click_ms) - s.until_first_chunk_total_s;
         lines.push("\\nАцэнка чаргі ZeroGPU + сеткі: " + Math.max(0, est_queue_net).toFixed(3) + " s");
     }}
     lines.push("\\nСтатус стриму: " + (window.__wa.playing ? "playing" : "stopped"));
     el.innerHTML = lines.join("\\n");
   }}
   const view = new Uint8Array(buf);
   for (let i=0; i<bin.length; i++) view[i] = bin.charCodeAt(i);
   const f32 = new Float32Array(buf);
   const duration = f32.length / window.__wa.ctx.sampleRate;
   window.__wa.meta.chunk_durations.push(duration.toFixed(3));
   window.__wa.push(f32);
 }
 """
   } catch (e) {}
 }
 """
     run_btn.click(fn=None, js=INIT_RESET_AND_PLAY_JS)
     run_btn.click(
         fn=text_to_speech,
     )
     stream_pipe.change(fn=None, inputs=[stream_pipe], js=PUSH_JS)
     log_pipe.change(fn=None, inputs=[log_pipe], js=LOG_JS)
     gr.Examples(examples=examples, inputs=[inp_text, inp_voice, initial_buffer_slider, subsequent_buffer_slider], cache_examples=False)
 if __name__ == "__main__":