Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# Калі запускаеце ў чыстым асяроддзі (раскаментуйце):
|
| 2 |
-
# !pip install -q gradio spaces huggingface_hub torch scipy
|
| 3 |
|
| 4 |
import os
|
| 5 |
import sys
|
|
@@ -8,7 +8,7 @@ import time
|
|
| 8 |
import tempfile
|
| 9 |
import subprocess
|
| 10 |
import inspect
|
| 11 |
-
from typing import Iterator, Iterable, Optional, Tuple, Any
|
| 12 |
|
| 13 |
import spaces
|
| 14 |
import gradio as gr
|
|
@@ -76,18 +76,34 @@ tokenizer = VoiceBpeTokenizer(vocab_file=vocab_file)
|
|
| 76 |
XTTS_MODEL.tokenizer = tokenizer
|
| 77 |
|
| 78 |
# =========================================================
|
| 79 |
-
# 4) «Як у
|
| 80 |
# =========================================================
|
| 81 |
|
| 82 |
# Канстанты латэнтнасці/буферу
|
| 83 |
MIN_BUFFER_S = 0.050 # ~50 ms цэлявы буфер для аўдыя
|
| 84 |
-
|
| 85 |
TOKENS_PER_STEP = 4 # памер кроку «токенаў» у fallback (BPE/субсловы)
|
| 86 |
|
| 87 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
| 88 |
return max(1, int(sec * sr))
|
| 89 |
|
| 90 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
"""Плыўна зліць два кавалкі без клікаў."""
|
| 92 |
if a.size == 0:
|
| 93 |
return b.astype(np.float32, copy=False)
|
|
@@ -95,7 +111,7 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_ms: float) ->
|
|
| 95 |
return a.astype(np.float32, copy=False)
|
| 96 |
a = a.astype(np.float32, copy=False)
|
| 97 |
b = b.astype(np.float32, copy=False)
|
| 98 |
-
fade_n = min(_seconds_to_samples(
|
| 99 |
if fade_n <= 1:
|
| 100 |
return np.concatenate([a, b], axis=0)
|
| 101 |
fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
|
|
@@ -107,9 +123,8 @@ def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_ms: float) ->
|
|
| 107 |
|
| 108 |
def _bpe_prefixes(text: str, lang: str, step_tokens: int) -> Iterable[str]:
|
| 109 |
"""
|
| 110 |
-
|
| 111 |
"""
|
| 112 |
-
# 1) BPE праз VoiceBpeTokenizer, калі падтрымліваецца
|
| 113 |
try:
|
| 114 |
ids = tokenizer.encode(text, lang=lang)
|
| 115 |
n = len(ids)
|
|
@@ -120,7 +135,6 @@ def _bpe_prefixes(text: str, lang: str, step_tokens: int) -> Iterable[str]:
|
|
| 120 |
return
|
| 121 |
except Exception:
|
| 122 |
pass
|
| 123 |
-
# 2) Падстрахоўка: «словы+раздзяляльнікі»
|
| 124 |
pseudo_tokens = re.findall(r"\S+|\s+", text)
|
| 125 |
acc = ""
|
| 126 |
for i in range(0, len(pseudo_tokens), step_tokens):
|
|
@@ -138,7 +152,7 @@ def _native_stream(
|
|
| 138 |
**gen_kwargs,
|
| 139 |
) -> Iterator[np.ndarray]:
|
| 140 |
"""
|
| 141 |
-
Натыўны паток, калі ў форку ёсць model.inference_stream(...)-> iterator of PCM/ndarray.
|
| 142 |
"""
|
| 143 |
sig = inspect.signature(model.inference_stream)
|
| 144 |
call_kwargs = dict(
|
|
@@ -147,18 +161,15 @@ def _native_stream(
|
|
| 147 |
gpt_cond_latent=gpt_cond_latent,
|
| 148 |
speaker_embedding=speaker_embedding,
|
| 149 |
)
|
| 150 |
-
# Перадаём тыповыя параметры генерацыі, калі яны ёсць у подпісе
|
| 151 |
for k in ("temperature", "length_penalty", "repetition_penalty", "top_k", "top_p"):
|
| 152 |
if k in gen_kwargs and k in sig.parameters:
|
| 153 |
call_kwargs[k] = gen_kwargs[k]
|
| 154 |
-
# Памер стрим-чанка (секунды), калі ёсць у подпісе
|
| 155 |
if "stream_chunk_size_s" in sig.parameters:
|
| 156 |
call_kwargs["stream_chunk_size_s"] = float(gen_kwargs.get("min_buffer_s", MIN_BUFFER_S))
|
| 157 |
|
| 158 |
generator = model.inference_stream(**call_kwargs)
|
| 159 |
for out in generator:
|
| 160 |
-
|
| 161 |
-
yield arr.astype(np.float32, copy=False)
|
| 162 |
|
| 163 |
def _fallback_incremental(
|
| 164 |
model: Xtts,
|
|
@@ -175,7 +186,7 @@ def _fallback_incremental(
|
|
| 175 |
emitted = 0
|
| 176 |
for prefix in _bpe_prefixes(text, language, tokens_per_step):
|
| 177 |
with torch.no_grad():
|
| 178 |
-
|
| 179 |
text=prefix,
|
| 180 |
language=language,
|
| 181 |
gpt_cond_latent=gpt_cond_latent,
|
|
@@ -185,7 +196,8 @@ def _fallback_incremental(
|
|
| 185 |
repetition_penalty=gen_kwargs.get("repetition_penalty", 10.0),
|
| 186 |
top_k=gen_kwargs.get("top_k", 10),
|
| 187 |
top_p=gen_kwargs.get("top_p", 0.3),
|
| 188 |
-
)
|
|
|
|
| 189 |
new_part = wav[emitted:]
|
| 190 |
emitted = wav.size
|
| 191 |
if new_part.size:
|
|
@@ -193,8 +205,7 @@ def _fallback_incremental(
|
|
| 193 |
|
| 194 |
class NewTTSGenerationMixin:
|
| 195 |
"""
|
| 196 |
-
«Як у transformers-stream-generator»: дадаём generate() і sample_stream()
|
| 197 |
-
у мадэль Xtts. return: або поўны wav (ndarray), або ітэратар чанкаў (ndarray).
|
| 198 |
"""
|
| 199 |
|
| 200 |
@torch.inference_mode()
|
|
@@ -210,12 +221,7 @@ class NewTTSGenerationMixin:
|
|
| 210 |
tokens_per_step: int = TOKENS_PER_STEP,
|
| 211 |
**gen_kwargs,
|
| 212 |
):
|
| 213 |
-
"""
|
| 214 |
-
Калі do_stream=False -> вяртае поўны wav (ndarray).
|
| 215 |
-
Калі do_stream=True -> вяртае генератар чанкаў wav (Iterator[np.ndarray]).
|
| 216 |
-
"""
|
| 217 |
assert isinstance(text, str) and text.strip(), "text is required"
|
| 218 |
-
# Блакіруючы рэжым — адным махам
|
| 219 |
if not do_stream:
|
| 220 |
out = self.inference(
|
| 221 |
text=text,
|
|
@@ -228,9 +234,8 @@ class NewTTSGenerationMixin:
|
|
| 228 |
top_k=gen_kwargs.get("top_k", 10),
|
| 229 |
top_p=gen_kwargs.get("top_p", 0.3),
|
| 230 |
)
|
| 231 |
-
return
|
| 232 |
|
| 233 |
-
# Стрымінгавы рэжым — як у прыкладзе: асобны генератар
|
| 234 |
return self.sample_stream(
|
| 235 |
text=text,
|
| 236 |
language=language,
|
|
@@ -253,27 +258,20 @@ class NewTTSGenerationMixin:
|
|
| 253 |
tokens_per_step: int = TOKENS_PER_STEP,
|
| 254 |
**gen_kwargs,
|
| 255 |
) -> Iterator[np.ndarray]:
|
| 256 |
-
"""
|
| 257 |
-
Вяртае генератар чанкаў wav. Стараемся даваць маленькія кавалкі як мага часцей.
|
| 258 |
-
"""
|
| 259 |
-
# 1) Калі ёсць натыўны паток — проста перасылаем яго
|
| 260 |
if hasattr(self, "inference_stream"):
|
| 261 |
for chunk in _native_stream(
|
| 262 |
self, text, language, gpt_cond_latent, speaker_embedding, min_buffer_s=min_buffer_s, **gen_kwargs
|
| 263 |
):
|
| 264 |
-
# тут мы не чакаем — верхні слой сам злімітуе плынь буферам
|
| 265 |
yield chunk
|
| 266 |
return
|
| 267 |
|
| 268 |
-
# 2) Інакш — інкрементальны fallback па токенах
|
| 269 |
for chunk in _fallback_incremental(
|
| 270 |
self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs
|
| 271 |
):
|
| 272 |
yield chunk
|
| 273 |
|
| 274 |
-
|
| 275 |
def init_stream_support():
|
| 276 |
-
"""Прапатчыць Xtts, дадаўшы generate/sample_stream
|
| 277 |
Xtts.generate = NewTTSGenerationMixin.generate
|
| 278 |
Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
|
| 279 |
|
|
@@ -289,19 +287,18 @@ def _yield_buffered_chunks_for_gradio(
|
|
| 289 |
target_s: float = MIN_BUFFER_S,
|
| 290 |
) -> Iterator[Tuple[int, np.ndarray]]:
|
| 291 |
"""
|
| 292 |
-
Назапашваем невялікі буфер (~50 ms), каб плэер Gradio паспеў «дагуляць»
|
| 293 |
-
і не накладваў наступны чанк.
|
| 294 |
"""
|
| 295 |
target_samples = _seconds_to_samples(target_s, sr)
|
| 296 |
buf = np.zeros((0,), dtype=np.float32)
|
| 297 |
for c in chunks:
|
| 298 |
-
c =
|
| 299 |
if c.size == 0:
|
| 300 |
continue
|
| 301 |
if buf.size == 0:
|
| 302 |
buf = c
|
| 303 |
else:
|
| 304 |
-
buf = _crossfade_concat(buf, c, sr,
|
| 305 |
if buf.size >= target_samples:
|
| 306 |
yield (sr, buf)
|
| 307 |
time.sleep(buf.size / float(sr))
|
|
@@ -311,14 +308,13 @@ def _yield_buffered_chunks_for_gradio(
|
|
| 311 |
time.sleep(buf.size / float(sr))
|
| 312 |
|
| 313 |
# ---------------------------------------------------------
|
| 314 |
-
# 6) Асноўная функцыя TTS для Gradio (
|
| 315 |
# ---------------------------------------------------------
|
| 316 |
@spaces.GPU(duration=60)
|
| 317 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
| 318 |
"""
|
| 319 |
Streaming для gr.Audio:
|
| 320 |
-
-
|
| 321 |
-
- аддаём невялікія чанкі (sr, chunk) з мінімальнай затрымкай;
|
| 322 |
- у фінале — шлях да поўнага WAV.
|
| 323 |
"""
|
| 324 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
|
@@ -342,7 +338,6 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 342 |
except Exception as e:
|
| 343 |
raise gr.Error(f"Памылка пры атрыманні латэнтаў голасу: {e}")
|
| 344 |
|
| 345 |
-
# --- Генератар па аналагіі з .generate(... do_stream=True) ---
|
| 346 |
generator = XTTS_MODEL.generate(
|
| 347 |
text=str(belarusian_story).strip(),
|
| 348 |
do_stream=True,
|
|
@@ -358,20 +353,18 @@ def text_to_speech(belarusian_story, speaker_audio_file=None):
|
|
| 358 |
top_p=0.3,
|
| 359 |
)
|
| 360 |
|
| 361 |
-
|
| 362 |
-
full_audio_chunks: list[np.ndarray] = []
|
| 363 |
|
| 364 |
-
# Аддаём у Gradio дробныя порцыі з невялікім буферам і рэальным «сном»
|
| 365 |
for sr, chunk in _yield_buffered_chunks_for_gradio(generator, sampling_rate, MIN_BUFFER_S):
|
| 366 |
full_audio_chunks.append(chunk)
|
| 367 |
yield (sr, chunk)
|
| 368 |
|
| 369 |
-
# Гатовы поўны WAV
|
| 370 |
if not full_audio_chunks:
|
| 371 |
raise gr.Error("Нічога не згенеравана. Праверце ўваходныя даныя або лагі.")
|
|
|
|
| 372 |
full_audio = full_audio_chunks[0]
|
| 373 |
for i in range(1, len(full_audio_chunks)):
|
| 374 |
-
full_audio = _crossfade_concat(full_audio, full_audio_chunks[i], sampling_rate,
|
| 375 |
|
| 376 |
try:
|
| 377 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
@@ -426,7 +419,7 @@ analytics_script = """
|
|
| 426 |
with gr.Blocks() as demo:
|
| 427 |
gr.HTML(analytics_script)
|
| 428 |
gr.Interface(
|
| 429 |
-
fn=text_to_speech,
|
| 430 |
inputs=[
|
| 431 |
gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
|
| 432 |
gr.Audio(
|
|
@@ -436,7 +429,7 @@ with gr.Blocks() as demo:
|
|
| 436 |
),
|
| 437 |
],
|
| 438 |
outputs=gr.Audio(
|
| 439 |
-
type="filepath",
|
| 440 |
label="Згенераванае аўдыя (па токенах, мінімальная затрымка)",
|
| 441 |
autoplay=True,
|
| 442 |
),
|
|
|
|
| 1 |
# Калі запускаеце ў чыстым асяроддзі (раскаментуйце):
|
| 2 |
+
# !pip install -q gradio spaces huggingface_hub torch scipy gitpython
|
| 3 |
|
| 4 |
import os
|
| 5 |
import sys
|
|
|
|
| 8 |
import tempfile
|
| 9 |
import subprocess
|
| 10 |
import inspect
|
| 11 |
+
from typing import Iterator, Iterable, Optional, Tuple, Any, List
|
| 12 |
|
| 13 |
import spaces
|
| 14 |
import gradio as gr
|
|
|
|
| 76 |
XTTS_MODEL.tokenizer = tokenizer
|
| 77 |
|
| 78 |
# =========================================================
|
| 79 |
+
# 4) «Як у transformers-stream-generator»: патч Xtts.generate/sample_stream
|
| 80 |
# =========================================================
|
| 81 |
|
| 82 |
# Канстанты латэнтнасці/буферу
|
| 83 |
MIN_BUFFER_S = 0.050 # ~50 ms цэлявы буфер для аўдыя
|
| 84 |
+
FADE_S = 0.008 # кароткі cross-fade паміж чанкамі
|
| 85 |
TOKENS_PER_STEP = 4 # памер кроку «токенаў» у fallback (BPE/субсловы)
|
| 86 |
|
| 87 |
def _seconds_to_samples(sec: float, sr: int) -> int:
|
| 88 |
return max(1, int(sec * sr))
|
| 89 |
|
| 90 |
+
def _to_np_audio(x) -> np.ndarray:
|
| 91 |
+
"""Прывесці аўдыя да 1D np.float32 на CPU (падтрымка dict/torch/np)."""
|
| 92 |
+
if isinstance(x, dict) and "wav" in x:
|
| 93 |
+
x = x["wav"]
|
| 94 |
+
if isinstance(x, torch.Tensor):
|
| 95 |
+
if x.dtype != torch.float32:
|
| 96 |
+
x = x.float()
|
| 97 |
+
x = x.detach().cpu().contiguous().view(-1)
|
| 98 |
+
return x.numpy()
|
| 99 |
+
x = np.asarray(x)
|
| 100 |
+
if x.ndim > 1:
|
| 101 |
+
x = x.reshape(-1)
|
| 102 |
+
if x.dtype != np.float32:
|
| 103 |
+
x = x.astype(np.float32, copy=False)
|
| 104 |
+
return x
|
| 105 |
+
|
| 106 |
+
def _crossfade_concat(a: np.ndarray, b: np.ndarray, sr: int, fade_s: float) -> np.ndarray:
|
| 107 |
"""Плыўна зліць два кавалкі без клікаў."""
|
| 108 |
if a.size == 0:
|
| 109 |
return b.astype(np.float32, copy=False)
|
|
|
|
| 111 |
return a.astype(np.float32, copy=False)
|
| 112 |
a = a.astype(np.float32, copy=False)
|
| 113 |
b = b.astype(np.float32, copy=False)
|
| 114 |
+
fade_n = min(_seconds_to_samples(fade_s, sr), a.size, b.size)
|
| 115 |
if fade_n <= 1:
|
| 116 |
return np.concatenate([a, b], axis=0)
|
| 117 |
fade_out = np.linspace(1.0, 0.0, fade_n, endpoint=True, dtype=np.float32)
|
|
|
|
| 123 |
|
| 124 |
def _bpe_prefixes(text: str, lang: str, step_tokens: int) -> Iterable[str]:
|
| 125 |
"""
|
| 126 |
+
Прэфіксы па BPE/субсловах; калі encode/decode недаступны — псэўда-токены (словы+прабелы).
|
| 127 |
"""
|
|
|
|
| 128 |
try:
|
| 129 |
ids = tokenizer.encode(text, lang=lang)
|
| 130 |
n = len(ids)
|
|
|
|
| 135 |
return
|
| 136 |
except Exception:
|
| 137 |
pass
|
|
|
|
| 138 |
pseudo_tokens = re.findall(r"\S+|\s+", text)
|
| 139 |
acc = ""
|
| 140 |
for i in range(0, len(pseudo_tokens), step_tokens):
|
|
|
|
| 152 |
**gen_kwargs,
|
| 153 |
) -> Iterator[np.ndarray]:
|
| 154 |
"""
|
| 155 |
+
Натыўны паток, калі ў форку ёсць model.inference_stream(...)-> iterator of PCM/ndarray/torch.Tensor.
|
| 156 |
"""
|
| 157 |
sig = inspect.signature(model.inference_stream)
|
| 158 |
call_kwargs = dict(
|
|
|
|
| 161 |
gpt_cond_latent=gpt_cond_latent,
|
| 162 |
speaker_embedding=speaker_embedding,
|
| 163 |
)
|
|
|
|
| 164 |
for k in ("temperature", "length_penalty", "repetition_penalty", "top_k", "top_p"):
|
| 165 |
if k in gen_kwargs and k in sig.parameters:
|
| 166 |
call_kwargs[k] = gen_kwargs[k]
|
|
|
|
| 167 |
if "stream_chunk_size_s" in sig.parameters:
|
| 168 |
call_kwargs["stream_chunk_size_s"] = float(gen_kwargs.get("min_buffer_s", MIN_BUFFER_S))
|
| 169 |
|
| 170 |
generator = model.inference_stream(**call_kwargs)
|
| 171 |
for out in generator:
|
| 172 |
+
yield _to_np_audio(out)
|
|
|
|
| 173 |
|
| 174 |
def _fallback_incremental(
|
| 175 |
model: Xtts,
|
|
|
|
| 186 |
emitted = 0
|
| 187 |
for prefix in _bpe_prefixes(text, language, tokens_per_step):
|
| 188 |
with torch.no_grad():
|
| 189 |
+
out = model.inference(
|
| 190 |
text=prefix,
|
| 191 |
language=language,
|
| 192 |
gpt_cond_latent=gpt_cond_latent,
|
|
|
|
| 196 |
repetition_penalty=gen_kwargs.get("repetition_penalty", 10.0),
|
| 197 |
top_k=gen_kwargs.get("top_k", 10),
|
| 198 |
top_p=gen_kwargs.get("top_p", 0.3),
|
| 199 |
+
)
|
| 200 |
+
wav = _to_np_audio(out)
|
| 201 |
new_part = wav[emitted:]
|
| 202 |
emitted = wav.size
|
| 203 |
if new_part.size:
|
|
|
|
| 205 |
|
| 206 |
class NewTTSGenerationMixin:
|
| 207 |
"""
|
| 208 |
+
«Як у transformers-stream-generator»: дадаём generate() і sample_stream() у Xtts.
|
|
|
|
| 209 |
"""
|
| 210 |
|
| 211 |
@torch.inference_mode()
|
|
|
|
| 221 |
tokens_per_step: int = TOKENS_PER_STEP,
|
| 222 |
**gen_kwargs,
|
| 223 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
assert isinstance(text, str) and text.strip(), "text is required"
|
|
|
|
| 225 |
if not do_stream:
|
| 226 |
out = self.inference(
|
| 227 |
text=text,
|
|
|
|
| 234 |
top_k=gen_kwargs.get("top_k", 10),
|
| 235 |
top_p=gen_kwargs.get("top_p", 0.3),
|
| 236 |
)
|
| 237 |
+
return _to_np_audio(out)
|
| 238 |
|
|
|
|
| 239 |
return self.sample_stream(
|
| 240 |
text=text,
|
| 241 |
language=language,
|
|
|
|
| 258 |
tokens_per_step: int = TOKENS_PER_STEP,
|
| 259 |
**gen_kwargs,
|
| 260 |
) -> Iterator[np.ndarray]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
if hasattr(self, "inference_stream"):
|
| 262 |
for chunk in _native_stream(
|
| 263 |
self, text, language, gpt_cond_latent, speaker_embedding, min_buffer_s=min_buffer_s, **gen_kwargs
|
| 264 |
):
|
|
|
|
| 265 |
yield chunk
|
| 266 |
return
|
| 267 |
|
|
|
|
| 268 |
for chunk in _fallback_incremental(
|
| 269 |
self, text, language, gpt_cond_latent, speaker_embedding, tokens_per_step, **gen_kwargs
|
| 270 |
):
|
| 271 |
yield chunk
|
| 272 |
|
|
|
|
| 273 |
def init_stream_support():
|
| 274 |
+
"""Прапатчыць Xtts, дадаўшы generate/sample_stream."""
|
| 275 |
Xtts.generate = NewTTSGenerationMixin.generate
|
| 276 |
Xtts.sample_stream = NewTTSGenerationMixin.sample_stream
|
| 277 |
|
|
|
|
| 287 |
target_s: float = MIN_BUFFER_S,
|
| 288 |
) -> Iterator[Tuple[int, np.ndarray]]:
|
| 289 |
"""
|
| 290 |
+
Назапашваем невялікі буфер (~50 ms), каб плэер Gradio паспеў «дагуляць».
|
|
|
|
| 291 |
"""
|
| 292 |
target_samples = _seconds_to_samples(target_s, sr)
|
| 293 |
buf = np.zeros((0,), dtype=np.float32)
|
| 294 |
for c in chunks:
|
| 295 |
+
c = _to_np_audio(c)
|
| 296 |
if c.size == 0:
|
| 297 |
continue
|
| 298 |
if buf.size == 0:
|
| 299 |
buf = c
|
| 300 |
else:
|
| 301 |
+
buf = _crossfade_concat(buf, c, sr, FADE_S)
|
| 302 |
if buf.size >= target_samples:
|
| 303 |
yield (sr, buf)
|
| 304 |
time.sleep(buf.size / float(sr))
|
|
|
|
| 308 |
time.sleep(buf.size / float(sr))
|
| 309 |
|
| 310 |
# ---------------------------------------------------------
|
| 311 |
+
# 6) Асноўная функцыя TTS для Gradio (у стылі .generate(do_stream=True))
|
| 312 |
# ---------------------------------------------------------
|
| 313 |
@spaces.GPU(duration=60)
|
| 314 |
def text_to_speech(belarusian_story, speaker_audio_file=None):
|
| 315 |
"""
|
| 316 |
Streaming для gr.Audio:
|
| 317 |
+
- model.generate(..., do_stream=True) -> чанкі (sr, chunk) з мінімальнай затрымкай;
|
|
|
|
| 318 |
- у фінале — шлях да поўнага WAV.
|
| 319 |
"""
|
| 320 |
if not belarusian_story or str(belarusian_story).strip() == "":
|
|
|
|
| 338 |
except Exception as e:
|
| 339 |
raise gr.Error(f"Памылка пры атрыманні латэнтаў голасу: {e}")
|
| 340 |
|
|
|
|
| 341 |
generator = XTTS_MODEL.generate(
|
| 342 |
text=str(belarusian_story).strip(),
|
| 343 |
do_stream=True,
|
|
|
|
| 353 |
top_p=0.3,
|
| 354 |
)
|
| 355 |
|
| 356 |
+
full_audio_chunks: List[np.ndarray] = []
|
|
|
|
| 357 |
|
|
|
|
| 358 |
for sr, chunk in _yield_buffered_chunks_for_gradio(generator, sampling_rate, MIN_BUFFER_S):
|
| 359 |
full_audio_chunks.append(chunk)
|
| 360 |
yield (sr, chunk)
|
| 361 |
|
|
|
|
| 362 |
if not full_audio_chunks:
|
| 363 |
raise gr.Error("Нічога не згенеравана. Праверце ўваходныя даныя або лагі.")
|
| 364 |
+
|
| 365 |
full_audio = full_audio_chunks[0]
|
| 366 |
for i in range(1, len(full_audio_chunks)):
|
| 367 |
+
full_audio = _crossfade_concat(full_audio, full_audio_chunks[i], sampling_rate, FADE_S)
|
| 368 |
|
| 369 |
try:
|
| 370 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
|
|
| 419 |
with gr.Blocks() as demo:
|
| 420 |
gr.HTML(analytics_script)
|
| 421 |
gr.Interface(
|
| 422 |
+
fn=text_to_speech,
|
| 423 |
inputs=[
|
| 424 |
gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
|
| 425 |
gr.Audio(
|
|
|
|
| 429 |
),
|
| 430 |
],
|
| 431 |
outputs=gr.Audio(
|
| 432 |
+
type="filepath",
|
| 433 |
label="Згенераванае аўдыя (па токенах, мінімальная затрымка)",
|
| 434 |
autoplay=True,
|
| 435 |
),
|