archivartaunik commited on
Commit
0485426
·
verified ·
1 Parent(s): a31f67a

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +47 -0
  2. app.py +7 -0
  3. assistant.py +109 -0
  4. config.py +26 -0
  5. gemini_client.py +72 -0
  6. requirements.txt +7 -0
  7. tts_model.py +360 -0
  8. ui.py +506 -0
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Belarusian XTTS + Gemini Voice Assistant (Hugging Face Space)
2
+
3
+ Праект для запуску на **Hugging Face Spaces** (тып: Gradio, **ZeroGPU**),
4
+ які аб'ядноўвае:
5
+ - форк Coqui XTTS ад **tuteishygpt** (`coqui-ai-TTS`)
6
+ - Belarusian XTTS-мадэль з Hugging Face
7
+ - Google Gemini 2.5 Flash-Lite
8
+ - Gradio UI з аўтаматычным мікрафонам, стрымінгам і бардж-ін.
9
+
10
+ ## Як разгарнуць на Spaces
11
+
12
+ 1. Ствары Space:
13
+ - Space type: **Gradio**
14
+ - Hardware: **ZeroGPU**
15
+ 2. Загрузі ў рэпазіторый усе файлы з гэтага архіва:
16
+ - `app.py`
17
+ - `ui.py`
18
+ - `assistant.py`
19
+ - `gemini_client.py`
20
+ - `tts_model.py`
21
+ - `config.py`
22
+ - `requirements.txt`
23
+ - `README.md`
24
+ 3. У наладах Space:
25
+ - зайдзі ў **Settings → Variables and secrets**
26
+ - дадай:
27
+ - `GEMINI_API_KEY` — твай API-ключ Google Gemini
28
+
29
+ Пасля аўтаматычнага `pip install -r requirements.txt` Space павінен запусціць `app.py`,
30
+ дзе ўжо створаны Gradio Blocks.
31
+
32
+ ## Лакальны запуск (не абавязкова)
33
+
34
+ Калі хочаш праганяць лакальна перад загрузкай у Space:
35
+
36
+ ```bash
37
+ python -m venv .venv
38
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
39
+ pip install --upgrade pip
40
+ pip install -r requirements.txt
41
+
42
+ export GEMINI_API_KEY="ТВОЙ_API_KEY" # або праз .env / PowerShell
43
+ python app.py
44
+ ```
45
+
46
+ На ZeroGPU код будзе працаваць на CPU (калі `torch.cuda.is_available()` = False),
47
+ але лагіка для GPU таксама застаецца на выпадак іншых тыпаў машын.
app.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from ui import create_app
2
+
3
+ demo = create_app()
4
+
5
+ if __name__ == "__main__":
6
+ # Лакальны запуск
7
+ demo.queue().launch()
assistant.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import tempfile
4
+ from typing import Generator, Tuple, Optional
5
+
6
+ import gradio as gr
7
+
8
+ from gemini_client import gemini_answer_from_audio, reset_gemini_dialog
9
+ from tts_model import text_to_speech
10
+
11
+
12
+ def voice_assistant_from_mic(
13
+ mic_audio_path: str,
14
+ speaker_audio: Optional[str],
15
+ ) -> Generator[Tuple[str, Optional[str]], None, None]:
16
+ """
17
+ Мікрафонны файл → Gemini → стрымінг тэкст→голас.
18
+ """
19
+ if not mic_audio_path:
20
+ raise gr.Error("Спачатку запішы фразу з мікрафона 🙂")
21
+
22
+ reply_text = gemini_answer_from_audio(mic_audio_path)
23
+ if not reply_text:
24
+ raise gr.Error("Gemini нічога не адказаў або ўзнікла памылка 🙃")
25
+
26
+ for chunk in text_to_speech(
27
+ text_input=reply_text,
28
+ speaker_audio=speaker_audio,
29
+ ):
30
+ yield chunk
31
+
32
+
33
+ def safe_voice_assistant(mic_path: str, ref_voice: Optional[str]):
34
+ """
35
+ Абгортка для стрымінгу з мікрафона.
36
+ """
37
+ if not mic_path:
38
+ yield gr.update(), gr.update()
39
+ return
40
+
41
+ for item in voice_assistant_from_mic(mic_path, ref_voice):
42
+ if isinstance(item, (list, tuple)):
43
+ if len(item) >= 2:
44
+ stream, log = item[0], item[1]
45
+ elif len(item) == 1:
46
+ stream, log = item[0], None
47
+ else:
48
+ stream, log = None, None
49
+ else:
50
+ stream, log = item, None
51
+ yield stream, log
52
+
53
+
54
+ def handle_b64_wav(b64_audio: str, ref_voice: Optional[str]):
55
+ """
56
+ Прыміць base64-WAV з фронта, захаваць у часовы .wav
57
+ і перадаць у існы пайплайн safe_voice_assistant.
58
+ """
59
+ print("handle_b64_wav: called, has_audio =", bool(b64_audio), flush=True)
60
+
61
+ if not b64_audio:
62
+ print("handle_b64_wav: empty audio, returning no-op", flush=True)
63
+ yield gr.update(), gr.update()
64
+ return
65
+
66
+ try:
67
+ audio_bytes = base64.b64decode(b64_audio)
68
+ print("handle_b64_wav: decoded bytes =", len(audio_bytes), flush=True)
69
+ except Exception as e:
70
+ print("handle_b64_wav: ERROR in b64decode:", e, flush=True)
71
+ yield gr.update(), gr.update()
72
+ return
73
+
74
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
75
+ f.write(audio_bytes)
76
+ temp_path = f.name
77
+
78
+ try:
79
+ for item in safe_voice_assistant(temp_path, ref_voice):
80
+ yield item
81
+ finally:
82
+ try:
83
+ os.remove(temp_path)
84
+ except OSError:
85
+ pass
86
+
87
+
88
+ def reset_session() -> str:
89
+ """
90
+ Скід гісторыі размовы Gemini + HTML для статуснага блоку.
91
+ """
92
+ try:
93
+ reset_gemini_dialog()
94
+ except NameError:
95
+ pass
96
+
97
+ return """
98
+ <div id='wa-log' style='font-family:system-ui;font-size:14px;'>
99
+ <div style='display:flex;align-items:center;gap:10px;'>
100
+ <div style='width:10px;height:10px;border-radius:999px;background:#6366F1;box-shadow:0 0 0 4px #6366F122;'></div>
101
+ <div>
102
+ <div style='font-weight:600;'>🧹 Новая размова</div>
103
+ <div style='font-size:12px;opacity:0.8;'>
104
+ Гісторыя размовы ачышчана. Можаш пачаць з чыстага ліста 🙂
105
+ </div>
106
+ </div>
107
+ </div>
108
+ </div>
109
+ """
config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # ---- Coqui / HF ----
4
+ REPO_ID = "archivartaunik/BE_XTTS_V2_10ep250k"
5
+
6
+ MODEL_DIR = os.path.abspath("./model")
7
+ CHECKPOINT_FILE = os.path.join(MODEL_DIR, "model.pth")
8
+ CONFIG_FILE = os.path.join(MODEL_DIR, "config.json")
9
+ VOCAB_FILE = os.path.join(MODEL_DIR, "vocab.json")
10
+ DEFAULT_VOICE_FILE = os.path.join(MODEL_DIR, "voice.wav")
11
+
12
+ # ---- Streaming / TTS ----
13
+ INITIAL_MIN_BUFFER_S = 0.20 # першы буфер, сек
14
+ MIN_BUFFER_S = 0.05 # наступныя буферы, сек
15
+ FADE_S = 0.005
16
+ ENABLE_TEXT_SPLITTING = True
17
+ FIRST_SEGMENT_LIMIT = 160 # сімвалаў
18
+
19
+ # ---- Gemini ----
20
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "PASTE_YOUR_KEY_HERE")
21
+ GEMINI_MODEL = "gemini-2.5-flash-lite"
22
+
23
+ DEFAULT_SYSTEM_PROMPT = (
24
+ "Ты галасавы беларускамоўны асістэнт. "
25
+ "Адказвай коратка, натуральна і па-беларуску. "
26
+ )
gemini_client.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from google import genai
4
+ from google.genai import types
5
+
6
+ from config import GEMINI_API_KEY, GEMINI_MODEL, DEFAULT_SYSTEM_PROMPT
7
+
8
+ DIALOG_HISTORY: List[types.Content] = []
9
+
10
+ if not GEMINI_API_KEY or GEMINI_API_KEY == "PASTE_YOUR_KEY_HERE":
11
+ print("⚠️ Увага: задай GEMINI_API_KEY перад выкарыстаннем голасавага асістэнта.")
12
+
13
+ gemini_client = genai.Client(api_key=GEMINI_API_KEY)
14
+
15
+
16
+ def reset_gemini_dialog() -> str:
17
+ """
18
+ Ачышчае глабальную гісторыю дыялогу Gemini.
19
+ """
20
+ global DIALOG_HISTORY
21
+ DIALOG_HISTORY = []
22
+ return "🧹 Гісторыя размовы Gemini ачышчана. Можаш пачынаць новую размову."
23
+
24
+
25
+ def gemini_answer_from_audio(
26
+ audio_path: str,
27
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT,
28
+ ) -> str:
29
+ """
30
+ Аўдыё з файла → Gemini 2.5 Flash-Lite → тэкставы адказ з улікам DIALOG_HISTORY.
31
+ """
32
+ global DIALOG_HISTORY
33
+
34
+ if not audio_path:
35
+ return ""
36
+
37
+ with open(audio_path, "rb") as f:
38
+ audio_bytes = f.read()
39
+
40
+ user_turn = types.Content(
41
+ role="user",
42
+ parts=[
43
+ types.Part.from_bytes(
44
+ data=audio_bytes,
45
+ mime_type="audio/wav",
46
+ ),
47
+ ],
48
+ )
49
+
50
+ try:
51
+ response = gemini_client.models.generate_content(
52
+ model=GEMINI_MODEL,
53
+ contents=[*DIALOG_HISTORY, user_turn],
54
+ config=types.GenerateContentConfig(
55
+ system_instruction=system_prompt,
56
+ ),
57
+ )
58
+ except Exception as e:
59
+ print("Памылка Gemini:", e)
60
+ return ""
61
+
62
+ text = (getattr(response, "text", "") or "").strip()
63
+ if not text:
64
+ return ""
65
+
66
+ assistant_turn = types.Content(
67
+ role="model",
68
+ parts=[types.Part.from_text(text=text)],
69
+ )
70
+ DIALOG_HISTORY.extend([user_turn, assistant_turn])
71
+
72
+ return text
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ git+https://github.com/tuteishygpt/coqui-ai-TTS.git#egg=TTS
2
+ torch
3
+ numpy
4
+ gradio
5
+ huggingface_hub
6
+ scipy
7
+ google-genai
tts_model.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import json
5
+ import base64
6
+ import pathlib
7
+ import hashlib
8
+ from dataclasses import dataclass
9
+ from typing import Iterable, Iterator, Optional, Tuple, List, Any
10
+
11
+ import numpy as np
12
+ import torch
13
+ import gradio as gr
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ # ВАЖНА: выкарыстоўваем менавіта форк TTS з GitHub:
17
+ # git+https://github.com/tuteishygpt/coqui-ai-TTS.git#egg=TTS
18
+ from TTS.tts.configs.xtts_config import XttsConfig
19
+ from TTS.tts.models.xtts import Xtts
20
+ from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
21
+
22
+ from config import (
23
+ REPO_ID,
24
+ MODEL_DIR,
25
+ CHECKPOINT_FILE,
26
+ CONFIG_FILE,
27
+ VOCAB_FILE,
28
+ DEFAULT_VOICE_FILE,
29
+ INITIAL_MIN_BUFFER_S,
30
+ MIN_BUFFER_S,
31
+ ENABLE_TEXT_SPLITTING,
32
+ FIRST_SEGMENT_LIMIT,
33
+ )
34
+
35
+ # Мінімізуем колькасць патокаў для стабільнасці на ZeroGPU / CPU
36
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
37
+ os.environ.setdefault("MKL_NUM_THREADS", "1")
38
+ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
39
+
40
+ # ---------------------------------------------------------
41
+ # 1) загрузка мадэльных файлаў
42
+ # ---------------------------------------------------------
43
+ os.makedirs(MODEL_DIR, exist_ok=True)
44
+ for fname in ("model.pth", "config.json", "vocab.json", "voice.wav"):
45
+ fpath = os.path.join(MODEL_DIR, fname)
46
+ if not os.path.exists(fpath):
47
+ hf_hub_download(REPO_ID, filename=fname, local_dir=MODEL_DIR)
48
+
49
+ # ---------------------------------------------------------
50
+ # 2) ініцыялізацыя мадэлі XTTS
51
+ # ---------------------------------------------------------
52
+ config = XttsConfig()
53
+ config.load_json(CONFIG_FILE)
54
+ XTTS_MODEL = Xtts.init_from_config(config)
55
+ XTTS_MODEL.load_checkpoint(
56
+ config,
57
+ checkpoint_path=CHECKPOINT_FILE,
58
+ vocab_path=VOCAB_FILE,
59
+ use_deepspeed=False,
60
+ )
61
+
62
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
63
+
64
+ torch.set_num_threads(1)
65
+ if device.startswith("cuda"):
66
+ torch.backends.cuda.matmul.allow_tf32 = True
67
+ torch.backends.cudnn.allow_tf32 = True
68
+ torch.backends.cudnn.benchmark = True
69
+ try:
70
+ torch.set_float32_matmul_precision("high")
71
+ except Exception:
72
+ pass
73
+
74
+ XTTS_MODEL.to(device).eval()
75
+ sampling_rate = int(XTTS_MODEL.config.audio["sample_rate"])
76
+
77
+ tokenizer = VoiceBpeTokenizer(vocab_file=VOCAB_FILE)
78
+ XTTS_MODEL.tokenizer = tokenizer
79
+
80
+ # ---------------------------------------------------------
81
+ # 3) Кэш латэнтаў
82
+ # ---------------------------------------------------------
83
+ PERSIST_LATENTS_DIR = pathlib.Path("./latents_cache")
84
+ PERSIST_LATENTS_DIR.mkdir(parents=True, exist_ok=True)
85
+
86
+
87
+ @dataclass(frozen=True)
88
+ class LatentsMeta:
89
+ model_id: str
90
+ gpt_cond_len: int
91
+ max_ref_len: int
92
+ sound_norm_refs: bool
93
+
94
+
95
+ LATENT_CACHE: dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
96
+ GPU_LATENT_CACHE: dict[Tuple[str, str], Tuple[torch.Tensor, torch.Tensor]] = {}
97
+
98
+
99
+ def _latents_key(path: Optional[str], meta: LatentsMeta) -> str:
100
+ base = (
101
+ f"{os.path.abspath(path)}:{os.path.getmtime(path)}:{os.path.getsize(path)}"
102
+ if path and os.path.exists(path)
103
+ else "default_voice"
104
+ )
105
+ h = hashlib.md5()
106
+ h.update((base + "|" + json.dumps(meta.__dict__, sort_keys=True)).encode("utf-8"))
107
+ return h.hexdigest()
108
+
109
+
110
+ def _latents_for(path: Optional[str], *, to_device: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
111
+ cfg = XTTS_MODEL.config
112
+ meta = LatentsMeta(
113
+ model_id=REPO_ID,
114
+ gpt_cond_len=cfg.gpt_cond_len,
115
+ max_ref_len=cfg.max_ref_len,
116
+ sound_norm_refs=cfg.sound_norm_refs,
117
+ )
118
+ key = _latents_key(path, meta)
119
+ g, s = LATENT_CACHE.get(key) or (None, None)
120
+ if g is None:
121
+ disk_path = PERSIST_LATENTS_DIR / f"{key}.pt"
122
+ if disk_path.exists():
123
+ data = torch.load(disk_path, map_location="cpu")
124
+ g, s = data["gpt_cond_latent"], data["speaker_embedding"]
125
+ else:
126
+ print(f"Разлік латэнтаў для {path or 'стандартнага голасу'}...")
127
+ with torch.inference_mode():
128
+ g_cpu, s_cpu = XTTS_MODEL.get_conditioning_latents(audio_path=path)
129
+ g, s = g_cpu.cpu(), s_cpu.cpu()
130
+ torch.save({"gpt_cond_latent": g, "speaker_embedding": s}, disk_path)
131
+ print("Латэнты захаваны ў кэш.")
132
+ LATENT_CACHE[key] = (g, s)
133
+ if to_device:
134
+ dev_key = (key, to_device)
135
+ if dev_key in GPU_LATENT_CACHE:
136
+ return GPU_LATENT_CACHE[dev_key]
137
+ g, s = g.to(to_device, non_blocking=True), s.to(to_device, non_blocking=True)
138
+ GPU_LATENT_CACHE[dev_key] = (g, s)
139
+ return g, s
140
+
141
+
142
+ try:
143
+ if os.path.exists(DEFAULT_VOICE_FILE):
144
+ _latents_for(DEFAULT_VOICE_FILE, to_device=device)
145
+ print("Стандартны голас паспяхова пракэшаваны.")
146
+ except Exception as e:
147
+ print(f"Папярэджанне: не атрымалася папярэдне кэшаваць стандартны голас: {e}")
148
+
149
+ # ---------------------------------------------------------
150
+ # 4) Дапаможнікі па аўдыё
151
+ # ---------------------------------------------------------
152
+ def _to_np_audio(x: Any) -> np.ndarray:
153
+ if isinstance(x, dict) and "wav" in x:
154
+ x = x["wav"]
155
+ if isinstance(x, torch.Tensor):
156
+ x = x.detach().cpu().float().contiguous().view(-1).numpy()
157
+ x = np.asarray(x, dtype=np.float32)
158
+ return x.reshape(-1)
159
+
160
+
161
+ def _seconds_to_samples(sec: float, sr: int) -> int:
162
+ return max(1, int(sec * sr))
163
+
164
+
165
+ def _chunker(
166
+ chunks: Iterable[np.ndarray],
167
+ sr: int,
168
+ initial_target_s: float,
169
+ target_s: float,
170
+ ) -> Iterator[np.ndarray]:
171
+ is_first = True
172
+ target_samples = _seconds_to_samples(initial_target_s, sr)
173
+ min_first = _seconds_to_samples(0.12, sr) # мін. 120 мс
174
+ min_next = _seconds_to_samples(0.05, sr) # мін. 50 мс
175
+ buffer = np.array([], dtype=np.float32)
176
+ for c_np in map(_to_np_audio, chunks):
177
+ if c_np.size == 0:
178
+ continue
179
+ buffer = np.concatenate([buffer, c_np])
180
+ need = target_samples if buffer.size < target_samples else 0
181
+ if buffer.size >= max(min_first if is_first else min_next, need):
182
+ yield buffer
183
+ buffer = np.array([], dtype=np.float32)
184
+ if is_first:
185
+ is_first = False
186
+ target_samples = _seconds_to_samples(target_s, sr)
187
+ if buffer.size > 0:
188
+ yield buffer
189
+
190
+
191
+ def _pcm_f32_to_b64(x: np.ndarray) -> str:
192
+ return base64.b64encode(x.tobytes()).decode("ascii")
193
+
194
+ # ---------------------------------------------------------
195
+ # 5) Падзел тэксту
196
+ # ---------------------------------------------------------
197
+ _SENT_END = re.compile(r"([\.\!\?…]+[»")\]]*\s+)")
198
+ _WS = re.compile(r"\s+")
199
+
200
+
201
+ def _fast_split(text: str, limit: int) -> List[str]:
202
+ text = text.strip()
203
+ if not text:
204
+ return []
205
+ parts, start = [], 0
206
+ for m in _SENT_END.finditer(text):
207
+ end = m.end()
208
+ parts.append(text[start:end].strip())
209
+ start = end
210
+ if start < len(text):
211
+ parts.append(text[start:].strip())
212
+ chunks, cur = [], ""
213
+ for s in parts:
214
+ if len(cur) + 1 + len(s) <= limit:
215
+ cur = (cur + " " + s).strip() if cur else s
216
+ else:
217
+ if cur:
218
+ chunks.append(cur)
219
+ if len(s) <= limit:
220
+ cur = s
221
+ else:
222
+ w = _WS.split(s)
223
+ acc = ""
224
+ for tok in w:
225
+ if len(acc) + 1 + len(tok) <= limit:
226
+ acc = (acc + " " + tok).strip() if acc else tok
227
+ else:
228
+ if acc:
229
+ chunks.append(acc)
230
+ acc = tok
231
+ cur = acc or ""
232
+ if cur:
233
+ chunks.append(cur)
234
+ return [c for c in chunks if c]
235
+
236
+
237
+ def _split_text_smart(text_in: str, lang_short: str, chunk_limit: int) -> List[str]:
238
+ text_in = text_in.strip()
239
+ if not text_in:
240
+ return []
241
+ parts: List[str] = []
242
+ if len(text_in) > FIRST_SEGMENT_LIMIT:
243
+ head = text_in[:FIRST_SEGMENT_LIMIT]
244
+ m = re.search(r".*[\.\!\?…»)]", head)
245
+ if m and len(m.group(0)) > 30:
246
+ head = m.group(0)
247
+ tail = text_in[len(head):].lstrip()
248
+ parts.append(head)
249
+ text_for_rest = tail
250
+ else:
251
+ text_for_rest = text_in
252
+ if not text_for_rest:
253
+ return parts or [text_in]
254
+
255
+ rest = _fast_split(text_for_rest, chunk_limit)
256
+ if not rest or sum(len(x) for x in rest) < int(0.6 * len(text_for_rest)):
257
+ try:
258
+ rest2 = split_sentence(text_for_rest, lang=lang_short, text_split_length=chunk_limit)
259
+ rest2 = [s.strip() for s in rest2 if s and s.strip()]
260
+ if rest2:
261
+ rest = rest2
262
+ except Exception:
263
+ pass
264
+ return parts + (rest or [text_for_rest])
265
+
266
+ # ---------------------------------------------------------
267
+ # 6) Асноўная функцыя TTS (стрымінг)
268
+ # ---------------------------------------------------------
269
+ def text_to_speech(
270
+ text_input: str,
271
+ speaker_audio: Optional[str],
272
+ initial_buffer_s: Optional[float] = None,
273
+ subsequent_buffer_s: Optional[float] = None,
274
+ ):
275
+ """
276
+ Генератар: вяртае (b64_float32_wav, metrics_json_or_None)
277
+ """
278
+ if initial_buffer_s is None:
279
+ initial_buffer_s = INITIAL_MIN_BUFFER_S
280
+ if subsequent_buffer_s is None:
281
+ subsequent_buffer_s = MIN_BUFFER_S
282
+
283
+ t_start_req = time.perf_counter()
284
+ if not text_input or not str(text_input).strip():
285
+ raise gr.Error("Увядзі хоць нейкі тэкст 🙂")
286
+
287
+ # Латэнты
288
+ t_lat_0 = time.perf_counter()
289
+ gpt_cond_latent, speaker_embedding = _latents_for(
290
+ speaker_audio or DEFAULT_VOICE_FILE,
291
+ to_device=device,
292
+ )
293
+ t_lat_1 = time.perf_counter()
294
+
295
+ # Падзел тэксту
296
+ t_split_0 = time.perf_counter()
297
+ char_limit_cfg = getattr(XTTS_MODEL, "tokenizer", None)
298
+ be_limit = 250
299
+ if char_limit_cfg and hasattr(char_limit_cfg, "char_limits"):
300
+ be_limit = char_limit_cfg.char_limits.get("be", 250)
301
+ char_limit = min(180, be_limit)
302
+ texts = (
303
+ _split_text_smart(str(text_input).strip(), "be", char_limit)
304
+ if ENABLE_TEXT_SPLITTING
305
+ else [str(text_input).strip()]
306
+ )
307
+ t_split_1 = time.perf_counter()
308
+
309
+ server_metrics = {
310
+ "latents_s": t_lat_1 - t_lat_0,
311
+ "text_split_s": t_split_1 - t_split_0,
312
+ "initial_buffer_s": initial_buffer_s,
313
+ "subsequent_buffer_s": subsequent_buffer_s,
314
+ }
315
+
316
+ # 1-е паведамленне — толькі метрыкі
317
+ yield ("", json.dumps(server_metrics))
318
+
319
+ first_chunk_sent = False
320
+ t_gen_start = time.perf_counter()
321
+
322
+ with torch.inference_mode(), torch.autocast(
323
+ device_type="cuda",
324
+ dtype=torch.float16,
325
+ enabled=str(device).startswith("cuda"),
326
+ ):
327
+ all_chunks_iterator = (
328
+ _to_np_audio(chunk)
329
+ for part in texts
330
+ for chunk in XTTS_MODEL.inference_stream(
331
+ text=part,
332
+ language="be",
333
+ gpt_cond_latent=gpt_cond_latent,
334
+ speaker_embedding=speaker_embedding,
335
+ temperature=0.25,
336
+ length_penalty=0.9,
337
+ repetition_penalty=7.0,
338
+ top_k=10,
339
+ top_p=0.80,
340
+ )
341
+ )
342
+
343
+ for audio_chunk in _chunker(
344
+ all_chunks_iterator,
345
+ sampling_rate,
346
+ initial_buffer_s,
347
+ subsequent_buffer_s,
348
+ ):
349
+ if not first_chunk_sent:
350
+ t_first_chunk_ready = time.perf_counter()
351
+ server_metrics["gen_init_to_first_chunk_s"] = (
352
+ t_first_chunk_ready - t_gen_start
353
+ )
354
+ server_metrics["until_first_chunk_total_s"] = (
355
+ t_first_chunk_ready - t_start_req
356
+ )
357
+ yield (_pcm_f32_to_b64(audio_chunk), json.dumps(server_metrics))
358
+ first_chunk_sent = True
359
+ else:
360
+ yield (_pcm_f32_to_b64(audio_chunk), None)
ui.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from tts_model import sampling_rate, text_to_speech
4
+ from assistant import handle_b64_wav, reset_session
5
+
6
+
7
+ def create_app():
8
+ with gr.Blocks() as demo:
9
+ gr.Markdown("## Belarusian TTS — Streaming (форк coqui-ai-TTS + Gemini)")
10
+
11
+ # CSS
12
+ gr.HTML(
13
+ """
14
+ <style>
15
+ /* Мікрафон */
16
+ #mic-input {
17
+ display: flex;
18
+ justify-content: center;
19
+ align-items: center;
20
+ }
21
+ #mic-input {
22
+ position: relative !important;
23
+ width: 80px !important;
24
+ height: 80px !important;
25
+ border-radius: 999px !important;
26
+ border: none !important;
27
+ cursor: pointer !important;
28
+ background: radial-gradient(circle at 30% 30%, #f97316, #b91c1c) !important;
29
+ box-shadow: 0 10px 25px rgba(0,0,0,.35) !important;
30
+ display: flex !important;
31
+ align-items: center !important;
32
+ justify-content: center !important;
33
+ padding: 0 !important;
34
+ font-size: 0 !important;
35
+ color: transparent !important;
36
+ }
37
+ #mic-input::before {
38
+ content: "" !important;
39
+ position: absolute !important;
40
+ inset: -6px !important;
41
+ border-radius: 999px !important;
42
+ border: 2px solid #f97316 !important;
43
+ animation: mic-pulse 1.6s infinite !important;
44
+ }
45
+ #mic-input::after {
46
+ content: "🎙️" !important;
47
+ font-size: 34px !important;
48
+ color: #fee2e2 !important;
49
+ }
50
+ @keyframes mic-pulse {
51
+ 0% { transform: scale(1); opacity: 0.9; }
52
+ 70% { transform: scale(1.35); opacity: 0; }
53
+ 100% { transform: scale(1.35); opacity: 0; }
54
+ }
55
+
56
+ /* Кнопка «Пачаць новую размову» */
57
+ #reset-btn {
58
+ margin-top: 10px !important;
59
+ padding: 10px 18px !important;
60
+ border-radius: 999px !important;
61
+ border: none !important;
62
+ cursor: pointer !important;
63
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif !important;
64
+ font-size: 14px !important;
65
+ font-weight: 600 !important;
66
+ background: linear-gradient(135deg, #4f46e5, #0ea5e9) !important;
67
+ color: #eff6ff !important;
68
+ box-shadow: 0 6px 18px rgba(15,23,42,0.35) !important;
69
+ display: inline-flex !important;
70
+ align-items: center !important;
71
+ gap: 8px !important;
72
+ }
73
+ #reset-btn::before {
74
+ content: "🧹" !important;
75
+ font-size: 16px !important;
76
+ }
77
+ #reset-btn:hover {
78
+ filter: brightness(1.05);
79
+ transform: translateY(-1px);
80
+ }
81
+ #reset-btn:active {
82
+ filter: brightness(0.98);
83
+ transform: translateY(0);
84
+ }
85
+ </style>
86
+ """
87
+ )
88
+
89
+ # Верхні радок
90
+ with gr.Row():
91
+ gr.Column(scale=1)
92
+ with gr.Column(scale=1):
93
+ gr.Markdown("### 🎙️ Мікрафон — спытайся ў асістэнта")
94
+
95
+ mic_btn = gr.Button("", elem_id="mic-input")
96
+ dummy_in = gr.Textbox(visible=False)
97
+
98
+ reset_btn = gr.Button("Пачаць новую размову", elem_id="reset-btn")
99
+ gr.Column(scale=1)
100
+
101
+ # Статус
102
+ log_panel = gr.HTML(
103
+ value="""
104
+ <div id='wa-log' style='font-family:system-ui;font-size:14px;'>
105
+ <div style='display:flex;align-items:center;gap:10px;'>
106
+ <div style='width:10px;height:10px;border-radius:999px;background:#10B981;box-shadow:0 0 0 4px #10B98122;'></div>
107
+ <div>
108
+ <div style='font-weight:600;'>✅ Гатова</div>
109
+ <div style='font-size:12px;opacity:0.8;'>Няма актыўнай генерацыі. Можаш задаць запыт.</div>
110
+ </div>
111
+ </div>
112
+ </div>
113
+ """,
114
+ label="Статус асістэнта",
115
+ )
116
+
117
+ reset_btn.click(
118
+ fn=reset_session,
119
+ inputs=None,
120
+ outputs=log_panel,
121
+ )
122
+
123
+ stream_pipe = gr.Textbox(visible=False)
124
+ log_pipe = gr.Textbox(visible=False)
125
+
126
+ # Ніжні блок: TTS
127
+ with gr.Accordion("Налады TTS (па жаданні)", open=False):
128
+ with gr.Row():
129
+ inp_text = gr.Textbox(
130
+ lines=3,
131
+ label="Тэкст на беларускай мове",
132
+ )
133
+ inp_voice = gr.Audio(
134
+ type="filepath",
135
+ label="Прыклад голасу (6–10 сек)",
136
+ )
137
+ with gr.Row():
138
+ run_btn = gr.Button("Згенераваць (тэкст → голас)")
139
+ gr.Markdown(f"**Частата дыскрэтызацыі:** {int(sampling_rate)} Гц")
140
+
141
+ run_btn.click(
142
+ fn=text_to_speech,
143
+ inputs=[inp_text, inp_voice],
144
+ outputs=[stream_pipe, log_pipe],
145
+ )
146
+
147
+ # -------- JS_PLAYER --------
148
+ JS_PLAYER = """
149
+ (b64) => {
150
+ if (!window.__wa) {
151
+ const sampleRate = SAMPLE_RATE_HZ;
152
+ const AC = window.AudioContext || window.webkitAudioContext;
153
+ if (!AC) { console.error('AudioContext is not supported.'); return; }
154
+ const ctx = new AC({ sampleRate });
155
+ const node = ctx.createScriptProcessor(2048, 1, 1);
156
+
157
+ let queue = [];
158
+ let playing = false;
159
+ let hasPlayedOnce = false;
160
+ let idleSince = null;
161
+ let autoMicStarted = false;
162
+ let stoppedByMic = false;
163
+
164
+ function autoStartMic() {
165
+ setTimeout(() => {
166
+ const btn = document.getElementById('mic-input');
167
+ if (!btn) return;
168
+ btn.click();
169
+ }, 300);
170
+ }
171
+
172
+ function logUpdate(state) {
173
+ const el = document.getElementById('wa-log');
174
+ if (!el) return;
175
+
176
+ let title = '✅ Гатова';
177
+ let subtitle = 'Можаш задаць наступнае пытанне.';
178
+ let color = '#10B981';
179
+
180
+ if (state === 'speaking') {
181
+ title = '🗣️ Кажу';
182
+ subtitle = 'Ідзе прайграванне згенераванага голасу.';
183
+ color = '#3B82F6';
184
+ } else if (state === 'buffering') {
185
+ title = '👂 Слухаю';
186
+ subtitle = 'Прылятаюць новыя аўдыя-чанкі ад мадэлі…';
187
+ color = '#6366F1';
188
+ } else if (state === 'idle') {
189
+ title = '✅ Гатова';
190
+ subtitle = 'Стрымінг завершаны. Можаш пытацца яшчэ 🙂';
191
+ color = '#10B981';
192
+ }
193
+
194
+ const html = `
195
+ <div style="display:flex;align-items:center;gap:10px;">
196
+ <div style="width:10px;height:10px;border-radius:999px;background:${color};box-shadow:0 0 0 4px ${color}22;"></div>
197
+ <div>
198
+ <div style="font-weight:600;">${title}</div>
199
+ <div style="font-size:12px;opacity:0.8;">${subtitle}</div>
200
+ </div>
201
+ </div>`;
202
+ el.innerHTML = html;
203
+ }
204
+
205
+ node.onaudioprocess = (e) => {
206
+ const out = e.outputBuffer.getChannelData(0);
207
+ let i = 0;
208
+
209
+ while (i < out.length) {
210
+ if (queue.length === 0) {
211
+ out[i++] = 0.0;
212
+ continue;
213
+ }
214
+ let cur = queue[0];
215
+ const take = Math.min(cur.length, out.length - i);
216
+ out.set(cur.subarray(0, take), i);
217
+ i += take;
218
+ if (take === cur.length) {
219
+ queue.shift();
220
+ } else {
221
+ queue[0] = cur.subarray(take);
222
+ }
223
+ }
224
+
225
+ const now = performance.now();
226
+
227
+ if (queue.length === 0) {
228
+ if (playing) {
229
+ playing = false;
230
+ logUpdate('idle');
231
+ }
232
+ if (hasPlayedOnce && !stoppedByMic && !autoMicStarted) {
233
+ if (idleSince === null) idleSince = now;
234
+ if (now - idleSince > 700) {
235
+ autoMicStarted = true;
236
+ autoStartMic();
237
+ }
238
+ }
239
+ } else {
240
+ if (!playing) {
241
+ playing = true;
242
+ logUpdate('speaking');
243
+ } else {
244
+ logUpdate('buffering');
245
+ }
246
+ idleSince = null;
247
+ }
248
+ };
249
+
250
+ node.connect(ctx.destination);
251
+
252
+ window.__wa = {
253
+ push: (b64in) => {
254
+ if (!b64in) return;
255
+
256
+ stoppedByMic = false;
257
+ autoMicStarted = false;
258
+
259
+ try {
260
+ const bin = atob(b64in);
261
+ const buf = new ArrayBuffer(bin.length);
262
+ const view = new Uint8Array(buf);
263
+ for (let i = 0; i < bin.length; i++) view[i] = bin.charCodeAt(i);
264
+ const f32 = new Float32Array(buf);
265
+ if (f32.length > 0) {
266
+ queue.push(f32);
267
+ hasPlayedOnce = true;
268
+ }
269
+ } catch (err) {
270
+ console.error('Failed to decode audio chunk', err);
271
+ }
272
+
273
+ if (!playing && queue.length > 0 && ctx.state === 'suspended') {
274
+ ctx.resume();
275
+ }
276
+ if (!playing && queue.length > 0) {
277
+ playing = true;
278
+ logUpdate('speaking');
279
+ } else if (playing) {
280
+ logUpdate('buffering');
281
+ }
282
+ },
283
+
284
+ stop: () => {
285
+ queue.length = 0;
286
+ playing = false;
287
+ stoppedByMic = true;
288
+ idleSince = null;
289
+ autoMicStarted = false;
290
+ logUpdate('idle');
291
+ if (ctx && ctx.state === 'running') {
292
+ ctx.suspend();
293
+ }
294
+ },
295
+
296
+ update_server_metrics: (js) => {},
297
+
298
+ reset: () => {
299
+ queue.length = 0;
300
+ playing = false;
301
+ hasPlayedOnce = false;
302
+ idleSince = null;
303
+ autoMicStarted = false;
304
+ stoppedByMic = false;
305
+ logUpdate('idle');
306
+ },
307
+
308
+ ctx: ctx
309
+ };
310
+
311
+ logUpdate('idle');
312
+ }
313
+
314
+ if (!b64) return;
315
+ if (window.__wa) {
316
+ window.__wa.push(b64);
317
+ }
318
+ }
319
+ """
320
+ JS_PLAYER = JS_PLAYER.replace("SAMPLE_RATE_HZ", str(int(sampling_rate)))
321
+
322
+ stream_pipe.change(fn=None, inputs=[stream_pipe], js=JS_PLAYER)
323
+ log_pipe.change(
324
+ fn=None,
325
+ inputs=[log_pipe],
326
+ js="(js) => { if(window.__wa) window.__wa.update_server_metrics(js); }",
327
+ )
328
+
329
+ # -------- JS_RECORDER --------
330
+ JS_RECORDER = """
331
+ (dummy, ref_voice) => {
332
+ return new Promise(async (resolve, reject) => {
333
+ if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
334
+ console.error("getUserMedia not supported");
335
+ resolve([null, ref_voice]);
336
+ return;
337
+ }
338
+
339
+ let stream = null;
340
+ let ctx = null;
341
+ let source = null;
342
+ let processor = null;
343
+ let stopped = false;
344
+ let speechStarted = false;
345
+ let lastSpeechTime = 0;
346
+ let startTime = performance.now();
347
+ const silenceMsAfterSpeech = 800;
348
+ const maxTotalMs = 15000;
349
+ const voiceThreshold = 0.012;
350
+ let recordedBuffers = [];
351
+
352
+ function setLog(title, subtitle, color) {
353
+ const el = document.getElementById('wa-log');
354
+ if (!el) return;
355
+ const html =
356
+ '<div style="display:flex;align-items:center;gap:10px;">' +
357
+ '<div style="width:10px;height:10px;border-radius:999px;background:' + color +
358
+ ';box-shadow:0 0 0 4px ' + color + '22;"></div>' +
359
+ '<div>' +
360
+ '<div style="font-weight:600;">' + title + '</div>' +
361
+ '<div style="font-size:12px;opacity:0.8;">' + subtitle + '</div>' +
362
+ '</div>' +
363
+ '</div>';
364
+ el.innerHTML = html;
365
+ }
366
+
367
+ function cleanup() {
368
+ try { if (processor) { processor.disconnect(); processor.onaudioprocess = null; } } catch (e) {}
369
+ try { if (source) { source.disconnect(); } } catch (e) {}
370
+ if (stream) { stream.getTracks().forEach(t => t.stop()); }
371
+ if (ctx && ctx.state !== 'closed') { ctx.close(); }
372
+ }
373
+
374
+ function stopWithResolve(b64) {
375
+ if (stopped) return;
376
+ stopped = true;
377
+ cleanup();
378
+ resolve([b64, ref_voice]);
379
+ }
380
+
381
+ function encodeWAV(samples, sampleRate) {
382
+ const buffer = new ArrayBuffer(44 + samples.length * 2);
383
+ const view = new DataView(buffer);
384
+
385
+ function writeString(view, offset, string) {
386
+ for (let i = 0; i < string.length; i++) {
387
+ view.setUint8(offset + i, string.charCodeAt(i));
388
+ }
389
+ }
390
+
391
+ let offset = 0;
392
+ writeString(view, offset, 'RIFF'); offset += 4;
393
+ view.setUint32(offset, 36 + samples.length * 2, true); offset += 4;
394
+ writeString(view, offset, 'WAVE'); offset += 4;
395
+ writeString(view, offset, 'fmt '); offset += 4;
396
+ view.setUint32(offset, 16, true); offset += 4;
397
+ view.setUint16(offset, 1, true); offset += 2;
398
+ view.setUint16(offset, 1, true); offset += 2;
399
+ view.setUint32(offset, sampleRate, true); offset += 4;
400
+ view.setUint32(offset, sampleRate * 2, true); offset += 4;
401
+ view.setUint16(offset, 2, true); offset += 2;
402
+ view.setUint16(offset, 16, true); offset += 2;
403
+ writeString(view, offset, 'data'); offset += 4;
404
+ view.setUint32(offset, samples.length * 2, true); offset += 4;
405
+
406
+ let index = 44;
407
+ for (let i = 0; i < samples.length; i++, index += 2) {
408
+ let s = Math.max(-1, Math.min(1, samples[i]));
409
+ view.setInt16(index, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
410
+ }
411
+
412
+ return buffer;
413
+ }
414
+
415
+ try {
416
+ stream = await navigator.mediaDevices.getUserMedia({ audio: true });
417
+ } catch (err) {
418
+ console.error('Mic error', err);
419
+ setLog('⚠️ Памылка мікрафона', 'Не атрымалася атрымаць доступ да мікрафона.', '#EF4444');
420
+ stopWithResolve(null);
421
+ return;
422
+ }
423
+
424
+ setLog('🎤 Слухаю', 'Сесія актыўная. Пачынай гаварыць.', '#6366F1');
425
+
426
+ const AC = window.AudioContext || window.webkitAudioContext;
427
+ ctx = new AC();
428
+ source = ctx.createMediaStreamSource(stream);
429
+ const bufferSize = 2048;
430
+ processor = ctx.createScriptProcessor(bufferSize, 1, 1);
431
+
432
+ source.connect(processor);
433
+ processor.connect(ctx.destination);
434
+
435
+ processor.onaudioprocess = (e) => {
436
+ if (stopped) return;
437
+ const input = e.inputBuffer.getChannelData(0);
438
+ if (!input) return;
439
+
440
+ let sum = 0.0;
441
+ for (let i = 0; i < input.length; i++) {
442
+ const s = input[i];
443
+ sum += s * s;
444
+ }
445
+ const rms = Math.sqrt(sum / input.length);
446
+ const now = performance.now();
447
+
448
+ if (rms > voiceThreshold) {
449
+ if (!speechStarted) {
450
+ speechStarted = true;
451
+ lastSpeechTime = now;
452
+ recordedBuffers = [];
453
+
454
+ if (window.__wa && typeof window.__wa.stop === 'function') {
455
+ window.__wa.stop();
456
+ }
457
+ setLog('👂 Слухаю', 'Фіксую тваю фразу…', '#6366F1');
458
+ } else {
459
+ lastSpeechTime = now;
460
+ }
461
+ recordedBuffers.push(new Float32Array(input));
462
+ } else {
463
+ if (speechStarted && (now - lastSpeechTime > silenceMsAfterSpeech)) {
464
+ if (!recordedBuffers.length) {
465
+ setLog('✅ Гатова', 'Голас не зафіксаваны.', '#10B981');
466
+ stopWithResolve(null);
467
+ return;
468
+ }
469
+ let totalLength = 0;
470
+ for (const b of recordedBuffers) totalLength += b.length;
471
+ const merged = new Float32Array(totalLength);
472
+ let offset = 0;
473
+ for (const b of recordedBuffers) {
474
+ merged.set(b, offset);
475
+ offset += b.length;
476
+ }
477
+ const wavBuffer = encodeWAV(merged, ctx.sampleRate);
478
+ const bytes = new Uint8Array(wavBuffer);
479
+ let binary = '';
480
+ for (let i = 0; i < bytes.byteLength; i++) {
481
+ binary += String.fromCharCode(bytes[i]);
482
+ }
483
+ const b64 = btoa(binary);
484
+ setLog('🤖 Думаю', 'Адпраўляю фразу асістэнту…', '#F59E0B');
485
+ stopWithResolve(b64);
486
+ return;
487
+ }
488
+ }
489
+
490
+ if (!speechStarted && (now - startTime > maxTotalMs)) {
491
+ setLog('✅ Гатова', 'Голас не зафіксаваны.', '#10B981');
492
+ stopWithResolve(null);
493
+ }
494
+ };
495
+ });
496
+ }
497
+ """
498
+
499
+ mic_btn.click(
500
+ fn=handle_b64_wav,
501
+ inputs=[dummy_in, inp_voice],
502
+ outputs=[stream_pipe, log_pipe],
503
+ js=JS_RECORDER,
504
+ )
505
+
506
+ return demo