| import asyncio, json, os, time |
| from typing import Optional, List |
|
|
| import numpy as np |
| from faster_whisper import WhisperModel |
| from fastapi import FastAPI, WebSocket, WebSocketDisconnect |
| from fastapi.responses import JSONResponse |
| import httpx |
| import uvicorn |
|
|
| |
| |
| |
| MODEL_NAME = os.environ.get("WHISPER_MODEL", "Systran/faster-whisper-tiny.en") |
| DEVICE = os.environ.get("WHISPER_DEVICE", "cpu") |
| COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE", "int8") |
|
|
| SAMPLE_RATE = int(os.environ.get("STT_SAMPLE_RATE", "16000")) |
| CHUNK_MS = int(os.environ.get("STT_CHUNK_MS", "20")) |
| FINAL_SILENCE_SEC = float(os.environ.get("FINAL_SILENCE_SEC", "0.8")) |
| MAX_BUFFER_SEC = float(os.environ.get("MAX_BUFFER_SEC", "30.0")) |
|
|
| |
| BRAIN_URL = (os.environ.get("BRAIN_PROCESS_URL") or "").strip() |
| BRAIN_SECRET = (os.environ.get("BRAIN_SHARED_SECRET") or "").strip() |
| _NOTIFY = bool(BRAIN_URL) |
|
|
| |
| |
| |
| _client = httpx.AsyncClient( |
| timeout=httpx.Timeout(connect=8.0, read=20.0, write=8.0, pool=8.0) |
| ) |
|
|
| |
| |
| |
| def pcm16_to_float32(b: bytes) -> np.ndarray: |
| a = np.frombuffer(b, dtype=np.int16) |
| return (a.astype(np.float32) / 32768.0) |
|
|
| def float32_concat(a: np.ndarray, b: np.ndarray) -> np.ndarray: |
| if a.size == 0: return b |
| if b.size == 0: return a |
| return np.concatenate([a, b]) |
|
|
| |
| |
| |
| class SpeakerEmbedder: |
| def _init_(self, model_name: Optional[str] = None, device: str = "cpu"): |
| self.model_name = model_name or "toy" |
| self.device = device |
| self.enrolled: Optional[np.ndarray] = None |
|
|
| def embed(self, audio_f32: np.ndarray, sr: int) -> Optional[np.ndarray]: |
| if audio_f32 is None or audio_f32.size == 0: |
| return None |
| v = audio_f32.astype(np.float32) |
| return np.array([float(v.mean()), float(v.std())], dtype=np.float32) |
|
|
| def enroll(self, audio_f32: np.ndarray, sr: int) -> bool: |
| e = self.embed(audio_f32, sr) |
| if e is None: |
| return False |
| self.enrolled = e |
| return True |
|
|
| _embedder: Optional[SpeakerEmbedder] = SpeakerEmbedder() |
|
|
| |
| |
| |
| print(f"[STT] Loading {MODEL_NAME} on {DEVICE} ({COMPUTE_TYPE})", flush=True) |
| model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE) |
|
|
| |
| |
| |
| app = FastAPI(title="ActualSTT (Whisper)") |
|
|
| @app.get("/") |
| def root(): |
| return JSONResponse({"ok": True, "tip": "Use WS /ws/stt; send {'event':'init','rate':16000} then PCM16 frames"}) |
|
|
| @app.get("/health") |
| def health(): |
| return { |
| "ok": True, |
| "engine": "faster-whisper", |
| "model": MODEL_NAME, |
| "device": DEVICE, |
| "compute": COMPUTE_TYPE, |
| "config": { |
| "sample_rate": SAMPLE_RATE, |
| "chunk_ms": CHUNK_MS, |
| "final_silence_sec": FINAL_SILENCE_SEC, |
| "max_buffer_sec": MAX_BUFFER_SEC, |
| }, |
| "tip": "Use WS /ws/stt; send init then raw PCM16 LE frames", |
| } |
|
|
| |
| |
| |
| def transcribe_block(audio_f32: np.ndarray, language: str, prompt: Optional[str]) -> str: |
| segments, _ = model.transcribe( |
| audio_f32, |
| language=(language or "en"), |
| task="transcribe", |
| beam_size=1, |
| vad_filter=False, |
| temperature=0.0, |
| no_speech_threshold=0.3, |
| initial_prompt=(prompt or None), |
| ) |
| parts: List[str] = [] |
| for seg in segments: |
| if seg.text: |
| parts.append(seg.text.strip()) |
| return " ".join(parts).strip() |
|
|
| async def _notify_brain(text: str): |
| if not _NOTIFY or not text.strip(): |
| return |
| headers = {"x-auth": BRAIN_SECRET} if BRAIN_SECRET else {} |
| payload = {"text": text.strip()} |
| try: |
| await _client.post(BRAIN_URL, json=payload, headers=headers) |
| except Exception as e: |
| print(f"[STT->Brain] notify failed: {e}", flush=True) |
|
|
| |
| async def safe_receive(ws: WebSocket): |
| try: |
| return await ws.receive() |
| except WebSocketDisconnect: |
| raise |
| except RuntimeError as e: |
| if "disconnect" in str(e).lower(): |
| raise WebSocketDisconnect() |
| raise |
|
|
| |
| |
| |
| @app.websocket("/ws/stt") |
| async def ws_stt(ws: WebSocket): |
| await ws.accept() |
|
|
| |
| try: |
| init_msg = await ws.receive_text() |
| except WebSocketDisconnect: |
| return |
| except Exception: |
| await ws.close(code=1002) |
| return |
|
|
| try: |
| init = json.loads(init_msg) |
| assert init.get("event") == "init", "first message must be {'event':'init','rate':16000}" |
| client_sr = int(init.get("rate", SAMPLE_RATE)) |
| language = (init.get("language") or "en").strip() |
| prompt = (init.get("prompt") or "").strip() or None |
| if client_sr != SAMPLE_RATE: |
| await ws.send_json({"event":"error","detail":f"Expected rate {SAMPLE_RATE}, got {client_sr}"}) |
| await ws.close(code=1002) |
| return |
| except Exception as e: |
| await ws.send_json({"event":"error","detail":f"Bad init: {e}"}) |
| await ws.close(code=1002) |
| return |
|
|
| await ws.send_json(json.dumps({"event":"ready","sr": SAMPLE_RATE})) |
|
|
| |
| audio_f32 = np.zeros((0,), dtype=np.float32) |
| last_emit_text = "" |
| last_audio_ts = time.time() |
| FRAME_BYTES = int(SAMPLE_RATE * (CHUNK_MS / 1000.0) * 2) |
|
|
| async def producer(): |
| nonlocal last_emit_text, last_audio_ts |
| try: |
| while True: |
| now = time.time() |
| |
| if last_emit_text and now - last_audio_ts >= FINAL_SILENCE_SEC: |
| try: |
| await ws.send_json(json.dumps({"event": "final", "text": last_emit_text})) |
| except Exception: |
| pass |
| |
| asyncio.create_task(_notify_brain(last_emit_text)) |
| last_emit_text = "" |
| await asyncio.sleep(0.05) |
| except WebSocketDisconnect: |
| pass |
| except Exception: |
| pass |
|
|
| async def consumer(): |
| nonlocal audio_f32, last_emit_text, last_audio_ts |
| try: |
| while True: |
| msg = await safe_receive(ws) |
| if "bytes" in msg and msg["bytes"] is not None: |
| chunk = msg["bytes"] |
| |
| for off in range(0, len(chunk), FRAME_BYTES): |
| sub = chunk[off:off+FRAME_BYTES] |
| if len(sub) == FRAME_BYTES: |
| audio_f32 = float32_concat(audio_f32, pcm16_to_float32(sub)) |
| last_audio_ts = time.time() |
|
|
| |
| max_len = int(MAX_BUFFER_SEC * SAMPLE_RATE) |
| if audio_f32.size > max_len: |
| audio_f32 = audio_f32[-max_len:] |
|
|
| |
| tail_len = SAMPLE_RATE |
| tail = audio_f32[-tail_len:] if audio_f32.size > tail_len else audio_f32 |
| if tail.size >= int(0.8 * SAMPLE_RATE): |
| try: |
| txt = transcribe_block(tail, language, prompt) |
| if txt and txt != last_emit_text: |
| |
| new_part = txt[len(last_emit_text):].strip() if txt.startswith(last_emit_text) else txt |
| if new_part: |
| try: |
| await ws.send_json(json.dumps({"event":"interim","text": new_part})) |
| except Exception: |
| pass |
| last_emit_text = txt |
| except Exception: |
| |
| pass |
| else: |
| break |
| except WebSocketDisconnect: |
| pass |
| except Exception: |
| pass |
| finally: |
| try: |
| await ws.close() |
| except Exception: |
| pass |
|
|
| await asyncio.gather(producer(), consumer()) |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")), workers=1) |