Spaces:

ashishkblink
/

neural_voice_AI

Running

App Files Files Community

ashishkblink commited on Jan 15

Commit

1a3eaef

verified ·

1 Parent(s): 0ef9177

Update pipe_method3.py

Browse files

Files changed (1) hide show

pipe_method3.py +160 -46

pipe_method3.py CHANGED Viewed

@@ -1,15 +1,7 @@
 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
-What this version does:
-- NO intent / NO clarify JSON
-- Logs only:
-  STT_FINAL> ...
-  LLM_ANS> ...
-  TTS> ...
-- Generation-id safe TTS (no self-cancel on Railway)
-- Better phone clarity using ffmpeg filters (highpass/lowpass/compand)
-- Proper 20ms pacing + keepalive marks to prevent WS idle timeouts
 """
 import asyncio
@@ -17,18 +9,18 @@ import base64
 import json
 import logging
 import os
-import re
 import tempfile
 import time
 import audioop
 import subprocess
-import threading
 from dataclasses import dataclass, field
 from typing import Optional, List, Dict
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
-from fastapi.responses import PlainTextResponse, Response
 from fastapi.middleware.cors import CORSMiddleware
 from vosk import Model, KaldiRecognizer
 from openai import OpenAI
@@ -38,9 +30,11 @@ from openai import OpenAI
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger("app")
 def P(tag: str, msg: str):
     print(f"{tag} {msg}", flush=True)
 # ----------------------------
 # Env
 # ----------------------------
@@ -54,7 +48,8 @@ PIPER_BIN = os.getenv("PIPER_BIN", "piper").strip()
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
 HOST = "0.0.0.0"
-PORT = int(os.getenv("PORT", "7860"))
 # ----------------------------
 # FastAPI
@@ -68,6 +63,37 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # ----------------------------
 # Audio / Twilio
 # ----------------------------
@@ -76,15 +102,17 @@ INPUT_RATE = 8000
 STT_RATE = 16000
 BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0))  # 160 bytes @ 8kHz, 20ms
 # ----------------------------
 # VAD settings
 # ----------------------------
-RMS_SPEECH_THRESHOLD = 450
 SPEECH_START_FRAMES = 3
 SPEECH_END_SILENCE_FRAMES = 40  # 800ms
 MAX_UTTERANCE_MS = 12000
 PARTIAL_EMIT_EVERY_MS = 250
 # ----------------------------
 # LLM prompt
 # ----------------------------
@@ -94,14 +122,17 @@ SYSTEM_PROMPT = (
     "No filler. No greetings unless user greets first."
 )
 # ----------------------------
 # Cached Vosk model
 # ----------------------------
 _VOSK_MODEL = None
 def now_ms() -> int:
     return int(time.time() * 1000)
 def build_twiml(stream_url: str) -> str:
     return f"""<?xml version="1.0" encoding="UTF-8"?>
 <Response>
@@ -112,6 +143,7 @@ def build_twiml(stream_url: str) -> str:
 </Response>
 """
 def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
     frames = []
     for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
@@ -121,6 +153,7 @@ def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
         frames.append(chunk)
     return frames
 async def drain_queue(q: asyncio.Queue):
     try:
         while True:
@@ -129,6 +162,67 @@ async def drain_queue(q: asyncio.Queue):
     except asyncio.QueueEmpty:
         return
 # ----------------------------
 # OpenAI
 # ----------------------------
@@ -137,10 +231,10 @@ def openai_client() -> OpenAI:
         raise RuntimeError("OPENAI_API_KEY not set")
     return OpenAI(api_key=OPENAI_API_KEY)
 def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     client = openai_client()
     msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # short tail context
     tail = history[-6:] if len(history) > 1 else []
     msgs.extend(tail)
     msgs.append({"role": "user", "content": user_text})
@@ -154,8 +248,9 @@ def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     ans = (resp.choices[0].message.content or "").strip()
     return ans
 # ----------------------------
-# Piper TTS -> 8k mulaw (clarity improved)
 # ----------------------------
 def piper_tts_to_mulaw(text: str) -> bytes:
     if not PIPER_MODEL_PATH:
@@ -180,11 +275,6 @@ def piper_tts_to_mulaw(text: str) -> bytes:
         if r1.returncode != 0:
             raise RuntimeError(f"piper rc={r1.returncode} stderr={r1.stderr.decode('utf-8','ignore')[:500]}")
-        # Phone-clarity filter chain:
-        # - highpass removes rumble
-        # - lowpass removes harshness
-        # - compand evens volume (helps “clarity” on phone)
-        # - dynaudnorm is avoided (can pump / distort at 8k)
         af = "highpass=f=200,lowpass=f=3400,compand=attacks=0:decays=0.3:points=-80/-80|-20/-10|0/-3"
         r2 = subprocess.run(
@@ -201,7 +291,6 @@ def piper_tts_to_mulaw(text: str) -> bytes:
         with open(mulaw_path, "rb") as f:
             data = f.read()
-        P("TTS>", f"audio_bytes={len(data)}")
         return data
     finally:
         for p in (wav_path, mulaw_path):
@@ -210,6 +299,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
@@ -219,6 +309,7 @@ class CancelFlag:
     def set(self):
         self.is_set = True
 @dataclass
 class CallState:
     call_id: str
@@ -255,8 +346,9 @@ class CallState:
         self.tts_generation_id += 1
         return self.tts_generation_id
 # ----------------------------
-# Keepalive marks (prevents WS ping timeout)
 # ----------------------------
 async def twilio_keepalive(ws: WebSocket, st: CallState):
     try:
@@ -270,12 +362,12 @@ async def twilio_keepalive(ws: WebSocket, st: CallState):
                     "streamSid": st.stream_sid,
                     "mark": {"name": name},
                 }))
-                P("TWILIO>", f"keepalive_mark={name}")
     except asyncio.CancelledError:
         return
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
 # HTTP
 # ----------------------------
@@ -283,6 +375,7 @@ async def twilio_keepalive(ws: WebSocket, st: CallState):
 async def health():
     return {"ok": True}
 @app.post("/voice")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
@@ -295,12 +388,14 @@ async def voice(request: Request):
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
 @app.get("/voice")
 async def voice_get(request: Request):
     return await voice(request)
 # ----------------------------
-# WebSocket /stream
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
@@ -330,10 +425,19 @@ async def stream(ws: WebSocket):
                 st.stream_sid = msg["start"]["streamSid"]
                 P("TWILIO>", f"start streamSid={st.stream_sid}")
                 if st.keepalive_task is None:
                     st.keepalive_task = asyncio.create_task(twilio_keepalive(ws, st))
-                # optional short greeting
                 asyncio.create_task(speak_text(ws, st, "Hi! How can I help?"))
             elif event == "media":
@@ -351,8 +455,8 @@ async def stream(ws: WebSocket):
                 await vad_and_stt(ws, st, pcm16_16k, is_speech)
             elif event == "mark":
-                name = (msg.get("mark") or {}).get("name")
-                P("TWILIO>", f"mark_received={name}")
             elif event == "stop":
                 P("TWILIO>", "stop")
@@ -364,12 +468,16 @@ async def stream(ws: WebSocket):
         P("SYS>", f"ws_error={e}")
         log.exception("ws_error")
     finally:
         if st.keepalive_task:
             st.keepalive_task.cancel()
         if st.outbound_task:
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
@@ -395,7 +503,6 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
     st.rec.AcceptWaveform(pcm16_16k)
-    # partial logging only (you said UI later)
     if t - st.last_partial_emit_ms >= PARTIAL_EMIT_EVERY_MS:
         st.last_partial_emit_ms = t
         try:
@@ -403,9 +510,12 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
             partial = (pj.get("partial") or "").strip()
         except Exception:
             partial = ""
         if partial and partial != st.last_partial:
             st.last_partial = partial
             P("STT_PART>", partial)
     if (t - st.utter_start_ms) > MAX_UTTERANCE_MS:
         await finalize_utterance(ws, st, "max_utterance")
@@ -419,6 +529,7 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
@@ -438,6 +549,8 @@ async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
         return
     P("STT_FINAL>", f"{user_text}  ({reason})")
     async def bot_job():
         async with st.bot_lock:
@@ -445,13 +558,13 @@ async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     asyncio.create_task(bot_job())
 # ----------------------------
 # LLM Answer -> Speak
 # ----------------------------
 async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     st.cancel_llm = CancelFlag(False)
-    # store user
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
@@ -466,13 +579,15 @@ async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
         ans = "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
-    # store assistant
     st.history.append({"role": "assistant", "content": ans})
     st.history = st.history[:1] + st.history[-8:]
     await speak_text(ws, st, ans)
 # ----------------------------
 # Barge-in (clear + drain)
 # ----------------------------
@@ -490,27 +605,20 @@ async def barge_in(ws: WebSocket, st: CallState):
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
 # ----------------------------
-# Speak / TTS with generation-id
 # ----------------------------
 async def speak_text(ws: WebSocket, st: CallState, text: str):
     gen = st.bump_tts_generation()
-    # clear previous audio
-    if st.stream_sid:
-        try:
-            await ws.send_text(json.dumps({"event": "clear", "streamSid": st.stream_sid}))
-            P("TWILIO>", "sent_clear")
-        except Exception:
-            pass
-    await drain_queue(st.outbound_q)
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
     my_gen = gen
     st.bot_speaking = True
     P("TTS>", f"text={text} gen={my_gen}")
     loop = asyncio.get_running_loop()
     try:
@@ -521,17 +629,22 @@ async def tts_enqueue(st: CallState, text: str, gen: int):
         return
     if my_gen != st.tts_generation_id:
-        P("TTS>", f"discard_gen my_gen={my_gen} current_gen={st.tts_generation_id}")
         return
     for fr in split_mulaw_frames(mulaw_bytes):
         if my_gen != st.tts_generation_id:
-            P("TTS>", f"discard_midstream my_gen={my_gen} current_gen={st.tts_generation_id}")
             return
         await st.outbound_q.put(base64.b64encode(fr).decode("ascii"))
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True:
@@ -563,10 +676,11 @@ async def outbound_sender(ws: WebSocket, st: CallState):
         P("SYS>", f"outbound_sender_error={e}")
         log.exception("outbound_sender_error")
 # ----------------------------
 # main
 # ----------------------------
 if __name__ == "__main__":
     import uvicorn
     P("SYS>", f"starting {HOST}:{PORT}")
-    uvicorn.run(app, host=HOST, port=PORT)

 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
++ Live UI (web_demo) showing STT/LLM in realtime
++ Multi-call UI support (separate calls by streamSid)
 """
 import asyncio
 import json
 import logging
 import os
 import tempfile
 import time
 import audioop
 import subprocess
 from dataclasses import dataclass, field
 from typing import Optional, List, Dict
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
+from fastapi.responses import PlainTextResponse, Response, FileResponse
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 from vosk import Model, KaldiRecognizer
 from openai import OpenAI
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger("app")
 def P(tag: str, msg: str):
     print(f"{tag} {msg}", flush=True)
 # ----------------------------
 # Env
 # ----------------------------
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
 HOST = "0.0.0.0"
+PORT = int(os.getenv("PORT", "7860"))  # HF uses 7860
 # ----------------------------
 # FastAPI
     allow_headers=["*"],
 )
+# ----------------------------
+# Frontend serving (web_demo/dist)
+# ----------------------------
+FRONTEND_DIR = os.path.join(os.getcwd(), "web_demo", "dist")
+ASSETS_DIR = os.path.join(FRONTEND_DIR, "assets")
+if os.path.isdir(ASSETS_DIR):
+    app.mount("/assets", StaticFiles(directory=ASSETS_DIR), name="assets")
+@app.get("/")
+async def serve_index():
+    index_path = os.path.join(FRONTEND_DIR, "index.html")
+    if os.path.isfile(index_path):
+        return FileResponse(index_path)
+    return PlainTextResponse("UI not built. Ensure web_demo/dist exists.", status_code=404)
+# SPA fallback (optional): if you later add routes like /dashboard
+@app.get("/{path:path}")
+async def serve_spa_fallback(path: str):
+    candidate = os.path.join(FRONTEND_DIR, path)
+    if os.path.isfile(candidate):
+        return FileResponse(candidate)
+    index_path = os.path.join(FRONTEND_DIR, "index.html")
+    if os.path.isfile(index_path):
+        return FileResponse(index_path)
+    return PlainTextResponse("Not Found", status_code=404)
 # ----------------------------
 # Audio / Twilio
 # ----------------------------
 STT_RATE = 16000
 BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0))  # 160 bytes @ 8kHz, 20ms
 # ----------------------------
 # VAD settings
 # ----------------------------
+RMS_SPEECH_THRESHOLD = 550
 SPEECH_START_FRAMES = 3
 SPEECH_END_SILENCE_FRAMES = 40  # 800ms
 MAX_UTTERANCE_MS = 12000
 PARTIAL_EMIT_EVERY_MS = 250
 # ----------------------------
 # LLM prompt
 # ----------------------------
     "No filler. No greetings unless user greets first."
 )
 # ----------------------------
 # Cached Vosk model
 # ----------------------------
 _VOSK_MODEL = None
 def now_ms() -> int:
     return int(time.time() * 1000)
 def build_twiml(stream_url: str) -> str:
     return f"""<?xml version="1.0" encoding="UTF-8"?>
 <Response>
 </Response>
 """
 def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
     frames = []
     for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
         frames.append(chunk)
     return frames
 async def drain_queue(q: asyncio.Queue):
     try:
         while True:
     except asyncio.QueueEmpty:
         return
+# ----------------------------
+# UI live dashboard (multi-call)
+# ----------------------------
+_UI_CLIENTS = set()
+_UI_LOCK = asyncio.Lock()
+ACTIVE_CALLS: Dict[str, Dict] = {}     # key: streamSid
+ACTIVE_LOCK = asyncio.Lock()
+async def ui_broadcast(event: str, data: dict):
+    msg = {"event": event, "data": data, "ts_ms": now_ms()}
+    dead = []
+    async with _UI_LOCK:
+        for c in list(_UI_CLIENTS):
+            try:
+                await c.send_text(json.dumps(msg))
+            except Exception:
+                dead.append(c)
+        for c in dead:
+            _UI_CLIENTS.discard(c)
+async def upsert_call(stream_sid: str, **fields):
+    if not stream_sid:
+        return
+    async with ACTIVE_LOCK:
+        row = ACTIVE_CALLS.get(stream_sid, {})
+        row.update(fields)
+        ACTIVE_CALLS[stream_sid] = row
+async def remove_call(stream_sid: str):
+    if not stream_sid:
+        return
+    async with ACTIVE_LOCK:
+        ACTIVE_CALLS.pop(stream_sid, None)
+@app.websocket("/ui/ws")
+async def ui_ws(ws: WebSocket):
+    await ws.accept()
+    async with _UI_LOCK:
+        _UI_CLIENTS.add(ws)
+    try:
+        while True:
+            await asyncio.sleep(60)
+    except WebSocketDisconnect:
+        pass
+    finally:
+        async with _UI_LOCK:
+            _UI_CLIENTS.discard(ws)
+@app.get("/ui/calls")
+async def ui_calls():
+    async with ACTIVE_LOCK:
+        return {k: dict(v) for k, v in ACTIVE_CALLS.items()}
 # ----------------------------
 # OpenAI
 # ----------------------------
         raise RuntimeError("OPENAI_API_KEY not set")
     return OpenAI(api_key=OPENAI_API_KEY)
 def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     client = openai_client()
     msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
     tail = history[-6:] if len(history) > 1 else []
     msgs.extend(tail)
     msgs.append({"role": "user", "content": user_text})
     ans = (resp.choices[0].message.content or "").strip()
     return ans
 # ----------------------------
+# Piper TTS -> 8k mulaw
 # ----------------------------
 def piper_tts_to_mulaw(text: str) -> bytes:
     if not PIPER_MODEL_PATH:
         if r1.returncode != 0:
             raise RuntimeError(f"piper rc={r1.returncode} stderr={r1.stderr.decode('utf-8','ignore')[:500]}")
         af = "highpass=f=200,lowpass=f=3400,compand=attacks=0:decays=0.3:points=-80/-80|-20/-10|0/-3"
         r2 = subprocess.run(
         with open(mulaw_path, "rb") as f:
             data = f.read()
         return data
     finally:
         for p in (wav_path, mulaw_path):
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
     def set(self):
         self.is_set = True
 @dataclass
 class CallState:
     call_id: str
         self.tts_generation_id += 1
         return self.tts_generation_id
 # ----------------------------
+# Keepalive marks
 # ----------------------------
 async def twilio_keepalive(ws: WebSocket, st: CallState):
     try:
                     "streamSid": st.stream_sid,
                     "mark": {"name": name},
                 }))
     except asyncio.CancelledError:
         return
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
 # HTTP
 # ----------------------------
 async def health():
     return {"ok": True}
 @app.post("/voice")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
 @app.get("/voice")
 async def voice_get(request: Request):
     return await voice(request)
 # ----------------------------
+# WebSocket /stream (Twilio)
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
                 st.stream_sid = msg["start"]["streamSid"]
                 P("TWILIO>", f"start streamSid={st.stream_sid}")
+                await upsert_call(
+                    st.stream_sid,
+                    call_id=st.call_id,
+                    started_ms=now_ms(),
+                    last_seen_ms=now_ms(),
+                    last_event="start",
+                )
+                await ui_broadcast("call_start", {"streamSid": st.stream_sid, "call_id": st.call_id})
                 if st.keepalive_task is None:
                     st.keepalive_task = asyncio.create_task(twilio_keepalive(ws, st))
+                # greeting (optional)
                 asyncio.create_task(speak_text(ws, st, "Hi! How can I help?"))
             elif event == "media":
                 await vad_and_stt(ws, st, pcm16_16k, is_speech)
             elif event == "mark":
+                # ignore; keepalive
+                pass
             elif event == "stop":
                 P("TWILIO>", "stop")
         P("SYS>", f"ws_error={e}")
         log.exception("ws_error")
     finally:
+        if st.stream_sid:
+            await remove_call(st.stream_sid)
+            await ui_broadcast("call_end", {"streamSid": st.stream_sid})
         if st.keepalive_task:
             st.keepalive_task.cancel()
         if st.outbound_task:
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
     st.rec.AcceptWaveform(pcm16_16k)
     if t - st.last_partial_emit_ms >= PARTIAL_EMIT_EVERY_MS:
         st.last_partial_emit_ms = t
         try:
             partial = (pj.get("partial") or "").strip()
         except Exception:
             partial = ""
         if partial and partial != st.last_partial:
             st.last_partial = partial
             P("STT_PART>", partial)
+            await upsert_call(st.stream_sid, last_seen_ms=t, last_event="stt_partial")
+            await ui_broadcast("stt_partial", {"streamSid": st.stream_sid, "text": partial})
     if (t - st.utter_start_ms) > MAX_UTTERANCE_MS:
         await finalize_utterance(ws, st, "max_utterance")
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
         return
     P("STT_FINAL>", f"{user_text}  ({reason})")
+    await upsert_call(st.stream_sid, last_seen_ms=now_ms(), last_event="stt_final", last_user_text=user_text)
+    await ui_broadcast("stt_final", {"streamSid": st.stream_sid, "text": user_text, "reason": reason})
     async def bot_job():
         async with st.bot_lock:
     asyncio.create_task(bot_job())
 # ----------------------------
 # LLM Answer -> Speak
 # ----------------------------
 async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     st.cancel_llm = CancelFlag(False)
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
         ans = "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
+    await upsert_call(st.stream_sid, last_seen_ms=now_ms(), last_event="llm_ans", last_bot_text=ans)
+    await ui_broadcast("llm_ans", {"streamSid": st.stream_sid, "text": ans})
     st.history.append({"role": "assistant", "content": ans})
     st.history = st.history[:1] + st.history[-8:]
     await speak_text(ws, st, ans)
 # ----------------------------
 # Barge-in (clear + drain)
 # ----------------------------
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
 # ----------------------------
+# Speak / TTS (no clear here; clear only on barge-in)
 # ----------------------------
 async def speak_text(ws: WebSocket, st: CallState, text: str):
     gen = st.bump_tts_generation()
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
     my_gen = gen
     st.bot_speaking = True
     P("TTS>", f"text={text} gen={my_gen}")
+    await ui_broadcast("tts", {"streamSid": st.stream_sid, "text": text, "gen": my_gen})
     loop = asyncio.get_running_loop()
     try:
         return
     if my_gen != st.tts_generation_id:
         return
+    # enqueue audio frames
     for fr in split_mulaw_frames(mulaw_bytes):
         if my_gen != st.tts_generation_id:
             return
         await st.outbound_q.put(base64.b64encode(fr).decode("ascii"))
+    # add a short silence tail to prevent cutoff
+    silence = base64.b64encode(b"\xFF" * BYTES_PER_20MS_MULAW).decode("ascii")
+    for _ in range(6):  # ~120ms
+        await st.outbound_q.put(silence)
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True:
         P("SYS>", f"outbound_sender_error={e}")
         log.exception("outbound_sender_error")
 # ----------------------------
 # main
 # ----------------------------
 if __name__ == "__main__":
     import uvicorn
     P("SYS>", f"starting {HOST}:{PORT}")
+    uvicorn.run(app, host=HOST, port=PORT)