Spaces:

ashishkblink
/

neuralvoiceGPU

Sleeping

App Files Files Community

ashishkblink commited on Jan 21

Commit

9e1ff8d

verified ·

1 Parent(s): 01cb0e6

Update pipe_method3.py

Browse files

Files changed (1) hide show

pipe_method3.py +36 -73

pipe_method3.py CHANGED Viewed

@@ -1,18 +1,9 @@
 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
-What this version does:
-- NO intent / NO clarify JSON
-- Logs only:
-  STT_FINAL> ...
-  LLM_ANS> ...
-  TTS> ...
-- Generation-id safe TTS
-- Better phone clarity using ffmpeg filters (highpass/lowpass/compand)
-- Proper 20ms pacing + keepalive marks to prevent WS idle timeouts
 HF Spaces notes:
 - DO NOT run uvicorn here
-- This FastAPI app is mounted at /twilio in app.py
   so routes here must be relative:
     POST /voice   -> /twilio/voice
     WS   /stream  -> /twilio/stream
@@ -36,6 +27,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from vosk import Model, KaldiRecognizer
 from openai import OpenAI
 # ----------------------------
 # Logging
 # ----------------------------
@@ -95,18 +87,12 @@ LAST_STATE = {
     "updated_ms": 0,
 }
-# ----------------------------
-# LLM prompt
-# ----------------------------
 SYSTEM_PROMPT = (
     "You are a phone-call assistant. "
     "Reply in 1 short sentence (max 15 words). "
     "No filler. No greetings unless user greets first."
 )
-# ----------------------------
-# Cached Vosk model
-# ----------------------------
 _VOSK_MODEL = None
 def now_ms() -> int:
@@ -160,11 +146,10 @@ def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
         temperature=0.3,
         max_tokens=80,
     )
-    ans = (resp.choices[0].message.content or "").strip()
-    return ans
 # ----------------------------
-# Piper TTS -> 8k mulaw (clarity improved)
 # ----------------------------
 def piper_tts_to_mulaw(text: str) -> bytes:
     if not PIPER_MODEL_PATH:
@@ -189,6 +174,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
         if r1.returncode != 0:
             raise RuntimeError(f"piper rc={r1.returncode} stderr={r1.stderr.decode('utf-8','ignore')[:500]}")
         af = "highpass=f=200,lowpass=f=3400,compand=attacks=0:decays=0.3:points=-80/-80|-20/-10|0/-3"
         r2 = subprocess.run(
@@ -203,10 +189,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
             raise RuntimeError(f"ffmpeg rc={r2.returncode} stderr={r2.stderr.decode('utf-8','ignore')[:500]}")
         with open(mulaw_path, "rb") as f:
-            data = f.read()
-        P("TTS>", f"audio_bytes={len(data)}")
-        return data
     finally:
         for p in (wav_path, mulaw_path):
             try:
@@ -214,6 +197,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
@@ -228,7 +212,6 @@ class CallState:
     call_id: str
     stream_sid: str = ""
-    # vad
     in_speech: bool = False
     speech_start_count: int = 0
     silence_count: int = 0
@@ -236,22 +219,18 @@ class CallState:
     rec: Optional[KaldiRecognizer] = None
-    # partials
     last_partial: str = ""
     last_partial_emit_ms: int = 0
-    # outbound
     outbound_q: asyncio.Queue = field(default_factory=lambda: asyncio.Queue(maxsize=50000))
     outbound_task: Optional[asyncio.Task] = None
     keepalive_task: Optional[asyncio.Task] = None
     mark_i: int = 0
-    # speaking / generation
     bot_speaking: bool = False
     cancel_llm: CancelFlag = field(default_factory=CancelFlag)
     tts_generation_id: int = 0
-    # conversation history
     history: List[Dict] = field(default_factory=list)
     bot_lock: asyncio.Lock = field(default_factory=asyncio.Lock)
@@ -259,9 +238,7 @@ class CallState:
         self.tts_generation_id += 1
         return self.tts_generation_id
-# ----------------------------
-# Keepalive marks (prevents WS ping timeout)
-# ----------------------------
 async def twilio_keepalive(ws: WebSocket, st: CallState):
     try:
         while True:
@@ -280,39 +257,39 @@ async def twilio_keepalive(ws: WebSocket, st: CallState):
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
-# HTTP (RELATIVE ROUTES)
-# Final URLs will be:
-#   POST /twilio/voice
-#   WS   /twilio/stream
 # ----------------------------
-@app.get("/health")
-async def health():
-    return {"ok": True}
 @app.post("/voice")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
     if not stream_url:
         host = request.headers.get("host")
         if host:
-            # because this app is mounted at /twilio
             stream_url = f"wss://{host}/twilio/stream"
             P("SYS>", f"auto_stream_url={stream_url}")
     if not stream_url:
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
-@app.get("/voice")
-async def voice_get(request: Request):
-    return await voice(request)
-@app.get("/debug/last")
-async def debug_last():
-    return LAST_STATE
 # ----------------------------
-# WebSocket /stream (RELATIVE)
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
@@ -360,10 +337,6 @@ async def stream(ws: WebSocket):
                 await vad_and_stt(ws, st, pcm16_16k, is_speech)
-            elif event == "mark":
-                name = (msg.get("mark") or {}).get("name")
-                P("TWILIO>", f"mark_received={name}")
             elif event == "stop":
                 P("TWILIO>", "stop")
                 break
@@ -380,6 +353,7 @@ async def stream(ws: WebSocket):
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
@@ -428,6 +402,7 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
@@ -454,12 +429,8 @@ async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     asyncio.create_task(bot_job())
-# ----------------------------
-# LLM Answer -> Speak
-# ----------------------------
-async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
-    st.cancel_llm = CancelFlag(False)
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
@@ -469,9 +440,7 @@ async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
         return openai_answer_blocking(st.history, user_text)
     ans = await loop.run_in_executor(None, worker)
-    ans = (ans or "").strip()
-    if not ans:
-        ans = "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
@@ -480,11 +449,8 @@ async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     await speak_text(ws, st, ans)
-# ----------------------------
-# Barge-in (clear + drain)
-# ----------------------------
 async def barge_in(ws: WebSocket, st: CallState):
-    st.cancel_llm.set()
     st.bump_tts_generation()
     if st.stream_sid:
@@ -497,9 +463,7 @@ async def barge_in(ws: WebSocket, st: CallState):
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
-# ----------------------------
-# Speak / TTS with generation-id
-# ----------------------------
 async def speak_text(ws: WebSocket, st: CallState, text: str):
     gen = st.bump_tts_generation()
@@ -509,14 +473,14 @@ async def speak_text(ws: WebSocket, st: CallState, text: str):
             P("TWILIO>", "sent_clear")
         except Exception:
             pass
-    await drain_queue(st.outbound_q)
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
-    my_gen = gen
     st.bot_speaking = True
-    P("TTS>", f"text={text} gen={my_gen}")
     loop = asyncio.get_running_loop()
     try:
@@ -526,18 +490,17 @@ async def tts_enqueue(st: CallState, text: str, gen: int):
         st.bot_speaking = False
         return
-    if my_gen != st.tts_generation_id:
-        P("TTS>", f"discard_gen my_gen={my_gen} current_gen={st.tts_generation_id}")
         return
     for fr in split_mulaw_frames(mulaw_bytes):
-        if my_gen != st.tts_generation_id:
-            P("TTS>", f"discard_midstream my_gen={my_gen} current_gen={st.tts_generation_id}")
             return
         await st.outbound_q.put(base64.b64encode(fr).decode("ascii"))
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True:

 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
 HF Spaces notes:
 - DO NOT run uvicorn here
+- This app is mounted at /twilio in app.py
   so routes here must be relative:
     POST /voice   -> /twilio/voice
     WS   /stream  -> /twilio/stream
 from vosk import Model, KaldiRecognizer
 from openai import OpenAI
 # ----------------------------
 # Logging
 # ----------------------------
     "updated_ms": 0,
 }
 SYSTEM_PROMPT = (
     "You are a phone-call assistant. "
     "Reply in 1 short sentence (max 15 words). "
     "No filler. No greetings unless user greets first."
 )
 _VOSK_MODEL = None
 def now_ms() -> int:
         temperature=0.3,
         max_tokens=80,
     )
+    return (resp.choices[0].message.content or "").strip()
 # ----------------------------
+# Piper TTS -> 8k mulaw
 # ----------------------------
 def piper_tts_to_mulaw(text: str) -> bytes:
     if not PIPER_MODEL_PATH:
         if r1.returncode != 0:
             raise RuntimeError(f"piper rc={r1.returncode} stderr={r1.stderr.decode('utf-8','ignore')[:500]}")
+        # clarity filter for phone audio
         af = "highpass=f=200,lowpass=f=3400,compand=attacks=0:decays=0.3:points=-80/-80|-20/-10|0/-3"
         r2 = subprocess.run(
             raise RuntimeError(f"ffmpeg rc={r2.returncode} stderr={r2.stderr.decode('utf-8','ignore')[:500]}")
         with open(mulaw_path, "rb") as f:
+            return f.read()
     finally:
         for p in (wav_path, mulaw_path):
             try:
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
     call_id: str
     stream_sid: str = ""
     in_speech: bool = False
     speech_start_count: int = 0
     silence_count: int = 0
     rec: Optional[KaldiRecognizer] = None
     last_partial: str = ""
     last_partial_emit_ms: int = 0
     outbound_q: asyncio.Queue = field(default_factory=lambda: asyncio.Queue(maxsize=50000))
     outbound_task: Optional[asyncio.Task] = None
     keepalive_task: Optional[asyncio.Task] = None
     mark_i: int = 0
     bot_speaking: bool = False
     cancel_llm: CancelFlag = field(default_factory=CancelFlag)
     tts_generation_id: int = 0
     history: List[Dict] = field(default_factory=list)
     bot_lock: asyncio.Lock = field(default_factory=asyncio.Lock)
         self.tts_generation_id += 1
         return self.tts_generation_id
 async def twilio_keepalive(ws: WebSocket, st: CallState):
     try:
         while True:
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
+# Twilio Voice Webhook (FIXES 405)
+# Accept POST + GET + trailing slash
 # ----------------------------
 @app.post("/voice")
+@app.post("/voice/")
+@app.get("/voice")
+@app.get("/voice/")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
+    # Auto-build if not set
     if not stream_url:
         host = request.headers.get("host")
         if host:
+            # mounted at /twilio, so final ws is /twilio/stream
             stream_url = f"wss://{host}/twilio/stream"
             P("SYS>", f"auto_stream_url={stream_url}")
     if not stream_url:
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
+@app.get("/health")
+async def health():
+    return {"ok": True}
 # ----------------------------
+# WebSocket /stream (mounted => /twilio/stream)
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
                 await vad_and_stt(ws, st, pcm16_16k, is_speech)
             elif event == "stop":
                 P("TWILIO>", "stop")
                 break
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
     asyncio.create_task(bot_job())
+async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
         return openai_answer_blocking(st.history, user_text)
     ans = await loop.run_in_executor(None, worker)
+    ans = (ans or "").strip() or "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
     await speak_text(ws, st, ans)
 async def barge_in(ws: WebSocket, st: CallState):
     st.bump_tts_generation()
     if st.stream_sid:
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
 async def speak_text(ws: WebSocket, st: CallState, text: str):
     gen = st.bump_tts_generation()
             P("TWILIO>", "sent_clear")
         except Exception:
             pass
+    await drain_queue(st.outbound_q)
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
     st.bot_speaking = True
+    P("TTS>", f"text={text} gen={gen}")
     loop = asyncio.get_running_loop()
     try:
         st.bot_speaking = False
         return
+    if gen != st.tts_generation_id:
         return
     for fr in split_mulaw_frames(mulaw_bytes):
+        if gen != st.tts_generation_id:
             return
         await st.outbound_q.put(base64.b64encode(fr).decode("ascii"))
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True: