Spaces:

ashishkblink
/

neuralvoiceGPU

Sleeping

App Files Files Community

ashishkblink commited on Jan 21

Commit

0f49668

verified ·

1 Parent(s): 134cdf0

Update pipe_method3.py

Browse files

Files changed (1) hide show

pipe_method3.py +52 -19

pipe_method3.py CHANGED Viewed

@@ -1,9 +1,13 @@
 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
-Spaces-safe version:
-- Does NOT start uvicorn
-- Does NOT bind ports
-- Only exposes FastAPI `app` for mounting
 """
 import asyncio
@@ -35,6 +39,7 @@ log = logging.getLogger("twilio")
 def P(tag: str, msg: str):
     print(f"{tag} {msg}", flush=True)
 # ----------------------------
 # Env
 # ----------------------------
@@ -47,8 +52,9 @@ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini").strip()
 PIPER_BIN = os.getenv("PIPER_BIN", "piper").strip()
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
 # ----------------------------
-# FastAPI (Twilio sub-app)
 # ----------------------------
 app = FastAPI()
 app.add_middleware(
@@ -59,6 +65,7 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # ----------------------------
 # Audio / Twilio
 # ----------------------------
@@ -67,6 +74,7 @@ INPUT_RATE = 8000
 STT_RATE = 16000
 BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0))  # 160 bytes @ 8kHz, 20ms
 # ----------------------------
 # VAD settings
 # ----------------------------
@@ -76,6 +84,16 @@ SPEECH_END_SILENCE_FRAMES = 40  # 800ms
 MAX_UTTERANCE_MS = 12000
 PARTIAL_EMIT_EVERY_MS = 250
 # ----------------------------
 # LLM prompt
 # ----------------------------
@@ -85,22 +103,17 @@ SYSTEM_PROMPT = (
     "No filler. No greetings unless user greets first."
 )
-LAST_STATE = {
-    "connected": False,
-    "last_stt": "",
-    "last_llm": "",
-    "last_tts": "",
-    "updated_ms": 0,
-}
 # ----------------------------
 # Cached Vosk model
 # ----------------------------
 _VOSK_MODEL = None
 def now_ms() -> int:
     return int(time.time() * 1000)
 def build_twiml(stream_url: str) -> str:
     return f"""<?xml version="1.0" encoding="UTF-8"?>
 <Response>
@@ -111,6 +124,7 @@ def build_twiml(stream_url: str) -> str:
 </Response>
 """
 def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
     frames = []
     for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
@@ -120,6 +134,7 @@ def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
         frames.append(chunk)
     return frames
 async def drain_queue(q: asyncio.Queue):
     try:
         while True:
@@ -128,6 +143,7 @@ async def drain_queue(q: asyncio.Queue):
     except asyncio.QueueEmpty:
         return
 # ----------------------------
 # OpenAI
 # ----------------------------
@@ -136,6 +152,7 @@ def openai_client() -> OpenAI:
         raise RuntimeError("OPENAI_API_KEY not set")
     return OpenAI(api_key=OPENAI_API_KEY)
 def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     client = openai_client()
     msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
@@ -151,6 +168,7 @@ def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     )
     return (resp.choices[0].message.content or "").strip()
 # ----------------------------
 # Piper TTS -> 8k mulaw
 # ----------------------------
@@ -202,6 +220,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
@@ -211,6 +230,7 @@ class CancelFlag:
     def set(self):
         self.is_set = True
 @dataclass
 class CallState:
     call_id: str
@@ -242,6 +262,7 @@ class CallState:
         self.tts_generation_id += 1
         return self.tts_generation_id
 # ----------------------------
 # Keepalive marks
 # ----------------------------
@@ -263,35 +284,43 @@ async def twilio_keepalive(ws: WebSocket, st: CallState):
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
-# HTTP
 # ----------------------------
 @app.get("/health")
 async def health():
     return {"ok": True}
 @app.post("/voice")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
     if not stream_url:
         host = request.headers.get("host")
         if host:
             stream_url = f"wss://{host}/twilio/stream"
             P("SYS>", f"auto_stream_url={stream_url}")
     if not stream_url:
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
 @app.get("/voice")
 async def voice_get(request: Request):
     return await voice(request)
 @app.get("/debug/last")
 async def debug_last():
     return LAST_STATE
 # ----------------------------
-# WebSocket
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
@@ -355,6 +384,7 @@ async def stream(ws: WebSocket):
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
@@ -403,6 +433,7 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
@@ -429,19 +460,18 @@ async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     asyncio.create_task(bot_job())
 # ----------------------------
-# LLM Answer -> Speak
 # ----------------------------
 async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
-    st.cancel_llm = CancelFlag(False)
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
     loop = asyncio.get_running_loop()
     ans = await loop.run_in_executor(None, openai_answer_blocking, st.history, user_text)
     ans = (ans or "").strip() or "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
     st.history.append({"role": "assistant", "content": ans})
@@ -449,11 +479,11 @@ async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     await speak_text(ws, st, ans)
 # ----------------------------
 # Barge-in
 # ----------------------------
 async def barge_in(ws: WebSocket, st: CallState):
-    st.cancel_llm.set()
     st.bump_tts_generation()
     if st.stream_sid:
@@ -466,6 +496,7 @@ async def barge_in(ws: WebSocket, st: CallState):
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
 # ----------------------------
 # Speak / TTS
 # ----------------------------
@@ -482,6 +513,7 @@ async def speak_text(ws: WebSocket, st: CallState, text: str):
     await drain_queue(st.outbound_q)
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
     st.bot_speaking = True
     P("TTS>", f"text={text} gen={gen}")
@@ -506,6 +538,7 @@ async def tts_enqueue(st: CallState, text: str, gen: int):
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True:

 """
 Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
+Spaces-safe changes:
+- NO uvicorn.run()
+- NO port binding
+- Routes are RELATIVE because this app is mounted at /twilio by app.py
+  So:
+    POST /voice     => /twilio/voice
+    WS   /stream    => /twilio/stream
 """
 import asyncio
 def P(tag: str, msg: str):
     print(f"{tag} {msg}", flush=True)
 # ----------------------------
 # Env
 # ----------------------------
 PIPER_BIN = os.getenv("PIPER_BIN", "piper").strip()
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
 # ----------------------------
+# FastAPI (this is a sub-app)
 # ----------------------------
 app = FastAPI()
 app.add_middleware(
     allow_headers=["*"],
 )
 # ----------------------------
 # Audio / Twilio
 # ----------------------------
 STT_RATE = 16000
 BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0))  # 160 bytes @ 8kHz, 20ms
 # ----------------------------
 # VAD settings
 # ----------------------------
 MAX_UTTERANCE_MS = 12000
 PARTIAL_EMIT_EVERY_MS = 250
+LAST_STATE = {
+    "connected": False,
+    "last_stt": "",
+    "last_llm": "",
+    "last_tts": "",
+    "updated_ms": 0,
+}
 # ----------------------------
 # LLM prompt
 # ----------------------------
     "No filler. No greetings unless user greets first."
 )
 # ----------------------------
 # Cached Vosk model
 # ----------------------------
 _VOSK_MODEL = None
 def now_ms() -> int:
     return int(time.time() * 1000)
 def build_twiml(stream_url: str) -> str:
     return f"""<?xml version="1.0" encoding="UTF-8"?>
 <Response>
 </Response>
 """
 def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
     frames = []
     for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
         frames.append(chunk)
     return frames
 async def drain_queue(q: asyncio.Queue):
     try:
         while True:
     except asyncio.QueueEmpty:
         return
 # ----------------------------
 # OpenAI
 # ----------------------------
         raise RuntimeError("OPENAI_API_KEY not set")
     return OpenAI(api_key=OPENAI_API_KEY)
 def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
     client = openai_client()
     msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
     )
     return (resp.choices[0].message.content or "").strip()
 # ----------------------------
 # Piper TTS -> 8k mulaw
 # ----------------------------
             except Exception:
                 pass
 # ----------------------------
 # Call state
 # ----------------------------
     def set(self):
         self.is_set = True
 @dataclass
 class CallState:
     call_id: str
         self.tts_generation_id += 1
         return self.tts_generation_id
 # ----------------------------
 # Keepalive marks
 # ----------------------------
     except Exception as e:
         P("SYS>", f"keepalive_error={e}")
 # ----------------------------
+# HTTP (RELATIVE ROUTES)
 # ----------------------------
 @app.get("/health")
 async def health():
     return {"ok": True}
 @app.post("/voice")
 async def voice(request: Request):
     stream_url = TWILIO_STREAM_URL
     if not stream_url:
         host = request.headers.get("host")
         if host:
+            # IMPORTANT: mounted at /twilio in app.py
             stream_url = f"wss://{host}/twilio/stream"
             P("SYS>", f"auto_stream_url={stream_url}")
     if not stream_url:
         return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
     return Response(content=build_twiml(stream_url), media_type="application/xml")
 @app.get("/voice")
 async def voice_get(request: Request):
     return await voice(request)
 @app.get("/debug/last")
 async def debug_last():
     return LAST_STATE
 # ----------------------------
+# WebSocket (RELATIVE ROUTE)
 # ----------------------------
 @app.websocket("/stream")
 async def stream(ws: WebSocket):
             st.outbound_task.cancel()
         P("SYS>", "ws_closed")
 # ----------------------------
 # VAD + STT
 # ----------------------------
     if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
         await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
 async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
     if not st.in_speech:
         return
     asyncio.create_task(bot_job())
 # ----------------------------
+# LLM -> Speak
 # ----------------------------
 async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
     st.history.append({"role": "user", "content": user_text})
     st.history = st.history[:1] + st.history[-8:]
     loop = asyncio.get_running_loop()
     ans = await loop.run_in_executor(None, openai_answer_blocking, st.history, user_text)
     ans = (ans or "").strip() or "Sorry, I didn’t catch that."
     P("LLM_ANS>", ans)
     st.history.append({"role": "assistant", "content": ans})
     await speak_text(ws, st, ans)
 # ----------------------------
 # Barge-in
 # ----------------------------
 async def barge_in(ws: WebSocket, st: CallState):
     st.bump_tts_generation()
     if st.stream_sid:
     await drain_queue(st.outbound_q)
     st.bot_speaking = False
 # ----------------------------
 # Speak / TTS
 # ----------------------------
     await drain_queue(st.outbound_q)
     await tts_enqueue(st, text, gen)
 async def tts_enqueue(st: CallState, text: str, gen: int):
     st.bot_speaking = True
     P("TTS>", f"text={text} gen={gen}")
     await st.outbound_q.put("__END_CHUNK__")
 async def outbound_sender(ws: WebSocket, st: CallState):
     try:
         while True: