ashishkblink commited on
Commit
0f49668
·
verified ·
1 Parent(s): 134cdf0

Update pipe_method3.py

Browse files
Files changed (1) hide show
  1. pipe_method3.py +52 -19
pipe_method3.py CHANGED
@@ -1,9 +1,13 @@
1
  """
2
  Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
3
- Spaces-safe version:
4
- - Does NOT start uvicorn
5
- - Does NOT bind ports
6
- - Only exposes FastAPI `app` for mounting
 
 
 
 
7
  """
8
 
9
  import asyncio
@@ -35,6 +39,7 @@ log = logging.getLogger("twilio")
35
  def P(tag: str, msg: str):
36
  print(f"{tag} {msg}", flush=True)
37
 
 
38
  # ----------------------------
39
  # Env
40
  # ----------------------------
@@ -47,8 +52,9 @@ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini").strip()
47
  PIPER_BIN = os.getenv("PIPER_BIN", "piper").strip()
48
  PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
49
 
 
50
  # ----------------------------
51
- # FastAPI (Twilio sub-app)
52
  # ----------------------------
53
  app = FastAPI()
54
  app.add_middleware(
@@ -59,6 +65,7 @@ app.add_middleware(
59
  allow_headers=["*"],
60
  )
61
 
 
62
  # ----------------------------
63
  # Audio / Twilio
64
  # ----------------------------
@@ -67,6 +74,7 @@ INPUT_RATE = 8000
67
  STT_RATE = 16000
68
  BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0)) # 160 bytes @ 8kHz, 20ms
69
 
 
70
  # ----------------------------
71
  # VAD settings
72
  # ----------------------------
@@ -76,6 +84,16 @@ SPEECH_END_SILENCE_FRAMES = 40 # 800ms
76
  MAX_UTTERANCE_MS = 12000
77
  PARTIAL_EMIT_EVERY_MS = 250
78
 
 
 
 
 
 
 
 
 
 
 
79
  # ----------------------------
80
  # LLM prompt
81
  # ----------------------------
@@ -85,22 +103,17 @@ SYSTEM_PROMPT = (
85
  "No filler. No greetings unless user greets first."
86
  )
87
 
88
- LAST_STATE = {
89
- "connected": False,
90
- "last_stt": "",
91
- "last_llm": "",
92
- "last_tts": "",
93
- "updated_ms": 0,
94
- }
95
 
96
  # ----------------------------
97
  # Cached Vosk model
98
  # ----------------------------
99
  _VOSK_MODEL = None
100
 
 
101
  def now_ms() -> int:
102
  return int(time.time() * 1000)
103
 
 
104
  def build_twiml(stream_url: str) -> str:
105
  return f"""<?xml version="1.0" encoding="UTF-8"?>
106
  <Response>
@@ -111,6 +124,7 @@ def build_twiml(stream_url: str) -> str:
111
  </Response>
112
  """
113
 
 
114
  def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
115
  frames = []
116
  for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
@@ -120,6 +134,7 @@ def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
120
  frames.append(chunk)
121
  return frames
122
 
 
123
  async def drain_queue(q: asyncio.Queue):
124
  try:
125
  while True:
@@ -128,6 +143,7 @@ async def drain_queue(q: asyncio.Queue):
128
  except asyncio.QueueEmpty:
129
  return
130
 
 
131
  # ----------------------------
132
  # OpenAI
133
  # ----------------------------
@@ -136,6 +152,7 @@ def openai_client() -> OpenAI:
136
  raise RuntimeError("OPENAI_API_KEY not set")
137
  return OpenAI(api_key=OPENAI_API_KEY)
138
 
 
139
  def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
140
  client = openai_client()
141
  msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
@@ -151,6 +168,7 @@ def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
151
  )
152
  return (resp.choices[0].message.content or "").strip()
153
 
 
154
  # ----------------------------
155
  # Piper TTS -> 8k mulaw
156
  # ----------------------------
@@ -202,6 +220,7 @@ def piper_tts_to_mulaw(text: str) -> bytes:
202
  except Exception:
203
  pass
204
 
 
205
  # ----------------------------
206
  # Call state
207
  # ----------------------------
@@ -211,6 +230,7 @@ class CancelFlag:
211
  def set(self):
212
  self.is_set = True
213
 
 
214
  @dataclass
215
  class CallState:
216
  call_id: str
@@ -242,6 +262,7 @@ class CallState:
242
  self.tts_generation_id += 1
243
  return self.tts_generation_id
244
 
 
245
  # ----------------------------
246
  # Keepalive marks
247
  # ----------------------------
@@ -263,35 +284,43 @@ async def twilio_keepalive(ws: WebSocket, st: CallState):
263
  except Exception as e:
264
  P("SYS>", f"keepalive_error={e}")
265
 
 
266
  # ----------------------------
267
- # HTTP
268
  # ----------------------------
269
  @app.get("/health")
270
  async def health():
271
  return {"ok": True}
272
 
 
273
  @app.post("/voice")
274
  async def voice(request: Request):
275
  stream_url = TWILIO_STREAM_URL
276
  if not stream_url:
277
  host = request.headers.get("host")
278
  if host:
 
279
  stream_url = f"wss://{host}/twilio/stream"
280
  P("SYS>", f"auto_stream_url={stream_url}")
 
281
  if not stream_url:
282
  return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
 
283
  return Response(content=build_twiml(stream_url), media_type="application/xml")
284
 
 
285
  @app.get("/voice")
286
  async def voice_get(request: Request):
287
  return await voice(request)
288
 
 
289
  @app.get("/debug/last")
290
  async def debug_last():
291
  return LAST_STATE
292
 
 
293
  # ----------------------------
294
- # WebSocket
295
  # ----------------------------
296
  @app.websocket("/stream")
297
  async def stream(ws: WebSocket):
@@ -355,6 +384,7 @@ async def stream(ws: WebSocket):
355
  st.outbound_task.cancel()
356
  P("SYS>", "ws_closed")
357
 
 
358
  # ----------------------------
359
  # VAD + STT
360
  # ----------------------------
@@ -403,6 +433,7 @@ async def vad_and_stt(ws: WebSocket, st: CallState, pcm16_16k: bytes, is_speech:
403
  if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
404
  await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
405
 
 
406
  async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
407
  if not st.in_speech:
408
  return
@@ -429,19 +460,18 @@ async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
429
 
430
  asyncio.create_task(bot_job())
431
 
 
432
  # ----------------------------
433
- # LLM Answer -> Speak
434
  # ----------------------------
435
  async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
436
- st.cancel_llm = CancelFlag(False)
437
-
438
  st.history.append({"role": "user", "content": user_text})
439
  st.history = st.history[:1] + st.history[-8:]
440
 
441
  loop = asyncio.get_running_loop()
442
  ans = await loop.run_in_executor(None, openai_answer_blocking, st.history, user_text)
443
-
444
  ans = (ans or "").strip() or "Sorry, I didn’t catch that."
 
445
  P("LLM_ANS>", ans)
446
 
447
  st.history.append({"role": "assistant", "content": ans})
@@ -449,11 +479,11 @@ async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
449
 
450
  await speak_text(ws, st, ans)
451
 
 
452
  # ----------------------------
453
  # Barge-in
454
  # ----------------------------
455
  async def barge_in(ws: WebSocket, st: CallState):
456
- st.cancel_llm.set()
457
  st.bump_tts_generation()
458
 
459
  if st.stream_sid:
@@ -466,6 +496,7 @@ async def barge_in(ws: WebSocket, st: CallState):
466
  await drain_queue(st.outbound_q)
467
  st.bot_speaking = False
468
 
 
469
  # ----------------------------
470
  # Speak / TTS
471
  # ----------------------------
@@ -482,6 +513,7 @@ async def speak_text(ws: WebSocket, st: CallState, text: str):
482
  await drain_queue(st.outbound_q)
483
  await tts_enqueue(st, text, gen)
484
 
 
485
  async def tts_enqueue(st: CallState, text: str, gen: int):
486
  st.bot_speaking = True
487
  P("TTS>", f"text={text} gen={gen}")
@@ -506,6 +538,7 @@ async def tts_enqueue(st: CallState, text: str, gen: int):
506
 
507
  await st.outbound_q.put("__END_CHUNK__")
508
 
 
509
  async def outbound_sender(ws: WebSocket, st: CallState):
510
  try:
511
  while True:
 
1
  """
2
  Twilio Media Streams (bidirectional) + Vosk + OpenAI Answer + Piper -> Twilio playback
3
+
4
+ Spaces-safe changes:
5
+ - NO uvicorn.run()
6
+ - NO port binding
7
+ - Routes are RELATIVE because this app is mounted at /twilio by app.py
8
+ So:
9
+ POST /voice => /twilio/voice
10
+ WS /stream => /twilio/stream
11
  """
12
 
13
  import asyncio
 
39
  def P(tag: str, msg: str):
40
  print(f"{tag} {msg}", flush=True)
41
 
42
+
43
  # ----------------------------
44
  # Env
45
  # ----------------------------
 
52
  PIPER_BIN = os.getenv("PIPER_BIN", "piper").strip()
53
  PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "").strip()
54
 
55
+
56
  # ----------------------------
57
+ # FastAPI (this is a sub-app)
58
  # ----------------------------
59
  app = FastAPI()
60
  app.add_middleware(
 
65
  allow_headers=["*"],
66
  )
67
 
68
+
69
  # ----------------------------
70
  # Audio / Twilio
71
  # ----------------------------
 
74
  STT_RATE = 16000
75
  BYTES_PER_20MS_MULAW = int(INPUT_RATE * (FRAME_MS / 1000.0)) # 160 bytes @ 8kHz, 20ms
76
 
77
+
78
  # ----------------------------
79
  # VAD settings
80
  # ----------------------------
 
84
  MAX_UTTERANCE_MS = 12000
85
  PARTIAL_EMIT_EVERY_MS = 250
86
 
87
+
88
+ LAST_STATE = {
89
+ "connected": False,
90
+ "last_stt": "",
91
+ "last_llm": "",
92
+ "last_tts": "",
93
+ "updated_ms": 0,
94
+ }
95
+
96
+
97
  # ----------------------------
98
  # LLM prompt
99
  # ----------------------------
 
103
  "No filler. No greetings unless user greets first."
104
  )
105
 
 
 
 
 
 
 
 
106
 
107
  # ----------------------------
108
  # Cached Vosk model
109
  # ----------------------------
110
  _VOSK_MODEL = None
111
 
112
+
113
  def now_ms() -> int:
114
  return int(time.time() * 1000)
115
 
116
+
117
  def build_twiml(stream_url: str) -> str:
118
  return f"""<?xml version="1.0" encoding="UTF-8"?>
119
  <Response>
 
124
  </Response>
125
  """
126
 
127
+
128
  def split_mulaw_frames(mulaw_bytes: bytes) -> List[bytes]:
129
  frames = []
130
  for i in range(0, len(mulaw_bytes), BYTES_PER_20MS_MULAW):
 
134
  frames.append(chunk)
135
  return frames
136
 
137
+
138
  async def drain_queue(q: asyncio.Queue):
139
  try:
140
  while True:
 
143
  except asyncio.QueueEmpty:
144
  return
145
 
146
+
147
  # ----------------------------
148
  # OpenAI
149
  # ----------------------------
 
152
  raise RuntimeError("OPENAI_API_KEY not set")
153
  return OpenAI(api_key=OPENAI_API_KEY)
154
 
155
+
156
  def openai_answer_blocking(history: List[Dict], user_text: str) -> str:
157
  client = openai_client()
158
  msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
 
168
  )
169
  return (resp.choices[0].message.content or "").strip()
170
 
171
+
172
  # ----------------------------
173
  # Piper TTS -> 8k mulaw
174
  # ----------------------------
 
220
  except Exception:
221
  pass
222
 
223
+
224
  # ----------------------------
225
  # Call state
226
  # ----------------------------
 
230
  def set(self):
231
  self.is_set = True
232
 
233
+
234
  @dataclass
235
  class CallState:
236
  call_id: str
 
262
  self.tts_generation_id += 1
263
  return self.tts_generation_id
264
 
265
+
266
  # ----------------------------
267
  # Keepalive marks
268
  # ----------------------------
 
284
  except Exception as e:
285
  P("SYS>", f"keepalive_error={e}")
286
 
287
+
288
  # ----------------------------
289
+ # HTTP (RELATIVE ROUTES)
290
  # ----------------------------
291
  @app.get("/health")
292
  async def health():
293
  return {"ok": True}
294
 
295
+
296
  @app.post("/voice")
297
  async def voice(request: Request):
298
  stream_url = TWILIO_STREAM_URL
299
  if not stream_url:
300
  host = request.headers.get("host")
301
  if host:
302
+ # IMPORTANT: mounted at /twilio in app.py
303
  stream_url = f"wss://{host}/twilio/stream"
304
  P("SYS>", f"auto_stream_url={stream_url}")
305
+
306
  if not stream_url:
307
  return PlainTextResponse("TWILIO_STREAM_URL not set and host not found", status_code=500)
308
+
309
  return Response(content=build_twiml(stream_url), media_type="application/xml")
310
 
311
+
312
  @app.get("/voice")
313
  async def voice_get(request: Request):
314
  return await voice(request)
315
 
316
+
317
  @app.get("/debug/last")
318
  async def debug_last():
319
  return LAST_STATE
320
 
321
+
322
  # ----------------------------
323
+ # WebSocket (RELATIVE ROUTE)
324
  # ----------------------------
325
  @app.websocket("/stream")
326
  async def stream(ws: WebSocket):
 
384
  st.outbound_task.cancel()
385
  P("SYS>", "ws_closed")
386
 
387
+
388
  # ----------------------------
389
  # VAD + STT
390
  # ----------------------------
 
433
  if st.silence_count >= SPEECH_END_SILENCE_FRAMES:
434
  await finalize_utterance(ws, st, f"vad_silence_{SPEECH_END_SILENCE_FRAMES*FRAME_MS}ms")
435
 
436
+
437
  async def finalize_utterance(ws: WebSocket, st: CallState, reason: str):
438
  if not st.in_speech:
439
  return
 
460
 
461
  asyncio.create_task(bot_job())
462
 
463
+
464
  # ----------------------------
465
+ # LLM -> Speak
466
  # ----------------------------
467
  async def answer_and_speak(ws: WebSocket, st: CallState, user_text: str):
 
 
468
  st.history.append({"role": "user", "content": user_text})
469
  st.history = st.history[:1] + st.history[-8:]
470
 
471
  loop = asyncio.get_running_loop()
472
  ans = await loop.run_in_executor(None, openai_answer_blocking, st.history, user_text)
 
473
  ans = (ans or "").strip() or "Sorry, I didn’t catch that."
474
+
475
  P("LLM_ANS>", ans)
476
 
477
  st.history.append({"role": "assistant", "content": ans})
 
479
 
480
  await speak_text(ws, st, ans)
481
 
482
+
483
  # ----------------------------
484
  # Barge-in
485
  # ----------------------------
486
  async def barge_in(ws: WebSocket, st: CallState):
 
487
  st.bump_tts_generation()
488
 
489
  if st.stream_sid:
 
496
  await drain_queue(st.outbound_q)
497
  st.bot_speaking = False
498
 
499
+
500
  # ----------------------------
501
  # Speak / TTS
502
  # ----------------------------
 
513
  await drain_queue(st.outbound_q)
514
  await tts_enqueue(st, text, gen)
515
 
516
+
517
  async def tts_enqueue(st: CallState, text: str, gen: int):
518
  st.bot_speaking = True
519
  P("TTS>", f"text={text} gen={gen}")
 
538
 
539
  await st.outbound_q.put("__END_CHUNK__")
540
 
541
+
542
  async def outbound_sender(ws: WebSocket, st: CallState):
543
  try:
544
  while True: