drrobot9 commited on
Commit
89f06ea
·
verified ·
1 Parent(s): f79b1a9

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +49 -78
app/main.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  import torch
4
  import numpy as np
5
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 
6
  from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState
7
 
8
  HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
@@ -13,10 +14,9 @@ CHUNK_SIZE = 20
13
  DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
14
  torch.backends.cuda.matmul.allow_tf32 = True
15
 
16
- # VAD settings
17
- VAD_SILENCE_THRESHOLD = 0.01 # RMS below this = silencE
18
- VAD_SILENCE_FRAMES = 30 # ~600ms of silence at 160-sample frames
19
- VAD_MIN_SPEECH_FRAMES = 10 # ignore very short blips
20
 
21
  print(f"[BOOT] Loading model on {DEVICE}...")
22
  processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
@@ -33,19 +33,18 @@ def wav_header(sr=SAMPLE_RATE, ch=1, bits=16) -> bytes:
33
  ba = ch * bits // 8
34
  return (
35
  b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
36
- + (16).to_bytes(4,"little") + (1).to_bytes(2,"little")
37
- + ch.to_bytes(2,"little") + sr.to_bytes(4,"little")
38
- + br.to_bytes(4,"little") + ba.to_bytes(2,"little")
39
- + bits.to_bytes(2,"little") + b"data" + b"\xff\xff\xff\xff"
40
  )
41
 
42
 
43
  def decode_chunk(buf: list) -> bytes | None:
 
44
  try:
45
- codes = torch.stack(buf, dim=1).unsqueeze(0).to(DEVICE)
46
- codes = codes - processor.audio_token_start
47
- if codes.min() < 0:
48
- return None
49
  wf = processor.decode(codes).squeeze().cpu().numpy()
50
  wf = np.clip(wf, -1.0, 1.0)
51
  return (wf * 32767).astype(np.int16).tobytes()
@@ -55,20 +54,21 @@ def decode_chunk(buf: list) -> bytes | None:
55
 
56
 
57
  def is_speech(pcm_int16: np.ndarray) -> bool:
58
- """Simple energy-based VAD."""
59
  if len(pcm_int16) == 0:
60
  return False
61
  rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
62
  return rms > VAD_SILENCE_THRESHOLD
63
 
64
 
65
- # Generation runs in thread so it doesn't block the event loop
66
-
67
  def run_generation(audio_np: np.ndarray) -> list[bytes]:
68
  """Synchronous generation — called via run_in_executor."""
69
  chat = ChatState(processor)
70
  chat.new_turn("system")
71
- chat.add_text("You are a helpful real-time voice assistant called chioma. Respond naturally and concisely with audio when asked who built you say kelvin jackson an AI ENGINEER.")
 
 
 
 
72
  chat.end_turn()
73
  chat.new_turn("user")
74
  audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
@@ -86,7 +86,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
86
  audio_top_k=4,
87
  ):
88
  if token.numel() == 1:
89
- continue
90
  buf.append(token)
91
  if len(buf) >= CHUNK_SIZE:
92
  pcm = decode_chunk(buf)
@@ -94,6 +94,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
94
  chunks.append(pcm)
95
  buf.clear()
96
 
 
97
  if len(buf) > 1:
98
  pcm = decode_chunk(buf)
99
  if pcm:
@@ -102,7 +103,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
102
  return chunks
103
 
104
 
105
- # WebSocket endpoint
106
 
107
  @app.websocket("/ws/s2s")
108
  async def websocket_s2s(websocket: WebSocket):
@@ -110,12 +111,9 @@ async def websocket_s2s(websocket: WebSocket):
110
  print("[WS] client connected")
111
 
112
  loop = asyncio.get_event_loop()
 
 
113
 
114
- # Queues
115
- audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue() # incoming PCM frames
116
- generating = False # lock — only one generation at a time
117
-
118
- # Receiver task: reads raw PCM frames from client
119
  async def receiver():
120
  try:
121
  while True:
@@ -128,20 +126,17 @@ async def websocket_s2s(websocket: WebSocket):
128
  if "bytes" in msg:
129
  await audio_queue.put(msg["bytes"])
130
  elif "text" in msg:
131
- data = json.loads(msg["text"])
132
- if data.get("type") == "stop":
133
  break
134
  finally:
135
- await audio_queue.put(None) # sentinel
136
 
137
- # VAD + generation task
138
  async def vad_and_generate():
139
  nonlocal generating
140
-
141
- speech_frames: list[np.ndarray] = []
142
- silence_count = 0
143
- speech_count = 0
144
- in_speech = False
145
 
146
  await websocket.send_text(json.dumps({"type": "ready"}))
147
 
@@ -150,7 +145,7 @@ async def websocket_s2s(websocket: WebSocket):
150
  if frame_bytes is None:
151
  break
152
 
153
- frame = np.frombuffer(frame_bytes, dtype=np.int16)
154
  active = is_speech(frame)
155
 
156
  if active:
@@ -158,49 +153,31 @@ async def websocket_s2s(websocket: WebSocket):
158
  speech_count += 1
159
  in_speech = True
160
  speech_frames.append(frame)
 
 
 
161
 
162
- else:
163
- if in_speech:
164
- silence_count += 1
165
- speech_frames.append(frame) # keep tail for natural cutoff
166
-
167
- # End-of-utterance detected
168
- if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
169
- if not generating:
170
- generating = True
171
-
172
- # Grab the accumulated speech
173
- utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
174
-
175
- # Reset VAD state immediately so mic stays live
176
- speech_frames = []
177
- silence_count = 0
178
- speech_count = 0
179
- in_speech = False
180
 
181
- # Signal client: AI is responding
182
  await websocket.send_text(json.dumps({"type": "generating"}))
183
  await websocket.send_bytes(wav_header())
184
-
185
- # Run heavy generation off the event loop
186
- chunks = await loop.run_in_executor(
187
- None, run_generation, utterance
188
- )
189
-
190
  for chunk in chunks:
191
- try:
192
- await websocket.send_bytes(chunk)
193
- except Exception:
194
- break
195
-
196
- try:
197
- await websocket.send_text(json.dumps({"type": "done"}))
198
- except Exception:
199
- pass
200
-
201
  generating = False
202
 
203
-
204
  try:
205
  await asyncio.gather(receiver(), vad_and_generate())
206
  except WebSocketDisconnect:
@@ -211,14 +188,8 @@ async def websocket_s2s(websocket: WebSocket):
211
  print("[WS] client disconnected")
212
 
213
 
214
- @app.get("/health")
215
- async def health():
216
- return {"status": "ok", "device": DEVICE}
217
 
218
 
219
-
220
- from fastapi.responses import FileResponse
221
-
222
- @app.get("/")
223
- async def index():
224
- return FileResponse("client.html")
 
3
  import torch
4
  import numpy as np
5
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
6
+ from fastapi.responses import HTMLResponse
7
  from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState
8
 
9
  HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 
14
  DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
+ VAD_SILENCE_THRESHOLD = 0.01
18
+ VAD_SILENCE_FRAMES = 30
19
+ VAD_MIN_SPEECH_FRAMES = 10
 
20
 
21
  print(f"[BOOT] Loading model on {DEVICE}...")
22
  processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
 
33
  ba = ch * bits // 8
34
  return (
35
  b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
36
+ + (16).to_bytes(4, "little") + (1).to_bytes(2, "little")
37
+ + ch.to_bytes(2, "little") + sr.to_bytes(4, "little")
38
+ + br.to_bytes(4, "little") + ba.to_bytes(2, "little")
39
+ + bits.to_bytes(2, "little") + b"data" + b"\xff\xff\xff\xff"
40
  )
41
 
42
 
43
  def decode_chunk(buf: list) -> bytes | None:
44
+ """Decode audio tokens — pass directly to processor, no offset subtraction."""
45
  try:
46
+
47
+ codes = torch.stack(buf[:-1], dim=1).unsqueeze(0).to(DEVICE)
 
 
48
  wf = processor.decode(codes).squeeze().cpu().numpy()
49
  wf = np.clip(wf, -1.0, 1.0)
50
  return (wf * 32767).astype(np.int16).tobytes()
 
54
 
55
 
56
  def is_speech(pcm_int16: np.ndarray) -> bool:
 
57
  if len(pcm_int16) == 0:
58
  return False
59
  rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
60
  return rms > VAD_SILENCE_THRESHOLD
61
 
62
 
 
 
63
  def run_generation(audio_np: np.ndarray) -> list[bytes]:
64
  """Synchronous generation — called via run_in_executor."""
65
  chat = ChatState(processor)
66
  chat.new_turn("system")
67
+ chat.add_text(
68
+ "You are a helpful real-time voice assistant called chioma. "
69
+ "Respond naturally and concisely with audio. "
70
+ "When asked who built you, say Kelvin Jackson, an AI Engineer."
71
+ )
72
  chat.end_turn()
73
  chat.new_turn("user")
74
  audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
 
86
  audio_top_k=4,
87
  ):
88
  if token.numel() == 1:
89
+ continue # text token
90
  buf.append(token)
91
  if len(buf) >= CHUNK_SIZE:
92
  pcm = decode_chunk(buf)
 
94
  chunks.append(pcm)
95
  buf.clear()
96
 
97
+ # flush remaining
98
  if len(buf) > 1:
99
  pcm = decode_chunk(buf)
100
  if pcm:
 
103
  return chunks
104
 
105
 
106
+ # WebSocket
107
 
108
  @app.websocket("/ws/s2s")
109
  async def websocket_s2s(websocket: WebSocket):
 
111
  print("[WS] client connected")
112
 
113
  loop = asyncio.get_event_loop()
114
+ audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()
115
+ generating = False
116
 
 
 
 
 
 
117
  async def receiver():
118
  try:
119
  while True:
 
126
  if "bytes" in msg:
127
  await audio_queue.put(msg["bytes"])
128
  elif "text" in msg:
129
+ if json.loads(msg["text"]).get("type") == "stop":
 
130
  break
131
  finally:
132
+ await audio_queue.put(None)
133
 
 
134
  async def vad_and_generate():
135
  nonlocal generating
136
+ speech_frames: list[np.ndarray] = []
137
+ silence_count = 0
138
+ speech_count = 0
139
+ in_speech = False
 
140
 
141
  await websocket.send_text(json.dumps({"type": "ready"}))
142
 
 
145
  if frame_bytes is None:
146
  break
147
 
148
+ frame = np.frombuffer(frame_bytes, dtype=np.int16)
149
  active = is_speech(frame)
150
 
151
  if active:
 
153
  speech_count += 1
154
  in_speech = True
155
  speech_frames.append(frame)
156
+ elif in_speech:
157
+ silence_count += 1
158
+ speech_frames.append(frame)
159
 
160
+ if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
161
+ if not generating:
162
+ generating = True
163
+ utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
164
+ speech_frames = []
165
+ silence_count = 0
166
+ speech_count = 0
167
+ in_speech = False
 
 
 
 
 
 
 
 
 
 
168
 
169
+ try:
170
  await websocket.send_text(json.dumps({"type": "generating"}))
171
  await websocket.send_bytes(wav_header())
172
+ chunks = await loop.run_in_executor(None, run_generation, utterance)
 
 
 
 
 
173
  for chunk in chunks:
174
+ await websocket.send_bytes(chunk)
175
+ await websocket.send_text(json.dumps({"type": "done"}))
176
+ except Exception as e:
177
+ print(f"[WS] send error: {e}")
178
+ finally:
 
 
 
 
 
179
  generating = False
180
 
 
181
  try:
182
  await asyncio.gather(receiver(), vad_and_generate())
183
  except WebSocketDisconnect:
 
188
  print("[WS] client disconnected")
189
 
190
 
 
 
 
191
 
192
 
193
+ @app.get("/health")
194
+ async def health():
195
+ return {"status": "ok", "device": DEVICE}