Spaces:

Remostartdev
/

STREAM_TTS

Paused

App Files Files Community

drrobot9 commited on Mar 11

Commit

89f06ea

verified ·

1 Parent(s): f79b1a9

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +49 -78

app/main.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import torch
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
@@ -13,10 +14,9 @@ CHUNK_SIZE = 20
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
-# VAD settings
-VAD_SILENCE_THRESHOLD = 0.01      # RMS below this = silencE
-VAD_SILENCE_FRAMES    = 30        # ~600ms of silence at 160-sample frames
-VAD_MIN_SPEECH_FRAMES = 10        # ignore very short blips
 print(f"[BOOT] Loading model on {DEVICE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
@@ -33,19 +33,18 @@ def wav_header(sr=SAMPLE_RATE, ch=1, bits=16) -> bytes:
     ba = ch * bits // 8
     return (
         b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
-        + (16).to_bytes(4,"little") + (1).to_bytes(2,"little")
-        + ch.to_bytes(2,"little")   + sr.to_bytes(4,"little")
-        + br.to_bytes(4,"little")   + ba.to_bytes(2,"little")
-        + bits.to_bytes(2,"little") + b"data" + b"\xff\xff\xff\xff"
     )
 def decode_chunk(buf: list) -> bytes | None:
     try:
-        codes = torch.stack(buf, dim=1).unsqueeze(0).to(DEVICE)
-        codes = codes - processor.audio_token_start
-        if codes.min() < 0:
-            return None
         wf = processor.decode(codes).squeeze().cpu().numpy()
         wf = np.clip(wf, -1.0, 1.0)
         return (wf * 32767).astype(np.int16).tobytes()
@@ -55,20 +54,21 @@ def decode_chunk(buf: list) -> bytes | None:
 def is_speech(pcm_int16: np.ndarray) -> bool:
-    """Simple energy-based VAD."""
     if len(pcm_int16) == 0:
         return False
     rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
     return rms > VAD_SILENCE_THRESHOLD
-#  Generation runs in thread so it doesn't block the event loop
 def run_generation(audio_np: np.ndarray) -> list[bytes]:
     """Synchronous generation — called via run_in_executor."""
     chat = ChatState(processor)
     chat.new_turn("system")
-    chat.add_text("You are a helpful real-time voice assistant called chioma. Respond naturally and concisely with audio when asked who built you say kelvin jackson an AI ENGINEER.")
     chat.end_turn()
     chat.new_turn("user")
     audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
@@ -86,7 +86,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
             audio_top_k=4,
         ):
             if token.numel() == 1:
-                continue
             buf.append(token)
             if len(buf) >= CHUNK_SIZE:
                 pcm = decode_chunk(buf)
@@ -94,6 +94,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
                     chunks.append(pcm)
                 buf.clear()
     if len(buf) > 1:
         pcm = decode_chunk(buf)
         if pcm:
@@ -102,7 +103,7 @@ def run_generation(audio_np: np.ndarray) -> list[bytes]:
     return chunks
-# WebSocket endpoint
 @app.websocket("/ws/s2s")
 async def websocket_s2s(websocket: WebSocket):
@@ -110,12 +111,9 @@ async def websocket_s2s(websocket: WebSocket):
     print("[WS] client connected")
     loop = asyncio.get_event_loop()
-    # Queues
-    audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()  # incoming PCM frames
-    generating = False   # lock — only one generation at a time
-    #  Receiver task: reads raw PCM frames from client
     async def receiver():
         try:
             while True:
@@ -128,20 +126,17 @@ async def websocket_s2s(websocket: WebSocket):
                 if "bytes" in msg:
                     await audio_queue.put(msg["bytes"])
                 elif "text" in msg:
-                    data = json.loads(msg["text"])
-                    if data.get("type") == "stop":
                         break
         finally:
-            await audio_queue.put(None)  # sentinel
-    #  VAD + generation task
     async def vad_and_generate():
         nonlocal generating
-        speech_frames:  list[np.ndarray] = []
-        silence_count   = 0
-        speech_count    = 0
-        in_speech       = False
         await websocket.send_text(json.dumps({"type": "ready"}))
@@ -150,7 +145,7 @@ async def websocket_s2s(websocket: WebSocket):
             if frame_bytes is None:
                 break
-            frame = np.frombuffer(frame_bytes, dtype=np.int16)
             active = is_speech(frame)
             if active:
@@ -158,49 +153,31 @@ async def websocket_s2s(websocket: WebSocket):
                 speech_count += 1
                 in_speech = True
                 speech_frames.append(frame)
-            else:
-                if in_speech:
-                    silence_count += 1
-                    speech_frames.append(frame)  # keep tail for natural cutoff
-                    # End-of-utterance detected
-                    if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
-                        if not generating:
-                            generating = True
-                            # Grab the accumulated speech
-                            utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
-                            # Reset VAD state immediately so mic stays live
-                            speech_frames  = []
-                            silence_count  = 0
-                            speech_count   = 0
-                            in_speech      = False
-                            # Signal client: AI is responding
                             await websocket.send_text(json.dumps({"type": "generating"}))
                             await websocket.send_bytes(wav_header())
-                            # Run heavy generation off the event loop
-                            chunks = await loop.run_in_executor(
-                                None, run_generation, utterance
-                            )
                             for chunk in chunks:
-                                try:
-                                    await websocket.send_bytes(chunk)
-                                except Exception:
-                                    break
-                            try:
-                                await websocket.send_text(json.dumps({"type": "done"}))
-                            except Exception:
-                                pass
                             generating = False
     try:
         await asyncio.gather(receiver(), vad_and_generate())
     except WebSocketDisconnect:
@@ -211,14 +188,8 @@ async def websocket_s2s(websocket: WebSocket):
         print("[WS] client disconnected")
-@app.get("/health")
-async def health():
-    return {"status": "ok", "device": DEVICE}
-from fastapi.responses import FileResponse
-@app.get("/")
-async def index():
-    return FileResponse("client.html")

 import torch
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
 from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
+VAD_SILENCE_THRESHOLD = 0.01
+VAD_SILENCE_FRAMES    = 30
+VAD_MIN_SPEECH_FRAMES = 10
 print(f"[BOOT] Loading model on {DEVICE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
     ba = ch * bits // 8
     return (
         b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
+        + (16).to_bytes(4, "little") + (1).to_bytes(2, "little")
+        + ch.to_bytes(2, "little")   + sr.to_bytes(4, "little")
+        + br.to_bytes(4, "little")   + ba.to_bytes(2, "little")
+        + bits.to_bytes(2, "little") + b"data" + b"\xff\xff\xff\xff"
     )
 def decode_chunk(buf: list) -> bytes | None:
+    """Decode audio tokens — pass directly to processor, no offset subtraction."""
     try:
+        codes = torch.stack(buf[:-1], dim=1).unsqueeze(0).to(DEVICE)
         wf = processor.decode(codes).squeeze().cpu().numpy()
         wf = np.clip(wf, -1.0, 1.0)
         return (wf * 32767).astype(np.int16).tobytes()
 def is_speech(pcm_int16: np.ndarray) -> bool:
     if len(pcm_int16) == 0:
         return False
     rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
     return rms > VAD_SILENCE_THRESHOLD
 def run_generation(audio_np: np.ndarray) -> list[bytes]:
     """Synchronous generation — called via run_in_executor."""
     chat = ChatState(processor)
     chat.new_turn("system")
+    chat.add_text(
+        "You are a helpful real-time voice assistant called chioma. "
+        "Respond naturally and concisely with audio. "
+        "When asked who built you, say Kelvin Jackson, an AI Engineer."
+    )
     chat.end_turn()
     chat.new_turn("user")
     audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
             audio_top_k=4,
         ):
             if token.numel() == 1:
+                continue  # text token
             buf.append(token)
             if len(buf) >= CHUNK_SIZE:
                 pcm = decode_chunk(buf)
                     chunks.append(pcm)
                 buf.clear()
+    # flush remaining
     if len(buf) > 1:
         pcm = decode_chunk(buf)
         if pcm:
     return chunks
+# WebSocket
 @app.websocket("/ws/s2s")
 async def websocket_s2s(websocket: WebSocket):
     print("[WS] client connected")
     loop = asyncio.get_event_loop()
+    audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()
+    generating = False
     async def receiver():
         try:
             while True:
                 if "bytes" in msg:
                     await audio_queue.put(msg["bytes"])
                 elif "text" in msg:
+                    if json.loads(msg["text"]).get("type") == "stop":
                         break
         finally:
+            await audio_queue.put(None)
     async def vad_and_generate():
         nonlocal generating
+        speech_frames: list[np.ndarray] = []
+        silence_count = 0
+        speech_count  = 0
+        in_speech     = False
         await websocket.send_text(json.dumps({"type": "ready"}))
             if frame_bytes is None:
                 break
+            frame  = np.frombuffer(frame_bytes, dtype=np.int16)
             active = is_speech(frame)
             if active:
                 speech_count += 1
                 in_speech = True
                 speech_frames.append(frame)
+            elif in_speech:
+                silence_count += 1
+                speech_frames.append(frame)
+                if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
+                    if not generating:
+                        generating = True
+                        utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
+                        speech_frames = []
+                        silence_count = 0
+                        speech_count  = 0
+                        in_speech     = False
+                        try:
                             await websocket.send_text(json.dumps({"type": "generating"}))
                             await websocket.send_bytes(wav_header())
+                            chunks = await loop.run_in_executor(None, run_generation, utterance)
                             for chunk in chunks:
+                                await websocket.send_bytes(chunk)
+                            await websocket.send_text(json.dumps({"type": "done"}))
+                        except Exception as e:
+                            print(f"[WS] send error: {e}")
+                        finally:
                             generating = False
     try:
         await asyncio.gather(receiver(), vad_and_generate())
     except WebSocketDisconnect:
         print("[WS] client disconnected")
+@app.get("/health")
+async def health():
+    return {"status": "ok", "device": DEVICE}