Spaces:

Remostartdev
/

STREAM_TTS

Runtime error

App Files Files Community

drrobot9 commited on 28 days ago

Commit

f79b1a9

verified ·

1 Parent(s): c56c006

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +165 -88

app/main.py CHANGED Viewed

@@ -3,11 +3,7 @@ import json
 import torch
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from liquid_audio import (
-    LFM2AudioModel,
-    LFM2AudioProcessor,
-    ChatState,
-)
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -17,131 +13,212 @@ CHUNK_SIZE = 20
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
-print(f"[BOOT] Loading model on {DEVICE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
-model = LFM2AudioModel.from_pretrained(HF_REPO).to(device=DEVICE, dtype=DTYPE).eval()
 print("[BOOT] Model loaded")
-app = FastAPI(title="LFM2.5 Speech-to-Speech", version="3.0")
-def wav_header(sr=24000, ch=1, bits=16):
-    byte_rate = sr * ch * bits // 8
-    block_align = ch * bits // 8
     return (
-        b"RIFF"
-        + b"\xff\xff\xff\xff"
-        + b"WAVEfmt "
-        + (16).to_bytes(4, "little")
-        + (1).to_bytes(2, "little")
-        + ch.to_bytes(2, "little")
-        + sr.to_bytes(4, "little")
-        + byte_rate.to_bytes(4, "little")
-        + block_align.to_bytes(2, "little")
-        + bits.to_bytes(2, "little")
-        + b"data"
-        + b"\xff\xff\xff\xff"
     )
-async def generate_response(websocket: WebSocket, audio_np: np.ndarray):
     chat = ChatState(processor)
     chat.new_turn("system")
-    chat.add_text("Respond conversationally with audio.")
     chat.end_turn()
     chat.new_turn("user")
     audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
     chat.add_audio(audio_tensor, sampling_rate=SAMPLE_RATE)
     chat.end_turn()
     chat.new_turn("assistant")
-    await websocket.send_bytes(wav_header())
-    audio_buffer = []
     with torch.inference_mode():
         for token in model.generate_interleaved(
             **chat,
-            max_new_tokens=4096,
             audio_temperature=0.8,
             audio_top_k=4,
         ):
-            # numel()==1 means text token
             if token.numel() == 1:
                 continue
-            # multi-element tensor = audio codes chunk
-            audio_buffer.append(token)
-            if len(audio_buffer) >= CHUNK_SIZE:
-                audio_codes = (
-                    torch.stack(audio_buffer, dim=1)
-                    .unsqueeze(0)
-                    .to(DEVICE)
-                )
-                try:
-                    waveform = processor.decode(audio_codes)
-                    waveform = waveform.squeeze().cpu().numpy()
-                    waveform = np.clip(waveform, -1.0, 1.0)
-                    audio_int16 = (waveform * 32767).astype(np.int16)
-                    await websocket.send_bytes(audio_int16.tobytes())
-                except Exception as e:
-                    print(f"[WARN] decode error: {e}")
-                finally:
-                    audio_buffer.clear()
-    # flush remaining
-    if len(audio_buffer) > 1:
-        audio_codes = (
-            torch.stack(audio_buffer, dim=1)
-            .unsqueeze(0)
-            .to(DEVICE)
-        )
-        try:
-            waveform = processor.decode(audio_codes)
-            waveform = waveform.squeeze().cpu().numpy()
-            waveform = np.clip(waveform, -1.0, 1.0)
-            audio_int16 = (waveform * 32767).astype(np.int16)
-            await websocket.send_bytes(audio_int16.tobytes())
-        except Exception as e:
-            print(f"[WARN] flush decode error: {e}")
-    await websocket.send_text(json.dumps({"type": "done"}))
 @app.websocket("/ws/s2s")
 async def websocket_s2s(websocket: WebSocket):
     await websocket.accept()
-    try:
-        audio_bytes = bytearray()
-        while True:
-            message = await websocket.receive()
-            if "text" in message:
-                payload = json.loads(message["text"])
-                if payload["type"] == "start":
-                    audio_bytes.clear()
-                elif payload["type"] == "end":
-                    audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32)
-                    audio_np /= 32767.0
-                    await generate_response(websocket, audio_np)
-            elif "bytes" in message:
-                audio_bytes.extend(message["bytes"])
     except WebSocketDisconnect:
         print("[WS] client disconnected")
 @app.get("/health")
 async def health():
-    return {"status": "ok", "device": DEVICE}

 import torch
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
+# VAD settings
+VAD_SILENCE_THRESHOLD = 0.01      # RMS below this = silencE
+VAD_SILENCE_FRAMES    = 30        # ~600ms of silence at 160-sample frames
+VAD_MIN_SPEECH_FRAMES = 10        # ignore very short blips
+print(f"[BOOT] Loading model on {DEVICE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
+model     = LFM2AudioModel.from_pretrained(HF_REPO).to(device=DEVICE, dtype=DTYPE).eval()
 print("[BOOT] Model loaded")
+app = FastAPI(title="LFM2.5 Real-Time S2S", version="4.0")
+#  Helpers
+def wav_header(sr=SAMPLE_RATE, ch=1, bits=16) -> bytes:
+    br = sr * ch * bits // 8
+    ba = ch * bits // 8
     return (
+        b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
+        + (16).to_bytes(4,"little") + (1).to_bytes(2,"little")
+        + ch.to_bytes(2,"little")   + sr.to_bytes(4,"little")
+        + br.to_bytes(4,"little")   + ba.to_bytes(2,"little")
+        + bits.to_bytes(2,"little") + b"data" + b"\xff\xff\xff\xff"
     )
+def decode_chunk(buf: list) -> bytes | None:
+    try:
+        codes = torch.stack(buf, dim=1).unsqueeze(0).to(DEVICE)
+        codes = codes - processor.audio_token_start
+        if codes.min() < 0:
+            return None
+        wf = processor.decode(codes).squeeze().cpu().numpy()
+        wf = np.clip(wf, -1.0, 1.0)
+        return (wf * 32767).astype(np.int16).tobytes()
+    except Exception as e:
+        print(f"[WARN] decode: {e}")
+        return None
+def is_speech(pcm_int16: np.ndarray) -> bool:
+    """Simple energy-based VAD."""
+    if len(pcm_int16) == 0:
+        return False
+    rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
+    return rms > VAD_SILENCE_THRESHOLD
+#  Generation runs in thread so it doesn't block the event loop
+def run_generation(audio_np: np.ndarray) -> list[bytes]:
+    """Synchronous generation — called via run_in_executor."""
     chat = ChatState(processor)
     chat.new_turn("system")
+    chat.add_text("You are a helpful real-time voice assistant called chioma. Respond naturally and concisely with audio when asked who built you say kelvin jackson an AI ENGINEER.")
     chat.end_turn()
     chat.new_turn("user")
     audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
     chat.add_audio(audio_tensor, sampling_rate=SAMPLE_RATE)
     chat.end_turn()
     chat.new_turn("assistant")
+    chunks = []
+    buf = []
     with torch.inference_mode():
         for token in model.generate_interleaved(
             **chat,
+            max_new_tokens=2048,
             audio_temperature=0.8,
             audio_top_k=4,
         ):
             if token.numel() == 1:
                 continue
+            buf.append(token)
+            if len(buf) >= CHUNK_SIZE:
+                pcm = decode_chunk(buf)
+                if pcm:
+                    chunks.append(pcm)
+                buf.clear()
+    if len(buf) > 1:
+        pcm = decode_chunk(buf)
+        if pcm:
+            chunks.append(pcm)
+    return chunks
+# WebSocket endpoint
 @app.websocket("/ws/s2s")
 async def websocket_s2s(websocket: WebSocket):
     await websocket.accept()
+    print("[WS] client connected")
+    loop = asyncio.get_event_loop()
+    # Queues
+    audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()  # incoming PCM frames
+    generating = False   # lock — only one generation at a time
+    #  Receiver task: reads raw PCM frames from client
+    async def receiver():
+        try:
+            while True:
+                try:
+                    msg = await websocket.receive()
+                except RuntimeError:
+                    break
+                if msg.get("type") == "websocket.disconnect":
+                    break
+                if "bytes" in msg:
+                    await audio_queue.put(msg["bytes"])
+                elif "text" in msg:
+                    data = json.loads(msg["text"])
+                    if data.get("type") == "stop":
+                        break
+        finally:
+            await audio_queue.put(None)  # sentinel
+    #  VAD + generation task
+    async def vad_and_generate():
+        nonlocal generating
+        speech_frames:  list[np.ndarray] = []
+        silence_count   = 0
+        speech_count    = 0
+        in_speech       = False
+        await websocket.send_text(json.dumps({"type": "ready"}))
+        while True:
+            frame_bytes = await audio_queue.get()
+            if frame_bytes is None:
+                break
+            frame = np.frombuffer(frame_bytes, dtype=np.int16)
+            active = is_speech(frame)
+            if active:
+                silence_count = 0
+                speech_count += 1
+                in_speech = True
+                speech_frames.append(frame)
+            else:
+                if in_speech:
+                    silence_count += 1
+                    speech_frames.append(frame)  # keep tail for natural cutoff
+                    # End-of-utterance detected
+                    if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
+                        if not generating:
+                            generating = True
+                            # Grab the accumulated speech
+                            utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
+                            # Reset VAD state immediately so mic stays live
+                            speech_frames  = []
+                            silence_count  = 0
+                            speech_count   = 0
+                            in_speech      = False
+                            # Signal client: AI is responding
+                            await websocket.send_text(json.dumps({"type": "generating"}))
+                            await websocket.send_bytes(wav_header())
+                            # Run heavy generation off the event loop
+                            chunks = await loop.run_in_executor(
+                                None, run_generation, utterance
+                            )
+                            for chunk in chunks:
+                                try:
+                                    await websocket.send_bytes(chunk)
+                                except Exception:
+                                    break
+                            try:
+                                await websocket.send_text(json.dumps({"type": "done"}))
+                            except Exception:
+                                pass
+                            generating = False
+    try:
+        await asyncio.gather(receiver(), vad_and_generate())
     except WebSocketDisconnect:
+        pass
+    except Exception as e:
+        print(f"[WS] error: {e}")
+    finally:
         print("[WS] client disconnected")
 @app.get("/health")
 async def health():
+    return {"status": "ok", "device": DEVICE}
+from fastapi.responses import FileResponse
+@app.get("/")
+async def index():
+    return FileResponse("client.html")