Spaces:

Remostartdev
/

STREAM_TTS

Runtime error

App Files Files Community

drrobot9 commited on 21 days ago

Commit

4e7f8bc

verified ·

1 Parent(s): 85a874c

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +76 -98

app/main.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import asyncio
 import json
 import torch
@@ -11,157 +13,133 @@ from liquid_audio import (
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-SAMPLE_RATE = 24_000
-CHUNK_SIZE = 6
-if DEVICE == "cuda" and torch.cuda.is_bf16_supported():
-    DTYPE = torch.bfloat16
-else:
-    DTYPE = torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
-print(f"[BOOT] Loading model on {DEVICE} with dtype {DTYPE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
-model = LFM2AudioModel.from_pretrained(HF_REPO).to(dtype=DTYPE, device=DEVICE).eval()
-print(f"[BOOT] LFM2.5 Loaded on {DEVICE}")
-app = FastAPI(title="LFM2.5 WebSocket TTS", version="2.0.0")
-def wav_header(sample_rate: int, channels: int = 1, bits: int = 16) -> bytes:
-    byte_rate = sample_rate * channels * bits // 8
-    block_align = channels * bits // 8
     return (
         b"RIFF"
-        + (b"\xff\xff\xff\xff")
         + b"WAVEfmt "
         + (16).to_bytes(4, "little")
         + (1).to_bytes(2, "little")
-        + channels.to_bytes(2, "little")
-        + sample_rate.to_bytes(4, "little")
         + byte_rate.to_bytes(4, "little")
         + block_align.to_bytes(2, "little")
         + bits.to_bytes(2, "little")
         + b"data"
-        + (b"\xff\xff\xff\xff")
     )
-#  Stream core
-async def stream_lfm_tts(websocket: WebSocket, text: str):
     chat = ChatState(processor)
     chat.new_turn("system")
-    chat.add_text("Respond with interleaved text and audio.")
     chat.end_turn()
     chat.new_turn("user")
-    chat.add_text(text)
     chat.end_turn()
     chat.new_turn("assistant")
-    await websocket.send_bytes(wav_header(SAMPLE_RATE))
     audio_buffer = []
-    stop_flag = False
-    async def listen_for_stop():
-        nonlocal stop_flag
-        try:
-            while True:
-                msg = await websocket.receive_text()
-                data = json.loads(msg)
-                if data.get("type") == "stop":
-                    stop_flag = True
-                    break
-        except Exception:
-            stop_flag = True
-    listener_task = asyncio.create_task(listen_for_stop())
-    try:
-        with torch.inference_mode():
-            for token in model.generate_interleaved(
-                **chat,
-                max_new_tokens=4096,
-                audio_temperature=0.8,
-                audio_top_k=4,
-            ):
-                if stop_flag:
-                    break
-                if token.numel() == 1:
-                    continue
                 audio_buffer.append(token)
-                if len(audio_buffer) >= CHUNK_SIZE:
-                    audio_codes = (
-                        torch.stack(audio_buffer, dim=1)
-                        .unsqueeze(0)
-                        .to(DEVICE)
-                    )
-                    waveform = processor.decode(audio_codes)
-                    waveform = waveform.squeeze().cpu().numpy()
-                    waveform = np.clip(waveform, -1.0, 1.0)
-                    audio_int16 = (waveform * 32767.0).astype(np.int16)
-                    await websocket.send_bytes(audio_int16.tobytes())
                     audio_buffer.clear()
-        # flush remaining
-        if not stop_flag and len(audio_buffer) > 1:
-            audio_codes = (
-                torch.stack(audio_buffer[:-1], dim=1)
-                .unsqueeze(0)
-                .to(DEVICE)
-            )
-            waveform = processor.decode(audio_codes)
-            waveform = waveform.squeeze().cpu().numpy()
-            waveform = np.clip(waveform, -1.0, 1.0)
-            audio_int16 = (waveform * 32767.0).astype(np.int16)
-            await websocket.send_bytes(audio_int16.tobytes())
-        await websocket.send_text(json.dumps({"type": "done"}))
-    finally:
-        listener_task.cancel()
-# WebSocket endpoint
-@app.websocket("/ws/tts")
-async def websocket_tts(websocket: WebSocket):
     await websocket.accept()
     try:
         while True:
-            message = await websocket.receive_text()
-            payload = json.loads(message)
-            if payload.get("type") == "start":
-                text = payload.get("text", "").strip()
-                if not text:
-                    await websocket.send_text(json.dumps({
-                        "type": "error",
-                        "message": "Text is empty"
-                    }))
-                    continue
-                await stream_lfm_tts(websocket, text)
-    except WebSocketDisconnect:
-        print("[WS] Client disconnected")
 @app.get("/health")

+# app/main.py
 import asyncio
 import json
 import torch
 HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 24000
+CHUNK_SIZE = 20
+DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
 torch.backends.cuda.matmul.allow_tf32 = True
+print(f"[BOOT] Loading model on {DEVICE}...")
 processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
+model = LFM2AudioModel.from_pretrained(HF_REPO).to(device=DEVICE, dtype=DTYPE).eval()
+print("[BOOT] Model loaded")
+app = FastAPI(title="LFM2.5 Speech-to-Speech", version="3.0")
+def wav_header(sr=24000, ch=1, bits=16):
+    byte_rate = sr * ch * bits // 8
+    block_align = ch * bits // 8
     return (
         b"RIFF"
+        + b"\xff\xff\xff\xff"
         + b"WAVEfmt "
         + (16).to_bytes(4, "little")
         + (1).to_bytes(2, "little")
+        + ch.to_bytes(2, "little")
+        + sr.to_bytes(4, "little")
         + byte_rate.to_bytes(4, "little")
         + block_align.to_bytes(2, "little")
         + bits.to_bytes(2, "little")
         + b"data"
+        + b"\xff\xff\xff\xff"
     )
+async def generate_response(websocket: WebSocket, audio_np: np.ndarray):
     chat = ChatState(processor)
     chat.new_turn("system")
+    chat.add_text("Respond conversationally with audio.")
     chat.end_turn()
     chat.new_turn("user")
+    chat.add_audio(audio_np, sample_rate=SAMPLE_RATE)
     chat.end_turn()
     chat.new_turn("assistant")
+    await websocket.send_bytes(wav_header())
     audio_buffer = []
+    with torch.inference_mode():
+        for token in model.generate_interleaved(
+            **chat,
+            max_new_tokens=4096,
+            audio_temperature=0.8,
+            audio_top_k=4,
+        ):
+            if token.numel() == 1:
+                continue
+            token_id = token.item()
+            if processor.audio_token_start <= token_id <= processor.audio_token_end:
                 audio_buffer.append(token)
+            if len(audio_buffer) >= CHUNK_SIZE:
+                audio_codes = (
+                    torch.stack(audio_buffer, dim=1)
+                    .unsqueeze(0)
+                    .to(DEVICE)
+                )
+                try:
+                    waveform = processor.decode(audio_codes)
+                except Exception:
                     audio_buffer.clear()
+                    continue
+                waveform = waveform.squeeze().cpu().numpy()
+                waveform = np.clip(waveform, -1.0, 1.0)
+                audio_int16 = (waveform * 32767).astype(np.int16)
+                await websocket.send_bytes(audio_int16.tobytes())
+                audio_buffer.clear()
+    await websocket.send_text(json.dumps({"type": "done"}))
+@app.websocket("/ws/s2s")
+async def websocket_s2s(websocket: WebSocket):
     await websocket.accept()
     try:
+        audio_bytes = bytearray()
         while True:
+            message = await websocket.receive()
+            if "text" in message:
+                payload = json.loads(message["text"])
+                if payload["type"] == "start":
+                    audio_bytes.clear()
+                if payload["type"] == "end":
+                    audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32)
+                    audio_np /= 32767.0
+                    await generate_response(websocket, audio_np)
+            elif "bytes" in message:
+                audio_bytes.extend(message["bytes"])
+    except WebSocketDisconnect:
+        print("[WS] client disconnected")
 @app.get("/health")