Spaces:

SalexAI
/

api

Sleeping

App Files Files Community

SalexAI commited on Feb 12

Commit

0a9dfed

verified ·

1 Parent(s): ce52252

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +180 -15

app/main.py CHANGED Viewed

@@ -1,33 +1,198 @@
-from fastapi import FastAPI
-from fastapi.responses import JSONResponse
 import numpy as np
-from fastrtc import Stream, ReplyOnPause
 app = FastAPI()
-# Simple echo handler to verify your stream works end-to-end
-def echo(audio: tuple[int, np.ndarray]):
-    # audio is (sample_rate, int16 numpy array)
-    yield audio
 stream = Stream(
-    handler=ReplyOnPause(echo),  # VAD-ish turn-taking
     modality="audio",
     mode="send-receive",
 )
-# Mount FastRTC endpoints onto this FastAPI app
 stream.mount(app)
 @app.get("/")
 async def root():
-    return JSONResponse(
-        {
-            "ok": True,
-            "message": "FastRTC mounted. Use the mounted endpoints for WebRTC/WebSocket.",
-        }
-    )

+import asyncio
+import base64
+import json
+import os
+import uuid
+from typing import AsyncGenerator, Literal, Optional
 import numpy as np
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import JSONResponse, StreamingResponse
+from dotenv import load_dotenv
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, Stream, wait_for_item
+# ---- Gemini (optional for later; right now we keep your echo handler working) ----
+# You can plug Gemini back in once bridge works.
+load_dotenv()
 app = FastAPI()
+# ---------------------------
+# Minimal VAD echo handler (server is already booting with this)
+# ---------------------------
+class EchoHandler(AsyncStreamHandler):
+    def __init__(self, expected_layout: Literal["mono"] = "mono", output_sample_rate: int = 24000):
+        super().__init__(expected_layout=expected_layout, output_sample_rate=output_sample_rate, input_sample_rate=16000)
+        self.out_q: asyncio.Queue[tuple[int, np.ndarray] | AdditionalOutputs] = asyncio.Queue()
+    def copy(self):
+        return EchoHandler()
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        sr, audio = frame
+        audio = np.asarray(audio)
+        if audio.ndim == 2:
+            audio = audio.squeeze()
+        if audio.dtype != np.int16:
+            audio = audio.astype(np.int16)
+        # Echo back immediately as "audio"
+        self.out_q.put_nowait((sr, audio.reshape(1, -1)))
+    async def emit(self):
+        return await wait_for_item(self.out_q)
 stream = Stream(
+    handler=EchoHandler(),
     modality="audio",
     mode="send-receive",
+    additional_inputs=["voice_name"],  # placeholder for later
 )
 stream.mount(app)
+# ---------------------------
+# Helpers
+# ---------------------------
+def b64_to_int16(b64: str) -> np.ndarray:
+    raw = base64.b64decode(b64)
+    return np.frombuffer(raw, dtype=np.int16)
+def int16_to_b64(audio: np.ndarray) -> str:
+    if audio.dtype != np.int16:
+        audio = audio.astype(np.int16)
+    return base64.b64encode(audio.tobytes()).decode("utf-8")
+# ---------------------------
+# Basic endpoints
+# ---------------------------
 @app.get("/")
 async def root():
+    return {"ok": True, "message": "FastRTC mounted. Use the mounted endpoints for WebRTC/WebSocket."}
+@app.get("/health")
+async def health():
+    return {"ok": True}
+@app.get("/webrtc/new")
+async def webrtc_new():
+    """
+    Mint a webrtc_id to use with /outputs or /ws bridge.
+    """
+    webrtc_id = str(uuid.uuid4())
+    # Initialize internal connection state so output_stream has something to bind to later
+    # (FastRTC will create it lazily when first used, but we create a stable id for the client.)
+    return {"webrtc_id": webrtc_id}
+@app.get("/outputs")
+async def outputs(webrtc_id: str):
+    async def event_stream():
+        async for out in stream.output_stream(webrtc_id):
+            payload = json.dumps(out.args[0] if out.args else None)
+            yield f"event: output\ndata: {payload}\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
+# ---------------------------
+# Scratch-friendly WebSocket bridge
+# ---------------------------
+@app.websocket("/ws")
+async def ws_bridge(ws: WebSocket):
+    await ws.accept()
+    webrtc_id: Optional[str] = None
+    out_task: Optional[asyncio.Task] = None
+    async def send_outputs_loop():
+        # Stream AdditionalOutputs + audio coming out of FastRTC
+        try:
+            async for item in stream.output_stream(webrtc_id):
+                # item is AdditionalOutputs; forward as JSON
+                msg = item.args[0] if item.args else None
+                await ws.send_text(json.dumps({"type": "output", "data": msg}))
+        except Exception:
+            pass
+    async def send_audio_loop():
+        # Also poll the "audio" output if your handler emits raw audio tuples.
+        # FastRTC output_stream yields AdditionalOutputs only.
+        # So for audio we use stream.fetch_output(...) style by calling internal generator:
+        try:
+            async for out in stream.stream_output(webrtc_id):
+                # out can be (sr, np.ndarray) or AdditionalOutputs
+                if isinstance(out, AdditionalOutputs):
+                    continue
+                sr, audio = out
+                audio = np.asarray(audio)
+                if audio.ndim == 2:
+                    audio = audio.squeeze()
+                if audio.dtype != np.int16:
+                    audio = audio.astype(np.int16)
+                await ws.send_text(json.dumps({
+                    "type": "audio_delta",
+                    "rate": int(sr),
+                    "data": int16_to_b64(audio)
+                }))
+        except Exception:
+            pass
+    try:
+        while True:
+            raw = await ws.receive_text()
+            msg = json.loads(raw)
+            t = msg.get("type")
+            if t == "connect":
+                # create or use provided webrtc_id
+                webrtc_id = msg.get("webrtc_id") or str(uuid.uuid4())
+                # optionally set voice / other inputs (stored for handler)
+                voice = msg.get("voice") or "Puck"
+                try:
+                    await stream.set_input(webrtc_id, voice)
+                except Exception:
+                    # if set_input isn't supported in your exact FastRTC build, ignore
+                    pass
+                # start output loops once
+                if out_task is None:
+                    out_task = asyncio.gather(send_audio_loop(), send_outputs_loop())
+                await ws.send_text(json.dumps({"type": "ready", "webrtc_id": webrtc_id}))
+                continue
+            if t == "audio":
+                if not webrtc_id:
+                    await ws.send_text(json.dumps({"type": "error", "message": "Not connected. Send {type:'connect'} first."}))
+                    continue
+                b64 = msg.get("data")
+                rate = int(msg.get("rate") or 16000)
+                if not isinstance(b64, str) or not b64:
+                    continue
+                audio = b64_to_int16(b64)
+                # FastRTC expects (sample_rate, np.ndarray)
+                await stream.send_input(webrtc_id, (rate, audio.reshape(1, -1)))
+                continue
+            if t == "close":
+                await ws.close()
+                return
+            await ws.send_text(json.dumps({"type": "error", "message": f"Unknown type: {t}"}))
+    except WebSocketDisconnect:
+        pass
+    finally:
+        try:
+            if out_task:
+                out_task.cancel()
+        except Exception:
+            pass