Spaces:

SalexAI
/

api

Sleeping

App Files Files Community

SalexAI commited on Feb 13

Commit

fc77df5

verified ·

1 Parent(s): ed9acde

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +145 -106

app/main.py CHANGED Viewed

@@ -1,161 +1,200 @@
 import asyncio
-import base64
 import json
 import uuid
-from typing import Optional, Literal
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.responses import JSONResponse
-from fastrtc import Stream, AsyncStreamHandler, wait_for_item, AdditionalOutputs
 app = FastAPI()
-# ---------------------------
-# A tiny headless audio handler (echo) to validate the pipe.
-# Swap this out later for Gemini / other realtime models.
-# ---------------------------
-class EchoHandler(AsyncStreamHandler):
-    def __init__(self, expected_layout: Literal["mono"] = "mono", output_sample_rate: int = 24000):
-        super().__init__(expected_layout=expected_layout, output_sample_rate=output_sample_rate, input_sample_rate=16000)
-        self.out_q: asyncio.Queue[tuple[int, np.ndarray] | AdditionalOutputs] = asyncio.Queue()
-    def copy(self):
-        return EchoHandler()
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        sr, audio = frame
-        audio = np.asarray(audio)
-        if audio.ndim == 2:
-            audio = audio.squeeze()
-        if audio.dtype != np.int16:
-            audio = audio.astype(np.int16)
-        # Echo straight back
-        self.out_q.put_nowait((sr, audio.reshape(1, -1)))
-    async def emit(self):
-        return await wait_for_item(self.out_q)
-# IMPORTANT: no additional_inputs here (strings crash in 0.0.34)
-stream = Stream(
-    handler=EchoHandler(),
     modality="audio",
     mode="send-receive",
 )
-# This mounts FastRTC’s internal routes; we’re also adding /ws below.
-stream.mount(app)
-# ---------------------------
-# Helpers
-# ---------------------------
-def b64_to_int16(b64: str) -> np.ndarray:
-    raw = base64.b64decode(b64)
-    return np.frombuffer(raw, dtype=np.int16)
-def int16_to_b64(audio: np.ndarray) -> str:
-    if audio.dtype != np.int16:
-        audio = audio.astype(np.int16)
-    return base64.b64encode(audio.tobytes()).decode("utf-8")
 @app.get("/")
 async def root():
-    return {"ok": True, "message": "FastRTC mounted. Headless mode. Use /ws for Scratch."}
-@app.get("/health")
-async def health():
-    return {"ok": True}
-# ---------------------------
-# Scratch-friendly WS bridge (no WebRTC needed client-side)
-# ---------------------------
 @app.websocket("/ws")
-async def ws_bridge(ws: WebSocket):
     await ws.accept()
     session_id: Optional[str] = None
-    pump_task: Optional[asyncio.Task] = None
-    async def pump_outputs():
-        """
-        Pull audio outputs from FastRTC and forward to client.
-        NOTE: FastRTC 0.0.34 uses fetch_output() polling style.
-        """
-        try:
-            while True:
-                out = await stream.fetch_output(session_id)
-                if out is None:
-                    await asyncio.sleep(0.01)
-                    continue
-                if isinstance(out, AdditionalOutputs):
-                    payload = out.args[0] if out.args else None
-                    await ws.send_text(json.dumps({"type": "output", "data": payload}))
-                    continue
-                sr, audio = out
-                audio = np.asarray(audio)
-                if audio.ndim == 2:
-                    audio = audio.squeeze()
-                if audio.dtype != np.int16:
-                    audio = audio.astype(np.int16)
-                await ws.send_text(json.dumps({
-                    "type": "audio_delta",
-                    "rate": int(sr),
-                    "data": int16_to_b64(audio),
-                }))
-        except Exception:
-            return
     try:
         while True:
             raw = await ws.receive_text()
-            msg = json.loads(raw)
-            t = msg.get("type")
-            if t == "connect":
                 session_id = msg.get("session_id") or str(uuid.uuid4())
-                # start pump once
-                if pump_task is None:
-                    pump_task = asyncio.create_task(pump_outputs())
-                await ws.send_text(json.dumps({"type": "ready", "session_id": session_id}))
                 continue
-            if t == "audio":
-                if not session_id:
-                    await ws.send_text(json.dumps({"type": "error", "message": "Send {type:'connect'} first."}))
                     continue
-                b64 = msg.get("data")
-                rate = int(msg.get("rate") or 16000)
-                if not isinstance(b64, str) or not b64:
                     continue
-                audio = b64_to_int16(b64)
-                await stream.send_input(session_id, (rate, audio.reshape(1, -1)))
                 continue
-            if t == "close":
-                await ws.close()
-                return
-            await ws.send_text(json.dumps({"type": "error", "message": f"Unknown type: {t}"}))
     except WebSocketDisconnect:
-        pass
-    finally:
         try:
-            if pump_task:
-                pump_task.cancel()
         except Exception:
             pass

+from __future__ import annotations
 import asyncio
 import json
+import os
 import uuid
+from typing import Any, Dict, Optional
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.responses import JSONResponse
+from fastrtc import Stream, ReplyOnPause, get_stt_model, get_tts_model
+from .gemini_text import (
+    gemini_chat_turn,
+    get_session,
+    deliver_function_result,
+)
 app = FastAPI()
+# ----------------------------
+# FastRTC Voice Chat (VAD + STT + TTS)
+# ----------------------------
+# These are CPU-friendly, but still heavy on Spaces. Keep them global.
+STT_MODEL_NAME = os.getenv("FASTRTC_STT_MODEL", "moonshine/tiny")
+TTS_MODEL_NAME = os.getenv("FASTRTC_TTS_MODEL", "kokoro")
+stt = get_stt_model(model=STT_MODEL_NAME)
+tts = get_tts_model(model=TTS_MODEL_NAME)
+def _voice_reply_fn(audio: tuple[int, np.ndarray]):
+    """
+    Called when the user pauses (VAD). Returns streamed audio frames (TTS).
+    """
+    # audio is (sample_rate, int16 mono ndarray)
+    # FastRTC STT expects "audio" in the same tuple form per docs examples.
+    user_text = stt.stt(audio).strip()
+    if not user_text:
+        return
+    # For voice sessions we create a synthetic session_id (not Scratch ws session)
+    # because FastRTC’s ReplyOnPause fn signature doesn’t expose the RTC session id.
+    # This keeps a stable conversation state per-process, but not per-user.
+    #
+    # If you need per-user memory for voice, we can switch to a stateful StreamHandler later.
+    voice_session_id = "voice-global"
+    async def run():
+        # No tool bounce for voice by default (still supported via same session registry if you want)
+        async def noop_emit(_evt: dict):
+            return
+        text = await gemini_chat_turn(
+            session_id=voice_session_id,
+            user_text=user_text,
+            emit_event=noop_emit,
+            model=os.getenv("GEMINI_TEXT_MODEL", "gemini-2.0-flash"),
+        )
+        return text
+    text = asyncio.get_event_loop().run_until_complete(run())
+    # Stream TTS back
+    for chunk in tts.stream_tts_sync(text):
+        # chunk is already an audio frame compatible with FastRTC
+        yield chunk
+voice_stream = Stream(
     modality="audio",
     mode="send-receive",
+    handler=ReplyOnPause(_voice_reply_fn),
 )
+# Mount FastRTC endpoints (WebRTC + WebSocket) under /rtc
+voice_stream.mount(app, path="/rtc")
+# ----------------------------
+# Scratch-friendly WebSocket API (text + function calling)
+# ----------------------------
 @app.get("/")
 async def root():
+    return JSONResponse(
+        {
+            "ok": True,
+            "service": "salexai-api",
+            "ws": "/ws",
+            "fastrtc": "/rtc",
+            "notes": [
+                "Use /ws for Scratch JSON chat + function calling.",
+                "Use /rtc for FastRTC voice chat endpoints (VAD/STT/TTS handled by FastRTC).",
+            ],
+        }
+    )
 @app.websocket("/ws")
+async def ws_endpoint(ws: WebSocket):
     await ws.accept()
     session_id: Optional[str] = None
+    async def emit(evt: dict):
+        await ws.send_text(json.dumps(evt))
     try:
         while True:
             raw = await ws.receive_text()
+            msg = json.loads(raw) if raw else {}
+            mtype = msg.get("type")
+            if mtype == "connect":
                 session_id = msg.get("session_id") or str(uuid.uuid4())
+                get_session(session_id)  # ensure exists
+                await emit({"type": "ready", "session_id": session_id})
+                continue
+            if not session_id:
+                await emit({"type": "error", "message": "Not connected. Send {type:'connect'} first."})
+                continue
+            # -------- function registry --------
+            if mtype == "add_function":
+                name = str(msg.get("name") or "").strip()
+                schema = msg.get("schema") or {}
+                if not name:
+                    await emit({"type": "error", "message": "add_function missing name"})
+                    continue
+                s = get_session(session_id)
+                s.functions[name] = schema
+                await emit({"type": "function_added", "name": name})
                 continue
+            if mtype == "remove_function":
+                name = str(msg.get("name") or "").strip()
+                s = get_session(session_id)
+                if name in s.functions:
+                    s.functions.pop(name, None)
+                    await emit({"type": "function_removed", "name": name})
+                else:
+                    await emit({"type": "warning", "message": f"Function not found: {name}"})
+                continue
+            if mtype == "list_functions":
+                s = get_session(session_id)
+                await emit({"type": "functions", "items": list(s.functions.keys())})
+                continue
+            # Client returns tool results
+            if mtype == "function_result":
+                call_id = msg.get("call_id")
+                result = msg.get("result")
+                if not call_id:
+                    await emit({"type": "error", "message": "function_result missing call_id"})
                     continue
+                ok = deliver_function_result(session_id, call_id, result)
+                if not ok:
+                    await emit({"type": "warning", "message": f"No pending call_id: {call_id}"})
+                else:
+                    await emit({"type": "function_result_ack", "call_id": call_id})
+                continue
+            # -------- chat --------
+            if mtype == "send":
+                text = str(msg.get("text") or "")
+                if not text.strip():
+                    await emit({"type": "error", "message": "Empty text"})
                     continue
+                try:
+                    assistant_text = await gemini_chat_turn(
+                        session_id=session_id,
+                        user_text=text,
+                        emit_event=emit,  # this is where tool calls get emitted
+                        model=os.getenv("GEMINI_TEXT_MODEL", "gemini-2.0-flash"),
+                    )
+                    await emit({"type": "assistant", "text": assistant_text})
+                except Exception as e:
+                    await emit({"type": "error", "message": f"Gemini error: {e}"})
                 continue
+            await emit({"type": "error", "message": f"Unknown type: {mtype}"})
     except WebSocketDisconnect:
+        return
+    except Exception as e:
         try:
+            await emit({"type": "error", "message": f"WS crashed: {e}"})
         except Exception:
             pass