Spaces:

SalexAI
/

api

Sleeping

App Files Files Community

SalexAI commited on Feb 12

Commit

176ee90

verified ·

1 Parent(s): 8ce42f6

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +68 -95

app/main.py CHANGED Viewed

@@ -11,9 +11,8 @@ import websockets
 load_dotenv()
-app = FastAPI(title="Gemini Live Native-Audio WS Proxy", version="2.0.0")
-# Gemini Live API WebSocket endpoint (v1beta, BidiGenerateContent)
 GEMINI_LIVE_WS_URL = (
     "wss://generativelanguage.googleapis.com/ws/"
     "google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"
@@ -21,31 +20,54 @@ GEMINI_LIVE_WS_URL = (
 API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
-# Defaults (override via HF Space variables)
-DEFAULT_MODEL = os.getenv("GEMINI_MODEL", "models/gemini-2.0-flash-live-001")
-DEFAULT_SYSTEM = os.getenv("GEMINI_SYSTEM_INSTRUCTION", "You are a helpful assistant for a school coding club.")
 DEFAULT_TEMPERATURE = float(os.getenv("GEMINI_TEMPERATURE", "0.7"))
 DEFAULT_MAX_TOKENS = int(os.getenv("GEMINI_MAX_OUTPUT_TOKENS", "1024"))
-# Native-audio config defaults
 DEFAULT_VOICE = os.getenv("GEMINI_VOICE_NAME", "Kore")
-# input audio: most common is 16k PCM16 mono
 DEFAULT_INPUT_RATE = int(os.getenv("GEMINI_INPUT_AUDIO_RATE", "16000"))
-# output audio: docs commonly mention 24k PCM16
 DEFAULT_OUTPUT_RATE = int(os.getenv("GEMINI_OUTPUT_AUDIO_RATE", "24000"))
-# Debug passthrough (set to "1" to enable)
 DEBUG_GEMINI_RAW = os.getenv("DEBUG_GEMINI_RAW", "0").strip() == "1"
 @app.get("/health")
 async def health():
     ok = bool(API_KEY)
     return JSONResponse(
         {
             "ok": ok,
             "has_api_key": ok,
-            "model": DEFAULT_MODEL,
             "voice": DEFAULT_VOICE,
             "input_rate": DEFAULT_INPUT_RATE,
             "output_rate": DEFAULT_OUTPUT_RATE,
@@ -64,9 +86,6 @@ def _extract_text_parts(content: Dict[str, Any]) -> str:
 def _extract_inline_audio_parts(content: Dict[str, Any]) -> List[Dict[str, str]]:
-    """
-    Returns list of {"mime": "...", "data": "base64..."} for any inlineData parts.
-    """
     parts = content.get("parts") or []
     out: List[Dict[str, str]] = []
     for p in parts:
@@ -86,14 +105,13 @@ async def _gemini_ws_connect(setup_payload: Dict[str, Any]):
     ws = await websockets.connect(
         GEMINI_LIVE_WS_URL,
         extra_headers=headers,
-        max_size=16 * 1024 * 1024,
         ping_interval=20,
         ping_timeout=20,
     )
     await ws.send(json.dumps(setup_payload))
-    # wait for setupComplete
     while True:
         raw = await ws.recv()
         msg = json.loads(raw)
@@ -105,28 +123,6 @@ async def _gemini_ws_connect(setup_payload: Dict[str, Any]):
 @app.websocket("/ws")
 async def ws_proxy(client_ws: WebSocket):
-    """
-    Client protocol (native-audio + VAD friendly):
-      -> {"type":"configure", "model": "...", "system_instruction": "...", "temperature": 0.7,
-          "max_output_tokens": 1024, "voice": "Kore", "input_rate": 16000}
-         (optional, must be first; else defaults are used)
-      -> {"type":"audio","data":"<base64 pcm16 mono>","rate":16000}
-         (send repeatedly while user is speaking)
-      -> {"type":"audio_end"}
-         (send when VAD decides user stopped speaking; triggers assistant response)
-      -> {"type":"text","text":"..."}  (optional helper; NOT the main mode for native audio)
-    Server -> client:
-      <- {"type":"ready"}
-      <- {"type":"text_delta","text":"..."}          (assistant text parts, if any)
-      <- {"type":"audio_delta","mime":"...","data":"..."} (assistant audio chunks)
-      <- {"type":"turn_complete"}
-      <- {"type":"error","message":"..."}
-      <- {"type":"gemini_raw","message":{...}}       (only if DEBUG_GEMINI_RAW=1)
-    """
     await client_ws.accept()
     if not API_KEY:
@@ -134,60 +130,57 @@ async def ws_proxy(client_ws: WebSocket):
         await client_ws.close(code=1011)
         return
-    # --- Phase 1: accept optional configure before connecting to Gemini ---
     cfg = {
-        "model": DEFAULT_MODEL,
-        "system_instruction": DEFAULT_SYSTEM,
         "temperature": DEFAULT_TEMPERATURE,
         "max_output_tokens": DEFAULT_MAX_TOKENS,
-        "voice": DEFAULT_VOICE,
         "input_rate": DEFAULT_INPUT_RATE,
     }
-    async def _wait_for_optional_config(timeout_s: float = 1.2):
-        try:
-            raw = await asyncio.wait_for(client_ws.receive_text(), timeout=timeout_s)
-        except asyncio.TimeoutError:
-            return
-        except Exception:
-            return
-        data = json.loads(raw)
-        if data.get("type") != "configure":
-            # if first message is not configure, we treat it as "not configure"
-            # and stash it for later by putting it into a queue (simple: handle inline)
-            return data
-        # apply config
-        if isinstance(data.get("model"), str) and data["model"].strip():
-            cfg["model"] = data["model"].strip()
-        if isinstance(data.get("system_instruction"), str) and data["system_instruction"].strip():
-            cfg["system_instruction"] = data["system_instruction"].strip()
-        if data.get("temperature") is not None:
             try:
-                cfg["temperature"] = float(data["temperature"])
             except Exception:
                 pass
-        if data.get("max_output_tokens") is not None:
             try:
-                cfg["max_output_tokens"] = int(data["max_output_tokens"])
             except Exception:
                 pass
-        if isinstance(data.get("voice"), str) and data["voice"].strip():
-            cfg["voice"] = data["voice"].strip()
-        if data.get("input_rate") is not None:
             try:
-                cfg["input_rate"] = int(data["input_rate"])
             except Exception:
                 pass
-        await client_ws.send_text(json.dumps({"type": "configured"}))
-        return None
-    first_non_config = await _wait_for_optional_config()
-    # --- Phase 2: connect to Gemini with native-audio setup ---
-    # NOTE: For native-audio models, AUDIO modality is required.
     setup_payload = {
         "setup": {
             "model": cfg["model"],
@@ -203,7 +196,6 @@ async def ws_proxy(client_ws: WebSocket):
                     }
                 },
             },
-            # Enable transcripts so Scratch can display text while audio plays
             "inputAudioTranscription": {},
             "outputAudioTranscription": {},
             "systemInstruction": {
@@ -218,15 +210,12 @@ async def ws_proxy(client_ws: WebSocket):
     try:
         gemini_ws = await _gemini_ws_connect(setup_payload)
-        await client_ws.send_text(json.dumps({"type": "ready"}))
     except Exception as e:
         await client_ws.send_text(json.dumps({"type": "error", "message": f"Gemini setup failed: {e}"}))
         await client_ws.close(code=1011)
         return
-    # If we consumed a non-config first message, we need to handle it.
-    pending_first = first_non_config
     async def forward_client_to_gemini():
         nonlocal pending_first
         try:
@@ -245,7 +234,6 @@ async def ws_proxy(client_ws: WebSocket):
                     return
                 if t == "audio":
-                    # expects base64 PCM16 mono
                     b64 = data.get("data")
                     rate = data.get("rate", cfg["input_rate"])
                     if not isinstance(b64, str) or not b64:
@@ -267,14 +255,10 @@ async def ws_proxy(client_ws: WebSocket):
                     continue
                 if t == "audio_end":
-                    # tell Gemini the input stream ended for this turn
-                    payload = {"realtimeInput": {"audioStreamEnd": True}}
-                    await gemini_ws.send(json.dumps(payload))
                     continue
                 if t == "text":
-                    # Optional helper: send text as a turn (some native-audio sessions still accept it),
-                    # but for voice-first you should mainly use audio.
                     text = data.get("text", "")
                     if isinstance(text, str) and text.strip():
                         payload = {
@@ -286,11 +270,6 @@ async def ws_proxy(client_ws: WebSocket):
                         await gemini_ws.send(json.dumps(payload))
                     continue
-                # Advanced passthrough
-                if t == "live_raw" and isinstance(data.get("payload"), dict):
-                    await gemini_ws.send(json.dumps(data["payload"]))
-                    continue
                 await client_ws.send_text(json.dumps({"type": "error", "message": f"Unknown message type: {t}"}))
         except WebSocketDisconnect:
@@ -315,19 +294,16 @@ async def ws_proxy(client_ws: WebSocket):
                 if isinstance(server_content, dict):
                     model_turn = server_content.get("modelTurn")
                     if isinstance(model_turn, dict):
-                        # text parts
                         txt = _extract_text_parts(model_turn)
                         if txt:
                             await client_ws.send_text(json.dumps({"type": "text_delta", "text": txt}))
-                        # audio parts (inlineData)
                         audios = _extract_inline_audio_parts(model_turn)
                         for a in audios:
                             await client_ws.send_text(
                                 json.dumps({"type": "audio_delta", "mime": a["mime"], "data": a["data"]})
                             )
-                    # Some implementations also include transcription fields; pass through if present
                     out_tx = server_content.get("outputTranscription")
                     if isinstance(out_tx, dict) and isinstance(out_tx.get("text"), str):
                         await client_ws.send_text(
@@ -337,9 +313,6 @@ async def ws_proxy(client_ws: WebSocket):
                     if server_content.get("generationComplete") is True:
                         await client_ws.send_text(json.dumps({"type": "turn_complete"}))
-                if "goAway" in msg:
-                    await client_ws.send_text(json.dumps({"type": "go_away", "goAway": msg["goAway"]}))
         except Exception as e:
             stop_event.set()
             try:

 load_dotenv()
+app = FastAPI(title="Gemini Live Native-Audio WS Proxy", version="2.1.0")
 GEMINI_LIVE_WS_URL = (
     "wss://generativelanguage.googleapis.com/ws/"
     "google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"
 API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
+# IMPORTANT: pick a REAL default model here (must support Live + native audio)
+# Put your known-working native audio model id below:
+FALLBACK_NATIVE_AUDIO_MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
+DEFAULT_MODEL = os.getenv("GEMINI_MODEL", FALLBACK_NATIVE_AUDIO_MODEL)
+DEFAULT_SYSTEM = os.getenv(
+    "GEMINI_SYSTEM_INSTRUCTION",
+    "You are a helpful assistant for a school coding club."
+)
 DEFAULT_TEMPERATURE = float(os.getenv("GEMINI_TEMPERATURE", "0.7"))
 DEFAULT_MAX_TOKENS = int(os.getenv("GEMINI_MAX_OUTPUT_TOKENS", "1024"))
 DEFAULT_VOICE = os.getenv("GEMINI_VOICE_NAME", "Kore")
 DEFAULT_INPUT_RATE = int(os.getenv("GEMINI_INPUT_AUDIO_RATE", "16000"))
 DEFAULT_OUTPUT_RATE = int(os.getenv("GEMINI_OUTPUT_AUDIO_RATE", "24000"))
 DEBUG_GEMINI_RAW = os.getenv("DEBUG_GEMINI_RAW", "0").strip() == "1"
+def _clean_str(x: Any) -> str:
+    if not isinstance(x, str):
+        return ""
+    return x.strip()
+def _is_bad_model(s: str) -> bool:
+    s2 = (s or "").strip().lower()
+    return (not s2) or (s2 in {"undefined", "null", "none"})
+def _safe_model(model: Any) -> str:
+    m = _clean_str(model)
+    if _is_bad_model(m):
+        m = _clean_str(DEFAULT_MODEL)
+    if _is_bad_model(m):
+        m = FALLBACK_NATIVE_AUDIO_MODEL
+    return m
 @app.get("/health")
 async def health():
+    model = _safe_model(DEFAULT_MODEL)
     ok = bool(API_KEY)
     return JSONResponse(
         {
             "ok": ok,
             "has_api_key": ok,
+            "model": model,
             "voice": DEFAULT_VOICE,
             "input_rate": DEFAULT_INPUT_RATE,
             "output_rate": DEFAULT_OUTPUT_RATE,
 def _extract_inline_audio_parts(content: Dict[str, Any]) -> List[Dict[str, str]]:
     parts = content.get("parts") or []
     out: List[Dict[str, str]] = []
     for p in parts:
     ws = await websockets.connect(
         GEMINI_LIVE_WS_URL,
         extra_headers=headers,
+        max_size=32 * 1024 * 1024,
         ping_interval=20,
         ping_timeout=20,
     )
     await ws.send(json.dumps(setup_payload))
     while True:
         raw = await ws.recv()
         msg = json.loads(raw)
 @app.websocket("/ws")
 async def ws_proxy(client_ws: WebSocket):
     await client_ws.accept()
     if not API_KEY:
         await client_ws.close(code=1011)
         return
+    # Defaults per connection
     cfg = {
+        "model": _safe_model(DEFAULT_MODEL),
+        "system_instruction": _clean_str(DEFAULT_SYSTEM) or "You are helpful.",
         "temperature": DEFAULT_TEMPERATURE,
         "max_output_tokens": DEFAULT_MAX_TOKENS,
+        "voice": _clean_str(DEFAULT_VOICE) or "Kore",
         "input_rate": DEFAULT_INPUT_RATE,
     }
+    # Wait briefly for optional configure (FIRST message)
+    pending_first: Optional[Dict[str, Any]] = None
+    try:
+        raw = await asyncio.wait_for(client_ws.receive_text(), timeout=1.2)
+        first = json.loads(raw)
+        if isinstance(first, dict) and first.get("type") == "configure":
+            cfg["model"] = _safe_model(first.get("model"))
+            si = _clean_str(first.get("system_instruction"))
+            if si:
+                cfg["system_instruction"] = si
             try:
+                if first.get("temperature") is not None:
+                    cfg["temperature"] = float(first["temperature"])
             except Exception:
                 pass
             try:
+                if first.get("max_output_tokens") is not None:
+                    cfg["max_output_tokens"] = int(first["max_output_tokens"])
             except Exception:
                 pass
+            v = _clean_str(first.get("voice"))
+            if v:
+                cfg["voice"] = v
             try:
+                if first.get("input_rate") is not None:
+                    cfg["input_rate"] = int(first["input_rate"])
             except Exception:
                 pass
+            await client_ws.send_text(json.dumps({"type": "configured"}))
+        else:
+            pending_first = first if isinstance(first, dict) else None
+    except asyncio.TimeoutError:
+        pass
+    except Exception:
+        pass
+    # FINAL guard (this prevents “undefined” ever reaching Gemini)
+    cfg["model"] = _safe_model(cfg["model"])
+    # Build native-audio session setup
     setup_payload = {
         "setup": {
             "model": cfg["model"],
                     }
                 },
             },
             "inputAudioTranscription": {},
             "outputAudioTranscription": {},
             "systemInstruction": {
     try:
         gemini_ws = await _gemini_ws_connect(setup_payload)
+        await client_ws.send_text(json.dumps({"type": "ready", "model": cfg["model"]}))
     except Exception as e:
         await client_ws.send_text(json.dumps({"type": "error", "message": f"Gemini setup failed: {e}"}))
         await client_ws.close(code=1011)
         return
     async def forward_client_to_gemini():
         nonlocal pending_first
         try:
                     return
                 if t == "audio":
                     b64 = data.get("data")
                     rate = data.get("rate", cfg["input_rate"])
                     if not isinstance(b64, str) or not b64:
                     continue
                 if t == "audio_end":
+                    await gemini_ws.send(json.dumps({"realtimeInput": {"audioStreamEnd": True}}))
                     continue
                 if t == "text":
                     text = data.get("text", "")
                     if isinstance(text, str) and text.strip():
                         payload = {
                         await gemini_ws.send(json.dumps(payload))
                     continue
                 await client_ws.send_text(json.dumps({"type": "error", "message": f"Unknown message type: {t}"}))
         except WebSocketDisconnect:
                 if isinstance(server_content, dict):
                     model_turn = server_content.get("modelTurn")
                     if isinstance(model_turn, dict):
                         txt = _extract_text_parts(model_turn)
                         if txt:
                             await client_ws.send_text(json.dumps({"type": "text_delta", "text": txt}))
                         audios = _extract_inline_audio_parts(model_turn)
                         for a in audios:
                             await client_ws.send_text(
                                 json.dumps({"type": "audio_delta", "mime": a["mime"], "data": a["data"]})
                             )
                     out_tx = server_content.get("outputTranscription")
                     if isinstance(out_tx, dict) and isinstance(out_tx.get("text"), str):
                         await client_ws.send_text(
                     if server_content.get("generationComplete") is True:
                         await client_ws.send_text(json.dumps({"type": "turn_complete"}))
         except Exception as e:
             stop_event.set()
             try: