Spaces:

SalexAI
/

api

Sleeping

App Files Files Community

SalexAI commited on Feb 12

Commit

9e04ed9

verified ·

1 Parent(s): 6a84946

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +34 -136

app/main.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import asyncio
-import base64
 import json
 import os
 from typing import AsyncGenerator, Literal
@@ -9,91 +8,36 @@ from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
-from fastrtc import (
-    AdditionalOutputs,
-    AsyncStreamHandler,
-    Stream,
-    wait_for_item,
-)
-from google import genai
-from google.genai.types import (
-    LiveConnectConfig,
-    PrebuiltVoiceConfig,
-    SpeechConfig,
-    VoiceConfig,
-)
-load_dotenv()
-# ---------------------------
-# Config (env vars)
-# ---------------------------
-# Put this in your HF Space "Secrets":
-#   GEMINI_API_KEY = "..."
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
-# Gemini realtime model (this is the one FastRTC uses in their Gemini demo Space)
-# You can change this later to another Live-capable model.
 GEMINI_LIVE_MODEL = os.getenv("GEMINI_LIVE_MODEL", "gemini-2.0-flash-exp")
-# Voice name (FastRTC Gemini demo uses "Puck" by default)
 DEFAULT_VOICE = os.getenv("GEMINI_VOICE", "Puck")
-# Sample rates
 OUTPUT_SAMPLE_RATE = int(os.getenv("OUTPUT_SAMPLE_RATE", "24000"))
-INPUT_SAMPLE_RATE = int(os.getenv("INPUT_SAMPLE_RATE", "16000"))  # matches the demo Space
-def _encode_pcm16_mono_to_b64(data: np.ndarray) -> str:
-    """
-    Encodes int16 mono PCM to base64 for any custom debug endpoints.
-    """
-    if data.dtype != np.int16:
-        data = data.astype(np.int16)
-    return base64.b64encode(data.tobytes()).decode("utf-8")
 class GeminiLiveAudioHandler(AsyncStreamHandler):
-    """
-    FastRTC AsyncStreamHandler that connects to Gemini Live and streams AUDIO back.
-    This is adapted from the official FastRTC Gemini demo Space code. :contentReference[oaicite:5]{index=5}
-    """
-    def __init__(
-        self,
-        expected_layout: Literal["mono"] = "mono",
-        output_sample_rate: int = OUTPUT_SAMPLE_RATE,
-    ) -> None:
-        super().__init__(
-            expected_layout=expected_layout,
-            output_sample_rate=output_sample_rate,
-            input_sample_rate=INPUT_SAMPLE_RATE,
-        )
         self.input_queue: asyncio.Queue[bytes] = asyncio.Queue()
         self.output_queue: asyncio.Queue[tuple[int, np.ndarray] | AdditionalOutputs] = asyncio.Queue()
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiLiveAudioHandler":
-        # FastRTC uses .copy() to clone per-connection handlers
-        return GeminiLiveAudioHandler(
-            expected_layout="mono",
-            output_sample_rate=self.output_sample_rate,
-        )
     async def start_up(self) -> None:
-        """
-        Connect to Gemini Live, then continuously:
-          - read user audio from self.stream()
-          - receive model audio chunks and push them to output_queue
-        """
-        # Optional: allow per-connection overrides via "additional_inputs"
-        # We wait for args to be set (FastRTC API docs show wait_for_args usage). :contentReference[oaicite:6]{index=6}
         await self.wait_for_args()
-        # latest_args includes metadata at [0]; any custom inputs start at [1]
-        # We'll accept: voice_name (str) as the single custom arg, fallback to DEFAULT_VOICE.
         voice_name = DEFAULT_VOICE
         try:
             if len(self.latest_args) >= 2 and isinstance(self.latest_args[1], str) and self.latest_args[1].strip():
@@ -101,21 +45,14 @@ class GeminiLiveAudioHandler(AsyncStreamHandler):
         except Exception:
             pass
-        api_key = GEMINI_API_KEY
-        if not api_key:
-            # Fail early with a helpful message in the client.
-            await self.output_queue.put(
-                AdditionalOutputs({"type": "error", "message": "Missing GEMINI_API_KEY env var on the server."})
-            )
             return
-        client = genai.Client(
-            api_key=api_key,
-            http_options={"api_version": "v1alpha"},  # matches FastRTC Gemini demo Space :contentReference[oaicite:7]{index=7}
-        )
         config = LiveConnectConfig(
-            response_modalities=["AUDIO"],  # AUDIO-only mode :contentReference[oaicite:8]{index=8}
             speech_config=SpeechConfig(
                 voice_config=VoiceConfig(
                     prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
@@ -124,104 +61,65 @@ class GeminiLiveAudioHandler(AsyncStreamHandler):
         )
         async with client.aio.live.connect(model=GEMINI_LIVE_MODEL, config=config) as session:
-            # session.start_stream takes an async generator of bytes
             async for audio in session.start_stream(stream=self._stream_pcm(), mime_type="audio/pcm"):
-                if audio.data:
-                    # Gemini returns pcm16 bytes; convert to int16 array
                     arr = np.frombuffer(audio.data, dtype=np.int16)
-                    # FastRTC expects (sample_rate, np.ndarray) shaped like (1, n) or (n,) depending on handler usage.
                     self.output_queue.put_nowait((self.output_sample_rate, arr.reshape(1, -1)))
     async def _stream_pcm(self) -> AsyncGenerator[bytes, None]:
-        """
-        Provides PCM bytes to Gemini Live continuously.
-        """
         while not self.quit.is_set():
             try:
                 chunk = await asyncio.wait_for(self.input_queue.get(), timeout=0.1)
                 yield chunk
-            except (asyncio.TimeoutError, TimeoutError):
                 pass
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        """
-        Called by FastRTC as audio frames arrive from the client.
-        """
         _, audio = frame
-        # Expect mono, int16-ish. Convert safely.
         audio = np.asarray(audio)
         if audio.ndim == 2:
             audio = audio.squeeze()
         if audio.dtype != np.int16:
             audio = audio.astype(np.int16)
-        # Push raw PCM16 bytes to Gemini stream
         self.input_queue.put_nowait(audio.tobytes())
-    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
-        """
-        Called by FastRTC to get the next outbound chunk (audio or structured outputs).
-        """
         return await wait_for_item(self.output_queue)
     async def shutdown(self) -> None:
         self.quit.set()
-# ---------------------------
-# FastRTC Stream + FastAPI
-# ---------------------------
-# We expose one additional input: voice name
-# Clients can set it via Stream.set_input(...) patterns described in the FastRTC API docs. :contentReference[oaicite:9]{index=9}
 stream = Stream(
     handler=GeminiLiveAudioHandler(),
     modality="audio",
     mode="send-receive",
-    additional_inputs=[
-        # Keep it simple: one string
-        # (FastRTC examples often use Gradio components here; in API mode we’ll set via set_input)
-        # We still define it so handler.wait_for_args() has something to wait on.
-        "voice_name"
-    ],
 )
 app = FastAPI()
-# Mount FastRTC endpoints onto FastAPI (this is the core feature). :contentReference[oaicite:10]{index=10}
 stream.mount(app)
-# ---------------------------
-# Optional: server-side outputs stream (SSE)
-# Works well for Scratch/JS clients that want text/meta without WebRTC.
-# FastRTC docs show using stream.output_stream(webrtc_id). :contentReference[oaicite:11]{index=11}
-# The talk-to-openai Space uses the same approach. :contentReference[oaicite:12]{index=12}
-# ---------------------------
 @app.get("/outputs")
 async def outputs(webrtc_id: str):
     async def event_stream():
         async for out in stream.output_stream(webrtc_id):
-            # out is an AdditionalOutputs instance
-            # Serialize it as SSE "output" events
             payload = json.dumps(out.args[0] if out.args else None)
             yield f"event: output\ndata: {payload}\n\n"
     return StreamingResponse(event_stream(), media_type="text/event-stream")
-@app.get("/health")
-async def health():
-    return {
-        "ok": True,
-        "provider": "gemini_live_audio",
-        "model": GEMINI_LIVE_MODEL,
-        "output_sample_rate": OUTPUT_SAMPLE_RATE,
-        "input_sample_rate": INPUT_SAMPLE_RATE,
-    }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))

 import asyncio
 import json
 import os
 from typing import AsyncGenerator, Literal
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
+load_dotenv()
+# Import gradio first so if it fails, it fails loudly & early
+import gradio as gr  # noqa
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, Stream, wait_for_item
+from google import genai
+from google.genai.types import LiveConnectConfig, PrebuiltVoiceConfig, SpeechConfig, VoiceConfig
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 GEMINI_LIVE_MODEL = os.getenv("GEMINI_LIVE_MODEL", "gemini-2.0-flash-exp")
 DEFAULT_VOICE = os.getenv("GEMINI_VOICE", "Puck")
 OUTPUT_SAMPLE_RATE = int(os.getenv("OUTPUT_SAMPLE_RATE", "24000"))
+INPUT_SAMPLE_RATE = int(os.getenv("INPUT_SAMPLE_RATE", "16000"))
 class GeminiLiveAudioHandler(AsyncStreamHandler):
+    def __init__(self, expected_layout: Literal["mono"] = "mono", output_sample_rate: int = OUTPUT_SAMPLE_RATE):
+        super().__init__(expected_layout=expected_layout, output_sample_rate=output_sample_rate, input_sample_rate=INPUT_SAMPLE_RATE)
         self.input_queue: asyncio.Queue[bytes] = asyncio.Queue()
         self.output_queue: asyncio.Queue[tuple[int, np.ndarray] | AdditionalOutputs] = asyncio.Queue()
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiLiveAudioHandler":
+        return GeminiLiveAudioHandler(expected_layout="mono", output_sample_rate=self.output_sample_rate)
     async def start_up(self) -> None:
         await self.wait_for_args()
         voice_name = DEFAULT_VOICE
         try:
             if len(self.latest_args) >= 2 and isinstance(self.latest_args[1], str) and self.latest_args[1].strip():
         except Exception:
             pass
+        if not GEMINI_API_KEY:
+            await self.output_queue.put(AdditionalOutputs({"type": "error", "message": "Missing GEMINI_API_KEY on server."}))
             return
+        client = genai.Client(api_key=GEMINI_API_KEY, http_options={"api_version": "v1alpha"})
         config = LiveConnectConfig(
+            response_modalities=["AUDIO"],
             speech_config=SpeechConfig(
                 voice_config=VoiceConfig(
                     prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
         )
         async with client.aio.live.connect(model=GEMINI_LIVE_MODEL, config=config) as session:
             async for audio in session.start_stream(stream=self._stream_pcm(), mime_type="audio/pcm"):
+                if getattr(audio, "data", None):
                     arr = np.frombuffer(audio.data, dtype=np.int16)
                     self.output_queue.put_nowait((self.output_sample_rate, arr.reshape(1, -1)))
     async def _stream_pcm(self) -> AsyncGenerator[bytes, None]:
         while not self.quit.is_set():
             try:
                 chunk = await asyncio.wait_for(self.input_queue.get(), timeout=0.1)
                 yield chunk
+            except asyncio.TimeoutError:
                 pass
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, audio = frame
         audio = np.asarray(audio)
         if audio.ndim == 2:
             audio = audio.squeeze()
         if audio.dtype != np.int16:
             audio = audio.astype(np.int16)
         self.input_queue.put_nowait(audio.tobytes())
+    async def emit(self):
         return await wait_for_item(self.output_queue)
     async def shutdown(self) -> None:
         self.quit.set()
 stream = Stream(
     handler=GeminiLiveAudioHandler(),
     modality="audio",
     mode="send-receive",
+    additional_inputs=["voice_name"],
 )
 app = FastAPI()
 stream.mount(app)
+@app.get("/health")
+async def health():
+    return {"ok": True, "model": GEMINI_LIVE_MODEL}
+@app.get("/versions")
+async def versions():
+    import fastrtc
+    return {
+        "gradio": getattr(gr, "__version__", "unknown"),
+        "fastrtc": getattr(fastrtc, "__version__", "unknown"),
+        "python": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
+    }
 @app.get("/outputs")
 async def outputs(webrtc_id: str):
     async def event_stream():
         async for out in stream.output_stream(webrtc_id):
             payload = json.dumps(out.args[0] if out.args else None)
             yield f"event: output\ndata: {payload}\n\n"
     return StreamingResponse(event_stream(), media_type="text/event-stream")