Spaces:

IFMedTech
/

audio_chat

Runtime error

App Files Files Community

IFMedTechdemo commited on Dec 23, 2025

Commit

f40004d

verified ·

1 Parent(s): 6dbfad3

Create app.py

Browse files

Files changed (1) hide show

app.py +208 -0

app.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Gradio Application for Gemini Live API with Audio + Video Streaming
+Installation:
+pip install "fastrtc[vad, tts]" gradio google-genai python-dotenv websockets pillow opencv-python numpy
+"""
+import asyncio
+import base64
+import io
+import os
+import time
+import numpy as np
+import cv2
+import websockets
+from PIL import Image
+from dotenv import load_dotenv
+from google import genai
+from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
+import gradio as gr
+load_dotenv()
+# Encoder functions for Gemini API
+def encode_audio(data: np.ndarray) -> dict:
+    """Encode audio data (int16 mono) for Gemini."""
+    return {
+        "mime_type": "audio/pcm",
+        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
+    }
+def encode_image(data: np.ndarray) -> dict:
+    """Encode image data as JPEG for Gemini."""
+    # Convert BGR to RGB if needed
+    if len(data.shape) == 3 and data.shape[2] == 3:
+        data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
+    with io.BytesIO() as output_bytes:
+        pil_image = Image.fromarray(data)
+        pil_image.thumbnail([1024, 1024])
+        pil_image.save(output_bytes, "JPEG")
+        bytes_data = output_bytes.getvalue()
+        base64_str = str(base64.b64encode(bytes_data), "utf-8")
+        return {"mime_type": "image/jpeg", "data": base64_str}
+# Main handler class
+class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
+    def __init__(self) -> None:
+        super().__init__(
+            expected_layout="mono",
+            output_sample_rate=24000,
+            input_sample_rate=16000,
+        )
+        self.audio_queue = asyncio.Queue()
+        self.video_queue = asyncio.Queue()
+        self.session = None
+        self.last_frame_time = 0.0
+        self.quit = asyncio.Event()
+    def copy(self) -> "GeminiLiveHandler":
+        return GeminiLiveHandler()
+    async def start_up(self):
+        """Initialize Gemini Live API session"""
+        await self.wait_for_args()
+        # Get API key and system instruction from Gradio inputs
+        api_key = self.latest_args[1]  # From gr.Textbox
+        system_instruction = self.latest_args[2]  # From gr.Textbox
+        video_mode = self.latest_args[3]  # From gr.Radio
+        if not api_key:
+            raise WebRTCError("Gemini API Key is required. Get one at https://aistudio.google.com/apikey")
+        # Initialize Gemini client
+        client = genai.Client(
+            api_key=api_key,
+            http_options={"api_version": "v1beta"}
+        )
+        # Configure Gemini session
+        config = {
+            "response_modalities": ["AUDIO"],
+            "system_instruction": system_instruction or "You are a helpful AI assistant. Be conversational and engaging.",
+            "speech_config": {
+                "voice_config": {
+                    "prebuilt_voice_config": {"voice_name": "Zephyr"}
+                }
+            },
+            "context_window_compression": {
+                "trigger_tokens": 25600,
+                "sliding_window": {"target_tokens": 12800}
+            }
+        }
+        # Start Live API session
+        async with client.aio.live.connect(
+            model="models/gemini-2.5-flash-native-audio-preview-12-2025",
+            config=config,
+        ) as session:
+            self.session = session
+            # Listen for responses from Gemini
+            while not self.quit.is_set():
+                turn = self.session.receive()
+                try:
+                    async for response in turn:
+                        if data := response.data:
+                            # Convert audio bytes to numpy array
+                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                            self.audio_queue.put_nowait(audio)
+                        if text := response.text:
+                            print(f"Gemini: {text}")
+                except websockets.exceptions.ConnectionClosedOK:
+                    print("Gemini session closed")
+                    break
+    # Video: receive frames from webcam/screen
+    async def video_receive(self, frame: np.ndarray):
+        """Process incoming video frames"""
+        self.video_queue.put_nowait(frame)
+        # Send frame to Gemini at ~1 FPS
+        video_mode = self.latest_args[3]
+        if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
+            self.last_frame_time = time.time()
+            await self.session.send(input=encode_image(frame))
+    async def video_emit(self) -> np.ndarray:
+        """Return video frames to display"""
+        frame = await wait_for_item(self.video_queue, 0.01)
+        if frame is not None:
+            return frame
+        # Fallback frame
+        return np.zeros((480, 640, 3), dtype=np.uint8)
+    # Audio: forward microphone audio to Gemini
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        """Process incoming audio from microphone"""
+        _, array = frame
+        array = array.squeeze()
+        audio_message = encode_audio(array)
+        if self.session:
+            await self.session.send(input=audio_message)
+    # Audio: emit Gemini's audio response
+    async def emit(self):
+        """Send Gemini's audio to speakers"""
+        array = await wait_for_item(self.audio_queue, 0.01)
+        if array is not None:
+            return (self.output_sample_rate, array)
+        return array
+    async def shutdown(self) -> None:
+        """Clean up session"""
+        if self.session:
+            self.quit.set()
+            await self.session.close()
+            self.quit.clear()
+# Create the Gradio application
+stream = Stream(
+    handler=GeminiLiveHandler(),
+    modality="audio-video",
+    mode="send-receive",
+    server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
+    rtc_configuration=get_hf_turn_credentials(),
+    additional_inputs=[
+        gr.Markdown(
+            "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
+            "Talk to Gemini with real-time audio and video streaming. "
+            "Get your API key at [Google AI Studio](https://aistudio.google.com/apikey).\n\n"
+            "**Features:** Voice conversation, webcam/screen sharing, low-latency responses"
+        ),
+        gr.Textbox(
+            label="Gemini API Key",
+            type="password",
+            placeholder="Enter your Gemini API key",
+        ),
+        gr.Textbox(
+            label="System Instruction",
+            value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
+            lines=3,
+        ),
+        gr.Radio(
+            choices=["camera", "screen", "none"],
+            value="camera",
+            label="Video Mode",
+            info="camera: webcam feed | screen: screen capture | none: audio only"
+        ),
+    ],
+    ui_args={
+        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+        "pulse_color": "rgb(66, 133, 244)",
+        "icon_button_color": "rgb(66, 133, 244)",
+        "title": "Gemini Live - Voice & Vision",
+    },
+    time_limit=900,  # 15 minutes
+    concurrency_limit=10,
+)
+if __name__ == "__main__":
+    stream.ui.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,  # Creates public URL for sharing
+    )