Spaces:

IFMedTech
/

audio_chat

Runtime error

App Files Files Community

IFMedTechdemo commited on Dec 23, 2025

Commit

f8857ac

verified ·

1 Parent(s): 08a0719

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -1

app.py CHANGED Viewed

@@ -113,4 +113,96 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                             self.audio_queue.put_nowait(audio)
                         if text := response.text:
-                            print(f

                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                             self.audio_queue.put_nowait(audio)
                         if text := response.text:
+                            print(f"Gemini: {text}")
+                except websockets.exceptions.ConnectionClosedOK:
+                    print("Gemini session closed")
+                    break
+    # Video: receive frames from webcam/screen
+    async def video_receive(self, frame: np.ndarray):
+        """Process incoming video frames"""
+        self.video_queue.put_nowait(frame)
+        # Send frame to Gemini at ~1 FPS
+        video_mode = self.latest_args[2]
+        if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
+            self.last_frame_time = time.time()
+            await self.session.send(input=encode_image(frame))
+    async def video_emit(self) -> np.ndarray:
+        """Return video frames to display"""
+        frame = await wait_for_item(self.video_queue, 0.01)
+        if frame is not None:
+            return frame
+        # Fallback frame
+        return np.zeros((480, 640, 3), dtype=np.uint8)
+    # Audio: forward microphone audio to Gemini
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        """Process incoming audio from microphone"""
+        _, array = frame
+        array = array.squeeze()
+        audio_message = encode_audio(array)
+        if self.session:
+            await self.session.send(input=audio_message)
+    # Audio: emit Gemini's audio response
+    async def emit(self):
+        """Send Gemini's audio to speakers"""
+        array = await wait_for_item(self.audio_queue, 0.01)
+        if array is not None:
+            return (self.output_sample_rate, array)
+        return array
+    async def shutdown(self) -> None:
+        """Clean up session"""
+        if self.session:
+            self.quit.set()
+            await self.session.close()
+            self.quit.clear()
+# Create the Gradio application
+stream = Stream(
+    handler=GeminiLiveHandler(),
+    modality="audio-video",
+    mode="send-receive",
+    server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
+    rtc_configuration=get_hf_turn_credentials(),
+    additional_inputs=[
+        gr.Markdown(
+            "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
+            "Talk to Gemini with real-time audio and video streaming.\n\n"
+            "**Features:** Voice conversation, webcam/screen sharing, low-latency responses, "
+            "interruption support, and natural voice interactions.\n\n"
+            "**Note:** API key is configured in Space settings (not visible to users)."
+        ),
+        gr.Textbox(
+            label="System Instruction",
+            value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
+            lines=3,
+            info="Customize how Gemini should behave"
+        ),
+        gr.Radio(
+            choices=["camera", "screen", "none"],
+            value="camera",
+            label="Video Mode",
+            info="camera: webcam feed | screen: screen capture | none: audio only"
+        ),
+    ],
+    ui_args={
+        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+        "pulse_color": "rgb(66, 133, 244)",
+        "icon_button_color": "rgb(66, 133, 244)",
+        "title": "Gemini Live - Voice & Vision",
+    },
+    time_limit=900,  # 15 minutes
+    concurrency_limit=10,
+)
+if __name__ == "__main__":
+    stream.ui.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+    )