Spaces:

IFMedTechdemo
/

Audio_chat_bot

Sleeping

App Files Files Community

IFMedTechdemo commited on Dec 23, 2025

Commit

719147e

verified ·

1 Parent(s): d469346

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -96

app.py CHANGED Viewed

@@ -1,8 +1,5 @@
 """
 Gradio Application for Gemini Live API with Audio + Video Streaming
-Installation:
-pip install "fastrtc[vad, tts]" gradio google-genai python-dotenv websockets pillow opencv-python numpy
 """
 import asyncio
@@ -28,7 +25,6 @@ def encode_audio(data: np.ndarray) -> dict:
 def encode_image(data: np.ndarray) -> dict:
     """Encode image data as JPEG for Gemini."""
-    # Convert BGR to RGB if needed
     if len(data.shape) == 3 and data.shape[2] == 3:
         data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
@@ -40,7 +36,6 @@ def encode_image(data: np.ndarray) -> dict:
         base64_str = str(base64.b64encode(bytes_data), "utf-8")
         return {"mime_type": "image/jpeg", "data": base64_str}
-# Main handler class
 class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
     def __init__(self) -> None:
         super().__init__(
@@ -61,108 +56,78 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
         """Initialize Gemini Live API session"""
         await self.wait_for_args()
-        # Get API key from HF Spaces secrets (environment variable)
         api_key = os.environ.get("GEMINI_API_KEY")
         if not api_key:
-            raise WebRTCError(
-                "Gemini API Key not found. "
-                "Please add GEMINI_API_KEY in Space Settings > Variables and Secrets. "
-                "Get your key at https://aistudio.google.com/apikey"
-            )
-        # Get system instruction and video mode from Gradio inputs
         system_instruction = self.latest_args[1]
-        video_mode = self.latest_args[2]
-        # Initialize Gemini client
         client = genai.Client(
             api_key=api_key,
             http_options={"api_version": "v1beta"}
         )
-        # Configure Gemini session
         config = {
             "response_modalities": ["AUDIO"],
-            "system_instruction": system_instruction or "You are a helpful AI assistant. Be conversational and engaging.",
             "speech_config": {
-                "voice_config": {
-                    "prebuilt_voice_config": {"voice_name": "Zephyr"}
-                }
-            },
-            "context_window_compression": {
-                "trigger_tokens": 25600,
-                "sliding_window": {"target_tokens": 12800}
             }
         }
-        # Start Live API session
         async with client.aio.live.connect(
-            model="models/gemini-2.5-flash-native-audio-preview-12-2025",
             config=config,
         ) as session:
             self.session = session
-            # Listen for responses from Gemini
-            while not self.quit.is_set():
-                turn = self.session.receive()
-                try:
-                    async for response in turn:
-                        if data := response.data:
-                            # Convert audio bytes to numpy array
-                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                            self.audio_queue.put_nowait(audio)
-                        if text := response.text:
-                            print(f"Gemini: {text}")
-                except websockets.exceptions.ConnectionClosedOK:
-                    print("Gemini session closed")
-                    break
-    # Video: receive frames from webcam/screen
     async def video_receive(self, frame: np.ndarray):
-        """Process incoming video frames"""
         self.video_queue.put_nowait(frame)
-        # Send frame to Gemini at ~1 FPS
         video_mode = self.latest_args[2]
         if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
             self.last_frame_time = time.time()
             await self.session.send(input=encode_image(frame))
     async def video_emit(self) -> np.ndarray:
-        """Return video frames to display"""
         frame = await wait_for_item(self.video_queue, 0.01)
-        if frame is not None:
-            return frame
-        # Fallback frame
-        return np.zeros((480, 640, 3), dtype=np.uint8)
-    # Audio: forward microphone audio to Gemini
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        """Process incoming audio from microphone"""
         _, array = frame
         array = array.squeeze()
-        audio_message = encode_audio(array)
         if self.session:
-            await self.session.send(input=audio_message)
-    # Audio: emit Gemini's audio response
     async def emit(self):
-        """Send Gemini's audio to speakers"""
         array = await wait_for_item(self.audio_queue, 0.01)
-        if array is not None:
-            return (self.output_sample_rate, array)
-        return array
     async def shutdown(self) -> None:
-        """Clean up session"""
         if self.session:
             self.quit.set()
             await self.session.close()
             self.quit.clear()
-# Create the Gradio application
 stream = Stream(
     handler=GeminiLiveHandler(),
     modality="audio-video",
@@ -170,39 +135,15 @@ stream = Stream(
     server_rtc_configuration=get_cloudflare_turn_credentials(),
     rtc_configuration=get_cloudflare_turn_credentials(),
     additional_inputs=[
-        gr.Markdown(
-            "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
-            "Talk to Gemini with real-time audio and video streaming.\n\n"
-            "**Features:** Voice conversation, webcam/screen sharing, low-latency responses, "
-            "interruption support, and natural voice interactions.\n\n"
-            "**Note:** API key is configured in Space settings (not visible to users)."
-        ),
-        gr.Textbox(
-            label="System Instruction",
-            value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
-            lines=3,
-            info="Customize how Gemini should behave"
-        ),
-        gr.Radio(
-            choices=["camera", "screen", "none"],
-            value="camera",
-            label="Video Mode",
-            info="camera: webcam feed | screen: screen capture | none: audio only"
-        ),
     ],
     ui_args={
-        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
-        "pulse_color": "rgb(66, 133, 244)",
-        "icon_button_color": "rgb(66, 133, 244)",
-        "title": "Gemini Live - Voice & Vision",
-    },
-    time_limit=900,  # 15 minutes
-    concurrency_limit=10,
 )
 if __name__ == "__main__":
-    stream.ui.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-    )

 """
 Gradio Application for Gemini Live API with Audio + Video Streaming
 """
 import asyncio
 def encode_image(data: np.ndarray) -> dict:
     """Encode image data as JPEG for Gemini."""
     if len(data.shape) == 3 and data.shape[2] == 3:
         data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
         base64_str = str(base64.b64encode(bytes_data), "utf-8")
         return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
     def __init__(self) -> None:
         super().__init__(
         """Initialize Gemini Live API session"""
         await self.wait_for_args()
         api_key = os.environ.get("GEMINI_API_KEY")
         if not api_key:
+            raise WebRTCError("Gemini API Key not found in Space secrets.")
         system_instruction = self.latest_args[1]
         client = genai.Client(
             api_key=api_key,
             http_options={"api_version": "v1beta"}
         )
         config = {
             "response_modalities": ["AUDIO"],
+            "system_instruction": system_instruction or "You are a helpful assistant.",
             "speech_config": {
+                "voice_config": {"prebuilt_voice_config": {"voice_name": "Zephyr"}}
             }
         }
+        # Use the standard preview model gemini-2.0-flash-exp
         async with client.aio.live.connect(
+            model="gemini-2.0-flash-exp",
             config=config,
         ) as session:
             self.session = session
+            # --- ADDED: Initial greeting to make the bot speak first ---
+            await self.session.send(input="Hello! I'm connected and ready to help.", end_of_turn=True)
+            # Listen for responses continuously
+            try:
+                async for response in self.session.receive():
+                    if self.quit.is_set():
+                        break
+                    if data := response.data:
+                        audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                        self.audio_queue.put_nowait(audio)
+                    if text := response.text:
+                        print(f"Gemini: {text}")
+            except Exception as e:
+                print(f"Session error: {e}")
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         video_mode = self.latest_args[2]
         if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
             self.last_frame_time = time.time()
             await self.session.send(input=encode_image(frame))
     async def video_emit(self) -> np.ndarray:
         frame = await wait_for_item(self.video_queue, 0.01)
+        return frame if frame is not None else np.zeros((480, 640, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
         if self.session:
+            await self.session.send(input=encode_audio(array))
     async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
+        return (self.output_sample_rate, array) if array is not None else None
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
             await self.session.close()
             self.quit.clear()
 stream = Stream(
     handler=GeminiLiveHandler(),
     modality="audio-video",
     server_rtc_configuration=get_cloudflare_turn_credentials(),
     rtc_configuration=get_cloudflare_turn_credentials(),
     additional_inputs=[
+        gr.Markdown("## 🎙️ Gemini Live - Real-Time Voice & Vision\n\nClick the **Connect/Start** button to begin."),
+        gr.Textbox(label="System Instruction", value="You are a helpful and concise AI assistant."),
+        gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Mode"),
     ],
     ui_args={
+        "title": "Gemini Live Assistant",
+        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png"
+    }
 )
 if __name__ == "__main__":
+    stream.ui.launch(server_name="0.0.0.0", server_port=7860)