Spaces:

IFMedTech
/

audio_chat

Paused

App Files Files Community

IFMedTechdemo commited on Dec 23, 2025

Commit

0d25cdd

verified ·

1 Parent(s): f40004d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -104

app.py CHANGED Viewed

@@ -14,13 +14,10 @@ import numpy as np
 import cv2
 import websockets
 from PIL import Image
-from dotenv import load_dotenv
 from google import genai
 from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
 import gradio as gr
-load_dotenv()
 # Encoder functions for Gemini API
 def encode_audio(data: np.ndarray) -> dict:
     """Encode audio data (int16 mono) for Gemini."""
@@ -64,13 +61,19 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
         """Initialize Gemini Live API session"""
         await self.wait_for_args()
-        # Get API key and system instruction from Gradio inputs
-        api_key = self.latest_args[1]  # From gr.Textbox
-        system_instruction = self.latest_args[2]  # From gr.Textbox
-        video_mode = self.latest_args[3]  # From gr.Radio
         if not api_key:
-            raise WebRTCError("Gemini API Key is required. Get one at https://aistudio.google.com/apikey")
         # Initialize Gemini client
         client = genai.Client(
@@ -110,99 +113,4 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                             self.audio_queue.put_nowait(audio)
                         if text := response.text:
-                            print(f"Gemini: {text}")
-                except websockets.exceptions.ConnectionClosedOK:
-                    print("Gemini session closed")
-                    break
-    # Video: receive frames from webcam/screen
-    async def video_receive(self, frame: np.ndarray):
-        """Process incoming video frames"""
-        self.video_queue.put_nowait(frame)
-        # Send frame to Gemini at ~1 FPS
-        video_mode = self.latest_args[3]
-        if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
-            self.last_frame_time = time.time()
-            await self.session.send(input=encode_image(frame))
-    async def video_emit(self) -> np.ndarray:
-        """Return video frames to display"""
-        frame = await wait_for_item(self.video_queue, 0.01)
-        if frame is not None:
-            return frame
-        # Fallback frame
-        return np.zeros((480, 640, 3), dtype=np.uint8)
-    # Audio: forward microphone audio to Gemini
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        """Process incoming audio from microphone"""
-        _, array = frame
-        array = array.squeeze()
-        audio_message = encode_audio(array)
-        if self.session:
-            await self.session.send(input=audio_message)
-    # Audio: emit Gemini's audio response
-    async def emit(self):
-        """Send Gemini's audio to speakers"""
-        array = await wait_for_item(self.audio_queue, 0.01)
-        if array is not None:
-            return (self.output_sample_rate, array)
-        return array
-    async def shutdown(self) -> None:
-        """Clean up session"""
-        if self.session:
-            self.quit.set()
-            await self.session.close()
-            self.quit.clear()
-# Create the Gradio application
-stream = Stream(
-    handler=GeminiLiveHandler(),
-    modality="audio-video",
-    mode="send-receive",
-    server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
-    rtc_configuration=get_hf_turn_credentials(),
-    additional_inputs=[
-        gr.Markdown(
-            "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
-            "Talk to Gemini with real-time audio and video streaming. "
-            "Get your API key at [Google AI Studio](https://aistudio.google.com/apikey).\n\n"
-            "**Features:** Voice conversation, webcam/screen sharing, low-latency responses"
-        ),
-        gr.Textbox(
-            label="Gemini API Key",
-            type="password",
-            placeholder="Enter your Gemini API key",
-        ),
-        gr.Textbox(
-            label="System Instruction",
-            value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
-            lines=3,
-        ),
-        gr.Radio(
-            choices=["camera", "screen", "none"],
-            value="camera",
-            label="Video Mode",
-            info="camera: webcam feed | screen: screen capture | none: audio only"
-        ),
-    ],
-    ui_args={
-        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
-        "pulse_color": "rgb(66, 133, 244)",
-        "icon_button_color": "rgb(66, 133, 244)",
-        "title": "Gemini Live - Voice & Vision",
-    },
-    time_limit=900,  # 15 minutes
-    concurrency_limit=10,
-)
-if __name__ == "__main__":
-    stream.ui.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,  # Creates public URL for sharing
-    )

 import cv2
 import websockets
 from PIL import Image
 from google import genai
 from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
 import gradio as gr
 # Encoder functions for Gemini API
 def encode_audio(data: np.ndarray) -> dict:
     """Encode audio data (int16 mono) for Gemini."""
         """Initialize Gemini Live API session"""
         await self.wait_for_args()
+        # Get API key from HF Spaces secrets (environment variable)
+        api_key = os.environ.get("GEMINI_API_KEY")
         if not api_key:
+            raise WebRTCError(
+                "Gemini API Key not found. "
+                "Please add GEMINI_API_KEY in Space Settings > Variables and Secrets. "
+                "Get your key at https://aistudio.google.com/apikey"
+            )
+        # Get system instruction and video mode from Gradio inputs
+        system_instruction = self.latest_args[1]
+        video_mode = self.latest_args[2]
         # Initialize Gemini client
         client = genai.Client(
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                             self.audio_queue.put_nowait(audio)
                         if text := response.text:
+                            print(f