IFMedTechdemo commited on
Commit
0d25cdd
·
verified ·
1 Parent(s): f40004d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -104
app.py CHANGED
@@ -14,13 +14,10 @@ import numpy as np
14
  import cv2
15
  import websockets
16
  from PIL import Image
17
- from dotenv import load_dotenv
18
  from google import genai
19
  from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
20
  import gradio as gr
21
 
22
- load_dotenv()
23
-
24
  # Encoder functions for Gemini API
25
  def encode_audio(data: np.ndarray) -> dict:
26
  """Encode audio data (int16 mono) for Gemini."""
@@ -64,13 +61,19 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
64
  """Initialize Gemini Live API session"""
65
  await self.wait_for_args()
66
 
67
- # Get API key and system instruction from Gradio inputs
68
- api_key = self.latest_args[1] # From gr.Textbox
69
- system_instruction = self.latest_args[2] # From gr.Textbox
70
- video_mode = self.latest_args[3] # From gr.Radio
71
 
72
  if not api_key:
73
- raise WebRTCError("Gemini API Key is required. Get one at https://aistudio.google.com/apikey")
 
 
 
 
 
 
 
 
74
 
75
  # Initialize Gemini client
76
  client = genai.Client(
@@ -110,99 +113,4 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
110
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
111
  self.audio_queue.put_nowait(audio)
112
  if text := response.text:
113
- print(f"Gemini: {text}")
114
- except websockets.exceptions.ConnectionClosedOK:
115
- print("Gemini session closed")
116
- break
117
-
118
- # Video: receive frames from webcam/screen
119
- async def video_receive(self, frame: np.ndarray):
120
- """Process incoming video frames"""
121
- self.video_queue.put_nowait(frame)
122
-
123
- # Send frame to Gemini at ~1 FPS
124
- video_mode = self.latest_args[3]
125
- if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
126
- self.last_frame_time = time.time()
127
- await self.session.send(input=encode_image(frame))
128
-
129
- async def video_emit(self) -> np.ndarray:
130
- """Return video frames to display"""
131
- frame = await wait_for_item(self.video_queue, 0.01)
132
- if frame is not None:
133
- return frame
134
- # Fallback frame
135
- return np.zeros((480, 640, 3), dtype=np.uint8)
136
-
137
- # Audio: forward microphone audio to Gemini
138
- async def receive(self, frame: tuple[int, np.ndarray]) -> None:
139
- """Process incoming audio from microphone"""
140
- _, array = frame
141
- array = array.squeeze()
142
- audio_message = encode_audio(array)
143
-
144
- if self.session:
145
- await self.session.send(input=audio_message)
146
-
147
- # Audio: emit Gemini's audio response
148
- async def emit(self):
149
- """Send Gemini's audio to speakers"""
150
- array = await wait_for_item(self.audio_queue, 0.01)
151
- if array is not None:
152
- return (self.output_sample_rate, array)
153
- return array
154
-
155
- async def shutdown(self) -> None:
156
- """Clean up session"""
157
- if self.session:
158
- self.quit.set()
159
- await self.session.close()
160
- self.quit.clear()
161
-
162
- # Create the Gradio application
163
- stream = Stream(
164
- handler=GeminiLiveHandler(),
165
- modality="audio-video",
166
- mode="send-receive",
167
- server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
168
- rtc_configuration=get_hf_turn_credentials(),
169
- additional_inputs=[
170
- gr.Markdown(
171
- "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
172
- "Talk to Gemini with real-time audio and video streaming. "
173
- "Get your API key at [Google AI Studio](https://aistudio.google.com/apikey).\n\n"
174
- "**Features:** Voice conversation, webcam/screen sharing, low-latency responses"
175
- ),
176
- gr.Textbox(
177
- label="Gemini API Key",
178
- type="password",
179
- placeholder="Enter your Gemini API key",
180
- ),
181
- gr.Textbox(
182
- label="System Instruction",
183
- value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
184
- lines=3,
185
- ),
186
- gr.Radio(
187
- choices=["camera", "screen", "none"],
188
- value="camera",
189
- label="Video Mode",
190
- info="camera: webcam feed | screen: screen capture | none: audio only"
191
- ),
192
- ],
193
- ui_args={
194
- "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
195
- "pulse_color": "rgb(66, 133, 244)",
196
- "icon_button_color": "rgb(66, 133, 244)",
197
- "title": "Gemini Live - Voice & Vision",
198
- },
199
- time_limit=900, # 15 minutes
200
- concurrency_limit=10,
201
- )
202
-
203
- if __name__ == "__main__":
204
- stream.ui.launch(
205
- server_name="0.0.0.0",
206
- server_port=7860,
207
- share=True, # Creates public URL for sharing
208
- )
 
14
  import cv2
15
  import websockets
16
  from PIL import Image
 
17
  from google import genai
18
  from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
19
  import gradio as gr
20
 
 
 
21
  # Encoder functions for Gemini API
22
  def encode_audio(data: np.ndarray) -> dict:
23
  """Encode audio data (int16 mono) for Gemini."""
 
61
  """Initialize Gemini Live API session"""
62
  await self.wait_for_args()
63
 
64
+ # Get API key from HF Spaces secrets (environment variable)
65
+ api_key = os.environ.get("GEMINI_API_KEY")
 
 
66
 
67
  if not api_key:
68
+ raise WebRTCError(
69
+ "Gemini API Key not found. "
70
+ "Please add GEMINI_API_KEY in Space Settings > Variables and Secrets. "
71
+ "Get your key at https://aistudio.google.com/apikey"
72
+ )
73
+
74
+ # Get system instruction and video mode from Gradio inputs
75
+ system_instruction = self.latest_args[1]
76
+ video_mode = self.latest_args[2]
77
 
78
  # Initialize Gemini client
79
  client = genai.Client(
 
113
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
114
  self.audio_queue.put_nowait(audio)
115
  if text := response.text:
116
+ print(f