IFMedTechdemo commited on
Commit
f8857ac
·
verified ·
1 Parent(s): 08a0719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -1
app.py CHANGED
@@ -113,4 +113,96 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
113
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
114
  self.audio_queue.put_nowait(audio)
115
  if text := response.text:
116
- print(f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
114
  self.audio_queue.put_nowait(audio)
115
  if text := response.text:
116
+ print(f"Gemini: {text}")
117
+ except websockets.exceptions.ConnectionClosedOK:
118
+ print("Gemini session closed")
119
+ break
120
+
121
+ # Video: receive frames from webcam/screen
122
+ async def video_receive(self, frame: np.ndarray):
123
+ """Process incoming video frames"""
124
+ self.video_queue.put_nowait(frame)
125
+
126
+ # Send frame to Gemini at ~1 FPS
127
+ video_mode = self.latest_args[2]
128
+ if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
129
+ self.last_frame_time = time.time()
130
+ await self.session.send(input=encode_image(frame))
131
+
132
+ async def video_emit(self) -> np.ndarray:
133
+ """Return video frames to display"""
134
+ frame = await wait_for_item(self.video_queue, 0.01)
135
+ if frame is not None:
136
+ return frame
137
+ # Fallback frame
138
+ return np.zeros((480, 640, 3), dtype=np.uint8)
139
+
140
+ # Audio: forward microphone audio to Gemini
141
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
142
+ """Process incoming audio from microphone"""
143
+ _, array = frame
144
+ array = array.squeeze()
145
+ audio_message = encode_audio(array)
146
+
147
+ if self.session:
148
+ await self.session.send(input=audio_message)
149
+
150
+ # Audio: emit Gemini's audio response
151
+ async def emit(self):
152
+ """Send Gemini's audio to speakers"""
153
+ array = await wait_for_item(self.audio_queue, 0.01)
154
+ if array is not None:
155
+ return (self.output_sample_rate, array)
156
+ return array
157
+
158
+ async def shutdown(self) -> None:
159
+ """Clean up session"""
160
+ if self.session:
161
+ self.quit.set()
162
+ await self.session.close()
163
+ self.quit.clear()
164
+
165
+ # Create the Gradio application
166
+ stream = Stream(
167
+ handler=GeminiLiveHandler(),
168
+ modality="audio-video",
169
+ mode="send-receive",
170
+ server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
171
+ rtc_configuration=get_hf_turn_credentials(),
172
+ additional_inputs=[
173
+ gr.Markdown(
174
+ "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
175
+ "Talk to Gemini with real-time audio and video streaming.\n\n"
176
+ "**Features:** Voice conversation, webcam/screen sharing, low-latency responses, "
177
+ "interruption support, and natural voice interactions.\n\n"
178
+ "**Note:** API key is configured in Space settings (not visible to users)."
179
+ ),
180
+ gr.Textbox(
181
+ label="System Instruction",
182
+ value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
183
+ lines=3,
184
+ info="Customize how Gemini should behave"
185
+ ),
186
+ gr.Radio(
187
+ choices=["camera", "screen", "none"],
188
+ value="camera",
189
+ label="Video Mode",
190
+ info="camera: webcam feed | screen: screen capture | none: audio only"
191
+ ),
192
+ ],
193
+ ui_args={
194
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
195
+ "pulse_color": "rgb(66, 133, 244)",
196
+ "icon_button_color": "rgb(66, 133, 244)",
197
+ "title": "Gemini Live - Voice & Vision",
198
+ },
199
+ time_limit=900, # 15 minutes
200
+ concurrency_limit=10,
201
+ )
202
+
203
+ if __name__ == "__main__":
204
+ stream.ui.launch(
205
+ server_name="0.0.0.0",
206
+ server_port=7860,
207
+ share=True,
208
+ )