IFMedTechdemo commited on
Commit
719147e
·
verified ·
1 Parent(s): d469346

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -96
app.py CHANGED
@@ -1,8 +1,5 @@
1
  """
2
  Gradio Application for Gemini Live API with Audio + Video Streaming
3
-
4
- Installation:
5
- pip install "fastrtc[vad, tts]" gradio google-genai python-dotenv websockets pillow opencv-python numpy
6
  """
7
 
8
  import asyncio
@@ -28,7 +25,6 @@ def encode_audio(data: np.ndarray) -> dict:
28
 
29
  def encode_image(data: np.ndarray) -> dict:
30
  """Encode image data as JPEG for Gemini."""
31
- # Convert BGR to RGB if needed
32
  if len(data.shape) == 3 and data.shape[2] == 3:
33
  data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
34
 
@@ -40,7 +36,6 @@ def encode_image(data: np.ndarray) -> dict:
40
  base64_str = str(base64.b64encode(bytes_data), "utf-8")
41
  return {"mime_type": "image/jpeg", "data": base64_str}
42
 
43
- # Main handler class
44
  class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
45
  def __init__(self) -> None:
46
  super().__init__(
@@ -61,108 +56,78 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
61
  """Initialize Gemini Live API session"""
62
  await self.wait_for_args()
63
 
64
- # Get API key from HF Spaces secrets (environment variable)
65
  api_key = os.environ.get("GEMINI_API_KEY")
66
-
67
  if not api_key:
68
- raise WebRTCError(
69
- "Gemini API Key not found. "
70
- "Please add GEMINI_API_KEY in Space Settings > Variables and Secrets. "
71
- "Get your key at https://aistudio.google.com/apikey"
72
- )
73
-
74
- # Get system instruction and video mode from Gradio inputs
75
  system_instruction = self.latest_args[1]
76
- video_mode = self.latest_args[2]
77
 
78
- # Initialize Gemini client
79
  client = genai.Client(
80
  api_key=api_key,
81
  http_options={"api_version": "v1beta"}
82
  )
83
-
84
- # Configure Gemini session
85
  config = {
86
  "response_modalities": ["AUDIO"],
87
- "system_instruction": system_instruction or "You are a helpful AI assistant. Be conversational and engaging.",
88
  "speech_config": {
89
- "voice_config": {
90
- "prebuilt_voice_config": {"voice_name": "Zephyr"}
91
- }
92
- },
93
- "context_window_compression": {
94
- "trigger_tokens": 25600,
95
- "sliding_window": {"target_tokens": 12800}
96
  }
97
  }
98
-
99
- # Start Live API session
100
  async with client.aio.live.connect(
101
- model="models/gemini-2.5-flash-native-audio-preview-12-2025",
102
  config=config,
103
  ) as session:
104
  self.session = session
105
 
106
- # Listen for responses from Gemini
107
- while not self.quit.is_set():
108
- turn = self.session.receive()
109
- try:
110
- async for response in turn:
111
- if data := response.data:
112
- # Convert audio bytes to numpy array
113
- audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
114
- self.audio_queue.put_nowait(audio)
115
- if text := response.text:
116
- print(f"Gemini: {text}")
117
- except websockets.exceptions.ConnectionClosedOK:
118
- print("Gemini session closed")
119
- break
120
-
121
- # Video: receive frames from webcam/screen
 
 
122
  async def video_receive(self, frame: np.ndarray):
123
- """Process incoming video frames"""
124
  self.video_queue.put_nowait(frame)
125
-
126
- # Send frame to Gemini at ~1 FPS
127
  video_mode = self.latest_args[2]
 
128
  if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
129
  self.last_frame_time = time.time()
130
  await self.session.send(input=encode_image(frame))
131
 
132
  async def video_emit(self) -> np.ndarray:
133
- """Return video frames to display"""
134
  frame = await wait_for_item(self.video_queue, 0.01)
135
- if frame is not None:
136
- return frame
137
- # Fallback frame
138
- return np.zeros((480, 640, 3), dtype=np.uint8)
139
 
140
- # Audio: forward microphone audio to Gemini
141
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
142
- """Process incoming audio from microphone"""
143
  _, array = frame
144
  array = array.squeeze()
145
- audio_message = encode_audio(array)
146
-
147
  if self.session:
148
- await self.session.send(input=audio_message)
149
 
150
- # Audio: emit Gemini's audio response
151
  async def emit(self):
152
- """Send Gemini's audio to speakers"""
153
  array = await wait_for_item(self.audio_queue, 0.01)
154
- if array is not None:
155
- return (self.output_sample_rate, array)
156
- return array
157
 
158
  async def shutdown(self) -> None:
159
- """Clean up session"""
160
  if self.session:
161
  self.quit.set()
162
  await self.session.close()
163
  self.quit.clear()
164
 
165
- # Create the Gradio application
166
  stream = Stream(
167
  handler=GeminiLiveHandler(),
168
  modality="audio-video",
@@ -170,39 +135,15 @@ stream = Stream(
170
  server_rtc_configuration=get_cloudflare_turn_credentials(),
171
  rtc_configuration=get_cloudflare_turn_credentials(),
172
  additional_inputs=[
173
- gr.Markdown(
174
- "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
175
- "Talk to Gemini with real-time audio and video streaming.\n\n"
176
- "**Features:** Voice conversation, webcam/screen sharing, low-latency responses, "
177
- "interruption support, and natural voice interactions.\n\n"
178
- "**Note:** API key is configured in Space settings (not visible to users)."
179
- ),
180
- gr.Textbox(
181
- label="System Instruction",
182
- value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
183
- lines=3,
184
- info="Customize how Gemini should behave"
185
- ),
186
- gr.Radio(
187
- choices=["camera", "screen", "none"],
188
- value="camera",
189
- label="Video Mode",
190
- info="camera: webcam feed | screen: screen capture | none: audio only"
191
- ),
192
  ],
193
  ui_args={
194
- "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
195
- "pulse_color": "rgb(66, 133, 244)",
196
- "icon_button_color": "rgb(66, 133, 244)",
197
- "title": "Gemini Live - Voice & Vision",
198
- },
199
- time_limit=900, # 15 minutes
200
- concurrency_limit=10,
201
  )
202
 
203
  if __name__ == "__main__":
204
- stream.ui.launch(
205
- server_name="0.0.0.0",
206
- server_port=7860,
207
- share=True,
208
- )
 
1
  """
2
  Gradio Application for Gemini Live API with Audio + Video Streaming
 
 
 
3
  """
4
 
5
  import asyncio
 
25
 
26
  def encode_image(data: np.ndarray) -> dict:
27
  """Encode image data as JPEG for Gemini."""
 
28
  if len(data.shape) == 3 and data.shape[2] == 3:
29
  data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
30
 
 
36
  base64_str = str(base64.b64encode(bytes_data), "utf-8")
37
  return {"mime_type": "image/jpeg", "data": base64_str}
38
 
 
39
  class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
40
  def __init__(self) -> None:
41
  super().__init__(
 
56
  """Initialize Gemini Live API session"""
57
  await self.wait_for_args()
58
 
 
59
  api_key = os.environ.get("GEMINI_API_KEY")
 
60
  if not api_key:
61
+ raise WebRTCError("Gemini API Key not found in Space secrets.")
62
+
 
 
 
 
 
63
  system_instruction = self.latest_args[1]
 
64
 
 
65
  client = genai.Client(
66
  api_key=api_key,
67
  http_options={"api_version": "v1beta"}
68
  )
69
+
 
70
  config = {
71
  "response_modalities": ["AUDIO"],
72
+ "system_instruction": system_instruction or "You are a helpful assistant.",
73
  "speech_config": {
74
+ "voice_config": {"prebuilt_voice_config": {"voice_name": "Zephyr"}}
 
 
 
 
 
 
75
  }
76
  }
77
+
78
+ # Use the standard preview model gemini-2.0-flash-exp
79
  async with client.aio.live.connect(
80
+ model="gemini-2.0-flash-exp",
81
  config=config,
82
  ) as session:
83
  self.session = session
84
 
85
+ # --- ADDED: Initial greeting to make the bot speak first ---
86
+ await self.session.send(input="Hello! I'm connected and ready to help.", end_of_turn=True)
87
+
88
+ # Listen for responses continuously
89
+ try:
90
+ async for response in self.session.receive():
91
+ if self.quit.is_set():
92
+ break
93
+
94
+ if data := response.data:
95
+ audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
96
+ self.audio_queue.put_nowait(audio)
97
+
98
+ if text := response.text:
99
+ print(f"Gemini: {text}")
100
+ except Exception as e:
101
+ print(f"Session error: {e}")
102
+
103
  async def video_receive(self, frame: np.ndarray):
 
104
  self.video_queue.put_nowait(frame)
 
 
105
  video_mode = self.latest_args[2]
106
+
107
  if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
108
  self.last_frame_time = time.time()
109
  await self.session.send(input=encode_image(frame))
110
 
111
  async def video_emit(self) -> np.ndarray:
 
112
  frame = await wait_for_item(self.video_queue, 0.01)
113
+ return frame if frame is not None else np.zeros((480, 640, 3), dtype=np.uint8)
 
 
 
114
 
 
115
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
 
116
  _, array = frame
117
  array = array.squeeze()
 
 
118
  if self.session:
119
+ await self.session.send(input=encode_audio(array))
120
 
 
121
  async def emit(self):
 
122
  array = await wait_for_item(self.audio_queue, 0.01)
123
+ return (self.output_sample_rate, array) if array is not None else None
 
 
124
 
125
  async def shutdown(self) -> None:
 
126
  if self.session:
127
  self.quit.set()
128
  await self.session.close()
129
  self.quit.clear()
130
 
 
131
  stream = Stream(
132
  handler=GeminiLiveHandler(),
133
  modality="audio-video",
 
135
  server_rtc_configuration=get_cloudflare_turn_credentials(),
136
  rtc_configuration=get_cloudflare_turn_credentials(),
137
  additional_inputs=[
138
+ gr.Markdown("## 🎙️ Gemini Live - Real-Time Voice & Vision\n\nClick the **Connect/Start** button to begin."),
139
+ gr.Textbox(label="System Instruction", value="You are a helpful and concise AI assistant."),
140
+ gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Mode"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  ],
142
  ui_args={
143
+ "title": "Gemini Live Assistant",
144
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png"
145
+ }
 
 
 
 
146
  )
147
 
148
  if __name__ == "__main__":
149
+ stream.ui.launch(server_name="0.0.0.0", server_port=7860)