IFMedTechdemo commited on
Commit
f40004d
·
verified ·
1 Parent(s): 6dbfad3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -0
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Application for Gemini Live API with Audio + Video Streaming
3
+
4
+ Installation:
5
+ pip install "fastrtc[vad, tts]" gradio google-genai python-dotenv websockets pillow opencv-python numpy
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import io
11
+ import os
12
+ import time
13
+ import numpy as np
14
+ import cv2
15
+ import websockets
16
+ from PIL import Image
17
+ from dotenv import load_dotenv
18
+ from google import genai
19
+ from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_hf_turn_credentials
20
+ import gradio as gr
21
+
22
+ load_dotenv()
23
+
24
+ # Encoder functions for Gemini API
25
+ def encode_audio(data: np.ndarray) -> dict:
26
+ """Encode audio data (int16 mono) for Gemini."""
27
+ return {
28
+ "mime_type": "audio/pcm",
29
+ "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
30
+ }
31
+
32
+ def encode_image(data: np.ndarray) -> dict:
33
+ """Encode image data as JPEG for Gemini."""
34
+ # Convert BGR to RGB if needed
35
+ if len(data.shape) == 3 and data.shape[2] == 3:
36
+ data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
37
+
38
+ with io.BytesIO() as output_bytes:
39
+ pil_image = Image.fromarray(data)
40
+ pil_image.thumbnail([1024, 1024])
41
+ pil_image.save(output_bytes, "JPEG")
42
+ bytes_data = output_bytes.getvalue()
43
+ base64_str = str(base64.b64encode(bytes_data), "utf-8")
44
+ return {"mime_type": "image/jpeg", "data": base64_str}
45
+
46
+ # Main handler class
47
+ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
48
+ def __init__(self) -> None:
49
+ super().__init__(
50
+ expected_layout="mono",
51
+ output_sample_rate=24000,
52
+ input_sample_rate=16000,
53
+ )
54
+ self.audio_queue = asyncio.Queue()
55
+ self.video_queue = asyncio.Queue()
56
+ self.session = None
57
+ self.last_frame_time = 0.0
58
+ self.quit = asyncio.Event()
59
+
60
+ def copy(self) -> "GeminiLiveHandler":
61
+ return GeminiLiveHandler()
62
+
63
+ async def start_up(self):
64
+ """Initialize Gemini Live API session"""
65
+ await self.wait_for_args()
66
+
67
+ # Get API key and system instruction from Gradio inputs
68
+ api_key = self.latest_args[1] # From gr.Textbox
69
+ system_instruction = self.latest_args[2] # From gr.Textbox
70
+ video_mode = self.latest_args[3] # From gr.Radio
71
+
72
+ if not api_key:
73
+ raise WebRTCError("Gemini API Key is required. Get one at https://aistudio.google.com/apikey")
74
+
75
+ # Initialize Gemini client
76
+ client = genai.Client(
77
+ api_key=api_key,
78
+ http_options={"api_version": "v1beta"}
79
+ )
80
+
81
+ # Configure Gemini session
82
+ config = {
83
+ "response_modalities": ["AUDIO"],
84
+ "system_instruction": system_instruction or "You are a helpful AI assistant. Be conversational and engaging.",
85
+ "speech_config": {
86
+ "voice_config": {
87
+ "prebuilt_voice_config": {"voice_name": "Zephyr"}
88
+ }
89
+ },
90
+ "context_window_compression": {
91
+ "trigger_tokens": 25600,
92
+ "sliding_window": {"target_tokens": 12800}
93
+ }
94
+ }
95
+
96
+ # Start Live API session
97
+ async with client.aio.live.connect(
98
+ model="models/gemini-2.5-flash-native-audio-preview-12-2025",
99
+ config=config,
100
+ ) as session:
101
+ self.session = session
102
+
103
+ # Listen for responses from Gemini
104
+ while not self.quit.is_set():
105
+ turn = self.session.receive()
106
+ try:
107
+ async for response in turn:
108
+ if data := response.data:
109
+ # Convert audio bytes to numpy array
110
+ audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
111
+ self.audio_queue.put_nowait(audio)
112
+ if text := response.text:
113
+ print(f"Gemini: {text}")
114
+ except websockets.exceptions.ConnectionClosedOK:
115
+ print("Gemini session closed")
116
+ break
117
+
118
+ # Video: receive frames from webcam/screen
119
+ async def video_receive(self, frame: np.ndarray):
120
+ """Process incoming video frames"""
121
+ self.video_queue.put_nowait(frame)
122
+
123
+ # Send frame to Gemini at ~1 FPS
124
+ video_mode = self.latest_args[3]
125
+ if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
126
+ self.last_frame_time = time.time()
127
+ await self.session.send(input=encode_image(frame))
128
+
129
+ async def video_emit(self) -> np.ndarray:
130
+ """Return video frames to display"""
131
+ frame = await wait_for_item(self.video_queue, 0.01)
132
+ if frame is not None:
133
+ return frame
134
+ # Fallback frame
135
+ return np.zeros((480, 640, 3), dtype=np.uint8)
136
+
137
+ # Audio: forward microphone audio to Gemini
138
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
139
+ """Process incoming audio from microphone"""
140
+ _, array = frame
141
+ array = array.squeeze()
142
+ audio_message = encode_audio(array)
143
+
144
+ if self.session:
145
+ await self.session.send(input=audio_message)
146
+
147
+ # Audio: emit Gemini's audio response
148
+ async def emit(self):
149
+ """Send Gemini's audio to speakers"""
150
+ array = await wait_for_item(self.audio_queue, 0.01)
151
+ if array is not None:
152
+ return (self.output_sample_rate, array)
153
+ return array
154
+
155
+ async def shutdown(self) -> None:
156
+ """Clean up session"""
157
+ if self.session:
158
+ self.quit.set()
159
+ await self.session.close()
160
+ self.quit.clear()
161
+
162
+ # Create the Gradio application
163
+ stream = Stream(
164
+ handler=GeminiLiveHandler(),
165
+ modality="audio-video",
166
+ mode="send-receive",
167
+ server_rtc_configuration=get_hf_turn_credentials(ttl=600*10000),
168
+ rtc_configuration=get_hf_turn_credentials(),
169
+ additional_inputs=[
170
+ gr.Markdown(
171
+ "## 🎙️ Gemini Live API - Real-Time Voice & Vision\n\n"
172
+ "Talk to Gemini with real-time audio and video streaming. "
173
+ "Get your API key at [Google AI Studio](https://aistudio.google.com/apikey).\n\n"
174
+ "**Features:** Voice conversation, webcam/screen sharing, low-latency responses"
175
+ ),
176
+ gr.Textbox(
177
+ label="Gemini API Key",
178
+ type="password",
179
+ placeholder="Enter your Gemini API key",
180
+ ),
181
+ gr.Textbox(
182
+ label="System Instruction",
183
+ value="You are a helpful AI assistant. Be conversational, engaging, and concise.",
184
+ lines=3,
185
+ ),
186
+ gr.Radio(
187
+ choices=["camera", "screen", "none"],
188
+ value="camera",
189
+ label="Video Mode",
190
+ info="camera: webcam feed | screen: screen capture | none: audio only"
191
+ ),
192
+ ],
193
+ ui_args={
194
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
195
+ "pulse_color": "rgb(66, 133, 244)",
196
+ "icon_button_color": "rgb(66, 133, 244)",
197
+ "title": "Gemini Live - Voice & Vision",
198
+ },
199
+ time_limit=900, # 15 minutes
200
+ concurrency_limit=10,
201
+ )
202
+
203
+ if __name__ == "__main__":
204
+ stream.ui.launch(
205
+ server_name="0.0.0.0",
206
+ server_port=7860,
207
+ share=True, # Creates public URL for sharing
208
+ )