arthrod commited on
Commit
a83e06b
Β·
verified Β·
1 Parent(s): 23bc8aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -910
app.py CHANGED
@@ -1,572 +1,193 @@
1
- # Environment variable for API key
2
-
3
- """Real-Time Screen Assistant - Premium Edition with Complete Frontend Integration
4
-
5
- This is the PREMIUM, BEST WORKING version with comprehensive real-time handlers:
6
- 1. Continuous audio flow from user β†’ model
7
- 2. Model audio output β†’ user
8
- 3. Screen data streaming β†’ model
9
- 4. Text responses from system β†’ user
10
-
11
- Features:
12
- - Google GenAI Live API integration with enhanced configuration
13
- - Real-time audio/video streaming via FastRTC
14
- - Voice activity detection with intelligent filtering
15
- - Continuous screen capture with adaptive throttling
16
- - AI response delivery system (audio + text)
17
- - Background task management with proper cleanup
18
- - Enhanced error handling and recovery
19
- - 300s timeout for real-time behavior
20
- """
21
-
22
- import asyncio
23
  import os
24
- import time
25
- import sys
26
- from collections import deque
27
-
28
- import cv2
29
- import gradio as gr
30
- import numpy as np
31
- import numpy.typing as npt
32
- from fastrtc import AsyncAudioVideoStreamHandler, ReplyOnPause, Stream, get_cloudflare_turn_credentials_async
33
- from google import genai
34
- from google.genai import types
35
- from gradio.events import Dependency
36
-
37
-
38
  import asyncio
39
- import os
40
  import time
41
- import sys
42
- from collections import deque
43
-
44
  import cv2
45
  import gradio as gr
46
- import numpy as np
47
- import numpy.typing as npt
48
- from fastrtc import AsyncAudioVideoStreamHandler, ReplyOnPause, Stream, get_cloudflare_turn_credentials_async
49
  from google import genai
50
  from google.genai import types
51
- import gradio as gr
52
- from gradio.components.base import Component
53
- from gradio.data_classes import FileData, GradioModel
54
- from typing import Optional, Literal, Any
55
- import tempfile
56
- import os
57
- import json
58
 
59
  # Environment variable for API key
60
  API_KEY = os.getenv("GEMINI_API_KEY", "")
61
 
62
- class ScreenRecorderData(GradioModel):
63
- video: Optional[FileData] = None
64
- duration: Optional[float] = None
65
- audio_enabled: bool = True
66
- status: Literal["recording", "stopped", "error"] = "stopped"
67
-
68
- class Config:
69
- json_encoders = {
70
- FileData: lambda v: v.model_dump() if v else None
71
- }
72
-
73
-
74
- class ScreenRecorder(Component):
75
- """
76
- Custom Gradio component for comprehensive screen recording functionality.
77
- """
78
-
79
- data_model = ScreenRecorderData
80
-
81
- EVENTS = [
82
- "record_start",
83
- "record_stop",
84
- "stream_update",
85
- "change"
86
- ]
87
-
88
- def __init__(
89
- self,
90
- value=None,
91
- audio_enabled: bool = True,
92
- webcam_overlay: bool = False,
93
- webcam_position: Literal["top-left", "top-right", "bottom-left", "bottom-right"] = "bottom-right",
94
- recording_format: str = "webm",
95
- max_duration: Optional[int] = None,
96
- interactive: bool = True,
97
- **kwargs
98
- ):
99
- self.audio_enabled = audio_enabled
100
- self.webcam_overlay = webcam_overlay
101
- self.webcam_position = webcam_position
102
- self.recording_format = recording_format
103
- self.max_duration = max_duration
104
- self._status = "stopped"
105
-
106
- super().__init__(
107
- value=value,
108
- interactive=interactive,
109
- **kwargs
110
- )
111
-
112
- def example_payload(self) -> dict:
113
- """
114
- The example inputs for this component for API usage. Must be JSON-serializable.
115
- """
116
- return {
117
- "video": {
118
- "path": "https://sample-videos.com/zip/10/mp4/SampleVideo_360x240_1mb.mp4",
119
- "orig_name": "example_recording.webm",
120
- "size": 1024000
121
- },
122
- "duration": 30.5,
123
- "audio_enabled": True,
124
- "status": "stopped"
125
- }
126
-
127
- def example_value(self) -> ScreenRecorderData:
128
- """
129
- An example value for this component for the default app.
130
- """
131
- return ScreenRecorderData(
132
- video=FileData(
133
- path="https://sample-videos.com/zip/10/mp4/SampleVideo_360x240_1mb.mp4",
134
- orig_name="example_recording.webm",
135
- size=1024000
136
- ),
137
- duration=30.5,
138
- audio_enabled=True,
139
- status="stopped"
140
- )
141
-
142
- def flag(self, x, flag_dir: str = "") -> str:
143
- """
144
- Write the component's value to a format for flagging (CSV storage).
145
- """
146
- if x is None:
147
- return ""
148
-
149
- if isinstance(x, ScreenRecorderData) and x.video:
150
- return f"Recording: {x.video.orig_name} ({x.duration}s) - Status: {x.status}"
151
-
152
- if isinstance(x, dict) and "video" in x:
153
- duration = x.get("duration", "unknown")
154
- status = x.get("status", "unknown")
155
- video_name = x["video"].get("orig_name", "unknown") if x["video"] else "none"
156
- return f"Recording: {video_name} ({duration}s) - Status: {status}"
157
-
158
- return str(x)
159
-
160
- def preprocess(self, payload) -> Optional[ScreenRecorderData]:
161
- """Process incoming recording data from frontend."""
162
- if payload is None:
163
- return None
164
-
165
- if isinstance(payload, dict):
166
- if payload.get("status") == "error": # Early exit for errors from frontend
167
- raise gr.Error(f"Recording failed on frontend: {payload.get('error', 'Unknown error')}")
168
-
169
- # If 'video' field is a string, assume it's JSON and parse it.
170
- if "video" in payload and isinstance(payload["video"], str):
171
- try:
172
- video_json_string = payload["video"]
173
- if video_json_string.strip().startswith("{") and video_json_string.strip().endswith("}"):
174
- payload["video"] = json.loads(video_json_string)
175
- # If it's a string but not our expected JSON (e.g. 'null', or empty string, or simple path)
176
- # json.loads would fail or Pydantic validation later will catch it if structure is wrong.
177
- # For 'null' string, json.loads results in None for payload["video"].
178
- elif video_json_string.lower() == 'null':
179
- payload["video"] = None
180
- else:
181
- # This case implies a string that isn't a JSON object or 'null',
182
- # e.g. a direct file path string, which FileData might not directly accept
183
- # if it expects a dict. Pydantic will raise error later if type is incompatible.
184
- gr.Warning(f"Video data is a string but not a recognized JSON object or 'null': {video_json_string[:100]}")
185
- # To be safe, if it's not a JSON object string, we might want to error or handle specifically
186
- # For now, let Pydantic try to handle it or fail.
187
-
188
- except json.JSONDecodeError:
189
- raise gr.Error(f"Invalid JSON for video data: {payload['video'][:100]}")
190
-
191
- # --- Validations from here ---
192
- video_data = payload.get("video") # Use .get() for safety, as 'video' might be absent or None
193
-
194
- if video_data is not None: # Only validate video_data if it exists
195
- if not isinstance(video_data, dict):
196
- # This can happen if payload["video"] was a string like "some_path.webm" and not parsed to dict
197
- # Or if it was parsed to something unexpected.
198
- raise gr.Error(f"Video data is not a dictionary after processing: {type(video_data)}. Value: {str(video_data)[:100]}")
199
-
200
- if video_data.get("size", 0) == 0:
201
- gr.Warning("Received recording with zero size. This might be an empty recording or an issue with data capture.")
202
- # Depending on requirements, could raise gr.Error here.
203
-
204
- max_size = 500 * 1024 * 1024 # 500MB
205
- if video_data.get("size", 0) > max_size:
206
- raise gr.Error(f"Recording file too large ({video_data.get('size', 0)} bytes). Maximum allowed: {max_size} bytes.")
207
- # If video_data is None (e.g. 'video': null was sent, or 'video' key missing),
208
- # ScreenRecorderData will have video=None, which is allowed by Optional[FileData].
209
-
210
- duration = payload.get("duration", 0)
211
- if duration <= 0 and video_data is not None: # Only warn about duration if there's video data
212
- gr.Warning("Recording duration is 0 or invalid. The recording might be corrupted.")
213
-
214
- try:
215
- return ScreenRecorderData(**payload)
216
- except Exception as e: # Catch Pydantic validation errors or other issues during model instantiation
217
- # Log the payload for easier debugging if there's a Pydantic error
218
- # Be careful with logging sensitive data in production.
219
- # print(f"Error creating ScreenRecorderData. Payload: {payload}")
220
- raise gr.Error(f"Error creating ScreenRecorderData from payload: {e}")
221
-
222
- elif isinstance(payload, ScreenRecorderData): # If it's already the correct type
223
- return payload
224
-
225
- gr.Warning(f"Unexpected payload format: {type(payload)}. Payload: {str(payload)[:200]}")
226
- return None
227
-
228
- # def postprocess(self, value) -> Optional[dict]:
229
- # """Process outgoing data to frontend."""
230
- # if value is None:
231
- # return {"status": "stopped"} # Ensure valid empty state
232
-
233
- # try:
234
- # if isinstance(value, ScreenRecorderData):
235
- # return value.model_dump()
236
- # elif isinstance(value, dict):
237
- # return value
238
- # return None
239
- # except Exception as e:
240
- # return {"status": "error", "error": str(e)}
241
-
242
-
243
- def postprocess(self, value) -> Optional[dict]:
244
- """Process outgoing data to frontend."""
245
- print(f'value in postprocess: {value}')
246
- if value is None:
247
- return None
248
-
249
- try:
250
- # If it's already a dict, return as is
251
- if isinstance(value, dict):
252
- return value
253
-
254
- # If it's a ScreenRecorderData object, convert to dict
255
- if hasattr(value, 'model_dump'):
256
- return value.model_dump()
257
-
258
- # Handle string values
259
- if isinstance(value, str):
260
- return {"video": {"path": value}}
261
-
262
- return None
263
-
264
- except Exception as e:
265
- print(f'Error in postprocess: {e}')
266
- return None
267
-
268
-
269
- # try:
270
- # if isinstance(value, ScreenRecorderData):
271
- # # Ensure video data exists before sending
272
- # if not value.video:
273
- # return {"status": "error", "error": "No video recorded"}
274
-
275
- # return {
276
- # "video": value.video,
277
- # "duration": value.duration,
278
- # "audio_enabled": value.audio_enabled,
279
- # "status": value.status
280
- # }
281
-
282
- # # Handle raw dict format from frontend
283
- # if isinstance(value, dict):
284
- # return {
285
- # "video": FileData(**value.get("video", {})),
286
- # "duration": value.get("duration"),
287
- # "audio_enabled": value.get("audio_enabled", True),
288
- # "status": value.get("status", "stopped")
289
- # }
290
-
291
- # except Exception as e:
292
- # return {"status": "error", "error": str(e)}
293
-
294
- # return {"status": "stopped"}
295
-
296
- def as_example(self, input_data):
297
- """Handle example data display."""
298
- if input_data is None:
299
- return None
300
-
301
- if isinstance(input_data, (ScreenRecorderData, dict)):
302
- return input_data
303
-
304
- # Convert simple video path to proper format
305
- if isinstance(input_data, str):
306
- return {
307
- "video": {
308
- "path": input_data,
309
- "orig_name": os.path.basename(input_data),
310
- "size": 0
311
- },
312
- "duration": None,
313
- "audio_enabled": self.audio_enabled,
314
- "status": "stopped"
315
- }
316
-
317
- return input_data
318
-
319
- def update_status(self, status: Literal["recording", "stopped", "error"]):
320
- """Update the internal status of the recorder."""
321
- self._status = status
322
-
323
- def get_status(self) -> str:
324
- """Get the current status of the recorder."""
325
- return self._status
326
-
327
 
328
- class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
329
- """Premium Real-time screen assistant with complete frontend integration.
330
-
331
- Real-time Frontend Integration Features:
332
- - Continuous audio streaming with voice activity detection
333
- - Real-time screen capture with intelligent throttling
334
- - AI audio response processing and delivery
335
- - Text response handling and display
336
- - Background task management
337
- - Enhanced error recovery
338
- """
339
 
340
  def __init__(self):
341
- super().__init__(
342
- expected_layout="mono",
343
- output_sample_rate=24000,
344
- input_sample_rate=16000
345
- )
346
  self.session = None
347
  self.last_frame_time = 0
348
  self.audio_queue = asyncio.Queue()
349
  self.text_queue = asyncio.Queue()
350
  self.connected = False
351
- self.frame_interval = 1.0 # Adaptive frame interval
352
-
353
- # Enhanced features for premium version
354
- self.conversation_history = deque(maxlen=20) # Keep last 20 exchanges
355
- self.background_tasks = set() # Track background tasks
356
- self.voice_activity_threshold = 0.01 # Voice activity detection threshold
357
- self.consecutive_silent_frames = 0
358
- self.max_silent_frames = 10 # Filter out silence
359
-
360
- # Performance optimization
361
- self.last_audio_level = 0.0
362
- self.frame_skip_counter = 0
363
- self.adaptive_quality = True
364
 
365
  async def start_up(self):
366
- """Enhanced startup with premium configuration"""
367
  try:
368
- current_api_key = os.getenv("GEMINI_API_KEY", "")
369
- if not current_api_key:
370
  print("❌ No GEMINI_API_KEY found in environment")
371
  return
372
 
373
- # Initialize client with premium configuration
374
- client = genai.Client(
375
- api_key=current_api_key,
376
- http_options={"api_version": "v1alpha"}
377
- )
378
 
379
- # PREMIUM: Enhanced configuration with all features enabled
380
  config = {
381
  "response_modalities": ["AUDIO", "TEXT"],
382
  "input_audio_transcription": {"model": "latest"},
383
  "output_audio_transcription": {"model": "latest"},
384
- "system_instruction": {
385
- "parts": [{
386
- "text": (
387
- "You are an expert real-time screen assistant with premium capabilities. "
388
- "You can see the user's screen continuously and hear their voice in real-time. "
389
- "Provide intelligent, proactive assistance based on what you observe. "
390
- "Give clear, actionable guidance for software usage, coding, troubleshooting, "
391
- "and any tasks you see the user working on. Be concise but comprehensive. "
392
- "Respond with both voice and text when helpful."
393
- )
394
- }]
395
- },
396
- "generation_config": {
397
- "response_mime_type": "text/plain",
398
- "temperature": 0.7,
399
- "max_output_tokens": 512
400
- }
401
  }
402
 
403
- # Connect with enhanced configuration
404
- self.session = await client.aio.live.connect(
405
- model="gemini-2.0-flash-live-preview",
406
- config=config
407
- )
408
 
409
  self.connected = True
410
- print("βœ… Connected to Google GenAI Live API (Premium Mode)")
411
 
412
- # Start enhanced response handler
 
413
  response_task = asyncio.create_task(self._handle_responses())
 
414
  self.background_tasks.add(response_task)
 
415
  response_task.add_done_callback(self.background_tasks.discard)
 
416
 
417
  except Exception as e:
418
  print(f"❌ Failed to connect to GenAI: {e}")
419
  self.connected = False
420
 
421
  async def _handle_responses(self):
422
- """Enhanced response handler with conversation history"""
423
  try:
424
- async for response in self.session.receive():
425
- if not self.connected:
426
- break
427
 
428
- try:
429
- # Handle audio responses (premium feature)
430
- if hasattr(response, 'data') and response.data:
431
- audio_array = np.frombuffer(response.data, dtype=np.int16)
432
- if len(audio_array) > 0:
433
- audio_array = audio_array.reshape(1, -1)
434
- await self.audio_queue.put(audio_array)
435
-
436
- # Handle text responses with conversation history
437
- if hasattr(response, 'text') and response.text:
438
- print(f"πŸ€– AI: {response.text}")
439
-
440
- # Add to conversation history
441
- self.conversation_history.append({
442
- "timestamp": time.time(),
443
- "type": "ai_response",
444
- "content": response.text
445
- })
446
-
447
- # Queue for frontend delivery
448
- await self.text_queue.put(response.text)
449
-
450
- # Handle structured responses (premium)
451
- if hasattr(response, 'server_content') and response.server_content:
452
- if hasattr(response.server_content, 'model_turn'):
453
- model_turn = response.server_content.model_turn
454
- if hasattr(model_turn, 'parts'):
455
- for part in model_turn.parts:
456
- if hasattr(part, 'text') and part.text:
457
- print(f"πŸ€– AI: {part.text}")
458
- await self.text_queue.put(part.text)
459
 
460
- except Exception as e:
461
- print(f"⚠️ Response processing error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
  except Exception as e:
464
- print(f"❌ Response handler error: {e}")
465
 
466
- async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
467
- """PREMIUM: Enhanced audio processing with voice activity detection"""
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  if not self.connected or not self.session:
469
  return
470
 
471
  try:
472
  _, audio_np = frame
473
 
474
- # PREMIUM: Voice activity detection
475
- audio_level = np.abs(audio_np.astype(np.float32)).mean()
476
- self.last_audio_level = audio_level
 
477
 
478
- # Filter out silence and background noise
479
- if audio_level < self.voice_activity_threshold:
480
- self.consecutive_silent_frames += 1
481
- if self.consecutive_silent_frames < self.max_silent_frames:
482
- return # Skip silent frames
483
- else:
484
- self.consecutive_silent_frames = 0
485
-
486
- # Convert and send audio
487
- audio_bytes = audio_np.tobytes()
488
-
489
- # PREMIUM: Send with metadata
490
- await self.session.send_realtime_input(
491
- input=types.Blob(
492
- data=audio_bytes,
493
- mime_type="audio/pcm;rate=16000"
494
- )
495
- )
496
-
497
- # Track user interaction
498
- self.conversation_history.append({
499
- "timestamp": time.time(),
500
- "type": "user_audio",
501
- "audio_level": float(audio_level)
502
- })
503
 
504
  except Exception as e:
505
- print(f"❌ Error sending audio: {e}")
506
 
507
- async def video_receive(self, frame: npt.NDArray[np.float32]):
508
- """PREMIUM: Enhanced screen capture with adaptive throttling"""
509
  if not self.connected or not self.session:
510
  return
511
 
512
  try:
513
- # PREMIUM: Adaptive frame throttling based on activity
514
  current_time = time.time()
515
 
516
- # Adaptive interval based on user activity
517
- if hasattr(self, 'last_audio_level') and self.last_audio_level > 0.05:
518
- # More frequent updates during active conversation
519
- adaptive_interval = self.frame_interval * 0.5
 
 
 
 
 
 
520
  else:
521
- # Standard interval during quiet periods
522
- adaptive_interval = self.frame_interval
523
 
524
- if current_time - self.last_frame_time < adaptive_interval:
525
  return
526
 
527
  self.last_frame_time = current_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
- # PREMIUM: Enhanced frame processing
530
- if frame.dtype == np.float32:
531
- frame_uint8 = (frame * 255).astype(np.uint8)
532
- else:
533
- frame_uint8 = frame.astype(np.uint8)
534
-
535
- # Validate frame
536
- if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
537
- return
538
-
539
- # PREMIUM: Adaptive quality encoding
540
- quality = 85 if self.adaptive_quality and self.last_audio_level > 0.02 else 75
541
-
542
- try:
543
- success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, quality])
544
- if not success:
545
- return
546
- except cv2.error:
547
  return
548
 
549
- # Send enhanced frame data
550
- await self.session.send_realtime_input(
551
- input=types.Blob(
552
- data=jpg_bytes.tobytes(),
553
- mime_type="image/jpeg"
554
- )
555
- )
556
 
557
- # Track screen activity
558
- self.conversation_history.append({
559
- "timestamp": time.time(),
560
- "type": "screen_frame",
561
- "quality": quality,
562
- "size": len(jpg_bytes)
563
- })
564
 
565
  except Exception as e:
566
- print(f"❌ Error sending video frame: {e}")
567
 
568
  async def emit(self):
569
- """PREMIUM: Enhanced audio emission with queue management"""
570
  try:
571
  audio_chunk = self.audio_queue.get_nowait()
572
  return (24000, audio_chunk)
@@ -574,40 +195,42 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
574
  return None
575
 
576
  async def get_latest_text(self):
577
- """PREMIUM: Get latest text response from AI"""
578
  try:
579
  text = self.text_queue.get_nowait()
580
  return text
581
  except asyncio.QueueEmpty:
582
  return None
583
 
584
- def copy(self):
585
- """Enhanced copy method with state preservation"""
586
- new_instance = RealTimeScreenAssistant()
587
- new_instance.frame_interval = self.frame_interval
588
- new_instance.voice_activity_threshold = self.voice_activity_threshold
589
- new_instance.adaptive_quality = self.adaptive_quality
590
- return new_instance
591
-
592
- async def video_emit(self):
593
- """Video emit method for FastRTC compatibility"""
594
- return None
595
-
596
  async def shutdown(self):
597
- """PREMIUM: Enhanced shutdown with complete cleanup"""
598
  self.connected = False
599
 
600
- # Cancel all background tasks
601
- for task in self.background_tasks.copy():
602
- if not task.done():
603
- task.cancel()
 
 
 
 
604
 
605
- # Wait for task cleanup
606
- if self.background_tasks:
607
- await asyncio.gather(*self.background_tasks, return_exceptions=True)
608
- self.background_tasks.clear()
609
 
610
- # Clean up queues
 
 
 
 
 
 
 
 
 
 
 
 
611
  while not self.audio_queue.empty():
612
  try:
613
  self.audio_queue.get_nowait()
@@ -620,453 +243,260 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
620
  except asyncio.QueueEmpty:
621
  break
622
 
623
- # Clear conversation history
624
- self.conversation_history.clear()
625
 
626
- # Close session
627
- if self.session:
628
- try:
629
- await self.session.close()
630
- print("πŸ”΄ Disconnected from GenAI Live API")
631
- except Exception as e:
632
- print(f"❌ Error during shutdown: {e}")
633
 
634
- self.session = None
 
635
 
636
- # Global state for premium app
637
- app_state = {
638
- "stream": None,
639
- "handler": None,
640
- "connected": False,
641
- "last_status": "Ready to connect",
642
- "stats": {"audio_sent": 0, "frames_sent": 0, "responses_received": 0}
643
- }
644
 
645
- def initialize_real_time_assistant():
646
- """PREMIUM: Enhanced stream initialization"""
647
  try:
648
- handler = RealTimeScreenAssistant()
 
649
  app_state["handler"] = handler
650
 
651
- # PREMIUM: Enhanced stream configuration with fallback
652
- try:
653
- # Try with ReplyOnPause (requires fastrtc[vad])
654
- stream = Stream(
655
- handler=ReplyOnPause(handler), # Voice activity detection
656
- modality="audio-video",
657
- mode="send-receive",
658
- rtc_configuration=get_cloudflare_turn_credentials_async,
659
- time_limit=300, # 5 minutes - real-time optimized
660
- ui_args={
661
- "title": "Premium Real-Time Assistant",
662
- "subtitle": "Audio-Video Streaming with Gemini 2.0",
663
- "hide_title": False
664
- }
665
- )
666
- print("βœ… Stream created with ReplyOnPause (VAD enabled)")
667
- except Exception as vad_error:
668
- print(f"⚠️ ReplyOnPause failed: {vad_error}")
669
- print("πŸ”„ Falling back to basic handler...")
670
- # Fallback to basic handler without VAD
671
- stream = Stream(
672
- handler=handler, # Direct handler without VAD
673
- modality="audio-video",
674
- mode="send-receive",
675
- rtc_configuration=get_cloudflare_turn_credentials_async,
676
- time_limit=300,
677
- ui_args={
678
- "title": "Real-Time Assistant (Basic)",
679
- "subtitle": "Audio-Video Streaming with Gemini 2.0",
680
- "hide_title": False
681
- }
682
- )
683
- print("βœ… Stream created with basic handler")
684
 
685
  app_state["stream"] = stream
686
  return stream
687
 
688
  except Exception as e:
689
- print(f"❌ Error creating stream: {e}")
690
  return None
691
 
692
- async def handle_connect_async():
693
- """PREMIUM: Enhanced async connection handler"""
694
- current_api_key = os.getenv("GEMINI_API_KEY", "")
695
- if not current_api_key:
696
  return "❌ Please set GEMINI_API_KEY environment variable"
697
 
698
  if app_state["connected"]:
699
  return "βœ… Already connected - session is active"
700
 
701
- try:
702
- if app_state["handler"]:
703
- await app_state["handler"].start_up()
704
- app_state["connected"] = True
705
- app_state["last_status"] = "Connected to GenAI Live API"
706
- return "βœ… Connected to GenAI Live API - Ready for real-time interaction!"
707
- else:
708
- return "❌ Handler not initialized"
709
- except Exception as e:
710
- app_state["connected"] = False
711
- return f"❌ Connection failed: {str(e)}"
712
 
713
- def handle_connect():
714
- """Sync wrapper for connection"""
715
- import threading
716
 
717
- app_state["connected"] = True # Optimistic update for UI
718
- app_state["last_status"] = "Initiating connection..."
719
 
720
- # Start async connection in a separate thread to avoid event loop issues
721
- def run_async_connection():
722
- try:
723
- import asyncio
724
- loop = asyncio.new_event_loop()
725
- asyncio.set_event_loop(loop)
726
- result = loop.run_until_complete(handle_connect_async())
727
- print(f"Connection result: {result}")
728
- loop.close()
729
- except Exception as e:
730
- print(f"Connection error: {e}")
731
- app_state["connected"] = False
732
- app_state["last_status"] = f"Connection failed: {e}"
733
 
734
- thread = threading.Thread(target=run_async_connection, daemon=True)
735
- thread.start()
 
 
736
 
737
- return "πŸ”„ Initiating connection to GenAI Live API..."
738
 
739
  async def handle_disconnect_async():
740
- """PREMIUM: Enhanced async disconnect handler"""
741
  if app_state["handler"] and app_state["connected"]:
742
  try:
743
  await app_state["handler"].shutdown()
744
  app_state["connected"] = False
 
745
  app_state["handler"] = None
746
- app_state["last_status"] = "Disconnected"
747
  return "πŸ”΄ Disconnected from AI assistant"
748
  except Exception as e:
749
- return f"❌ Error during disconnect: {str(e)}"
750
- return "Already disconnected"
751
 
752
- def handle_disconnect():
753
- """Sync wrapper for disconnect"""
754
- import threading
755
 
756
- app_state["connected"] = False # Immediate update for UI
757
 
758
- # Start async disconnect in a separate thread to avoid event loop issues
759
- def run_async_disconnect():
760
- try:
761
- import asyncio
762
- loop = asyncio.new_event_loop()
763
- asyncio.set_event_loop(loop)
764
- result = loop.run_until_complete(handle_disconnect_async())
765
- print(f"Disconnect result: {result}")
766
- loop.close()
767
- except Exception as e:
768
- print(f"Disconnect error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769
 
770
- thread = threading.Thread(target=run_async_disconnect, daemon=True)
771
- thread.start()
772
 
773
- return "πŸ”„ Disconnecting from AI assistant..."
774
 
775
- def get_connection_status():
776
- """PREMIUM: Get detailed connection status"""
777
- if app_state["connected"]:
778
- stats = app_state["stats"]
779
- return f"🟒 Connected | Audio: {stats['audio_sent']} | Frames: {stats['frames_sent']} | Responses: {stats['responses_received']}"
780
- else:
781
- return f"πŸ”΄ Disconnected | Status: {app_state['last_status']}"
782
 
783
- def create_interface():
784
- """PREMIUM: Enhanced interface with complete real-time integration"""
785
- # Initialize premium stream
786
- stream = initialize_real_time_assistant()
787
 
788
  with gr.Blocks(
789
- title="Real-Time Screen Assistant - Premium Edition",
790
- theme=gr.themes.Soft()
 
 
 
 
 
 
 
791
  ) as demo:
792
-
793
- gr.Markdown("# πŸš€ Real-Time Screen Assistant - Premium Edition")
794
  gr.Markdown("""
795
- **🎯 PREMIUM AI with complete real-time frontend integration!**
796
-
797
- **Real-time Frontend Integration Features:**
798
- βœ… **Continuous audio flow** - Voice activity detection, noise filtering
799
- βœ… **Model audio output** - AI voice responses with queue management
800
- βœ… **Screen data streaming** - Adaptive capture with intelligent throttling
801
- βœ… **Text response delivery** - Real-time text display with conversation history
802
-
803
- **Enhanced Premium Features:**
804
- - 🧠 Enhanced GenAI configuration with full modalities
805
- - πŸŽ™οΈ Intelligent voice activity detection
806
- - πŸ“Ή Adaptive screen capture (300s real-time timeout)
807
- - πŸ”„ Background task management with cleanup
808
- - πŸ“Š Performance monitoring and optimization
809
- - πŸ›‘οΈ Enhanced error handling and recovery
810
  """)
811
 
812
- # PREMIUM: Enhanced status display
813
- with gr.Row():
814
- status_display = gr.Textbox(
815
- label="πŸ”΄ Connection Status",
816
- value="Ready to connect - Premium features enabled",
817
- interactive=False
818
- )
819
- stats_display = gr.Textbox(
820
- label="πŸ“Š Performance Stats",
821
- value="Audio: 0 | Frames: 0 | Responses: 0",
822
- interactive=False
823
- )
824
-
825
- # PREMIUM: Enhanced control panel
826
- with gr.Row():
827
- connect_btn = gr.Button("πŸ”— Connect (Premium)", variant="primary")
828
- disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
829
-
830
- with gr.Row():
831
- mic_test_btn = gr.Button("πŸŽ™οΈ Test Microphone", variant="secondary")
832
- screen_share_btn = gr.Button("πŸ–₯️ Share Screen", variant="secondary")
833
-
834
- # --- Backend logic for mic test and screen sharing ---
835
- def backend_mic_test():
836
- # Simulate a backend mic test (could be extended to record/playback)
837
- if app_state.get("handler") and app_state.get("connected"):
838
- return "πŸŽ™οΈ Microphone is active and streaming to backend."
839
- return "⚠️ Please connect first to test microphone."
840
-
841
- def backend_screen_share():
842
- # Simulate backend screen sharing trigger
843
- if app_state.get("handler") and app_state.get("connected"):
844
- # In a real implementation, you might set a flag or trigger a backend event
845
- return "πŸ–₯️ Screen sharing is active and streaming to backend."
846
- return "⚠️ Please connect first to share your screen."
847
-
848
- # PREMIUM: Real-time streaming interface
849
- gr.Markdown("### πŸ“‘ Premium Real-Time Stream")
850
-
851
- if stream:
852
- # Create streaming interface with enhanced configuration
853
- audio_stream = gr.Audio(
854
- streaming=True,
855
- autoplay=False,
856
- show_download_button=False,
857
- label="πŸŽ™οΈ Microphone Input (Voice Activity Detection)",
858
- interactive=True
859
- )
860
-
861
- # PREMIUM: Integrated ScreenRecorder component
862
- screen_recorder = ScreenRecorder(
863
- audio_enabled=True,
864
- webcam_overlay=True,
865
- webcam_position="bottom-right",
866
- recording_format="webm",
867
- max_duration=300, # 5 minutes - real-time optimized
868
- label="πŸ–₯️ Screen Recorder (Premium)",
869
- interactive=True
870
- )
871
-
872
- # PREMIUM: Connect streaming handlers
873
- audio_stream.stream(
874
- fn=lambda audio: app_state["handler"].receive(audio) if app_state["handler"] and app_state["connected"] else None,
875
- inputs=[audio_stream],
876
- outputs=[],
877
- time_limit=300, # Real-time optimized
878
- concurrency_limit=5
879
- )
880
-
881
- # PREMIUM: AI response display
882
- ai_response_display = gr.Textbox(
883
- label="πŸ€– AI Response Stream",
884
- value="AI responses will appear here...",
885
- interactive=False,
886
- max_lines=10
887
- )
888
-
889
- # PREMIUM: Audio output
890
- ai_audio_output = gr.Audio(
891
- label="πŸ”Š AI Voice Response",
892
- autoplay=True,
893
- streaming=True
894
- )
895
-
896
- # Connect AI response handlers
897
- ai_audio_output.stream(
898
- fn=lambda: app_state["handler"].emit() if app_state["handler"] and app_state["connected"] else None,
899
- inputs=[],
900
- outputs=[ai_audio_output],
901
- time_limit=300
902
- )
903
-
904
- # Connect screen recorder to video handler
905
- def handle_screen_recording(recording_data):
906
- """Handle screen recording data and send to AI"""
907
- if not recording_data or not app_state["handler"] or not app_state["connected"]:
908
- return "⚠️ Not connected to AI or no recording data"
909
-
910
- try:
911
- # If we have video data, process it for the AI
912
- if recording_data and recording_data.get('video'):
913
- # For real-time processing, we could extract frames
914
- # For now, just acknowledge the recording
915
- duration = recording_data.get('duration', 0)
916
- size = recording_data.get('size', 0)
917
- print(f"πŸ“Ή Screen recording received: {duration}s, {size} bytes")
918
-
919
- # Update stats
920
- app_state["stats"]["frames_sent"] += 1
921
-
922
- return f"βœ… Screen recording processed: {duration:.1f}s"
923
- else:
924
- return "⚠️ No video data in recording"
925
-
926
- except Exception as e:
927
- print(f"❌ Error processing screen recording: {e}")
928
- return f"❌ Error: {e}"
929
-
930
- screen_recorder.change(
931
- fn=handle_screen_recording,
932
- inputs=[screen_recorder],
933
- outputs=[ai_response_display],
934
- show_progress=False
935
- )
936
 
 
 
 
 
 
 
 
 
 
 
 
937
  else:
938
- gr.HTML("<div>⚠️ Premium stream initialization failed - Check console for errors</div>")
939
 
940
- # PREMIUM: Enhanced instructions
941
- with gr.Accordion("πŸ“‹ Premium Instructions", open=True):
942
  gr.Markdown("""
943
- **How to use the Premium Real-Time Assistant:**
944
-
945
- 1. **Connect**: Click "Connect (Premium)" to start enhanced AI session
946
- 2. **Permissions**: Allow microphone and camera access when prompted
947
- 3. **Voice Interaction**: Speak naturally - voice activity detection filters noise
948
- 4. **Screen Sharing**: Click "Share Screen" for continuous screen analysis
949
- 5. **Real-time Responses**: Receive both voice and text responses immediately
950
- 6. **Monitor Performance**: Check stats display for real-time metrics
951
-
952
- **Premium Features Active:**
953
- - βœ… **Continuous Audio Flow**: Voice activity detection with noise filtering
954
- - βœ… **Model Audio Output**: AI voice responses with smart queue management
955
- - βœ… **Screen Data Streaming**: Adaptive capture with 1 FPS optimization
956
- - βœ… **Text Response Delivery**: Real-time text with conversation history
957
- - βœ… **Background Task Management**: Proper async task handling and cleanup
958
- - βœ… **Enhanced Error Recovery**: Robust connection management
 
 
 
 
959
  """)
960
 
961
- # PREMIUM: Technical details
962
- with gr.Accordion("πŸ”§ Premium Technical Features", open=False):
963
  gr.Markdown("""
964
- **Real-Time Frontend Integration Implementation:**
965
-
966
- **1. Continuous Audio Flow (User β†’ Model):**
967
- ```python
968
- # Voice activity detection with threshold filtering
969
- audio_level = np.abs(audio_np.astype(np.float32)).mean()
970
- if audio_level < voice_activity_threshold:
971
- return # Filter silence
972
-
973
- # Enhanced send with metadata
974
- await session.send_realtime_input(input=types.Blob(...))
975
- ```
976
-
977
- **2. Model Audio Output (Model β†’ User):**
978
- ```python
979
- # AI response processing with queue management
980
- audio_array = np.frombuffer(response.data, dtype=np.int16)
981
- await audio_queue.put(audio_array.reshape(1, -1))
982
- ```
983
-
984
- **3. Screen Data Streaming (Screen β†’ Model):**
985
- ```python
986
- # Adaptive throttling based on activity
987
- adaptive_interval = frame_interval * (0.5 if active else 1.0)
988
- # Quality optimization: 85% for active, 75% for quiet
989
- ```
990
-
991
- **4. Text Response Delivery (System β†’ User):**
992
- ```python
993
- # Conversation history with timestamps
994
- conversation_history.append({
995
- "timestamp": time.time(),
996
- "type": "ai_response",
997
- "content": response.text
998
- })
999
- ```
1000
-
1001
- **Premium Optimizations:**
1002
- - Background task management with proper cleanup
1003
- - Enhanced error handling and recovery
1004
- - Performance monitoring and adaptive quality
1005
- - 300s timeout optimized for real-time behavior
1006
- """)
1007
-
1008
- # Wire up premium controls
1009
- connect_btn.click(
1010
- fn=handle_connect,
1011
- outputs=[status_display]
1012
- )
1013
-
1014
- disconnect_btn.click(
1015
- fn=handle_disconnect,
1016
- outputs=[status_display]
1017
- )
1018
-
1019
- mic_test_btn.click(
1020
- fn=backend_mic_test,
1021
- outputs=[status_display]
1022
- )
1023
-
1024
- screen_share_btn.click(
1025
- fn=backend_screen_share,
1026
- outputs=[status_display]
1027
- )
1028
-
1029
- # Initial load of connection status
1030
- demo.load(
1031
- fn=get_connection_status,
1032
- outputs=[stats_display]
1033
- )
1034
-
1035
- return demo
1036
-
1037
- # Main execution
1038
- if __name__ == "__main__":
1039
- print("πŸš€ Real-Time Screen Assistant - PREMIUM EDITION")
1040
- print("=" * 60)
1041
- print("βœ… Complete real-time frontend integration:")
1042
- print(" 1. Continuous audio flow (user β†’ model)")
1043
- print(" 2. Model audio output (model β†’ user)")
1044
- print(" 3. Screen data streaming (screen β†’ model)")
1045
- print(" 4. Text response delivery (system β†’ user)")
1046
- print("βœ… Enhanced features:")
1047
- print(" - Voice activity detection with noise filtering")
1048
- print(" - Adaptive screen capture with quality optimization")
1049
- print(" - Background task management with cleanup")
1050
- print(" - Enhanced error handling and recovery")
1051
- print(" - 300s timeout optimized for real-time behavior")
1052
-
1053
- if not API_KEY:
1054
- print("\n⚠️ No GEMINI_API_KEY environment variable found")
1055
- print("Please set your Google AI API key:")
1056
- print("export GEMINI_API_KEY='your-api-key-here'")
1057
- else:
1058
- print(f"\nβœ… API key configured (Premium Mode)")
1059
-
1060
- print("\nπŸš€ Starting Premium Real-Time Assistant...")
1061
-
1062
- try:
1063
- demo = create_interface()
1064
- demo.launch(
1065
- server_name="0.0.0.0",
1066
- server_port=7860,
1067
- share=False,
1068
- show_error=True
1069
- )
1070
- except Exception as e:
1071
- print(f"❌ Failed to launch: {e}")
1072
- print("Ensure all dependencies are installed: pip install -r requirements.txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import asyncio
 
3
  import time
4
+ import numpy as np
 
 
5
  import cv2
6
  import gradio as gr
7
+ from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
 
 
8
  from google import genai
9
  from google.genai import types
 
 
 
 
 
 
 
10
 
11
  # Environment variable for API key
12
  API_KEY = os.getenv("GEMINI_API_KEY", "")
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ class EnhancedScreenAssistantHandler(AsyncAudioVideoStreamHandler):
16
+ """Enhanced real-time screen assistant with voice activity detection"""
 
 
 
 
 
 
 
 
 
17
 
18
  def __init__(self):
19
+ super().__init__(input_audio_type="mono", output_sample_rate=24000, input_sample_rate=16000)
 
 
 
 
20
  self.session = None
21
  self.last_frame_time = 0
22
  self.audio_queue = asyncio.Queue()
23
  self.text_queue = asyncio.Queue()
24
  self.connected = False
25
+ self.frame_interval = 1.0 # Send one frame per second
26
+ self.conversation_history = []
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  async def start_up(self):
29
+ """Initialize Google GenAI Live session with enhanced configuration"""
30
  try:
31
+ if not API_KEY:
 
32
  print("❌ No GEMINI_API_KEY found in environment")
33
  return
34
 
35
+ # Initialize Google GenAI client with alpha API access
36
+ client = genai.Client(api_key=API_KEY, http_options={"api_version": "v1alpha"})
 
 
 
37
 
38
+ # Enhanced configuration for live session
39
  config = {
40
  "response_modalities": ["AUDIO", "TEXT"],
41
  "input_audio_transcription": {"model": "latest"},
42
  "output_audio_transcription": {"model": "latest"},
43
+ "system_instruction": (
44
+ "You are an expert real-time screen assistant. You can see the user's screen "
45
+ "and hear their voice. Provide clear, actionable guidance based on what you observe. "
46
+ "Be proactive - if you see the user struggling or notice something important, "
47
+ "offer helpful suggestions even without being asked. Keep responses concise but thorough. "
48
+ "When giving instructions, be specific about what to click, where to look, "
49
+ "and what to expect next."
50
+ ),
51
+ "generation_config": {"response_mime_type": "text/plain", "temperature": 0.7, "max_output_tokens": 512},
 
 
 
 
 
 
 
 
52
  }
53
 
54
+ # Connect to Live API
55
+ self.session = await client.aio.live.connect(model="gemini-2.0-flash-live-preview", config=config)
 
 
 
56
 
57
  self.connected = True
58
+ print("βœ… Connected to Google GenAI Live API with enhanced configuration")
59
 
60
+ # Start background tasks with proper management
61
+ self.background_tasks = set()
62
  response_task = asyncio.create_task(self._handle_responses())
63
+ context_task = asyncio.create_task(self._periodic_context_update())
64
  self.background_tasks.add(response_task)
65
+ self.background_tasks.add(context_task)
66
  response_task.add_done_callback(self.background_tasks.discard)
67
+ context_task.add_done_callback(self.background_tasks.discard)
68
 
69
  except Exception as e:
70
  print(f"❌ Failed to connect to GenAI: {e}")
71
  self.connected = False
72
 
73
  async def _handle_responses(self):
74
+ """Handle incoming responses from AI with enhanced processing"""
75
  try:
76
+ current_text = ""
 
 
77
 
78
+ async for msg in self.session.receive():
79
+ if msg.data: # Audio response from AI
80
+ # Convert raw PCM bytes to numpy array for FastRTC
81
+ audio_array = np.frombuffer(msg.data, dtype=np.int16)
82
+ if len(audio_array) > 0:
83
+ audio_array = audio_array.reshape(1, -1) # Shape: (1, N)
84
+ await self.audio_queue.put(audio_array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if msg.text: # Text response from AI
87
+ current_text += msg.text
88
+ print(f"πŸ€– AI: {msg.text}")
89
+
90
+ # Add to conversation history when response is complete
91
+ if msg.text.endswith((".", "!", "?", "\n")):
92
+ self.conversation_history.append({"role": "assistant", "content": current_text.strip(), "timestamp": time.time()})
93
+ current_text = ""
94
+
95
+ # Keep conversation history manageable
96
+ if len(self.conversation_history) > 20:
97
+ self.conversation_history = self.conversation_history[-15:]
98
+
99
+ await self.text_queue.put(msg.text)
100
 
101
  except Exception as e:
102
+ print(f"❌ Error handling AI responses: {e}")
103
 
104
+ async def _periodic_context_update(self):
105
+ """Periodically send context updates to maintain session state"""
106
+ while self.connected:
107
+ await asyncio.sleep(30) # Update every 30 seconds
108
+
109
+ if self.session and len(self.conversation_history) > 0:
110
+ try:
111
+ # Send a subtle context maintenance message
112
+ context_msg = "Continue monitoring and providing assistance as needed."
113
+ await self.session.send_realtime_input(text=context_msg)
114
+ except Exception as e:
115
+ print(f"⚠️ Context update failed: {e}")
116
+
117
+ async def receive(self, frame: tuple[int, np.ndarray]):
118
+ """Handle incoming audio with voice activity detection"""
119
  if not self.connected or not self.session:
120
  return
121
 
122
  try:
123
  _, audio_np = frame
124
 
125
+ # Basic voice activity detection
126
+ audio_level = np.abs(audio_np).mean()
127
+ if audio_level > 0.01: # Threshold for voice activity
128
+ audio_bytes = audio_np.tobytes()
129
 
130
+ # Send audio to Google GenAI Live API
131
+ await self.session.send_realtime_input(media=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  except Exception as e:
134
+ print(f"❌ Error processing audio: {e}")
135
 
136
+ async def video_receive(self, frame: np.ndarray):
137
+ """Handle incoming video frames with intelligent frame selection"""
138
  if not self.connected or not self.session:
139
  return
140
 
141
  try:
 
142
  current_time = time.time()
143
 
144
+ # Adaptive frame rate based on activity
145
+ # Send frames more frequently if there's likely activity
146
+ frame_diff_threshold = 0.1
147
+ if hasattr(self, "last_frame"):
148
+ frame_diff = np.abs(frame.astype(float) - self.last_frame.astype(float)).mean()
149
+ if frame_diff > frame_diff_threshold:
150
+ # More activity detected, reduce interval
151
+ effective_interval = self.frame_interval * 0.5
152
+ else:
153
+ effective_interval = self.frame_interval
154
  else:
155
+ effective_interval = self.frame_interval
 
156
 
157
+ if current_time - self.last_frame_time < effective_interval:
158
  return
159
 
160
  self.last_frame_time = current_time
161
+ self.last_frame = frame.copy()
162
+
163
+ # Resize frame for efficiency while maintaining quality
164
+ height, width = frame.shape[:2]
165
+ if width > 1280:
166
+ scale = 1280 / width
167
+ new_width = 1280
168
+ new_height = int(height * scale)
169
+ frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_AREA)
170
+
171
+ # Encode frame as JPEG with optimized quality
172
+ success, jpg_bytes = cv2.imencode(
173
+ ".jpg",
174
+ frame,
175
+ [cv2.IMWRITE_JPEG_QUALITY, 75], # Balanced quality/size
176
+ )
177
 
178
+ if not success:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  return
180
 
181
+ # Send frame to Google GenAI
182
+ await self.session.send_realtime_input(media=types.Blob(data=jpg_bytes.tobytes(), mime_type="image/jpeg"))
 
 
 
 
 
183
 
184
+ print(f"πŸ“Έ Sent frame ({frame.shape[1]}x{frame.shape[0]}, {len(jpg_bytes)} bytes)")
 
 
 
 
 
 
185
 
186
  except Exception as e:
187
+ print(f"❌ Error processing video frame: {e}")
188
 
189
  async def emit(self):
190
+ """Provide audio output back to user with queue management"""
191
  try:
192
  audio_chunk = self.audio_queue.get_nowait()
193
  return (24000, audio_chunk)
 
195
  return None
196
 
197
  async def get_latest_text(self):
198
+ """Get latest text response for UI updates"""
199
  try:
200
  text = self.text_queue.get_nowait()
201
  return text
202
  except asyncio.QueueEmpty:
203
  return None
204
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  async def shutdown(self):
206
+ """Enhanced cleanup with proper resource management"""
207
  self.connected = False
208
 
209
+ if self.session:
210
+ try:
211
+ # Send goodbye message
212
+ await self.session.send_realtime_input(text="Session ending. Thank you!")
213
+
214
+ await asyncio.sleep(0.5) # Brief delay for message to send
215
+ await self.session.close()
216
+ print("πŸ”΄ Cleanly disconnected from GenAI Live API")
217
 
218
+ except Exception as e:
219
+ print(f"⚠️ Error during shutdown: {e}")
 
 
220
 
221
+ # Cancel all background tasks properly
222
+ if hasattr(self, "background_tasks"):
223
+ for task in self.background_tasks.copy():
224
+ if not task.done():
225
+ task.cancel()
226
+
227
+ # Wait for all tasks to complete or be cancelled
228
+ if self.background_tasks:
229
+ await asyncio.gather(*self.background_tasks, return_exceptions=True)
230
+
231
+ self.background_tasks.clear()
232
+
233
+ # Clear queues
234
  while not self.audio_queue.empty():
235
  try:
236
  self.audio_queue.get_nowait()
 
243
  except asyncio.QueueEmpty:
244
  break
245
 
246
+ self.session = None
247
+ self.conversation_history = []
248
 
 
 
 
 
 
 
 
249
 
250
+ # Global state management
251
+ app_state = {"stream": None, "handler": None, "connected": False, "screen_sharing": False}
252
 
 
 
 
 
 
 
 
 
253
 
254
+ def initialize_stream():
255
+ """Initialize the FastRTC stream with enhanced configuration"""
256
  try:
257
+ # Create enhanced handler
258
+ handler = EnhancedScreenAssistantHandler()
259
  app_state["handler"] = handler
260
 
261
+ # Create stream with optimized settings for HF Spaces
262
+ stream = Stream(
263
+ handler=ReplyOnPause(handler), # Add voice activity detection
264
+ modality="audio-video",
265
+ mode="send-receive",
266
+ rtc_configuration=get_cloudflare_turn_credentials_async,
267
+ time_limit=600, # 10 minute session limit
268
+ ui_args={
269
+ "audio_controls": True,
270
+ "video_controls": True,
271
+ },
272
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  app_state["stream"] = stream
275
  return stream
276
 
277
  except Exception as e:
278
+ print(f"❌ Error initializing stream: {e}")
279
  return None
280
 
281
+
282
+ def handle_connect():
283
+ """Enhanced connection handler"""
284
+ if not API_KEY:
285
  return "❌ Please set GEMINI_API_KEY environment variable"
286
 
287
  if app_state["connected"]:
288
  return "βœ… Already connected - session is active"
289
 
290
+ if app_state["handler"]:
291
+ app_state["connected"] = True
292
+ return "βœ… Connecting to AI... Please allow microphone and camera permissions"
 
 
 
 
 
 
 
 
293
 
294
+ return "❌ Stream not initialized - please refresh the page"
 
 
295
 
 
 
296
 
297
+ def handle_screen_share():
298
+ """Handle screen sharing toggle"""
299
+ app_state["screen_sharing"] = not app_state["screen_sharing"]
 
 
 
 
 
 
 
 
 
 
300
 
301
+ if app_state["screen_sharing"]:
302
+ return "πŸ–₯️ Screen sharing started - AI can now see your screen"
303
+ else:
304
+ return "πŸ“± Switched back to camera view"
305
 
 
306
 
307
  async def handle_disconnect_async():
308
+ """Async enhanced disconnection handler"""
309
  if app_state["handler"] and app_state["connected"]:
310
  try:
311
  await app_state["handler"].shutdown()
312
  app_state["connected"] = False
313
+ app_state["screen_sharing"] = False
314
  app_state["handler"] = None
 
315
  return "πŸ”΄ Disconnected from AI assistant"
316
  except Exception as e:
317
+ return f"⚠️ Disconnect error: {e}"
 
318
 
319
+ return "Already disconnected"
 
 
320
 
 
321
 
322
+ def handle_disconnect():
323
+ """Sync wrapper for enhanced disconnection handler"""
324
+ # Create task and store reference for proper cleanup
325
+ if not hasattr(app_state, "disconnect_task") or app_state.get("disconnect_task", {}).done():
326
+ import asyncio
327
+
328
+ app_state["disconnect_task"] = asyncio.create_task(handle_disconnect_async())
329
+ app_state["connected"] = False # Immediately mark as disconnected
330
+ app_state["screen_sharing"] = False
331
+
332
+ return "πŸ”„ Disconnecting... Please wait..."
333
+
334
+
335
+ # Enhanced JavaScript for screen sharing
336
+ enhanced_screen_share_js = """
337
+ async function toggleScreenShare() {
338
+ try {
339
+ const videoElements = document.querySelectorAll('video');
340
+ const webrtcVideo = Array.from(videoElements).find(video =>
341
+ video.srcObject && video.srcObject.getVideoTracks().length > 0
342
+ );
343
+
344
+ if (!webrtcVideo) {
345
+ return "❌ Could not find video element";
346
+ }
347
+
348
+ const currentTrack = webrtcVideo.srcObject.getVideoTracks()[0];
349
+ const isScreenShare = currentTrack && currentTrack.label.includes('screen');
350
+
351
+ if (isScreenShare) {
352
+ // Switch back to camera
353
+ const cameraStream = await navigator.mediaDevices.getUserMedia({
354
+ video: { width: 640, height: 480 },
355
+ audio: false
356
+ });
357
+
358
+ const videoTrack = cameraStream.getVideoTracks()[0];
359
+ webrtcVideo.srcObject.removeTrack(currentTrack);
360
+ webrtcVideo.srcObject.addTrack(videoTrack);
361
+
362
+ currentTrack.stop();
363
+ return "πŸ“± Switched to camera view";
364
+
365
+ } else {
366
+ // Switch to screen share
367
+ const screenStream = await navigator.mediaDevices.getDisplayMedia({
368
+ video: {
369
+ mediaSource: 'screen',
370
+ width: { ideal: 1280, max: 1920 },
371
+ height: { ideal: 720, max: 1080 },
372
+ frameRate: { ideal: 2, max: 5 } // Low frame rate for efficiency
373
+ },
374
+ audio: false
375
+ });
376
+
377
+ const videoTrack = screenStream.getVideoTracks()[0];
378
+ webrtcVideo.srcObject.removeTrack(currentTrack);
379
+ webrtcVideo.srcObject.addTrack(videoTrack);
380
+
381
+ // Handle when screen sharing ends
382
+ videoTrack.onended = () => {
383
+ console.log('Screen sharing ended by user');
384
+ // Automatically switch back to camera
385
+ navigator.mediaDevices.getUserMedia({video: true, audio: false})
386
+ .then(cameraStream => {
387
+ const cameraTrack = cameraStream.getVideoTracks()[0];
388
+ webrtcVideo.srcObject.addTrack(cameraTrack);
389
+ });
390
+ };
391
+
392
+ currentTrack.stop();
393
+ return "πŸ–₯️ Screen sharing active";
394
+ }
395
+
396
+ } catch (error) {
397
+ console.error('Screen sharing error:', error);
398
+ if (error.name === 'NotAllowedError') {
399
+ return "❌ Screen sharing permission denied";
400
+ } else if (error.name === 'NotFoundError') {
401
+ return "❌ No screen available to share";
402
+ } else {
403
+ return `❌ Error: ${error.message}`;
404
+ }
405
+ }
406
+ }
407
 
408
+ return toggleScreenShare();
409
+ """
410
 
 
411
 
412
+ def create_main_interface():
413
+ """Create the enhanced main interface"""
 
 
 
 
 
414
 
415
+ # Initialize stream
416
+ stream = initialize_stream()
 
 
417
 
418
  with gr.Blocks(
419
+ title="Enhanced Real-Time Screen Assistant",
420
+ theme=gr.themes.Soft(),
421
+ css="""
422
+ .status-connected { background: linear-gradient(90deg, #4CAF50, #45a049); color: white; }
423
+ .status-disconnected { background: linear-gradient(90deg, #f44336, #da190b); color: white; }
424
+ .status-warning { background: linear-gradient(90deg, #ff9800, #f57c00); color: white; }
425
+ .control-row { margin: 10px 0; }
426
+ .stream-container { border: 2px solid #ddd; border-radius: 10px; padding: 20px; margin: 20px 0; }
427
+ """,
428
  ) as demo:
429
+ gr.Markdown("# πŸ–₯️ Enhanced Real-Time Screen Assistant")
 
430
  gr.Markdown("""
431
+ **Advanced AI assistant with live screen sharing, voice interaction, and real-time guidance**
432
+
433
+ Powered by Google's Gemini Live API and FastRTC for ultra-low latency communication.
 
 
 
 
 
 
 
 
 
 
 
 
434
  """)
435
 
436
+ # Status display
437
+ status_display = gr.Textbox(
438
+ label="πŸ” Status",
439
+ value="Ready to connect - Click Connect to start your AI session",
440
+ interactive=False,
441
+ elem_classes=["status-disconnected"],
442
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
+ # Control buttons
445
+ with gr.Row(elem_classes=["control-row"]):
446
+ connect_btn = gr.Button("πŸ”— Connect to AI", variant="primary", size="lg")
447
+ screen_btn = gr.Button("πŸ–₯️ Toggle Screen Share", variant="secondary", size="lg")
448
+ disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop", size="lg")
449
+
450
+ # Stream container
451
+ if stream and stream.ui:
452
+ with gr.Group(elem_classes=["stream-container"]):
453
+ gr.Markdown("### πŸ“‘ Live Stream")
454
+ stream_interface = stream.ui
455
  else:
456
+ stream_interface = gr.HTML("<div>⚠️ Stream initialization failed - check console for errors</div>")
457
 
458
+ # Usage instructions
459
+ with gr.Accordion("πŸ“‹ How to Use This Assistant", open=True):
460
  gr.Markdown("""
461
+ **Getting Started:**
462
+ 1. **Connect**: Click "Connect to AI" to establish the AI session
463
+ 2. **Permissions**: Allow microphone and camera access in your browser
464
+ 3. **Screen Share**: Click "Toggle Screen Share" to let the AI see your screen
465
+ 4. **Interact**: Simply speak naturally - the AI will respond with voice and can see your screen
466
+
467
+ **What the AI can help with:**
468
+ - πŸ–₯️ **Software tutorials**: "Show me how to use this feature"
469
+ - πŸ”§ **Troubleshooting**: "Why isn't this working?"
470
+ - πŸ“Š **Data analysis**: "Help me understand this chart"
471
+ - 🎨 **Design feedback**: "How can I improve this layout?"
472
+ - πŸ“ **Writing assistance**: "Help me edit this document"
473
+ - 🌐 **Web navigation**: "Guide me through this website"
474
+
475
+ **Voice Commands:**
476
+ - "What am I looking at?"
477
+ - "What should I do next?"
478
+ - "Explain this to me"
479
+ - "Help me fix this error"
480
+ - "Is this the right approach?"
481
  """)
482
 
483
+ # Advanced features
484
+ with gr.Accordion("βš™οΈ Advanced Features", open=False):
485
  gr.Markdown("""
486
+ **Technical Capabilities:**
487
+ - πŸŽ™οΈ **Voice Activity Detection**: AI responds when you finish speaking
488
+ - πŸ“Έ **Intelligent Frame Sampling**: Optimized screen capture (1-2 FPS)
489
+ - 🧠 **Context Awareness**: AI remembers your conversation history
490
+ - πŸ”„ **Adaptive Quality**: Automatically adjusts based on connection
491
+ - ⚑ **Ultra-Low Latency**: Typical response time under 500ms
492
+
493
+ **Privacy & Security:**
494
+ - πŸ”’ All data encrypted in transit (WebRTC + TLS)
495
+ - 🏠 Processing by Google's secure AI infrastructure
496
+ - 🚫 No permanent storage of your screen or voice data
497
+ - πŸ‘€ Each session is completely isolated and private
498
+
499
+ **Optimization for Hugging Face Spaces:**
500
+ - ☁️ Cloudflare TURN servers for reliable connectivity
501
+ - πŸ”§ Automatic resource management and cleanup
502
+ - ⏱️ Session timeout prot