Spaces:

IFMedTech
/

audio_chat

Paused

App Files Files Community

IFMedTechdemo commited on Dec 23, 2025

Commit

0f20060

verified ·

1 Parent(s): 719147e

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -83

app.py CHANGED Viewed

@@ -1,7 +1,3 @@
-"""
-Gradio Application for Gemini Live API with Audio + Video Streaming
-"""
 import asyncio
 import base64
 import io
@@ -9,103 +5,66 @@ import os
 import time
 import numpy as np
 import cv2
-import websockets
 from PIL import Image
 from google import genai
-from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, Stream, get_cloudflare_turn_credentials
 import gradio as gr
-# Encoder functions for Gemini API
 def encode_audio(data: np.ndarray) -> dict:
-    """Encode audio data (int16 mono) for Gemini."""
-    return {
-        "mime_type": "audio/pcm",
-        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
-    }
 def encode_image(data: np.ndarray) -> dict:
-    """Encode image data as JPEG for Gemini."""
     if len(data.shape) == 3 and data.shape[2] == 3:
         data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
     with io.BytesIO() as output_bytes:
         pil_image = Image.fromarray(data)
         pil_image.thumbnail([1024, 1024])
         pil_image.save(output_bytes, "JPEG")
-        bytes_data = output_bytes.getvalue()
-        base64_str = str(base64.b64encode(bytes_data), "utf-8")
-        return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
     def __init__(self) -> None:
-        super().__init__(
-            expected_layout="mono",
-            output_sample_rate=24000,
-            input_sample_rate=16000,
-        )
         self.audio_queue = asyncio.Queue()
         self.video_queue = asyncio.Queue()
         self.session = None
-        self.last_frame_time = 0.0
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiLiveHandler":
         return GeminiLiveHandler()
     async def start_up(self):
-        """Initialize Gemini Live API session"""
         await self.wait_for_args()
         api_key = os.environ.get("GEMINI_API_KEY")
         if not api_key:
-            raise WebRTCError("Gemini API Key not found in Space secrets.")
         system_instruction = self.latest_args[1]
-        client = genai.Client(
-            api_key=api_key,
-            http_options={"api_version": "v1beta"}
-        )
         config = {
             "response_modalities": ["AUDIO"],
-            "system_instruction": system_instruction or "You are a helpful assistant.",
-            "speech_config": {
-                "voice_config": {"prebuilt_voice_config": {"voice_name": "Zephyr"}}
-            }
         }
-        # Use the standard preview model gemini-2.0-flash-exp
-        async with client.aio.live.connect(
-            model="gemini-2.0-flash-exp",
-            config=config,
-        ) as session:
-            self.session = session
-            # --- ADDED: Initial greeting to make the bot speak first ---
-            await self.session.send(input="Hello! I'm connected and ready to help.", end_of_turn=True)
-            # Listen for responses continuously
-            try:
                 async for response in self.session.receive():
-                    if self.quit.is_set():
-                        break
                     if data := response.data:
-                        audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                        self.audio_queue.put_nowait(audio)
-                    if text := response.text:
-                        print(f"Gemini: {text}")
-            except Exception as e:
-                print(f"Session error: {e}")
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
-        video_mode = self.latest_args[2]
-        if video_mode != "none" and self.session and (time.time() - self.last_frame_time > 1.0):
-            self.last_frame_time = time.time()
             await self.session.send(input=encode_image(frame))
     async def video_emit(self) -> np.ndarray:
@@ -113,10 +72,8 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
         return frame if frame is not None else np.zeros((480, 640, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        _, array = frame
-        array = array.squeeze()
         if self.session:
-            await self.session.send(input=encode_audio(array))
     async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
@@ -126,24 +83,45 @@ class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
         if self.session:
             self.quit.set()
             await self.session.close()
-            self.quit.clear()
-stream = Stream(
-    handler=GeminiLiveHandler(),
-    modality="audio-video",
-    mode="send-receive",
-    server_rtc_configuration=get_cloudflare_turn_credentials(),
-    rtc_configuration=get_cloudflare_turn_credentials(),
-    additional_inputs=[
-        gr.Markdown("## 🎙️ Gemini Live - Real-Time Voice & Vision\n\nClick the **Connect/Start** button to begin."),
-        gr.Textbox(label="System Instruction", value="You are a helpful and concise AI assistant."),
-        gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Mode"),
-    ],
-    ui_args={
-        "title": "Gemini Live Assistant",
-        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png"
-    }
-)
 if __name__ == "__main__":
-    stream.ui.launch(server_name="0.0.0.0", server_port=7860)

 import asyncio
 import base64
 import io
 import time
 import numpy as np
 import cv2
 from PIL import Image
 from google import genai
+from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, WebRTC, get_cloudflare_turn_credentials
 import gradio as gr
+# --- Encoder Helpers ---
 def encode_audio(data: np.ndarray) -> dict:
+    return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}
 def encode_image(data: np.ndarray) -> dict:
     if len(data.shape) == 3 and data.shape[2] == 3:
         data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
     with io.BytesIO() as output_bytes:
         pil_image = Image.fromarray(data)
         pil_image.thumbnail([1024, 1024])
         pil_image.save(output_bytes, "JPEG")
+        return {"mime_type": "image/jpeg", "data": base64.b64encode(output_bytes.getvalue()).decode("utf-8")}
+# --- Gemini Handler ---
 class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
     def __init__(self) -> None:
+        super().__init__(expected_layout="mono", output_sample_rate=24000, input_sample_rate=16000)
         self.audio_queue = asyncio.Queue()
         self.video_queue = asyncio.Queue()
         self.session = None
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiLiveHandler":
         return GeminiLiveHandler()
     async def start_up(self):
         await self.wait_for_args()
         api_key = os.environ.get("GEMINI_API_KEY")
         if not api_key:
+            raise WebRTCError("API Key missing! Please set GEMINI_API_KEY in Secrets.")
         system_instruction = self.latest_args[1]
+        client = genai.Client(api_key=api_key, http_options={"api_version": "v1beta"})
         config = {
             "response_modalities": ["AUDIO"],
+            "system_instruction": system_instruction or "You are a helpful AI assistant.",
+            "speech_config": {"voice_config": {"prebuilt_voice_config": {"voice_name": "Zephyr"}}}
         }
+        try:
+            async with client.aio.live.connect(model="gemini-2.0-flash-exp", config=config) as session:
+                self.session = session
+                # Bot speaks first to confirm connection
+                await self.session.send(input="Hello! I am connected and ready. How can I help?", end_of_turn=True)
                 async for response in self.session.receive():
+                    if self.quit.is_set(): break
                     if data := response.data:
+                        self.audio_queue.put_nowait(np.frombuffer(data, dtype=np.int16).reshape(1, -1))
+        except Exception as e:
+            raise WebRTCError(f"Connection Error: {str(e)}")
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
+        if self.latest_args[2] != "none" and self.session:
             await self.session.send(input=encode_image(frame))
     async def video_emit(self) -> np.ndarray:
         return frame if frame is not None else np.zeros((480, 640, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if self.session:
+            await self.session.send(input=encode_audio(frame[1].squeeze()))
     async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
         if self.session:
             self.quit.set()
             await self.session.close()
+# --- Custom UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ Gemini Live: Voice & Vision")
+    with gr.Row():
+        with gr.Column(scale=1):
+            instruction = gr.Textbox(label="System Instruction", value="Be helpful and concise.")
+            # 1. User selects mode
+            mode = gr.Radio(choices=["camera", "screen", "none"], label="Select Video Mode")
+            # 2. Start button is hidden until mode is selected
+            start_btn = gr.Button("🚀 Start Conversation", variant="primary", visible=False)
+        with gr.Column(scale=2):
+            # 3. WebRTC component is hidden until Start is clicked
+            webrtc = WebRTC(
+                label="Gemini Live Stream",
+                modality="audio-video",
+                mode="send-receive",
+                visible=False,
+                rtc_configuration=get_cloudflare_turn_credentials()
+            )
+    # Show start button once a radio option is picked
+    mode.change(lambda x: gr.update(visible=True) if x else gr.update(visible=False), [mode], [start_btn])
+    # When Start is clicked, show the video/audio interface
+    def on_start():
+        return gr.update(visible=True)
+    start_btn.click(on_start, None, [webrtc])
+    # Connect the WebRTC stream to the handler
+    webrtc.stream(
+        fn=GeminiLiveHandler(),
+        inputs=[webrtc, instruction, mode],
+        outputs=[webrtc],
+        time_limit=900
+    )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)