Spaces:

IFMedTechdemo
/

Audio_chat_bot

Sleeping

File size: 5,050 Bytes

import asyncio
import base64
import io
import os
import time
import numpy as np
import cv2
from PIL import Image
from google import genai
from fastrtc import AsyncAudioVideoStreamHandler, wait_for_item, WebRTCError, WebRTC, get_cloudflare_turn_credentials
import gradio as gr

# --- Encoder Helpers ---
def encode_audio(data: np.ndarray) -> dict:
    return {"mime_type": "audio/pcm", "data": base64.b64encode(data.tobytes()).decode("UTF-8")}

def encode_image(data: np.ndarray) -> dict:
    if len(data.shape) == 3 and data.shape[2] == 3:
        data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
    with io.BytesIO() as output_bytes:
        pil_image = Image.fromarray(data)
        pil_image.thumbnail([1024, 1024])
        pil_image.save(output_bytes, "JPEG")
        return {"mime_type": "image/jpeg", "data": base64.b64encode(output_bytes.getvalue()).decode("utf-8")}

# --- Gemini Handler ---
class GeminiLiveHandler(AsyncAudioVideoStreamHandler):
    def __init__(self) -> None:
        super().__init__(expected_layout="mono", output_sample_rate=24000, input_sample_rate=16000)
        self.audio_queue = asyncio.Queue()
        self.video_queue = asyncio.Queue()
        self.session = None
        self.quit = asyncio.Event()

    def copy(self) -> "GeminiLiveHandler":
        return GeminiLiveHandler()

    async def start_up(self):
        await self.wait_for_args()
        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise WebRTCError("API Key missing! Please set GEMINI_API_KEY in Secrets.")

        system_instruction = self.latest_args[1]
        client = genai.Client(api_key=api_key, http_options={"api_version": "v1beta"})
        
        config = {
            "response_modalities": ["AUDIO"],
            "system_instruction": system_instruction or "You are a helpful AI assistant.",
            "speech_config": {"voice_config": {"prebuilt_voice_config": {"voice_name": "Zephyr"}}}
        }

        try:
            async with client.aio.live.connect(model="gemini-2.0-flash-exp", config=config) as session:
                self.session = session
                # Bot speaks first to confirm connection
                await self.session.send(input="Hello! I am connected and ready. How can I help?", end_of_turn=True)
                async for response in self.session.receive():
                    if self.quit.is_set(): break
                    if data := response.data:
                        self.audio_queue.put_nowait(np.frombuffer(data, dtype=np.int16).reshape(1, -1))
        except Exception as e:
            raise WebRTCError(f"Connection Error: {str(e)}")

    async def video_receive(self, frame: np.ndarray):
        self.video_queue.put_nowait(frame)
        if self.latest_args[2] != "none" and self.session:
            await self.session.send(input=encode_image(frame))

    async def video_emit(self) -> np.ndarray:
        frame = await wait_for_item(self.video_queue, 0.01)
        return frame if frame is not None else np.zeros((480, 640, 3), dtype=np.uint8)

    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        if self.session:
            await self.session.send(input=encode_audio(frame[1].squeeze()))

    async def emit(self):
        array = await wait_for_item(self.audio_queue, 0.01)
        return (self.output_sample_rate, array) if array is not None else None

    async def shutdown(self) -> None:
        if self.session:
            self.quit.set()
            await self.session.close()

# --- Custom UI ---
with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ Gemini Live: Voice & Vision")
    
    with gr.Row():
        with gr.Column(scale=1):
            instruction = gr.Textbox(label="System Instruction", value="Be helpful and concise.")
            # 1. User selects mode
            mode = gr.Radio(choices=["camera", "screen", "none"], label="Select Video Mode")
            # 2. Start button is hidden until mode is selected
            start_btn = gr.Button("🚀 Start Conversation", variant="primary", visible=False)
        
        with gr.Column(scale=2):
            # 3. WebRTC component is hidden until Start is clicked
            webrtc = WebRTC(
                label="Gemini Live Stream",
                modality="audio-video",
                mode="send-receive",
                visible=False,
                rtc_configuration=get_cloudflare_turn_credentials()
            )

    # Show start button once a radio option is picked
    mode.change(lambda x: gr.update(visible=True) if x else gr.update(visible=False), [mode], [start_btn])

    # When Start is clicked, show the video/audio interface
    def on_start():
        return gr.update(visible=True)

    start_btn.click(on_start, None, [webrtc])
    
    # Connect the WebRTC stream to the handler
    webrtc.stream(
        fn=GeminiLiveHandler(),
        inputs=[webrtc, instruction, mode],
        outputs=[webrtc],
        time_limit=900
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)