import gradio as gr import cv2 import numpy as np import os from utils import ( register_new_face, process_video_frame, generate_gemini_response, draw_overlays ) # --- Global State Initialization --- # In a real deployment, you might use a database. # For this demo, we use Gradio State for session-specific storage. def create_app(): with gr.Blocks(title="Gemini Live Identity Chat", theme=gr.themes.Soft()) as demo: # --- State Variables --- # known_faces: dict {name: encoding} known_faces_state = gr.State(value={}) # current_user: str current_user_state = gr.State(value="Unknown") # chat_history: list of [user_msg, bot_msg] history_state = gr.State(value=[]) # current_frame: to store the last frame for multimodal queries last_frame_state = gr.State(value=None) # --- Header --- with gr.Row(elem_classes="header"): gr.Markdown( """ # 🎙️ Gemini Live Identity Chat [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """ ) # --- Main Layout --- with gr.Tabs(): # TAB 1: Live Interaction with gr.Tab("💬 Live Interaction"): with gr.Row(): # Left Column: Vision & Identity with gr.Column(scale=1): gr.Markdown("### 👁️ Vision & Identity") # Input webcam for face recognition input_webcam = gr.Image( label="Live Feed", sources=["webcam"], streaming=True, type="numpy" ) # Status display user_status = gr.Markdown( value="**👤 Detected:** Unknown", elem_id="status-box" ) # Multimodal toggle use_vision_toggle = gr.Checkbox( label="👀 Allow Gemini to see this video frame", value=False, info="If checked, the current image will be sent with your audio." ) # Right Column: Chat with gr.Column(scale=2): gr.Markdown("### 🗣️ Conversation") chatbot = gr.Chatbot( label="Chat History", height=500, type="messages", avatar_images=(None, "https://www.gstatic.com/lamda/images/gemini_sparkle_v002_d4735304ff6292a690345.svg") ) with gr.Row(): audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Voice Input (Recording stops automatically)", editable=False ) clear_btn = gr.Button("Clear Conversation", variant="secondary") # TAB 2: Registration with gr.Tab("👤 Registration"): gr.Markdown("### Register a New Face") with gr.Row(): with gr.Column(): reg_name = gr.Textbox(label="Name", placeholder="Enter your name") reg_image = gr.Image(label="Upload Photo", sources=["upload", "webcam"], type="numpy") reg_btn = gr.Button("Register Face", variant="primary") with gr.Column(): gr.Markdown("### Registered Users") registered_list = gr.JSON(label="Database", value={}) # TAB 3: Configuration with gr.Tab("⚙️ Settings"): gr.Markdown("### App Configuration") api_key_input = gr.Textbox( label="Gemini API Key", type="password", placeholder="Paste your Google AI Studio Key here", info="Required for chat functionality." ) system_prompt_input = gr.Textbox( label="System Persona", value="You are a helpful, conversational assistant. Keep responses concise.", lines=3 ) # --- Event Wiring --- # 1. Face Recognition Loop # This stream processes frames, updates the 'current_user', and returns the annotated image input_webcam.stream( fn=process_video_frame, inputs=[input_webcam, known_faces_state], outputs=[input_webcam, current_user_state, user_status, last_frame_state], time_limit=None, stream_every=0.1 # Limit FPS for performance ) # 2. Audio Chat Interaction # Triggered when the user stops recording audio audio_input.stop_recording( fn=generate_gemini_response, inputs=[ audio_input, history_state, current_user_state, api_key_input, system_prompt_input, use_vision_toggle, last_frame_state ], outputs=[history_state, chatbot, audio_input] # Clear audio input after sending ) # 3. Registration Logic reg_btn.click( fn=register_new_face, inputs=[reg_name, reg_image, known_faces_state], outputs=[known_faces_state, registered_list, reg_name, reg_image] ) # 4. Clear Chat def clear_history(): return [], [] clear_btn.click(clear_history, None, [history_state, chatbot]) return demo if __name__ == "__main__": demo = create_app() demo.launch()