kamcio1989's picture
Upload folder using huggingface_hub
618cf4f verified
import gradio as gr
import cv2
import numpy as np
import os
from utils import (
register_new_face,
process_video_frame,
generate_gemini_response,
draw_overlays
)
# --- Global State Initialization ---
# In a real deployment, you might use a database.
# For this demo, we use Gradio State for session-specific storage.
def create_app():
with gr.Blocks(title="Gemini Live Identity Chat", theme=gr.themes.Soft()) as demo:
# --- State Variables ---
# known_faces: dict {name: encoding}
known_faces_state = gr.State(value={})
# current_user: str
current_user_state = gr.State(value="Unknown")
# chat_history: list of [user_msg, bot_msg]
history_state = gr.State(value=[])
# current_frame: to store the last frame for multimodal queries
last_frame_state = gr.State(value=None)
# --- Header ---
with gr.Row(elem_classes="header"):
gr.Markdown(
"""
# πŸŽ™οΈ Gemini Live Identity Chat
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
# --- Main Layout ---
with gr.Tabs():
# TAB 1: Live Interaction
with gr.Tab("πŸ’¬ Live Interaction"):
with gr.Row():
# Left Column: Vision & Identity
with gr.Column(scale=1):
gr.Markdown("### πŸ‘οΈ Vision & Identity")
# Input webcam for face recognition
input_webcam = gr.Image(
label="Live Feed",
sources=["webcam"],
streaming=True,
type="numpy"
)
# Status display
user_status = gr.Markdown(
value="**πŸ‘€ Detected:** Unknown",
elem_id="status-box"
)
# Multimodal toggle
use_vision_toggle = gr.Checkbox(
label="πŸ‘€ Allow Gemini to see this video frame",
value=False,
info="If checked, the current image will be sent with your audio."
)
# Right Column: Chat
with gr.Column(scale=2):
gr.Markdown("### πŸ—£οΈ Conversation")
chatbot = gr.Chatbot(
label="Chat History",
height=500,
type="messages",
avatar_images=(None, "https://www.gstatic.com/lamda/images/gemini_sparkle_v002_d4735304ff6292a690345.svg")
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Voice Input (Recording stops automatically)",
editable=False
)
clear_btn = gr.Button("Clear Conversation", variant="secondary")
# TAB 2: Registration
with gr.Tab("πŸ‘€ Registration"):
gr.Markdown("### Register a New Face")
with gr.Row():
with gr.Column():
reg_name = gr.Textbox(label="Name", placeholder="Enter your name")
reg_image = gr.Image(label="Upload Photo", sources=["upload", "webcam"], type="numpy")
reg_btn = gr.Button("Register Face", variant="primary")
with gr.Column():
gr.Markdown("### Registered Users")
registered_list = gr.JSON(label="Database", value={})
# TAB 3: Configuration
with gr.Tab("βš™οΈ Settings"):
gr.Markdown("### App Configuration")
api_key_input = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="Paste your Google AI Studio Key here",
info="Required for chat functionality."
)
system_prompt_input = gr.Textbox(
label="System Persona",
value="You are a helpful, conversational assistant. Keep responses concise.",
lines=3
)
# --- Event Wiring ---
# 1. Face Recognition Loop
# This stream processes frames, updates the 'current_user', and returns the annotated image
input_webcam.stream(
fn=process_video_frame,
inputs=[input_webcam, known_faces_state],
outputs=[input_webcam, current_user_state, user_status, last_frame_state],
time_limit=None,
stream_every=0.1 # Limit FPS for performance
)
# 2. Audio Chat Interaction
# Triggered when the user stops recording audio
audio_input.stop_recording(
fn=generate_gemini_response,
inputs=[
audio_input,
history_state,
current_user_state,
api_key_input,
system_prompt_input,
use_vision_toggle,
last_frame_state
],
outputs=[history_state, chatbot, audio_input] # Clear audio input after sending
)
# 3. Registration Logic
reg_btn.click(
fn=register_new_face,
inputs=[reg_name, reg_image, known_faces_state],
outputs=[known_faces_state, registered_list, reg_name, reg_image]
)
# 4. Clear Chat
def clear_history():
return [], []
clear_btn.click(clear_history, None, [history_state, chatbot])
return demo
if __name__ == "__main__":
demo = create_app()
demo.launch()