Spaces:
Build error
Build error
File size: 6,347 Bytes
618cf4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import gradio as gr
import cv2
import numpy as np
import os
from utils import (
register_new_face,
process_video_frame,
generate_gemini_response,
draw_overlays
)
# --- Global State Initialization ---
# In a real deployment, you might use a database.
# For this demo, we use Gradio State for session-specific storage.
def create_app():
with gr.Blocks(title="Gemini Live Identity Chat", theme=gr.themes.Soft()) as demo:
# --- State Variables ---
# known_faces: dict {name: encoding}
known_faces_state = gr.State(value={})
# current_user: str
current_user_state = gr.State(value="Unknown")
# chat_history: list of [user_msg, bot_msg]
history_state = gr.State(value=[])
# current_frame: to store the last frame for multimodal queries
last_frame_state = gr.State(value=None)
# --- Header ---
with gr.Row(elem_classes="header"):
gr.Markdown(
"""
# ποΈ Gemini Live Identity Chat
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
# --- Main Layout ---
with gr.Tabs():
# TAB 1: Live Interaction
with gr.Tab("π¬ Live Interaction"):
with gr.Row():
# Left Column: Vision & Identity
with gr.Column(scale=1):
gr.Markdown("### ποΈ Vision & Identity")
# Input webcam for face recognition
input_webcam = gr.Image(
label="Live Feed",
sources=["webcam"],
streaming=True,
type="numpy"
)
# Status display
user_status = gr.Markdown(
value="**π€ Detected:** Unknown",
elem_id="status-box"
)
# Multimodal toggle
use_vision_toggle = gr.Checkbox(
label="π Allow Gemini to see this video frame",
value=False,
info="If checked, the current image will be sent with your audio."
)
# Right Column: Chat
with gr.Column(scale=2):
gr.Markdown("### π£οΈ Conversation")
chatbot = gr.Chatbot(
label="Chat History",
height=500,
type="messages",
avatar_images=(None, "https://www.gstatic.com/lamda/images/gemini_sparkle_v002_d4735304ff6292a690345.svg")
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Voice Input (Recording stops automatically)",
editable=False
)
clear_btn = gr.Button("Clear Conversation", variant="secondary")
# TAB 2: Registration
with gr.Tab("π€ Registration"):
gr.Markdown("### Register a New Face")
with gr.Row():
with gr.Column():
reg_name = gr.Textbox(label="Name", placeholder="Enter your name")
reg_image = gr.Image(label="Upload Photo", sources=["upload", "webcam"], type="numpy")
reg_btn = gr.Button("Register Face", variant="primary")
with gr.Column():
gr.Markdown("### Registered Users")
registered_list = gr.JSON(label="Database", value={})
# TAB 3: Configuration
with gr.Tab("βοΈ Settings"):
gr.Markdown("### App Configuration")
api_key_input = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="Paste your Google AI Studio Key here",
info="Required for chat functionality."
)
system_prompt_input = gr.Textbox(
label="System Persona",
value="You are a helpful, conversational assistant. Keep responses concise.",
lines=3
)
# --- Event Wiring ---
# 1. Face Recognition Loop
# This stream processes frames, updates the 'current_user', and returns the annotated image
input_webcam.stream(
fn=process_video_frame,
inputs=[input_webcam, known_faces_state],
outputs=[input_webcam, current_user_state, user_status, last_frame_state],
time_limit=None,
stream_every=0.1 # Limit FPS for performance
)
# 2. Audio Chat Interaction
# Triggered when the user stops recording audio
audio_input.stop_recording(
fn=generate_gemini_response,
inputs=[
audio_input,
history_state,
current_user_state,
api_key_input,
system_prompt_input,
use_vision_toggle,
last_frame_state
],
outputs=[history_state, chatbot, audio_input] # Clear audio input after sending
)
# 3. Registration Logic
reg_btn.click(
fn=register_new_face,
inputs=[reg_name, reg_image, known_faces_state],
outputs=[known_faces_state, registered_list, reg_name, reg_image]
)
# 4. Clear Chat
def clear_history():
return [], []
clear_btn.click(clear_history, None, [history_state, chatbot])
return demo
if __name__ == "__main__":
demo = create_app()
demo.launch() |