File size: 6,347 Bytes
618cf4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
import cv2
import numpy as np
import os
from utils import (
    register_new_face,
    process_video_frame,
    generate_gemini_response,
    draw_overlays
)

# --- Global State Initialization ---
# In a real deployment, you might use a database. 
# For this demo, we use Gradio State for session-specific storage.

def create_app():
    with gr.Blocks(title="Gemini Live Identity Chat", theme=gr.themes.Soft()) as demo:
        
        # --- State Variables ---
        # known_faces: dict {name: encoding}
        known_faces_state = gr.State(value={}) 
        # current_user: str
        current_user_state = gr.State(value="Unknown")
        # chat_history: list of [user_msg, bot_msg]
        history_state = gr.State(value=[])
        # current_frame: to store the last frame for multimodal queries
        last_frame_state = gr.State(value=None)

        # --- Header ---
        with gr.Row(elem_classes="header"):
            gr.Markdown(
                """
                # πŸŽ™οΈ Gemini Live Identity Chat
                [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
                """
            )

        # --- Main Layout ---
        with gr.Tabs():
            
            # TAB 1: Live Interaction
            with gr.Tab("πŸ’¬ Live Interaction"):
                with gr.Row():
                    # Left Column: Vision & Identity
                    with gr.Column(scale=1):
                        gr.Markdown("### πŸ‘οΈ Vision & Identity")
                        
                        # Input webcam for face recognition
                        input_webcam = gr.Image(
                            label="Live Feed", 
                            sources=["webcam"], 
                            streaming=True,
                            type="numpy"
                        )
                        
                        # Status display
                        user_status = gr.Markdown(
                            value="**πŸ‘€ Detected:** Unknown", 
                            elem_id="status-box"
                        )
                        
                        # Multimodal toggle
                        use_vision_toggle = gr.Checkbox(
                            label="πŸ‘€ Allow Gemini to see this video frame",
                            value=False,
                            info="If checked, the current image will be sent with your audio."
                        )

                    # Right Column: Chat
                    with gr.Column(scale=2):
                        gr.Markdown("### πŸ—£οΈ Conversation")
                        
                        chatbot = gr.Chatbot(
                            label="Chat History",
                            height=500,
                            type="messages",
                            avatar_images=(None, "https://www.gstatic.com/lamda/images/gemini_sparkle_v002_d4735304ff6292a690345.svg")
                        )
                        
                        with gr.Row():
                            audio_input = gr.Audio(
                                sources=["microphone"], 
                                type="filepath",
                                label="Voice Input (Recording stops automatically)",
                                editable=False
                            )
                        
                        clear_btn = gr.Button("Clear Conversation", variant="secondary")

            # TAB 2: Registration
            with gr.Tab("πŸ‘€ Registration"):
                gr.Markdown("### Register a New Face")
                with gr.Row():
                    with gr.Column():
                        reg_name = gr.Textbox(label="Name", placeholder="Enter your name")
                        reg_image = gr.Image(label="Upload Photo", sources=["upload", "webcam"], type="numpy")
                        reg_btn = gr.Button("Register Face", variant="primary")
                    
                    with gr.Column():
                        gr.Markdown("### Registered Users")
                        registered_list = gr.JSON(label="Database", value={})

            # TAB 3: Configuration
            with gr.Tab("βš™οΈ Settings"):
                gr.Markdown("### App Configuration")
                api_key_input = gr.Textbox(
                    label="Gemini API Key", 
                    type="password", 
                    placeholder="Paste your Google AI Studio Key here",
                    info="Required for chat functionality."
                )
                
                system_prompt_input = gr.Textbox(
                    label="System Persona", 
                    value="You are a helpful, conversational assistant. Keep responses concise.",
                    lines=3
                )

        # --- Event Wiring ---

        # 1. Face Recognition Loop
        # This stream processes frames, updates the 'current_user', and returns the annotated image
        input_webcam.stream(
            fn=process_video_frame,
            inputs=[input_webcam, known_faces_state],
            outputs=[input_webcam, current_user_state, user_status, last_frame_state],
            time_limit=None,
            stream_every=0.1  # Limit FPS for performance
        )

        # 2. Audio Chat Interaction
        # Triggered when the user stops recording audio
        audio_input.stop_recording(
            fn=generate_gemini_response,
            inputs=[
                audio_input, 
                history_state, 
                current_user_state, 
                api_key_input, 
                system_prompt_input,
                use_vision_toggle,
                last_frame_state
            ],
            outputs=[history_state, chatbot, audio_input] # Clear audio input after sending
        )

        # 3. Registration Logic
        reg_btn.click(
            fn=register_new_face,
            inputs=[reg_name, reg_image, known_faces_state],
            outputs=[known_faces_state, registered_list, reg_name, reg_image]
        )
        
        # 4. Clear Chat
        def clear_history():
            return [], []
        clear_btn.click(clear_history, None, [history_state, chatbot])

    return demo

if __name__ == "__main__":
    demo = create_app()
    demo.launch()