File size: 6,140 Bytes
618cf4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import face_recognition
import cv2
import numpy as np
import google.generativeai as genai
import gradio as gr
from PIL import Image

def register_new_face(name, image, known_faces):
    """
    Registers a new face encoding into the state dictionary.
    """
    if not name or image is None:
        raise gr.Error("Please provide both a name and an image.")
    
    # Convert to RGB for face_recognition
    if image.shape[2] == 4:
        image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    else:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
    encodings = face_recognition.face_encodings(image)
    
    if not encodings:
        raise gr.Error("No face detected in the image. Please try another photo.")
    
    # Update state
    new_known_faces = known_faces.copy()
    new_known_faces[name] = encodings[0]
    
    # Return updated state, updated JSON list, and clear inputs
    return new_known_faces, list(new_known_faces.keys()), "", None

def process_video_frame(frame, known_faces):
    """
    Processes a video frame: detects faces, recognizes them, draws boxes,
    and updates the current detected user.
    """
    if frame is None:
        return None, "Unknown", "**👤 Detected:** Unknown", None

    # Store original for multimodal context (before drawing boxes)
    original_frame = frame.copy()

    # Resize for faster processing
    small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
    
    # Convert BGR/BGRA to RGB
    # Gradio usually provides RGB, but opencv operations might need care
    rgb_small_frame = np.ascontiguousarray(small_frame)

    # Find faces
    face_locations = face_recognition.face_locations(rgb_small_frame)
    face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

    detected_name = "Unknown"
    face_names = []

    for face_encoding in face_encodings:
        name = "Unknown"
        if known_faces:
            known_names = list(known_faces.keys())
            known_encodings_list = list(known_faces.values())
            
            matches = face_recognition.compare_faces(known_encodings_list, face_encoding)
            face_distances = face_recognition.face_distance(known_encodings_list, face_encoding)
            
            if len(face_distances) > 0:
                best_match_index = np.argmin(face_distances)
                if matches[best_match_index]:
                    name = known_names[best_match_index]
        
        face_names.append(name)
        if name != "Unknown":
            detected_name = name

    # Draw results on frame
    annotated_frame = draw_overlays(frame, face_locations, face_names)
    
    status_md = f"**👤 Detected:** {detected_name}"
    if detected_name != "Unknown":
        status_md = f"**👤 Detected:** <span style='color: green'>{detected_name}</span>"

    return annotated_frame, detected_name, status_md, original_frame

def draw_overlays(frame, face_locations, face_names):
    """Helper to draw boxes and names on the frame"""
    # Scale back up face locations since we detected on 1/4 size
    for (top, right, bottom, left), name in zip(face_locations, face_names):
        top *= 4
        right *= 4
        bottom *= 4
        left *= 4

        # Draw a box around the face
        color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
        cv2.rectangle(frame, (left, top), (right, bottom), color, 2)

        # Draw a label with a name below the face
        cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
        font = cv2.FONT_HERSHEY_DUPLEX
        cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1)
    
    return frame

def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame):
    """
    Sends audio (and optionally image) to Gemini and returns the response.
    """
    if not api_key:
        gr.Warning("Please enter your Gemini API Key in the Settings tab.")
        return history, history, None

    if not audio_path:
        return history, history, None

    try:
        genai.configure(api_key=api_key)
        # Use flash for speed
        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        
        # Prepare prompt
        identity_context = ""
        if user_name and user_name != "Unknown":
            identity_context = f"The user speaking is named {user_name}. "
        
        full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond."

        # Prepare content parts
        content_parts = [full_prompt]
        
        # Add Audio
        # Gradio audio type="filepath" gives a path to a wav/mp3
        # We upload the file to Gemini temporarily or pass bytes if small enough.
        # For simplicity in this demo, we upload the file using the File API 
        # (or simply pass data if supported by the specific SDK version, but File API is safer for audio).
        
        # Note: In a production high-traffic app, manage file lifecycle carefully.
        myfile = genai.upload_file(audio_path)
        content_parts.append(myfile)

        # Add Image if multimodal enabled
        if use_vision and image_frame is not None:
            # Convert numpy array to PIL Image
            pil_img = Image.fromarray(image_frame)
            content_parts.append(pil_img)
            content_parts[0] += " The user has also provided a video frame of what they are looking at."

        # Generate
        response = model.generate_content(content_parts)
        response_text = response.text

        # Update History (Gradio Chatbot 'messages' format)
        # User message is just a placeholder for "Audio Sent" since we don't STT locally
        history.append({"role": "user", "content": "🎤 [Audio Sent]"})
        history.append({"role": "assistant", "content": response_text})

        return history, history, None # Return None to clear the audio input

    except Exception as e:
        error_msg = f"Error: {str(e)}"
        history.append({"role": "assistant", "content": error_msg})
        return history, history, None