Spaces:
Build error
Build error
File size: 6,140 Bytes
618cf4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import face_recognition
import cv2
import numpy as np
import google.generativeai as genai
import gradio as gr
from PIL import Image
def register_new_face(name, image, known_faces):
"""
Registers a new face encoding into the state dictionary.
"""
if not name or image is None:
raise gr.Error("Please provide both a name and an image.")
# Convert to RGB for face_recognition
if image.shape[2] == 4:
image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
else:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
encodings = face_recognition.face_encodings(image)
if not encodings:
raise gr.Error("No face detected in the image. Please try another photo.")
# Update state
new_known_faces = known_faces.copy()
new_known_faces[name] = encodings[0]
# Return updated state, updated JSON list, and clear inputs
return new_known_faces, list(new_known_faces.keys()), "", None
def process_video_frame(frame, known_faces):
"""
Processes a video frame: detects faces, recognizes them, draws boxes,
and updates the current detected user.
"""
if frame is None:
return None, "Unknown", "**👤 Detected:** Unknown", None
# Store original for multimodal context (before drawing boxes)
original_frame = frame.copy()
# Resize for faster processing
small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
# Convert BGR/BGRA to RGB
# Gradio usually provides RGB, but opencv operations might need care
rgb_small_frame = np.ascontiguousarray(small_frame)
# Find faces
face_locations = face_recognition.face_locations(rgb_small_frame)
face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)
detected_name = "Unknown"
face_names = []
for face_encoding in face_encodings:
name = "Unknown"
if known_faces:
known_names = list(known_faces.keys())
known_encodings_list = list(known_faces.values())
matches = face_recognition.compare_faces(known_encodings_list, face_encoding)
face_distances = face_recognition.face_distance(known_encodings_list, face_encoding)
if len(face_distances) > 0:
best_match_index = np.argmin(face_distances)
if matches[best_match_index]:
name = known_names[best_match_index]
face_names.append(name)
if name != "Unknown":
detected_name = name
# Draw results on frame
annotated_frame = draw_overlays(frame, face_locations, face_names)
status_md = f"**👤 Detected:** {detected_name}"
if detected_name != "Unknown":
status_md = f"**👤 Detected:** <span style='color: green'>{detected_name}</span>"
return annotated_frame, detected_name, status_md, original_frame
def draw_overlays(frame, face_locations, face_names):
"""Helper to draw boxes and names on the frame"""
# Scale back up face locations since we detected on 1/4 size
for (top, right, bottom, left), name in zip(face_locations, face_names):
top *= 4
right *= 4
bottom *= 4
left *= 4
# Draw a box around the face
color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
cv2.rectangle(frame, (left, top), (right, bottom), color, 2)
# Draw a label with a name below the face
cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
font = cv2.FONT_HERSHEY_DUPLEX
cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1)
return frame
def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame):
"""
Sends audio (and optionally image) to Gemini and returns the response.
"""
if not api_key:
gr.Warning("Please enter your Gemini API Key in the Settings tab.")
return history, history, None
if not audio_path:
return history, history, None
try:
genai.configure(api_key=api_key)
# Use flash for speed
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Prepare prompt
identity_context = ""
if user_name and user_name != "Unknown":
identity_context = f"The user speaking is named {user_name}. "
full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond."
# Prepare content parts
content_parts = [full_prompt]
# Add Audio
# Gradio audio type="filepath" gives a path to a wav/mp3
# We upload the file to Gemini temporarily or pass bytes if small enough.
# For simplicity in this demo, we upload the file using the File API
# (or simply pass data if supported by the specific SDK version, but File API is safer for audio).
# Note: In a production high-traffic app, manage file lifecycle carefully.
myfile = genai.upload_file(audio_path)
content_parts.append(myfile)
# Add Image if multimodal enabled
if use_vision and image_frame is not None:
# Convert numpy array to PIL Image
pil_img = Image.fromarray(image_frame)
content_parts.append(pil_img)
content_parts[0] += " The user has also provided a video frame of what they are looking at."
# Generate
response = model.generate_content(content_parts)
response_text = response.text
# Update History (Gradio Chatbot 'messages' format)
# User message is just a placeholder for "Audio Sent" since we don't STT locally
history.append({"role": "user", "content": "🎤 [Audio Sent]"})
history.append({"role": "assistant", "content": response_text})
return history, history, None # Return None to clear the audio input
except Exception as e:
error_msg = f"Error: {str(e)}"
history.append({"role": "assistant", "content": error_msg})
return history, history, None |