anycoder-679c5b67 / utils.py
kamcio1989's picture
Upload folder using huggingface_hub
618cf4f verified
import face_recognition
import cv2
import numpy as np
import google.generativeai as genai
import gradio as gr
from PIL import Image
def register_new_face(name, image, known_faces):
"""
Registers a new face encoding into the state dictionary.
"""
if not name or image is None:
raise gr.Error("Please provide both a name and an image.")
# Convert to RGB for face_recognition
if image.shape[2] == 4:
image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
else:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
encodings = face_recognition.face_encodings(image)
if not encodings:
raise gr.Error("No face detected in the image. Please try another photo.")
# Update state
new_known_faces = known_faces.copy()
new_known_faces[name] = encodings[0]
# Return updated state, updated JSON list, and clear inputs
return new_known_faces, list(new_known_faces.keys()), "", None
def process_video_frame(frame, known_faces):
"""
Processes a video frame: detects faces, recognizes them, draws boxes,
and updates the current detected user.
"""
if frame is None:
return None, "Unknown", "**πŸ‘€ Detected:** Unknown", None
# Store original for multimodal context (before drawing boxes)
original_frame = frame.copy()
# Resize for faster processing
small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
# Convert BGR/BGRA to RGB
# Gradio usually provides RGB, but opencv operations might need care
rgb_small_frame = np.ascontiguousarray(small_frame)
# Find faces
face_locations = face_recognition.face_locations(rgb_small_frame)
face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)
detected_name = "Unknown"
face_names = []
for face_encoding in face_encodings:
name = "Unknown"
if known_faces:
known_names = list(known_faces.keys())
known_encodings_list = list(known_faces.values())
matches = face_recognition.compare_faces(known_encodings_list, face_encoding)
face_distances = face_recognition.face_distance(known_encodings_list, face_encoding)
if len(face_distances) > 0:
best_match_index = np.argmin(face_distances)
if matches[best_match_index]:
name = known_names[best_match_index]
face_names.append(name)
if name != "Unknown":
detected_name = name
# Draw results on frame
annotated_frame = draw_overlays(frame, face_locations, face_names)
status_md = f"**πŸ‘€ Detected:** {detected_name}"
if detected_name != "Unknown":
status_md = f"**πŸ‘€ Detected:** <span style='color: green'>{detected_name}</span>"
return annotated_frame, detected_name, status_md, original_frame
def draw_overlays(frame, face_locations, face_names):
"""Helper to draw boxes and names on the frame"""
# Scale back up face locations since we detected on 1/4 size
for (top, right, bottom, left), name in zip(face_locations, face_names):
top *= 4
right *= 4
bottom *= 4
left *= 4
# Draw a box around the face
color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
cv2.rectangle(frame, (left, top), (right, bottom), color, 2)
# Draw a label with a name below the face
cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
font = cv2.FONT_HERSHEY_DUPLEX
cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1)
return frame
def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame):
"""
Sends audio (and optionally image) to Gemini and returns the response.
"""
if not api_key:
gr.Warning("Please enter your Gemini API Key in the Settings tab.")
return history, history, None
if not audio_path:
return history, history, None
try:
genai.configure(api_key=api_key)
# Use flash for speed
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Prepare prompt
identity_context = ""
if user_name and user_name != "Unknown":
identity_context = f"The user speaking is named {user_name}. "
full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond."
# Prepare content parts
content_parts = [full_prompt]
# Add Audio
# Gradio audio type="filepath" gives a path to a wav/mp3
# We upload the file to Gemini temporarily or pass bytes if small enough.
# For simplicity in this demo, we upload the file using the File API
# (or simply pass data if supported by the specific SDK version, but File API is safer for audio).
# Note: In a production high-traffic app, manage file lifecycle carefully.
myfile = genai.upload_file(audio_path)
content_parts.append(myfile)
# Add Image if multimodal enabled
if use_vision and image_frame is not None:
# Convert numpy array to PIL Image
pil_img = Image.fromarray(image_frame)
content_parts.append(pil_img)
content_parts[0] += " The user has also provided a video frame of what they are looking at."
# Generate
response = model.generate_content(content_parts)
response_text = response.text
# Update History (Gradio Chatbot 'messages' format)
# User message is just a placeholder for "Audio Sent" since we don't STT locally
history.append({"role": "user", "content": "🎀 [Audio Sent]"})
history.append({"role": "assistant", "content": response_text})
return history, history, None # Return None to clear the audio input
except Exception as e:
error_msg = f"Error: {str(e)}"
history.append({"role": "assistant", "content": error_msg})
return history, history, None