import face_recognition import cv2 import numpy as np import google.generativeai as genai import gradio as gr from PIL import Image def register_new_face(name, image, known_faces): """ Registers a new face encoding into the state dictionary. """ if not name or image is None: raise gr.Error("Please provide both a name and an image.") # Convert to RGB for face_recognition if image.shape[2] == 4: image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB) else: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) encodings = face_recognition.face_encodings(image) if not encodings: raise gr.Error("No face detected in the image. Please try another photo.") # Update state new_known_faces = known_faces.copy() new_known_faces[name] = encodings[0] # Return updated state, updated JSON list, and clear inputs return new_known_faces, list(new_known_faces.keys()), "", None def process_video_frame(frame, known_faces): """ Processes a video frame: detects faces, recognizes them, draws boxes, and updates the current detected user. """ if frame is None: return None, "Unknown", "**👤 Detected:** Unknown", None # Store original for multimodal context (before drawing boxes) original_frame = frame.copy() # Resize for faster processing small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25) # Convert BGR/BGRA to RGB # Gradio usually provides RGB, but opencv operations might need care rgb_small_frame = np.ascontiguousarray(small_frame) # Find faces face_locations = face_recognition.face_locations(rgb_small_frame) face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations) detected_name = "Unknown" face_names = [] for face_encoding in face_encodings: name = "Unknown" if known_faces: known_names = list(known_faces.keys()) known_encodings_list = list(known_faces.values()) matches = face_recognition.compare_faces(known_encodings_list, face_encoding) face_distances = face_recognition.face_distance(known_encodings_list, face_encoding) if len(face_distances) > 0: best_match_index = np.argmin(face_distances) if matches[best_match_index]: name = known_names[best_match_index] face_names.append(name) if name != "Unknown": detected_name = name # Draw results on frame annotated_frame = draw_overlays(frame, face_locations, face_names) status_md = f"**👤 Detected:** {detected_name}" if detected_name != "Unknown": status_md = f"**👤 Detected:** {detected_name}" return annotated_frame, detected_name, status_md, original_frame def draw_overlays(frame, face_locations, face_names): """Helper to draw boxes and names on the frame""" # Scale back up face locations since we detected on 1/4 size for (top, right, bottom, left), name in zip(face_locations, face_names): top *= 4 right *= 4 bottom *= 4 left *= 4 # Draw a box around the face color = (0, 255, 0) if name != "Unknown" else (0, 0, 255) cv2.rectangle(frame, (left, top), (right, bottom), color, 2) # Draw a label with a name below the face cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED) font = cv2.FONT_HERSHEY_DUPLEX cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1) return frame def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame): """ Sends audio (and optionally image) to Gemini and returns the response. """ if not api_key: gr.Warning("Please enter your Gemini API Key in the Settings tab.") return history, history, None if not audio_path: return history, history, None try: genai.configure(api_key=api_key) # Use flash for speed model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Prepare prompt identity_context = "" if user_name and user_name != "Unknown": identity_context = f"The user speaking is named {user_name}. " full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond." # Prepare content parts content_parts = [full_prompt] # Add Audio # Gradio audio type="filepath" gives a path to a wav/mp3 # We upload the file to Gemini temporarily or pass bytes if small enough. # For simplicity in this demo, we upload the file using the File API # (or simply pass data if supported by the specific SDK version, but File API is safer for audio). # Note: In a production high-traffic app, manage file lifecycle carefully. myfile = genai.upload_file(audio_path) content_parts.append(myfile) # Add Image if multimodal enabled if use_vision and image_frame is not None: # Convert numpy array to PIL Image pil_img = Image.fromarray(image_frame) content_parts.append(pil_img) content_parts[0] += " The user has also provided a video frame of what they are looking at." # Generate response = model.generate_content(content_parts) response_text = response.text # Update History (Gradio Chatbot 'messages' format) # User message is just a placeholder for "Audio Sent" since we don't STT locally history.append({"role": "user", "content": "🎤 [Audio Sent]"}) history.append({"role": "assistant", "content": response_text}) return history, history, None # Return None to clear the audio input except Exception as e: error_msg = f"Error: {str(e)}" history.append({"role": "assistant", "content": error_msg}) return history, history, None