Spaces:
Build error
Build error
| import face_recognition | |
| import cv2 | |
| import numpy as np | |
| import google.generativeai as genai | |
| import gradio as gr | |
| from PIL import Image | |
| def register_new_face(name, image, known_faces): | |
| """ | |
| Registers a new face encoding into the state dictionary. | |
| """ | |
| if not name or image is None: | |
| raise gr.Error("Please provide both a name and an image.") | |
| # Convert to RGB for face_recognition | |
| if image.shape[2] == 4: | |
| image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB) | |
| else: | |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| encodings = face_recognition.face_encodings(image) | |
| if not encodings: | |
| raise gr.Error("No face detected in the image. Please try another photo.") | |
| # Update state | |
| new_known_faces = known_faces.copy() | |
| new_known_faces[name] = encodings[0] | |
| # Return updated state, updated JSON list, and clear inputs | |
| return new_known_faces, list(new_known_faces.keys()), "", None | |
| def process_video_frame(frame, known_faces): | |
| """ | |
| Processes a video frame: detects faces, recognizes them, draws boxes, | |
| and updates the current detected user. | |
| """ | |
| if frame is None: | |
| return None, "Unknown", "**π€ Detected:** Unknown", None | |
| # Store original for multimodal context (before drawing boxes) | |
| original_frame = frame.copy() | |
| # Resize for faster processing | |
| small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25) | |
| # Convert BGR/BGRA to RGB | |
| # Gradio usually provides RGB, but opencv operations might need care | |
| rgb_small_frame = np.ascontiguousarray(small_frame) | |
| # Find faces | |
| face_locations = face_recognition.face_locations(rgb_small_frame) | |
| face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations) | |
| detected_name = "Unknown" | |
| face_names = [] | |
| for face_encoding in face_encodings: | |
| name = "Unknown" | |
| if known_faces: | |
| known_names = list(known_faces.keys()) | |
| known_encodings_list = list(known_faces.values()) | |
| matches = face_recognition.compare_faces(known_encodings_list, face_encoding) | |
| face_distances = face_recognition.face_distance(known_encodings_list, face_encoding) | |
| if len(face_distances) > 0: | |
| best_match_index = np.argmin(face_distances) | |
| if matches[best_match_index]: | |
| name = known_names[best_match_index] | |
| face_names.append(name) | |
| if name != "Unknown": | |
| detected_name = name | |
| # Draw results on frame | |
| annotated_frame = draw_overlays(frame, face_locations, face_names) | |
| status_md = f"**π€ Detected:** {detected_name}" | |
| if detected_name != "Unknown": | |
| status_md = f"**π€ Detected:** <span style='color: green'>{detected_name}</span>" | |
| return annotated_frame, detected_name, status_md, original_frame | |
| def draw_overlays(frame, face_locations, face_names): | |
| """Helper to draw boxes and names on the frame""" | |
| # Scale back up face locations since we detected on 1/4 size | |
| for (top, right, bottom, left), name in zip(face_locations, face_names): | |
| top *= 4 | |
| right *= 4 | |
| bottom *= 4 | |
| left *= 4 | |
| # Draw a box around the face | |
| color = (0, 255, 0) if name != "Unknown" else (0, 0, 255) | |
| cv2.rectangle(frame, (left, top), (right, bottom), color, 2) | |
| # Draw a label with a name below the face | |
| cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED) | |
| font = cv2.FONT_HERSHEY_DUPLEX | |
| cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1) | |
| return frame | |
| def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame): | |
| """ | |
| Sends audio (and optionally image) to Gemini and returns the response. | |
| """ | |
| if not api_key: | |
| gr.Warning("Please enter your Gemini API Key in the Settings tab.") | |
| return history, history, None | |
| if not audio_path: | |
| return history, history, None | |
| try: | |
| genai.configure(api_key=api_key) | |
| # Use flash for speed | |
| model = genai.GenerativeModel(model_name="gemini-1.5-flash") | |
| # Prepare prompt | |
| identity_context = "" | |
| if user_name and user_name != "Unknown": | |
| identity_context = f"The user speaking is named {user_name}. " | |
| full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond." | |
| # Prepare content parts | |
| content_parts = [full_prompt] | |
| # Add Audio | |
| # Gradio audio type="filepath" gives a path to a wav/mp3 | |
| # We upload the file to Gemini temporarily or pass bytes if small enough. | |
| # For simplicity in this demo, we upload the file using the File API | |
| # (or simply pass data if supported by the specific SDK version, but File API is safer for audio). | |
| # Note: In a production high-traffic app, manage file lifecycle carefully. | |
| myfile = genai.upload_file(audio_path) | |
| content_parts.append(myfile) | |
| # Add Image if multimodal enabled | |
| if use_vision and image_frame is not None: | |
| # Convert numpy array to PIL Image | |
| pil_img = Image.fromarray(image_frame) | |
| content_parts.append(pil_img) | |
| content_parts[0] += " The user has also provided a video frame of what they are looking at." | |
| # Generate | |
| response = model.generate_content(content_parts) | |
| response_text = response.text | |
| # Update History (Gradio Chatbot 'messages' format) | |
| # User message is just a placeholder for "Audio Sent" since we don't STT locally | |
| history.append({"role": "user", "content": "π€ [Audio Sent]"}) | |
| history.append({"role": "assistant", "content": response_text}) | |
| return history, history, None # Return None to clear the audio input | |
| except Exception as e: | |
| error_msg = f"Error: {str(e)}" | |
| history.append({"role": "assistant", "content": error_msg}) | |
| return history, history, None |