Spaces:

kamcio1989
/

anycoder-679c5b67

Build error

App Files Files Community

anycoder-679c5b67 / utils.py

kamcio1989

Upload folder using huggingface_hub

618cf4f verified about 2 months ago

raw

history blame contribute delete

6.14 kB

	import face_recognition
	import cv2
	import numpy as np
	import google.generativeai as genai
	import gradio as gr
	from PIL import Image

	def register_new_face(name, image, known_faces):
	"""
	Registers a new face encoding into the state dictionary.
	"""
	if not name or image is None:
	raise gr.Error("Please provide both a name and an image.")

	# Convert to RGB for face_recognition
	if image.shape[2] == 4:
	image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
	else:
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	encodings = face_recognition.face_encodings(image)

	if not encodings:
	raise gr.Error("No face detected in the image. Please try another photo.")

	# Update state
	new_known_faces = known_faces.copy()
	new_known_faces[name] = encodings[0]

	# Return updated state, updated JSON list, and clear inputs
	return new_known_faces, list(new_known_faces.keys()), "", None

	def process_video_frame(frame, known_faces):
	"""
	Processes a video frame: detects faces, recognizes them, draws boxes,
	and updates the current detected user.
	"""
	if frame is None:
	return None, "Unknown", "👤 Detected: Unknown", None

	# Store original for multimodal context (before drawing boxes)
	original_frame = frame.copy()

	# Resize for faster processing
	small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)

	# Convert BGR/BGRA to RGB
	# Gradio usually provides RGB, but opencv operations might need care
	rgb_small_frame = np.ascontiguousarray(small_frame)

	# Find faces
	face_locations = face_recognition.face_locations(rgb_small_frame)
	face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

	detected_name = "Unknown"
	face_names = []

	for face_encoding in face_encodings:
	name = "Unknown"
	if known_faces:
	known_names = list(known_faces.keys())
	known_encodings_list = list(known_faces.values())

	matches = face_recognition.compare_faces(known_encodings_list, face_encoding)
	face_distances = face_recognition.face_distance(known_encodings_list, face_encoding)

	if len(face_distances) > 0:
	best_match_index = np.argmin(face_distances)
	if matches[best_match_index]:
	name = known_names[best_match_index]

	face_names.append(name)
	if name != "Unknown":
	detected_name = name

	# Draw results on frame
	annotated_frame = draw_overlays(frame, face_locations, face_names)

	status_md = f"👤 Detected: {detected_name}"
	if detected_name != "Unknown":
	status_md = f"👤 Detected: <span style='color: green'>{detected_name}</span>"

	return annotated_frame, detected_name, status_md, original_frame

	def draw_overlays(frame, face_locations, face_names):
	"""Helper to draw boxes and names on the frame"""
	# Scale back up face locations since we detected on 1/4 size
	for (top, right, bottom, left), name in zip(face_locations, face_names):
	top *= 4
	right *= 4
	bottom *= 4
	left *= 4

	# Draw a box around the face
	color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
	cv2.rectangle(frame, (left, top), (right, bottom), color, 2)

	# Draw a label with a name below the face
	cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
	font = cv2.FONT_HERSHEY_DUPLEX
	cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1)

	return frame

	def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame):
	"""
	Sends audio (and optionally image) to Gemini and returns the response.
	"""
	if not api_key:
	gr.Warning("Please enter your Gemini API Key in the Settings tab.")
	return history, history, None

	if not audio_path:
	return history, history, None

	try:
	genai.configure(api_key=api_key)
	# Use flash for speed
	model = genai.GenerativeModel(model_name="gemini-1.5-flash")

	# Prepare prompt
	identity_context = ""
	if user_name and user_name != "Unknown":
	identity_context = f"The user speaking is named {user_name}. "

	full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond."

	# Prepare content parts
	content_parts = [full_prompt]

	# Add Audio
	# Gradio audio type="filepath" gives a path to a wav/mp3
	# We upload the file to Gemini temporarily or pass bytes if small enough.
	# For simplicity in this demo, we upload the file using the File API
	# (or simply pass data if supported by the specific SDK version, but File API is safer for audio).

	# Note: In a production high-traffic app, manage file lifecycle carefully.
	myfile = genai.upload_file(audio_path)
	content_parts.append(myfile)

	# Add Image if multimodal enabled
	if use_vision and image_frame is not None:
	# Convert numpy array to PIL Image
	pil_img = Image.fromarray(image_frame)
	content_parts.append(pil_img)
	content_parts[0] += " The user has also provided a video frame of what they are looking at."

	# Generate
	response = model.generate_content(content_parts)
	response_text = response.text

	# Update History (Gradio Chatbot 'messages' format)
	# User message is just a placeholder for "Audio Sent" since we don't STT locally
	history.append({"role": "user", "content": "🎤 [Audio Sent]"})
	history.append({"role": "assistant", "content": response_text})

	return history, history, None # Return None to clear the audio input

	except Exception as e:
	error_msg = f"Error: {str(e)}"
	history.append({"role": "assistant", "content": error_msg})
	return history, history, None