molecularmax's picture
Remove SCRFD and OpenCV DNN models for reliable HF Spaces deployment
d6a6f38
import gradio as gr
import numpy as np
import cv2
import mediapipe as mp
from facenet_pytorch import MTCNN
import torch
from insightface.app import FaceAnalysis
from ultralytics import YOLO
import os
import glob
from PIL import Image
import json
from datetime import datetime
# Load MediaPipe face detector
mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils
# Initialize MTCNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'
mtcnn = MTCNN(keep_all=True, device=device, min_face_size=20)
# Initialize InsightFace (RetinaFace)
face_app = FaceAnalysis(allowed_modules=['detection'], providers=['CPUExecutionProvider'])
face_app.prepare(ctx_id=0, det_size=(640, 640))
# Global variable for YOLO face mode
yolo_face_mode = False
# Initialize YOLOv8 face detector
# Note: You can use a face-specific model like yolov8n-face.pt if available
try:
# Try to load face-specific model first
if os.path.exists('yolov8n-face.pt'):
yolo_model = YOLO('yolov8n-face.pt')
yolo_face_mode = True
print("Loaded YOLOv8 face-specific model")
else:
# Fall back to general model
yolo_model = YOLO('yolov8n.pt')
yolo_face_mode = False
print("Loaded general YOLOv8 model - will adapt person detections for faces")
except:
yolo_model = None
yolo_face_mode = False
print("YOLOv8 model not found. YOLO detection will be disabled.")
# Dictionary to store face detector functions
face_detectors = {
"MediaPipe": "mediapipe",
"MTCNN": "mtcnn",
"RetinaFace": "retinaface",
"YOLOv8": "yolo"
}
def create_detection_legend():
"""Create an enhanced legend image showing which color corresponds to which model."""
# Create a blank image with gradient background
legend_height = 280
legend_width = 350
legend = np.ones((legend_height, legend_width, 3), dtype=np.uint8) * 255
# Add a subtle gradient background
for y in range(legend_height):
legend[y, :] = [255 - int(y * 0.1), 255 - int(y * 0.1), 255]
# Add title
cv2.putText(legend, "Detection Legend", (legend_width//2 - 70, 25),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)
# Add colored rectangles and text
colors = {
"MediaPipe": (0, 255, 0), # Green
"MTCNN": (0, 0, 255), # Red
"RetinaFace": (255, 255, 0), # Yellow
"SCRFD": (255, 128, 0), # Orange
"YOLOv8": (255, 0, 255), # Magenta
"OpenCV DNN": (128, 0, 255), # Purple
"Ground Truth": (0, 255, 255) # Cyan
}
y_offset = 60
for i, (name, color) in enumerate(colors.items()):
# Draw colored rectangle with border
cv2.rectangle(legend, (15, y_offset - 15), (35, y_offset + 5), color, -1)
cv2.rectangle(legend, (15, y_offset - 15), (35, y_offset + 5), (0, 0, 0), 1)
# Add text with shadow effect
cv2.putText(legend, name, (46, y_offset + 1),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (100, 100, 100), 1)
cv2.putText(legend, name, (45, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1)
y_offset += 30
# Add text for heat-colored boxes in consensus view
cv2.putText(legend, "Consensus View (Agreement Level):", (15, y_offset + 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)
y_offset += 35
# Draw color samples for agreement levels
agreement_colors = [
("1/4 (25%)", (0, 0, 255)), # Red
("2/4 (50%)", (0, 165, 255)), # Orange
("3/4 (75%)", (0, 255, 255)), # Yellow
("4/4 (100%)", (0, 255, 0)) # Green
]
for i, (label, color) in enumerate(agreement_colors):
x_pos = 45 + i * 70
cv2.rectangle(legend, (x_pos, y_offset), (x_pos + 20, y_offset + 20), color, -1)
cv2.rectangle(legend, (x_pos, y_offset), (x_pos + 20, y_offset + 20), (0, 0, 0), 1)
cv2.putText(legend, label, (x_pos - 10, y_offset + 35),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1)
return legend
return legend
def get_example_images():
"""Get list of example images from the faces folder."""
examples = []
# Common paths where examples might be stored
search_paths = ["faces", "examples", "samples", "."]
for base_path in search_paths:
if os.path.exists(base_path):
for ext in ["*.jpg", "*.jpeg", "*.png", "*.webp"]:
sample_paths = glob.glob(os.path.join(base_path, ext))
examples.extend(sample_paths)
# Remove duplicates and sort
examples = sorted(list(set(examples)))[:12] # Limit to 12 examples
return examples
def detect_faces_mediapipe(image, confidence_threshold=0.5):
"""Detect faces using MediaPipe with confidence scores."""
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
with mp_face_detection.FaceDetection(
min_detection_confidence=confidence_threshold,
model_selection=1 # Use full-range model
) as face_detection:
results = face_detection.process(rgb_image)
if not results.detections:
return [], []
bboxes = []
confidences = []
for detection in results.detections:
h, w, _ = image.shape
bbox = detection.location_data.relative_bounding_box
x = int(bbox.xmin * w)
y = int(bbox.ymin * h)
width = int(bbox.width * w)
height = int(bbox.height * h)
# Ensure bbox is within image bounds
x = max(0, x)
y = max(0, y)
width = min(width, w - x)
height = min(height, h - y)
bboxes.append([x, y, width, height])
confidences.append(detection.score[0] if detection.score else confidence_threshold)
return bboxes, confidences
def detect_faces_mtcnn(image, confidence_threshold=0.5):
"""Detect faces using MTCNN with confidence scores."""
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Detect faces
boxes, probs = mtcnn.detect(rgb_image)
if boxes is None:
return [], []
# Convert to [x, y, w, h] format and filter by confidence
result_boxes = []
result_confidences = []
for box, prob in zip(boxes, probs):
if prob >= confidence_threshold:
x1, y1, x2, y2 = [int(coord) for coord in box]
w = x2 - x1
h = y2 - y1
result_boxes.append([x1, y1, w, h])
result_confidences.append(float(prob))
return result_boxes, result_confidences
def detect_faces_retinaface(image, confidence_threshold=0.5):
"""Detect faces using RetinaFace via InsightFace with confidence scores."""
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Detect faces
faces = face_app.get(rgb_image)
# Convert to [x, y, w, h] format and extract confidence
result_boxes = []
result_confidences = []
for face in faces:
# Get confidence score (det_score)
confidence = face.det_score
if confidence >= confidence_threshold:
bbox = face.bbox.astype(int)
x1, y1, x2, y2 = bbox
w = x2 - x1
h = y2 - y1
result_boxes.append([x1, y1, w, h])
result_confidences.append(float(confidence))
return result_boxes, result_confidences
def detect_faces_yolo(image, confidence_threshold=0.5):
"""Detect faces/persons using YOLOv8."""
if yolo_model is None:
return [], []
# Run YOLOv8 inference
results = yolo_model(image, conf=confidence_threshold)
boxes = []
confidences = []
for r in results:
for box in r.boxes:
class_id = int(box.cls)
# Check if using face-specific model or general model
if yolo_face_mode and class_id == 0: # Face class in face-specific model
x1, y1, x2, y2 = box.xyxy[0].tolist()
w = x2 - x1
h = y2 - y1
boxes.append([int(x1), int(y1), int(w), int(h)])
confidences.append(float(box.conf))
elif not yolo_face_mode and class_id == 0: # Person class in general COCO model
x1, y1, x2, y2 = box.xyxy[0].tolist()
w = x2 - x1
h = y2 - y1
# For person detection, estimate face region
# Use top 1/3 of person bbox as a heuristic
face_h = h * 0.3
face_y = y1
boxes.append([int(x1), int(face_y), int(w), int(face_h)])
confidences.append(float(box.conf) * 0.7) # Reduce confidence for adapted detections
return boxes, confidences
def draw_ground_truth(image, ground_truth_boxes):
"""Draw ground truth boxes on image in cyan with enhanced visualization."""
image_with_gt = image.copy()
for i, box in enumerate(ground_truth_boxes):
x, y, w, h = box
# Draw filled rectangle with transparency
overlay = image_with_gt.copy()
cv2.rectangle(overlay, (x, y), (x+w, y+h), (0, 255, 255), -1)
cv2.addWeighted(overlay, 0.1, image_with_gt, 0.9, 0, image_with_gt)
# Draw border with white background for visibility
cv2.rectangle(image_with_gt, (x-1, y-1), (x+w+1, y+h+1), (255, 255, 255), 3)
cv2.rectangle(image_with_gt, (x, y), (x+w, y+h), (0, 255, 255), 2)
# Add label with background
label = f"GT {i+1}"
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
cv2.rectangle(image_with_gt, (x, y-25), (x + label_size[0] + 5, y-2), (255, 255, 255), -1)
cv2.putText(image_with_gt, label, (x+2, y-8),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
return image_with_gt
def iou(bbox1, bbox2):
"""Calculate the Intersection over Union (IoU) between two bounding boxes."""
# Convert from [x, y, w, h] to [x1, y1, x2, y2]
bbox1_x1, bbox1_y1, bbox1_w, bbox1_h = bbox1
bbox1_x2, bbox1_y2 = bbox1_x1 + bbox1_w, bbox1_y1 + bbox1_h
bbox2_x1, bbox2_y1, bbox2_w, bbox2_h = bbox2
bbox2_x2, bbox2_y2 = bbox2_x1 + bbox2_w, bbox2_y1 + bbox2_h
# Calculate intersection coordinates
inter_x1 = max(bbox1_x1, bbox2_x1)
inter_y1 = max(bbox1_y1, bbox2_y1)
inter_x2 = min(bbox1_x2, bbox2_x2)
inter_y2 = min(bbox1_y2, bbox2_y2)
# Check if there is an intersection
if inter_x2 < inter_x1 or inter_y2 < inter_y1:
return 0.0
# Calculate areas
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
bbox1_area = bbox1_w * bbox1_h
bbox2_area = bbox2_w * bbox2_h
# Calculate IoU
union_area = bbox1_area + bbox2_area - inter_area
return inter_area / union_area if union_area > 0 else 0.0
def filter_overlapping_boxes(detections_dict, threshold=0.5):
"""Filter out overlapping boxes across different detectors with confidence scores."""
all_boxes = []
# Collect all detections with confidence scores
for detector_name, (boxes, confidences) in detections_dict.items():
for box, conf in zip(boxes, confidences):
box_found = False
for existing_box in all_boxes:
if iou(box, existing_box['box']) > threshold:
# Check if this detector already contributed to this box
if detector_name not in existing_box['detectors']:
existing_box['detectors'].append(detector_name)
existing_box['confidences'][detector_name] = conf
existing_box['avg_confidence'] = np.mean(list(existing_box['confidences'].values()))
box_found = True
break
if not box_found:
all_boxes.append({
'box': box,
'detectors': [detector_name],
'confidences': {detector_name: conf},
'avg_confidence': conf
})
# Create unique boxes list with correct detector counts
unique_boxes = []
for box_info in all_boxes:
unique_detectors = list(set(box_info['detectors']))
unique_boxes.append({
'box': box_info['box'],
'detector_count': len(unique_detectors),
'detectors': unique_detectors,
'avg_confidence': box_info['avg_confidence'],
'confidences': box_info['confidences']
})
return unique_boxes
def evaluate_image_quality(image):
"""Enhanced image quality evaluation with more metrics."""
h, w = image.shape[:2]
quality_issues = []
quality_score = 100 # Start with perfect score
# Check image dimensions
if h < 100 or w < 100:
quality_issues.append(f"Image too small ({w}x{h}, min 100x100)")
quality_score -= 30
# Convert to grayscale for analysis
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Check blur using Laplacian variance
blur_value = cv2.Laplacian(gray, cv2.CV_64F).var()
if blur_value < 50:
quality_issues.append(f"Image blurry (score: {blur_value:.1f}, min 50)")
quality_score -= 20
# Check brightness
brightness = np.mean(gray)
if brightness < 30:
quality_issues.append(f"Image too dark (brightness: {brightness:.1f})")
quality_score -= 20
elif brightness > 225:
quality_issues.append(f"Image too bright (brightness: {brightness:.1f})")
quality_score -= 20
# Check contrast
contrast = gray.std()
if contrast < 20:
quality_issues.append(f"Low contrast (std: {contrast:.1f})")
quality_score -= 15
# Check for noise (high-frequency components)
noise_level = np.mean(np.abs(cv2.Laplacian(gray, cv2.CV_64F)))
if noise_level > 20:
quality_issues.append(f"High noise level ({noise_level:.1f})")
quality_score -= 10
quality_ok = quality_score >= 50
quality_message = f"Quality score: {quality_score}/100"
if quality_issues:
quality_message += " - Issues: " + "; ".join(quality_issues)
return quality_ok, quality_message, quality_score
def calculate_metrics_with_ground_truth(unique_faces, ground_truth_faces, iou_threshold=0.5):
"""Calculate enhanced detection metrics using ground truth faces."""
if not ground_truth_faces:
return {
"precision": None,
"recall": None,
"f1_score": None,
"true_positives": 0,
"false_positives": len(unique_faces),
"false_negatives": 0,
"avg_iou": None,
"avg_confidence": np.mean([f['avg_confidence'] for f in unique_faces]) if unique_faces else 0
}
# Extract just the boxes from unique_faces
detection_boxes = [face['box'] for face in unique_faces]
detection_confidences = [face['avg_confidence'] for face in unique_faces]
# Initialize counters
true_positives = 0
detected_gt = [False] * len(ground_truth_faces)
iou_scores = []
tp_confidences = []
# Match detections to ground truth
for det_idx, det_box in enumerate(detection_boxes):
max_iou = 0
max_idx = -1
# Find best matching ground truth box
for i, gt_box in enumerate(ground_truth_faces):
if not detected_gt[i]: # Only consider unmatched ground truth
curr_iou = iou(det_box, gt_box)
if curr_iou > max_iou and curr_iou >= iou_threshold:
max_iou = curr_iou
max_idx = i
# If we found a match
if max_idx >= 0:
true_positives += 1
detected_gt[max_idx] = True
iou_scores.append(max_iou)
tp_confidences.append(detection_confidences[det_idx])
# Calculate metrics
false_positives = len(detection_boxes) - true_positives
false_negatives = detected_gt.count(False)
precision = true_positives / len(detection_boxes) if len(detection_boxes) > 0 else 0
recall = true_positives / len(ground_truth_faces) if len(ground_truth_faces) > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"precision": precision,
"recall": recall,
"f1_score": f1_score,
"true_positives": true_positives,
"false_positives": false_positives,
"false_negatives": false_negatives,
"avg_iou": np.mean(iou_scores) if iou_scores else None,
"avg_confidence": np.mean(tp_confidences) if tp_confidences else 0
}
def parse_ground_truth(ground_truth_str):
"""Parse ground truth boxes from string format with validation."""
if not ground_truth_str or ground_truth_str.strip() == "":
return []
try:
ground_truth_boxes = []
lines = ground_truth_str.strip().split("\n")
for line_num, line in enumerate(lines, 1):
if line.strip():
# Try to parse as "x,y,w,h"
coords = [int(x.strip()) for x in line.split(",")]
if len(coords) == 4:
x, y, w, h = coords
if w > 0 and h > 0: # Validate positive dimensions
ground_truth_boxes.append(coords)
else:
print(f"Line {line_num}: Invalid dimensions (w={w}, h={h})")
else:
print(f"Line {line_num}: Expected 4 coordinates, got {len(coords)}")
return ground_truth_boxes
except Exception as e:
print(f"Error parsing ground truth: {str(e)}")
return []
def create_comparison_grid(images_dict, max_cols=3):
"""Create a grid of images for comparison."""
images = list(images_dict.values())
titles = list(images_dict.keys())
n_images = len(images)
n_cols = min(n_images, max_cols)
n_rows = (n_images + n_cols - 1) // n_cols
# Calculate grid dimensions
img_h, img_w = images[0].shape[:2] if images else (480, 640)
scale = 0.5 # Scale down for grid
cell_w = int(img_w * scale)
cell_h = int(img_h * scale)
# Create grid canvas
grid_w = cell_w * n_cols + 10 * (n_cols + 1)
grid_h = cell_h * n_rows + 40 * n_rows + 10
grid = np.ones((grid_h, grid_w, 3), dtype=np.uint8) * 240
# Place images in grid
for idx, (img, title) in enumerate(zip(images, titles)):
if img is None:
continue
row = idx // n_cols
col = idx % n_cols
# Resize image
img_resized = cv2.resize(img, (cell_w, cell_h))
# Calculate position
x = col * (cell_w + 10) + 10
y = row * (cell_h + 40) + 30
# Place image
grid[y:y+cell_h, x:x+cell_w] = img_resized
# Add title
cv2.putText(grid, title, (x, y-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)
return grid
def process_image(image, min_detector_agreement, ground_truth_str, quality_check, confidence_threshold,
mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled):
"""Process the image with selected face detectors and provide enhanced metrics.""
if image is None:
return None, "No image uploaded", None, None, "❌ REJECTED: No image provided", None, None, None, None
# Create legend image
try:
legend_image = create_detection_legend()
except Exception as e:
print(f"Error creating legend: {str(e)}")
legend_image = np.ones((250, 350, 3), dtype=np.uint8) * 255
# Parse ground truth boxes
ground_truth_boxes = parse_ground_truth(ground_truth_str)
# Make a copy to avoid modifying the original
image_copy = image.copy()
# Check image quality if enabled
quality_score = 100
if quality_check:
quality_ok, quality_message, quality_score = evaluate_image_quality(image)
if not quality_ok:
return None, quality_message, None, None, f"❌ REJECTED: {quality_message}", None, None, legend_image, None
# Detect faces with different methods
detections = {}
detector_results = {}
processing_times = {}
# Run each detector with timing
import time
# Build detector functions dict based on selected models
detector_functions = {}
model_enabled = {
"MediaPipe": mediapipe_enabled,
"MTCNN": mtcnn_enabled,
"RetinaFace": retinaface_enabled,
"YOLOv8": yolo_enabled
}
detector_func_map = {
"MediaPipe": detect_faces_mediapipe,
"MTCNN": detect_faces_mtcnn,
"RetinaFace": detect_faces_retinaface,
"YOLOv8": detect_faces_yolo
}
# Only include enabled detectors
for detector_name, func in detector_func_map.items():
if model_enabled[detector_name]:
detector_functions[detector_name] = func
# Update face_detectors dict for dynamic counting
enabled_face_detectors = {k: v for k, v in face_detectors.items() if model_enabled[k]}
for detector_name, detect_func in detector_functions.items():
try:
start_time = time.time()
boxes, confidences = detect_func(image, confidence_threshold)
processing_times[detector_name] = (time.time() - start_time) * 1000 # ms
detections[detector_name] = (boxes, confidences)
detector_results[detector_name] = {
"status": "βœ… Success",
"detections": len(boxes),
"avg_confidence": np.mean(confidences) if confidences else 0,
"time_ms": f"{processing_times[detector_name]:.1f}"
}
except Exception as e:
detections[detector_name] = ([], [])
detector_results[detector_name] = {
"status": f"❌ Error: {str(e)}",
"detections": 0,
"avg_confidence": 0,
"time_ms": "N/A"
}
print(f"{detector_name} error: {str(e)}")
# Filter overlapping boxes with lower threshold for better merging
unique_faces = filter_overlapping_boxes(detections, threshold=0.3)
# Sort by confidence
unique_faces = sorted(unique_faces, key=lambda x: x['avg_confidence'], reverse=True)
# Create visualizations with better colors
image_with_boxes = image_copy.copy()
colors = {
"MediaPipe": (0, 255, 0), # Green
"MTCNN": (0, 0, 255), # Red
"RetinaFace": (255, 255, 0), # Yellow
"YOLOv8": (255, 0, 255) # Magenta
}
# Draw all detections with colored boxes and better visibility
for detector, (boxes, confidences) in detections.items():
for box, conf in zip(boxes, confidences):
x, y, w, h = box
# Draw white background for better contrast
cv2.rectangle(image_with_boxes, (x-1, y-1), (x+w+1, y+h+1), (255, 255, 255), 3)
# Draw colored box
cv2.rectangle(image_with_boxes, (x, y), (x+w, y+h), colors[detector], 2)
# Add label with white background
label = f"{detector} ({conf:.2f})"
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(image_with_boxes, (x, y-22), (x + label_size[0] + 4, y-2), (255, 255, 255), -1)
cv2.putText(image_with_boxes, label, (x+2, y-7),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[detector], 2)
# Add ground truth boxes if provided
if ground_truth_boxes:
image_with_boxes = draw_ground_truth(image_with_boxes, ground_truth_boxes)
# Prepare enhanced metrics
metrics = {
"πŸ“Š Detection Summary": "",
"Total unique faces": len(unique_faces),
"Total detections": sum(len(boxes) for boxes, _ in detections.values()),
"": "",
"πŸ” Detector Performance": ""
}
# Add metrics for enabled detectors only
for detector_name in detector_functions.keys():
if detector_name in detections:
detection_count = len(detections[detector_name][0])
processing_time = processing_times.get(detector_name, 0)
metrics[detector_name] = f"{detection_count} faces ({processing_time:.1f}ms)"
# Add ground truth metrics if provided
if ground_truth_boxes:
gt_metrics = calculate_metrics_with_ground_truth(unique_faces, ground_truth_boxes)
metrics.update({
" ": "",
"πŸ“ Ground Truth Evaluation": "",
"Ground truth faces": len(ground_truth_boxes),
"True positives": gt_metrics["true_positives"],
"False positives": gt_metrics["false_positives"],
"False negatives": gt_metrics["false_negatives"],
"Precision": f"{gt_metrics['precision']:.3f}" if gt_metrics['precision'] is not None else "N/A",
"Recall": f"{gt_metrics['recall']:.3f}" if gt_metrics['recall'] is not None else "N/A",
"F1 Score": f"{gt_metrics['f1_score']:.3f}" if gt_metrics['f1_score'] is not None else "N/A",
"Average IoU": f"{gt_metrics['avg_iou']:.3f}" if gt_metrics['avg_iou'] is not None else "N/A"
})
# Add image quality metrics
if quality_check:
metrics.update({
" ": "",
"πŸ–ΌοΈ Image Quality": "",
"Quality Score": f"{quality_score}/100"
})
# Calculate confidence scores for each face
face_confidence = {}
num_enabled_detectors = len(enabled_face_detectors)
for i, face in enumerate(unique_faces):
detector_ratio = face['detector_count'] / num_enabled_detectors
conf_details = {
"Agreement": f"{face['detector_count']}/{num_enabled_detectors} detectors",
"Avg Confidence": f"{face['avg_confidence']:.3f}",
"Detected by": ", ".join(face['detectors'])
}
# Add individual detector confidences
for det in face['detectors']:
if det in face['confidences']:
conf_details[f"{det} conf"] = f"{face['confidences'][det]:.3f}"
face_confidence[f"Face {i+1}"] = conf_details
# Create metrics text
metrics_text = "\n".join([f"{k}: {v}" for k, v in metrics.items()])
# Create detailed face info text
agreement_text = "\n🎯 Face Detection Details:\n"
for face_id, conf in face_confidence.items():
agreement_text += f"\n{face_id}:\n"
for metric, value in conf.items():
agreement_text += f" {metric}: {value}\n"
# Check acceptance criteria
accepted = True
verdict_details = []
# Check if we have enough detectors agreeing
valid_faces = [face for face in unique_faces if face['detector_count'] >= min_detector_agreement]
# If ground truth is provided, check against it
if ground_truth_boxes:
metrics_verdict = calculate_metrics_with_ground_truth(unique_faces, ground_truth_boxes)
precision_threshold = 0.7
recall_threshold = 0.7
if metrics_verdict["precision"] < precision_threshold or metrics_verdict["recall"] < recall_threshold:
accepted = False
verdict_details.append(f"Detection quality below threshold")
verdict_details.append(f"Precision: {metrics_verdict['precision']:.2f}, Recall: {metrics_verdict['recall']:.2f}")
# Check detector agreement
if len(valid_faces) == 0:
accepted = False
verdict_details.append(f"No faces with {min_detector_agreement}+ detector agreement")
else:
verdict_details.append(f"{len(valid_faces)} faces with {min_detector_agreement}+ detector agreement")
# Create verdict display
verdict = " | ".join(verdict_details)
if accepted:
verdict_text = f"βœ… ACCEPTED: {verdict}"
verdict_color = (0, 200, 0) # Green for accepted
else:
verdict_text = f"❌ REJECTED: {verdict}"
verdict_color = (0, 0, 200) # Red for rejected
# Final output text
final_text = f"{metrics_text}\n{agreement_text}"
# Generate consensus result image with better visibility
result_image = image_copy.copy()
# Draw white background rectangles first for better contrast
for face in unique_faces:
x, y, w, h = face['box']
cv2.rectangle(result_image, (x-2, y-2), (x+w+2, y+h+2), (255, 255, 255), 4)
for i, face in enumerate(unique_faces):
x, y, w, h = face['box']
# Color based on detector agreement with more distinct colors
agreement = face['detector_count'] / num_enabled_detectors
if agreement <= 0.25:
color = (0, 0, 255) # Red for low agreement
elif agreement <= 0.5:
color = (0, 165, 255) # Orange
elif agreement <= 0.75:
color = (0, 255, 255) # Yellow
else:
color = (0, 255, 0) # Green for high agreement
# Draw box with fixed thick line for visibility
thickness = 3
cv2.rectangle(result_image, (x, y), (x+w, y+h), color, thickness)
# Add label with better background
label = f"F{i+1} ({face['detector_count']}/{len(face_detectors)}) {face['avg_confidence']:.2f}"
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
# White background for label
cv2.rectangle(result_image, (x, y-25), (x + label_size[0] + 6, y-2), (255, 255, 255), -1)
# Black text for contrast
cv2.putText(result_image, label, (x+3, y-8),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
# Add ground truth boxes with enhanced visibility
if ground_truth_boxes:
for i, box in enumerate(ground_truth_boxes):
x, y, w, h = box
# White background for visibility
cv2.rectangle(result_image, (x-2, y-2), (x+w+2, y+h+2), (255, 255, 255), 4)
cv2.rectangle(result_image, (x, y), (x+w, y+h), (0, 255, 255), 3)
# Label with white background
label = f"GT {i+1}"
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
cv2.rectangle(result_image, (x, y+h+2), (x + label_size[0] + 6, y+h+25), (255, 255, 255), -1)
cv2.putText(result_image, label, (x+3, y+h+18),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
# Add verdict overlay
verdict_image = result_image.copy()
overlay = verdict_image.copy()
cv2.rectangle(overlay, (10, 10), (len(verdict_text) * 12, 50), (255, 255, 255), -1)
cv2.addWeighted(overlay, 0.7, verdict_image, 0.3, 0, verdict_image)
cv2.putText(verdict_image, verdict_text, (15, 35),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, verdict_color, 2)
# Create ground truth visualization
if ground_truth_boxes:
gt_image = draw_ground_truth(image_copy, ground_truth_boxes)
else:
gt_image = None
# Create comparison grid
comparison_images = {
"Original": image_copy,
"All Detections": image_with_boxes,
"Consensus": result_image
}
if gt_image is not None:
comparison_images["Ground Truth"] = gt_image
comparison_grid = create_comparison_grid(comparison_images)
return (image_with_boxes, final_text, verdict_image, image, verdict_text,
detector_results, gt_image, legend_image, comparison_grid)
# Define the Gradio interface with enhanced styling
css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.gr-button-primary {
background-color: #2563eb !important;
}
.verdict-box textarea {
font-size: 1.2em !important;
font-weight: bold !important;
}
"""
with gr.Blocks(title="GUARD Robustness Face Detection Ensemble", css=css) as demo:
gr.Markdown("""
# 🎯 GUARD Robustness
### See the results from an ensemble of face detectors and if they would pass the P/F criteria
""")
# Get sample image paths
examples = get_example_images()
with gr.Row():
with gr.Column(scale=1):
# Input section
input_image = gr.Image(type="numpy", label="πŸ“Έ Upload Image")
# Display examples if available
if examples:
gr.Examples(
examples=examples,
inputs=input_image,
examples_per_page=6,
label="πŸ“ Example Images"
)
with gr.Accordion("βš™οΈ Detection Settings", open=True):
gr.Markdown("**Select Face Detection Models:**")
with gr.Row():
mediapipe_enabled = gr.Checkbox(value=True, label="MediaPipe", scale=1)
mtcnn_enabled = gr.Checkbox(value=True, label="MTCNN", scale=1)
retinaface_enabled = gr.Checkbox(value=True, label="RetinaFace", scale=1)
yolo_enabled = gr.Checkbox(value=True, label="YOLOv8", scale=1)
min_detector_agreement = gr.Slider(
minimum=1, maximum=4, value=2, step=1,
label="Minimum Detector Agreement",
info="Number of detectors that must agree on a face"
)
confidence_threshold = gr.Slider(
minimum=0.1, maximum=0.9, value=0.5, step=0.1,
label="Confidence Threshold",
info="Minimum confidence score for detections"
)
quality_check = gr.Checkbox(
value=True,
label="Enable Image Quality Check",
info="Check for blur, brightness, and contrast issues"
)
ground_truth = gr.Textbox(
label="Ground Truth Faces (Optional)",
placeholder="Enter face coordinates (x,y,w,h), one per line:\n100,150,50,60\n200,250,45,55",
lines=4,
info="Provide ground truth for accuracy evaluation"
)
submit_btn = gr.Button("πŸš€ Detect Faces", variant="primary", size="lg")
with gr.Column(scale=2):
# Results section
verdict_box = gr.Textbox(
label="πŸ“‹ Verdict",
lines=1,
elem_classes=["verdict-box"]
)
with gr.Tabs():
with gr.Tab("🎯 Consensus Result"):
consensus_image = gr.Image(label="Consensus Detection with Verdict")
gr.Markdown("*Boxes colored by agreement level: Red (low) β†’ Yellow β†’ Green (high)*")
with gr.Tab("πŸ” All Detections"):
output_image = gr.Image(label="All Model Detections")
metrics_text = gr.Textbox(label="Detection Metrics & Analysis", lines=20)
with gr.Tab("πŸ“Š Comparison Grid"):
comparison_grid = gr.Image(label="Side-by-side Comparison")
with gr.Tab("πŸ“Έ Original"):
original_image = gr.Image(label="Original Image")
with gr.Tab("βœ”οΈ Ground Truth"):
ground_truth_image = gr.Image(label="Ground Truth Visualization")
with gr.Tab("🎨 Legend"):
legend_image = gr.Image(label="Detection Box Legend")
with gr.Tab("πŸ“ˆ Detector Status"):
detector_status = gr.Json(label="Detector Performance Details")
# Footer
gr.Markdown("""
---
### πŸ“– About
This demo compares four reliable face detection models:
- **MediaPipe**: Google's lightweight face detection
- **MTCNN**: Multi-task Cascaded CNNs
- **RetinaFace**: State-of-the-art face detection via InsightFace
- **YOLOv8**: Latest YOLO architecture adapted for face detection
Select which models to run and compare their performance. The consensus view shows faces colored by detector agreement level.
""")
# Process button handler
def process_handler(image, min_detector_agreement, ground_truth_str, quality_check, confidence_threshold,
mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled):
if image is None:
legend_img = create_detection_legend()
return [None, "No image selected", None, None,
"❌ REJECTED: No image provided", None, None, legend_img, None]
try:
return process_image(image, min_detector_agreement, ground_truth_str,
quality_check, confidence_threshold,
mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled)
except Exception as e:
print(f"Error processing image: {str(e)}")
legend_img = create_detection_legend()
return [None, f"Error: {str(e)}", None, image,
f"❌ ERROR: {str(e)}", None, None, legend_img, None]
submit_btn.click(
fn=process_handler,
inputs=[input_image, min_detector_agreement, ground_truth, quality_check, confidence_threshold,
mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled],
outputs=[output_image, metrics_text, consensus_image, original_image,
verdict_box, detector_status, ground_truth_image, legend_image, comparison_grid]
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)