Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import cv2 | |
| import mediapipe as mp | |
| from facenet_pytorch import MTCNN | |
| import torch | |
| from insightface.app import FaceAnalysis | |
| from ultralytics import YOLO | |
| import os | |
| import glob | |
| from PIL import Image | |
| import json | |
| from datetime import datetime | |
| # Load MediaPipe face detector | |
| mp_face_detection = mp.solutions.face_detection | |
| mp_drawing = mp.solutions.drawing_utils | |
| # Initialize MTCNN | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| mtcnn = MTCNN(keep_all=True, device=device, min_face_size=20) | |
| # Initialize InsightFace (RetinaFace) | |
| face_app = FaceAnalysis(allowed_modules=['detection'], providers=['CPUExecutionProvider']) | |
| face_app.prepare(ctx_id=0, det_size=(640, 640)) | |
| # Global variable for YOLO face mode | |
| yolo_face_mode = False | |
| # Initialize YOLOv8 face detector | |
| # Note: You can use a face-specific model like yolov8n-face.pt if available | |
| try: | |
| # Try to load face-specific model first | |
| if os.path.exists('yolov8n-face.pt'): | |
| yolo_model = YOLO('yolov8n-face.pt') | |
| yolo_face_mode = True | |
| print("Loaded YOLOv8 face-specific model") | |
| else: | |
| # Fall back to general model | |
| yolo_model = YOLO('yolov8n.pt') | |
| yolo_face_mode = False | |
| print("Loaded general YOLOv8 model - will adapt person detections for faces") | |
| except: | |
| yolo_model = None | |
| yolo_face_mode = False | |
| print("YOLOv8 model not found. YOLO detection will be disabled.") | |
| # Dictionary to store face detector functions | |
| face_detectors = { | |
| "MediaPipe": "mediapipe", | |
| "MTCNN": "mtcnn", | |
| "RetinaFace": "retinaface", | |
| "YOLOv8": "yolo" | |
| } | |
| def create_detection_legend(): | |
| """Create an enhanced legend image showing which color corresponds to which model.""" | |
| # Create a blank image with gradient background | |
| legend_height = 280 | |
| legend_width = 350 | |
| legend = np.ones((legend_height, legend_width, 3), dtype=np.uint8) * 255 | |
| # Add a subtle gradient background | |
| for y in range(legend_height): | |
| legend[y, :] = [255 - int(y * 0.1), 255 - int(y * 0.1), 255] | |
| # Add title | |
| cv2.putText(legend, "Detection Legend", (legend_width//2 - 70, 25), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2) | |
| # Add colored rectangles and text | |
| colors = { | |
| "MediaPipe": (0, 255, 0), # Green | |
| "MTCNN": (0, 0, 255), # Red | |
| "RetinaFace": (255, 255, 0), # Yellow | |
| "SCRFD": (255, 128, 0), # Orange | |
| "YOLOv8": (255, 0, 255), # Magenta | |
| "OpenCV DNN": (128, 0, 255), # Purple | |
| "Ground Truth": (0, 255, 255) # Cyan | |
| } | |
| y_offset = 60 | |
| for i, (name, color) in enumerate(colors.items()): | |
| # Draw colored rectangle with border | |
| cv2.rectangle(legend, (15, y_offset - 15), (35, y_offset + 5), color, -1) | |
| cv2.rectangle(legend, (15, y_offset - 15), (35, y_offset + 5), (0, 0, 0), 1) | |
| # Add text with shadow effect | |
| cv2.putText(legend, name, (46, y_offset + 1), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, (100, 100, 100), 1) | |
| cv2.putText(legend, name, (45, y_offset), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1) | |
| y_offset += 30 | |
| # Add text for heat-colored boxes in consensus view | |
| cv2.putText(legend, "Consensus View (Agreement Level):", (15, y_offset + 10), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2) | |
| y_offset += 35 | |
| # Draw color samples for agreement levels | |
| agreement_colors = [ | |
| ("1/4 (25%)", (0, 0, 255)), # Red | |
| ("2/4 (50%)", (0, 165, 255)), # Orange | |
| ("3/4 (75%)", (0, 255, 255)), # Yellow | |
| ("4/4 (100%)", (0, 255, 0)) # Green | |
| ] | |
| for i, (label, color) in enumerate(agreement_colors): | |
| x_pos = 45 + i * 70 | |
| cv2.rectangle(legend, (x_pos, y_offset), (x_pos + 20, y_offset + 20), color, -1) | |
| cv2.rectangle(legend, (x_pos, y_offset), (x_pos + 20, y_offset + 20), (0, 0, 0), 1) | |
| cv2.putText(legend, label, (x_pos - 10, y_offset + 35), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1) | |
| return legend | |
| return legend | |
| def get_example_images(): | |
| """Get list of example images from the faces folder.""" | |
| examples = [] | |
| # Common paths where examples might be stored | |
| search_paths = ["faces", "examples", "samples", "."] | |
| for base_path in search_paths: | |
| if os.path.exists(base_path): | |
| for ext in ["*.jpg", "*.jpeg", "*.png", "*.webp"]: | |
| sample_paths = glob.glob(os.path.join(base_path, ext)) | |
| examples.extend(sample_paths) | |
| # Remove duplicates and sort | |
| examples = sorted(list(set(examples)))[:12] # Limit to 12 examples | |
| return examples | |
| def detect_faces_mediapipe(image, confidence_threshold=0.5): | |
| """Detect faces using MediaPipe with confidence scores.""" | |
| rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| with mp_face_detection.FaceDetection( | |
| min_detection_confidence=confidence_threshold, | |
| model_selection=1 # Use full-range model | |
| ) as face_detection: | |
| results = face_detection.process(rgb_image) | |
| if not results.detections: | |
| return [], [] | |
| bboxes = [] | |
| confidences = [] | |
| for detection in results.detections: | |
| h, w, _ = image.shape | |
| bbox = detection.location_data.relative_bounding_box | |
| x = int(bbox.xmin * w) | |
| y = int(bbox.ymin * h) | |
| width = int(bbox.width * w) | |
| height = int(bbox.height * h) | |
| # Ensure bbox is within image bounds | |
| x = max(0, x) | |
| y = max(0, y) | |
| width = min(width, w - x) | |
| height = min(height, h - y) | |
| bboxes.append([x, y, width, height]) | |
| confidences.append(detection.score[0] if detection.score else confidence_threshold) | |
| return bboxes, confidences | |
| def detect_faces_mtcnn(image, confidence_threshold=0.5): | |
| """Detect faces using MTCNN with confidence scores.""" | |
| rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| # Detect faces | |
| boxes, probs = mtcnn.detect(rgb_image) | |
| if boxes is None: | |
| return [], [] | |
| # Convert to [x, y, w, h] format and filter by confidence | |
| result_boxes = [] | |
| result_confidences = [] | |
| for box, prob in zip(boxes, probs): | |
| if prob >= confidence_threshold: | |
| x1, y1, x2, y2 = [int(coord) for coord in box] | |
| w = x2 - x1 | |
| h = y2 - y1 | |
| result_boxes.append([x1, y1, w, h]) | |
| result_confidences.append(float(prob)) | |
| return result_boxes, result_confidences | |
| def detect_faces_retinaface(image, confidence_threshold=0.5): | |
| """Detect faces using RetinaFace via InsightFace with confidence scores.""" | |
| rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| # Detect faces | |
| faces = face_app.get(rgb_image) | |
| # Convert to [x, y, w, h] format and extract confidence | |
| result_boxes = [] | |
| result_confidences = [] | |
| for face in faces: | |
| # Get confidence score (det_score) | |
| confidence = face.det_score | |
| if confidence >= confidence_threshold: | |
| bbox = face.bbox.astype(int) | |
| x1, y1, x2, y2 = bbox | |
| w = x2 - x1 | |
| h = y2 - y1 | |
| result_boxes.append([x1, y1, w, h]) | |
| result_confidences.append(float(confidence)) | |
| return result_boxes, result_confidences | |
| def detect_faces_yolo(image, confidence_threshold=0.5): | |
| """Detect faces/persons using YOLOv8.""" | |
| if yolo_model is None: | |
| return [], [] | |
| # Run YOLOv8 inference | |
| results = yolo_model(image, conf=confidence_threshold) | |
| boxes = [] | |
| confidences = [] | |
| for r in results: | |
| for box in r.boxes: | |
| class_id = int(box.cls) | |
| # Check if using face-specific model or general model | |
| if yolo_face_mode and class_id == 0: # Face class in face-specific model | |
| x1, y1, x2, y2 = box.xyxy[0].tolist() | |
| w = x2 - x1 | |
| h = y2 - y1 | |
| boxes.append([int(x1), int(y1), int(w), int(h)]) | |
| confidences.append(float(box.conf)) | |
| elif not yolo_face_mode and class_id == 0: # Person class in general COCO model | |
| x1, y1, x2, y2 = box.xyxy[0].tolist() | |
| w = x2 - x1 | |
| h = y2 - y1 | |
| # For person detection, estimate face region | |
| # Use top 1/3 of person bbox as a heuristic | |
| face_h = h * 0.3 | |
| face_y = y1 | |
| boxes.append([int(x1), int(face_y), int(w), int(face_h)]) | |
| confidences.append(float(box.conf) * 0.7) # Reduce confidence for adapted detections | |
| return boxes, confidences | |
| def draw_ground_truth(image, ground_truth_boxes): | |
| """Draw ground truth boxes on image in cyan with enhanced visualization.""" | |
| image_with_gt = image.copy() | |
| for i, box in enumerate(ground_truth_boxes): | |
| x, y, w, h = box | |
| # Draw filled rectangle with transparency | |
| overlay = image_with_gt.copy() | |
| cv2.rectangle(overlay, (x, y), (x+w, y+h), (0, 255, 255), -1) | |
| cv2.addWeighted(overlay, 0.1, image_with_gt, 0.9, 0, image_with_gt) | |
| # Draw border with white background for visibility | |
| cv2.rectangle(image_with_gt, (x-1, y-1), (x+w+1, y+h+1), (255, 255, 255), 3) | |
| cv2.rectangle(image_with_gt, (x, y), (x+w, y+h), (0, 255, 255), 2) | |
| # Add label with background | |
| label = f"GT {i+1}" | |
| label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) | |
| cv2.rectangle(image_with_gt, (x, y-25), (x + label_size[0] + 5, y-2), (255, 255, 255), -1) | |
| cv2.putText(image_with_gt, label, (x+2, y-8), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2) | |
| return image_with_gt | |
| def iou(bbox1, bbox2): | |
| """Calculate the Intersection over Union (IoU) between two bounding boxes.""" | |
| # Convert from [x, y, w, h] to [x1, y1, x2, y2] | |
| bbox1_x1, bbox1_y1, bbox1_w, bbox1_h = bbox1 | |
| bbox1_x2, bbox1_y2 = bbox1_x1 + bbox1_w, bbox1_y1 + bbox1_h | |
| bbox2_x1, bbox2_y1, bbox2_w, bbox2_h = bbox2 | |
| bbox2_x2, bbox2_y2 = bbox2_x1 + bbox2_w, bbox2_y1 + bbox2_h | |
| # Calculate intersection coordinates | |
| inter_x1 = max(bbox1_x1, bbox2_x1) | |
| inter_y1 = max(bbox1_y1, bbox2_y1) | |
| inter_x2 = min(bbox1_x2, bbox2_x2) | |
| inter_y2 = min(bbox1_y2, bbox2_y2) | |
| # Check if there is an intersection | |
| if inter_x2 < inter_x1 or inter_y2 < inter_y1: | |
| return 0.0 | |
| # Calculate areas | |
| inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1) | |
| bbox1_area = bbox1_w * bbox1_h | |
| bbox2_area = bbox2_w * bbox2_h | |
| # Calculate IoU | |
| union_area = bbox1_area + bbox2_area - inter_area | |
| return inter_area / union_area if union_area > 0 else 0.0 | |
| def filter_overlapping_boxes(detections_dict, threshold=0.5): | |
| """Filter out overlapping boxes across different detectors with confidence scores.""" | |
| all_boxes = [] | |
| # Collect all detections with confidence scores | |
| for detector_name, (boxes, confidences) in detections_dict.items(): | |
| for box, conf in zip(boxes, confidences): | |
| box_found = False | |
| for existing_box in all_boxes: | |
| if iou(box, existing_box['box']) > threshold: | |
| # Check if this detector already contributed to this box | |
| if detector_name not in existing_box['detectors']: | |
| existing_box['detectors'].append(detector_name) | |
| existing_box['confidences'][detector_name] = conf | |
| existing_box['avg_confidence'] = np.mean(list(existing_box['confidences'].values())) | |
| box_found = True | |
| break | |
| if not box_found: | |
| all_boxes.append({ | |
| 'box': box, | |
| 'detectors': [detector_name], | |
| 'confidences': {detector_name: conf}, | |
| 'avg_confidence': conf | |
| }) | |
| # Create unique boxes list with correct detector counts | |
| unique_boxes = [] | |
| for box_info in all_boxes: | |
| unique_detectors = list(set(box_info['detectors'])) | |
| unique_boxes.append({ | |
| 'box': box_info['box'], | |
| 'detector_count': len(unique_detectors), | |
| 'detectors': unique_detectors, | |
| 'avg_confidence': box_info['avg_confidence'], | |
| 'confidences': box_info['confidences'] | |
| }) | |
| return unique_boxes | |
| def evaluate_image_quality(image): | |
| """Enhanced image quality evaluation with more metrics.""" | |
| h, w = image.shape[:2] | |
| quality_issues = [] | |
| quality_score = 100 # Start with perfect score | |
| # Check image dimensions | |
| if h < 100 or w < 100: | |
| quality_issues.append(f"Image too small ({w}x{h}, min 100x100)") | |
| quality_score -= 30 | |
| # Convert to grayscale for analysis | |
| gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| # Check blur using Laplacian variance | |
| blur_value = cv2.Laplacian(gray, cv2.CV_64F).var() | |
| if blur_value < 50: | |
| quality_issues.append(f"Image blurry (score: {blur_value:.1f}, min 50)") | |
| quality_score -= 20 | |
| # Check brightness | |
| brightness = np.mean(gray) | |
| if brightness < 30: | |
| quality_issues.append(f"Image too dark (brightness: {brightness:.1f})") | |
| quality_score -= 20 | |
| elif brightness > 225: | |
| quality_issues.append(f"Image too bright (brightness: {brightness:.1f})") | |
| quality_score -= 20 | |
| # Check contrast | |
| contrast = gray.std() | |
| if contrast < 20: | |
| quality_issues.append(f"Low contrast (std: {contrast:.1f})") | |
| quality_score -= 15 | |
| # Check for noise (high-frequency components) | |
| noise_level = np.mean(np.abs(cv2.Laplacian(gray, cv2.CV_64F))) | |
| if noise_level > 20: | |
| quality_issues.append(f"High noise level ({noise_level:.1f})") | |
| quality_score -= 10 | |
| quality_ok = quality_score >= 50 | |
| quality_message = f"Quality score: {quality_score}/100" | |
| if quality_issues: | |
| quality_message += " - Issues: " + "; ".join(quality_issues) | |
| return quality_ok, quality_message, quality_score | |
| def calculate_metrics_with_ground_truth(unique_faces, ground_truth_faces, iou_threshold=0.5): | |
| """Calculate enhanced detection metrics using ground truth faces.""" | |
| if not ground_truth_faces: | |
| return { | |
| "precision": None, | |
| "recall": None, | |
| "f1_score": None, | |
| "true_positives": 0, | |
| "false_positives": len(unique_faces), | |
| "false_negatives": 0, | |
| "avg_iou": None, | |
| "avg_confidence": np.mean([f['avg_confidence'] for f in unique_faces]) if unique_faces else 0 | |
| } | |
| # Extract just the boxes from unique_faces | |
| detection_boxes = [face['box'] for face in unique_faces] | |
| detection_confidences = [face['avg_confidence'] for face in unique_faces] | |
| # Initialize counters | |
| true_positives = 0 | |
| detected_gt = [False] * len(ground_truth_faces) | |
| iou_scores = [] | |
| tp_confidences = [] | |
| # Match detections to ground truth | |
| for det_idx, det_box in enumerate(detection_boxes): | |
| max_iou = 0 | |
| max_idx = -1 | |
| # Find best matching ground truth box | |
| for i, gt_box in enumerate(ground_truth_faces): | |
| if not detected_gt[i]: # Only consider unmatched ground truth | |
| curr_iou = iou(det_box, gt_box) | |
| if curr_iou > max_iou and curr_iou >= iou_threshold: | |
| max_iou = curr_iou | |
| max_idx = i | |
| # If we found a match | |
| if max_idx >= 0: | |
| true_positives += 1 | |
| detected_gt[max_idx] = True | |
| iou_scores.append(max_iou) | |
| tp_confidences.append(detection_confidences[det_idx]) | |
| # Calculate metrics | |
| false_positives = len(detection_boxes) - true_positives | |
| false_negatives = detected_gt.count(False) | |
| precision = true_positives / len(detection_boxes) if len(detection_boxes) > 0 else 0 | |
| recall = true_positives / len(ground_truth_faces) if len(ground_truth_faces) > 0 else 0 | |
| f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 | |
| return { | |
| "precision": precision, | |
| "recall": recall, | |
| "f1_score": f1_score, | |
| "true_positives": true_positives, | |
| "false_positives": false_positives, | |
| "false_negatives": false_negatives, | |
| "avg_iou": np.mean(iou_scores) if iou_scores else None, | |
| "avg_confidence": np.mean(tp_confidences) if tp_confidences else 0 | |
| } | |
| def parse_ground_truth(ground_truth_str): | |
| """Parse ground truth boxes from string format with validation.""" | |
| if not ground_truth_str or ground_truth_str.strip() == "": | |
| return [] | |
| try: | |
| ground_truth_boxes = [] | |
| lines = ground_truth_str.strip().split("\n") | |
| for line_num, line in enumerate(lines, 1): | |
| if line.strip(): | |
| # Try to parse as "x,y,w,h" | |
| coords = [int(x.strip()) for x in line.split(",")] | |
| if len(coords) == 4: | |
| x, y, w, h = coords | |
| if w > 0 and h > 0: # Validate positive dimensions | |
| ground_truth_boxes.append(coords) | |
| else: | |
| print(f"Line {line_num}: Invalid dimensions (w={w}, h={h})") | |
| else: | |
| print(f"Line {line_num}: Expected 4 coordinates, got {len(coords)}") | |
| return ground_truth_boxes | |
| except Exception as e: | |
| print(f"Error parsing ground truth: {str(e)}") | |
| return [] | |
| def create_comparison_grid(images_dict, max_cols=3): | |
| """Create a grid of images for comparison.""" | |
| images = list(images_dict.values()) | |
| titles = list(images_dict.keys()) | |
| n_images = len(images) | |
| n_cols = min(n_images, max_cols) | |
| n_rows = (n_images + n_cols - 1) // n_cols | |
| # Calculate grid dimensions | |
| img_h, img_w = images[0].shape[:2] if images else (480, 640) | |
| scale = 0.5 # Scale down for grid | |
| cell_w = int(img_w * scale) | |
| cell_h = int(img_h * scale) | |
| # Create grid canvas | |
| grid_w = cell_w * n_cols + 10 * (n_cols + 1) | |
| grid_h = cell_h * n_rows + 40 * n_rows + 10 | |
| grid = np.ones((grid_h, grid_w, 3), dtype=np.uint8) * 240 | |
| # Place images in grid | |
| for idx, (img, title) in enumerate(zip(images, titles)): | |
| if img is None: | |
| continue | |
| row = idx // n_cols | |
| col = idx % n_cols | |
| # Resize image | |
| img_resized = cv2.resize(img, (cell_w, cell_h)) | |
| # Calculate position | |
| x = col * (cell_w + 10) + 10 | |
| y = row * (cell_h + 40) + 30 | |
| # Place image | |
| grid[y:y+cell_h, x:x+cell_w] = img_resized | |
| # Add title | |
| cv2.putText(grid, title, (x, y-10), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2) | |
| return grid | |
| def process_image(image, min_detector_agreement, ground_truth_str, quality_check, confidence_threshold, | |
| mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled): | |
| """Process the image with selected face detectors and provide enhanced metrics."" | |
| if image is None: | |
| return None, "No image uploaded", None, None, "β REJECTED: No image provided", None, None, None, None | |
| # Create legend image | |
| try: | |
| legend_image = create_detection_legend() | |
| except Exception as e: | |
| print(f"Error creating legend: {str(e)}") | |
| legend_image = np.ones((250, 350, 3), dtype=np.uint8) * 255 | |
| # Parse ground truth boxes | |
| ground_truth_boxes = parse_ground_truth(ground_truth_str) | |
| # Make a copy to avoid modifying the original | |
| image_copy = image.copy() | |
| # Check image quality if enabled | |
| quality_score = 100 | |
| if quality_check: | |
| quality_ok, quality_message, quality_score = evaluate_image_quality(image) | |
| if not quality_ok: | |
| return None, quality_message, None, None, f"β REJECTED: {quality_message}", None, None, legend_image, None | |
| # Detect faces with different methods | |
| detections = {} | |
| detector_results = {} | |
| processing_times = {} | |
| # Run each detector with timing | |
| import time | |
| # Build detector functions dict based on selected models | |
| detector_functions = {} | |
| model_enabled = { | |
| "MediaPipe": mediapipe_enabled, | |
| "MTCNN": mtcnn_enabled, | |
| "RetinaFace": retinaface_enabled, | |
| "YOLOv8": yolo_enabled | |
| } | |
| detector_func_map = { | |
| "MediaPipe": detect_faces_mediapipe, | |
| "MTCNN": detect_faces_mtcnn, | |
| "RetinaFace": detect_faces_retinaface, | |
| "YOLOv8": detect_faces_yolo | |
| } | |
| # Only include enabled detectors | |
| for detector_name, func in detector_func_map.items(): | |
| if model_enabled[detector_name]: | |
| detector_functions[detector_name] = func | |
| # Update face_detectors dict for dynamic counting | |
| enabled_face_detectors = {k: v for k, v in face_detectors.items() if model_enabled[k]} | |
| for detector_name, detect_func in detector_functions.items(): | |
| try: | |
| start_time = time.time() | |
| boxes, confidences = detect_func(image, confidence_threshold) | |
| processing_times[detector_name] = (time.time() - start_time) * 1000 # ms | |
| detections[detector_name] = (boxes, confidences) | |
| detector_results[detector_name] = { | |
| "status": "β Success", | |
| "detections": len(boxes), | |
| "avg_confidence": np.mean(confidences) if confidences else 0, | |
| "time_ms": f"{processing_times[detector_name]:.1f}" | |
| } | |
| except Exception as e: | |
| detections[detector_name] = ([], []) | |
| detector_results[detector_name] = { | |
| "status": f"β Error: {str(e)}", | |
| "detections": 0, | |
| "avg_confidence": 0, | |
| "time_ms": "N/A" | |
| } | |
| print(f"{detector_name} error: {str(e)}") | |
| # Filter overlapping boxes with lower threshold for better merging | |
| unique_faces = filter_overlapping_boxes(detections, threshold=0.3) | |
| # Sort by confidence | |
| unique_faces = sorted(unique_faces, key=lambda x: x['avg_confidence'], reverse=True) | |
| # Create visualizations with better colors | |
| image_with_boxes = image_copy.copy() | |
| colors = { | |
| "MediaPipe": (0, 255, 0), # Green | |
| "MTCNN": (0, 0, 255), # Red | |
| "RetinaFace": (255, 255, 0), # Yellow | |
| "YOLOv8": (255, 0, 255) # Magenta | |
| } | |
| # Draw all detections with colored boxes and better visibility | |
| for detector, (boxes, confidences) in detections.items(): | |
| for box, conf in zip(boxes, confidences): | |
| x, y, w, h = box | |
| # Draw white background for better contrast | |
| cv2.rectangle(image_with_boxes, (x-1, y-1), (x+w+1, y+h+1), (255, 255, 255), 3) | |
| # Draw colored box | |
| cv2.rectangle(image_with_boxes, (x, y), (x+w, y+h), colors[detector], 2) | |
| # Add label with white background | |
| label = f"{detector} ({conf:.2f})" | |
| label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) | |
| cv2.rectangle(image_with_boxes, (x, y-22), (x + label_size[0] + 4, y-2), (255, 255, 255), -1) | |
| cv2.putText(image_with_boxes, label, (x+2, y-7), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[detector], 2) | |
| # Add ground truth boxes if provided | |
| if ground_truth_boxes: | |
| image_with_boxes = draw_ground_truth(image_with_boxes, ground_truth_boxes) | |
| # Prepare enhanced metrics | |
| metrics = { | |
| "π Detection Summary": "", | |
| "Total unique faces": len(unique_faces), | |
| "Total detections": sum(len(boxes) for boxes, _ in detections.values()), | |
| "": "", | |
| "π Detector Performance": "" | |
| } | |
| # Add metrics for enabled detectors only | |
| for detector_name in detector_functions.keys(): | |
| if detector_name in detections: | |
| detection_count = len(detections[detector_name][0]) | |
| processing_time = processing_times.get(detector_name, 0) | |
| metrics[detector_name] = f"{detection_count} faces ({processing_time:.1f}ms)" | |
| # Add ground truth metrics if provided | |
| if ground_truth_boxes: | |
| gt_metrics = calculate_metrics_with_ground_truth(unique_faces, ground_truth_boxes) | |
| metrics.update({ | |
| " ": "", | |
| "π Ground Truth Evaluation": "", | |
| "Ground truth faces": len(ground_truth_boxes), | |
| "True positives": gt_metrics["true_positives"], | |
| "False positives": gt_metrics["false_positives"], | |
| "False negatives": gt_metrics["false_negatives"], | |
| "Precision": f"{gt_metrics['precision']:.3f}" if gt_metrics['precision'] is not None else "N/A", | |
| "Recall": f"{gt_metrics['recall']:.3f}" if gt_metrics['recall'] is not None else "N/A", | |
| "F1 Score": f"{gt_metrics['f1_score']:.3f}" if gt_metrics['f1_score'] is not None else "N/A", | |
| "Average IoU": f"{gt_metrics['avg_iou']:.3f}" if gt_metrics['avg_iou'] is not None else "N/A" | |
| }) | |
| # Add image quality metrics | |
| if quality_check: | |
| metrics.update({ | |
| " ": "", | |
| "πΌοΈ Image Quality": "", | |
| "Quality Score": f"{quality_score}/100" | |
| }) | |
| # Calculate confidence scores for each face | |
| face_confidence = {} | |
| num_enabled_detectors = len(enabled_face_detectors) | |
| for i, face in enumerate(unique_faces): | |
| detector_ratio = face['detector_count'] / num_enabled_detectors | |
| conf_details = { | |
| "Agreement": f"{face['detector_count']}/{num_enabled_detectors} detectors", | |
| "Avg Confidence": f"{face['avg_confidence']:.3f}", | |
| "Detected by": ", ".join(face['detectors']) | |
| } | |
| # Add individual detector confidences | |
| for det in face['detectors']: | |
| if det in face['confidences']: | |
| conf_details[f"{det} conf"] = f"{face['confidences'][det]:.3f}" | |
| face_confidence[f"Face {i+1}"] = conf_details | |
| # Create metrics text | |
| metrics_text = "\n".join([f"{k}: {v}" for k, v in metrics.items()]) | |
| # Create detailed face info text | |
| agreement_text = "\nπ― Face Detection Details:\n" | |
| for face_id, conf in face_confidence.items(): | |
| agreement_text += f"\n{face_id}:\n" | |
| for metric, value in conf.items(): | |
| agreement_text += f" {metric}: {value}\n" | |
| # Check acceptance criteria | |
| accepted = True | |
| verdict_details = [] | |
| # Check if we have enough detectors agreeing | |
| valid_faces = [face for face in unique_faces if face['detector_count'] >= min_detector_agreement] | |
| # If ground truth is provided, check against it | |
| if ground_truth_boxes: | |
| metrics_verdict = calculate_metrics_with_ground_truth(unique_faces, ground_truth_boxes) | |
| precision_threshold = 0.7 | |
| recall_threshold = 0.7 | |
| if metrics_verdict["precision"] < precision_threshold or metrics_verdict["recall"] < recall_threshold: | |
| accepted = False | |
| verdict_details.append(f"Detection quality below threshold") | |
| verdict_details.append(f"Precision: {metrics_verdict['precision']:.2f}, Recall: {metrics_verdict['recall']:.2f}") | |
| # Check detector agreement | |
| if len(valid_faces) == 0: | |
| accepted = False | |
| verdict_details.append(f"No faces with {min_detector_agreement}+ detector agreement") | |
| else: | |
| verdict_details.append(f"{len(valid_faces)} faces with {min_detector_agreement}+ detector agreement") | |
| # Create verdict display | |
| verdict = " | ".join(verdict_details) | |
| if accepted: | |
| verdict_text = f"β ACCEPTED: {verdict}" | |
| verdict_color = (0, 200, 0) # Green for accepted | |
| else: | |
| verdict_text = f"β REJECTED: {verdict}" | |
| verdict_color = (0, 0, 200) # Red for rejected | |
| # Final output text | |
| final_text = f"{metrics_text}\n{agreement_text}" | |
| # Generate consensus result image with better visibility | |
| result_image = image_copy.copy() | |
| # Draw white background rectangles first for better contrast | |
| for face in unique_faces: | |
| x, y, w, h = face['box'] | |
| cv2.rectangle(result_image, (x-2, y-2), (x+w+2, y+h+2), (255, 255, 255), 4) | |
| for i, face in enumerate(unique_faces): | |
| x, y, w, h = face['box'] | |
| # Color based on detector agreement with more distinct colors | |
| agreement = face['detector_count'] / num_enabled_detectors | |
| if agreement <= 0.25: | |
| color = (0, 0, 255) # Red for low agreement | |
| elif agreement <= 0.5: | |
| color = (0, 165, 255) # Orange | |
| elif agreement <= 0.75: | |
| color = (0, 255, 255) # Yellow | |
| else: | |
| color = (0, 255, 0) # Green for high agreement | |
| # Draw box with fixed thick line for visibility | |
| thickness = 3 | |
| cv2.rectangle(result_image, (x, y), (x+w, y+h), color, thickness) | |
| # Add label with better background | |
| label = f"F{i+1} ({face['detector_count']}/{len(face_detectors)}) {face['avg_confidence']:.2f}" | |
| label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) | |
| # White background for label | |
| cv2.rectangle(result_image, (x, y-25), (x + label_size[0] + 6, y-2), (255, 255, 255), -1) | |
| # Black text for contrast | |
| cv2.putText(result_image, label, (x+3, y-8), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2) | |
| # Add ground truth boxes with enhanced visibility | |
| if ground_truth_boxes: | |
| for i, box in enumerate(ground_truth_boxes): | |
| x, y, w, h = box | |
| # White background for visibility | |
| cv2.rectangle(result_image, (x-2, y-2), (x+w+2, y+h+2), (255, 255, 255), 4) | |
| cv2.rectangle(result_image, (x, y), (x+w, y+h), (0, 255, 255), 3) | |
| # Label with white background | |
| label = f"GT {i+1}" | |
| label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) | |
| cv2.rectangle(result_image, (x, y+h+2), (x + label_size[0] + 6, y+h+25), (255, 255, 255), -1) | |
| cv2.putText(result_image, label, (x+3, y+h+18), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2) | |
| # Add verdict overlay | |
| verdict_image = result_image.copy() | |
| overlay = verdict_image.copy() | |
| cv2.rectangle(overlay, (10, 10), (len(verdict_text) * 12, 50), (255, 255, 255), -1) | |
| cv2.addWeighted(overlay, 0.7, verdict_image, 0.3, 0, verdict_image) | |
| cv2.putText(verdict_image, verdict_text, (15, 35), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.8, verdict_color, 2) | |
| # Create ground truth visualization | |
| if ground_truth_boxes: | |
| gt_image = draw_ground_truth(image_copy, ground_truth_boxes) | |
| else: | |
| gt_image = None | |
| # Create comparison grid | |
| comparison_images = { | |
| "Original": image_copy, | |
| "All Detections": image_with_boxes, | |
| "Consensus": result_image | |
| } | |
| if gt_image is not None: | |
| comparison_images["Ground Truth"] = gt_image | |
| comparison_grid = create_comparison_grid(comparison_images) | |
| return (image_with_boxes, final_text, verdict_image, image, verdict_text, | |
| detector_results, gt_image, legend_image, comparison_grid) | |
| # Define the Gradio interface with enhanced styling | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Arial', sans-serif; | |
| } | |
| .gr-button-primary { | |
| background-color: #2563eb !important; | |
| } | |
| .verdict-box textarea { | |
| font-size: 1.2em !important; | |
| font-weight: bold !important; | |
| } | |
| """ | |
| with gr.Blocks(title="GUARD Robustness Face Detection Ensemble", css=css) as demo: | |
| gr.Markdown(""" | |
| # π― GUARD Robustness | |
| ### See the results from an ensemble of face detectors and if they would pass the P/F criteria | |
| """) | |
| # Get sample image paths | |
| examples = get_example_images() | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Input section | |
| input_image = gr.Image(type="numpy", label="πΈ Upload Image") | |
| # Display examples if available | |
| if examples: | |
| gr.Examples( | |
| examples=examples, | |
| inputs=input_image, | |
| examples_per_page=6, | |
| label="π Example Images" | |
| ) | |
| with gr.Accordion("βοΈ Detection Settings", open=True): | |
| gr.Markdown("**Select Face Detection Models:**") | |
| with gr.Row(): | |
| mediapipe_enabled = gr.Checkbox(value=True, label="MediaPipe", scale=1) | |
| mtcnn_enabled = gr.Checkbox(value=True, label="MTCNN", scale=1) | |
| retinaface_enabled = gr.Checkbox(value=True, label="RetinaFace", scale=1) | |
| yolo_enabled = gr.Checkbox(value=True, label="YOLOv8", scale=1) | |
| min_detector_agreement = gr.Slider( | |
| minimum=1, maximum=4, value=2, step=1, | |
| label="Minimum Detector Agreement", | |
| info="Number of detectors that must agree on a face" | |
| ) | |
| confidence_threshold = gr.Slider( | |
| minimum=0.1, maximum=0.9, value=0.5, step=0.1, | |
| label="Confidence Threshold", | |
| info="Minimum confidence score for detections" | |
| ) | |
| quality_check = gr.Checkbox( | |
| value=True, | |
| label="Enable Image Quality Check", | |
| info="Check for blur, brightness, and contrast issues" | |
| ) | |
| ground_truth = gr.Textbox( | |
| label="Ground Truth Faces (Optional)", | |
| placeholder="Enter face coordinates (x,y,w,h), one per line:\n100,150,50,60\n200,250,45,55", | |
| lines=4, | |
| info="Provide ground truth for accuracy evaluation" | |
| ) | |
| submit_btn = gr.Button("π Detect Faces", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| # Results section | |
| verdict_box = gr.Textbox( | |
| label="π Verdict", | |
| lines=1, | |
| elem_classes=["verdict-box"] | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("π― Consensus Result"): | |
| consensus_image = gr.Image(label="Consensus Detection with Verdict") | |
| gr.Markdown("*Boxes colored by agreement level: Red (low) β Yellow β Green (high)*") | |
| with gr.Tab("π All Detections"): | |
| output_image = gr.Image(label="All Model Detections") | |
| metrics_text = gr.Textbox(label="Detection Metrics & Analysis", lines=20) | |
| with gr.Tab("π Comparison Grid"): | |
| comparison_grid = gr.Image(label="Side-by-side Comparison") | |
| with gr.Tab("πΈ Original"): | |
| original_image = gr.Image(label="Original Image") | |
| with gr.Tab("βοΈ Ground Truth"): | |
| ground_truth_image = gr.Image(label="Ground Truth Visualization") | |
| with gr.Tab("π¨ Legend"): | |
| legend_image = gr.Image(label="Detection Box Legend") | |
| with gr.Tab("π Detector Status"): | |
| detector_status = gr.Json(label="Detector Performance Details") | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| ### π About | |
| This demo compares four reliable face detection models: | |
| - **MediaPipe**: Google's lightweight face detection | |
| - **MTCNN**: Multi-task Cascaded CNNs | |
| - **RetinaFace**: State-of-the-art face detection via InsightFace | |
| - **YOLOv8**: Latest YOLO architecture adapted for face detection | |
| Select which models to run and compare their performance. The consensus view shows faces colored by detector agreement level. | |
| """) | |
| # Process button handler | |
| def process_handler(image, min_detector_agreement, ground_truth_str, quality_check, confidence_threshold, | |
| mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled): | |
| if image is None: | |
| legend_img = create_detection_legend() | |
| return [None, "No image selected", None, None, | |
| "β REJECTED: No image provided", None, None, legend_img, None] | |
| try: | |
| return process_image(image, min_detector_agreement, ground_truth_str, | |
| quality_check, confidence_threshold, | |
| mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled) | |
| except Exception as e: | |
| print(f"Error processing image: {str(e)}") | |
| legend_img = create_detection_legend() | |
| return [None, f"Error: {str(e)}", None, image, | |
| f"β ERROR: {str(e)}", None, None, legend_img, None] | |
| submit_btn.click( | |
| fn=process_handler, | |
| inputs=[input_image, min_detector_agreement, ground_truth, quality_check, confidence_threshold, | |
| mediapipe_enabled, mtcnn_enabled, retinaface_enabled, yolo_enabled], | |
| outputs=[output_image, metrics_text, consensus_image, original_image, | |
| verdict_box, detector_status, ground_truth_image, legend_image, comparison_grid] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |