Face_Image_Mosaic_Generator / face_dataset_analyzer.py
yogesh882's picture
Initial commit
9ecc0ac
import numpy as np
import cv2
from PIL import Image
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
def analyze_face_dataset(faces_directory="Real_Images/", sample_size=None):
"""Comprehensive analysis of the face dataset"""
print(f"πŸ” ANALYZING FACE DATASET: {faces_directory}")
print("="*50)
if not os.path.exists(faces_directory):
print(f"❌ Directory '{faces_directory}' not found!")
return
# Load all face images
face_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tiff']
face_files = []
for extension in face_extensions:
face_files.extend(glob.glob(os.path.join(faces_directory, extension)))
if len(face_files) == 0:
print(f"❌ No face images found in {faces_directory}")
return
print(f"πŸ“Š Found {len(face_files)} face images")
# Sample if dataset is very large
if sample_size and len(face_files) > sample_size:
face_files = face_files[:sample_size]
print(f"πŸ“ Analyzing sample of {sample_size} images for performance")
# Initialize analysis data
face_data = []
image_sizes = []
file_sizes = []
color_stats = []
brightness_values = []
dominant_colors = []
print("\nπŸ”„ Processing images...")
for i, face_path in enumerate(face_files):
try:
# Load image
image = Image.open(face_path)
# Get file size
file_size = os.path.getsize(face_path) / 1024 # KB
file_sizes.append(file_size)
# Get image dimensions
width, height = image.size
image_sizes.append((width, height))
# Convert to RGB and numpy array
if image.mode != 'RGB':
image = image.convert('RGB')
img_array = np.array(image)
# Calculate color statistics
avg_color = np.mean(img_array, axis=(0, 1))
brightness = np.mean(avg_color)
color_stats.append(avg_color)
brightness_values.append(brightness)
# Get dominant color
dominant_color = get_dominant_color(img_array)
dominant_colors.append(dominant_color)
# Store face data
face_data.append({
'filename': os.path.basename(face_path),
'size': (width, height),
'file_size_kb': file_size,
'avg_color': avg_color,
'brightness': brightness,
'dominant_color': dominant_color
})
if (i + 1) % 50 == 0:
print(f" Processed {i + 1}/{len(face_files)} images...")
except Exception as e:
print(f" ⚠️ Error processing {face_path}: {e}")
print(f"\nβœ… Analysis complete! Processed {len(face_data)} valid images")
# Generate comprehensive analysis
generate_analysis_report(face_data, face_files, faces_directory)
# Create visualizations
create_analysis_visualizations(face_data)
return face_data
def get_dominant_color(image_array):
"""Extract dominant color using uniform quantization"""
# Apply uniform quantization to reduce color space
step_size = 256 // 8 # 8 levels per channel
quantized = (image_array // step_size) * step_size + step_size // 2
quantized = np.clip(quantized, 0, 255)
# Flatten to get list of colors
colors = quantized.reshape(-1, 3)
# Find most frequent color
unique_colors, counts = np.unique(colors, axis=0, return_counts=True)
most_frequent_idx = np.argmax(counts)
return unique_colors[most_frequent_idx].astype(float)
def generate_analysis_report(face_data, all_files, directory):
"""Generate detailed analysis report"""
print("\n" + "="*60)
print("πŸ“ˆ COMPREHENSIVE FACE DATASET ANALYSIS")
print("="*60)
# Basic statistics
print(f"\nπŸ“ DATASET OVERVIEW:")
print(f"Directory: {directory}")
print(f"Total Images Found: {len(all_files)}")
print(f"Successfully Processed: {len(face_data)}")
print(f"Success Rate: {100 * len(face_data) / len(all_files):.1f}%")
# File size analysis
file_sizes = [face['file_size_kb'] for face in face_data]
print(f"\nπŸ’Ύ FILE SIZE ANALYSIS:")
print(f"Average File Size: {np.mean(file_sizes):.1f} KB")
print(f"File Size Range: {np.min(file_sizes):.1f} - {np.max(file_sizes):.1f} KB")
print(f"Total Dataset Size: {np.sum(file_sizes)/1024:.1f} MB")
# Image dimension analysis
widths = [face['size'][0] for face in face_data]
heights = [face['size'][1] for face in face_data]
print(f"\nπŸ–ΌοΈ IMAGE DIMENSION ANALYSIS:")
print(f"Width Range: {np.min(widths)} - {np.max(widths)} pixels")
print(f"Height Range: {np.min(heights)} - {np.max(heights)} pixels")
print(f"Average Dimensions: {np.mean(widths):.0f} Γ— {np.mean(heights):.0f}")
# Find common aspect ratios
aspect_ratios = [w/h for w, h in zip(widths, heights)]
aspect_ratio_rounded = [round(ar, 2) for ar in aspect_ratios]
common_ratios = Counter(aspect_ratio_rounded).most_common(5)
print(f"\nπŸ“ COMMON ASPECT RATIOS:")
for ratio, count in common_ratios:
percentage = 100 * count / len(face_data)
print(f" {ratio:.2f} : 1 β†’ {count} images ({percentage:.1f}%)")
# Color analysis
brightnesses = [face['brightness'] for face in face_data]
avg_colors = np.array([face['avg_color'] for face in face_data])
print(f"\n🎨 COLOR ANALYSIS:")
print(f"Brightness Range: {np.min(brightnesses):.1f} - {np.max(brightnesses):.1f}")
print(f"Average Brightness: {np.mean(brightnesses):.1f} Β± {np.std(brightnesses):.1f}")
# RGB channel analysis
print(f"\nRGB Channel Averages:")
print(f" Red Channel: {np.mean(avg_colors[:, 0]):.1f} Β± {np.std(avg_colors[:, 0]):.1f}")
print(f" Green Channel: {np.mean(avg_colors[:, 1]):.1f} Β± {np.std(avg_colors[:, 1]):.1f}")
print(f" Blue Channel: {np.mean(avg_colors[:, 2]):.1f} Β± {np.std(avg_colors[:, 2]):.1f}")
# Diversity analysis
color_diversity = calculate_color_diversity(avg_colors)
print(f"\n🌈 DIVERSITY METRICS:")
print(f"Color Diversity Score: {color_diversity:.2f} (0=uniform, 1=maximum diversity)")
# Mosaic suitability assessment
assess_mosaic_suitability(face_data)
def calculate_color_diversity(colors):
"""Calculate color diversity score"""
# Calculate pairwise distances between all colors
n = len(colors)
total_distance = 0
max_possible_distance = np.sqrt(3 * 255**2) # Maximum RGB distance
for i in range(n):
for j in range(i+1, n):
distance = np.sqrt(np.sum((colors[i] - colors[j])**2))
total_distance += distance
# Normalize by maximum possible distance and number of pairs
if n > 1:
avg_distance = total_distance / (n * (n-1) / 2)
diversity_score = avg_distance / max_possible_distance
else:
diversity_score = 0
return diversity_score
def assess_mosaic_suitability(face_data):
"""Assess how suitable the dataset is for mosaic generation"""
print(f"\n🎯 MOSAIC SUITABILITY ASSESSMENT:")
num_faces = len(face_data)
brightnesses = [face['brightness'] for face in face_data]
brightness_std = np.std(brightnesses)
# Quantity assessment
if num_faces < 50:
quantity_rating = "Poor"
quantity_advice = "Add more faces for better variety"
elif num_faces < 200:
quantity_rating = "Fair"
quantity_advice = "Good start, more faces will improve results"
elif num_faces < 500:
quantity_rating = "Good"
quantity_advice = "Excellent quantity for most mosaics"
else:
quantity_rating = "Excellent"
quantity_advice = "Outstanding variety available"
# Diversity assessment
if brightness_std < 20:
diversity_rating = "Poor"
diversity_advice = "Add faces with more varied lighting"
elif brightness_std < 40:
diversity_rating = "Fair"
diversity_advice = "Decent range, could use more variety"
elif brightness_std < 60:
diversity_rating = "Good"
diversity_advice = "Good brightness diversity"
else:
diversity_rating = "Excellent"
diversity_advice = "Excellent brightness range"
print(f" Quantity Rating: {quantity_rating} ({num_faces} faces)")
print(f" Advice: {quantity_advice}")
print(f" Diversity Rating: {diversity_rating} (Οƒ={brightness_std:.1f})")
print(f" Advice: {diversity_advice}")
# Grid size recommendations
recommended_grids = []
if num_faces >= 64:
recommended_grids.append("8Γ—8 (64 tiles)")
if num_faces >= 256:
recommended_grids.append("16Γ—16 (256 tiles)")
if num_faces >= 1024:
recommended_grids.append("32Γ—32 (1,024 tiles)")
if num_faces >= 2500:
recommended_grids.append("50Γ—50+ (2,500+ tiles)")
if recommended_grids:
print(f"\nπŸ“ RECOMMENDED GRID SIZES:")
for grid in recommended_grids:
print(f" βœ… {grid}")
else:
print(f"\n⚠️ Dataset too small for most grid sizes. Add more faces!")
def create_analysis_visualizations(face_data):
"""Create comprehensive visualizations"""
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Face Dataset Analysis', fontsize=16, fontweight='bold')
# 1. Brightness distribution
brightnesses = [face['brightness'] for face in face_data]
axes[0, 0].hist(brightnesses, bins=30, color='skyblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Brightness Distribution')
axes[0, 0].set_xlabel('Average Brightness')
axes[0, 0].set_ylabel('Number of Faces')
axes[0, 0].grid(True, alpha=0.3)
# 2. File size distribution
file_sizes = [face['file_size_kb'] for face in face_data]
axes[0, 1].hist(file_sizes, bins=25, color='lightcoral', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('File Size Distribution')
axes[0, 1].set_xlabel('File Size (KB)')
axes[0, 1].set_ylabel('Number of Faces')
axes[0, 1].grid(True, alpha=0.3)
# 3. Image dimensions scatter plot
widths = [face['size'][0] for face in face_data]
heights = [face['size'][1] for face in face_data]
axes[0, 2].scatter(widths, heights, alpha=0.6, color='green')
axes[0, 2].set_title('Image Dimensions')
axes[0, 2].set_xlabel('Width (pixels)')
axes[0, 2].set_ylabel('Height (pixels)')
axes[0, 2].grid(True, alpha=0.3)
# 4. RGB color distribution
avg_colors = np.array([face['avg_color'] for face in face_data])
axes[1, 0].hist(avg_colors[:, 0], bins=20, alpha=0.7, color='red', label='Red')
axes[1, 0].hist(avg_colors[:, 1], bins=20, alpha=0.7, color='green', label='Green')
axes[1, 0].hist(avg_colors[:, 2], bins=20, alpha=0.7, color='blue', label='Blue')
axes[1, 0].set_title('RGB Channel Distribution')
axes[1, 0].set_xlabel('Color Value')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# 5. Dominant colors visualization
dominant_colors = np.array([face['dominant_color'] for face in face_data])
sample_indices = np.random.choice(len(dominant_colors), min(100, len(dominant_colors)), replace=False)
sample_colors = dominant_colors[sample_indices] / 255.0 # Normalize for display
# Create color palette visualization
palette_size = int(np.sqrt(len(sample_colors)))
palette = np.zeros((palette_size, palette_size, 3))
for i in range(palette_size):
for j in range(palette_size):
idx = i * palette_size + j
if idx < len(sample_colors):
palette[i, j] = sample_colors[idx]
axes[1, 1].imshow(palette)
axes[1, 1].set_title('Sample Dominant Colors')
axes[1, 1].set_xticks([])
axes[1, 1].set_yticks([])
# 6. Mosaic grid suitability
grid_sizes = [8, 16, 24, 32, 48, 64]
suitability_scores = []
for grid_size in grid_sizes:
total_tiles = grid_size * grid_size
if len(face_data) >= total_tiles:
# Full coverage possible
score = min(1.0, len(face_data) / total_tiles)
else:
# Partial coverage
score = len(face_data) / total_tiles
suitability_scores.append(score)
bars = axes[1, 2].bar(grid_sizes, suitability_scores, color='gold', alpha=0.7, edgecolor='black')
axes[1, 2].set_title('Grid Size Suitability')
axes[1, 2].set_xlabel('Grid Size (NΓ—N)')
axes[1, 2].set_ylabel('Coverage Ratio')
axes[1, 2].set_ylim(0, 1.1)
axes[1, 2].grid(True, alpha=0.3)
# Add text labels on bars
for bar, score in zip(bars, suitability_scores):
if score >= 1.0:
label = "Full"
color = 'darkgreen'
elif score >= 0.5:
label = f"{score:.1f}"
color = 'darkorange'
else:
label = f"{score:.2f}"
color = 'darkred'
axes[1, 2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
label, ha='center', va='bottom', fontweight='bold', color=color)
plt.tight_layout()
plt.savefig('face_dataset_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\nπŸ“Š Visualizations saved as 'face_dataset_analysis.png'")
def recommend_optimizations(face_data):
"""Provide specific recommendations for improving the dataset"""
print(f"\nπŸ’‘ OPTIMIZATION RECOMMENDATIONS:")
print("-" * 40)
num_faces = len(face_data)
brightnesses = [face['brightness'] for face in face_data]
avg_colors = np.array([face['avg_color'] for face in face_data])
# Check for gaps in brightness spectrum
brightness_bins = np.histogram(brightnesses, bins=10)[0]
empty_bins = np.sum(brightness_bins == 0)
if empty_bins > 2:
print(f"πŸ” Add faces with different lighting conditions")
print(f" Found {empty_bins} gaps in brightness spectrum")
# Check color diversity
color_std = np.std(avg_colors, axis=0)
if np.mean(color_std) < 30:
print(f"🌈 Add more diverse face colors/ethnicities")
print(f" Current color variation is limited")
# Check quantity recommendations
target_grids = [(32, 1024), (48, 2304), (64, 4096)]
for grid_size, needed_tiles in target_grids:
if num_faces < needed_tiles:
additional_needed = needed_tiles - num_faces
print(f"πŸ“ˆ For {grid_size}Γ—{grid_size} mosaics: Add {additional_needed} more faces")
print(f"\n🎯 Priority Actions:")
print(f" 1. Collect faces with varied lighting (bright, normal, dark)")
print(f" 2. Include diverse ethnicities and skin tones")
print(f" 3. Add faces with different backgrounds")
print(f" 4. Maintain consistent image quality")
if __name__ == "__main__":
# Run comprehensive analysis
print("Starting face dataset analysis...")
# Analyze the dataset
face_data = analyze_face_dataset("Real_Images/", sample_size=1000) # Limit for performance
if face_data:
# Provide optimization recommendations
recommend_optimizations(face_data)
print(f"\nπŸŽ‰ Analysis Complete!")
print(f"Check 'face_dataset_analysis.png' for visual analysis")
print(f"Your dataset is ready for mosaic generation!")
else:
print(f"❌ Analysis failed. Please check your Real_Images/ directory.")