Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """ | |
| AI Model Image Evaluation Tool | |
| A comprehensive tool for evaluating and comparing AI-generated images across multiple models. | |
| Works both in HuggingFace Spaces and locally without internet connection. | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional, Any | |
| import argparse | |
| from datetime import datetime | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Core dependencies | |
| try: | |
| from PIL import Image, ImageStat, ImageFilter, ImageEnhance | |
| import cv2 | |
| from skimage import measure, filters, feature, exposure | |
| from skimage.metrics import structural_similarity as ssim | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| except ImportError as e: | |
| print(f"Missing required package: {e}") | |
| sys.exit(1) | |
| # Optional dependencies for enhanced evaluation | |
| try: | |
| import torch | |
| import torchvision.transforms as transforms | |
| from transformers import CLIPProcessor, CLIPModel | |
| CLIP_AVAILABLE = True | |
| except ImportError: | |
| CLIP_AVAILABLE = False | |
| print("CLIP not available - aesthetic scoring will be limited") | |
| try: | |
| import gradio as gr | |
| GRADIO_AVAILABLE = True | |
| except ImportError: | |
| GRADIO_AVAILABLE = False | |
| print("Gradio not available - web interface disabled") | |
| class ImageMetrics: | |
| """Calculate various image quality and aesthetic metrics""" | |
| def __init__(self): | |
| self.clip_model = None | |
| self.clip_processor = None | |
| if CLIP_AVAILABLE: | |
| try: | |
| self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| except: | |
| print("Could not load CLIP model - using fallback aesthetic scoring") | |
| def calculate_sharpness(self, image: Image.Image) -> float: | |
| """Calculate image sharpness using Laplacian variance""" | |
| gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | |
| return cv2.Laplacian(gray, cv2.CV_64F).var() | |
| def calculate_contrast(self, image: Image.Image) -> float: | |
| """Calculate image contrast using RMS contrast""" | |
| gray = np.array(image.convert('L')) | |
| return gray.std() | |
| def calculate_brightness(self, image: Image.Image) -> float: | |
| """Calculate average brightness""" | |
| gray = np.array(image.convert('L')) | |
| return gray.mean() | |
| def calculate_saturation(self, image: Image.Image) -> float: | |
| """Calculate color saturation""" | |
| hsv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2HSV) | |
| return hsv[:, :, 1].mean() | |
| def calculate_colorfulness(self, image: Image.Image) -> float: | |
| """Calculate colorfulness metric based on Hasler and Süsstrunk""" | |
| img = np.array(image) | |
| R, G, B = img[:,:,0], img[:,:,1], img[:,:,2] | |
| rg = R - G | |
| yb = 0.5 * (R + G) - B | |
| std_rg = np.std(rg) | |
| std_yb = np.std(yb) | |
| mean_rg = np.mean(rg) | |
| mean_yb = np.mean(yb) | |
| std_root = np.sqrt(std_rg**2 + std_yb**2) | |
| mean_root = np.sqrt(mean_rg**2 + mean_yb**2) | |
| return std_root + 0.3 * mean_root | |
| def calculate_noise_level(self, image: Image.Image) -> float: | |
| """Estimate noise level using high-pass filtering""" | |
| gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | |
| kernel = np.array([[-1,-1,-1], [-1,8,-1], [-1,-1,-1]]) | |
| filtered = cv2.filter2D(gray, -1, kernel) | |
| return filtered.std() | |
| def calculate_dynamic_range(self, image: Image.Image) -> float: | |
| """Calculate dynamic range (difference between max and min luminance)""" | |
| gray = np.array(image.convert('L')) | |
| return float(gray.max() - gray.min()) | |
| def calculate_edge_density(self, image: Image.Image) -> float: | |
| """Calculate edge density using Canny edge detection""" | |
| gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | |
| edges = cv2.Canny(gray, 50, 150) | |
| return np.sum(edges > 0) / edges.size | |
| def calculate_composition_score(self, image: Image.Image) -> float: | |
| """Simple composition score based on rule of thirds and balance""" | |
| gray = np.array(image.convert('L')) | |
| h, w = gray.shape | |
| # Rule of thirds intersection points | |
| third_h, third_w = h // 3, w // 3 | |
| intersections = [ | |
| (third_h, third_w), (third_h, 2*third_w), | |
| (2*third_h, third_w), (2*third_h, 2*third_w) | |
| ] | |
| # Calculate interest at rule of thirds points | |
| interest_score = 0 | |
| for y, x in intersections: | |
| region = gray[max(0, y-10):min(h, y+10), max(0, x-10):min(w, x+10)] | |
| if region.size > 0: | |
| interest_score += region.std() | |
| # Balance score (difference between left/right halves) | |
| left_half = gray[:, :w//2].mean() | |
| right_half = gray[:, w//2:].mean() | |
| balance_score = 1.0 / (1.0 + abs(left_half - right_half)) | |
| return (interest_score / 4.0 + balance_score * 100) / 2 | |
| def calculate_aesthetic_score_clip(self, image: Image.Image) -> float: | |
| """Calculate aesthetic score using CLIP embeddings""" | |
| if not self.clip_model or not self.clip_processor: | |
| return self.calculate_aesthetic_score_fallback(image) | |
| try: | |
| # Aesthetic prompts | |
| positive_prompts = [ | |
| "beautiful", "aesthetic", "artistic", "high quality", "masterpiece", | |
| "visually appealing", "well composed", "stunning", "gorgeous" | |
| ] | |
| negative_prompts = [ | |
| "ugly", "low quality", "blurry", "distorted", "amateur", | |
| "poorly composed", "unappealing", "bad", "terrible" | |
| ] | |
| inputs = self.clip_processor( | |
| text=positive_prompts + negative_prompts, | |
| images=image, | |
| return_tensors="pt", | |
| padding=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = self.clip_model(**inputs) | |
| logits_per_image = outputs.logits_per_image | |
| probs = logits_per_image.softmax(dim=-1) | |
| # Average positive vs negative sentiment | |
| positive_score = probs[0][:len(positive_prompts)].mean().item() | |
| negative_score = probs[0][len(positive_prompts):].mean().item() | |
| return positive_score / (positive_score + negative_score) * 100 | |
| except Exception as e: | |
| print(f"CLIP aesthetic scoring failed: {e}") | |
| return self.calculate_aesthetic_score_fallback(image) | |
| def calculate_aesthetic_score_fallback(self, image: Image.Image) -> float: | |
| """Fallback aesthetic score based on traditional metrics""" | |
| # Combine multiple metrics for aesthetic approximation | |
| sharpness = min(self.calculate_sharpness(image) / 1000, 1.0) | |
| contrast = min(self.calculate_contrast(image) / 50, 1.0) | |
| colorfulness = min(self.calculate_colorfulness(image) / 100, 1.0) | |
| composition = min(self.calculate_composition_score(image) / 100, 1.0) | |
| # Weighted combination | |
| aesthetic_score = ( | |
| sharpness * 0.3 + | |
| contrast * 0.25 + | |
| colorfulness * 0.25 + | |
| composition * 0.2 | |
| ) * 100 | |
| return aesthetic_score | |
| def calculate_technical_quality(self, image: Image.Image) -> float: | |
| """Overall technical quality score""" | |
| sharpness = min(self.calculate_sharpness(image) / 1000, 1.0) | |
| contrast = min(self.calculate_contrast(image) / 50, 1.0) | |
| noise_penalty = max(0, 1.0 - self.calculate_noise_level(image) / 50) | |
| dynamic_range = min(self.calculate_dynamic_range(image) / 255, 1.0) | |
| return (sharpness * 0.4 + contrast * 0.3 + noise_penalty * 0.2 + dynamic_range * 0.1) * 100 | |
| class ModelEvaluator: | |
| """Main evaluation system for comparing AI model outputs""" | |
| def __init__(self): | |
| self.metrics_calculator = ImageMetrics() | |
| self.results = [] | |
| def evaluate_image(self, image_path: str, model_name: str, prompt: str = "") -> Dict[str, Any]: | |
| """Evaluate a single image and return metrics""" | |
| try: | |
| image = Image.open(image_path).convert('RGB') | |
| metrics = { | |
| 'model_name': model_name, | |
| 'image_path': image_path, | |
| 'prompt': prompt, | |
| 'file_size_kb': os.path.getsize(image_path) / 1024, | |
| 'resolution': f"{image.size[0]}x{image.size[1]}", | |
| 'aspect_ratio': round(image.size[0] / image.size[1], 2), | |
| # Technical metrics | |
| 'sharpness': round(self.metrics_calculator.calculate_sharpness(image), 2), | |
| 'contrast': round(self.metrics_calculator.calculate_contrast(image), 2), | |
| 'brightness': round(self.metrics_calculator.calculate_brightness(image), 2), | |
| 'saturation': round(self.metrics_calculator.calculate_saturation(image), 2), | |
| 'colorfulness': round(self.metrics_calculator.calculate_colorfulness(image), 2), | |
| 'noise_level': round(self.metrics_calculator.calculate_noise_level(image), 2), | |
| 'dynamic_range': round(self.metrics_calculator.calculate_dynamic_range(image), 2), | |
| 'edge_density': round(self.metrics_calculator.calculate_edge_density(image), 4), | |
| # Quality scores | |
| 'technical_quality': round(self.metrics_calculator.calculate_technical_quality(image), 2), | |
| 'composition_score': round(self.metrics_calculator.calculate_composition_score(image), 2), | |
| 'aesthetic_score': round(self.metrics_calculator.calculate_aesthetic_score_clip(image), 2), | |
| # Overall score (weighted combination) | |
| 'overall_score': 0.0 | |
| } | |
| # Calculate overall score | |
| metrics['overall_score'] = round( | |
| metrics['technical_quality'] * 0.4 + | |
| metrics['aesthetic_score'] * 0.4 + | |
| metrics['composition_score'] * 0.2, 2 | |
| ) | |
| return metrics | |
| except Exception as e: | |
| print(f"Error evaluating {image_path}: {e}") | |
| return None | |
| def evaluate_batch(self, image_paths: List[str], model_names: List[str], | |
| prompts: List[str] = None) -> pd.DataFrame: | |
| """Evaluate multiple images and return comparison DataFrame""" | |
| if prompts is None: | |
| prompts = [""] * len(image_paths) | |
| self.results = [] | |
| for i, (img_path, model_name) in enumerate(zip(image_paths, model_names)): | |
| prompt = prompts[i] if i < len(prompts) else "" | |
| result = self.evaluate_image(img_path, model_name, prompt) | |
| if result: | |
| self.results.append(result) | |
| return pd.DataFrame(self.results) | |
| def create_comparison_report(self, df: pd.DataFrame, output_path: str = None) -> str: | |
| """Create a detailed comparison report""" | |
| if df.empty: | |
| return "No valid results to compare." | |
| report = [] | |
| report.append("# AI Model Image Evaluation Report") | |
| report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| report.append(f"Total images evaluated: {len(df)}") | |
| report.append("") | |
| # Summary statistics | |
| report.append("## Summary Statistics") | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| summary = df[numeric_cols].describe() | |
| report.append(summary.to_string()) | |
| report.append("") | |
| # Model rankings | |
| report.append("## Model Rankings by Overall Score") | |
| model_scores = df.groupby('model_name')['overall_score'].agg(['mean', 'std', 'count']) | |
| model_scores = model_scores.sort_values('mean', ascending=False) | |
| report.append(model_scores.to_string()) | |
| report.append("") | |
| # Best performing models by category | |
| categories = ['technical_quality', 'aesthetic_score', 'composition_score', 'sharpness', 'colorfulness'] | |
| report.append("## Best Performing Models by Category") | |
| for category in categories: | |
| if category in df.columns: | |
| best_model = df.loc[df[category].idxmax()] | |
| report.append(f"**{category.replace('_', ' ').title()}**: {best_model['model_name']} ({best_model[category]:.2f})") | |
| report.append("") | |
| # Detailed comparison table | |
| report.append("## Detailed Results") | |
| comparison_cols = ['model_name', 'overall_score', 'technical_quality', 'aesthetic_score', | |
| 'composition_score', 'sharpness', 'contrast', 'colorfulness'] | |
| comparison_df = df[comparison_cols].round(2) | |
| report.append(comparison_df.to_string(index=False)) | |
| report_text = "\n".join(report) | |
| if output_path: | |
| with open(output_path, 'w') as f: | |
| f.write(report_text) | |
| print(f"Report saved to {output_path}") | |
| return report_text | |
| def create_visualization(self, df: pd.DataFrame, output_dir: str = "."): | |
| """Create visualization plots for the comparison""" | |
| if df.empty: | |
| print("No data to visualize") | |
| return | |
| plt.style.use('seaborn-v0_8') | |
| fig, axes = plt.subplots(2, 2, figsize=(15, 12)) | |
| # Overall score comparison | |
| model_scores = df.groupby('model_name')['overall_score'].mean().sort_values(ascending=True) | |
| axes[0, 0].barh(model_scores.index, model_scores.values) | |
| axes[0, 0].set_title('Overall Score by Model') | |
| axes[0, 0].set_xlabel('Score') | |
| # Quality metrics radar chart data prep | |
| metrics_cols = ['technical_quality', 'aesthetic_score', 'composition_score'] | |
| model_means = df.groupby('model_name')[metrics_cols].mean() | |
| # Scatter plot: Technical vs Aesthetic | |
| for model in df['model_name'].unique(): | |
| model_data = df[df['model_name'] == model] | |
| axes[0, 1].scatter(model_data['technical_quality'], model_data['aesthetic_score'], | |
| label=model, alpha=0.7, s=60) | |
| axes[0, 1].set_xlabel('Technical Quality') | |
| axes[0, 1].set_ylabel('Aesthetic Score') | |
| axes[0, 1].set_title('Technical vs Aesthetic Quality') | |
| axes[0, 1].legend() | |
| # Distribution of overall scores | |
| for model in df['model_name'].unique(): | |
| model_data = df[df['model_name'] == model]['overall_score'] | |
| axes[1, 0].hist(model_data, alpha=0.6, label=model, bins=10) | |
| axes[1, 0].set_xlabel('Overall Score') | |
| axes[1, 0].set_ylabel('Frequency') | |
| axes[1, 0].set_title('Distribution of Overall Scores') | |
| axes[1, 0].legend() | |
| # Correlation heatmap of metrics | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| correlation_matrix = df[numeric_cols].corr() | |
| sns.heatmap(correlation_matrix, ax=axes[1, 1], cmap='coolwarm', center=0, | |
| square=True, cbar_kws={'shrink': 0.8}) | |
| axes[1, 1].set_title('Metrics Correlation Heatmap') | |
| plt.tight_layout() | |
| plot_path = os.path.join(output_dir, 'model_comparison.png') | |
| plt.savefig(plot_path, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"Visualization saved to {plot_path}") | |
| def create_gradio_interface(): | |
| """Create Gradio web interface""" | |
| if not GRADIO_AVAILABLE: | |
| print("Gradio not available - cannot create web interface") | |
| return None | |
| evaluator = ModelEvaluator() | |
| def evaluate_images(images, model_names, prompts=""): | |
| try: | |
| if not images or not model_names.strip(): | |
| return "Please provide images and model names", None, None | |
| # Parse model names and prompts | |
| model_list = [name.strip() for name in model_names.split(',') if name.strip()] | |
| prompt_list = [p.strip() for p in prompts.split('\n')] if prompts else [""] * len(images) | |
| # Save uploaded images temporarily and evaluate | |
| image_paths = [] | |
| for i, img in enumerate(images): | |
| temp_path = f"temp_image_{i}.png" | |
| img.save(temp_path) | |
| image_paths.append(temp_path) | |
| # Ensure we have the right number of model names | |
| if len(model_list) == 1 and len(images) > 1: | |
| model_list = model_list * len(images) | |
| elif len(model_list) != len(images): | |
| return f"Number of model names ({len(model_list)}) must match number of images ({len(images)})", None, None | |
| # Evaluate images | |
| df = evaluator.evaluate_batch(image_paths, model_list, prompt_list) | |
| if df.empty: | |
| return "No images could be evaluated", None, None | |
| # Create report and visualization | |
| report = evaluator.create_comparison_report(df) | |
| evaluator.create_visualization(df, ".") | |
| # Clean up temp files | |
| for path in image_paths: | |
| try: | |
| os.remove(path) | |
| except: | |
| pass | |
| return report, df, "model_comparison.png" | |
| except Exception as e: | |
| return f"Error during evaluation: {str(e)}", None, None | |
| # Create interface | |
| with gr.Blocks(title="AI Model Image Evaluator") as interface: | |
| gr.Markdown("# AI Model Image Evaluation Tool") | |
| gr.Markdown("Upload images from different AI models to compare their quality, aesthetics, and technical metrics.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| images_input = gr.File(file_count="multiple", file_types=["image"], label="Upload Images") | |
| model_names_input = gr.Textbox( | |
| label="Model Names", | |
| placeholder="model1, model2, model3 (comma-separated)", | |
| info="If you provide one name for multiple images, it will be applied to all" | |
| ) | |
| prompts_input = gr.Textbox( | |
| label="Prompts (Optional)", | |
| placeholder="One prompt per line", | |
| lines=3 | |
| ) | |
| evaluate_btn = gr.Button("Evaluate Images", variant="primary") | |
| with gr.Tabs(): | |
| with gr.TabItem("Report"): | |
| report_output = gr.Textbox(label="Evaluation Report", lines=25, max_lines=50) | |
| with gr.TabItem("Data Table"): | |
| dataframe_output = gr.Dataframe(label="Detailed Results") | |
| with gr.TabItem("Visualization"): | |
| plot_output = gr.Image(label="Comparison Visualization") | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| [[], "DALL-E, Midjourney, Stable Diffusion", "beautiful landscape\nportrait of a cat\nabstract art"], | |
| [[], "model_v1, model_v2", ""], | |
| ], | |
| inputs=[images_input, model_names_input, prompts_input], | |
| ) | |
| evaluate_btn.click( | |
| evaluate_images, | |
| inputs=[images_input, model_names_input, prompts_input], | |
| outputs=[report_output, dataframe_output, plot_output] | |
| ) | |
| return interface | |
| def main(): | |
| parser = argparse.ArgumentParser(description="AI Model Image Evaluation Tool") | |
| subparsers = parser.add_subparsers(dest='command', help='Available commands') | |
| # CLI evaluation command | |
| eval_parser = subparsers.add_parser('evaluate', help='Evaluate images from command line') | |
| eval_parser.add_argument('--images', nargs='+', required=True, help='Paths to images') | |
| eval_parser.add_argument('--models', nargs='+', required=True, help='Model names') | |
| eval_parser.add_argument('--prompts', nargs='*', help='Prompts used (optional)') | |
| eval_parser.add_argument('--output', help='Output directory for results') | |
| # Web interface command | |
| web_parser = subparsers.add_parser('web', help='Launch web interface') | |
| web_parser.add_argument('--port', type=int, default=7860, help='Port for web interface') | |
| web_parser.add_argument('--share', action='store_true', help='Create public link') | |
| # Check if running in HuggingFace Spaces or no args provided | |
| if len(sys.argv) == 1 or os.getenv('SPACE_ID'): | |
| # Auto-launch web interface for HF Spaces or when no args | |
| print("Auto-launching web interface...") | |
| interface = create_gradio_interface() | |
| if interface: | |
| interface.launch(server_name="0.0.0.0", server_port=7860) | |
| else: | |
| print("Web interface not available - Gradio not installed") | |
| return | |
| args = parser.parse_args() | |
| if args.command == 'evaluate': | |
| evaluator = ModelEvaluator() | |
| # Validate inputs | |
| if len(args.models) == 1 and len(args.images) > 1: | |
| args.models = args.models * len(args.images) | |
| elif len(args.models) != len(args.images): | |
| print(f"Error: Number of models ({len(args.models)}) must match number of images ({len(args.images)})") | |
| return | |
| prompts = args.prompts if args.prompts else [""] * len(args.images) | |
| # Evaluate images | |
| print("Evaluating images...") | |
| df = evaluator.evaluate_batch(args.images, args.models, prompts) | |
| if df.empty: | |
| print("No images could be evaluated") | |
| return | |
| # Create output directory | |
| output_dir = args.output or "evaluation_results" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Generate report and visualization | |
| report_path = os.path.join(output_dir, "evaluation_report.txt") | |
| csv_path = os.path.join(output_dir, "detailed_results.csv") | |
| report = evaluator.create_comparison_report(df, report_path) | |
| df.to_csv(csv_path, index=False) | |
| evaluator.create_visualization(df, output_dir) | |
| print(f"Results saved to {output_dir}/") | |
| print("\nTop 3 Models by Overall Score:") | |
| top_models = df.nlargest(3, 'overall_score')[['model_name', 'overall_score']] | |
| print(top_models.to_string(index=False)) | |
| elif args.command == 'web': | |
| interface = create_gradio_interface() | |
| if interface: | |
| print(f"Launching web interface on port {args.port}...") | |
| interface.launch(server_port=args.port, share=args.share) | |
| else: | |
| print("Web interface not available - Gradio not installed") | |
| else: | |
| parser.print_help() | |
| if __name__ == "__main__": | |
| main() |