image-evaluator / evaluator.py
ilkerzg's picture
Revert to Qwen2.5-VL-7B-Instruct
3f51434 unverified
"""
Image Evaluator Core Logic
Contains the main evaluation classes:
- ImageEvaluator: For text-to-image generation quality assessment
- EditEvaluator: For image editing quality assessment
"""
import re
import math
import time
from typing import Optional, List, Dict, Any
from dataclasses import dataclass, field
from PIL import Image
from metrics import (
parse_json_robust,
calculate_sharpness,
calculate_colorfulness,
calculate_contrast,
calculate_ssim,
calculate_psnr,
calculate_clip_score,
calculate_lpips,
score_to_grade,
geometric_mean,
)
@dataclass
class PrimitiveResult:
"""Result for a single Soft-TIFA primitive."""
content: str
type: str
question: str
answer: str
score: float
reasoning: Optional[str] = None
@dataclass
class SoftTIFAResult:
"""Soft-TIFA evaluation result."""
primitives_count: int
atom_score: float
prompt_score: float
passed: bool
primitive_results: List[PrimitiveResult]
@dataclass
class VLMAssessmentResult:
"""VLM-as-Judge assessment result."""
technical_quality: float
aesthetic_appeal: float
realism: float
semantic_accuracy: Optional[float]
artifacts_detected: List[str]
artifacts_severity: str
overall: float
reasoning: Optional[str] = None
@dataclass
class TechnicalMetricsResult:
"""Technical metrics result."""
clip_score: Optional[float] = None
sharpness: Optional[float] = None
colorfulness: Optional[float] = None
contrast: Optional[float] = None
@dataclass
class ScoreBreakdown:
"""Detailed score breakdown by category."""
prompt_alignment: Optional[float] = None
technical_quality: Optional[float] = None
aesthetic_appeal: Optional[float] = None
realism: Optional[float] = None
artifacts: Optional[float] = None
@dataclass
class AggregatedScore:
"""Comprehensive aggregated scoring."""
overall: float
grade: str
passed: bool
confidence: float
breakdown: ScoreBreakdown
weights_used: Dict[str, float]
recommendation: str
@dataclass
class ImageEvalResult:
"""Complete image evaluation result."""
score: AggregatedScore
soft_tifa: Optional[SoftTIFAResult] = None
vlm_assessment: Optional[VLMAssessmentResult] = None
technical_metrics: Optional[TechnicalMetricsResult] = None
evaluation_time: float = 0.0
@dataclass
class InstructionFollowingResult:
"""Instruction following evaluation result."""
edit_primitives: List[Dict]
primitive_scores: List[Dict]
overall_score: float
reasoning: Optional[str] = None
@dataclass
class PreservationResult:
"""Preservation evaluation result."""
lpips_score: Optional[float] = None
ssim_score: Optional[float] = None
psnr_score: Optional[float] = None
overall_score: float = 0.0
@dataclass
class EditQualityResult:
"""Edit quality assessment result."""
technical_score: float
aesthetic_score: float
coherence_score: float
artifacts: List[str]
artifact_severity: str
overall_score: float
reasoning: Optional[str] = None
@dataclass
class EditScoreBreakdown:
"""Detailed score breakdown for editing evaluation."""
instruction_following: Optional[float] = None
preservation: Optional[float] = None
edit_quality: Optional[float] = None
artifacts: Optional[float] = None
@dataclass
class EditAggregatedScore:
"""Comprehensive aggregated scoring for editing."""
overall: float
grade: str
passed: bool
confidence: float
breakdown: EditScoreBreakdown
weights_used: Dict[str, float]
recommendation: str
@dataclass
class EditEvalResult:
"""Complete edit evaluation result."""
score: EditAggregatedScore
instruction_following: Optional[InstructionFollowingResult] = None
preservation: Optional[PreservationResult] = None
edit_quality: Optional[EditQualityResult] = None
evaluation_time: float = 0.0
@dataclass
class ComparisonRanking:
"""Ranking result for a single criterion."""
criterion: str
ranking: List[int] # Image indices, best to worst
scores: List[float] # Normalized scores per image
reasoning: Optional[str] = None
@dataclass
class ComparisonResult:
"""Complete comparison result for multiple images."""
num_images: int
prompt: str
overall_ranking: List[int] # Image indices, best to worst
overall_scores: List[float] # Normalized scores per image
rankings_by_criterion: Dict[str, ComparisonRanking]
winner_index: int
winner_reasoning: str
individual_scores: List[AggregatedScore] # Individual evaluation scores
individual_results: List["ImageEvalResult"] = field(default_factory=list) # Full evaluation results
evaluation_time: float = 0.0
class ImageEvaluator:
"""
AI-Generated Image Quality Evaluator
Evaluates AI-generated images using:
- Soft-TIFA: Atomic prompt decomposition for precise alignment scoring
- VLM-as-Judge: Human-like holistic assessment with reasoning
- Technical Metrics: Sharpness, colorfulness, contrast, CLIP score
"""
def __init__(self, device: str = "cuda"):
"""Initialize evaluator with models."""
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
self.device = device if torch.cuda.is_available() else "cpu"
# Load Qwen2.5-VL-7B-Instruct
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
self.vlm_processor = AutoProcessor.from_pretrained(
model_name,
trust_remote_code=True,
)
# Load CLIP for text-image alignment
import open_clip
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
'ViT-B-32', pretrained='openai'
)
self.clip_model = self.clip_model.to(self.device).eval()
self.clip_tokenizer = open_clip.get_tokenizer('ViT-B-32')
def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
"""Generate response from VLM with image."""
import torch
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
],
}
]
text = self.vlm_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.vlm_processor(
text=[text],
images=[image],
return_tensors="pt",
).to(self.vlm_model.device)
with torch.no_grad():
outputs = self.vlm_model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
)
generated = outputs[0][inputs.input_ids.shape[1]:]
return self.vlm_processor.decode(generated, skip_special_tokens=True)
def _vlm_text_generate(self, prompt: str) -> str:
"""Generate response from VLM (text only)."""
import torch
messages = [{"role": "user", "content": prompt}]
text = self.vlm_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.vlm_processor(
text=[text],
return_tensors="pt",
).to(self.vlm_model.device)
with torch.no_grad():
outputs = self.vlm_model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
)
generated = outputs[0][inputs.input_ids.shape[1]:]
return self.vlm_processor.decode(generated, skip_special_tokens=True)
def evaluate_soft_tifa(self, image: Image.Image, prompt: str) -> SoftTIFAResult:
"""Run Soft-TIFA evaluation with atomic prompt decomposition."""
# Step 1: Decompose prompt into primitives
decomposition_prompt = f'''Decompose this text-to-image prompt into atomic visual primitives.
Prompt: "{prompt}"
For each primitive, identify:
- content: The specific visual element (e.g., "a red car", "sunset sky")
- type: One of [object, attribute, count, relation, action, scene, style]
- importance: How critical (0.5-1.0)
Example for "A cat sitting on a red chair":
[
{{"content": "cat", "type": "object", "importance": 1.0}},
{{"content": "chair", "type": "object", "importance": 0.9}},
{{"content": "red chair", "type": "attribute", "importance": 0.8}},
{{"content": "cat sitting on chair", "type": "relation", "importance": 0.9}}
]
Return ONLY valid JSON array for the given prompt:'''
decomp_response = self._vlm_text_generate(decomposition_prompt)
primitives = parse_json_robust(decomp_response, fallback=[])
if not primitives or not isinstance(primitives, list):
return SoftTIFAResult(
primitives_count=0,
atom_score=0.0,
prompt_score=0.0,
passed=False,
primitive_results=[],
)
# Step 2: Evaluate each primitive via VQA
primitive_results = []
vqa_templates = {
"object": "Is there a {content} in this image?",
"attribute": "Does the image show {content}?",
"count": "Are there {content}?",
"relation": "Is it true that {content}?",
"action": "Is {content} happening in this image?",
"scene": "Does this image depict {content}?",
"style": "Is this image in {content} style?",
}
for prim in primitives[:20]: # Limit to 20 primitives
content = prim.get("content", "")
ptype = prim.get("type", "object")
template = vqa_templates.get(ptype, vqa_templates["object"])
question = template.format(content=content)
vqa_prompt = f"""{question}
Answer Yes or No with confidence (0-100%).
Format: [Yes/No] (confidence: X%) - brief reasoning"""
response = self._vlm_generate(image, vqa_prompt)
# Parse response
answer = "no"
confidence = 0.5
reasoning = None
response_lower = response.lower().strip()
if response_lower.startswith("yes") or "[yes]" in response_lower:
answer = "yes"
conf_match = re.search(r'confidence[:\s]*(\d+)%?', response_lower)
if conf_match:
confidence = float(conf_match.group(1)) / 100.0
if "-" in response:
parts = response.split("-", 1)
if len(parts) > 1:
reasoning = parts[1].strip()[:200]
# Calculate score
score = confidence if answer == "yes" else (1.0 - confidence)
primitive_results.append(PrimitiveResult(
content=content,
type=ptype,
question=question,
answer=answer,
score=score,
reasoning=reasoning,
))
# Aggregate scores
if primitive_results:
atom_score = sum(r.score for r in primitive_results) / len(primitive_results)
geo_mean = geometric_mean([r.score for r in primitive_results])
prompt_score = 0.7 * atom_score + 0.3 * geo_mean
else:
atom_score = 0.0
prompt_score = 0.0
return SoftTIFAResult(
primitives_count=len(primitive_results),
atom_score=atom_score,
prompt_score=prompt_score,
passed=prompt_score >= 0.7,
primitive_results=primitive_results,
)
def evaluate_vlm_judge(self, image: Image.Image, prompt: Optional[str]) -> VLMAssessmentResult:
"""Run VLM-as-Judge holistic assessment."""
prompt_context = f'Original prompt: "{prompt}"' if prompt else ""
semantic_field = '"semantic_accuracy": {"score": 8, "reasoning": "matches prompt well"},' if prompt else ""
eval_prompt = f"""Evaluate this AI-generated image on multiple dimensions.
{prompt_context}
Rate each dimension from 1-10:
- **Technical Quality**: Sharpness, noise level, color accuracy, resolution
- **Aesthetic Appeal**: Composition, color harmony, visual balance, style
- **Realism**: Physical plausibility, lighting consistency, proportions
{('- **Semantic Accuracy**: How well it matches the prompt' if prompt else '')}
- **AI Artifacts**: Detect issues like distorted faces/hands, extra limbs, text errors
Example output:
{{
"technical_quality": {{"score": 8, "reasoning": "sharp with good colors"}},
"aesthetic_appeal": {{"score": 7, "reasoning": "balanced composition"}},
"realism": {{"score": 6, "reasoning": "slightly off proportions"}},
{semantic_field}
"artifacts": {{"detected": ["slightly distorted fingers"], "severity": "minor"}},
"overall": {{"score": 7, "reasoning": "good quality with minor issues"}}
}}
Now evaluate this image and return ONLY valid JSON:"""
response = self._vlm_generate(image, eval_prompt)
data = parse_json_robust(response, fallback=None)
if data and isinstance(data, dict):
try:
def get_score(key: str, default: float = 5.0) -> float:
val = data.get(key, {})
if isinstance(val, dict):
return float(val.get("score", default))
return float(val) if val else default
artifacts = data.get("artifacts", {})
if isinstance(artifacts, dict):
detected = artifacts.get("detected", [])
severity = artifacts.get("severity", "unknown")
else:
detected = []
severity = "unknown"
return VLMAssessmentResult(
technical_quality=get_score("technical_quality"),
aesthetic_appeal=get_score("aesthetic_appeal"),
realism=get_score("realism"),
semantic_accuracy=get_score("semantic_accuracy") if prompt else None,
artifacts_detected=detected if isinstance(detected, list) else [],
artifacts_severity=severity if isinstance(severity, str) else "unknown",
overall=get_score("overall"),
reasoning=data.get("overall", {}).get("reasoning") if isinstance(data.get("overall"), dict) else None,
)
except (KeyError, TypeError, ValueError):
pass
# Fallback
return VLMAssessmentResult(
technical_quality=5.0,
aesthetic_appeal=5.0,
realism=5.0,
semantic_accuracy=5.0 if prompt else None,
artifacts_detected=[],
artifacts_severity="unknown",
overall=5.0,
)
def evaluate_technical_metrics(self, image: Image.Image, prompt: Optional[str]) -> TechnicalMetricsResult:
"""Calculate technical quality metrics."""
sharpness = None
colorfulness_score = None
contrast_score = None
clip_score = None
try:
sharpness = calculate_sharpness(image)
except Exception:
pass
try:
colorfulness_score = calculate_colorfulness(image)
except Exception:
pass
try:
contrast_score = calculate_contrast(image)
except Exception:
pass
if prompt:
clip_score = calculate_clip_score(
image, prompt,
self.clip_model, self.clip_preprocess, self.clip_tokenizer,
self.device
)
return TechnicalMetricsResult(
clip_score=clip_score,
sharpness=sharpness,
colorfulness=colorfulness_score,
contrast=contrast_score,
)
def _calculate_aggregated_score(
self,
soft_tifa: Optional[SoftTIFAResult],
vlm: Optional[VLMAssessmentResult],
technical: Optional[TechnicalMetricsResult],
has_prompt: bool,
) -> AggregatedScore:
"""Calculate comprehensive aggregated score."""
# Prompt alignment scores
prompt_alignment_scores = []
if soft_tifa:
prompt_alignment_scores.append(soft_tifa.prompt_score)
if vlm and vlm.semantic_accuracy is not None:
prompt_alignment_scores.append(vlm.semantic_accuracy / 10.0)
if technical and technical.clip_score is not None:
prompt_alignment_scores.append(technical.clip_score)
prompt_alignment = sum(prompt_alignment_scores) / len(prompt_alignment_scores) if prompt_alignment_scores else None
# Technical quality scores
tech_scores = []
if technical:
if technical.sharpness is not None:
tech_scores.append(technical.sharpness)
if technical.contrast is not None:
tech_scores.append(technical.contrast)
if vlm:
tech_scores.append(vlm.technical_quality / 10.0)
technical_quality = sum(tech_scores) / len(tech_scores) if tech_scores else None
# Aesthetic appeal scores
aesthetic_scores = []
if technical and technical.colorfulness is not None:
aesthetic_scores.append(technical.colorfulness)
if vlm:
aesthetic_scores.append(vlm.aesthetic_appeal / 10.0)
aesthetic_appeal = sum(aesthetic_scores) / len(aesthetic_scores) if aesthetic_scores else None
# Realism
realism = vlm.realism / 10.0 if vlm else None
# Artifacts
artifacts_score = None
if vlm:
severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
artifacts_score = severity_map.get(vlm.artifacts_severity, 0.7)
# Calculate weighted overall
score_map = {
"prompt_alignment": prompt_alignment,
"technical_quality": technical_quality,
"aesthetic_appeal": aesthetic_appeal,
"realism": realism,
"artifacts": artifacts_score,
}
category_weights = {
"prompt_alignment": 0.45 if has_prompt else 0.0, # Highest priority
"technical_quality": 0.20,
"aesthetic_appeal": 0.15,
"realism": 0.10,
"artifacts": 0.10,
}
weighted_sum = 0.0
total_weight = 0.0
for key, score in score_map.items():
if score is not None:
weight = category_weights[key]
weighted_sum += score * weight
total_weight += weight
overall = weighted_sum / total_weight if total_weight > 0 else 0.0
# Confidence
max_metrics = 5 if has_prompt else 4
available_metrics = sum(1 for s in score_map.values() if s is not None)
confidence = available_metrics / max_metrics
# Recommendation
recommendation = self._generate_recommendation(score_map, overall)
# Normalized weights
normalized_weights = {k: v / total_weight for k, v in category_weights.items() if score_map.get(k) is not None}
return AggregatedScore(
overall=round(overall, 3),
grade=score_to_grade(overall),
passed=overall >= 0.7,
confidence=round(confidence, 2),
breakdown=ScoreBreakdown(
prompt_alignment=round(prompt_alignment, 3) if prompt_alignment is not None else None,
technical_quality=round(technical_quality, 3) if technical_quality is not None else None,
aesthetic_appeal=round(aesthetic_appeal, 3) if aesthetic_appeal is not None else None,
realism=round(realism, 3) if realism is not None else None,
artifacts=round(artifacts_score, 3) if artifacts_score is not None else None,
),
weights_used=normalized_weights,
recommendation=recommendation,
)
def _generate_recommendation(self, scores: Dict, overall: float) -> str:
"""Generate recommendation based on scores."""
weakest = None
weakest_score = 1.0
for key, score in scores.items():
if score is not None and score < weakest_score:
weakest_score = score
weakest = key
if overall >= 0.85:
return "Excellent quality image. Ready for production use."
elif overall >= 0.70:
if weakest and weakest_score < 0.7:
suggestions = {
"prompt_alignment": "Consider regenerating with clearer prompt.",
"technical_quality": "Image has quality issues. Try higher resolution.",
"aesthetic_appeal": "Composition could be improved.",
"realism": "Physical inconsistencies detected.",
"artifacts": "AI artifacts present. Consider regeneration.",
}
return f"Good overall. Improvement: {suggestions.get(weakest, weakest)}"
return "Good quality image. Minor improvements possible."
elif overall >= 0.50:
return f"Moderate quality. Main issue: {weakest.replace('_', ' ') if weakest else 'overall'}."
else:
return "Low quality. Regeneration strongly recommended."
def evaluate(
self,
image: Image.Image,
prompt: Optional[str] = None,
include_soft_tifa: bool = True,
include_vlm: bool = True,
include_technical: bool = True,
) -> ImageEvalResult:
"""
Evaluate an AI-generated image.
Args:
image: PIL Image to evaluate
prompt: Optional text prompt used to generate the image
include_soft_tifa: Run Soft-TIFA evaluation (requires prompt)
include_vlm: Run VLM-as-Judge assessment
include_technical: Calculate technical metrics
Returns:
ImageEvalResult with all evaluation components
"""
start_time = time.time()
soft_tifa_result = None
vlm_result = None
technical_result = None
if include_soft_tifa and prompt:
soft_tifa_result = self.evaluate_soft_tifa(image, prompt)
if include_vlm:
vlm_result = self.evaluate_vlm_judge(image, prompt)
if include_technical:
technical_result = self.evaluate_technical_metrics(image, prompt)
aggregated = self._calculate_aggregated_score(
soft_tifa=soft_tifa_result,
vlm=vlm_result,
technical=technical_result,
has_prompt=prompt is not None,
)
return ImageEvalResult(
score=aggregated,
soft_tifa=soft_tifa_result,
vlm_assessment=vlm_result,
technical_metrics=technical_result,
evaluation_time=time.time() - start_time,
)
def _vlm_generate_multi_image(self, images: List[Image.Image], prompt: str) -> str:
"""Generate response from VLM with multiple images."""
import torch
content = []
for i, img in enumerate(images):
content.append({"type": "image", "image": img})
content.append({"type": "text", "text": prompt})
messages = [{"role": "user", "content": content}]
text = self.vlm_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.vlm_processor(
text=[text],
images=images,
return_tensors="pt",
).to(self.vlm_model.device)
with torch.no_grad():
outputs = self.vlm_model.generate(
**inputs,
max_new_tokens=2048,
do_sample=False,
)
generated = outputs[0][inputs.input_ids.shape[1]:]
return self.vlm_processor.decode(generated, skip_special_tokens=True)
def compare_images(
self,
images: List[Image.Image],
prompt: str,
) -> ComparisonResult:
"""
Compare multiple images (2-4) against a prompt.
Args:
images: List of 2-4 PIL Images to compare
prompt: The text prompt to evaluate against
Returns:
ComparisonResult with rankings and scores
"""
start_time = time.time()
num_images = len(images)
if num_images < 2 or num_images > 4:
raise ValueError("Must provide 2-4 images for comparison")
# Step 1: Get individual scores for each image
individual_results = []
for img in images:
result = self.evaluate(img, prompt, include_soft_tifa=True, include_vlm=True, include_technical=True)
individual_results.append(result)
individual_scores = [r.score for r in individual_results]
# Step 2: Direct multi-image comparison via VLM
image_labels = ", ".join([f"Image {i+1}" for i in range(num_images)])
comparison_prompt = f'''You are comparing {num_images} AI-generated images for the prompt: "{prompt}"
The images are labeled {image_labels} (in order from left to right).
Evaluate and rank ALL images for each criterion. Return a JSON object with rankings (1=best, {num_images}=worst) and scores (0-10).
Criteria:
1. prompt_alignment: How well does each image match the prompt?
2. technical_quality: Sharpness, clarity, no artifacts or distortions
3. aesthetic_appeal: Composition, color harmony, visual appeal
4. realism: Physical plausibility, lighting, proportions
Example output format:
{{
"prompt_alignment": {{"ranking": [2, 1, 3], "scores": [9, 8, 6], "reasoning": "Image 2 captures all elements..."}},
"technical_quality": {{"ranking": [1, 2, 3], "scores": [8, 7, 5], "reasoning": "Image 1 is sharpest..."}},
"aesthetic_appeal": {{"ranking": [2, 1, 3], "scores": [9, 8, 6], "reasoning": "Image 2 has best composition..."}},
"realism": {{"ranking": [1, 2, 3], "scores": [8, 7, 5], "reasoning": "Image 1 looks most natural..."}},
"overall": {{"ranking": [2, 1, 3], "scores": [8.5, 7.5, 5.5], "winner": 2, "reasoning": "Image 2 best balances all criteria..."}}
}}
Return ONLY valid JSON:'''
response = self._vlm_generate_multi_image(images, comparison_prompt)
data = parse_json_robust(response, fallback=None)
# Parse comparison results
rankings_by_criterion = {}
overall_scores = [r.score.overall for r in individual_results]
# Default: determine winner from individual scores
winner_index = max(range(num_images), key=lambda i: overall_scores[i])
# Create ranking from individual scores (sorted indices, highest first)
sorted_indices = sorted(range(num_images), key=lambda i: overall_scores[i], reverse=True)
overall_ranking = [i + 1 for i in sorted_indices] # Convert to 1-indexed
winner_reasoning = f"Image {winner_index + 1} achieved the highest overall score ({overall_scores[winner_index]:.3f})."
# Build rankings_by_criterion from individual score breakdowns
criteria_map = {
"prompt_alignment": lambda s: s.breakdown.prompt_alignment,
"technical_quality": lambda s: s.breakdown.technical_quality,
"aesthetic_appeal": lambda s: s.breakdown.aesthetic_appeal,
"realism": lambda s: s.breakdown.realism,
}
for criterion, getter in criteria_map.items():
crit_scores = []
for score in individual_scores:
val = getter(score)
crit_scores.append(val if val is not None else 0.5)
# Create ranking for this criterion
sorted_idx = sorted(range(num_images), key=lambda i: crit_scores[i], reverse=True)
ranking = [i + 1 for i in sorted_idx]
rankings_by_criterion[criterion] = ComparisonRanking(
criterion=criterion,
ranking=ranking,
scores=crit_scores,
reasoning=None,
)
# Try to use VLM comparison if available (overrides individual-based rankings)
if data and isinstance(data, dict):
criteria = ["prompt_alignment", "technical_quality", "aesthetic_appeal", "realism"]
for criterion in criteria:
crit_data = data.get(criterion, {})
if isinstance(crit_data, dict):
ranking = crit_data.get("ranking", [])
scores = crit_data.get("scores", [])
reasoning = crit_data.get("reasoning", "")
if ranking and scores and len(ranking) == num_images and len(scores) == num_images:
# Normalize scores to 0-1
norm_scores = [s / 10.0 for s in scores]
rankings_by_criterion[criterion] = ComparisonRanking(
criterion=criterion,
ranking=ranking,
scores=norm_scores,
reasoning=reasoning,
)
# Overall from VLM
overall_data = data.get("overall", {})
if isinstance(overall_data, dict):
vlm_ranking = overall_data.get("ranking", [])
raw_scores = overall_data.get("scores", [])
vlm_winner = overall_data.get("winner")
vlm_reasoning = overall_data.get("reasoning", "")
if vlm_ranking and len(vlm_ranking) == num_images:
overall_ranking = vlm_ranking
if raw_scores and len(raw_scores) == num_images:
overall_scores = [s / 10.0 if s > 1 else s for s in raw_scores]
if vlm_winner and 1 <= vlm_winner <= num_images:
winner_index = vlm_winner - 1
if vlm_reasoning:
winner_reasoning = vlm_reasoning
return ComparisonResult(
num_images=num_images,
prompt=prompt,
overall_ranking=overall_ranking,
overall_scores=overall_scores,
rankings_by_criterion=rankings_by_criterion,
winner_index=max(0, min(winner_index, num_images - 1)),
winner_reasoning=winner_reasoning,
individual_scores=individual_scores,
individual_results=individual_results,
evaluation_time=time.time() - start_time,
)
class EditEvaluator:
"""
Image Editing Evaluator
Evaluates instruction-based image editing using:
- Instruction Following: Were the requested edits applied?
- Preservation: Were non-edited regions maintained?
- Edit Quality: Is the edit seamless and high-quality?
"""
def __init__(self, device: str = "cuda"):
"""Initialize evaluator with models."""
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
import lpips
self.device = device if torch.cuda.is_available() else "cpu"
# Load Qwen2.5-VL-7B-Instruct
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
self.vlm_processor = AutoProcessor.from_pretrained(
model_name,
trust_remote_code=True,
)
# Load LPIPS
self.lpips_model = lpips.LPIPS(net='alex').to(self.device)
def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
"""Generate response from VLM with image."""
import torch
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
],
}
]
text = self.vlm_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.vlm_processor(
text=[text],
images=[image],
return_tensors="pt",
).to(self.vlm_model.device)
with torch.no_grad():
outputs = self.vlm_model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
)
generated = outputs[0][inputs.input_ids.shape[1]:]
return self.vlm_processor.decode(generated, skip_special_tokens=True)
def _vlm_text_generate(self, prompt: str) -> str:
"""Generate response from VLM (text only)."""
import torch
messages = [{"role": "user", "content": prompt}]
text = self.vlm_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.vlm_processor(
text=[text],
return_tensors="pt",
).to(self.vlm_model.device)
with torch.no_grad():
outputs = self.vlm_model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
)
generated = outputs[0][inputs.input_ids.shape[1]:]
return self.vlm_processor.decode(generated, skip_special_tokens=True)
def evaluate_instruction_following(self, edited_image: Image.Image, instruction: str) -> InstructionFollowingResult:
"""Evaluate if editing instruction was followed."""
decomp_prompt = f'''Analyze this image editing instruction and decompose into atomic edits.
Instruction: "{instruction}"
Example for "Change the sky to sunset and add a bird":
{{
"edits": [
{{"content": "change sky color to sunset", "type": "modify", "target": "sky", "expected_result": "orange/purple sunset sky"}},
{{"content": "add a bird", "type": "add", "target": "sky area", "expected_result": "visible bird in the scene"}}
]
}}
Return ONLY valid JSON for the given instruction:'''
decomp_response = self._vlm_text_generate(decomp_prompt)
data = parse_json_robust(decomp_response, fallback={})
edits = data.get("edits", []) if isinstance(data, dict) else []
if not edits or not isinstance(edits, list):
# Fallback: evaluate holistically
verify_prompt = f'''Evaluate if this image correctly shows the result of the edit:
Edit instruction: "{instruction}"
Rate success from 0-10.
Format: Score: X/10 - Reasoning'''
response = self._vlm_generate(edited_image, verify_prompt)
score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
score = float(score_match.group(1)) if score_match else 5.0
return InstructionFollowingResult(
edit_primitives=[{"content": instruction, "type": "unknown"}],
primitive_scores=[{"edit": instruction, "score": score}],
overall_score=score / 10.0,
reasoning=response[:200] if response else None,
)
# Evaluate each edit
primitive_scores = []
for edit in edits[:10]:
content = edit.get("content", "")
target = edit.get("target", "the image")
expected = edit.get("expected_result", content)
verify_prompt = f'''Verify if this edit was applied:
Edit: {content}
Target: {target}
Expected: {expected}
Rate from 0-10.
Format: Score: X/10 - Reasoning'''
response = self._vlm_generate(edited_image, verify_prompt)
score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
score = float(score_match.group(1)) if score_match else 5.0
primitive_scores.append({
"edit": content,
"score": score,
"reasoning": response[:100] if response else None,
})
overall = sum(p["score"] for p in primitive_scores) / len(primitive_scores) if primitive_scores else 0
return InstructionFollowingResult(
edit_primitives=edits[:10],
primitive_scores=primitive_scores,
overall_score=overall / 10.0,
)
def evaluate_preservation(self, source_image: Image.Image, edited_image: Image.Image) -> PreservationResult:
"""Evaluate if non-edited regions were preserved."""
scores = []
# LPIPS
lpips_score = calculate_lpips(source_image, edited_image, self.lpips_model, self.device)
lpips_similarity = max(0, 1 - lpips_score) if lpips_score is not None else None
if lpips_similarity is not None:
scores.append(lpips_similarity)
# SSIM
ssim_score = None
try:
ssim_score = calculate_ssim(source_image, edited_image)
scores.append(ssim_score)
except Exception:
pass
# PSNR
psnr_score = None
try:
psnr_score = calculate_psnr(source_image, edited_image)
scores.append(psnr_score)
except Exception:
pass
# Combined score
if scores:
if lpips_similarity is not None and len(scores) > 1:
preservation_score = lpips_similarity * 0.5 + sum(s for s in scores if s != lpips_similarity) / (len(scores) - 1) * 0.5
else:
preservation_score = sum(scores) / len(scores)
else:
preservation_score = 0.5
return PreservationResult(
lpips_score=lpips_score,
ssim_score=ssim_score,
psnr_score=psnr_score,
overall_score=preservation_score,
)
def evaluate_edit_quality(self, edited_image: Image.Image, instruction: str) -> EditQualityResult:
"""Evaluate the quality of the edit."""
eval_prompt = f'''Evaluate the quality of this edited image.
Edit instruction: "{instruction}"
Rate each dimension 1-10:
- **Technical**: Seamless blending? Resolution consistent? No visible edit boundaries?
- **Aesthetic**: Natural looking? Color harmony maintained? Visually pleasing?
- **Coherence**: Physically plausible? Lighting/shadows consistent? Proper perspective?
- **Artifacts**: List any issues (blur, color bleeding, unnatural edges, etc.)
Example output:
{{
"technical": {{"score": 8}},
"aesthetic": {{"score": 7}},
"coherence": {{"score": 8}},
"artifacts": {{"detected": ["slight blur at edge"], "severity": "minor"}}
}}
Return ONLY valid JSON:'''
response = self._vlm_generate(edited_image, eval_prompt)
data = parse_json_robust(response, fallback=None)
if data and isinstance(data, dict):
try:
def get_score(key: str, default: float = 5.0) -> float:
val = data.get(key, {})
if isinstance(val, dict):
return float(val.get("score", default))
return float(val) if val else default
technical = get_score("technical")
aesthetic = get_score("aesthetic")
coherence = get_score("coherence")
artifacts_data = data.get("artifacts", {})
if isinstance(artifacts_data, dict):
artifacts = artifacts_data.get("detected", [])
severity = artifacts_data.get("severity", "unknown")
else:
artifacts = []
severity = "unknown"
overall = (technical + aesthetic + coherence) / 30.0
severity_penalties = {"major": 0.7, "moderate": 0.85, "minor": 0.95, "none": 1.0}
overall *= severity_penalties.get(severity, 0.9)
return EditQualityResult(
technical_score=technical,
aesthetic_score=aesthetic,
coherence_score=coherence,
artifacts=artifacts if isinstance(artifacts, list) else [],
artifact_severity=severity if isinstance(severity, str) else "unknown",
overall_score=overall,
)
except (KeyError, TypeError, ValueError):
pass
return EditQualityResult(
technical_score=5.0,
aesthetic_score=5.0,
coherence_score=5.0,
artifacts=[],
artifact_severity="unknown",
overall_score=0.5,
)
def _calculate_edit_aggregated_score(
self,
instruction_result: InstructionFollowingResult,
preservation_result: PreservationResult,
quality_result: EditQualityResult,
) -> EditAggregatedScore:
"""Calculate comprehensive aggregated score for editing."""
weights = {
"instruction_following": 0.35,
"preservation": 0.25,
"edit_quality": 0.25,
"artifacts": 0.15,
}
instruction_score = instruction_result.overall_score
preservation_score = preservation_result.overall_score
edit_quality_score = quality_result.overall_score
severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
artifacts_score = severity_map.get(quality_result.artifact_severity, 0.7)
overall = (
instruction_score * weights["instruction_following"] +
preservation_score * weights["preservation"] +
edit_quality_score * weights["edit_quality"] +
artifacts_score * weights["artifacts"]
)
num_primitives = len(instruction_result.primitive_scores)
confidence = min(1.0, 0.5 + (num_primitives * 0.1))
recommendation = self._generate_edit_recommendation(
instruction_score, preservation_score, edit_quality_score, artifacts_score, overall
)
return EditAggregatedScore(
overall=round(overall, 3),
grade=score_to_grade(overall),
passed=overall >= 0.7,
confidence=round(confidence, 2),
breakdown=EditScoreBreakdown(
instruction_following=round(instruction_score, 3),
preservation=round(preservation_score, 3),
edit_quality=round(edit_quality_score, 3),
artifacts=round(artifacts_score, 3),
),
weights_used=weights,
recommendation=recommendation,
)
def _generate_edit_recommendation(
self,
instruction: float,
preservation: float,
quality: float,
artifacts: float,
overall: float,
) -> str:
"""Generate recommendation for edit quality."""
issues = []
if instruction < 0.6:
issues.append("instruction not fully followed")
if preservation < 0.6:
issues.append("too much content changed")
if quality < 0.6:
issues.append("edit quality issues")
if artifacts < 0.7:
issues.append("visible artifacts")
if overall >= 0.85:
return "Excellent edit. Ready for use."
elif overall >= 0.70:
if issues:
return f"Good edit with minor issues: {', '.join(issues[:2])}."
return "Good quality edit. Minor improvements possible."
elif overall >= 0.50:
if issues:
return f"Moderate quality. Issues: {', '.join(issues)}."
return "Moderate quality. Consider regenerating."
else:
return f"Low quality. Issues: {', '.join(issues) if issues else 'multiple problems'}."
def evaluate(
self,
source_image: Image.Image,
edited_image: Image.Image,
instruction: str,
) -> EditEvalResult:
"""
Evaluate an image editing result.
Args:
source_image: Original image before editing
edited_image: Image after editing
instruction: The editing instruction that was applied
Returns:
EditEvalResult with all evaluation components
"""
start_time = time.time()
instruction_result = self.evaluate_instruction_following(edited_image, instruction)
preservation_result = self.evaluate_preservation(source_image, edited_image)
quality_result = self.evaluate_edit_quality(edited_image, instruction)
aggregated = self._calculate_edit_aggregated_score(
instruction_result=instruction_result,
preservation_result=preservation_result,
quality_result=quality_result,
)
return EditEvalResult(
score=aggregated,
instruction_following=instruction_result,
preservation=preservation_result,
edit_quality=quality_result,
evaluation_time=time.time() - start_time,
)