Spaces:
Running on Zero
Running on Zero
Add image evaluator with Soft-TIFA and VLM-as-Judge
Browse files- README.md +52 -6
- app.py +575 -4
- evaluator.py +1049 -0
- metrics.py +285 -0
- requirements.txt +22 -0
README.md
CHANGED
|
@@ -1,12 +1,58 @@
|
|
| 1 |
---
|
| 2 |
-
title: Image Evaluator
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Fal Image Evaluator
|
| 3 |
+
emoji: "🎨"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
suggested_hardware: a10g-small
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Fal Image Evaluator
|
| 15 |
+
|
| 16 |
+
AI image quality assessment using Soft-TIFA, VLM-as-Judge, and technical metrics.
|
| 17 |
+
|
| 18 |
+
## Features
|
| 19 |
+
|
| 20 |
+
### Image Evaluation
|
| 21 |
+
- **Soft-TIFA**: Atomic prompt decomposition + VQA verification
|
| 22 |
+
- **VLM-as-Judge**: Holistic quality assessment (technical, aesthetic, realism)
|
| 23 |
+
- **Technical Metrics**: CLIP score, sharpness, colorfulness, contrast
|
| 24 |
+
|
| 25 |
+
### Edit Evaluation
|
| 26 |
+
- **Instruction Following**: Verify edit primitives were applied
|
| 27 |
+
- **Preservation**: LPIPS, SSIM, PSNR for non-edited region preservation
|
| 28 |
+
- **Edit Quality**: Seamlessness, coherence, artifact detection
|
| 29 |
+
|
| 30 |
+
## Local Development
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
pip install -r requirements.txt
|
| 34 |
+
python app.py
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## File Structure
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
├── app.py # Gradio interface
|
| 41 |
+
├── evaluator.py # Core evaluation logic
|
| 42 |
+
├── metrics.py # Helper functions for metrics
|
| 43 |
+
└── requirements.txt # Dependencies
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Components
|
| 47 |
+
|
| 48 |
+
| Component | Description |
|
| 49 |
+
|-----------|-------------|
|
| 50 |
+
| Soft-TIFA | Decomposes prompts into atomic primitives, verifies via VQA |
|
| 51 |
+
| VLM-as-Judge | Qwen2.5-VL-7B for holistic image assessment |
|
| 52 |
+
| CLIP Score | Text-image alignment using OpenCLIP |
|
| 53 |
+
| LPIPS | Learned perceptual similarity |
|
| 54 |
+
| SSIM/PSNR | Structural and pixel-level similarity |
|
| 55 |
+
|
| 56 |
+
## License
|
| 57 |
+
|
| 58 |
+
MIT
|
app.py
CHANGED
|
@@ -1,7 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
def greet(name):
|
| 4 |
-
return "Hello " + name + "!!"
|
| 5 |
|
| 6 |
-
|
| 7 |
-
demo.launch()
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fal Image Evaluator - Gradio App for HuggingFace Spaces
|
| 3 |
+
|
| 4 |
+
AI image quality assessment using:
|
| 5 |
+
- Soft-TIFA for prompt alignment
|
| 6 |
+
- VLM-as-Judge for holistic assessment
|
| 7 |
+
- Technical metrics (sharpness, colorfulness, contrast, CLIP)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
import gradio as gr
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from typing import Optional
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
# Global evaluators (loaded on first use)
|
| 16 |
+
image_evaluator = None
|
| 17 |
+
edit_evaluator = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_image_evaluator():
|
| 21 |
+
"""Lazy load image evaluator."""
|
| 22 |
+
global image_evaluator
|
| 23 |
+
if image_evaluator is None:
|
| 24 |
+
from evaluator import ImageEvaluator
|
| 25 |
+
image_evaluator = ImageEvaluator()
|
| 26 |
+
return image_evaluator
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_edit_evaluator():
|
| 30 |
+
"""Lazy load edit evaluator."""
|
| 31 |
+
global edit_evaluator
|
| 32 |
+
if edit_evaluator is None:
|
| 33 |
+
from evaluator import EditEvaluator
|
| 34 |
+
edit_evaluator = EditEvaluator()
|
| 35 |
+
return edit_evaluator
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def format_score_html(score: float, label: str) -> str:
|
| 39 |
+
"""Format a score as colored HTML."""
|
| 40 |
+
if score >= 0.85:
|
| 41 |
+
color = "#22c55e" # green
|
| 42 |
+
elif score >= 0.70:
|
| 43 |
+
color = "#84cc16" # lime
|
| 44 |
+
elif score >= 0.50:
|
| 45 |
+
color = "#eab308" # yellow
|
| 46 |
+
else:
|
| 47 |
+
color = "#ef4444" # red
|
| 48 |
+
|
| 49 |
+
return f'<span style="color: {color}; font-weight: bold;">{score:.3f}</span> {label}'
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def format_grade_badge(grade: str, passed: bool) -> str:
|
| 53 |
+
"""Format grade as a badge."""
|
| 54 |
+
if passed:
|
| 55 |
+
bg_color = "#22c55e"
|
| 56 |
+
else:
|
| 57 |
+
bg_color = "#ef4444"
|
| 58 |
+
|
| 59 |
+
return f'''
|
| 60 |
+
<div style="display: inline-flex; align-items: center; gap: 8px;">
|
| 61 |
+
<span style="background: {bg_color}; color: white; padding: 4px 12px; border-radius: 4px; font-weight: bold; font-size: 1.2em;">
|
| 62 |
+
{grade}
|
| 63 |
+
</span>
|
| 64 |
+
<span style="color: {'#22c55e' if passed else '#ef4444'};">
|
| 65 |
+
{'PASSED' if passed else 'FAILED'}
|
| 66 |
+
</span>
|
| 67 |
+
</div>
|
| 68 |
+
'''
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def evaluate_image(
|
| 72 |
+
image: Image.Image,
|
| 73 |
+
prompt: str,
|
| 74 |
+
include_soft_tifa: bool,
|
| 75 |
+
include_vlm: bool,
|
| 76 |
+
include_technical: bool,
|
| 77 |
+
progress=gr.Progress()
|
| 78 |
+
) -> tuple:
|
| 79 |
+
"""Evaluate an AI-generated image."""
|
| 80 |
+
if image is None:
|
| 81 |
+
return (
|
| 82 |
+
"Please upload an image.",
|
| 83 |
+
"", "", "", ""
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
progress(0.1, desc="Loading models...")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
evaluator = get_image_evaluator()
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return (
|
| 92 |
+
f"Error loading models: {str(e)}",
|
| 93 |
+
"", "", "", ""
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
progress(0.2, desc="Starting evaluation...")
|
| 97 |
+
|
| 98 |
+
prompt_text = prompt.strip() if prompt else None
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
result = evaluator.evaluate(
|
| 102 |
+
image=image,
|
| 103 |
+
prompt=prompt_text,
|
| 104 |
+
include_soft_tifa=include_soft_tifa and bool(prompt_text),
|
| 105 |
+
include_vlm=include_vlm,
|
| 106 |
+
include_technical=include_technical,
|
| 107 |
+
)
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return (
|
| 110 |
+
f"Evaluation error: {str(e)}",
|
| 111 |
+
"", "", "", ""
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
progress(0.9, desc="Formatting results...")
|
| 115 |
+
|
| 116 |
+
# Format overall score
|
| 117 |
+
score = result.score
|
| 118 |
+
overall_html = f"""
|
| 119 |
+
<div style="padding: 16px; background: #1f2937; border-radius: 8px; margin-bottom: 16px;">
|
| 120 |
+
<h2 style="margin: 0 0 12px 0; color: #f3f4f6;">Overall Score</h2>
|
| 121 |
+
<div style="font-size: 2.5em; font-weight: bold; color: #60a5fa; margin-bottom: 8px;">
|
| 122 |
+
{score.overall:.3f}
|
| 123 |
+
</div>
|
| 124 |
+
{format_grade_badge(score.grade, score.passed)}
|
| 125 |
+
<div style="margin-top: 12px; color: #9ca3af;">
|
| 126 |
+
Confidence: {score.confidence:.0%} | Time: {result.evaluation_time:.1f}s
|
| 127 |
+
</div>
|
| 128 |
+
<div style="margin-top: 8px; padding: 8px; background: #374151; border-radius: 4px; color: #d1d5db;">
|
| 129 |
+
{score.recommendation}
|
| 130 |
+
</div>
|
| 131 |
+
</div>
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
# Format breakdown
|
| 135 |
+
breakdown = score.breakdown
|
| 136 |
+
breakdown_html = "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px;'>"
|
| 137 |
+
|
| 138 |
+
metrics = [
|
| 139 |
+
("Prompt Alignment", breakdown.prompt_alignment),
|
| 140 |
+
("Technical Quality", breakdown.technical_quality),
|
| 141 |
+
("Aesthetic Appeal", breakdown.aesthetic_appeal),
|
| 142 |
+
("Realism", breakdown.realism),
|
| 143 |
+
("Artifacts (inv)", breakdown.artifacts),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
for name, value in metrics:
|
| 147 |
+
if value is not None:
|
| 148 |
+
bar_width = int(value * 100)
|
| 149 |
+
if value >= 0.7:
|
| 150 |
+
bar_color = "#22c55e"
|
| 151 |
+
elif value >= 0.5:
|
| 152 |
+
bar_color = "#eab308"
|
| 153 |
+
else:
|
| 154 |
+
bar_color = "#ef4444"
|
| 155 |
+
|
| 156 |
+
breakdown_html += f"""
|
| 157 |
+
<div style="background: #374151; padding: 12px; border-radius: 6px;">
|
| 158 |
+
<div style="color: #9ca3af; font-size: 0.85em; margin-bottom: 4px;">{name}</div>
|
| 159 |
+
<div style="font-weight: bold; color: #f3f4f6; margin-bottom: 6px;">{value:.3f}</div>
|
| 160 |
+
<div style="background: #1f2937; border-radius: 4px; height: 6px; overflow: hidden;">
|
| 161 |
+
<div style="background: {bar_color}; width: {bar_width}%; height: 100%;"></div>
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
breakdown_html += "</div>"
|
| 167 |
+
|
| 168 |
+
# Format Soft-TIFA details
|
| 169 |
+
soft_tifa_html = ""
|
| 170 |
+
if result.soft_tifa:
|
| 171 |
+
st = result.soft_tifa
|
| 172 |
+
soft_tifa_html = f"""
|
| 173 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 174 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Soft-TIFA Results</h3>
|
| 175 |
+
<div style="color: #9ca3af; margin-bottom: 8px;">
|
| 176 |
+
Primitives: {st.primitives_count} | Atom Score: {st.atom_score:.3f} | Prompt Score: {st.prompt_score:.3f}
|
| 177 |
+
</div>
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
if st.primitive_results:
|
| 181 |
+
soft_tifa_html += "<div style='max-height: 200px; overflow-y: auto;'>"
|
| 182 |
+
for pr in st.primitive_results[:10]:
|
| 183 |
+
icon = "o" if pr.score >= 0.7 else "x"
|
| 184 |
+
color = "#22c55e" if pr.score >= 0.7 else "#ef4444"
|
| 185 |
+
soft_tifa_html += f"""
|
| 186 |
+
<div style="padding: 6px; border-bottom: 1px solid #374151; display: flex; justify-content: space-between;">
|
| 187 |
+
<span style="color: {color};">[{icon}] {pr.content}</span>
|
| 188 |
+
<span style="color: #9ca3af;">{pr.score:.2f}</span>
|
| 189 |
+
</div>
|
| 190 |
+
"""
|
| 191 |
+
soft_tifa_html += "</div>"
|
| 192 |
+
|
| 193 |
+
soft_tifa_html += "</div>"
|
| 194 |
+
|
| 195 |
+
# Format VLM assessment
|
| 196 |
+
vlm_html = ""
|
| 197 |
+
if result.vlm_assessment:
|
| 198 |
+
vlm = result.vlm_assessment
|
| 199 |
+
vlm_html = f"""
|
| 200 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 201 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">VLM-as-Judge Assessment</h3>
|
| 202 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
|
| 203 |
+
<div>Technical: {vlm.technical_quality:.1f}/10</div>
|
| 204 |
+
<div>Aesthetic: {vlm.aesthetic_appeal:.1f}/10</div>
|
| 205 |
+
<div>Realism: {vlm.realism:.1f}/10</div>
|
| 206 |
+
<div>Overall: {vlm.overall:.1f}/10</div>
|
| 207 |
+
</div>
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
if vlm.artifacts_detected:
|
| 211 |
+
vlm_html += f"""
|
| 212 |
+
<div style="margin-top: 8px; padding: 8px; background: #7f1d1d; border-radius: 4px;">
|
| 213 |
+
<strong>Artifacts ({vlm.artifacts_severity}):</strong> {', '.join(vlm.artifacts_detected[:5])}
|
| 214 |
+
</div>
|
| 215 |
+
"""
|
| 216 |
+
|
| 217 |
+
if vlm.reasoning:
|
| 218 |
+
vlm_html += f"""
|
| 219 |
+
<div style="margin-top: 8px; color: #9ca3af; font-style: italic;">
|
| 220 |
+
"{vlm.reasoning[:200]}"
|
| 221 |
+
</div>
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
vlm_html += "</div>"
|
| 225 |
+
|
| 226 |
+
# Format technical metrics
|
| 227 |
+
technical_html = ""
|
| 228 |
+
if result.technical_metrics:
|
| 229 |
+
tm = result.technical_metrics
|
| 230 |
+
technical_html = f"""
|
| 231 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 232 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Technical Metrics</h3>
|
| 233 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
|
| 234 |
+
"""
|
| 235 |
+
|
| 236 |
+
if tm.clip_score is not None:
|
| 237 |
+
technical_html += f"<div>CLIP Score: {tm.clip_score:.3f}</div>"
|
| 238 |
+
if tm.sharpness is not None:
|
| 239 |
+
technical_html += f"<div>Sharpness: {tm.sharpness:.3f}</div>"
|
| 240 |
+
if tm.colorfulness is not None:
|
| 241 |
+
technical_html += f"<div>Colorfulness: {tm.colorfulness:.3f}</div>"
|
| 242 |
+
if tm.contrast is not None:
|
| 243 |
+
technical_html += f"<div>Contrast: {tm.contrast:.3f}</div>"
|
| 244 |
+
|
| 245 |
+
technical_html += "</div></div>"
|
| 246 |
+
|
| 247 |
+
return (
|
| 248 |
+
overall_html,
|
| 249 |
+
breakdown_html,
|
| 250 |
+
soft_tifa_html,
|
| 251 |
+
vlm_html,
|
| 252 |
+
technical_html
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def evaluate_edit(
|
| 257 |
+
source_image: Image.Image,
|
| 258 |
+
edited_image: Image.Image,
|
| 259 |
+
instruction: str,
|
| 260 |
+
progress=gr.Progress()
|
| 261 |
+
) -> tuple:
|
| 262 |
+
"""Evaluate an image editing result."""
|
| 263 |
+
if source_image is None or edited_image is None:
|
| 264 |
+
return (
|
| 265 |
+
"Please upload both source and edited images.",
|
| 266 |
+
"", "", "", ""
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
if not instruction.strip():
|
| 270 |
+
return (
|
| 271 |
+
"Please enter the editing instruction.",
|
| 272 |
+
"", "", "", ""
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
progress(0.1, desc="Loading models...")
|
| 276 |
+
|
| 277 |
+
try:
|
| 278 |
+
evaluator = get_edit_evaluator()
|
| 279 |
+
except Exception as e:
|
| 280 |
+
return (
|
| 281 |
+
f"Error loading models: {str(e)}",
|
| 282 |
+
"", "", "", ""
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
progress(0.2, desc="Starting evaluation...")
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
result = evaluator.evaluate(
|
| 289 |
+
source_image=source_image,
|
| 290 |
+
edited_image=edited_image,
|
| 291 |
+
instruction=instruction.strip(),
|
| 292 |
+
)
|
| 293 |
+
except Exception as e:
|
| 294 |
+
return (
|
| 295 |
+
f"Evaluation error: {str(e)}",
|
| 296 |
+
"", "", "", ""
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
progress(0.9, desc="Formatting results...")
|
| 300 |
+
|
| 301 |
+
# Format overall score
|
| 302 |
+
score = result.score
|
| 303 |
+
overall_html = f"""
|
| 304 |
+
<div style="padding: 16px; background: #1f2937; border-radius: 8px; margin-bottom: 16px;">
|
| 305 |
+
<h2 style="margin: 0 0 12px 0; color: #f3f4f6;">Edit Quality Score</h2>
|
| 306 |
+
<div style="font-size: 2.5em; font-weight: bold; color: #60a5fa; margin-bottom: 8px;">
|
| 307 |
+
{score.overall:.3f}
|
| 308 |
+
</div>
|
| 309 |
+
{format_grade_badge(score.grade, score.passed)}
|
| 310 |
+
<div style="margin-top: 12px; color: #9ca3af;">
|
| 311 |
+
Confidence: {score.confidence:.0%} | Time: {result.evaluation_time:.1f}s
|
| 312 |
+
</div>
|
| 313 |
+
<div style="margin-top: 8px; padding: 8px; background: #374151; border-radius: 4px; color: #d1d5db;">
|
| 314 |
+
{score.recommendation}
|
| 315 |
+
</div>
|
| 316 |
+
</div>
|
| 317 |
+
"""
|
| 318 |
+
|
| 319 |
+
# Format breakdown
|
| 320 |
+
breakdown = score.breakdown
|
| 321 |
+
breakdown_html = "<div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 12px;'>"
|
| 322 |
+
|
| 323 |
+
metrics = [
|
| 324 |
+
("Instruction Following", breakdown.instruction_following),
|
| 325 |
+
("Preservation", breakdown.preservation),
|
| 326 |
+
("Edit Quality", breakdown.edit_quality),
|
| 327 |
+
("Artifacts (inv)", breakdown.artifacts),
|
| 328 |
+
]
|
| 329 |
+
|
| 330 |
+
for name, value in metrics:
|
| 331 |
+
if value is not None:
|
| 332 |
+
bar_width = int(value * 100)
|
| 333 |
+
if value >= 0.7:
|
| 334 |
+
bar_color = "#22c55e"
|
| 335 |
+
elif value >= 0.5:
|
| 336 |
+
bar_color = "#eab308"
|
| 337 |
+
else:
|
| 338 |
+
bar_color = "#ef4444"
|
| 339 |
+
|
| 340 |
+
breakdown_html += f"""
|
| 341 |
+
<div style="background: #374151; padding: 12px; border-radius: 6px;">
|
| 342 |
+
<div style="color: #9ca3af; font-size: 0.85em; margin-bottom: 4px;">{name}</div>
|
| 343 |
+
<div style="font-weight: bold; color: #f3f4f6; margin-bottom: 6px;">{value:.3f}</div>
|
| 344 |
+
<div style="background: #1f2937; border-radius: 4px; height: 6px; overflow: hidden;">
|
| 345 |
+
<div style="background: {bar_color}; width: {bar_width}%; height: 100%;"></div>
|
| 346 |
+
</div>
|
| 347 |
+
</div>
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
+
breakdown_html += "</div>"
|
| 351 |
+
|
| 352 |
+
# Format instruction following details
|
| 353 |
+
instruction_html = ""
|
| 354 |
+
if result.instruction_following:
|
| 355 |
+
inst = result.instruction_following
|
| 356 |
+
instruction_html = f"""
|
| 357 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 358 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Instruction Following</h3>
|
| 359 |
+
<div style="color: #9ca3af; margin-bottom: 8px;">
|
| 360 |
+
Overall: {inst.overall_score:.3f}
|
| 361 |
+
</div>
|
| 362 |
+
"""
|
| 363 |
+
|
| 364 |
+
if inst.primitive_scores:
|
| 365 |
+
instruction_html += "<div style='max-height: 150px; overflow-y: auto;'>"
|
| 366 |
+
for ps in inst.primitive_scores[:5]:
|
| 367 |
+
score_val = ps.get("score", 0) / 10.0
|
| 368 |
+
icon = "o" if score_val >= 0.7 else "x"
|
| 369 |
+
color = "#22c55e" if score_val >= 0.7 else "#ef4444"
|
| 370 |
+
instruction_html += f"""
|
| 371 |
+
<div style="padding: 6px; border-bottom: 1px solid #374151;">
|
| 372 |
+
<span style="color: {color};">[{icon}]</span> {ps.get('edit', 'N/A')}: {ps.get('score', 0):.1f}/10
|
| 373 |
+
</div>
|
| 374 |
+
"""
|
| 375 |
+
instruction_html += "</div>"
|
| 376 |
+
|
| 377 |
+
instruction_html += "</div>"
|
| 378 |
+
|
| 379 |
+
# Format preservation details
|
| 380 |
+
preservation_html = ""
|
| 381 |
+
if result.preservation:
|
| 382 |
+
pres = result.preservation
|
| 383 |
+
preservation_html = f"""
|
| 384 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 385 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Preservation Metrics</h3>
|
| 386 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
|
| 387 |
+
"""
|
| 388 |
+
|
| 389 |
+
if pres.lpips_score is not None:
|
| 390 |
+
preservation_html += f"<div>LPIPS: {pres.lpips_score:.3f} (lower=better)</div>"
|
| 391 |
+
if pres.ssim_score is not None:
|
| 392 |
+
preservation_html += f"<div>SSIM: {pres.ssim_score:.3f}</div>"
|
| 393 |
+
if pres.psnr_score is not None:
|
| 394 |
+
preservation_html += f"<div>PSNR: {pres.psnr_score:.3f}</div>"
|
| 395 |
+
|
| 396 |
+
preservation_html += f"<div><strong>Combined: {pres.overall_score:.3f}</strong></div>"
|
| 397 |
+
preservation_html += "</div></div>"
|
| 398 |
+
|
| 399 |
+
# Format edit quality details
|
| 400 |
+
quality_html = ""
|
| 401 |
+
if result.edit_quality:
|
| 402 |
+
eq = result.edit_quality
|
| 403 |
+
quality_html = f"""
|
| 404 |
+
<div style="padding: 12px; background: #1f2937; border-radius: 8px;">
|
| 405 |
+
<h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Edit Quality Assessment</h3>
|
| 406 |
+
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
|
| 407 |
+
<div>Technical: {eq.technical_score:.1f}/10</div>
|
| 408 |
+
<div>Aesthetic: {eq.aesthetic_score:.1f}/10</div>
|
| 409 |
+
<div>Coherence: {eq.coherence_score:.1f}/10</div>
|
| 410 |
+
<div>Overall: {eq.overall_score:.3f}</div>
|
| 411 |
+
</div>
|
| 412 |
+
"""
|
| 413 |
+
|
| 414 |
+
if eq.artifacts:
|
| 415 |
+
quality_html += f"""
|
| 416 |
+
<div style="margin-top: 8px; padding: 8px; background: #7f1d1d; border-radius: 4px;">
|
| 417 |
+
<strong>Artifacts ({eq.artifact_severity}):</strong> {', '.join(eq.artifacts[:5])}
|
| 418 |
+
</div>
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
quality_html += "</div>"
|
| 422 |
+
|
| 423 |
+
return (
|
| 424 |
+
overall_html,
|
| 425 |
+
breakdown_html,
|
| 426 |
+
instruction_html,
|
| 427 |
+
preservation_html,
|
| 428 |
+
quality_html
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
# Create Gradio interface
|
| 433 |
+
with gr.Blocks(
|
| 434 |
+
title="Fal Image Evaluator",
|
| 435 |
+
theme=gr.themes.Soft(
|
| 436 |
+
primary_hue="blue",
|
| 437 |
+
secondary_hue="slate",
|
| 438 |
+
),
|
| 439 |
+
css="""
|
| 440 |
+
.gradio-container { max-width: 1200px !important; }
|
| 441 |
+
.score-display { font-size: 1.5em; font-weight: bold; }
|
| 442 |
+
"""
|
| 443 |
+
) as demo:
|
| 444 |
+
|
| 445 |
+
gr.Markdown("""
|
| 446 |
+
# Fal Image Evaluator
|
| 447 |
+
|
| 448 |
+
AI image quality assessment using **Soft-TIFA**, **VLM-as-Judge**, and technical metrics.
|
| 449 |
+
|
| 450 |
+
- **Image Evaluation**: Assess text-to-image generation quality
|
| 451 |
+
- **Edit Evaluation**: Assess image editing quality
|
| 452 |
+
""")
|
| 453 |
+
|
| 454 |
+
with gr.Tabs():
|
| 455 |
+
# Image Evaluation Tab
|
| 456 |
+
with gr.TabItem("Image Evaluation"):
|
| 457 |
+
with gr.Row():
|
| 458 |
+
with gr.Column(scale=1):
|
| 459 |
+
image_input = gr.Image(
|
| 460 |
+
label="Upload Image",
|
| 461 |
+
type="pil",
|
| 462 |
+
height=400,
|
| 463 |
+
)
|
| 464 |
+
prompt_input = gr.Textbox(
|
| 465 |
+
label="Generation Prompt (optional but recommended)",
|
| 466 |
+
placeholder="Enter the prompt used to generate this image...",
|
| 467 |
+
lines=3,
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
with gr.Row():
|
| 471 |
+
soft_tifa_check = gr.Checkbox(
|
| 472 |
+
label="Soft-TIFA",
|
| 473 |
+
value=True,
|
| 474 |
+
info="Prompt alignment (requires prompt)"
|
| 475 |
+
)
|
| 476 |
+
vlm_check = gr.Checkbox(
|
| 477 |
+
label="VLM-as-Judge",
|
| 478 |
+
value=True,
|
| 479 |
+
info="Holistic assessment"
|
| 480 |
+
)
|
| 481 |
+
technical_check = gr.Checkbox(
|
| 482 |
+
label="Technical Metrics",
|
| 483 |
+
value=True,
|
| 484 |
+
info="CLIP, sharpness, etc."
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
evaluate_btn = gr.Button("Evaluate Image", variant="primary", size="lg")
|
| 488 |
+
|
| 489 |
+
with gr.Column(scale=1):
|
| 490 |
+
overall_output = gr.HTML(label="Overall Score")
|
| 491 |
+
breakdown_output = gr.HTML(label="Score Breakdown")
|
| 492 |
+
|
| 493 |
+
with gr.Row():
|
| 494 |
+
soft_tifa_output = gr.HTML(label="Soft-TIFA Details")
|
| 495 |
+
vlm_output = gr.HTML(label="VLM Assessment")
|
| 496 |
+
technical_output = gr.HTML(label="Technical Metrics")
|
| 497 |
+
|
| 498 |
+
evaluate_btn.click(
|
| 499 |
+
fn=evaluate_image,
|
| 500 |
+
inputs=[
|
| 501 |
+
image_input,
|
| 502 |
+
prompt_input,
|
| 503 |
+
soft_tifa_check,
|
| 504 |
+
vlm_check,
|
| 505 |
+
technical_check,
|
| 506 |
+
],
|
| 507 |
+
outputs=[
|
| 508 |
+
overall_output,
|
| 509 |
+
breakdown_output,
|
| 510 |
+
soft_tifa_output,
|
| 511 |
+
vlm_output,
|
| 512 |
+
technical_output,
|
| 513 |
+
],
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
# Edit Evaluation Tab
|
| 517 |
+
with gr.TabItem("Edit Evaluation"):
|
| 518 |
+
with gr.Row():
|
| 519 |
+
with gr.Column(scale=1):
|
| 520 |
+
with gr.Row():
|
| 521 |
+
source_input = gr.Image(
|
| 522 |
+
label="Source Image (Before)",
|
| 523 |
+
type="pil",
|
| 524 |
+
height=300,
|
| 525 |
+
)
|
| 526 |
+
edited_input = gr.Image(
|
| 527 |
+
label="Edited Image (After)",
|
| 528 |
+
type="pil",
|
| 529 |
+
height=300,
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
instruction_input = gr.Textbox(
|
| 533 |
+
label="Edit Instruction",
|
| 534 |
+
placeholder="Enter the editing instruction that was applied...",
|
| 535 |
+
lines=2,
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
edit_btn = gr.Button("Evaluate Edit", variant="primary", size="lg")
|
| 539 |
+
|
| 540 |
+
with gr.Column(scale=1):
|
| 541 |
+
edit_overall_output = gr.HTML(label="Overall Score")
|
| 542 |
+
edit_breakdown_output = gr.HTML(label="Score Breakdown")
|
| 543 |
+
|
| 544 |
+
with gr.Row():
|
| 545 |
+
instruction_output = gr.HTML(label="Instruction Following")
|
| 546 |
+
preservation_output = gr.HTML(label="Preservation")
|
| 547 |
+
quality_output = gr.HTML(label="Edit Quality")
|
| 548 |
+
|
| 549 |
+
edit_btn.click(
|
| 550 |
+
fn=evaluate_edit,
|
| 551 |
+
inputs=[
|
| 552 |
+
source_input,
|
| 553 |
+
edited_input,
|
| 554 |
+
instruction_input,
|
| 555 |
+
],
|
| 556 |
+
outputs=[
|
| 557 |
+
edit_overall_output,
|
| 558 |
+
edit_breakdown_output,
|
| 559 |
+
instruction_output,
|
| 560 |
+
preservation_output,
|
| 561 |
+
quality_output,
|
| 562 |
+
],
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
gr.Markdown("""
|
| 566 |
+
---
|
| 567 |
+
**Components:**
|
| 568 |
+
- **Soft-TIFA**: Decomposes prompts into atomic primitives and verifies each via VQA
|
| 569 |
+
- **VLM-as-Judge**: Uses Qwen2.5-VL for holistic quality assessment
|
| 570 |
+
- **Technical Metrics**: CLIP score, sharpness (Laplacian), colorfulness, contrast
|
| 571 |
+
- **Preservation Metrics**: LPIPS, SSIM, PSNR for edit evaluation
|
| 572 |
+
|
| 573 |
+
*Powered by Qwen2.5-VL-7B-Instruct*
|
| 574 |
+
""")
|
| 575 |
|
|
|
|
|
|
|
| 576 |
|
| 577 |
+
if __name__ == "__main__":
|
| 578 |
+
demo.launch()
|
evaluator.py
ADDED
|
@@ -0,0 +1,1049 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Image Evaluator Core Logic
|
| 3 |
+
|
| 4 |
+
Contains the main evaluation classes:
|
| 5 |
+
- ImageEvaluator: For text-to-image generation quality assessment
|
| 6 |
+
- EditEvaluator: For image editing quality assessment
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
import math
|
| 11 |
+
import time
|
| 12 |
+
from typing import Optional, List, Dict, Any
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from PIL import Image
|
| 15 |
+
|
| 16 |
+
from metrics import (
|
| 17 |
+
parse_json_robust,
|
| 18 |
+
calculate_sharpness,
|
| 19 |
+
calculate_colorfulness,
|
| 20 |
+
calculate_contrast,
|
| 21 |
+
calculate_ssim,
|
| 22 |
+
calculate_psnr,
|
| 23 |
+
calculate_clip_score,
|
| 24 |
+
calculate_lpips,
|
| 25 |
+
score_to_grade,
|
| 26 |
+
geometric_mean,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class PrimitiveResult:
|
| 32 |
+
"""Result for a single Soft-TIFA primitive."""
|
| 33 |
+
content: str
|
| 34 |
+
type: str
|
| 35 |
+
question: str
|
| 36 |
+
answer: str
|
| 37 |
+
score: float
|
| 38 |
+
reasoning: Optional[str] = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@dataclass
|
| 42 |
+
class SoftTIFAResult:
|
| 43 |
+
"""Soft-TIFA evaluation result."""
|
| 44 |
+
primitives_count: int
|
| 45 |
+
atom_score: float
|
| 46 |
+
prompt_score: float
|
| 47 |
+
passed: bool
|
| 48 |
+
primitive_results: List[PrimitiveResult]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class VLMAssessmentResult:
|
| 53 |
+
"""VLM-as-Judge assessment result."""
|
| 54 |
+
technical_quality: float
|
| 55 |
+
aesthetic_appeal: float
|
| 56 |
+
realism: float
|
| 57 |
+
semantic_accuracy: Optional[float]
|
| 58 |
+
artifacts_detected: List[str]
|
| 59 |
+
artifacts_severity: str
|
| 60 |
+
overall: float
|
| 61 |
+
reasoning: Optional[str] = None
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class TechnicalMetricsResult:
|
| 66 |
+
"""Technical metrics result."""
|
| 67 |
+
clip_score: Optional[float] = None
|
| 68 |
+
sharpness: Optional[float] = None
|
| 69 |
+
colorfulness: Optional[float] = None
|
| 70 |
+
contrast: Optional[float] = None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class ScoreBreakdown:
|
| 75 |
+
"""Detailed score breakdown by category."""
|
| 76 |
+
prompt_alignment: Optional[float] = None
|
| 77 |
+
technical_quality: Optional[float] = None
|
| 78 |
+
aesthetic_appeal: Optional[float] = None
|
| 79 |
+
realism: Optional[float] = None
|
| 80 |
+
artifacts: Optional[float] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@dataclass
|
| 84 |
+
class AggregatedScore:
|
| 85 |
+
"""Comprehensive aggregated scoring."""
|
| 86 |
+
overall: float
|
| 87 |
+
grade: str
|
| 88 |
+
passed: bool
|
| 89 |
+
confidence: float
|
| 90 |
+
breakdown: ScoreBreakdown
|
| 91 |
+
weights_used: Dict[str, float]
|
| 92 |
+
recommendation: str
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@dataclass
|
| 96 |
+
class ImageEvalResult:
|
| 97 |
+
"""Complete image evaluation result."""
|
| 98 |
+
score: AggregatedScore
|
| 99 |
+
soft_tifa: Optional[SoftTIFAResult] = None
|
| 100 |
+
vlm_assessment: Optional[VLMAssessmentResult] = None
|
| 101 |
+
technical_metrics: Optional[TechnicalMetricsResult] = None
|
| 102 |
+
evaluation_time: float = 0.0
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@dataclass
|
| 106 |
+
class InstructionFollowingResult:
|
| 107 |
+
"""Instruction following evaluation result."""
|
| 108 |
+
edit_primitives: List[Dict]
|
| 109 |
+
primitive_scores: List[Dict]
|
| 110 |
+
overall_score: float
|
| 111 |
+
reasoning: Optional[str] = None
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@dataclass
|
| 115 |
+
class PreservationResult:
|
| 116 |
+
"""Preservation evaluation result."""
|
| 117 |
+
lpips_score: Optional[float] = None
|
| 118 |
+
ssim_score: Optional[float] = None
|
| 119 |
+
psnr_score: Optional[float] = None
|
| 120 |
+
overall_score: float = 0.0
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@dataclass
|
| 124 |
+
class EditQualityResult:
|
| 125 |
+
"""Edit quality assessment result."""
|
| 126 |
+
technical_score: float
|
| 127 |
+
aesthetic_score: float
|
| 128 |
+
coherence_score: float
|
| 129 |
+
artifacts: List[str]
|
| 130 |
+
artifact_severity: str
|
| 131 |
+
overall_score: float
|
| 132 |
+
reasoning: Optional[str] = None
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@dataclass
|
| 136 |
+
class EditScoreBreakdown:
|
| 137 |
+
"""Detailed score breakdown for editing evaluation."""
|
| 138 |
+
instruction_following: Optional[float] = None
|
| 139 |
+
preservation: Optional[float] = None
|
| 140 |
+
edit_quality: Optional[float] = None
|
| 141 |
+
artifacts: Optional[float] = None
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@dataclass
|
| 145 |
+
class EditAggregatedScore:
|
| 146 |
+
"""Comprehensive aggregated scoring for editing."""
|
| 147 |
+
overall: float
|
| 148 |
+
grade: str
|
| 149 |
+
passed: bool
|
| 150 |
+
confidence: float
|
| 151 |
+
breakdown: EditScoreBreakdown
|
| 152 |
+
weights_used: Dict[str, float]
|
| 153 |
+
recommendation: str
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
@dataclass
|
| 157 |
+
class EditEvalResult:
|
| 158 |
+
"""Complete edit evaluation result."""
|
| 159 |
+
score: EditAggregatedScore
|
| 160 |
+
instruction_following: Optional[InstructionFollowingResult] = None
|
| 161 |
+
preservation: Optional[PreservationResult] = None
|
| 162 |
+
edit_quality: Optional[EditQualityResult] = None
|
| 163 |
+
evaluation_time: float = 0.0
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class ImageEvaluator:
|
| 167 |
+
"""
|
| 168 |
+
AI-Generated Image Quality Evaluator
|
| 169 |
+
|
| 170 |
+
Evaluates AI-generated images using:
|
| 171 |
+
- Soft-TIFA: Atomic prompt decomposition for precise alignment scoring
|
| 172 |
+
- VLM-as-Judge: Human-like holistic assessment with reasoning
|
| 173 |
+
- Technical Metrics: Sharpness, colorfulness, contrast, CLIP score
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
def __init__(self, device: str = "cuda"):
|
| 177 |
+
"""Initialize evaluator with models."""
|
| 178 |
+
import torch
|
| 179 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
| 180 |
+
|
| 181 |
+
self.device = device if torch.cuda.is_available() else "cpu"
|
| 182 |
+
|
| 183 |
+
# Load Qwen2.5-VL for VLM-as-Judge and Soft-TIFA
|
| 184 |
+
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
|
| 185 |
+
|
| 186 |
+
self.vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 187 |
+
model_name,
|
| 188 |
+
device_map="auto",
|
| 189 |
+
torch_dtype=torch.float16,
|
| 190 |
+
)
|
| 191 |
+
self.vlm_processor = AutoProcessor.from_pretrained(model_name)
|
| 192 |
+
|
| 193 |
+
# Load CLIP for text-image alignment
|
| 194 |
+
import open_clip
|
| 195 |
+
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
|
| 196 |
+
'ViT-B-32', pretrained='openai'
|
| 197 |
+
)
|
| 198 |
+
self.clip_model = self.clip_model.to(self.device).eval()
|
| 199 |
+
self.clip_tokenizer = open_clip.get_tokenizer('ViT-B-32')
|
| 200 |
+
|
| 201 |
+
def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
|
| 202 |
+
"""Generate response from VLM with image."""
|
| 203 |
+
import torch
|
| 204 |
+
|
| 205 |
+
messages = [
|
| 206 |
+
{
|
| 207 |
+
"role": "user",
|
| 208 |
+
"content": [
|
| 209 |
+
{"type": "image", "image": image},
|
| 210 |
+
{"type": "text", "text": prompt},
|
| 211 |
+
],
|
| 212 |
+
}
|
| 213 |
+
]
|
| 214 |
+
|
| 215 |
+
text = self.vlm_processor.apply_chat_template(
|
| 216 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
inputs = self.vlm_processor(
|
| 220 |
+
text=[text],
|
| 221 |
+
images=[image],
|
| 222 |
+
return_tensors="pt",
|
| 223 |
+
).to(self.vlm_model.device)
|
| 224 |
+
|
| 225 |
+
with torch.no_grad():
|
| 226 |
+
outputs = self.vlm_model.generate(
|
| 227 |
+
**inputs,
|
| 228 |
+
max_new_tokens=1024,
|
| 229 |
+
do_sample=False,
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
generated = outputs[0][inputs.input_ids.shape[1]:]
|
| 233 |
+
return self.vlm_processor.decode(generated, skip_special_tokens=True)
|
| 234 |
+
|
| 235 |
+
def _vlm_text_generate(self, prompt: str) -> str:
|
| 236 |
+
"""Generate response from VLM (text only)."""
|
| 237 |
+
import torch
|
| 238 |
+
|
| 239 |
+
messages = [{"role": "user", "content": prompt}]
|
| 240 |
+
|
| 241 |
+
text = self.vlm_processor.apply_chat_template(
|
| 242 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
inputs = self.vlm_processor(
|
| 246 |
+
text=[text],
|
| 247 |
+
return_tensors="pt",
|
| 248 |
+
).to(self.vlm_model.device)
|
| 249 |
+
|
| 250 |
+
with torch.no_grad():
|
| 251 |
+
outputs = self.vlm_model.generate(
|
| 252 |
+
**inputs,
|
| 253 |
+
max_new_tokens=1024,
|
| 254 |
+
do_sample=False,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
generated = outputs[0][inputs.input_ids.shape[1]:]
|
| 258 |
+
return self.vlm_processor.decode(generated, skip_special_tokens=True)
|
| 259 |
+
|
| 260 |
+
def evaluate_soft_tifa(self, image: Image.Image, prompt: str) -> SoftTIFAResult:
|
| 261 |
+
"""Run Soft-TIFA evaluation with atomic prompt decomposition."""
|
| 262 |
+
# Step 1: Decompose prompt into primitives
|
| 263 |
+
decomposition_prompt = f'''Decompose this text-to-image prompt into atomic visual primitives.
|
| 264 |
+
|
| 265 |
+
Prompt: "{prompt}"
|
| 266 |
+
|
| 267 |
+
For each primitive, identify:
|
| 268 |
+
- content: The specific visual element (e.g., "a red car", "sunset sky")
|
| 269 |
+
- type: One of [object, attribute, count, relation, action, scene, style]
|
| 270 |
+
- importance: How critical (0.5-1.0)
|
| 271 |
+
|
| 272 |
+
Example for "A cat sitting on a red chair":
|
| 273 |
+
[
|
| 274 |
+
{{"content": "cat", "type": "object", "importance": 1.0}},
|
| 275 |
+
{{"content": "chair", "type": "object", "importance": 0.9}},
|
| 276 |
+
{{"content": "red chair", "type": "attribute", "importance": 0.8}},
|
| 277 |
+
{{"content": "cat sitting on chair", "type": "relation", "importance": 0.9}}
|
| 278 |
+
]
|
| 279 |
+
|
| 280 |
+
Return ONLY valid JSON array for the given prompt:'''
|
| 281 |
+
|
| 282 |
+
decomp_response = self._vlm_text_generate(decomposition_prompt)
|
| 283 |
+
primitives = parse_json_robust(decomp_response, fallback=[])
|
| 284 |
+
|
| 285 |
+
if not primitives or not isinstance(primitives, list):
|
| 286 |
+
return SoftTIFAResult(
|
| 287 |
+
primitives_count=0,
|
| 288 |
+
atom_score=0.0,
|
| 289 |
+
prompt_score=0.0,
|
| 290 |
+
passed=False,
|
| 291 |
+
primitive_results=[],
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
# Step 2: Evaluate each primitive via VQA
|
| 295 |
+
primitive_results = []
|
| 296 |
+
vqa_templates = {
|
| 297 |
+
"object": "Is there a {content} in this image?",
|
| 298 |
+
"attribute": "Does the image show {content}?",
|
| 299 |
+
"count": "Are there {content}?",
|
| 300 |
+
"relation": "Is it true that {content}?",
|
| 301 |
+
"action": "Is {content} happening in this image?",
|
| 302 |
+
"scene": "Does this image depict {content}?",
|
| 303 |
+
"style": "Is this image in {content} style?",
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
for prim in primitives[:20]: # Limit to 20 primitives
|
| 307 |
+
content = prim.get("content", "")
|
| 308 |
+
ptype = prim.get("type", "object")
|
| 309 |
+
|
| 310 |
+
template = vqa_templates.get(ptype, vqa_templates["object"])
|
| 311 |
+
question = template.format(content=content)
|
| 312 |
+
|
| 313 |
+
vqa_prompt = f"""{question}
|
| 314 |
+
Answer Yes or No with confidence (0-100%).
|
| 315 |
+
Format: [Yes/No] (confidence: X%) - brief reasoning"""
|
| 316 |
+
|
| 317 |
+
response = self._vlm_generate(image, vqa_prompt)
|
| 318 |
+
|
| 319 |
+
# Parse response
|
| 320 |
+
answer = "no"
|
| 321 |
+
confidence = 0.5
|
| 322 |
+
reasoning = None
|
| 323 |
+
|
| 324 |
+
response_lower = response.lower().strip()
|
| 325 |
+
if response_lower.startswith("yes") or "[yes]" in response_lower:
|
| 326 |
+
answer = "yes"
|
| 327 |
+
|
| 328 |
+
conf_match = re.search(r'confidence[:\s]*(\d+)%?', response_lower)
|
| 329 |
+
if conf_match:
|
| 330 |
+
confidence = float(conf_match.group(1)) / 100.0
|
| 331 |
+
|
| 332 |
+
if "-" in response:
|
| 333 |
+
parts = response.split("-", 1)
|
| 334 |
+
if len(parts) > 1:
|
| 335 |
+
reasoning = parts[1].strip()[:200]
|
| 336 |
+
|
| 337 |
+
# Calculate score
|
| 338 |
+
score = confidence if answer == "yes" else (1.0 - confidence)
|
| 339 |
+
|
| 340 |
+
primitive_results.append(PrimitiveResult(
|
| 341 |
+
content=content,
|
| 342 |
+
type=ptype,
|
| 343 |
+
question=question,
|
| 344 |
+
answer=answer,
|
| 345 |
+
score=score,
|
| 346 |
+
reasoning=reasoning,
|
| 347 |
+
))
|
| 348 |
+
|
| 349 |
+
# Aggregate scores
|
| 350 |
+
if primitive_results:
|
| 351 |
+
atom_score = sum(r.score for r in primitive_results) / len(primitive_results)
|
| 352 |
+
geo_mean = geometric_mean([r.score for r in primitive_results])
|
| 353 |
+
prompt_score = 0.7 * atom_score + 0.3 * geo_mean
|
| 354 |
+
else:
|
| 355 |
+
atom_score = 0.0
|
| 356 |
+
prompt_score = 0.0
|
| 357 |
+
|
| 358 |
+
return SoftTIFAResult(
|
| 359 |
+
primitives_count=len(primitive_results),
|
| 360 |
+
atom_score=atom_score,
|
| 361 |
+
prompt_score=prompt_score,
|
| 362 |
+
passed=prompt_score >= 0.7,
|
| 363 |
+
primitive_results=primitive_results,
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
def evaluate_vlm_judge(self, image: Image.Image, prompt: Optional[str]) -> VLMAssessmentResult:
|
| 367 |
+
"""Run VLM-as-Judge holistic assessment."""
|
| 368 |
+
prompt_context = f'Original prompt: "{prompt}"' if prompt else ""
|
| 369 |
+
semantic_field = '"semantic_accuracy": {"score": 8, "reasoning": "matches prompt well"},' if prompt else ""
|
| 370 |
+
|
| 371 |
+
eval_prompt = f"""Evaluate this AI-generated image on multiple dimensions.
|
| 372 |
+
{prompt_context}
|
| 373 |
+
|
| 374 |
+
Rate each dimension from 1-10:
|
| 375 |
+
- **Technical Quality**: Sharpness, noise level, color accuracy, resolution
|
| 376 |
+
- **Aesthetic Appeal**: Composition, color harmony, visual balance, style
|
| 377 |
+
- **Realism**: Physical plausibility, lighting consistency, proportions
|
| 378 |
+
{('- **Semantic Accuracy**: How well it matches the prompt' if prompt else '')}
|
| 379 |
+
- **AI Artifacts**: Detect issues like distorted faces/hands, extra limbs, text errors
|
| 380 |
+
|
| 381 |
+
Example output:
|
| 382 |
+
{{
|
| 383 |
+
"technical_quality": {{"score": 8, "reasoning": "sharp with good colors"}},
|
| 384 |
+
"aesthetic_appeal": {{"score": 7, "reasoning": "balanced composition"}},
|
| 385 |
+
"realism": {{"score": 6, "reasoning": "slightly off proportions"}},
|
| 386 |
+
{semantic_field}
|
| 387 |
+
"artifacts": {{"detected": ["slightly distorted fingers"], "severity": "minor"}},
|
| 388 |
+
"overall": {{"score": 7, "reasoning": "good quality with minor issues"}}
|
| 389 |
+
}}
|
| 390 |
+
|
| 391 |
+
Now evaluate this image and return ONLY valid JSON:"""
|
| 392 |
+
|
| 393 |
+
response = self._vlm_generate(image, eval_prompt)
|
| 394 |
+
data = parse_json_robust(response, fallback=None)
|
| 395 |
+
|
| 396 |
+
if data and isinstance(data, dict):
|
| 397 |
+
try:
|
| 398 |
+
def get_score(key: str, default: float = 5.0) -> float:
|
| 399 |
+
val = data.get(key, {})
|
| 400 |
+
if isinstance(val, dict):
|
| 401 |
+
return float(val.get("score", default))
|
| 402 |
+
return float(val) if val else default
|
| 403 |
+
|
| 404 |
+
artifacts = data.get("artifacts", {})
|
| 405 |
+
if isinstance(artifacts, dict):
|
| 406 |
+
detected = artifacts.get("detected", [])
|
| 407 |
+
severity = artifacts.get("severity", "unknown")
|
| 408 |
+
else:
|
| 409 |
+
detected = []
|
| 410 |
+
severity = "unknown"
|
| 411 |
+
|
| 412 |
+
return VLMAssessmentResult(
|
| 413 |
+
technical_quality=get_score("technical_quality"),
|
| 414 |
+
aesthetic_appeal=get_score("aesthetic_appeal"),
|
| 415 |
+
realism=get_score("realism"),
|
| 416 |
+
semantic_accuracy=get_score("semantic_accuracy") if prompt else None,
|
| 417 |
+
artifacts_detected=detected if isinstance(detected, list) else [],
|
| 418 |
+
artifacts_severity=severity if isinstance(severity, str) else "unknown",
|
| 419 |
+
overall=get_score("overall"),
|
| 420 |
+
reasoning=data.get("overall", {}).get("reasoning") if isinstance(data.get("overall"), dict) else None,
|
| 421 |
+
)
|
| 422 |
+
except (KeyError, TypeError, ValueError):
|
| 423 |
+
pass
|
| 424 |
+
|
| 425 |
+
# Fallback
|
| 426 |
+
return VLMAssessmentResult(
|
| 427 |
+
technical_quality=5.0,
|
| 428 |
+
aesthetic_appeal=5.0,
|
| 429 |
+
realism=5.0,
|
| 430 |
+
semantic_accuracy=5.0 if prompt else None,
|
| 431 |
+
artifacts_detected=[],
|
| 432 |
+
artifacts_severity="unknown",
|
| 433 |
+
overall=5.0,
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
def evaluate_technical_metrics(self, image: Image.Image, prompt: Optional[str]) -> TechnicalMetricsResult:
|
| 437 |
+
"""Calculate technical quality metrics."""
|
| 438 |
+
sharpness = None
|
| 439 |
+
colorfulness_score = None
|
| 440 |
+
contrast_score = None
|
| 441 |
+
clip_score = None
|
| 442 |
+
|
| 443 |
+
try:
|
| 444 |
+
sharpness = calculate_sharpness(image)
|
| 445 |
+
except Exception:
|
| 446 |
+
pass
|
| 447 |
+
|
| 448 |
+
try:
|
| 449 |
+
colorfulness_score = calculate_colorfulness(image)
|
| 450 |
+
except Exception:
|
| 451 |
+
pass
|
| 452 |
+
|
| 453 |
+
try:
|
| 454 |
+
contrast_score = calculate_contrast(image)
|
| 455 |
+
except Exception:
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
if prompt:
|
| 459 |
+
clip_score = calculate_clip_score(
|
| 460 |
+
image, prompt,
|
| 461 |
+
self.clip_model, self.clip_preprocess, self.clip_tokenizer,
|
| 462 |
+
self.device
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
return TechnicalMetricsResult(
|
| 466 |
+
clip_score=clip_score,
|
| 467 |
+
sharpness=sharpness,
|
| 468 |
+
colorfulness=colorfulness_score,
|
| 469 |
+
contrast=contrast_score,
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
def _calculate_aggregated_score(
|
| 473 |
+
self,
|
| 474 |
+
soft_tifa: Optional[SoftTIFAResult],
|
| 475 |
+
vlm: Optional[VLMAssessmentResult],
|
| 476 |
+
technical: Optional[TechnicalMetricsResult],
|
| 477 |
+
has_prompt: bool,
|
| 478 |
+
) -> AggregatedScore:
|
| 479 |
+
"""Calculate comprehensive aggregated score."""
|
| 480 |
+
# Prompt alignment scores
|
| 481 |
+
prompt_alignment_scores = []
|
| 482 |
+
if soft_tifa:
|
| 483 |
+
prompt_alignment_scores.append(soft_tifa.prompt_score)
|
| 484 |
+
if vlm and vlm.semantic_accuracy is not None:
|
| 485 |
+
prompt_alignment_scores.append(vlm.semantic_accuracy / 10.0)
|
| 486 |
+
if technical and technical.clip_score is not None:
|
| 487 |
+
prompt_alignment_scores.append(technical.clip_score)
|
| 488 |
+
|
| 489 |
+
prompt_alignment = sum(prompt_alignment_scores) / len(prompt_alignment_scores) if prompt_alignment_scores else None
|
| 490 |
+
|
| 491 |
+
# Technical quality scores
|
| 492 |
+
tech_scores = []
|
| 493 |
+
if technical:
|
| 494 |
+
if technical.sharpness is not None:
|
| 495 |
+
tech_scores.append(technical.sharpness)
|
| 496 |
+
if technical.contrast is not None:
|
| 497 |
+
tech_scores.append(technical.contrast)
|
| 498 |
+
if vlm:
|
| 499 |
+
tech_scores.append(vlm.technical_quality / 10.0)
|
| 500 |
+
|
| 501 |
+
technical_quality = sum(tech_scores) / len(tech_scores) if tech_scores else None
|
| 502 |
+
|
| 503 |
+
# Aesthetic appeal scores
|
| 504 |
+
aesthetic_scores = []
|
| 505 |
+
if technical and technical.colorfulness is not None:
|
| 506 |
+
aesthetic_scores.append(technical.colorfulness)
|
| 507 |
+
if vlm:
|
| 508 |
+
aesthetic_scores.append(vlm.aesthetic_appeal / 10.0)
|
| 509 |
+
|
| 510 |
+
aesthetic_appeal = sum(aesthetic_scores) / len(aesthetic_scores) if aesthetic_scores else None
|
| 511 |
+
|
| 512 |
+
# Realism
|
| 513 |
+
realism = vlm.realism / 10.0 if vlm else None
|
| 514 |
+
|
| 515 |
+
# Artifacts
|
| 516 |
+
artifacts_score = None
|
| 517 |
+
if vlm:
|
| 518 |
+
severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
|
| 519 |
+
artifacts_score = severity_map.get(vlm.artifacts_severity, 0.7)
|
| 520 |
+
|
| 521 |
+
# Calculate weighted overall
|
| 522 |
+
score_map = {
|
| 523 |
+
"prompt_alignment": prompt_alignment,
|
| 524 |
+
"technical_quality": technical_quality,
|
| 525 |
+
"aesthetic_appeal": aesthetic_appeal,
|
| 526 |
+
"realism": realism,
|
| 527 |
+
"artifacts": artifacts_score,
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
category_weights = {
|
| 531 |
+
"prompt_alignment": 0.30 if has_prompt else 0.0,
|
| 532 |
+
"technical_quality": 0.25,
|
| 533 |
+
"aesthetic_appeal": 0.20,
|
| 534 |
+
"realism": 0.15,
|
| 535 |
+
"artifacts": 0.10,
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
weighted_sum = 0.0
|
| 539 |
+
total_weight = 0.0
|
| 540 |
+
|
| 541 |
+
for key, score in score_map.items():
|
| 542 |
+
if score is not None:
|
| 543 |
+
weight = category_weights[key]
|
| 544 |
+
weighted_sum += score * weight
|
| 545 |
+
total_weight += weight
|
| 546 |
+
|
| 547 |
+
overall = weighted_sum / total_weight if total_weight > 0 else 0.0
|
| 548 |
+
|
| 549 |
+
# Confidence
|
| 550 |
+
max_metrics = 5 if has_prompt else 4
|
| 551 |
+
available_metrics = sum(1 for s in score_map.values() if s is not None)
|
| 552 |
+
confidence = available_metrics / max_metrics
|
| 553 |
+
|
| 554 |
+
# Recommendation
|
| 555 |
+
recommendation = self._generate_recommendation(score_map, overall)
|
| 556 |
+
|
| 557 |
+
# Normalized weights
|
| 558 |
+
normalized_weights = {k: v / total_weight for k, v in category_weights.items() if score_map.get(k) is not None}
|
| 559 |
+
|
| 560 |
+
return AggregatedScore(
|
| 561 |
+
overall=round(overall, 3),
|
| 562 |
+
grade=score_to_grade(overall),
|
| 563 |
+
passed=overall >= 0.7,
|
| 564 |
+
confidence=round(confidence, 2),
|
| 565 |
+
breakdown=ScoreBreakdown(
|
| 566 |
+
prompt_alignment=round(prompt_alignment, 3) if prompt_alignment is not None else None,
|
| 567 |
+
technical_quality=round(technical_quality, 3) if technical_quality is not None else None,
|
| 568 |
+
aesthetic_appeal=round(aesthetic_appeal, 3) if aesthetic_appeal is not None else None,
|
| 569 |
+
realism=round(realism, 3) if realism is not None else None,
|
| 570 |
+
artifacts=round(artifacts_score, 3) if artifacts_score is not None else None,
|
| 571 |
+
),
|
| 572 |
+
weights_used=normalized_weights,
|
| 573 |
+
recommendation=recommendation,
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
def _generate_recommendation(self, scores: Dict, overall: float) -> str:
|
| 577 |
+
"""Generate recommendation based on scores."""
|
| 578 |
+
weakest = None
|
| 579 |
+
weakest_score = 1.0
|
| 580 |
+
|
| 581 |
+
for key, score in scores.items():
|
| 582 |
+
if score is not None and score < weakest_score:
|
| 583 |
+
weakest_score = score
|
| 584 |
+
weakest = key
|
| 585 |
+
|
| 586 |
+
if overall >= 0.85:
|
| 587 |
+
return "Excellent quality image. Ready for production use."
|
| 588 |
+
elif overall >= 0.70:
|
| 589 |
+
if weakest and weakest_score < 0.7:
|
| 590 |
+
suggestions = {
|
| 591 |
+
"prompt_alignment": "Consider regenerating with clearer prompt.",
|
| 592 |
+
"technical_quality": "Image has quality issues. Try higher resolution.",
|
| 593 |
+
"aesthetic_appeal": "Composition could be improved.",
|
| 594 |
+
"realism": "Physical inconsistencies detected.",
|
| 595 |
+
"artifacts": "AI artifacts present. Consider regeneration.",
|
| 596 |
+
}
|
| 597 |
+
return f"Good overall. Improvement: {suggestions.get(weakest, weakest)}"
|
| 598 |
+
return "Good quality image. Minor improvements possible."
|
| 599 |
+
elif overall >= 0.50:
|
| 600 |
+
return f"Moderate quality. Main issue: {weakest.replace('_', ' ') if weakest else 'overall'}."
|
| 601 |
+
else:
|
| 602 |
+
return "Low quality. Regeneration strongly recommended."
|
| 603 |
+
|
| 604 |
+
def evaluate(
|
| 605 |
+
self,
|
| 606 |
+
image: Image.Image,
|
| 607 |
+
prompt: Optional[str] = None,
|
| 608 |
+
include_soft_tifa: bool = True,
|
| 609 |
+
include_vlm: bool = True,
|
| 610 |
+
include_technical: bool = True,
|
| 611 |
+
) -> ImageEvalResult:
|
| 612 |
+
"""
|
| 613 |
+
Evaluate an AI-generated image.
|
| 614 |
+
|
| 615 |
+
Args:
|
| 616 |
+
image: PIL Image to evaluate
|
| 617 |
+
prompt: Optional text prompt used to generate the image
|
| 618 |
+
include_soft_tifa: Run Soft-TIFA evaluation (requires prompt)
|
| 619 |
+
include_vlm: Run VLM-as-Judge assessment
|
| 620 |
+
include_technical: Calculate technical metrics
|
| 621 |
+
|
| 622 |
+
Returns:
|
| 623 |
+
ImageEvalResult with all evaluation components
|
| 624 |
+
"""
|
| 625 |
+
start_time = time.time()
|
| 626 |
+
|
| 627 |
+
soft_tifa_result = None
|
| 628 |
+
vlm_result = None
|
| 629 |
+
technical_result = None
|
| 630 |
+
|
| 631 |
+
if include_soft_tifa and prompt:
|
| 632 |
+
soft_tifa_result = self.evaluate_soft_tifa(image, prompt)
|
| 633 |
+
|
| 634 |
+
if include_vlm:
|
| 635 |
+
vlm_result = self.evaluate_vlm_judge(image, prompt)
|
| 636 |
+
|
| 637 |
+
if include_technical:
|
| 638 |
+
technical_result = self.evaluate_technical_metrics(image, prompt)
|
| 639 |
+
|
| 640 |
+
aggregated = self._calculate_aggregated_score(
|
| 641 |
+
soft_tifa=soft_tifa_result,
|
| 642 |
+
vlm=vlm_result,
|
| 643 |
+
technical=technical_result,
|
| 644 |
+
has_prompt=prompt is not None,
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
+
return ImageEvalResult(
|
| 648 |
+
score=aggregated,
|
| 649 |
+
soft_tifa=soft_tifa_result,
|
| 650 |
+
vlm_assessment=vlm_result,
|
| 651 |
+
technical_metrics=technical_result,
|
| 652 |
+
evaluation_time=time.time() - start_time,
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
class EditEvaluator:
|
| 657 |
+
"""
|
| 658 |
+
Image Editing Evaluator
|
| 659 |
+
|
| 660 |
+
Evaluates instruction-based image editing using:
|
| 661 |
+
- Instruction Following: Were the requested edits applied?
|
| 662 |
+
- Preservation: Were non-edited regions maintained?
|
| 663 |
+
- Edit Quality: Is the edit seamless and high-quality?
|
| 664 |
+
"""
|
| 665 |
+
|
| 666 |
+
def __init__(self, device: str = "cuda"):
|
| 667 |
+
"""Initialize evaluator with models."""
|
| 668 |
+
import torch
|
| 669 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
| 670 |
+
import lpips
|
| 671 |
+
|
| 672 |
+
self.device = device if torch.cuda.is_available() else "cpu"
|
| 673 |
+
|
| 674 |
+
# Load Qwen2.5-VL
|
| 675 |
+
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
|
| 676 |
+
|
| 677 |
+
self.vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 678 |
+
model_name,
|
| 679 |
+
device_map="auto",
|
| 680 |
+
torch_dtype=torch.float16,
|
| 681 |
+
)
|
| 682 |
+
self.vlm_processor = AutoProcessor.from_pretrained(model_name)
|
| 683 |
+
|
| 684 |
+
# Load LPIPS
|
| 685 |
+
self.lpips_model = lpips.LPIPS(net='alex').to(self.device)
|
| 686 |
+
|
| 687 |
+
def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
|
| 688 |
+
"""Generate response from VLM with image."""
|
| 689 |
+
import torch
|
| 690 |
+
|
| 691 |
+
messages = [
|
| 692 |
+
{
|
| 693 |
+
"role": "user",
|
| 694 |
+
"content": [
|
| 695 |
+
{"type": "image", "image": image},
|
| 696 |
+
{"type": "text", "text": prompt},
|
| 697 |
+
],
|
| 698 |
+
}
|
| 699 |
+
]
|
| 700 |
+
|
| 701 |
+
text = self.vlm_processor.apply_chat_template(
|
| 702 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
inputs = self.vlm_processor(
|
| 706 |
+
text=[text],
|
| 707 |
+
images=[image],
|
| 708 |
+
return_tensors="pt",
|
| 709 |
+
).to(self.vlm_model.device)
|
| 710 |
+
|
| 711 |
+
with torch.no_grad():
|
| 712 |
+
outputs = self.vlm_model.generate(
|
| 713 |
+
**inputs,
|
| 714 |
+
max_new_tokens=1024,
|
| 715 |
+
do_sample=False,
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
generated = outputs[0][inputs.input_ids.shape[1]:]
|
| 719 |
+
return self.vlm_processor.decode(generated, skip_special_tokens=True)
|
| 720 |
+
|
| 721 |
+
def _vlm_text_generate(self, prompt: str) -> str:
|
| 722 |
+
"""Generate response from VLM (text only)."""
|
| 723 |
+
import torch
|
| 724 |
+
|
| 725 |
+
messages = [{"role": "user", "content": prompt}]
|
| 726 |
+
|
| 727 |
+
text = self.vlm_processor.apply_chat_template(
|
| 728 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
inputs = self.vlm_processor(
|
| 732 |
+
text=[text],
|
| 733 |
+
return_tensors="pt",
|
| 734 |
+
).to(self.vlm_model.device)
|
| 735 |
+
|
| 736 |
+
with torch.no_grad():
|
| 737 |
+
outputs = self.vlm_model.generate(
|
| 738 |
+
**inputs,
|
| 739 |
+
max_new_tokens=1024,
|
| 740 |
+
do_sample=False,
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
generated = outputs[0][inputs.input_ids.shape[1]:]
|
| 744 |
+
return self.vlm_processor.decode(generated, skip_special_tokens=True)
|
| 745 |
+
|
| 746 |
+
def evaluate_instruction_following(self, edited_image: Image.Image, instruction: str) -> InstructionFollowingResult:
|
| 747 |
+
"""Evaluate if editing instruction was followed."""
|
| 748 |
+
decomp_prompt = f'''Analyze this image editing instruction and decompose into atomic edits.
|
| 749 |
+
|
| 750 |
+
Instruction: "{instruction}"
|
| 751 |
+
|
| 752 |
+
Example for "Change the sky to sunset and add a bird":
|
| 753 |
+
{{
|
| 754 |
+
"edits": [
|
| 755 |
+
{{"content": "change sky color to sunset", "type": "modify", "target": "sky", "expected_result": "orange/purple sunset sky"}},
|
| 756 |
+
{{"content": "add a bird", "type": "add", "target": "sky area", "expected_result": "visible bird in the scene"}}
|
| 757 |
+
]
|
| 758 |
+
}}
|
| 759 |
+
|
| 760 |
+
Return ONLY valid JSON for the given instruction:'''
|
| 761 |
+
|
| 762 |
+
decomp_response = self._vlm_text_generate(decomp_prompt)
|
| 763 |
+
data = parse_json_robust(decomp_response, fallback={})
|
| 764 |
+
edits = data.get("edits", []) if isinstance(data, dict) else []
|
| 765 |
+
|
| 766 |
+
if not edits or not isinstance(edits, list):
|
| 767 |
+
# Fallback: evaluate holistically
|
| 768 |
+
verify_prompt = f'''Evaluate if this image correctly shows the result of the edit:
|
| 769 |
+
|
| 770 |
+
Edit instruction: "{instruction}"
|
| 771 |
+
|
| 772 |
+
Rate success from 0-10.
|
| 773 |
+
Format: Score: X/10 - Reasoning'''
|
| 774 |
+
|
| 775 |
+
response = self._vlm_generate(edited_image, verify_prompt)
|
| 776 |
+
score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
|
| 777 |
+
score = float(score_match.group(1)) if score_match else 5.0
|
| 778 |
+
|
| 779 |
+
return InstructionFollowingResult(
|
| 780 |
+
edit_primitives=[{"content": instruction, "type": "unknown"}],
|
| 781 |
+
primitive_scores=[{"edit": instruction, "score": score}],
|
| 782 |
+
overall_score=score / 10.0,
|
| 783 |
+
reasoning=response[:200] if response else None,
|
| 784 |
+
)
|
| 785 |
+
|
| 786 |
+
# Evaluate each edit
|
| 787 |
+
primitive_scores = []
|
| 788 |
+
for edit in edits[:10]:
|
| 789 |
+
content = edit.get("content", "")
|
| 790 |
+
target = edit.get("target", "the image")
|
| 791 |
+
expected = edit.get("expected_result", content)
|
| 792 |
+
|
| 793 |
+
verify_prompt = f'''Verify if this edit was applied:
|
| 794 |
+
|
| 795 |
+
Edit: {content}
|
| 796 |
+
Target: {target}
|
| 797 |
+
Expected: {expected}
|
| 798 |
+
|
| 799 |
+
Rate from 0-10.
|
| 800 |
+
Format: Score: X/10 - Reasoning'''
|
| 801 |
+
|
| 802 |
+
response = self._vlm_generate(edited_image, verify_prompt)
|
| 803 |
+
score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
|
| 804 |
+
score = float(score_match.group(1)) if score_match else 5.0
|
| 805 |
+
|
| 806 |
+
primitive_scores.append({
|
| 807 |
+
"edit": content,
|
| 808 |
+
"score": score,
|
| 809 |
+
"reasoning": response[:100] if response else None,
|
| 810 |
+
})
|
| 811 |
+
|
| 812 |
+
overall = sum(p["score"] for p in primitive_scores) / len(primitive_scores) if primitive_scores else 0
|
| 813 |
+
|
| 814 |
+
return InstructionFollowingResult(
|
| 815 |
+
edit_primitives=edits[:10],
|
| 816 |
+
primitive_scores=primitive_scores,
|
| 817 |
+
overall_score=overall / 10.0,
|
| 818 |
+
)
|
| 819 |
+
|
| 820 |
+
def evaluate_preservation(self, source_image: Image.Image, edited_image: Image.Image) -> PreservationResult:
|
| 821 |
+
"""Evaluate if non-edited regions were preserved."""
|
| 822 |
+
scores = []
|
| 823 |
+
|
| 824 |
+
# LPIPS
|
| 825 |
+
lpips_score = calculate_lpips(source_image, edited_image, self.lpips_model, self.device)
|
| 826 |
+
lpips_similarity = max(0, 1 - lpips_score) if lpips_score is not None else None
|
| 827 |
+
if lpips_similarity is not None:
|
| 828 |
+
scores.append(lpips_similarity)
|
| 829 |
+
|
| 830 |
+
# SSIM
|
| 831 |
+
ssim_score = None
|
| 832 |
+
try:
|
| 833 |
+
ssim_score = calculate_ssim(source_image, edited_image)
|
| 834 |
+
scores.append(ssim_score)
|
| 835 |
+
except Exception:
|
| 836 |
+
pass
|
| 837 |
+
|
| 838 |
+
# PSNR
|
| 839 |
+
psnr_score = None
|
| 840 |
+
try:
|
| 841 |
+
psnr_score = calculate_psnr(source_image, edited_image)
|
| 842 |
+
scores.append(psnr_score)
|
| 843 |
+
except Exception:
|
| 844 |
+
pass
|
| 845 |
+
|
| 846 |
+
# Combined score
|
| 847 |
+
if scores:
|
| 848 |
+
if lpips_similarity is not None and len(scores) > 1:
|
| 849 |
+
preservation_score = lpips_similarity * 0.5 + sum(s for s in scores if s != lpips_similarity) / (len(scores) - 1) * 0.5
|
| 850 |
+
else:
|
| 851 |
+
preservation_score = sum(scores) / len(scores)
|
| 852 |
+
else:
|
| 853 |
+
preservation_score = 0.5
|
| 854 |
+
|
| 855 |
+
return PreservationResult(
|
| 856 |
+
lpips_score=lpips_score,
|
| 857 |
+
ssim_score=ssim_score,
|
| 858 |
+
psnr_score=psnr_score,
|
| 859 |
+
overall_score=preservation_score,
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
def evaluate_edit_quality(self, edited_image: Image.Image, instruction: str) -> EditQualityResult:
|
| 863 |
+
"""Evaluate the quality of the edit."""
|
| 864 |
+
eval_prompt = f'''Evaluate the quality of this edited image.
|
| 865 |
+
|
| 866 |
+
Edit instruction: "{instruction}"
|
| 867 |
+
|
| 868 |
+
Rate each dimension 1-10:
|
| 869 |
+
- **Technical**: Seamless blending? Resolution consistent? No visible edit boundaries?
|
| 870 |
+
- **Aesthetic**: Natural looking? Color harmony maintained? Visually pleasing?
|
| 871 |
+
- **Coherence**: Physically plausible? Lighting/shadows consistent? Proper perspective?
|
| 872 |
+
- **Artifacts**: List any issues (blur, color bleeding, unnatural edges, etc.)
|
| 873 |
+
|
| 874 |
+
Example output:
|
| 875 |
+
{{
|
| 876 |
+
"technical": {{"score": 8}},
|
| 877 |
+
"aesthetic": {{"score": 7}},
|
| 878 |
+
"coherence": {{"score": 8}},
|
| 879 |
+
"artifacts": {{"detected": ["slight blur at edge"], "severity": "minor"}}
|
| 880 |
+
}}
|
| 881 |
+
|
| 882 |
+
Return ONLY valid JSON:'''
|
| 883 |
+
|
| 884 |
+
response = self._vlm_generate(edited_image, eval_prompt)
|
| 885 |
+
data = parse_json_robust(response, fallback=None)
|
| 886 |
+
|
| 887 |
+
if data and isinstance(data, dict):
|
| 888 |
+
try:
|
| 889 |
+
def get_score(key: str, default: float = 5.0) -> float:
|
| 890 |
+
val = data.get(key, {})
|
| 891 |
+
if isinstance(val, dict):
|
| 892 |
+
return float(val.get("score", default))
|
| 893 |
+
return float(val) if val else default
|
| 894 |
+
|
| 895 |
+
technical = get_score("technical")
|
| 896 |
+
aesthetic = get_score("aesthetic")
|
| 897 |
+
coherence = get_score("coherence")
|
| 898 |
+
|
| 899 |
+
artifacts_data = data.get("artifacts", {})
|
| 900 |
+
if isinstance(artifacts_data, dict):
|
| 901 |
+
artifacts = artifacts_data.get("detected", [])
|
| 902 |
+
severity = artifacts_data.get("severity", "unknown")
|
| 903 |
+
else:
|
| 904 |
+
artifacts = []
|
| 905 |
+
severity = "unknown"
|
| 906 |
+
|
| 907 |
+
overall = (technical + aesthetic + coherence) / 30.0
|
| 908 |
+
severity_penalties = {"major": 0.7, "moderate": 0.85, "minor": 0.95, "none": 1.0}
|
| 909 |
+
overall *= severity_penalties.get(severity, 0.9)
|
| 910 |
+
|
| 911 |
+
return EditQualityResult(
|
| 912 |
+
technical_score=technical,
|
| 913 |
+
aesthetic_score=aesthetic,
|
| 914 |
+
coherence_score=coherence,
|
| 915 |
+
artifacts=artifacts if isinstance(artifacts, list) else [],
|
| 916 |
+
artifact_severity=severity if isinstance(severity, str) else "unknown",
|
| 917 |
+
overall_score=overall,
|
| 918 |
+
)
|
| 919 |
+
except (KeyError, TypeError, ValueError):
|
| 920 |
+
pass
|
| 921 |
+
|
| 922 |
+
return EditQualityResult(
|
| 923 |
+
technical_score=5.0,
|
| 924 |
+
aesthetic_score=5.0,
|
| 925 |
+
coherence_score=5.0,
|
| 926 |
+
artifacts=[],
|
| 927 |
+
artifact_severity="unknown",
|
| 928 |
+
overall_score=0.5,
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
def _calculate_edit_aggregated_score(
|
| 932 |
+
self,
|
| 933 |
+
instruction_result: InstructionFollowingResult,
|
| 934 |
+
preservation_result: PreservationResult,
|
| 935 |
+
quality_result: EditQualityResult,
|
| 936 |
+
) -> EditAggregatedScore:
|
| 937 |
+
"""Calculate comprehensive aggregated score for editing."""
|
| 938 |
+
weights = {
|
| 939 |
+
"instruction_following": 0.35,
|
| 940 |
+
"preservation": 0.25,
|
| 941 |
+
"edit_quality": 0.25,
|
| 942 |
+
"artifacts": 0.15,
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
instruction_score = instruction_result.overall_score
|
| 946 |
+
preservation_score = preservation_result.overall_score
|
| 947 |
+
edit_quality_score = quality_result.overall_score
|
| 948 |
+
|
| 949 |
+
severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
|
| 950 |
+
artifacts_score = severity_map.get(quality_result.artifact_severity, 0.7)
|
| 951 |
+
|
| 952 |
+
overall = (
|
| 953 |
+
instruction_score * weights["instruction_following"] +
|
| 954 |
+
preservation_score * weights["preservation"] +
|
| 955 |
+
edit_quality_score * weights["edit_quality"] +
|
| 956 |
+
artifacts_score * weights["artifacts"]
|
| 957 |
+
)
|
| 958 |
+
|
| 959 |
+
num_primitives = len(instruction_result.primitive_scores)
|
| 960 |
+
confidence = min(1.0, 0.5 + (num_primitives * 0.1))
|
| 961 |
+
|
| 962 |
+
recommendation = self._generate_edit_recommendation(
|
| 963 |
+
instruction_score, preservation_score, edit_quality_score, artifacts_score, overall
|
| 964 |
+
)
|
| 965 |
+
|
| 966 |
+
return EditAggregatedScore(
|
| 967 |
+
overall=round(overall, 3),
|
| 968 |
+
grade=score_to_grade(overall),
|
| 969 |
+
passed=overall >= 0.7,
|
| 970 |
+
confidence=round(confidence, 2),
|
| 971 |
+
breakdown=EditScoreBreakdown(
|
| 972 |
+
instruction_following=round(instruction_score, 3),
|
| 973 |
+
preservation=round(preservation_score, 3),
|
| 974 |
+
edit_quality=round(edit_quality_score, 3),
|
| 975 |
+
artifacts=round(artifacts_score, 3),
|
| 976 |
+
),
|
| 977 |
+
weights_used=weights,
|
| 978 |
+
recommendation=recommendation,
|
| 979 |
+
)
|
| 980 |
+
|
| 981 |
+
def _generate_edit_recommendation(
|
| 982 |
+
self,
|
| 983 |
+
instruction: float,
|
| 984 |
+
preservation: float,
|
| 985 |
+
quality: float,
|
| 986 |
+
artifacts: float,
|
| 987 |
+
overall: float,
|
| 988 |
+
) -> str:
|
| 989 |
+
"""Generate recommendation for edit quality."""
|
| 990 |
+
issues = []
|
| 991 |
+
|
| 992 |
+
if instruction < 0.6:
|
| 993 |
+
issues.append("instruction not fully followed")
|
| 994 |
+
if preservation < 0.6:
|
| 995 |
+
issues.append("too much content changed")
|
| 996 |
+
if quality < 0.6:
|
| 997 |
+
issues.append("edit quality issues")
|
| 998 |
+
if artifacts < 0.7:
|
| 999 |
+
issues.append("visible artifacts")
|
| 1000 |
+
|
| 1001 |
+
if overall >= 0.85:
|
| 1002 |
+
return "Excellent edit. Ready for use."
|
| 1003 |
+
elif overall >= 0.70:
|
| 1004 |
+
if issues:
|
| 1005 |
+
return f"Good edit with minor issues: {', '.join(issues[:2])}."
|
| 1006 |
+
return "Good quality edit. Minor improvements possible."
|
| 1007 |
+
elif overall >= 0.50:
|
| 1008 |
+
if issues:
|
| 1009 |
+
return f"Moderate quality. Issues: {', '.join(issues)}."
|
| 1010 |
+
return "Moderate quality. Consider regenerating."
|
| 1011 |
+
else:
|
| 1012 |
+
return f"Low quality. Issues: {', '.join(issues) if issues else 'multiple problems'}."
|
| 1013 |
+
|
| 1014 |
+
def evaluate(
|
| 1015 |
+
self,
|
| 1016 |
+
source_image: Image.Image,
|
| 1017 |
+
edited_image: Image.Image,
|
| 1018 |
+
instruction: str,
|
| 1019 |
+
) -> EditEvalResult:
|
| 1020 |
+
"""
|
| 1021 |
+
Evaluate an image editing result.
|
| 1022 |
+
|
| 1023 |
+
Args:
|
| 1024 |
+
source_image: Original image before editing
|
| 1025 |
+
edited_image: Image after editing
|
| 1026 |
+
instruction: The editing instruction that was applied
|
| 1027 |
+
|
| 1028 |
+
Returns:
|
| 1029 |
+
EditEvalResult with all evaluation components
|
| 1030 |
+
"""
|
| 1031 |
+
start_time = time.time()
|
| 1032 |
+
|
| 1033 |
+
instruction_result = self.evaluate_instruction_following(edited_image, instruction)
|
| 1034 |
+
preservation_result = self.evaluate_preservation(source_image, edited_image)
|
| 1035 |
+
quality_result = self.evaluate_edit_quality(edited_image, instruction)
|
| 1036 |
+
|
| 1037 |
+
aggregated = self._calculate_edit_aggregated_score(
|
| 1038 |
+
instruction_result=instruction_result,
|
| 1039 |
+
preservation_result=preservation_result,
|
| 1040 |
+
quality_result=quality_result,
|
| 1041 |
+
)
|
| 1042 |
+
|
| 1043 |
+
return EditEvalResult(
|
| 1044 |
+
score=aggregated,
|
| 1045 |
+
instruction_following=instruction_result,
|
| 1046 |
+
preservation=preservation_result,
|
| 1047 |
+
edit_quality=quality_result,
|
| 1048 |
+
evaluation_time=time.time() - start_time,
|
| 1049 |
+
)
|
metrics.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Image Quality Metrics
|
| 3 |
+
|
| 4 |
+
Helper functions for calculating various image quality metrics:
|
| 5 |
+
- Technical metrics: sharpness, colorfulness, contrast
|
| 6 |
+
- Preservation metrics: SSIM, PSNR
|
| 7 |
+
- JSON parsing utilities for LLM outputs
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
import json
|
| 12 |
+
import math
|
| 13 |
+
import numpy as np
|
| 14 |
+
from typing import Any, Optional
|
| 15 |
+
from PIL import Image
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def parse_json_robust(text: str, fallback: Any = None) -> Any:
|
| 19 |
+
"""
|
| 20 |
+
Robustly parse JSON from LLM output.
|
| 21 |
+
Handles common issues like markdown code blocks, extra text, etc.
|
| 22 |
+
"""
|
| 23 |
+
if not text:
|
| 24 |
+
return fallback
|
| 25 |
+
|
| 26 |
+
# Try direct parse first
|
| 27 |
+
try:
|
| 28 |
+
return json.loads(text)
|
| 29 |
+
except json.JSONDecodeError:
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
# Remove markdown code blocks
|
| 33 |
+
text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
|
| 34 |
+
text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE)
|
| 35 |
+
|
| 36 |
+
# Try to find JSON array
|
| 37 |
+
array_match = re.search(r'\[[\s\S]*?\](?=\s*$|\s*[,}\]]|$)', text)
|
| 38 |
+
if array_match:
|
| 39 |
+
try:
|
| 40 |
+
return json.loads(array_match.group())
|
| 41 |
+
except json.JSONDecodeError:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
# Try to find JSON object
|
| 45 |
+
obj_match = re.search(r'\{[\s\S]*\}', text)
|
| 46 |
+
if obj_match:
|
| 47 |
+
try:
|
| 48 |
+
return json.loads(obj_match.group())
|
| 49 |
+
except json.JSONDecodeError:
|
| 50 |
+
# Try fixing common issues
|
| 51 |
+
json_str = obj_match.group()
|
| 52 |
+
# Replace single quotes with double quotes
|
| 53 |
+
json_str = re.sub(r"'([^']*)':", r'"\1":', json_str)
|
| 54 |
+
# Remove trailing commas
|
| 55 |
+
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
|
| 56 |
+
try:
|
| 57 |
+
return json.loads(json_str)
|
| 58 |
+
except json.JSONDecodeError:
|
| 59 |
+
pass
|
| 60 |
+
|
| 61 |
+
return fallback
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def calculate_sharpness(image: Image.Image) -> float:
|
| 65 |
+
"""
|
| 66 |
+
Calculate image sharpness using Laplacian variance.
|
| 67 |
+
Higher values = sharper image.
|
| 68 |
+
Returns normalized score 0-1.
|
| 69 |
+
"""
|
| 70 |
+
import torch
|
| 71 |
+
import torchvision.transforms as T
|
| 72 |
+
import torch.nn.functional as F
|
| 73 |
+
|
| 74 |
+
# Convert to grayscale tensor
|
| 75 |
+
transform = T.Compose([
|
| 76 |
+
T.Resize((512, 512)),
|
| 77 |
+
T.Grayscale(),
|
| 78 |
+
T.ToTensor(),
|
| 79 |
+
])
|
| 80 |
+
tensor = transform(image)
|
| 81 |
+
|
| 82 |
+
# Laplacian kernel
|
| 83 |
+
laplacian = torch.tensor([
|
| 84 |
+
[0, 1, 0],
|
| 85 |
+
[1, -4, 1],
|
| 86 |
+
[0, 1, 0]
|
| 87 |
+
], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
|
| 88 |
+
|
| 89 |
+
# Apply convolution
|
| 90 |
+
output = F.conv2d(tensor.unsqueeze(0), laplacian, padding=1)
|
| 91 |
+
|
| 92 |
+
# Variance of Laplacian
|
| 93 |
+
variance = output.var().item()
|
| 94 |
+
|
| 95 |
+
# Normalize (typical range 0-0.1, cap at 0.05 for max score)
|
| 96 |
+
return min(1.0, variance / 0.05)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def calculate_colorfulness(image: Image.Image) -> float:
|
| 100 |
+
"""
|
| 101 |
+
Calculate image colorfulness using Hasler and Susstrunk's method.
|
| 102 |
+
Returns normalized score 0-1.
|
| 103 |
+
"""
|
| 104 |
+
img = np.array(image)
|
| 105 |
+
|
| 106 |
+
if len(img.shape) < 3 or img.shape[2] < 3:
|
| 107 |
+
return 0.0 # Grayscale image
|
| 108 |
+
|
| 109 |
+
R, G, B = img[:, :, 0], img[:, :, 1], img[:, :, 2]
|
| 110 |
+
|
| 111 |
+
rg = np.abs(R.astype(float) - G.astype(float))
|
| 112 |
+
yb = np.abs(0.5 * (R.astype(float) + G.astype(float)) - B.astype(float))
|
| 113 |
+
|
| 114 |
+
rg_mean, rg_std = np.mean(rg), np.std(rg)
|
| 115 |
+
yb_mean, yb_std = np.mean(yb), np.std(yb)
|
| 116 |
+
|
| 117 |
+
std_root = np.sqrt(rg_std ** 2 + yb_std ** 2)
|
| 118 |
+
mean_root = np.sqrt(rg_mean ** 2 + yb_mean ** 2)
|
| 119 |
+
|
| 120 |
+
colorfulness = std_root + 0.3 * mean_root
|
| 121 |
+
|
| 122 |
+
# Normalize (typical range 0-100, good images around 40-60)
|
| 123 |
+
return min(1.0, colorfulness / 100.0)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def calculate_contrast(image: Image.Image) -> float:
|
| 127 |
+
"""
|
| 128 |
+
Calculate image contrast using standard deviation of luminance.
|
| 129 |
+
Returns normalized score 0-1.
|
| 130 |
+
"""
|
| 131 |
+
img = np.array(image.convert('L')) # Convert to grayscale
|
| 132 |
+
contrast = np.std(img) / 128.0 # Normalize by half of max value
|
| 133 |
+
return min(1.0, contrast)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def calculate_ssim(image1: Image.Image, image2: Image.Image, window_size: int = 11) -> float:
|
| 137 |
+
"""
|
| 138 |
+
Calculate Structural Similarity Index (SSIM) between two images.
|
| 139 |
+
Returns score 0-1 where 1 means identical.
|
| 140 |
+
"""
|
| 141 |
+
# Convert to grayscale numpy arrays
|
| 142 |
+
img1 = np.array(image1.convert('L').resize((256, 256)), dtype=np.float64)
|
| 143 |
+
img2 = np.array(image2.convert('L').resize((256, 256)), dtype=np.float64)
|
| 144 |
+
|
| 145 |
+
# Constants for stability
|
| 146 |
+
C1 = (0.01 * 255) ** 2
|
| 147 |
+
C2 = (0.03 * 255) ** 2
|
| 148 |
+
|
| 149 |
+
# Calculate means
|
| 150 |
+
mu1 = np.mean(img1)
|
| 151 |
+
mu2 = np.mean(img2)
|
| 152 |
+
|
| 153 |
+
# Calculate variances and covariance
|
| 154 |
+
sigma1_sq = np.var(img1)
|
| 155 |
+
sigma2_sq = np.var(img2)
|
| 156 |
+
sigma12 = np.mean((img1 - mu1) * (img2 - mu2))
|
| 157 |
+
|
| 158 |
+
# SSIM formula
|
| 159 |
+
numerator = (2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)
|
| 160 |
+
denominator = (mu1 ** 2 + mu2 ** 2 + C1) * (sigma1_sq + sigma2_sq + C2)
|
| 161 |
+
|
| 162 |
+
ssim = numerator / denominator
|
| 163 |
+
return float(max(0, min(1, ssim)))
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def calculate_psnr(image1: Image.Image, image2: Image.Image) -> float:
|
| 167 |
+
"""
|
| 168 |
+
Calculate Peak Signal-to-Noise Ratio between two images.
|
| 169 |
+
Returns normalized score 0-1.
|
| 170 |
+
"""
|
| 171 |
+
# Convert to numpy arrays
|
| 172 |
+
img1 = np.array(image1.resize((256, 256)), dtype=np.float64)
|
| 173 |
+
img2 = np.array(image2.resize((256, 256)), dtype=np.float64)
|
| 174 |
+
|
| 175 |
+
# Calculate MSE
|
| 176 |
+
mse = np.mean((img1 - img2) ** 2)
|
| 177 |
+
|
| 178 |
+
if mse == 0:
|
| 179 |
+
return 1.0 # Identical images
|
| 180 |
+
|
| 181 |
+
# PSNR in dB
|
| 182 |
+
psnr = 10 * np.log10((255 ** 2) / mse)
|
| 183 |
+
|
| 184 |
+
# Normalize (20-50 dB range maps to 0-1)
|
| 185 |
+
normalized = (psnr - 20) / 30
|
| 186 |
+
return float(max(0, min(1, normalized)))
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def calculate_clip_score(
|
| 190 |
+
image: Image.Image,
|
| 191 |
+
text: str,
|
| 192 |
+
clip_model,
|
| 193 |
+
clip_preprocess,
|
| 194 |
+
clip_tokenizer,
|
| 195 |
+
device: str = "cuda"
|
| 196 |
+
) -> Optional[float]:
|
| 197 |
+
"""
|
| 198 |
+
Calculate CLIP text-image alignment score.
|
| 199 |
+
Returns score 0-1 where higher means better alignment.
|
| 200 |
+
"""
|
| 201 |
+
import torch
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
image_input = clip_preprocess(image).unsqueeze(0).to(device)
|
| 205 |
+
text_input = clip_tokenizer([text]).to(device)
|
| 206 |
+
|
| 207 |
+
with torch.no_grad():
|
| 208 |
+
image_features = clip_model.encode_image(image_input)
|
| 209 |
+
text_features = clip_model.encode_text(text_input)
|
| 210 |
+
|
| 211 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
| 212 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
| 213 |
+
|
| 214 |
+
score = (image_features @ text_features.T).item()
|
| 215 |
+
|
| 216 |
+
return float(score)
|
| 217 |
+
except Exception:
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def calculate_lpips(
|
| 222 |
+
image1: Image.Image,
|
| 223 |
+
image2: Image.Image,
|
| 224 |
+
lpips_model,
|
| 225 |
+
device: str = "cuda"
|
| 226 |
+
) -> Optional[float]:
|
| 227 |
+
"""
|
| 228 |
+
Calculate LPIPS perceptual distance between two images.
|
| 229 |
+
Returns distance value (lower = more similar).
|
| 230 |
+
"""
|
| 231 |
+
import torch
|
| 232 |
+
import torchvision.transforms as T
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
transform = T.Compose([
|
| 236 |
+
T.Resize((512, 512)),
|
| 237 |
+
T.ToTensor(),
|
| 238 |
+
])
|
| 239 |
+
|
| 240 |
+
tensor1 = transform(image1).unsqueeze(0).to(device) * 2 - 1
|
| 241 |
+
tensor2 = transform(image2).unsqueeze(0).to(device) * 2 - 1
|
| 242 |
+
|
| 243 |
+
with torch.no_grad():
|
| 244 |
+
lpips_score = float(lpips_model(tensor1, tensor2).item())
|
| 245 |
+
|
| 246 |
+
return lpips_score
|
| 247 |
+
except Exception:
|
| 248 |
+
return None
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def score_to_grade(score: float) -> str:
|
| 252 |
+
"""Convert numeric score (0-1) to letter grade."""
|
| 253 |
+
if score >= 0.95:
|
| 254 |
+
return "A+"
|
| 255 |
+
elif score >= 0.90:
|
| 256 |
+
return "A"
|
| 257 |
+
elif score >= 0.85:
|
| 258 |
+
return "A-"
|
| 259 |
+
elif score >= 0.80:
|
| 260 |
+
return "B+"
|
| 261 |
+
elif score >= 0.75:
|
| 262 |
+
return "B"
|
| 263 |
+
elif score >= 0.70:
|
| 264 |
+
return "B-"
|
| 265 |
+
elif score >= 0.65:
|
| 266 |
+
return "C+"
|
| 267 |
+
elif score >= 0.60:
|
| 268 |
+
return "C"
|
| 269 |
+
elif score >= 0.55:
|
| 270 |
+
return "C-"
|
| 271 |
+
elif score >= 0.50:
|
| 272 |
+
return "D"
|
| 273 |
+
else:
|
| 274 |
+
return "F"
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def geometric_mean(scores: list[float]) -> float:
|
| 278 |
+
"""Calculate geometric mean of scores, handling zeros."""
|
| 279 |
+
if not scores:
|
| 280 |
+
return 0.0
|
| 281 |
+
|
| 282 |
+
# Clamp to minimum value to avoid log(0)
|
| 283 |
+
clamped = [max(s, 0.01) for s in scores]
|
| 284 |
+
log_sum = sum(math.log(s) for s in clamped)
|
| 285 |
+
return math.exp(log_sum / len(clamped))
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for HuggingFace Spaces
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
|
| 4 |
+
# PyTorch (HF Spaces provides CUDA)
|
| 5 |
+
torch>=2.1.0
|
| 6 |
+
torchvision>=0.16.0
|
| 7 |
+
|
| 8 |
+
# Transformers for Qwen2.5-VL
|
| 9 |
+
transformers>=4.45.0
|
| 10 |
+
accelerate>=0.25.0
|
| 11 |
+
qwen-vl-utils>=0.0.8
|
| 12 |
+
|
| 13 |
+
# Image processing
|
| 14 |
+
Pillow>=10.0.0
|
| 15 |
+
numpy>=1.24.0
|
| 16 |
+
|
| 17 |
+
# Metrics
|
| 18 |
+
lpips>=0.1.4
|
| 19 |
+
open-clip-torch>=2.24.0
|
| 20 |
+
|
| 21 |
+
# Utilities
|
| 22 |
+
httpx>=0.25.0
|