"""
Image Evaluator - Gradio App for HuggingFace Spaces
AI image quality assessment using:
- Soft-TIFA for prompt alignment
- VLM-as-Judge for holistic assessment
- Multi-image comparison
- Technical metrics (sharpness, colorfulness, contrast, CLIP)
Powered by Qwen2.5-VL-7B
"""
import gradio as gr
from PIL import Image
from typing import Optional, List
import time
import spaces
# Global evaluator (loaded on first use)
evaluator = None
def get_evaluator():
"""Lazy load evaluator."""
global evaluator
if evaluator is None:
from evaluator import ImageEvaluator
evaluator = ImageEvaluator()
return evaluator
# Dark theme CSS
DARK_CSS = """
/* Base dark theme */
.gradio-container {
background-color: #09090b !important;
color: #fafafa !important;
}
/* Main blocks */
.dark {
--background-fill-primary: #09090b !important;
--background-fill-secondary: #18181b !important;
--border-color-primary: #27272a !important;
--text-color: #fafafa !important;
--text-color-subdued: #a1a1aa !important;
}
/* Cards and panels */
.panel, .form, .block {
background-color: #18181b !important;
border: 1px solid #27272a !important;
border-radius: 12px !important;
}
/* Input fields */
input, textarea, select {
background-color: #18181b !important;
border: 1px solid #27272a !important;
color: #fafafa !important;
border-radius: 8px !important;
}
input:focus, textarea:focus {
border-color: #3b82f6 !important;
outline: none !important;
}
/* Buttons */
.primary {
background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
border: none !important;
color: white !important;
}
.secondary {
background-color: #27272a !important;
border: 1px solid #3f3f46 !important;
color: #fafafa !important;
}
/* Tabs */
.tab-nav {
background-color: #09090b !important;
border-bottom: 1px solid #27272a !important;
}
.tab-nav button {
color: #a1a1aa !important;
background: transparent !important;
}
.tab-nav button.selected {
color: #3b82f6 !important;
border-bottom: 2px solid #3b82f6 !important;
}
/* Image upload */
.image-container {
background-color: #18181b !important;
border: 2px dashed #27272a !important;
border-radius: 12px !important;
}
/* Labels */
label, .label-wrap {
color: #a1a1aa !important;
font-weight: 500 !important;
}
/* Checkboxes */
.checkbox-group {
background-color: #18181b !important;
padding: 12px !important;
border-radius: 8px !important;
}
/* Scrollbar */
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: #18181b;
}
::-webkit-scrollbar-thumb {
background: #3f3f46;
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: #52525b;
}
/* Result cards */
.result-card {
background: #18181b;
border: 1px solid #27272a;
border-radius: 12px;
padding: 20px;
margin: 8px 0;
}
/* Score display */
.score-large {
font-size: 3em;
font-weight: 700;
background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
/* Winner badge */
.winner-badge {
background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);
color: #000;
padding: 4px 12px;
border-radius: 20px;
font-weight: 600;
font-size: 0.85em;
}
/* Rank badge */
.rank-badge {
display: inline-flex;
align-items: center;
justify-content: center;
width: 28px;
height: 28px;
border-radius: 50%;
font-weight: 700;
font-size: 0.9em;
}
.rank-1 { background: linear-gradient(135deg, #fbbf24 0%, #f59e0b 100%); color: #000; }
.rank-2 { background: linear-gradient(135deg, #94a3b8 0%, #64748b 100%); color: #fff; }
.rank-3 { background: linear-gradient(135deg, #c2855a 0%, #a16207 100%); color: #fff; }
.rank-4 { background: #3f3f46; color: #a1a1aa; }
/* Progress bars */
.progress-bar {
background: #27272a;
border-radius: 4px;
height: 6px;
overflow: hidden;
}
.progress-fill {
height: 100%;
border-radius: 4px;
transition: width 0.3s ease;
}
.progress-green { background: linear-gradient(90deg, #22c55e 0%, #16a34a 100%); }
.progress-yellow { background: linear-gradient(90deg, #eab308 0%, #ca8a04 100%); }
.progress-red { background: linear-gradient(90deg, #ef4444 0%, #dc2626 100%); }
.progress-blue { background: linear-gradient(90deg, #3b82f6 0%, #2563eb 100%); }
/* Skeleton loading animation */
@keyframes skeleton-pulse {
0%, 100% { opacity: 0.4; }
50% { opacity: 0.7; }
}
.skeleton {
background: linear-gradient(90deg, #27272a 0%, #3f3f46 50%, #27272a 100%);
background-size: 200% 100%;
animation: skeleton-pulse 1.5s ease-in-out infinite;
border-radius: 6px;
}
.skeleton-text {
height: 14px;
margin: 8px 0;
}
.skeleton-title {
height: 20px;
width: 60%;
margin-bottom: 16px;
}
.skeleton-score {
height: 48px;
width: 120px;
margin: 12px auto;
}
.skeleton-bar {
height: 6px;
width: 100%;
margin: 8px 0;
}
.skeleton-card {
background: #18181b;
border: 1px solid #27272a;
border-radius: 12px;
padding: 20px;
}
"""
# Skeleton placeholders for loading states
SKELETON_OVERALL = '''
'''
SKELETON_BREAKDOWN = '''
'''
SKELETON_SOFT_TIFA = '''
'''
SKELETON_VLM = '''
'''
SKELETON_TECHNICAL = '''
'''
SKELETON_WINNER = '''
'''
SKELETON_RANKINGS = '''
'''
SKELETON_INDIVIDUAL = '''
'''
SKELETON_EDIT = '''
'''
SKELETON_DETAILS = '''
'''
def format_score_card(score: float, label: str, show_bar: bool = True) -> str:
"""Format a score as a styled card."""
if score >= 0.7:
color_class = "progress-green"
text_color = "#22c55e"
elif score >= 0.5:
color_class = "progress-yellow"
text_color = "#eab308"
else:
color_class = "progress-red"
text_color = "#ef4444"
bar_html = f'''
''' if show_bar else ""
return f'''
{label}
{score:.3f}
{bar_html}
'''
def get_grade_colors(grade: str) -> tuple:
"""Get colors for a grade (F=red, D=orange, C=yellow, B=lime, A=green)."""
grade_colors = {
"A": ("#22c55e", "#16a34a"),
"B": ("#84cc16", "#65a30d"),
"C": ("#eab308", "#ca8a04"),
"D": ("#f97316", "#ea580c"),
"F": ("#ef4444", "#dc2626"),
}
return grade_colors.get(grade.upper(), ("#71717a", "#52525b"))
def get_grade_status(grade: str) -> str:
"""Get status text for a grade."""
statuses = {
"A": "EXCELLENT",
"B": "GOOD",
"C": "AVERAGE",
"D": "BELOW AVG",
"F": "POOR",
}
return statuses.get(grade.upper(), "UNKNOWN")
def format_grade_badge(grade: str, passed: bool) -> str:
"""Format grade as a badge with color gradient from F (red) to A (green)."""
color1, color2 = get_grade_colors(grade)
status = get_grade_status(grade)
bg = f"linear-gradient(135deg, {color1} 0%, {color2} 100%)"
return f'''
{grade}
{status}
'''
def format_rank_badge(rank: int) -> str:
"""Format ranking position as badge."""
return f'{rank}'
@spaces.GPU(duration=300)
def evaluate_single(
image: Image.Image,
prompt: str,
include_soft_tifa: bool,
include_vlm: bool,
include_technical: bool,
progress=gr.Progress()
) -> tuple:
"""Evaluate a single AI-generated image."""
if image is None:
return ("Please upload an image.", "", "", "", "")
progress(0.1, desc="Loading models...")
try:
eval_instance = get_evaluator()
except Exception as e:
return (f"Error loading models: {str(e)}", "", "", "", "")
progress(0.2, desc="Evaluating image...")
prompt_text = prompt.strip() if prompt else None
try:
result = eval_instance.evaluate(
image=image,
prompt=prompt_text,
include_soft_tifa=include_soft_tifa and bool(prompt_text),
include_vlm=include_vlm,
include_technical=include_technical,
)
except Exception as e:
return (f"Evaluation error: {str(e)}", "", "", "", "")
progress(0.9, desc="Formatting results...")
score = result.score
# Overall score card
overall_html = f'''
Overall Score
{score.overall:.3f}
{format_grade_badge(score.grade, score.passed)}
Confidence: {score.confidence:.0%} | Time: {result.evaluation_time:.1f}s
{score.recommendation}
'''
# Breakdown scores
breakdown = score.breakdown
metrics = [
("Prompt Alignment", breakdown.prompt_alignment),
("Technical Quality", breakdown.technical_quality),
("Aesthetic Appeal", breakdown.aesthetic_appeal),
("Realism", breakdown.realism),
("Artifacts (inv)", breakdown.artifacts),
]
breakdown_html = ''
for name, value in metrics:
if value is not None:
breakdown_html += format_score_card(value, name)
breakdown_html += '
'
# Soft-TIFA results
soft_tifa_html = ""
if result.soft_tifa:
st = result.soft_tifa
soft_tifa_html = f'''
Soft-TIFA Analysis
Primitives: {st.primitives_count} | Atom: {st.atom_score:.3f} | Prompt: {st.prompt_score:.3f}
'''
for pr in st.primitive_results[:10]:
icon = "●" if pr.score >= 0.7 else "○"
color = "#22c55e" if pr.score >= 0.7 else "#ef4444"
soft_tifa_html += f'''
{icon} {pr.content}
{pr.score:.2f}
'''
soft_tifa_html += '
'
# VLM Assessment
vlm_html = ""
if result.vlm_assessment:
vlm = result.vlm_assessment
vlm_html = f'''
VLM Assessment
Technical: {vlm.technical_quality:.1f}/10
Aesthetic: {vlm.aesthetic_appeal:.1f}/10
Realism: {vlm.realism:.1f}/10
Overall: {vlm.overall:.1f}/10
'''
if vlm.artifacts_detected:
vlm_html += f'''
Artifacts ({vlm.artifacts_severity}): {', '.join(vlm.artifacts_detected[:5])}
'''
vlm_html += '
'
# Technical metrics
tech_html = ""
if result.technical_metrics:
tm = result.technical_metrics
tech_html = f'''
Technical Metrics
'''
if tm.clip_score is not None:
tech_html += f'
CLIP: {tm.clip_score:.3f}
'
if tm.sharpness is not None:
tech_html += f'
Sharpness: {tm.sharpness:.3f}
'
if tm.colorfulness is not None:
tech_html += f'
Colorfulness: {tm.colorfulness:.3f}
'
if tm.contrast is not None:
tech_html += f'
Contrast: {tm.contrast:.3f}
'
tech_html += '
'
return (overall_html, breakdown_html, soft_tifa_html, vlm_html, tech_html)
@spaces.GPU(duration=600)
def compare_images(
img1: Image.Image,
img2: Image.Image,
img3: Image.Image,
img4: Image.Image,
prompt: str,
progress=gr.Progress()
) -> tuple:
"""Compare 2-4 images against a prompt."""
images = [img for img in [img1, img2, img3, img4] if img is not None]
if len(images) < 2:
return ("Please upload at least 2 images to compare.", "", "", "")
if not prompt.strip():
return ("Please enter a prompt to compare images against.", "", "", "")
progress(0.1, desc="Loading models...")
try:
eval_instance = get_evaluator()
except Exception as e:
return (f"Error loading models: {str(e)}", "", "", "")
progress(0.2, desc="Comparing images...")
try:
result = eval_instance.compare_images(images, prompt.strip())
except Exception as e:
return (f"Comparison error: {str(e)}", "", "", "")
progress(0.9, desc="Formatting results...")
# Winner announcement
winner_html = f'''
Winner
Image {result.winner_index + 1}
BEST OVERALL
{result.winner_reasoning}
Evaluation time: {result.evaluation_time:.1f}s
'''
# Rankings table
rankings_html = '''
Rankings by Criterion
| Criterion |
'''
for i in range(result.num_images):
rankings_html += f'Image {i+1} | '
rankings_html += '
'
criteria_labels = {
"prompt_alignment": "Prompt Alignment (45%)",
"technical_quality": "Technical Quality (20%)",
"aesthetic_appeal": "Aesthetic Appeal (15%)",
"realism": "Realism (10%)"
}
for criterion, label in criteria_labels.items():
if criterion in result.rankings_by_criterion:
ranking_data = result.rankings_by_criterion[criterion]
rankings_html += f'| {label} | '
for i in range(result.num_images):
rank = ranking_data.ranking.index(i + 1) + 1 if (i + 1) in ranking_data.ranking else i + 1
score = ranking_data.scores[i] if i < len(ranking_data.scores) else 0.5
rankings_html += f'''
{format_rank_badge(rank)}
{score:.3f}
|
'''
rankings_html += '
'
# Overall row
rankings_html += '| Overall | '
for i in range(result.num_images):
rank = result.overall_ranking.index(i + 1) + 1 if (i + 1) in result.overall_ranking else i + 1
score = result.overall_scores[i] if i < len(result.overall_scores) else 0.5
is_winner = i == result.winner_index
rankings_html += f'''
{format_rank_badge(rank)}
{'WINNER' if is_winner else ''}
{score:.3f}
|
'''
rankings_html += '
'
# Individual scores summary
individual_html = '''
Individual Scores Summary
'''
for i, score in enumerate(result.individual_scores):
is_winner = i == result.winner_index
border_color = "#f59e0b" if is_winner else "#27272a"
grade_color1, grade_color2 = get_grade_colors(score.grade)
grade_bg = f"linear-gradient(135deg, {grade_color1} 0%, {grade_color2} 100%)"
individual_html += f'''
Image {i+1}
{score.overall:.3f}
{score.grade}
{'
WINNER
' if is_winner else ''}
'''
individual_html += '
'
# Detailed breakdown per image
details_html = '''
Detailed Breakdown
'''
for i, (score, eval_result) in enumerate(zip(result.individual_scores, result.individual_results)):
is_winner = i == result.winner_index
border_color = "#f59e0b" if is_winner else "#3f3f46"
details_html += f'''
Image {i+1} {'WINNER' if is_winner else ''}
{score.overall:.3f}
'''
# Breakdown metrics
metrics = [
("Prompt", score.breakdown.prompt_alignment, "#3b82f6"),
("Technical", score.breakdown.technical_quality, "#22c55e"),
("Aesthetic", score.breakdown.aesthetic_appeal, "#8b5cf6"),
("Realism", score.breakdown.realism, "#06b6d4"),
("Artifacts", score.breakdown.artifacts, "#f59e0b"),
]
for name, value, color in metrics:
if value is not None:
bar_width = int(value * 100)
details_html += f'''
'''
details_html += '
'
# VLM Assessment if available
if eval_result.vlm_assessment:
vlm = eval_result.vlm_assessment
details_html += f'''
VLM Assessment
Tech: {vlm.technical_quality:.1f}
Aesthetic: {vlm.aesthetic_appeal:.1f}
Realism: {vlm.realism:.1f}
Overall: {vlm.overall:.1f}
'''
if vlm.artifacts_detected:
details_html += f'''
Artifacts ({vlm.artifacts_severity}): {', '.join(vlm.artifacts_detected[:3])}
'''
details_html += '
'
# Soft-TIFA summary if available
if eval_result.soft_tifa:
st = eval_result.soft_tifa
details_html += f'''
Soft-TIFA
Primitives: {st.primitives_count} | Atom: {st.atom_score:.3f} | Prompt: {st.prompt_score:.3f}
'''
details_html += '
'
details_html += '
'
return (winner_html, rankings_html, individual_html, details_html)
# Build Gradio interface
with gr.Blocks(title="Image Evaluator", css=DARK_CSS, theme=gr.themes.Base()) as demo:
gr.HTML('''
Image Evaluator
AI image quality assessment powered by Qwen2.5-VL-7B
''')
with gr.Tabs():
# Single Image Evaluation Tab
with gr.TabItem("Single Evaluation"):
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(label="Upload Image", type="pil", height=400)
prompt_input = gr.Textbox(
label="Generation Prompt",
placeholder="Enter the prompt used to generate this image...",
lines=3
)
with gr.Row():
soft_tifa_check = gr.Checkbox(label="Soft-TIFA", value=True, info="Prompt alignment")
vlm_check = gr.Checkbox(label="VLM Judge", value=True, info="Holistic assessment")
technical_check = gr.Checkbox(label="Technical", value=True, info="CLIP, sharpness")
evaluate_btn = gr.Button("Evaluate Image", variant="primary", size="lg")
with gr.Column(scale=1):
overall_output = gr.HTML(value=SKELETON_OVERALL)
breakdown_output = gr.HTML(value=SKELETON_BREAKDOWN)
soft_tifa_output = gr.HTML(value=SKELETON_SOFT_TIFA)
with gr.Row():
vlm_output = gr.HTML(value=SKELETON_VLM)
technical_output = gr.HTML(value=SKELETON_TECHNICAL)
evaluate_btn.click(
fn=evaluate_single,
inputs=[image_input, prompt_input, soft_tifa_check, vlm_check, technical_check],
outputs=[overall_output, breakdown_output, soft_tifa_output, vlm_output, technical_output],
)
# Compare Images Tab
with gr.TabItem("Compare Images"):
gr.HTML('''
Upload 2-4 images to compare them against a prompt. The AI will rank them across multiple criteria.
''')
with gr.Row():
img1 = gr.Image(label="Image 1", type="pil", height=250)
img2 = gr.Image(label="Image 2", type="pil", height=250)
img3 = gr.Image(label="Image 3 (optional)", type="pil", height=250)
img4 = gr.Image(label="Image 4 (optional)", type="pil", height=250)
compare_prompt = gr.Textbox(
label="Comparison Prompt",
placeholder="Enter the prompt to compare images against...",
lines=2
)
compare_btn = gr.Button("Compare Images", variant="primary", size="lg")
winner_output = gr.HTML(value=SKELETON_WINNER)
rankings_output = gr.HTML(value=SKELETON_RANKINGS)
individual_output = gr.HTML(value=SKELETON_INDIVIDUAL)
details_output = gr.HTML(value=SKELETON_DETAILS)
compare_btn.click(
fn=compare_images,
inputs=[img1, img2, img3, img4, compare_prompt],
outputs=[winner_output, rankings_output, individual_output, details_output],
)
# Edit Evaluation Tab
with gr.TabItem("Edit Evaluation"):
gr.HTML('''
Evaluate image editing quality by comparing source and edited images.
''')
with gr.Row():
with gr.Column():
source_input = gr.Image(label="Source Image (Before)", type="pil", height=300)
with gr.Column():
edited_input = gr.Image(label="Edited Image (After)", type="pil", height=300)
edit_instruction = gr.Textbox(
label="Edit Instruction",
placeholder="Enter the editing instruction that was applied...",
lines=2
)
edit_btn = gr.Button("Evaluate Edit", variant="primary", size="lg")
edit_output = gr.HTML(value=SKELETON_EDIT)
@spaces.GPU(duration=300)
def evaluate_edit_handler(source, edited, instruction, progress=gr.Progress()):
if source is None or edited is None:
return "Please upload both source and edited images."
if not instruction.strip():
return "Please enter the edit instruction."
progress(0.1, desc="Loading models...")
try:
from evaluator import EditEvaluator
edit_eval = EditEvaluator()
except Exception as e:
return f"Error loading models: {str(e)}"
progress(0.3, desc="Evaluating edit...")
try:
result = edit_eval.evaluate(source, edited, instruction.strip())
except Exception as e:
return f"Evaluation error: {str(e)}"
score = result.score
return f'''
Edit Quality Score
{score.overall:.3f}
{format_grade_badge(score.grade, score.passed)}
{format_score_card(score.breakdown.instruction_following or 0, "Instruction Following")}
{format_score_card(score.breakdown.preservation or 0, "Preservation")}
{format_score_card(score.breakdown.edit_quality or 0, "Edit Quality")}
{format_score_card(score.breakdown.artifacts or 0, "Artifacts (inv)")}
{score.recommendation}
Evaluation time: {result.evaluation_time:.1f}s
'''
edit_btn.click(
fn=evaluate_edit_handler,
inputs=[source_input, edited_input, edit_instruction],
outputs=[edit_output],
)
gr.HTML('''
Powered by Qwen2.5-VL-7B | Soft-TIFA | CLIP | LPIPS
''')
if __name__ == "__main__":
demo.launch()