""" Image Evaluator - Gradio App for HuggingFace Spaces AI image quality assessment using: - Soft-TIFA for prompt alignment - VLM-as-Judge for holistic assessment - Multi-image comparison - Technical metrics (sharpness, colorfulness, contrast, CLIP) Powered by Qwen2.5-VL-7B """ import gradio as gr from PIL import Image from typing import Optional, List import time import spaces # Global evaluator (loaded on first use) evaluator = None def get_evaluator(): """Lazy load evaluator.""" global evaluator if evaluator is None: from evaluator import ImageEvaluator evaluator = ImageEvaluator() return evaluator # Dark theme CSS DARK_CSS = """ /* Base dark theme */ .gradio-container { background-color: #09090b !important; color: #fafafa !important; } /* Main blocks */ .dark { --background-fill-primary: #09090b !important; --background-fill-secondary: #18181b !important; --border-color-primary: #27272a !important; --text-color: #fafafa !important; --text-color-subdued: #a1a1aa !important; } /* Cards and panels */ .panel, .form, .block { background-color: #18181b !important; border: 1px solid #27272a !important; border-radius: 12px !important; } /* Input fields */ input, textarea, select { background-color: #18181b !important; border: 1px solid #27272a !important; color: #fafafa !important; border-radius: 8px !important; } input:focus, textarea:focus { border-color: #3b82f6 !important; outline: none !important; } /* Buttons */ .primary { background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important; border: none !important; color: white !important; } .secondary { background-color: #27272a !important; border: 1px solid #3f3f46 !important; color: #fafafa !important; } /* Tabs */ .tab-nav { background-color: #09090b !important; border-bottom: 1px solid #27272a !important; } .tab-nav button { color: #a1a1aa !important; background: transparent !important; } .tab-nav button.selected { color: #3b82f6 !important; border-bottom: 2px solid #3b82f6 !important; } /* Image upload */ .image-container { background-color: #18181b !important; border: 2px dashed #27272a !important; border-radius: 12px !important; } /* Labels */ label, .label-wrap { color: #a1a1aa !important; font-weight: 500 !important; } /* Checkboxes */ .checkbox-group { background-color: #18181b !important; padding: 12px !important; border-radius: 8px !important; } /* Scrollbar */ ::-webkit-scrollbar { width: 8px; height: 8px; } ::-webkit-scrollbar-track { background: #18181b; } ::-webkit-scrollbar-thumb { background: #3f3f46; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #52525b; } /* Result cards */ .result-card { background: #18181b; border: 1px solid #27272a; border-radius: 12px; padding: 20px; margin: 8px 0; } /* Score display */ .score-large { font-size: 3em; font-weight: 700; background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; } /* Winner badge */ .winner-badge { background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%); color: #000; padding: 4px 12px; border-radius: 20px; font-weight: 600; font-size: 0.85em; } /* Rank badge */ .rank-badge { display: inline-flex; align-items: center; justify-content: center; width: 28px; height: 28px; border-radius: 50%; font-weight: 700; font-size: 0.9em; } .rank-1 { background: linear-gradient(135deg, #fbbf24 0%, #f59e0b 100%); color: #000; } .rank-2 { background: linear-gradient(135deg, #94a3b8 0%, #64748b 100%); color: #fff; } .rank-3 { background: linear-gradient(135deg, #c2855a 0%, #a16207 100%); color: #fff; } .rank-4 { background: #3f3f46; color: #a1a1aa; } /* Progress bars */ .progress-bar { background: #27272a; border-radius: 4px; height: 6px; overflow: hidden; } .progress-fill { height: 100%; border-radius: 4px; transition: width 0.3s ease; } .progress-green { background: linear-gradient(90deg, #22c55e 0%, #16a34a 100%); } .progress-yellow { background: linear-gradient(90deg, #eab308 0%, #ca8a04 100%); } .progress-red { background: linear-gradient(90deg, #ef4444 0%, #dc2626 100%); } .progress-blue { background: linear-gradient(90deg, #3b82f6 0%, #2563eb 100%); } /* Skeleton loading animation */ @keyframes skeleton-pulse { 0%, 100% { opacity: 0.4; } 50% { opacity: 0.7; } } .skeleton { background: linear-gradient(90deg, #27272a 0%, #3f3f46 50%, #27272a 100%); background-size: 200% 100%; animation: skeleton-pulse 1.5s ease-in-out infinite; border-radius: 6px; } .skeleton-text { height: 14px; margin: 8px 0; } .skeleton-title { height: 20px; width: 60%; margin-bottom: 16px; } .skeleton-score { height: 48px; width: 120px; margin: 12px auto; } .skeleton-bar { height: 6px; width: 100%; margin: 8px 0; } .skeleton-card { background: #18181b; border: 1px solid #27272a; border-radius: 12px; padding: 20px; } """ # Skeleton placeholders for loading states SKELETON_OVERALL = '''
''' SKELETON_BREAKDOWN = '''
''' SKELETON_SOFT_TIFA = '''
''' SKELETON_VLM = '''
''' SKELETON_TECHNICAL = '''
''' SKELETON_WINNER = '''
''' SKELETON_RANKINGS = '''
''' SKELETON_INDIVIDUAL = '''
''' SKELETON_EDIT = '''
''' SKELETON_DETAILS = '''
''' def format_score_card(score: float, label: str, show_bar: bool = True) -> str: """Format a score as a styled card.""" if score >= 0.7: color_class = "progress-green" text_color = "#22c55e" elif score >= 0.5: color_class = "progress-yellow" text_color = "#eab308" else: color_class = "progress-red" text_color = "#ef4444" bar_html = f'''
''' if show_bar else "" return f'''
{label}
{score:.3f}
{bar_html}
''' def get_grade_colors(grade: str) -> tuple: """Get colors for a grade (F=red, D=orange, C=yellow, B=lime, A=green).""" grade_colors = { "A": ("#22c55e", "#16a34a"), "B": ("#84cc16", "#65a30d"), "C": ("#eab308", "#ca8a04"), "D": ("#f97316", "#ea580c"), "F": ("#ef4444", "#dc2626"), } return grade_colors.get(grade.upper(), ("#71717a", "#52525b")) def get_grade_status(grade: str) -> str: """Get status text for a grade.""" statuses = { "A": "EXCELLENT", "B": "GOOD", "C": "AVERAGE", "D": "BELOW AVG", "F": "POOR", } return statuses.get(grade.upper(), "UNKNOWN") def format_grade_badge(grade: str, passed: bool) -> str: """Format grade as a badge with color gradient from F (red) to A (green).""" color1, color2 = get_grade_colors(grade) status = get_grade_status(grade) bg = f"linear-gradient(135deg, {color1} 0%, {color2} 100%)" return f'''
{grade} {status}
''' def format_rank_badge(rank: int) -> str: """Format ranking position as badge.""" return f'{rank}' @spaces.GPU(duration=300) def evaluate_single( image: Image.Image, prompt: str, include_soft_tifa: bool, include_vlm: bool, include_technical: bool, progress=gr.Progress() ) -> tuple: """Evaluate a single AI-generated image.""" if image is None: return ("Please upload an image.", "", "", "", "") progress(0.1, desc="Loading models...") try: eval_instance = get_evaluator() except Exception as e: return (f"Error loading models: {str(e)}", "", "", "", "") progress(0.2, desc="Evaluating image...") prompt_text = prompt.strip() if prompt else None try: result = eval_instance.evaluate( image=image, prompt=prompt_text, include_soft_tifa=include_soft_tifa and bool(prompt_text), include_vlm=include_vlm, include_technical=include_technical, ) except Exception as e: return (f"Evaluation error: {str(e)}", "", "", "", "") progress(0.9, desc="Formatting results...") score = result.score # Overall score card overall_html = f'''
Overall Score
{score.overall:.3f}
{format_grade_badge(score.grade, score.passed)}
Confidence: {score.confidence:.0%}  |  Time: {result.evaluation_time:.1f}s
{score.recommendation}
''' # Breakdown scores breakdown = score.breakdown metrics = [ ("Prompt Alignment", breakdown.prompt_alignment), ("Technical Quality", breakdown.technical_quality), ("Aesthetic Appeal", breakdown.aesthetic_appeal), ("Realism", breakdown.realism), ("Artifacts (inv)", breakdown.artifacts), ] breakdown_html = '
' for name, value in metrics: if value is not None: breakdown_html += format_score_card(value, name) breakdown_html += '
' # Soft-TIFA results soft_tifa_html = "" if result.soft_tifa: st = result.soft_tifa soft_tifa_html = f'''
Soft-TIFA Analysis
Primitives: {st.primitives_count}  |  Atom: {st.atom_score:.3f}  |  Prompt: {st.prompt_score:.3f}
''' for pr in st.primitive_results[:10]: icon = "●" if pr.score >= 0.7 else "○" color = "#22c55e" if pr.score >= 0.7 else "#ef4444" soft_tifa_html += f'''
{icon} {pr.content} {pr.score:.2f}
''' soft_tifa_html += '
' # VLM Assessment vlm_html = "" if result.vlm_assessment: vlm = result.vlm_assessment vlm_html = f'''
VLM Assessment
Technical: {vlm.technical_quality:.1f}/10
Aesthetic: {vlm.aesthetic_appeal:.1f}/10
Realism: {vlm.realism:.1f}/10
Overall: {vlm.overall:.1f}/10
''' if vlm.artifacts_detected: vlm_html += f'''
Artifacts ({vlm.artifacts_severity}): {', '.join(vlm.artifacts_detected[:5])}
''' vlm_html += '
' # Technical metrics tech_html = "" if result.technical_metrics: tm = result.technical_metrics tech_html = f'''
Technical Metrics
''' if tm.clip_score is not None: tech_html += f'
CLIP: {tm.clip_score:.3f}
' if tm.sharpness is not None: tech_html += f'
Sharpness: {tm.sharpness:.3f}
' if tm.colorfulness is not None: tech_html += f'
Colorfulness: {tm.colorfulness:.3f}
' if tm.contrast is not None: tech_html += f'
Contrast: {tm.contrast:.3f}
' tech_html += '
' return (overall_html, breakdown_html, soft_tifa_html, vlm_html, tech_html) @spaces.GPU(duration=600) def compare_images( img1: Image.Image, img2: Image.Image, img3: Image.Image, img4: Image.Image, prompt: str, progress=gr.Progress() ) -> tuple: """Compare 2-4 images against a prompt.""" images = [img for img in [img1, img2, img3, img4] if img is not None] if len(images) < 2: return ("Please upload at least 2 images to compare.", "", "", "") if not prompt.strip(): return ("Please enter a prompt to compare images against.", "", "", "") progress(0.1, desc="Loading models...") try: eval_instance = get_evaluator() except Exception as e: return (f"Error loading models: {str(e)}", "", "", "") progress(0.2, desc="Comparing images...") try: result = eval_instance.compare_images(images, prompt.strip()) except Exception as e: return (f"Comparison error: {str(e)}", "", "", "") progress(0.9, desc="Formatting results...") # Winner announcement winner_html = f'''
Winner
Image {result.winner_index + 1}
BEST OVERALL
{result.winner_reasoning}
Evaluation time: {result.evaluation_time:.1f}s
''' # Rankings table rankings_html = '''
Rankings by Criterion
''' for i in range(result.num_images): rankings_html += f'' rankings_html += '' criteria_labels = { "prompt_alignment": "Prompt Alignment (45%)", "technical_quality": "Technical Quality (20%)", "aesthetic_appeal": "Aesthetic Appeal (15%)", "realism": "Realism (10%)" } for criterion, label in criteria_labels.items(): if criterion in result.rankings_by_criterion: ranking_data = result.rankings_by_criterion[criterion] rankings_html += f'' for i in range(result.num_images): rank = ranking_data.ranking.index(i + 1) + 1 if (i + 1) in ranking_data.ranking else i + 1 score = ranking_data.scores[i] if i < len(ranking_data.scores) else 0.5 rankings_html += f''' ''' rankings_html += '' # Overall row rankings_html += '' for i in range(result.num_images): rank = result.overall_ranking.index(i + 1) + 1 if (i + 1) in result.overall_ranking else i + 1 score = result.overall_scores[i] if i < len(result.overall_scores) else 0.5 is_winner = i == result.winner_index rankings_html += f''' ''' rankings_html += '
CriterionImage {i+1}
{label} {format_rank_badge(rank)}
{score:.3f}
Overall {format_rank_badge(rank)} {'WINNER' if is_winner else ''}
{score:.3f}
' # Individual scores summary individual_html = '''
Individual Scores Summary
''' for i, score in enumerate(result.individual_scores): is_winner = i == result.winner_index border_color = "#f59e0b" if is_winner else "#27272a" grade_color1, grade_color2 = get_grade_colors(score.grade) grade_bg = f"linear-gradient(135deg, {grade_color1} 0%, {grade_color2} 100%)" individual_html += f'''
Image {i+1}
{score.overall:.3f}
{score.grade}
{'
WINNER
' if is_winner else ''}
''' individual_html += '
' # Detailed breakdown per image details_html = '''
Detailed Breakdown
''' for i, (score, eval_result) in enumerate(zip(result.individual_scores, result.individual_results)): is_winner = i == result.winner_index border_color = "#f59e0b" if is_winner else "#3f3f46" details_html += f'''
Image {i+1} {'WINNER' if is_winner else ''}
{score.overall:.3f}
''' # Breakdown metrics metrics = [ ("Prompt", score.breakdown.prompt_alignment, "#3b82f6"), ("Technical", score.breakdown.technical_quality, "#22c55e"), ("Aesthetic", score.breakdown.aesthetic_appeal, "#8b5cf6"), ("Realism", score.breakdown.realism, "#06b6d4"), ("Artifacts", score.breakdown.artifacts, "#f59e0b"), ] for name, value, color in metrics: if value is not None: bar_width = int(value * 100) details_html += f'''
{name}
{value:.3f}
''' details_html += '
' # VLM Assessment if available if eval_result.vlm_assessment: vlm = eval_result.vlm_assessment details_html += f'''
VLM Assessment
Tech: {vlm.technical_quality:.1f} Aesthetic: {vlm.aesthetic_appeal:.1f} Realism: {vlm.realism:.1f} Overall: {vlm.overall:.1f}
''' if vlm.artifacts_detected: details_html += f'''
Artifacts ({vlm.artifacts_severity}): {', '.join(vlm.artifacts_detected[:3])}
''' details_html += '
' # Soft-TIFA summary if available if eval_result.soft_tifa: st = eval_result.soft_tifa details_html += f'''
Soft-TIFA
Primitives: {st.primitives_count} | Atom: {st.atom_score:.3f} | Prompt: {st.prompt_score:.3f}
''' details_html += '
' details_html += '
' return (winner_html, rankings_html, individual_html, details_html) # Build Gradio interface with gr.Blocks(title="Image Evaluator", css=DARK_CSS, theme=gr.themes.Base()) as demo: gr.HTML('''

Image Evaluator

AI image quality assessment powered by Qwen2.5-VL-7B

''') with gr.Tabs(): # Single Image Evaluation Tab with gr.TabItem("Single Evaluation"): with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(label="Upload Image", type="pil", height=400) prompt_input = gr.Textbox( label="Generation Prompt", placeholder="Enter the prompt used to generate this image...", lines=3 ) with gr.Row(): soft_tifa_check = gr.Checkbox(label="Soft-TIFA", value=True, info="Prompt alignment") vlm_check = gr.Checkbox(label="VLM Judge", value=True, info="Holistic assessment") technical_check = gr.Checkbox(label="Technical", value=True, info="CLIP, sharpness") evaluate_btn = gr.Button("Evaluate Image", variant="primary", size="lg") with gr.Column(scale=1): overall_output = gr.HTML(value=SKELETON_OVERALL) breakdown_output = gr.HTML(value=SKELETON_BREAKDOWN) soft_tifa_output = gr.HTML(value=SKELETON_SOFT_TIFA) with gr.Row(): vlm_output = gr.HTML(value=SKELETON_VLM) technical_output = gr.HTML(value=SKELETON_TECHNICAL) evaluate_btn.click( fn=evaluate_single, inputs=[image_input, prompt_input, soft_tifa_check, vlm_check, technical_check], outputs=[overall_output, breakdown_output, soft_tifa_output, vlm_output, technical_output], ) # Compare Images Tab with gr.TabItem("Compare Images"): gr.HTML('''
Upload 2-4 images to compare them against a prompt. The AI will rank them across multiple criteria.
''') with gr.Row(): img1 = gr.Image(label="Image 1", type="pil", height=250) img2 = gr.Image(label="Image 2", type="pil", height=250) img3 = gr.Image(label="Image 3 (optional)", type="pil", height=250) img4 = gr.Image(label="Image 4 (optional)", type="pil", height=250) compare_prompt = gr.Textbox( label="Comparison Prompt", placeholder="Enter the prompt to compare images against...", lines=2 ) compare_btn = gr.Button("Compare Images", variant="primary", size="lg") winner_output = gr.HTML(value=SKELETON_WINNER) rankings_output = gr.HTML(value=SKELETON_RANKINGS) individual_output = gr.HTML(value=SKELETON_INDIVIDUAL) details_output = gr.HTML(value=SKELETON_DETAILS) compare_btn.click( fn=compare_images, inputs=[img1, img2, img3, img4, compare_prompt], outputs=[winner_output, rankings_output, individual_output, details_output], ) # Edit Evaluation Tab with gr.TabItem("Edit Evaluation"): gr.HTML('''
Evaluate image editing quality by comparing source and edited images.
''') with gr.Row(): with gr.Column(): source_input = gr.Image(label="Source Image (Before)", type="pil", height=300) with gr.Column(): edited_input = gr.Image(label="Edited Image (After)", type="pil", height=300) edit_instruction = gr.Textbox( label="Edit Instruction", placeholder="Enter the editing instruction that was applied...", lines=2 ) edit_btn = gr.Button("Evaluate Edit", variant="primary", size="lg") edit_output = gr.HTML(value=SKELETON_EDIT) @spaces.GPU(duration=300) def evaluate_edit_handler(source, edited, instruction, progress=gr.Progress()): if source is None or edited is None: return "Please upload both source and edited images." if not instruction.strip(): return "Please enter the edit instruction." progress(0.1, desc="Loading models...") try: from evaluator import EditEvaluator edit_eval = EditEvaluator() except Exception as e: return f"Error loading models: {str(e)}" progress(0.3, desc="Evaluating edit...") try: result = edit_eval.evaluate(source, edited, instruction.strip()) except Exception as e: return f"Evaluation error: {str(e)}" score = result.score return f'''
Edit Quality Score
{score.overall:.3f}
{format_grade_badge(score.grade, score.passed)}
{format_score_card(score.breakdown.instruction_following or 0, "Instruction Following")} {format_score_card(score.breakdown.preservation or 0, "Preservation")} {format_score_card(score.breakdown.edit_quality or 0, "Edit Quality")} {format_score_card(score.breakdown.artifacts or 0, "Artifacts (inv)")}
{score.recommendation}
Evaluation time: {result.evaluation_time:.1f}s
''' edit_btn.click( fn=evaluate_edit_handler, inputs=[source_input, edited_input, edit_instruction], outputs=[edit_output], ) gr.HTML('''
Powered by Qwen2.5-VL-7B  |  Soft-TIFA  |  CLIP  |  LPIPS
''') if __name__ == "__main__": demo.launch()