Spaces:

ilkerzgi
/

image-evaluator

Running on Zero

App Files Files Community

ilkerzg commited on Dec 21, 2025

Commit

2425579

unverified ·

1 Parent(s): 5781757

Add image evaluator with Soft-TIFA and VLM-as-Judge

Browse files

Files changed (5) hide show

README.md +52 -6
app.py +575 -4
evaluator.py +1049 -0
metrics.py +285 -0
requirements.txt +22 -0

README.md CHANGED Viewed

@@ -1,12 +1,58 @@
 ---
-title: Image Evaluator
-emoji: 🚀
-colorFrom: red
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.2.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Fal Image Evaluator
+emoji: "🎨"
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
+suggested_hardware: a10g-small
 ---
+# Fal Image Evaluator
+AI image quality assessment using Soft-TIFA, VLM-as-Judge, and technical metrics.
+## Features
+### Image Evaluation
+- **Soft-TIFA**: Atomic prompt decomposition + VQA verification
+- **VLM-as-Judge**: Holistic quality assessment (technical, aesthetic, realism)
+- **Technical Metrics**: CLIP score, sharpness, colorfulness, contrast
+### Edit Evaluation
+- **Instruction Following**: Verify edit primitives were applied
+- **Preservation**: LPIPS, SSIM, PSNR for non-edited region preservation
+- **Edit Quality**: Seamlessness, coherence, artifact detection
+## Local Development
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## File Structure
+```
+├── app.py           # Gradio interface
+├── evaluator.py     # Core evaluation logic
+├── metrics.py       # Helper functions for metrics
+└── requirements.txt # Dependencies
+```
+## Components
+| Component | Description |
+|-----------|-------------|
+| Soft-TIFA | Decomposes prompts into atomic primitives, verifies via VQA |
+| VLM-as-Judge | Qwen2.5-VL-7B for holistic image assessment |
+| CLIP Score | Text-image alignment using OpenCLIP |
+| LPIPS | Learned perceptual similarity |
+| SSIM/PSNR | Structural and pixel-level similarity |
+## License
+MIT

app.py CHANGED Viewed

@@ -1,7 +1,578 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+Fal Image Evaluator - Gradio App for HuggingFace Spaces
+AI image quality assessment using:
+- Soft-TIFA for prompt alignment
+- VLM-as-Judge for holistic assessment
+- Technical metrics (sharpness, colorfulness, contrast, CLIP)
+"""
 import gradio as gr
+from PIL import Image
+from typing import Optional
+import time
+# Global evaluators (loaded on first use)
+image_evaluator = None
+edit_evaluator = None
+def get_image_evaluator():
+    """Lazy load image evaluator."""
+    global image_evaluator
+    if image_evaluator is None:
+        from evaluator import ImageEvaluator
+        image_evaluator = ImageEvaluator()
+    return image_evaluator
+def get_edit_evaluator():
+    """Lazy load edit evaluator."""
+    global edit_evaluator
+    if edit_evaluator is None:
+        from evaluator import EditEvaluator
+        edit_evaluator = EditEvaluator()
+    return edit_evaluator
+def format_score_html(score: float, label: str) -> str:
+    """Format a score as colored HTML."""
+    if score >= 0.85:
+        color = "#22c55e"  # green
+    elif score >= 0.70:
+        color = "#84cc16"  # lime
+    elif score >= 0.50:
+        color = "#eab308"  # yellow
+    else:
+        color = "#ef4444"  # red
+    return f'<span style="color: {color}; font-weight: bold;">{score:.3f}</span> {label}'
+def format_grade_badge(grade: str, passed: bool) -> str:
+    """Format grade as a badge."""
+    if passed:
+        bg_color = "#22c55e"
+    else:
+        bg_color = "#ef4444"
+    return f'''
+    <div style="display: inline-flex; align-items: center; gap: 8px;">
+        <span style="background: {bg_color}; color: white; padding: 4px 12px; border-radius: 4px; font-weight: bold; font-size: 1.2em;">
+            {grade}
+        </span>
+        <span style="color: {'#22c55e' if passed else '#ef4444'};">
+            {'PASSED' if passed else 'FAILED'}
+        </span>
+    </div>
+    '''
+def evaluate_image(
+    image: Image.Image,
+    prompt: str,
+    include_soft_tifa: bool,
+    include_vlm: bool,
+    include_technical: bool,
+    progress=gr.Progress()
+) -> tuple:
+    """Evaluate an AI-generated image."""
+    if image is None:
+        return (
+            "Please upload an image.",
+            "", "", "", ""
+        )
+    progress(0.1, desc="Loading models...")
+    try:
+        evaluator = get_image_evaluator()
+    except Exception as e:
+        return (
+            f"Error loading models: {str(e)}",
+            "", "", "", ""
+        )
+    progress(0.2, desc="Starting evaluation...")
+    prompt_text = prompt.strip() if prompt else None
+    try:
+        result = evaluator.evaluate(
+            image=image,
+            prompt=prompt_text,
+            include_soft_tifa=include_soft_tifa and bool(prompt_text),
+            include_vlm=include_vlm,
+            include_technical=include_technical,
+        )
+    except Exception as e:
+        return (
+            f"Evaluation error: {str(e)}",
+            "", "", "", ""
+        )
+    progress(0.9, desc="Formatting results...")
+    # Format overall score
+    score = result.score
+    overall_html = f"""
+    <div style="padding: 16px; background: #1f2937; border-radius: 8px; margin-bottom: 16px;">
+        <h2 style="margin: 0 0 12px 0; color: #f3f4f6;">Overall Score</h2>
+        <div style="font-size: 2.5em; font-weight: bold; color: #60a5fa; margin-bottom: 8px;">
+            {score.overall:.3f}
+        </div>
+        {format_grade_badge(score.grade, score.passed)}
+        <div style="margin-top: 12px; color: #9ca3af;">
+            Confidence: {score.confidence:.0%} | Time: {result.evaluation_time:.1f}s
+        </div>
+        <div style="margin-top: 8px; padding: 8px; background: #374151; border-radius: 4px; color: #d1d5db;">
+            {score.recommendation}
+        </div>
+    </div>
+    """
+    # Format breakdown
+    breakdown = score.breakdown
+    breakdown_html = "<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px;'>"
+    metrics = [
+        ("Prompt Alignment", breakdown.prompt_alignment),
+        ("Technical Quality", breakdown.technical_quality),
+        ("Aesthetic Appeal", breakdown.aesthetic_appeal),
+        ("Realism", breakdown.realism),
+        ("Artifacts (inv)", breakdown.artifacts),
+    ]
+    for name, value in metrics:
+        if value is not None:
+            bar_width = int(value * 100)
+            if value >= 0.7:
+                bar_color = "#22c55e"
+            elif value >= 0.5:
+                bar_color = "#eab308"
+            else:
+                bar_color = "#ef4444"
+            breakdown_html += f"""
+            <div style="background: #374151; padding: 12px; border-radius: 6px;">
+                <div style="color: #9ca3af; font-size: 0.85em; margin-bottom: 4px;">{name}</div>
+                <div style="font-weight: bold; color: #f3f4f6; margin-bottom: 6px;">{value:.3f}</div>
+                <div style="background: #1f2937; border-radius: 4px; height: 6px; overflow: hidden;">
+                    <div style="background: {bar_color}; width: {bar_width}%; height: 100%;"></div>
+                </div>
+            </div>
+            """
+    breakdown_html += "</div>"
+    # Format Soft-TIFA details
+    soft_tifa_html = ""
+    if result.soft_tifa:
+        st = result.soft_tifa
+        soft_tifa_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Soft-TIFA Results</h3>
+            <div style="color: #9ca3af; margin-bottom: 8px;">
+                Primitives: {st.primitives_count} | Atom Score: {st.atom_score:.3f} | Prompt Score: {st.prompt_score:.3f}
+            </div>
+        """
+        if st.primitive_results:
+            soft_tifa_html += "<div style='max-height: 200px; overflow-y: auto;'>"
+            for pr in st.primitive_results[:10]:
+                icon = "o" if pr.score >= 0.7 else "x"
+                color = "#22c55e" if pr.score >= 0.7 else "#ef4444"
+                soft_tifa_html += f"""
+                <div style="padding: 6px; border-bottom: 1px solid #374151; display: flex; justify-content: space-between;">
+                    <span style="color: {color};">[{icon}] {pr.content}</span>
+                    <span style="color: #9ca3af;">{pr.score:.2f}</span>
+                </div>
+                """
+            soft_tifa_html += "</div>"
+        soft_tifa_html += "</div>"
+    # Format VLM assessment
+    vlm_html = ""
+    if result.vlm_assessment:
+        vlm = result.vlm_assessment
+        vlm_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">VLM-as-Judge Assessment</h3>
+            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
+                <div>Technical: {vlm.technical_quality:.1f}/10</div>
+                <div>Aesthetic: {vlm.aesthetic_appeal:.1f}/10</div>
+                <div>Realism: {vlm.realism:.1f}/10</div>
+                <div>Overall: {vlm.overall:.1f}/10</div>
+            </div>
+        """
+        if vlm.artifacts_detected:
+            vlm_html += f"""
+            <div style="margin-top: 8px; padding: 8px; background: #7f1d1d; border-radius: 4px;">
+                <strong>Artifacts ({vlm.artifacts_severity}):</strong> {', '.join(vlm.artifacts_detected[:5])}
+            </div>
+            """
+        if vlm.reasoning:
+            vlm_html += f"""
+            <div style="margin-top: 8px; color: #9ca3af; font-style: italic;">
+                "{vlm.reasoning[:200]}"
+            </div>
+            """
+        vlm_html += "</div>"
+    # Format technical metrics
+    technical_html = ""
+    if result.technical_metrics:
+        tm = result.technical_metrics
+        technical_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Technical Metrics</h3>
+            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
+        """
+        if tm.clip_score is not None:
+            technical_html += f"<div>CLIP Score: {tm.clip_score:.3f}</div>"
+        if tm.sharpness is not None:
+            technical_html += f"<div>Sharpness: {tm.sharpness:.3f}</div>"
+        if tm.colorfulness is not None:
+            technical_html += f"<div>Colorfulness: {tm.colorfulness:.3f}</div>"
+        if tm.contrast is not None:
+            technical_html += f"<div>Contrast: {tm.contrast:.3f}</div>"
+        technical_html += "</div></div>"
+    return (
+        overall_html,
+        breakdown_html,
+        soft_tifa_html,
+        vlm_html,
+        technical_html
+    )
+def evaluate_edit(
+    source_image: Image.Image,
+    edited_image: Image.Image,
+    instruction: str,
+    progress=gr.Progress()
+) -> tuple:
+    """Evaluate an image editing result."""
+    if source_image is None or edited_image is None:
+        return (
+            "Please upload both source and edited images.",
+            "", "", "", ""
+        )
+    if not instruction.strip():
+        return (
+            "Please enter the editing instruction.",
+            "", "", "", ""
+        )
+    progress(0.1, desc="Loading models...")
+    try:
+        evaluator = get_edit_evaluator()
+    except Exception as e:
+        return (
+            f"Error loading models: {str(e)}",
+            "", "", "", ""
+        )
+    progress(0.2, desc="Starting evaluation...")
+    try:
+        result = evaluator.evaluate(
+            source_image=source_image,
+            edited_image=edited_image,
+            instruction=instruction.strip(),
+        )
+    except Exception as e:
+        return (
+            f"Evaluation error: {str(e)}",
+            "", "", "", ""
+        )
+    progress(0.9, desc="Formatting results...")
+    # Format overall score
+    score = result.score
+    overall_html = f"""
+    <div style="padding: 16px; background: #1f2937; border-radius: 8px; margin-bottom: 16px;">
+        <h2 style="margin: 0 0 12px 0; color: #f3f4f6;">Edit Quality Score</h2>
+        <div style="font-size: 2.5em; font-weight: bold; color: #60a5fa; margin-bottom: 8px;">
+            {score.overall:.3f}
+        </div>
+        {format_grade_badge(score.grade, score.passed)}
+        <div style="margin-top: 12px; color: #9ca3af;">
+            Confidence: {score.confidence:.0%} | Time: {result.evaluation_time:.1f}s
+        </div>
+        <div style="margin-top: 8px; padding: 8px; background: #374151; border-radius: 4px; color: #d1d5db;">
+            {score.recommendation}
+        </div>
+    </div>
+    """
+    # Format breakdown
+    breakdown = score.breakdown
+    breakdown_html = "<div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 12px;'>"
+    metrics = [
+        ("Instruction Following", breakdown.instruction_following),
+        ("Preservation", breakdown.preservation),
+        ("Edit Quality", breakdown.edit_quality),
+        ("Artifacts (inv)", breakdown.artifacts),
+    ]
+    for name, value in metrics:
+        if value is not None:
+            bar_width = int(value * 100)
+            if value >= 0.7:
+                bar_color = "#22c55e"
+            elif value >= 0.5:
+                bar_color = "#eab308"
+            else:
+                bar_color = "#ef4444"
+            breakdown_html += f"""
+            <div style="background: #374151; padding: 12px; border-radius: 6px;">
+                <div style="color: #9ca3af; font-size: 0.85em; margin-bottom: 4px;">{name}</div>
+                <div style="font-weight: bold; color: #f3f4f6; margin-bottom: 6px;">{value:.3f}</div>
+                <div style="background: #1f2937; border-radius: 4px; height: 6px; overflow: hidden;">
+                    <div style="background: {bar_color}; width: {bar_width}%; height: 100%;"></div>
+                </div>
+            </div>
+            """
+    breakdown_html += "</div>"
+    # Format instruction following details
+    instruction_html = ""
+    if result.instruction_following:
+        inst = result.instruction_following
+        instruction_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Instruction Following</h3>
+            <div style="color: #9ca3af; margin-bottom: 8px;">
+                Overall: {inst.overall_score:.3f}
+            </div>
+        """
+        if inst.primitive_scores:
+            instruction_html += "<div style='max-height: 150px; overflow-y: auto;'>"
+            for ps in inst.primitive_scores[:5]:
+                score_val = ps.get("score", 0) / 10.0
+                icon = "o" if score_val >= 0.7 else "x"
+                color = "#22c55e" if score_val >= 0.7 else "#ef4444"
+                instruction_html += f"""
+                <div style="padding: 6px; border-bottom: 1px solid #374151;">
+                    <span style="color: {color};">[{icon}]</span> {ps.get('edit', 'N/A')}: {ps.get('score', 0):.1f}/10
+                </div>
+                """
+            instruction_html += "</div>"
+        instruction_html += "</div>"
+    # Format preservation details
+    preservation_html = ""
+    if result.preservation:
+        pres = result.preservation
+        preservation_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Preservation Metrics</h3>
+            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
+        """
+        if pres.lpips_score is not None:
+            preservation_html += f"<div>LPIPS: {pres.lpips_score:.3f} (lower=better)</div>"
+        if pres.ssim_score is not None:
+            preservation_html += f"<div>SSIM: {pres.ssim_score:.3f}</div>"
+        if pres.psnr_score is not None:
+            preservation_html += f"<div>PSNR: {pres.psnr_score:.3f}</div>"
+        preservation_html += f"<div><strong>Combined: {pres.overall_score:.3f}</strong></div>"
+        preservation_html += "</div></div>"
+    # Format edit quality details
+    quality_html = ""
+    if result.edit_quality:
+        eq = result.edit_quality
+        quality_html = f"""
+        <div style="padding: 12px; background: #1f2937; border-radius: 8px;">
+            <h3 style="margin: 0 0 8px 0; color: #f3f4f6;">Edit Quality Assessment</h3>
+            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; color: #d1d5db;">
+                <div>Technical: {eq.technical_score:.1f}/10</div>
+                <div>Aesthetic: {eq.aesthetic_score:.1f}/10</div>
+                <div>Coherence: {eq.coherence_score:.1f}/10</div>
+                <div>Overall: {eq.overall_score:.3f}</div>
+            </div>
+        """
+        if eq.artifacts:
+            quality_html += f"""
+            <div style="margin-top: 8px; padding: 8px; background: #7f1d1d; border-radius: 4px;">
+                <strong>Artifacts ({eq.artifact_severity}):</strong> {', '.join(eq.artifacts[:5])}
+            </div>
+            """
+        quality_html += "</div>"
+    return (
+        overall_html,
+        breakdown_html,
+        instruction_html,
+        preservation_html,
+        quality_html
+    )
+# Create Gradio interface
+with gr.Blocks(
+    title="Fal Image Evaluator",
+    theme=gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="slate",
+    ),
+    css="""
+    .gradio-container { max-width: 1200px !important; }
+    .score-display { font-size: 1.5em; font-weight: bold; }
+    """
+) as demo:
+    gr.Markdown("""
+    # Fal Image Evaluator
+    AI image quality assessment using **Soft-TIFA**, **VLM-as-Judge**, and technical metrics.
+    - **Image Evaluation**: Assess text-to-image generation quality
+    - **Edit Evaluation**: Assess image editing quality
+    """)
+    with gr.Tabs():
+        # Image Evaluation Tab
+        with gr.TabItem("Image Evaluation"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_input = gr.Image(
+                        label="Upload Image",
+                        type="pil",
+                        height=400,
+                    )
+                    prompt_input = gr.Textbox(
+                        label="Generation Prompt (optional but recommended)",
+                        placeholder="Enter the prompt used to generate this image...",
+                        lines=3,
+                    )
+                    with gr.Row():
+                        soft_tifa_check = gr.Checkbox(
+                            label="Soft-TIFA",
+                            value=True,
+                            info="Prompt alignment (requires prompt)"
+                        )
+                        vlm_check = gr.Checkbox(
+                            label="VLM-as-Judge",
+                            value=True,
+                            info="Holistic assessment"
+                        )
+                        technical_check = gr.Checkbox(
+                            label="Technical Metrics",
+                            value=True,
+                            info="CLIP, sharpness, etc."
+                        )
+                    evaluate_btn = gr.Button("Evaluate Image", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    overall_output = gr.HTML(label="Overall Score")
+                    breakdown_output = gr.HTML(label="Score Breakdown")
+            with gr.Row():
+                soft_tifa_output = gr.HTML(label="Soft-TIFA Details")
+                vlm_output = gr.HTML(label="VLM Assessment")
+                technical_output = gr.HTML(label="Technical Metrics")
+            evaluate_btn.click(
+                fn=evaluate_image,
+                inputs=[
+                    image_input,
+                    prompt_input,
+                    soft_tifa_check,
+                    vlm_check,
+                    technical_check,
+                ],
+                outputs=[
+                    overall_output,
+                    breakdown_output,
+                    soft_tifa_output,
+                    vlm_output,
+                    technical_output,
+                ],
+            )
+        # Edit Evaluation Tab
+        with gr.TabItem("Edit Evaluation"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    with gr.Row():
+                        source_input = gr.Image(
+                            label="Source Image (Before)",
+                            type="pil",
+                            height=300,
+                        )
+                        edited_input = gr.Image(
+                            label="Edited Image (After)",
+                            type="pil",
+                            height=300,
+                        )
+                    instruction_input = gr.Textbox(
+                        label="Edit Instruction",
+                        placeholder="Enter the editing instruction that was applied...",
+                        lines=2,
+                    )
+                    edit_btn = gr.Button("Evaluate Edit", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    edit_overall_output = gr.HTML(label="Overall Score")
+                    edit_breakdown_output = gr.HTML(label="Score Breakdown")
+            with gr.Row():
+                instruction_output = gr.HTML(label="Instruction Following")
+                preservation_output = gr.HTML(label="Preservation")
+                quality_output = gr.HTML(label="Edit Quality")
+            edit_btn.click(
+                fn=evaluate_edit,
+                inputs=[
+                    source_input,
+                    edited_input,
+                    instruction_input,
+                ],
+                outputs=[
+                    edit_overall_output,
+                    edit_breakdown_output,
+                    instruction_output,
+                    preservation_output,
+                    quality_output,
+                ],
+            )
+    gr.Markdown("""
+    ---
+    **Components:**
+    - **Soft-TIFA**: Decomposes prompts into atomic primitives and verifies each via VQA
+    - **VLM-as-Judge**: Uses Qwen2.5-VL for holistic quality assessment
+    - **Technical Metrics**: CLIP score, sharpness (Laplacian), colorfulness, contrast
+    - **Preservation Metrics**: LPIPS, SSIM, PSNR for edit evaluation
+    *Powered by Qwen2.5-VL-7B-Instruct*
+    """)
+if __name__ == "__main__":
+    demo.launch()

evaluator.py ADDED Viewed

	@@ -0,0 +1,1049 @@

+"""
+Image Evaluator Core Logic
+Contains the main evaluation classes:
+- ImageEvaluator: For text-to-image generation quality assessment
+- EditEvaluator: For image editing quality assessment
+"""
+import re
+import math
+import time
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass, field
+from PIL import Image
+from metrics import (
+    parse_json_robust,
+    calculate_sharpness,
+    calculate_colorfulness,
+    calculate_contrast,
+    calculate_ssim,
+    calculate_psnr,
+    calculate_clip_score,
+    calculate_lpips,
+    score_to_grade,
+    geometric_mean,
+)
+@dataclass
+class PrimitiveResult:
+    """Result for a single Soft-TIFA primitive."""
+    content: str
+    type: str
+    question: str
+    answer: str
+    score: float
+    reasoning: Optional[str] = None
+@dataclass
+class SoftTIFAResult:
+    """Soft-TIFA evaluation result."""
+    primitives_count: int
+    atom_score: float
+    prompt_score: float
+    passed: bool
+    primitive_results: List[PrimitiveResult]
+@dataclass
+class VLMAssessmentResult:
+    """VLM-as-Judge assessment result."""
+    technical_quality: float
+    aesthetic_appeal: float
+    realism: float
+    semantic_accuracy: Optional[float]
+    artifacts_detected: List[str]
+    artifacts_severity: str
+    overall: float
+    reasoning: Optional[str] = None
+@dataclass
+class TechnicalMetricsResult:
+    """Technical metrics result."""
+    clip_score: Optional[float] = None
+    sharpness: Optional[float] = None
+    colorfulness: Optional[float] = None
+    contrast: Optional[float] = None
+@dataclass
+class ScoreBreakdown:
+    """Detailed score breakdown by category."""
+    prompt_alignment: Optional[float] = None
+    technical_quality: Optional[float] = None
+    aesthetic_appeal: Optional[float] = None
+    realism: Optional[float] = None
+    artifacts: Optional[float] = None
+@dataclass
+class AggregatedScore:
+    """Comprehensive aggregated scoring."""
+    overall: float
+    grade: str
+    passed: bool
+    confidence: float
+    breakdown: ScoreBreakdown
+    weights_used: Dict[str, float]
+    recommendation: str
+@dataclass
+class ImageEvalResult:
+    """Complete image evaluation result."""
+    score: AggregatedScore
+    soft_tifa: Optional[SoftTIFAResult] = None
+    vlm_assessment: Optional[VLMAssessmentResult] = None
+    technical_metrics: Optional[TechnicalMetricsResult] = None
+    evaluation_time: float = 0.0
+@dataclass
+class InstructionFollowingResult:
+    """Instruction following evaluation result."""
+    edit_primitives: List[Dict]
+    primitive_scores: List[Dict]
+    overall_score: float
+    reasoning: Optional[str] = None
+@dataclass
+class PreservationResult:
+    """Preservation evaluation result."""
+    lpips_score: Optional[float] = None
+    ssim_score: Optional[float] = None
+    psnr_score: Optional[float] = None
+    overall_score: float = 0.0
+@dataclass
+class EditQualityResult:
+    """Edit quality assessment result."""
+    technical_score: float
+    aesthetic_score: float
+    coherence_score: float
+    artifacts: List[str]
+    artifact_severity: str
+    overall_score: float
+    reasoning: Optional[str] = None
+@dataclass
+class EditScoreBreakdown:
+    """Detailed score breakdown for editing evaluation."""
+    instruction_following: Optional[float] = None
+    preservation: Optional[float] = None
+    edit_quality: Optional[float] = None
+    artifacts: Optional[float] = None
+@dataclass
+class EditAggregatedScore:
+    """Comprehensive aggregated scoring for editing."""
+    overall: float
+    grade: str
+    passed: bool
+    confidence: float
+    breakdown: EditScoreBreakdown
+    weights_used: Dict[str, float]
+    recommendation: str
+@dataclass
+class EditEvalResult:
+    """Complete edit evaluation result."""
+    score: EditAggregatedScore
+    instruction_following: Optional[InstructionFollowingResult] = None
+    preservation: Optional[PreservationResult] = None
+    edit_quality: Optional[EditQualityResult] = None
+    evaluation_time: float = 0.0
+class ImageEvaluator:
+    """
+    AI-Generated Image Quality Evaluator
+    Evaluates AI-generated images using:
+    - Soft-TIFA: Atomic prompt decomposition for precise alignment scoring
+    - VLM-as-Judge: Human-like holistic assessment with reasoning
+    - Technical Metrics: Sharpness, colorfulness, contrast, CLIP score
+    """
+    def __init__(self, device: str = "cuda"):
+        """Initialize evaluator with models."""
+        import torch
+        from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+        self.device = device if torch.cuda.is_available() else "cpu"
+        # Load Qwen2.5-VL for VLM-as-Judge and Soft-TIFA
+        model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+        self.vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+        self.vlm_processor = AutoProcessor.from_pretrained(model_name)
+        # Load CLIP for text-image alignment
+        import open_clip
+        self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
+            'ViT-B-32', pretrained='openai'
+        )
+        self.clip_model = self.clip_model.to(self.device).eval()
+        self.clip_tokenizer = open_clip.get_tokenizer('ViT-B-32')
+    def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
+        """Generate response from VLM with image."""
+        import torch
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.vlm_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.vlm_processor(
+            text=[text],
+            images=[image],
+            return_tensors="pt",
+        ).to(self.vlm_model.device)
+        with torch.no_grad():
+            outputs = self.vlm_model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+            )
+        generated = outputs[0][inputs.input_ids.shape[1]:]
+        return self.vlm_processor.decode(generated, skip_special_tokens=True)
+    def _vlm_text_generate(self, prompt: str) -> str:
+        """Generate response from VLM (text only)."""
+        import torch
+        messages = [{"role": "user", "content": prompt}]
+        text = self.vlm_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.vlm_processor(
+            text=[text],
+            return_tensors="pt",
+        ).to(self.vlm_model.device)
+        with torch.no_grad():
+            outputs = self.vlm_model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+            )
+        generated = outputs[0][inputs.input_ids.shape[1]:]
+        return self.vlm_processor.decode(generated, skip_special_tokens=True)
+    def evaluate_soft_tifa(self, image: Image.Image, prompt: str) -> SoftTIFAResult:
+        """Run Soft-TIFA evaluation with atomic prompt decomposition."""
+        # Step 1: Decompose prompt into primitives
+        decomposition_prompt = f'''Decompose this text-to-image prompt into atomic visual primitives.
+Prompt: "{prompt}"
+For each primitive, identify:
+- content: The specific visual element (e.g., "a red car", "sunset sky")
+- type: One of [object, attribute, count, relation, action, scene, style]
+- importance: How critical (0.5-1.0)
+Example for "A cat sitting on a red chair":
+[
+  {{"content": "cat", "type": "object", "importance": 1.0}},
+  {{"content": "chair", "type": "object", "importance": 0.9}},
+  {{"content": "red chair", "type": "attribute", "importance": 0.8}},
+  {{"content": "cat sitting on chair", "type": "relation", "importance": 0.9}}
+]
+Return ONLY valid JSON array for the given prompt:'''
+        decomp_response = self._vlm_text_generate(decomposition_prompt)
+        primitives = parse_json_robust(decomp_response, fallback=[])
+        if not primitives or not isinstance(primitives, list):
+            return SoftTIFAResult(
+                primitives_count=0,
+                atom_score=0.0,
+                prompt_score=0.0,
+                passed=False,
+                primitive_results=[],
+            )
+        # Step 2: Evaluate each primitive via VQA
+        primitive_results = []
+        vqa_templates = {
+            "object": "Is there a {content} in this image?",
+            "attribute": "Does the image show {content}?",
+            "count": "Are there {content}?",
+            "relation": "Is it true that {content}?",
+            "action": "Is {content} happening in this image?",
+            "scene": "Does this image depict {content}?",
+            "style": "Is this image in {content} style?",
+        }
+        for prim in primitives[:20]:  # Limit to 20 primitives
+            content = prim.get("content", "")
+            ptype = prim.get("type", "object")
+            template = vqa_templates.get(ptype, vqa_templates["object"])
+            question = template.format(content=content)
+            vqa_prompt = f"""{question}
+Answer Yes or No with confidence (0-100%).
+Format: [Yes/No] (confidence: X%) - brief reasoning"""
+            response = self._vlm_generate(image, vqa_prompt)
+            # Parse response
+            answer = "no"
+            confidence = 0.5
+            reasoning = None
+            response_lower = response.lower().strip()
+            if response_lower.startswith("yes") or "[yes]" in response_lower:
+                answer = "yes"
+            conf_match = re.search(r'confidence[:\s]*(\d+)%?', response_lower)
+            if conf_match:
+                confidence = float(conf_match.group(1)) / 100.0
+            if "-" in response:
+                parts = response.split("-", 1)
+                if len(parts) > 1:
+                    reasoning = parts[1].strip()[:200]
+            # Calculate score
+            score = confidence if answer == "yes" else (1.0 - confidence)
+            primitive_results.append(PrimitiveResult(
+                content=content,
+                type=ptype,
+                question=question,
+                answer=answer,
+                score=score,
+                reasoning=reasoning,
+            ))
+        # Aggregate scores
+        if primitive_results:
+            atom_score = sum(r.score for r in primitive_results) / len(primitive_results)
+            geo_mean = geometric_mean([r.score for r in primitive_results])
+            prompt_score = 0.7 * atom_score + 0.3 * geo_mean
+        else:
+            atom_score = 0.0
+            prompt_score = 0.0
+        return SoftTIFAResult(
+            primitives_count=len(primitive_results),
+            atom_score=atom_score,
+            prompt_score=prompt_score,
+            passed=prompt_score >= 0.7,
+            primitive_results=primitive_results,
+        )
+    def evaluate_vlm_judge(self, image: Image.Image, prompt: Optional[str]) -> VLMAssessmentResult:
+        """Run VLM-as-Judge holistic assessment."""
+        prompt_context = f'Original prompt: "{prompt}"' if prompt else ""
+        semantic_field = '"semantic_accuracy": {"score": 8, "reasoning": "matches prompt well"},' if prompt else ""
+        eval_prompt = f"""Evaluate this AI-generated image on multiple dimensions.
+{prompt_context}
+Rate each dimension from 1-10:
+- **Technical Quality**: Sharpness, noise level, color accuracy, resolution
+- **Aesthetic Appeal**: Composition, color harmony, visual balance, style
+- **Realism**: Physical plausibility, lighting consistency, proportions
+{('- **Semantic Accuracy**: How well it matches the prompt' if prompt else '')}
+- **AI Artifacts**: Detect issues like distorted faces/hands, extra limbs, text errors
+Example output:
+{{
+  "technical_quality": {{"score": 8, "reasoning": "sharp with good colors"}},
+  "aesthetic_appeal": {{"score": 7, "reasoning": "balanced composition"}},
+  "realism": {{"score": 6, "reasoning": "slightly off proportions"}},
+  {semantic_field}
+  "artifacts": {{"detected": ["slightly distorted fingers"], "severity": "minor"}},
+  "overall": {{"score": 7, "reasoning": "good quality with minor issues"}}
+}}
+Now evaluate this image and return ONLY valid JSON:"""
+        response = self._vlm_generate(image, eval_prompt)
+        data = parse_json_robust(response, fallback=None)
+        if data and isinstance(data, dict):
+            try:
+                def get_score(key: str, default: float = 5.0) -> float:
+                    val = data.get(key, {})
+                    if isinstance(val, dict):
+                        return float(val.get("score", default))
+                    return float(val) if val else default
+                artifacts = data.get("artifacts", {})
+                if isinstance(artifacts, dict):
+                    detected = artifacts.get("detected", [])
+                    severity = artifacts.get("severity", "unknown")
+                else:
+                    detected = []
+                    severity = "unknown"
+                return VLMAssessmentResult(
+                    technical_quality=get_score("technical_quality"),
+                    aesthetic_appeal=get_score("aesthetic_appeal"),
+                    realism=get_score("realism"),
+                    semantic_accuracy=get_score("semantic_accuracy") if prompt else None,
+                    artifacts_detected=detected if isinstance(detected, list) else [],
+                    artifacts_severity=severity if isinstance(severity, str) else "unknown",
+                    overall=get_score("overall"),
+                    reasoning=data.get("overall", {}).get("reasoning") if isinstance(data.get("overall"), dict) else None,
+                )
+            except (KeyError, TypeError, ValueError):
+                pass
+        # Fallback
+        return VLMAssessmentResult(
+            technical_quality=5.0,
+            aesthetic_appeal=5.0,
+            realism=5.0,
+            semantic_accuracy=5.0 if prompt else None,
+            artifacts_detected=[],
+            artifacts_severity="unknown",
+            overall=5.0,
+        )
+    def evaluate_technical_metrics(self, image: Image.Image, prompt: Optional[str]) -> TechnicalMetricsResult:
+        """Calculate technical quality metrics."""
+        sharpness = None
+        colorfulness_score = None
+        contrast_score = None
+        clip_score = None
+        try:
+            sharpness = calculate_sharpness(image)
+        except Exception:
+            pass
+        try:
+            colorfulness_score = calculate_colorfulness(image)
+        except Exception:
+            pass
+        try:
+            contrast_score = calculate_contrast(image)
+        except Exception:
+            pass
+        if prompt:
+            clip_score = calculate_clip_score(
+                image, prompt,
+                self.clip_model, self.clip_preprocess, self.clip_tokenizer,
+                self.device
+            )
+        return TechnicalMetricsResult(
+            clip_score=clip_score,
+            sharpness=sharpness,
+            colorfulness=colorfulness_score,
+            contrast=contrast_score,
+        )
+    def _calculate_aggregated_score(
+        self,
+        soft_tifa: Optional[SoftTIFAResult],
+        vlm: Optional[VLMAssessmentResult],
+        technical: Optional[TechnicalMetricsResult],
+        has_prompt: bool,
+    ) -> AggregatedScore:
+        """Calculate comprehensive aggregated score."""
+        # Prompt alignment scores
+        prompt_alignment_scores = []
+        if soft_tifa:
+            prompt_alignment_scores.append(soft_tifa.prompt_score)
+        if vlm and vlm.semantic_accuracy is not None:
+            prompt_alignment_scores.append(vlm.semantic_accuracy / 10.0)
+        if technical and technical.clip_score is not None:
+            prompt_alignment_scores.append(technical.clip_score)
+        prompt_alignment = sum(prompt_alignment_scores) / len(prompt_alignment_scores) if prompt_alignment_scores else None
+        # Technical quality scores
+        tech_scores = []
+        if technical:
+            if technical.sharpness is not None:
+                tech_scores.append(technical.sharpness)
+            if technical.contrast is not None:
+                tech_scores.append(technical.contrast)
+        if vlm:
+            tech_scores.append(vlm.technical_quality / 10.0)
+        technical_quality = sum(tech_scores) / len(tech_scores) if tech_scores else None
+        # Aesthetic appeal scores
+        aesthetic_scores = []
+        if technical and technical.colorfulness is not None:
+            aesthetic_scores.append(technical.colorfulness)
+        if vlm:
+            aesthetic_scores.append(vlm.aesthetic_appeal / 10.0)
+        aesthetic_appeal = sum(aesthetic_scores) / len(aesthetic_scores) if aesthetic_scores else None
+        # Realism
+        realism = vlm.realism / 10.0 if vlm else None
+        # Artifacts
+        artifacts_score = None
+        if vlm:
+            severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
+            artifacts_score = severity_map.get(vlm.artifacts_severity, 0.7)
+        # Calculate weighted overall
+        score_map = {
+            "prompt_alignment": prompt_alignment,
+            "technical_quality": technical_quality,
+            "aesthetic_appeal": aesthetic_appeal,
+            "realism": realism,
+            "artifacts": artifacts_score,
+        }
+        category_weights = {
+            "prompt_alignment": 0.30 if has_prompt else 0.0,
+            "technical_quality": 0.25,
+            "aesthetic_appeal": 0.20,
+            "realism": 0.15,
+            "artifacts": 0.10,
+        }
+        weighted_sum = 0.0
+        total_weight = 0.0
+        for key, score in score_map.items():
+            if score is not None:
+                weight = category_weights[key]
+                weighted_sum += score * weight
+                total_weight += weight
+        overall = weighted_sum / total_weight if total_weight > 0 else 0.0
+        # Confidence
+        max_metrics = 5 if has_prompt else 4
+        available_metrics = sum(1 for s in score_map.values() if s is not None)
+        confidence = available_metrics / max_metrics
+        # Recommendation
+        recommendation = self._generate_recommendation(score_map, overall)
+        # Normalized weights
+        normalized_weights = {k: v / total_weight for k, v in category_weights.items() if score_map.get(k) is not None}
+        return AggregatedScore(
+            overall=round(overall, 3),
+            grade=score_to_grade(overall),
+            passed=overall >= 0.7,
+            confidence=round(confidence, 2),
+            breakdown=ScoreBreakdown(
+                prompt_alignment=round(prompt_alignment, 3) if prompt_alignment is not None else None,
+                technical_quality=round(technical_quality, 3) if technical_quality is not None else None,
+                aesthetic_appeal=round(aesthetic_appeal, 3) if aesthetic_appeal is not None else None,
+                realism=round(realism, 3) if realism is not None else None,
+                artifacts=round(artifacts_score, 3) if artifacts_score is not None else None,
+            ),
+            weights_used=normalized_weights,
+            recommendation=recommendation,
+        )
+    def _generate_recommendation(self, scores: Dict, overall: float) -> str:
+        """Generate recommendation based on scores."""
+        weakest = None
+        weakest_score = 1.0
+        for key, score in scores.items():
+            if score is not None and score < weakest_score:
+                weakest_score = score
+                weakest = key
+        if overall >= 0.85:
+            return "Excellent quality image. Ready for production use."
+        elif overall >= 0.70:
+            if weakest and weakest_score < 0.7:
+                suggestions = {
+                    "prompt_alignment": "Consider regenerating with clearer prompt.",
+                    "technical_quality": "Image has quality issues. Try higher resolution.",
+                    "aesthetic_appeal": "Composition could be improved.",
+                    "realism": "Physical inconsistencies detected.",
+                    "artifacts": "AI artifacts present. Consider regeneration.",
+                }
+                return f"Good overall. Improvement: {suggestions.get(weakest, weakest)}"
+            return "Good quality image. Minor improvements possible."
+        elif overall >= 0.50:
+            return f"Moderate quality. Main issue: {weakest.replace('_', ' ') if weakest else 'overall'}."
+        else:
+            return "Low quality. Regeneration strongly recommended."
+    def evaluate(
+        self,
+        image: Image.Image,
+        prompt: Optional[str] = None,
+        include_soft_tifa: bool = True,
+        include_vlm: bool = True,
+        include_technical: bool = True,
+    ) -> ImageEvalResult:
+        """
+        Evaluate an AI-generated image.
+        Args:
+            image: PIL Image to evaluate
+            prompt: Optional text prompt used to generate the image
+            include_soft_tifa: Run Soft-TIFA evaluation (requires prompt)
+            include_vlm: Run VLM-as-Judge assessment
+            include_technical: Calculate technical metrics
+        Returns:
+            ImageEvalResult with all evaluation components
+        """
+        start_time = time.time()
+        soft_tifa_result = None
+        vlm_result = None
+        technical_result = None
+        if include_soft_tifa and prompt:
+            soft_tifa_result = self.evaluate_soft_tifa(image, prompt)
+        if include_vlm:
+            vlm_result = self.evaluate_vlm_judge(image, prompt)
+        if include_technical:
+            technical_result = self.evaluate_technical_metrics(image, prompt)
+        aggregated = self._calculate_aggregated_score(
+            soft_tifa=soft_tifa_result,
+            vlm=vlm_result,
+            technical=technical_result,
+            has_prompt=prompt is not None,
+        )
+        return ImageEvalResult(
+            score=aggregated,
+            soft_tifa=soft_tifa_result,
+            vlm_assessment=vlm_result,
+            technical_metrics=technical_result,
+            evaluation_time=time.time() - start_time,
+        )
+class EditEvaluator:
+    """
+    Image Editing Evaluator
+    Evaluates instruction-based image editing using:
+    - Instruction Following: Were the requested edits applied?
+    - Preservation: Were non-edited regions maintained?
+    - Edit Quality: Is the edit seamless and high-quality?
+    """
+    def __init__(self, device: str = "cuda"):
+        """Initialize evaluator with models."""
+        import torch
+        from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+        import lpips
+        self.device = device if torch.cuda.is_available() else "cpu"
+        # Load Qwen2.5-VL
+        model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+        self.vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+        self.vlm_processor = AutoProcessor.from_pretrained(model_name)
+        # Load LPIPS
+        self.lpips_model = lpips.LPIPS(net='alex').to(self.device)
+    def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
+        """Generate response from VLM with image."""
+        import torch
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.vlm_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.vlm_processor(
+            text=[text],
+            images=[image],
+            return_tensors="pt",
+        ).to(self.vlm_model.device)
+        with torch.no_grad():
+            outputs = self.vlm_model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+            )
+        generated = outputs[0][inputs.input_ids.shape[1]:]
+        return self.vlm_processor.decode(generated, skip_special_tokens=True)
+    def _vlm_text_generate(self, prompt: str) -> str:
+        """Generate response from VLM (text only)."""
+        import torch
+        messages = [{"role": "user", "content": prompt}]
+        text = self.vlm_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.vlm_processor(
+            text=[text],
+            return_tensors="pt",
+        ).to(self.vlm_model.device)
+        with torch.no_grad():
+            outputs = self.vlm_model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+            )
+        generated = outputs[0][inputs.input_ids.shape[1]:]
+        return self.vlm_processor.decode(generated, skip_special_tokens=True)
+    def evaluate_instruction_following(self, edited_image: Image.Image, instruction: str) -> InstructionFollowingResult:
+        """Evaluate if editing instruction was followed."""
+        decomp_prompt = f'''Analyze this image editing instruction and decompose into atomic edits.
+Instruction: "{instruction}"
+Example for "Change the sky to sunset and add a bird":
+{{
+  "edits": [
+    {{"content": "change sky color to sunset", "type": "modify", "target": "sky", "expected_result": "orange/purple sunset sky"}},
+    {{"content": "add a bird", "type": "add", "target": "sky area", "expected_result": "visible bird in the scene"}}
+  ]
+}}
+Return ONLY valid JSON for the given instruction:'''
+        decomp_response = self._vlm_text_generate(decomp_prompt)
+        data = parse_json_robust(decomp_response, fallback={})
+        edits = data.get("edits", []) if isinstance(data, dict) else []
+        if not edits or not isinstance(edits, list):
+            # Fallback: evaluate holistically
+            verify_prompt = f'''Evaluate if this image correctly shows the result of the edit:
+Edit instruction: "{instruction}"
+Rate success from 0-10.
+Format: Score: X/10 - Reasoning'''
+            response = self._vlm_generate(edited_image, verify_prompt)
+            score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
+            score = float(score_match.group(1)) if score_match else 5.0
+            return InstructionFollowingResult(
+                edit_primitives=[{"content": instruction, "type": "unknown"}],
+                primitive_scores=[{"edit": instruction, "score": score}],
+                overall_score=score / 10.0,
+                reasoning=response[:200] if response else None,
+            )
+        # Evaluate each edit
+        primitive_scores = []
+        for edit in edits[:10]:
+            content = edit.get("content", "")
+            target = edit.get("target", "the image")
+            expected = edit.get("expected_result", content)
+            verify_prompt = f'''Verify if this edit was applied:
+Edit: {content}
+Target: {target}
+Expected: {expected}
+Rate from 0-10.
+Format: Score: X/10 - Reasoning'''
+            response = self._vlm_generate(edited_image, verify_prompt)
+            score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
+            score = float(score_match.group(1)) if score_match else 5.0
+            primitive_scores.append({
+                "edit": content,
+                "score": score,
+                "reasoning": response[:100] if response else None,
+            })
+        overall = sum(p["score"] for p in primitive_scores) / len(primitive_scores) if primitive_scores else 0
+        return InstructionFollowingResult(
+            edit_primitives=edits[:10],
+            primitive_scores=primitive_scores,
+            overall_score=overall / 10.0,
+        )
+    def evaluate_preservation(self, source_image: Image.Image, edited_image: Image.Image) -> PreservationResult:
+        """Evaluate if non-edited regions were preserved."""
+        scores = []
+        # LPIPS
+        lpips_score = calculate_lpips(source_image, edited_image, self.lpips_model, self.device)
+        lpips_similarity = max(0, 1 - lpips_score) if lpips_score is not None else None
+        if lpips_similarity is not None:
+            scores.append(lpips_similarity)
+        # SSIM
+        ssim_score = None
+        try:
+            ssim_score = calculate_ssim(source_image, edited_image)
+            scores.append(ssim_score)
+        except Exception:
+            pass
+        # PSNR
+        psnr_score = None
+        try:
+            psnr_score = calculate_psnr(source_image, edited_image)
+            scores.append(psnr_score)
+        except Exception:
+            pass
+        # Combined score
+        if scores:
+            if lpips_similarity is not None and len(scores) > 1:
+                preservation_score = lpips_similarity * 0.5 + sum(s for s in scores if s != lpips_similarity) / (len(scores) - 1) * 0.5
+            else:
+                preservation_score = sum(scores) / len(scores)
+        else:
+            preservation_score = 0.5
+        return PreservationResult(
+            lpips_score=lpips_score,
+            ssim_score=ssim_score,
+            psnr_score=psnr_score,
+            overall_score=preservation_score,
+        )
+    def evaluate_edit_quality(self, edited_image: Image.Image, instruction: str) -> EditQualityResult:
+        """Evaluate the quality of the edit."""
+        eval_prompt = f'''Evaluate the quality of this edited image.
+Edit instruction: "{instruction}"
+Rate each dimension 1-10:
+- **Technical**: Seamless blending? Resolution consistent? No visible edit boundaries?
+- **Aesthetic**: Natural looking? Color harmony maintained? Visually pleasing?
+- **Coherence**: Physically plausible? Lighting/shadows consistent? Proper perspective?
+- **Artifacts**: List any issues (blur, color bleeding, unnatural edges, etc.)
+Example output:
+{{
+  "technical": {{"score": 8}},
+  "aesthetic": {{"score": 7}},
+  "coherence": {{"score": 8}},
+  "artifacts": {{"detected": ["slight blur at edge"], "severity": "minor"}}
+}}
+Return ONLY valid JSON:'''
+        response = self._vlm_generate(edited_image, eval_prompt)
+        data = parse_json_robust(response, fallback=None)
+        if data and isinstance(data, dict):
+            try:
+                def get_score(key: str, default: float = 5.0) -> float:
+                    val = data.get(key, {})
+                    if isinstance(val, dict):
+                        return float(val.get("score", default))
+                    return float(val) if val else default
+                technical = get_score("technical")
+                aesthetic = get_score("aesthetic")
+                coherence = get_score("coherence")
+                artifacts_data = data.get("artifacts", {})
+                if isinstance(artifacts_data, dict):
+                    artifacts = artifacts_data.get("detected", [])
+                    severity = artifacts_data.get("severity", "unknown")
+                else:
+                    artifacts = []
+                    severity = "unknown"
+                overall = (technical + aesthetic + coherence) / 30.0
+                severity_penalties = {"major": 0.7, "moderate": 0.85, "minor": 0.95, "none": 1.0}
+                overall *= severity_penalties.get(severity, 0.9)
+                return EditQualityResult(
+                    technical_score=technical,
+                    aesthetic_score=aesthetic,
+                    coherence_score=coherence,
+                    artifacts=artifacts if isinstance(artifacts, list) else [],
+                    artifact_severity=severity if isinstance(severity, str) else "unknown",
+                    overall_score=overall,
+                )
+            except (KeyError, TypeError, ValueError):
+                pass
+        return EditQualityResult(
+            technical_score=5.0,
+            aesthetic_score=5.0,
+            coherence_score=5.0,
+            artifacts=[],
+            artifact_severity="unknown",
+            overall_score=0.5,
+        )
+    def _calculate_edit_aggregated_score(
+        self,
+        instruction_result: InstructionFollowingResult,
+        preservation_result: PreservationResult,
+        quality_result: EditQualityResult,
+    ) -> EditAggregatedScore:
+        """Calculate comprehensive aggregated score for editing."""
+        weights = {
+            "instruction_following": 0.35,
+            "preservation": 0.25,
+            "edit_quality": 0.25,
+            "artifacts": 0.15,
+        }
+        instruction_score = instruction_result.overall_score
+        preservation_score = preservation_result.overall_score
+        edit_quality_score = quality_result.overall_score
+        severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
+        artifacts_score = severity_map.get(quality_result.artifact_severity, 0.7)
+        overall = (
+            instruction_score * weights["instruction_following"] +
+            preservation_score * weights["preservation"] +
+            edit_quality_score * weights["edit_quality"] +
+            artifacts_score * weights["artifacts"]
+        )
+        num_primitives = len(instruction_result.primitive_scores)
+        confidence = min(1.0, 0.5 + (num_primitives * 0.1))
+        recommendation = self._generate_edit_recommendation(
+            instruction_score, preservation_score, edit_quality_score, artifacts_score, overall
+        )
+        return EditAggregatedScore(
+            overall=round(overall, 3),
+            grade=score_to_grade(overall),
+            passed=overall >= 0.7,
+            confidence=round(confidence, 2),
+            breakdown=EditScoreBreakdown(
+                instruction_following=round(instruction_score, 3),
+                preservation=round(preservation_score, 3),
+                edit_quality=round(edit_quality_score, 3),
+                artifacts=round(artifacts_score, 3),
+            ),
+            weights_used=weights,
+            recommendation=recommendation,
+        )
+    def _generate_edit_recommendation(
+        self,
+        instruction: float,
+        preservation: float,
+        quality: float,
+        artifacts: float,
+        overall: float,
+    ) -> str:
+        """Generate recommendation for edit quality."""
+        issues = []
+        if instruction < 0.6:
+            issues.append("instruction not fully followed")
+        if preservation < 0.6:
+            issues.append("too much content changed")
+        if quality < 0.6:
+            issues.append("edit quality issues")
+        if artifacts < 0.7:
+            issues.append("visible artifacts")
+        if overall >= 0.85:
+            return "Excellent edit. Ready for use."
+        elif overall >= 0.70:
+            if issues:
+                return f"Good edit with minor issues: {', '.join(issues[:2])}."
+            return "Good quality edit. Minor improvements possible."
+        elif overall >= 0.50:
+            if issues:
+                return f"Moderate quality. Issues: {', '.join(issues)}."
+            return "Moderate quality. Consider regenerating."
+        else:
+            return f"Low quality. Issues: {', '.join(issues) if issues else 'multiple problems'}."
+    def evaluate(
+        self,
+        source_image: Image.Image,
+        edited_image: Image.Image,
+        instruction: str,
+    ) -> EditEvalResult:
+        """
+        Evaluate an image editing result.
+        Args:
+            source_image: Original image before editing
+            edited_image: Image after editing
+            instruction: The editing instruction that was applied
+        Returns:
+            EditEvalResult with all evaluation components
+        """
+        start_time = time.time()
+        instruction_result = self.evaluate_instruction_following(edited_image, instruction)
+        preservation_result = self.evaluate_preservation(source_image, edited_image)
+        quality_result = self.evaluate_edit_quality(edited_image, instruction)
+        aggregated = self._calculate_edit_aggregated_score(
+            instruction_result=instruction_result,
+            preservation_result=preservation_result,
+            quality_result=quality_result,
+        )
+        return EditEvalResult(
+            score=aggregated,
+            instruction_following=instruction_result,
+            preservation=preservation_result,
+            edit_quality=quality_result,
+            evaluation_time=time.time() - start_time,
+        )

metrics.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+Image Quality Metrics
+Helper functions for calculating various image quality metrics:
+- Technical metrics: sharpness, colorfulness, contrast
+- Preservation metrics: SSIM, PSNR
+- JSON parsing utilities for LLM outputs
+"""
+import re
+import json
+import math
+import numpy as np
+from typing import Any, Optional
+from PIL import Image
+def parse_json_robust(text: str, fallback: Any = None) -> Any:
+    """
+    Robustly parse JSON from LLM output.
+    Handles common issues like markdown code blocks, extra text, etc.
+    """
+    if not text:
+        return fallback
+    # Try direct parse first
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Remove markdown code blocks
+    text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE)
+    # Try to find JSON array
+    array_match = re.search(r'\[[\s\S]*?\](?=\s*$|\s*[,}\]]|$)', text)
+    if array_match:
+        try:
+            return json.loads(array_match.group())
+        except json.JSONDecodeError:
+            pass
+    # Try to find JSON object
+    obj_match = re.search(r'\{[\s\S]*\}', text)
+    if obj_match:
+        try:
+            return json.loads(obj_match.group())
+        except json.JSONDecodeError:
+            # Try fixing common issues
+            json_str = obj_match.group()
+            # Replace single quotes with double quotes
+            json_str = re.sub(r"'([^']*)':", r'"\1":', json_str)
+            # Remove trailing commas
+            json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                pass
+    return fallback
+def calculate_sharpness(image: Image.Image) -> float:
+    """
+    Calculate image sharpness using Laplacian variance.
+    Higher values = sharper image.
+    Returns normalized score 0-1.
+    """
+    import torch
+    import torchvision.transforms as T
+    import torch.nn.functional as F
+    # Convert to grayscale tensor
+    transform = T.Compose([
+        T.Resize((512, 512)),
+        T.Grayscale(),
+        T.ToTensor(),
+    ])
+    tensor = transform(image)
+    # Laplacian kernel
+    laplacian = torch.tensor([
+        [0, 1, 0],
+        [1, -4, 1],
+        [0, 1, 0]
+    ], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
+    # Apply convolution
+    output = F.conv2d(tensor.unsqueeze(0), laplacian, padding=1)
+    # Variance of Laplacian
+    variance = output.var().item()
+    # Normalize (typical range 0-0.1, cap at 0.05 for max score)
+    return min(1.0, variance / 0.05)
+def calculate_colorfulness(image: Image.Image) -> float:
+    """
+    Calculate image colorfulness using Hasler and Susstrunk's method.
+    Returns normalized score 0-1.
+    """
+    img = np.array(image)
+    if len(img.shape) < 3 or img.shape[2] < 3:
+        return 0.0  # Grayscale image
+    R, G, B = img[:, :, 0], img[:, :, 1], img[:, :, 2]
+    rg = np.abs(R.astype(float) - G.astype(float))
+    yb = np.abs(0.5 * (R.astype(float) + G.astype(float)) - B.astype(float))
+    rg_mean, rg_std = np.mean(rg), np.std(rg)
+    yb_mean, yb_std = np.mean(yb), np.std(yb)
+    std_root = np.sqrt(rg_std ** 2 + yb_std ** 2)
+    mean_root = np.sqrt(rg_mean ** 2 + yb_mean ** 2)
+    colorfulness = std_root + 0.3 * mean_root
+    # Normalize (typical range 0-100, good images around 40-60)
+    return min(1.0, colorfulness / 100.0)
+def calculate_contrast(image: Image.Image) -> float:
+    """
+    Calculate image contrast using standard deviation of luminance.
+    Returns normalized score 0-1.
+    """
+    img = np.array(image.convert('L'))  # Convert to grayscale
+    contrast = np.std(img) / 128.0  # Normalize by half of max value
+    return min(1.0, contrast)
+def calculate_ssim(image1: Image.Image, image2: Image.Image, window_size: int = 11) -> float:
+    """
+    Calculate Structural Similarity Index (SSIM) between two images.
+    Returns score 0-1 where 1 means identical.
+    """
+    # Convert to grayscale numpy arrays
+    img1 = np.array(image1.convert('L').resize((256, 256)), dtype=np.float64)
+    img2 = np.array(image2.convert('L').resize((256, 256)), dtype=np.float64)
+    # Constants for stability
+    C1 = (0.01 * 255) ** 2
+    C2 = (0.03 * 255) ** 2
+    # Calculate means
+    mu1 = np.mean(img1)
+    mu2 = np.mean(img2)
+    # Calculate variances and covariance
+    sigma1_sq = np.var(img1)
+    sigma2_sq = np.var(img2)
+    sigma12 = np.mean((img1 - mu1) * (img2 - mu2))
+    # SSIM formula
+    numerator = (2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)
+    denominator = (mu1 ** 2 + mu2 ** 2 + C1) * (sigma1_sq + sigma2_sq + C2)
+    ssim = numerator / denominator
+    return float(max(0, min(1, ssim)))
+def calculate_psnr(image1: Image.Image, image2: Image.Image) -> float:
+    """
+    Calculate Peak Signal-to-Noise Ratio between two images.
+    Returns normalized score 0-1.
+    """
+    # Convert to numpy arrays
+    img1 = np.array(image1.resize((256, 256)), dtype=np.float64)
+    img2 = np.array(image2.resize((256, 256)), dtype=np.float64)
+    # Calculate MSE
+    mse = np.mean((img1 - img2) ** 2)
+    if mse == 0:
+        return 1.0  # Identical images
+    # PSNR in dB
+    psnr = 10 * np.log10((255 ** 2) / mse)
+    # Normalize (20-50 dB range maps to 0-1)
+    normalized = (psnr - 20) / 30
+    return float(max(0, min(1, normalized)))
+def calculate_clip_score(
+    image: Image.Image,
+    text: str,
+    clip_model,
+    clip_preprocess,
+    clip_tokenizer,
+    device: str = "cuda"
+) -> Optional[float]:
+    """
+    Calculate CLIP text-image alignment score.
+    Returns score 0-1 where higher means better alignment.
+    """
+    import torch
+    try:
+        image_input = clip_preprocess(image).unsqueeze(0).to(device)
+        text_input = clip_tokenizer([text]).to(device)
+        with torch.no_grad():
+            image_features = clip_model.encode_image(image_input)
+            text_features = clip_model.encode_text(text_input)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            score = (image_features @ text_features.T).item()
+        return float(score)
+    except Exception:
+        return None
+def calculate_lpips(
+    image1: Image.Image,
+    image2: Image.Image,
+    lpips_model,
+    device: str = "cuda"
+) -> Optional[float]:
+    """
+    Calculate LPIPS perceptual distance between two images.
+    Returns distance value (lower = more similar).
+    """
+    import torch
+    import torchvision.transforms as T
+    try:
+        transform = T.Compose([
+            T.Resize((512, 512)),
+            T.ToTensor(),
+        ])
+        tensor1 = transform(image1).unsqueeze(0).to(device) * 2 - 1
+        tensor2 = transform(image2).unsqueeze(0).to(device) * 2 - 1
+        with torch.no_grad():
+            lpips_score = float(lpips_model(tensor1, tensor2).item())
+        return lpips_score
+    except Exception:
+        return None
+def score_to_grade(score: float) -> str:
+    """Convert numeric score (0-1) to letter grade."""
+    if score >= 0.95:
+        return "A+"
+    elif score >= 0.90:
+        return "A"
+    elif score >= 0.85:
+        return "A-"
+    elif score >= 0.80:
+        return "B+"
+    elif score >= 0.75:
+        return "B"
+    elif score >= 0.70:
+        return "B-"
+    elif score >= 0.65:
+        return "C+"
+    elif score >= 0.60:
+        return "C"
+    elif score >= 0.55:
+        return "C-"
+    elif score >= 0.50:
+        return "D"
+    else:
+        return "F"
+def geometric_mean(scores: list[float]) -> float:
+    """Calculate geometric mean of scores, handling zeros."""
+    if not scores:
+        return 0.0
+    # Clamp to minimum value to avoid log(0)
+    clamped = [max(s, 0.01) for s in scores]
+    log_sum = sum(math.log(s) for s in clamped)
+    return math.exp(log_sum / len(clamped))

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Core dependencies for HuggingFace Spaces
+gradio>=4.0.0
+# PyTorch (HF Spaces provides CUDA)
+torch>=2.1.0
+torchvision>=0.16.0
+# Transformers for Qwen2.5-VL
+transformers>=4.45.0
+accelerate>=0.25.0
+qwen-vl-utils>=0.0.8
+# Image processing
+Pillow>=10.0.0
+numpy>=1.24.0
+# Metrics
+lpips>=0.1.4
+open-clip-torch>=2.24.0
+# Utilities
+httpx>=0.25.0