"""
Image Evaluator Core Logic

Contains the main evaluation classes:
- ImageEvaluator: For text-to-image generation quality assessment
- EditEvaluator: For image editing quality assessment
"""

import re
import math
import time
from typing import Optional, List, Dict, Any
from dataclasses import dataclass, field
from PIL import Image

from metrics import (
    parse_json_robust,
    calculate_sharpness,
    calculate_colorfulness,
    calculate_contrast,
    calculate_ssim,
    calculate_psnr,
    calculate_clip_score,
    calculate_lpips,
    score_to_grade,
    geometric_mean,
)


@dataclass
class PrimitiveResult:
    """Result for a single Soft-TIFA primitive."""
    content: str
    type: str
    question: str
    answer: str
    score: float
    reasoning: Optional[str] = None


@dataclass
class SoftTIFAResult:
    """Soft-TIFA evaluation result."""
    primitives_count: int
    atom_score: float
    prompt_score: float
    passed: bool
    primitive_results: List[PrimitiveResult]


@dataclass
class VLMAssessmentResult:
    """VLM-as-Judge assessment result."""
    technical_quality: float
    aesthetic_appeal: float
    realism: float
    semantic_accuracy: Optional[float]
    artifacts_detected: List[str]
    artifacts_severity: str
    overall: float
    reasoning: Optional[str] = None


@dataclass
class TechnicalMetricsResult:
    """Technical metrics result."""
    clip_score: Optional[float] = None
    sharpness: Optional[float] = None
    colorfulness: Optional[float] = None
    contrast: Optional[float] = None


@dataclass
class ScoreBreakdown:
    """Detailed score breakdown by category."""
    prompt_alignment: Optional[float] = None
    technical_quality: Optional[float] = None
    aesthetic_appeal: Optional[float] = None
    realism: Optional[float] = None
    artifacts: Optional[float] = None


@dataclass
class AggregatedScore:
    """Comprehensive aggregated scoring."""
    overall: float
    grade: str
    passed: bool
    confidence: float
    breakdown: ScoreBreakdown
    weights_used: Dict[str, float]
    recommendation: str


@dataclass
class ImageEvalResult:
    """Complete image evaluation result."""
    score: AggregatedScore
    soft_tifa: Optional[SoftTIFAResult] = None
    vlm_assessment: Optional[VLMAssessmentResult] = None
    technical_metrics: Optional[TechnicalMetricsResult] = None
    evaluation_time: float = 0.0


@dataclass
class InstructionFollowingResult:
    """Instruction following evaluation result."""
    edit_primitives: List[Dict]
    primitive_scores: List[Dict]
    overall_score: float
    reasoning: Optional[str] = None


@dataclass
class PreservationResult:
    """Preservation evaluation result."""
    lpips_score: Optional[float] = None
    ssim_score: Optional[float] = None
    psnr_score: Optional[float] = None
    overall_score: float = 0.0


@dataclass
class EditQualityResult:
    """Edit quality assessment result."""
    technical_score: float
    aesthetic_score: float
    coherence_score: float
    artifacts: List[str]
    artifact_severity: str
    overall_score: float
    reasoning: Optional[str] = None


@dataclass
class EditScoreBreakdown:
    """Detailed score breakdown for editing evaluation."""
    instruction_following: Optional[float] = None
    preservation: Optional[float] = None
    edit_quality: Optional[float] = None
    artifacts: Optional[float] = None


@dataclass
class EditAggregatedScore:
    """Comprehensive aggregated scoring for editing."""
    overall: float
    grade: str
    passed: bool
    confidence: float
    breakdown: EditScoreBreakdown
    weights_used: Dict[str, float]
    recommendation: str


@dataclass
class EditEvalResult:
    """Complete edit evaluation result."""
    score: EditAggregatedScore
    instruction_following: Optional[InstructionFollowingResult] = None
    preservation: Optional[PreservationResult] = None
    edit_quality: Optional[EditQualityResult] = None
    evaluation_time: float = 0.0


@dataclass
class ComparisonRanking:
    """Ranking result for a single criterion."""
    criterion: str
    ranking: List[int]  # Image indices, best to worst
    scores: List[float]  # Normalized scores per image
    reasoning: Optional[str] = None


@dataclass
class ComparisonResult:
    """Complete comparison result for multiple images."""
    num_images: int
    prompt: str
    overall_ranking: List[int]  # Image indices, best to worst
    overall_scores: List[float]  # Normalized scores per image
    rankings_by_criterion: Dict[str, ComparisonRanking]
    winner_index: int
    winner_reasoning: str
    individual_scores: List[AggregatedScore]  # Individual evaluation scores
    individual_results: List["ImageEvalResult"] = field(default_factory=list)  # Full evaluation results
    evaluation_time: float = 0.0


class ImageEvaluator:
    """
    AI-Generated Image Quality Evaluator

    Evaluates AI-generated images using:
    - Soft-TIFA: Atomic prompt decomposition for precise alignment scoring
    - VLM-as-Judge: Human-like holistic assessment with reasoning
    - Technical Metrics: Sharpness, colorfulness, contrast, CLIP score
    """

    def __init__(self, device: str = "cuda"):
        """Initialize evaluator with models."""
        import torch
        from transformers import AutoModelForImageTextToText, AutoProcessor

        self.device = device if torch.cuda.is_available() else "cpu"

        # Load Qwen2.5-VL-7B-Instruct
        model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

        self.vlm_model = AutoModelForImageTextToText.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
        )
        self.vlm_processor = AutoProcessor.from_pretrained(
            model_name,
            trust_remote_code=True,
        )

        # Load CLIP for text-image alignment
        import open_clip
        self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
            'ViT-B-32', pretrained='openai'
        )
        self.clip_model = self.clip_model.to(self.device).eval()
        self.clip_tokenizer = open_clip.get_tokenizer('ViT-B-32')

    def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
        """Generate response from VLM with image."""
        import torch

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        text = self.vlm_processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = self.vlm_processor(
            text=[text],
            images=[image],
            return_tensors="pt",
        ).to(self.vlm_model.device)

        with torch.no_grad():
            outputs = self.vlm_model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
            )

        generated = outputs[0][inputs.input_ids.shape[1]:]
        return self.vlm_processor.decode(generated, skip_special_tokens=True)

    def _vlm_text_generate(self, prompt: str) -> str:
        """Generate response from VLM (text only)."""
        import torch

        messages = [{"role": "user", "content": prompt}]

        text = self.vlm_processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = self.vlm_processor(
            text=[text],
            return_tensors="pt",
        ).to(self.vlm_model.device)

        with torch.no_grad():
            outputs = self.vlm_model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
            )

        generated = outputs[0][inputs.input_ids.shape[1]:]
        return self.vlm_processor.decode(generated, skip_special_tokens=True)

    def evaluate_soft_tifa(self, image: Image.Image, prompt: str) -> SoftTIFAResult:
        """Run Soft-TIFA evaluation with atomic prompt decomposition."""
        # Step 1: Decompose prompt into primitives
        decomposition_prompt = f'''Decompose this text-to-image prompt into atomic visual primitives.

Prompt: "{prompt}"

For each primitive, identify:
- content: The specific visual element (e.g., "a red car", "sunset sky")
- type: One of [object, attribute, count, relation, action, scene, style]
- importance: How critical (0.5-1.0)

Example for "A cat sitting on a red chair":
[
  {{"content": "cat", "type": "object", "importance": 1.0}},
  {{"content": "chair", "type": "object", "importance": 0.9}},
  {{"content": "red chair", "type": "attribute", "importance": 0.8}},
  {{"content": "cat sitting on chair", "type": "relation", "importance": 0.9}}
]

Return ONLY valid JSON array for the given prompt:'''

        decomp_response = self._vlm_text_generate(decomposition_prompt)
        primitives = parse_json_robust(decomp_response, fallback=[])

        if not primitives or not isinstance(primitives, list):
            return SoftTIFAResult(
                primitives_count=0,
                atom_score=0.0,
                prompt_score=0.0,
                passed=False,
                primitive_results=[],
            )

        # Step 2: Evaluate each primitive via VQA
        primitive_results = []
        vqa_templates = {
            "object": "Is there a {content} in this image?",
            "attribute": "Does the image show {content}?",
            "count": "Are there {content}?",
            "relation": "Is it true that {content}?",
            "action": "Is {content} happening in this image?",
            "scene": "Does this image depict {content}?",
            "style": "Is this image in {content} style?",
        }

        for prim in primitives[:20]:  # Limit to 20 primitives
            content = prim.get("content", "")
            ptype = prim.get("type", "object")

            template = vqa_templates.get(ptype, vqa_templates["object"])
            question = template.format(content=content)

            vqa_prompt = f"""{question}
Answer Yes or No with confidence (0-100%).
Format: [Yes/No] (confidence: X%) - brief reasoning"""

            response = self._vlm_generate(image, vqa_prompt)

            # Parse response
            answer = "no"
            confidence = 0.5
            reasoning = None

            response_lower = response.lower().strip()
            if response_lower.startswith("yes") or "[yes]" in response_lower:
                answer = "yes"

            conf_match = re.search(r'confidence[:\s]*(\d+)%?', response_lower)
            if conf_match:
                confidence = float(conf_match.group(1)) / 100.0

            if "-" in response:
                parts = response.split("-", 1)
                if len(parts) > 1:
                    reasoning = parts[1].strip()[:200]

            # Calculate score
            score = confidence if answer == "yes" else (1.0 - confidence)

            primitive_results.append(PrimitiveResult(
                content=content,
                type=ptype,
                question=question,
                answer=answer,
                score=score,
                reasoning=reasoning,
            ))

        # Aggregate scores
        if primitive_results:
            atom_score = sum(r.score for r in primitive_results) / len(primitive_results)
            geo_mean = geometric_mean([r.score for r in primitive_results])
            prompt_score = 0.7 * atom_score + 0.3 * geo_mean
        else:
            atom_score = 0.0
            prompt_score = 0.0

        return SoftTIFAResult(
            primitives_count=len(primitive_results),
            atom_score=atom_score,
            prompt_score=prompt_score,
            passed=prompt_score >= 0.7,
            primitive_results=primitive_results,
        )

    def evaluate_vlm_judge(self, image: Image.Image, prompt: Optional[str]) -> VLMAssessmentResult:
        """Run VLM-as-Judge holistic assessment."""
        prompt_context = f'Original prompt: "{prompt}"' if prompt else ""
        semantic_field = '"semantic_accuracy": {"score": 8, "reasoning": "matches prompt well"},' if prompt else ""

        eval_prompt = f"""Evaluate this AI-generated image on multiple dimensions.
{prompt_context}

Rate each dimension from 1-10:
- **Technical Quality**: Sharpness, noise level, color accuracy, resolution
- **Aesthetic Appeal**: Composition, color harmony, visual balance, style
- **Realism**: Physical plausibility, lighting consistency, proportions
{('- **Semantic Accuracy**: How well it matches the prompt' if prompt else '')}
- **AI Artifacts**: Detect issues like distorted faces/hands, extra limbs, text errors

Example output:
{{
  "technical_quality": {{"score": 8, "reasoning": "sharp with good colors"}},
  "aesthetic_appeal": {{"score": 7, "reasoning": "balanced composition"}},
  "realism": {{"score": 6, "reasoning": "slightly off proportions"}},
  {semantic_field}
  "artifacts": {{"detected": ["slightly distorted fingers"], "severity": "minor"}},
  "overall": {{"score": 7, "reasoning": "good quality with minor issues"}}
}}

Now evaluate this image and return ONLY valid JSON:"""

        response = self._vlm_generate(image, eval_prompt)
        data = parse_json_robust(response, fallback=None)

        if data and isinstance(data, dict):
            try:
                def get_score(key: str, default: float = 5.0) -> float:
                    val = data.get(key, {})
                    if isinstance(val, dict):
                        return float(val.get("score", default))
                    return float(val) if val else default

                artifacts = data.get("artifacts", {})
                if isinstance(artifacts, dict):
                    detected = artifacts.get("detected", [])
                    severity = artifacts.get("severity", "unknown")
                else:
                    detected = []
                    severity = "unknown"

                return VLMAssessmentResult(
                    technical_quality=get_score("technical_quality"),
                    aesthetic_appeal=get_score("aesthetic_appeal"),
                    realism=get_score("realism"),
                    semantic_accuracy=get_score("semantic_accuracy") if prompt else None,
                    artifacts_detected=detected if isinstance(detected, list) else [],
                    artifacts_severity=severity if isinstance(severity, str) else "unknown",
                    overall=get_score("overall"),
                    reasoning=data.get("overall", {}).get("reasoning") if isinstance(data.get("overall"), dict) else None,
                )
            except (KeyError, TypeError, ValueError):
                pass

        # Fallback
        return VLMAssessmentResult(
            technical_quality=5.0,
            aesthetic_appeal=5.0,
            realism=5.0,
            semantic_accuracy=5.0 if prompt else None,
            artifacts_detected=[],
            artifacts_severity="unknown",
            overall=5.0,
        )

    def evaluate_technical_metrics(self, image: Image.Image, prompt: Optional[str]) -> TechnicalMetricsResult:
        """Calculate technical quality metrics."""
        sharpness = None
        colorfulness_score = None
        contrast_score = None
        clip_score = None

        try:
            sharpness = calculate_sharpness(image)
        except Exception:
            pass

        try:
            colorfulness_score = calculate_colorfulness(image)
        except Exception:
            pass

        try:
            contrast_score = calculate_contrast(image)
        except Exception:
            pass

        if prompt:
            clip_score = calculate_clip_score(
                image, prompt,
                self.clip_model, self.clip_preprocess, self.clip_tokenizer,
                self.device
            )

        return TechnicalMetricsResult(
            clip_score=clip_score,
            sharpness=sharpness,
            colorfulness=colorfulness_score,
            contrast=contrast_score,
        )

    def _calculate_aggregated_score(
        self,
        soft_tifa: Optional[SoftTIFAResult],
        vlm: Optional[VLMAssessmentResult],
        technical: Optional[TechnicalMetricsResult],
        has_prompt: bool,
    ) -> AggregatedScore:
        """Calculate comprehensive aggregated score."""
        # Prompt alignment scores
        prompt_alignment_scores = []
        if soft_tifa:
            prompt_alignment_scores.append(soft_tifa.prompt_score)
        if vlm and vlm.semantic_accuracy is not None:
            prompt_alignment_scores.append(vlm.semantic_accuracy / 10.0)
        if technical and technical.clip_score is not None:
            prompt_alignment_scores.append(technical.clip_score)

        prompt_alignment = sum(prompt_alignment_scores) / len(prompt_alignment_scores) if prompt_alignment_scores else None

        # Technical quality scores
        tech_scores = []
        if technical:
            if technical.sharpness is not None:
                tech_scores.append(technical.sharpness)
            if technical.contrast is not None:
                tech_scores.append(technical.contrast)
        if vlm:
            tech_scores.append(vlm.technical_quality / 10.0)

        technical_quality = sum(tech_scores) / len(tech_scores) if tech_scores else None

        # Aesthetic appeal scores
        aesthetic_scores = []
        if technical and technical.colorfulness is not None:
            aesthetic_scores.append(technical.colorfulness)
        if vlm:
            aesthetic_scores.append(vlm.aesthetic_appeal / 10.0)

        aesthetic_appeal = sum(aesthetic_scores) / len(aesthetic_scores) if aesthetic_scores else None

        # Realism
        realism = vlm.realism / 10.0 if vlm else None

        # Artifacts
        artifacts_score = None
        if vlm:
            severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
            artifacts_score = severity_map.get(vlm.artifacts_severity, 0.7)

        # Calculate weighted overall
        score_map = {
            "prompt_alignment": prompt_alignment,
            "technical_quality": technical_quality,
            "aesthetic_appeal": aesthetic_appeal,
            "realism": realism,
            "artifacts": artifacts_score,
        }

        category_weights = {
            "prompt_alignment": 0.45 if has_prompt else 0.0,  # Highest priority
            "technical_quality": 0.20,
            "aesthetic_appeal": 0.15,
            "realism": 0.10,
            "artifacts": 0.10,
        }

        weighted_sum = 0.0
        total_weight = 0.0

        for key, score in score_map.items():
            if score is not None:
                weight = category_weights[key]
                weighted_sum += score * weight
                total_weight += weight

        overall = weighted_sum / total_weight if total_weight > 0 else 0.0

        # Confidence
        max_metrics = 5 if has_prompt else 4
        available_metrics = sum(1 for s in score_map.values() if s is not None)
        confidence = available_metrics / max_metrics

        # Recommendation
        recommendation = self._generate_recommendation(score_map, overall)

        # Normalized weights
        normalized_weights = {k: v / total_weight for k, v in category_weights.items() if score_map.get(k) is not None}

        return AggregatedScore(
            overall=round(overall, 3),
            grade=score_to_grade(overall),
            passed=overall >= 0.7,
            confidence=round(confidence, 2),
            breakdown=ScoreBreakdown(
                prompt_alignment=round(prompt_alignment, 3) if prompt_alignment is not None else None,
                technical_quality=round(technical_quality, 3) if technical_quality is not None else None,
                aesthetic_appeal=round(aesthetic_appeal, 3) if aesthetic_appeal is not None else None,
                realism=round(realism, 3) if realism is not None else None,
                artifacts=round(artifacts_score, 3) if artifacts_score is not None else None,
            ),
            weights_used=normalized_weights,
            recommendation=recommendation,
        )

    def _generate_recommendation(self, scores: Dict, overall: float) -> str:
        """Generate recommendation based on scores."""
        weakest = None
        weakest_score = 1.0

        for key, score in scores.items():
            if score is not None and score < weakest_score:
                weakest_score = score
                weakest = key

        if overall >= 0.85:
            return "Excellent quality image. Ready for production use."
        elif overall >= 0.70:
            if weakest and weakest_score < 0.7:
                suggestions = {
                    "prompt_alignment": "Consider regenerating with clearer prompt.",
                    "technical_quality": "Image has quality issues. Try higher resolution.",
                    "aesthetic_appeal": "Composition could be improved.",
                    "realism": "Physical inconsistencies detected.",
                    "artifacts": "AI artifacts present. Consider regeneration.",
                }
                return f"Good overall. Improvement: {suggestions.get(weakest, weakest)}"
            return "Good quality image. Minor improvements possible."
        elif overall >= 0.50:
            return f"Moderate quality. Main issue: {weakest.replace('_', ' ') if weakest else 'overall'}."
        else:
            return "Low quality. Regeneration strongly recommended."

    def evaluate(
        self,
        image: Image.Image,
        prompt: Optional[str] = None,
        include_soft_tifa: bool = True,
        include_vlm: bool = True,
        include_technical: bool = True,
    ) -> ImageEvalResult:
        """
        Evaluate an AI-generated image.

        Args:
            image: PIL Image to evaluate
            prompt: Optional text prompt used to generate the image
            include_soft_tifa: Run Soft-TIFA evaluation (requires prompt)
            include_vlm: Run VLM-as-Judge assessment
            include_technical: Calculate technical metrics

        Returns:
            ImageEvalResult with all evaluation components
        """
        start_time = time.time()

        soft_tifa_result = None
        vlm_result = None
        technical_result = None

        if include_soft_tifa and prompt:
            soft_tifa_result = self.evaluate_soft_tifa(image, prompt)

        if include_vlm:
            vlm_result = self.evaluate_vlm_judge(image, prompt)

        if include_technical:
            technical_result = self.evaluate_technical_metrics(image, prompt)

        aggregated = self._calculate_aggregated_score(
            soft_tifa=soft_tifa_result,
            vlm=vlm_result,
            technical=technical_result,
            has_prompt=prompt is not None,
        )

        return ImageEvalResult(
            score=aggregated,
            soft_tifa=soft_tifa_result,
            vlm_assessment=vlm_result,
            technical_metrics=technical_result,
            evaluation_time=time.time() - start_time,
        )

    def _vlm_generate_multi_image(self, images: List[Image.Image], prompt: str) -> str:
        """Generate response from VLM with multiple images."""
        import torch

        content = []
        for i, img in enumerate(images):
            content.append({"type": "image", "image": img})
        content.append({"type": "text", "text": prompt})

        messages = [{"role": "user", "content": content}]

        text = self.vlm_processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = self.vlm_processor(
            text=[text],
            images=images,
            return_tensors="pt",
        ).to(self.vlm_model.device)

        with torch.no_grad():
            outputs = self.vlm_model.generate(
                **inputs,
                max_new_tokens=2048,
                do_sample=False,
            )

        generated = outputs[0][inputs.input_ids.shape[1]:]
        return self.vlm_processor.decode(generated, skip_special_tokens=True)

    def compare_images(
        self,
        images: List[Image.Image],
        prompt: str,
    ) -> ComparisonResult:
        """
        Compare multiple images (2-4) against a prompt.

        Args:
            images: List of 2-4 PIL Images to compare
            prompt: The text prompt to evaluate against

        Returns:
            ComparisonResult with rankings and scores
        """
        start_time = time.time()
        num_images = len(images)

        if num_images < 2 or num_images > 4:
            raise ValueError("Must provide 2-4 images for comparison")

        # Step 1: Get individual scores for each image
        individual_results = []
        for img in images:
            result = self.evaluate(img, prompt, include_soft_tifa=True, include_vlm=True, include_technical=True)
            individual_results.append(result)

        individual_scores = [r.score for r in individual_results]

        # Step 2: Direct multi-image comparison via VLM
        image_labels = ", ".join([f"Image {i+1}" for i in range(num_images)])
        comparison_prompt = f'''You are comparing {num_images} AI-generated images for the prompt: "{prompt}"

The images are labeled {image_labels} (in order from left to right).

Evaluate and rank ALL images for each criterion. Return a JSON object with rankings (1=best, {num_images}=worst) and scores (0-10).

Criteria:
1. prompt_alignment: How well does each image match the prompt?
2. technical_quality: Sharpness, clarity, no artifacts or distortions
3. aesthetic_appeal: Composition, color harmony, visual appeal
4. realism: Physical plausibility, lighting, proportions

Example output format:
{{
  "prompt_alignment": {{"ranking": [2, 1, 3], "scores": [9, 8, 6], "reasoning": "Image 2 captures all elements..."}},
  "technical_quality": {{"ranking": [1, 2, 3], "scores": [8, 7, 5], "reasoning": "Image 1 is sharpest..."}},
  "aesthetic_appeal": {{"ranking": [2, 1, 3], "scores": [9, 8, 6], "reasoning": "Image 2 has best composition..."}},
  "realism": {{"ranking": [1, 2, 3], "scores": [8, 7, 5], "reasoning": "Image 1 looks most natural..."}},
  "overall": {{"ranking": [2, 1, 3], "scores": [8.5, 7.5, 5.5], "winner": 2, "reasoning": "Image 2 best balances all criteria..."}}
}}

Return ONLY valid JSON:'''

        response = self._vlm_generate_multi_image(images, comparison_prompt)
        data = parse_json_robust(response, fallback=None)

        # Parse comparison results
        rankings_by_criterion = {}
        overall_scores = [r.score.overall for r in individual_results]

        # Default: determine winner from individual scores
        winner_index = max(range(num_images), key=lambda i: overall_scores[i])

        # Create ranking from individual scores (sorted indices, highest first)
        sorted_indices = sorted(range(num_images), key=lambda i: overall_scores[i], reverse=True)
        overall_ranking = [i + 1 for i in sorted_indices]  # Convert to 1-indexed

        winner_reasoning = f"Image {winner_index + 1} achieved the highest overall score ({overall_scores[winner_index]:.3f})."

        # Build rankings_by_criterion from individual score breakdowns
        criteria_map = {
            "prompt_alignment": lambda s: s.breakdown.prompt_alignment,
            "technical_quality": lambda s: s.breakdown.technical_quality,
            "aesthetic_appeal": lambda s: s.breakdown.aesthetic_appeal,
            "realism": lambda s: s.breakdown.realism,
        }

        for criterion, getter in criteria_map.items():
            crit_scores = []
            for score in individual_scores:
                val = getter(score)
                crit_scores.append(val if val is not None else 0.5)

            # Create ranking for this criterion
            sorted_idx = sorted(range(num_images), key=lambda i: crit_scores[i], reverse=True)
            ranking = [i + 1 for i in sorted_idx]

            rankings_by_criterion[criterion] = ComparisonRanking(
                criterion=criterion,
                ranking=ranking,
                scores=crit_scores,
                reasoning=None,
            )

        # Try to use VLM comparison if available (overrides individual-based rankings)
        if data and isinstance(data, dict):
            criteria = ["prompt_alignment", "technical_quality", "aesthetic_appeal", "realism"]

            for criterion in criteria:
                crit_data = data.get(criterion, {})
                if isinstance(crit_data, dict):
                    ranking = crit_data.get("ranking", [])
                    scores = crit_data.get("scores", [])
                    reasoning = crit_data.get("reasoning", "")

                    if ranking and scores and len(ranking) == num_images and len(scores) == num_images:
                        # Normalize scores to 0-1
                        norm_scores = [s / 10.0 for s in scores]

                        rankings_by_criterion[criterion] = ComparisonRanking(
                            criterion=criterion,
                            ranking=ranking,
                            scores=norm_scores,
                            reasoning=reasoning,
                        )

            # Overall from VLM
            overall_data = data.get("overall", {})
            if isinstance(overall_data, dict):
                vlm_ranking = overall_data.get("ranking", [])
                raw_scores = overall_data.get("scores", [])
                vlm_winner = overall_data.get("winner")
                vlm_reasoning = overall_data.get("reasoning", "")

                if vlm_ranking and len(vlm_ranking) == num_images:
                    overall_ranking = vlm_ranking
                if raw_scores and len(raw_scores) == num_images:
                    overall_scores = [s / 10.0 if s > 1 else s for s in raw_scores]
                if vlm_winner and 1 <= vlm_winner <= num_images:
                    winner_index = vlm_winner - 1
                if vlm_reasoning:
                    winner_reasoning = vlm_reasoning

        return ComparisonResult(
            num_images=num_images,
            prompt=prompt,
            overall_ranking=overall_ranking,
            overall_scores=overall_scores,
            rankings_by_criterion=rankings_by_criterion,
            winner_index=max(0, min(winner_index, num_images - 1)),
            winner_reasoning=winner_reasoning,
            individual_scores=individual_scores,
            individual_results=individual_results,
            evaluation_time=time.time() - start_time,
        )


class EditEvaluator:
    """
    Image Editing Evaluator

    Evaluates instruction-based image editing using:
    - Instruction Following: Were the requested edits applied?
    - Preservation: Were non-edited regions maintained?
    - Edit Quality: Is the edit seamless and high-quality?
    """

    def __init__(self, device: str = "cuda"):
        """Initialize evaluator with models."""
        import torch
        from transformers import AutoModelForImageTextToText, AutoProcessor
        import lpips

        self.device = device if torch.cuda.is_available() else "cpu"

        # Load Qwen2.5-VL-7B-Instruct
        model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

        self.vlm_model = AutoModelForImageTextToText.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
        )
        self.vlm_processor = AutoProcessor.from_pretrained(
            model_name,
            trust_remote_code=True,
        )

        # Load LPIPS
        self.lpips_model = lpips.LPIPS(net='alex').to(self.device)

    def _vlm_generate(self, image: Image.Image, prompt: str) -> str:
        """Generate response from VLM with image."""
        import torch

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        text = self.vlm_processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = self.vlm_processor(
            text=[text],
            images=[image],
            return_tensors="pt",
        ).to(self.vlm_model.device)

        with torch.no_grad():
            outputs = self.vlm_model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
            )

        generated = outputs[0][inputs.input_ids.shape[1]:]
        return self.vlm_processor.decode(generated, skip_special_tokens=True)

    def _vlm_text_generate(self, prompt: str) -> str:
        """Generate response from VLM (text only)."""
        import torch

        messages = [{"role": "user", "content": prompt}]

        text = self.vlm_processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = self.vlm_processor(
            text=[text],
            return_tensors="pt",
        ).to(self.vlm_model.device)

        with torch.no_grad():
            outputs = self.vlm_model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
            )

        generated = outputs[0][inputs.input_ids.shape[1]:]
        return self.vlm_processor.decode(generated, skip_special_tokens=True)

    def evaluate_instruction_following(self, edited_image: Image.Image, instruction: str) -> InstructionFollowingResult:
        """Evaluate if editing instruction was followed."""
        decomp_prompt = f'''Analyze this image editing instruction and decompose into atomic edits.

Instruction: "{instruction}"

Example for "Change the sky to sunset and add a bird":
{{
  "edits": [
    {{"content": "change sky color to sunset", "type": "modify", "target": "sky", "expected_result": "orange/purple sunset sky"}},
    {{"content": "add a bird", "type": "add", "target": "sky area", "expected_result": "visible bird in the scene"}}
  ]
}}

Return ONLY valid JSON for the given instruction:'''

        decomp_response = self._vlm_text_generate(decomp_prompt)
        data = parse_json_robust(decomp_response, fallback={})
        edits = data.get("edits", []) if isinstance(data, dict) else []

        if not edits or not isinstance(edits, list):
            # Fallback: evaluate holistically
            verify_prompt = f'''Evaluate if this image correctly shows the result of the edit:

Edit instruction: "{instruction}"

Rate success from 0-10.
Format: Score: X/10 - Reasoning'''

            response = self._vlm_generate(edited_image, verify_prompt)
            score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
            score = float(score_match.group(1)) if score_match else 5.0

            return InstructionFollowingResult(
                edit_primitives=[{"content": instruction, "type": "unknown"}],
                primitive_scores=[{"edit": instruction, "score": score}],
                overall_score=score / 10.0,
                reasoning=response[:200] if response else None,
            )

        # Evaluate each edit
        primitive_scores = []
        for edit in edits[:10]:
            content = edit.get("content", "")
            target = edit.get("target", "the image")
            expected = edit.get("expected_result", content)

            verify_prompt = f'''Verify if this edit was applied:

Edit: {content}
Target: {target}
Expected: {expected}

Rate from 0-10.
Format: Score: X/10 - Reasoning'''

            response = self._vlm_generate(edited_image, verify_prompt)
            score_match = re.search(r'[Ss]core[:\s]*(\d+(?:\.\d+)?)\s*/\s*10', response)
            score = float(score_match.group(1)) if score_match else 5.0

            primitive_scores.append({
                "edit": content,
                "score": score,
                "reasoning": response[:100] if response else None,
            })

        overall = sum(p["score"] for p in primitive_scores) / len(primitive_scores) if primitive_scores else 0

        return InstructionFollowingResult(
            edit_primitives=edits[:10],
            primitive_scores=primitive_scores,
            overall_score=overall / 10.0,
        )

    def evaluate_preservation(self, source_image: Image.Image, edited_image: Image.Image) -> PreservationResult:
        """Evaluate if non-edited regions were preserved."""
        scores = []

        # LPIPS
        lpips_score = calculate_lpips(source_image, edited_image, self.lpips_model, self.device)
        lpips_similarity = max(0, 1 - lpips_score) if lpips_score is not None else None
        if lpips_similarity is not None:
            scores.append(lpips_similarity)

        # SSIM
        ssim_score = None
        try:
            ssim_score = calculate_ssim(source_image, edited_image)
            scores.append(ssim_score)
        except Exception:
            pass

        # PSNR
        psnr_score = None
        try:
            psnr_score = calculate_psnr(source_image, edited_image)
            scores.append(psnr_score)
        except Exception:
            pass

        # Combined score
        if scores:
            if lpips_similarity is not None and len(scores) > 1:
                preservation_score = lpips_similarity * 0.5 + sum(s for s in scores if s != lpips_similarity) / (len(scores) - 1) * 0.5
            else:
                preservation_score = sum(scores) / len(scores)
        else:
            preservation_score = 0.5

        return PreservationResult(
            lpips_score=lpips_score,
            ssim_score=ssim_score,
            psnr_score=psnr_score,
            overall_score=preservation_score,
        )

    def evaluate_edit_quality(self, edited_image: Image.Image, instruction: str) -> EditQualityResult:
        """Evaluate the quality of the edit."""
        eval_prompt = f'''Evaluate the quality of this edited image.

Edit instruction: "{instruction}"

Rate each dimension 1-10:
- **Technical**: Seamless blending? Resolution consistent? No visible edit boundaries?
- **Aesthetic**: Natural looking? Color harmony maintained? Visually pleasing?
- **Coherence**: Physically plausible? Lighting/shadows consistent? Proper perspective?
- **Artifacts**: List any issues (blur, color bleeding, unnatural edges, etc.)

Example output:
{{
  "technical": {{"score": 8}},
  "aesthetic": {{"score": 7}},
  "coherence": {{"score": 8}},
  "artifacts": {{"detected": ["slight blur at edge"], "severity": "minor"}}
}}

Return ONLY valid JSON:'''

        response = self._vlm_generate(edited_image, eval_prompt)
        data = parse_json_robust(response, fallback=None)

        if data and isinstance(data, dict):
            try:
                def get_score(key: str, default: float = 5.0) -> float:
                    val = data.get(key, {})
                    if isinstance(val, dict):
                        return float(val.get("score", default))
                    return float(val) if val else default

                technical = get_score("technical")
                aesthetic = get_score("aesthetic")
                coherence = get_score("coherence")

                artifacts_data = data.get("artifacts", {})
                if isinstance(artifacts_data, dict):
                    artifacts = artifacts_data.get("detected", [])
                    severity = artifacts_data.get("severity", "unknown")
                else:
                    artifacts = []
                    severity = "unknown"

                overall = (technical + aesthetic + coherence) / 30.0
                severity_penalties = {"major": 0.7, "moderate": 0.85, "minor": 0.95, "none": 1.0}
                overall *= severity_penalties.get(severity, 0.9)

                return EditQualityResult(
                    technical_score=technical,
                    aesthetic_score=aesthetic,
                    coherence_score=coherence,
                    artifacts=artifacts if isinstance(artifacts, list) else [],
                    artifact_severity=severity if isinstance(severity, str) else "unknown",
                    overall_score=overall,
                )
            except (KeyError, TypeError, ValueError):
                pass

        return EditQualityResult(
            technical_score=5.0,
            aesthetic_score=5.0,
            coherence_score=5.0,
            artifacts=[],
            artifact_severity="unknown",
            overall_score=0.5,
        )

    def _calculate_edit_aggregated_score(
        self,
        instruction_result: InstructionFollowingResult,
        preservation_result: PreservationResult,
        quality_result: EditQualityResult,
    ) -> EditAggregatedScore:
        """Calculate comprehensive aggregated score for editing."""
        weights = {
            "instruction_following": 0.35,
            "preservation": 0.25,
            "edit_quality": 0.25,
            "artifacts": 0.15,
        }

        instruction_score = instruction_result.overall_score
        preservation_score = preservation_result.overall_score
        edit_quality_score = quality_result.overall_score

        severity_map = {"none": 1.0, "minor": 0.85, "moderate": 0.6, "major": 0.3, "unknown": 0.7}
        artifacts_score = severity_map.get(quality_result.artifact_severity, 0.7)

        overall = (
            instruction_score * weights["instruction_following"] +
            preservation_score * weights["preservation"] +
            edit_quality_score * weights["edit_quality"] +
            artifacts_score * weights["artifacts"]
        )

        num_primitives = len(instruction_result.primitive_scores)
        confidence = min(1.0, 0.5 + (num_primitives * 0.1))

        recommendation = self._generate_edit_recommendation(
            instruction_score, preservation_score, edit_quality_score, artifacts_score, overall
        )

        return EditAggregatedScore(
            overall=round(overall, 3),
            grade=score_to_grade(overall),
            passed=overall >= 0.7,
            confidence=round(confidence, 2),
            breakdown=EditScoreBreakdown(
                instruction_following=round(instruction_score, 3),
                preservation=round(preservation_score, 3),
                edit_quality=round(edit_quality_score, 3),
                artifacts=round(artifacts_score, 3),
            ),
            weights_used=weights,
            recommendation=recommendation,
        )

    def _generate_edit_recommendation(
        self,
        instruction: float,
        preservation: float,
        quality: float,
        artifacts: float,
        overall: float,
    ) -> str:
        """Generate recommendation for edit quality."""
        issues = []

        if instruction < 0.6:
            issues.append("instruction not fully followed")
        if preservation < 0.6:
            issues.append("too much content changed")
        if quality < 0.6:
            issues.append("edit quality issues")
        if artifacts < 0.7:
            issues.append("visible artifacts")

        if overall >= 0.85:
            return "Excellent edit. Ready for use."
        elif overall >= 0.70:
            if issues:
                return f"Good edit with minor issues: {', '.join(issues[:2])}."
            return "Good quality edit. Minor improvements possible."
        elif overall >= 0.50:
            if issues:
                return f"Moderate quality. Issues: {', '.join(issues)}."
            return "Moderate quality. Consider regenerating."
        else:
            return f"Low quality. Issues: {', '.join(issues) if issues else 'multiple problems'}."

    def evaluate(
        self,
        source_image: Image.Image,
        edited_image: Image.Image,
        instruction: str,
    ) -> EditEvalResult:
        """
        Evaluate an image editing result.

        Args:
            source_image: Original image before editing
            edited_image: Image after editing
            instruction: The editing instruction that was applied

        Returns:
            EditEvalResult with all evaluation components
        """
        start_time = time.time()

        instruction_result = self.evaluate_instruction_following(edited_image, instruction)
        preservation_result = self.evaluate_preservation(source_image, edited_image)
        quality_result = self.evaluate_edit_quality(edited_image, instruction)

        aggregated = self._calculate_edit_aggregated_score(
            instruction_result=instruction_result,
            preservation_result=preservation_result,
            quality_result=quality_result,
        )

        return EditEvalResult(
            score=aggregated,
            instruction_following=instruction_result,
            preservation=preservation_result,
            edit_quality=quality_result,
            evaluation_time=time.time() - start_time,
        )