Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

File size: 20,332 Bytes

cacd4d0

"""
Validation Evaluator for UI Validation Use Case

Evaluates predicted validation results (true/false) against expected results.
Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
"""

from typing import Dict, Any, Optional
import re
import logging

try:
    from .base_evaluator import BaseEvaluator
except ImportError:
    # For standalone testing
    import sys
    from pathlib import Path
    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator


class ValidationEvaluator(BaseEvaluator):
    """
    Evaluator for validation use case (true/false results).
    
    Features:
    - Normalizes boolean formats ("true"/"True"/"1" → True, "false"/"False"/"0" → False)
    - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
    - Binary scoring: correct boolean = 1.0, wrong = 0.0
    - Returns reasoning in evaluation results for LLM-as-judge feedback
    """
    
    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
        """
        Initialize validation evaluator.
        
        Args:
            metric_weights: Weights for evaluation metrics
                          Default: {"output_match": 1.0}
        """
        default_weights = {
            "output_match": 1.0  # Binary boolean comparison
        }
        
        weights = metric_weights or default_weights
        super().__init__(metric_weights=weights)
    
    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
        """
        Evaluate predicted validation result against expected result.
        
        Scoring Strategy:
        1. Normalize both predicted and expected to boolean
        2. Compare booleans (exact match required)
        3. Extract reasoning from both (for LLM-as-judge)
        4. Return 1.0 if match, 0.0 otherwise (binary scoring)
        
        Args:
            predicted: LLM's output (may include "true"/"false" + reasoning)
            expected: Expected output (should be "true" or "false", may include reasoning)
            
        Returns:
            Dictionary with evaluation metrics, extracted booleans, and reasoning:
            {
                "output_match": 1.0 or 0.0,
                "composite_score": 1.0 or 0.0,
                "predicted_output": str,
                "expected_output": str,
                "predicted_boolean": True/False,
                "expected_boolean": True/False,
                "predicted_reasoning": str,  # REQUIRED for LLM-as-judge
                "expected_reasoning": str,   # REQUIRED for LLM-as-judge
                "evaluation_reason": str
            }
        """
        if not predicted or not expected:
            return {
                "output_match": 0.0,
                "composite_score": 0.0,
                "predicted_output": str(predicted).strip() if predicted else "",
                "expected_output": str(expected).strip() if expected else "",
                "predicted_boolean": None,
                "expected_boolean": None,
                "predicted_reasoning": "",
                "expected_reasoning": "",
                "evaluation_reason": "❌ Empty or missing input/output"
            }
        
        predicted_str = str(predicted).strip()
        expected_str = str(expected).strip()
        
        # 1. Extract boolean from predicted output
        pred_bool = self._normalize_to_bool(predicted_str)
        pred_reasoning = self._extract_reasoning(predicted_str)
        
        # 2. Extract boolean from expected output
        exp_bool = self._normalize_to_bool(expected_str)
        exp_reasoning = self._extract_reasoning(expected_str)
        
        # 🔥 NEW: Detect output structure for both expected and predicted
        expected_structure = self._detect_output_structure(expected_str)
        predicted_structure = self._detect_output_structure(predicted_str)
        
        # Compare structures
        structure_match = (expected_structure['format'] == predicted_structure['format'])
        
        # 3. Compare booleans (binary scoring)
        if pred_bool is None or exp_bool is None:
            # Could not extract boolean from one or both
            score = 0.0
            reason = "❌ Could not extract boolean value"
            if pred_bool is None:
                reason += " from predicted output"
            if exp_bool is None:
                reason += " from expected output"
        else:
            # Both booleans extracted successfully - compare
            score = 1.0 if pred_bool == exp_bool else 0.0
            if score == 1.0:
                reason = f"✅ Correct! Result matches (both are {exp_bool})"
                # 🔥 NEW: Add note if structure doesn't match
                if not structure_match:
                    reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
            else:
                reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
        
        # 4. Log evaluation details
        self.logger.info(f"\n{'─'*70}")
        self.logger.info(f"📊 VALIDATION EVALUATION")
        self.logger.info(f"{'─'*70}")
        self.logger.info(f"   Expected: '{expected_str[:100]}...' → {exp_bool}")
        self.logger.info(f"   Predicted: '{predicted_str[:100]}...' → {pred_bool}")
        self.logger.info(f"   {'─'*66}")
        self.logger.info(f"   🎯 SCORE: {score:.2f} - {reason}")
        if pred_reasoning:
            self.logger.info(f"   📝 Predicted Reasoning: {pred_reasoning[:150]}...")
        if exp_reasoning:
            self.logger.info(f"   📝 Expected Reasoning: {exp_reasoning[:150]}...")
        # 🔥 NEW: Log structure comparison
        self.logger.info(f"   📐 Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
        self.logger.info(f"   📐 Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
        if not structure_match:
            self.logger.warning(f"   ⚠️  OUTPUT STRUCTURE MISMATCH!")
        self.logger.info(f"{'─'*70}\n")
        
        return {
            "output_match": score,
            "composite_score": score,  # This is what GEPA uses
            "predicted_output": predicted_str,
            "expected_output": expected_str,
            "predicted_boolean": pred_bool,
            "expected_boolean": exp_bool,
            "predicted_reasoning": pred_reasoning,  # REQUIRED for LLM-as-judge
            "expected_reasoning": exp_reasoning,     # REQUIRED for LLM-as-judge
            "evaluation_reason": reason,
            # 🔥 NEW: Structure metadata for LLM-as-judge
            "expected_structure": expected_structure,
            "predicted_structure": predicted_structure,
            "output_structure_match": structure_match,
            "expected_has_reasoning": expected_structure['has_reasoning'],
            "predicted_has_reasoning": predicted_structure['has_reasoning'],
            "reasoning_quality_gap": expected_structure['reasoning_quality'] + " → " + predicted_structure['reasoning_quality']
        }
    
    def _normalize_to_bool(self, value: str) -> Optional[bool]:
        """
        Normalize various formats to boolean.
        
        Handles:
        - "true", "True", "TRUE" → True
        - "false", "False", "FALSE" → False
        - "1", "0" → True, False
        - "yes", "no" → True, False
        - "correct", "incorrect" → True, False
        - JSON: {"result": true} → True
        - Text with boolean: "The result is true because..." → True
        
        Args:
            value: String that may contain a boolean value
            
        Returns:
            Boolean value or None if cannot be determined
        """
        if not value:
            return None
        
        value_lower = value.lower().strip()
        
        # Direct boolean strings
        if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
            return True
        if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
            return False
        
        # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
        # This handles the production prompt's JSON output format
        # Match both quoted and unquoted values, case-insensitive
        action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
        if action_match:
            action_value = action_match.group(1).lower()
            if action_value == "true":
                return True
            elif action_value == "false":
                return False
            elif action_value == "loading":
                # Treat LOADING as False for validation purposes (screen not ready)
                return False
        
        # Also try to parse full JSON structure if present (more robust)
        try:
            import json
            # Try to find and parse JSON object
            json_start = value.find('{')
            if json_start != -1:
                # Try to extract JSON from the response
                for end_idx in range(len(value), json_start, -1):
                    try:
                        json_str = value[json_start:end_idx]
                        data = json.loads(json_str)
                        # Check for "action" field (production prompt format)
                        if "action" in data:
                            action_val = str(data["action"]).upper()
                            if action_val == "TRUE":
                                return True
                            elif action_val == "FALSE":
                                return False
                            elif action_val == "LOADING":
                                return False  # Treat as False
                        # Check for "result" field (alternative format)
                        if "result" in data:
                            result_val = data["result"]
                            if isinstance(result_val, bool):
                                return result_val
                            elif isinstance(result_val, str):
                                return result_val.lower() in ("true", "1", "yes")
                    except (json.JSONDecodeError, KeyError, ValueError):
                        continue
        except Exception:
            pass  # Fall through to other extraction methods
        
        # JSON format: {"result": true} or {"result": false}
        json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
        if json_match:
            return json_match.group(1) == "true"
        
        # Pattern: "result is true" or "result: true"
        pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
        if pattern_match:
            return pattern_match.group(1) == "true"
        
        # Pattern: "is true" or "is false" (standalone)
        is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
        if is_match:
            return is_match.group(2) == "true"
        
        # Pattern: "true" or "false" as standalone word (not in other words)
        standalone_match = re.search(r'\b(true|false)\b', value_lower)
        if standalone_match:
            return standalone_match.group(1) == "true"
        
        # Last resort: check if "true" appears before "false" in text
        true_pos = value_lower.find("true")
        false_pos = value_lower.find("false")
        
        if true_pos != -1 and false_pos != -1:
            # Both found - use the one that appears first
            return true_pos < false_pos
        elif true_pos != -1:
            return True
        elif false_pos != -1:
            return False
        
        # Cannot determine
        return None
    
    def _detect_output_structure(self, output: str) -> Dict[str, Any]:
        """
        Dynamically detect the structure/components of the output.
        
        This detects:
        - Boolean result presence
        - Reasoning/explanation presence and quality
        - Output format (boolean only, boolean+reasoning, etc.)
        
        Args:
            output: Output string to analyze
            
        Returns:
            Dictionary with structure information:
            {
                "has_boolean": bool,
                "has_reasoning": bool,
                "reasoning_length": int,
                "reasoning_quality": str,  # "missing", "minimal", "adequate", "detailed"
                "format": str  # "boolean_only", "boolean_with_reasoning", "unknown"
            }
        """
        if not output:
            return {
                "has_boolean": False,
                "has_reasoning": False,
                "reasoning_length": 0,
                "reasoning_quality": "missing",
                "format": "empty"
            }
        
        output_clean = output.strip()
        
        # Detect boolean
        has_boolean = self._normalize_to_bool(output_clean) is not None
        
        # Extract reasoning
        reasoning = self._extract_reasoning(output_clean)
        has_reasoning = len(reasoning) > 15  # Minimum 15 chars to count as reasoning
        reasoning_length = len(reasoning)
        
        # Classify reasoning quality
        if reasoning_length == 0:
            reasoning_quality = "missing"
        elif reasoning_length < 30:
            reasoning_quality = "minimal"  # Just a few words
        elif reasoning_length < 100:
            reasoning_quality = "adequate"  # Brief explanation
        else:
            reasoning_quality = "detailed"  # Full explanation
        
        # Determine format
        if has_boolean and has_reasoning:
            output_format = "boolean_with_reasoning"
        elif has_boolean and not has_reasoning:
            output_format = "boolean_only"
        elif not has_boolean and has_reasoning:
            output_format = "reasoning_only"
        else:
            output_format = "unknown"
        
        return {
            "has_boolean": has_boolean,
            "has_reasoning": has_reasoning,
            "reasoning_length": reasoning_length,
            "reasoning_quality": reasoning_quality,
            "format": output_format
        }
    
    def _extract_reasoning(self, output: str) -> str:
        """
        Extract reasoning/explanation from output string.
        
        This is REQUIRED for LLM-as-judge feedback. The reasoning helps
        the judge understand why the result was true/false and compare
        predicted vs expected reasoning.
        
        Args:
            output: Full output string that may contain reasoning
            
        Returns:
            Extracted reasoning text, or empty string if not found
        """
        if not output:
            return ""
        
        # Patterns to find reasoning sections
        reasoning_patterns = [
            r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)',  # "Reason: ..."
            r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)',  # "Explanation: ..."
            r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)',  # "Because: ..."
            r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)',  # "Why: ..."
            r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)',  # "Details: ..."
        ]
        
        # Try each pattern
        for pattern in reasoning_patterns:
            match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
            if match:
                reasoning = match.group(1).strip()
                if len(reasoning) > 20:  # Only return if substantial
                    return reasoning
        
        # If no explicit reasoning section, check if output has substantial text
        # after boolean (likely contains reasoning)
        bool_match = re.search(r'\b(true|false)\b', output.lower())
        if bool_match:
            # Get text after the boolean
            bool_pos = bool_match.end()
            remaining = output[bool_pos:].strip()
            
            # If remaining text is substantial (more than just punctuation), use it
            if len(remaining) > 30:
                # Clean up common prefixes
                remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
                if remaining:
                    return remaining
        
        # If output is long and doesn't start with boolean, might be all reasoning
        if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
            # Return first 500 chars as reasoning
            return output[:500].strip()
        
        # No reasoning found
        return ""
    
    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
        """
        Get summary statistics for a batch of evaluations.
        
        Args:
            results: List of evaluation result dictionaries
            
        Returns:
            Summary statistics including accuracy, true/false distribution
        """
        if not results:
            return {
                "total_samples": 0,
                "accuracy": 0.0,
                "correct_predictions": 0,
                "incorrect_predictions": 0,
                "true_predictions": 0,
                "false_predictions": 0
            }
        
        total = len(results)
        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
        accuracy = correct / total if total > 0 else 0.0
        
        # Count true/false predictions
        true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
        false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
        
        return {
            "total_samples": total,
            "accuracy": accuracy,
            "correct_predictions": correct,
            "incorrect_predictions": total - correct,
            "true_predictions": true_preds,
            "false_predictions": false_preds
        }


# Example usage and testing
if __name__ == "__main__":
    print("🚀 Testing Validation Evaluator...")
    
    evaluator = ValidationEvaluator()
    
    # Test cases
    test_cases = [
        # (predicted, expected, should_match)
        ("true", "true", True),
        ("false", "false", True),
        ("True", "true", True),
        ("FALSE", "false", True),
        ("1", "true", True),
        ("0", "false", True),
        ("true", "false", False),
        ("false", "true", False),
        ("The result is true because the button is visible", "true", True),
        ("The result is false because the element is not found", "false", True),
        ('{"result": true, "reasoning": "Button is visible"}', "true", True),
        ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
        ("", "true", False),
        ("invalid", "true", False),
    ]
    
    print("\n📝 Running test cases:")
    print("-" * 80)
    
    results = []
    for predicted, expected, should_match in test_cases:
        result = evaluator.evaluate(predicted, expected)
        match = result["composite_score"] == 1.0
        
        status = "✅" if match == should_match else "❌"
        pred_bool = result.get("predicted_boolean", "?")
        exp_bool = result.get("expected_boolean", "?")
        pred_reason = result.get("predicted_reasoning", "")[:50]
        
        print(f"{status} Predicted: '{predicted[:40]}...' → {pred_bool}")
        print(f"   Expected: '{expected}' → {exp_bool}")
        print(f"   Match: {match} (should be {should_match})")
        if pred_reason:
            print(f"   Reasoning: {pred_reason}...")
        print()
        
        results.append(result)
    
    # Summary
    print("\n📊 Summary:")
    summary = evaluator.get_evaluation_summary(results)
    print(f"   Total: {summary['total_samples']}")
    print(f"   Correct: {summary['correct_predictions']}")
    print(f"   Accuracy: {summary['accuracy']:.1%}")
    print(f"   True predictions: {summary['true_predictions']}")
    print(f"   False predictions: {summary['false_predictions']}")