""" Validation Evaluator for UI Validation Use Case Evaluates predicted validation results (true/false) against expected results. Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback. """ from typing import Dict, Any, Optional import re import logging try: from .base_evaluator import BaseEvaluator except ImportError: # For standalone testing import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator class ValidationEvaluator(BaseEvaluator): """ Evaluator for validation use case (true/false results). Features: - Normalizes boolean formats ("true"/"True"/"1" → True, "false"/"False"/"0" → False) - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge) - Binary scoring: correct boolean = 1.0, wrong = 0.0 - Returns reasoning in evaluation results for LLM-as-judge feedback """ def __init__(self, metric_weights: Optional[Dict[str, float]] = None): """ Initialize validation evaluator. Args: metric_weights: Weights for evaluation metrics Default: {"output_match": 1.0} """ default_weights = { "output_match": 1.0 # Binary boolean comparison } weights = metric_weights or default_weights super().__init__(metric_weights=weights) def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: """ Evaluate predicted validation result against expected result. Scoring Strategy: 1. Normalize both predicted and expected to boolean 2. Compare booleans (exact match required) 3. Extract reasoning from both (for LLM-as-judge) 4. Return 1.0 if match, 0.0 otherwise (binary scoring) Args: predicted: LLM's output (may include "true"/"false" + reasoning) expected: Expected output (should be "true" or "false", may include reasoning) Returns: Dictionary with evaluation metrics, extracted booleans, and reasoning: { "output_match": 1.0 or 0.0, "composite_score": 1.0 or 0.0, "predicted_output": str, "expected_output": str, "predicted_boolean": True/False, "expected_boolean": True/False, "predicted_reasoning": str, # REQUIRED for LLM-as-judge "expected_reasoning": str, # REQUIRED for LLM-as-judge "evaluation_reason": str } """ if not predicted or not expected: return { "output_match": 0.0, "composite_score": 0.0, "predicted_output": str(predicted).strip() if predicted else "", "expected_output": str(expected).strip() if expected else "", "predicted_boolean": None, "expected_boolean": None, "predicted_reasoning": "", "expected_reasoning": "", "evaluation_reason": "āŒ Empty or missing input/output" } predicted_str = str(predicted).strip() expected_str = str(expected).strip() # 1. Extract boolean from predicted output pred_bool = self._normalize_to_bool(predicted_str) pred_reasoning = self._extract_reasoning(predicted_str) # 2. Extract boolean from expected output exp_bool = self._normalize_to_bool(expected_str) exp_reasoning = self._extract_reasoning(expected_str) # šŸ”„ NEW: Detect output structure for both expected and predicted expected_structure = self._detect_output_structure(expected_str) predicted_structure = self._detect_output_structure(predicted_str) # Compare structures structure_match = (expected_structure['format'] == predicted_structure['format']) # 3. Compare booleans (binary scoring) if pred_bool is None or exp_bool is None: # Could not extract boolean from one or both score = 0.0 reason = "āŒ Could not extract boolean value" if pred_bool is None: reason += " from predicted output" if exp_bool is None: reason += " from expected output" else: # Both booleans extracted successfully - compare score = 1.0 if pred_bool == exp_bool else 0.0 if score == 1.0: reason = f"āœ… Correct! Result matches (both are {exp_bool})" # šŸ”„ NEW: Add note if structure doesn't match if not structure_match: reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})" else: reason = f"āŒ Wrong result (predicted: {pred_bool}, expected: {exp_bool})" # 4. Log evaluation details self.logger.info(f"\n{'─'*70}") self.logger.info(f"šŸ“Š VALIDATION EVALUATION") self.logger.info(f"{'─'*70}") self.logger.info(f" Expected: '{expected_str[:100]}...' → {exp_bool}") self.logger.info(f" Predicted: '{predicted_str[:100]}...' → {pred_bool}") self.logger.info(f" {'─'*66}") self.logger.info(f" šŸŽÆ SCORE: {score:.2f} - {reason}") if pred_reasoning: self.logger.info(f" šŸ“ Predicted Reasoning: {pred_reasoning[:150]}...") if exp_reasoning: self.logger.info(f" šŸ“ Expected Reasoning: {exp_reasoning[:150]}...") # šŸ”„ NEW: Log structure comparison self.logger.info(f" šŸ“ Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})") self.logger.info(f" šŸ“ Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})") if not structure_match: self.logger.warning(f" āš ļø OUTPUT STRUCTURE MISMATCH!") self.logger.info(f"{'─'*70}\n") return { "output_match": score, "composite_score": score, # This is what GEPA uses "predicted_output": predicted_str, "expected_output": expected_str, "predicted_boolean": pred_bool, "expected_boolean": exp_bool, "predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge "expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge "evaluation_reason": reason, # šŸ”„ NEW: Structure metadata for LLM-as-judge "expected_structure": expected_structure, "predicted_structure": predicted_structure, "output_structure_match": structure_match, "expected_has_reasoning": expected_structure['has_reasoning'], "predicted_has_reasoning": predicted_structure['has_reasoning'], "reasoning_quality_gap": expected_structure['reasoning_quality'] + " → " + predicted_structure['reasoning_quality'] } def _normalize_to_bool(self, value: str) -> Optional[bool]: """ Normalize various formats to boolean. Handles: - "true", "True", "TRUE" → True - "false", "False", "FALSE" → False - "1", "0" → True, False - "yes", "no" → True, False - "correct", "incorrect" → True, False - JSON: {"result": true} → True - Text with boolean: "The result is true because..." → True Args: value: String that may contain a boolean value Returns: Boolean value or None if cannot be determined """ if not value: return None value_lower = value.lower().strip() # Direct boolean strings if value_lower in ("true", "1", "yes", "correct", "valid", "pass"): return True if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"): return False # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"} # This handles the production prompt's JSON output format # Match both quoted and unquoted values, case-insensitive action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower) if action_match: action_value = action_match.group(1).lower() if action_value == "true": return True elif action_value == "false": return False elif action_value == "loading": # Treat LOADING as False for validation purposes (screen not ready) return False # Also try to parse full JSON structure if present (more robust) try: import json # Try to find and parse JSON object json_start = value.find('{') if json_start != -1: # Try to extract JSON from the response for end_idx in range(len(value), json_start, -1): try: json_str = value[json_start:end_idx] data = json.loads(json_str) # Check for "action" field (production prompt format) if "action" in data: action_val = str(data["action"]).upper() if action_val == "TRUE": return True elif action_val == "FALSE": return False elif action_val == "LOADING": return False # Treat as False # Check for "result" field (alternative format) if "result" in data: result_val = data["result"] if isinstance(result_val, bool): return result_val elif isinstance(result_val, str): return result_val.lower() in ("true", "1", "yes") except (json.JSONDecodeError, KeyError, ValueError): continue except Exception: pass # Fall through to other extraction methods # JSON format: {"result": true} or {"result": false} json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower) if json_match: return json_match.group(1) == "true" # Pattern: "result is true" or "result: true" pattern_match = re.search(r'result[:\s]+(true|false)', value_lower) if pattern_match: return pattern_match.group(1) == "true" # Pattern: "is true" or "is false" (standalone) is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower) if is_match: return is_match.group(2) == "true" # Pattern: "true" or "false" as standalone word (not in other words) standalone_match = re.search(r'\b(true|false)\b', value_lower) if standalone_match: return standalone_match.group(1) == "true" # Last resort: check if "true" appears before "false" in text true_pos = value_lower.find("true") false_pos = value_lower.find("false") if true_pos != -1 and false_pos != -1: # Both found - use the one that appears first return true_pos < false_pos elif true_pos != -1: return True elif false_pos != -1: return False # Cannot determine return None def _detect_output_structure(self, output: str) -> Dict[str, Any]: """ Dynamically detect the structure/components of the output. This detects: - Boolean result presence - Reasoning/explanation presence and quality - Output format (boolean only, boolean+reasoning, etc.) Args: output: Output string to analyze Returns: Dictionary with structure information: { "has_boolean": bool, "has_reasoning": bool, "reasoning_length": int, "reasoning_quality": str, # "missing", "minimal", "adequate", "detailed" "format": str # "boolean_only", "boolean_with_reasoning", "unknown" } """ if not output: return { "has_boolean": False, "has_reasoning": False, "reasoning_length": 0, "reasoning_quality": "missing", "format": "empty" } output_clean = output.strip() # Detect boolean has_boolean = self._normalize_to_bool(output_clean) is not None # Extract reasoning reasoning = self._extract_reasoning(output_clean) has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning reasoning_length = len(reasoning) # Classify reasoning quality if reasoning_length == 0: reasoning_quality = "missing" elif reasoning_length < 30: reasoning_quality = "minimal" # Just a few words elif reasoning_length < 100: reasoning_quality = "adequate" # Brief explanation else: reasoning_quality = "detailed" # Full explanation # Determine format if has_boolean and has_reasoning: output_format = "boolean_with_reasoning" elif has_boolean and not has_reasoning: output_format = "boolean_only" elif not has_boolean and has_reasoning: output_format = "reasoning_only" else: output_format = "unknown" return { "has_boolean": has_boolean, "has_reasoning": has_reasoning, "reasoning_length": reasoning_length, "reasoning_quality": reasoning_quality, "format": output_format } def _extract_reasoning(self, output: str) -> str: """ Extract reasoning/explanation from output string. This is REQUIRED for LLM-as-judge feedback. The reasoning helps the judge understand why the result was true/false and compare predicted vs expected reasoning. Args: output: Full output string that may contain reasoning Returns: Extracted reasoning text, or empty string if not found """ if not output: return "" # Patterns to find reasoning sections reasoning_patterns = [ r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)', # "Reason: ..." r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)', # "Explanation: ..." r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)', # "Because: ..." r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)', # "Why: ..." r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)', # "Details: ..." ] # Try each pattern for pattern in reasoning_patterns: match = re.search(pattern, output, re.DOTALL | re.IGNORECASE) if match: reasoning = match.group(1).strip() if len(reasoning) > 20: # Only return if substantial return reasoning # If no explicit reasoning section, check if output has substantial text # after boolean (likely contains reasoning) bool_match = re.search(r'\b(true|false)\b', output.lower()) if bool_match: # Get text after the boolean bool_pos = bool_match.end() remaining = output[bool_pos:].strip() # If remaining text is substantial (more than just punctuation), use it if len(remaining) > 30: # Clean up common prefixes remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining) if remaining: return remaining # If output is long and doesn't start with boolean, might be all reasoning if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE): # Return first 500 chars as reasoning return output[:500].strip() # No reasoning found return "" def get_evaluation_summary(self, results: list) -> Dict[str, Any]: """ Get summary statistics for a batch of evaluations. Args: results: List of evaluation result dictionaries Returns: Summary statistics including accuracy, true/false distribution """ if not results: return { "total_samples": 0, "accuracy": 0.0, "correct_predictions": 0, "incorrect_predictions": 0, "true_predictions": 0, "false_predictions": 0 } total = len(results) correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0) accuracy = correct / total if total > 0 else 0.0 # Count true/false predictions true_preds = sum(1 for r in results if r.get("predicted_boolean") is True) false_preds = sum(1 for r in results if r.get("predicted_boolean") is False) return { "total_samples": total, "accuracy": accuracy, "correct_predictions": correct, "incorrect_predictions": total - correct, "true_predictions": true_preds, "false_predictions": false_preds } # Example usage and testing if __name__ == "__main__": print("šŸš€ Testing Validation Evaluator...") evaluator = ValidationEvaluator() # Test cases test_cases = [ # (predicted, expected, should_match) ("true", "true", True), ("false", "false", True), ("True", "true", True), ("FALSE", "false", True), ("1", "true", True), ("0", "false", True), ("true", "false", False), ("false", "true", False), ("The result is true because the button is visible", "true", True), ("The result is false because the element is not found", "false", True), ('{"result": true, "reasoning": "Button is visible"}', "true", True), ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True), ("", "true", False), ("invalid", "true", False), ] print("\nšŸ“ Running test cases:") print("-" * 80) results = [] for predicted, expected, should_match in test_cases: result = evaluator.evaluate(predicted, expected) match = result["composite_score"] == 1.0 status = "āœ…" if match == should_match else "āŒ" pred_bool = result.get("predicted_boolean", "?") exp_bool = result.get("expected_boolean", "?") pred_reason = result.get("predicted_reasoning", "")[:50] print(f"{status} Predicted: '{predicted[:40]}...' → {pred_bool}") print(f" Expected: '{expected}' → {exp_bool}") print(f" Match: {match} (should be {should_match})") if pred_reason: print(f" Reasoning: {pred_reason}...") print() results.append(result) # Summary print("\nšŸ“Š Summary:") summary = evaluator.get_evaluation_summary(results) print(f" Total: {summary['total_samples']}") print(f" Correct: {summary['correct_predictions']}") print(f" Accuracy: {summary['accuracy']:.1%}") print(f" True predictions: {summary['true_predictions']}") print(f" False predictions: {summary['false_predictions']}")