Spaces:
Sleeping
Sleeping
| """ | |
| Validation Evaluator for UI Validation Use Case | |
| Evaluates predicted validation results (true/false) against expected results. | |
| Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback. | |
| """ | |
| from typing import Dict, Any, Optional | |
| import re | |
| import logging | |
| try: | |
| from .base_evaluator import BaseEvaluator | |
| except ImportError: | |
| # For standalone testing | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent.parent)) | |
| from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator | |
| class ValidationEvaluator(BaseEvaluator): | |
| """ | |
| Evaluator for validation use case (true/false results). | |
| Features: | |
| - Normalizes boolean formats ("true"/"True"/"1" β True, "false"/"False"/"0" β False) | |
| - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge) | |
| - Binary scoring: correct boolean = 1.0, wrong = 0.0 | |
| - Returns reasoning in evaluation results for LLM-as-judge feedback | |
| """ | |
| def __init__(self, metric_weights: Optional[Dict[str, float]] = None): | |
| """ | |
| Initialize validation evaluator. | |
| Args: | |
| metric_weights: Weights for evaluation metrics | |
| Default: {"output_match": 1.0} | |
| """ | |
| default_weights = { | |
| "output_match": 1.0 # Binary boolean comparison | |
| } | |
| weights = metric_weights or default_weights | |
| super().__init__(metric_weights=weights) | |
| def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: | |
| """ | |
| Evaluate predicted validation result against expected result. | |
| Scoring Strategy: | |
| 1. Normalize both predicted and expected to boolean | |
| 2. Compare booleans (exact match required) | |
| 3. Extract reasoning from both (for LLM-as-judge) | |
| 4. Return 1.0 if match, 0.0 otherwise (binary scoring) | |
| Args: | |
| predicted: LLM's output (may include "true"/"false" + reasoning) | |
| expected: Expected output (should be "true" or "false", may include reasoning) | |
| Returns: | |
| Dictionary with evaluation metrics, extracted booleans, and reasoning: | |
| { | |
| "output_match": 1.0 or 0.0, | |
| "composite_score": 1.0 or 0.0, | |
| "predicted_output": str, | |
| "expected_output": str, | |
| "predicted_boolean": True/False, | |
| "expected_boolean": True/False, | |
| "predicted_reasoning": str, # REQUIRED for LLM-as-judge | |
| "expected_reasoning": str, # REQUIRED for LLM-as-judge | |
| "evaluation_reason": str | |
| } | |
| """ | |
| if not predicted or not expected: | |
| return { | |
| "output_match": 0.0, | |
| "composite_score": 0.0, | |
| "predicted_output": str(predicted).strip() if predicted else "", | |
| "expected_output": str(expected).strip() if expected else "", | |
| "predicted_boolean": None, | |
| "expected_boolean": None, | |
| "predicted_reasoning": "", | |
| "expected_reasoning": "", | |
| "evaluation_reason": "β Empty or missing input/output" | |
| } | |
| predicted_str = str(predicted).strip() | |
| expected_str = str(expected).strip() | |
| # 1. Extract boolean from predicted output | |
| pred_bool = self._normalize_to_bool(predicted_str) | |
| pred_reasoning = self._extract_reasoning(predicted_str) | |
| # 2. Extract boolean from expected output | |
| exp_bool = self._normalize_to_bool(expected_str) | |
| exp_reasoning = self._extract_reasoning(expected_str) | |
| # π₯ NEW: Detect output structure for both expected and predicted | |
| expected_structure = self._detect_output_structure(expected_str) | |
| predicted_structure = self._detect_output_structure(predicted_str) | |
| # Compare structures | |
| structure_match = (expected_structure['format'] == predicted_structure['format']) | |
| # 3. Compare booleans (binary scoring) | |
| if pred_bool is None or exp_bool is None: | |
| # Could not extract boolean from one or both | |
| score = 0.0 | |
| reason = "β Could not extract boolean value" | |
| if pred_bool is None: | |
| reason += " from predicted output" | |
| if exp_bool is None: | |
| reason += " from expected output" | |
| else: | |
| # Both booleans extracted successfully - compare | |
| score = 1.0 if pred_bool == exp_bool else 0.0 | |
| if score == 1.0: | |
| reason = f"β Correct! Result matches (both are {exp_bool})" | |
| # π₯ NEW: Add note if structure doesn't match | |
| if not structure_match: | |
| reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})" | |
| else: | |
| reason = f"β Wrong result (predicted: {pred_bool}, expected: {exp_bool})" | |
| # 4. Log evaluation details | |
| self.logger.info(f"\n{'β'*70}") | |
| self.logger.info(f"π VALIDATION EVALUATION") | |
| self.logger.info(f"{'β'*70}") | |
| self.logger.info(f" Expected: '{expected_str[:100]}...' β {exp_bool}") | |
| self.logger.info(f" Predicted: '{predicted_str[:100]}...' β {pred_bool}") | |
| self.logger.info(f" {'β'*66}") | |
| self.logger.info(f" π― SCORE: {score:.2f} - {reason}") | |
| if pred_reasoning: | |
| self.logger.info(f" π Predicted Reasoning: {pred_reasoning[:150]}...") | |
| if exp_reasoning: | |
| self.logger.info(f" π Expected Reasoning: {exp_reasoning[:150]}...") | |
| # π₯ NEW: Log structure comparison | |
| self.logger.info(f" π Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})") | |
| self.logger.info(f" π Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})") | |
| if not structure_match: | |
| self.logger.warning(f" β οΈ OUTPUT STRUCTURE MISMATCH!") | |
| self.logger.info(f"{'β'*70}\n") | |
| return { | |
| "output_match": score, | |
| "composite_score": score, # This is what GEPA uses | |
| "predicted_output": predicted_str, | |
| "expected_output": expected_str, | |
| "predicted_boolean": pred_bool, | |
| "expected_boolean": exp_bool, | |
| "predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge | |
| "expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge | |
| "evaluation_reason": reason, | |
| # π₯ NEW: Structure metadata for LLM-as-judge | |
| "expected_structure": expected_structure, | |
| "predicted_structure": predicted_structure, | |
| "output_structure_match": structure_match, | |
| "expected_has_reasoning": expected_structure['has_reasoning'], | |
| "predicted_has_reasoning": predicted_structure['has_reasoning'], | |
| "reasoning_quality_gap": expected_structure['reasoning_quality'] + " β " + predicted_structure['reasoning_quality'] | |
| } | |
| def _normalize_to_bool(self, value: str) -> Optional[bool]: | |
| """ | |
| Normalize various formats to boolean. | |
| Handles: | |
| - "true", "True", "TRUE" β True | |
| - "false", "False", "FALSE" β False | |
| - "1", "0" β True, False | |
| - "yes", "no" β True, False | |
| - "correct", "incorrect" β True, False | |
| - JSON: {"result": true} β True | |
| - Text with boolean: "The result is true because..." β True | |
| Args: | |
| value: String that may contain a boolean value | |
| Returns: | |
| Boolean value or None if cannot be determined | |
| """ | |
| if not value: | |
| return None | |
| value_lower = value.lower().strip() | |
| # Direct boolean strings | |
| if value_lower in ("true", "1", "yes", "correct", "valid", "pass"): | |
| return True | |
| if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"): | |
| return False | |
| # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"} | |
| # This handles the production prompt's JSON output format | |
| # Match both quoted and unquoted values, case-insensitive | |
| action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower) | |
| if action_match: | |
| action_value = action_match.group(1).lower() | |
| if action_value == "true": | |
| return True | |
| elif action_value == "false": | |
| return False | |
| elif action_value == "loading": | |
| # Treat LOADING as False for validation purposes (screen not ready) | |
| return False | |
| # Also try to parse full JSON structure if present (more robust) | |
| try: | |
| import json | |
| # Try to find and parse JSON object | |
| json_start = value.find('{') | |
| if json_start != -1: | |
| # Try to extract JSON from the response | |
| for end_idx in range(len(value), json_start, -1): | |
| try: | |
| json_str = value[json_start:end_idx] | |
| data = json.loads(json_str) | |
| # Check for "action" field (production prompt format) | |
| if "action" in data: | |
| action_val = str(data["action"]).upper() | |
| if action_val == "TRUE": | |
| return True | |
| elif action_val == "FALSE": | |
| return False | |
| elif action_val == "LOADING": | |
| return False # Treat as False | |
| # Check for "result" field (alternative format) | |
| if "result" in data: | |
| result_val = data["result"] | |
| if isinstance(result_val, bool): | |
| return result_val | |
| elif isinstance(result_val, str): | |
| return result_val.lower() in ("true", "1", "yes") | |
| except (json.JSONDecodeError, KeyError, ValueError): | |
| continue | |
| except Exception: | |
| pass # Fall through to other extraction methods | |
| # JSON format: {"result": true} or {"result": false} | |
| json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower) | |
| if json_match: | |
| return json_match.group(1) == "true" | |
| # Pattern: "result is true" or "result: true" | |
| pattern_match = re.search(r'result[:\s]+(true|false)', value_lower) | |
| if pattern_match: | |
| return pattern_match.group(1) == "true" | |
| # Pattern: "is true" or "is false" (standalone) | |
| is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower) | |
| if is_match: | |
| return is_match.group(2) == "true" | |
| # Pattern: "true" or "false" as standalone word (not in other words) | |
| standalone_match = re.search(r'\b(true|false)\b', value_lower) | |
| if standalone_match: | |
| return standalone_match.group(1) == "true" | |
| # Last resort: check if "true" appears before "false" in text | |
| true_pos = value_lower.find("true") | |
| false_pos = value_lower.find("false") | |
| if true_pos != -1 and false_pos != -1: | |
| # Both found - use the one that appears first | |
| return true_pos < false_pos | |
| elif true_pos != -1: | |
| return True | |
| elif false_pos != -1: | |
| return False | |
| # Cannot determine | |
| return None | |
| def _detect_output_structure(self, output: str) -> Dict[str, Any]: | |
| """ | |
| Dynamically detect the structure/components of the output. | |
| This detects: | |
| - Boolean result presence | |
| - Reasoning/explanation presence and quality | |
| - Output format (boolean only, boolean+reasoning, etc.) | |
| Args: | |
| output: Output string to analyze | |
| Returns: | |
| Dictionary with structure information: | |
| { | |
| "has_boolean": bool, | |
| "has_reasoning": bool, | |
| "reasoning_length": int, | |
| "reasoning_quality": str, # "missing", "minimal", "adequate", "detailed" | |
| "format": str # "boolean_only", "boolean_with_reasoning", "unknown" | |
| } | |
| """ | |
| if not output: | |
| return { | |
| "has_boolean": False, | |
| "has_reasoning": False, | |
| "reasoning_length": 0, | |
| "reasoning_quality": "missing", | |
| "format": "empty" | |
| } | |
| output_clean = output.strip() | |
| # Detect boolean | |
| has_boolean = self._normalize_to_bool(output_clean) is not None | |
| # Extract reasoning | |
| reasoning = self._extract_reasoning(output_clean) | |
| has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning | |
| reasoning_length = len(reasoning) | |
| # Classify reasoning quality | |
| if reasoning_length == 0: | |
| reasoning_quality = "missing" | |
| elif reasoning_length < 30: | |
| reasoning_quality = "minimal" # Just a few words | |
| elif reasoning_length < 100: | |
| reasoning_quality = "adequate" # Brief explanation | |
| else: | |
| reasoning_quality = "detailed" # Full explanation | |
| # Determine format | |
| if has_boolean and has_reasoning: | |
| output_format = "boolean_with_reasoning" | |
| elif has_boolean and not has_reasoning: | |
| output_format = "boolean_only" | |
| elif not has_boolean and has_reasoning: | |
| output_format = "reasoning_only" | |
| else: | |
| output_format = "unknown" | |
| return { | |
| "has_boolean": has_boolean, | |
| "has_reasoning": has_reasoning, | |
| "reasoning_length": reasoning_length, | |
| "reasoning_quality": reasoning_quality, | |
| "format": output_format | |
| } | |
| def _extract_reasoning(self, output: str) -> str: | |
| """ | |
| Extract reasoning/explanation from output string. | |
| This is REQUIRED for LLM-as-judge feedback. The reasoning helps | |
| the judge understand why the result was true/false and compare | |
| predicted vs expected reasoning. | |
| Args: | |
| output: Full output string that may contain reasoning | |
| Returns: | |
| Extracted reasoning text, or empty string if not found | |
| """ | |
| if not output: | |
| return "" | |
| # Patterns to find reasoning sections | |
| reasoning_patterns = [ | |
| r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)', # "Reason: ..." | |
| r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)', # "Explanation: ..." | |
| r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)', # "Because: ..." | |
| r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)', # "Why: ..." | |
| r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)', # "Details: ..." | |
| ] | |
| # Try each pattern | |
| for pattern in reasoning_patterns: | |
| match = re.search(pattern, output, re.DOTALL | re.IGNORECASE) | |
| if match: | |
| reasoning = match.group(1).strip() | |
| if len(reasoning) > 20: # Only return if substantial | |
| return reasoning | |
| # If no explicit reasoning section, check if output has substantial text | |
| # after boolean (likely contains reasoning) | |
| bool_match = re.search(r'\b(true|false)\b', output.lower()) | |
| if bool_match: | |
| # Get text after the boolean | |
| bool_pos = bool_match.end() | |
| remaining = output[bool_pos:].strip() | |
| # If remaining text is substantial (more than just punctuation), use it | |
| if len(remaining) > 30: | |
| # Clean up common prefixes | |
| remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining) | |
| if remaining: | |
| return remaining | |
| # If output is long and doesn't start with boolean, might be all reasoning | |
| if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE): | |
| # Return first 500 chars as reasoning | |
| return output[:500].strip() | |
| # No reasoning found | |
| return "" | |
| def get_evaluation_summary(self, results: list) -> Dict[str, Any]: | |
| """ | |
| Get summary statistics for a batch of evaluations. | |
| Args: | |
| results: List of evaluation result dictionaries | |
| Returns: | |
| Summary statistics including accuracy, true/false distribution | |
| """ | |
| if not results: | |
| return { | |
| "total_samples": 0, | |
| "accuracy": 0.0, | |
| "correct_predictions": 0, | |
| "incorrect_predictions": 0, | |
| "true_predictions": 0, | |
| "false_predictions": 0 | |
| } | |
| total = len(results) | |
| correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0) | |
| accuracy = correct / total if total > 0 else 0.0 | |
| # Count true/false predictions | |
| true_preds = sum(1 for r in results if r.get("predicted_boolean") is True) | |
| false_preds = sum(1 for r in results if r.get("predicted_boolean") is False) | |
| return { | |
| "total_samples": total, | |
| "accuracy": accuracy, | |
| "correct_predictions": correct, | |
| "incorrect_predictions": total - correct, | |
| "true_predictions": true_preds, | |
| "false_predictions": false_preds | |
| } | |
| # Example usage and testing | |
| if __name__ == "__main__": | |
| print("π Testing Validation Evaluator...") | |
| evaluator = ValidationEvaluator() | |
| # Test cases | |
| test_cases = [ | |
| # (predicted, expected, should_match) | |
| ("true", "true", True), | |
| ("false", "false", True), | |
| ("True", "true", True), | |
| ("FALSE", "false", True), | |
| ("1", "true", True), | |
| ("0", "false", True), | |
| ("true", "false", False), | |
| ("false", "true", False), | |
| ("The result is true because the button is visible", "true", True), | |
| ("The result is false because the element is not found", "false", True), | |
| ('{"result": true, "reasoning": "Button is visible"}', "true", True), | |
| ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True), | |
| ("", "true", False), | |
| ("invalid", "true", False), | |
| ] | |
| print("\nπ Running test cases:") | |
| print("-" * 80) | |
| results = [] | |
| for predicted, expected, should_match in test_cases: | |
| result = evaluator.evaluate(predicted, expected) | |
| match = result["composite_score"] == 1.0 | |
| status = "β " if match == should_match else "β" | |
| pred_bool = result.get("predicted_boolean", "?") | |
| exp_bool = result.get("expected_boolean", "?") | |
| pred_reason = result.get("predicted_reasoning", "")[:50] | |
| print(f"{status} Predicted: '{predicted[:40]}...' β {pred_bool}") | |
| print(f" Expected: '{expected}' β {exp_bool}") | |
| print(f" Match: {match} (should be {should_match})") | |
| if pred_reason: | |
| print(f" Reasoning: {pred_reason}...") | |
| print() | |
| results.append(result) | |
| # Summary | |
| print("\nπ Summary:") | |
| summary = evaluator.get_evaluation_summary(results) | |
| print(f" Total: {summary['total_samples']}") | |
| print(f" Correct: {summary['correct_predictions']}") | |
| print(f" Accuracy: {summary['accuracy']:.1%}") | |
| print(f" True predictions: {summary['true_predictions']}") | |
| print(f" False predictions: {summary['false_predictions']}") | |