Suhasdev's picture
Deploy Universal Prompt Optimizer to HF Spaces (clean)
cacd4d0
"""
Validation Evaluator for UI Validation Use Case
Evaluates predicted validation results (true/false) against expected results.
Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
"""
from typing import Dict, Any, Optional
import re
import logging
try:
from .base_evaluator import BaseEvaluator
except ImportError:
# For standalone testing
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
class ValidationEvaluator(BaseEvaluator):
"""
Evaluator for validation use case (true/false results).
Features:
- Normalizes boolean formats ("true"/"True"/"1" β†’ True, "false"/"False"/"0" β†’ False)
- Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
- Binary scoring: correct boolean = 1.0, wrong = 0.0
- Returns reasoning in evaluation results for LLM-as-judge feedback
"""
def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
"""
Initialize validation evaluator.
Args:
metric_weights: Weights for evaluation metrics
Default: {"output_match": 1.0}
"""
default_weights = {
"output_match": 1.0 # Binary boolean comparison
}
weights = metric_weights or default_weights
super().__init__(metric_weights=weights)
def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
"""
Evaluate predicted validation result against expected result.
Scoring Strategy:
1. Normalize both predicted and expected to boolean
2. Compare booleans (exact match required)
3. Extract reasoning from both (for LLM-as-judge)
4. Return 1.0 if match, 0.0 otherwise (binary scoring)
Args:
predicted: LLM's output (may include "true"/"false" + reasoning)
expected: Expected output (should be "true" or "false", may include reasoning)
Returns:
Dictionary with evaluation metrics, extracted booleans, and reasoning:
{
"output_match": 1.0 or 0.0,
"composite_score": 1.0 or 0.0,
"predicted_output": str,
"expected_output": str,
"predicted_boolean": True/False,
"expected_boolean": True/False,
"predicted_reasoning": str, # REQUIRED for LLM-as-judge
"expected_reasoning": str, # REQUIRED for LLM-as-judge
"evaluation_reason": str
}
"""
if not predicted or not expected:
return {
"output_match": 0.0,
"composite_score": 0.0,
"predicted_output": str(predicted).strip() if predicted else "",
"expected_output": str(expected).strip() if expected else "",
"predicted_boolean": None,
"expected_boolean": None,
"predicted_reasoning": "",
"expected_reasoning": "",
"evaluation_reason": "❌ Empty or missing input/output"
}
predicted_str = str(predicted).strip()
expected_str = str(expected).strip()
# 1. Extract boolean from predicted output
pred_bool = self._normalize_to_bool(predicted_str)
pred_reasoning = self._extract_reasoning(predicted_str)
# 2. Extract boolean from expected output
exp_bool = self._normalize_to_bool(expected_str)
exp_reasoning = self._extract_reasoning(expected_str)
# πŸ”₯ NEW: Detect output structure for both expected and predicted
expected_structure = self._detect_output_structure(expected_str)
predicted_structure = self._detect_output_structure(predicted_str)
# Compare structures
structure_match = (expected_structure['format'] == predicted_structure['format'])
# 3. Compare booleans (binary scoring)
if pred_bool is None or exp_bool is None:
# Could not extract boolean from one or both
score = 0.0
reason = "❌ Could not extract boolean value"
if pred_bool is None:
reason += " from predicted output"
if exp_bool is None:
reason += " from expected output"
else:
# Both booleans extracted successfully - compare
score = 1.0 if pred_bool == exp_bool else 0.0
if score == 1.0:
reason = f"βœ… Correct! Result matches (both are {exp_bool})"
# πŸ”₯ NEW: Add note if structure doesn't match
if not structure_match:
reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
else:
reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
# 4. Log evaluation details
self.logger.info(f"\n{'─'*70}")
self.logger.info(f"πŸ“Š VALIDATION EVALUATION")
self.logger.info(f"{'─'*70}")
self.logger.info(f" Expected: '{expected_str[:100]}...' β†’ {exp_bool}")
self.logger.info(f" Predicted: '{predicted_str[:100]}...' β†’ {pred_bool}")
self.logger.info(f" {'─'*66}")
self.logger.info(f" 🎯 SCORE: {score:.2f} - {reason}")
if pred_reasoning:
self.logger.info(f" πŸ“ Predicted Reasoning: {pred_reasoning[:150]}...")
if exp_reasoning:
self.logger.info(f" πŸ“ Expected Reasoning: {exp_reasoning[:150]}...")
# πŸ”₯ NEW: Log structure comparison
self.logger.info(f" πŸ“ Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
self.logger.info(f" πŸ“ Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
if not structure_match:
self.logger.warning(f" ⚠️ OUTPUT STRUCTURE MISMATCH!")
self.logger.info(f"{'─'*70}\n")
return {
"output_match": score,
"composite_score": score, # This is what GEPA uses
"predicted_output": predicted_str,
"expected_output": expected_str,
"predicted_boolean": pred_bool,
"expected_boolean": exp_bool,
"predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge
"expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge
"evaluation_reason": reason,
# πŸ”₯ NEW: Structure metadata for LLM-as-judge
"expected_structure": expected_structure,
"predicted_structure": predicted_structure,
"output_structure_match": structure_match,
"expected_has_reasoning": expected_structure['has_reasoning'],
"predicted_has_reasoning": predicted_structure['has_reasoning'],
"reasoning_quality_gap": expected_structure['reasoning_quality'] + " β†’ " + predicted_structure['reasoning_quality']
}
def _normalize_to_bool(self, value: str) -> Optional[bool]:
"""
Normalize various formats to boolean.
Handles:
- "true", "True", "TRUE" β†’ True
- "false", "False", "FALSE" β†’ False
- "1", "0" β†’ True, False
- "yes", "no" β†’ True, False
- "correct", "incorrect" β†’ True, False
- JSON: {"result": true} β†’ True
- Text with boolean: "The result is true because..." β†’ True
Args:
value: String that may contain a boolean value
Returns:
Boolean value or None if cannot be determined
"""
if not value:
return None
value_lower = value.lower().strip()
# Direct boolean strings
if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
return True
if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
return False
# JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
# This handles the production prompt's JSON output format
# Match both quoted and unquoted values, case-insensitive
action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
if action_match:
action_value = action_match.group(1).lower()
if action_value == "true":
return True
elif action_value == "false":
return False
elif action_value == "loading":
# Treat LOADING as False for validation purposes (screen not ready)
return False
# Also try to parse full JSON structure if present (more robust)
try:
import json
# Try to find and parse JSON object
json_start = value.find('{')
if json_start != -1:
# Try to extract JSON from the response
for end_idx in range(len(value), json_start, -1):
try:
json_str = value[json_start:end_idx]
data = json.loads(json_str)
# Check for "action" field (production prompt format)
if "action" in data:
action_val = str(data["action"]).upper()
if action_val == "TRUE":
return True
elif action_val == "FALSE":
return False
elif action_val == "LOADING":
return False # Treat as False
# Check for "result" field (alternative format)
if "result" in data:
result_val = data["result"]
if isinstance(result_val, bool):
return result_val
elif isinstance(result_val, str):
return result_val.lower() in ("true", "1", "yes")
except (json.JSONDecodeError, KeyError, ValueError):
continue
except Exception:
pass # Fall through to other extraction methods
# JSON format: {"result": true} or {"result": false}
json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
if json_match:
return json_match.group(1) == "true"
# Pattern: "result is true" or "result: true"
pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
if pattern_match:
return pattern_match.group(1) == "true"
# Pattern: "is true" or "is false" (standalone)
is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
if is_match:
return is_match.group(2) == "true"
# Pattern: "true" or "false" as standalone word (not in other words)
standalone_match = re.search(r'\b(true|false)\b', value_lower)
if standalone_match:
return standalone_match.group(1) == "true"
# Last resort: check if "true" appears before "false" in text
true_pos = value_lower.find("true")
false_pos = value_lower.find("false")
if true_pos != -1 and false_pos != -1:
# Both found - use the one that appears first
return true_pos < false_pos
elif true_pos != -1:
return True
elif false_pos != -1:
return False
# Cannot determine
return None
def _detect_output_structure(self, output: str) -> Dict[str, Any]:
"""
Dynamically detect the structure/components of the output.
This detects:
- Boolean result presence
- Reasoning/explanation presence and quality
- Output format (boolean only, boolean+reasoning, etc.)
Args:
output: Output string to analyze
Returns:
Dictionary with structure information:
{
"has_boolean": bool,
"has_reasoning": bool,
"reasoning_length": int,
"reasoning_quality": str, # "missing", "minimal", "adequate", "detailed"
"format": str # "boolean_only", "boolean_with_reasoning", "unknown"
}
"""
if not output:
return {
"has_boolean": False,
"has_reasoning": False,
"reasoning_length": 0,
"reasoning_quality": "missing",
"format": "empty"
}
output_clean = output.strip()
# Detect boolean
has_boolean = self._normalize_to_bool(output_clean) is not None
# Extract reasoning
reasoning = self._extract_reasoning(output_clean)
has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning
reasoning_length = len(reasoning)
# Classify reasoning quality
if reasoning_length == 0:
reasoning_quality = "missing"
elif reasoning_length < 30:
reasoning_quality = "minimal" # Just a few words
elif reasoning_length < 100:
reasoning_quality = "adequate" # Brief explanation
else:
reasoning_quality = "detailed" # Full explanation
# Determine format
if has_boolean and has_reasoning:
output_format = "boolean_with_reasoning"
elif has_boolean and not has_reasoning:
output_format = "boolean_only"
elif not has_boolean and has_reasoning:
output_format = "reasoning_only"
else:
output_format = "unknown"
return {
"has_boolean": has_boolean,
"has_reasoning": has_reasoning,
"reasoning_length": reasoning_length,
"reasoning_quality": reasoning_quality,
"format": output_format
}
def _extract_reasoning(self, output: str) -> str:
"""
Extract reasoning/explanation from output string.
This is REQUIRED for LLM-as-judge feedback. The reasoning helps
the judge understand why the result was true/false and compare
predicted vs expected reasoning.
Args:
output: Full output string that may contain reasoning
Returns:
Extracted reasoning text, or empty string if not found
"""
if not output:
return ""
# Patterns to find reasoning sections
reasoning_patterns = [
r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)', # "Reason: ..."
r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)', # "Explanation: ..."
r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)', # "Because: ..."
r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)', # "Why: ..."
r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)', # "Details: ..."
]
# Try each pattern
for pattern in reasoning_patterns:
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
if match:
reasoning = match.group(1).strip()
if len(reasoning) > 20: # Only return if substantial
return reasoning
# If no explicit reasoning section, check if output has substantial text
# after boolean (likely contains reasoning)
bool_match = re.search(r'\b(true|false)\b', output.lower())
if bool_match:
# Get text after the boolean
bool_pos = bool_match.end()
remaining = output[bool_pos:].strip()
# If remaining text is substantial (more than just punctuation), use it
if len(remaining) > 30:
# Clean up common prefixes
remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
if remaining:
return remaining
# If output is long and doesn't start with boolean, might be all reasoning
if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
# Return first 500 chars as reasoning
return output[:500].strip()
# No reasoning found
return ""
def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
"""
Get summary statistics for a batch of evaluations.
Args:
results: List of evaluation result dictionaries
Returns:
Summary statistics including accuracy, true/false distribution
"""
if not results:
return {
"total_samples": 0,
"accuracy": 0.0,
"correct_predictions": 0,
"incorrect_predictions": 0,
"true_predictions": 0,
"false_predictions": 0
}
total = len(results)
correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
accuracy = correct / total if total > 0 else 0.0
# Count true/false predictions
true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
return {
"total_samples": total,
"accuracy": accuracy,
"correct_predictions": correct,
"incorrect_predictions": total - correct,
"true_predictions": true_preds,
"false_predictions": false_preds
}
# Example usage and testing
if __name__ == "__main__":
print("πŸš€ Testing Validation Evaluator...")
evaluator = ValidationEvaluator()
# Test cases
test_cases = [
# (predicted, expected, should_match)
("true", "true", True),
("false", "false", True),
("True", "true", True),
("FALSE", "false", True),
("1", "true", True),
("0", "false", True),
("true", "false", False),
("false", "true", False),
("The result is true because the button is visible", "true", True),
("The result is false because the element is not found", "false", True),
('{"result": true, "reasoning": "Button is visible"}', "true", True),
("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
("", "true", False),
("invalid", "true", False),
]
print("\nπŸ“ Running test cases:")
print("-" * 80)
results = []
for predicted, expected, should_match in test_cases:
result = evaluator.evaluate(predicted, expected)
match = result["composite_score"] == 1.0
status = "βœ…" if match == should_match else "❌"
pred_bool = result.get("predicted_boolean", "?")
exp_bool = result.get("expected_boolean", "?")
pred_reason = result.get("predicted_reasoning", "")[:50]
print(f"{status} Predicted: '{predicted[:40]}...' β†’ {pred_bool}")
print(f" Expected: '{expected}' β†’ {exp_bool}")
print(f" Match: {match} (should be {should_match})")
if pred_reason:
print(f" Reasoning: {pred_reason}...")
print()
results.append(result)
# Summary
print("\nπŸ“Š Summary:")
summary = evaluator.get_evaluation_summary(results)
print(f" Total: {summary['total_samples']}")
print(f" Correct: {summary['correct_predictions']}")
print(f" Accuracy: {summary['accuracy']:.1%}")
print(f" True predictions: {summary['true_predictions']}")
print(f" False predictions: {summary['false_predictions']}")