Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

App Files Files Community

Universal-prompt-Optimizer / src /gepa_optimizer /evaluation /validation_evaluator.py

Suhasdev

Deploy Universal Prompt Optimizer to HF Spaces (clean)

cacd4d0 about 2 months ago

raw

history blame contribute delete

20.3 kB

	"""
	Validation Evaluator for UI Validation Use Case

	Evaluates predicted validation results (true/false) against expected results.
	Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
	"""

	from typing import Dict, Any, Optional
	import re
	import logging

	try:
	from .base_evaluator import BaseEvaluator
	except ImportError:
	# For standalone testing
	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parent.parent.parent))
	from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator


	class ValidationEvaluator(BaseEvaluator):
	"""
	Evaluator for validation use case (true/false results).

	Features:
	- Normalizes boolean formats ("true"/"True"/"1" → True, "false"/"False"/"0" → False)
	- Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
	- Binary scoring: correct boolean = 1.0, wrong = 0.0
	- Returns reasoning in evaluation results for LLM-as-judge feedback
	"""

	def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
	"""
	Initialize validation evaluator.

	Args:
	metric_weights: Weights for evaluation metrics
	Default: {"output_match": 1.0}
	"""
	default_weights = {
	"output_match": 1.0 # Binary boolean comparison
	}

	weights = metric_weights or default_weights
	super().__init__(metric_weights=weights)

	def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
	"""
	Evaluate predicted validation result against expected result.

	Scoring Strategy:
	1. Normalize both predicted and expected to boolean
	2. Compare booleans (exact match required)
	3. Extract reasoning from both (for LLM-as-judge)
	4. Return 1.0 if match, 0.0 otherwise (binary scoring)

	Args:
	predicted: LLM's output (may include "true"/"false" + reasoning)
	expected: Expected output (should be "true" or "false", may include reasoning)

	Returns:
	Dictionary with evaluation metrics, extracted booleans, and reasoning:
	{
	"output_match": 1.0 or 0.0,
	"composite_score": 1.0 or 0.0,
	"predicted_output": str,
	"expected_output": str,
	"predicted_boolean": True/False,
	"expected_boolean": True/False,
	"predicted_reasoning": str, # REQUIRED for LLM-as-judge
	"expected_reasoning": str, # REQUIRED for LLM-as-judge
	"evaluation_reason": str
	}
	"""
	if not predicted or not expected:
	return {
	"output_match": 0.0,
	"composite_score": 0.0,
	"predicted_output": str(predicted).strip() if predicted else "",
	"expected_output": str(expected).strip() if expected else "",
	"predicted_boolean": None,
	"expected_boolean": None,
	"predicted_reasoning": "",
	"expected_reasoning": "",
	"evaluation_reason": "❌ Empty or missing input/output"
	}

	predicted_str = str(predicted).strip()
	expected_str = str(expected).strip()

	# 1. Extract boolean from predicted output
	pred_bool = self._normalize_to_bool(predicted_str)
	pred_reasoning = self._extract_reasoning(predicted_str)

	# 2. Extract boolean from expected output
	exp_bool = self._normalize_to_bool(expected_str)
	exp_reasoning = self._extract_reasoning(expected_str)

	# 🔥 NEW: Detect output structure for both expected and predicted
	expected_structure = self._detect_output_structure(expected_str)
	predicted_structure = self._detect_output_structure(predicted_str)

	# Compare structures
	structure_match = (expected_structure['format'] == predicted_structure['format'])

	# 3. Compare booleans (binary scoring)
	if pred_bool is None or exp_bool is None:
	# Could not extract boolean from one or both
	score = 0.0
	reason = "❌ Could not extract boolean value"
	if pred_bool is None:
	reason += " from predicted output"
	if exp_bool is None:
	reason += " from expected output"
	else:
	# Both booleans extracted successfully - compare
	score = 1.0 if pred_bool == exp_bool else 0.0
	if score == 1.0:
	reason = f"✅ Correct! Result matches (both are {exp_bool})"
	# 🔥 NEW: Add note if structure doesn't match
	if not structure_match:
	reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
	else:
	reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"

	# 4. Log evaluation details
	self.logger.info(f"\n{'─'*70}")
	self.logger.info(f"📊 VALIDATION EVALUATION")
	self.logger.info(f"{'─'*70}")
	self.logger.info(f" Expected: '{expected_str[:100]}...' → {exp_bool}")
	self.logger.info(f" Predicted: '{predicted_str[:100]}...' → {pred_bool}")
	self.logger.info(f" {'─'*66}")
	self.logger.info(f" 🎯 SCORE: {score:.2f} - {reason}")
	if pred_reasoning:
	self.logger.info(f" 📝 Predicted Reasoning: {pred_reasoning[:150]}...")
	if exp_reasoning:
	self.logger.info(f" 📝 Expected Reasoning: {exp_reasoning[:150]}...")
	# 🔥 NEW: Log structure comparison
	self.logger.info(f" 📐 Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
	self.logger.info(f" 📐 Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
	if not structure_match:
	self.logger.warning(f" ⚠️ OUTPUT STRUCTURE MISMATCH!")
	self.logger.info(f"{'─'*70}\n")

	return {
	"output_match": score,
	"composite_score": score, # This is what GEPA uses
	"predicted_output": predicted_str,
	"expected_output": expected_str,
	"predicted_boolean": pred_bool,
	"expected_boolean": exp_bool,
	"predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge
	"expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge
	"evaluation_reason": reason,
	# 🔥 NEW: Structure metadata for LLM-as-judge
	"expected_structure": expected_structure,
	"predicted_structure": predicted_structure,
	"output_structure_match": structure_match,
	"expected_has_reasoning": expected_structure['has_reasoning'],
	"predicted_has_reasoning": predicted_structure['has_reasoning'],
	"reasoning_quality_gap": expected_structure['reasoning_quality'] + " → " + predicted_structure['reasoning_quality']
	}

	def _normalize_to_bool(self, value: str) -> Optional[bool]:
	"""
	Normalize various formats to boolean.

	Handles:
	- "true", "True", "TRUE" → True
	- "false", "False", "FALSE" → False
	- "1", "0" → True, False
	- "yes", "no" → True, False
	- "correct", "incorrect" → True, False
	- JSON: {"result": true} → True
	- Text with boolean: "The result is true because..." → True

	Args:
	value: String that may contain a boolean value

	Returns:
	Boolean value or None if cannot be determined
	"""
	if not value:
	return None

	value_lower = value.lower().strip()

	# Direct boolean strings
	if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
	return True
	if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
	return False

	# JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
	# This handles the production prompt's JSON output format
	# Match both quoted and unquoted values, case-insensitive
	action_match = re.search(r'["\']?action["\']?\s:\s["\']?(true\|false\|loading)["\']?', value_lower)
	if action_match:
	action_value = action_match.group(1).lower()
	if action_value == "true":
	return True
	elif action_value == "false":
	return False
	elif action_value == "loading":
	# Treat LOADING as False for validation purposes (screen not ready)
	return False

	# Also try to parse full JSON structure if present (more robust)
	try:
	import json
	# Try to find and parse JSON object
	json_start = value.find('{')
	if json_start != -1:
	# Try to extract JSON from the response
	for end_idx in range(len(value), json_start, -1):
	try:
	json_str = value[json_start:end_idx]
	data = json.loads(json_str)
	# Check for "action" field (production prompt format)
	if "action" in data:
	action_val = str(data["action"]).upper()
	if action_val == "TRUE":
	return True
	elif action_val == "FALSE":
	return False
	elif action_val == "LOADING":
	return False # Treat as False
	# Check for "result" field (alternative format)
	if "result" in data:
	result_val = data["result"]
	if isinstance(result_val, bool):
	return result_val
	elif isinstance(result_val, str):
	return result_val.lower() in ("true", "1", "yes")
	except (json.JSONDecodeError, KeyError, ValueError):
	continue
	except Exception:
	pass # Fall through to other extraction methods

	# JSON format: {"result": true} or {"result": false}
	json_match = re.search(r'["\']?result["\']?\s:\s(true\|false)', value_lower)
	if json_match:
	return json_match.group(1) == "true"

	# Pattern: "result is true" or "result: true"
	pattern_match = re.search(r'result[:\s]+(true\|false)', value_lower)
	if pattern_match:
	return pattern_match.group(1) == "true"

	# Pattern: "is true" or "is false" (standalone)
	is_match = re.search(r'\b(is\|are)\s+(true\|false)\b', value_lower)
	if is_match:
	return is_match.group(2) == "true"

	# Pattern: "true" or "false" as standalone word (not in other words)
	standalone_match = re.search(r'\b(true\|false)\b', value_lower)
	if standalone_match:
	return standalone_match.group(1) == "true"

	# Last resort: check if "true" appears before "false" in text
	true_pos = value_lower.find("true")
	false_pos = value_lower.find("false")

	if true_pos != -1 and false_pos != -1:
	# Both found - use the one that appears first
	return true_pos < false_pos
	elif true_pos != -1:
	return True
	elif false_pos != -1:
	return False

	# Cannot determine
	return None

	def _detect_output_structure(self, output: str) -> Dict[str, Any]:
	"""
	Dynamically detect the structure/components of the output.

	This detects:
	- Boolean result presence
	- Reasoning/explanation presence and quality
	- Output format (boolean only, boolean+reasoning, etc.)

	Args:
	output: Output string to analyze

	Returns:
	Dictionary with structure information:
	{
	"has_boolean": bool,
	"has_reasoning": bool,
	"reasoning_length": int,
	"reasoning_quality": str, # "missing", "minimal", "adequate", "detailed"
	"format": str # "boolean_only", "boolean_with_reasoning", "unknown"
	}
	"""
	if not output:
	return {
	"has_boolean": False,
	"has_reasoning": False,
	"reasoning_length": 0,
	"reasoning_quality": "missing",
	"format": "empty"
	}

	output_clean = output.strip()

	# Detect boolean
	has_boolean = self._normalize_to_bool(output_clean) is not None

	# Extract reasoning
	reasoning = self._extract_reasoning(output_clean)
	has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning
	reasoning_length = len(reasoning)

	# Classify reasoning quality
	if reasoning_length == 0:
	reasoning_quality = "missing"
	elif reasoning_length < 30:
	reasoning_quality = "minimal" # Just a few words
	elif reasoning_length < 100:
	reasoning_quality = "adequate" # Brief explanation
	else:
	reasoning_quality = "detailed" # Full explanation

	# Determine format
	if has_boolean and has_reasoning:
	output_format = "boolean_with_reasoning"
	elif has_boolean and not has_reasoning:
	output_format = "boolean_only"
	elif not has_boolean and has_reasoning:
	output_format = "reasoning_only"
	else:
	output_format = "unknown"

	return {
	"has_boolean": has_boolean,
	"has_reasoning": has_reasoning,
	"reasoning_length": reasoning_length,
	"reasoning_quality": reasoning_quality,
	"format": output_format
	}

	def _extract_reasoning(self, output: str) -> str:
	"""
	Extract reasoning/explanation from output string.

	This is REQUIRED for LLM-as-judge feedback. The reasoning helps
	the judge understand why the result was true/false and compare
	predicted vs expected reasoning.

	Args:
	output: Full output string that may contain reasoning

	Returns:
	Extracted reasoning text, or empty string if not found
	"""
	if not output:
	return ""

	# Patterns to find reasoning sections
	reasoning_patterns = [
	r'[Rr]eason[:\s]+(.*?)(?:\n\n\|\Z)', # "Reason: ..."
	r'[Ee]xplanation[:\s]+(.*?)(?:\n\n\|\Z)', # "Explanation: ..."
	r'[Bb]ecause[:\s]+(.*?)(?:\n\n\|\Z)', # "Because: ..."
	r'[Ww]hy[:\s]+(.*?)(?:\n\n\|\Z)', # "Why: ..."
	r'[Dd]etails[:\s]+(.*?)(?:\n\n\|\Z)', # "Details: ..."
	]

	# Try each pattern
	for pattern in reasoning_patterns:
	match = re.search(pattern, output, re.DOTALL \| re.IGNORECASE)
	if match:
	reasoning = match.group(1).strip()
	if len(reasoning) > 20: # Only return if substantial
	return reasoning

	# If no explicit reasoning section, check if output has substantial text
	# after boolean (likely contains reasoning)
	bool_match = re.search(r'\b(true\|false)\b', output.lower())
	if bool_match:
	# Get text after the boolean
	bool_pos = bool_match.end()
	remaining = output[bool_pos:].strip()

	# If remaining text is substantial (more than just punctuation), use it
	if len(remaining) > 30:
	# Clean up common prefixes
	remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
	if remaining:
	return remaining

	# If output is long and doesn't start with boolean, might be all reasoning
	if len(output) > 100 and not re.match(r'^\s(true\|false)\s$', output, re.IGNORECASE):
	# Return first 500 chars as reasoning
	return output[:500].strip()

	# No reasoning found
	return ""

	def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
	"""
	Get summary statistics for a batch of evaluations.

	Args:
	results: List of evaluation result dictionaries

	Returns:
	Summary statistics including accuracy, true/false distribution
	"""
	if not results:
	return {
	"total_samples": 0,
	"accuracy": 0.0,
	"correct_predictions": 0,
	"incorrect_predictions": 0,
	"true_predictions": 0,
	"false_predictions": 0
	}

	total = len(results)
	correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
	accuracy = correct / total if total > 0 else 0.0

	# Count true/false predictions
	true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
	false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)

	return {
	"total_samples": total,
	"accuracy": accuracy,
	"correct_predictions": correct,
	"incorrect_predictions": total - correct,
	"true_predictions": true_preds,
	"false_predictions": false_preds
	}


	# Example usage and testing
	if __name__ == "__main__":
	print("🚀 Testing Validation Evaluator...")

	evaluator = ValidationEvaluator()

	# Test cases
	test_cases = [
	# (predicted, expected, should_match)
	("true", "true", True),
	("false", "false", True),
	("True", "true", True),
	("FALSE", "false", True),
	("1", "true", True),
	("0", "false", True),
	("true", "false", False),
	("false", "true", False),
	("The result is true because the button is visible", "true", True),
	("The result is false because the element is not found", "false", True),
	('{"result": true, "reasoning": "Button is visible"}', "true", True),
	("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
	("", "true", False),
	("invalid", "true", False),
	]

	print("\n📝 Running test cases:")
	print("-" * 80)

	results = []
	for predicted, expected, should_match in test_cases:
	result = evaluator.evaluate(predicted, expected)
	match = result["composite_score"] == 1.0

	status = "✅" if match == should_match else "❌"
	pred_bool = result.get("predicted_boolean", "?")
	exp_bool = result.get("expected_boolean", "?")
	pred_reason = result.get("predicted_reasoning", "")[:50]

	print(f"{status} Predicted: '{predicted[:40]}...' → {pred_bool}")
	print(f" Expected: '{expected}' → {exp_bool}")
	print(f" Match: {match} (should be {should_match})")
	if pred_reason:
	print(f" Reasoning: {pred_reason}...")
	print()

	results.append(result)

	# Summary
	print("\n📊 Summary:")
	summary = evaluator.get_evaluation_summary(results)
	print(f" Total: {summary['total_samples']}")
	print(f" Correct: {summary['correct_predictions']}")
	print(f" Accuracy: {summary['accuracy']:.1%}")
	print(f" True predictions: {summary['true_predictions']}")
	print(f" False predictions: {summary['false_predictions']}")