Spaces:
Sleeping
Sleeping
| """ | |
| GENERIC String Match Evaluator | |
| Compares predicted output against expected output (simple string comparison). | |
| NO assumptions about what the output represents (IDs, text, JSON, etc.). | |
| Let GEPA discover the correct output format through evolution and feedback! | |
| """ | |
| from typing import Dict, Any | |
| try: | |
| from .base_evaluator import BaseEvaluator | |
| except ImportError: | |
| # For standalone testing | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent.parent)) | |
| from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator | |
| class ScrollElementEvaluator(BaseEvaluator): | |
| """ | |
| GENERIC evaluator - just compares strings! | |
| NO assumptions about: | |
| - Output format (element IDs, text, JSON, etc.) | |
| - Output structure | |
| - What the task is | |
| GEPA will learn the correct format through feedback and evolution. | |
| """ | |
| def __init__(self, metric_weights: Dict[str, float] = None): | |
| """ | |
| Initialize evaluator. | |
| Args: | |
| metric_weights: Weights for evaluation metrics | |
| Default: {"output_match": 1.0} | |
| """ | |
| default_weights = { | |
| "output_match": 1.0 # Simple string comparison | |
| } | |
| weights = metric_weights or default_weights | |
| super().__init__(metric_weights=weights) | |
| def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: | |
| """ | |
| Binary evaluation with element ID extraction. | |
| Phase 1 Implementation: | |
| - Extracts element IDs using regex patterns (flexible format support) | |
| - Uses INTEGER comparison for robustness (prevents "4" vs "14" bugs) | |
| - Binary scoring: correct element = 1.0, wrong/missing = 0.0 | |
| Scoring Strategy: | |
| 1. Extract element ID from both predicted and expected outputs | |
| 2. Compare using integer arithmetic (not string comparison) | |
| 3. Return 1.0 if match, 0.0 otherwise (no partial credit) | |
| Args: | |
| predicted: LLM's output (may include verbose explanation) | |
| expected: Expected output (may include verbose explanation) | |
| Returns: | |
| Dictionary with evaluation metrics and extracted element IDs | |
| """ | |
| import re | |
| if not predicted or not expected: | |
| return { | |
| "content_match": 0.0, | |
| "output_match": 0.0, | |
| "composite_score": 0.0, | |
| "predicted_output": str(predicted).strip() if predicted else "", | |
| "expected_output": str(expected).strip() if expected else "", | |
| "predicted_element": "None", | |
| "expected_element": "None", | |
| "evaluation_reason": "β Empty or missing input/output" | |
| } | |
| predicted_str = str(predicted).strip() | |
| expected_str = str(expected).strip() | |
| # 1. Extract element numbers using MULTIPLE strategies (flexible!) | |
| # Strategy A: "Element: X" or "Element X" (explicit format) | |
| element_pattern_a = r'element[:\s]+(\d+)' | |
| # Strategy B: "element X" or "Element X" anywhere in text | |
| element_pattern_b = r'\belement\s+(\d+)\b' | |
| # Strategy C: Just find ANY number if other strategies fail (last resort) | |
| number_pattern = r'\b(\d+)\b' | |
| # Try to extract from predicted | |
| pred_match = re.search(element_pattern_a, predicted_str, re.IGNORECASE) | |
| if not pred_match: | |
| pred_match = re.search(element_pattern_b, predicted_str, re.IGNORECASE) | |
| if not pred_match: | |
| # Last resort: find first number in the text | |
| pred_match = re.search(number_pattern, predicted_str) | |
| # Try to extract from expected | |
| exp_match = re.search(element_pattern_a, expected_str, re.IGNORECASE) | |
| if not exp_match: | |
| exp_match = re.search(element_pattern_b, expected_str, re.IGNORECASE) | |
| if not exp_match: | |
| exp_match = re.search(number_pattern, expected_str) | |
| # 2. Check if we found element numbers in both | |
| if not exp_match: | |
| # Expected doesn't have element pattern - fallback to exact match | |
| content_score = 1.0 if predicted_str.lower() == expected_str.lower() else 0.0 | |
| elif not pred_match: | |
| # Predicted doesn't have element number - WRONG | |
| content_score = 0.0 | |
| else: | |
| # Both have element pattern - compare using INTEGER comparison | |
| pred_element = pred_match.group(1) | |
| exp_element = exp_match.group(1) | |
| # π₯ Phase 1: Use INTEGER comparison for robustness | |
| # This prevents bugs like "4" != "14" string comparison issues | |
| try: | |
| pred_num = int(pred_element) | |
| exp_num = int(exp_element) | |
| # Integer comparison (more robust than string) | |
| content_score = 1.0 if pred_num == exp_num else 0.0 | |
| # Log comparison for debugging | |
| if pred_num != exp_num: | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| logger.debug(f"Element mismatch: predicted={pred_num}, expected={exp_num}") | |
| except (ValueError, TypeError) as e: | |
| # Fallback to string comparison if conversion fails | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| logger.warning(f"Could not convert elements to integers: {e}, using string comparison") | |
| content_score = 1.0 if pred_element == exp_element else 0.0 | |
| # 3. Binary score and reason | |
| if content_score == 1.0: | |
| composite_score = 1.0 | |
| reason = "β Correct! Element number matches" | |
| else: | |
| composite_score = 0.0 | |
| if pred_match and exp_match: | |
| reason = "β Wrong element number (predicted different element)" | |
| else: | |
| reason = "β Missing or invalid element number" | |
| pred_element = pred_match.group(1) if pred_match else "None" | |
| exp_element = exp_match.group(1) if exp_match else "None" | |
| # Detailed logging for transparency | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| logger.info(f"\n{'β'*70}") | |
| logger.info(f"π EVALUATION DETAILS") | |
| logger.info(f"{'β'*70}") | |
| logger.info(f" Expected: '{expected_str}' (Element: {exp_element})") | |
| logger.info(f" Predicted: '{predicted_str}' (Element: {pred_element})") | |
| logger.info(f" {'β'*66}") | |
| logger.info(f" π― SCORE: {composite_score:.2f} - {reason}") | |
| logger.info(f"{'β'*70}\n") | |
| return { | |
| "content_match": content_score, | |
| "output_match": composite_score, # This is what GEPA uses | |
| "composite_score": composite_score, | |
| "predicted_output": predicted_str, | |
| "expected_output": expected_str, | |
| "predicted_element": pred_element, | |
| "expected_element": exp_element, | |
| "evaluation_reason": reason | |
| } | |
| def get_evaluation_summary(self, results: list) -> Dict[str, Any]: | |
| """ | |
| Get summary statistics for a batch of evaluations. | |
| Args: | |
| results: List of evaluation result dictionaries | |
| Returns: | |
| Summary statistics | |
| """ | |
| if not results: | |
| return { | |
| "total_samples": 0, | |
| "accuracy": 0.0, | |
| "correct_predictions": 0 | |
| } | |
| total = len(results) | |
| correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0) | |
| accuracy = correct / total if total > 0 else 0.0 | |
| return { | |
| "total_samples": total, | |
| "accuracy": accuracy, | |
| "correct_predictions": correct, | |
| "incorrect_predictions": total - correct | |
| } | |
| # Example usage and testing | |
| if __name__ == "__main__": | |
| print("π Testing Scroll Element Evaluator...") | |
| evaluator = ScrollElementEvaluator() | |
| # Test cases | |
| test_cases = [ | |
| ("4", "4", True), | |
| ("Element: 4", "4", True), | |
| ("Element 4", "4", True), | |
| ("The element to interact with is 4", "4", True), | |
| ("Element ID: 4", "4", True), | |
| ("Click on element 4 to scroll", "4", True), | |
| ("5", "4", False), | |
| ("Element: 5", "4", False), | |
| ("No element found", "4", False), | |
| ("", "4", False), | |
| ] | |
| print("\nπ Running test cases:") | |
| print("-" * 80) | |
| results = [] | |
| for predicted, expected, should_match in test_cases: | |
| result = evaluator.evaluate(predicted, expected) | |
| match = result["composite_score"] == 1.0 | |
| status = "β " if match == should_match else "β" | |
| print(f"{status} Predicted: '{predicted}' | Expected: '{expected}' | Match: {match}") | |
| results.append(result) | |
| # Summary | |
| print("\nπ Summary:") | |
| summary = evaluator.get_evaluation_summary(results) | |
| print(f" Total: {summary['total_samples']}") | |
| print(f" Correct: {summary['correct_predictions']}") | |
| print(f" Accuracy: {summary['accuracy']:.1%}") | |