File size: 20,332 Bytes
cacd4d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
"""
Validation Evaluator for UI Validation Use Case

Evaluates predicted validation results (true/false) against expected results.
Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
"""

from typing import Dict, Any, Optional
import re
import logging

try:
    from .base_evaluator import BaseEvaluator
except ImportError:
    # For standalone testing
    import sys
    from pathlib import Path
    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator


class ValidationEvaluator(BaseEvaluator):
    """
    Evaluator for validation use case (true/false results).
    
    Features:
    - Normalizes boolean formats ("true"/"True"/"1" β†’ True, "false"/"False"/"0" β†’ False)
    - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
    - Binary scoring: correct boolean = 1.0, wrong = 0.0
    - Returns reasoning in evaluation results for LLM-as-judge feedback
    """
    
    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
        """
        Initialize validation evaluator.
        
        Args:
            metric_weights: Weights for evaluation metrics
                          Default: {"output_match": 1.0}
        """
        default_weights = {
            "output_match": 1.0  # Binary boolean comparison
        }
        
        weights = metric_weights or default_weights
        super().__init__(metric_weights=weights)
    
    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
        """
        Evaluate predicted validation result against expected result.
        
        Scoring Strategy:
        1. Normalize both predicted and expected to boolean
        2. Compare booleans (exact match required)
        3. Extract reasoning from both (for LLM-as-judge)
        4. Return 1.0 if match, 0.0 otherwise (binary scoring)
        
        Args:
            predicted: LLM's output (may include "true"/"false" + reasoning)
            expected: Expected output (should be "true" or "false", may include reasoning)
            
        Returns:
            Dictionary with evaluation metrics, extracted booleans, and reasoning:
            {
                "output_match": 1.0 or 0.0,
                "composite_score": 1.0 or 0.0,
                "predicted_output": str,
                "expected_output": str,
                "predicted_boolean": True/False,
                "expected_boolean": True/False,
                "predicted_reasoning": str,  # REQUIRED for LLM-as-judge
                "expected_reasoning": str,   # REQUIRED for LLM-as-judge
                "evaluation_reason": str
            }
        """
        if not predicted or not expected:
            return {
                "output_match": 0.0,
                "composite_score": 0.0,
                "predicted_output": str(predicted).strip() if predicted else "",
                "expected_output": str(expected).strip() if expected else "",
                "predicted_boolean": None,
                "expected_boolean": None,
                "predicted_reasoning": "",
                "expected_reasoning": "",
                "evaluation_reason": "❌ Empty or missing input/output"
            }
        
        predicted_str = str(predicted).strip()
        expected_str = str(expected).strip()
        
        # 1. Extract boolean from predicted output
        pred_bool = self._normalize_to_bool(predicted_str)
        pred_reasoning = self._extract_reasoning(predicted_str)
        
        # 2. Extract boolean from expected output
        exp_bool = self._normalize_to_bool(expected_str)
        exp_reasoning = self._extract_reasoning(expected_str)
        
        # πŸ”₯ NEW: Detect output structure for both expected and predicted
        expected_structure = self._detect_output_structure(expected_str)
        predicted_structure = self._detect_output_structure(predicted_str)
        
        # Compare structures
        structure_match = (expected_structure['format'] == predicted_structure['format'])
        
        # 3. Compare booleans (binary scoring)
        if pred_bool is None or exp_bool is None:
            # Could not extract boolean from one or both
            score = 0.0
            reason = "❌ Could not extract boolean value"
            if pred_bool is None:
                reason += " from predicted output"
            if exp_bool is None:
                reason += " from expected output"
        else:
            # Both booleans extracted successfully - compare
            score = 1.0 if pred_bool == exp_bool else 0.0
            if score == 1.0:
                reason = f"βœ… Correct! Result matches (both are {exp_bool})"
                # πŸ”₯ NEW: Add note if structure doesn't match
                if not structure_match:
                    reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
            else:
                reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
        
        # 4. Log evaluation details
        self.logger.info(f"\n{'─'*70}")
        self.logger.info(f"πŸ“Š VALIDATION EVALUATION")
        self.logger.info(f"{'─'*70}")
        self.logger.info(f"   Expected: '{expected_str[:100]}...' β†’ {exp_bool}")
        self.logger.info(f"   Predicted: '{predicted_str[:100]}...' β†’ {pred_bool}")
        self.logger.info(f"   {'─'*66}")
        self.logger.info(f"   🎯 SCORE: {score:.2f} - {reason}")
        if pred_reasoning:
            self.logger.info(f"   πŸ“ Predicted Reasoning: {pred_reasoning[:150]}...")
        if exp_reasoning:
            self.logger.info(f"   πŸ“ Expected Reasoning: {exp_reasoning[:150]}...")
        # πŸ”₯ NEW: Log structure comparison
        self.logger.info(f"   πŸ“ Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
        self.logger.info(f"   πŸ“ Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
        if not structure_match:
            self.logger.warning(f"   ⚠️  OUTPUT STRUCTURE MISMATCH!")
        self.logger.info(f"{'─'*70}\n")
        
        return {
            "output_match": score,
            "composite_score": score,  # This is what GEPA uses
            "predicted_output": predicted_str,
            "expected_output": expected_str,
            "predicted_boolean": pred_bool,
            "expected_boolean": exp_bool,
            "predicted_reasoning": pred_reasoning,  # REQUIRED for LLM-as-judge
            "expected_reasoning": exp_reasoning,     # REQUIRED for LLM-as-judge
            "evaluation_reason": reason,
            # πŸ”₯ NEW: Structure metadata for LLM-as-judge
            "expected_structure": expected_structure,
            "predicted_structure": predicted_structure,
            "output_structure_match": structure_match,
            "expected_has_reasoning": expected_structure['has_reasoning'],
            "predicted_has_reasoning": predicted_structure['has_reasoning'],
            "reasoning_quality_gap": expected_structure['reasoning_quality'] + " β†’ " + predicted_structure['reasoning_quality']
        }
    
    def _normalize_to_bool(self, value: str) -> Optional[bool]:
        """
        Normalize various formats to boolean.
        
        Handles:
        - "true", "True", "TRUE" β†’ True
        - "false", "False", "FALSE" β†’ False
        - "1", "0" β†’ True, False
        - "yes", "no" β†’ True, False
        - "correct", "incorrect" β†’ True, False
        - JSON: {"result": true} β†’ True
        - Text with boolean: "The result is true because..." β†’ True
        
        Args:
            value: String that may contain a boolean value
            
        Returns:
            Boolean value or None if cannot be determined
        """
        if not value:
            return None
        
        value_lower = value.lower().strip()
        
        # Direct boolean strings
        if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
            return True
        if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
            return False
        
        # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
        # This handles the production prompt's JSON output format
        # Match both quoted and unquoted values, case-insensitive
        action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
        if action_match:
            action_value = action_match.group(1).lower()
            if action_value == "true":
                return True
            elif action_value == "false":
                return False
            elif action_value == "loading":
                # Treat LOADING as False for validation purposes (screen not ready)
                return False
        
        # Also try to parse full JSON structure if present (more robust)
        try:
            import json
            # Try to find and parse JSON object
            json_start = value.find('{')
            if json_start != -1:
                # Try to extract JSON from the response
                for end_idx in range(len(value), json_start, -1):
                    try:
                        json_str = value[json_start:end_idx]
                        data = json.loads(json_str)
                        # Check for "action" field (production prompt format)
                        if "action" in data:
                            action_val = str(data["action"]).upper()
                            if action_val == "TRUE":
                                return True
                            elif action_val == "FALSE":
                                return False
                            elif action_val == "LOADING":
                                return False  # Treat as False
                        # Check for "result" field (alternative format)
                        if "result" in data:
                            result_val = data["result"]
                            if isinstance(result_val, bool):
                                return result_val
                            elif isinstance(result_val, str):
                                return result_val.lower() in ("true", "1", "yes")
                    except (json.JSONDecodeError, KeyError, ValueError):
                        continue
        except Exception:
            pass  # Fall through to other extraction methods
        
        # JSON format: {"result": true} or {"result": false}
        json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
        if json_match:
            return json_match.group(1) == "true"
        
        # Pattern: "result is true" or "result: true"
        pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
        if pattern_match:
            return pattern_match.group(1) == "true"
        
        # Pattern: "is true" or "is false" (standalone)
        is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
        if is_match:
            return is_match.group(2) == "true"
        
        # Pattern: "true" or "false" as standalone word (not in other words)
        standalone_match = re.search(r'\b(true|false)\b', value_lower)
        if standalone_match:
            return standalone_match.group(1) == "true"
        
        # Last resort: check if "true" appears before "false" in text
        true_pos = value_lower.find("true")
        false_pos = value_lower.find("false")
        
        if true_pos != -1 and false_pos != -1:
            # Both found - use the one that appears first
            return true_pos < false_pos
        elif true_pos != -1:
            return True
        elif false_pos != -1:
            return False
        
        # Cannot determine
        return None
    
    def _detect_output_structure(self, output: str) -> Dict[str, Any]:
        """
        Dynamically detect the structure/components of the output.
        
        This detects:
        - Boolean result presence
        - Reasoning/explanation presence and quality
        - Output format (boolean only, boolean+reasoning, etc.)
        
        Args:
            output: Output string to analyze
            
        Returns:
            Dictionary with structure information:
            {
                "has_boolean": bool,
                "has_reasoning": bool,
                "reasoning_length": int,
                "reasoning_quality": str,  # "missing", "minimal", "adequate", "detailed"
                "format": str  # "boolean_only", "boolean_with_reasoning", "unknown"
            }
        """
        if not output:
            return {
                "has_boolean": False,
                "has_reasoning": False,
                "reasoning_length": 0,
                "reasoning_quality": "missing",
                "format": "empty"
            }
        
        output_clean = output.strip()
        
        # Detect boolean
        has_boolean = self._normalize_to_bool(output_clean) is not None
        
        # Extract reasoning
        reasoning = self._extract_reasoning(output_clean)
        has_reasoning = len(reasoning) > 15  # Minimum 15 chars to count as reasoning
        reasoning_length = len(reasoning)
        
        # Classify reasoning quality
        if reasoning_length == 0:
            reasoning_quality = "missing"
        elif reasoning_length < 30:
            reasoning_quality = "minimal"  # Just a few words
        elif reasoning_length < 100:
            reasoning_quality = "adequate"  # Brief explanation
        else:
            reasoning_quality = "detailed"  # Full explanation
        
        # Determine format
        if has_boolean and has_reasoning:
            output_format = "boolean_with_reasoning"
        elif has_boolean and not has_reasoning:
            output_format = "boolean_only"
        elif not has_boolean and has_reasoning:
            output_format = "reasoning_only"
        else:
            output_format = "unknown"
        
        return {
            "has_boolean": has_boolean,
            "has_reasoning": has_reasoning,
            "reasoning_length": reasoning_length,
            "reasoning_quality": reasoning_quality,
            "format": output_format
        }
    
    def _extract_reasoning(self, output: str) -> str:
        """
        Extract reasoning/explanation from output string.
        
        This is REQUIRED for LLM-as-judge feedback. The reasoning helps
        the judge understand why the result was true/false and compare
        predicted vs expected reasoning.
        
        Args:
            output: Full output string that may contain reasoning
            
        Returns:
            Extracted reasoning text, or empty string if not found
        """
        if not output:
            return ""
        
        # Patterns to find reasoning sections
        reasoning_patterns = [
            r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)',  # "Reason: ..."
            r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)',  # "Explanation: ..."
            r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)',  # "Because: ..."
            r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)',  # "Why: ..."
            r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)',  # "Details: ..."
        ]
        
        # Try each pattern
        for pattern in reasoning_patterns:
            match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
            if match:
                reasoning = match.group(1).strip()
                if len(reasoning) > 20:  # Only return if substantial
                    return reasoning
        
        # If no explicit reasoning section, check if output has substantial text
        # after boolean (likely contains reasoning)
        bool_match = re.search(r'\b(true|false)\b', output.lower())
        if bool_match:
            # Get text after the boolean
            bool_pos = bool_match.end()
            remaining = output[bool_pos:].strip()
            
            # If remaining text is substantial (more than just punctuation), use it
            if len(remaining) > 30:
                # Clean up common prefixes
                remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
                if remaining:
                    return remaining
        
        # If output is long and doesn't start with boolean, might be all reasoning
        if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
            # Return first 500 chars as reasoning
            return output[:500].strip()
        
        # No reasoning found
        return ""
    
    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
        """
        Get summary statistics for a batch of evaluations.
        
        Args:
            results: List of evaluation result dictionaries
            
        Returns:
            Summary statistics including accuracy, true/false distribution
        """
        if not results:
            return {
                "total_samples": 0,
                "accuracy": 0.0,
                "correct_predictions": 0,
                "incorrect_predictions": 0,
                "true_predictions": 0,
                "false_predictions": 0
            }
        
        total = len(results)
        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
        accuracy = correct / total if total > 0 else 0.0
        
        # Count true/false predictions
        true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
        false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
        
        return {
            "total_samples": total,
            "accuracy": accuracy,
            "correct_predictions": correct,
            "incorrect_predictions": total - correct,
            "true_predictions": true_preds,
            "false_predictions": false_preds
        }


# Example usage and testing
if __name__ == "__main__":
    print("πŸš€ Testing Validation Evaluator...")
    
    evaluator = ValidationEvaluator()
    
    # Test cases
    test_cases = [
        # (predicted, expected, should_match)
        ("true", "true", True),
        ("false", "false", True),
        ("True", "true", True),
        ("FALSE", "false", True),
        ("1", "true", True),
        ("0", "false", True),
        ("true", "false", False),
        ("false", "true", False),
        ("The result is true because the button is visible", "true", True),
        ("The result is false because the element is not found", "false", True),
        ('{"result": true, "reasoning": "Button is visible"}', "true", True),
        ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
        ("", "true", False),
        ("invalid", "true", False),
    ]
    
    print("\nπŸ“ Running test cases:")
    print("-" * 80)
    
    results = []
    for predicted, expected, should_match in test_cases:
        result = evaluator.evaluate(predicted, expected)
        match = result["composite_score"] == 1.0
        
        status = "βœ…" if match == should_match else "❌"
        pred_bool = result.get("predicted_boolean", "?")
        exp_bool = result.get("expected_boolean", "?")
        pred_reason = result.get("predicted_reasoning", "")[:50]
        
        print(f"{status} Predicted: '{predicted[:40]}...' β†’ {pred_bool}")
        print(f"   Expected: '{expected}' β†’ {exp_bool}")
        print(f"   Match: {match} (should be {should_match})")
        if pred_reason:
            print(f"   Reasoning: {pred_reason}...")
        print()
        
        results.append(result)
    
    # Summary
    print("\nπŸ“Š Summary:")
    summary = evaluator.get_evaluation_summary(results)
    print(f"   Total: {summary['total_samples']}")
    print(f"   Correct: {summary['correct_predictions']}")
    print(f"   Accuracy: {summary['accuracy']:.1%}")
    print(f"   True predictions: {summary['true_predictions']}")
    print(f"   False predictions: {summary['false_predictions']}")