File size: 7,089 Bytes
cacd4d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Data validation utilities for GEPA optimizer
"""

from typing import List, Dict, Any, Optional, Tuple
import logging

logger = logging.getLogger(__name__)

class DataValidator:
    """
    Validates datasets for completeness and GEPA compatibility
    """
    
    def __init__(self):
        self.required_fields = ['input', 'output']
        self.optional_fields = ['metadata', 'id', 'tags']
    
    def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
        """
        Validate entire dataset
        
        Args:
            dataset: List of data items to validate
            
        Returns:
            Tuple[bool, List[str]]: (is_valid, list_of_errors)
        """
        errors = []
        
        # Basic dataset checks
        if not dataset:
            errors.append("Dataset is empty")
            return False, errors
        
        if not isinstance(dataset, list):
            errors.append("Dataset must be a list")
            return False, errors
        
        # Validate each item
        for idx, item in enumerate(dataset):
            item_errors = self.validate_item(item, idx)
            errors.extend(item_errors)
        
        # Check for minimum dataset size
        if len(dataset) < 2:
            errors.append("Dataset should have at least 2 items for proper train/val split")
        
        # Log validation results
        if errors:
            logger.warning(f"Dataset validation failed with {len(errors)} errors")
        else:
            logger.info(f"Dataset validation passed for {len(dataset)} items")
        
        return len(errors) == 0, errors
    
    def validate_item(self, item: Dict[str, Any], index: Optional[int] = None) -> List[str]:
        """
        Validate a single dataset item
        
        Args:
            item: Single data item to validate
            index: Optional item index for error reporting
            
        Returns:
            List[str]: List of validation errors
        """
        errors = []
        item_ref = f"item {index}" if index is not None else "item"
        
        # Check if item is a dictionary
        if not isinstance(item, dict):
            errors.append(f"{item_ref}: Must be a dictionary")
            return errors
        
        # Check for required fields
        if 'input' not in item:
            errors.append(f"{item_ref}: Missing required 'input' field")
        elif not isinstance(item['input'], str):
            errors.append(f"{item_ref}: 'input' field must be a string")
        elif not item['input'].strip():
            errors.append(f"{item_ref}: 'input' field cannot be empty")
        
        # Check output field (can be empty but should exist for supervised learning)
        if 'output' in item:
            if not isinstance(item['output'], str):
                errors.append(f"{item_ref}: 'output' field must be a string")
        
        # Validate metadata if present
        if 'metadata' in item and not isinstance(item['metadata'], dict):
            errors.append(f"{item_ref}: 'metadata' field must be a dictionary")
        
        return errors
    
    def validate_gepa_format(self, gepa_data: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
        """
        Validate data in GEPA format
        
        Args:
            gepa_data: Data in GEPA format
            
        Returns:
            Tuple[bool, List[str]]: (is_valid, list_of_errors)
        """
        errors = []
        
        if not gepa_data:
            errors.append("GEPA dataset is empty")
            return False, errors
        
        for idx, item in enumerate(gepa_data):
            if 'input' not in item:
                errors.append(f"GEPA item {idx}: Missing 'input' field")
            
            if 'expected_output' not in item:
                errors.append(f"GEPA item {idx}: Missing 'expected_output' field")
            
            if 'metadata' not in item:
                errors.append(f"GEPA item {idx}: Missing 'metadata' field")
            elif not isinstance(item['metadata'], dict):
                errors.append(f"GEPA item {idx}: 'metadata' must be a dictionary")
        
        return len(errors) == 0, errors
    
    def validate_split(self, trainset: List[Dict], valset: List[Dict]) -> Tuple[bool, List[str]]:
        """
        Validate train/validation split
        
        Args:
            trainset: Training data
            valset: Validation data
            
        Returns:
            Tuple[bool, List[str]]: (is_valid, list_of_errors)
        """
        errors = []
        
        if not trainset:
            errors.append("Training set is empty")
        
        if not valset:
            errors.append("Validation set is empty")
        
        # Check proportions
        total_size = len(trainset) + len(valset)
        if total_size > 0:
            train_ratio = len(trainset) / total_size
            if train_ratio < 0.5:
                errors.append(f"Training set too small: {train_ratio:.2%} of total data")
            elif train_ratio > 0.95:
                errors.append(f"Validation set too small: {1-train_ratio:.2%} of total data")
        
        return len(errors) == 0, errors
    
    def get_dataset_stats(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Get statistics about the dataset
        
        Args:
            dataset: Dataset to analyze
            
        Returns:
            Dict[str, Any]: Dataset statistics
        """
        if not dataset:
            return {'total_items': 0, 'valid': False}
        
        stats = {
            'total_items': len(dataset),
            'has_output': sum(1 for item in dataset if item.get('output')),
            'avg_input_length': 0,
            'avg_output_length': 0,
            'empty_inputs': 0,
            'empty_outputs': 0
        }
        
        input_lengths = []
        output_lengths = []
        
        for item in dataset:
            if isinstance(item, dict):
                input_text = item.get('input', '')
                output_text = item.get('output', '')
                
                if isinstance(input_text, str):
                    input_lengths.append(len(input_text))
                    if not input_text.strip():
                        stats['empty_inputs'] += 1
                
                if isinstance(output_text, str):
                    output_lengths.append(len(output_text))
                    if not output_text.strip():
                        stats['empty_outputs'] += 1
        
        if input_lengths:
            stats['avg_input_length'] = sum(input_lengths) / len(input_lengths)
        
        if output_lengths:
            stats['avg_output_length'] = sum(output_lengths) / len(output_lengths)
        
        # Determine if dataset looks valid
        stats['valid'] = (
            stats['total_items'] > 0 and
            stats['empty_inputs'] < stats['total_items'] * 0.5  # Less than 50% empty inputs
        )
        
        return stats