File size: 10,249 Bytes
cacd4d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
"""
Universal converter for dataset to GEPA format with 3-way split (train/val/test)
"""

import os
import json
from typing import Any, List, Tuple, Union, Dict, Optional
from pathlib import Path
import pandas as pd
import logging

from .loaders import DataLoader
from ..utils.exceptions import DatasetError
from ..models.config import DataSplitConfig

logger = logging.getLogger(__name__)

class UniversalConverter:
    """
    Universal converter for datasets to GEPA format.
    
    Handles 3-way splitting (train/val/test) with configurable ratios and
    graceful handling of small datasets.
    """
    
    def __init__(self, data_split_config: Optional[DataSplitConfig] = None):
        """
        Initialize converter with optional split configuration.
        
        Args:
            data_split_config: Configuration for train/val/test splits.
                             If None, uses default 60/20/20 split.
        """
        self.supported_extensions = [
            '.csv', '.json', '.jsonl', '.txt', '.md',
            '.png', '.jpg', '.jpeg'
        ]
        self.loader = DataLoader()
        self.data_split_config = data_split_config or DataSplitConfig()

    def convert(
        self, 
        dataset: Union[List[Any], str, Any, Dict[str, Any]],
        split_config: Optional[DataSplitConfig] = None
    ) -> Tuple[List[dict], List[dict], List[dict]]:
        """
        Convert any dataset to GEPA format with 3-way split (train/val/test).
        
        Args:
            dataset: Input dataset in any supported format
            split_config: Optional split configuration (overrides instance config)
            
        Returns:
            Tuple of (trainset, valset, testset) where:
            - trainset: Used for reflection/feedback (Dfeedback in GEPA paper)
            - valset: Used for Pareto selection (Dpareto in GEPA paper)
            - testset: Held-out for final evaluation (not passed to GEPA)
            
        Raises:
            DatasetError: If dataset cannot be converted or is too small
        """
        try:
            # Use provided split config or instance default
            config = split_config or self.data_split_config
            
            # Handle UI tree dataset format
            if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset':
                return self.convert_ui_tree_dataset(
                    dataset.get('json_dir', 'json_tree'),
                    dataset.get('screenshots_dir', 'screenshots'),
                    split_config=config
                )
            elif isinstance(dataset, str):
                data = self._load_from_path(dataset)
            elif hasattr(dataset, 'to_dict'):  # pandas DataFrame
                data = dataset.to_dict(orient='records')
            elif isinstance(dataset, list):
                data = dataset
            else:
                data = [dataset]

            logger.info(f"Normalized data length: {len(data)}")
            standardized = self._standardize(data)
            train, val, test = self._split_three_way(standardized, config)
            return train, val, test
        except (FileNotFoundError, ValueError, TypeError) as e:
            raise DatasetError(f"Failed to convert dataset: {str(e)}")

    def _load_from_path(self, path: str) -> List[Any]:
        """Load data from file path"""
        p = Path(path)
        if not p.exists():
            raise FileNotFoundError(f"File not found: {path}")
        
        ext = p.suffix.lower()
        if ext in self.supported_extensions:
            return [self.loader.load(p)]
        else:
            raise DatasetError(f"Unsupported file extension: {ext}")

    def _standardize(self, data: List[Any]) -> List[dict]:
        """Standardize data to input/output format
        
        Handles both UI tree JSON format and simple text inputs.
        UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str}
        Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc.
        """
        out = []
        for item in data:
            if not isinstance(item, dict):
                item = {'input': str(item)}
                
            # Handle UI tree JSON format
            if 'ui_tree' in item and 'screenshot' in item:
                ui_tree = item['ui_tree']
                input_text = ui_tree.get('text', '')
                output_text = item.get('expected_output', '')
                image = item.get('screenshot', '')
                out.append({'input': input_text, 'output': output_text, 'image': image})
            # Handle simple text format
            else:
                inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or ''
                outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or ''
                image = self._extract(item, ['image', 'image_base64', 'screenshot']) or ''
                out.append({'input': inp, 'output': outp, 'image': image})
                
        return out

    def _extract(self, d: dict, keys: List[str]) -> Union[str, None]:
        """Extract value by trying multiple keys"""
        for k in keys:
            if k in d:
                return d[k]
        return None

    def _split_three_way(
        self, 
        data: List[dict], 
        config: DataSplitConfig
    ) -> Tuple[List[dict], List[dict], List[dict]]:
        """
        Split data into train, validation, and test sets.
        
        Args:
            data: Standardized dataset
            config: Split configuration with ratios and strategies
            
        Returns:
            Tuple of (train, val, test) datasets
            
        Raises:
            ValueError: If dataset is too small for configured splits
        """
        dataset_size = len(data)
        
        # 🔥 NEW: Log adaptive strategy if being used
        if config.small_dataset_strategy == 'adaptive':
            train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size)
            logger.info(
                f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): "
                f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% "
                f"(prioritizes validation for reliable candidate ranking)"
            )
        
        # Get split indices from config
        try:
            train_end, val_end, test_end, _ = config.get_split_indices(dataset_size)
        except ValueError as e:
            logger.error(f"Dataset split error: {e}")
            raise DatasetError(str(e))
        
        # Perform the split
        train = data[:train_end]
        val = data[train_end:val_end]
        test = data[val_end:test_end]
        
        # Log split information with strategy
        strategy_note = ""
        if config.small_dataset_strategy == 'adaptive':
            strategy_note = " (adaptive)"
        logger.info(
            f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), "
            f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), "
            f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)"
        )
        
        # Validate splits are not empty
        if len(train) == 0:
            raise DatasetError("Training set is empty after split")
        if len(val) == 0:
            logger.warning("Validation set is empty - this may cause issues with Pareto selection")
            val = [train[-1]]  # Use last training sample as fallback
        if len(test) == 0:
            logger.warning("Test set is empty - final evaluation will not be performed")
        
        return train, val, test
    
    def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
        """
        DEPRECATED: Legacy 2-way split for backwards compatibility.
        
        Use _split_three_way() instead for production code.
        
        Args:
            data: Standardized dataset
            ratio: Train ratio (0.0-1.0)
            
        Returns:
            Tuple of (train, val) datasets
        """
        import warnings
        warnings.warn(
            "_split() is deprecated. Use _split_three_way() for 3-way splitting.",
            DeprecationWarning,
            stacklevel=2
        )
        
        split = max(1, int(len(data) * ratio))
        train = data[:split]
        val = data[split:] or data[-1:]  # Ensure val is not empty
        return train, val

    def convert_ui_tree_dataset(
        self, 
        json_dir: str, 
        screenshots_dir: str,
        split_config: Optional[DataSplitConfig] = None
    ) -> Tuple[List[dict], List[dict], List[dict]]:
        """
        Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split.
        
        Args:
            json_dir: Directory containing JSON files
            screenshots_dir: Directory containing screenshot images
            split_config: Optional split configuration (overrides instance config)
            
        Returns:
            Tuple of (train_data, val_data, test_data) in GEPA format
            
        Raises:
            DatasetError: If dataset cannot be loaded or is invalid
        """
        try:
            # Load paired dataset
            dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir)
            
            if not dataset:
                raise DatasetError("No valid image-JSON pairs found")
            
            logger.info(f"Loaded {len(dataset)} UI tree samples")
            
            # Use provided config or instance default
            config = split_config or self.data_split_config
            
            # Split into train/val/test
            train, val, test = self._split_three_way(dataset, config)
            
            logger.info(
                f"Split UI tree dataset: {len(train)} train, "
                f"{len(val)} validation, {len(test)} test"
            )
            return train, val, test
            
        except Exception as e:
            raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")