""" Universal converter for dataset to GEPA format with 3-way split (train/val/test) """ import os import json from typing import Any, List, Tuple, Union, Dict, Optional from pathlib import Path import pandas as pd import logging from .loaders import DataLoader from ..utils.exceptions import DatasetError from ..models.config import DataSplitConfig logger = logging.getLogger(__name__) class UniversalConverter: """ Universal converter for datasets to GEPA format. Handles 3-way splitting (train/val/test) with configurable ratios and graceful handling of small datasets. """ def __init__(self, data_split_config: Optional[DataSplitConfig] = None): """ Initialize converter with optional split configuration. Args: data_split_config: Configuration for train/val/test splits. If None, uses default 60/20/20 split. """ self.supported_extensions = [ '.csv', '.json', '.jsonl', '.txt', '.md', '.png', '.jpg', '.jpeg' ] self.loader = DataLoader() self.data_split_config = data_split_config or DataSplitConfig() def convert( self, dataset: Union[List[Any], str, Any, Dict[str, Any]], split_config: Optional[DataSplitConfig] = None ) -> Tuple[List[dict], List[dict], List[dict]]: """ Convert any dataset to GEPA format with 3-way split (train/val/test). Args: dataset: Input dataset in any supported format split_config: Optional split configuration (overrides instance config) Returns: Tuple of (trainset, valset, testset) where: - trainset: Used for reflection/feedback (Dfeedback in GEPA paper) - valset: Used for Pareto selection (Dpareto in GEPA paper) - testset: Held-out for final evaluation (not passed to GEPA) Raises: DatasetError: If dataset cannot be converted or is too small """ try: # Use provided split config or instance default config = split_config or self.data_split_config # Handle UI tree dataset format if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset': return self.convert_ui_tree_dataset( dataset.get('json_dir', 'json_tree'), dataset.get('screenshots_dir', 'screenshots'), split_config=config ) elif isinstance(dataset, str): data = self._load_from_path(dataset) elif hasattr(dataset, 'to_dict'): # pandas DataFrame data = dataset.to_dict(orient='records') elif isinstance(dataset, list): data = dataset else: data = [dataset] logger.info(f"Normalized data length: {len(data)}") standardized = self._standardize(data) train, val, test = self._split_three_way(standardized, config) return train, val, test except (FileNotFoundError, ValueError, TypeError) as e: raise DatasetError(f"Failed to convert dataset: {str(e)}") def _load_from_path(self, path: str) -> List[Any]: """Load data from file path""" p = Path(path) if not p.exists(): raise FileNotFoundError(f"File not found: {path}") ext = p.suffix.lower() if ext in self.supported_extensions: return [self.loader.load(p)] else: raise DatasetError(f"Unsupported file extension: {ext}") def _standardize(self, data: List[Any]) -> List[dict]: """Standardize data to input/output format Handles both UI tree JSON format and simple text inputs. UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str} Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc. """ out = [] for item in data: if not isinstance(item, dict): item = {'input': str(item)} # Handle UI tree JSON format if 'ui_tree' in item and 'screenshot' in item: ui_tree = item['ui_tree'] input_text = ui_tree.get('text', '') output_text = item.get('expected_output', '') image = item.get('screenshot', '') out.append({'input': input_text, 'output': output_text, 'image': image}) # Handle simple text format else: inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or '' outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or '' image = self._extract(item, ['image', 'image_base64', 'screenshot']) or '' out.append({'input': inp, 'output': outp, 'image': image}) return out def _extract(self, d: dict, keys: List[str]) -> Union[str, None]: """Extract value by trying multiple keys""" for k in keys: if k in d: return d[k] return None def _split_three_way( self, data: List[dict], config: DataSplitConfig ) -> Tuple[List[dict], List[dict], List[dict]]: """ Split data into train, validation, and test sets. Args: data: Standardized dataset config: Split configuration with ratios and strategies Returns: Tuple of (train, val, test) datasets Raises: ValueError: If dataset is too small for configured splits """ dataset_size = len(data) # 🔥 NEW: Log adaptive strategy if being used if config.small_dataset_strategy == 'adaptive': train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size) logger.info( f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): " f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% " f"(prioritizes validation for reliable candidate ranking)" ) # Get split indices from config try: train_end, val_end, test_end, _ = config.get_split_indices(dataset_size) except ValueError as e: logger.error(f"Dataset split error: {e}") raise DatasetError(str(e)) # Perform the split train = data[:train_end] val = data[train_end:val_end] test = data[val_end:test_end] # Log split information with strategy strategy_note = "" if config.small_dataset_strategy == 'adaptive': strategy_note = " (adaptive)" logger.info( f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), " f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), " f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)" ) # Validate splits are not empty if len(train) == 0: raise DatasetError("Training set is empty after split") if len(val) == 0: logger.warning("Validation set is empty - this may cause issues with Pareto selection") val = [train[-1]] # Use last training sample as fallback if len(test) == 0: logger.warning("Test set is empty - final evaluation will not be performed") return train, val, test def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]: """ DEPRECATED: Legacy 2-way split for backwards compatibility. Use _split_three_way() instead for production code. Args: data: Standardized dataset ratio: Train ratio (0.0-1.0) Returns: Tuple of (train, val) datasets """ import warnings warnings.warn( "_split() is deprecated. Use _split_three_way() for 3-way splitting.", DeprecationWarning, stacklevel=2 ) split = max(1, int(len(data) * ratio)) train = data[:split] val = data[split:] or data[-1:] # Ensure val is not empty return train, val def convert_ui_tree_dataset( self, json_dir: str, screenshots_dir: str, split_config: Optional[DataSplitConfig] = None ) -> Tuple[List[dict], List[dict], List[dict]]: """ Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split. Args: json_dir: Directory containing JSON files screenshots_dir: Directory containing screenshot images split_config: Optional split configuration (overrides instance config) Returns: Tuple of (train_data, val_data, test_data) in GEPA format Raises: DatasetError: If dataset cannot be loaded or is invalid """ try: # Load paired dataset dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir) if not dataset: raise DatasetError("No valid image-JSON pairs found") logger.info(f"Loaded {len(dataset)} UI tree samples") # Use provided config or instance default config = split_config or self.data_split_config # Split into train/val/test train, val, test = self._split_three_way(dataset, config) logger.info( f"Split UI tree dataset: {len(train)} train, " f"{len(val)} validation, {len(test)} test" ) return train, val, test except Exception as e: raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")