Spaces:
Sleeping
Sleeping
| """ | |
| Universal converter for dataset to GEPA format with 3-way split (train/val/test) | |
| """ | |
| import os | |
| import json | |
| from typing import Any, List, Tuple, Union, Dict, Optional | |
| from pathlib import Path | |
| import pandas as pd | |
| import logging | |
| from .loaders import DataLoader | |
| from ..utils.exceptions import DatasetError | |
| from ..models.config import DataSplitConfig | |
| logger = logging.getLogger(__name__) | |
| class UniversalConverter: | |
| """ | |
| Universal converter for datasets to GEPA format. | |
| Handles 3-way splitting (train/val/test) with configurable ratios and | |
| graceful handling of small datasets. | |
| """ | |
| def __init__(self, data_split_config: Optional[DataSplitConfig] = None): | |
| """ | |
| Initialize converter with optional split configuration. | |
| Args: | |
| data_split_config: Configuration for train/val/test splits. | |
| If None, uses default 60/20/20 split. | |
| """ | |
| self.supported_extensions = [ | |
| '.csv', '.json', '.jsonl', '.txt', '.md', | |
| '.png', '.jpg', '.jpeg' | |
| ] | |
| self.loader = DataLoader() | |
| self.data_split_config = data_split_config or DataSplitConfig() | |
| def convert( | |
| self, | |
| dataset: Union[List[Any], str, Any, Dict[str, Any]], | |
| split_config: Optional[DataSplitConfig] = None | |
| ) -> Tuple[List[dict], List[dict], List[dict]]: | |
| """ | |
| Convert any dataset to GEPA format with 3-way split (train/val/test). | |
| Args: | |
| dataset: Input dataset in any supported format | |
| split_config: Optional split configuration (overrides instance config) | |
| Returns: | |
| Tuple of (trainset, valset, testset) where: | |
| - trainset: Used for reflection/feedback (Dfeedback in GEPA paper) | |
| - valset: Used for Pareto selection (Dpareto in GEPA paper) | |
| - testset: Held-out for final evaluation (not passed to GEPA) | |
| Raises: | |
| DatasetError: If dataset cannot be converted or is too small | |
| """ | |
| try: | |
| # Use provided split config or instance default | |
| config = split_config or self.data_split_config | |
| # Handle UI tree dataset format | |
| if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset': | |
| return self.convert_ui_tree_dataset( | |
| dataset.get('json_dir', 'json_tree'), | |
| dataset.get('screenshots_dir', 'screenshots'), | |
| split_config=config | |
| ) | |
| elif isinstance(dataset, str): | |
| data = self._load_from_path(dataset) | |
| elif hasattr(dataset, 'to_dict'): # pandas DataFrame | |
| data = dataset.to_dict(orient='records') | |
| elif isinstance(dataset, list): | |
| data = dataset | |
| else: | |
| data = [dataset] | |
| logger.info(f"Normalized data length: {len(data)}") | |
| standardized = self._standardize(data) | |
| train, val, test = self._split_three_way(standardized, config) | |
| return train, val, test | |
| except (FileNotFoundError, ValueError, TypeError) as e: | |
| raise DatasetError(f"Failed to convert dataset: {str(e)}") | |
| def _load_from_path(self, path: str) -> List[Any]: | |
| """Load data from file path""" | |
| p = Path(path) | |
| if not p.exists(): | |
| raise FileNotFoundError(f"File not found: {path}") | |
| ext = p.suffix.lower() | |
| if ext in self.supported_extensions: | |
| return [self.loader.load(p)] | |
| else: | |
| raise DatasetError(f"Unsupported file extension: {ext}") | |
| def _standardize(self, data: List[Any]) -> List[dict]: | |
| """Standardize data to input/output format | |
| Handles both UI tree JSON format and simple text inputs. | |
| UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str} | |
| Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc. | |
| """ | |
| out = [] | |
| for item in data: | |
| if not isinstance(item, dict): | |
| item = {'input': str(item)} | |
| # Handle UI tree JSON format | |
| if 'ui_tree' in item and 'screenshot' in item: | |
| ui_tree = item['ui_tree'] | |
| input_text = ui_tree.get('text', '') | |
| output_text = item.get('expected_output', '') | |
| image = item.get('screenshot', '') | |
| out.append({'input': input_text, 'output': output_text, 'image': image}) | |
| # Handle simple text format | |
| else: | |
| inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or '' | |
| outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or '' | |
| image = self._extract(item, ['image', 'image_base64', 'screenshot']) or '' | |
| out.append({'input': inp, 'output': outp, 'image': image}) | |
| return out | |
| def _extract(self, d: dict, keys: List[str]) -> Union[str, None]: | |
| """Extract value by trying multiple keys""" | |
| for k in keys: | |
| if k in d: | |
| return d[k] | |
| return None | |
| def _split_three_way( | |
| self, | |
| data: List[dict], | |
| config: DataSplitConfig | |
| ) -> Tuple[List[dict], List[dict], List[dict]]: | |
| """ | |
| Split data into train, validation, and test sets. | |
| Args: | |
| data: Standardized dataset | |
| config: Split configuration with ratios and strategies | |
| Returns: | |
| Tuple of (train, val, test) datasets | |
| Raises: | |
| ValueError: If dataset is too small for configured splits | |
| """ | |
| dataset_size = len(data) | |
| # 🔥 NEW: Log adaptive strategy if being used | |
| if config.small_dataset_strategy == 'adaptive': | |
| train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size) | |
| logger.info( | |
| f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): " | |
| f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% " | |
| f"(prioritizes validation for reliable candidate ranking)" | |
| ) | |
| # Get split indices from config | |
| try: | |
| train_end, val_end, test_end, _ = config.get_split_indices(dataset_size) | |
| except ValueError as e: | |
| logger.error(f"Dataset split error: {e}") | |
| raise DatasetError(str(e)) | |
| # Perform the split | |
| train = data[:train_end] | |
| val = data[train_end:val_end] | |
| test = data[val_end:test_end] | |
| # Log split information with strategy | |
| strategy_note = "" | |
| if config.small_dataset_strategy == 'adaptive': | |
| strategy_note = " (adaptive)" | |
| logger.info( | |
| f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), " | |
| f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), " | |
| f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)" | |
| ) | |
| # Validate splits are not empty | |
| if len(train) == 0: | |
| raise DatasetError("Training set is empty after split") | |
| if len(val) == 0: | |
| logger.warning("Validation set is empty - this may cause issues with Pareto selection") | |
| val = [train[-1]] # Use last training sample as fallback | |
| if len(test) == 0: | |
| logger.warning("Test set is empty - final evaluation will not be performed") | |
| return train, val, test | |
| def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]: | |
| """ | |
| DEPRECATED: Legacy 2-way split for backwards compatibility. | |
| Use _split_three_way() instead for production code. | |
| Args: | |
| data: Standardized dataset | |
| ratio: Train ratio (0.0-1.0) | |
| Returns: | |
| Tuple of (train, val) datasets | |
| """ | |
| import warnings | |
| warnings.warn( | |
| "_split() is deprecated. Use _split_three_way() for 3-way splitting.", | |
| DeprecationWarning, | |
| stacklevel=2 | |
| ) | |
| split = max(1, int(len(data) * ratio)) | |
| train = data[:split] | |
| val = data[split:] or data[-1:] # Ensure val is not empty | |
| return train, val | |
| def convert_ui_tree_dataset( | |
| self, | |
| json_dir: str, | |
| screenshots_dir: str, | |
| split_config: Optional[DataSplitConfig] = None | |
| ) -> Tuple[List[dict], List[dict], List[dict]]: | |
| """ | |
| Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split. | |
| Args: | |
| json_dir: Directory containing JSON files | |
| screenshots_dir: Directory containing screenshot images | |
| split_config: Optional split configuration (overrides instance config) | |
| Returns: | |
| Tuple of (train_data, val_data, test_data) in GEPA format | |
| Raises: | |
| DatasetError: If dataset cannot be loaded or is invalid | |
| """ | |
| try: | |
| # Load paired dataset | |
| dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir) | |
| if not dataset: | |
| raise DatasetError("No valid image-JSON pairs found") | |
| logger.info(f"Loaded {len(dataset)} UI tree samples") | |
| # Use provided config or instance default | |
| config = split_config or self.data_split_config | |
| # Split into train/val/test | |
| train, val, test = self._split_three_way(dataset, config) | |
| logger.info( | |
| f"Split UI tree dataset: {len(train)} train, " | |
| f"{len(val)} validation, {len(test)} test" | |
| ) | |
| return train, val, test | |
| except Exception as e: | |
| raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}") | |