Suhasdev's picture
Deploy Universal Prompt Optimizer to HF Spaces (clean)
cacd4d0
"""
Universal converter for dataset to GEPA format with 3-way split (train/val/test)
"""
import os
import json
from typing import Any, List, Tuple, Union, Dict, Optional
from pathlib import Path
import pandas as pd
import logging
from .loaders import DataLoader
from ..utils.exceptions import DatasetError
from ..models.config import DataSplitConfig
logger = logging.getLogger(__name__)
class UniversalConverter:
"""
Universal converter for datasets to GEPA format.
Handles 3-way splitting (train/val/test) with configurable ratios and
graceful handling of small datasets.
"""
def __init__(self, data_split_config: Optional[DataSplitConfig] = None):
"""
Initialize converter with optional split configuration.
Args:
data_split_config: Configuration for train/val/test splits.
If None, uses default 60/20/20 split.
"""
self.supported_extensions = [
'.csv', '.json', '.jsonl', '.txt', '.md',
'.png', '.jpg', '.jpeg'
]
self.loader = DataLoader()
self.data_split_config = data_split_config or DataSplitConfig()
def convert(
self,
dataset: Union[List[Any], str, Any, Dict[str, Any]],
split_config: Optional[DataSplitConfig] = None
) -> Tuple[List[dict], List[dict], List[dict]]:
"""
Convert any dataset to GEPA format with 3-way split (train/val/test).
Args:
dataset: Input dataset in any supported format
split_config: Optional split configuration (overrides instance config)
Returns:
Tuple of (trainset, valset, testset) where:
- trainset: Used for reflection/feedback (Dfeedback in GEPA paper)
- valset: Used for Pareto selection (Dpareto in GEPA paper)
- testset: Held-out for final evaluation (not passed to GEPA)
Raises:
DatasetError: If dataset cannot be converted or is too small
"""
try:
# Use provided split config or instance default
config = split_config or self.data_split_config
# Handle UI tree dataset format
if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset':
return self.convert_ui_tree_dataset(
dataset.get('json_dir', 'json_tree'),
dataset.get('screenshots_dir', 'screenshots'),
split_config=config
)
elif isinstance(dataset, str):
data = self._load_from_path(dataset)
elif hasattr(dataset, 'to_dict'): # pandas DataFrame
data = dataset.to_dict(orient='records')
elif isinstance(dataset, list):
data = dataset
else:
data = [dataset]
logger.info(f"Normalized data length: {len(data)}")
standardized = self._standardize(data)
train, val, test = self._split_three_way(standardized, config)
return train, val, test
except (FileNotFoundError, ValueError, TypeError) as e:
raise DatasetError(f"Failed to convert dataset: {str(e)}")
def _load_from_path(self, path: str) -> List[Any]:
"""Load data from file path"""
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"File not found: {path}")
ext = p.suffix.lower()
if ext in self.supported_extensions:
return [self.loader.load(p)]
else:
raise DatasetError(f"Unsupported file extension: {ext}")
def _standardize(self, data: List[Any]) -> List[dict]:
"""Standardize data to input/output format
Handles both UI tree JSON format and simple text inputs.
UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str}
Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc.
"""
out = []
for item in data:
if not isinstance(item, dict):
item = {'input': str(item)}
# Handle UI tree JSON format
if 'ui_tree' in item and 'screenshot' in item:
ui_tree = item['ui_tree']
input_text = ui_tree.get('text', '')
output_text = item.get('expected_output', '')
image = item.get('screenshot', '')
out.append({'input': input_text, 'output': output_text, 'image': image})
# Handle simple text format
else:
inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or ''
outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or ''
image = self._extract(item, ['image', 'image_base64', 'screenshot']) or ''
out.append({'input': inp, 'output': outp, 'image': image})
return out
def _extract(self, d: dict, keys: List[str]) -> Union[str, None]:
"""Extract value by trying multiple keys"""
for k in keys:
if k in d:
return d[k]
return None
def _split_three_way(
self,
data: List[dict],
config: DataSplitConfig
) -> Tuple[List[dict], List[dict], List[dict]]:
"""
Split data into train, validation, and test sets.
Args:
data: Standardized dataset
config: Split configuration with ratios and strategies
Returns:
Tuple of (train, val, test) datasets
Raises:
ValueError: If dataset is too small for configured splits
"""
dataset_size = len(data)
# 🔥 NEW: Log adaptive strategy if being used
if config.small_dataset_strategy == 'adaptive':
train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size)
logger.info(
f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): "
f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% "
f"(prioritizes validation for reliable candidate ranking)"
)
# Get split indices from config
try:
train_end, val_end, test_end, _ = config.get_split_indices(dataset_size)
except ValueError as e:
logger.error(f"Dataset split error: {e}")
raise DatasetError(str(e))
# Perform the split
train = data[:train_end]
val = data[train_end:val_end]
test = data[val_end:test_end]
# Log split information with strategy
strategy_note = ""
if config.small_dataset_strategy == 'adaptive':
strategy_note = " (adaptive)"
logger.info(
f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), "
f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), "
f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)"
)
# Validate splits are not empty
if len(train) == 0:
raise DatasetError("Training set is empty after split")
if len(val) == 0:
logger.warning("Validation set is empty - this may cause issues with Pareto selection")
val = [train[-1]] # Use last training sample as fallback
if len(test) == 0:
logger.warning("Test set is empty - final evaluation will not be performed")
return train, val, test
def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
"""
DEPRECATED: Legacy 2-way split for backwards compatibility.
Use _split_three_way() instead for production code.
Args:
data: Standardized dataset
ratio: Train ratio (0.0-1.0)
Returns:
Tuple of (train, val) datasets
"""
import warnings
warnings.warn(
"_split() is deprecated. Use _split_three_way() for 3-way splitting.",
DeprecationWarning,
stacklevel=2
)
split = max(1, int(len(data) * ratio))
train = data[:split]
val = data[split:] or data[-1:] # Ensure val is not empty
return train, val
def convert_ui_tree_dataset(
self,
json_dir: str,
screenshots_dir: str,
split_config: Optional[DataSplitConfig] = None
) -> Tuple[List[dict], List[dict], List[dict]]:
"""
Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split.
Args:
json_dir: Directory containing JSON files
screenshots_dir: Directory containing screenshot images
split_config: Optional split configuration (overrides instance config)
Returns:
Tuple of (train_data, val_data, test_data) in GEPA format
Raises:
DatasetError: If dataset cannot be loaded or is invalid
"""
try:
# Load paired dataset
dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir)
if not dataset:
raise DatasetError("No valid image-JSON pairs found")
logger.info(f"Loaded {len(dataset)} UI tree samples")
# Use provided config or instance default
config = split_config or self.data_split_config
# Split into train/val/test
train, val, test = self._split_three_way(dataset, config)
logger.info(
f"Split UI tree dataset: {len(train)} train, "
f"{len(val)} validation, {len(test)} test"
)
return train, val, test
except Exception as e:
raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")