Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

File size: 21,084 Bytes

cacd4d0

"""
Configuration models for GEPA Optimizer
"""

import os
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Union, Tuple

@dataclass
class ModelConfig:
    """Configuration for any LLM provider"""
    provider: str  # Required: "openai", "anthropic", "huggingface", "vllm", etc.
    model_name: str  # Required: actual model name
    api_key: str  # Required: API key for the provider
    base_url: Optional[str] = None  # Optional: custom endpoint URL
    temperature: float = 0.7
    max_tokens: int = 2048
    top_p: float = 1.0
    frequency_penalty: float = 0.0
    presence_penalty: float = 0.0
    
    def __post_init__(self):
        """Validate required fields after initialization"""
        if not self.provider:
            raise ValueError("Provider is required (e.g., 'openai', 'anthropic', 'huggingface')")
        if not self.model_name:
            raise ValueError("Model name is required (e.g., 'gpt-4', 'claude-3-opus')")
        if not self.api_key:
            raise ValueError(f"API key is required for {self.provider} provider")
    
    @classmethod
    def from_string(cls, model_string: str) -> 'ModelConfig':
        """Create ModelConfig from string like 'openai/gpt-4' or 'gpt-4'"""
        if "/" in model_string:
            provider, model_name = model_string.split("/", 1)
        else:
            # Default to OpenAI if no provider specified
            provider = "openai"
            model_name = model_string
        
        # Get API key from environment
        api_key = cls._get_api_key_for_provider(provider)
        if not api_key:
            raise ValueError(
                f"No API key found for {provider}. Please set {provider.upper()}_API_KEY environment variable"
            )
        
        return cls(
            provider=provider,
            model_name=model_name,
            api_key=api_key
        )
    
    @classmethod
    def from_dict(cls, config_dict: dict) -> 'ModelConfig':
        """Create ModelConfig from dictionary"""
        return cls(**config_dict)
    
    def to_dict(self) -> dict:
        """Convert ModelConfig to dictionary"""
        return {
            'provider': self.provider,
            'model_name': self.model_name,
            'api_key': self.api_key,
            'base_url': self.base_url,
            'temperature': self.temperature,
            'max_tokens': self.max_tokens,
            'top_p': self.top_p,
            'frequency_penalty': self.frequency_penalty,
            'presence_penalty': self.presence_penalty
        }
    
    @staticmethod
    def _get_api_key_for_provider(provider: str) -> Optional[str]:
        """Get API key for provider from environment variables"""
        env_var_map = {
            "openai": "OPENAI_API_KEY",
            "anthropic": "ANTHROPIC_API_KEY",
            "huggingface": "HUGGINGFACE_API_KEY",
            "cohere": "COHERE_API_KEY",
            "ai21": "AI21_API_KEY",
            "together": "TOGETHER_API_KEY",
            "replicate": "REPLICATE_API_TOKEN",
            "groq": "GROQ_API_KEY",
            "ollama": "OLLAMA_API_KEY"
        }
        
        env_var = env_var_map.get(provider.lower())
        if env_var:
            return os.getenv(env_var)
        
        # Fallback: try generic pattern
        return os.getenv(f"{provider.upper()}_API_KEY")

@dataclass
class DataSplitConfig:
    """Configuration for dataset splitting into train/val/test sets
    
    🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size for optimal results.
    - Small datasets (< 15): Prioritizes validation set (70/25/5) for reliable candidate ranking
    - Medium datasets (15-50): Balanced split (60/20/20)
    - Large datasets (50+): More training data (70/15/15)
    """
    
    # Split ratios (must sum to 1.0) - used as defaults, but adaptive strategy overrides for small datasets
    train_ratio: float = 0.6  # 60% for training (Dfeedback - reflection examples)
    val_ratio: float = 0.2    # 20% for validation (Dpareto - Pareto selection)
    test_ratio: float = 0.2   # 20% for test (held-out final evaluation)
    
    # Minimum samples per split
    min_train_samples: int = 3
    min_val_samples: int = 3  # 🔥 INCREASED from 2 to 3 for more reliable validation scores
    min_test_samples: int = 1  # 🔥 REDUCED from 2 to 1 (test set less critical, only used once)
    
    # Strategy for handling small datasets
    small_dataset_strategy: str = 'adaptive'  # 🔥 DEFAULT: 'adaptive', 'duplicate_val', 'no_test', 'error'
    
    def __post_init__(self):
        """Validate split configuration"""
        total = self.train_ratio + self.val_ratio + self.test_ratio
        if not (0.99 <= total <= 1.01):  # Allow small floating point errors
            raise ValueError(
                f"Split ratios must sum to 1.0, got {total:.3f} "
                f"(train={self.train_ratio}, val={self.val_ratio}, test={self.test_ratio})"
            )
        
        if self.train_ratio <= 0 or self.val_ratio <= 0 or self.test_ratio < 0:
            raise ValueError("Split ratios must be positive (test_ratio can be 0 to disable)")
        
        if self.small_dataset_strategy not in {'adaptive', 'duplicate_val', 'no_test', 'error'}:
            raise ValueError(
                f"Invalid small_dataset_strategy: {self.small_dataset_strategy}. "
                f"Must be 'adaptive', 'duplicate_val', 'no_test', or 'error'"
            )
    
    def get_adaptive_ratios(self, dataset_size: int) -> Tuple[float, float, float]:
        """
        🔥 NEW: Get adaptive split ratios based on dataset size.
        
        For prompt optimization:
        - Small datasets (< 15): Prioritize validation (70/25/5) for reliable candidate ranking
        - Medium (15-50): Balanced (60/20/20)
        - Large (50+): More training (70/15/15)
        
        Args:
            dataset_size: Total number of samples in dataset
            
        Returns:
            Tuple of (train_ratio, val_ratio, test_ratio)
        """
        if dataset_size < 15:
            # Small dataset: Prioritize validation for reliable candidate ranking
            # Validation set is CRITICAL - used for every candidate evaluation
            return (0.70, 0.25, 0.05)  # 70% train, 25% val, 5% test
        elif dataset_size < 50:
            # Medium dataset: Balanced split
            return (0.60, 0.20, 0.20)  # 60% train, 20% val, 20% test
        else:
            # Large dataset: More training data, can reduce validation/test
            return (0.70, 0.15, 0.15)  # 70% train, 15% val, 15% test
    
    def get_split_indices(self, dataset_size: int) -> Tuple[int, int, int, int]:
        """
        Calculate split indices for a dataset with adaptive ratios.
        
        🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size.
        This ensures optimal allocation:
        - Small datasets: More validation samples for reliable ranking
        - Medium datasets: Balanced split
        - Large datasets: More training data
        
        Args:
            dataset_size: Total number of samples in dataset
            
        Returns:
            Tuple of (train_end, val_end, test_end, dataset_size) indices
            
        Raises:
            ValueError: If dataset is too small for configured splits
        """
        # 🔥 NEW: Use adaptive ratios if strategy is 'adaptive'
        if self.small_dataset_strategy == 'adaptive':
            train_ratio, val_ratio, test_ratio = self.get_adaptive_ratios(dataset_size)
        else:
            train_ratio, val_ratio, test_ratio = self.train_ratio, self.val_ratio, self.test_ratio
        
        if dataset_size < self.min_train_samples + self.min_val_samples:
            if self.small_dataset_strategy == 'error':
                raise ValueError(
                    f"Dataset too small ({dataset_size} samples). "
                    f"Need at least {self.min_train_samples + self.min_val_samples} samples."
                )
        
        # Calculate ideal split points with adaptive ratios
        train_end = max(self.min_train_samples, int(dataset_size * train_ratio))
        val_end = train_end + max(self.min_val_samples, int(dataset_size * val_ratio))
        
        # Adjust for small datasets
        if val_end >= dataset_size:
            if self.small_dataset_strategy in {'adaptive', 'duplicate_val'}:
                # Ensure minimum validation samples, use remainder for test
                val_end = min(dataset_size, train_end + self.min_val_samples)
                test_end = dataset_size
            elif self.small_dataset_strategy == 'no_test':
                # No test set for small datasets
                val_end = dataset_size
                test_end = dataset_size
            else:  # error
                raise ValueError(
                    f"Dataset too small ({dataset_size} samples) for train/val/test split. "
                    f"Need at least {self.min_train_samples + self.min_val_samples + self.min_test_samples} samples."
                )
        else:
            test_end = dataset_size
        
        return train_end, val_end, test_end, dataset_size

@dataclass
class OptimizationConfig:
    """Configuration class for GEPA optimization process"""
    
    # Core models - REQUIRED by user
    model: Union[str, ModelConfig]  # No default - user must specify
    reflection_model: Union[str, ModelConfig]  # No default - user must specify
    
    # Optimization parameters - REQUIRED by user
    max_iterations: int  # No default - user decides their budget
    max_metric_calls: int  # No default - user sets their budget
    batch_size: int  # No default - user decides based on memory
    
    # Dataset splitting configuration
    data_split: DataSplitConfig = field(default_factory=DataSplitConfig)
    
    # Reflection settings (separate from evaluation batch_size)
    reflection_examples: int = 3  # Number of examples for each reflection (small!)
    
    # Optional optimization settings with sensible fallbacks
    early_stopping: bool = True
    learning_rate: float = 0.01
    
    # Multi-objective optimization
    multi_objective: bool = False
    objectives: List[str] = field(default_factory=lambda: ["accuracy"])
    
    # Advanced settings
    custom_metrics: Optional[Dict[str, Any]] = None
    use_cache: bool = True
    parallel_evaluation: bool = False
    
    # Backwards compatibility (deprecated)
    train_split_ratio: Optional[float] = None  # Use data_split instead
    min_dataset_size: int = 2
    
    # Cost and budget - user controlled
    max_cost_usd: Optional[float] = None
    timeout_seconds: Optional[int] = None
    
    # GEPA-specific optimization parameters (based on actual GEPA library)
    candidate_selection_strategy: str = 'pareto'  # Use Pareto selection strategy
    skip_perfect_score: bool = False  # Don't skip perfect scores (set to True for early stopping)
    reflection_minibatch_size: Optional[int] = None  # Will use reflection_examples if None
    perfect_score: float = 1.0  # Perfect score threshold
    module_selector: str = 'round_robin'  # Component selection strategy
    verbose: bool = True  # Enable detailed GEPA logging
    
    # Test set evaluation
    evaluate_on_test: bool = True  # Evaluate final prompt on held-out test set
    
    # 🆕 LLEGO Genetic Operator Parameters (Optional - for faster convergence)
    # Based on ICLR 2025 paper: "Decision Tree Induction Through LLMs via Semantically-Aware Evolution"
    # Optimized for small datasets (6-10 samples)
    use_llego_operators: bool = False  # Enable LLEGO genetic operators
    
    # 🔥 HYBRID MODE: Combine GEPA Reflection + LLEGO Operators
    # When both enabled, candidates are generated from BOTH sources for maximum diversity
    enable_gepa_reflection_with_llego: bool = False  # Enable hybrid GEPA+LLEGO mode
    num_gepa_reflection_candidates: int = 3  # Number of GEPA reflection candidates per iteration (default: 3 for better exploration, range: 2-5)
    
    # Fitness-guided crossover parameters (FIX #3: Conservative alpha)
    alpha: float = 0.05  # FIX #3: Fitness extrapolation (0.05 = 5% above best parent, realistic for prompt optimization)
    n_crossover: int = 2  # Number of offspring from crossover per iteration
    
    # Diversity-guided mutation parameters
    tau: float = 8.0  # Diversity temperature (8.0 = moderate diversity, balanced exploration/exploitation)
    nu: int = 3  # Parent arity (3 parents optimal for small populations ~6 samples)
    n_mutation: int = 2  # Number of offspring from mutation per iteration (total 4 offspring with crossover)
    
    # Population management (for genetic operators)
    population_size: int = 8  # Size of prompt population (small but diverse for 6-sample dataset)
    
    # 🆕 LLM-as-Judge configuration (Phase 2)
    use_llm_as_judge: bool = True  # Enable LLM-as-Judge feedback for detailed, actionable analysis
    llm_as_judge_threshold: float = 0.8  # Use LLM-as-Judge for scores below this threshold
    llm_as_judge_model: Optional[ModelConfig] = None  # Optional: use different model (defaults to reflection_model)
    
    # 🆕 Logging configuration (Phase 3)
    log_level: str = "INFO"  # Logging level: "DEBUG", "INFO", "WARNING", "ERROR"
    
    def __post_init__(self):
        """Validate and process configuration after initialization"""
        # Handle backwards compatibility for train_split_ratio
        if self.train_split_ratio is not None and self.train_split_ratio != 0.8:
            import warnings
            warnings.warn(
                "train_split_ratio is deprecated. Use data_split=DataSplitConfig(...) instead. "
                "Converting to 3-way split with your ratio.",
                DeprecationWarning,
                stacklevel=2
            )
            # Convert 2-way split to 3-way: use train_ratio, split remainder between val/test
            remainder = 1.0 - self.train_split_ratio
            self.data_split = DataSplitConfig(
                train_ratio=self.train_split_ratio,
                val_ratio=remainder * 0.5,
                test_ratio=remainder * 0.5
            )
        
        # Convert string models to ModelConfig objects
        self.model = self._parse_model_config(self.model, "model")
        self.reflection_model = self._parse_model_config(self.reflection_model, "reflection_model")
        
        # Set reflection_minibatch_size default
        if self.reflection_minibatch_size is None:
            self.reflection_minibatch_size = self.reflection_examples
        
        # Validate required parameters
        self._validate_required_params()
        
        # Validate ranges
        self._validate_ranges()
    
    def _parse_model_config(self, model: Union[str, ModelConfig], field_name: str) -> ModelConfig:
        """Parse string model specification into ModelConfig"""
        if isinstance(model, ModelConfig):
            return model
        
        if isinstance(model, str):
            # Parse "provider/model-name" format
            if "/" in model:
                provider, model_name = model.split("/", 1)
            else:
                # Default to openai if no provider specified
                provider = "openai"
                model_name = model
            
            # Try to get API key from environment
            api_key = self._get_api_key_for_provider(provider)
            if not api_key:
                raise ValueError(
                    f"No API key found for {provider}. Please set environment variable "
                    f"or provide ModelConfig with api_key for {field_name}"
                )
            
            return ModelConfig(
                provider=provider,
                model_name=model_name,
                api_key=api_key
            )
        
        raise ValueError(f"{field_name} must be either a string or ModelConfig object")
    
    def _get_api_key_for_provider(self, provider: str) -> Optional[str]:
        """Get API key for provider from environment variables"""
        return ModelConfig._get_api_key_for_provider(provider)
    
    def _validate_required_params(self):
        """Validate that all required parameters are provided"""
        required_fields = {
            "max_iterations": self.max_iterations,
            "max_metric_calls": self.max_metric_calls,
            "batch_size": self.batch_size,
        }
        
        for field_name, value in required_fields.items():
            if value is None:
                raise ValueError(f"{field_name} is required and must be specified by user")
    
    def _validate_ranges(self):
        """Validate parameter ranges"""
        if self.max_iterations <= 0:
            raise ValueError("max_iterations must be positive")
        
        if self.max_metric_calls <= 0:
            raise ValueError("max_metric_calls must be positive")
        
        if self.batch_size <= 0:
            raise ValueError("batch_size must be positive")
        
        if self.reflection_examples <= 0 or self.reflection_examples > 10:
            raise ValueError("reflection_examples must be between 1 and 10 (recommended: 2-5)")
        
        if self.reflection_minibatch_size <= 0:
            raise ValueError("reflection_minibatch_size must be positive")
            
        if hasattr(self.model, 'max_tokens') and self.model.max_tokens <= 0:
            raise ValueError("model.max_tokens must be a positive integer")
        
        # Validate hybrid mode parameters
        if self.enable_gepa_reflection_with_llego and not self.use_llego_operators:
            raise ValueError("enable_gepa_reflection_with_llego requires use_llego_operators=True")
        
        if self.num_gepa_reflection_candidates <= 0 or self.num_gepa_reflection_candidates > 5:
            raise ValueError("num_gepa_reflection_candidates must be between 1 and 5 (recommended: 3 for balanced exploration)")
        
        # Validate log_level
        valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"]
        if self.log_level.upper() not in valid_log_levels:
            raise ValueError(f"log_level must be one of {valid_log_levels}, got: {self.log_level}")
            
    def validate_api_connectivity(self) -> Dict[str, bool]:
        """Test API connectivity for both models"""
        results = {}
        
        for model_name, model_config in [("model", self.model), ("reflection_model", self.reflection_model)]:
            try:
                # This would be implemented to actually test the API
                # For now, just check if we have the required info
                if model_config.api_key and model_config.provider and model_config.model_name:
                    results[model_name] = True
                else:
                    results[model_name] = False
            except Exception:
                results[model_name] = False
        
        return results
    
    def get_estimated_cost(self) -> Dict[str, Any]:
        """Estimate cost based on configuration"""
        # This would calculate estimated costs based on:
        # - max_metric_calls
        # - model pricing
        # - expected tokens per call
        return {
            "max_calls": self.max_metric_calls,
            "estimated_cost_range": "To be calculated based on provider pricing",
            "cost_factors": {
                "model_calls": self.max_metric_calls,
                "reflection_calls": self.max_iterations,
                "batch_size": self.batch_size
            }
        }
    
    @classmethod
    def create_example_config(cls, provider: str = "openai") -> str:
        """Generate example configuration code for users"""
        examples = {
            "openai": '''
# Example OpenAI Configuration
config = OptimizationConfig(
    model="openai/gpt-4-turbo",  # or ModelConfig(...)
    reflection_model="openai/gpt-4-turbo",
    max_iterations=50,  # Your choice based on budget
    max_metric_calls=300,  # Your choice based on budget
    batch_size=8,  # Your choice based on memory
    early_stopping=True,
    learning_rate=0.01
)
''',
            "anthropic": '''
# Example Anthropic Configuration
config = OptimizationConfig(
    model=ModelConfig(
        provider="anthropic",
        model_name="claude-3-opus-20240229",
        api_key="your-anthropic-key",
        temperature=0.7
    ),
    reflection_model="anthropic/claude-3-sonnet-20240229",
    max_iterations=30,
    max_metric_calls=200,
    batch_size=4
)
''',
            "mixed": '''
# Example Mixed Providers Configuration
config = OptimizationConfig(
    model="openai/gpt-4-turbo",  # Main model
    reflection_model="anthropic/claude-3-opus",  # Reflection model
    max_iterations=25,
    max_metric_calls=250,
    batch_size=6,
    max_cost_usd=100.0,  # Budget limit
    timeout_seconds=3600  # 1 hour limit
)
'''
        }
        
        return examples.get(provider, examples["openai"])