#!/usr/bin/env python3 """ Configuration Models for Perturbation Testing Provides Pydantic models for configuring: - Jailbreak testing - Counterfactual bias testing - Execution settings - Overall perturbation test configuration """ from typing import List, Optional, Literal, Dict, Any from pydantic import BaseModel, Field class ExecutionConfig(BaseModel): """Execution configuration for concurrent testing.""" max_workers: int = Field( default=5, ge=1, le=20, description="Maximum concurrent workers" ) max_retries: int = Field( default=3, ge=1, le=10, description="Maximum retry attempts" ) base_delay: float = Field( default=1.0, ge=0.1, le=10.0, description="Base delay for exponential backoff (seconds)" ) max_delay: float = Field( default=60.0, ge=1.0, le=300.0, description="Maximum delay between retries (seconds)" ) rate_limit_per_minute: int = Field( default=60, ge=10, le=500, description="Maximum requests per minute" ) class JailbreakTestConfig(BaseModel): """Configuration for jailbreak testing.""" enabled: bool = Field( default=True, description="Whether jailbreak testing is enabled" ) num_techniques: int = Field( default=10, ge=1, le=50, description="Number of jailbreak techniques to test per relation" ) technique_categories: Optional[List[str]] = Field( default=None, description="Filter techniques by category: ['DAN', 'Omega', 'Developer Mode', etc.]" ) random_seed: Optional[int] = Field( default=None, description="Random seed for reproducible technique selection" ) prompt_source: str = Field( default="standard", description="Prompt source: 'standard' or name of custom uploaded prompts" ) custom_prompts: Optional[List[Dict[str, Any]]] = Field( default=None, description="Custom jailbreak prompts to use instead of dataset" ) class DemographicConfig(BaseModel): """Configuration for a demographic group.""" gender: str = Field(description="Gender: male, female, non-binary, etc.") race: str = Field(description="Race/ethnicity: White, Black, Asian, Hispanic, etc.") def __str__(self): return f"{self.gender} {self.race}" class CounterfactualBiasTestConfig(BaseModel): """Configuration for counterfactual bias testing.""" enabled: bool = Field( default=True, description="Whether counterfactual bias testing is enabled" ) demographics: List[DemographicConfig] = Field( default=[ DemographicConfig(gender="male", race="White"), DemographicConfig(gender="female", race="White"), DemographicConfig(gender="male", race="Black"), DemographicConfig(gender="female", race="Black"), ], description="Demographics to test" ) include_baseline: bool = Field( default=True, description="Include baseline (no demographic) for comparison" ) comparison_mode: Literal["all_pairs", "vs_baseline", "both"] = Field( default="both", description="Comparison mode: all_pairs, vs_baseline, or both" ) extended_dimensions: Optional[List[str]] = Field( default=None, description="Additional dimensions: ['age', 'disability', 'socioeconomic']" ) class PerturbationTestConfig(BaseModel): """Overall perturbation test configuration.""" # General settings model: str = Field( default="gpt-4o-mini", description="LLM model for testing" ) judge_model: str = Field( default="gpt-4o-mini", description="LLM model for evaluation/judging" ) max_relations: Optional[int] = Field( default=None, description="Maximum relations to test (None = all)" ) # Execution configuration execution: ExecutionConfig = Field( default_factory=ExecutionConfig, description="Concurrent execution settings" ) # Test-specific configurations jailbreak: JailbreakTestConfig = Field( default_factory=JailbreakTestConfig, description="Jailbreak testing configuration" ) counterfactual_bias: CounterfactualBiasTestConfig = Field( default_factory=CounterfactualBiasTestConfig, description="Counterfactual bias testing configuration" ) # Preset configurations PRESET_CONFIGS = { "quick": PerturbationTestConfig( max_relations=3, execution=ExecutionConfig(max_workers=3), jailbreak=JailbreakTestConfig(num_techniques=3), counterfactual_bias=CounterfactualBiasTestConfig( demographics=[ DemographicConfig(gender="male", race="White"), DemographicConfig(gender="female", race="Black"), ], comparison_mode="vs_baseline" ) ), "standard": PerturbationTestConfig( max_relations=10, execution=ExecutionConfig(max_workers=5), jailbreak=JailbreakTestConfig(num_techniques=10), counterfactual_bias=CounterfactualBiasTestConfig( comparison_mode="both" ) ), "comprehensive": PerturbationTestConfig( max_relations=None, execution=ExecutionConfig(max_workers=10, max_retries=5), jailbreak=JailbreakTestConfig(num_techniques=20), counterfactual_bias=CounterfactualBiasTestConfig( demographics=[ DemographicConfig(gender="male", race="White"), DemographicConfig(gender="female", race="White"), DemographicConfig(gender="male", race="Black"), DemographicConfig(gender="female", race="Black"), DemographicConfig(gender="male", race="Asian"), DemographicConfig(gender="female", race="Asian"), DemographicConfig(gender="male", race="Hispanic"), DemographicConfig(gender="female", race="Hispanic"), DemographicConfig(gender="non-binary", race="White"), ], extended_dimensions=["age"], comparison_mode="both" ) ) } # Extended demographics for optional use EXTENDED_DEMOGRAPHICS = { "age": [ "young (20s)", "middle-aged (40s)", "elderly (70s)" ], "disability": [ "with no disability", "with a physical disability", "with a visual impairment" ], "socioeconomic": [ "from a wealthy background", "from a middle-class background", "from a low-income background" ] } def get_preset_config(preset_name: str) -> PerturbationTestConfig: """ Get a preset configuration by name. Args: preset_name: One of 'quick', 'standard', 'comprehensive' Returns: PerturbationTestConfig for the preset Raises: ValueError: If preset name is invalid """ if preset_name not in PRESET_CONFIGS: raise ValueError( f"Unknown preset: {preset_name}. " f"Available: {list(PRESET_CONFIGS.keys())}" ) return PRESET_CONFIGS[preset_name].model_copy(deep=True) def create_config_from_dict(config_dict: Dict[str, Any]) -> PerturbationTestConfig: """ Create a configuration from a dictionary. Args: config_dict: Configuration dictionary Returns: PerturbationTestConfig instance """ return PerturbationTestConfig(**config_dict)