wu981526092's picture
Add comprehensive perturbation testing system with E2E tests
795b72e
#!/usr/bin/env python3
"""
Configuration Models for Perturbation Testing
Provides Pydantic models for configuring:
- Jailbreak testing
- Counterfactual bias testing
- Execution settings
- Overall perturbation test configuration
"""
from typing import List, Optional, Literal, Dict, Any
from pydantic import BaseModel, Field
class ExecutionConfig(BaseModel):
"""Execution configuration for concurrent testing."""
max_workers: int = Field(
default=5,
ge=1,
le=20,
description="Maximum concurrent workers"
)
max_retries: int = Field(
default=3,
ge=1,
le=10,
description="Maximum retry attempts"
)
base_delay: float = Field(
default=1.0,
ge=0.1,
le=10.0,
description="Base delay for exponential backoff (seconds)"
)
max_delay: float = Field(
default=60.0,
ge=1.0,
le=300.0,
description="Maximum delay between retries (seconds)"
)
rate_limit_per_minute: int = Field(
default=60,
ge=10,
le=500,
description="Maximum requests per minute"
)
class JailbreakTestConfig(BaseModel):
"""Configuration for jailbreak testing."""
enabled: bool = Field(
default=True,
description="Whether jailbreak testing is enabled"
)
num_techniques: int = Field(
default=10,
ge=1,
le=50,
description="Number of jailbreak techniques to test per relation"
)
technique_categories: Optional[List[str]] = Field(
default=None,
description="Filter techniques by category: ['DAN', 'Omega', 'Developer Mode', etc.]"
)
random_seed: Optional[int] = Field(
default=None,
description="Random seed for reproducible technique selection"
)
prompt_source: str = Field(
default="standard",
description="Prompt source: 'standard' or name of custom uploaded prompts"
)
custom_prompts: Optional[List[Dict[str, Any]]] = Field(
default=None,
description="Custom jailbreak prompts to use instead of dataset"
)
class DemographicConfig(BaseModel):
"""Configuration for a demographic group."""
gender: str = Field(description="Gender: male, female, non-binary, etc.")
race: str = Field(description="Race/ethnicity: White, Black, Asian, Hispanic, etc.")
def __str__(self):
return f"{self.gender} {self.race}"
class CounterfactualBiasTestConfig(BaseModel):
"""Configuration for counterfactual bias testing."""
enabled: bool = Field(
default=True,
description="Whether counterfactual bias testing is enabled"
)
demographics: List[DemographicConfig] = Field(
default=[
DemographicConfig(gender="male", race="White"),
DemographicConfig(gender="female", race="White"),
DemographicConfig(gender="male", race="Black"),
DemographicConfig(gender="female", race="Black"),
],
description="Demographics to test"
)
include_baseline: bool = Field(
default=True,
description="Include baseline (no demographic) for comparison"
)
comparison_mode: Literal["all_pairs", "vs_baseline", "both"] = Field(
default="both",
description="Comparison mode: all_pairs, vs_baseline, or both"
)
extended_dimensions: Optional[List[str]] = Field(
default=None,
description="Additional dimensions: ['age', 'disability', 'socioeconomic']"
)
class PerturbationTestConfig(BaseModel):
"""Overall perturbation test configuration."""
# General settings
model: str = Field(
default="gpt-4o-mini",
description="LLM model for testing"
)
judge_model: str = Field(
default="gpt-4o-mini",
description="LLM model for evaluation/judging"
)
max_relations: Optional[int] = Field(
default=None,
description="Maximum relations to test (None = all)"
)
# Execution configuration
execution: ExecutionConfig = Field(
default_factory=ExecutionConfig,
description="Concurrent execution settings"
)
# Test-specific configurations
jailbreak: JailbreakTestConfig = Field(
default_factory=JailbreakTestConfig,
description="Jailbreak testing configuration"
)
counterfactual_bias: CounterfactualBiasTestConfig = Field(
default_factory=CounterfactualBiasTestConfig,
description="Counterfactual bias testing configuration"
)
# Preset configurations
PRESET_CONFIGS = {
"quick": PerturbationTestConfig(
max_relations=3,
execution=ExecutionConfig(max_workers=3),
jailbreak=JailbreakTestConfig(num_techniques=3),
counterfactual_bias=CounterfactualBiasTestConfig(
demographics=[
DemographicConfig(gender="male", race="White"),
DemographicConfig(gender="female", race="Black"),
],
comparison_mode="vs_baseline"
)
),
"standard": PerturbationTestConfig(
max_relations=10,
execution=ExecutionConfig(max_workers=5),
jailbreak=JailbreakTestConfig(num_techniques=10),
counterfactual_bias=CounterfactualBiasTestConfig(
comparison_mode="both"
)
),
"comprehensive": PerturbationTestConfig(
max_relations=None,
execution=ExecutionConfig(max_workers=10, max_retries=5),
jailbreak=JailbreakTestConfig(num_techniques=20),
counterfactual_bias=CounterfactualBiasTestConfig(
demographics=[
DemographicConfig(gender="male", race="White"),
DemographicConfig(gender="female", race="White"),
DemographicConfig(gender="male", race="Black"),
DemographicConfig(gender="female", race="Black"),
DemographicConfig(gender="male", race="Asian"),
DemographicConfig(gender="female", race="Asian"),
DemographicConfig(gender="male", race="Hispanic"),
DemographicConfig(gender="female", race="Hispanic"),
DemographicConfig(gender="non-binary", race="White"),
],
extended_dimensions=["age"],
comparison_mode="both"
)
)
}
# Extended demographics for optional use
EXTENDED_DEMOGRAPHICS = {
"age": [
"young (20s)",
"middle-aged (40s)",
"elderly (70s)"
],
"disability": [
"with no disability",
"with a physical disability",
"with a visual impairment"
],
"socioeconomic": [
"from a wealthy background",
"from a middle-class background",
"from a low-income background"
]
}
def get_preset_config(preset_name: str) -> PerturbationTestConfig:
"""
Get a preset configuration by name.
Args:
preset_name: One of 'quick', 'standard', 'comprehensive'
Returns:
PerturbationTestConfig for the preset
Raises:
ValueError: If preset name is invalid
"""
if preset_name not in PRESET_CONFIGS:
raise ValueError(
f"Unknown preset: {preset_name}. "
f"Available: {list(PRESET_CONFIGS.keys())}"
)
return PRESET_CONFIGS[preset_name].model_copy(deep=True)
def create_config_from_dict(config_dict: Dict[str, Any]) -> PerturbationTestConfig:
"""
Create a configuration from a dictionary.
Args:
config_dict: Configuration dictionary
Returns:
PerturbationTestConfig instance
"""
return PerturbationTestConfig(**config_dict)