""" Experiment Configuration Defines configuration for controlled experiments with statistical rigor. """ from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from typing import List, Dict, Any, Optional from pathlib import Path import json class PlannerMode(str, Enum): """Planning mode for generation.""" DIRECT = "direct" # Raw prompt → generators SINGLE_PLANNER = "single_planner" # 1 LLM call → plan → generators COUNCIL = "council" # 3 LLM calls → merge → generators EXTENDED_PROMPT = "extended_prompt" # 1 LLM call with 3× token budget class PerturbationType(str, Enum): """Perturbation conditions for sensitivity testing.""" BASELINE = "baseline" # Normal generation WRONG_IMAGE = "wrong_image" # Image from different prompt WRONG_AUDIO = "wrong_audio" # Audio from different prompt SEMANTIC_SHIFT_25 = "semantic_shift_25" # 25% semantic mismatch SEMANTIC_SHIFT_50 = "semantic_shift_50" # 50% semantic mismatch SEMANTIC_SHIFT_75 = "semantic_shift_75" # 75% semantic mismatch RANDOM_IMAGE = "random_image" # Completely random image RANDOM_AUDIO = "random_audio" # Completely random audio @dataclass class ExperimentConfig: """ Configuration for controlled experiments. Supports: - Multiple planning modes (RQ2: planning effect) - Multiple perturbation conditions (RQ1: MSCI sensitivity) - Statistical parameters for hypothesis testing """ # Experiment identity name: str = "default_experiment" description: str = "" # Sample sizes n_prompts: int = 50 # Minimum for statistical power n_seeds: int = 3 # Replications per prompt # Experimental conditions modes: List[PlannerMode] = field( default_factory=lambda: [PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER] ) perturbations: List[PerturbationType] = field( default_factory=lambda: [ PerturbationType.BASELINE, PerturbationType.WRONG_IMAGE, PerturbationType.WRONG_AUDIO, ] ) # Statistical parameters alpha: float = 0.05 # Significance level power: float = 0.80 # Target statistical power min_effect_size: float = 0.5 # Minimum Cohen's d to detect # Execution parameters deterministic: bool = True base_seed: int = 42 use_ollama: bool = True output_dir: str = "runs/experiments" # Resource tracking track_tokens: bool = True track_time: bool = True @property def total_runs(self) -> int: """Total number of experimental runs.""" return self.n_prompts * self.n_seeds * len(self.modes) * len(self.perturbations) @property def conditions(self) -> List[str]: """List of all condition keys (mode_perturbation).""" return [ f"{mode.value}_{pert.value}" for mode in self.modes for pert in self.perturbations ] def required_sample_size(self, effect_size: Optional[float] = None) -> int: """ Compute required sample size for given effect size using power analysis. For paired t-test with alpha=0.05, power=0.80: - d=0.5 (medium): N≈34 - d=0.8 (large): N≈15 - d=0.3 (small): N≈90 Uses approximation: N ≈ 2 * ((z_alpha + z_beta) / d)^2 """ from scipy import stats d = effect_size or self.min_effect_size z_alpha = stats.norm.ppf(1 - self.alpha / 2) z_beta = stats.norm.ppf(self.power) n = 2 * ((z_alpha + z_beta) / d) ** 2 return int(n) + 1 # Round up def validate(self) -> List[str]: """Validate configuration and return any warnings.""" warnings = [] required_n = self.required_sample_size() if self.n_prompts < required_n: warnings.append( f"Sample size ({self.n_prompts}) may be underpowered. " f"Recommended: {required_n} for effect size d={self.min_effect_size}" ) if self.n_seeds < 2: warnings.append( "n_seeds < 2: No replication variance can be estimated" ) if self.alpha > 0.10: warnings.append( f"High alpha ({self.alpha}): Increased false positive risk" ) return warnings def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "name": self.name, "description": self.description, "n_prompts": self.n_prompts, "n_seeds": self.n_seeds, "modes": [m.value for m in self.modes], "perturbations": [p.value for p in self.perturbations], "alpha": self.alpha, "power": self.power, "min_effect_size": self.min_effect_size, "deterministic": self.deterministic, "base_seed": self.base_seed, "use_ollama": self.use_ollama, "output_dir": self.output_dir, "track_tokens": self.track_tokens, "track_time": self.track_time, "total_runs": self.total_runs, "conditions": self.conditions, } @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig": """Create from dictionary.""" # Convert string enums back if "modes" in data: data["modes"] = [PlannerMode(m) for m in data["modes"]] if "perturbations" in data: data["perturbations"] = [PerturbationType(p) for p in data["perturbations"]] # Remove computed fields data.pop("total_runs", None) data.pop("conditions", None) return cls(**data) def save(self, path: Path): """Save configuration to JSON file.""" path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: json.dump(self.to_dict(), f, indent=2) @classmethod def load(cls, path: Path) -> "ExperimentConfig": """Load configuration from JSON file.""" with Path(path).open("r", encoding="utf-8") as f: return cls.from_dict(json.load(f)) # Preset configurations for common experiment types PRESETS = { "rq1_sensitivity": ExperimentConfig( name="RQ1: MSCI Sensitivity", description="Test if MSCI is sensitive to controlled semantic perturbations", n_prompts=50, n_seeds=3, modes=[PlannerMode.SINGLE_PLANNER], perturbations=[ PerturbationType.BASELINE, PerturbationType.WRONG_IMAGE, PerturbationType.WRONG_AUDIO, PerturbationType.SEMANTIC_SHIFT_25, PerturbationType.SEMANTIC_SHIFT_50, PerturbationType.SEMANTIC_SHIFT_75, ], ), "rq2_planning": ExperimentConfig( name="RQ2: Planning Effect", description="Test if structured planning improves cross-modal alignment", n_prompts=50, n_seeds=3, modes=[ PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER, PlannerMode.COUNCIL, PlannerMode.EXTENDED_PROMPT, ], perturbations=[PerturbationType.BASELINE], ), "full_ablation": ExperimentConfig( name="Full Ablation Study", description="Complete ablation across all modes and perturbations", n_prompts=50, n_seeds=3, modes=[ PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER, PlannerMode.COUNCIL, PlannerMode.EXTENDED_PROMPT, ], perturbations=[ PerturbationType.BASELINE, PerturbationType.WRONG_IMAGE, PerturbationType.WRONG_AUDIO, ], ), "quick_test": ExperimentConfig( name="Quick Test", description="Small-scale test run", n_prompts=5, n_seeds=1, modes=[PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER], perturbations=[PerturbationType.BASELINE, PerturbationType.WRONG_IMAGE], ), }