""" TestTime RLVR Configuration AZR 기반 TestTime RLVR을 위한 설정 클래스 """ from dataclasses import dataclass from typing import Optional, List, Dict, Any import torch @dataclass class TestTimeConfig: """TestTime RLVR 전용 설정""" # ============================================================================ # 기본 모델 설정 (AZR 기반) # ============================================================================ model_name: str = "Qwen/Qwen2.5-7B" device: str = "auto" torch_dtype: torch.dtype = torch.bfloat16 use_flash_attention: bool = True enable_gradient_checkpointing: bool = True # ============================================================================ # TestTime 학습 설정 # ============================================================================ max_adaptation_steps: int = 10 # AZR 대비 짧은 적응 학습 adaptation_batch_size: int = 1 # 소규모 배치 gradient_accumulation_steps: int = 4 learning_rate: float = 1e-6 # AZR과 동일 # ============================================================================ # 반복 제어 설정 # ============================================================================ max_cycles: int = 3 # 최대 반복 횟수 min_improvement_threshold: float = 0.05 # 최소 개선 임계값 early_stopping_patience: int = 2 # Early stopping # ============================================================================ # IPO 추출 설정 # ============================================================================ max_ipo_triples: int = 10 # 추출할 최대 트리플 수 python_executor_timeout: int = 5 # AZR보다 짧은 타임아웃 validate_triples: bool = True # 트리플 검증 여부 # ============================================================================ # 다중 프로그램 생성 설정 # ============================================================================ num_program_variations: int = 4 # 생성할 다양한 프로그램 수 baseline_evaluation_rounds: int = 5 # 베이스라인 성능 측정 횟수 diverse_generation_temperature: float = 0.7 # 다양한 프로그램 생성용 temperature baseline_generation_temperature: float = 0.05 # 베이스라인 측정용 temperature # ============================================================================ # 태스크 생성 설정 # ============================================================================ task_distribution: Dict[str, float] = None # induction:deduction:abduction 비율 max_tasks_per_type: int = 5 # 타입별 최대 태스크 수 use_azr_templates: bool = True # AZR 템플릿 사용 skip_task_evaluation: bool = True # Task evaluation(4단계) 스킵 여부 (VeRL에서 수행) # ============================================================================ # 보상 설정 (AZR 기반) # ============================================================================ use_accuracy_reward: bool = True use_improvement_reward: bool = True # TestTime 전용 개선도 보상 use_complexity_reward: bool = True accuracy_weight: float = 1.0 improvement_weight: float = 0.5 # 개선도 가중치 complexity_weight: float = 0.1 # ============================================================================ # 로깅 설정 # ============================================================================ log_level: str = "INFO" save_intermediate_results: bool = True log_ipo_details: bool = True log_task_details: bool = True log_training_metrics: bool = True # ============================================================================ # 메모리 최적화 설정 (AZR 기반) # ============================================================================ gpu_memory_utilization: float = 0.4 max_workers: int = 2 # Python executor workers use_memory_efficient_attention: bool = True def __post_init__(self): """설정 후처리""" if self.task_distribution is None: # 기본 태스크 분포: 균등 분배 self.task_distribution = { "induction": 0.33, "deduction": 0.33, "abduction": 0.34 } # device 자동 설정 if self.device == "auto": self.device = "cuda" if torch.cuda.is_available() else "cpu" # dtype 설정 if self.device == "cpu": self.torch_dtype = torch.float32 def to_dict(self) -> Dict[str, Any]: """설정을 딕셔너리로 변환""" return { "model_name": self.model_name, "device": self.device, "torch_dtype": str(self.torch_dtype), "max_adaptation_steps": self.max_adaptation_steps, "max_cycles": self.max_cycles, "learning_rate": self.learning_rate, "task_distribution": self.task_distribution, "reward_weights": { "accuracy": self.accuracy_weight, "improvement": self.improvement_weight, "complexity": self.complexity_weight } } @classmethod def from_dict(cls, config_dict: Dict[str, Any]) -> 'TestTimeConfig': """딕셔너리에서 설정 로드""" return cls(**config_dict) @dataclass class BenchmarkConfig: """벤치마크별 설정""" name: str # "humaneval", "mbpp", "livecodebase" data_path: str problem_prefix: str # "HumanEval", "Mbpp" start_index: int = 0 # MBPP는 2부터 시작 max_problems: int = 5 # 테스트할 문제 수 # 벤치마크별 특화 설정 test_timeout: int = 10 use_plus_version: bool = True # HumanEval+, MBPP+ 사용 @classmethod def get_humaneval_config(cls) -> 'BenchmarkConfig': return cls( name="humaneval", data_path="evaluation/code_eval/data/HumanEvalPlus.jsonl", problem_prefix="HumanEval", start_index=0, max_problems=5 ) @classmethod def get_mbpp_config(cls) -> 'BenchmarkConfig': return cls( name="mbpp", data_path="evaluation/code_eval/data/MbppPlus.jsonl", problem_prefix="Mbpp", start_index=2, # MBPP는 2번부터 max_problems=5 )