{ "optimizer_name": "AdamW", "optimizer_class": "torch.optim.AdamW", "hyperparameters": { "learning_rate": 3.0e-4, "betas": [0.9, 0.95], "eps": 1.0e-8, "weight_decay": 0.1, "amsgrad": false, "maximize": false, "foreach": null, "capturable": false, "differentiable": false, "fused": true }, "learning_rate_schedule": { "scheduler_type": "cosine_with_warmup", "warmup_steps": 2000, "total_steps": 875000, "min_lr": 3.0e-5, "max_lr": 3.0e-4, "warmup_init_lr": 0.0, "cycle_mult": 1.0, "last_epoch": -1 }, "gradient_configuration": { "max_grad_norm": 1.0, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_policy": "full_deterministic" }, "mixed_precision": { "enabled": true, "dtype": "bfloat16", "loss_scale": "dynamic", "initial_scale_power": 16, "scale_growth_factor": 2.0, "backoff_factor": 0.5, "scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1.0 }, "optimization_stages": [ { "stage": "pretraining", "steps": 750000, "learning_rate": 3.0e-4, "weight_decay": 0.1, "scheduler": "cosine", "warmup_steps": 2000, "description": "Initial pretraining phase on diverse corpus" }, { "stage": "domain_adaptation", "steps": 80000, "learning_rate": 1.0e-4, "weight_decay": 0.1, "scheduler": "constant", "warmup_steps": 0, "description": "Continued pretraining on domain-specific data" }, { "stage": "instruction_tuning", "steps": 45000, "learning_rate": 5.0e-5, "weight_decay": 0.01, "scheduler": "linear_decay", "warmup_steps": 500, "description": "Fine-tuning for instruction following" } ], "parameter_groups": [ { "name": "embeddings", "modules": ["model.embed_tokens", "lm_head"], "learning_rate_multiplier": 1.0, "weight_decay": 0.1 }, { "name": "attention", "modules": [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj" ], "learning_rate_multiplier": 1.0, "weight_decay": 0.1 }, { "name": "mlp", "modules": [ "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj" ], "learning_rate_multiplier": 1.0, "weight_decay": 0.1 }, { "name": "layer_norms", "modules": [ "input_layernorm", "post_attention_layernorm", "model.norm" ], "learning_rate_multiplier": 1.0, "weight_decay": 0.0 } ], "advanced_techniques": { "layer_wise_lr_decay": { "enabled": false, "decay_rate": 0.95, "description": "Apply learning rate decay by layer depth" }, "warmup_schedule": { "type": "linear", "steps": 2000, "start_lr": 0.0, "target_lr": 3.0e-4 }, "gradient_clipping": { "method": "norm", "max_norm": 1.0, "norm_type": 2.0 }, "optimizer_state_sharding": { "enabled": true, "strategy": "zero_stage_2", "offload_optimizer": false, "offload_params": false } }, "memory_optimization": { "cpu_offload": { "enabled": false, "offload_optimizer_states": false, "offload_params": false, "pin_memory": true }, "activation_checkpointing": { "enabled": true, "checkpoint_every_n_layers": 1, "use_reentrant": false }, "zero_optimization": { "stage": 2, "offload_optimizer": false, "offload_param": false, "overlap_comm": true, "contiguous_gradients": true, "reduce_bucket_size": 5.0e8, "allgather_bucket_size": 5.0e8, "sub_group_size": 1.0e9, "round_robin_gradients": false } }, "monitoring": { "log_optimizer_states": true, "log_learning_rate": true, "log_gradient_norm": true, "log_parameter_norm": true, "log_interval": 100, "tracked_metrics": [ "lr", "grad_norm", "param_norm", "loss_scale", "overflow_count", "step_time", "samples_per_second", "tokens_per_second" ] }, "convergence_criteria": { "max_steps": 875000, "early_stopping": { "enabled": false, "patience": 10000, "min_delta": 0.001, "monitor": "eval_loss" }, "plateau_detection": { "enabled": true, "patience": 5000, "threshold": 0.001, "cooldown": 1000 } }, "stability_features": { "loss_spike_detection": { "enabled": true, "threshold": 2.0, "window_size": 100, "action": "skip_update" }, "gradient_overflow_detection": { "enabled": true, "max_overflow_count": 10, "action": "reduce_loss_scale" }, "nan_inf_detection": { "enabled": true, "check_frequency": 100, "action": "rollback_checkpoint" } }, "distributed_optimization": { "backend": "nccl", "gradient_as_bucket_view": true, "static_graph": false, "ddp_bucket_cap_mb": 25, "find_unused_parameters": false, "broadcast_buffers": true, "communication_optimization": { "fp16_reduce_scatter": false, "bf16_reduce_scatter": true, "bucket_size_multiplier": 1.0, "overlap_grad_reduce": true, "use_multi_stream": true } }, "checkpointing": { "save_optimizer_states": true, "save_scheduler_states": true, "save_rng_states": true, "checkpoint_format": "pytorch", "async_save": true, "save_interval_steps": 5000, "keep_last_n_checkpoints": 10 } }