Create optimizer_config.json

Browse files

Files changed (1) hide show

optimizer_config.json +256 -0

optimizer_config.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+  "optimizer_name": "AdamW",
+  "optimizer_class": "torch.optim.AdamW",
+  "hyperparameters": {
+    "learning_rate": 3.0e-4,
+    "betas": [0.9, 0.95],
+    "eps": 1.0e-8,
+    "weight_decay": 0.1,
+    "amsgrad": false,
+    "maximize": false,
+    "foreach": null,
+    "capturable": false,
+    "differentiable": false,
+    "fused": true
+  },
+  "learning_rate_schedule": {
+    "scheduler_type": "cosine_with_warmup",
+    "warmup_steps": 2000,
+    "total_steps": 875000,
+    "min_lr": 3.0e-5,
+    "max_lr": 3.0e-4,
+    "warmup_init_lr": 0.0,
+    "cycle_mult": 1.0,
+    "last_epoch": -1
+  },
+  "gradient_configuration": {
+    "max_grad_norm": 1.0,
+    "gradient_accumulation_steps": 8,
+    "gradient_checkpointing": true,
+    "gradient_checkpointing_policy": "full_deterministic"
+  },
+  "mixed_precision": {
+    "enabled": true,
+    "dtype": "bfloat16",
+    "loss_scale": "dynamic",
+    "initial_scale_power": 16,
+    "scale_growth_factor": 2.0,
+    "backoff_factor": 0.5,
+    "scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1.0
+  },
+  "optimization_stages": [
+    {
+      "stage": "pretraining",
+      "steps": 750000,
+      "learning_rate": 3.0e-4,
+      "weight_decay": 0.1,
+      "scheduler": "cosine",
+      "warmup_steps": 2000,
+      "description": "Initial pretraining phase on diverse corpus"
+    },
+    {
+      "stage": "domain_adaptation",
+      "steps": 80000,
+      "learning_rate": 1.0e-4,
+      "weight_decay": 0.1,
+      "scheduler": "constant",
+      "warmup_steps": 0,
+      "description": "Continued pretraining on domain-specific data"
+    },
+    {
+      "stage": "instruction_tuning",
+      "steps": 45000,
+      "learning_rate": 5.0e-5,
+      "weight_decay": 0.01,
+      "scheduler": "linear_decay",
+      "warmup_steps": 500,
+      "description": "Fine-tuning for instruction following"
+    }
+  ],
+  "parameter_groups": [
+    {
+      "name": "embeddings",
+      "modules": ["model.embed_tokens", "lm_head"],
+      "learning_rate_multiplier": 1.0,
+      "weight_decay": 0.1
+    },
+    {
+      "name": "attention",
+      "modules": [
+        "self_attn.q_proj",
+        "self_attn.k_proj",
+        "self_attn.v_proj",
+        "self_attn.o_proj"
+      ],
+      "learning_rate_multiplier": 1.0,
+      "weight_decay": 0.1
+    },
+    {
+      "name": "mlp",
+      "modules": [
+        "mlp.gate_proj",
+        "mlp.up_proj",
+        "mlp.down_proj"
+      ],
+      "learning_rate_multiplier": 1.0,
+      "weight_decay": 0.1
+    },
+    {
+      "name": "layer_norms",
+      "modules": [
+        "input_layernorm",
+        "post_attention_layernorm",
+        "model.norm"
+      ],
+      "learning_rate_multiplier": 1.0,
+      "weight_decay": 0.0
+    }
+  ],
+  "advanced_techniques": {
+    "layer_wise_lr_decay": {
+      "enabled": false,
+      "decay_rate": 0.95,
+      "description": "Apply learning rate decay by layer depth"
+    },
+    "warmup_schedule": {
+      "type": "linear",
+      "steps": 2000,
+      "start_lr": 0.0,
+      "target_lr": 3.0e-4
+    },
+    "gradient_clipping": {
+      "method": "norm",
+      "max_norm": 1.0,
+      "norm_type": 2.0
+    },
+    "optimizer_state_sharding": {
+      "enabled": true,
+      "strategy": "zero_stage_2",
+      "offload_optimizer": false,
+      "offload_params": false
+    }
+  },
+  "memory_optimization": {
+    "cpu_offload": {
+      "enabled": false,
+      "offload_optimizer_states": false,
+      "offload_params": false,
+      "pin_memory": true
+    },
+    "activation_checkpointing": {
+      "enabled": true,
+      "checkpoint_every_n_layers": 1,
+      "use_reentrant": false
+    },
+    "zero_optimization": {
+      "stage": 2,
+      "offload_optimizer": false,
+      "offload_param": false,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "reduce_bucket_size": 5.0e8,
+      "allgather_bucket_size": 5.0e8,
+      "sub_group_size": 1.0e9,
+      "round_robin_gradients": false
+    }
+  },
+  "monitoring": {
+    "log_optimizer_states": true,
+    "log_learning_rate": true,
+    "log_gradient_norm": true,
+    "log_parameter_norm": true,
+    "log_interval": 100,
+    "tracked_metrics": [
+      "lr",
+      "grad_norm",
+      "param_norm",
+      "loss_scale",
+      "overflow_count",
+      "step_time",
+      "samples_per_second",
+      "tokens_per_second"
+    ]
+  },
+  "convergence_criteria": {
+    "max_steps": 875000,
+    "early_stopping": {
+      "enabled": false,
+      "patience": 10000,
+      "min_delta": 0.001,
+      "monitor": "eval_loss"
+    },
+    "plateau_detection": {
+      "enabled": true,
+      "patience": 5000,
+      "threshold": 0.001,
+      "cooldown": 1000
+    }
+  },
+  "stability_features": {
+    "loss_spike_detection": {
+      "enabled": true,
+      "threshold": 2.0,
+      "window_size": 100,
+      "action": "skip_update"
+    },
+    "gradient_overflow_detection": {
+      "enabled": true,
+      "max_overflow_count": 10,
+      "action": "reduce_loss_scale"
+    },
+    "nan_inf_detection": {
+      "enabled": true,
+      "check_frequency": 100,
+      "action": "rollback_checkpoint"
+    }
+  },
+  "distributed_optimization": {
+    "backend": "nccl",
+    "gradient_as_bucket_view": true,
+    "static_graph": false,
+    "ddp_bucket_cap_mb": 25,
+    "find_unused_parameters": false,
+    "broadcast_buffers": true,
+    "communication_optimization": {
+      "fp16_reduce_scatter": false,
+      "bf16_reduce_scatter": true,
+      "bucket_size_multiplier": 1.0,
+      "overlap_grad_reduce": true,
+      "use_multi_stream": true
+    }
+  },
+  "checkpointing": {
+    "save_optimizer_states": true,
+    "save_scheduler_states": true,
+    "save_rng_states": true,
+    "checkpoint_format": "pytorch",
+    "async_save": true,
+    "save_interval_steps": 5000,
+    "keep_last_n_checkpoints": 10
+  }
+}