| model_name = "llama3" | |
| flavor = "Comma7B" | |
| tokenizer_name = "common-pile/comma-v0.1-2t" | |
| # job | |
| job_name = "munin-7b-open-stage3" | |
| wandb_project = "munin-7b-open-stage3" | |
| enable_wandb = false | |
| # parallelism | |
| num_nodes = 1 | |
| data_parallel_shard_degree = 8 | |
| data_parallel_replicate_degree = 1 | |
| # training settings | |
| train_batch_size = 8 | |
| gradient_accumulation_steps = 2 | |
| gradient_accumulation_sync_each_step = true | |
| seq_len = 4096 | |
| train_num_steps = 18926 # 37852 // 2 | |
| scheduler = "linear_warmup_constant_sqrt_decay" | |
| warmup_steps = 500 | |
| cooldown_steps = 18426 | |
| checkpoint_interval = 1000 | |
| forced_load_path = "/work/training/maester/jobs/munin-7b-open-stage2/checkpoints/step-18926/" | |
| compile = true | |
| enable_cut_cross_entropy = false | |
| ac_mode = "none" | |
| selective_ac_option = "op" | |
| [dataset] | |
| bos_token = 2 | |
| eos_token = 1 | |
| data_dirs = [ | |
| "/work/production/data/munin-open-dyna-0-of-1-cp-2-of-16-train/", | |
| ] | |
| dataset_weights = "1.0" | |
| [opt_cfg] # must specify *all* fields here, will not merge with defaults | |
| lr = 1e-5 | |
| betas = [0.9, 0.95] | |
| weight_decay = 0.1 | |
| eps = 1e-9 | |
| fused = true | |