| accum_freq: 1 |
| attn_activation: None |
| attn_name: torch_attn |
| attn_seq_scalar: None |
| attn_seq_scalar_alpha: None |
| average: None |
| average_coefficients: None |
| averagers: None |
| beta1: 0.9 |
| beta2: 0.95 |
| checkpoint_path: checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-hellaswag-v8.5-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000/checkpoints |
| copy_codebase: False |
| data_key: json.gz |
| data_tolerate_error_p: 0.09 |
| data_tolerate_num_ckpts: 0 |
| dataset_manifest: ['/home/awettig/pli/dclm/dclm-pool-1b-1x/tokenized/h-uniform_by_topic-v3.8-lgbm-hellaswag-v8.5/manifest.jsonl'] |
| dataset_resampled: False |
| dataset_type: auto |
| ddp_static_graph: False |
| debug: False |
| delete_previous_checkpoint: False |
| device: cuda:0 |
| disable_buffer: False |
| dist_backend: nccl |
| dist_url: env:// |
| distill_model: None |
| distill_pretrained: None |
| distributed: True |
| epochs: 5 |
| epochs_cooldown: None |
| eps: 1e-08 |
| experimental_meta_device: False |
| failed_checkpoint_path: checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-hellaswag-v8.5-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000/checkpoints_failed |
| ffn_type: swiglu |
| force_distributed: False |
| force_min_lr: 0.0 |
| fsdp: True |
| fsdp_amp: True |
| fsdp_backward_prefetch: False |
| fsdp_checkpoint: False |
| fsdp_cpu_offload: False |
| fsdp_hybrid: False |
| fsdp_hybrid_o2: False |
| fsdp_limit_all_gathers: True |
| fsdp_pure_bf16: False |
| fsdp_use_orig_params: True |
| global_batch_size: 256 |
| global_val_batch_size: None |
| grad_checkpointing: False |
| grad_clip_norm: 1.0 |
| hf_fsdp_block: None |
| hf_model: None |
| hf_seq_len: None |
| ignore_parse_errors: False |
| load_pretrained_state: False |
| local_rank: 0 |
| log_avg_model_training_loss: 0 |
| log_every_n_steps: 20 |
| log_level: 20 |
| log_local: False |
| log_logit_mean: True |
| log_path: checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-hellaswag-v8.5-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000/out.log |
| logs: checkpoints |
| lr: 0.003 |
| lr_cooldown_end: 3e-05 |
| lr_cooldown_power: 1.0 |
| lr_scheduler: cosine |
| model: open_lm_1b_swiglutorch |
| model_norm: gain_only_lp_layer_norm |
| moe_capacity_factor: 1.25 |
| moe_expert_model_parallelism: False |
| moe_freq: 0 |
| moe_loss_weight: 0.1 |
| moe_num_experts: None |
| moe_top_k: 2 |
| moe_weight_parallelism: False |
| multiple_data_passes: False |
| name: dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-hellaswag-v8.5-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000 |
| no_set_device_rank: False |
| optimizer: adamw |
| per_gpu_batch_size: 8 |
| positional_embedding_type: rotary |
| precision: amp_bfloat16 |
| preset_world_size: None |
| pretrained: None |
| qk_norm: True |
| rank: 0 |
| remote_sync: None |
| remote_sync_frequency: 300 |
| remote_sync_protocol: s3 |
| report_to: wandb |
| resume: None |
| save_frequency: 1 |
| save_most_recent: False |
| seed: 124 |
| seq_len: 2048 |
| skip_scheduler: False |
| squash_mask_left: False |
| target_mask_individual: None |
| target_mask_left: None |
| tensorboard: False |
| tensorboard_path: |
| torchcompile: True |
| torchscript: False |
| trace: False |
| train_data: None |
| train_data_mix_weights: None |
| train_data_upsampling_factors: None |
| train_num_samples: 2812100 |
| use_bn_sync: False |
| use_bnb_linear: None |
| val_data: None |
| val_data_key: None |
| val_frequency: 1 |
| val_iter_ci: 10000 |
| val_max_pop_ci: None |
| val_num_samples: None |
| val_seq_ci: False |
| val_tok_ci: False |
| vocab_size: 50432 |
| wandb: True |
| wandb_notes: |
| wandb_project_name: dcnlp |
| warmup: 5000 |
| wd: 0.033 |
| workers: 1 |
| world_size: 32 |
| z_loss_coefficient: 0.0001 |
|
|