File size: 3,307 Bytes
00bc05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
checkpoints:
  checkpoint_interval: 10000
  checkpoints_path: checkpoints/360M_UT_original_withcos_early0.01
  checkpoints_path_is_shared_file_system: false
  load_lr_scheduler: false
  load_optimizer: false
  resume_checkpoint_path: /mnt/bn/ridger1/yxy/SmolLM2-nanotron-ckpt/360M/pre-decay
  save_final_state: false
  save_initial_state: false
data_stages:
- data:
    dataset:
      dataset_folder:
      - /mnt/bn/ridger1/yxy/datasets/finewebedu-dedup
      - /mnt/bn/ridger1/datasets/cosmopedia-v2
      - /mnt/bn/ridger1/yxy/datasets/megamath/megamath-text-code-block
      - /mnt/bn/ridger1/yxy/datasets/megamath/megamath-qa
      - /mnt/bn/ridger1/datasets/megamath/megamath-translated-code
      - /mnt/bn/ridger1/yxy/datasets/megamath/megamath-web-pro
      dataset_weights:
      - 0.545
      - 0.08
      - 0.25
      - 0.035
      - 0.035
      - 0.075
      token_size_in_bytes: 2
      tokenizer_name: HuggingFaceTB/cosmo2-tokenizer
      vocab_size: 49152
    num_loading_workers: 1
    seed: 42
  name: stable phase
  start_training_step: 1
general:
  benchmark_csv_path: null
  consumed_train_samples: 51200000
  ignore_sanity_checks: true
  project: diffUT
  run: 360M_32_with_cos_constant_and_decay_early0.1
  seed: 8
  step: 100000
lighteval: null
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    std: 0.041666666666666664
  make_vocab_size_divisible_by: 1
  model_config:
    _attn_implementation: flash_attention_2
    attention_bias: false
    bos_token_id: 0
    early_ratio: 0.01
    eos_token_id: 0
    hidden_act: silu
    hidden_size: 960
    initializer_range: 0.02
    intermediate_size: 2560
    is_llama_config: true
    max_position_embeddings: 2048
    num_attention_heads: 15
    num_hidden_layers: 32
    num_key_value_heads: 5
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_interleaved: false
    rope_scaling: null
    rope_theta: 10000.0
    tie_word_embeddings: true
    unroll: true
    unroll_end: 25
    unroll_start: 5
    unroll_type: 1
    use_cache: true
    vocab_size: 49152
    z_loss_coefficient: 0.0001
    z_loss_enabled: false
optimizer:
  accumulate_grad_in_fp32: true
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.003
    lr_decay_starting_step: 10000
    lr_decay_steps: 100000
    lr_decay_style: linear
    lr_warmup_steps: 1024
    lr_warmup_style: linear
    min_decay_lr: 0.0003
  optimizer_factory:
    adam_beta1: 0.9
    adam_beta2: 0.95
    adam_eps: 1.0e-08
    name: adamW
    torch_adam_is_fused: true
  weight_decay: 0.01
  weight_decay_exclude_named_params: []
  zero_stage: 1
parallelism:
  context_parallel_size: 1
  dp: 64
  expert_parallel_size: 1
  moe_layer_recompute: false
  pp: 1
  pp_engine: 1f1b
  recompute_layer: false
  tp: 1
  tp_linear_async_communication: true
  tp_mode: REDUCE_SCATTER
  tp_recompute_allgather: true
profiler: null
s3_upload: null
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 8
  sequence_length: 2048
  train_steps: 102400
  val_check_interval: 10000