RuoliuYang commited on
Commit
c2009bc
·
verified ·
1 Parent(s): a73dc09

stage2 size-250k ckpt-50 weights 2026-06-05T06:21:19+02:00

Browse files
size-250k/checkpoint-50/global_step50/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6da2d836710d765b2e377f5bc371a2ef9923ad262c8444368f8e184119e98a25
3
- size 11419196428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c6df94d48a55930873043b7db6e55470712fa446e76f5887e2a3699e1f52330
3
+ size 5709600220
size-250k/checkpoint-50/global_step50/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5b83577ae26fe70f736fedb8be08db07b07ac758bb37a0731a7696298220182
3
- size 11419197708
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d04ea2b72463c6748b61ade8b3e0583bfaeb31caa6ae8a51e144f154d64b06
3
+ size 5709601436
size-250k/checkpoint-50/global_step50/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e08133b31e18d6e9c3970fa46a394906e7fefcef434e3dcfcdf9d70c4315f426
3
- size 11419197772
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d0710840122111b243dcb6d8bb60e1e75901a387d0521be277ce7397b579f7
3
+ size 5709601372
size-250k/checkpoint-50/global_step50/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d13a86f94cc28b4d9aec1e37ff1c0fb9327feb9d4dc8604e103065d6f50f645
3
- size 11419197772
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08fce7d0fc589dab48f362a8cbefb4aeb2ba70306d4d03c68fa589c64fc42633
3
+ size 5709601436
size-250k/checkpoint-50/global_step50/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c04e7eebf616784fbf1877c846062e0163bc76a1193f4ea4df9ef1f5659b6e34
3
  size 17932200534
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00021ffceb23ffe6e59486b3e833867153667ef2287eaaeaac66983a02a5018
3
  size 17932200534
size-250k/checkpoint-50/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49f6d65708d75aabbb9b7a464147b4617c10972b6a165014c3bb34b86ef892c7
3
  size 4965419112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80b087a72c88e78fd6e634e47cac307e1b6b2c2224614270b871a8204e74b284
3
  size 4965419112
size-250k/checkpoint-50/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a93b7eabf126483f4b4bea2a7e4fb1b9525a0ffac49ca32132e6b40c87649e0
3
  size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60d6d56c2688898b50559464514eaae9ba0e1c7de579e6dcde3c7b5d0ff4844e
3
  size 4991495816
size-250k/checkpoint-50/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d68df264eac72bc211170b4b8654ccc496bf4fc33fed5e6c75a2eaf6b60e33f2
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d419383d0f641ccc922e57f2bc9fb18ee0975e649154af0acb64c8886a5c18
3
  size 4932751040
size-250k/checkpoint-50/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:936228ee8e1fc9e62eeb1cdc0f91bc14a93de09bf790ab5cc25d886c01d85288
3
  size 1689100192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ee7f5faccd3ef0cf65f389d3910aca239500fc2c8e073a61ba648451dc9124
3
  size 1689100192
size-250k/checkpoint-50/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.02598837020433356,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
@@ -10,59 +10,59 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "alignment_loss": 0.818257,
14
- "epoch": 0.005197674040866712,
15
- "grad_norm": 10616.4033203125,
16
  "learning_rate": 9e-06,
17
- "loss": 600.6838,
18
- "mean_token_accuracy": 0.6212009839713574,
19
  "num_tokens": 1084700.0,
20
  "step": 10,
21
- "teacher_ce_loss": 37.7183
22
  },
23
  {
24
- "alignment_loss": 0.894285,
25
- "epoch": 0.010395348081733424,
26
- "grad_norm": 3175.2373046875,
27
  "learning_rate": 9.97655028660761e-06,
28
- "loss": 163.3203,
29
- "mean_token_accuracy": 0.7600021116435528,
30
  "num_tokens": 2173682.0,
31
  "step": 20,
32
- "teacher_ce_loss": 9.920976
33
  },
34
  {
35
- "alignment_loss": 0.892784,
36
- "epoch": 0.015593022122600136,
37
- "grad_norm": 3032.060546875,
38
  "learning_rate": 9.950495049504951e-06,
39
- "loss": 112.184,
40
- "mean_token_accuracy": 0.7797912888228893,
41
  "num_tokens": 3254544.0,
42
  "step": 30,
43
- "teacher_ce_loss": 6.994861
44
  },
45
  {
46
- "alignment_loss": 0.871766,
47
- "epoch": 0.02079069616346685,
48
- "grad_norm": 2499.0107421875,
49
  "learning_rate": 9.924439812402293e-06,
50
- "loss": 80.4524,
51
- "mean_token_accuracy": 0.8553705904632807,
52
  "num_tokens": 4366899.0,
53
  "step": 40,
54
- "teacher_ce_loss": 4.613683
55
  },
56
  {
57
- "alignment_loss": 0.839332,
58
- "epoch": 0.02598837020433356,
59
- "grad_norm": 844.4440307617188,
60
  "learning_rate": 9.898384575299636e-06,
61
- "loss": 55.5309,
62
- "mean_token_accuracy": 0.8646903920918703,
63
  "num_tokens": 5454040.0,
64
  "step": 50,
65
- "teacher_ce_loss": 3.528176
66
  }
67
  ],
68
  "logging_steps": 10,
@@ -82,7 +82,7 @@
82
  "attributes": {}
83
  }
84
  },
85
- "total_flos": 2.5347401902876262e+17,
86
  "train_batch_size": 1,
87
  "trial_name": null,
88
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.02598752598752599,
6
  "eval_steps": 500,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "alignment_loss": 0.795723,
14
+ "epoch": 0.005197505197505198,
15
+ "grad_norm": 9890.51953125,
16
  "learning_rate": 9e-06,
17
+ "loss": 595.1739,
18
+ "mean_token_accuracy": 0.6256996631622315,
19
  "num_tokens": 1084700.0,
20
  "step": 10,
21
+ "teacher_ce_loss": 72.813206
22
  },
23
  {
24
+ "alignment_loss": 0.901524,
25
+ "epoch": 0.010395010395010396,
26
+ "grad_norm": 3162.57958984375,
27
  "learning_rate": 9.97655028660761e-06,
28
+ "loss": 163.0194,
29
+ "mean_token_accuracy": 0.7504611149430275,
30
  "num_tokens": 2173682.0,
31
  "step": 20,
32
+ "teacher_ce_loss": 19.007352
33
  },
34
  {
35
+ "alignment_loss": 0.910772,
36
+ "epoch": 0.015592515592515593,
37
+ "grad_norm": 3037.57958984375,
38
  "learning_rate": 9.950495049504951e-06,
39
+ "loss": 112.238,
40
+ "mean_token_accuracy": 0.7754241786897182,
41
  "num_tokens": 3254544.0,
42
  "step": 30,
43
+ "teacher_ce_loss": 14.432665
44
  },
45
  {
46
+ "alignment_loss": 0.885737,
47
+ "epoch": 0.02079002079002079,
48
+ "grad_norm": 2462.610595703125,
49
  "learning_rate": 9.924439812402293e-06,
50
+ "loss": 80.2902,
51
+ "mean_token_accuracy": 0.8474745027720928,
52
  "num_tokens": 4366899.0,
53
  "step": 40,
54
+ "teacher_ce_loss": 9.699339
55
  },
56
  {
57
+ "alignment_loss": 0.867464,
58
+ "epoch": 0.02598752598752599,
59
+ "grad_norm": 817.1935424804688,
60
  "learning_rate": 9.898384575299636e-06,
61
+ "loss": 55.5484,
62
+ "mean_token_accuracy": 0.8571618065237999,
63
  "num_tokens": 5454040.0,
64
  "step": 50,
65
+ "teacher_ce_loss": 7.613436
66
  }
67
  ],
68
  "logging_steps": 10,
 
82
  "attributes": {}
83
  }
84
  },
85
+ "total_flos": 2.5347401473379533e+17,
86
  "train_batch_size": 1,
87
  "trial_name": null,
88
  "trial_params": null
size-250k/checkpoint-50/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e861a7238207eaec94b085e7f17b60738ef5e4b9c421e456a6d3c521acae754
3
  size 10570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca14d8a6b91e7eb5dce22e1701967a5af6244e13996a2c7240340eeaaeed1e14
3
  size 10570