RuoliuYang commited on
Commit
accc275
·
verified ·
1 Parent(s): 0538b26

stage2 size-250k ckpt-100 weights 2026-06-05T06:41:39+02:00

Browse files
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfa30c25552aa127b5fcfea14428003246d43ef8b5ad012fd5b8124f5d7db81a
3
- size 11419196428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aefc71d94b209f7cd29c0116a29f05ca2e3f6eef5ea711434fc9cd79662393e0
3
+ size 5709600220
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02b08e4cede6c5a3f9e2948c4204a4a7a5fd31e523be05c438f5524c0db896a9
3
- size 11419197708
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf99788094f8c9bb48d4a2fda249406ba047d0be5078b1f68212e035633299b
3
+ size 5709601436
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6adec87ee310f4e4117cb29c1234d8ea803f793f2b206f95aafed6e12838eef
3
- size 11419197772
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba23acc32dc6029bd269f5d5227a741c2b1415937b618b3867385c67f0aa30e
3
+ size 5709601372
size-250k/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46c31f6cc991abac922ba18860567507f762964ed573433572bcd541b28496ba
3
- size 11419197772
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeb5200a4ae28ede6faccac6fe0a0e03bf9e1eaaa302dde6f2780eaf8e8e8a3b
3
+ size 5709601436
size-250k/checkpoint-100/global_step100/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:201ecf5e5a4a4a3d381a519f5440199ea67b47d074f7f5001a319e29926332ff
3
  size 17932200534
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd35decc59726c9d8341004e7d9129796996e2536a1e636f4d4943f95a4459a5
3
  size 17932200534
size-250k/checkpoint-100/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54e7032c6e2999eb7a7f99dedc3babff40d9b00b289a9045ef013f9c40b0c355
3
  size 4965419112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e13adca318a79871de5ecfba6647f2755364d93dbec9569ce8627d6b5f06263e
3
  size 4965419112
size-250k/checkpoint-100/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4aef0804a5889a575ec8e62308b3f652dffd7e3a13aae340d368955a9784a490
3
  size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d53ded58f4de861066edafa7447316f1f19e22c0c65bd73cb44070b7e48cbfc
3
  size 4991495816
size-250k/checkpoint-100/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a98e656f206bef577b91d13739d58854d25e71db175b93ddbe055f018c492a78
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c23abe77a027e2733c5f0a82127c9e9d6c03ed5f54bd46e27545a2aeeef62473
3
  size 4932751040
size-250k/checkpoint-100/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37fb75387a15635259276329298c49a5d214c4eb9ff32664d4edcca127e48446
3
  size 1689100192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:636226e0264f48f6f489a47650baae72527f51d1a4773e7e4697f2232c422f5a
3
  size 1689100192
size-250k/checkpoint-100/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.05197674040866712,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
@@ -10,114 +10,114 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "alignment_loss": 0.818257,
14
- "epoch": 0.005197674040866712,
15
- "grad_norm": 10616.4033203125,
16
  "learning_rate": 9e-06,
17
- "loss": 600.6838,
18
- "mean_token_accuracy": 0.6212009839713574,
19
  "num_tokens": 1084700.0,
20
  "step": 10,
21
- "teacher_ce_loss": 37.7183
22
  },
23
  {
24
- "alignment_loss": 0.894285,
25
- "epoch": 0.010395348081733424,
26
- "grad_norm": 3175.2373046875,
27
  "learning_rate": 9.97655028660761e-06,
28
- "loss": 163.3203,
29
- "mean_token_accuracy": 0.7600021116435528,
30
  "num_tokens": 2173682.0,
31
  "step": 20,
32
- "teacher_ce_loss": 9.920976
33
  },
34
  {
35
- "alignment_loss": 0.892784,
36
- "epoch": 0.015593022122600136,
37
- "grad_norm": 3032.060546875,
38
  "learning_rate": 9.950495049504951e-06,
39
- "loss": 112.184,
40
- "mean_token_accuracy": 0.7797912888228893,
41
  "num_tokens": 3254544.0,
42
  "step": 30,
43
- "teacher_ce_loss": 6.994861
44
  },
45
  {
46
- "alignment_loss": 0.871766,
47
- "epoch": 0.02079069616346685,
48
- "grad_norm": 2499.0107421875,
49
  "learning_rate": 9.924439812402293e-06,
50
- "loss": 80.4524,
51
- "mean_token_accuracy": 0.8553705904632807,
52
  "num_tokens": 4366899.0,
53
  "step": 40,
54
- "teacher_ce_loss": 4.613683
55
  },
56
  {
57
- "alignment_loss": 0.839332,
58
- "epoch": 0.02598837020433356,
59
- "grad_norm": 844.4440307617188,
60
  "learning_rate": 9.898384575299636e-06,
61
- "loss": 55.5309,
62
- "mean_token_accuracy": 0.8646903920918703,
63
  "num_tokens": 5454040.0,
64
  "step": 50,
65
- "teacher_ce_loss": 3.528176
66
  },
67
  {
68
- "alignment_loss": 0.852953,
69
- "epoch": 0.031186044245200273,
70
- "grad_norm": 493.1343994140625,
71
  "learning_rate": 9.872329338196979e-06,
72
- "loss": 47.6412,
73
- "mean_token_accuracy": 0.8714054178446531,
74
  "num_tokens": 6535542.0,
75
  "step": 60,
76
- "teacher_ce_loss": 2.624911
77
  },
78
  {
79
- "alignment_loss": 0.850321,
80
- "epoch": 0.036383718286066985,
81
- "grad_norm": 502.66070556640625,
82
  "learning_rate": 9.84627410109432e-06,
83
- "loss": 45.5853,
84
- "mean_token_accuracy": 0.8716515514999628,
85
  "num_tokens": 7587360.0,
86
  "step": 70,
87
- "teacher_ce_loss": 2.632225
88
  },
89
  {
90
- "alignment_loss": 0.855101,
91
- "epoch": 0.0415813923269337,
92
- "grad_norm": 432.3569030761719,
93
  "learning_rate": 9.820218863991662e-06,
94
- "loss": 43.1104,
95
- "mean_token_accuracy": 0.8750173676759004,
96
  "num_tokens": 8677343.0,
97
  "step": 80,
98
- "teacher_ce_loss": 2.711963
99
  },
100
  {
101
- "alignment_loss": 0.835686,
102
- "epoch": 0.04677906636780041,
103
- "grad_norm": 481.1368713378906,
104
  "learning_rate": 9.794163626889005e-06,
105
- "loss": 42.8872,
106
- "mean_token_accuracy": 0.8726172130554914,
107
  "num_tokens": 9750881.0,
108
  "step": 90,
109
- "teacher_ce_loss": 2.858154
110
  },
111
  {
112
- "alignment_loss": 0.819627,
113
- "epoch": 0.05197674040866712,
114
- "grad_norm": 509.544921875,
115
  "learning_rate": 9.768108389786348e-06,
116
- "loss": 46.8516,
117
- "mean_token_accuracy": 0.8589316807687283,
118
  "num_tokens": 10855717.0,
119
  "step": 100,
120
- "teacher_ce_loss": 2.724989
121
  }
122
  ],
123
  "logging_steps": 10,
@@ -137,7 +137,7 @@
137
  "attributes": {}
138
  }
139
  },
140
- "total_flos": 5.045144881127752e+17,
141
  "train_batch_size": 1,
142
  "trial_name": null,
143
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.05197505197505198,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "alignment_loss": 0.795723,
14
+ "epoch": 0.005197505197505198,
15
+ "grad_norm": 9890.51953125,
16
  "learning_rate": 9e-06,
17
+ "loss": 595.1739,
18
+ "mean_token_accuracy": 0.6256996631622315,
19
  "num_tokens": 1084700.0,
20
  "step": 10,
21
+ "teacher_ce_loss": 72.813206
22
  },
23
  {
24
+ "alignment_loss": 0.901524,
25
+ "epoch": 0.010395010395010396,
26
+ "grad_norm": 3162.57958984375,
27
  "learning_rate": 9.97655028660761e-06,
28
+ "loss": 163.0194,
29
+ "mean_token_accuracy": 0.7504611149430275,
30
  "num_tokens": 2173682.0,
31
  "step": 20,
32
+ "teacher_ce_loss": 19.007352
33
  },
34
  {
35
+ "alignment_loss": 0.910772,
36
+ "epoch": 0.015592515592515593,
37
+ "grad_norm": 3037.57958984375,
38
  "learning_rate": 9.950495049504951e-06,
39
+ "loss": 112.238,
40
+ "mean_token_accuracy": 0.7754241786897182,
41
  "num_tokens": 3254544.0,
42
  "step": 30,
43
+ "teacher_ce_loss": 14.432665
44
  },
45
  {
46
+ "alignment_loss": 0.885737,
47
+ "epoch": 0.02079002079002079,
48
+ "grad_norm": 2462.610595703125,
49
  "learning_rate": 9.924439812402293e-06,
50
+ "loss": 80.2902,
51
+ "mean_token_accuracy": 0.8474745027720928,
52
  "num_tokens": 4366899.0,
53
  "step": 40,
54
+ "teacher_ce_loss": 9.699339
55
  },
56
  {
57
+ "alignment_loss": 0.867464,
58
+ "epoch": 0.02598752598752599,
59
+ "grad_norm": 817.1935424804688,
60
  "learning_rate": 9.898384575299636e-06,
61
+ "loss": 55.5484,
62
+ "mean_token_accuracy": 0.8571618065237999,
63
  "num_tokens": 5454040.0,
64
  "step": 50,
65
+ "teacher_ce_loss": 7.613436
66
  },
67
  {
68
+ "alignment_loss": 0.869904,
69
+ "epoch": 0.031185031185031187,
70
+ "grad_norm": 497.1650085449219,
71
  "learning_rate": 9.872329338196979e-06,
72
+ "loss": 47.8751,
73
+ "mean_token_accuracy": 0.8671012565493583,
74
  "num_tokens": 6535542.0,
75
  "step": 60,
76
+ "teacher_ce_loss": 5.361066
77
  },
78
  {
79
+ "alignment_loss": 0.882581,
80
+ "epoch": 0.036382536382536385,
81
+ "grad_norm": 504.9124755859375,
82
  "learning_rate": 9.84627410109432e-06,
83
+ "loss": 45.7668,
84
+ "mean_token_accuracy": 0.8666120573878289,
85
  "num_tokens": 7587360.0,
86
  "step": 70,
87
+ "teacher_ce_loss": 5.355153
88
  },
89
  {
90
+ "alignment_loss": 0.882264,
91
+ "epoch": 0.04158004158004158,
92
+ "grad_norm": 413.635498046875,
93
  "learning_rate": 9.820218863991662e-06,
94
+ "loss": 43.0118,
95
+ "mean_token_accuracy": 0.868953762203455,
96
  "num_tokens": 8677343.0,
97
  "step": 80,
98
+ "teacher_ce_loss": 4.836935
99
  },
100
  {
101
+ "alignment_loss": 0.846325,
102
+ "epoch": 0.04677754677754678,
103
+ "grad_norm": 446.23834228515625,
104
  "learning_rate": 9.794163626889005e-06,
105
+ "loss": 42.9171,
106
+ "mean_token_accuracy": 0.8668660171329975,
107
  "num_tokens": 9750881.0,
108
  "step": 90,
109
+ "teacher_ce_loss": 5.504833
110
  },
111
  {
112
+ "alignment_loss": 0.849805,
113
+ "epoch": 0.05197505197505198,
114
+ "grad_norm": 515.4571533203125,
115
  "learning_rate": 9.768108389786348e-06,
116
+ "loss": 47.1901,
117
+ "mean_token_accuracy": 0.8482094191014766,
118
  "num_tokens": 10855717.0,
119
  "step": 100,
120
+ "teacher_ce_loss": 5.504737
121
  }
122
  ],
123
  "logging_steps": 10,
 
137
  "attributes": {}
138
  }
139
  },
140
+ "total_flos": 5.045144795228406e+17,
141
  "train_batch_size": 1,
142
  "trial_name": null,
143
  "trial_params": null
size-250k/checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e861a7238207eaec94b085e7f17b60738ef5e4b9c421e456a6d3c521acae754
3
  size 10570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca14d8a6b91e7eb5dce22e1701967a5af6244e13996a2c7240340eeaaeed1e14
3
  size 10570