Ksjsjjdj commited on
Commit
da4d7f9
·
verified ·
1 Parent(s): c4fa8a4

Auto-save flat update: checkpoint-100

Browse files
Files changed (6) hide show
  1. config.json +1 -1
  2. model.safetensors +1 -1
  3. optimizer.pt +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +43 -183
  6. training_args.bin +1 -1
config.json CHANGED
@@ -13,7 +13,7 @@
13
  "hidden_size": 256,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 1024,
16
- "max_position_embeddings": 1024,
17
  "max_window_layers": 28,
18
  "mlp_bias": false,
19
  "model_type": "qwen2",
 
13
  "hidden_size": 256,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 1024,
16
+ "max_position_embeddings": 512,
17
  "max_window_layers": 28,
18
  "mlp_bias": false,
19
  "model_type": "qwen2",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c633054e8f62adb0c63a3f768c6f64adee2e808946e35fd25e0a4ff8fe9886cc
3
  size 4398536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc2024b06d6d177a54f5f514c8784fc3039cc0904d37d02dd58522a0d3362322
3
  size 4398536
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be20755d8a8f3672d8753c50dea6913d0e7fe725cef330124867906dd6d1d499
3
  size 8806533
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faee95f783ecf608464997a8200a1558c89d78e05ee6481dc796ec32c4bbbdca
3
  size 8806533
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac7010ca38527d647c6bec40d9e474292bd22ba1a7391c34323926f03e67d0ef
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d82c58c32b204ed6cf1be47fcccac4a2997bdd7e1431fe3a6ec925f0a86a9891
3
  size 1465
trainer_state.json CHANGED
@@ -1,292 +1,152 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.02,
5
  "eval_steps": 500,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0005,
13
- "grad_norm": 1.354914903640747,
14
  "learning_rate": 0.0001,
15
- "loss": 5.3068,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.001,
20
- "grad_norm": 1.0461070537567139,
21
  "learning_rate": 0.0002,
22
- "loss": 5.0784,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.0015,
27
- "grad_norm": 0.7310259938240051,
28
  "learning_rate": 0.0001998998998998999,
29
- "loss": 4.8251,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.002,
34
- "grad_norm": 0.82170170545578,
35
  "learning_rate": 0.0001997997997997998,
36
- "loss": 4.6949,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0025,
41
- "grad_norm": 0.9640143513679504,
42
  "learning_rate": 0.0001996996996996997,
43
- "loss": 4.5294,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.003,
48
- "grad_norm": 0.6337556838989258,
49
  "learning_rate": 0.0001995995995995996,
50
- "loss": 4.3776,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0035,
55
- "grad_norm": 0.5715162754058838,
56
  "learning_rate": 0.0001994994994994995,
57
- "loss": 4.251,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.004,
62
- "grad_norm": 0.47545069456100464,
63
  "learning_rate": 0.0001993993993993994,
64
- "loss": 4.142,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.0045,
69
- "grad_norm": 0.43138620257377625,
70
  "learning_rate": 0.00019929929929929932,
71
- "loss": 4.0538,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.005,
76
- "grad_norm": 0.41834330558776855,
77
  "learning_rate": 0.0001991991991991992,
78
- "loss": 3.9896,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.0055,
83
- "grad_norm": 0.3807925283908844,
84
  "learning_rate": 0.00019909909909909912,
85
- "loss": 3.9316,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.006,
90
- "grad_norm": 0.4051252603530884,
91
  "learning_rate": 0.000198998998998999,
92
- "loss": 3.8816,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0065,
97
- "grad_norm": 0.3600367307662964,
98
  "learning_rate": 0.0001988988988988989,
99
- "loss": 3.8327,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.007,
104
- "grad_norm": 0.3089018762111664,
105
  "learning_rate": 0.0001987987987987988,
106
- "loss": 3.7908,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.0075,
111
- "grad_norm": 0.2999509572982788,
112
  "learning_rate": 0.0001986986986986987,
113
- "loss": 3.7632,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.008,
118
- "grad_norm": 0.29107317328453064,
119
  "learning_rate": 0.0001985985985985986,
120
- "loss": 3.7366,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.0085,
125
- "grad_norm": 0.3126203417778015,
126
  "learning_rate": 0.0001984984984984985,
127
- "loss": 3.7243,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.009,
132
- "grad_norm": 0.3028947710990906,
133
  "learning_rate": 0.0001983983983983984,
134
- "loss": 3.6909,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.0095,
139
- "grad_norm": 0.3013005554676056,
140
  "learning_rate": 0.00019829829829829833,
141
- "loss": 3.6686,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.01,
146
- "grad_norm": 0.26517948508262634,
147
  "learning_rate": 0.0001981981981981982,
148
- "loss": 3.6513,
149
  "step": 100
150
- },
151
- {
152
- "epoch": 0.0105,
153
- "grad_norm": 0.283869206905365,
154
- "learning_rate": 0.00019809809809809813,
155
- "loss": 3.6389,
156
- "step": 105
157
- },
158
- {
159
- "epoch": 0.011,
160
- "grad_norm": 0.3128926455974579,
161
- "learning_rate": 0.000197997997997998,
162
- "loss": 3.6236,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 0.0115,
167
- "grad_norm": 0.3017060458660126,
168
- "learning_rate": 0.0001978978978978979,
169
- "loss": 3.6056,
170
- "step": 115
171
- },
172
- {
173
- "epoch": 0.012,
174
- "grad_norm": 0.3050957918167114,
175
- "learning_rate": 0.0001977977977977978,
176
- "loss": 3.5945,
177
- "step": 120
178
- },
179
- {
180
- "epoch": 0.0125,
181
- "grad_norm": 0.39496731758117676,
182
- "learning_rate": 0.0001976976976976977,
183
- "loss": 3.576,
184
- "step": 125
185
- },
186
- {
187
- "epoch": 0.013,
188
- "grad_norm": 0.39083537459373474,
189
- "learning_rate": 0.0001975975975975976,
190
- "loss": 3.5746,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 0.0135,
195
- "grad_norm": 0.30549755692481995,
196
- "learning_rate": 0.0001974974974974975,
197
- "loss": 3.5464,
198
- "step": 135
199
- },
200
- {
201
- "epoch": 0.014,
202
- "grad_norm": 0.30516958236694336,
203
- "learning_rate": 0.00019739739739739739,
204
- "loss": 3.5528,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 0.0145,
209
- "grad_norm": 0.28228166699409485,
210
- "learning_rate": 0.0001972972972972973,
211
- "loss": 3.5414,
212
- "step": 145
213
- },
214
- {
215
- "epoch": 0.015,
216
- "grad_norm": 0.2340458333492279,
217
- "learning_rate": 0.0001971971971971972,
218
- "loss": 3.5297,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 0.0155,
223
- "grad_norm": 0.3061468005180359,
224
- "learning_rate": 0.00019709709709709713,
225
- "loss": 3.5114,
226
- "step": 155
227
- },
228
- {
229
- "epoch": 0.016,
230
- "grad_norm": 0.3535705804824829,
231
- "learning_rate": 0.00019699699699699701,
232
- "loss": 3.4996,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 0.0165,
237
- "grad_norm": 0.3399507403373718,
238
- "learning_rate": 0.0001968968968968969,
239
- "loss": 3.4855,
240
- "step": 165
241
- },
242
- {
243
- "epoch": 0.017,
244
- "grad_norm": 0.26981884241104126,
245
- "learning_rate": 0.00019679679679679681,
246
- "loss": 3.4712,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 0.0175,
251
- "grad_norm": 0.3286713659763336,
252
- "learning_rate": 0.0001966966966966967,
253
- "loss": 3.4543,
254
- "step": 175
255
- },
256
- {
257
- "epoch": 0.018,
258
- "grad_norm": 0.31991562247276306,
259
- "learning_rate": 0.00019659659659659661,
260
- "loss": 3.4302,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 0.0185,
265
- "grad_norm": 0.40395843982696533,
266
- "learning_rate": 0.0001964964964964965,
267
- "loss": 3.4062,
268
- "step": 185
269
- },
270
- {
271
- "epoch": 0.019,
272
- "grad_norm": 0.3666783571243286,
273
- "learning_rate": 0.0001963963963963964,
274
- "loss": 3.379,
275
- "step": 190
276
- },
277
- {
278
- "epoch": 0.0195,
279
- "grad_norm": 0.3933778703212738,
280
- "learning_rate": 0.0001962962962962963,
281
- "loss": 3.3496,
282
- "step": 195
283
- },
284
- {
285
- "epoch": 0.02,
286
- "grad_norm": 0.34942948818206787,
287
- "learning_rate": 0.00019619619619619621,
288
- "loss": 3.3259,
289
- "step": 200
290
  }
291
  ],
292
  "logging_steps": 5,
@@ -306,7 +166,7 @@
306
  "attributes": {}
307
  }
308
  },
309
- "total_flos": 82584168038400.0,
310
  "train_batch_size": 4,
311
  "trial_name": null,
312
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.01,
5
  "eval_steps": 500,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0005,
13
+ "grad_norm": 1.3692148923873901,
14
  "learning_rate": 0.0001,
15
+ "loss": 5.3023,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.001,
20
+ "grad_norm": 1.0570337772369385,
21
  "learning_rate": 0.0002,
22
+ "loss": 5.0871,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.0015,
27
+ "grad_norm": 0.7336458563804626,
28
  "learning_rate": 0.0001998998998998999,
29
+ "loss": 4.8384,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.002,
34
+ "grad_norm": 0.729788601398468,
35
  "learning_rate": 0.0001997997997997998,
36
+ "loss": 4.7071,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0025,
41
+ "grad_norm": 0.8077158331871033,
42
  "learning_rate": 0.0001996996996996997,
43
+ "loss": 4.5564,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.003,
48
+ "grad_norm": 0.6561239361763,
49
  "learning_rate": 0.0001995995995995996,
50
+ "loss": 4.4024,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0035,
55
+ "grad_norm": 0.5824812650680542,
56
  "learning_rate": 0.0001994994994994995,
57
+ "loss": 4.2921,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.004,
62
+ "grad_norm": 0.5250737071037292,
63
  "learning_rate": 0.0001993993993993994,
64
+ "loss": 4.1845,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.0045,
69
+ "grad_norm": 0.5088778734207153,
70
  "learning_rate": 0.00019929929929929932,
71
+ "loss": 4.0933,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.005,
76
+ "grad_norm": 0.4544166326522827,
77
  "learning_rate": 0.0001991991991991992,
78
+ "loss": 4.0118,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.0055,
83
+ "grad_norm": 0.41549327969551086,
84
  "learning_rate": 0.00019909909909909912,
85
+ "loss": 3.9531,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.006,
90
+ "grad_norm": 0.3995205760002136,
91
  "learning_rate": 0.000198998998998999,
92
+ "loss": 3.8955,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0065,
97
+ "grad_norm": 0.3810112178325653,
98
  "learning_rate": 0.0001988988988988989,
99
+ "loss": 3.8356,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.007,
104
+ "grad_norm": 0.3959825932979584,
105
  "learning_rate": 0.0001987987987987988,
106
+ "loss": 3.8059,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.0075,
111
+ "grad_norm": 0.34660765528678894,
112
  "learning_rate": 0.0001986986986986987,
113
+ "loss": 3.786,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.008,
118
+ "grad_norm": 0.35685837268829346,
119
  "learning_rate": 0.0001985985985985986,
120
+ "loss": 3.7469,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.0085,
125
+ "grad_norm": 0.3709333539009094,
126
  "learning_rate": 0.0001984984984984985,
127
+ "loss": 3.7236,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.009,
132
+ "grad_norm": 0.3217354118824005,
133
  "learning_rate": 0.0001983983983983984,
134
+ "loss": 3.7075,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.0095,
139
+ "grad_norm": 0.42025989294052124,
140
  "learning_rate": 0.00019829829829829833,
141
+ "loss": 3.682,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.01,
146
+ "grad_norm": 0.35580873489379883,
147
  "learning_rate": 0.0001981981981981982,
148
+ "loss": 3.6626,
149
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  }
151
  ],
152
  "logging_steps": 5,
 
166
  "attributes": {}
167
  }
168
  },
169
+ "total_flos": 20646042009600.0,
170
  "train_batch_size": 4,
171
  "trial_name": null,
172
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87f7094c9781b5c9394410d447866dce36653e1a7dc4508ca501767ea42b00ab
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b22be76b34e68797fdb33f5525668e9c928c7650cec1eef415c99efec1ffeb
3
  size 5713