SystemAdmin123 commited on
Commit
7d571b6
·
verified ·
1 Parent(s): 5c842c0

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03438d87162fa3ae59956221d36f82a9321e50965d77bf13e46151829d5f6e80
3
  size 250490408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6afd96f0a4efb226214a2f98fc2930b95faa7d2409bdbba1791318c01e36cad
3
  size 250490408
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:482c76b7e2b2dd94f963053f2b889f177b57ab782f96200808e7548078b9c06c
3
- size 255266042
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9766749cabbd0fe5ad2cb84cb5b1e21a853eda9b1ed07a7c96fd6bf1b05d3c04
3
+ size 255265850
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57d0f1b7067c6220f6a5ca48baeecd587b394151cc545f90b6339e9e610bf246
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0676c143653a967526628bc2f1d4644cea97dd18a780321d518070502832502d
3
+ size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d72484dfd8551a0f67ea47f28366dfee127e1515d7d6d10f8ca73a95bec2695e
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:465d2d2aca677e6044b1b9bb8917a886a23650c3ce9b3ee9246d24ecb5324a05
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c747745535681b443a3cae9777675e8f1d7f8e4fdcc2e923572fbc2e7ee62ede
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9410529b5a855657bd3a08b9e2887f922f2b19dd1ea9123054c798d54f882e1
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e4c78e85c4ba926d25150d4aaddeaf5728dcb066f4afc01202e3e56f29a5487
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,371 +1,97 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 33.333333333333336,
5
- "eval_steps": 50,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08333333333333333,
13
- "eval_loss": 3.261073350906372,
14
- "eval_runtime": 10.6205,
15
- "eval_samples_per_second": 141.33,
16
- "eval_steps_per_second": 2.26,
17
  "step": 1
18
  },
19
- {
20
- "epoch": 0.8333333333333334,
21
- "grad_norm": 8.0625,
22
- "learning_rate": 6.666666666666667e-05,
23
- "loss": 6.5691,
24
- "step": 10
25
- },
26
  {
27
  "epoch": 1.6666666666666665,
28
- "grad_norm": 3.890625,
29
- "learning_rate": 0.00013333333333333334,
30
- "loss": 5.8025,
31
- "step": 20
32
- },
33
- {
34
- "epoch": 2.5,
35
- "grad_norm": 4.375,
36
- "learning_rate": 0.0002,
37
- "loss": 5.5103,
38
- "step": 30
39
  },
40
  {
41
  "epoch": 3.3333333333333335,
42
- "grad_norm": 4.78125,
43
- "learning_rate": 0.00019984815164333163,
44
- "loss": 5.3092,
45
- "step": 40
46
- },
47
- {
48
- "epoch": 4.166666666666667,
49
- "grad_norm": 4.1875,
50
- "learning_rate": 0.00019939306773179497,
51
- "loss": 5.0693,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 4.166666666666667,
56
- "eval_loss": 3.1816465854644775,
57
- "eval_runtime": 10.8479,
58
- "eval_samples_per_second": 138.368,
59
- "eval_steps_per_second": 2.212,
60
- "step": 50
61
  },
62
  {
63
  "epoch": 5.0,
64
- "grad_norm": 7.0625,
65
- "learning_rate": 0.00019863613034027224,
66
- "loss": 4.772,
67
- "step": 60
68
- },
69
- {
70
- "epoch": 5.833333333333333,
71
- "grad_norm": 4.53125,
72
- "learning_rate": 0.00019757963826274357,
73
- "loss": 4.671,
74
- "step": 70
75
  },
76
  {
77
  "epoch": 6.666666666666667,
78
- "grad_norm": 5.46875,
79
- "learning_rate": 0.00019622680003092503,
80
- "loss": 4.4709,
81
- "step": 80
82
- },
83
- {
84
- "epoch": 7.5,
85
- "grad_norm": 3.796875,
86
- "learning_rate": 0.00019458172417006347,
87
- "loss": 4.3646,
88
- "step": 90
89
- },
90
- {
91
- "epoch": 8.333333333333334,
92
- "grad_norm": 3.875,
93
- "learning_rate": 0.00019264940672148018,
94
- "loss": 4.2165,
95
- "step": 100
96
  },
97
  {
98
  "epoch": 8.333333333333334,
99
- "eval_loss": 3.1671667098999023,
100
- "eval_runtime": 10.6557,
101
- "eval_samples_per_second": 140.864,
102
- "eval_steps_per_second": 2.252,
103
- "step": 100
104
- },
105
- {
106
- "epoch": 9.166666666666666,
107
- "grad_norm": 5.0625,
108
- "learning_rate": 0.00019043571606975777,
109
- "loss": 4.1085,
110
- "step": 110
111
  },
112
  {
113
  "epoch": 10.0,
114
- "grad_norm": 4.59375,
115
- "learning_rate": 0.0001879473751206489,
116
- "loss": 4.0046,
117
- "step": 120
118
- },
119
- {
120
- "epoch": 10.833333333333334,
121
- "grad_norm": 4.875,
122
- "learning_rate": 0.00018519194088383273,
123
- "loss": 3.8989,
124
- "step": 130
125
  },
126
  {
127
  "epoch": 11.666666666666666,
128
- "grad_norm": 3.90625,
129
- "learning_rate": 0.0001821777815225245,
130
- "loss": 3.7925,
131
- "step": 140
132
- },
133
- {
134
- "epoch": 12.5,
135
- "grad_norm": 3.9375,
136
- "learning_rate": 0.00017891405093963938,
137
- "loss": 3.7077,
138
- "step": 150
139
- },
140
- {
141
- "epoch": 12.5,
142
- "eval_loss": 3.263496160507202,
143
- "eval_runtime": 10.1832,
144
- "eval_samples_per_second": 147.399,
145
- "eval_steps_per_second": 2.357,
146
- "step": 150
147
  },
148
  {
149
  "epoch": 13.333333333333334,
150
- "grad_norm": 4.0,
151
- "learning_rate": 0.00017541066097768963,
152
- "loss": 3.6203,
153
- "step": 160
154
- },
155
- {
156
- "epoch": 14.166666666666666,
157
- "grad_norm": 4.21875,
158
- "learning_rate": 0.00017167825131684513,
159
- "loss": 3.533,
160
- "step": 170
161
  },
162
  {
163
  "epoch": 15.0,
164
- "grad_norm": 4.59375,
165
- "learning_rate": 0.00016772815716257412,
166
- "loss": 3.4567,
167
- "step": 180
168
- },
169
- {
170
- "epoch": 15.833333333333334,
171
- "grad_norm": 5.125,
172
- "learning_rate": 0.00016357237482099684,
173
- "loss": 3.3726,
174
- "step": 190
175
- },
176
- {
177
- "epoch": 16.666666666666668,
178
- "grad_norm": 4.03125,
179
- "learning_rate": 0.00015922352526649803,
180
- "loss": 3.3007,
181
- "step": 200
182
- },
183
- {
184
- "epoch": 16.666666666666668,
185
- "eval_loss": 3.2971582412719727,
186
- "eval_runtime": 10.605,
187
- "eval_samples_per_second": 141.537,
188
- "eval_steps_per_second": 2.263,
189
- "step": 200
190
- },
191
- {
192
- "epoch": 17.5,
193
- "grad_norm": 3.984375,
194
- "learning_rate": 0.00015469481581224272,
195
- "loss": 3.2145,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 18.333333333333332,
200
- "grad_norm": 4.0,
201
- "learning_rate": 0.00015000000000000001,
202
- "loss": 3.1833,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 19.166666666666668,
207
- "grad_norm": 4.28125,
208
- "learning_rate": 0.00014515333583108896,
209
- "loss": 3.1072,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 20.0,
214
- "grad_norm": 3.328125,
215
- "learning_rate": 0.00014016954246529696,
216
- "loss": 3.0609,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 20.833333333333332,
221
- "grad_norm": 4.0,
222
- "learning_rate": 0.00013506375551927547,
223
- "loss": 3.0201,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 20.833333333333332,
228
- "eval_loss": 3.0752980709075928,
229
- "eval_runtime": 10.4283,
230
- "eval_samples_per_second": 143.936,
231
- "eval_steps_per_second": 2.301,
232
- "step": 250
233
- },
234
- {
235
- "epoch": 21.666666666666668,
236
- "grad_norm": 4.03125,
237
- "learning_rate": 0.00012985148110016947,
238
- "loss": 2.9827,
239
- "step": 260
240
- },
241
- {
242
- "epoch": 22.5,
243
- "grad_norm": 3.953125,
244
- "learning_rate": 0.00012454854871407994,
245
- "loss": 2.9238,
246
- "step": 270
247
- },
248
- {
249
- "epoch": 23.333333333333332,
250
- "grad_norm": 4.25,
251
- "learning_rate": 0.00011917106319237386,
252
- "loss": 2.8875,
253
- "step": 280
254
- },
255
- {
256
- "epoch": 24.166666666666668,
257
- "grad_norm": 3.90625,
258
- "learning_rate": 0.00011373535578184082,
259
- "loss": 2.836,
260
- "step": 290
261
- },
262
- {
263
- "epoch": 25.0,
264
- "grad_norm": 3.703125,
265
- "learning_rate": 0.00010825793454723325,
266
- "loss": 2.8044,
267
- "step": 300
268
- },
269
- {
270
- "epoch": 25.0,
271
- "eval_loss": 3.1132636070251465,
272
- "eval_runtime": 10.3535,
273
- "eval_samples_per_second": 144.976,
274
- "eval_steps_per_second": 2.318,
275
- "step": 300
276
- },
277
- {
278
- "epoch": 25.833333333333332,
279
- "grad_norm": 3.84375,
280
- "learning_rate": 0.00010275543423681621,
281
- "loss": 2.7693,
282
- "step": 310
283
- },
284
- {
285
- "epoch": 26.666666666666668,
286
- "grad_norm": 4.25,
287
- "learning_rate": 9.724456576318381e-05,
288
- "loss": 2.7404,
289
- "step": 320
290
- },
291
- {
292
- "epoch": 27.5,
293
  "grad_norm": 3.28125,
294
- "learning_rate": 9.174206545276677e-05,
295
- "loss": 2.7013,
296
- "step": 330
297
- },
298
- {
299
- "epoch": 28.333333333333332,
300
- "grad_norm": 3.734375,
301
- "learning_rate": 8.626464421815919e-05,
302
- "loss": 2.6912,
303
- "step": 340
304
- },
305
- {
306
- "epoch": 29.166666666666668,
307
- "grad_norm": 3.546875,
308
- "learning_rate": 8.082893680762619e-05,
309
- "loss": 2.6459,
310
- "step": 350
311
- },
312
- {
313
- "epoch": 29.166666666666668,
314
- "eval_loss": 3.04976749420166,
315
- "eval_runtime": 10.3469,
316
- "eval_samples_per_second": 145.068,
317
- "eval_steps_per_second": 2.32,
318
- "step": 350
319
- },
320
- {
321
- "epoch": 30.0,
322
- "grad_norm": 3.6875,
323
- "learning_rate": 7.54514512859201e-05,
324
- "loss": 2.6366,
325
- "step": 360
326
- },
327
- {
328
- "epoch": 30.833333333333332,
329
- "grad_norm": 2.890625,
330
- "learning_rate": 7.014851889983057e-05,
331
- "loss": 2.6056,
332
- "step": 370
333
- },
334
- {
335
- "epoch": 31.666666666666668,
336
- "grad_norm": 4.03125,
337
- "learning_rate": 6.493624448072457e-05,
338
- "loss": 2.5957,
339
- "step": 380
340
- },
341
- {
342
- "epoch": 32.5,
343
- "grad_norm": 2.9375,
344
- "learning_rate": 5.983045753470308e-05,
345
- "loss": 2.5768,
346
- "step": 390
347
- },
348
- {
349
- "epoch": 33.333333333333336,
350
- "grad_norm": 2.75,
351
- "learning_rate": 5.484666416891109e-05,
352
- "loss": 2.5792,
353
- "step": 400
354
  },
355
  {
356
- "epoch": 33.333333333333336,
357
- "eval_loss": 3.1012673377990723,
358
- "eval_runtime": 10.582,
359
- "eval_samples_per_second": 141.844,
360
- "eval_steps_per_second": 2.268,
361
- "step": 400
362
  }
363
  ],
364
  "logging_steps": 10,
365
- "max_steps": 600,
366
  "num_input_tokens_seen": 0,
367
- "num_train_epochs": 50,
368
- "save_steps": 50,
369
  "stateful_callbacks": {
370
  "TrainerControl": {
371
  "args": {
@@ -373,12 +99,12 @@
373
  "should_evaluate": false,
374
  "should_log": false,
375
  "should_save": true,
376
- "should_training_stop": false
377
  },
378
  "attributes": {}
379
  }
380
  },
381
- "total_flos": 2.592225988129587e+16,
382
  "train_batch_size": 32,
383
  "trial_name": null,
384
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 16.666666666666668,
5
+ "eval_steps": 200,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.16666666666666666,
13
+ "eval_loss": 3.2664246559143066,
14
+ "eval_runtime": 5.1158,
15
+ "eval_samples_per_second": 293.406,
16
+ "eval_steps_per_second": 2.346,
17
  "step": 1
18
  },
 
 
 
 
 
 
 
19
  {
20
  "epoch": 1.6666666666666665,
21
+ "grad_norm": 3.84375,
22
+ "learning_rate": 0.00019863613034027224,
23
+ "loss": 6.2672,
24
+ "step": 10
 
 
 
 
 
 
 
25
  },
26
  {
27
  "epoch": 3.3333333333333335,
28
+ "grad_norm": 3.5,
29
+ "learning_rate": 0.0001879473751206489,
30
+ "loss": 5.5486,
31
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  {
34
  "epoch": 5.0,
35
+ "grad_norm": 2.71875,
36
+ "learning_rate": 0.00016772815716257412,
37
+ "loss": 5.2045,
38
+ "step": 30
 
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 6.666666666666667,
42
+ "grad_norm": 3.03125,
43
+ "learning_rate": 0.00014016954246529696,
44
+ "loss": 5.0295,
45
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
  "epoch": 8.333333333333334,
49
+ "grad_norm": 4.25,
50
+ "learning_rate": 0.00010825793454723325,
51
+ "loss": 4.8139,
52
+ "step": 50
 
 
 
 
 
 
 
 
53
  },
54
  {
55
  "epoch": 10.0,
56
+ "grad_norm": 2.921875,
57
+ "learning_rate": 7.54514512859201e-05,
58
+ "loss": 4.6461,
59
+ "step": 60
 
 
 
 
 
 
 
60
  },
61
  {
62
  "epoch": 11.666666666666666,
63
+ "grad_norm": 2.34375,
64
+ "learning_rate": 4.530518418775733e-05,
65
+ "loss": 4.5564,
66
+ "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
  "epoch": 13.333333333333334,
70
+ "grad_norm": 2.265625,
71
+ "learning_rate": 2.1085949060360654e-05,
72
+ "loss": 4.4932,
73
+ "step": 80
 
 
 
 
 
 
 
74
  },
75
  {
76
  "epoch": 15.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  "grad_norm": 3.28125,
78
+ "learning_rate": 5.418275829936537e-06,
79
+ "loss": 4.4706,
80
+ "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  },
82
  {
83
+ "epoch": 16.666666666666668,
84
+ "grad_norm": 1.953125,
85
+ "learning_rate": 0.0,
86
+ "loss": 4.4708,
87
+ "step": 100
 
88
  }
89
  ],
90
  "logging_steps": 10,
91
+ "max_steps": 100,
92
  "num_input_tokens_seen": 0,
93
+ "num_train_epochs": 17,
94
+ "save_steps": 200,
95
  "stateful_callbacks": {
96
  "TrainerControl": {
97
  "args": {
 
99
  "should_evaluate": false,
100
  "should_log": false,
101
  "should_save": true,
102
+ "should_training_stop": true
103
  },
104
  "attributes": {}
105
  }
106
  },
107
+ "total_flos": 1.2971582112661504e+16,
108
  "train_batch_size": 32,
109
  "trial_name": null,
110
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:701b8fdcf7ee9a32ee2820bf97b3ab785ffbf4aa2b41bcd64bfa018085f95834
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e32bef4fa18f7434f34b3ab9a59a991d12d4e7c7fd850b74f33c05529273df
3
  size 6840