DejanX13 commited on
Commit
c2c5740
·
verified ·
1 Parent(s): e9d57ee

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +262 -255
trainer_state.json CHANGED
@@ -1,443 +1,450 @@
1
  {
2
  "best_global_step": 450,
3
- "best_metric": 0.570094108581543,
4
- "best_model_checkpoint": "./vit-results/checkpoint-200",
5
  "epoch": 10.0,
6
  "eval_steps": 50,
7
- "global_step": 480,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.20833333333333334,
14
- "grad_norm": 1.4851477146148682,
15
- "learning_rate": 1.9625e-05,
16
- "loss": 1.3347,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.4166666666666667,
21
- "grad_norm": 2.0887389183044434,
22
- "learning_rate": 1.9208333333333337e-05,
23
- "loss": 1.2739,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.625,
28
- "grad_norm": 1.6611489057540894,
29
- "learning_rate": 1.8791666666666668e-05,
30
- "loss": 1.2122,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.8333333333333334,
35
- "grad_norm": 1.7925909757614136,
36
- "learning_rate": 1.8375e-05,
37
- "loss": 1.181,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 1.0416666666666667,
42
- "grad_norm": 1.7047886848449707,
43
- "learning_rate": 1.7958333333333334e-05,
44
- "loss": 1.1578,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 1.0416666666666667,
49
- "eval_accuracy": 0.4625,
50
- "eval_loss": 1.135870099067688,
51
- "eval_runtime": 1.2659,
52
- "eval_samples_per_second": 63.197,
53
- "eval_steps_per_second": 7.9,
54
  "step": 50
55
  },
56
  {
57
- "epoch": 1.25,
58
- "grad_norm": 1.962847352027893,
59
- "learning_rate": 1.754166666666667e-05,
60
- "loss": 1.0607,
61
  "step": 60
62
  },
63
  {
64
- "epoch": 1.4583333333333333,
65
- "grad_norm": 2.020244836807251,
66
- "learning_rate": 1.7125e-05,
67
- "loss": 1.0081,
68
  "step": 70
69
  },
70
  {
71
- "epoch": 1.6666666666666665,
72
- "grad_norm": 2.294292449951172,
73
- "learning_rate": 1.6708333333333334e-05,
74
- "loss": 0.9995,
75
  "step": 80
76
  },
77
  {
78
- "epoch": 1.875,
79
- "grad_norm": 2.0403995513916016,
80
- "learning_rate": 1.629166666666667e-05,
81
- "loss": 1.0403,
82
  "step": 90
83
  },
84
  {
85
- "epoch": 2.0833333333333335,
86
- "grad_norm": 3.781327724456787,
87
- "learning_rate": 1.5875e-05,
88
- "loss": 0.9551,
89
  "step": 100
90
  },
91
  {
92
- "epoch": 2.0833333333333335,
93
- "eval_accuracy": 0.725,
94
- "eval_loss": 1.0034687519073486,
95
- "eval_runtime": 1.2636,
96
- "eval_samples_per_second": 63.309,
97
- "eval_steps_per_second": 7.914,
98
  "step": 100
99
  },
100
  {
101
- "epoch": 2.2916666666666665,
102
- "grad_norm": 2.135753631591797,
103
- "learning_rate": 1.5458333333333334e-05,
104
- "loss": 0.8584,
105
  "step": 110
106
  },
107
  {
108
- "epoch": 2.5,
109
- "grad_norm": 1.9046008586883545,
110
- "learning_rate": 1.5041666666666667e-05,
111
- "loss": 0.8578,
112
  "step": 120
113
  },
114
  {
115
- "epoch": 2.7083333333333335,
116
- "grad_norm": 1.7979331016540527,
117
- "learning_rate": 1.4625e-05,
118
- "loss": 0.7959,
119
  "step": 130
120
  },
121
  {
122
- "epoch": 2.9166666666666665,
123
- "grad_norm": 2.755568742752075,
124
- "learning_rate": 1.4208333333333336e-05,
125
- "loss": 0.8676,
126
  "step": 140
127
  },
128
  {
129
- "epoch": 3.125,
130
- "grad_norm": 1.752200961112976,
131
- "learning_rate": 1.3791666666666667e-05,
132
- "loss": 0.7547,
133
  "step": 150
134
  },
135
  {
136
- "epoch": 3.125,
137
- "eval_accuracy": 0.7625,
138
- "eval_loss": 0.8992247581481934,
139
- "eval_runtime": 1.2311,
140
- "eval_samples_per_second": 64.981,
141
- "eval_steps_per_second": 8.123,
142
  "step": 150
143
  },
144
  {
145
- "epoch": 3.3333333333333335,
146
- "grad_norm": 1.913486361503601,
147
- "learning_rate": 1.3375e-05,
148
- "loss": 0.7207,
149
  "step": 160
150
  },
151
  {
152
- "epoch": 3.5416666666666665,
153
- "grad_norm": 2.5411376953125,
154
- "learning_rate": 1.2958333333333334e-05,
155
- "loss": 0.6804,
156
  "step": 170
157
  },
158
  {
159
- "epoch": 3.75,
160
- "grad_norm": 1.997448444366455,
161
- "learning_rate": 1.2541666666666667e-05,
162
- "loss": 0.6733,
163
  "step": 180
164
  },
165
  {
166
- "epoch": 3.9583333333333335,
167
- "grad_norm": 1.744899868965149,
168
- "learning_rate": 1.2125e-05,
169
- "loss": 0.6074,
170
  "step": 190
171
  },
172
  {
173
- "epoch": 4.166666666666667,
174
- "grad_norm": 1.6227153539657593,
175
- "learning_rate": 1.1708333333333334e-05,
176
- "loss": 0.5998,
177
  "step": 200
178
  },
179
  {
180
- "epoch": 4.166666666666667,
181
- "eval_accuracy": 0.8,
182
- "eval_loss": 0.7827270030975342,
183
- "eval_runtime": 1.2396,
184
- "eval_samples_per_second": 64.536,
185
- "eval_steps_per_second": 8.067,
186
  "step": 200
187
  },
188
  {
189
- "epoch": 4.375,
190
- "grad_norm": 1.6534216403961182,
191
- "learning_rate": 1.1291666666666667e-05,
192
- "loss": 0.5243,
193
  "step": 210
194
  },
195
  {
196
- "epoch": 4.583333333333333,
197
- "grad_norm": 1.8691354990005493,
198
- "learning_rate": 1.0875e-05,
199
- "loss": 0.5312,
200
  "step": 220
201
  },
202
  {
203
- "epoch": 4.791666666666667,
204
- "grad_norm": 1.7466825246810913,
205
- "learning_rate": 1.0458333333333335e-05,
206
- "loss": 0.492,
207
  "step": 230
208
  },
209
  {
210
- "epoch": 5.0,
211
- "grad_norm": 3.337895154953003,
212
- "learning_rate": 1.0041666666666667e-05,
213
- "loss": 0.4978,
214
  "step": 240
215
  },
216
  {
217
- "epoch": 5.208333333333333,
218
- "grad_norm": 2.0019006729125977,
219
- "learning_rate": 9.625e-06,
220
- "loss": 0.4259,
221
  "step": 250
222
  },
223
  {
224
- "epoch": 5.208333333333333,
225
- "eval_accuracy": 0.8125,
226
- "eval_loss": 0.6670618653297424,
227
- "eval_runtime": 1.2584,
228
- "eval_samples_per_second": 63.572,
229
- "eval_steps_per_second": 7.946,
230
  "step": 250
231
  },
232
  {
233
- "epoch": 5.416666666666667,
234
- "grad_norm": 1.431734323501587,
235
- "learning_rate": 9.208333333333333e-06,
236
- "loss": 0.4032,
237
  "step": 260
238
  },
239
  {
240
- "epoch": 5.625,
241
- "grad_norm": 2.39288592338562,
242
- "learning_rate": 8.791666666666667e-06,
243
- "loss": 0.4095,
244
  "step": 270
245
  },
246
  {
247
- "epoch": 5.833333333333333,
248
- "grad_norm": 2.3530094623565674,
249
- "learning_rate": 8.375e-06,
250
- "loss": 0.3499,
251
  "step": 280
252
  },
253
  {
254
- "epoch": 6.041666666666667,
255
- "grad_norm": 1.5345488786697388,
256
- "learning_rate": 7.958333333333333e-06,
257
- "loss": 0.333,
258
  "step": 290
259
  },
260
  {
261
- "epoch": 6.25,
262
- "grad_norm": 2.1422934532165527,
263
- "learning_rate": 7.541666666666667e-06,
264
- "loss": 0.3336,
265
  "step": 300
266
  },
267
  {
268
- "epoch": 6.25,
269
- "eval_accuracy": 0.725,
270
- "eval_loss": 0.6925244927406311,
271
- "eval_runtime": 1.3296,
272
- "eval_samples_per_second": 60.169,
273
- "eval_steps_per_second": 7.521,
274
  "step": 300
275
  },
276
  {
277
- "epoch": 6.458333333333333,
278
- "grad_norm": 1.0821270942687988,
279
- "learning_rate": 7.125e-06,
280
- "loss": 0.2689,
281
  "step": 310
282
  },
283
  {
284
- "epoch": 6.666666666666667,
285
- "grad_norm": 1.5213534832000732,
286
- "learning_rate": 6.708333333333333e-06,
287
- "loss": 0.2875,
288
  "step": 320
289
  },
290
  {
291
- "epoch": 6.875,
292
- "grad_norm": 2.103844165802002,
293
- "learning_rate": 6.291666666666667e-06,
294
- "loss": 0.2881,
295
  "step": 330
296
  },
297
  {
298
- "epoch": 7.083333333333333,
299
- "grad_norm": 1.164125680923462,
300
- "learning_rate": 5.8750000000000005e-06,
301
- "loss": 0.25,
302
  "step": 340
303
  },
304
  {
305
- "epoch": 7.291666666666667,
306
- "grad_norm": 0.9860062599182129,
307
- "learning_rate": 5.458333333333333e-06,
308
- "loss": 0.2409,
309
  "step": 350
310
  },
311
  {
312
- "epoch": 7.291666666666667,
313
  "eval_accuracy": 0.8125,
314
- "eval_loss": 0.5735878348350525,
315
- "eval_runtime": 1.2591,
316
- "eval_samples_per_second": 63.536,
317
- "eval_steps_per_second": 7.942,
318
  "step": 350
319
  },
320
  {
321
- "epoch": 7.5,
322
- "grad_norm": 0.9531723856925964,
323
- "learning_rate": 5.041666666666667e-06,
324
- "loss": 0.2265,
325
  "step": 360
326
  },
327
  {
328
- "epoch": 7.708333333333333,
329
- "grad_norm": 1.6497358083724976,
330
- "learning_rate": 4.625000000000001e-06,
331
- "loss": 0.2423,
332
  "step": 370
333
  },
334
  {
335
- "epoch": 7.916666666666667,
336
- "grad_norm": 1.4591480493545532,
337
- "learning_rate": 4.208333333333333e-06,
338
- "loss": 0.2121,
339
  "step": 380
340
  },
341
  {
342
- "epoch": 8.125,
343
- "grad_norm": 0.8170286417007446,
344
- "learning_rate": 3.7916666666666666e-06,
345
- "loss": 0.2122,
346
  "step": 390
347
  },
348
  {
349
- "epoch": 8.333333333333334,
350
- "grad_norm": 0.8556548953056335,
351
- "learning_rate": 3.3750000000000003e-06,
352
- "loss": 0.1982,
353
  "step": 400
354
  },
355
  {
356
- "epoch": 8.333333333333334,
357
  "eval_accuracy": 0.8125,
358
- "eval_loss": 0.5838413238525391,
359
- "eval_runtime": 1.3799,
360
- "eval_samples_per_second": 57.977,
361
- "eval_steps_per_second": 7.247,
362
  "step": 400
363
  },
364
  {
365
- "epoch": 8.541666666666666,
366
- "grad_norm": 0.9361312389373779,
367
- "learning_rate": 2.9583333333333335e-06,
368
- "loss": 0.1997,
369
  "step": 410
370
  },
371
  {
372
- "epoch": 8.75,
373
- "grad_norm": 0.8666655421257019,
374
- "learning_rate": 2.5416666666666668e-06,
375
- "loss": 0.1902,
376
  "step": 420
377
  },
378
  {
379
- "epoch": 8.958333333333334,
380
- "grad_norm": 0.8868537545204163,
381
- "learning_rate": 2.125e-06,
382
- "loss": 0.1861,
383
  "step": 430
384
  },
385
  {
386
- "epoch": 9.166666666666666,
387
- "grad_norm": 1.152140498161316,
388
- "learning_rate": 1.7083333333333334e-06,
389
- "loss": 0.1866,
390
  "step": 440
391
  },
392
  {
393
- "epoch": 9.375,
394
- "grad_norm": 0.7092148661613464,
395
- "learning_rate": 1.2916666666666669e-06,
396
- "loss": 0.1732,
397
  "step": 450
398
  },
399
  {
400
- "epoch": 9.375,
401
- "eval_accuracy": 0.8125,
402
- "eval_loss": 0.570094108581543,
403
- "eval_runtime": 1.4058,
404
- "eval_samples_per_second": 56.905,
405
- "eval_steps_per_second": 7.113,
406
  "step": 450
407
  },
408
  {
409
- "epoch": 9.583333333333334,
410
- "grad_norm": 0.7921653985977173,
411
- "learning_rate": 8.75e-07,
412
- "loss": 0.1803,
413
  "step": 460
414
  },
415
  {
416
- "epoch": 9.791666666666666,
417
- "grad_norm": 0.735058605670929,
418
- "learning_rate": 4.583333333333333e-07,
419
- "loss": 0.1927,
420
  "step": 470
421
  },
422
  {
423
- "epoch": 10.0,
424
- "grad_norm": 1.2600841522216797,
425
- "learning_rate": 4.166666666666667e-08,
426
- "loss": 0.174,
427
  "step": 480
428
  },
429
  {
430
  "epoch": 10.0,
431
- "step": 480,
432
- "total_flos": 5.866248766604083e+17,
433
- "train_loss": 0.5593519407014053,
434
- "train_runtime": 335.8262,
435
- "train_samples_per_second": 22.541,
436
- "train_steps_per_second": 1.429
 
 
 
 
 
 
 
437
  }
438
  ],
439
  "logging_steps": 10,
440
- "max_steps": 480,
441
  "num_input_tokens_seen": 0,
442
  "num_train_epochs": 10,
443
  "save_steps": 100,
@@ -453,7 +460,7 @@
453
  "attributes": {}
454
  }
455
  },
456
- "total_flos": 5.866248766604083e+17,
457
  "train_batch_size": 16,
458
  "trial_name": null,
459
  "trial_params": null
 
1
  {
2
  "best_global_step": 450,
3
+ "best_metric": 0.5457363724708557,
4
+ "best_model_checkpoint": "./vit-results/checkpoint-400",
5
  "epoch": 10.0,
6
  "eval_steps": 50,
7
+ "global_step": 490,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.20408163265306123,
14
+ "grad_norm": 1.3306224346160889,
15
+ "learning_rate": 1.963265306122449e-05,
16
+ "loss": 1.3374,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.40816326530612246,
21
+ "grad_norm": 1.4784319400787354,
22
+ "learning_rate": 1.922448979591837e-05,
23
+ "loss": 1.265,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.6122448979591837,
28
+ "grad_norm": 2.0332705974578857,
29
+ "learning_rate": 1.8816326530612246e-05,
30
+ "loss": 1.2081,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.8163265306122449,
35
+ "grad_norm": 1.4771761894226074,
36
+ "learning_rate": 1.8408163265306125e-05,
37
+ "loss": 1.2104,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 1.0204081632653061,
42
+ "grad_norm": 1.5638868808746338,
43
+ "learning_rate": 1.8e-05,
44
+ "loss": 1.1453,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 1.0204081632653061,
49
+ "eval_accuracy": 0.475,
50
+ "eval_loss": 1.1232645511627197,
51
+ "eval_runtime": 1.3068,
52
+ "eval_samples_per_second": 61.216,
53
+ "eval_steps_per_second": 7.652,
54
  "step": 50
55
  },
56
  {
57
+ "epoch": 1.2244897959183674,
58
+ "grad_norm": 2.1241655349731445,
59
+ "learning_rate": 1.759183673469388e-05,
60
+ "loss": 1.0284,
61
  "step": 60
62
  },
63
  {
64
+ "epoch": 1.4285714285714286,
65
+ "grad_norm": 2.0577926635742188,
66
+ "learning_rate": 1.7183673469387755e-05,
67
+ "loss": 1.0428,
68
  "step": 70
69
  },
70
  {
71
+ "epoch": 1.6326530612244898,
72
+ "grad_norm": 1.7612333297729492,
73
+ "learning_rate": 1.6775510204081634e-05,
74
+ "loss": 1.008,
75
  "step": 80
76
  },
77
  {
78
+ "epoch": 1.836734693877551,
79
+ "grad_norm": 2.2085816860198975,
80
+ "learning_rate": 1.6367346938775513e-05,
81
+ "loss": 1.0095,
82
  "step": 90
83
  },
84
  {
85
+ "epoch": 2.0408163265306123,
86
+ "grad_norm": 1.5611677169799805,
87
+ "learning_rate": 1.595918367346939e-05,
88
+ "loss": 0.9155,
89
  "step": 100
90
  },
91
  {
92
+ "epoch": 2.0408163265306123,
93
+ "eval_accuracy": 0.7,
94
+ "eval_loss": 0.964668869972229,
95
+ "eval_runtime": 1.2647,
96
+ "eval_samples_per_second": 63.255,
97
+ "eval_steps_per_second": 7.907,
98
  "step": 100
99
  },
100
  {
101
+ "epoch": 2.2448979591836733,
102
+ "grad_norm": 2.1504409313201904,
103
+ "learning_rate": 1.5551020408163265e-05,
104
+ "loss": 0.8396,
105
  "step": 110
106
  },
107
  {
108
+ "epoch": 2.4489795918367347,
109
+ "grad_norm": 1.9358028173446655,
110
+ "learning_rate": 1.5142857142857144e-05,
111
+ "loss": 0.8618,
112
  "step": 120
113
  },
114
  {
115
+ "epoch": 2.6530612244897958,
116
+ "grad_norm": 1.5132843255996704,
117
+ "learning_rate": 1.4734693877551021e-05,
118
+ "loss": 0.8077,
119
  "step": 130
120
  },
121
  {
122
+ "epoch": 2.857142857142857,
123
+ "grad_norm": 2.5402004718780518,
124
+ "learning_rate": 1.43265306122449e-05,
125
+ "loss": 0.7742,
126
  "step": 140
127
  },
128
  {
129
+ "epoch": 3.061224489795918,
130
+ "grad_norm": 2.259558916091919,
131
+ "learning_rate": 1.3918367346938776e-05,
132
+ "loss": 0.7638,
133
  "step": 150
134
  },
135
  {
136
+ "epoch": 3.061224489795918,
137
+ "eval_accuracy": 0.75,
138
+ "eval_loss": 0.8326537013053894,
139
+ "eval_runtime": 1.2728,
140
+ "eval_samples_per_second": 62.853,
141
+ "eval_steps_per_second": 7.857,
142
  "step": 150
143
  },
144
  {
145
+ "epoch": 3.2653061224489797,
146
+ "grad_norm": 2.010672092437744,
147
+ "learning_rate": 1.3510204081632654e-05,
148
+ "loss": 0.672,
149
  "step": 160
150
  },
151
  {
152
+ "epoch": 3.4693877551020407,
153
+ "grad_norm": 1.8449556827545166,
154
+ "learning_rate": 1.3102040816326531e-05,
155
+ "loss": 0.6367,
156
  "step": 170
157
  },
158
  {
159
+ "epoch": 3.673469387755102,
160
+ "grad_norm": 1.8838822841644287,
161
+ "learning_rate": 1.2693877551020409e-05,
162
+ "loss": 0.6125,
163
  "step": 180
164
  },
165
  {
166
+ "epoch": 3.877551020408163,
167
+ "grad_norm": 2.325084686279297,
168
+ "learning_rate": 1.2285714285714288e-05,
169
+ "loss": 0.5768,
170
  "step": 190
171
  },
172
  {
173
+ "epoch": 4.081632653061225,
174
+ "grad_norm": 2.084455966949463,
175
+ "learning_rate": 1.1877551020408165e-05,
176
+ "loss": 0.537,
177
  "step": 200
178
  },
179
  {
180
+ "epoch": 4.081632653061225,
181
+ "eval_accuracy": 0.7875,
182
+ "eval_loss": 0.7291887402534485,
183
+ "eval_runtime": 1.2754,
184
+ "eval_samples_per_second": 62.726,
185
+ "eval_steps_per_second": 7.841,
186
  "step": 200
187
  },
188
  {
189
+ "epoch": 4.285714285714286,
190
+ "grad_norm": 2.3336734771728516,
191
+ "learning_rate": 1.146938775510204e-05,
192
+ "loss": 0.5,
193
  "step": 210
194
  },
195
  {
196
+ "epoch": 4.489795918367347,
197
+ "grad_norm": 2.6175546646118164,
198
+ "learning_rate": 1.1061224489795918e-05,
199
+ "loss": 0.4665,
200
  "step": 220
201
  },
202
  {
203
+ "epoch": 4.6938775510204085,
204
+ "grad_norm": 1.6590876579284668,
205
+ "learning_rate": 1.0653061224489796e-05,
206
+ "loss": 0.448,
207
  "step": 230
208
  },
209
  {
210
+ "epoch": 4.8979591836734695,
211
+ "grad_norm": 1.8714507818222046,
212
+ "learning_rate": 1.0244897959183675e-05,
213
+ "loss": 0.4491,
214
  "step": 240
215
  },
216
  {
217
+ "epoch": 5.1020408163265305,
218
+ "grad_norm": 2.239349365234375,
219
+ "learning_rate": 9.836734693877552e-06,
220
+ "loss": 0.3957,
221
  "step": 250
222
  },
223
  {
224
+ "epoch": 5.1020408163265305,
225
+ "eval_accuracy": 0.825,
226
+ "eval_loss": 0.6328426599502563,
227
+ "eval_runtime": 1.2652,
228
+ "eval_samples_per_second": 63.23,
229
+ "eval_steps_per_second": 7.904,
230
  "step": 250
231
  },
232
  {
233
+ "epoch": 5.3061224489795915,
234
+ "grad_norm": 2.4021127223968506,
235
+ "learning_rate": 9.42857142857143e-06,
236
+ "loss": 0.3601,
237
  "step": 260
238
  },
239
  {
240
+ "epoch": 5.510204081632653,
241
+ "grad_norm": 1.2185922861099243,
242
+ "learning_rate": 9.020408163265307e-06,
243
+ "loss": 0.3431,
244
  "step": 270
245
  },
246
  {
247
+ "epoch": 5.714285714285714,
248
+ "grad_norm": 1.0699914693832397,
249
+ "learning_rate": 8.612244897959184e-06,
250
+ "loss": 0.3646,
251
  "step": 280
252
  },
253
  {
254
+ "epoch": 5.918367346938775,
255
+ "grad_norm": 1.5863635540008545,
256
+ "learning_rate": 8.204081632653062e-06,
257
+ "loss": 0.3129,
258
  "step": 290
259
  },
260
  {
261
+ "epoch": 6.122448979591836,
262
+ "grad_norm": 1.0668057203292847,
263
+ "learning_rate": 7.79591836734694e-06,
264
+ "loss": 0.2842,
265
  "step": 300
266
  },
267
  {
268
+ "epoch": 6.122448979591836,
269
+ "eval_accuracy": 0.7875,
270
+ "eval_loss": 0.5894995927810669,
271
+ "eval_runtime": 1.2447,
272
+ "eval_samples_per_second": 64.272,
273
+ "eval_steps_per_second": 8.034,
274
  "step": 300
275
  },
276
  {
277
+ "epoch": 6.326530612244898,
278
+ "grad_norm": 1.0372223854064941,
279
+ "learning_rate": 7.387755102040817e-06,
280
+ "loss": 0.2725,
281
  "step": 310
282
  },
283
  {
284
+ "epoch": 6.530612244897959,
285
+ "grad_norm": 1.855797529220581,
286
+ "learning_rate": 6.979591836734695e-06,
287
+ "loss": 0.2596,
288
  "step": 320
289
  },
290
  {
291
+ "epoch": 6.73469387755102,
292
+ "grad_norm": 1.3400880098342896,
293
+ "learning_rate": 6.571428571428572e-06,
294
+ "loss": 0.2579,
295
  "step": 330
296
  },
297
  {
298
+ "epoch": 6.938775510204081,
299
+ "grad_norm": 1.0782897472381592,
300
+ "learning_rate": 6.163265306122449e-06,
301
+ "loss": 0.2394,
302
  "step": 340
303
  },
304
  {
305
+ "epoch": 7.142857142857143,
306
+ "grad_norm": 1.1473782062530518,
307
+ "learning_rate": 5.755102040816327e-06,
308
+ "loss": 0.2266,
309
  "step": 350
310
  },
311
  {
312
+ "epoch": 7.142857142857143,
313
  "eval_accuracy": 0.8125,
314
+ "eval_loss": 0.5921751260757446,
315
+ "eval_runtime": 1.2621,
316
+ "eval_samples_per_second": 63.388,
317
+ "eval_steps_per_second": 7.924,
318
  "step": 350
319
  },
320
  {
321
+ "epoch": 7.346938775510204,
322
+ "grad_norm": 0.8374194502830505,
323
+ "learning_rate": 5.3469387755102045e-06,
324
+ "loss": 0.2043,
325
  "step": 360
326
  },
327
  {
328
+ "epoch": 7.551020408163265,
329
+ "grad_norm": 0.9697467684745789,
330
+ "learning_rate": 4.938775510204082e-06,
331
+ "loss": 0.2076,
332
  "step": 370
333
  },
334
  {
335
+ "epoch": 7.755102040816326,
336
+ "grad_norm": 1.7021687030792236,
337
+ "learning_rate": 4.530612244897959e-06,
338
+ "loss": 0.2111,
339
  "step": 380
340
  },
341
  {
342
+ "epoch": 7.959183673469388,
343
+ "grad_norm": 0.8718199729919434,
344
+ "learning_rate": 4.122448979591837e-06,
345
+ "loss": 0.188,
346
  "step": 390
347
  },
348
  {
349
+ "epoch": 8.16326530612245,
350
+ "grad_norm": 0.7752737998962402,
351
+ "learning_rate": 3.7142857142857146e-06,
352
+ "loss": 0.1798,
353
  "step": 400
354
  },
355
  {
356
+ "epoch": 8.16326530612245,
357
  "eval_accuracy": 0.8125,
358
+ "eval_loss": 0.5628954172134399,
359
+ "eval_runtime": 1.291,
360
+ "eval_samples_per_second": 61.967,
361
+ "eval_steps_per_second": 7.746,
362
  "step": 400
363
  },
364
  {
365
+ "epoch": 8.36734693877551,
366
+ "grad_norm": 1.137776494026184,
367
+ "learning_rate": 3.3061224489795924e-06,
368
+ "loss": 0.1778,
369
  "step": 410
370
  },
371
  {
372
+ "epoch": 8.571428571428571,
373
+ "grad_norm": 0.7163811326026917,
374
+ "learning_rate": 2.8979591836734694e-06,
375
+ "loss": 0.1697,
376
  "step": 420
377
  },
378
  {
379
+ "epoch": 8.775510204081632,
380
+ "grad_norm": 0.860792875289917,
381
+ "learning_rate": 2.489795918367347e-06,
382
+ "loss": 0.1803,
383
  "step": 430
384
  },
385
  {
386
+ "epoch": 8.979591836734693,
387
+ "grad_norm": 0.8092204928398132,
388
+ "learning_rate": 2.0816326530612247e-06,
389
+ "loss": 0.1792,
390
  "step": 440
391
  },
392
  {
393
+ "epoch": 9.183673469387756,
394
+ "grad_norm": 3.376288890838623,
395
+ "learning_rate": 1.6734693877551023e-06,
396
+ "loss": 0.1746,
397
  "step": 450
398
  },
399
  {
400
+ "epoch": 9.183673469387756,
401
+ "eval_accuracy": 0.825,
402
+ "eval_loss": 0.5457363724708557,
403
+ "eval_runtime": 1.2999,
404
+ "eval_samples_per_second": 61.544,
405
+ "eval_steps_per_second": 7.693,
406
  "step": 450
407
  },
408
  {
409
+ "epoch": 9.387755102040817,
410
+ "grad_norm": 0.6227843165397644,
411
+ "learning_rate": 1.2653061224489795e-06,
412
+ "loss": 0.1631,
413
  "step": 460
414
  },
415
  {
416
+ "epoch": 9.591836734693878,
417
+ "grad_norm": 0.7905530333518982,
418
+ "learning_rate": 8.571428571428572e-07,
419
+ "loss": 0.1603,
420
  "step": 470
421
  },
422
  {
423
+ "epoch": 9.795918367346939,
424
+ "grad_norm": 0.6496562361717224,
425
+ "learning_rate": 4.489795918367347e-07,
426
+ "loss": 0.1581,
427
  "step": 480
428
  },
429
  {
430
  "epoch": 10.0,
431
+ "grad_norm": 0.9721047282218933,
432
+ "learning_rate": 4.0816326530612253e-08,
433
+ "loss": 0.1594,
434
+ "step": 490
435
+ },
436
+ {
437
+ "epoch": 10.0,
438
+ "step": 490,
439
+ "total_flos": 6.013486186109338e+17,
440
+ "train_loss": 0.5295110111333886,
441
+ "train_runtime": 345.9048,
442
+ "train_samples_per_second": 22.434,
443
+ "train_steps_per_second": 1.417
444
  }
445
  ],
446
  "logging_steps": 10,
447
+ "max_steps": 490,
448
  "num_input_tokens_seen": 0,
449
  "num_train_epochs": 10,
450
  "save_steps": 100,
 
460
  "attributes": {}
461
  }
462
  },
463
+ "total_flos": 6.013486186109338e+17,
464
  "train_batch_size": 16,
465
  "trial_name": null,
466
  "trial_params": null