Prot10 commited on
Commit
8ba28c8
·
1 Parent(s): cfe9388

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +442 -316
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.7374
21
- - Accuracy: 0.3736
22
 
23
  ## Model description
24
 
 
17
 
18
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.6048
21
+ - Accuracy: 0.3929
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 27.91,
3
- "eval_accuracy": 0.42657342657342656,
4
- "eval_loss": 1.6125619411468506,
5
- "eval_runtime": 7.9305,
6
- "eval_samples_per_second": 54.095,
7
- "eval_steps_per_second": 1.765,
8
- "total_flos": 2.9422949721372426e+18,
9
- "train_loss": 0.9840216811498006,
10
- "train_runtime": 1797.847,
11
- "train_samples_per_second": 22.694,
12
- "train_steps_per_second": 0.167
13
  }
 
1
  {
2
+ "epoch": 29.54,
3
+ "eval_accuracy": 0.39285714285714285,
4
+ "eval_loss": 1.604812502861023,
5
+ "eval_runtime": 5.7371,
6
+ "eval_samples_per_second": 63.447,
7
+ "eval_steps_per_second": 2.092,
8
+ "total_flos": 4.691568687003814e+18,
9
+ "train_loss": 0.9943243801593781,
10
+ "train_runtime": 2664.7299,
11
+ "train_samples_per_second": 23.068,
12
+ "train_steps_per_second": 0.18
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 27.91,
3
- "eval_accuracy": 0.42657342657342656,
4
- "eval_loss": 1.6125619411468506,
5
- "eval_runtime": 7.9305,
6
- "eval_samples_per_second": 54.095,
7
- "eval_steps_per_second": 1.765
8
  }
 
1
  {
2
+ "epoch": 29.54,
3
+ "eval_accuracy": 0.39285714285714285,
4
+ "eval_loss": 1.604812502861023,
5
+ "eval_runtime": 5.7371,
6
+ "eval_samples_per_second": 63.447,
7
+ "eval_steps_per_second": 2.092
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 27.91,
3
- "total_flos": 2.9422949721372426e+18,
4
- "train_loss": 0.9840216811498006,
5
- "train_runtime": 1797.847,
6
- "train_samples_per_second": 22.694,
7
- "train_steps_per_second": 0.167
8
  }
 
1
  {
2
+ "epoch": 29.54,
3
+ "total_flos": 4.691568687003814e+18,
4
+ "train_loss": 0.9943243801593781,
5
+ "train_runtime": 2664.7299,
6
+ "train_samples_per_second": 23.068,
7
+ "train_steps_per_second": 0.18
8
  }
trainer_state.json CHANGED
@@ -1,460 +1,586 @@
1
  {
2
- "best_metric": 0.42657342657342656,
3
- "best_model_checkpoint": "vit-base-patch16-224-for-pre_evaluation/checkpoint-236",
4
- "epoch": 27.906976744186046,
5
  "eval_steps": 500,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.93,
13
- "learning_rate": 1.6666666666666667e-05,
14
- "loss": 1.6266,
15
  "step": 10
16
  },
17
  {
18
- "epoch": 0.93,
19
- "eval_accuracy": 0.3006993006993007,
20
- "eval_loss": 1.5307577848434448,
21
- "eval_runtime": 10.578,
22
- "eval_samples_per_second": 40.556,
23
- "eval_steps_per_second": 1.324,
24
- "step": 10
25
  },
26
  {
27
- "epoch": 1.86,
28
- "learning_rate": 3.3333333333333335e-05,
29
- "loss": 1.4973,
30
  "step": 20
31
  },
32
  {
33
- "epoch": 1.95,
34
- "eval_accuracy": 0.3123543123543124,
35
- "eval_loss": 1.5000418424606323,
36
- "eval_runtime": 7.5932,
37
- "eval_samples_per_second": 56.498,
38
- "eval_steps_per_second": 1.844,
39
- "step": 21
40
- },
41
- {
42
- "epoch": 2.79,
43
- "learning_rate": 5e-05,
44
- "loss": 1.4636,
45
  "step": 30
46
  },
47
  {
48
- "epoch": 2.98,
49
- "eval_accuracy": 0.34498834498834496,
50
- "eval_loss": 1.472951054573059,
51
- "eval_runtime": 11.6515,
52
- "eval_samples_per_second": 36.819,
53
- "eval_steps_per_second": 1.202,
54
  "step": 32
55
  },
56
  {
57
- "epoch": 3.72,
58
- "learning_rate": 4.814814814814815e-05,
59
- "loss": 1.4185,
60
  "step": 40
61
  },
62
  {
63
- "epoch": 4.0,
64
- "eval_accuracy": 0.30536130536130535,
65
- "eval_loss": 1.5051661729812622,
66
- "eval_runtime": 7.0051,
67
- "eval_samples_per_second": 61.241,
68
- "eval_steps_per_second": 1.999,
69
- "step": 43
70
  },
71
  {
72
- "epoch": 4.65,
73
- "learning_rate": 4.62962962962963e-05,
74
- "loss": 1.4147,
75
  "step": 50
76
  },
77
  {
78
- "epoch": 4.93,
79
- "eval_accuracy": 0.34965034965034963,
80
- "eval_loss": 1.4455597400665283,
81
- "eval_runtime": 6.741,
82
- "eval_samples_per_second": 63.641,
83
- "eval_steps_per_second": 2.077,
84
- "step": 53
85
- },
86
- {
87
- "epoch": 5.58,
88
- "learning_rate": 4.4444444444444447e-05,
89
- "loss": 1.337,
90
  "step": 60
91
  },
92
  {
93
- "epoch": 5.95,
94
- "eval_accuracy": 0.34265734265734266,
95
- "eval_loss": 1.4735510349273682,
96
- "eval_runtime": 7.0594,
97
- "eval_samples_per_second": 60.77,
98
- "eval_steps_per_second": 1.983,
99
- "step": 64
100
  },
101
  {
102
- "epoch": 6.51,
103
- "learning_rate": 4.259259259259259e-05,
104
- "loss": 1.2869,
105
  "step": 70
106
  },
107
  {
108
- "epoch": 6.98,
109
- "eval_accuracy": 0.37995337995337997,
110
- "eval_loss": 1.4170001745224,
111
- "eval_runtime": 7.3984,
112
- "eval_samples_per_second": 57.985,
113
- "eval_steps_per_second": 1.892,
114
- "step": 75
115
- },
116
- {
117
- "epoch": 7.44,
118
- "learning_rate": 4.074074074074074e-05,
119
- "loss": 1.241,
120
  "step": 80
121
  },
122
  {
123
- "epoch": 8.0,
124
- "eval_accuracy": 0.372960372960373,
125
- "eval_loss": 1.5077800750732422,
126
- "eval_runtime": 7.0787,
127
- "eval_samples_per_second": 60.604,
128
- "eval_steps_per_second": 1.978,
129
- "step": 86
130
  },
131
  {
132
- "epoch": 8.37,
133
- "learning_rate": 3.888888888888889e-05,
134
- "loss": 1.2261,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 8.93,
139
- "eval_accuracy": 0.38927738927738925,
140
- "eval_loss": 1.4629043340682983,
141
- "eval_runtime": 7.4136,
142
- "eval_samples_per_second": 57.867,
143
- "eval_steps_per_second": 1.888,
144
- "step": 96
145
  },
146
  {
147
- "epoch": 9.3,
148
- "learning_rate": 3.7037037037037037e-05,
149
- "loss": 1.1967,
150
  "step": 100
151
  },
152
  {
153
- "epoch": 9.95,
154
- "eval_accuracy": 0.41025641025641024,
155
- "eval_loss": 1.4139227867126465,
156
- "eval_runtime": 7.4631,
157
- "eval_samples_per_second": 57.483,
158
- "eval_steps_per_second": 1.876,
159
- "step": 107
160
- },
161
- {
162
- "epoch": 10.23,
163
- "learning_rate": 3.518518518518519e-05,
164
- "loss": 1.1153,
165
  "step": 110
166
  },
167
  {
168
- "epoch": 10.98,
169
- "eval_accuracy": 0.4195804195804196,
170
- "eval_loss": 1.4337666034698486,
171
- "eval_runtime": 7.0965,
172
- "eval_samples_per_second": 60.452,
173
- "eval_steps_per_second": 1.973,
174
- "step": 118
175
  },
176
  {
177
- "epoch": 11.16,
178
- "learning_rate": 3.3333333333333335e-05,
179
- "loss": 1.063,
180
  "step": 120
181
  },
182
  {
183
- "epoch": 12.0,
184
- "eval_accuracy": 0.3939393939393939,
185
- "eval_loss": 1.4642902612686157,
186
- "eval_runtime": 7.3068,
187
- "eval_samples_per_second": 58.713,
188
- "eval_steps_per_second": 1.916,
189
- "step": 129
190
- },
191
- {
192
- "epoch": 12.09,
193
- "learning_rate": 3.148148148148148e-05,
194
- "loss": 1.0434,
195
  "step": 130
196
  },
197
  {
198
- "epoch": 12.93,
199
- "eval_accuracy": 0.40792540792540793,
200
- "eval_loss": 1.4725754261016846,
201
- "eval_runtime": 7.4877,
202
- "eval_samples_per_second": 57.294,
203
- "eval_steps_per_second": 1.87,
204
- "step": 139
205
  },
206
  {
207
- "epoch": 13.02,
208
- "learning_rate": 2.962962962962963e-05,
209
- "loss": 1.0067,
210
  "step": 140
211
  },
212
  {
213
- "epoch": 13.95,
214
- "learning_rate": 2.777777777777778e-05,
215
- "loss": 0.9849,
216
- "step": 150
 
 
 
217
  },
218
  {
219
- "epoch": 13.95,
220
- "eval_accuracy": 0.4149184149184149,
221
- "eval_loss": 1.528671145439148,
222
- "eval_runtime": 7.2601,
223
- "eval_samples_per_second": 59.09,
224
- "eval_steps_per_second": 1.928,
225
  "step": 150
226
  },
227
  {
228
- "epoch": 14.88,
229
- "learning_rate": 2.5925925925925925e-05,
230
- "loss": 0.9285,
231
  "step": 160
232
  },
233
  {
234
- "epoch": 14.98,
235
- "eval_accuracy": 0.3986013986013986,
236
- "eval_loss": 1.552985429763794,
237
- "eval_runtime": 7.0224,
238
- "eval_samples_per_second": 61.09,
239
- "eval_steps_per_second": 1.994,
240
- "step": 161
241
  },
242
  {
243
- "epoch": 15.81,
244
- "learning_rate": 2.4074074074074074e-05,
245
- "loss": 0.8724,
246
  "step": 170
247
  },
248
  {
249
- "epoch": 16.0,
250
- "eval_accuracy": 0.3939393939393939,
251
- "eval_loss": 1.5016406774520874,
252
- "eval_runtime": 7.1273,
253
- "eval_samples_per_second": 60.191,
254
- "eval_steps_per_second": 1.964,
255
- "step": 172
256
  },
257
  {
258
- "epoch": 16.74,
259
- "learning_rate": 2.2222222222222223e-05,
260
- "loss": 0.8063,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 16.93,
265
- "eval_accuracy": 0.40559440559440557,
266
- "eval_loss": 1.5184847116470337,
267
- "eval_runtime": 7.1223,
268
- "eval_samples_per_second": 60.233,
269
- "eval_steps_per_second": 1.966,
270
- "step": 182
271
- },
272
- {
273
- "epoch": 17.67,
274
- "learning_rate": 2.037037037037037e-05,
275
- "loss": 0.8206,
276
  "step": 190
277
  },
278
  {
279
- "epoch": 17.95,
280
- "eval_accuracy": 0.4172494172494173,
281
- "eval_loss": 1.5447591543197632,
282
- "eval_runtime": 7.0975,
283
- "eval_samples_per_second": 60.444,
284
- "eval_steps_per_second": 1.973,
285
- "step": 193
286
  },
287
  {
288
- "epoch": 18.6,
289
- "learning_rate": 1.8518518518518518e-05,
290
- "loss": 0.7396,
291
  "step": 200
292
  },
293
  {
294
- "epoch": 18.98,
295
- "eval_accuracy": 0.40326340326340326,
296
- "eval_loss": 1.554015874862671,
297
- "eval_runtime": 6.78,
298
- "eval_samples_per_second": 63.275,
299
- "eval_steps_per_second": 2.065,
300
- "step": 204
301
- },
302
- {
303
- "epoch": 19.53,
304
- "learning_rate": 1.6666666666666667e-05,
305
- "loss": 0.7437,
306
  "step": 210
307
  },
308
  {
309
- "epoch": 20.0,
310
- "eval_accuracy": 0.40093240093240096,
311
- "eval_loss": 1.579628586769104,
312
- "eval_runtime": 6.8036,
313
- "eval_samples_per_second": 63.055,
314
- "eval_steps_per_second": 2.058,
315
- "step": 215
316
  },
317
  {
318
- "epoch": 20.47,
319
- "learning_rate": 1.4814814814814815e-05,
320
- "loss": 0.7103,
321
  "step": 220
322
  },
323
  {
324
- "epoch": 20.93,
325
- "eval_accuracy": 0.40326340326340326,
326
- "eval_loss": 1.6218039989471436,
327
- "eval_runtime": 7.5951,
328
- "eval_samples_per_second": 56.484,
329
- "eval_steps_per_second": 1.843,
330
- "step": 225
331
  },
332
  {
333
- "epoch": 21.4,
334
- "learning_rate": 1.2962962962962962e-05,
335
- "loss": 0.6861,
336
  "step": 230
337
  },
338
  {
339
- "epoch": 21.95,
340
- "eval_accuracy": 0.42657342657342656,
341
- "eval_loss": 1.6125619411468506,
342
- "eval_runtime": 7.0894,
343
- "eval_samples_per_second": 60.512,
344
- "eval_steps_per_second": 1.975,
345
- "step": 236
346
- },
347
- {
348
- "epoch": 22.33,
349
- "learning_rate": 1.1111111111111112e-05,
350
- "loss": 0.6798,
351
  "step": 240
352
  },
353
  {
354
- "epoch": 22.98,
355
- "eval_accuracy": 0.42657342657342656,
356
- "eval_loss": 1.6051045656204224,
357
- "eval_runtime": 7.0114,
358
- "eval_samples_per_second": 61.186,
359
- "eval_steps_per_second": 1.997,
360
- "step": 247
361
  },
362
  {
363
- "epoch": 23.26,
364
- "learning_rate": 9.259259259259259e-06,
365
- "loss": 0.6358,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 24.0,
370
- "eval_accuracy": 0.41025641025641024,
371
- "eval_loss": 1.6141223907470703,
372
- "eval_runtime": 6.9962,
373
- "eval_samples_per_second": 61.319,
374
- "eval_steps_per_second": 2.001,
375
- "step": 258
376
- },
377
- {
378
- "epoch": 24.19,
379
- "learning_rate": 7.4074074074074075e-06,
380
- "loss": 0.6189,
381
  "step": 260
382
  },
383
  {
384
- "epoch": 24.93,
385
- "eval_accuracy": 0.40559440559440557,
386
- "eval_loss": 1.6705572605133057,
387
- "eval_runtime": 6.7024,
388
- "eval_samples_per_second": 64.007,
389
- "eval_steps_per_second": 2.089,
390
- "step": 268
391
  },
392
  {
393
- "epoch": 25.12,
394
- "learning_rate": 5.555555555555556e-06,
395
- "loss": 0.6176,
396
  "step": 270
397
  },
398
  {
399
- "epoch": 25.95,
400
- "eval_accuracy": 0.4219114219114219,
401
- "eval_loss": 1.6404389142990112,
402
- "eval_runtime": 6.7355,
403
- "eval_samples_per_second": 63.692,
404
- "eval_steps_per_second": 2.079,
405
- "step": 279
406
  },
407
  {
408
- "epoch": 26.05,
409
- "learning_rate": 3.7037037037037037e-06,
410
- "loss": 0.575,
411
  "step": 280
412
  },
413
  {
414
- "epoch": 26.98,
415
- "learning_rate": 1.8518518518518519e-06,
416
- "loss": 0.5756,
417
  "step": 290
418
  },
419
  {
420
- "epoch": 26.98,
421
- "eval_accuracy": 0.4172494172494173,
422
- "eval_loss": 1.660282850265503,
423
- "eval_runtime": 6.8437,
424
- "eval_samples_per_second": 62.685,
425
- "eval_steps_per_second": 2.046,
426
- "step": 290
427
  },
428
  {
429
- "epoch": 27.91,
430
- "learning_rate": 0.0,
431
- "loss": 0.5887,
432
  "step": 300
433
  },
434
  {
435
- "epoch": 27.91,
436
- "eval_accuracy": 0.42424242424242425,
437
- "eval_loss": 1.6626534461975098,
438
- "eval_runtime": 6.7064,
439
- "eval_samples_per_second": 63.969,
440
- "eval_steps_per_second": 2.088,
441
- "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
442
  },
443
  {
444
- "epoch": 27.91,
445
- "step": 300,
446
- "total_flos": 2.9422949721372426e+18,
447
- "train_loss": 0.9840216811498006,
448
- "train_runtime": 1797.847,
449
- "train_samples_per_second": 22.694,
450
- "train_steps_per_second": 0.167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  }
452
  ],
453
  "logging_steps": 10,
454
- "max_steps": 300,
455
  "num_train_epochs": 30,
456
  "save_steps": 500,
457
- "total_flos": 2.9422949721372426e+18,
458
  "trial_name": null,
459
  "trial_params": null
460
  }
 
1
  {
2
+ "best_metric": 0.39285714285714285,
3
+ "best_model_checkpoint": "vit-base-patch16-224-for-pre_evaluation/checkpoint-308",
4
+ "epoch": 29.53846153846154,
5
  "eval_steps": 500,
6
+ "global_step": 480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.62,
13
+ "learning_rate": 1.0416666666666668e-05,
14
+ "loss": 1.5774,
15
  "step": 10
16
  },
17
  {
18
+ "epoch": 0.98,
19
+ "eval_accuracy": 0.3021978021978022,
20
+ "eval_loss": 1.510853886604309,
21
+ "eval_runtime": 6.2821,
22
+ "eval_samples_per_second": 57.943,
23
+ "eval_steps_per_second": 1.91,
24
+ "step": 16
25
  },
26
  {
27
+ "epoch": 1.23,
28
+ "learning_rate": 2.0833333333333336e-05,
29
+ "loss": 1.5237,
30
  "step": 20
31
  },
32
  {
33
+ "epoch": 1.85,
34
+ "learning_rate": 3.125e-05,
35
+ "loss": 1.4794,
 
 
 
 
 
 
 
 
 
36
  "step": 30
37
  },
38
  {
39
+ "epoch": 1.97,
40
+ "eval_accuracy": 0.3241758241758242,
41
+ "eval_loss": 1.494186282157898,
42
+ "eval_runtime": 6.3421,
43
+ "eval_samples_per_second": 57.395,
44
+ "eval_steps_per_second": 1.892,
45
  "step": 32
46
  },
47
  {
48
+ "epoch": 2.46,
49
+ "learning_rate": 4.166666666666667e-05,
50
+ "loss": 1.4536,
51
  "step": 40
52
  },
53
  {
54
+ "epoch": 2.95,
55
+ "eval_accuracy": 0.31868131868131866,
56
+ "eval_loss": 1.4943327903747559,
57
+ "eval_runtime": 5.7214,
58
+ "eval_samples_per_second": 63.621,
59
+ "eval_steps_per_second": 2.097,
60
+ "step": 48
61
  },
62
  {
63
+ "epoch": 3.08,
64
+ "learning_rate": 4.976851851851852e-05,
65
+ "loss": 1.4643,
66
  "step": 50
67
  },
68
  {
69
+ "epoch": 3.69,
70
+ "learning_rate": 4.8611111111111115e-05,
71
+ "loss": 1.421,
 
 
 
 
 
 
 
 
 
72
  "step": 60
73
  },
74
  {
75
+ "epoch": 4.0,
76
+ "eval_accuracy": 0.34065934065934067,
77
+ "eval_loss": 1.4246565103530884,
78
+ "eval_runtime": 5.9355,
79
+ "eval_samples_per_second": 61.326,
80
+ "eval_steps_per_second": 2.022,
81
+ "step": 65
82
  },
83
  {
84
+ "epoch": 4.31,
85
+ "learning_rate": 4.745370370370371e-05,
86
+ "loss": 1.4268,
87
  "step": 70
88
  },
89
  {
90
+ "epoch": 4.92,
91
+ "learning_rate": 4.62962962962963e-05,
92
+ "loss": 1.3882,
 
 
 
 
 
 
 
 
 
93
  "step": 80
94
  },
95
  {
96
+ "epoch": 4.98,
97
+ "eval_accuracy": 0.34615384615384615,
98
+ "eval_loss": 1.4944226741790771,
99
+ "eval_runtime": 6.3968,
100
+ "eval_samples_per_second": 56.903,
101
+ "eval_steps_per_second": 1.876,
102
+ "step": 81
103
  },
104
  {
105
+ "epoch": 5.54,
106
+ "learning_rate": 4.5138888888888894e-05,
107
+ "loss": 1.3579,
108
  "step": 90
109
  },
110
  {
111
+ "epoch": 5.97,
112
+ "eval_accuracy": 0.35714285714285715,
113
+ "eval_loss": 1.4180346727371216,
114
+ "eval_runtime": 6.5996,
115
+ "eval_samples_per_second": 55.154,
116
+ "eval_steps_per_second": 1.818,
117
+ "step": 97
118
  },
119
  {
120
+ "epoch": 6.15,
121
+ "learning_rate": 4.3981481481481486e-05,
122
+ "loss": 1.3075,
123
  "step": 100
124
  },
125
  {
126
+ "epoch": 6.77,
127
+ "learning_rate": 4.282407407407408e-05,
128
+ "loss": 1.2838,
 
 
 
 
 
 
 
 
 
129
  "step": 110
130
  },
131
  {
132
+ "epoch": 6.95,
133
+ "eval_accuracy": 0.36813186813186816,
134
+ "eval_loss": 1.4692732095718384,
135
+ "eval_runtime": 5.8126,
136
+ "eval_samples_per_second": 62.623,
137
+ "eval_steps_per_second": 2.064,
138
+ "step": 113
139
  },
140
  {
141
+ "epoch": 7.38,
142
+ "learning_rate": 4.166666666666667e-05,
143
+ "loss": 1.2877,
144
  "step": 120
145
  },
146
  {
147
+ "epoch": 8.0,
148
+ "learning_rate": 4.0509259259259265e-05,
149
+ "loss": 1.2695,
 
 
 
 
 
 
 
 
 
150
  "step": 130
151
  },
152
  {
153
+ "epoch": 8.0,
154
+ "eval_accuracy": 0.3434065934065934,
155
+ "eval_loss": 1.4359365701675415,
156
+ "eval_runtime": 5.7907,
157
+ "eval_samples_per_second": 62.859,
158
+ "eval_steps_per_second": 2.072,
159
+ "step": 130
160
  },
161
  {
162
+ "epoch": 8.62,
163
+ "learning_rate": 3.935185185185186e-05,
164
+ "loss": 1.2016,
165
  "step": 140
166
  },
167
  {
168
+ "epoch": 8.98,
169
+ "eval_accuracy": 0.3598901098901099,
170
+ "eval_loss": 1.4656463861465454,
171
+ "eval_runtime": 6.0359,
172
+ "eval_samples_per_second": 60.306,
173
+ "eval_steps_per_second": 1.988,
174
+ "step": 146
175
  },
176
  {
177
+ "epoch": 9.23,
178
+ "learning_rate": 3.8194444444444444e-05,
179
+ "loss": 1.2361,
 
 
 
180
  "step": 150
181
  },
182
  {
183
+ "epoch": 9.85,
184
+ "learning_rate": 3.7037037037037037e-05,
185
+ "loss": 1.2087,
186
  "step": 160
187
  },
188
  {
189
+ "epoch": 9.97,
190
+ "eval_accuracy": 0.33791208791208793,
191
+ "eval_loss": 1.4549881219863892,
192
+ "eval_runtime": 6.5216,
193
+ "eval_samples_per_second": 55.814,
194
+ "eval_steps_per_second": 1.84,
195
+ "step": 162
196
  },
197
  {
198
+ "epoch": 10.46,
199
+ "learning_rate": 3.587962962962963e-05,
200
+ "loss": 1.206,
201
  "step": 170
202
  },
203
  {
204
+ "epoch": 10.95,
205
+ "eval_accuracy": 0.3516483516483517,
206
+ "eval_loss": 1.5055769681930542,
207
+ "eval_runtime": 5.8519,
208
+ "eval_samples_per_second": 62.202,
209
+ "eval_steps_per_second": 2.051,
210
+ "step": 178
211
  },
212
  {
213
+ "epoch": 11.08,
214
+ "learning_rate": 3.472222222222222e-05,
215
+ "loss": 1.1296,
216
  "step": 180
217
  },
218
  {
219
+ "epoch": 11.69,
220
+ "learning_rate": 3.3564814814814815e-05,
221
+ "loss": 1.1236,
 
 
 
 
 
 
 
 
 
222
  "step": 190
223
  },
224
  {
225
+ "epoch": 12.0,
226
+ "eval_accuracy": 0.3434065934065934,
227
+ "eval_loss": 1.5003132820129395,
228
+ "eval_runtime": 6.3872,
229
+ "eval_samples_per_second": 56.989,
230
+ "eval_steps_per_second": 1.879,
231
+ "step": 195
232
  },
233
  {
234
+ "epoch": 12.31,
235
+ "learning_rate": 3.240740740740741e-05,
236
+ "loss": 1.0955,
237
  "step": 200
238
  },
239
  {
240
+ "epoch": 12.92,
241
+ "learning_rate": 3.125e-05,
242
+ "loss": 1.0534,
 
 
 
 
 
 
 
 
 
243
  "step": 210
244
  },
245
  {
246
+ "epoch": 12.98,
247
+ "eval_accuracy": 0.3269230769230769,
248
+ "eval_loss": 1.5192676782608032,
249
+ "eval_runtime": 6.8957,
250
+ "eval_samples_per_second": 52.786,
251
+ "eval_steps_per_second": 1.74,
252
+ "step": 211
253
  },
254
  {
255
+ "epoch": 13.54,
256
+ "learning_rate": 3.0092592592592593e-05,
257
+ "loss": 1.0024,
258
  "step": 220
259
  },
260
  {
261
+ "epoch": 13.97,
262
+ "eval_accuracy": 0.36813186813186816,
263
+ "eval_loss": 1.4890482425689697,
264
+ "eval_runtime": 5.7451,
265
+ "eval_samples_per_second": 63.358,
266
+ "eval_steps_per_second": 2.089,
267
+ "step": 227
268
  },
269
  {
270
+ "epoch": 14.15,
271
+ "learning_rate": 2.8935185185185186e-05,
272
+ "loss": 0.9924,
273
  "step": 230
274
  },
275
  {
276
+ "epoch": 14.77,
277
+ "learning_rate": 2.777777777777778e-05,
278
+ "loss": 0.9767,
 
 
 
 
 
 
 
 
 
279
  "step": 240
280
  },
281
  {
282
+ "epoch": 14.95,
283
+ "eval_accuracy": 0.3434065934065934,
284
+ "eval_loss": 1.5628184080123901,
285
+ "eval_runtime": 6.5373,
286
+ "eval_samples_per_second": 55.68,
287
+ "eval_steps_per_second": 1.836,
288
+ "step": 243
289
  },
290
  {
291
+ "epoch": 15.38,
292
+ "learning_rate": 2.6620370370370372e-05,
293
+ "loss": 0.9337,
294
  "step": 250
295
  },
296
  {
297
+ "epoch": 16.0,
298
+ "learning_rate": 2.5462962962962965e-05,
299
+ "loss": 0.9201,
 
 
 
 
 
 
 
 
 
300
  "step": 260
301
  },
302
  {
303
+ "epoch": 16.0,
304
+ "eval_accuracy": 0.3516483516483517,
305
+ "eval_loss": 1.6305893659591675,
306
+ "eval_runtime": 6.6234,
307
+ "eval_samples_per_second": 54.957,
308
+ "eval_steps_per_second": 1.812,
309
+ "step": 260
310
  },
311
  {
312
+ "epoch": 16.62,
313
+ "learning_rate": 2.4305555555555558e-05,
314
+ "loss": 0.9136,
315
  "step": 270
316
  },
317
  {
318
+ "epoch": 16.98,
319
+ "eval_accuracy": 0.3626373626373626,
320
+ "eval_loss": 1.5715110301971436,
321
+ "eval_runtime": 5.8274,
322
+ "eval_samples_per_second": 62.463,
323
+ "eval_steps_per_second": 2.059,
324
+ "step": 276
325
  },
326
  {
327
+ "epoch": 17.23,
328
+ "learning_rate": 2.314814814814815e-05,
329
+ "loss": 0.8228,
330
  "step": 280
331
  },
332
  {
333
+ "epoch": 17.85,
334
+ "learning_rate": 2.1990740740740743e-05,
335
+ "loss": 0.8566,
336
  "step": 290
337
  },
338
  {
339
+ "epoch": 17.97,
340
+ "eval_accuracy": 0.36538461538461536,
341
+ "eval_loss": 1.5965826511383057,
342
+ "eval_runtime": 5.8771,
343
+ "eval_samples_per_second": 61.935,
344
+ "eval_steps_per_second": 2.042,
345
+ "step": 292
346
  },
347
  {
348
+ "epoch": 18.46,
349
+ "learning_rate": 2.0833333333333336e-05,
350
+ "loss": 0.8273,
351
  "step": 300
352
  },
353
  {
354
+ "epoch": 18.95,
355
+ "eval_accuracy": 0.39285714285714285,
356
+ "eval_loss": 1.604812502861023,
357
+ "eval_runtime": 6.2505,
358
+ "eval_samples_per_second": 58.235,
359
+ "eval_steps_per_second": 1.92,
360
+ "step": 308
361
+ },
362
+ {
363
+ "epoch": 19.08,
364
+ "learning_rate": 1.967592592592593e-05,
365
+ "loss": 0.8217,
366
+ "step": 310
367
+ },
368
+ {
369
+ "epoch": 19.69,
370
+ "learning_rate": 1.8518518518518518e-05,
371
+ "loss": 0.7825,
372
+ "step": 320
373
  },
374
  {
375
+ "epoch": 20.0,
376
+ "eval_accuracy": 0.38461538461538464,
377
+ "eval_loss": 1.6174668073654175,
378
+ "eval_runtime": 5.8325,
379
+ "eval_samples_per_second": 62.409,
380
+ "eval_steps_per_second": 2.057,
381
+ "step": 325
382
+ },
383
+ {
384
+ "epoch": 20.31,
385
+ "learning_rate": 1.736111111111111e-05,
386
+ "loss": 0.8128,
387
+ "step": 330
388
+ },
389
+ {
390
+ "epoch": 20.92,
391
+ "learning_rate": 1.6203703703703704e-05,
392
+ "loss": 0.736,
393
+ "step": 340
394
+ },
395
+ {
396
+ "epoch": 20.98,
397
+ "eval_accuracy": 0.39285714285714285,
398
+ "eval_loss": 1.652581810951233,
399
+ "eval_runtime": 6.2966,
400
+ "eval_samples_per_second": 57.809,
401
+ "eval_steps_per_second": 1.906,
402
+ "step": 341
403
+ },
404
+ {
405
+ "epoch": 21.54,
406
+ "learning_rate": 1.5046296296296297e-05,
407
+ "loss": 0.7008,
408
+ "step": 350
409
+ },
410
+ {
411
+ "epoch": 21.97,
412
+ "eval_accuracy": 0.37362637362637363,
413
+ "eval_loss": 1.65627920627594,
414
+ "eval_runtime": 6.0612,
415
+ "eval_samples_per_second": 60.054,
416
+ "eval_steps_per_second": 1.98,
417
+ "step": 357
418
+ },
419
+ {
420
+ "epoch": 22.15,
421
+ "learning_rate": 1.388888888888889e-05,
422
+ "loss": 0.7074,
423
+ "step": 360
424
+ },
425
+ {
426
+ "epoch": 22.77,
427
+ "learning_rate": 1.2731481481481482e-05,
428
+ "loss": 0.6714,
429
+ "step": 370
430
+ },
431
+ {
432
+ "epoch": 22.95,
433
+ "eval_accuracy": 0.3901098901098901,
434
+ "eval_loss": 1.7319426536560059,
435
+ "eval_runtime": 6.3113,
436
+ "eval_samples_per_second": 57.674,
437
+ "eval_steps_per_second": 1.901,
438
+ "step": 373
439
+ },
440
+ {
441
+ "epoch": 23.38,
442
+ "learning_rate": 1.1574074074074075e-05,
443
+ "loss": 0.6723,
444
+ "step": 380
445
+ },
446
+ {
447
+ "epoch": 24.0,
448
+ "learning_rate": 1.0416666666666668e-05,
449
+ "loss": 0.7039,
450
+ "step": 390
451
+ },
452
+ {
453
+ "epoch": 24.0,
454
+ "eval_accuracy": 0.39285714285714285,
455
+ "eval_loss": 1.686637282371521,
456
+ "eval_runtime": 5.7311,
457
+ "eval_samples_per_second": 63.514,
458
+ "eval_steps_per_second": 2.094,
459
+ "step": 390
460
+ },
461
+ {
462
+ "epoch": 24.62,
463
+ "learning_rate": 9.259259259259259e-06,
464
+ "loss": 0.628,
465
+ "step": 400
466
+ },
467
+ {
468
+ "epoch": 24.98,
469
+ "eval_accuracy": 0.3791208791208791,
470
+ "eval_loss": 1.7022507190704346,
471
+ "eval_runtime": 5.7107,
472
+ "eval_samples_per_second": 63.74,
473
+ "eval_steps_per_second": 2.101,
474
+ "step": 406
475
+ },
476
+ {
477
+ "epoch": 25.23,
478
+ "learning_rate": 8.101851851851852e-06,
479
+ "loss": 0.6386,
480
+ "step": 410
481
+ },
482
+ {
483
+ "epoch": 25.85,
484
+ "learning_rate": 6.944444444444445e-06,
485
+ "loss": 0.6182,
486
+ "step": 420
487
+ },
488
+ {
489
+ "epoch": 25.97,
490
+ "eval_accuracy": 0.3901098901098901,
491
+ "eval_loss": 1.7301020622253418,
492
+ "eval_runtime": 5.9062,
493
+ "eval_samples_per_second": 61.63,
494
+ "eval_steps_per_second": 2.032,
495
+ "step": 422
496
+ },
497
+ {
498
+ "epoch": 26.46,
499
+ "learning_rate": 5.787037037037038e-06,
500
+ "loss": 0.5957,
501
+ "step": 430
502
+ },
503
+ {
504
+ "epoch": 26.95,
505
+ "eval_accuracy": 0.38461538461538464,
506
+ "eval_loss": 1.7156624794006348,
507
+ "eval_runtime": 6.0021,
508
+ "eval_samples_per_second": 60.646,
509
+ "eval_steps_per_second": 1.999,
510
+ "step": 438
511
+ },
512
+ {
513
+ "epoch": 27.08,
514
+ "learning_rate": 4.6296296296296296e-06,
515
+ "loss": 0.595,
516
+ "step": 440
517
+ },
518
+ {
519
+ "epoch": 27.69,
520
+ "learning_rate": 3.4722222222222224e-06,
521
+ "loss": 0.5973,
522
+ "step": 450
523
+ },
524
+ {
525
+ "epoch": 28.0,
526
+ "eval_accuracy": 0.3708791208791209,
527
+ "eval_loss": 1.7478266954421997,
528
+ "eval_runtime": 6.4181,
529
+ "eval_samples_per_second": 56.715,
530
+ "eval_steps_per_second": 1.87,
531
+ "step": 455
532
+ },
533
+ {
534
+ "epoch": 28.31,
535
+ "learning_rate": 2.3148148148148148e-06,
536
+ "loss": 0.5767,
537
+ "step": 460
538
+ },
539
+ {
540
+ "epoch": 28.92,
541
+ "learning_rate": 1.1574074074074074e-06,
542
+ "loss": 0.5655,
543
+ "step": 470
544
+ },
545
+ {
546
+ "epoch": 28.98,
547
+ "eval_accuracy": 0.37362637362637363,
548
+ "eval_loss": 1.7376786470413208,
549
+ "eval_runtime": 5.7918,
550
+ "eval_samples_per_second": 62.847,
551
+ "eval_steps_per_second": 2.072,
552
+ "step": 471
553
+ },
554
+ {
555
+ "epoch": 29.54,
556
+ "learning_rate": 0.0,
557
+ "loss": 0.5631,
558
+ "step": 480
559
+ },
560
+ {
561
+ "epoch": 29.54,
562
+ "eval_accuracy": 0.37362637362637363,
563
+ "eval_loss": 1.737407922744751,
564
+ "eval_runtime": 6.3261,
565
+ "eval_samples_per_second": 57.54,
566
+ "eval_steps_per_second": 1.897,
567
+ "step": 480
568
+ },
569
+ {
570
+ "epoch": 29.54,
571
+ "step": 480,
572
+ "total_flos": 4.691568687003814e+18,
573
+ "train_loss": 0.9943243801593781,
574
+ "train_runtime": 2664.7299,
575
+ "train_samples_per_second": 23.068,
576
+ "train_steps_per_second": 0.18
577
  }
578
  ],
579
  "logging_steps": 10,
580
+ "max_steps": 480,
581
  "num_train_epochs": 30,
582
  "save_steps": 500,
583
+ "total_flos": 4.691568687003814e+18,
584
  "trial_name": null,
585
  "trial_params": null
586
  }