bombshelll commited on
Commit
d41640d
·
verified ·
1 Parent(s): e8da135

End of training

Browse files
Files changed (6) hide show
  1. README.md +2 -2
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. test_results.json +8 -0
  5. train_results.json +6 -6
  6. trainer_state.json +190 -419
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.1909
22
- - Accuracy: 0.9548
23
 
24
  ## Model description
25
 
 
18
 
19
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2461
22
+ - Accuracy: 0.9273
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 19.27710843373494,
3
- "eval_accuracy": 0.9661016949152542,
4
- "eval_loss": 0.13029339909553528,
5
- "eval_runtime": 1.306,
6
- "eval_samples_per_second": 225.886,
7
- "eval_steps_per_second": 7.657,
8
- "total_flos": 1.2721899193419387e+18,
9
- "train_loss": 0.1996179285645485,
10
- "train_runtime": 565.0512,
11
- "train_samples_per_second": 93.974,
12
- "train_steps_per_second": 0.708
13
  }
 
1
  {
2
+ "epoch": 14.838709677419354,
3
+ "eval_accuracy": 0.9272727272727272,
4
+ "eval_loss": 0.2460637390613556,
5
+ "eval_runtime": 0.974,
6
+ "eval_samples_per_second": 169.402,
7
+ "eval_steps_per_second": 6.16,
8
+ "total_flos": 1.0886233115316142e+18,
9
+ "train_loss": 0.21913787420245184,
10
+ "train_runtime": 552.9922,
11
+ "train_samples_per_second": 80.019,
12
+ "train_steps_per_second": 0.624
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 19.27710843373494,
3
- "eval_accuracy": 0.9661016949152542,
4
- "eval_loss": 0.13029339909553528,
5
- "eval_runtime": 1.306,
6
- "eval_samples_per_second": 225.886,
7
- "eval_steps_per_second": 7.657
8
  }
 
1
  {
2
+ "epoch": 14.838709677419354,
3
+ "eval_accuracy": 0.964824120603015,
4
+ "eval_loss": 0.15059159696102142,
5
+ "eval_runtime": 2.7989,
6
+ "eval_samples_per_second": 213.295,
7
+ "eval_steps_per_second": 6.788
8
  }
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.838709677419354,
3
+ "eval_accuracy": 0.9272727272727272,
4
+ "eval_loss": 0.2460637390613556,
5
+ "eval_runtime": 0.974,
6
+ "eval_samples_per_second": 169.402,
7
+ "eval_steps_per_second": 6.16
8
+ }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 19.27710843373494,
3
- "total_flos": 1.2721899193419387e+18,
4
- "train_loss": 0.1996179285645485,
5
- "train_runtime": 565.0512,
6
- "train_samples_per_second": 93.974,
7
- "train_steps_per_second": 0.708
8
  }
 
1
  {
2
+ "epoch": 14.838709677419354,
3
+ "total_flos": 1.0886233115316142e+18,
4
+ "train_loss": 0.21913787420245184,
5
+ "train_runtime": 552.9922,
6
+ "train_samples_per_second": 80.019,
7
+ "train_steps_per_second": 0.624
8
  }
trainer_state.json CHANGED
@@ -1,498 +1,269 @@
1
  {
2
- "best_metric": 0.9661016949152542,
3
- "best_model_checkpoint": "/kaggle/working/swin-brain-abnormalities-classification/checkpoint-311",
4
- "epoch": 19.27710843373494,
5
  "eval_steps": 500,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4819277108433735,
13
- "grad_norm": 7.259073257446289,
14
- "learning_rate": 1.25e-05,
15
- "loss": 1.1246,
16
- "step": 10
17
  },
18
  {
19
- "epoch": 0.963855421686747,
20
- "grad_norm": 4.209974765777588,
21
- "learning_rate": 2.5e-05,
22
- "loss": 0.7845,
23
- "step": 20
 
 
24
  },
25
  {
26
- "epoch": 0.963855421686747,
27
- "eval_accuracy": 0.7661016949152543,
28
- "eval_loss": 0.5746350288391113,
29
- "eval_runtime": 1.2982,
30
- "eval_samples_per_second": 227.238,
31
- "eval_steps_per_second": 7.703,
32
- "step": 20
33
  },
34
  {
35
- "epoch": 1.4457831325301205,
36
- "grad_norm": 8.389396667480469,
37
- "learning_rate": 3.7500000000000003e-05,
38
- "loss": 0.6339,
39
- "step": 30
 
 
40
  },
41
  {
42
- "epoch": 1.927710843373494,
43
- "grad_norm": 5.584402561187744,
44
- "learning_rate": 5e-05,
45
- "loss": 0.4587,
46
- "step": 40
47
  },
48
  {
49
- "epoch": 1.9759036144578315,
50
- "eval_accuracy": 0.8779661016949153,
51
- "eval_loss": 0.29308223724365234,
52
- "eval_runtime": 1.2711,
53
- "eval_samples_per_second": 232.081,
54
- "eval_steps_per_second": 7.867,
55
- "step": 41
56
- },
57
- {
58
- "epoch": 2.4096385542168672,
59
- "grad_norm": 13.30373477935791,
60
- "learning_rate": 4.8611111111111115e-05,
61
- "loss": 0.3783,
62
- "step": 50
63
- },
64
- {
65
- "epoch": 2.891566265060241,
66
- "grad_norm": 14.480766296386719,
67
- "learning_rate": 4.722222222222222e-05,
68
- "loss": 0.3004,
69
- "step": 60
70
- },
71
- {
72
- "epoch": 2.9879518072289155,
73
- "eval_accuracy": 0.8949152542372881,
74
- "eval_loss": 0.2784439027309418,
75
- "eval_runtime": 1.2869,
76
- "eval_samples_per_second": 229.239,
77
- "eval_steps_per_second": 7.771,
78
- "step": 62
79
- },
80
- {
81
- "epoch": 3.3734939759036147,
82
- "grad_norm": 22.313514709472656,
83
- "learning_rate": 4.5833333333333334e-05,
84
- "loss": 0.2702,
85
- "step": 70
86
- },
87
- {
88
- "epoch": 3.855421686746988,
89
- "grad_norm": 15.409673690795898,
90
- "learning_rate": 4.4444444444444447e-05,
91
- "loss": 0.2379,
92
- "step": 80
93
  },
94
  {
95
  "epoch": 4.0,
96
- "eval_accuracy": 0.9355932203389831,
97
- "eval_loss": 0.1557122766971588,
98
- "eval_runtime": 1.3066,
99
- "eval_samples_per_second": 225.78,
100
- "eval_steps_per_second": 7.654,
101
- "step": 83
102
- },
103
- {
104
- "epoch": 4.337349397590361,
105
- "grad_norm": 8.086126327514648,
106
- "learning_rate": 4.305555555555556e-05,
107
- "loss": 0.2492,
108
- "step": 90
109
- },
110
- {
111
- "epoch": 4.8192771084337345,
112
- "grad_norm": 13.449581146240234,
113
- "learning_rate": 4.166666666666667e-05,
114
- "loss": 0.1845,
115
- "step": 100
116
- },
117
- {
118
- "epoch": 4.9638554216867465,
119
- "eval_accuracy": 0.9491525423728814,
120
- "eval_loss": 0.15197788178920746,
121
- "eval_runtime": 1.2703,
122
- "eval_samples_per_second": 232.229,
123
- "eval_steps_per_second": 7.872,
124
- "step": 103
125
- },
126
- {
127
- "epoch": 5.301204819277109,
128
- "grad_norm": 11.823051452636719,
129
- "learning_rate": 4.027777777777778e-05,
130
- "loss": 0.1818,
131
- "step": 110
132
  },
133
  {
134
- "epoch": 5.783132530120482,
135
- "grad_norm": 7.886294364929199,
136
- "learning_rate": 3.888888888888889e-05,
137
- "loss": 0.1445,
138
- "step": 120
 
 
139
  },
140
  {
141
- "epoch": 5.975903614457831,
142
- "eval_accuracy": 0.9525423728813559,
143
- "eval_loss": 0.14502111077308655,
144
- "eval_runtime": 1.2821,
145
- "eval_samples_per_second": 230.1,
146
- "eval_steps_per_second": 7.8,
147
- "step": 124
148
  },
149
  {
150
- "epoch": 6.265060240963855,
151
- "grad_norm": 9.61337661743164,
152
- "learning_rate": 3.7500000000000003e-05,
153
- "loss": 0.1449,
154
- "step": 130
 
 
155
  },
156
  {
157
- "epoch": 6.746987951807229,
158
- "grad_norm": 10.163887977600098,
159
- "learning_rate": 3.611111111111111e-05,
160
- "loss": 0.1557,
161
- "step": 140
162
  },
163
  {
164
- "epoch": 6.9879518072289155,
165
- "eval_accuracy": 0.9525423728813559,
166
- "eval_loss": 0.11894461512565613,
167
- "eval_runtime": 1.2782,
168
- "eval_samples_per_second": 230.788,
169
- "eval_steps_per_second": 7.823,
170
- "step": 145
171
  },
172
  {
173
- "epoch": 7.228915662650603,
174
- "grad_norm": 5.782637596130371,
175
- "learning_rate": 3.472222222222222e-05,
176
- "loss": 0.1577,
177
- "step": 150
178
  },
179
  {
180
- "epoch": 7.710843373493976,
181
- "grad_norm": 8.481064796447754,
182
- "learning_rate": 3.3333333333333335e-05,
183
- "loss": 0.1503,
184
- "step": 160
 
 
185
  },
186
  {
187
  "epoch": 8.0,
188
- "eval_accuracy": 0.9559322033898305,
189
- "eval_loss": 0.12011975049972534,
190
- "eval_runtime": 1.2771,
191
- "eval_samples_per_second": 230.985,
192
- "eval_steps_per_second": 7.83,
193
- "step": 166
194
- },
195
- {
196
- "epoch": 8.19277108433735,
197
- "grad_norm": 11.080423355102539,
198
- "learning_rate": 3.194444444444444e-05,
199
- "loss": 0.1716,
200
- "step": 170
201
- },
202
- {
203
- "epoch": 8.674698795180722,
204
- "grad_norm": 6.277684211730957,
205
- "learning_rate": 3.055555555555556e-05,
206
- "loss": 0.1446,
207
- "step": 180
208
- },
209
- {
210
- "epoch": 8.963855421686747,
211
- "eval_accuracy": 0.9627118644067797,
212
- "eval_loss": 0.12793326377868652,
213
- "eval_runtime": 1.2696,
214
- "eval_samples_per_second": 232.353,
215
- "eval_steps_per_second": 7.876,
216
  "step": 186
217
  },
218
  {
219
- "epoch": 9.156626506024097,
220
- "grad_norm": 5.818056106567383,
221
- "learning_rate": 2.916666666666667e-05,
222
- "loss": 0.1256,
223
- "step": 190
224
- },
225
- {
226
- "epoch": 9.638554216867469,
227
- "grad_norm": 5.736883163452148,
228
- "learning_rate": 2.777777777777778e-05,
229
- "loss": 0.1368,
230
- "step": 200
231
  },
232
  {
233
- "epoch": 9.975903614457831,
234
- "eval_accuracy": 0.9593220338983051,
235
- "eval_loss": 0.13930343091487885,
236
- "eval_runtime": 1.2699,
237
- "eval_samples_per_second": 232.298,
238
- "eval_steps_per_second": 7.875,
239
- "step": 207
240
  },
241
  {
242
- "epoch": 10.120481927710843,
243
- "grad_norm": 5.910589218139648,
244
- "learning_rate": 2.6388888888888892e-05,
245
- "loss": 0.1273,
246
- "step": 210
 
 
247
  },
248
  {
249
- "epoch": 10.602409638554217,
250
- "grad_norm": 9.146703720092773,
251
- "learning_rate": 2.5e-05,
252
- "loss": 0.111,
253
- "step": 220
254
  },
255
  {
256
- "epoch": 10.987951807228916,
257
- "eval_accuracy": 0.9627118644067797,
258
- "eval_loss": 0.17713582515716553,
259
- "eval_runtime": 1.2537,
260
- "eval_samples_per_second": 235.307,
261
- "eval_steps_per_second": 7.976,
262
- "step": 228
263
  },
264
  {
265
- "epoch": 11.08433734939759,
266
- "grad_norm": 4.925355434417725,
267
- "learning_rate": 2.361111111111111e-05,
268
- "loss": 0.1125,
269
- "step": 230
270
  },
271
  {
272
- "epoch": 11.566265060240964,
273
- "grad_norm": 4.107492923736572,
274
- "learning_rate": 2.2222222222222223e-05,
275
- "loss": 0.118,
276
- "step": 240
 
 
277
  },
278
  {
279
  "epoch": 12.0,
280
- "eval_accuracy": 0.9627118644067797,
281
- "eval_loss": 0.15914401412010193,
282
- "eval_runtime": 1.2854,
283
- "eval_samples_per_second": 229.507,
284
- "eval_steps_per_second": 7.78,
285
- "step": 249
286
- },
287
- {
288
- "epoch": 12.048192771084338,
289
- "grad_norm": 7.796498775482178,
290
- "learning_rate": 2.0833333333333336e-05,
291
- "loss": 0.0915,
292
- "step": 250
293
- },
294
- {
295
- "epoch": 12.53012048192771,
296
- "grad_norm": 9.677573204040527,
297
- "learning_rate": 1.9444444444444445e-05,
298
- "loss": 0.099,
299
- "step": 260
300
- },
301
- {
302
- "epoch": 12.963855421686747,
303
- "eval_accuracy": 0.9593220338983051,
304
- "eval_loss": 0.15266619622707367,
305
- "eval_runtime": 1.2662,
306
- "eval_samples_per_second": 232.985,
307
- "eval_steps_per_second": 7.898,
308
- "step": 269
309
  },
310
  {
311
- "epoch": 13.012048192771084,
312
- "grad_norm": 4.379421710968018,
313
- "learning_rate": 1.8055555555555555e-05,
314
- "loss": 0.1159,
315
- "step": 270
316
- },
317
- {
318
- "epoch": 13.493975903614459,
319
- "grad_norm": 4.8903326988220215,
320
- "learning_rate": 1.6666666666666667e-05,
321
- "loss": 0.1205,
322
- "step": 280
323
- },
324
- {
325
- "epoch": 13.975903614457831,
326
- "grad_norm": 11.284186363220215,
327
- "learning_rate": 1.527777777777778e-05,
328
- "loss": 0.0888,
329
- "step": 290
330
- },
331
- {
332
- "epoch": 13.975903614457831,
333
- "eval_accuracy": 0.9559322033898305,
334
- "eval_loss": 0.16676990687847137,
335
- "eval_runtime": 1.2901,
336
- "eval_samples_per_second": 228.665,
337
- "eval_steps_per_second": 7.751,
338
- "step": 290
339
- },
340
- {
341
- "epoch": 14.457831325301205,
342
- "grad_norm": 3.1499440670013428,
343
- "learning_rate": 1.388888888888889e-05,
344
- "loss": 0.0899,
345
- "step": 300
346
- },
347
- {
348
- "epoch": 14.939759036144578,
349
- "grad_norm": 3.247986316680908,
350
- "learning_rate": 1.25e-05,
351
- "loss": 0.0768,
352
- "step": 310
353
- },
354
- {
355
- "epoch": 14.987951807228916,
356
- "eval_accuracy": 0.9661016949152542,
357
- "eval_loss": 0.13029339909553528,
358
- "eval_runtime": 1.2781,
359
- "eval_samples_per_second": 230.82,
360
- "eval_steps_per_second": 7.824,
361
- "step": 311
362
- },
363
- {
364
- "epoch": 15.421686746987952,
365
- "grad_norm": 6.322991371154785,
366
- "learning_rate": 1.1111111111111112e-05,
367
- "loss": 0.0927,
368
- "step": 320
369
- },
370
- {
371
- "epoch": 15.903614457831326,
372
- "grad_norm": 6.686095714569092,
373
- "learning_rate": 9.722222222222223e-06,
374
- "loss": 0.0776,
375
- "step": 330
376
- },
377
- {
378
- "epoch": 16.0,
379
- "eval_accuracy": 0.9661016949152542,
380
- "eval_loss": 0.1429983526468277,
381
- "eval_runtime": 1.2624,
382
- "eval_samples_per_second": 233.689,
383
- "eval_steps_per_second": 7.922,
384
- "step": 332
385
- },
386
- {
387
- "epoch": 16.3855421686747,
388
- "grad_norm": 7.175258159637451,
389
- "learning_rate": 8.333333333333334e-06,
390
- "loss": 0.0702,
391
- "step": 340
392
- },
393
- {
394
- "epoch": 16.867469879518072,
395
- "grad_norm": 3.372356414794922,
396
- "learning_rate": 6.944444444444445e-06,
397
- "loss": 0.0853,
398
- "step": 350
399
- },
400
- {
401
- "epoch": 16.96385542168675,
402
- "eval_accuracy": 0.9593220338983051,
403
- "eval_loss": 0.1605215072631836,
404
- "eval_runtime": 1.2716,
405
- "eval_samples_per_second": 231.992,
406
- "eval_steps_per_second": 7.864,
407
- "step": 352
408
- },
409
- {
410
- "epoch": 17.349397590361445,
411
- "grad_norm": 5.981749534606934,
412
- "learning_rate": 5.555555555555556e-06,
413
- "loss": 0.0721,
414
- "step": 360
415
- },
416
- {
417
- "epoch": 17.83132530120482,
418
- "grad_norm": 4.273797988891602,
419
- "learning_rate": 4.166666666666667e-06,
420
- "loss": 0.07,
421
- "step": 370
422
  },
423
  {
424
- "epoch": 17.97590361445783,
425
- "eval_accuracy": 0.9593220338983051,
426
- "eval_loss": 0.16592465341091156,
427
- "eval_runtime": 1.2755,
428
- "eval_samples_per_second": 231.286,
429
- "eval_steps_per_second": 7.84,
430
- "step": 373
431
  },
432
  {
433
- "epoch": 18.313253012048193,
434
- "grad_norm": 5.308500289916992,
435
- "learning_rate": 2.777777777777778e-06,
436
- "loss": 0.0885,
437
- "step": 380
 
 
438
  },
439
  {
440
- "epoch": 18.795180722891565,
441
- "grad_norm": 4.930727958679199,
442
- "learning_rate": 1.388888888888889e-06,
443
- "loss": 0.0705,
444
- "step": 390
445
  },
446
  {
447
- "epoch": 18.987951807228917,
448
- "eval_accuracy": 0.9593220338983051,
449
- "eval_loss": 0.14548562467098236,
450
- "eval_runtime": 1.2629,
451
- "eval_samples_per_second": 233.598,
452
- "eval_steps_per_second": 7.919,
453
- "step": 394
454
  },
455
  {
456
- "epoch": 19.27710843373494,
457
- "grad_norm": 4.126744747161865,
458
  "learning_rate": 0.0,
459
- "loss": 0.0712,
460
- "step": 400
461
  },
462
  {
463
- "epoch": 19.27710843373494,
464
- "eval_accuracy": 0.9593220338983051,
465
- "eval_loss": 0.14513157308101654,
466
- "eval_runtime": 1.4146,
467
- "eval_samples_per_second": 208.538,
468
- "eval_steps_per_second": 7.069,
469
- "step": 400
470
  },
471
  {
472
- "epoch": 19.27710843373494,
473
- "step": 400,
474
- "total_flos": 1.2721899193419387e+18,
475
- "train_loss": 0.1996179285645485,
476
- "train_runtime": 565.0512,
477
- "train_samples_per_second": 93.974,
478
- "train_steps_per_second": 0.708
479
  }
480
  ],
481
- "logging_steps": 10,
482
- "max_steps": 400,
483
  "num_input_tokens_seen": 0,
484
- "num_train_epochs": 20,
485
  "save_steps": 500,
486
  "stateful_callbacks": {
487
- "CustomEarlyStoppingCallback": {
488
- "args": {
489
- "early_stopping_patience": 1,
490
- "early_stopping_threshold": 0.0
491
- },
492
- "attributes": {
493
- "early_stopping_patience_counter": 0
494
- }
495
- },
496
  "TrainerControl": {
497
  "args": {
498
  "should_epoch_stop": false,
@@ -504,7 +275,7 @@
504
  "attributes": {}
505
  }
506
  },
507
- "total_flos": 1.2721899193419387e+18,
508
  "train_batch_size": 32,
509
  "trial_name": null,
510
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.964824120603015,
3
+ "best_model_checkpoint": "/kaggle/working/swin-brain-abnormalities-classification/checkpoint-279",
4
+ "epoch": 14.838709677419354,
5
  "eval_steps": 500,
6
+ "global_step": 345,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.989247311827957,
13
+ "grad_norm": 12.819618225097656,
14
+ "learning_rate": 3.285714285714286e-05,
15
+ "loss": 0.928,
16
+ "step": 23
17
  },
18
  {
19
+ "epoch": 0.989247311827957,
20
+ "eval_accuracy": 0.7705192629815746,
21
+ "eval_loss": 0.6080142259597778,
22
+ "eval_runtime": 2.7919,
23
+ "eval_samples_per_second": 213.832,
24
+ "eval_steps_per_second": 6.805,
25
+ "step": 23
26
  },
27
  {
28
+ "epoch": 1.978494623655914,
29
+ "grad_norm": 15.878138542175293,
30
+ "learning_rate": 4.822580645161291e-05,
31
+ "loss": 0.508,
32
+ "step": 46
 
 
33
  },
34
  {
35
+ "epoch": 1.978494623655914,
36
+ "eval_accuracy": 0.916247906197655,
37
+ "eval_loss": 0.24024777114391327,
38
+ "eval_runtime": 2.7697,
39
+ "eval_samples_per_second": 215.545,
40
+ "eval_steps_per_second": 6.86,
41
+ "step": 46
42
  },
43
  {
44
+ "epoch": 2.967741935483871,
45
+ "grad_norm": 15.8612699508667,
46
+ "learning_rate": 4.451612903225807e-05,
47
+ "loss": 0.3178,
48
+ "step": 69
49
  },
50
  {
51
+ "epoch": 2.967741935483871,
52
+ "eval_accuracy": 0.9246231155778895,
53
+ "eval_loss": 0.21210229396820068,
54
+ "eval_runtime": 2.7786,
55
+ "eval_samples_per_second": 214.86,
56
+ "eval_steps_per_second": 6.838,
57
+ "step": 69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "grad_norm": 18.1905574798584,
62
+ "learning_rate": 4.0645161290322584e-05,
63
+ "loss": 0.2338,
64
+ "step": 93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  },
66
  {
67
+ "epoch": 4.0,
68
+ "eval_accuracy": 0.9363484087102177,
69
+ "eval_loss": 0.20449711382389069,
70
+ "eval_runtime": 2.7356,
71
+ "eval_samples_per_second": 218.234,
72
+ "eval_steps_per_second": 6.945,
73
+ "step": 93
74
  },
75
  {
76
+ "epoch": 4.989247311827957,
77
+ "grad_norm": 7.354104042053223,
78
+ "learning_rate": 3.6935483870967746e-05,
79
+ "loss": 0.1788,
80
+ "step": 116
 
 
81
  },
82
  {
83
+ "epoch": 4.989247311827957,
84
+ "eval_accuracy": 0.9296482412060302,
85
+ "eval_loss": 0.24434839189052582,
86
+ "eval_runtime": 2.754,
87
+ "eval_samples_per_second": 216.777,
88
+ "eval_steps_per_second": 6.899,
89
+ "step": 116
90
  },
91
  {
92
+ "epoch": 5.978494623655914,
93
+ "grad_norm": 12.372457504272461,
94
+ "learning_rate": 3.322580645161291e-05,
95
+ "loss": 0.1675,
96
+ "step": 139
97
  },
98
  {
99
+ "epoch": 5.978494623655914,
100
+ "eval_accuracy": 0.9430485762144054,
101
+ "eval_loss": 0.14566932618618011,
102
+ "eval_runtime": 2.7575,
103
+ "eval_samples_per_second": 216.503,
104
+ "eval_steps_per_second": 6.89,
105
+ "step": 139
106
  },
107
  {
108
+ "epoch": 6.967741935483871,
109
+ "grad_norm": 13.181761741638184,
110
+ "learning_rate": 2.9516129032258067e-05,
111
+ "loss": 0.155,
112
+ "step": 162
113
  },
114
  {
115
+ "epoch": 6.967741935483871,
116
+ "eval_accuracy": 0.9514237855946399,
117
+ "eval_loss": 0.17081618309020996,
118
+ "eval_runtime": 2.8126,
119
+ "eval_samples_per_second": 212.256,
120
+ "eval_steps_per_second": 6.755,
121
+ "step": 162
122
  },
123
  {
124
  "epoch": 8.0,
125
+ "grad_norm": 43.71896743774414,
126
+ "learning_rate": 2.5645161290322582e-05,
127
+ "loss": 0.1316,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "step": 186
129
  },
130
  {
131
+ "epoch": 8.0,
132
+ "eval_accuracy": 0.9530988274706867,
133
+ "eval_loss": 0.1554775983095169,
134
+ "eval_runtime": 2.7304,
135
+ "eval_samples_per_second": 218.645,
136
+ "eval_steps_per_second": 6.959,
137
+ "step": 186
 
 
 
 
 
138
  },
139
  {
140
+ "epoch": 8.989247311827956,
141
+ "grad_norm": 8.03194808959961,
142
+ "learning_rate": 2.1935483870967744e-05,
143
+ "loss": 0.1099,
144
+ "step": 209
 
 
145
  },
146
  {
147
+ "epoch": 8.989247311827956,
148
+ "eval_accuracy": 0.9530988274706867,
149
+ "eval_loss": 0.17324857413768768,
150
+ "eval_runtime": 2.7989,
151
+ "eval_samples_per_second": 213.295,
152
+ "eval_steps_per_second": 6.788,
153
+ "step": 209
154
  },
155
  {
156
+ "epoch": 9.978494623655914,
157
+ "grad_norm": 5.151728630065918,
158
+ "learning_rate": 1.8225806451612903e-05,
159
+ "loss": 0.1121,
160
+ "step": 232
161
  },
162
  {
163
+ "epoch": 9.978494623655914,
164
+ "eval_accuracy": 0.9581239530988275,
165
+ "eval_loss": 0.13575538992881775,
166
+ "eval_runtime": 2.7641,
167
+ "eval_samples_per_second": 215.983,
168
+ "eval_steps_per_second": 6.874,
169
+ "step": 232
170
  },
171
  {
172
+ "epoch": 10.967741935483872,
173
+ "grad_norm": 14.949158668518066,
174
+ "learning_rate": 1.4516129032258066e-05,
175
+ "loss": 0.1007,
176
+ "step": 255
177
  },
178
  {
179
+ "epoch": 10.967741935483872,
180
+ "eval_accuracy": 0.9514237855946399,
181
+ "eval_loss": 0.21547764539718628,
182
+ "eval_runtime": 2.7877,
183
+ "eval_samples_per_second": 214.157,
184
+ "eval_steps_per_second": 6.816,
185
+ "step": 255
186
  },
187
  {
188
  "epoch": 12.0,
189
+ "grad_norm": 4.855920314788818,
190
+ "learning_rate": 1.064516129032258e-05,
191
+ "loss": 0.0951,
192
+ "step": 279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  },
194
  {
195
+ "epoch": 12.0,
196
+ "eval_accuracy": 0.964824120603015,
197
+ "eval_loss": 0.15059159696102142,
198
+ "eval_runtime": 2.7534,
199
+ "eval_samples_per_second": 216.826,
200
+ "eval_steps_per_second": 6.901,
201
+ "step": 279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  },
203
  {
204
+ "epoch": 12.989247311827956,
205
+ "grad_norm": 5.928957462310791,
206
+ "learning_rate": 6.935483870967742e-06,
207
+ "loss": 0.0841,
208
+ "step": 302
 
 
209
  },
210
  {
211
+ "epoch": 12.989247311827956,
212
+ "eval_accuracy": 0.9530988274706867,
213
+ "eval_loss": 0.1921372413635254,
214
+ "eval_runtime": 2.8151,
215
+ "eval_samples_per_second": 212.072,
216
+ "eval_steps_per_second": 6.749,
217
+ "step": 302
218
  },
219
  {
220
+ "epoch": 13.978494623655914,
221
+ "grad_norm": 2.9877490997314453,
222
+ "learning_rate": 3.225806451612903e-06,
223
+ "loss": 0.0778,
224
+ "step": 325
225
  },
226
  {
227
+ "epoch": 13.978494623655914,
228
+ "eval_accuracy": 0.9530988274706867,
229
+ "eval_loss": 0.20410552620887756,
230
+ "eval_runtime": 2.7514,
231
+ "eval_samples_per_second": 216.98,
232
+ "eval_steps_per_second": 6.906,
233
+ "step": 325
234
  },
235
  {
236
+ "epoch": 14.838709677419354,
237
+ "grad_norm": 5.737996578216553,
238
  "learning_rate": 0.0,
239
+ "loss": 0.0768,
240
+ "step": 345
241
  },
242
  {
243
+ "epoch": 14.838709677419354,
244
+ "eval_accuracy": 0.9547738693467337,
245
+ "eval_loss": 0.1908799260854721,
246
+ "eval_runtime": 2.96,
247
+ "eval_samples_per_second": 201.69,
248
+ "eval_steps_per_second": 6.419,
249
+ "step": 345
250
  },
251
  {
252
+ "epoch": 14.838709677419354,
253
+ "step": 345,
254
+ "total_flos": 1.0886233115316142e+18,
255
+ "train_loss": 0.21913787420245184,
256
+ "train_runtime": 552.9922,
257
+ "train_samples_per_second": 80.019,
258
+ "train_steps_per_second": 0.624
259
  }
260
  ],
261
+ "logging_steps": 1,
262
+ "max_steps": 345,
263
  "num_input_tokens_seen": 0,
264
+ "num_train_epochs": 15,
265
  "save_steps": 500,
266
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
267
  "TrainerControl": {
268
  "args": {
269
  "should_epoch_stop": false,
 
275
  "attributes": {}
276
  }
277
  },
278
+ "total_flos": 1.0886233115316142e+18,
279
  "train_batch_size": 32,
280
  "trial_name": null,
281
  "trial_params": null