bombshelll commited on
Commit
15b8df9
·
verified ·
1 Parent(s): cd376b7

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +427 -94
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.1451
22
- - Accuracy: 0.9593
23
 
24
  ## Model description
25
 
 
18
 
19
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.1303
22
+ - Accuracy: 0.9661
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.9853556485355649,
4
- "eval_loss": 0.04226452112197876,
5
- "eval_runtime": 2.1424,
6
- "eval_samples_per_second": 223.11,
7
- "eval_steps_per_second": 7.001,
8
- "total_flos": 4.2732416264483635e+17,
9
- "train_loss": 0.27906332854871396,
10
- "train_runtime": 197.2966,
11
- "train_samples_per_second": 435.689,
12
- "train_steps_per_second": 3.345
13
  }
 
1
  {
2
+ "epoch": 19.27710843373494,
3
+ "eval_accuracy": 0.9661016949152542,
4
+ "eval_loss": 0.13029339909553528,
5
+ "eval_runtime": 1.306,
6
+ "eval_samples_per_second": 225.886,
7
+ "eval_steps_per_second": 7.657,
8
+ "total_flos": 1.2721899193419387e+18,
9
+ "train_loss": 0.1996179285645485,
10
+ "train_runtime": 565.0512,
11
+ "train_samples_per_second": 93.974,
12
+ "train_steps_per_second": 0.708
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.9853556485355649,
4
- "eval_loss": 0.04226452112197876,
5
- "eval_runtime": 2.1424,
6
- "eval_samples_per_second": 223.11,
7
- "eval_steps_per_second": 7.001
8
  }
 
1
  {
2
+ "epoch": 19.27710843373494,
3
+ "eval_accuracy": 0.9661016949152542,
4
+ "eval_loss": 0.13029339909553528,
5
+ "eval_runtime": 1.306,
6
+ "eval_samples_per_second": 225.886,
7
+ "eval_steps_per_second": 7.657
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "total_flos": 4.2732416264483635e+17,
4
- "train_loss": 0.27906332854871396,
5
- "train_runtime": 197.2966,
6
- "train_samples_per_second": 435.689,
7
- "train_steps_per_second": 3.345
8
  }
 
1
  {
2
+ "epoch": 19.27710843373494,
3
+ "total_flos": 1.2721899193419387e+18,
4
+ "train_loss": 0.1996179285645485,
5
+ "train_runtime": 565.0512,
6
+ "train_samples_per_second": 93.974,
7
+ "train_steps_per_second": 0.708
8
  }
trainer_state.json CHANGED
@@ -1,152 +1,485 @@
1
  {
2
- "best_metric": 0.9853556485355649,
3
- "best_model_checkpoint": "/kaggle/working/swin-brain-abnormalities-classification/checkpoint-135",
4
- "epoch": 4.0,
5
  "eval_steps": 500,
6
- "global_step": 135,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.2962962962962963,
13
- "grad_norm": 6.951941013336182,
14
- "learning_rate": 7.5757575757575764e-06,
15
- "loss": 0.6951,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.5925925925925926,
20
- "grad_norm": 3.8463668823242188,
21
- "learning_rate": 1.5151515151515153e-05,
22
- "loss": 0.5921,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.8888888888888888,
27
- "grad_norm": 3.810600519180298,
28
- "learning_rate": 2.272727272727273e-05,
29
- "loss": 0.4614,
30
- "step": 30
 
 
31
  },
32
  {
33
- "epoch": 0.9777777777777777,
34
- "eval_accuracy": 0.8619246861924686,
35
- "eval_loss": 0.3314987123012543,
36
- "eval_runtime": 2.4698,
37
- "eval_samples_per_second": 193.534,
38
- "eval_steps_per_second": 6.073,
39
- "step": 33
40
  },
41
  {
42
- "epoch": 1.1851851851851851,
43
- "grad_norm": 7.118009567260742,
44
- "learning_rate": 3.0303030303030306e-05,
45
- "loss": 0.3518,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.4814814814814814,
50
- "grad_norm": 11.195390701293945,
51
- "learning_rate": 3.787878787878788e-05,
52
- "loss": 0.3006,
 
 
 
 
 
 
 
 
 
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.7777777777777777,
57
- "grad_norm": 4.766914367675781,
58
- "learning_rate": 4.545454545454546e-05,
59
- "loss": 0.231,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.9851851851851852,
64
- "eval_accuracy": 0.8807531380753139,
65
- "eval_loss": 0.2779073119163513,
66
- "eval_runtime": 2.1596,
67
- "eval_samples_per_second": 221.342,
68
- "eval_steps_per_second": 6.946,
69
- "step": 67
70
  },
71
  {
72
- "epoch": 2.074074074074074,
73
- "grad_norm": 6.157414436340332,
74
- "learning_rate": 4.966329966329967e-05,
75
- "loss": 0.2006,
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.3703703703703702,
80
- "grad_norm": 6.762674331665039,
81
- "learning_rate": 4.882154882154882e-05,
82
- "loss": 0.1684,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.6666666666666665,
87
- "grad_norm": 19.68504524230957,
88
- "learning_rate": 4.797979797979798e-05,
89
- "loss": 0.1645,
 
 
 
 
 
 
 
 
 
90
  "step": 90
91
  },
92
  {
93
- "epoch": 2.962962962962963,
94
- "grad_norm": 3.5722227096557617,
95
- "learning_rate": 4.713804713804714e-05,
96
- "loss": 0.1631,
97
  "step": 100
98
  },
99
  {
100
- "epoch": 2.9925925925925925,
101
- "eval_accuracy": 0.8807531380753139,
102
- "eval_loss": 0.2769422233104706,
103
- "eval_runtime": 2.1679,
104
- "eval_samples_per_second": 220.494,
105
- "eval_steps_per_second": 6.919,
106
- "step": 101
107
  },
108
  {
109
- "epoch": 3.259259259259259,
110
- "grad_norm": 6.03804349899292,
111
- "learning_rate": 4.62962962962963e-05,
112
- "loss": 0.1491,
113
  "step": 110
114
  },
115
  {
116
- "epoch": 3.5555555555555554,
117
- "grad_norm": 7.634627342224121,
118
- "learning_rate": 4.545454545454546e-05,
119
- "loss": 0.113,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 3.851851851851852,
124
- "grad_norm": 6.576891899108887,
125
- "learning_rate": 4.4612794612794616e-05,
126
- "loss": 0.1141,
 
 
 
 
 
 
 
 
 
127
  "step": 130
128
  },
129
  {
130
- "epoch": 4.0,
131
- "eval_accuracy": 0.9853556485355649,
132
- "eval_loss": 0.04226452112197876,
133
- "eval_runtime": 2.1274,
134
- "eval_samples_per_second": 224.686,
135
- "eval_steps_per_second": 7.051,
136
- "step": 135
137
  },
138
  {
139
- "epoch": 4.0,
140
- "step": 135,
141
- "total_flos": 4.2732416264483635e+17,
142
- "train_loss": 0.27906332854871396,
143
- "train_runtime": 197.2966,
144
- "train_samples_per_second": 435.689,
145
- "train_steps_per_second": 3.345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  }
147
  ],
148
  "logging_steps": 10,
149
- "max_steps": 660,
150
  "num_input_tokens_seen": 0,
151
  "num_train_epochs": 20,
152
  "save_steps": 500,
@@ -171,7 +504,7 @@
171
  "attributes": {}
172
  }
173
  },
174
- "total_flos": 4.2732416264483635e+17,
175
  "train_batch_size": 32,
176
  "trial_name": null,
177
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9661016949152542,
3
+ "best_model_checkpoint": "/kaggle/working/swin-brain-abnormalities-classification/checkpoint-311",
4
+ "epoch": 19.27710843373494,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.4819277108433735,
13
+ "grad_norm": 7.259073257446289,
14
+ "learning_rate": 1.25e-05,
15
+ "loss": 1.1246,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.963855421686747,
20
+ "grad_norm": 4.209974765777588,
21
+ "learning_rate": 2.5e-05,
22
+ "loss": 0.7845,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.963855421686747,
27
+ "eval_accuracy": 0.7661016949152543,
28
+ "eval_loss": 0.5746350288391113,
29
+ "eval_runtime": 1.2982,
30
+ "eval_samples_per_second": 227.238,
31
+ "eval_steps_per_second": 7.703,
32
+ "step": 20
33
  },
34
  {
35
+ "epoch": 1.4457831325301205,
36
+ "grad_norm": 8.389396667480469,
37
+ "learning_rate": 3.7500000000000003e-05,
38
+ "loss": 0.6339,
39
+ "step": 30
 
 
40
  },
41
  {
42
+ "epoch": 1.927710843373494,
43
+ "grad_norm": 5.584402561187744,
44
+ "learning_rate": 5e-05,
45
+ "loss": 0.4587,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 1.9759036144578315,
50
+ "eval_accuracy": 0.8779661016949153,
51
+ "eval_loss": 0.29308223724365234,
52
+ "eval_runtime": 1.2711,
53
+ "eval_samples_per_second": 232.081,
54
+ "eval_steps_per_second": 7.867,
55
+ "step": 41
56
+ },
57
+ {
58
+ "epoch": 2.4096385542168672,
59
+ "grad_norm": 13.30373477935791,
60
+ "learning_rate": 4.8611111111111115e-05,
61
+ "loss": 0.3783,
62
  "step": 50
63
  },
64
  {
65
+ "epoch": 2.891566265060241,
66
+ "grad_norm": 14.480766296386719,
67
+ "learning_rate": 4.722222222222222e-05,
68
+ "loss": 0.3004,
69
  "step": 60
70
  },
71
  {
72
+ "epoch": 2.9879518072289155,
73
+ "eval_accuracy": 0.8949152542372881,
74
+ "eval_loss": 0.2784439027309418,
75
+ "eval_runtime": 1.2869,
76
+ "eval_samples_per_second": 229.239,
77
+ "eval_steps_per_second": 7.771,
78
+ "step": 62
79
  },
80
  {
81
+ "epoch": 3.3734939759036147,
82
+ "grad_norm": 22.313514709472656,
83
+ "learning_rate": 4.5833333333333334e-05,
84
+ "loss": 0.2702,
85
  "step": 70
86
  },
87
  {
88
+ "epoch": 3.855421686746988,
89
+ "grad_norm": 15.409673690795898,
90
+ "learning_rate": 4.4444444444444447e-05,
91
+ "loss": 0.2379,
92
  "step": 80
93
  },
94
  {
95
+ "epoch": 4.0,
96
+ "eval_accuracy": 0.9355932203389831,
97
+ "eval_loss": 0.1557122766971588,
98
+ "eval_runtime": 1.3066,
99
+ "eval_samples_per_second": 225.78,
100
+ "eval_steps_per_second": 7.654,
101
+ "step": 83
102
+ },
103
+ {
104
+ "epoch": 4.337349397590361,
105
+ "grad_norm": 8.086126327514648,
106
+ "learning_rate": 4.305555555555556e-05,
107
+ "loss": 0.2492,
108
  "step": 90
109
  },
110
  {
111
+ "epoch": 4.8192771084337345,
112
+ "grad_norm": 13.449581146240234,
113
+ "learning_rate": 4.166666666666667e-05,
114
+ "loss": 0.1845,
115
  "step": 100
116
  },
117
  {
118
+ "epoch": 4.9638554216867465,
119
+ "eval_accuracy": 0.9491525423728814,
120
+ "eval_loss": 0.15197788178920746,
121
+ "eval_runtime": 1.2703,
122
+ "eval_samples_per_second": 232.229,
123
+ "eval_steps_per_second": 7.872,
124
+ "step": 103
125
  },
126
  {
127
+ "epoch": 5.301204819277109,
128
+ "grad_norm": 11.823051452636719,
129
+ "learning_rate": 4.027777777777778e-05,
130
+ "loss": 0.1818,
131
  "step": 110
132
  },
133
  {
134
+ "epoch": 5.783132530120482,
135
+ "grad_norm": 7.886294364929199,
136
+ "learning_rate": 3.888888888888889e-05,
137
+ "loss": 0.1445,
138
  "step": 120
139
  },
140
  {
141
+ "epoch": 5.975903614457831,
142
+ "eval_accuracy": 0.9525423728813559,
143
+ "eval_loss": 0.14502111077308655,
144
+ "eval_runtime": 1.2821,
145
+ "eval_samples_per_second": 230.1,
146
+ "eval_steps_per_second": 7.8,
147
+ "step": 124
148
+ },
149
+ {
150
+ "epoch": 6.265060240963855,
151
+ "grad_norm": 9.61337661743164,
152
+ "learning_rate": 3.7500000000000003e-05,
153
+ "loss": 0.1449,
154
  "step": 130
155
  },
156
  {
157
+ "epoch": 6.746987951807229,
158
+ "grad_norm": 10.163887977600098,
159
+ "learning_rate": 3.611111111111111e-05,
160
+ "loss": 0.1557,
161
+ "step": 140
 
 
162
  },
163
  {
164
+ "epoch": 6.9879518072289155,
165
+ "eval_accuracy": 0.9525423728813559,
166
+ "eval_loss": 0.11894461512565613,
167
+ "eval_runtime": 1.2782,
168
+ "eval_samples_per_second": 230.788,
169
+ "eval_steps_per_second": 7.823,
170
+ "step": 145
171
+ },
172
+ {
173
+ "epoch": 7.228915662650603,
174
+ "grad_norm": 5.782637596130371,
175
+ "learning_rate": 3.472222222222222e-05,
176
+ "loss": 0.1577,
177
+ "step": 150
178
+ },
179
+ {
180
+ "epoch": 7.710843373493976,
181
+ "grad_norm": 8.481064796447754,
182
+ "learning_rate": 3.3333333333333335e-05,
183
+ "loss": 0.1503,
184
+ "step": 160
185
+ },
186
+ {
187
+ "epoch": 8.0,
188
+ "eval_accuracy": 0.9559322033898305,
189
+ "eval_loss": 0.12011975049972534,
190
+ "eval_runtime": 1.2771,
191
+ "eval_samples_per_second": 230.985,
192
+ "eval_steps_per_second": 7.83,
193
+ "step": 166
194
+ },
195
+ {
196
+ "epoch": 8.19277108433735,
197
+ "grad_norm": 11.080423355102539,
198
+ "learning_rate": 3.194444444444444e-05,
199
+ "loss": 0.1716,
200
+ "step": 170
201
+ },
202
+ {
203
+ "epoch": 8.674698795180722,
204
+ "grad_norm": 6.277684211730957,
205
+ "learning_rate": 3.055555555555556e-05,
206
+ "loss": 0.1446,
207
+ "step": 180
208
+ },
209
+ {
210
+ "epoch": 8.963855421686747,
211
+ "eval_accuracy": 0.9627118644067797,
212
+ "eval_loss": 0.12793326377868652,
213
+ "eval_runtime": 1.2696,
214
+ "eval_samples_per_second": 232.353,
215
+ "eval_steps_per_second": 7.876,
216
+ "step": 186
217
+ },
218
+ {
219
+ "epoch": 9.156626506024097,
220
+ "grad_norm": 5.818056106567383,
221
+ "learning_rate": 2.916666666666667e-05,
222
+ "loss": 0.1256,
223
+ "step": 190
224
+ },
225
+ {
226
+ "epoch": 9.638554216867469,
227
+ "grad_norm": 5.736883163452148,
228
+ "learning_rate": 2.777777777777778e-05,
229
+ "loss": 0.1368,
230
+ "step": 200
231
+ },
232
+ {
233
+ "epoch": 9.975903614457831,
234
+ "eval_accuracy": 0.9593220338983051,
235
+ "eval_loss": 0.13930343091487885,
236
+ "eval_runtime": 1.2699,
237
+ "eval_samples_per_second": 232.298,
238
+ "eval_steps_per_second": 7.875,
239
+ "step": 207
240
+ },
241
+ {
242
+ "epoch": 10.120481927710843,
243
+ "grad_norm": 5.910589218139648,
244
+ "learning_rate": 2.6388888888888892e-05,
245
+ "loss": 0.1273,
246
+ "step": 210
247
+ },
248
+ {
249
+ "epoch": 10.602409638554217,
250
+ "grad_norm": 9.146703720092773,
251
+ "learning_rate": 2.5e-05,
252
+ "loss": 0.111,
253
+ "step": 220
254
+ },
255
+ {
256
+ "epoch": 10.987951807228916,
257
+ "eval_accuracy": 0.9627118644067797,
258
+ "eval_loss": 0.17713582515716553,
259
+ "eval_runtime": 1.2537,
260
+ "eval_samples_per_second": 235.307,
261
+ "eval_steps_per_second": 7.976,
262
+ "step": 228
263
+ },
264
+ {
265
+ "epoch": 11.08433734939759,
266
+ "grad_norm": 4.925355434417725,
267
+ "learning_rate": 2.361111111111111e-05,
268
+ "loss": 0.1125,
269
+ "step": 230
270
+ },
271
+ {
272
+ "epoch": 11.566265060240964,
273
+ "grad_norm": 4.107492923736572,
274
+ "learning_rate": 2.2222222222222223e-05,
275
+ "loss": 0.118,
276
+ "step": 240
277
+ },
278
+ {
279
+ "epoch": 12.0,
280
+ "eval_accuracy": 0.9627118644067797,
281
+ "eval_loss": 0.15914401412010193,
282
+ "eval_runtime": 1.2854,
283
+ "eval_samples_per_second": 229.507,
284
+ "eval_steps_per_second": 7.78,
285
+ "step": 249
286
+ },
287
+ {
288
+ "epoch": 12.048192771084338,
289
+ "grad_norm": 7.796498775482178,
290
+ "learning_rate": 2.0833333333333336e-05,
291
+ "loss": 0.0915,
292
+ "step": 250
293
+ },
294
+ {
295
+ "epoch": 12.53012048192771,
296
+ "grad_norm": 9.677573204040527,
297
+ "learning_rate": 1.9444444444444445e-05,
298
+ "loss": 0.099,
299
+ "step": 260
300
+ },
301
+ {
302
+ "epoch": 12.963855421686747,
303
+ "eval_accuracy": 0.9593220338983051,
304
+ "eval_loss": 0.15266619622707367,
305
+ "eval_runtime": 1.2662,
306
+ "eval_samples_per_second": 232.985,
307
+ "eval_steps_per_second": 7.898,
308
+ "step": 269
309
+ },
310
+ {
311
+ "epoch": 13.012048192771084,
312
+ "grad_norm": 4.379421710968018,
313
+ "learning_rate": 1.8055555555555555e-05,
314
+ "loss": 0.1159,
315
+ "step": 270
316
+ },
317
+ {
318
+ "epoch": 13.493975903614459,
319
+ "grad_norm": 4.8903326988220215,
320
+ "learning_rate": 1.6666666666666667e-05,
321
+ "loss": 0.1205,
322
+ "step": 280
323
+ },
324
+ {
325
+ "epoch": 13.975903614457831,
326
+ "grad_norm": 11.284186363220215,
327
+ "learning_rate": 1.527777777777778e-05,
328
+ "loss": 0.0888,
329
+ "step": 290
330
+ },
331
+ {
332
+ "epoch": 13.975903614457831,
333
+ "eval_accuracy": 0.9559322033898305,
334
+ "eval_loss": 0.16676990687847137,
335
+ "eval_runtime": 1.2901,
336
+ "eval_samples_per_second": 228.665,
337
+ "eval_steps_per_second": 7.751,
338
+ "step": 290
339
+ },
340
+ {
341
+ "epoch": 14.457831325301205,
342
+ "grad_norm": 3.1499440670013428,
343
+ "learning_rate": 1.388888888888889e-05,
344
+ "loss": 0.0899,
345
+ "step": 300
346
+ },
347
+ {
348
+ "epoch": 14.939759036144578,
349
+ "grad_norm": 3.247986316680908,
350
+ "learning_rate": 1.25e-05,
351
+ "loss": 0.0768,
352
+ "step": 310
353
+ },
354
+ {
355
+ "epoch": 14.987951807228916,
356
+ "eval_accuracy": 0.9661016949152542,
357
+ "eval_loss": 0.13029339909553528,
358
+ "eval_runtime": 1.2781,
359
+ "eval_samples_per_second": 230.82,
360
+ "eval_steps_per_second": 7.824,
361
+ "step": 311
362
+ },
363
+ {
364
+ "epoch": 15.421686746987952,
365
+ "grad_norm": 6.322991371154785,
366
+ "learning_rate": 1.1111111111111112e-05,
367
+ "loss": 0.0927,
368
+ "step": 320
369
+ },
370
+ {
371
+ "epoch": 15.903614457831326,
372
+ "grad_norm": 6.686095714569092,
373
+ "learning_rate": 9.722222222222223e-06,
374
+ "loss": 0.0776,
375
+ "step": 330
376
+ },
377
+ {
378
+ "epoch": 16.0,
379
+ "eval_accuracy": 0.9661016949152542,
380
+ "eval_loss": 0.1429983526468277,
381
+ "eval_runtime": 1.2624,
382
+ "eval_samples_per_second": 233.689,
383
+ "eval_steps_per_second": 7.922,
384
+ "step": 332
385
+ },
386
+ {
387
+ "epoch": 16.3855421686747,
388
+ "grad_norm": 7.175258159637451,
389
+ "learning_rate": 8.333333333333334e-06,
390
+ "loss": 0.0702,
391
+ "step": 340
392
+ },
393
+ {
394
+ "epoch": 16.867469879518072,
395
+ "grad_norm": 3.372356414794922,
396
+ "learning_rate": 6.944444444444445e-06,
397
+ "loss": 0.0853,
398
+ "step": 350
399
+ },
400
+ {
401
+ "epoch": 16.96385542168675,
402
+ "eval_accuracy": 0.9593220338983051,
403
+ "eval_loss": 0.1605215072631836,
404
+ "eval_runtime": 1.2716,
405
+ "eval_samples_per_second": 231.992,
406
+ "eval_steps_per_second": 7.864,
407
+ "step": 352
408
+ },
409
+ {
410
+ "epoch": 17.349397590361445,
411
+ "grad_norm": 5.981749534606934,
412
+ "learning_rate": 5.555555555555556e-06,
413
+ "loss": 0.0721,
414
+ "step": 360
415
+ },
416
+ {
417
+ "epoch": 17.83132530120482,
418
+ "grad_norm": 4.273797988891602,
419
+ "learning_rate": 4.166666666666667e-06,
420
+ "loss": 0.07,
421
+ "step": 370
422
+ },
423
+ {
424
+ "epoch": 17.97590361445783,
425
+ "eval_accuracy": 0.9593220338983051,
426
+ "eval_loss": 0.16592465341091156,
427
+ "eval_runtime": 1.2755,
428
+ "eval_samples_per_second": 231.286,
429
+ "eval_steps_per_second": 7.84,
430
+ "step": 373
431
+ },
432
+ {
433
+ "epoch": 18.313253012048193,
434
+ "grad_norm": 5.308500289916992,
435
+ "learning_rate": 2.777777777777778e-06,
436
+ "loss": 0.0885,
437
+ "step": 380
438
+ },
439
+ {
440
+ "epoch": 18.795180722891565,
441
+ "grad_norm": 4.930727958679199,
442
+ "learning_rate": 1.388888888888889e-06,
443
+ "loss": 0.0705,
444
+ "step": 390
445
+ },
446
+ {
447
+ "epoch": 18.987951807228917,
448
+ "eval_accuracy": 0.9593220338983051,
449
+ "eval_loss": 0.14548562467098236,
450
+ "eval_runtime": 1.2629,
451
+ "eval_samples_per_second": 233.598,
452
+ "eval_steps_per_second": 7.919,
453
+ "step": 394
454
+ },
455
+ {
456
+ "epoch": 19.27710843373494,
457
+ "grad_norm": 4.126744747161865,
458
+ "learning_rate": 0.0,
459
+ "loss": 0.0712,
460
+ "step": 400
461
+ },
462
+ {
463
+ "epoch": 19.27710843373494,
464
+ "eval_accuracy": 0.9593220338983051,
465
+ "eval_loss": 0.14513157308101654,
466
+ "eval_runtime": 1.4146,
467
+ "eval_samples_per_second": 208.538,
468
+ "eval_steps_per_second": 7.069,
469
+ "step": 400
470
+ },
471
+ {
472
+ "epoch": 19.27710843373494,
473
+ "step": 400,
474
+ "total_flos": 1.2721899193419387e+18,
475
+ "train_loss": 0.1996179285645485,
476
+ "train_runtime": 565.0512,
477
+ "train_samples_per_second": 93.974,
478
+ "train_steps_per_second": 0.708
479
  }
480
  ],
481
  "logging_steps": 10,
482
+ "max_steps": 400,
483
  "num_input_tokens_seen": 0,
484
  "num_train_epochs": 20,
485
  "save_steps": 500,
 
504
  "attributes": {}
505
  }
506
  },
507
+ "total_flos": 1.2721899193419387e+18,
508
  "train_batch_size": 32,
509
  "trial_name": null,
510
  "trial_params": null