RobertoSonic commited on
Commit
e3fa80a
·
verified ·
1 Parent(s): abe7917

End of training

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.6521
22
- - Accuracy: 0.88
23
 
24
  ## Model description
25
 
 
18
 
19
  This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.5977
22
+ - Accuracy: 0.9029
23
 
24
  ## Model description
25
 
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 56.0,
3
+ "eval_accuracy": 0.9028571428571428,
4
+ "eval_loss": 0.597748875617981,
5
+ "eval_runtime": 1.9786,
6
+ "eval_samples_per_second": 88.445,
7
+ "eval_steps_per_second": 5.559,
8
+ "total_flos": 1.7108328318259692e+18,
9
+ "train_loss": 0.19382851386354083,
10
+ "train_runtime": 1219.347,
11
+ "train_samples_per_second": 46.205,
12
+ "train_steps_per_second": 0.689
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 56.0,
3
+ "eval_accuracy": 0.9028571428571428,
4
+ "eval_loss": 0.597748875617981,
5
+ "eval_runtime": 1.9786,
6
+ "eval_samples_per_second": 88.445,
7
+ "eval_steps_per_second": 5.559
8
+ }
runs/May23_06-57-00_cba75054b839/events.out.tfevents.1747984700.cba75054b839.2385.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f2c5aa30d6a473d28062d5e548e57e195db3d97f5aec9feeefb9177ac54c96
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 56.0,
3
+ "total_flos": 1.7108328318259692e+18,
4
+ "train_loss": 0.19382851386354083,
5
+ "train_runtime": 1219.347,
6
+ "train_samples_per_second": 46.205,
7
+ "train_steps_per_second": 0.689
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 750,
3
+ "best_metric": 0.9028571428571428,
4
+ "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-dmae-humeda-DAV71/checkpoint-750",
5
+ "epoch": 56.0,
6
+ "eval_steps": 500,
7
+ "global_step": 840,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.6779661016949152,
14
+ "grad_norm": 3.3897862434387207,
15
+ "learning_rate": 6.428571428571429e-06,
16
+ "loss": 1.1246,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 1.0,
21
+ "eval_accuracy": 0.41714285714285715,
22
+ "eval_loss": 1.05587899684906,
23
+ "eval_runtime": 2.0285,
24
+ "eval_samples_per_second": 86.269,
25
+ "eval_steps_per_second": 5.423,
26
+ "step": 15
27
+ },
28
+ {
29
+ "epoch": 1.3389830508474576,
30
+ "grad_norm": 4.0562825202941895,
31
+ "learning_rate": 1.3571428571428572e-05,
32
+ "loss": 1.0075,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 2.0,
37
+ "grad_norm": 4.147009372711182,
38
+ "learning_rate": 2.0714285714285715e-05,
39
+ "loss": 0.8728,
40
+ "step": 30
41
+ },
42
+ {
43
+ "epoch": 2.0,
44
+ "eval_accuracy": 0.6971428571428572,
45
+ "eval_loss": 0.746044933795929,
46
+ "eval_runtime": 2.3392,
47
+ "eval_samples_per_second": 74.811,
48
+ "eval_steps_per_second": 4.702,
49
+ "step": 30
50
+ },
51
+ {
52
+ "epoch": 2.6779661016949152,
53
+ "grad_norm": 8.447574615478516,
54
+ "learning_rate": 2.7857142857142858e-05,
55
+ "loss": 0.6663,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 3.0,
60
+ "eval_accuracy": 0.8057142857142857,
61
+ "eval_loss": 0.4562816619873047,
62
+ "eval_runtime": 1.9074,
63
+ "eval_samples_per_second": 91.749,
64
+ "eval_steps_per_second": 5.767,
65
+ "step": 45
66
+ },
67
+ {
68
+ "epoch": 3.3389830508474576,
69
+ "grad_norm": 11.307677268981934,
70
+ "learning_rate": 3.5000000000000004e-05,
71
+ "loss": 0.5432,
72
+ "step": 50
73
+ },
74
+ {
75
+ "epoch": 4.0,
76
+ "grad_norm": 8.582294464111328,
77
+ "learning_rate": 4.214285714285714e-05,
78
+ "loss": 0.4632,
79
+ "step": 60
80
+ },
81
+ {
82
+ "epoch": 4.0,
83
+ "eval_accuracy": 0.8285714285714286,
84
+ "eval_loss": 0.40758973360061646,
85
+ "eval_runtime": 1.8904,
86
+ "eval_samples_per_second": 92.573,
87
+ "eval_steps_per_second": 5.819,
88
+ "step": 60
89
+ },
90
+ {
91
+ "epoch": 4.677966101694915,
92
+ "grad_norm": 9.387711524963379,
93
+ "learning_rate": 4.928571428571428e-05,
94
+ "loss": 0.4278,
95
+ "step": 70
96
+ },
97
+ {
98
+ "epoch": 5.0,
99
+ "eval_accuracy": 0.84,
100
+ "eval_loss": 0.3669876456260681,
101
+ "eval_runtime": 2.6198,
102
+ "eval_samples_per_second": 66.799,
103
+ "eval_steps_per_second": 4.199,
104
+ "step": 75
105
+ },
106
+ {
107
+ "epoch": 5.338983050847458,
108
+ "grad_norm": 8.6371431350708,
109
+ "learning_rate": 5.642857142857143e-05,
110
+ "loss": 0.3608,
111
+ "step": 80
112
+ },
113
+ {
114
+ "epoch": 6.0,
115
+ "grad_norm": 11.995199203491211,
116
+ "learning_rate": 5.96031746031746e-05,
117
+ "loss": 0.361,
118
+ "step": 90
119
+ },
120
+ {
121
+ "epoch": 6.0,
122
+ "eval_accuracy": 0.8457142857142858,
123
+ "eval_loss": 0.36241406202316284,
124
+ "eval_runtime": 1.9339,
125
+ "eval_samples_per_second": 90.49,
126
+ "eval_steps_per_second": 5.688,
127
+ "step": 90
128
+ },
129
+ {
130
+ "epoch": 6.677966101694915,
131
+ "grad_norm": 10.142115592956543,
132
+ "learning_rate": 5.880952380952381e-05,
133
+ "loss": 0.3742,
134
+ "step": 100
135
+ },
136
+ {
137
+ "epoch": 7.0,
138
+ "eval_accuracy": 0.8628571428571429,
139
+ "eval_loss": 0.3504450023174286,
140
+ "eval_runtime": 1.8843,
141
+ "eval_samples_per_second": 92.872,
142
+ "eval_steps_per_second": 5.838,
143
+ "step": 105
144
+ },
145
+ {
146
+ "epoch": 7.338983050847458,
147
+ "grad_norm": 11.923420906066895,
148
+ "learning_rate": 5.801587301587302e-05,
149
+ "loss": 0.2351,
150
+ "step": 110
151
+ },
152
+ {
153
+ "epoch": 8.0,
154
+ "grad_norm": 4.905794620513916,
155
+ "learning_rate": 5.722222222222223e-05,
156
+ "loss": 0.3313,
157
+ "step": 120
158
+ },
159
+ {
160
+ "epoch": 8.0,
161
+ "eval_accuracy": 0.8628571428571429,
162
+ "eval_loss": 0.2962282598018646,
163
+ "eval_runtime": 2.4213,
164
+ "eval_samples_per_second": 72.274,
165
+ "eval_steps_per_second": 4.543,
166
+ "step": 120
167
+ },
168
+ {
169
+ "epoch": 8.677966101694915,
170
+ "grad_norm": 8.893570899963379,
171
+ "learning_rate": 5.642857142857143e-05,
172
+ "loss": 0.2977,
173
+ "step": 130
174
+ },
175
+ {
176
+ "epoch": 9.0,
177
+ "eval_accuracy": 0.8685714285714285,
178
+ "eval_loss": 0.33206191658973694,
179
+ "eval_runtime": 1.9246,
180
+ "eval_samples_per_second": 90.929,
181
+ "eval_steps_per_second": 5.716,
182
+ "step": 135
183
+ },
184
+ {
185
+ "epoch": 9.338983050847457,
186
+ "grad_norm": 12.26343822479248,
187
+ "learning_rate": 5.563492063492064e-05,
188
+ "loss": 0.3156,
189
+ "step": 140
190
+ },
191
+ {
192
+ "epoch": 10.0,
193
+ "grad_norm": 6.5488362312316895,
194
+ "learning_rate": 5.4841269841269845e-05,
195
+ "loss": 0.2589,
196
+ "step": 150
197
+ },
198
+ {
199
+ "epoch": 10.0,
200
+ "eval_accuracy": 0.8628571428571429,
201
+ "eval_loss": 0.34251242876052856,
202
+ "eval_runtime": 1.9199,
203
+ "eval_samples_per_second": 91.152,
204
+ "eval_steps_per_second": 5.73,
205
+ "step": 150
206
+ },
207
+ {
208
+ "epoch": 10.677966101694915,
209
+ "grad_norm": 5.538010120391846,
210
+ "learning_rate": 5.404761904761905e-05,
211
+ "loss": 0.2477,
212
+ "step": 160
213
+ },
214
+ {
215
+ "epoch": 11.0,
216
+ "eval_accuracy": 0.8457142857142858,
217
+ "eval_loss": 0.39819851517677307,
218
+ "eval_runtime": 1.9172,
219
+ "eval_samples_per_second": 91.281,
220
+ "eval_steps_per_second": 5.738,
221
+ "step": 165
222
+ },
223
+ {
224
+ "epoch": 11.338983050847457,
225
+ "grad_norm": 6.982760906219482,
226
+ "learning_rate": 5.333333333333333e-05,
227
+ "loss": 0.2527,
228
+ "step": 170
229
+ },
230
+ {
231
+ "epoch": 12.0,
232
+ "grad_norm": 8.375303268432617,
233
+ "learning_rate": 5.253968253968254e-05,
234
+ "loss": 0.2187,
235
+ "step": 180
236
+ },
237
+ {
238
+ "epoch": 12.0,
239
+ "eval_accuracy": 0.8514285714285714,
240
+ "eval_loss": 0.5953956246376038,
241
+ "eval_runtime": 2.0286,
242
+ "eval_samples_per_second": 86.268,
243
+ "eval_steps_per_second": 5.423,
244
+ "step": 180
245
+ },
246
+ {
247
+ "epoch": 12.677966101694915,
248
+ "grad_norm": 7.4971723556518555,
249
+ "learning_rate": 5.174603174603175e-05,
250
+ "loss": 0.2342,
251
+ "step": 190
252
+ },
253
+ {
254
+ "epoch": 13.0,
255
+ "eval_accuracy": 0.8514285714285714,
256
+ "eval_loss": 0.3745245933532715,
257
+ "eval_runtime": 1.8701,
258
+ "eval_samples_per_second": 93.58,
259
+ "eval_steps_per_second": 5.882,
260
+ "step": 195
261
+ },
262
+ {
263
+ "epoch": 13.338983050847457,
264
+ "grad_norm": 6.758434772491455,
265
+ "learning_rate": 5.095238095238095e-05,
266
+ "loss": 0.2354,
267
+ "step": 200
268
+ },
269
+ {
270
+ "epoch": 14.0,
271
+ "grad_norm": 4.655900001525879,
272
+ "learning_rate": 5.015873015873016e-05,
273
+ "loss": 0.2444,
274
+ "step": 210
275
+ },
276
+ {
277
+ "epoch": 14.0,
278
+ "eval_accuracy": 0.8628571428571429,
279
+ "eval_loss": 0.5219993591308594,
280
+ "eval_runtime": 2.5324,
281
+ "eval_samples_per_second": 69.106,
282
+ "eval_steps_per_second": 4.344,
283
+ "step": 210
284
+ },
285
+ {
286
+ "epoch": 14.677966101694915,
287
+ "grad_norm": 8.788654327392578,
288
+ "learning_rate": 4.936507936507937e-05,
289
+ "loss": 0.2067,
290
+ "step": 220
291
+ },
292
+ {
293
+ "epoch": 15.0,
294
+ "eval_accuracy": 0.8457142857142858,
295
+ "eval_loss": 0.44333723187446594,
296
+ "eval_runtime": 1.9312,
297
+ "eval_samples_per_second": 90.617,
298
+ "eval_steps_per_second": 5.696,
299
+ "step": 225
300
+ },
301
+ {
302
+ "epoch": 15.338983050847457,
303
+ "grad_norm": 8.221491813659668,
304
+ "learning_rate": 4.8571428571428576e-05,
305
+ "loss": 0.2097,
306
+ "step": 230
307
+ },
308
+ {
309
+ "epoch": 16.0,
310
+ "grad_norm": 5.007316589355469,
311
+ "learning_rate": 4.777777777777778e-05,
312
+ "loss": 0.1882,
313
+ "step": 240
314
+ },
315
+ {
316
+ "epoch": 16.0,
317
+ "eval_accuracy": 0.8628571428571429,
318
+ "eval_loss": 0.3937312960624695,
319
+ "eval_runtime": 1.922,
320
+ "eval_samples_per_second": 91.052,
321
+ "eval_steps_per_second": 5.723,
322
+ "step": 240
323
+ },
324
+ {
325
+ "epoch": 16.677966101694913,
326
+ "grad_norm": 9.630002975463867,
327
+ "learning_rate": 4.6984126984126986e-05,
328
+ "loss": 0.199,
329
+ "step": 250
330
+ },
331
+ {
332
+ "epoch": 17.0,
333
+ "eval_accuracy": 0.8628571428571429,
334
+ "eval_loss": 0.5102602243423462,
335
+ "eval_runtime": 2.5902,
336
+ "eval_samples_per_second": 67.563,
337
+ "eval_steps_per_second": 4.247,
338
+ "step": 255
339
+ },
340
+ {
341
+ "epoch": 17.338983050847457,
342
+ "grad_norm": 5.763312339782715,
343
+ "learning_rate": 4.6190476190476194e-05,
344
+ "loss": 0.174,
345
+ "step": 260
346
+ },
347
+ {
348
+ "epoch": 18.0,
349
+ "grad_norm": 2.7853012084960938,
350
+ "learning_rate": 4.53968253968254e-05,
351
+ "loss": 0.1565,
352
+ "step": 270
353
+ },
354
+ {
355
+ "epoch": 18.0,
356
+ "eval_accuracy": 0.8857142857142857,
357
+ "eval_loss": 0.36082425713539124,
358
+ "eval_runtime": 1.8955,
359
+ "eval_samples_per_second": 92.326,
360
+ "eval_steps_per_second": 5.803,
361
+ "step": 270
362
+ },
363
+ {
364
+ "epoch": 18.677966101694913,
365
+ "grad_norm": 7.840061187744141,
366
+ "learning_rate": 4.4603174603174604e-05,
367
+ "loss": 0.2068,
368
+ "step": 280
369
+ },
370
+ {
371
+ "epoch": 19.0,
372
+ "eval_accuracy": 0.88,
373
+ "eval_loss": 0.3678865134716034,
374
+ "eval_runtime": 1.914,
375
+ "eval_samples_per_second": 91.43,
376
+ "eval_steps_per_second": 5.747,
377
+ "step": 285
378
+ },
379
+ {
380
+ "epoch": 19.338983050847457,
381
+ "grad_norm": 10.269192695617676,
382
+ "learning_rate": 4.3809523809523805e-05,
383
+ "loss": 0.1742,
384
+ "step": 290
385
+ },
386
+ {
387
+ "epoch": 20.0,
388
+ "grad_norm": 11.602302551269531,
389
+ "learning_rate": 4.301587301587302e-05,
390
+ "loss": 0.194,
391
+ "step": 300
392
+ },
393
+ {
394
+ "epoch": 20.0,
395
+ "eval_accuracy": 0.8457142857142858,
396
+ "eval_loss": 0.5581231117248535,
397
+ "eval_runtime": 2.5904,
398
+ "eval_samples_per_second": 67.556,
399
+ "eval_steps_per_second": 4.246,
400
+ "step": 300
401
+ },
402
+ {
403
+ "epoch": 20.677966101694913,
404
+ "grad_norm": 4.199820518493652,
405
+ "learning_rate": 4.222222222222222e-05,
406
+ "loss": 0.1654,
407
+ "step": 310
408
+ },
409
+ {
410
+ "epoch": 21.0,
411
+ "eval_accuracy": 0.8685714285714285,
412
+ "eval_loss": 0.5074398517608643,
413
+ "eval_runtime": 1.9301,
414
+ "eval_samples_per_second": 90.667,
415
+ "eval_steps_per_second": 5.699,
416
+ "step": 315
417
+ },
418
+ {
419
+ "epoch": 21.338983050847457,
420
+ "grad_norm": 8.24092960357666,
421
+ "learning_rate": 4.142857142857143e-05,
422
+ "loss": 0.1393,
423
+ "step": 320
424
+ },
425
+ {
426
+ "epoch": 22.0,
427
+ "grad_norm": 6.02392053604126,
428
+ "learning_rate": 4.063492063492063e-05,
429
+ "loss": 0.1986,
430
+ "step": 330
431
+ },
432
+ {
433
+ "epoch": 22.0,
434
+ "eval_accuracy": 0.88,
435
+ "eval_loss": 0.4395482540130615,
436
+ "eval_runtime": 1.9058,
437
+ "eval_samples_per_second": 91.826,
438
+ "eval_steps_per_second": 5.772,
439
+ "step": 330
440
+ },
441
+ {
442
+ "epoch": 22.677966101694913,
443
+ "grad_norm": 5.646173000335693,
444
+ "learning_rate": 3.9841269841269846e-05,
445
+ "loss": 0.1257,
446
+ "step": 340
447
+ },
448
+ {
449
+ "epoch": 23.0,
450
+ "eval_accuracy": 0.8685714285714285,
451
+ "eval_loss": 0.42931947112083435,
452
+ "eval_runtime": 2.3278,
453
+ "eval_samples_per_second": 75.179,
454
+ "eval_steps_per_second": 4.726,
455
+ "step": 345
456
+ },
457
+ {
458
+ "epoch": 23.338983050847457,
459
+ "grad_norm": 7.199140548706055,
460
+ "learning_rate": 3.904761904761905e-05,
461
+ "loss": 0.1364,
462
+ "step": 350
463
+ },
464
+ {
465
+ "epoch": 24.0,
466
+ "grad_norm": 2.1807098388671875,
467
+ "learning_rate": 3.8253968253968256e-05,
468
+ "loss": 0.1976,
469
+ "step": 360
470
+ },
471
+ {
472
+ "epoch": 24.0,
473
+ "eval_accuracy": 0.8571428571428571,
474
+ "eval_loss": 0.4932045638561249,
475
+ "eval_runtime": 1.9503,
476
+ "eval_samples_per_second": 89.731,
477
+ "eval_steps_per_second": 5.64,
478
+ "step": 360
479
+ },
480
+ {
481
+ "epoch": 24.677966101694913,
482
+ "grad_norm": 3.774115562438965,
483
+ "learning_rate": 3.746031746031746e-05,
484
+ "loss": 0.1563,
485
+ "step": 370
486
+ },
487
+ {
488
+ "epoch": 25.0,
489
+ "eval_accuracy": 0.8857142857142857,
490
+ "eval_loss": 0.42544516921043396,
491
+ "eval_runtime": 1.9489,
492
+ "eval_samples_per_second": 89.793,
493
+ "eval_steps_per_second": 5.644,
494
+ "step": 375
495
+ },
496
+ {
497
+ "epoch": 25.338983050847457,
498
+ "grad_norm": 5.813924789428711,
499
+ "learning_rate": 3.666666666666667e-05,
500
+ "loss": 0.1537,
501
+ "step": 380
502
+ },
503
+ {
504
+ "epoch": 26.0,
505
+ "grad_norm": 5.670418739318848,
506
+ "learning_rate": 3.5873015873015874e-05,
507
+ "loss": 0.0985,
508
+ "step": 390
509
+ },
510
+ {
511
+ "epoch": 26.0,
512
+ "eval_accuracy": 0.8685714285714285,
513
+ "eval_loss": 0.5096610188484192,
514
+ "eval_runtime": 2.2632,
515
+ "eval_samples_per_second": 77.323,
516
+ "eval_steps_per_second": 4.86,
517
+ "step": 390
518
+ },
519
+ {
520
+ "epoch": 26.677966101694913,
521
+ "grad_norm": 7.973656177520752,
522
+ "learning_rate": 3.507936507936508e-05,
523
+ "loss": 0.1238,
524
+ "step": 400
525
+ },
526
+ {
527
+ "epoch": 27.0,
528
+ "eval_accuracy": 0.8514285714285714,
529
+ "eval_loss": 0.7264113426208496,
530
+ "eval_runtime": 2.2954,
531
+ "eval_samples_per_second": 76.239,
532
+ "eval_steps_per_second": 4.792,
533
+ "step": 405
534
+ },
535
+ {
536
+ "epoch": 27.338983050847457,
537
+ "grad_norm": 5.4732866287231445,
538
+ "learning_rate": 3.4285714285714284e-05,
539
+ "loss": 0.1385,
540
+ "step": 410
541
+ },
542
+ {
543
+ "epoch": 28.0,
544
+ "grad_norm": 4.48883056640625,
545
+ "learning_rate": 3.34920634920635e-05,
546
+ "loss": 0.1577,
547
+ "step": 420
548
+ },
549
+ {
550
+ "epoch": 28.0,
551
+ "eval_accuracy": 0.8571428571428571,
552
+ "eval_loss": 0.4826878011226654,
553
+ "eval_runtime": 1.9183,
554
+ "eval_samples_per_second": 91.227,
555
+ "eval_steps_per_second": 5.734,
556
+ "step": 420
557
+ },
558
+ {
559
+ "epoch": 28.677966101694913,
560
+ "grad_norm": 4.5706787109375,
561
+ "learning_rate": 3.26984126984127e-05,
562
+ "loss": 0.1271,
563
+ "step": 430
564
+ },
565
+ {
566
+ "epoch": 29.0,
567
+ "eval_accuracy": 0.8685714285714285,
568
+ "eval_loss": 0.530450165271759,
569
+ "eval_runtime": 2.147,
570
+ "eval_samples_per_second": 81.509,
571
+ "eval_steps_per_second": 5.123,
572
+ "step": 435
573
+ },
574
+ {
575
+ "epoch": 29.338983050847457,
576
+ "grad_norm": 8.417387962341309,
577
+ "learning_rate": 3.190476190476191e-05,
578
+ "loss": 0.1171,
579
+ "step": 440
580
+ },
581
+ {
582
+ "epoch": 30.0,
583
+ "grad_norm": 1.3924190998077393,
584
+ "learning_rate": 3.111111111111111e-05,
585
+ "loss": 0.1002,
586
+ "step": 450
587
+ },
588
+ {
589
+ "epoch": 30.0,
590
+ "eval_accuracy": 0.8628571428571429,
591
+ "eval_loss": 0.5888301134109497,
592
+ "eval_runtime": 1.8837,
593
+ "eval_samples_per_second": 92.902,
594
+ "eval_steps_per_second": 5.84,
595
+ "step": 450
596
+ },
597
+ {
598
+ "epoch": 30.677966101694913,
599
+ "grad_norm": 4.39676570892334,
600
+ "learning_rate": 3.031746031746032e-05,
601
+ "loss": 0.1268,
602
+ "step": 460
603
+ },
604
+ {
605
+ "epoch": 31.0,
606
+ "eval_accuracy": 0.8571428571428571,
607
+ "eval_loss": 0.6432701945304871,
608
+ "eval_runtime": 1.9204,
609
+ "eval_samples_per_second": 91.126,
610
+ "eval_steps_per_second": 5.728,
611
+ "step": 465
612
+ },
613
+ {
614
+ "epoch": 31.338983050847457,
615
+ "grad_norm": 6.017373561859131,
616
+ "learning_rate": 2.9523809523809523e-05,
617
+ "loss": 0.1077,
618
+ "step": 470
619
+ },
620
+ {
621
+ "epoch": 32.0,
622
+ "grad_norm": 5.33542013168335,
623
+ "learning_rate": 2.873015873015873e-05,
624
+ "loss": 0.1153,
625
+ "step": 480
626
+ },
627
+ {
628
+ "epoch": 32.0,
629
+ "eval_accuracy": 0.8342857142857143,
630
+ "eval_loss": 0.8394165635108948,
631
+ "eval_runtime": 2.2924,
632
+ "eval_samples_per_second": 76.34,
633
+ "eval_steps_per_second": 4.798,
634
+ "step": 480
635
+ },
636
+ {
637
+ "epoch": 32.67796610169491,
638
+ "grad_norm": 13.854134559631348,
639
+ "learning_rate": 2.7936507936507936e-05,
640
+ "loss": 0.1191,
641
+ "step": 490
642
+ },
643
+ {
644
+ "epoch": 33.0,
645
+ "eval_accuracy": 0.84,
646
+ "eval_loss": 0.747542142868042,
647
+ "eval_runtime": 1.9178,
648
+ "eval_samples_per_second": 91.249,
649
+ "eval_steps_per_second": 5.736,
650
+ "step": 495
651
+ },
652
+ {
653
+ "epoch": 33.33898305084746,
654
+ "grad_norm": 8.244441986083984,
655
+ "learning_rate": 2.7142857142857144e-05,
656
+ "loss": 0.1271,
657
+ "step": 500
658
+ },
659
+ {
660
+ "epoch": 34.0,
661
+ "grad_norm": 2.1750755310058594,
662
+ "learning_rate": 2.634920634920635e-05,
663
+ "loss": 0.1184,
664
+ "step": 510
665
+ },
666
+ {
667
+ "epoch": 34.0,
668
+ "eval_accuracy": 0.8742857142857143,
669
+ "eval_loss": 0.4883846938610077,
670
+ "eval_runtime": 1.8782,
671
+ "eval_samples_per_second": 93.173,
672
+ "eval_steps_per_second": 5.857,
673
+ "step": 510
674
+ },
675
+ {
676
+ "epoch": 34.67796610169491,
677
+ "grad_norm": 9.822646141052246,
678
+ "learning_rate": 2.5555555555555557e-05,
679
+ "loss": 0.1332,
680
+ "step": 520
681
+ },
682
+ {
683
+ "epoch": 35.0,
684
+ "eval_accuracy": 0.8857142857142857,
685
+ "eval_loss": 0.5834174752235413,
686
+ "eval_runtime": 2.1992,
687
+ "eval_samples_per_second": 79.573,
688
+ "eval_steps_per_second": 5.002,
689
+ "step": 525
690
+ },
691
+ {
692
+ "epoch": 35.33898305084746,
693
+ "grad_norm": 5.659489154815674,
694
+ "learning_rate": 2.4761904761904762e-05,
695
+ "loss": 0.0804,
696
+ "step": 530
697
+ },
698
+ {
699
+ "epoch": 36.0,
700
+ "grad_norm": 5.710267066955566,
701
+ "learning_rate": 2.396825396825397e-05,
702
+ "loss": 0.1071,
703
+ "step": 540
704
+ },
705
+ {
706
+ "epoch": 36.0,
707
+ "eval_accuracy": 0.8571428571428571,
708
+ "eval_loss": 0.627878725528717,
709
+ "eval_runtime": 1.9363,
710
+ "eval_samples_per_second": 90.377,
711
+ "eval_steps_per_second": 5.681,
712
+ "step": 540
713
+ },
714
+ {
715
+ "epoch": 36.67796610169491,
716
+ "grad_norm": 6.440234661102295,
717
+ "learning_rate": 2.3174603174603175e-05,
718
+ "loss": 0.0886,
719
+ "step": 550
720
+ },
721
+ {
722
+ "epoch": 37.0,
723
+ "eval_accuracy": 0.8628571428571429,
724
+ "eval_loss": 0.6998600363731384,
725
+ "eval_runtime": 1.9136,
726
+ "eval_samples_per_second": 91.451,
727
+ "eval_steps_per_second": 5.748,
728
+ "step": 555
729
+ },
730
+ {
731
+ "epoch": 37.33898305084746,
732
+ "grad_norm": 4.174771785736084,
733
+ "learning_rate": 2.238095238095238e-05,
734
+ "loss": 0.0845,
735
+ "step": 560
736
+ },
737
+ {
738
+ "epoch": 38.0,
739
+ "grad_norm": 0.6355146765708923,
740
+ "learning_rate": 2.1587301587301585e-05,
741
+ "loss": 0.0744,
742
+ "step": 570
743
+ },
744
+ {
745
+ "epoch": 38.0,
746
+ "eval_accuracy": 0.8685714285714285,
747
+ "eval_loss": 0.729452908039093,
748
+ "eval_runtime": 2.1953,
749
+ "eval_samples_per_second": 79.716,
750
+ "eval_steps_per_second": 5.011,
751
+ "step": 570
752
+ },
753
+ {
754
+ "epoch": 38.67796610169491,
755
+ "grad_norm": 12.643016815185547,
756
+ "learning_rate": 2.0793650793650793e-05,
757
+ "loss": 0.1274,
758
+ "step": 580
759
+ },
760
+ {
761
+ "epoch": 39.0,
762
+ "eval_accuracy": 0.8914285714285715,
763
+ "eval_loss": 0.6137147545814514,
764
+ "eval_runtime": 1.9072,
765
+ "eval_samples_per_second": 91.758,
766
+ "eval_steps_per_second": 5.768,
767
+ "step": 585
768
+ },
769
+ {
770
+ "epoch": 39.33898305084746,
771
+ "grad_norm": 8.217287063598633,
772
+ "learning_rate": 1.9999999999999998e-05,
773
+ "loss": 0.0905,
774
+ "step": 590
775
+ },
776
+ {
777
+ "epoch": 40.0,
778
+ "grad_norm": 2.896934986114502,
779
+ "learning_rate": 1.9206349206349206e-05,
780
+ "loss": 0.0795,
781
+ "step": 600
782
+ },
783
+ {
784
+ "epoch": 40.0,
785
+ "eval_accuracy": 0.8742857142857143,
786
+ "eval_loss": 0.5706229209899902,
787
+ "eval_runtime": 2.3574,
788
+ "eval_samples_per_second": 74.235,
789
+ "eval_steps_per_second": 4.666,
790
+ "step": 600
791
+ },
792
+ {
793
+ "epoch": 40.67796610169491,
794
+ "grad_norm": 8.571166038513184,
795
+ "learning_rate": 1.841269841269841e-05,
796
+ "loss": 0.0962,
797
+ "step": 610
798
+ },
799
+ {
800
+ "epoch": 41.0,
801
+ "eval_accuracy": 0.8857142857142857,
802
+ "eval_loss": 0.6100188493728638,
803
+ "eval_runtime": 2.0136,
804
+ "eval_samples_per_second": 86.908,
805
+ "eval_steps_per_second": 5.463,
806
+ "step": 615
807
+ },
808
+ {
809
+ "epoch": 41.33898305084746,
810
+ "grad_norm": 4.700484752655029,
811
+ "learning_rate": 1.761904761904762e-05,
812
+ "loss": 0.0749,
813
+ "step": 620
814
+ },
815
+ {
816
+ "epoch": 42.0,
817
+ "grad_norm": 2.4115490913391113,
818
+ "learning_rate": 1.6825396825396824e-05,
819
+ "loss": 0.094,
820
+ "step": 630
821
+ },
822
+ {
823
+ "epoch": 42.0,
824
+ "eval_accuracy": 0.8742857142857143,
825
+ "eval_loss": 0.6148616075515747,
826
+ "eval_runtime": 1.9168,
827
+ "eval_samples_per_second": 91.298,
828
+ "eval_steps_per_second": 5.739,
829
+ "step": 630
830
+ },
831
+ {
832
+ "epoch": 42.67796610169491,
833
+ "grad_norm": 4.732550144195557,
834
+ "learning_rate": 1.6031746031746033e-05,
835
+ "loss": 0.0945,
836
+ "step": 640
837
+ },
838
+ {
839
+ "epoch": 43.0,
840
+ "eval_accuracy": 0.88,
841
+ "eval_loss": 0.5688998699188232,
842
+ "eval_runtime": 2.0595,
843
+ "eval_samples_per_second": 84.974,
844
+ "eval_steps_per_second": 5.341,
845
+ "step": 645
846
+ },
847
+ {
848
+ "epoch": 43.33898305084746,
849
+ "grad_norm": 4.677188396453857,
850
+ "learning_rate": 1.5238095238095238e-05,
851
+ "loss": 0.0851,
852
+ "step": 650
853
+ },
854
+ {
855
+ "epoch": 44.0,
856
+ "grad_norm": 6.458128452301025,
857
+ "learning_rate": 1.4444444444444444e-05,
858
+ "loss": 0.0584,
859
+ "step": 660
860
+ },
861
+ {
862
+ "epoch": 44.0,
863
+ "eval_accuracy": 0.8742857142857143,
864
+ "eval_loss": 0.7018650770187378,
865
+ "eval_runtime": 2.3309,
866
+ "eval_samples_per_second": 75.079,
867
+ "eval_steps_per_second": 4.719,
868
+ "step": 660
869
+ },
870
+ {
871
+ "epoch": 44.67796610169491,
872
+ "grad_norm": 4.308237552642822,
873
+ "learning_rate": 1.365079365079365e-05,
874
+ "loss": 0.0676,
875
+ "step": 670
876
+ },
877
+ {
878
+ "epoch": 45.0,
879
+ "eval_accuracy": 0.88,
880
+ "eval_loss": 0.6934124231338501,
881
+ "eval_runtime": 1.9251,
882
+ "eval_samples_per_second": 90.902,
883
+ "eval_steps_per_second": 5.714,
884
+ "step": 675
885
+ },
886
+ {
887
+ "epoch": 45.33898305084746,
888
+ "grad_norm": 2.8312790393829346,
889
+ "learning_rate": 1.2857142857142857e-05,
890
+ "loss": 0.0893,
891
+ "step": 680
892
+ },
893
+ {
894
+ "epoch": 46.0,
895
+ "grad_norm": 7.0031328201293945,
896
+ "learning_rate": 1.2063492063492064e-05,
897
+ "loss": 0.0763,
898
+ "step": 690
899
+ },
900
+ {
901
+ "epoch": 46.0,
902
+ "eval_accuracy": 0.8914285714285715,
903
+ "eval_loss": 0.6047118902206421,
904
+ "eval_runtime": 2.0296,
905
+ "eval_samples_per_second": 86.224,
906
+ "eval_steps_per_second": 5.42,
907
+ "step": 690
908
+ },
909
+ {
910
+ "epoch": 46.67796610169491,
911
+ "grad_norm": 8.401297569274902,
912
+ "learning_rate": 1.126984126984127e-05,
913
+ "loss": 0.0762,
914
+ "step": 700
915
+ },
916
+ {
917
+ "epoch": 47.0,
918
+ "eval_accuracy": 0.88,
919
+ "eval_loss": 0.6063617467880249,
920
+ "eval_runtime": 1.8566,
921
+ "eval_samples_per_second": 94.259,
922
+ "eval_steps_per_second": 5.925,
923
+ "step": 705
924
+ },
925
+ {
926
+ "epoch": 47.33898305084746,
927
+ "grad_norm": 1.462274432182312,
928
+ "learning_rate": 1.0476190476190475e-05,
929
+ "loss": 0.0563,
930
+ "step": 710
931
+ },
932
+ {
933
+ "epoch": 48.0,
934
+ "grad_norm": 1.8739376068115234,
935
+ "learning_rate": 9.682539682539682e-06,
936
+ "loss": 0.0696,
937
+ "step": 720
938
+ },
939
+ {
940
+ "epoch": 48.0,
941
+ "eval_accuracy": 0.8685714285714285,
942
+ "eval_loss": 0.7335702776908875,
943
+ "eval_runtime": 1.911,
944
+ "eval_samples_per_second": 91.576,
945
+ "eval_steps_per_second": 5.756,
946
+ "step": 720
947
+ },
948
+ {
949
+ "epoch": 48.67796610169491,
950
+ "grad_norm": 2.9189000129699707,
951
+ "learning_rate": 8.888888888888888e-06,
952
+ "loss": 0.0555,
953
+ "step": 730
954
+ },
955
+ {
956
+ "epoch": 49.0,
957
+ "eval_accuracy": 0.8742857142857143,
958
+ "eval_loss": 0.6598544120788574,
959
+ "eval_runtime": 1.9333,
960
+ "eval_samples_per_second": 90.519,
961
+ "eval_steps_per_second": 5.69,
962
+ "step": 735
963
+ },
964
+ {
965
+ "epoch": 49.33898305084746,
966
+ "grad_norm": 3.1225035190582275,
967
+ "learning_rate": 8.095238095238095e-06,
968
+ "loss": 0.1129,
969
+ "step": 740
970
+ },
971
+ {
972
+ "epoch": 50.0,
973
+ "grad_norm": 2.0588467121124268,
974
+ "learning_rate": 7.301587301587301e-06,
975
+ "loss": 0.0572,
976
+ "step": 750
977
+ },
978
+ {
979
+ "epoch": 50.0,
980
+ "eval_accuracy": 0.9028571428571428,
981
+ "eval_loss": 0.597748875617981,
982
+ "eval_runtime": 2.541,
983
+ "eval_samples_per_second": 68.87,
984
+ "eval_steps_per_second": 4.329,
985
+ "step": 750
986
+ },
987
+ {
988
+ "epoch": 50.67796610169491,
989
+ "grad_norm": 2.578906536102295,
990
+ "learning_rate": 6.507936507936508e-06,
991
+ "loss": 0.0648,
992
+ "step": 760
993
+ },
994
+ {
995
+ "epoch": 51.0,
996
+ "eval_accuracy": 0.88,
997
+ "eval_loss": 0.6257001757621765,
998
+ "eval_runtime": 1.8911,
999
+ "eval_samples_per_second": 92.541,
1000
+ "eval_steps_per_second": 5.817,
1001
+ "step": 765
1002
+ },
1003
+ {
1004
+ "epoch": 51.33898305084746,
1005
+ "grad_norm": 3.871882677078247,
1006
+ "learning_rate": 5.7142857142857145e-06,
1007
+ "loss": 0.0521,
1008
+ "step": 770
1009
+ },
1010
+ {
1011
+ "epoch": 52.0,
1012
+ "grad_norm": 3.6807923316955566,
1013
+ "learning_rate": 4.92063492063492e-06,
1014
+ "loss": 0.0705,
1015
+ "step": 780
1016
+ },
1017
+ {
1018
+ "epoch": 52.0,
1019
+ "eval_accuracy": 0.8857142857142857,
1020
+ "eval_loss": 0.6653619408607483,
1021
+ "eval_runtime": 1.8778,
1022
+ "eval_samples_per_second": 93.193,
1023
+ "eval_steps_per_second": 5.858,
1024
+ "step": 780
1025
+ },
1026
+ {
1027
+ "epoch": 52.67796610169491,
1028
+ "grad_norm": 6.3525519371032715,
1029
+ "learning_rate": 4.126984126984127e-06,
1030
+ "loss": 0.0646,
1031
+ "step": 790
1032
+ },
1033
+ {
1034
+ "epoch": 53.0,
1035
+ "eval_accuracy": 0.8685714285714285,
1036
+ "eval_loss": 0.6813338994979858,
1037
+ "eval_runtime": 1.8783,
1038
+ "eval_samples_per_second": 93.171,
1039
+ "eval_steps_per_second": 5.856,
1040
+ "step": 795
1041
+ },
1042
+ {
1043
+ "epoch": 53.33898305084746,
1044
+ "grad_norm": 5.389460563659668,
1045
+ "learning_rate": 3.3333333333333333e-06,
1046
+ "loss": 0.0463,
1047
+ "step": 800
1048
+ },
1049
+ {
1050
+ "epoch": 54.0,
1051
+ "grad_norm": 0.5001619458198547,
1052
+ "learning_rate": 2.5396825396825395e-06,
1053
+ "loss": 0.0795,
1054
+ "step": 810
1055
+ },
1056
+ {
1057
+ "epoch": 54.0,
1058
+ "eval_accuracy": 0.8742857142857143,
1059
+ "eval_loss": 0.6209337711334229,
1060
+ "eval_runtime": 2.6137,
1061
+ "eval_samples_per_second": 66.955,
1062
+ "eval_steps_per_second": 4.209,
1063
+ "step": 810
1064
+ },
1065
+ {
1066
+ "epoch": 54.67796610169491,
1067
+ "grad_norm": 9.643752098083496,
1068
+ "learning_rate": 1.746031746031746e-06,
1069
+ "loss": 0.0828,
1070
+ "step": 820
1071
+ },
1072
+ {
1073
+ "epoch": 55.0,
1074
+ "eval_accuracy": 0.8742857142857143,
1075
+ "eval_loss": 0.6456648707389832,
1076
+ "eval_runtime": 1.8674,
1077
+ "eval_samples_per_second": 93.711,
1078
+ "eval_steps_per_second": 5.89,
1079
+ "step": 825
1080
+ },
1081
+ {
1082
+ "epoch": 55.33898305084746,
1083
+ "grad_norm": 7.717844486236572,
1084
+ "learning_rate": 9.523809523809523e-07,
1085
+ "loss": 0.0916,
1086
+ "step": 830
1087
+ },
1088
+ {
1089
+ "epoch": 56.0,
1090
+ "grad_norm": 3.563279390335083,
1091
+ "learning_rate": 1.5873015873015872e-07,
1092
+ "loss": 0.0674,
1093
+ "step": 840
1094
+ },
1095
+ {
1096
+ "epoch": 56.0,
1097
+ "eval_accuracy": 0.88,
1098
+ "eval_loss": 0.6521316766738892,
1099
+ "eval_runtime": 1.8761,
1100
+ "eval_samples_per_second": 93.278,
1101
+ "eval_steps_per_second": 5.863,
1102
+ "step": 840
1103
+ },
1104
+ {
1105
+ "epoch": 56.0,
1106
+ "step": 840,
1107
+ "total_flos": 1.7108328318259692e+18,
1108
+ "train_loss": 0.19382851386354083,
1109
+ "train_runtime": 1219.347,
1110
+ "train_samples_per_second": 46.205,
1111
+ "train_steps_per_second": 0.689
1112
+ }
1113
+ ],
1114
+ "logging_steps": 10,
1115
+ "max_steps": 840,
1116
+ "num_input_tokens_seen": 0,
1117
+ "num_train_epochs": 60,
1118
+ "save_steps": 500,
1119
+ "stateful_callbacks": {
1120
+ "TrainerControl": {
1121
+ "args": {
1122
+ "should_epoch_stop": false,
1123
+ "should_evaluate": false,
1124
+ "should_log": false,
1125
+ "should_save": true,
1126
+ "should_training_stop": true
1127
+ },
1128
+ "attributes": {}
1129
+ }
1130
+ },
1131
+ "total_flos": 1.7108328318259692e+18,
1132
+ "train_batch_size": 16,
1133
+ "trial_name": null,
1134
+ "trial_params": null
1135
+ }