zireael08 commited on
Commit
0aba07d
·
verified ·
1 Parent(s): 116c7df

zireael08/swin-msldv2

Browse files
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9929701230228472
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.0139
37
- - Accuracy: 0.9930
38
 
39
  ## Model description
40
 
@@ -53,7 +53,7 @@ More information needed
53
  ### Training hyperparameters
54
 
55
  The following hyperparameters were used during training:
56
- - learning_rate: 0.0002
57
  - train_batch_size: 32
58
  - eval_batch_size: 64
59
  - seed: 42
@@ -66,16 +66,16 @@ The following hyperparameters were used during training:
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
69
- | 0.6791 | 1.0 | 83 | 0.4035 | 0.8389 |
70
- | 0.5440 | 2.0 | 166 | 0.4195 | 0.8301 |
71
- | 0.5461 | 3.0 | 249 | 0.2318 | 0.9062 |
72
- | 0.3638 | 4.0 | 332 | 0.1876 | 0.9186 |
73
- | 0.2937 | 5.0 | 415 | 0.1239 | 0.9540 |
74
- | 0.1980 | 6.0 | 498 | 0.0752 | 0.9770 |
75
- | 0.1727 | 7.0 | 581 | 0.0600 | 0.9823 |
76
- | 0.0900 | 8.0 | 664 | 0.0722 | 0.9858 |
77
- | 0.0441 | 9.0 | 747 | 0.0640 | 0.9876 |
78
- | 0.1195 | 10.0 | 830 | 0.0693 | 0.9841 |
79
 
80
 
81
  ### Framework versions
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.9982425307557118
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.0098
37
+ - Accuracy: 0.9982
38
 
39
  ## Model description
40
 
 
53
  ### Training hyperparameters
54
 
55
  The following hyperparameters were used during training:
56
+ - learning_rate: 0.0001
57
  - train_batch_size: 32
58
  - eval_batch_size: 64
59
  - seed: 42
 
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
68
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
69
+ | 0.6367 | 1.0 | 83 | 0.4612 | 0.8248 |
70
+ | 0.4656 | 2.0 | 166 | 0.3608 | 0.8496 |
71
+ | 0.4911 | 3.0 | 249 | 0.1344 | 0.9646 |
72
+ | 0.1630 | 4.0 | 332 | 0.1347 | 0.9575 |
73
+ | 0.1872 | 5.0 | 415 | 0.1106 | 0.9628 |
74
+ | 0.1801 | 6.0 | 498 | 0.0968 | 0.9823 |
75
+ | 0.1453 | 7.0 | 581 | 0.1196 | 0.9717 |
76
+ | 0.0787 | 8.0 | 664 | 0.0838 | 0.9894 |
77
+ | 0.0353 | 9.0 | 747 | 0.0801 | 0.9912 |
78
+ | 0.0878 | 10.0 | 830 | 0.0818 | 0.9912 |
79
 
80
 
81
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.9929701230228472,
4
- "eval_loss": 0.013884289190173149,
5
- "eval_runtime": 5.6893,
6
- "eval_samples_per_second": 100.013,
7
- "eval_steps_per_second": 1.582,
8
  "total_flos": 8.593274471605862e+17,
9
- "train_loss": 0.33146001000002207,
10
- "train_runtime": 922.6735,
11
- "train_samples_per_second": 28.623,
12
- "train_steps_per_second": 0.9
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.9982425307557118,
4
+ "eval_loss": 0.009822744876146317,
5
+ "eval_runtime": 6.0857,
6
+ "eval_samples_per_second": 93.498,
7
+ "eval_steps_per_second": 1.479,
8
  "total_flos": 8.593274471605862e+17,
9
+ "train_loss": 0.29410572172288435,
10
+ "train_runtime": 896.4382,
11
+ "train_samples_per_second": 29.461,
12
+ "train_steps_per_second": 0.926
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.9929701230228472,
4
- "eval_loss": 0.013884289190173149,
5
- "eval_runtime": 5.6893,
6
- "eval_samples_per_second": 100.013,
7
- "eval_steps_per_second": 1.582
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.9982425307557118,
4
+ "eval_loss": 0.009822744876146317,
5
+ "eval_runtime": 6.0857,
6
+ "eval_samples_per_second": 93.498,
7
+ "eval_steps_per_second": 1.479
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88893b360ace28496cff1e3fb3284cbcf229d4e60a790e577d565d9fbe374109
3
  size 110362448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2399d36d2c712a58c79aa8eada79a504907fecd03646c12c6865d1884ee997
3
  size 110362448
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 8.593274471605862e+17,
4
- "train_loss": 0.33146001000002207,
5
- "train_runtime": 922.6735,
6
- "train_samples_per_second": 28.623,
7
- "train_steps_per_second": 0.9
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 8.593274471605862e+17,
4
+ "train_loss": 0.29410572172288435,
5
+ "train_runtime": 896.4382,
6
+ "train_samples_per_second": 29.461,
7
+ "train_steps_per_second": 0.926
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 747,
3
- "best_metric": 0.9876106194690265,
4
  "best_model_checkpoint": "./logs/checkpoint-747",
5
  "epoch": 10.0,
6
  "eval_steps": 500,
@@ -11,683 +11,683 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.12048192771084337,
14
- "grad_norm": 6.176705837249756,
15
- "learning_rate": 2.168674698795181e-05,
16
- "loss": 1.7943456649780274,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.24096385542168675,
21
- "grad_norm": 8.065713882446289,
22
- "learning_rate": 4.578313253012048e-05,
23
- "loss": 1.5439151763916015,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.3614457831325301,
28
- "grad_norm": 7.123748302459717,
29
- "learning_rate": 6.987951807228917e-05,
30
- "loss": 1.2263535499572753,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.4819277108433735,
35
- "grad_norm": 15.442414283752441,
36
- "learning_rate": 9.397590361445784e-05,
37
- "loss": 0.8671344757080078,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.6024096385542169,
42
- "grad_norm": 13.811105728149414,
43
- "learning_rate": 0.00011807228915662652,
44
- "loss": 0.7558579444885254,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.7228915662650602,
49
- "grad_norm": 11.728008270263672,
50
- "learning_rate": 0.00014216867469879518,
51
- "loss": 0.8202498435974122,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.8433734939759037,
56
- "grad_norm": 9.147120475769043,
57
- "learning_rate": 0.00016626506024096388,
58
- "loss": 0.7484613418579101,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.963855421686747,
63
- "grad_norm": 13.851210594177246,
64
- "learning_rate": 0.00019036144578313252,
65
- "loss": 0.6791098594665528,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 1.0,
70
- "eval_accuracy": 0.8389380530973451,
71
- "eval_loss": 0.40345069766044617,
72
- "eval_runtime": 5.4508,
73
- "eval_samples_per_second": 103.654,
74
- "eval_steps_per_second": 1.651,
75
  "step": 83
76
  },
77
  {
78
  "epoch": 1.0843373493975903,
79
- "grad_norm": 7.1830878257751465,
80
- "learning_rate": 0.00019996816476922677,
81
- "loss": 0.627526044845581,
82
  "step": 90
83
  },
84
  {
85
  "epoch": 1.2048192771084336,
86
- "grad_norm": 9.858720779418945,
87
- "learning_rate": 0.00019977368953632839,
88
- "loss": 0.5963219165802002,
89
  "step": 100
90
  },
91
  {
92
  "epoch": 1.3253012048192772,
93
- "grad_norm": 16.397104263305664,
94
- "learning_rate": 0.0001994027688138541,
95
- "loss": 0.6106359958648682,
96
  "step": 110
97
  },
98
  {
99
  "epoch": 1.4457831325301205,
100
- "grad_norm": 8.843240737915039,
101
- "learning_rate": 0.00019885605855918885,
102
- "loss": 0.6805217742919922,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 1.5662650602409638,
107
- "grad_norm": 14.887824058532715,
108
- "learning_rate": 0.0001981345256059637,
109
- "loss": 0.531688928604126,
110
  "step": 130
111
  },
112
  {
113
  "epoch": 1.6867469879518073,
114
- "grad_norm": 9.859296798706055,
115
- "learning_rate": 0.0001972394459542521,
116
- "loss": 0.5321962833404541,
117
  "step": 140
118
  },
119
  {
120
  "epoch": 1.8072289156626506,
121
- "grad_norm": 22.302900314331055,
122
- "learning_rate": 0.0001961724025140185,
123
- "loss": 0.5432641983032227,
124
  "step": 150
125
  },
126
  {
127
  "epoch": 1.927710843373494,
128
- "grad_norm": 8.619937896728516,
129
- "learning_rate": 0.00019493528230580992,
130
- "loss": 0.5439841747283936,
131
  "step": 160
132
  },
133
  {
134
  "epoch": 2.0,
135
- "eval_accuracy": 0.8300884955752212,
136
- "eval_loss": 0.4195369780063629,
137
- "eval_runtime": 5.4707,
138
- "eval_samples_per_second": 103.278,
139
- "eval_steps_per_second": 1.645,
140
  "step": 166
141
  },
142
  {
143
  "epoch": 2.0481927710843375,
144
- "grad_norm": 24.363290786743164,
145
- "learning_rate": 0.00019353027312364116,
146
- "loss": 0.5910098552703857,
147
  "step": 170
148
  },
149
  {
150
  "epoch": 2.1686746987951806,
151
- "grad_norm": 6.910096168518066,
152
- "learning_rate": 0.00019195985966597494,
153
- "loss": 0.5120758533477783,
154
  "step": 180
155
  },
156
  {
157
  "epoch": 2.289156626506024,
158
- "grad_norm": 17.687562942504883,
159
- "learning_rate": 0.00019022681914163917,
160
- "loss": 0.405228328704834,
161
  "step": 190
162
  },
163
  {
164
  "epoch": 2.4096385542168672,
165
- "grad_norm": 12.02397632598877,
166
- "learning_rate": 0.0001883342163584523,
167
- "loss": 0.48291816711425783,
168
  "step": 200
169
  },
170
  {
171
  "epoch": 2.5301204819277108,
172
- "grad_norm": 18.50483512878418,
173
- "learning_rate": 0.00018628539830324229,
174
- "loss": 0.4583749294281006,
175
  "step": 210
176
  },
177
  {
178
  "epoch": 2.6506024096385543,
179
- "grad_norm": 6.045924186706543,
180
- "learning_rate": 0.00018408398822284392,
181
- "loss": 0.48716106414794924,
182
  "step": 220
183
  },
184
  {
185
  "epoch": 2.7710843373493974,
186
- "grad_norm": 11.271085739135742,
187
- "learning_rate": 0.0001817338792165421,
188
- "loss": 0.3358022212982178,
189
  "step": 230
190
  },
191
  {
192
  "epoch": 2.891566265060241,
193
- "grad_norm": 12.60858154296875,
194
- "learning_rate": 0.00017923922735129302,
195
- "loss": 0.546141242980957,
196
  "step": 240
197
  },
198
  {
199
  "epoch": 3.0,
200
- "eval_accuracy": 0.9061946902654867,
201
- "eval_loss": 0.23178476095199585,
202
- "eval_runtime": 5.4341,
203
- "eval_samples_per_second": 103.972,
204
- "eval_steps_per_second": 1.656,
205
  "step": 249
206
  },
207
  {
208
  "epoch": 3.0120481927710845,
209
- "grad_norm": 8.480234146118164,
210
- "learning_rate": 0.0001766044443118978,
211
- "loss": 0.34228219985961916,
212
  "step": 250
213
  },
214
  {
215
  "epoch": 3.1325301204819276,
216
- "grad_norm": 17.870315551757812,
217
- "learning_rate": 0.00017383418959912746,
218
- "loss": 0.3570461988449097,
219
  "step": 260
220
  },
221
  {
222
  "epoch": 3.253012048192771,
223
- "grad_norm": 3.9850714206695557,
224
- "learning_rate": 0.00017093336228959536,
225
- "loss": 0.30241072177886963,
226
  "step": 270
227
  },
228
  {
229
  "epoch": 3.3734939759036147,
230
- "grad_norm": 12.326411247253418,
231
- "learning_rate": 0.00016790709237195065,
232
- "loss": 0.4090369701385498,
233
  "step": 280
234
  },
235
  {
236
  "epoch": 3.4939759036144578,
237
- "grad_norm": 4.366477966308594,
238
- "learning_rate": 0.00016476073167471345,
239
- "loss": 0.3212424755096436,
240
  "step": 290
241
  },
242
  {
243
  "epoch": 3.6144578313253013,
244
- "grad_norm": 9.694393157958984,
245
- "learning_rate": 0.00016149984440179537,
246
- "loss": 0.2823866128921509,
247
  "step": 300
248
  },
249
  {
250
  "epoch": 3.734939759036145,
251
- "grad_norm": 8.204358100891113,
252
- "learning_rate": 0.00015813019729244405,
253
- "loss": 0.2677892208099365,
254
  "step": 310
255
  },
256
  {
257
  "epoch": 3.855421686746988,
258
- "grad_norm": 7.021804332733154,
259
- "learning_rate": 0.0001546577494230118,
260
- "loss": 0.24797892570495605,
261
  "step": 320
262
  },
263
  {
264
  "epoch": 3.9759036144578315,
265
- "grad_norm": 9.44687557220459,
266
- "learning_rate": 0.00015108864166858506,
267
- "loss": 0.3637809991836548,
268
  "step": 330
269
  },
270
  {
271
  "epoch": 4.0,
272
- "eval_accuracy": 0.9185840707964602,
273
- "eval_loss": 0.1876251995563507,
274
- "eval_runtime": 5.4214,
275
- "eval_samples_per_second": 104.217,
276
- "eval_steps_per_second": 1.66,
277
  "step": 332
278
  },
279
  {
280
  "epoch": 4.096385542168675,
281
- "grad_norm": 7.593841552734375,
282
- "learning_rate": 0.00014742918584311,
283
- "loss": 0.3275733470916748,
284
  "step": 340
285
  },
286
  {
287
  "epoch": 4.216867469879518,
288
- "grad_norm": 9.369234085083008,
289
- "learning_rate": 0.00014368585353722048,
290
- "loss": 0.26893665790557864,
291
  "step": 350
292
  },
293
  {
294
  "epoch": 4.337349397590361,
295
- "grad_norm": 19.293025970458984,
296
- "learning_rate": 0.0001398652646735076,
297
- "loss": 0.27639012336730956,
298
  "step": 360
299
  },
300
  {
301
  "epoch": 4.457831325301205,
302
- "grad_norm": 17.32465171813965,
303
- "learning_rate": 0.00013597417579947054,
304
- "loss": 0.2637490749359131,
305
  "step": 370
306
  },
307
  {
308
  "epoch": 4.578313253012048,
309
- "grad_norm": 6.271369934082031,
310
- "learning_rate": 0.00013201946813885232,
311
- "loss": 0.34631929397583006,
312
  "step": 380
313
  },
314
  {
315
  "epoch": 4.698795180722891,
316
- "grad_norm": 8.38680362701416,
317
- "learning_rate": 0.00012800813542249072,
318
- "loss": 0.22855329513549805,
319
  "step": 390
320
  },
321
  {
322
  "epoch": 4.8192771084337345,
323
- "grad_norm": 9.031497955322266,
324
- "learning_rate": 0.00012394727152020528,
325
- "loss": 0.3539623498916626,
326
  "step": 400
327
  },
328
  {
329
  "epoch": 4.9397590361445785,
330
- "grad_norm": 8.364486694335938,
331
- "learning_rate": 0.00011984405789559298,
332
- "loss": 0.2936580657958984,
333
  "step": 410
334
  },
335
  {
336
  "epoch": 5.0,
337
- "eval_accuracy": 0.9539823008849557,
338
- "eval_loss": 0.12390898168087006,
339
- "eval_runtime": 5.8382,
340
- "eval_samples_per_second": 96.776,
341
- "eval_steps_per_second": 1.542,
342
  "step": 415
343
  },
344
  {
345
  "epoch": 5.0602409638554215,
346
- "grad_norm": 5.641082763671875,
347
- "learning_rate": 0.00011570575090591791,
348
- "loss": 0.20228171348571777,
349
  "step": 420
350
  },
351
  {
352
  "epoch": 5.180722891566265,
353
- "grad_norm": 5.010425567626953,
354
- "learning_rate": 0.00011153966896955468,
355
- "loss": 0.2692150354385376,
356
  "step": 430
357
  },
358
  {
359
  "epoch": 5.301204819277109,
360
- "grad_norm": 5.971624374389648,
361
- "learning_rate": 0.00010735317962367959,
362
- "loss": 0.21376564502716064,
363
  "step": 440
364
  },
365
  {
366
  "epoch": 5.421686746987952,
367
- "grad_norm": 9.657953262329102,
368
- "learning_rate": 0.00010315368649509716,
369
- "loss": 0.15639951229095458,
370
  "step": 450
371
  },
372
  {
373
  "epoch": 5.542168674698795,
374
- "grad_norm": 6.185604095458984,
375
- "learning_rate": 9.894861620724375e-05,
376
- "loss": 0.23455722332000734,
377
  "step": 460
378
  },
379
  {
380
  "epoch": 5.662650602409639,
381
- "grad_norm": 4.242949962615967,
382
- "learning_rate": 9.474540524652267e-05,
383
- "loss": 0.1507526993751526,
384
  "step": 470
385
  },
386
  {
387
  "epoch": 5.783132530120482,
388
- "grad_norm": 9.122421264648438,
389
- "learning_rate": 9.055148681119688e-05,
390
- "loss": 0.21329116821289062,
391
  "step": 480
392
  },
393
  {
394
  "epoch": 5.903614457831325,
395
- "grad_norm": 2.256641149520874,
396
- "learning_rate": 8.637427766609691e-05,
397
- "loss": 0.19799797534942626,
398
  "step": 490
399
  },
400
  {
401
  "epoch": 6.0,
402
- "eval_accuracy": 0.9769911504424779,
403
- "eval_loss": 0.0752442330121994,
404
- "eval_runtime": 5.5001,
405
- "eval_samples_per_second": 102.725,
406
- "eval_steps_per_second": 1.636,
407
  "step": 498
408
  },
409
  {
410
  "epoch": 6.024096385542169,
411
- "grad_norm": 5.028798580169678,
412
- "learning_rate": 8.222116502639032e-05,
413
- "loss": 0.10370807647705078,
414
  "step": 500
415
  },
416
  {
417
  "epoch": 6.144578313253012,
418
- "grad_norm": 7.922329902648926,
419
- "learning_rate": 7.809949349360872e-05,
420
- "loss": 0.1501248598098755,
421
  "step": 510
422
  },
423
  {
424
  "epoch": 6.265060240963855,
425
- "grad_norm": 3.3606350421905518,
426
- "learning_rate": 7.401655206703479e-05,
427
- "loss": 0.1932325005531311,
428
  "step": 520
429
  },
430
  {
431
  "epoch": 6.385542168674699,
432
- "grad_norm": 4.238986492156982,
433
- "learning_rate": 6.99795612534202e-05,
434
- "loss": 0.14431376457214357,
435
  "step": 530
436
  },
437
  {
438
  "epoch": 6.506024096385542,
439
- "grad_norm": 5.669098854064941,
440
- "learning_rate": 6.599566029782863e-05,
441
- "loss": 0.19270881414413452,
442
  "step": 540
443
  },
444
  {
445
  "epoch": 6.626506024096385,
446
- "grad_norm": 1.8570481538772583,
447
- "learning_rate": 6.20718945581877e-05,
448
- "loss": 0.12588899135589598,
449
  "step": 550
450
  },
451
  {
452
  "epoch": 6.746987951807229,
453
- "grad_norm": 8.998724937438965,
454
- "learning_rate": 5.821520304587528e-05,
455
- "loss": 0.1433807611465454,
456
  "step": 560
457
  },
458
  {
459
  "epoch": 6.867469879518072,
460
- "grad_norm": 5.294978141784668,
461
- "learning_rate": 5.443240615437586e-05,
462
- "loss": 0.13480768203735352,
463
  "step": 570
464
  },
465
  {
466
  "epoch": 6.9879518072289155,
467
- "grad_norm": 6.6459832191467285,
468
- "learning_rate": 5.07301935977071e-05,
469
- "loss": 0.17268821001052856,
470
  "step": 580
471
  },
472
  {
473
  "epoch": 7.0,
474
- "eval_accuracy": 0.9823008849557522,
475
- "eval_loss": 0.05996280908584595,
476
- "eval_runtime": 5.4871,
477
- "eval_samples_per_second": 102.969,
478
- "eval_steps_per_second": 1.64,
479
  "step": 581
480
  },
481
  {
482
  "epoch": 7.108433734939759,
483
- "grad_norm": 6.390337944030762,
484
- "learning_rate": 4.7115112579947675e-05,
485
- "loss": 0.11058632135391236,
486
  "step": 590
487
  },
488
  {
489
  "epoch": 7.228915662650603,
490
- "grad_norm": 6.371334552764893,
491
- "learning_rate": 4.359355621678764e-05,
492
- "loss": 0.13580918312072754,
493
  "step": 600
494
  },
495
  {
496
  "epoch": 7.349397590361446,
497
- "grad_norm": 5.915042400360107,
498
- "learning_rate": 4.0171752229577875e-05,
499
- "loss": 0.10019686222076415,
500
  "step": 610
501
  },
502
  {
503
  "epoch": 7.469879518072289,
504
- "grad_norm": 10.278961181640625,
505
- "learning_rate": 3.6855751931871516e-05,
506
- "loss": 0.10151375532150268,
507
  "step": 620
508
  },
509
  {
510
  "epoch": 7.590361445783133,
511
- "grad_norm": 4.616462230682373,
512
- "learning_rate": 3.365141952793622e-05,
513
- "loss": 0.11933131217956543,
514
  "step": 630
515
  },
516
  {
517
  "epoch": 7.710843373493976,
518
- "grad_norm": 14.82908821105957,
519
- "learning_rate": 3.056442174215985e-05,
520
- "loss": 0.11742031574249268,
521
  "step": 640
522
  },
523
  {
524
  "epoch": 7.831325301204819,
525
- "grad_norm": 7.869307518005371,
526
- "learning_rate": 2.7600217797692042e-05,
527
- "loss": 0.13143677711486818,
528
  "step": 650
529
  },
530
  {
531
  "epoch": 7.951807228915663,
532
- "grad_norm": 8.95853328704834,
533
- "learning_rate": 2.4764049762041874e-05,
534
- "loss": 0.09001794457435608,
535
  "step": 660
536
  },
537
  {
538
  "epoch": 8.0,
539
- "eval_accuracy": 0.9858407079646018,
540
- "eval_loss": 0.07218904048204422,
541
- "eval_runtime": 5.8913,
542
- "eval_samples_per_second": 95.904,
543
- "eval_steps_per_second": 1.528,
544
  "step": 664
545
  },
546
  {
547
  "epoch": 8.072289156626505,
548
- "grad_norm": 6.763011455535889,
549
- "learning_rate": 2.2060933276706586e-05,
550
- "loss": 0.1112417459487915,
551
  "step": 670
552
  },
553
  {
554
  "epoch": 8.19277108433735,
555
- "grad_norm": 3.7270097732543945,
556
- "learning_rate": 1.9495648687224676e-05,
557
- "loss": 0.05919206738471985,
558
  "step": 680
559
  },
560
  {
561
  "epoch": 8.313253012048193,
562
- "grad_norm": 3.07499361038208,
563
- "learning_rate": 1.7072732589339955e-05,
564
- "loss": 0.12545101642608641,
565
  "step": 690
566
  },
567
  {
568
  "epoch": 8.433734939759036,
569
- "grad_norm": 5.470985412597656,
570
- "learning_rate": 1.4796469806226532e-05,
571
- "loss": 0.0804430365562439,
572
  "step": 700
573
  },
574
  {
575
  "epoch": 8.55421686746988,
576
- "grad_norm": 6.063421726226807,
577
- "learning_rate": 1.2670885810962884e-05,
578
- "loss": 0.06899781823158264,
579
  "step": 710
580
  },
581
  {
582
  "epoch": 8.674698795180722,
583
- "grad_norm": 1.9861423969268799,
584
- "learning_rate": 1.0699739607655435e-05,
585
- "loss": 0.06391697525978088,
586
  "step": 720
587
  },
588
  {
589
  "epoch": 8.795180722891565,
590
- "grad_norm": 7.721556186676025,
591
- "learning_rate": 8.886517083801015e-06,
592
- "loss": 0.1207929015159607,
593
  "step": 730
594
  },
595
  {
596
  "epoch": 8.91566265060241,
597
- "grad_norm": 0.324483186006546,
598
- "learning_rate": 7.234424845644383e-06,
599
- "loss": 0.04410604834556579,
600
  "step": 740
601
  },
602
  {
603
  "epoch": 9.0,
604
- "eval_accuracy": 0.9876106194690265,
605
- "eval_loss": 0.06401132047176361,
606
- "eval_runtime": 5.8918,
607
- "eval_samples_per_second": 95.896,
608
- "eval_steps_per_second": 1.528,
609
  "step": 747
610
  },
611
  {
612
  "epoch": 9.036144578313253,
613
- "grad_norm": 0.9848870038986206,
614
- "learning_rate": 5.746384547432737e-06,
615
- "loss": 0.028363901376724242,
616
  "step": 750
617
  },
618
  {
619
  "epoch": 9.156626506024097,
620
- "grad_norm": 3.6140482425689697,
621
- "learning_rate": 4.425027724595298e-06,
622
- "loss": 0.10400755405426025,
623
  "step": 760
624
  },
625
  {
626
  "epoch": 9.27710843373494,
627
- "grad_norm": 2.882124900817871,
628
- "learning_rate": 3.2726911399860837e-06,
629
- "loss": 0.11473925113677978,
630
  "step": 770
631
  },
632
  {
633
  "epoch": 9.397590361445783,
634
- "grad_norm": 3.0985116958618164,
635
- "learning_rate": 2.291412651418778e-06,
636
- "loss": 0.11879135370254516,
637
  "step": 780
638
  },
639
  {
640
  "epoch": 9.518072289156626,
641
- "grad_norm": 6.65316104888916,
642
- "learning_rate": 1.482927607802853e-06,
643
- "loss": 0.08090834617614746,
644
  "step": 790
645
  },
646
  {
647
  "epoch": 9.638554216867469,
648
- "grad_norm": 0.7483940124511719,
649
- "learning_rate": 8.486657802532439e-07,
650
- "loss": 0.052888357639312746,
651
  "step": 800
652
  },
653
  {
654
  "epoch": 9.759036144578314,
655
- "grad_norm": 3.122527599334717,
656
- "learning_rate": 3.8974883360169966e-07,
657
- "loss": 0.07597114443778992,
658
  "step": 810
659
  },
660
  {
661
  "epoch": 9.879518072289157,
662
- "grad_norm": 3.4678826332092285,
663
- "learning_rate": 1.0698834278045633e-07,
664
- "loss": 0.0631083905696869,
665
  "step": 820
666
  },
667
  {
668
  "epoch": 10.0,
669
- "grad_norm": 0.2041388303041458,
670
- "learning_rate": 8.843575868833221e-10,
671
- "loss": 0.11945844888687134,
672
  "step": 830
673
  },
674
  {
675
  "epoch": 10.0,
676
- "eval_accuracy": 0.984070796460177,
677
- "eval_loss": 0.06931564211845398,
678
- "eval_runtime": 5.5703,
679
- "eval_samples_per_second": 101.431,
680
- "eval_steps_per_second": 1.616,
681
  "step": 830
682
  },
683
  {
684
  "epoch": 10.0,
685
  "step": 830,
686
  "total_flos": 8.593274471605862e+17,
687
- "train_loss": 0.33146001000002207,
688
- "train_runtime": 922.6735,
689
- "train_samples_per_second": 28.623,
690
- "train_steps_per_second": 0.9
691
  }
692
  ],
693
  "logging_steps": 10,
 
1
  {
2
  "best_global_step": 747,
3
+ "best_metric": 0.9911504424778761,
4
  "best_model_checkpoint": "./logs/checkpoint-747",
5
  "epoch": 10.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.12048192771084337,
14
+ "grad_norm": 6.236629486083984,
15
+ "learning_rate": 1.0843373493975904e-05,
16
+ "loss": 1.7779657363891601,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.24096385542168675,
21
+ "grad_norm": 6.614838123321533,
22
+ "learning_rate": 2.289156626506024e-05,
23
+ "loss": 1.6157239913940429,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.3614457831325301,
28
+ "grad_norm": 8.87565803527832,
29
+ "learning_rate": 3.4939759036144585e-05,
30
+ "loss": 1.3881938934326172,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.4819277108433735,
35
+ "grad_norm": 8.774048805236816,
36
+ "learning_rate": 4.698795180722892e-05,
37
+ "loss": 1.0548779487609863,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.6024096385542169,
42
+ "grad_norm": 10.919977188110352,
43
+ "learning_rate": 5.903614457831326e-05,
44
+ "loss": 0.860891056060791,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.7228915662650602,
49
+ "grad_norm": 17.067983627319336,
50
+ "learning_rate": 7.108433734939759e-05,
51
+ "loss": 0.7981919765472412,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.8433734939759037,
56
+ "grad_norm": 9.766481399536133,
57
+ "learning_rate": 8.313253012048194e-05,
58
+ "loss": 0.7360480785369873,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.963855421686747,
63
+ "grad_norm": 11.55764102935791,
64
+ "learning_rate": 9.518072289156626e-05,
65
+ "loss": 0.636728572845459,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 1.0,
70
+ "eval_accuracy": 0.8247787610619469,
71
+ "eval_loss": 0.46115025877952576,
72
+ "eval_runtime": 5.444,
73
+ "eval_samples_per_second": 103.785,
74
+ "eval_steps_per_second": 1.653,
75
  "step": 83
76
  },
77
  {
78
  "epoch": 1.0843373493975903,
79
+ "grad_norm": 13.059717178344727,
80
+ "learning_rate": 9.998408238461338e-05,
81
+ "loss": 0.5606242656707764,
82
  "step": 90
83
  },
84
  {
85
  "epoch": 1.2048192771084336,
86
+ "grad_norm": 7.8438615798950195,
87
+ "learning_rate": 9.988684476816419e-05,
88
+ "loss": 0.4747779369354248,
89
  "step": 100
90
  },
91
  {
92
  "epoch": 1.3253012048192772,
93
+ "grad_norm": 17.873554229736328,
94
+ "learning_rate": 9.970138440692705e-05,
95
+ "loss": 0.513523006439209,
96
  "step": 110
97
  },
98
  {
99
  "epoch": 1.4457831325301205,
100
+ "grad_norm": 11.972646713256836,
101
+ "learning_rate": 9.942802927959443e-05,
102
+ "loss": 0.5974394321441651,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 1.5662650602409638,
107
+ "grad_norm": 16.943544387817383,
108
+ "learning_rate": 9.906726280298186e-05,
109
+ "loss": 0.452280855178833,
110
  "step": 130
111
  },
112
  {
113
  "epoch": 1.6867469879518073,
114
+ "grad_norm": 9.1749906539917,
115
+ "learning_rate": 9.861972297712605e-05,
116
+ "loss": 0.4720293045043945,
117
  "step": 140
118
  },
119
  {
120
  "epoch": 1.8072289156626506,
121
+ "grad_norm": 11.010215759277344,
122
+ "learning_rate": 9.808620125700925e-05,
123
+ "loss": 0.4886914253234863,
124
  "step": 150
125
  },
126
  {
127
  "epoch": 1.927710843373494,
128
+ "grad_norm": 8.292040824890137,
129
+ "learning_rate": 9.746764115290496e-05,
130
+ "loss": 0.46558895111083987,
131
  "step": 160
132
  },
133
  {
134
  "epoch": 2.0,
135
+ "eval_accuracy": 0.8495575221238938,
136
+ "eval_loss": 0.3608015775680542,
137
+ "eval_runtime": 5.678,
138
+ "eval_samples_per_second": 99.507,
139
+ "eval_steps_per_second": 1.585,
140
  "step": 166
141
  },
142
  {
143
  "epoch": 2.0481927710843375,
144
+ "grad_norm": 17.205005645751953,
145
+ "learning_rate": 9.676513656182058e-05,
146
+ "loss": 0.36324758529663087,
147
  "step": 170
148
  },
149
  {
150
  "epoch": 2.1686746987951806,
151
+ "grad_norm": 11.862001419067383,
152
+ "learning_rate": 9.597992983298747e-05,
153
+ "loss": 0.4520224094390869,
154
  "step": 180
155
  },
156
  {
157
  "epoch": 2.289156626506024,
158
+ "grad_norm": 15.979503631591797,
159
+ "learning_rate": 9.511340957081958e-05,
160
+ "loss": 0.40279397964477537,
161
  "step": 190
162
  },
163
  {
164
  "epoch": 2.4096385542168672,
165
+ "grad_norm": 14.119962692260742,
166
+ "learning_rate": 9.416710817922615e-05,
167
+ "loss": 0.41336545944213865,
168
  "step": 200
169
  },
170
  {
171
  "epoch": 2.5301204819277108,
172
+ "grad_norm": 17.406816482543945,
173
+ "learning_rate": 9.314269915162114e-05,
174
+ "loss": 0.3019423961639404,
175
  "step": 210
176
  },
177
  {
178
  "epoch": 2.6506024096385543,
179
+ "grad_norm": 14.716191291809082,
180
+ "learning_rate": 9.204199411142196e-05,
181
+ "loss": 0.3748778820037842,
182
  "step": 220
183
  },
184
  {
185
  "epoch": 2.7710843373493974,
186
+ "grad_norm": 21.437185287475586,
187
+ "learning_rate": 9.086693960827105e-05,
188
+ "loss": 0.34201803207397463,
189
  "step": 230
190
  },
191
  {
192
  "epoch": 2.891566265060241,
193
+ "grad_norm": 10.061134338378906,
194
+ "learning_rate": 8.961961367564651e-05,
195
+ "loss": 0.49113874435424804,
196
  "step": 240
197
  },
198
  {
199
  "epoch": 3.0,
200
+ "eval_accuracy": 0.9646017699115044,
201
+ "eval_loss": 0.1344006061553955,
202
+ "eval_runtime": 6.1864,
203
+ "eval_samples_per_second": 91.329,
204
+ "eval_steps_per_second": 1.455,
205
  "step": 249
206
  },
207
  {
208
  "epoch": 3.0120481927710845,
209
+ "grad_norm": 12.825634956359863,
210
+ "learning_rate": 8.83022221559489e-05,
211
+ "loss": 0.27299160957336427,
212
  "step": 250
213
  },
214
  {
215
  "epoch": 3.1325301204819276,
216
+ "grad_norm": 10.527647972106934,
217
+ "learning_rate": 8.691709479956373e-05,
218
+ "loss": 0.24414093494415284,
219
  "step": 260
220
  },
221
  {
222
  "epoch": 3.253012048192771,
223
+ "grad_norm": 4.087771892547607,
224
+ "learning_rate": 8.546668114479768e-05,
225
+ "loss": 0.18104053735733033,
226
  "step": 270
227
  },
228
  {
229
  "epoch": 3.3734939759036147,
230
+ "grad_norm": 9.269667625427246,
231
+ "learning_rate": 8.395354618597533e-05,
232
+ "loss": 0.3339837551116943,
233
  "step": 280
234
  },
235
  {
236
  "epoch": 3.4939759036144578,
237
+ "grad_norm": 5.284663200378418,
238
+ "learning_rate": 8.238036583735673e-05,
239
+ "loss": 0.2400984764099121,
240
  "step": 290
241
  },
242
  {
243
  "epoch": 3.6144578313253013,
244
+ "grad_norm": 8.479917526245117,
245
+ "learning_rate": 8.074992220089769e-05,
246
+ "loss": 0.22019331455230712,
247
  "step": 300
248
  },
249
  {
250
  "epoch": 3.734939759036145,
251
+ "grad_norm": 9.128419876098633,
252
+ "learning_rate": 7.906509864622203e-05,
253
+ "loss": 0.25870823860168457,
254
  "step": 310
255
  },
256
  {
257
  "epoch": 3.855421686746988,
258
+ "grad_norm": 12.799089431762695,
259
+ "learning_rate": 7.73288747115059e-05,
260
+ "loss": 0.20678648948669434,
261
  "step": 320
262
  },
263
  {
264
  "epoch": 3.9759036144578315,
265
+ "grad_norm": 11.251450538635254,
266
+ "learning_rate": 7.554432083429253e-05,
267
+ "loss": 0.1629856824874878,
268
  "step": 330
269
  },
270
  {
271
  "epoch": 4.0,
272
+ "eval_accuracy": 0.9575221238938053,
273
+ "eval_loss": 0.13474561274051666,
274
+ "eval_runtime": 5.6362,
275
+ "eval_samples_per_second": 100.245,
276
+ "eval_steps_per_second": 1.597,
277
  "step": 332
278
  },
279
  {
280
  "epoch": 4.096385542168675,
281
+ "grad_norm": 19.876901626586914,
282
+ "learning_rate": 7.3714592921555e-05,
283
+ "loss": 0.24734578132629395,
284
  "step": 340
285
  },
286
  {
287
  "epoch": 4.216867469879518,
288
+ "grad_norm": 13.15965461730957,
289
+ "learning_rate": 7.184292676861024e-05,
290
+ "loss": 0.22765071392059327,
291
  "step": 350
292
  },
293
  {
294
  "epoch": 4.337349397590361,
295
+ "grad_norm": 12.12977123260498,
296
+ "learning_rate": 6.99326323367538e-05,
297
+ "loss": 0.17916421890258788,
298
  "step": 360
299
  },
300
  {
301
  "epoch": 4.457831325301205,
302
+ "grad_norm": 8.979646682739258,
303
+ "learning_rate": 6.798708789973527e-05,
304
+ "loss": 0.1901506304740906,
305
  "step": 370
306
  },
307
  {
308
  "epoch": 4.578313253012048,
309
+ "grad_norm": 5.592668056488037,
310
+ "learning_rate": 6.600973406942616e-05,
311
+ "loss": 0.22261853218078614,
312
  "step": 380
313
  },
314
  {
315
  "epoch": 4.698795180722891,
316
+ "grad_norm": 12.222548484802246,
317
+ "learning_rate": 6.400406771124536e-05,
318
+ "loss": 0.16046804189682007,
319
  "step": 390
320
  },
321
  {
322
  "epoch": 4.8192771084337345,
323
+ "grad_norm": 9.516422271728516,
324
+ "learning_rate": 6.197363576010264e-05,
325
+ "loss": 0.3090466022491455,
326
  "step": 400
327
  },
328
  {
329
  "epoch": 4.9397590361445785,
330
+ "grad_norm": 8.311286926269531,
331
+ "learning_rate": 5.992202894779649e-05,
332
+ "loss": 0.18722275495529175,
333
  "step": 410
334
  },
335
  {
336
  "epoch": 5.0,
337
+ "eval_accuracy": 0.9628318584070796,
338
+ "eval_loss": 0.11059214919805527,
339
+ "eval_runtime": 5.668,
340
+ "eval_samples_per_second": 99.682,
341
+ "eval_steps_per_second": 1.588,
342
  "step": 415
343
  },
344
  {
345
  "epoch": 5.0602409638554215,
346
+ "grad_norm": 6.544140338897705,
347
+ "learning_rate": 5.7852875452958954e-05,
348
+ "loss": 0.1725080966949463,
349
  "step": 420
350
  },
351
  {
352
  "epoch": 5.180722891566265,
353
+ "grad_norm": 7.241940975189209,
354
+ "learning_rate": 5.576983448477734e-05,
355
+ "loss": 0.2657145023345947,
356
  "step": 430
357
  },
358
  {
359
  "epoch": 5.301204819277109,
360
+ "grad_norm": 2.805722713470459,
361
+ "learning_rate": 5.3676589811839796e-05,
362
+ "loss": 0.16265145540237427,
363
  "step": 440
364
  },
365
  {
366
  "epoch": 5.421686746987952,
367
+ "grad_norm": 7.153483867645264,
368
+ "learning_rate": 5.157684324754858e-05,
369
+ "loss": 0.1511433720588684,
370
  "step": 450
371
  },
372
  {
373
  "epoch": 5.542168674698795,
374
+ "grad_norm": 3.1414175033569336,
375
+ "learning_rate": 4.9474308103621874e-05,
376
+ "loss": 0.15450478792190553,
377
  "step": 460
378
  },
379
  {
380
  "epoch": 5.662650602409639,
381
+ "grad_norm": 3.8960776329040527,
382
+ "learning_rate": 4.737270262326134e-05,
383
+ "loss": 0.13111191987991333,
384
  "step": 470
385
  },
386
  {
387
  "epoch": 5.783132530120482,
388
+ "grad_norm": 7.418442726135254,
389
+ "learning_rate": 4.527574340559844e-05,
390
+ "loss": 0.1539200186729431,
391
  "step": 480
392
  },
393
  {
394
  "epoch": 5.903614457831325,
395
+ "grad_norm": 8.860248565673828,
396
+ "learning_rate": 4.3187138833048456e-05,
397
+ "loss": 0.1801429271697998,
398
  "step": 490
399
  },
400
  {
401
  "epoch": 6.0,
402
+ "eval_accuracy": 0.9823008849557522,
403
+ "eval_loss": 0.09679495543241501,
404
+ "eval_runtime": 6.1076,
405
+ "eval_samples_per_second": 92.508,
406
+ "eval_steps_per_second": 1.474,
407
  "step": 498
408
  },
409
  {
410
  "epoch": 6.024096385542169,
411
+ "grad_norm": 11.467449188232422,
412
+ "learning_rate": 4.111058251319516e-05,
413
+ "loss": 0.11156998872756958,
414
  "step": 500
415
  },
416
  {
417
  "epoch": 6.144578313253012,
418
+ "grad_norm": 13.2984037399292,
419
+ "learning_rate": 3.904974674680436e-05,
420
+ "loss": 0.11730811595916749,
421
  "step": 510
422
  },
423
  {
424
  "epoch": 6.265060240963855,
425
+ "grad_norm": 3.543509006500244,
426
+ "learning_rate": 3.7008276033517396e-05,
427
+ "loss": 0.19998840093612671,
428
  "step": 520
429
  },
430
  {
431
  "epoch": 6.385542168674699,
432
+ "grad_norm": 4.334460735321045,
433
+ "learning_rate": 3.49897806267101e-05,
434
+ "loss": 0.09577634930610657,
435
  "step": 530
436
  },
437
  {
438
  "epoch": 6.506024096385542,
439
+ "grad_norm": 1.5698552131652832,
440
+ "learning_rate": 3.2997830148914314e-05,
441
+ "loss": 0.11064940690994263,
442
  "step": 540
443
  },
444
  {
445
  "epoch": 6.626506024096385,
446
+ "grad_norm": 1.4867029190063477,
447
+ "learning_rate": 3.103594727909385e-05,
448
+ "loss": 0.0978583574295044,
449
  "step": 550
450
  },
451
  {
452
  "epoch": 6.746987951807229,
453
+ "grad_norm": 8.762438774108887,
454
+ "learning_rate": 2.910760152293764e-05,
455
+ "loss": 0.08853105902671814,
456
  "step": 560
457
  },
458
  {
459
  "epoch": 6.867469879518072,
460
+ "grad_norm": 5.0525360107421875,
461
+ "learning_rate": 2.721620307718793e-05,
462
+ "loss": 0.13467444181442262,
463
  "step": 570
464
  },
465
  {
466
  "epoch": 6.9879518072289155,
467
+ "grad_norm": 13.927603721618652,
468
+ "learning_rate": 2.536509679885355e-05,
469
+ "loss": 0.14531443119049073,
470
  "step": 580
471
  },
472
  {
473
  "epoch": 7.0,
474
+ "eval_accuracy": 0.9716814159292035,
475
+ "eval_loss": 0.1196078509092331,
476
+ "eval_runtime": 5.7351,
477
+ "eval_samples_per_second": 98.517,
478
+ "eval_steps_per_second": 1.569,
479
  "step": 581
480
  },
481
  {
482
  "epoch": 7.108433734939759,
483
+ "grad_norm": 1.3813672065734863,
484
+ "learning_rate": 2.3557556289973838e-05,
485
+ "loss": 0.07141577005386353,
486
  "step": 590
487
  },
488
  {
489
  "epoch": 7.228915662650603,
490
+ "grad_norm": 11.30803108215332,
491
+ "learning_rate": 2.179677810839382e-05,
492
+ "loss": 0.08913902044296265,
493
  "step": 600
494
  },
495
  {
496
  "epoch": 7.349397590361446,
497
+ "grad_norm": 15.15943717956543,
498
+ "learning_rate": 2.0085876114788937e-05,
499
+ "loss": 0.1786208987236023,
500
  "step": 610
501
  },
502
  {
503
  "epoch": 7.469879518072289,
504
+ "grad_norm": 6.659374237060547,
505
+ "learning_rate": 1.8427875965935758e-05,
506
+ "loss": 0.05375434160232544,
507
  "step": 620
508
  },
509
  {
510
  "epoch": 7.590361445783133,
511
+ "grad_norm": 3.859622001647949,
512
+ "learning_rate": 1.682570976396811e-05,
513
+ "loss": 0.13732693195343018,
514
  "step": 630
515
  },
516
  {
517
  "epoch": 7.710843373493976,
518
+ "grad_norm": 4.593474388122559,
519
+ "learning_rate": 1.5282210871079926e-05,
520
+ "loss": 0.09100980162620545,
521
  "step": 640
522
  },
523
  {
524
  "epoch": 7.831325301204819,
525
+ "grad_norm": 13.193694114685059,
526
+ "learning_rate": 1.3800108898846021e-05,
527
+ "loss": 0.09656141400337219,
528
  "step": 650
529
  },
530
  {
531
  "epoch": 7.951807228915663,
532
+ "grad_norm": 13.730281829833984,
533
+ "learning_rate": 1.2382024881020937e-05,
534
+ "loss": 0.0786526083946228,
535
  "step": 660
536
  },
537
  {
538
  "epoch": 8.0,
539
+ "eval_accuracy": 0.9893805309734514,
540
+ "eval_loss": 0.08379530161619186,
541
+ "eval_runtime": 5.6656,
542
+ "eval_samples_per_second": 99.724,
543
+ "eval_steps_per_second": 1.589,
544
  "step": 664
545
  },
546
  {
547
  "epoch": 8.072289156626505,
548
+ "grad_norm": 1.4331773519515991,
549
+ "learning_rate": 1.1030466638353293e-05,
550
+ "loss": 0.0922305703163147,
551
  "step": 670
552
  },
553
  {
554
  "epoch": 8.19277108433735,
555
+ "grad_norm": 11.172012329101562,
556
+ "learning_rate": 9.747824343612338e-06,
557
+ "loss": 0.051563167572021486,
558
  "step": 680
559
  },
560
  {
561
  "epoch": 8.313253012048193,
562
+ "grad_norm": 9.389365196228027,
563
+ "learning_rate": 8.536366294669978e-06,
564
+ "loss": 0.0976746916770935,
565
  "step": 690
566
  },
567
  {
568
  "epoch": 8.433734939759036,
569
+ "grad_norm": 0.6641272902488708,
570
+ "learning_rate": 7.398234903113266e-06,
571
+ "loss": 0.07286246418952942,
572
  "step": 700
573
  },
574
  {
575
  "epoch": 8.55421686746988,
576
+ "grad_norm": 10.819772720336914,
577
+ "learning_rate": 6.335442905481442e-06,
578
+ "loss": 0.07259726524353027,
579
  "step": 710
580
  },
581
  {
582
  "epoch": 8.674698795180722,
583
+ "grad_norm": 0.8964389562606812,
584
+ "learning_rate": 5.349869803827717e-06,
585
+ "loss": 0.043448707461357115,
586
  "step": 720
587
  },
588
  {
589
  "epoch": 8.795180722891565,
590
+ "grad_norm": 7.601110935211182,
591
+ "learning_rate": 4.4432585419005076e-06,
592
+ "loss": 0.10902594327926636,
593
  "step": 730
594
  },
595
  {
596
  "epoch": 8.91566265060241,
597
+ "grad_norm": 0.9344149827957153,
598
+ "learning_rate": 3.6172124228221914e-06,
599
+ "loss": 0.03534201383590698,
600
  "step": 740
601
  },
602
  {
603
  "epoch": 9.0,
604
+ "eval_accuracy": 0.9911504424778761,
605
+ "eval_loss": 0.0801326259970665,
606
+ "eval_runtime": 6.1806,
607
+ "eval_samples_per_second": 91.415,
608
+ "eval_steps_per_second": 1.456,
609
  "step": 747
610
  },
611
  {
612
  "epoch": 9.036144578313253,
613
+ "grad_norm": 2.648090124130249,
614
+ "learning_rate": 2.8731922737163685e-06,
615
+ "loss": 0.024153660237789153,
616
  "step": 750
617
  },
618
  {
619
  "epoch": 9.156626506024097,
620
+ "grad_norm": 7.166019439697266,
621
+ "learning_rate": 2.212513862297649e-06,
622
+ "loss": 0.12096415758132935,
623
  "step": 760
624
  },
625
  {
626
  "epoch": 9.27710843373494,
627
+ "grad_norm": 9.207470893859863,
628
+ "learning_rate": 1.6363455699930419e-06,
629
+ "loss": 0.10932642221450806,
630
  "step": 770
631
  },
632
  {
633
  "epoch": 9.397590361445783,
634
+ "grad_norm": 2.5481743812561035,
635
+ "learning_rate": 1.145706325709389e-06,
636
+ "loss": 0.06989773511886596,
637
  "step": 780
638
  },
639
  {
640
  "epoch": 9.518072289156626,
641
+ "grad_norm": 12.317899703979492,
642
+ "learning_rate": 7.414638039014265e-07,
643
+ "loss": 0.08738085627555847,
644
  "step": 790
645
  },
646
  {
647
  "epoch": 9.638554216867469,
648
+ "grad_norm": 2.812633752822876,
649
+ "learning_rate": 4.2433289012662194e-07,
650
+ "loss": 0.04424922168254852,
651
  "step": 800
652
  },
653
  {
654
  "epoch": 9.759036144578314,
655
+ "grad_norm": 1.378448486328125,
656
+ "learning_rate": 1.9487441680084983e-07,
657
+ "loss": 0.07828723788261413,
658
  "step": 810
659
  },
660
  {
661
  "epoch": 9.879518072289157,
662
+ "grad_norm": 9.539847373962402,
663
+ "learning_rate": 5.3494171390228166e-08,
664
+ "loss": 0.06453937888145447,
665
  "step": 820
666
  },
667
  {
668
  "epoch": 10.0,
669
+ "grad_norm": 0.7179245948791504,
670
+ "learning_rate": 4.4217879344166103e-10,
671
+ "loss": 0.08780375719070435,
672
  "step": 830
673
  },
674
  {
675
  "epoch": 10.0,
676
+ "eval_accuracy": 0.9911504424778761,
677
+ "eval_loss": 0.08178059756755829,
678
+ "eval_runtime": 5.6512,
679
+ "eval_samples_per_second": 99.979,
680
+ "eval_steps_per_second": 1.593,
681
  "step": 830
682
  },
683
  {
684
  "epoch": 10.0,
685
  "step": 830,
686
  "total_flos": 8.593274471605862e+17,
687
+ "train_loss": 0.29410572172288435,
688
+ "train_runtime": 896.4382,
689
+ "train_samples_per_second": 29.461,
690
+ "train_steps_per_second": 0.926
691
  }
692
  ],
693
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72ac52cd7661209822ca173a4c3a27f6b18940c628449005a218e0107865dfb3
3
  size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59329c3833ceef25e02a3057cd6a7cff3d8905c772dabca522835eecbd1b79b
3
  size 5201