jerryteps commited on
Commit
807dfe5
·
1 Parent(s): 835de62

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.99,
3
  "eval_accuracy": 0.30788520479242126,
4
  "eval_loss": 1.6947710514068604,
5
  "eval_runtime": 17.149,
6
  "eval_samples_per_second": 209.283,
7
  "eval_steps_per_second": 6.589,
8
- "total_flos": 2.054868144086016e+18,
9
- "train_loss": 1.782613229499292,
10
- "train_runtime": 1261.819,
11
- "train_samples_per_second": 76.789,
12
- "train_steps_per_second": 0.599
13
  }
 
1
  {
2
+ "epoch": 4.99,
3
  "eval_accuracy": 0.30788520479242126,
4
  "eval_loss": 1.6947710514068604,
5
  "eval_runtime": 17.149,
6
  "eval_samples_per_second": 209.283,
7
  "eval_steps_per_second": 6.589,
8
+ "total_flos": 3.424468651849728e+18,
9
+ "train_loss": 1.653014714377267,
10
+ "train_runtime": 2002.0643,
11
+ "train_samples_per_second": 80.662,
12
+ "train_steps_per_second": 0.629
13
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b399ad33f7ea46c338c5b5843f2fb6bd9142bdea7ca8f8a97bcae2fa2a57b141
3
  size 94416394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeca89952dba0bd69eb866dcb80f92e7db5657894e4b35a6b2d1f931a10da0b0
3
  size 94416394
runs/Nov06_05-04-24_76172e0bfbf9/events.out.tfevents.1699247079.76172e0bfbf9.578.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7ddefc09b51c51f0767e8dd4acc0907204119e7574dde24dfd9a4059141b684
3
+ size 8542
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.99,
3
- "total_flos": 2.054868144086016e+18,
4
- "train_loss": 1.782613229499292,
5
- "train_runtime": 1261.819,
6
- "train_samples_per_second": 76.789,
7
- "train_steps_per_second": 0.599
8
  }
 
1
  {
2
+ "epoch": 4.99,
3
+ "total_flos": 3.424468651849728e+18,
4
+ "train_loss": 1.653014714377267,
5
+ "train_runtime": 2002.0643,
6
+ "train_samples_per_second": 80.662,
7
+ "train_steps_per_second": 0.629
8
  }
trainer_state.json CHANGED
@@ -1,502 +1,826 @@
1
  {
2
- "best_metric": 0.30788520479242126,
3
- "best_model_checkpoint": "microsoft/resnet-50/checkpoint-756",
4
- "epoch": 2.994059405940594,
5
- "global_step": 756,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.04,
12
- "learning_rate": 6.578947368421053e-06,
13
- "loss": 1.9535,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.08,
18
- "learning_rate": 1.3157894736842106e-05,
19
- "loss": 1.9539,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.12,
24
- "learning_rate": 1.9736842105263158e-05,
25
- "loss": 1.9476,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.16,
30
- "learning_rate": 2.6315789473684212e-05,
31
- "loss": 1.9401,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.2,
36
- "learning_rate": 3.289473684210527e-05,
37
- "loss": 1.9308,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.24,
42
- "learning_rate": 3.9473684210526316e-05,
43
- "loss": 1.9207,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.28,
48
- "learning_rate": 4.605263157894737e-05,
49
- "loss": 1.9125,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.32,
54
- "learning_rate": 4.970588235294118e-05,
55
- "loss": 1.8956,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.36,
60
- "learning_rate": 4.897058823529412e-05,
61
- "loss": 1.8897,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.4,
66
- "learning_rate": 4.823529411764706e-05,
67
- "loss": 1.8761,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.44,
72
- "learning_rate": 4.75e-05,
73
- "loss": 1.8658,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.48,
78
- "learning_rate": 4.6764705882352944e-05,
79
- "loss": 1.8532,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.51,
84
- "learning_rate": 4.6029411764705885e-05,
85
- "loss": 1.8561,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 0.55,
90
- "learning_rate": 4.5294117647058826e-05,
91
- "loss": 1.8521,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 0.59,
96
- "learning_rate": 4.455882352941177e-05,
97
- "loss": 1.8386,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.63,
102
- "learning_rate": 4.382352941176471e-05,
103
- "loss": 1.8219,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.67,
108
- "learning_rate": 4.308823529411765e-05,
109
- "loss": 1.8281,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.71,
114
- "learning_rate": 4.235294117647059e-05,
115
- "loss": 1.8081,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.75,
120
- "learning_rate": 4.161764705882353e-05,
121
- "loss": 1.8114,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.79,
126
- "learning_rate": 4.0882352941176474e-05,
127
- "loss": 1.813,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.83,
132
- "learning_rate": 4.0147058823529415e-05,
133
- "loss": 1.7975,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.87,
138
- "learning_rate": 3.9411764705882356e-05,
139
- "loss": 1.8039,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.91,
144
- "learning_rate": 3.86764705882353e-05,
145
- "loss": 1.7997,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.95,
150
- "learning_rate": 3.794117647058824e-05,
151
- "loss": 1.7928,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.99,
156
- "learning_rate": 3.720588235294118e-05,
157
- "loss": 1.7973,
158
  "step": 250
159
  },
160
  {
161
  "epoch": 1.0,
162
- "eval_accuracy": 0.24770130955697967,
163
- "eval_loss": 1.7939863204956055,
164
- "eval_runtime": 18.0712,
165
- "eval_samples_per_second": 198.604,
166
- "eval_steps_per_second": 6.253,
167
  "step": 252
168
  },
169
  {
170
  "epoch": 1.03,
171
- "learning_rate": 3.6470588235294114e-05,
172
- "loss": 1.8085,
173
  "step": 260
174
  },
175
  {
176
  "epoch": 1.07,
177
- "learning_rate": 3.573529411764706e-05,
178
- "loss": 1.7814,
179
  "step": 270
180
  },
181
  {
182
  "epoch": 1.11,
183
- "learning_rate": 3.5e-05,
184
- "loss": 1.8047,
185
  "step": 280
186
  },
187
  {
188
  "epoch": 1.15,
189
- "learning_rate": 3.4264705882352945e-05,
190
- "loss": 1.801,
191
  "step": 290
192
  },
193
  {
194
  "epoch": 1.19,
195
- "learning_rate": 3.352941176470588e-05,
196
- "loss": 1.7883,
197
  "step": 300
198
  },
199
  {
200
  "epoch": 1.23,
201
- "learning_rate": 3.279411764705883e-05,
202
- "loss": 1.7824,
203
  "step": 310
204
  },
205
  {
206
  "epoch": 1.27,
207
- "learning_rate": 3.205882352941177e-05,
208
- "loss": 1.774,
209
  "step": 320
210
  },
211
  {
212
  "epoch": 1.31,
213
- "learning_rate": 3.132352941176471e-05,
214
- "loss": 1.7772,
215
  "step": 330
216
  },
217
  {
218
  "epoch": 1.35,
219
- "learning_rate": 3.058823529411765e-05,
220
- "loss": 1.7584,
221
  "step": 340
222
  },
223
  {
224
  "epoch": 1.39,
225
- "learning_rate": 2.985294117647059e-05,
226
- "loss": 1.7609,
227
  "step": 350
228
  },
229
  {
230
  "epoch": 1.43,
231
- "learning_rate": 2.9117647058823534e-05,
232
- "loss": 1.7864,
233
  "step": 360
234
  },
235
  {
236
  "epoch": 1.47,
237
- "learning_rate": 2.838235294117647e-05,
238
- "loss": 1.7783,
239
  "step": 370
240
  },
241
  {
242
  "epoch": 1.5,
243
- "learning_rate": 2.7647058823529416e-05,
244
- "loss": 1.757,
245
  "step": 380
246
  },
247
  {
248
  "epoch": 1.54,
249
- "learning_rate": 2.6911764705882354e-05,
250
- "loss": 1.7539,
251
  "step": 390
252
  },
253
  {
254
  "epoch": 1.58,
255
- "learning_rate": 2.6176470588235295e-05,
256
- "loss": 1.7694,
257
  "step": 400
258
  },
259
  {
260
  "epoch": 1.62,
261
- "learning_rate": 2.5441176470588236e-05,
262
- "loss": 1.7601,
263
  "step": 410
264
  },
265
  {
266
  "epoch": 1.66,
267
- "learning_rate": 2.4705882352941178e-05,
268
- "loss": 1.7417,
269
  "step": 420
270
  },
271
  {
272
  "epoch": 1.7,
273
- "learning_rate": 2.397058823529412e-05,
274
- "loss": 1.744,
275
  "step": 430
276
  },
277
  {
278
  "epoch": 1.74,
279
- "learning_rate": 2.323529411764706e-05,
280
- "loss": 1.7491,
281
  "step": 440
282
  },
283
  {
284
  "epoch": 1.78,
285
- "learning_rate": 2.25e-05,
286
- "loss": 1.7387,
287
  "step": 450
288
  },
289
  {
290
  "epoch": 1.82,
291
- "learning_rate": 2.1764705882352943e-05,
292
- "loss": 1.7526,
293
  "step": 460
294
  },
295
  {
296
  "epoch": 1.86,
297
- "learning_rate": 2.1029411764705884e-05,
298
- "loss": 1.7361,
299
  "step": 470
300
  },
301
  {
302
  "epoch": 1.9,
303
- "learning_rate": 2.0294117647058825e-05,
304
- "loss": 1.7344,
305
  "step": 480
306
  },
307
  {
308
  "epoch": 1.94,
309
- "learning_rate": 1.9558823529411766e-05,
310
- "loss": 1.7211,
311
  "step": 490
312
  },
313
  {
314
  "epoch": 1.98,
315
- "learning_rate": 1.8823529411764708e-05,
316
- "loss": 1.7299,
317
  "step": 500
318
  },
319
  {
320
  "epoch": 2.0,
321
- "eval_accuracy": 0.2803009194761772,
322
- "eval_loss": 1.727730393409729,
323
- "eval_runtime": 17.3747,
324
- "eval_samples_per_second": 206.564,
325
- "eval_steps_per_second": 6.504,
326
  "step": 505
327
  },
328
  {
329
  "epoch": 2.02,
330
- "learning_rate": 1.808823529411765e-05,
331
- "loss": 1.7469,
332
  "step": 510
333
  },
334
  {
335
  "epoch": 2.06,
336
- "learning_rate": 1.735294117647059e-05,
337
- "loss": 1.744,
338
  "step": 520
339
  },
340
  {
341
  "epoch": 2.1,
342
- "learning_rate": 1.6617647058823528e-05,
343
- "loss": 1.7489,
344
  "step": 530
345
  },
346
  {
347
  "epoch": 2.14,
348
- "learning_rate": 1.588235294117647e-05,
349
- "loss": 1.7356,
350
  "step": 540
351
  },
352
  {
353
  "epoch": 2.18,
354
- "learning_rate": 1.5147058823529412e-05,
355
- "loss": 1.7327,
356
  "step": 550
357
  },
358
  {
359
  "epoch": 2.22,
360
- "learning_rate": 1.4411764705882352e-05,
361
- "loss": 1.7045,
362
  "step": 560
363
  },
364
  {
365
  "epoch": 2.26,
366
- "learning_rate": 1.3676470588235296e-05,
367
- "loss": 1.717,
368
  "step": 570
369
  },
370
  {
371
  "epoch": 2.3,
372
- "learning_rate": 1.2941176470588238e-05,
373
- "loss": 1.7335,
374
  "step": 580
375
  },
376
  {
377
  "epoch": 2.34,
378
- "learning_rate": 1.2205882352941177e-05,
379
- "loss": 1.7338,
380
  "step": 590
381
  },
382
  {
383
  "epoch": 2.38,
384
- "learning_rate": 1.1470588235294118e-05,
385
- "loss": 1.7145,
386
  "step": 600
387
  },
388
  {
389
  "epoch": 2.42,
390
- "learning_rate": 1.0735294117647058e-05,
391
- "loss": 1.7349,
392
  "step": 610
393
  },
394
  {
395
  "epoch": 2.46,
396
- "learning_rate": 1e-05,
397
- "loss": 1.7293,
398
  "step": 620
399
  },
400
  {
401
  "epoch": 2.5,
402
- "learning_rate": 9.264705882352942e-06,
403
- "loss": 1.7291,
404
  "step": 630
405
  },
406
  {
407
  "epoch": 2.53,
408
- "learning_rate": 8.529411764705883e-06,
409
- "loss": 1.7192,
410
  "step": 640
411
  },
412
  {
413
  "epoch": 2.57,
414
- "learning_rate": 7.794117647058825e-06,
415
- "loss": 1.7203,
416
  "step": 650
417
  },
418
  {
419
  "epoch": 2.61,
420
- "learning_rate": 7.058823529411765e-06,
421
- "loss": 1.7403,
422
  "step": 660
423
  },
424
  {
425
  "epoch": 2.65,
426
- "learning_rate": 6.323529411764706e-06,
427
- "loss": 1.7077,
428
  "step": 670
429
  },
430
  {
431
  "epoch": 2.69,
432
- "learning_rate": 5.588235294117647e-06,
433
- "loss": 1.712,
434
  "step": 680
435
  },
436
  {
437
  "epoch": 2.73,
438
- "learning_rate": 4.852941176470589e-06,
439
- "loss": 1.7327,
440
  "step": 690
441
  },
442
  {
443
  "epoch": 2.77,
444
- "learning_rate": 4.11764705882353e-06,
445
- "loss": 1.7021,
446
  "step": 700
447
  },
448
  {
449
  "epoch": 2.81,
450
- "learning_rate": 3.3823529411764707e-06,
451
- "loss": 1.7295,
452
  "step": 710
453
  },
454
  {
455
  "epoch": 2.85,
456
- "learning_rate": 2.647058823529412e-06,
457
- "loss": 1.7288,
458
  "step": 720
459
  },
460
  {
461
  "epoch": 2.89,
462
- "learning_rate": 1.911764705882353e-06,
463
- "loss": 1.6857,
464
  "step": 730
465
  },
466
  {
467
  "epoch": 2.93,
468
- "learning_rate": 1.1764705882352942e-06,
469
- "loss": 1.7052,
470
  "step": 740
471
  },
472
  {
473
  "epoch": 2.97,
474
- "learning_rate": 4.411764705882353e-07,
475
- "loss": 1.7096,
476
  "step": 750
477
  },
478
  {
479
- "epoch": 2.99,
480
- "eval_accuracy": 0.30788520479242126,
481
- "eval_loss": 1.6947710514068604,
482
- "eval_runtime": 16.9402,
483
- "eval_samples_per_second": 211.863,
484
- "eval_steps_per_second": 6.671,
485
- "step": 756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  },
487
  {
488
- "epoch": 2.99,
489
- "step": 756,
490
- "total_flos": 2.054868144086016e+18,
491
- "train_loss": 1.782613229499292,
492
- "train_runtime": 1261.819,
493
- "train_samples_per_second": 76.789,
494
- "train_steps_per_second": 0.599
495
  }
496
  ],
497
- "max_steps": 756,
498
- "num_train_epochs": 3,
499
- "total_flos": 2.054868144086016e+18,
500
  "trial_name": null,
501
  "trial_params": null
502
  }
 
1
  {
2
+ "best_metric": 0.4446921147952076,
3
+ "best_model_checkpoint": "microsoft/resnet-50/checkpoint-1260",
4
+ "epoch": 4.99009900990099,
5
+ "global_step": 1260,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.04,
12
+ "learning_rate": 3.968253968253968e-06,
13
+ "loss": 1.7784,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.08,
18
+ "learning_rate": 7.936507936507936e-06,
19
+ "loss": 1.786,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.12,
24
+ "learning_rate": 1.1904761904761905e-05,
25
+ "loss": 1.8132,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.16,
30
+ "learning_rate": 1.5873015873015872e-05,
31
+ "loss": 1.7916,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.2,
36
+ "learning_rate": 1.984126984126984e-05,
37
+ "loss": 1.7859,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.24,
42
+ "learning_rate": 2.380952380952381e-05,
43
+ "loss": 1.7751,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.28,
48
+ "learning_rate": 2.777777777777778e-05,
49
+ "loss": 1.8154,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.32,
54
+ "learning_rate": 3.1746031746031745e-05,
55
+ "loss": 1.7868,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.36,
60
+ "learning_rate": 3.571428571428572e-05,
61
+ "loss": 1.7624,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.4,
66
+ "learning_rate": 3.968253968253968e-05,
67
+ "loss": 1.7823,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.44,
72
+ "learning_rate": 4.3650793650793655e-05,
73
+ "loss": 1.7823,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.48,
78
+ "learning_rate": 4.761904761904762e-05,
79
+ "loss": 1.7932,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.51,
84
+ "learning_rate": 4.982363315696649e-05,
85
+ "loss": 1.7863,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 0.55,
90
+ "learning_rate": 4.938271604938271e-05,
91
+ "loss": 1.7806,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 0.59,
96
+ "learning_rate": 4.894179894179895e-05,
97
+ "loss": 1.7706,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.63,
102
+ "learning_rate": 4.850088183421517e-05,
103
+ "loss": 1.7629,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.67,
108
+ "learning_rate": 4.8059964726631394e-05,
109
+ "loss": 1.7731,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.71,
114
+ "learning_rate": 4.761904761904762e-05,
115
+ "loss": 1.7435,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.75,
120
+ "learning_rate": 4.717813051146385e-05,
121
+ "loss": 1.7371,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.79,
126
+ "learning_rate": 4.673721340388007e-05,
127
+ "loss": 1.729,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.83,
132
+ "learning_rate": 4.62962962962963e-05,
133
+ "loss": 1.7516,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.87,
138
+ "learning_rate": 4.585537918871252e-05,
139
+ "loss": 1.7337,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.91,
144
+ "learning_rate": 4.541446208112875e-05,
145
+ "loss": 1.7118,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.95,
150
+ "learning_rate": 4.4973544973544974e-05,
151
+ "loss": 1.7141,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.99,
156
+ "learning_rate": 4.45326278659612e-05,
157
+ "loss": 1.7252,
158
  "step": 250
159
  },
160
  {
161
  "epoch": 1.0,
162
+ "eval_accuracy": 0.27528559487322374,
163
+ "eval_loss": 1.7069034576416016,
164
+ "eval_runtime": 16.7761,
165
+ "eval_samples_per_second": 213.935,
166
+ "eval_steps_per_second": 6.736,
167
  "step": 252
168
  },
169
  {
170
  "epoch": 1.03,
171
+ "learning_rate": 4.409171075837743e-05,
172
+ "loss": 1.7246,
173
  "step": 260
174
  },
175
  {
176
  "epoch": 1.07,
177
+ "learning_rate": 4.3650793650793655e-05,
178
+ "loss": 1.7388,
179
  "step": 270
180
  },
181
  {
182
  "epoch": 1.11,
183
+ "learning_rate": 4.3209876543209875e-05,
184
+ "loss": 1.7169,
185
  "step": 280
186
  },
187
  {
188
  "epoch": 1.15,
189
+ "learning_rate": 4.27689594356261e-05,
190
+ "loss": 1.6987,
191
  "step": 290
192
  },
193
  {
194
  "epoch": 1.19,
195
+ "learning_rate": 4.232804232804233e-05,
196
+ "loss": 1.7117,
197
  "step": 300
198
  },
199
  {
200
  "epoch": 1.23,
201
+ "learning_rate": 4.1887125220458555e-05,
202
+ "loss": 1.7117,
203
  "step": 310
204
  },
205
  {
206
  "epoch": 1.27,
207
+ "learning_rate": 4.144620811287478e-05,
208
+ "loss": 1.7248,
209
  "step": 320
210
  },
211
  {
212
  "epoch": 1.31,
213
+ "learning_rate": 4.100529100529101e-05,
214
+ "loss": 1.7123,
215
  "step": 330
216
  },
217
  {
218
  "epoch": 1.35,
219
+ "learning_rate": 4.056437389770723e-05,
220
+ "loss": 1.7096,
221
  "step": 340
222
  },
223
  {
224
  "epoch": 1.39,
225
+ "learning_rate": 4.012345679012346e-05,
226
+ "loss": 1.6905,
227
  "step": 350
228
  },
229
  {
230
  "epoch": 1.43,
231
+ "learning_rate": 3.968253968253968e-05,
232
+ "loss": 1.7309,
233
  "step": 360
234
  },
235
  {
236
  "epoch": 1.47,
237
+ "learning_rate": 3.924162257495591e-05,
238
+ "loss": 1.7151,
239
  "step": 370
240
  },
241
  {
242
  "epoch": 1.5,
243
+ "learning_rate": 3.8800705467372136e-05,
244
+ "loss": 1.7067,
245
  "step": 380
246
  },
247
  {
248
  "epoch": 1.54,
249
+ "learning_rate": 3.835978835978836e-05,
250
+ "loss": 1.6982,
251
  "step": 390
252
  },
253
  {
254
  "epoch": 1.58,
255
+ "learning_rate": 3.791887125220458e-05,
256
+ "loss": 1.6959,
257
  "step": 400
258
  },
259
  {
260
  "epoch": 1.62,
261
+ "learning_rate": 3.7477954144620817e-05,
262
+ "loss": 1.6794,
263
  "step": 410
264
  },
265
  {
266
  "epoch": 1.66,
267
+ "learning_rate": 3.7037037037037037e-05,
268
+ "loss": 1.6696,
269
  "step": 420
270
  },
271
  {
272
  "epoch": 1.7,
273
+ "learning_rate": 3.659611992945326e-05,
274
+ "loss": 1.6761,
275
  "step": 430
276
  },
277
  {
278
  "epoch": 1.74,
279
+ "learning_rate": 3.615520282186949e-05,
280
+ "loss": 1.6844,
281
  "step": 440
282
  },
283
  {
284
  "epoch": 1.78,
285
+ "learning_rate": 3.571428571428572e-05,
286
+ "loss": 1.6666,
287
  "step": 450
288
  },
289
  {
290
  "epoch": 1.82,
291
+ "learning_rate": 3.527336860670194e-05,
292
+ "loss": 1.6767,
293
  "step": 460
294
  },
295
  {
296
  "epoch": 1.86,
297
+ "learning_rate": 3.483245149911817e-05,
298
+ "loss": 1.6578,
299
  "step": 470
300
  },
301
  {
302
  "epoch": 1.9,
303
+ "learning_rate": 3.439153439153439e-05,
304
+ "loss": 1.6493,
305
  "step": 480
306
  },
307
  {
308
  "epoch": 1.94,
309
+ "learning_rate": 3.395061728395062e-05,
310
+ "loss": 1.6547,
311
  "step": 490
312
  },
313
  {
314
  "epoch": 1.98,
315
+ "learning_rate": 3.3509700176366844e-05,
316
+ "loss": 1.6386,
317
  "step": 500
318
  },
319
  {
320
  "epoch": 2.0,
321
+ "eval_accuracy": 0.3911953190303706,
322
+ "eval_loss": 1.5798698663711548,
323
+ "eval_runtime": 16.5653,
324
+ "eval_samples_per_second": 216.658,
325
+ "eval_steps_per_second": 6.821,
326
  "step": 505
327
  },
328
  {
329
  "epoch": 2.02,
330
+ "learning_rate": 3.306878306878307e-05,
331
+ "loss": 1.675,
332
  "step": 510
333
  },
334
  {
335
  "epoch": 2.06,
336
+ "learning_rate": 3.262786596119929e-05,
337
+ "loss": 1.6512,
338
  "step": 520
339
  },
340
  {
341
  "epoch": 2.1,
342
+ "learning_rate": 3.2186948853615525e-05,
343
+ "loss": 1.6484,
344
  "step": 530
345
  },
346
  {
347
  "epoch": 2.14,
348
+ "learning_rate": 3.1746031746031745e-05,
349
+ "loss": 1.6573,
350
  "step": 540
351
  },
352
  {
353
  "epoch": 2.18,
354
+ "learning_rate": 3.130511463844797e-05,
355
+ "loss": 1.6452,
356
  "step": 550
357
  },
358
  {
359
  "epoch": 2.22,
360
+ "learning_rate": 3.08641975308642e-05,
361
+ "loss": 1.6462,
362
  "step": 560
363
  },
364
  {
365
  "epoch": 2.26,
366
+ "learning_rate": 3.0423280423280425e-05,
367
+ "loss": 1.6164,
368
  "step": 570
369
  },
370
  {
371
  "epoch": 2.3,
372
+ "learning_rate": 2.998236331569665e-05,
373
+ "loss": 1.6062,
374
  "step": 580
375
  },
376
  {
377
  "epoch": 2.34,
378
+ "learning_rate": 2.954144620811288e-05,
379
+ "loss": 1.6517,
380
  "step": 590
381
  },
382
  {
383
  "epoch": 2.38,
384
+ "learning_rate": 2.91005291005291e-05,
385
+ "loss": 1.6156,
386
  "step": 600
387
  },
388
  {
389
  "epoch": 2.42,
390
+ "learning_rate": 2.865961199294533e-05,
391
+ "loss": 1.6497,
392
  "step": 610
393
  },
394
  {
395
  "epoch": 2.46,
396
+ "learning_rate": 2.8218694885361552e-05,
397
+ "loss": 1.6526,
398
  "step": 620
399
  },
400
  {
401
  "epoch": 2.5,
402
+ "learning_rate": 2.777777777777778e-05,
403
+ "loss": 1.6249,
404
  "step": 630
405
  },
406
  {
407
  "epoch": 2.53,
408
+ "learning_rate": 2.7336860670194003e-05,
409
+ "loss": 1.6367,
410
  "step": 640
411
  },
412
  {
413
  "epoch": 2.57,
414
+ "learning_rate": 2.6895943562610233e-05,
415
+ "loss": 1.6321,
416
  "step": 650
417
  },
418
  {
419
  "epoch": 2.61,
420
+ "learning_rate": 2.6455026455026456e-05,
421
+ "loss": 1.6255,
422
  "step": 660
423
  },
424
  {
425
  "epoch": 2.65,
426
+ "learning_rate": 2.6014109347442683e-05,
427
+ "loss": 1.6215,
428
  "step": 670
429
  },
430
  {
431
  "epoch": 2.69,
432
+ "learning_rate": 2.5573192239858906e-05,
433
+ "loss": 1.6425,
434
  "step": 680
435
  },
436
  {
437
  "epoch": 2.73,
438
+ "learning_rate": 2.5132275132275137e-05,
439
+ "loss": 1.6064,
440
  "step": 690
441
  },
442
  {
443
  "epoch": 2.77,
444
+ "learning_rate": 2.4691358024691357e-05,
445
+ "loss": 1.6093,
446
  "step": 700
447
  },
448
  {
449
  "epoch": 2.81,
450
+ "learning_rate": 2.4250440917107583e-05,
451
+ "loss": 1.6022,
452
  "step": 710
453
  },
454
  {
455
  "epoch": 2.85,
456
+ "learning_rate": 2.380952380952381e-05,
457
+ "loss": 1.6309,
458
  "step": 720
459
  },
460
  {
461
  "epoch": 2.89,
462
+ "learning_rate": 2.3368606701940034e-05,
463
+ "loss": 1.6112,
464
  "step": 730
465
  },
466
  {
467
  "epoch": 2.93,
468
+ "learning_rate": 2.292768959435626e-05,
469
+ "loss": 1.6253,
470
  "step": 740
471
  },
472
  {
473
  "epoch": 2.97,
474
+ "learning_rate": 2.2486772486772487e-05,
475
+ "loss": 1.617,
476
  "step": 750
477
  },
478
  {
479
+ "epoch": 3.0,
480
+ "eval_accuracy": 0.42741710782947895,
481
+ "eval_loss": 1.5154473781585693,
482
+ "eval_runtime": 17.1814,
483
+ "eval_samples_per_second": 208.889,
484
+ "eval_steps_per_second": 6.577,
485
+ "step": 757
486
+ },
487
+ {
488
+ "epoch": 3.01,
489
+ "learning_rate": 2.2045855379188714e-05,
490
+ "loss": 1.6162,
491
+ "step": 760
492
+ },
493
+ {
494
+ "epoch": 3.05,
495
+ "learning_rate": 2.1604938271604937e-05,
496
+ "loss": 1.6166,
497
+ "step": 770
498
+ },
499
+ {
500
+ "epoch": 3.09,
501
+ "learning_rate": 2.1164021164021164e-05,
502
+ "loss": 1.5919,
503
+ "step": 780
504
+ },
505
+ {
506
+ "epoch": 3.13,
507
+ "learning_rate": 2.072310405643739e-05,
508
+ "loss": 1.6002,
509
+ "step": 790
510
+ },
511
+ {
512
+ "epoch": 3.17,
513
+ "learning_rate": 2.0282186948853614e-05,
514
+ "loss": 1.6324,
515
+ "step": 800
516
+ },
517
+ {
518
+ "epoch": 3.21,
519
+ "learning_rate": 1.984126984126984e-05,
520
+ "loss": 1.5883,
521
+ "step": 810
522
+ },
523
+ {
524
+ "epoch": 3.25,
525
+ "learning_rate": 1.9400352733686068e-05,
526
+ "loss": 1.587,
527
+ "step": 820
528
+ },
529
+ {
530
+ "epoch": 3.29,
531
+ "learning_rate": 1.895943562610229e-05,
532
+ "loss": 1.6264,
533
+ "step": 830
534
+ },
535
+ {
536
+ "epoch": 3.33,
537
+ "learning_rate": 1.8518518518518518e-05,
538
+ "loss": 1.5798,
539
+ "step": 840
540
+ },
541
+ {
542
+ "epoch": 3.37,
543
+ "learning_rate": 1.8077601410934745e-05,
544
+ "loss": 1.6045,
545
+ "step": 850
546
+ },
547
+ {
548
+ "epoch": 3.41,
549
+ "learning_rate": 1.763668430335097e-05,
550
+ "loss": 1.6234,
551
+ "step": 860
552
+ },
553
+ {
554
+ "epoch": 3.45,
555
+ "learning_rate": 1.7195767195767195e-05,
556
+ "loss": 1.5851,
557
+ "step": 870
558
+ },
559
+ {
560
+ "epoch": 3.49,
561
+ "learning_rate": 1.6754850088183422e-05,
562
+ "loss": 1.6007,
563
+ "step": 880
564
+ },
565
+ {
566
+ "epoch": 3.52,
567
+ "learning_rate": 1.6313932980599646e-05,
568
+ "loss": 1.6145,
569
+ "step": 890
570
+ },
571
+ {
572
+ "epoch": 3.56,
573
+ "learning_rate": 1.5873015873015872e-05,
574
+ "loss": 1.5742,
575
+ "step": 900
576
+ },
577
+ {
578
+ "epoch": 3.6,
579
+ "learning_rate": 1.54320987654321e-05,
580
+ "loss": 1.5846,
581
+ "step": 910
582
+ },
583
+ {
584
+ "epoch": 3.64,
585
+ "learning_rate": 1.4991181657848324e-05,
586
+ "loss": 1.5921,
587
+ "step": 920
588
+ },
589
+ {
590
+ "epoch": 3.68,
591
+ "learning_rate": 1.455026455026455e-05,
592
+ "loss": 1.6146,
593
+ "step": 930
594
+ },
595
+ {
596
+ "epoch": 3.72,
597
+ "learning_rate": 1.4109347442680776e-05,
598
+ "loss": 1.5937,
599
+ "step": 940
600
+ },
601
+ {
602
+ "epoch": 3.76,
603
+ "learning_rate": 1.3668430335097001e-05,
604
+ "loss": 1.5761,
605
+ "step": 950
606
+ },
607
+ {
608
+ "epoch": 3.8,
609
+ "learning_rate": 1.3227513227513228e-05,
610
+ "loss": 1.6056,
611
+ "step": 960
612
+ },
613
+ {
614
+ "epoch": 3.84,
615
+ "learning_rate": 1.2786596119929453e-05,
616
+ "loss": 1.5614,
617
+ "step": 970
618
+ },
619
+ {
620
+ "epoch": 3.88,
621
+ "learning_rate": 1.2345679012345678e-05,
622
+ "loss": 1.578,
623
+ "step": 980
624
+ },
625
+ {
626
+ "epoch": 3.92,
627
+ "learning_rate": 1.1904761904761905e-05,
628
+ "loss": 1.5495,
629
+ "step": 990
630
+ },
631
+ {
632
+ "epoch": 3.96,
633
+ "learning_rate": 1.146384479717813e-05,
634
+ "loss": 1.5949,
635
+ "step": 1000
636
+ },
637
+ {
638
+ "epoch": 4.0,
639
+ "learning_rate": 1.1022927689594357e-05,
640
+ "loss": 1.5795,
641
+ "step": 1010
642
+ },
643
+ {
644
+ "epoch": 4.0,
645
+ "eval_accuracy": 0.43772638617999443,
646
+ "eval_loss": 1.4835728406906128,
647
+ "eval_runtime": 16.6725,
648
+ "eval_samples_per_second": 215.265,
649
+ "eval_steps_per_second": 6.778,
650
+ "step": 1010
651
+ },
652
+ {
653
+ "epoch": 4.04,
654
+ "learning_rate": 1.0582010582010582e-05,
655
+ "loss": 1.5641,
656
+ "step": 1020
657
+ },
658
+ {
659
+ "epoch": 4.08,
660
+ "learning_rate": 1.0141093474426807e-05,
661
+ "loss": 1.5833,
662
+ "step": 1030
663
+ },
664
+ {
665
+ "epoch": 4.12,
666
+ "learning_rate": 9.700176366843034e-06,
667
+ "loss": 1.5865,
668
+ "step": 1040
669
+ },
670
+ {
671
+ "epoch": 4.16,
672
+ "learning_rate": 9.259259259259259e-06,
673
+ "loss": 1.5846,
674
+ "step": 1050
675
+ },
676
+ {
677
+ "epoch": 4.2,
678
+ "learning_rate": 8.818342151675484e-06,
679
+ "loss": 1.5946,
680
+ "step": 1060
681
+ },
682
+ {
683
+ "epoch": 4.24,
684
+ "learning_rate": 8.377425044091711e-06,
685
+ "loss": 1.6151,
686
+ "step": 1070
687
+ },
688
+ {
689
+ "epoch": 4.28,
690
+ "learning_rate": 7.936507936507936e-06,
691
+ "loss": 1.5523,
692
+ "step": 1080
693
+ },
694
+ {
695
+ "epoch": 4.32,
696
+ "learning_rate": 7.495590828924162e-06,
697
+ "loss": 1.5852,
698
+ "step": 1090
699
+ },
700
+ {
701
+ "epoch": 4.36,
702
+ "learning_rate": 7.054673721340388e-06,
703
+ "loss": 1.5802,
704
+ "step": 1100
705
+ },
706
+ {
707
+ "epoch": 4.4,
708
+ "learning_rate": 6.613756613756614e-06,
709
+ "loss": 1.6024,
710
+ "step": 1110
711
+ },
712
+ {
713
+ "epoch": 4.44,
714
+ "learning_rate": 6.172839506172839e-06,
715
+ "loss": 1.5884,
716
+ "step": 1120
717
+ },
718
+ {
719
+ "epoch": 4.48,
720
+ "learning_rate": 5.731922398589065e-06,
721
+ "loss": 1.5436,
722
+ "step": 1130
723
+ },
724
+ {
725
+ "epoch": 4.51,
726
+ "learning_rate": 5.291005291005291e-06,
727
+ "loss": 1.5893,
728
+ "step": 1140
729
+ },
730
+ {
731
+ "epoch": 4.55,
732
+ "learning_rate": 4.850088183421517e-06,
733
+ "loss": 1.5899,
734
+ "step": 1150
735
+ },
736
+ {
737
+ "epoch": 4.59,
738
+ "learning_rate": 4.409171075837742e-06,
739
+ "loss": 1.5784,
740
+ "step": 1160
741
+ },
742
+ {
743
+ "epoch": 4.63,
744
+ "learning_rate": 3.968253968253968e-06,
745
+ "loss": 1.5623,
746
+ "step": 1170
747
+ },
748
+ {
749
+ "epoch": 4.67,
750
+ "learning_rate": 3.527336860670194e-06,
751
+ "loss": 1.586,
752
+ "step": 1180
753
+ },
754
+ {
755
+ "epoch": 4.71,
756
+ "learning_rate": 3.0864197530864196e-06,
757
+ "loss": 1.5778,
758
+ "step": 1190
759
+ },
760
+ {
761
+ "epoch": 4.75,
762
+ "learning_rate": 2.6455026455026455e-06,
763
+ "loss": 1.5501,
764
+ "step": 1200
765
+ },
766
+ {
767
+ "epoch": 4.79,
768
+ "learning_rate": 2.204585537918871e-06,
769
+ "loss": 1.6087,
770
+ "step": 1210
771
+ },
772
+ {
773
+ "epoch": 4.83,
774
+ "learning_rate": 1.763668430335097e-06,
775
+ "loss": 1.5764,
776
+ "step": 1220
777
+ },
778
+ {
779
+ "epoch": 4.87,
780
+ "learning_rate": 1.3227513227513228e-06,
781
+ "loss": 1.5991,
782
+ "step": 1230
783
+ },
784
+ {
785
+ "epoch": 4.91,
786
+ "learning_rate": 8.818342151675485e-07,
787
+ "loss": 1.5643,
788
+ "step": 1240
789
+ },
790
+ {
791
+ "epoch": 4.95,
792
+ "learning_rate": 4.4091710758377425e-07,
793
+ "loss": 1.5688,
794
+ "step": 1250
795
+ },
796
+ {
797
+ "epoch": 4.99,
798
+ "learning_rate": 0.0,
799
+ "loss": 1.5645,
800
+ "step": 1260
801
+ },
802
+ {
803
+ "epoch": 4.99,
804
+ "eval_accuracy": 0.4446921147952076,
805
+ "eval_loss": 1.4667332172393799,
806
+ "eval_runtime": 16.8572,
807
+ "eval_samples_per_second": 212.906,
808
+ "eval_steps_per_second": 6.703,
809
+ "step": 1260
810
  },
811
  {
812
+ "epoch": 4.99,
813
+ "step": 1260,
814
+ "total_flos": 3.424468651849728e+18,
815
+ "train_loss": 1.653014714377267,
816
+ "train_runtime": 2002.0643,
817
+ "train_samples_per_second": 80.662,
818
+ "train_steps_per_second": 0.629
819
  }
820
  ],
821
+ "max_steps": 1260,
822
+ "num_train_epochs": 5,
823
+ "total_flos": 3.424468651849728e+18,
824
  "trial_name": null,
825
  "trial_params": null
826
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:920d9e50045acf1adce074b7f37032f58c4798960340b0330ab02f085fe17dff
3
  size 4408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3229e35e3e55df4b186a6a6699879b95c04344282f339abe3fbb728404a6edca
3
  size 4408