ealdaz commited on
Commit
ddaccb1
·
verified ·
1 Parent(s): 0c89dc6

End of training

Browse files
README.md CHANGED
@@ -2,6 +2,8 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -15,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # vit-base-beans
17
 
18
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0797
21
- - Accuracy: 0.9774
22
 
23
  ## Model description
24
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
+ - vision
7
  - generated_from_trainer
8
  metrics:
9
  - accuracy
 
17
 
18
  # vit-base-beans
19
 
20
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.0627
23
+ - Accuracy: 0.9925
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.9699248120300752,
4
- "eval_loss": 0.10282660275697708,
5
- "eval_runtime": 9.7767,
6
- "eval_samples_per_second": 13.604,
7
- "eval_steps_per_second": 6.853,
8
- "total_flos": 1.602548708238213e+17,
9
- "train_loss": 0.2494486983959172,
10
- "train_runtime": 527.8899,
11
- "train_samples_per_second": 3.917,
12
- "train_steps_per_second": 1.959
13
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9924812030075187,
4
+ "eval_loss": 0.06271149218082428,
5
+ "eval_runtime": 2.0152,
6
+ "eval_samples_per_second": 66.0,
7
+ "eval_steps_per_second": 8.436,
8
+ "total_flos": 4.006371770595533e+17,
9
+ "train_loss": 0.2178187003502479,
10
+ "train_runtime": 261.0621,
11
+ "train_samples_per_second": 19.804,
12
+ "train_steps_per_second": 2.49
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.9699248120300752,
4
- "eval_loss": 0.10282660275697708,
5
- "eval_runtime": 9.7767,
6
- "eval_samples_per_second": 13.604,
7
- "eval_steps_per_second": 6.853
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9924812030075187,
4
+ "eval_loss": 0.06271149218082428,
5
+ "eval_runtime": 2.0152,
6
+ "eval_samples_per_second": 66.0,
7
+ "eval_steps_per_second": 8.436
8
  }
runs/May23_21-28-08_Edus-MacBook-Pro.local/events.out.tfevents.1716496939.Edus-MacBook-Pro.local ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97aa9bed90c56b9f4cce0a647462eb252cfecba65d0c6d9540025841fc42bae4
3
+ size 253
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "total_flos": 1.602548708238213e+17,
4
- "train_loss": 0.2494486983959172,
5
- "train_runtime": 527.8899,
6
- "train_samples_per_second": 3.917,
7
- "train_steps_per_second": 1.959
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 4.006371770595533e+17,
4
+ "train_loss": 0.2178187003502479,
5
+ "train_runtime": 261.0621,
6
+ "train_samples_per_second": 19.804,
7
+ "train_steps_per_second": 2.49
8
  }
trainer_state.json CHANGED
@@ -1,769 +1,542 @@
1
  {
2
- "best_metric": 0.10282659530639648,
3
- "best_model_checkpoint": "./beans_outputs/checkpoint-517",
4
- "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 1034,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.019342359767891684,
13
- "grad_norm": 3.192776918411255,
14
- "learning_rate": 1.9806576402321086e-05,
15
- "loss": 1.0424,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.03868471953578337,
20
- "grad_norm": 4.558914661407471,
21
- "learning_rate": 1.961315280464217e-05,
22
- "loss": 0.9602,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.058027079303675046,
27
- "grad_norm": 4.579104423522949,
28
- "learning_rate": 1.941972920696325e-05,
29
- "loss": 0.8902,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.07736943907156674,
34
- "grad_norm": 7.477652072906494,
35
- "learning_rate": 1.9226305609284334e-05,
36
- "loss": 0.9063,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.09671179883945841,
41
- "grad_norm": 5.673726558685303,
42
- "learning_rate": 1.9032882011605418e-05,
43
- "loss": 0.8961,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.11605415860735009,
48
- "grad_norm": 4.80955696105957,
49
- "learning_rate": 1.8839458413926502e-05,
50
- "loss": 0.8157,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.13539651837524178,
55
- "grad_norm": 2.8267390727996826,
56
- "learning_rate": 1.8646034816247586e-05,
57
- "loss": 0.7936,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.15473887814313347,
62
- "grad_norm": 3.271488904953003,
63
- "learning_rate": 1.8452611218568667e-05,
64
- "loss": 0.6614,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.17408123791102514,
69
- "grad_norm": 3.2059671878814697,
70
- "learning_rate": 1.825918762088975e-05,
71
- "loss": 0.616,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.19342359767891681,
76
- "grad_norm": 4.123504161834717,
77
- "learning_rate": 1.806576402321083e-05,
78
- "loss": 0.6023,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.2127659574468085,
83
- "grad_norm": 2.275813579559326,
84
- "learning_rate": 1.7872340425531915e-05,
85
- "loss": 0.5268,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.23210831721470018,
90
- "grad_norm": 4.621743679046631,
91
- "learning_rate": 1.7678916827853e-05,
92
- "loss": 0.5247,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.2514506769825919,
97
- "grad_norm": 5.684010028839111,
98
- "learning_rate": 1.7485493230174083e-05,
99
- "loss": 0.4011,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.27079303675048355,
104
- "grad_norm": 1.714890956878662,
105
- "learning_rate": 1.7292069632495167e-05,
106
- "loss": 0.354,
 
 
 
 
 
 
 
 
 
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.2901353965183752,
111
- "grad_norm": 1.8230618238449097,
112
- "learning_rate": 1.7098646034816248e-05,
113
- "loss": 0.3116,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.30947775628626695,
118
- "grad_norm": 1.1684730052947998,
119
- "learning_rate": 1.690522243713733e-05,
120
- "loss": 0.3653,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.3288201160541586,
125
- "grad_norm": 3.569240093231201,
126
- "learning_rate": 1.6711798839458415e-05,
127
- "loss": 0.3585,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.3481624758220503,
132
- "grad_norm": 1.3044307231903076,
133
- "learning_rate": 1.65183752417795e-05,
134
- "loss": 0.3891,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.36750483558994196,
139
- "grad_norm": 0.8555651903152466,
140
- "learning_rate": 1.6324951644100583e-05,
141
- "loss": 0.368,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.38684719535783363,
146
- "grad_norm": 24.579862594604492,
147
- "learning_rate": 1.6131528046421664e-05,
148
- "loss": 0.3181,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 0.40618955512572535,
153
- "grad_norm": 2.9059534072875977,
154
- "learning_rate": 1.5938104448742748e-05,
155
- "loss": 0.3285,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 0.425531914893617,
160
- "grad_norm": 8.594366073608398,
161
- "learning_rate": 1.5744680851063832e-05,
162
- "loss": 0.4461,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 0.4448742746615087,
167
- "grad_norm": 1.1595553159713745,
168
- "learning_rate": 1.5551257253384916e-05,
169
- "loss": 0.1521,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 0.46421663442940037,
174
- "grad_norm": 14.084031105041504,
175
- "learning_rate": 1.5357833655706e-05,
176
- "loss": 0.1935,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 0.4835589941972921,
181
- "grad_norm": 14.068747520446777,
182
- "learning_rate": 1.5164410058027082e-05,
183
- "loss": 0.5199,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 0.5029013539651838,
188
- "grad_norm": 20.944473266601562,
189
- "learning_rate": 1.4970986460348164e-05,
190
- "loss": 0.1387,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.5222437137330754,
195
- "grad_norm": 14.65572452545166,
196
- "learning_rate": 1.4777562862669247e-05,
197
- "loss": 0.6002,
 
 
 
 
 
 
 
 
 
198
  "step": 270
199
  },
200
  {
201
- "epoch": 0.5415860735009671,
202
- "grad_norm": 0.8473848104476929,
203
- "learning_rate": 1.4584139264990329e-05,
204
- "loss": 0.1309,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 0.5609284332688588,
209
- "grad_norm": 2.6289236545562744,
210
- "learning_rate": 1.4390715667311413e-05,
211
- "loss": 0.0999,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 0.5802707930367504,
216
- "grad_norm": 0.45386189222335815,
217
- "learning_rate": 1.4197292069632495e-05,
218
- "loss": 0.2121,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 0.5996131528046421,
223
- "grad_norm": 0.400414377450943,
224
- "learning_rate": 1.4003868471953579e-05,
225
- "loss": 0.1389,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 0.6189555125725339,
230
- "grad_norm": 0.4177948534488678,
231
- "learning_rate": 1.3810444874274663e-05,
232
- "loss": 0.0817,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 0.6382978723404256,
237
- "grad_norm": 3.3589959144592285,
238
- "learning_rate": 1.3617021276595745e-05,
239
- "loss": 0.1227,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 0.6576402321083172,
244
- "grad_norm": 0.5038989782333374,
245
- "learning_rate": 1.342359767891683e-05,
246
- "loss": 0.3583,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 0.6769825918762089,
251
- "grad_norm": 15.520181655883789,
252
- "learning_rate": 1.3230174081237912e-05,
253
- "loss": 0.1876,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 0.6963249516441006,
258
- "grad_norm": 0.2799462676048279,
259
- "learning_rate": 1.3036750483558995e-05,
260
- "loss": 0.0677,
261
  "step": 360
262
  },
263
  {
264
- "epoch": 0.7156673114119922,
265
- "grad_norm": 4.5798869132995605,
266
- "learning_rate": 1.2843326885880078e-05,
267
- "loss": 0.0657,
268
  "step": 370
269
  },
270
  {
271
- "epoch": 0.7350096711798839,
272
- "grad_norm": 1.6752430200576782,
273
- "learning_rate": 1.2649903288201162e-05,
274
- "loss": 0.1983,
275
  "step": 380
276
  },
277
  {
278
- "epoch": 0.7543520309477756,
279
- "grad_norm": 0.23511852324008942,
280
- "learning_rate": 1.2456479690522246e-05,
281
- "loss": 0.1278,
 
 
 
 
 
 
 
 
 
282
  "step": 390
283
  },
284
  {
285
- "epoch": 0.7736943907156673,
286
- "grad_norm": 0.3272879719734192,
287
- "learning_rate": 1.2263056092843328e-05,
288
- "loss": 0.138,
289
  "step": 400
290
  },
291
  {
292
- "epoch": 0.793036750483559,
293
- "grad_norm": 0.34640178084373474,
294
- "learning_rate": 1.2069632495164412e-05,
295
- "loss": 0.3669,
296
  "step": 410
297
  },
298
  {
299
- "epoch": 0.8123791102514507,
300
- "grad_norm": 0.3155074715614319,
301
- "learning_rate": 1.1876208897485494e-05,
302
- "loss": 0.1772,
303
  "step": 420
304
  },
305
  {
306
- "epoch": 0.8317214700193424,
307
- "grad_norm": 0.30559900403022766,
308
- "learning_rate": 1.1682785299806578e-05,
309
- "loss": 0.3992,
310
  "step": 430
311
  },
312
  {
313
- "epoch": 0.851063829787234,
314
- "grad_norm": 0.4976309835910797,
315
- "learning_rate": 1.1489361702127662e-05,
316
- "loss": 0.0475,
317
  "step": 440
318
  },
319
  {
320
- "epoch": 0.8704061895551257,
321
- "grad_norm": 0.40445780754089355,
322
- "learning_rate": 1.1295938104448743e-05,
323
- "loss": 0.0504,
324
  "step": 450
325
  },
326
  {
327
- "epoch": 0.8897485493230174,
328
- "grad_norm": 6.139995098114014,
329
- "learning_rate": 1.1102514506769827e-05,
330
- "loss": 0.0773,
331
  "step": 460
332
  },
333
  {
334
- "epoch": 0.9090909090909091,
335
- "grad_norm": 0.5419360995292664,
336
- "learning_rate": 1.0909090909090909e-05,
337
- "loss": 0.046,
338
  "step": 470
339
  },
340
  {
341
- "epoch": 0.9284332688588007,
342
- "grad_norm": 2.6806490421295166,
343
- "learning_rate": 1.0715667311411993e-05,
344
- "loss": 0.2814,
345
  "step": 480
346
  },
347
  {
348
- "epoch": 0.9477756286266924,
349
- "grad_norm": 6.263265132904053,
350
- "learning_rate": 1.0522243713733075e-05,
351
- "loss": 0.2632,
352
  "step": 490
353
  },
354
  {
355
- "epoch": 0.9671179883945842,
356
- "grad_norm": 0.29741278290748596,
357
- "learning_rate": 1.0328820116054159e-05,
358
- "loss": 0.355,
359
  "step": 500
360
  },
361
  {
362
- "epoch": 0.9864603481624759,
363
- "grad_norm": 2.9817466735839844,
364
- "learning_rate": 1.0135396518375243e-05,
365
- "loss": 0.056,
366
  "step": 510
367
  },
368
  {
369
- "epoch": 1.0,
370
- "eval_accuracy": 0.9699248120300752,
371
- "eval_loss": 0.10282659530639648,
372
- "eval_runtime": 9.3712,
373
- "eval_samples_per_second": 14.192,
374
- "eval_steps_per_second": 7.15,
375
- "step": 517
376
  },
377
  {
378
- "epoch": 1.0058027079303675,
379
- "grad_norm": 0.93055659532547,
380
- "learning_rate": 9.941972920696325e-06,
381
- "loss": 0.0659,
 
 
382
  "step": 520
383
  },
384
  {
385
- "epoch": 1.0251450676982592,
386
- "grad_norm": 0.2567221522331238,
387
- "learning_rate": 9.74854932301741e-06,
388
- "loss": 0.1785,
389
  "step": 530
390
  },
391
  {
392
- "epoch": 1.0444874274661509,
393
- "grad_norm": 0.9089804291725159,
394
- "learning_rate": 9.555125725338492e-06,
395
- "loss": 0.0413,
396
  "step": 540
397
  },
398
  {
399
- "epoch": 1.0638297872340425,
400
- "grad_norm": 0.8061901926994324,
401
- "learning_rate": 9.361702127659576e-06,
402
- "loss": 0.0424,
403
  "step": 550
404
  },
405
  {
406
- "epoch": 1.0831721470019342,
407
- "grad_norm": 0.21418491005897522,
408
- "learning_rate": 9.16827852998066e-06,
409
- "loss": 0.0448,
410
  "step": 560
411
  },
412
  {
413
- "epoch": 1.1025145067698259,
414
- "grad_norm": 0.20464298129081726,
415
- "learning_rate": 8.974854932301742e-06,
416
- "loss": 0.0388,
417
  "step": 570
418
  },
419
  {
420
- "epoch": 1.1218568665377175,
421
- "grad_norm": 0.19184868037700653,
422
- "learning_rate": 8.781431334622824e-06,
423
- "loss": 0.4538,
424
  "step": 580
425
  },
426
  {
427
- "epoch": 1.1411992263056092,
428
- "grad_norm": 0.19347575306892395,
429
- "learning_rate": 8.588007736943908e-06,
430
- "loss": 0.0361,
431
  "step": 590
432
  },
433
  {
434
- "epoch": 1.1605415860735009,
435
- "grad_norm": 5.5723772048950195,
436
- "learning_rate": 8.39458413926499e-06,
437
- "loss": 0.126,
438
  "step": 600
439
  },
440
  {
441
- "epoch": 1.1798839458413926,
442
- "grad_norm": 8.067544937133789,
443
- "learning_rate": 8.201160541586074e-06,
444
- "loss": 0.1137,
445
  "step": 610
446
  },
447
  {
448
- "epoch": 1.1992263056092844,
449
- "grad_norm": 0.20713689923286438,
450
- "learning_rate": 8.007736943907156e-06,
451
- "loss": 0.0514,
452
  "step": 620
453
  },
454
  {
455
- "epoch": 1.218568665377176,
456
- "grad_norm": 0.21297387778759003,
457
- "learning_rate": 7.81431334622824e-06,
458
- "loss": 0.0538,
459
  "step": 630
460
  },
461
  {
462
- "epoch": 1.2379110251450678,
463
- "grad_norm": 6.387426376342773,
464
- "learning_rate": 7.6208897485493236e-06,
465
- "loss": 0.0757,
466
  "step": 640
467
  },
468
  {
469
- "epoch": 1.2572533849129595,
470
- "grad_norm": 0.20734897255897522,
471
- "learning_rate": 7.4274661508704075e-06,
472
- "loss": 0.3696,
473
  "step": 650
474
  },
475
  {
476
- "epoch": 1.2765957446808511,
477
- "grad_norm": 0.3389629125595093,
478
- "learning_rate": 7.234042553191491e-06,
479
- "loss": 0.0789,
480
- "step": 660
481
- },
482
- {
483
- "epoch": 1.2959381044487428,
484
- "grad_norm": 0.26222383975982666,
485
- "learning_rate": 7.040618955512573e-06,
486
- "loss": 0.3863,
487
- "step": 670
488
- },
489
- {
490
- "epoch": 1.3152804642166345,
491
- "grad_norm": 0.40525734424591064,
492
- "learning_rate": 6.847195357833656e-06,
493
- "loss": 0.0357,
494
- "step": 680
495
- },
496
- {
497
- "epoch": 1.3346228239845261,
498
- "grad_norm": 0.341835081577301,
499
- "learning_rate": 6.653771760154739e-06,
500
- "loss": 0.1815,
501
- "step": 690
502
- },
503
- {
504
- "epoch": 1.3539651837524178,
505
- "grad_norm": 0.17171211540699005,
506
- "learning_rate": 6.460348162475822e-06,
507
- "loss": 0.0815,
508
- "step": 700
509
- },
510
- {
511
- "epoch": 1.3733075435203095,
512
- "grad_norm": 0.23651224374771118,
513
- "learning_rate": 6.266924564796905e-06,
514
- "loss": 0.1392,
515
- "step": 710
516
- },
517
- {
518
- "epoch": 1.3926499032882012,
519
- "grad_norm": 0.19646570086479187,
520
- "learning_rate": 6.073500967117989e-06,
521
- "loss": 0.0331,
522
- "step": 720
523
- },
524
- {
525
- "epoch": 1.4119922630560928,
526
- "grad_norm": 7.1733574867248535,
527
- "learning_rate": 5.8800773694390724e-06,
528
- "loss": 0.0638,
529
- "step": 730
530
- },
531
- {
532
- "epoch": 1.4313346228239845,
533
- "grad_norm": 0.18651318550109863,
534
- "learning_rate": 5.6866537717601556e-06,
535
- "loss": 0.0352,
536
- "step": 740
537
- },
538
- {
539
- "epoch": 1.4506769825918762,
540
- "grad_norm": 0.1645909547805786,
541
- "learning_rate": 5.493230174081239e-06,
542
- "loss": 0.2837,
543
- "step": 750
544
- },
545
- {
546
- "epoch": 1.4700193423597678,
547
- "grad_norm": 0.16515417397022247,
548
- "learning_rate": 5.299806576402321e-06,
549
- "loss": 0.108,
550
- "step": 760
551
- },
552
- {
553
- "epoch": 1.4893617021276595,
554
- "grad_norm": 0.17802861332893372,
555
- "learning_rate": 5.106382978723404e-06,
556
- "loss": 0.5353,
557
- "step": 770
558
- },
559
- {
560
- "epoch": 1.5087040618955512,
561
- "grad_norm": 0.20135052502155304,
562
- "learning_rate": 4.912959381044487e-06,
563
- "loss": 0.1647,
564
- "step": 780
565
- },
566
- {
567
- "epoch": 1.528046421663443,
568
- "grad_norm": 0.16655394434928894,
569
- "learning_rate": 4.719535783365571e-06,
570
- "loss": 0.2726,
571
- "step": 790
572
- },
573
- {
574
- "epoch": 1.5473887814313345,
575
- "grad_norm": 0.25633805990219116,
576
- "learning_rate": 4.526112185686654e-06,
577
- "loss": 0.029,
578
- "step": 800
579
- },
580
- {
581
- "epoch": 1.5667311411992264,
582
- "grad_norm": 27.006752014160156,
583
- "learning_rate": 4.332688588007737e-06,
584
- "loss": 0.0478,
585
- "step": 810
586
- },
587
- {
588
- "epoch": 1.5860735009671179,
589
- "grad_norm": 60.104461669921875,
590
- "learning_rate": 4.1392649903288205e-06,
591
- "loss": 0.1252,
592
- "step": 820
593
- },
594
- {
595
- "epoch": 1.6054158607350097,
596
- "grad_norm": 0.22381837666034698,
597
- "learning_rate": 3.945841392649904e-06,
598
- "loss": 0.2842,
599
- "step": 830
600
- },
601
- {
602
- "epoch": 1.6247582205029012,
603
- "grad_norm": 0.20553378760814667,
604
- "learning_rate": 3.7524177949709867e-06,
605
- "loss": 0.06,
606
- "step": 840
607
- },
608
- {
609
- "epoch": 1.644100580270793,
610
- "grad_norm": 0.18213896453380585,
611
- "learning_rate": 3.55899419729207e-06,
612
- "loss": 0.031,
613
- "step": 850
614
- },
615
- {
616
- "epoch": 1.6634429400386848,
617
- "grad_norm": 0.18769198656082153,
618
- "learning_rate": 3.3655705996131534e-06,
619
- "loss": 0.4103,
620
- "step": 860
621
- },
622
- {
623
- "epoch": 1.6827852998065764,
624
- "grad_norm": 26.75303077697754,
625
- "learning_rate": 3.172147001934236e-06,
626
- "loss": 0.4095,
627
- "step": 870
628
- },
629
- {
630
- "epoch": 1.702127659574468,
631
- "grad_norm": 1.4275975227355957,
632
- "learning_rate": 2.978723404255319e-06,
633
- "loss": 0.0442,
634
- "step": 880
635
- },
636
- {
637
- "epoch": 1.7214700193423598,
638
- "grad_norm": 0.7911761999130249,
639
- "learning_rate": 2.7852998065764027e-06,
640
- "loss": 0.0295,
641
- "step": 890
642
- },
643
- {
644
- "epoch": 1.7408123791102514,
645
- "grad_norm": 0.1537817418575287,
646
- "learning_rate": 2.591876208897486e-06,
647
- "loss": 0.2648,
648
- "step": 900
649
- },
650
- {
651
- "epoch": 1.760154738878143,
652
- "grad_norm": 0.19490283727645874,
653
- "learning_rate": 2.398452611218569e-06,
654
- "loss": 0.0336,
655
- "step": 910
656
- },
657
- {
658
- "epoch": 1.7794970986460348,
659
- "grad_norm": 67.34917449951172,
660
- "learning_rate": 2.205029013539652e-06,
661
- "loss": 0.1109,
662
- "step": 920
663
- },
664
- {
665
- "epoch": 1.7988394584139265,
666
- "grad_norm": 0.6602293252944946,
667
- "learning_rate": 2.011605415860735e-06,
668
- "loss": 0.0386,
669
- "step": 930
670
- },
671
- {
672
- "epoch": 1.8181818181818183,
673
- "grad_norm": 0.20363681018352509,
674
- "learning_rate": 1.8181818181818183e-06,
675
- "loss": 0.1173,
676
- "step": 940
677
- },
678
- {
679
- "epoch": 1.8375241779497098,
680
- "grad_norm": 0.1697886437177658,
681
- "learning_rate": 1.6247582205029014e-06,
682
- "loss": 0.1885,
683
- "step": 950
684
- },
685
- {
686
- "epoch": 1.8568665377176017,
687
- "grad_norm": 0.1814304143190384,
688
- "learning_rate": 1.4313346228239847e-06,
689
- "loss": 0.2553,
690
- "step": 960
691
- },
692
- {
693
- "epoch": 1.8762088974854931,
694
- "grad_norm": 0.3075575530529022,
695
- "learning_rate": 1.2379110251450678e-06,
696
- "loss": 0.2235,
697
- "step": 970
698
- },
699
- {
700
- "epoch": 1.895551257253385,
701
- "grad_norm": 0.17250248789787292,
702
- "learning_rate": 1.044487427466151e-06,
703
- "loss": 0.0648,
704
- "step": 980
705
- },
706
- {
707
- "epoch": 1.9148936170212765,
708
- "grad_norm": 0.18552158772945404,
709
- "learning_rate": 8.510638297872341e-07,
710
- "loss": 0.0266,
711
- "step": 990
712
- },
713
- {
714
- "epoch": 1.9342359767891684,
715
- "grad_norm": 21.213985443115234,
716
- "learning_rate": 6.576402321083172e-07,
717
- "loss": 0.2269,
718
- "step": 1000
719
- },
720
- {
721
- "epoch": 1.9535783365570598,
722
- "grad_norm": 0.20198415219783783,
723
- "learning_rate": 4.6421663442940047e-07,
724
- "loss": 0.0441,
725
- "step": 1010
726
- },
727
- {
728
- "epoch": 1.9729206963249517,
729
- "grad_norm": 8.638155937194824,
730
- "learning_rate": 2.707930367504836e-07,
731
- "loss": 0.0328,
732
- "step": 1020
733
- },
734
- {
735
- "epoch": 1.9922630560928434,
736
- "grad_norm": 0.18748199939727783,
737
- "learning_rate": 7.736943907156674e-08,
738
- "loss": 0.0627,
739
- "step": 1030
740
- },
741
- {
742
- "epoch": 2.0,
743
- "eval_accuracy": 0.9699248120300752,
744
- "eval_loss": 0.10996536910533905,
745
- "eval_runtime": 9.2747,
746
- "eval_samples_per_second": 14.34,
747
- "eval_steps_per_second": 7.224,
748
- "step": 1034
749
  },
750
  {
751
- "epoch": 2.0,
752
- "step": 1034,
753
- "total_flos": 1.602548708238213e+17,
754
- "train_loss": 0.2494486983959172,
755
- "train_runtime": 527.8899,
756
- "train_samples_per_second": 3.917,
757
- "train_steps_per_second": 1.959
758
  }
759
  ],
760
  "logging_steps": 10,
761
- "max_steps": 1034,
762
  "num_input_tokens_seen": 0,
763
- "num_train_epochs": 2,
764
  "save_steps": 500,
765
- "total_flos": 1.602548708238213e+17,
766
- "train_batch_size": 2,
 
 
 
 
 
 
 
 
 
 
 
 
767
  "trial_name": null,
768
  "trial_params": null
769
  }
 
1
  {
2
+ "best_metric": 0.06271149218082428,
3
+ "best_model_checkpoint": "./beans_outputs/checkpoint-520",
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 650,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.07692307692307693,
13
+ "grad_norm": 2.1943726539611816,
14
+ "learning_rate": 1.9692307692307696e-05,
15
+ "loss": 1.0249,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.15384615384615385,
20
+ "grad_norm": 1.8665084838867188,
21
+ "learning_rate": 1.9384615384615386e-05,
22
+ "loss": 0.9463,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.23076923076923078,
27
+ "grad_norm": 2.0366761684417725,
28
+ "learning_rate": 1.907692307692308e-05,
29
+ "loss": 0.8401,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.3076923076923077,
34
+ "grad_norm": 2.6267013549804688,
35
+ "learning_rate": 1.876923076923077e-05,
36
+ "loss": 0.6974,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.38461538461538464,
41
+ "grad_norm": 4.034597396850586,
42
+ "learning_rate": 1.8461538461538465e-05,
43
+ "loss": 0.664,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.46153846153846156,
48
+ "grad_norm": 2.0909483432769775,
49
+ "learning_rate": 1.8153846153846155e-05,
50
+ "loss": 0.5928,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.5384615384615384,
55
+ "grad_norm": 1.782705307006836,
56
+ "learning_rate": 1.784615384615385e-05,
57
+ "loss": 0.5262,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.6153846153846154,
62
+ "grad_norm": 1.3501217365264893,
63
+ "learning_rate": 1.753846153846154e-05,
64
+ "loss": 0.4015,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.6923076923076923,
69
+ "grad_norm": 2.443540573120117,
70
+ "learning_rate": 1.7230769230769234e-05,
71
+ "loss": 0.3581,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.7692307692307693,
76
+ "grad_norm": 3.095242738723755,
77
+ "learning_rate": 1.6923076923076924e-05,
78
+ "loss": 0.3728,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.8461538461538461,
83
+ "grad_norm": 2.682217836380005,
84
+ "learning_rate": 1.6615384615384618e-05,
85
+ "loss": 0.3364,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.9230769230769231,
90
+ "grad_norm": 2.5428781509399414,
91
+ "learning_rate": 1.630769230769231e-05,
92
+ "loss": 0.2471,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 1.0,
97
+ "grad_norm": 6.824251651763916,
98
+ "learning_rate": 1.6000000000000003e-05,
99
+ "loss": 0.281,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 1.0,
104
+ "eval_accuracy": 0.9624060150375939,
105
+ "eval_loss": 0.2193511426448822,
106
+ "eval_runtime": 2.0821,
107
+ "eval_samples_per_second": 63.878,
108
+ "eval_steps_per_second": 8.165,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 1.0769230769230769,
113
+ "grad_norm": 1.4056776762008667,
114
+ "learning_rate": 1.5692307692307693e-05,
115
+ "loss": 0.2017,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 1.1538461538461537,
120
+ "grad_norm": 2.9976863861083984,
121
+ "learning_rate": 1.5384615384615387e-05,
122
+ "loss": 0.225,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.2307692307692308,
127
+ "grad_norm": 1.4828065633773804,
128
+ "learning_rate": 1.5076923076923078e-05,
129
+ "loss": 0.2011,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 1.3076923076923077,
134
+ "grad_norm": 0.5160149335861206,
135
+ "learning_rate": 1.4769230769230772e-05,
136
+ "loss": 0.216,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 1.3846153846153846,
141
+ "grad_norm": 0.606515645980835,
142
+ "learning_rate": 1.4461538461538462e-05,
143
+ "loss": 0.2028,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 1.4615384615384617,
148
+ "grad_norm": 4.552361011505127,
149
+ "learning_rate": 1.4153846153846156e-05,
150
+ "loss": 0.1627,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 1.5384615384615383,
155
+ "grad_norm": 0.44607630372047424,
156
+ "learning_rate": 1.3846153846153847e-05,
157
+ "loss": 0.2465,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 1.6153846153846154,
162
+ "grad_norm": 0.686824381351471,
163
+ "learning_rate": 1.353846153846154e-05,
164
+ "loss": 0.1797,
165
  "step": 210
166
  },
167
  {
168
+ "epoch": 1.6923076923076923,
169
+ "grad_norm": 2.306919574737549,
170
+ "learning_rate": 1.3230769230769231e-05,
171
+ "loss": 0.2161,
172
  "step": 220
173
  },
174
  {
175
+ "epoch": 1.7692307692307692,
176
+ "grad_norm": 2.050942897796631,
177
+ "learning_rate": 1.2923076923076925e-05,
178
+ "loss": 0.1433,
179
  "step": 230
180
  },
181
  {
182
+ "epoch": 1.8461538461538463,
183
+ "grad_norm": 7.248354434967041,
184
+ "learning_rate": 1.2615384615384616e-05,
185
+ "loss": 0.2762,
186
  "step": 240
187
  },
188
  {
189
+ "epoch": 1.9230769230769231,
190
+ "grad_norm": 0.40929391980171204,
191
+ "learning_rate": 1.230769230769231e-05,
192
+ "loss": 0.1286,
193
  "step": 250
194
  },
195
  {
196
+ "epoch": 2.0,
197
+ "grad_norm": 0.3473702669143677,
198
+ "learning_rate": 1.2e-05,
199
+ "loss": 0.1268,
200
  "step": 260
201
  },
202
  {
203
+ "epoch": 2.0,
204
+ "eval_accuracy": 0.9699248120300752,
205
+ "eval_loss": 0.12887412309646606,
206
+ "eval_runtime": 2.0415,
207
+ "eval_samples_per_second": 65.15,
208
+ "eval_steps_per_second": 8.327,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 2.076923076923077,
213
+ "grad_norm": 0.6053388118743896,
214
+ "learning_rate": 1.1692307692307694e-05,
215
+ "loss": 0.0969,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 2.1538461538461537,
220
+ "grad_norm": 0.3397085666656494,
221
+ "learning_rate": 1.1384615384615385e-05,
222
+ "loss": 0.1396,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 2.230769230769231,
227
+ "grad_norm": 7.664570331573486,
228
+ "learning_rate": 1.1076923076923079e-05,
229
+ "loss": 0.1308,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 2.3076923076923075,
234
+ "grad_norm": 9.331936836242676,
235
+ "learning_rate": 1.076923076923077e-05,
236
+ "loss": 0.1427,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 2.3846153846153846,
241
+ "grad_norm": 6.082279682159424,
242
+ "learning_rate": 1.0461538461538463e-05,
243
+ "loss": 0.1687,
244
  "step": 310
245
  },
246
  {
247
+ "epoch": 2.4615384615384617,
248
+ "grad_norm": 0.3458414375782013,
249
+ "learning_rate": 1.0153846153846154e-05,
250
+ "loss": 0.1621,
251
  "step": 320
252
  },
253
  {
254
+ "epoch": 2.5384615384615383,
255
+ "grad_norm": 5.815878391265869,
256
+ "learning_rate": 9.846153846153848e-06,
257
+ "loss": 0.1863,
258
  "step": 330
259
  },
260
  {
261
+ "epoch": 2.6153846153846154,
262
+ "grad_norm": 1.0016790628433228,
263
+ "learning_rate": 9.53846153846154e-06,
264
+ "loss": 0.0756,
265
  "step": 340
266
  },
267
  {
268
+ "epoch": 2.6923076923076925,
269
+ "grad_norm": 0.35424068570137024,
270
+ "learning_rate": 9.230769230769232e-06,
271
+ "loss": 0.1243,
272
  "step": 350
273
  },
274
  {
275
+ "epoch": 2.769230769230769,
276
+ "grad_norm": 13.017029762268066,
277
+ "learning_rate": 8.923076923076925e-06,
278
+ "loss": 0.1215,
279
  "step": 360
280
  },
281
  {
282
+ "epoch": 2.8461538461538463,
283
+ "grad_norm": 0.30303019285202026,
284
+ "learning_rate": 8.615384615384617e-06,
285
+ "loss": 0.1552,
286
  "step": 370
287
  },
288
  {
289
+ "epoch": 2.9230769230769234,
290
+ "grad_norm": 1.2382951974868774,
291
+ "learning_rate": 8.307692307692309e-06,
292
+ "loss": 0.1296,
293
  "step": 380
294
  },
295
  {
296
+ "epoch": 3.0,
297
+ "grad_norm": 0.29275208711624146,
298
+ "learning_rate": 8.000000000000001e-06,
299
+ "loss": 0.1385,
300
+ "step": 390
301
+ },
302
+ {
303
+ "epoch": 3.0,
304
+ "eval_accuracy": 0.9699248120300752,
305
+ "eval_loss": 0.09677492827177048,
306
+ "eval_runtime": 2.1789,
307
+ "eval_samples_per_second": 61.041,
308
+ "eval_steps_per_second": 7.802,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 3.076923076923077,
313
+ "grad_norm": 2.064225673675537,
314
+ "learning_rate": 7.692307692307694e-06,
315
+ "loss": 0.1348,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 3.1538461538461537,
320
+ "grad_norm": 6.044634819030762,
321
+ "learning_rate": 7.384615384615386e-06,
322
+ "loss": 0.1496,
323
  "step": 410
324
  },
325
  {
326
+ "epoch": 3.230769230769231,
327
+ "grad_norm": 9.788597106933594,
328
+ "learning_rate": 7.076923076923078e-06,
329
+ "loss": 0.142,
330
  "step": 420
331
  },
332
  {
333
+ "epoch": 3.3076923076923075,
334
+ "grad_norm": 0.23322972655296326,
335
+ "learning_rate": 6.76923076923077e-06,
336
+ "loss": 0.0788,
337
  "step": 430
338
  },
339
  {
340
+ "epoch": 3.3846153846153846,
341
+ "grad_norm": 2.807680368423462,
342
+ "learning_rate": 6.461538461538463e-06,
343
+ "loss": 0.1074,
344
  "step": 440
345
  },
346
  {
347
+ "epoch": 3.4615384615384617,
348
+ "grad_norm": 4.750285625457764,
349
+ "learning_rate": 6.153846153846155e-06,
350
+ "loss": 0.0719,
351
  "step": 450
352
  },
353
  {
354
+ "epoch": 3.5384615384615383,
355
+ "grad_norm": 5.354732990264893,
356
+ "learning_rate": 5.846153846153847e-06,
357
+ "loss": 0.0966,
358
  "step": 460
359
  },
360
  {
361
+ "epoch": 3.6153846153846154,
362
+ "grad_norm": 8.170781135559082,
363
+ "learning_rate": 5.538461538461539e-06,
364
+ "loss": 0.0753,
365
  "step": 470
366
  },
367
  {
368
+ "epoch": 3.6923076923076925,
369
+ "grad_norm": 0.22035281360149384,
370
+ "learning_rate": 5.230769230769232e-06,
371
+ "loss": 0.0902,
372
  "step": 480
373
  },
374
  {
375
+ "epoch": 3.769230769230769,
376
+ "grad_norm": 0.2130032181739807,
377
+ "learning_rate": 4.923076923076924e-06,
378
+ "loss": 0.077,
379
  "step": 490
380
  },
381
  {
382
+ "epoch": 3.8461538461538463,
383
+ "grad_norm": 0.9436342716217041,
384
+ "learning_rate": 4.615384615384616e-06,
385
+ "loss": 0.1247,
386
  "step": 500
387
  },
388
  {
389
+ "epoch": 3.9230769230769234,
390
+ "grad_norm": 0.2058902233839035,
391
+ "learning_rate": 4.307692307692308e-06,
392
+ "loss": 0.1309,
393
  "step": 510
394
  },
395
  {
396
+ "epoch": 4.0,
397
+ "grad_norm": 0.30152463912963867,
398
+ "learning_rate": 4.000000000000001e-06,
399
+ "loss": 0.0749,
400
+ "step": 520
 
 
401
  },
402
  {
403
+ "epoch": 4.0,
404
+ "eval_accuracy": 0.9924812030075187,
405
+ "eval_loss": 0.06271149218082428,
406
+ "eval_runtime": 1.9746,
407
+ "eval_samples_per_second": 67.355,
408
+ "eval_steps_per_second": 8.609,
409
  "step": 520
410
  },
411
  {
412
+ "epoch": 4.076923076923077,
413
+ "grad_norm": 2.3252828121185303,
414
+ "learning_rate": 3.692307692307693e-06,
415
+ "loss": 0.0948,
416
  "step": 530
417
  },
418
  {
419
+ "epoch": 4.153846153846154,
420
+ "grad_norm": 2.0056347846984863,
421
+ "learning_rate": 3.384615384615385e-06,
422
+ "loss": 0.1599,
423
  "step": 540
424
  },
425
  {
426
+ "epoch": 4.230769230769231,
427
+ "grad_norm": 0.3621722161769867,
428
+ "learning_rate": 3.0769230769230774e-06,
429
+ "loss": 0.1402,
430
  "step": 550
431
  },
432
  {
433
+ "epoch": 4.3076923076923075,
434
+ "grad_norm": 1.570186734199524,
435
+ "learning_rate": 2.7692307692307697e-06,
436
+ "loss": 0.0493,
437
  "step": 560
438
  },
439
  {
440
+ "epoch": 4.384615384615385,
441
+ "grad_norm": 0.31239092350006104,
442
+ "learning_rate": 2.461538461538462e-06,
443
+ "loss": 0.1283,
444
  "step": 570
445
  },
446
  {
447
+ "epoch": 4.461538461538462,
448
+ "grad_norm": 0.39084771275520325,
449
+ "learning_rate": 2.153846153846154e-06,
450
+ "loss": 0.0804,
451
  "step": 580
452
  },
453
  {
454
+ "epoch": 4.538461538461538,
455
+ "grad_norm": 9.627459526062012,
456
+ "learning_rate": 1.8461538461538465e-06,
457
+ "loss": 0.064,
458
  "step": 590
459
  },
460
  {
461
+ "epoch": 4.615384615384615,
462
+ "grad_norm": 3.2321269512176514,
463
+ "learning_rate": 1.5384615384615387e-06,
464
+ "loss": 0.0828,
465
  "step": 600
466
  },
467
  {
468
+ "epoch": 4.6923076923076925,
469
+ "grad_norm": 5.409714221954346,
470
+ "learning_rate": 1.230769230769231e-06,
471
+ "loss": 0.0899,
472
  "step": 610
473
  },
474
  {
475
+ "epoch": 4.769230769230769,
476
+ "grad_norm": 0.2573850750923157,
477
+ "learning_rate": 9.230769230769232e-07,
478
+ "loss": 0.0794,
479
  "step": 620
480
  },
481
  {
482
+ "epoch": 4.846153846153846,
483
+ "grad_norm": 0.21551428735256195,
484
+ "learning_rate": 6.153846153846155e-07,
485
+ "loss": 0.0616,
486
  "step": 630
487
  },
488
  {
489
+ "epoch": 4.923076923076923,
490
+ "grad_norm": 0.23159781098365784,
491
+ "learning_rate": 3.0769230769230774e-07,
492
+ "loss": 0.0776,
493
  "step": 640
494
  },
495
  {
496
+ "epoch": 5.0,
497
+ "grad_norm": 0.7027397155761719,
498
+ "learning_rate": 0.0,
499
+ "loss": 0.1089,
500
  "step": 650
501
  },
502
  {
503
+ "epoch": 5.0,
504
+ "eval_accuracy": 0.9774436090225563,
505
+ "eval_loss": 0.07966959476470947,
506
+ "eval_runtime": 1.9647,
507
+ "eval_samples_per_second": 67.695,
508
+ "eval_steps_per_second": 8.653,
509
+ "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  },
511
  {
512
+ "epoch": 5.0,
513
+ "step": 650,
514
+ "total_flos": 4.006371770595533e+17,
515
+ "train_loss": 0.2178187003502479,
516
+ "train_runtime": 261.0621,
517
+ "train_samples_per_second": 19.804,
518
+ "train_steps_per_second": 2.49
519
  }
520
  ],
521
  "logging_steps": 10,
522
+ "max_steps": 650,
523
  "num_input_tokens_seen": 0,
524
+ "num_train_epochs": 5,
525
  "save_steps": 500,
526
+ "stateful_callbacks": {
527
+ "TrainerControl": {
528
+ "args": {
529
+ "should_epoch_stop": false,
530
+ "should_evaluate": false,
531
+ "should_log": false,
532
+ "should_save": true,
533
+ "should_training_stop": true
534
+ },
535
+ "attributes": {}
536
+ }
537
+ },
538
+ "total_flos": 4.006371770595533e+17,
539
+ "train_batch_size": 8,
540
  "trial_name": null,
541
  "trial_params": null
542
  }