besimray commited on
Commit
99fd48a
·
verified ·
1 Parent(s): 431eb85

Training in progress, step 20, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "gate_proj",
24
- "down_proj",
25
  "k_proj",
26
  "o_proj",
27
- "up_proj",
28
  "v_proj",
29
- "q_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "up_proj",
 
24
  "k_proj",
25
  "o_proj",
 
26
  "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee822693fef528317ba083c5d48f88c30b9c61025611d616bab3d9d798072246
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2ea1e6a39b9b019bb68f8bcb9e4113a9450d3a145c326c3d17c60e7914f4870
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:460835f7008ae886b43dbf42bb872deabb83175b9129a32fbc50d714ed9014db
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa015a1e42d23eb4a3ea0fa718ceb403d42f019d9d2191fa6d5727830e4b7375
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d11c7a6a55b1fb167809df49e86bfb5922b63262ff5a72c4acf98ce212bd8ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e691a6f9c4d5048643b02e3232a9cfa061e68f527f466d35a53bb462800e4b3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:194456d3c9e165255d5406a0f3f62973b0bede79d91784f72431350783e27ae7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c297c5cf11a27c75d9f99f1df69752f78c3ad41b0275adf50cdd1b67f9d0bb3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1126 +1,168 @@
1
  {
2
- "best_metric": 0.7633899450302124,
3
- "best_model_checkpoint": "miner_id_besimray/checkpoint-140",
4
- "epoch": 1.9292604501607717,
5
  "eval_steps": 20,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.012861736334405145,
13
- "grad_norm": 0.6353400945663452,
14
  "learning_rate": 2e-05,
15
- "loss": 0.9595,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.012861736334405145,
20
- "eval_loss": 0.9745924472808838,
21
- "eval_runtime": 5.7213,
22
- "eval_samples_per_second": 28.665,
23
- "eval_steps_per_second": 2.971,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.02572347266881029,
28
- "grad_norm": 0.9050183296203613,
29
  "learning_rate": 4e-05,
30
- "loss": 0.9813,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.03858520900321544,
35
- "grad_norm": 0.8838196992874146,
36
  "learning_rate": 6e-05,
37
- "loss": 0.9972,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.05144694533762058,
42
- "grad_norm": 0.6954429149627686,
43
  "learning_rate": 8e-05,
44
- "loss": 0.8263,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.06430868167202572,
49
- "grad_norm": 0.7475744485855103,
50
  "learning_rate": 0.0001,
51
- "loss": 1.0028,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.07717041800643087,
56
- "grad_norm": 0.48648685216903687,
57
  "learning_rate": 0.00012,
58
- "loss": 0.8235,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.09003215434083602,
63
- "grad_norm": 0.4148712754249573,
64
  "learning_rate": 0.00014,
65
- "loss": 0.9482,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.10289389067524116,
70
- "grad_norm": 0.3340390622615814,
71
  "learning_rate": 0.00016,
72
- "loss": 0.8758,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.1157556270096463,
77
- "grad_norm": 0.26932457089424133,
78
  "learning_rate": 0.00018,
79
- "loss": 0.8252,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.12861736334405144,
84
- "grad_norm": 0.23018883168697357,
85
  "learning_rate": 0.0002,
86
- "loss": 0.8404,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.1414790996784566,
91
- "grad_norm": 0.39730432629585266,
92
  "learning_rate": 0.00019997482349425066,
93
- "loss": 0.8719,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.15434083601286175,
98
- "grad_norm": 0.42967018485069275,
99
  "learning_rate": 0.00019989930665413147,
100
- "loss": 0.8539,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.16720257234726688,
105
- "grad_norm": 0.39713603258132935,
106
  "learning_rate": 0.0001997734875046456,
107
- "loss": 0.7647,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.18006430868167203,
112
- "grad_norm": 0.3015749156475067,
113
  "learning_rate": 0.00019959742939952392,
114
- "loss": 0.806,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.19292604501607716,
119
- "grad_norm": 0.32679829001426697,
120
  "learning_rate": 0.00019937122098932428,
121
- "loss": 0.7549,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.2057877813504823,
126
- "grad_norm": 0.31075620651245117,
127
  "learning_rate": 0.00019909497617679348,
128
- "loss": 0.7734,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.21864951768488747,
133
- "grad_norm": 0.3161105215549469,
134
  "learning_rate": 0.00019876883405951377,
135
- "loss": 0.8734,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.2315112540192926,
140
- "grad_norm": 0.26673561334609985,
141
  "learning_rate": 0.00019839295885986296,
142
- "loss": 0.7541,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.24437299035369775,
147
- "grad_norm": 0.2973664104938507,
148
  "learning_rate": 0.00019796753984232358,
149
- "loss": 0.9559,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.2572347266881029,
154
- "grad_norm": 0.2668478190898895,
155
  "learning_rate": 0.00019749279121818235,
156
- "loss": 0.9389,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.2572347266881029,
161
- "eval_loss": 0.8176891803741455,
162
- "eval_runtime": 5.7831,
163
- "eval_samples_per_second": 28.358,
164
- "eval_steps_per_second": 2.94,
165
  "step": 20
166
- },
167
- {
168
- "epoch": 0.27009646302250806,
169
- "grad_norm": 0.21167340874671936,
170
- "learning_rate": 0.0001969689520376687,
171
- "loss": 0.7046,
172
- "step": 21
173
- },
174
- {
175
- "epoch": 0.2829581993569132,
176
- "grad_norm": 0.23888267576694489,
177
- "learning_rate": 0.00019639628606958533,
178
- "loss": 0.769,
179
- "step": 22
180
- },
181
- {
182
- "epoch": 0.2958199356913183,
183
- "grad_norm": 0.2215345799922943,
184
- "learning_rate": 0.00019577508166849304,
185
- "loss": 0.7524,
186
- "step": 23
187
- },
188
- {
189
- "epoch": 0.3086816720257235,
190
- "grad_norm": 0.2053850293159485,
191
- "learning_rate": 0.00019510565162951537,
192
- "loss": 0.7305,
193
- "step": 24
194
- },
195
- {
196
- "epoch": 0.3215434083601286,
197
- "grad_norm": 0.21348363161087036,
198
- "learning_rate": 0.00019438833303083678,
199
- "loss": 0.8105,
200
- "step": 25
201
- },
202
- {
203
- "epoch": 0.33440514469453375,
204
- "grad_norm": 0.21149109303951263,
205
- "learning_rate": 0.00019362348706397373,
206
- "loss": 0.8249,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.34726688102893893,
211
- "grad_norm": 0.22312600910663605,
212
- "learning_rate": 0.0001928114988519039,
213
- "loss": 0.8205,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.36012861736334406,
218
- "grad_norm": 0.20689311623573303,
219
- "learning_rate": 0.0001919527772551451,
220
- "loss": 0.7488,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.3729903536977492,
225
- "grad_norm": 0.22930146753787994,
226
- "learning_rate": 0.00019104775466588161,
227
- "loss": 0.8267,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.3858520900321543,
232
- "grad_norm": 0.24117408692836761,
233
- "learning_rate": 0.0001900968867902419,
234
- "loss": 0.8412,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.3987138263665595,
239
- "grad_norm": 0.23889240622520447,
240
- "learning_rate": 0.0001891006524188368,
241
- "loss": 0.8336,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.4115755627009646,
246
- "grad_norm": 0.22367344796657562,
247
- "learning_rate": 0.0001880595531856738,
248
- "loss": 0.7905,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.42443729903536975,
253
- "grad_norm": 0.22343823313713074,
254
- "learning_rate": 0.00018697411331556956,
255
- "loss": 0.7842,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.43729903536977494,
260
- "grad_norm": 0.21874739229679108,
261
- "learning_rate": 0.00018584487936018661,
262
- "loss": 0.7624,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.45016077170418006,
267
- "grad_norm": 0.21489335596561432,
268
- "learning_rate": 0.00018467241992282843,
269
- "loss": 0.7956,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.4630225080385852,
274
- "grad_norm": 0.2225746512413025,
275
- "learning_rate": 0.00018345732537213027,
276
- "loss": 0.788,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.4758842443729904,
281
- "grad_norm": 0.25866109132766724,
282
- "learning_rate": 0.00018220020754479102,
283
- "loss": 0.9176,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.4887459807073955,
288
- "grad_norm": 0.2253965586423874,
289
- "learning_rate": 0.00018090169943749476,
290
- "loss": 0.837,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.5016077170418006,
295
- "grad_norm": 0.23456409573554993,
296
- "learning_rate": 0.00017956245488817812,
297
- "loss": 0.8215,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.5144694533762058,
302
- "grad_norm": 0.22983534634113312,
303
- "learning_rate": 0.000178183148246803,
304
- "loss": 0.7865,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.5144694533762058,
309
- "eval_loss": 0.790475070476532,
310
- "eval_runtime": 7.5635,
311
- "eval_samples_per_second": 21.683,
312
- "eval_steps_per_second": 2.248,
313
- "step": 40
314
- },
315
- {
316
- "epoch": 0.5273311897106109,
317
- "grad_norm": 0.21388499438762665,
318
- "learning_rate": 0.0001767644740358011,
319
- "loss": 0.7596,
320
- "step": 41
321
- },
322
- {
323
- "epoch": 0.5401929260450161,
324
- "grad_norm": 0.23246338963508606,
325
- "learning_rate": 0.00017530714660036112,
326
- "loss": 0.8301,
327
- "step": 42
328
- },
329
- {
330
- "epoch": 0.5530546623794212,
331
- "grad_norm": 0.2237369567155838,
332
- "learning_rate": 0.00017381189974873407,
333
- "loss": 0.7615,
334
- "step": 43
335
- },
336
- {
337
- "epoch": 0.5659163987138264,
338
- "grad_norm": 0.2338048666715622,
339
- "learning_rate": 0.00017227948638273916,
340
- "loss": 0.7508,
341
- "step": 44
342
- },
343
- {
344
- "epoch": 0.5787781350482315,
345
- "grad_norm": 0.23250731825828552,
346
- "learning_rate": 0.00017071067811865476,
347
- "loss": 0.7531,
348
- "step": 45
349
- },
350
- {
351
- "epoch": 0.5916398713826366,
352
- "grad_norm": 0.23482945561408997,
353
- "learning_rate": 0.00016910626489868649,
354
- "loss": 0.747,
355
- "step": 46
356
- },
357
- {
358
- "epoch": 0.6045016077170418,
359
- "grad_norm": 0.24034211039543152,
360
- "learning_rate": 0.00016746705459320745,
361
- "loss": 0.8101,
362
- "step": 47
363
- },
364
- {
365
- "epoch": 0.617363344051447,
366
- "grad_norm": 0.22306109964847565,
367
- "learning_rate": 0.00016579387259397127,
368
- "loss": 0.7662,
369
- "step": 48
370
- },
371
- {
372
- "epoch": 0.6302250803858521,
373
- "grad_norm": 0.2227921038866043,
374
- "learning_rate": 0.0001640875613985024,
375
- "loss": 0.7652,
376
- "step": 49
377
- },
378
- {
379
- "epoch": 0.6430868167202572,
380
- "grad_norm": 0.23712162673473358,
381
- "learning_rate": 0.00016234898018587337,
382
- "loss": 0.7383,
383
- "step": 50
384
- },
385
- {
386
- "epoch": 0.6559485530546624,
387
- "grad_norm": 0.22555230557918549,
388
- "learning_rate": 0.000160579004384082,
389
- "loss": 0.7069,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 0.6688102893890675,
394
- "grad_norm": 0.20635604858398438,
395
- "learning_rate": 0.00015877852522924732,
396
- "loss": 0.7082,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 0.6816720257234726,
401
- "grad_norm": 0.24219252169132233,
402
- "learning_rate": 0.0001569484493168452,
403
- "loss": 0.8613,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 0.6945337620578779,
408
- "grad_norm": 0.21117131412029266,
409
- "learning_rate": 0.00015508969814521025,
410
- "loss": 0.7367,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 0.707395498392283,
415
- "grad_norm": 0.19816330075263977,
416
- "learning_rate": 0.00015320320765153367,
417
- "loss": 0.6701,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 0.7202572347266881,
422
- "grad_norm": 0.24127283692359924,
423
- "learning_rate": 0.00015128992774059063,
424
- "loss": 0.7212,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.7331189710610932,
429
- "grad_norm": 0.2404824048280716,
430
- "learning_rate": 0.0001493508218064347,
431
- "loss": 0.6806,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.7459807073954984,
436
- "grad_norm": 0.2355654090642929,
437
- "learning_rate": 0.00014738686624729986,
438
- "loss": 0.8262,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.7588424437299035,
443
- "grad_norm": 0.24600790441036224,
444
- "learning_rate": 0.00014539904997395468,
445
- "loss": 0.8672,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.7717041800643086,
450
- "grad_norm": 0.22222688794136047,
451
- "learning_rate": 0.00014338837391175582,
452
- "loss": 0.7565,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.7717041800643086,
457
- "eval_loss": 0.7795044779777527,
458
- "eval_runtime": 7.0042,
459
- "eval_samples_per_second": 23.414,
460
- "eval_steps_per_second": 2.427,
461
- "step": 60
462
- },
463
- {
464
- "epoch": 0.7845659163987139,
465
- "grad_norm": 0.23646439611911774,
466
- "learning_rate": 0.00014135585049665207,
467
- "loss": 0.8112,
468
- "step": 61
469
- },
470
- {
471
- "epoch": 0.797427652733119,
472
- "grad_norm": 0.2320852130651474,
473
- "learning_rate": 0.00013930250316539238,
474
- "loss": 0.8133,
475
- "step": 62
476
- },
477
- {
478
- "epoch": 0.8102893890675241,
479
- "grad_norm": 0.22437462210655212,
480
- "learning_rate": 0.00013722936584019453,
481
- "loss": 0.7881,
482
- "step": 63
483
- },
484
- {
485
- "epoch": 0.8231511254019293,
486
- "grad_norm": 0.22626082599163055,
487
- "learning_rate": 0.0001351374824081343,
488
- "loss": 0.7188,
489
- "step": 64
490
- },
491
- {
492
- "epoch": 0.8360128617363344,
493
- "grad_norm": 0.2286273092031479,
494
- "learning_rate": 0.00013302790619551674,
495
- "loss": 0.7627,
496
- "step": 65
497
- },
498
- {
499
- "epoch": 0.8488745980707395,
500
- "grad_norm": 0.20803673565387726,
501
- "learning_rate": 0.00013090169943749476,
502
- "loss": 0.7243,
503
- "step": 66
504
- },
505
- {
506
- "epoch": 0.8617363344051447,
507
- "grad_norm": 0.2412373572587967,
508
- "learning_rate": 0.00012875993274320173,
509
- "loss": 0.7587,
510
- "step": 67
511
- },
512
- {
513
- "epoch": 0.8745980707395499,
514
- "grad_norm": 0.23493675887584686,
515
- "learning_rate": 0.00012660368455666752,
516
- "loss": 0.7248,
517
- "step": 68
518
- },
519
- {
520
- "epoch": 0.887459807073955,
521
- "grad_norm": 0.2631540596485138,
522
- "learning_rate": 0.0001244340406137894,
523
- "loss": 0.8143,
524
- "step": 69
525
- },
526
- {
527
- "epoch": 0.9003215434083601,
528
- "grad_norm": 0.22676923871040344,
529
- "learning_rate": 0.00012225209339563145,
530
- "loss": 0.8182,
531
- "step": 70
532
- },
533
- {
534
- "epoch": 0.9131832797427653,
535
- "grad_norm": 0.2400771528482437,
536
- "learning_rate": 0.00012005894157832729,
537
- "loss": 0.8467,
538
- "step": 71
539
- },
540
- {
541
- "epoch": 0.9260450160771704,
542
- "grad_norm": 0.22387422621250153,
543
- "learning_rate": 0.00011785568947986367,
544
- "loss": 0.7709,
545
- "step": 72
546
- },
547
- {
548
- "epoch": 0.9389067524115756,
549
- "grad_norm": 0.2363765388727188,
550
- "learning_rate": 0.0001156434465040231,
551
- "loss": 0.8006,
552
- "step": 73
553
- },
554
- {
555
- "epoch": 0.9517684887459807,
556
- "grad_norm": 0.23003999888896942,
557
- "learning_rate": 0.00011342332658176555,
558
- "loss": 0.6959,
559
- "step": 74
560
- },
561
- {
562
- "epoch": 0.9646302250803859,
563
- "grad_norm": 0.2013099491596222,
564
- "learning_rate": 0.00011119644761033078,
565
- "loss": 0.73,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 0.977491961414791,
570
- "grad_norm": 0.23860198259353638,
571
- "learning_rate": 0.00010896393089034336,
572
- "loss": 0.8589,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 0.9903536977491961,
577
- "grad_norm": 0.2546059489250183,
578
- "learning_rate": 0.00010672690056120399,
579
- "loss": 0.7953,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 1.0032154340836013,
584
- "grad_norm": 0.3725316822528839,
585
- "learning_rate": 0.00010448648303505151,
586
- "loss": 0.6716,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 1.0160771704180065,
591
- "grad_norm": 0.2085859179496765,
592
- "learning_rate": 0.00010224380642958052,
593
- "loss": 0.6845,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 1.0289389067524115,
598
- "grad_norm": 0.24546077847480774,
599
- "learning_rate": 0.0001,
600
- "loss": 0.8137,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 1.0289389067524115,
605
- "eval_loss": 0.7710850238800049,
606
- "eval_runtime": 6.8626,
607
- "eval_samples_per_second": 23.898,
608
- "eval_steps_per_second": 2.477,
609
- "step": 80
610
- },
611
- {
612
- "epoch": 1.0418006430868167,
613
- "grad_norm": 0.21984128654003143,
614
- "learning_rate": 9.775619357041952e-05,
615
- "loss": 0.6629,
616
- "step": 81
617
- },
618
- {
619
- "epoch": 1.0546623794212218,
620
- "grad_norm": 0.21213646233081818,
621
- "learning_rate": 9.551351696494854e-05,
622
- "loss": 0.6813,
623
- "step": 82
624
- },
625
- {
626
- "epoch": 1.067524115755627,
627
- "grad_norm": 0.22041082382202148,
628
- "learning_rate": 9.327309943879604e-05,
629
- "loss": 0.7533,
630
- "step": 83
631
- },
632
- {
633
- "epoch": 1.0803858520900322,
634
- "grad_norm": 0.23151585459709167,
635
- "learning_rate": 9.103606910965666e-05,
636
- "loss": 0.7094,
637
- "step": 84
638
- },
639
- {
640
- "epoch": 1.0932475884244373,
641
- "grad_norm": 0.24404524266719818,
642
- "learning_rate": 8.880355238966923e-05,
643
- "loss": 0.7239,
644
- "step": 85
645
- },
646
- {
647
- "epoch": 1.1061093247588425,
648
- "grad_norm": 0.21060094237327576,
649
- "learning_rate": 8.657667341823448e-05,
650
- "loss": 0.6813,
651
- "step": 86
652
- },
653
- {
654
- "epoch": 1.1189710610932475,
655
- "grad_norm": 0.21228162944316864,
656
- "learning_rate": 8.435655349597689e-05,
657
- "loss": 0.6777,
658
- "step": 87
659
- },
660
- {
661
- "epoch": 1.1318327974276527,
662
- "grad_norm": 0.2304028421640396,
663
- "learning_rate": 8.214431052013634e-05,
664
- "loss": 0.7201,
665
- "step": 88
666
- },
667
- {
668
- "epoch": 1.144694533762058,
669
- "grad_norm": 0.2450607568025589,
670
- "learning_rate": 7.994105842167273e-05,
671
- "loss": 0.6912,
672
- "step": 89
673
- },
674
- {
675
- "epoch": 1.157556270096463,
676
- "grad_norm": 0.22919628024101257,
677
- "learning_rate": 7.774790660436858e-05,
678
- "loss": 0.6571,
679
- "step": 90
680
- },
681
- {
682
- "epoch": 1.1704180064308682,
683
- "grad_norm": 0.23411712050437927,
684
- "learning_rate": 7.556595938621058e-05,
685
- "loss": 0.7856,
686
- "step": 91
687
- },
688
- {
689
- "epoch": 1.1832797427652733,
690
- "grad_norm": 0.23763220012187958,
691
- "learning_rate": 7.339631544333249e-05,
692
- "loss": 0.6961,
693
- "step": 92
694
- },
695
- {
696
- "epoch": 1.1961414790996785,
697
- "grad_norm": 0.2141312211751938,
698
- "learning_rate": 7.124006725679828e-05,
699
- "loss": 0.6091,
700
- "step": 93
701
- },
702
- {
703
- "epoch": 1.2090032154340835,
704
- "grad_norm": 0.21600951254367828,
705
- "learning_rate": 6.909830056250527e-05,
706
- "loss": 0.6597,
707
- "step": 94
708
- },
709
- {
710
- "epoch": 1.2218649517684887,
711
- "grad_norm": 0.24440795183181763,
712
- "learning_rate": 6.697209380448333e-05,
713
- "loss": 0.7335,
714
- "step": 95
715
- },
716
- {
717
- "epoch": 1.234726688102894,
718
- "grad_norm": 0.23137834668159485,
719
- "learning_rate": 6.486251759186572e-05,
720
- "loss": 0.649,
721
- "step": 96
722
- },
723
- {
724
- "epoch": 1.247588424437299,
725
- "grad_norm": 0.24493689835071564,
726
- "learning_rate": 6.277063415980549e-05,
727
- "loss": 0.7592,
728
- "step": 97
729
- },
730
- {
731
- "epoch": 1.2604501607717042,
732
- "grad_norm": 0.2131170779466629,
733
- "learning_rate": 6.069749683460765e-05,
734
- "loss": 0.6984,
735
- "step": 98
736
- },
737
- {
738
- "epoch": 1.2733118971061093,
739
- "grad_norm": 0.26320311427116394,
740
- "learning_rate": 5.864414950334796e-05,
741
- "loss": 0.7309,
742
- "step": 99
743
- },
744
- {
745
- "epoch": 1.2861736334405145,
746
- "grad_norm": 0.24698734283447266,
747
- "learning_rate": 5.6611626088244194e-05,
748
- "loss": 0.7243,
749
- "step": 100
750
- },
751
- {
752
- "epoch": 1.2861736334405145,
753
- "eval_loss": 0.7675071954727173,
754
- "eval_runtime": 6.8995,
755
- "eval_samples_per_second": 23.77,
756
- "eval_steps_per_second": 2.464,
757
- "step": 100
758
- },
759
- {
760
- "epoch": 1.2990353697749195,
761
- "grad_norm": 0.2751113474369049,
762
- "learning_rate": 5.4600950026045326e-05,
763
- "loss": 0.8179,
764
- "step": 101
765
- },
766
- {
767
- "epoch": 1.3118971061093248,
768
- "grad_norm": 0.24326056241989136,
769
- "learning_rate": 5.261313375270014e-05,
770
- "loss": 0.7323,
771
- "step": 102
772
- },
773
- {
774
- "epoch": 1.32475884244373,
775
- "grad_norm": 0.26097771525382996,
776
- "learning_rate": 5.0649178193565314e-05,
777
- "loss": 0.8192,
778
- "step": 103
779
- },
780
- {
781
- "epoch": 1.337620578778135,
782
- "grad_norm": 0.28192853927612305,
783
- "learning_rate": 4.87100722594094e-05,
784
- "loss": 0.7968,
785
- "step": 104
786
- },
787
- {
788
- "epoch": 1.3504823151125402,
789
- "grad_norm": 0.25281116366386414,
790
- "learning_rate": 4.6796792348466356e-05,
791
- "loss": 0.7235,
792
- "step": 105
793
- },
794
- {
795
- "epoch": 1.3633440514469453,
796
- "grad_norm": 0.24736233055591583,
797
- "learning_rate": 4.491030185478976e-05,
798
- "loss": 0.8161,
799
- "step": 106
800
- },
801
- {
802
- "epoch": 1.3762057877813505,
803
- "grad_norm": 0.22223138809204102,
804
- "learning_rate": 4.305155068315481e-05,
805
- "loss": 0.6488,
806
- "step": 107
807
- },
808
- {
809
- "epoch": 1.3890675241157555,
810
- "grad_norm": 0.2827548682689667,
811
- "learning_rate": 4.12214747707527e-05,
812
- "loss": 0.7366,
813
- "step": 108
814
- },
815
- {
816
- "epoch": 1.4019292604501608,
817
- "grad_norm": 0.2146385759115219,
818
- "learning_rate": 3.942099561591802e-05,
819
- "loss": 0.6998,
820
- "step": 109
821
- },
822
- {
823
- "epoch": 1.414790996784566,
824
- "grad_norm": 0.24802613258361816,
825
- "learning_rate": 3.7651019814126654e-05,
826
- "loss": 0.7389,
827
- "step": 110
828
- },
829
- {
830
- "epoch": 1.427652733118971,
831
- "grad_norm": 0.21664857864379883,
832
- "learning_rate": 3.591243860149759e-05,
833
- "loss": 0.6622,
834
- "step": 111
835
- },
836
- {
837
- "epoch": 1.4405144694533762,
838
- "grad_norm": 0.2532820403575897,
839
- "learning_rate": 3.4206127406028745e-05,
840
- "loss": 0.6946,
841
- "step": 112
842
- },
843
- {
844
- "epoch": 1.4533762057877815,
845
- "grad_norm": 0.2482985109090805,
846
- "learning_rate": 3.253294540679257e-05,
847
- "loss": 0.6731,
848
- "step": 113
849
- },
850
- {
851
- "epoch": 1.4662379421221865,
852
- "grad_norm": 0.2581978440284729,
853
- "learning_rate": 3.089373510131354e-05,
854
- "loss": 0.729,
855
- "step": 114
856
- },
857
- {
858
- "epoch": 1.4790996784565915,
859
- "grad_norm": 0.2761397957801819,
860
- "learning_rate": 2.9289321881345254e-05,
861
- "loss": 0.7888,
862
- "step": 115
863
- },
864
- {
865
- "epoch": 1.4919614147909968,
866
- "grad_norm": 0.2787714898586273,
867
- "learning_rate": 2.7720513617260856e-05,
868
- "loss": 0.7264,
869
- "step": 116
870
- },
871
- {
872
- "epoch": 1.504823151125402,
873
- "grad_norm": 0.24963301420211792,
874
- "learning_rate": 2.6188100251265945e-05,
875
- "loss": 0.6779,
876
- "step": 117
877
- },
878
- {
879
- "epoch": 1.517684887459807,
880
- "grad_norm": 0.27637454867362976,
881
- "learning_rate": 2.4692853399638917e-05,
882
- "loss": 0.7916,
883
- "step": 118
884
- },
885
- {
886
- "epoch": 1.5305466237942122,
887
- "grad_norm": 0.2721528708934784,
888
- "learning_rate": 2.323552596419889e-05,
889
- "loss": 0.8018,
890
- "step": 119
891
- },
892
- {
893
- "epoch": 1.5434083601286175,
894
- "grad_norm": 0.25732940435409546,
895
- "learning_rate": 2.181685175319702e-05,
896
- "loss": 0.8195,
897
- "step": 120
898
- },
899
- {
900
- "epoch": 1.5434083601286175,
901
- "eval_loss": 0.764509916305542,
902
- "eval_runtime": 6.806,
903
- "eval_samples_per_second": 24.097,
904
- "eval_steps_per_second": 2.498,
905
- "step": 120
906
- },
907
- {
908
- "epoch": 1.5562700964630225,
909
- "grad_norm": 0.288256973028183,
910
- "learning_rate": 2.043754511182191e-05,
911
- "loss": 0.8183,
912
- "step": 121
913
- },
914
- {
915
- "epoch": 1.5691318327974275,
916
- "grad_norm": 0.254046231508255,
917
- "learning_rate": 1.9098300562505266e-05,
918
- "loss": 0.6721,
919
- "step": 122
920
- },
921
- {
922
- "epoch": 1.5819935691318328,
923
- "grad_norm": 0.24051472544670105,
924
- "learning_rate": 1.7799792455209018e-05,
925
- "loss": 0.6993,
926
- "step": 123
927
- },
928
- {
929
- "epoch": 1.594855305466238,
930
- "grad_norm": 0.2690548002719879,
931
- "learning_rate": 1.6542674627869737e-05,
932
- "loss": 0.7357,
933
- "step": 124
934
- },
935
- {
936
- "epoch": 1.607717041800643,
937
- "grad_norm": 0.2249222695827484,
938
- "learning_rate": 1.5327580077171587e-05,
939
- "loss": 0.6112,
940
- "step": 125
941
- },
942
- {
943
- "epoch": 1.6205787781350482,
944
- "grad_norm": 0.2525765597820282,
945
- "learning_rate": 1.415512063981339e-05,
946
- "loss": 0.7281,
947
- "step": 126
948
- },
949
- {
950
- "epoch": 1.6334405144694535,
951
- "grad_norm": 0.2448454648256302,
952
- "learning_rate": 1.3025886684430467e-05,
953
- "loss": 0.6699,
954
- "step": 127
955
- },
956
- {
957
- "epoch": 1.6463022508038585,
958
- "grad_norm": 0.27227962017059326,
959
- "learning_rate": 1.19404468143262e-05,
960
- "loss": 0.7431,
961
- "step": 128
962
- },
963
- {
964
- "epoch": 1.6591639871382635,
965
- "grad_norm": 0.26319149136543274,
966
- "learning_rate": 1.0899347581163221e-05,
967
- "loss": 0.6885,
968
- "step": 129
969
- },
970
- {
971
- "epoch": 1.6720257234726688,
972
- "grad_norm": 0.2802058160305023,
973
- "learning_rate": 9.903113209758096e-06,
974
- "loss": 0.7451,
975
- "step": 130
976
- },
977
- {
978
- "epoch": 1.684887459807074,
979
- "grad_norm": 0.23295214772224426,
980
- "learning_rate": 8.952245334118414e-06,
981
- "loss": 0.6393,
982
- "step": 131
983
- },
984
- {
985
- "epoch": 1.697749196141479,
986
- "grad_norm": 0.2382490485906601,
987
- "learning_rate": 8.047222744854943e-06,
988
- "loss": 0.621,
989
- "step": 132
990
- },
991
- {
992
- "epoch": 1.7106109324758842,
993
- "grad_norm": 0.26903268694877625,
994
- "learning_rate": 7.1885011480961164e-06,
995
- "loss": 0.8464,
996
- "step": 133
997
- },
998
- {
999
- "epoch": 1.7234726688102895,
1000
- "grad_norm": 0.22437304258346558,
1001
- "learning_rate": 6.37651293602628e-06,
1002
- "loss": 0.526,
1003
- "step": 134
1004
- },
1005
- {
1006
- "epoch": 1.7363344051446945,
1007
- "grad_norm": 0.2693169414997101,
1008
- "learning_rate": 5.611666969163243e-06,
1009
- "loss": 0.7231,
1010
- "step": 135
1011
- },
1012
- {
1013
- "epoch": 1.7491961414790995,
1014
- "grad_norm": 0.24426168203353882,
1015
- "learning_rate": 4.8943483704846475e-06,
1016
- "loss": 0.6771,
1017
- "step": 136
1018
- },
1019
- {
1020
- "epoch": 1.762057877813505,
1021
- "grad_norm": 0.24735158681869507,
1022
- "learning_rate": 4.224918331506955e-06,
1023
- "loss": 0.7126,
1024
- "step": 137
1025
- },
1026
- {
1027
- "epoch": 1.77491961414791,
1028
- "grad_norm": 0.2898198962211609,
1029
- "learning_rate": 3.6037139304146762e-06,
1030
- "loss": 0.838,
1031
- "step": 138
1032
- },
1033
- {
1034
- "epoch": 1.787781350482315,
1035
- "grad_norm": 0.22501428425312042,
1036
- "learning_rate": 3.0310479623313127e-06,
1037
- "loss": 0.6003,
1038
- "step": 139
1039
- },
1040
- {
1041
- "epoch": 1.8006430868167203,
1042
- "grad_norm": 0.2505525052547455,
1043
- "learning_rate": 2.5072087818176382e-06,
1044
- "loss": 0.6793,
1045
- "step": 140
1046
- },
1047
- {
1048
- "epoch": 1.8006430868167203,
1049
- "eval_loss": 0.7633899450302124,
1050
- "eval_runtime": 6.8979,
1051
- "eval_samples_per_second": 23.775,
1052
- "eval_steps_per_second": 2.465,
1053
- "step": 140
1054
- },
1055
- {
1056
- "epoch": 1.8135048231511255,
1057
- "grad_norm": 0.2789839506149292,
1058
- "learning_rate": 2.032460157676452e-06,
1059
- "loss": 0.6984,
1060
- "step": 141
1061
- },
1062
- {
1063
- "epoch": 1.8263665594855305,
1064
- "grad_norm": 0.2638675272464752,
1065
- "learning_rate": 1.6070411401370334e-06,
1066
- "loss": 0.7894,
1067
- "step": 142
1068
- },
1069
- {
1070
- "epoch": 1.8392282958199357,
1071
- "grad_norm": 0.22497716546058655,
1072
- "learning_rate": 1.231165940486234e-06,
1073
- "loss": 0.6259,
1074
- "step": 143
1075
- },
1076
- {
1077
- "epoch": 1.852090032154341,
1078
- "grad_norm": 0.2598622143268585,
1079
- "learning_rate": 9.0502382320653e-07,
1080
- "loss": 0.7023,
1081
- "step": 144
1082
- },
1083
- {
1084
- "epoch": 1.864951768488746,
1085
- "grad_norm": 0.24743396043777466,
1086
- "learning_rate": 6.287790106757396e-07,
1087
- "loss": 0.7361,
1088
- "step": 145
1089
- },
1090
- {
1091
- "epoch": 1.877813504823151,
1092
- "grad_norm": 0.22825051844120026,
1093
- "learning_rate": 4.025706004760932e-07,
1094
- "loss": 0.6144,
1095
- "step": 146
1096
- },
1097
- {
1098
- "epoch": 1.8906752411575563,
1099
- "grad_norm": 0.24232302606105804,
1100
- "learning_rate": 2.265124953543918e-07,
1101
- "loss": 0.738,
1102
- "step": 147
1103
- },
1104
- {
1105
- "epoch": 1.9035369774919615,
1106
- "grad_norm": 0.23391401767730713,
1107
- "learning_rate": 1.0069334586854107e-07,
1108
- "loss": 0.671,
1109
- "step": 148
1110
- },
1111
- {
1112
- "epoch": 1.9163987138263665,
1113
- "grad_norm": 0.2621341943740845,
1114
- "learning_rate": 2.5176505749346936e-08,
1115
- "loss": 0.7556,
1116
- "step": 149
1117
- },
1118
- {
1119
- "epoch": 1.9292604501607717,
1120
- "grad_norm": 0.30414289236068726,
1121
- "learning_rate": 0.0,
1122
- "loss": 0.8086,
1123
- "step": 150
1124
  }
1125
  ],
1126
  "logging_steps": 1,
@@ -1144,12 +186,12 @@
1144
  "should_evaluate": false,
1145
  "should_log": false,
1146
  "should_save": true,
1147
- "should_training_stop": true
1148
  },
1149
  "attributes": {}
1150
  }
1151
  },
1152
- "total_flos": 3.58265259884544e+16,
1153
  "train_batch_size": 10,
1154
  "trial_name": null,
1155
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3471006155014038,
3
+ "best_model_checkpoint": "miner_id_besimray/checkpoint-20",
4
+ "epoch": 0.2572347266881029,
5
  "eval_steps": 20,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.012861736334405145,
13
+ "grad_norm": 14.967682838439941,
14
  "learning_rate": 2e-05,
15
+ "loss": 9.1,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.012861736334405145,
20
+ "eval_loss": 8.996231079101562,
21
+ "eval_runtime": 5.6709,
22
+ "eval_samples_per_second": 28.919,
23
+ "eval_steps_per_second": 2.998,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.02572347266881029,
28
+ "grad_norm": 17.858224868774414,
29
  "learning_rate": 4e-05,
30
+ "loss": 8.9697,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.03858520900321544,
35
+ "grad_norm": 17.052230834960938,
36
  "learning_rate": 6e-05,
37
+ "loss": 9.3384,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.05144694533762058,
42
+ "grad_norm": 15.359251022338867,
43
  "learning_rate": 8e-05,
44
+ "loss": 8.6911,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.06430868167202572,
49
+ "grad_norm": 14.528264045715332,
50
  "learning_rate": 0.0001,
51
+ "loss": 7.9832,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.07717041800643087,
56
+ "grad_norm": 12.933821678161621,
57
  "learning_rate": 0.00012,
58
+ "loss": 6.9516,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.09003215434083602,
63
+ "grad_norm": 11.693599700927734,
64
  "learning_rate": 0.00014,
65
+ "loss": 6.192,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.10289389067524116,
70
+ "grad_norm": 9.695719718933105,
71
  "learning_rate": 0.00016,
72
+ "loss": 5.1497,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.1157556270096463,
77
+ "grad_norm": 11.275097846984863,
78
  "learning_rate": 0.00018,
79
+ "loss": 4.0494,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.12861736334405144,
84
+ "grad_norm": 13.86536693572998,
85
  "learning_rate": 0.0002,
86
+ "loss": 2.8475,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.1414790996784566,
91
+ "grad_norm": 11.746561050415039,
92
  "learning_rate": 0.00019997482349425066,
93
+ "loss": 1.7198,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.15434083601286175,
98
+ "grad_norm": 6.656251907348633,
99
  "learning_rate": 0.00019989930665413147,
100
+ "loss": 0.991,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.16720257234726688,
105
+ "grad_norm": 7.672077655792236,
106
  "learning_rate": 0.0001997734875046456,
107
+ "loss": 0.8849,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.18006430868167203,
112
+ "grad_norm": 5.506864547729492,
113
  "learning_rate": 0.00019959742939952392,
114
+ "loss": 0.5775,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.19292604501607716,
119
+ "grad_norm": 6.015160083770752,
120
  "learning_rate": 0.00019937122098932428,
121
+ "loss": 0.4869,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.2057877813504823,
126
+ "grad_norm": 6.050337314605713,
127
  "learning_rate": 0.00019909497617679348,
128
+ "loss": 0.5208,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.21864951768488747,
133
+ "grad_norm": 5.535327911376953,
134
  "learning_rate": 0.00019876883405951377,
135
+ "loss": 0.5163,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.2315112540192926,
140
+ "grad_norm": 3.005343437194824,
141
  "learning_rate": 0.00019839295885986296,
142
+ "loss": 0.4186,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.24437299035369775,
147
+ "grad_norm": 3.88594388961792,
148
  "learning_rate": 0.00019796753984232358,
149
+ "loss": 0.3806,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.2572347266881029,
154
+ "grad_norm": 3.7315642833709717,
155
  "learning_rate": 0.00019749279121818235,
156
+ "loss": 0.3746,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.2572347266881029,
161
+ "eval_loss": 0.3471006155014038,
162
+ "eval_runtime": 5.7323,
163
+ "eval_samples_per_second": 28.61,
164
+ "eval_steps_per_second": 2.966,
165
  "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 1,
 
186
  "should_evaluate": false,
187
  "should_log": false,
188
  "should_save": true,
189
+ "should_training_stop": false
190
  },
191
  "attributes": {}
192
  }
193
  },
194
+ "total_flos": 5141026150809600.0,
195
  "train_batch_size": 10,
196
  "trial_name": null,
197
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36d0f18aea43d7612a7191b41a7eb6bd0de838ab0f1cc6cc2b200f546360c06a
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a61f177c4f35816461aeee7877425472ed07bd0e989be98a55cef3f50bbeb021
3
  size 6648