besimray commited on
Commit
afc53db
·
verified ·
1 Parent(s): 6e4c05f

Training in progress, step 20, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -21,12 +21,12 @@
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
24
- "o_proj",
25
  "gate_proj",
26
  "v_proj",
27
  "down_proj",
28
- "q_proj",
29
- "up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
 
24
  "gate_proj",
25
  "v_proj",
26
  "down_proj",
27
+ "o_proj",
28
+ "up_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f892944e1d6553c2988900130a3362ea080ce87049af159434348e43983a67f7
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c262af3316d92f1e6a59b3f3fce338e187fac9175949a02bc80bdc70674949db
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f294a2b1a687df224adc5ac3e37eb23eab0bfe8458ff9f9b4712852a5997cf77
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7abfd469db57334042c7cf1fda0a239d09983f7d72fbc0bfe26e16575bc765
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbd0b0d00d8a6ce2af47f7a318c5367a4519b639c67ff4d1f9441e0f3c04db1f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaefb486716e7cdd53aa7207d6e8d9df693e032d4b9b882b4154f9c46bb10d61
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3002a39ac6502366eefa64e828fe85e0b7d2b42f2ce52a223a7439ad2a05fd9b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c297c5cf11a27c75d9f99f1df69752f78c3ad41b0275adf50cdd1b67f9d0bb3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.8421052631578947,
5
- "eval_steps": 8,
6
- "global_step": 135,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.021052631578947368,
13
- "grad_norm": 0.6863332986831665,
14
  "learning_rate": 2e-05,
15
  "loss": 1.3028,
16
  "step": 1
@@ -18,1084 +18,168 @@
18
  {
19
  "epoch": 0.021052631578947368,
20
  "eval_loss": 1.2579221725463867,
21
- "eval_runtime": 2.0781,
22
- "eval_samples_per_second": 48.122,
23
- "eval_steps_per_second": 4.812,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.042105263157894736,
28
- "grad_norm": 0.6748984456062317,
29
  "learning_rate": 4e-05,
30
  "loss": 1.4572,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.06315789473684211,
35
- "grad_norm": 0.6037639379501343,
36
  "learning_rate": 6e-05,
37
- "loss": 1.2726,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.08421052631578947,
42
- "grad_norm": 0.6334605813026428,
43
  "learning_rate": 8e-05,
44
- "loss": 1.42,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
- "grad_norm": 0.48335137963294983,
50
  "learning_rate": 0.0001,
51
- "loss": 1.24,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.12631578947368421,
56
- "grad_norm": 0.514263927936554,
57
  "learning_rate": 0.00012,
58
- "loss": 1.3899,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.14736842105263157,
63
- "grad_norm": 0.6221964955329895,
64
  "learning_rate": 0.00014,
65
- "loss": 1.3389,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.16842105263157894,
70
- "grad_norm": 0.4577077031135559,
71
  "learning_rate": 0.00016,
72
- "loss": 1.223,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.16842105263157894,
77
- "eval_loss": 1.2106529474258423,
78
- "eval_runtime": 2.1078,
79
- "eval_samples_per_second": 47.443,
80
- "eval_steps_per_second": 4.744,
81
  "step": 8
82
  },
83
  {
84
  "epoch": 0.18947368421052632,
85
- "grad_norm": 0.5334281921386719,
86
  "learning_rate": 0.00018,
87
- "loss": 1.2047,
88
  "step": 9
89
  },
90
  {
91
  "epoch": 0.21052631578947367,
92
- "grad_norm": 0.44930824637413025,
93
  "learning_rate": 0.0002,
94
- "loss": 1.3736,
95
  "step": 10
96
  },
97
  {
98
  "epoch": 0.23157894736842105,
99
- "grad_norm": 0.41816574335098267,
100
  "learning_rate": 0.00019997482349425066,
101
- "loss": 1.1894,
102
  "step": 11
103
  },
104
  {
105
  "epoch": 0.25263157894736843,
106
- "grad_norm": 0.32093414664268494,
107
  "learning_rate": 0.00019989930665413147,
108
- "loss": 1.2164,
109
  "step": 12
110
  },
111
  {
112
  "epoch": 0.2736842105263158,
113
- "grad_norm": 0.31163352727890015,
114
  "learning_rate": 0.0001997734875046456,
115
- "loss": 1.0625,
116
  "step": 13
117
  },
118
  {
119
  "epoch": 0.29473684210526313,
120
- "grad_norm": 0.3636438250541687,
121
  "learning_rate": 0.00019959742939952392,
122
- "loss": 1.0752,
123
  "step": 14
124
  },
125
  {
126
  "epoch": 0.3157894736842105,
127
- "grad_norm": 0.36225762963294983,
128
  "learning_rate": 0.00019937122098932428,
129
- "loss": 1.0046,
130
  "step": 15
131
  },
132
  {
133
  "epoch": 0.3368421052631579,
134
- "grad_norm": 0.30216899514198303,
135
  "learning_rate": 0.00019909497617679348,
136
- "loss": 1.2119,
137
- "step": 16
138
- },
139
- {
140
- "epoch": 0.3368421052631579,
141
- "eval_loss": 1.1756433248519897,
142
- "eval_runtime": 2.0951,
143
- "eval_samples_per_second": 47.731,
144
- "eval_steps_per_second": 4.773,
145
  "step": 16
146
  },
147
  {
148
  "epoch": 0.35789473684210527,
149
- "grad_norm": 0.35788360238075256,
150
  "learning_rate": 0.00019876883405951377,
151
- "loss": 1.2791,
152
  "step": 17
153
  },
154
  {
155
  "epoch": 0.37894736842105264,
156
- "grad_norm": 0.3098278045654297,
157
  "learning_rate": 0.00019839295885986296,
158
- "loss": 1.2221,
159
  "step": 18
160
  },
161
  {
162
  "epoch": 0.4,
163
- "grad_norm": 0.37158146500587463,
164
  "learning_rate": 0.00019796753984232358,
165
- "loss": 1.2627,
166
  "step": 19
167
  },
168
  {
169
  "epoch": 0.42105263157894735,
170
- "grad_norm": 0.3635830581188202,
171
  "learning_rate": 0.00019749279121818235,
172
- "loss": 1.3514,
173
  "step": 20
174
  },
175
  {
176
- "epoch": 0.4421052631578947,
177
- "grad_norm": 0.398816853761673,
178
- "learning_rate": 0.0001969689520376687,
179
- "loss": 1.0967,
180
- "step": 21
181
- },
182
- {
183
- "epoch": 0.4631578947368421,
184
- "grad_norm": 0.30981966853141785,
185
- "learning_rate": 0.00019639628606958533,
186
- "loss": 1.1732,
187
- "step": 22
188
- },
189
- {
190
- "epoch": 0.4842105263157895,
191
- "grad_norm": 0.33735471963882446,
192
- "learning_rate": 0.00019577508166849304,
193
- "loss": 1.2267,
194
- "step": 23
195
- },
196
- {
197
- "epoch": 0.5052631578947369,
198
- "grad_norm": 0.4743439555168152,
199
- "learning_rate": 0.00019510565162951537,
200
- "loss": 1.2174,
201
- "step": 24
202
- },
203
- {
204
- "epoch": 0.5052631578947369,
205
- "eval_loss": 1.1610326766967773,
206
- "eval_runtime": 2.0948,
207
- "eval_samples_per_second": 47.737,
208
- "eval_steps_per_second": 4.774,
209
- "step": 24
210
- },
211
- {
212
- "epoch": 0.5263157894736842,
213
- "grad_norm": 0.3302611708641052,
214
- "learning_rate": 0.00019438833303083678,
215
- "loss": 1.199,
216
- "step": 25
217
- },
218
- {
219
- "epoch": 0.5473684210526316,
220
- "grad_norm": 0.4188154339790344,
221
- "learning_rate": 0.00019362348706397373,
222
- "loss": 1.2427,
223
- "step": 26
224
- },
225
- {
226
- "epoch": 0.5684210526315789,
227
- "grad_norm": 0.40187111496925354,
228
- "learning_rate": 0.0001928114988519039,
229
- "loss": 1.2533,
230
- "step": 27
231
- },
232
- {
233
- "epoch": 0.5894736842105263,
234
- "grad_norm": 0.3812921643257141,
235
- "learning_rate": 0.0001919527772551451,
236
- "loss": 1.2258,
237
- "step": 28
238
- },
239
- {
240
- "epoch": 0.6105263157894737,
241
- "grad_norm": 0.36781439185142517,
242
- "learning_rate": 0.00019104775466588161,
243
- "loss": 1.3239,
244
- "step": 29
245
- },
246
- {
247
- "epoch": 0.631578947368421,
248
- "grad_norm": 0.30295032262802124,
249
- "learning_rate": 0.0001900968867902419,
250
- "loss": 1.1369,
251
- "step": 30
252
- },
253
- {
254
- "epoch": 0.6526315789473685,
255
- "grad_norm": 0.32395803928375244,
256
- "learning_rate": 0.0001891006524188368,
257
- "loss": 1.0771,
258
- "step": 31
259
- },
260
- {
261
- "epoch": 0.6736842105263158,
262
- "grad_norm": 0.30691561102867126,
263
- "learning_rate": 0.0001880595531856738,
264
- "loss": 1.1445,
265
- "step": 32
266
- },
267
- {
268
- "epoch": 0.6736842105263158,
269
- "eval_loss": 1.1577736139297485,
270
- "eval_runtime": 2.099,
271
- "eval_samples_per_second": 47.642,
272
- "eval_steps_per_second": 4.764,
273
- "step": 32
274
- },
275
- {
276
- "epoch": 0.6947368421052632,
277
- "grad_norm": 0.34633180499076843,
278
- "learning_rate": 0.00018697411331556956,
279
- "loss": 1.2347,
280
- "step": 33
281
- },
282
- {
283
- "epoch": 0.7157894736842105,
284
- "grad_norm": 0.41544532775878906,
285
- "learning_rate": 0.00018584487936018661,
286
- "loss": 1.1794,
287
- "step": 34
288
- },
289
- {
290
- "epoch": 0.7368421052631579,
291
- "grad_norm": 0.3669692575931549,
292
- "learning_rate": 0.00018467241992282843,
293
- "loss": 1.1205,
294
- "step": 35
295
- },
296
- {
297
- "epoch": 0.7578947368421053,
298
- "grad_norm": 0.35185304284095764,
299
- "learning_rate": 0.00018345732537213027,
300
- "loss": 1.2297,
301
- "step": 36
302
- },
303
- {
304
- "epoch": 0.7789473684210526,
305
- "grad_norm": 0.3203611671924591,
306
- "learning_rate": 0.00018220020754479102,
307
- "loss": 1.2702,
308
- "step": 37
309
- },
310
- {
311
- "epoch": 0.8,
312
- "grad_norm": 0.35329145193099976,
313
- "learning_rate": 0.00018090169943749476,
314
- "loss": 1.2618,
315
- "step": 38
316
- },
317
- {
318
- "epoch": 0.8210526315789474,
319
- "grad_norm": 0.3242780268192291,
320
- "learning_rate": 0.00017956245488817812,
321
- "loss": 1.1429,
322
- "step": 39
323
- },
324
- {
325
- "epoch": 0.8421052631578947,
326
- "grad_norm": 0.36986637115478516,
327
- "learning_rate": 0.000178183148246803,
328
- "loss": 1.1966,
329
- "step": 40
330
- },
331
- {
332
- "epoch": 0.8421052631578947,
333
- "eval_loss": 1.1517940759658813,
334
- "eval_runtime": 2.1353,
335
- "eval_samples_per_second": 46.832,
336
- "eval_steps_per_second": 4.683,
337
- "step": 40
338
- },
339
- {
340
- "epoch": 0.8631578947368421,
341
- "grad_norm": 0.42866817116737366,
342
- "learning_rate": 0.0001767644740358011,
343
- "loss": 1.2441,
344
- "step": 41
345
- },
346
- {
347
- "epoch": 0.8842105263157894,
348
- "grad_norm": 0.32174167037010193,
349
- "learning_rate": 0.00017530714660036112,
350
- "loss": 1.1713,
351
- "step": 42
352
- },
353
- {
354
- "epoch": 0.9052631578947369,
355
- "grad_norm": 0.3334487974643707,
356
- "learning_rate": 0.00017381189974873407,
357
- "loss": 1.1588,
358
- "step": 43
359
- },
360
- {
361
- "epoch": 0.9263157894736842,
362
- "grad_norm": 0.3190995156764984,
363
- "learning_rate": 0.00017227948638273916,
364
- "loss": 1.1455,
365
- "step": 44
366
- },
367
- {
368
- "epoch": 0.9473684210526315,
369
- "grad_norm": 0.33743467926979065,
370
- "learning_rate": 0.00017071067811865476,
371
- "loss": 1.2266,
372
- "step": 45
373
- },
374
- {
375
- "epoch": 0.968421052631579,
376
- "grad_norm": 0.3349515199661255,
377
- "learning_rate": 0.00016910626489868649,
378
- "loss": 1.1853,
379
- "step": 46
380
- },
381
- {
382
- "epoch": 0.9894736842105263,
383
- "grad_norm": 0.35275840759277344,
384
- "learning_rate": 0.00016746705459320745,
385
- "loss": 1.0376,
386
- "step": 47
387
- },
388
- {
389
- "epoch": 1.0105263157894737,
390
- "grad_norm": 0.3261784315109253,
391
- "learning_rate": 0.00016579387259397127,
392
- "loss": 1.2876,
393
- "step": 48
394
- },
395
- {
396
- "epoch": 1.0105263157894737,
397
- "eval_loss": 1.1508827209472656,
398
- "eval_runtime": 2.1052,
399
- "eval_samples_per_second": 47.501,
400
- "eval_steps_per_second": 4.75,
401
- "step": 48
402
- },
403
- {
404
- "epoch": 1.0315789473684212,
405
- "grad_norm": 0.3690132200717926,
406
- "learning_rate": 0.0001640875613985024,
407
- "loss": 1.079,
408
- "step": 49
409
- },
410
- {
411
- "epoch": 1.0526315789473684,
412
- "grad_norm": 0.42905285954475403,
413
- "learning_rate": 0.00016234898018587337,
414
- "loss": 1.153,
415
- "step": 50
416
- },
417
- {
418
- "epoch": 1.0736842105263158,
419
- "grad_norm": 0.3510225713253021,
420
- "learning_rate": 0.000160579004384082,
421
- "loss": 1.1183,
422
- "step": 51
423
- },
424
- {
425
- "epoch": 1.0947368421052632,
426
- "grad_norm": 0.36225467920303345,
427
- "learning_rate": 0.00015877852522924732,
428
- "loss": 1.0998,
429
- "step": 52
430
- },
431
- {
432
- "epoch": 1.1157894736842104,
433
- "grad_norm": 0.37081998586654663,
434
- "learning_rate": 0.0001569484493168452,
435
- "loss": 1.172,
436
- "step": 53
437
- },
438
- {
439
- "epoch": 1.1368421052631579,
440
- "grad_norm": 0.37817469239234924,
441
- "learning_rate": 0.00015508969814521025,
442
- "loss": 1.1103,
443
- "step": 54
444
- },
445
- {
446
- "epoch": 1.1578947368421053,
447
- "grad_norm": 0.36000335216522217,
448
- "learning_rate": 0.00015320320765153367,
449
- "loss": 1.0199,
450
- "step": 55
451
- },
452
- {
453
- "epoch": 1.1789473684210527,
454
- "grad_norm": 0.34209051728248596,
455
- "learning_rate": 0.00015128992774059063,
456
- "loss": 1.0651,
457
- "step": 56
458
- },
459
- {
460
- "epoch": 1.1789473684210527,
461
- "eval_loss": 1.153894066810608,
462
- "eval_runtime": 2.0943,
463
- "eval_samples_per_second": 47.749,
464
- "eval_steps_per_second": 4.775,
465
- "step": 56
466
- },
467
- {
468
- "epoch": 1.2,
469
- "grad_norm": 0.4330388605594635,
470
- "learning_rate": 0.0001493508218064347,
471
- "loss": 1.0079,
472
- "step": 57
473
- },
474
- {
475
- "epoch": 1.2210526315789474,
476
- "grad_norm": 0.34977588057518005,
477
- "learning_rate": 0.00014738686624729986,
478
- "loss": 1.0271,
479
- "step": 58
480
- },
481
- {
482
- "epoch": 1.2421052631578948,
483
- "grad_norm": 0.4688788652420044,
484
- "learning_rate": 0.00014539904997395468,
485
- "loss": 1.1388,
486
- "step": 59
487
- },
488
- {
489
- "epoch": 1.263157894736842,
490
- "grad_norm": 0.3630085289478302,
491
- "learning_rate": 0.00014338837391175582,
492
- "loss": 1.0998,
493
- "step": 60
494
- },
495
- {
496
- "epoch": 1.2842105263157895,
497
- "grad_norm": 0.4067210853099823,
498
- "learning_rate": 0.00014135585049665207,
499
- "loss": 0.9867,
500
- "step": 61
501
- },
502
- {
503
- "epoch": 1.305263157894737,
504
- "grad_norm": 0.33548006415367126,
505
- "learning_rate": 0.00013930250316539238,
506
- "loss": 0.9863,
507
- "step": 62
508
- },
509
- {
510
- "epoch": 1.3263157894736843,
511
- "grad_norm": 0.4114859402179718,
512
- "learning_rate": 0.00013722936584019453,
513
- "loss": 1.0526,
514
- "step": 63
515
- },
516
- {
517
- "epoch": 1.3473684210526315,
518
- "grad_norm": 0.39736467599868774,
519
- "learning_rate": 0.0001351374824081343,
520
- "loss": 1.1337,
521
- "step": 64
522
- },
523
- {
524
- "epoch": 1.3473684210526315,
525
- "eval_loss": 1.1499197483062744,
526
- "eval_runtime": 2.1122,
527
- "eval_samples_per_second": 47.344,
528
- "eval_steps_per_second": 4.734,
529
- "step": 64
530
- },
531
- {
532
- "epoch": 1.368421052631579,
533
- "grad_norm": 0.33866772055625916,
534
- "learning_rate": 0.00013302790619551674,
535
- "loss": 1.1114,
536
- "step": 65
537
- },
538
- {
539
- "epoch": 1.3894736842105262,
540
- "grad_norm": 0.42472875118255615,
541
- "learning_rate": 0.00013090169943749476,
542
- "loss": 1.0533,
543
- "step": 66
544
- },
545
- {
546
- "epoch": 1.4105263157894736,
547
- "grad_norm": 0.45051443576812744,
548
- "learning_rate": 0.00012875993274320173,
549
- "loss": 1.1449,
550
- "step": 67
551
- },
552
- {
553
- "epoch": 1.431578947368421,
554
- "grad_norm": 0.47155171632766724,
555
- "learning_rate": 0.00012660368455666752,
556
- "loss": 1.1683,
557
- "step": 68
558
- },
559
- {
560
- "epoch": 1.4526315789473685,
561
- "grad_norm": 0.47672173380851746,
562
- "learning_rate": 0.0001244340406137894,
563
- "loss": 1.122,
564
- "step": 69
565
- },
566
- {
567
- "epoch": 1.4736842105263157,
568
- "grad_norm": 0.3632158935070038,
569
- "learning_rate": 0.00012225209339563145,
570
- "loss": 0.9826,
571
- "step": 70
572
- },
573
- {
574
- "epoch": 1.4947368421052631,
575
- "grad_norm": 0.44283154606819153,
576
- "learning_rate": 0.00012005894157832729,
577
- "loss": 1.1671,
578
- "step": 71
579
- },
580
- {
581
- "epoch": 1.5157894736842106,
582
- "grad_norm": 0.45704108476638794,
583
- "learning_rate": 0.00011785568947986367,
584
- "loss": 1.0473,
585
- "step": 72
586
- },
587
- {
588
- "epoch": 1.5157894736842106,
589
- "eval_loss": 1.1517176628112793,
590
- "eval_runtime": 2.1336,
591
- "eval_samples_per_second": 46.869,
592
- "eval_steps_per_second": 4.687,
593
- "step": 72
594
- },
595
- {
596
- "epoch": 1.5368421052631578,
597
- "grad_norm": 0.39218422770500183,
598
- "learning_rate": 0.0001156434465040231,
599
- "loss": 1.1024,
600
- "step": 73
601
- },
602
- {
603
- "epoch": 1.5578947368421052,
604
- "grad_norm": 0.3508377969264984,
605
- "learning_rate": 0.00011342332658176555,
606
- "loss": 0.9808,
607
- "step": 74
608
- },
609
- {
610
- "epoch": 1.5789473684210527,
611
- "grad_norm": 0.3267882466316223,
612
- "learning_rate": 0.00011119644761033078,
613
- "loss": 0.9895,
614
- "step": 75
615
- },
616
- {
617
- "epoch": 1.6,
618
- "grad_norm": 0.41372963786125183,
619
- "learning_rate": 0.00010896393089034336,
620
- "loss": 0.9947,
621
- "step": 76
622
- },
623
- {
624
- "epoch": 1.6210526315789475,
625
- "grad_norm": 0.42969149351119995,
626
- "learning_rate": 0.00010672690056120399,
627
- "loss": 0.9632,
628
- "step": 77
629
- },
630
- {
631
- "epoch": 1.6421052631578947,
632
- "grad_norm": 0.38285690546035767,
633
- "learning_rate": 0.00010448648303505151,
634
- "loss": 1.1273,
635
- "step": 78
636
- },
637
- {
638
- "epoch": 1.663157894736842,
639
- "grad_norm": 0.43110236525535583,
640
- "learning_rate": 0.00010224380642958052,
641
- "loss": 1.1023,
642
- "step": 79
643
- },
644
- {
645
- "epoch": 1.6842105263157894,
646
- "grad_norm": 0.46195274591445923,
647
- "learning_rate": 0.0001,
648
- "loss": 1.0664,
649
- "step": 80
650
- },
651
- {
652
- "epoch": 1.6842105263157894,
653
- "eval_loss": 1.1489697694778442,
654
- "eval_runtime": 2.1184,
655
- "eval_samples_per_second": 47.205,
656
- "eval_steps_per_second": 4.721,
657
- "step": 80
658
- },
659
- {
660
- "epoch": 1.7052631578947368,
661
- "grad_norm": 0.4386035203933716,
662
- "learning_rate": 9.775619357041952e-05,
663
- "loss": 1.1012,
664
- "step": 81
665
- },
666
- {
667
- "epoch": 1.7263157894736842,
668
- "grad_norm": 0.4999752342700958,
669
- "learning_rate": 9.551351696494854e-05,
670
- "loss": 1.1244,
671
- "step": 82
672
- },
673
- {
674
- "epoch": 1.7473684210526317,
675
- "grad_norm": 0.4127891957759857,
676
- "learning_rate": 9.327309943879604e-05,
677
- "loss": 1.187,
678
- "step": 83
679
- },
680
- {
681
- "epoch": 1.768421052631579,
682
- "grad_norm": 0.5349937677383423,
683
- "learning_rate": 9.103606910965666e-05,
684
- "loss": 1.1489,
685
- "step": 84
686
- },
687
- {
688
- "epoch": 1.7894736842105263,
689
- "grad_norm": 0.42807015776634216,
690
- "learning_rate": 8.880355238966923e-05,
691
- "loss": 1.1736,
692
- "step": 85
693
- },
694
- {
695
- "epoch": 1.8105263157894735,
696
- "grad_norm": 0.3887334167957306,
697
- "learning_rate": 8.657667341823448e-05,
698
- "loss": 1.1251,
699
- "step": 86
700
- },
701
- {
702
- "epoch": 1.831578947368421,
703
- "grad_norm": 0.4703119993209839,
704
- "learning_rate": 8.435655349597689e-05,
705
- "loss": 1.369,
706
- "step": 87
707
- },
708
- {
709
- "epoch": 1.8526315789473684,
710
- "grad_norm": 0.5050467252731323,
711
- "learning_rate": 8.214431052013634e-05,
712
- "loss": 0.9705,
713
- "step": 88
714
- },
715
- {
716
- "epoch": 1.8526315789473684,
717
- "eval_loss": 1.1517329216003418,
718
- "eval_runtime": 2.0675,
719
- "eval_samples_per_second": 48.367,
720
- "eval_steps_per_second": 4.837,
721
- "step": 88
722
- },
723
- {
724
- "epoch": 1.8736842105263158,
725
- "grad_norm": 0.48088398575782776,
726
- "learning_rate": 7.994105842167273e-05,
727
- "loss": 1.1485,
728
- "step": 89
729
- },
730
- {
731
- "epoch": 1.8947368421052633,
732
- "grad_norm": 0.5244817137718201,
733
- "learning_rate": 7.774790660436858e-05,
734
- "loss": 1.1301,
735
- "step": 90
736
- },
737
- {
738
- "epoch": 1.9157894736842105,
739
- "grad_norm": 0.5362399220466614,
740
- "learning_rate": 7.556595938621058e-05,
741
- "loss": 1.1488,
742
- "step": 91
743
- },
744
- {
745
- "epoch": 1.936842105263158,
746
- "grad_norm": 0.45146438479423523,
747
- "learning_rate": 7.339631544333249e-05,
748
- "loss": 1.0524,
749
- "step": 92
750
- },
751
- {
752
- "epoch": 1.9578947368421051,
753
- "grad_norm": 0.48216360807418823,
754
- "learning_rate": 7.124006725679828e-05,
755
- "loss": 1.2223,
756
- "step": 93
757
- },
758
- {
759
- "epoch": 1.9789473684210526,
760
- "grad_norm": 0.48500946164131165,
761
- "learning_rate": 6.909830056250527e-05,
762
- "loss": 1.0837,
763
- "step": 94
764
- },
765
- {
766
- "epoch": 2.0,
767
- "grad_norm": 0.46944934129714966,
768
- "learning_rate": 6.697209380448333e-05,
769
- "loss": 1.1183,
770
- "step": 95
771
- },
772
- {
773
- "epoch": 2.0210526315789474,
774
- "grad_norm": 0.4117797017097473,
775
- "learning_rate": 6.486251759186572e-05,
776
- "loss": 1.0669,
777
- "step": 96
778
- },
779
- {
780
- "epoch": 2.0210526315789474,
781
- "eval_loss": 1.1518473625183105,
782
- "eval_runtime": 2.1108,
783
- "eval_samples_per_second": 47.376,
784
- "eval_steps_per_second": 4.738,
785
- "step": 96
786
- },
787
- {
788
- "epoch": 2.042105263157895,
789
- "grad_norm": 0.39091888070106506,
790
- "learning_rate": 6.277063415980549e-05,
791
- "loss": 0.9891,
792
- "step": 97
793
- },
794
- {
795
- "epoch": 2.0631578947368423,
796
- "grad_norm": 0.49795445799827576,
797
- "learning_rate": 6.069749683460765e-05,
798
- "loss": 0.8838,
799
- "step": 98
800
- },
801
- {
802
- "epoch": 2.0842105263157893,
803
- "grad_norm": 0.4604962170124054,
804
- "learning_rate": 5.864414950334796e-05,
805
- "loss": 0.9824,
806
- "step": 99
807
- },
808
- {
809
- "epoch": 2.1052631578947367,
810
- "grad_norm": 0.5574219226837158,
811
- "learning_rate": 5.6611626088244194e-05,
812
- "loss": 1.0056,
813
- "step": 100
814
- },
815
- {
816
- "epoch": 2.126315789473684,
817
- "grad_norm": 0.46602797508239746,
818
- "learning_rate": 5.4600950026045326e-05,
819
- "loss": 0.9943,
820
- "step": 101
821
- },
822
- {
823
- "epoch": 2.1473684210526316,
824
- "grad_norm": 0.464478999376297,
825
- "learning_rate": 5.261313375270014e-05,
826
- "loss": 0.8895,
827
- "step": 102
828
- },
829
- {
830
- "epoch": 2.168421052631579,
831
- "grad_norm": 0.47825688123703003,
832
- "learning_rate": 5.0649178193565314e-05,
833
- "loss": 1.0034,
834
- "step": 103
835
- },
836
- {
837
- "epoch": 2.1894736842105265,
838
- "grad_norm": 0.5426080822944641,
839
- "learning_rate": 4.87100722594094e-05,
840
- "loss": 0.9732,
841
- "step": 104
842
- },
843
- {
844
- "epoch": 2.1894736842105265,
845
- "eval_loss": 1.1610064506530762,
846
- "eval_runtime": 2.0779,
847
- "eval_samples_per_second": 48.125,
848
- "eval_steps_per_second": 4.813,
849
- "step": 104
850
- },
851
- {
852
- "epoch": 2.2105263157894735,
853
- "grad_norm": 0.4391036033630371,
854
- "learning_rate": 4.6796792348466356e-05,
855
- "loss": 0.9018,
856
- "step": 105
857
- },
858
- {
859
- "epoch": 2.231578947368421,
860
- "grad_norm": 0.495150625705719,
861
- "learning_rate": 4.491030185478976e-05,
862
- "loss": 1.0982,
863
- "step": 106
864
- },
865
- {
866
- "epoch": 2.2526315789473683,
867
- "grad_norm": 0.4889540374279022,
868
- "learning_rate": 4.305155068315481e-05,
869
- "loss": 1.1357,
870
- "step": 107
871
- },
872
- {
873
- "epoch": 2.2736842105263158,
874
- "grad_norm": 0.47582054138183594,
875
- "learning_rate": 4.12214747707527e-05,
876
- "loss": 0.8421,
877
- "step": 108
878
- },
879
- {
880
- "epoch": 2.294736842105263,
881
- "grad_norm": 0.43810227513313293,
882
- "learning_rate": 3.942099561591802e-05,
883
- "loss": 1.0096,
884
- "step": 109
885
- },
886
- {
887
- "epoch": 2.3157894736842106,
888
- "grad_norm": 0.5217084884643555,
889
- "learning_rate": 3.7651019814126654e-05,
890
- "loss": 0.9681,
891
- "step": 110
892
- },
893
- {
894
- "epoch": 2.336842105263158,
895
- "grad_norm": 0.5350040793418884,
896
- "learning_rate": 3.591243860149759e-05,
897
- "loss": 0.9163,
898
- "step": 111
899
- },
900
- {
901
- "epoch": 2.3578947368421055,
902
- "grad_norm": 0.4863702654838562,
903
- "learning_rate": 3.4206127406028745e-05,
904
- "loss": 1.1016,
905
- "step": 112
906
- },
907
- {
908
- "epoch": 2.3578947368421055,
909
- "eval_loss": 1.163386583328247,
910
- "eval_runtime": 2.0711,
911
- "eval_samples_per_second": 48.284,
912
- "eval_steps_per_second": 4.828,
913
- "step": 112
914
- },
915
- {
916
- "epoch": 2.3789473684210525,
917
- "grad_norm": 0.4959012269973755,
918
- "learning_rate": 3.253294540679257e-05,
919
- "loss": 1.1242,
920
- "step": 113
921
- },
922
- {
923
- "epoch": 2.4,
924
- "grad_norm": 0.4682742953300476,
925
- "learning_rate": 3.089373510131354e-05,
926
- "loss": 0.8366,
927
- "step": 114
928
- },
929
- {
930
- "epoch": 2.4210526315789473,
931
- "grad_norm": 0.5049096941947937,
932
- "learning_rate": 2.9289321881345254e-05,
933
- "loss": 1.0976,
934
- "step": 115
935
- },
936
- {
937
- "epoch": 2.442105263157895,
938
- "grad_norm": 0.4340517818927765,
939
- "learning_rate": 2.7720513617260856e-05,
940
- "loss": 1.0151,
941
- "step": 116
942
- },
943
- {
944
- "epoch": 2.463157894736842,
945
- "grad_norm": 0.5189387202262878,
946
- "learning_rate": 2.6188100251265945e-05,
947
- "loss": 0.9766,
948
- "step": 117
949
- },
950
- {
951
- "epoch": 2.4842105263157896,
952
- "grad_norm": 0.45461520552635193,
953
- "learning_rate": 2.4692853399638917e-05,
954
- "loss": 1.08,
955
- "step": 118
956
- },
957
- {
958
- "epoch": 2.5052631578947366,
959
- "grad_norm": 0.5745816826820374,
960
- "learning_rate": 2.323552596419889e-05,
961
- "loss": 0.9789,
962
- "step": 119
963
- },
964
- {
965
- "epoch": 2.526315789473684,
966
- "grad_norm": 0.4734479784965515,
967
- "learning_rate": 2.181685175319702e-05,
968
- "loss": 1.108,
969
- "step": 120
970
- },
971
- {
972
- "epoch": 2.526315789473684,
973
- "eval_loss": 1.1624512672424316,
974
- "eval_runtime": 2.0886,
975
- "eval_samples_per_second": 47.879,
976
  "eval_steps_per_second": 4.788,
977
- "step": 120
978
- },
979
- {
980
- "epoch": 2.5473684210526315,
981
- "grad_norm": 0.5734113454818726,
982
- "learning_rate": 2.043754511182191e-05,
983
- "loss": 0.9151,
984
- "step": 121
985
- },
986
- {
987
- "epoch": 2.568421052631579,
988
- "grad_norm": 0.5083211064338684,
989
- "learning_rate": 1.9098300562505266e-05,
990
- "loss": 0.9903,
991
- "step": 122
992
- },
993
- {
994
- "epoch": 2.5894736842105264,
995
- "grad_norm": 0.5377265214920044,
996
- "learning_rate": 1.7799792455209018e-05,
997
- "loss": 1.0774,
998
- "step": 123
999
- },
1000
- {
1001
- "epoch": 2.610526315789474,
1002
- "grad_norm": 0.4219975471496582,
1003
- "learning_rate": 1.6542674627869737e-05,
1004
- "loss": 0.9234,
1005
- "step": 124
1006
- },
1007
- {
1008
- "epoch": 2.6315789473684212,
1009
- "grad_norm": 0.49157968163490295,
1010
- "learning_rate": 1.5327580077171587e-05,
1011
- "loss": 0.9577,
1012
- "step": 125
1013
- },
1014
- {
1015
- "epoch": 2.6526315789473687,
1016
- "grad_norm": 0.4462091326713562,
1017
- "learning_rate": 1.415512063981339e-05,
1018
- "loss": 0.9661,
1019
- "step": 126
1020
- },
1021
- {
1022
- "epoch": 2.6736842105263157,
1023
- "grad_norm": 0.5062934756278992,
1024
- "learning_rate": 1.3025886684430467e-05,
1025
- "loss": 0.9206,
1026
- "step": 127
1027
- },
1028
- {
1029
- "epoch": 2.694736842105263,
1030
- "grad_norm": 0.558468759059906,
1031
- "learning_rate": 1.19404468143262e-05,
1032
- "loss": 1.0424,
1033
- "step": 128
1034
- },
1035
- {
1036
- "epoch": 2.694736842105263,
1037
- "eval_loss": 1.164870023727417,
1038
- "eval_runtime": 2.1075,
1039
- "eval_samples_per_second": 47.449,
1040
- "eval_steps_per_second": 4.745,
1041
- "step": 128
1042
- },
1043
- {
1044
- "epoch": 2.7157894736842105,
1045
- "grad_norm": 0.48067817091941833,
1046
- "learning_rate": 1.0899347581163221e-05,
1047
- "loss": 0.9617,
1048
- "step": 129
1049
- },
1050
- {
1051
- "epoch": 2.736842105263158,
1052
- "grad_norm": 0.6342288255691528,
1053
- "learning_rate": 9.903113209758096e-06,
1054
- "loss": 1.0679,
1055
- "step": 130
1056
- },
1057
- {
1058
- "epoch": 2.7578947368421054,
1059
- "grad_norm": 0.5930253267288208,
1060
- "learning_rate": 8.952245334118414e-06,
1061
- "loss": 0.8819,
1062
- "step": 131
1063
- },
1064
- {
1065
- "epoch": 2.7789473684210524,
1066
- "grad_norm": 0.6247056126594543,
1067
- "learning_rate": 8.047222744854943e-06,
1068
- "loss": 0.991,
1069
- "step": 132
1070
- },
1071
- {
1072
- "epoch": 2.8,
1073
- "grad_norm": 0.5282688736915588,
1074
- "learning_rate": 7.1885011480961164e-06,
1075
- "loss": 0.9508,
1076
- "step": 133
1077
- },
1078
- {
1079
- "epoch": 2.8210526315789473,
1080
- "grad_norm": 0.4279923141002655,
1081
- "learning_rate": 6.37651293602628e-06,
1082
- "loss": 0.9463,
1083
- "step": 134
1084
- },
1085
- {
1086
- "epoch": 2.8421052631578947,
1087
- "grad_norm": 0.4681239426136017,
1088
- "learning_rate": 5.611666969163243e-06,
1089
- "loss": 1.1093,
1090
- "step": 135
1091
  }
1092
  ],
1093
  "logging_steps": 1,
1094
  "max_steps": 150,
1095
  "num_input_tokens_seen": 0,
1096
  "num_train_epochs": 4,
1097
- "save_steps": 5,
1098
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
1099
  "TrainerControl": {
1100
  "args": {
1101
  "should_epoch_stop": false,
@@ -1107,7 +191,7 @@
1107
  "attributes": {}
1108
  }
1109
  },
1110
- "total_flos": 1.366908129509376e+16,
1111
  "train_batch_size": 10,
1112
  "trial_name": null,
1113
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.1702154874801636,
3
+ "best_model_checkpoint": "miner_id_besimray/checkpoint-20",
4
+ "epoch": 0.42105263157894735,
5
+ "eval_steps": 20,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.021052631578947368,
13
+ "grad_norm": 0.7695803046226501,
14
  "learning_rate": 2e-05,
15
  "loss": 1.3028,
16
  "step": 1
 
18
  {
19
  "epoch": 0.021052631578947368,
20
  "eval_loss": 1.2579221725463867,
21
+ "eval_runtime": 2.0651,
22
+ "eval_samples_per_second": 48.423,
23
+ "eval_steps_per_second": 4.842,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.042105263157894736,
28
+ "grad_norm": 0.7731568217277527,
29
  "learning_rate": 4e-05,
30
  "loss": 1.4572,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.06315789473684211,
35
+ "grad_norm": 0.6739473342895508,
36
  "learning_rate": 6e-05,
37
+ "loss": 1.2761,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.713449239730835,
43
  "learning_rate": 8e-05,
44
+ "loss": 1.4221,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
+ "grad_norm": 0.5318827629089355,
50
  "learning_rate": 0.0001,
51
+ "loss": 1.2373,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.12631578947368421,
56
+ "grad_norm": 0.5601332783699036,
57
  "learning_rate": 0.00012,
58
+ "loss": 1.3898,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.14736842105263157,
63
+ "grad_norm": 0.6797667741775513,
64
  "learning_rate": 0.00014,
65
+ "loss": 1.3347,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.16842105263157894,
70
+ "grad_norm": 0.5191617012023926,
71
  "learning_rate": 0.00016,
72
+ "loss": 1.2194,
 
 
 
 
 
 
 
 
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.18947368421052632,
77
+ "grad_norm": 0.5978218913078308,
78
  "learning_rate": 0.00018,
79
+ "loss": 1.2025,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.21052631578947367,
84
+ "grad_norm": 0.4920961558818817,
85
  "learning_rate": 0.0002,
86
+ "loss": 1.378,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.23157894736842105,
91
+ "grad_norm": 0.44265127182006836,
92
  "learning_rate": 0.00019997482349425066,
93
+ "loss": 1.1907,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.25263157894736843,
98
+ "grad_norm": 0.3402289152145386,
99
  "learning_rate": 0.00019989930665413147,
100
+ "loss": 1.2153,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.2736842105263158,
105
+ "grad_norm": 0.33481013774871826,
106
  "learning_rate": 0.0001997734875046456,
107
+ "loss": 1.0648,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.29473684210526313,
112
+ "grad_norm": 0.3752918243408203,
113
  "learning_rate": 0.00019959742939952392,
114
+ "loss": 1.0774,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.3157894736842105,
119
+ "grad_norm": 0.37364915013313293,
120
  "learning_rate": 0.00019937122098932428,
121
+ "loss": 1.003,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.3368421052631579,
126
+ "grad_norm": 0.3115549683570862,
127
  "learning_rate": 0.00019909497617679348,
128
+ "loss": 1.2112,
 
 
 
 
 
 
 
 
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.35789473684210527,
133
+ "grad_norm": 0.3663255572319031,
134
  "learning_rate": 0.00019876883405951377,
135
+ "loss": 1.281,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.37894736842105264,
140
+ "grad_norm": 0.325300008058548,
141
  "learning_rate": 0.00019839295885986296,
142
+ "loss": 1.2251,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.4,
147
+ "grad_norm": 0.3866478204727173,
148
  "learning_rate": 0.00019796753984232358,
149
+ "loss": 1.2657,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.42105263157894735,
154
+ "grad_norm": 0.3811936378479004,
155
  "learning_rate": 0.00019749279121818235,
156
+ "loss": 1.3521,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.42105263157894735,
161
+ "eval_loss": 1.1702154874801636,
162
+ "eval_runtime": 2.0888,
163
+ "eval_samples_per_second": 47.875,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  "eval_steps_per_second": 4.788,
165
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 1,
169
  "max_steps": 150,
170
  "num_input_tokens_seen": 0,
171
  "num_train_epochs": 4,
172
+ "save_steps": 20,
173
  "stateful_callbacks": {
174
+ "EarlyStoppingCallback": {
175
+ "args": {
176
+ "early_stopping_patience": 3,
177
+ "early_stopping_threshold": 0.0
178
+ },
179
+ "attributes": {
180
+ "early_stopping_patience_counter": 0
181
+ }
182
+ },
183
  "TrainerControl": {
184
  "args": {
185
  "should_epoch_stop": false,
 
191
  "attributes": {}
192
  }
193
  },
194
+ "total_flos": 2033729462599680.0,
195
  "train_batch_size": 10,
196
  "trial_name": null,
197
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ff1d8c6f902797e12d28a21e9a2c84e35270ac0165782c609c1439669b26556
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4cb7ce651922e8f53dabf2b1364985d613e09d28a1319890e22f5a25dfbce85
3
  size 6648