besimray commited on
Commit
b93abb9
·
verified ·
1 Parent(s): 8e34df6

Training in progress, step 20, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "k_proj",
24
- "gate_proj",
25
  "v_proj",
 
26
  "down_proj",
27
  "o_proj",
28
- "up_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
  "k_proj",
 
25
  "v_proj",
26
+ "gate_proj",
27
  "down_proj",
28
  "o_proj",
29
+ "up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e36bb4966b3713f17079f0f0073225f3c17789e78598436f125bc5847c546220
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bf014d1a50f271c41f7422261b08ca5ec84dc1faabd04c29231ef2836d36445
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31631c7141c9eed8d3d67722b7f007bb55e7b4644efb82e4b7c07b72a46d6b5f
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9606721fc8617b61d2e0dc2ec8042ef4e6afb22d62a62595ce9ec2026c0ad30
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:330e765b24011cd6e18b8db74d77f7195e5780a184071a5df72e72c642350c23
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:490faae3574e0545627c6c066345113a5ec4be88337cd4484537a0d75c6be16a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61c2b4927e3039b26d377375be782c03ce853d193f96b5868ccf559441e84af9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c297c5cf11a27c75d9f99f1df69752f78c3ad41b0275adf50cdd1b67f9d0bb3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,914 +1,174 @@
1
  {
2
- "best_metric": 1.1519354581832886,
3
- "best_model_checkpoint": "miner_id_besimray/checkpoint-60",
4
- "epoch": 2.526315789473684,
5
  "eval_steps": 20,
6
- "global_step": 120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.021052631578947368,
13
- "grad_norm": 0.7695803046226501,
14
  "learning_rate": 2e-05,
15
- "loss": 1.3028,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.021052631578947368,
20
- "eval_loss": 1.2579221725463867,
21
- "eval_runtime": 2.0651,
22
- "eval_samples_per_second": 48.423,
23
- "eval_steps_per_second": 4.842,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.042105263157894736,
28
- "grad_norm": 0.7731568217277527,
29
  "learning_rate": 4e-05,
30
- "loss": 1.4572,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.06315789473684211,
35
- "grad_norm": 0.6739473342895508,
36
  "learning_rate": 6e-05,
37
- "loss": 1.2761,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.08421052631578947,
42
- "grad_norm": 0.713449239730835,
43
  "learning_rate": 8e-05,
44
- "loss": 1.4221,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.10526315789473684,
49
- "grad_norm": 0.5318827629089355,
50
  "learning_rate": 0.0001,
51
- "loss": 1.2373,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.12631578947368421,
56
- "grad_norm": 0.5601332783699036,
57
  "learning_rate": 0.00012,
58
- "loss": 1.3898,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.14736842105263157,
63
- "grad_norm": 0.6797667741775513,
64
  "learning_rate": 0.00014,
65
- "loss": 1.3347,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.16842105263157894,
70
- "grad_norm": 0.5191617012023926,
71
  "learning_rate": 0.00016,
72
- "loss": 1.2194,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.18947368421052632,
77
- "grad_norm": 0.5978218913078308,
78
  "learning_rate": 0.00018,
79
- "loss": 1.2025,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.21052631578947367,
84
- "grad_norm": 0.4920961558818817,
85
  "learning_rate": 0.0002,
86
- "loss": 1.378,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.23157894736842105,
91
- "grad_norm": 0.44265127182006836,
92
  "learning_rate": 0.00019997482349425066,
93
- "loss": 1.1907,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.25263157894736843,
98
- "grad_norm": 0.3402289152145386,
99
  "learning_rate": 0.00019989930665413147,
100
- "loss": 1.2153,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.2736842105263158,
105
- "grad_norm": 0.33481013774871826,
106
  "learning_rate": 0.0001997734875046456,
107
- "loss": 1.0648,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.29473684210526313,
112
- "grad_norm": 0.3752918243408203,
113
  "learning_rate": 0.00019959742939952392,
114
- "loss": 1.0774,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.3157894736842105,
119
- "grad_norm": 0.37364915013313293,
120
  "learning_rate": 0.00019937122098932428,
121
- "loss": 1.003,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.3368421052631579,
126
- "grad_norm": 0.3115549683570862,
127
  "learning_rate": 0.00019909497617679348,
128
- "loss": 1.2112,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.35789473684210527,
133
- "grad_norm": 0.3663255572319031,
134
  "learning_rate": 0.00019876883405951377,
135
- "loss": 1.281,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.37894736842105264,
140
- "grad_norm": 0.325300008058548,
141
  "learning_rate": 0.00019839295885986296,
142
- "loss": 1.2251,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.4,
147
- "grad_norm": 0.3866478204727173,
148
  "learning_rate": 0.00019796753984232358,
149
- "loss": 1.2657,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.42105263157894735,
154
- "grad_norm": 0.3811936378479004,
155
  "learning_rate": 0.00019749279121818235,
156
- "loss": 1.3521,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.42105263157894735,
161
- "eval_loss": 1.1702154874801636,
162
- "eval_runtime": 2.0888,
163
- "eval_samples_per_second": 47.875,
164
- "eval_steps_per_second": 4.788,
165
  "step": 20
166
- },
167
- {
168
- "epoch": 0.4421052631578947,
169
- "grad_norm": 0.4274454414844513,
170
- "learning_rate": 0.0001969689520376687,
171
- "loss": 1.0972,
172
- "step": 21
173
- },
174
- {
175
- "epoch": 0.4631578947368421,
176
- "grad_norm": 0.3145941197872162,
177
- "learning_rate": 0.00019639628606958533,
178
- "loss": 1.1746,
179
- "step": 22
180
- },
181
- {
182
- "epoch": 0.4842105263157895,
183
- "grad_norm": 0.34570032358169556,
184
- "learning_rate": 0.00019577508166849304,
185
- "loss": 1.2273,
186
- "step": 23
187
- },
188
- {
189
- "epoch": 0.5052631578947369,
190
- "grad_norm": 0.4847642481327057,
191
- "learning_rate": 0.00019510565162951537,
192
- "loss": 1.2152,
193
- "step": 24
194
- },
195
- {
196
- "epoch": 0.5263157894736842,
197
- "grad_norm": 0.3375917375087738,
198
- "learning_rate": 0.00019438833303083678,
199
- "loss": 1.1977,
200
- "step": 25
201
- },
202
- {
203
- "epoch": 0.5473684210526316,
204
- "grad_norm": 0.42273998260498047,
205
- "learning_rate": 0.00019362348706397373,
206
- "loss": 1.24,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.5684210526315789,
211
- "grad_norm": 0.4141988456249237,
212
- "learning_rate": 0.0001928114988519039,
213
- "loss": 1.2562,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.5894736842105263,
218
- "grad_norm": 0.382915198802948,
219
- "learning_rate": 0.0001919527772551451,
220
- "loss": 1.226,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.6105263157894737,
225
- "grad_norm": 0.37382128834724426,
226
- "learning_rate": 0.00019104775466588161,
227
- "loss": 1.3248,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.631578947368421,
232
- "grad_norm": 0.3107808232307434,
233
- "learning_rate": 0.0001900968867902419,
234
- "loss": 1.1354,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.6526315789473685,
239
- "grad_norm": 0.34155750274658203,
240
- "learning_rate": 0.0001891006524188368,
241
- "loss": 1.0803,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.6736842105263158,
246
- "grad_norm": 0.3141622543334961,
247
- "learning_rate": 0.0001880595531856738,
248
- "loss": 1.1457,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.6947368421052632,
253
- "grad_norm": 0.35257869958877563,
254
- "learning_rate": 0.00018697411331556956,
255
- "loss": 1.2322,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.7157894736842105,
260
- "grad_norm": 0.42415115237236023,
261
- "learning_rate": 0.00018584487936018661,
262
- "loss": 1.179,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.7368421052631579,
267
- "grad_norm": 0.3805026710033417,
268
- "learning_rate": 0.00018467241992282843,
269
- "loss": 1.126,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.7578947368421053,
274
- "grad_norm": 0.3650873601436615,
275
- "learning_rate": 0.00018345732537213027,
276
- "loss": 1.2333,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.7789473684210526,
281
- "grad_norm": 0.3254134953022003,
282
- "learning_rate": 0.00018220020754479102,
283
- "loss": 1.2721,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.8,
288
- "grad_norm": 0.36688175797462463,
289
- "learning_rate": 0.00018090169943749476,
290
- "loss": 1.2615,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.8210526315789474,
295
- "grad_norm": 0.3320186734199524,
296
- "learning_rate": 0.00017956245488817812,
297
- "loss": 1.1474,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.8421052631578947,
302
- "grad_norm": 0.37685626745224,
303
- "learning_rate": 0.000178183148246803,
304
- "loss": 1.1977,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.8421052631578947,
309
- "eval_loss": 1.153311014175415,
310
- "eval_runtime": 2.0851,
311
- "eval_samples_per_second": 47.96,
312
- "eval_steps_per_second": 4.796,
313
- "step": 40
314
- },
315
- {
316
- "epoch": 0.8631578947368421,
317
- "grad_norm": 0.43908432126045227,
318
- "learning_rate": 0.0001767644740358011,
319
- "loss": 1.2448,
320
- "step": 41
321
- },
322
- {
323
- "epoch": 0.8842105263157894,
324
- "grad_norm": 0.3212919235229492,
325
- "learning_rate": 0.00017530714660036112,
326
- "loss": 1.1699,
327
- "step": 42
328
- },
329
- {
330
- "epoch": 0.9052631578947369,
331
- "grad_norm": 0.339679479598999,
332
- "learning_rate": 0.00017381189974873407,
333
- "loss": 1.1572,
334
- "step": 43
335
- },
336
- {
337
- "epoch": 0.9263157894736842,
338
- "grad_norm": 0.3269651234149933,
339
- "learning_rate": 0.00017227948638273916,
340
- "loss": 1.1494,
341
- "step": 44
342
- },
343
- {
344
- "epoch": 0.9473684210526315,
345
- "grad_norm": 0.34232962131500244,
346
- "learning_rate": 0.00017071067811865476,
347
- "loss": 1.2262,
348
- "step": 45
349
- },
350
- {
351
- "epoch": 0.968421052631579,
352
- "grad_norm": 0.34240803122520447,
353
- "learning_rate": 0.00016910626489868649,
354
- "loss": 1.1834,
355
- "step": 46
356
- },
357
- {
358
- "epoch": 0.9894736842105263,
359
- "grad_norm": 0.35933125019073486,
360
- "learning_rate": 0.00016746705459320745,
361
- "loss": 1.0391,
362
- "step": 47
363
- },
364
- {
365
- "epoch": 1.0105263157894737,
366
- "grad_norm": 0.3355937898159027,
367
- "learning_rate": 0.00016579387259397127,
368
- "loss": 1.2872,
369
- "step": 48
370
- },
371
- {
372
- "epoch": 1.0315789473684212,
373
- "grad_norm": 0.3706349730491638,
374
- "learning_rate": 0.0001640875613985024,
375
- "loss": 1.0775,
376
- "step": 49
377
- },
378
- {
379
- "epoch": 1.0526315789473684,
380
- "grad_norm": 0.427852988243103,
381
- "learning_rate": 0.00016234898018587337,
382
- "loss": 1.1524,
383
- "step": 50
384
- },
385
- {
386
- "epoch": 1.0736842105263158,
387
- "grad_norm": 0.3533117473125458,
388
- "learning_rate": 0.000160579004384082,
389
- "loss": 1.1181,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 1.0947368421052632,
394
- "grad_norm": 0.3712696135044098,
395
- "learning_rate": 0.00015877852522924732,
396
- "loss": 1.0986,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 1.1157894736842104,
401
- "grad_norm": 0.3790956437587738,
402
- "learning_rate": 0.0001569484493168452,
403
- "loss": 1.1749,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 1.1368421052631579,
408
- "grad_norm": 0.3779037296772003,
409
- "learning_rate": 0.00015508969814521025,
410
- "loss": 1.1089,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 1.1578947368421053,
415
- "grad_norm": 0.36196696758270264,
416
- "learning_rate": 0.00015320320765153367,
417
- "loss": 1.0186,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 1.1789473684210527,
422
- "grad_norm": 0.3449699282646179,
423
- "learning_rate": 0.00015128992774059063,
424
- "loss": 1.064,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 1.2,
429
- "grad_norm": 0.43372786045074463,
430
- "learning_rate": 0.0001493508218064347,
431
- "loss": 1.007,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 1.2210526315789474,
436
- "grad_norm": 0.35580453276634216,
437
- "learning_rate": 0.00014738686624729986,
438
- "loss": 1.0295,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 1.2421052631578948,
443
- "grad_norm": 0.47308239340782166,
444
- "learning_rate": 0.00014539904997395468,
445
- "loss": 1.1361,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 1.263157894736842,
450
- "grad_norm": 0.3692001402378082,
451
- "learning_rate": 0.00014338837391175582,
452
- "loss": 1.099,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 1.263157894736842,
457
- "eval_loss": 1.1519354581832886,
458
- "eval_runtime": 2.053,
459
- "eval_samples_per_second": 48.71,
460
- "eval_steps_per_second": 4.871,
461
- "step": 60
462
- },
463
- {
464
- "epoch": 1.2842105263157895,
465
- "grad_norm": 0.4118487238883972,
466
- "learning_rate": 0.00014135585049665207,
467
- "loss": 0.9891,
468
- "step": 61
469
- },
470
- {
471
- "epoch": 1.305263157894737,
472
- "grad_norm": 0.32802432775497437,
473
- "learning_rate": 0.00013930250316539238,
474
- "loss": 0.9878,
475
- "step": 62
476
- },
477
- {
478
- "epoch": 1.3263157894736843,
479
- "grad_norm": 0.41467538475990295,
480
- "learning_rate": 0.00013722936584019453,
481
- "loss": 1.0542,
482
- "step": 63
483
- },
484
- {
485
- "epoch": 1.3473684210526315,
486
- "grad_norm": 0.39795804023742676,
487
- "learning_rate": 0.0001351374824081343,
488
- "loss": 1.1358,
489
- "step": 64
490
- },
491
- {
492
- "epoch": 1.368421052631579,
493
- "grad_norm": 0.3385366201400757,
494
- "learning_rate": 0.00013302790619551674,
495
- "loss": 1.1107,
496
- "step": 65
497
- },
498
- {
499
- "epoch": 1.3894736842105262,
500
- "grad_norm": 0.4300186336040497,
501
- "learning_rate": 0.00013090169943749476,
502
- "loss": 1.0554,
503
- "step": 66
504
- },
505
- {
506
- "epoch": 1.4105263157894736,
507
- "grad_norm": 0.4523608982563019,
508
- "learning_rate": 0.00012875993274320173,
509
- "loss": 1.1442,
510
- "step": 67
511
- },
512
- {
513
- "epoch": 1.431578947368421,
514
- "grad_norm": 0.48153308033943176,
515
- "learning_rate": 0.00012660368455666752,
516
- "loss": 1.1677,
517
- "step": 68
518
- },
519
- {
520
- "epoch": 1.4526315789473685,
521
- "grad_norm": 0.46898069977760315,
522
- "learning_rate": 0.0001244340406137894,
523
- "loss": 1.1212,
524
- "step": 69
525
- },
526
- {
527
- "epoch": 1.4736842105263157,
528
- "grad_norm": 0.3733386695384979,
529
- "learning_rate": 0.00012225209339563145,
530
- "loss": 0.9843,
531
- "step": 70
532
- },
533
- {
534
- "epoch": 1.4947368421052631,
535
- "grad_norm": 0.4410829544067383,
536
- "learning_rate": 0.00012005894157832729,
537
- "loss": 1.1679,
538
- "step": 71
539
- },
540
- {
541
- "epoch": 1.5157894736842106,
542
- "grad_norm": 0.46537336707115173,
543
- "learning_rate": 0.00011785568947986367,
544
- "loss": 1.0453,
545
- "step": 72
546
- },
547
- {
548
- "epoch": 1.5368421052631578,
549
- "grad_norm": 0.39270663261413574,
550
- "learning_rate": 0.0001156434465040231,
551
- "loss": 1.1019,
552
- "step": 73
553
- },
554
- {
555
- "epoch": 1.5578947368421052,
556
- "grad_norm": 0.3547813296318054,
557
- "learning_rate": 0.00011342332658176555,
558
- "loss": 0.9807,
559
- "step": 74
560
- },
561
- {
562
- "epoch": 1.5789473684210527,
563
- "grad_norm": 0.33064335584640503,
564
- "learning_rate": 0.00011119644761033078,
565
- "loss": 0.9903,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 1.6,
570
- "grad_norm": 0.41019386053085327,
571
- "learning_rate": 0.00010896393089034336,
572
- "loss": 0.9956,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 1.6210526315789475,
577
- "grad_norm": 0.43731600046157837,
578
- "learning_rate": 0.00010672690056120399,
579
- "loss": 0.9657,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 1.6421052631578947,
584
- "grad_norm": 0.38457056879997253,
585
- "learning_rate": 0.00010448648303505151,
586
- "loss": 1.1255,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 1.663157894736842,
591
- "grad_norm": 0.4372155964374542,
592
- "learning_rate": 0.00010224380642958052,
593
- "loss": 1.105,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 1.6842105263157894,
598
- "grad_norm": 0.4701666533946991,
599
- "learning_rate": 0.0001,
600
- "loss": 1.0658,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 1.6842105263157894,
605
- "eval_loss": 1.152337908744812,
606
- "eval_runtime": 2.0462,
607
- "eval_samples_per_second": 48.871,
608
- "eval_steps_per_second": 4.887,
609
- "step": 80
610
- },
611
- {
612
- "epoch": 1.7052631578947368,
613
- "grad_norm": 0.44070982933044434,
614
- "learning_rate": 9.775619357041952e-05,
615
- "loss": 1.1024,
616
- "step": 81
617
- },
618
- {
619
- "epoch": 1.7263157894736842,
620
- "grad_norm": 0.5059276819229126,
621
- "learning_rate": 9.551351696494854e-05,
622
- "loss": 1.1214,
623
- "step": 82
624
- },
625
- {
626
- "epoch": 1.7473684210526317,
627
- "grad_norm": 0.4155433773994446,
628
- "learning_rate": 9.327309943879604e-05,
629
- "loss": 1.1853,
630
- "step": 83
631
- },
632
- {
633
- "epoch": 1.768421052631579,
634
- "grad_norm": 0.5396384596824646,
635
- "learning_rate": 9.103606910965666e-05,
636
- "loss": 1.1497,
637
- "step": 84
638
- },
639
- {
640
- "epoch": 1.7894736842105263,
641
- "grad_norm": 0.43235623836517334,
642
- "learning_rate": 8.880355238966923e-05,
643
- "loss": 1.1753,
644
- "step": 85
645
- },
646
- {
647
- "epoch": 1.8105263157894735,
648
- "grad_norm": 0.3918503224849701,
649
- "learning_rate": 8.657667341823448e-05,
650
- "loss": 1.1254,
651
- "step": 86
652
- },
653
- {
654
- "epoch": 1.831578947368421,
655
- "grad_norm": 0.4692346155643463,
656
- "learning_rate": 8.435655349597689e-05,
657
- "loss": 1.3666,
658
- "step": 87
659
- },
660
- {
661
- "epoch": 1.8526315789473684,
662
- "grad_norm": 0.4968159794807434,
663
- "learning_rate": 8.214431052013634e-05,
664
- "loss": 0.9668,
665
- "step": 88
666
- },
667
- {
668
- "epoch": 1.8736842105263158,
669
- "grad_norm": 0.4856269061565399,
670
- "learning_rate": 7.994105842167273e-05,
671
- "loss": 1.1482,
672
- "step": 89
673
- },
674
- {
675
- "epoch": 1.8947368421052633,
676
- "grad_norm": 0.5288775563240051,
677
- "learning_rate": 7.774790660436858e-05,
678
- "loss": 1.13,
679
- "step": 90
680
- },
681
- {
682
- "epoch": 1.9157894736842105,
683
- "grad_norm": 0.5403844118118286,
684
- "learning_rate": 7.556595938621058e-05,
685
- "loss": 1.1483,
686
- "step": 91
687
- },
688
- {
689
- "epoch": 1.936842105263158,
690
- "grad_norm": 0.45445382595062256,
691
- "learning_rate": 7.339631544333249e-05,
692
- "loss": 1.0528,
693
- "step": 92
694
- },
695
- {
696
- "epoch": 1.9578947368421051,
697
- "grad_norm": 0.48713403940200806,
698
- "learning_rate": 7.124006725679828e-05,
699
- "loss": 1.2208,
700
- "step": 93
701
- },
702
- {
703
- "epoch": 1.9789473684210526,
704
- "grad_norm": 0.4627130627632141,
705
- "learning_rate": 6.909830056250527e-05,
706
- "loss": 1.0794,
707
- "step": 94
708
- },
709
- {
710
- "epoch": 2.0,
711
- "grad_norm": 0.46807029843330383,
712
- "learning_rate": 6.697209380448333e-05,
713
- "loss": 1.12,
714
- "step": 95
715
- },
716
- {
717
- "epoch": 2.0210526315789474,
718
- "grad_norm": 0.41066575050354004,
719
- "learning_rate": 6.486251759186572e-05,
720
- "loss": 1.0634,
721
- "step": 96
722
- },
723
- {
724
- "epoch": 2.042105263157895,
725
- "grad_norm": 0.3904050886631012,
726
- "learning_rate": 6.277063415980549e-05,
727
- "loss": 0.9888,
728
- "step": 97
729
- },
730
- {
731
- "epoch": 2.0631578947368423,
732
- "grad_norm": 0.49676060676574707,
733
- "learning_rate": 6.069749683460765e-05,
734
- "loss": 0.8783,
735
- "step": 98
736
- },
737
- {
738
- "epoch": 2.0842105263157893,
739
- "grad_norm": 0.46549147367477417,
740
- "learning_rate": 5.864414950334796e-05,
741
- "loss": 0.9815,
742
- "step": 99
743
- },
744
- {
745
- "epoch": 2.1052631578947367,
746
- "grad_norm": 0.5622740387916565,
747
- "learning_rate": 5.6611626088244194e-05,
748
- "loss": 1.0091,
749
- "step": 100
750
- },
751
- {
752
- "epoch": 2.1052631578947367,
753
- "eval_loss": 1.1575236320495605,
754
- "eval_runtime": 2.0589,
755
- "eval_samples_per_second": 48.569,
756
- "eval_steps_per_second": 4.857,
757
- "step": 100
758
- },
759
- {
760
- "epoch": 2.126315789473684,
761
- "grad_norm": 0.47087791562080383,
762
- "learning_rate": 5.4600950026045326e-05,
763
- "loss": 0.994,
764
- "step": 101
765
- },
766
- {
767
- "epoch": 2.1473684210526316,
768
- "grad_norm": 0.46321335434913635,
769
- "learning_rate": 5.261313375270014e-05,
770
- "loss": 0.8965,
771
- "step": 102
772
- },
773
- {
774
- "epoch": 2.168421052631579,
775
- "grad_norm": 0.48722636699676514,
776
- "learning_rate": 5.0649178193565314e-05,
777
- "loss": 1.0028,
778
- "step": 103
779
- },
780
- {
781
- "epoch": 2.1894736842105265,
782
- "grad_norm": 0.5477016568183899,
783
- "learning_rate": 4.87100722594094e-05,
784
- "loss": 0.9755,
785
- "step": 104
786
- },
787
- {
788
- "epoch": 2.2105263157894735,
789
- "grad_norm": 0.43870726227760315,
790
- "learning_rate": 4.6796792348466356e-05,
791
- "loss": 0.9023,
792
- "step": 105
793
- },
794
- {
795
- "epoch": 2.231578947368421,
796
- "grad_norm": 0.4974609911441803,
797
- "learning_rate": 4.491030185478976e-05,
798
- "loss": 1.0978,
799
- "step": 106
800
- },
801
- {
802
- "epoch": 2.2526315789473683,
803
- "grad_norm": 0.48663774132728577,
804
- "learning_rate": 4.305155068315481e-05,
805
- "loss": 1.1326,
806
- "step": 107
807
- },
808
- {
809
- "epoch": 2.2736842105263158,
810
- "grad_norm": 0.47879499197006226,
811
- "learning_rate": 4.12214747707527e-05,
812
- "loss": 0.8403,
813
- "step": 108
814
- },
815
- {
816
- "epoch": 2.294736842105263,
817
- "grad_norm": 0.4391883909702301,
818
- "learning_rate": 3.942099561591802e-05,
819
- "loss": 1.0096,
820
- "step": 109
821
- },
822
- {
823
- "epoch": 2.3157894736842106,
824
- "grad_norm": 0.5225970149040222,
825
- "learning_rate": 3.7651019814126654e-05,
826
- "loss": 0.9684,
827
- "step": 110
828
- },
829
- {
830
- "epoch": 2.336842105263158,
831
- "grad_norm": 0.529344379901886,
832
- "learning_rate": 3.591243860149759e-05,
833
- "loss": 0.9164,
834
- "step": 111
835
- },
836
- {
837
- "epoch": 2.3578947368421055,
838
- "grad_norm": 0.4865782856941223,
839
- "learning_rate": 3.4206127406028745e-05,
840
- "loss": 1.0993,
841
- "step": 112
842
- },
843
- {
844
- "epoch": 2.3789473684210525,
845
- "grad_norm": 0.4908663332462311,
846
- "learning_rate": 3.253294540679257e-05,
847
- "loss": 1.1203,
848
- "step": 113
849
- },
850
- {
851
- "epoch": 2.4,
852
- "grad_norm": 0.4688137471675873,
853
- "learning_rate": 3.089373510131354e-05,
854
- "loss": 0.8358,
855
- "step": 114
856
- },
857
- {
858
- "epoch": 2.4210526315789473,
859
- "grad_norm": 0.5007145404815674,
860
- "learning_rate": 2.9289321881345254e-05,
861
- "loss": 1.0975,
862
- "step": 115
863
- },
864
- {
865
- "epoch": 2.442105263157895,
866
- "grad_norm": 0.4280741214752197,
867
- "learning_rate": 2.7720513617260856e-05,
868
- "loss": 1.0134,
869
- "step": 116
870
- },
871
- {
872
- "epoch": 2.463157894736842,
873
- "grad_norm": 0.5474169850349426,
874
- "learning_rate": 2.6188100251265945e-05,
875
- "loss": 0.9781,
876
- "step": 117
877
- },
878
- {
879
- "epoch": 2.4842105263157896,
880
- "grad_norm": 0.4554167091846466,
881
- "learning_rate": 2.4692853399638917e-05,
882
- "loss": 1.082,
883
- "step": 118
884
- },
885
- {
886
- "epoch": 2.5052631578947366,
887
- "grad_norm": 0.5812304615974426,
888
- "learning_rate": 2.323552596419889e-05,
889
- "loss": 0.9826,
890
- "step": 119
891
- },
892
- {
893
- "epoch": 2.526315789473684,
894
- "grad_norm": 0.4756172001361847,
895
- "learning_rate": 2.181685175319702e-05,
896
- "loss": 1.1045,
897
- "step": 120
898
- },
899
- {
900
- "epoch": 2.526315789473684,
901
- "eval_loss": 1.1679396629333496,
902
- "eval_runtime": 2.0595,
903
- "eval_samples_per_second": 48.555,
904
- "eval_steps_per_second": 4.856,
905
- "step": 120
906
  }
907
  ],
908
  "logging_steps": 1,
909
  "max_steps": 150,
910
  "num_input_tokens_seen": 0,
911
- "num_train_epochs": 4,
912
  "save_steps": 20,
913
  "stateful_callbacks": {
914
  "EarlyStoppingCallback": {
@@ -917,7 +177,7 @@
917
  "early_stopping_threshold": 0.0
918
  },
919
  "attributes": {
920
- "early_stopping_patience_counter": 3
921
  }
922
  },
923
  "TrainerControl": {
@@ -926,12 +186,12 @@
926
  "should_evaluate": false,
927
  "should_log": false,
928
  "should_save": true,
929
- "should_training_stop": true
930
  },
931
  "attributes": {}
932
  }
933
  },
934
- "total_flos": 1.214189411500032e+16,
935
  "train_batch_size": 10,
936
  "trial_name": null,
937
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8169272541999817,
3
+ "best_model_checkpoint": "miner_id_besimray/checkpoint-20",
4
+ "epoch": 0.2572347266881029,
5
  "eval_steps": 20,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.012861736334405145,
13
+ "grad_norm": 0.5877827405929565,
14
  "learning_rate": 2e-05,
15
+ "loss": 0.9595,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.012861736334405145,
20
+ "eval_loss": 0.9745924472808838,
21
+ "eval_runtime": 5.6911,
22
+ "eval_samples_per_second": 28.817,
23
+ "eval_steps_per_second": 2.987,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.02572347266881029,
28
+ "grad_norm": 0.8297654986381531,
29
  "learning_rate": 4e-05,
30
+ "loss": 0.9813,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.03858520900321544,
35
+ "grad_norm": 0.8195751905441284,
36
  "learning_rate": 6e-05,
37
+ "loss": 1.0,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.05144694533762058,
42
+ "grad_norm": 0.6416041851043701,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.8282,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.06430868167202572,
49
+ "grad_norm": 0.7084880471229553,
50
  "learning_rate": 0.0001,
51
+ "loss": 1.0059,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.07717041800643087,
56
+ "grad_norm": 0.4804805517196655,
57
  "learning_rate": 0.00012,
58
+ "loss": 0.8269,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.09003215434083602,
63
+ "grad_norm": 0.4058835804462433,
64
  "learning_rate": 0.00014,
65
+ "loss": 0.9492,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.10289389067524116,
70
+ "grad_norm": 0.3298371434211731,
71
  "learning_rate": 0.00016,
72
+ "loss": 0.8769,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.1157556270096463,
77
+ "grad_norm": 0.2648942470550537,
78
  "learning_rate": 0.00018,
79
+ "loss": 0.8254,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.12861736334405144,
84
+ "grad_norm": 0.22385652363300323,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.8403,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.1414790996784566,
91
+ "grad_norm": 0.38447538018226624,
92
  "learning_rate": 0.00019997482349425066,
93
+ "loss": 0.8706,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.15434083601286175,
98
+ "grad_norm": 0.41497623920440674,
99
  "learning_rate": 0.00019989930665413147,
100
+ "loss": 0.8532,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.16720257234726688,
105
+ "grad_norm": 0.38628965616226196,
106
  "learning_rate": 0.0001997734875046456,
107
+ "loss": 0.7667,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.18006430868167203,
112
+ "grad_norm": 0.29869189858436584,
113
  "learning_rate": 0.00019959742939952392,
114
+ "loss": 0.8069,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.19292604501607716,
119
+ "grad_norm": 0.3177284002304077,
120
  "learning_rate": 0.00019937122098932428,
121
+ "loss": 0.7532,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.2057877813504823,
126
+ "grad_norm": 0.3028796315193176,
127
  "learning_rate": 0.00019909497617679348,
128
+ "loss": 0.7722,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.21864951768488747,
133
+ "grad_norm": 0.31497061252593994,
134
  "learning_rate": 0.00019876883405951377,
135
+ "loss": 0.8742,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.2315112540192926,
140
+ "grad_norm": 0.26743173599243164,
141
  "learning_rate": 0.00019839295885986296,
142
+ "loss": 0.7541,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.24437299035369775,
147
+ "grad_norm": 0.2908126711845398,
148
  "learning_rate": 0.00019796753984232358,
149
+ "loss": 0.9543,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.2572347266881029,
154
+ "grad_norm": 0.26621854305267334,
155
  "learning_rate": 0.00019749279121818235,
156
+ "loss": 0.9359,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.2572347266881029,
161
+ "eval_loss": 0.8169272541999817,
162
+ "eval_runtime": 5.7691,
163
+ "eval_samples_per_second": 28.427,
164
+ "eval_steps_per_second": 2.947,
165
  "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 1,
169
  "max_steps": 150,
170
  "num_input_tokens_seen": 0,
171
+ "num_train_epochs": 2,
172
  "save_steps": 20,
173
  "stateful_callbacks": {
174
  "EarlyStoppingCallback": {
 
177
  "early_stopping_threshold": 0.0
178
  },
179
  "attributes": {
180
+ "early_stopping_patience_counter": 0
181
  }
182
  },
183
  "TrainerControl": {
 
186
  "should_evaluate": false,
187
  "should_log": false,
188
  "should_save": true,
189
+ "should_training_stop": false
190
  },
191
  "attributes": {}
192
  }
193
  },
194
+ "total_flos": 5141026150809600.0,
195
  "train_batch_size": 10,
196
  "trial_name": null,
197
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4cb7ce651922e8f53dabf2b1364985d613e09d28a1319890e22f5a25dfbce85
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb34079e4accf483c3a38a7fbb5ed53ad4dbca33ee39f87bdede0297b6d0cff
3
  size 6648