baby-dev commited on
Commit
9731301
·
verified ·
1 Parent(s): 05cfb99

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "up_proj",
25
- "gate_proj",
26
  "down_proj",
27
- "v_proj",
28
  "o_proj",
29
- "q_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
23
  "down_proj",
24
+ "k_proj",
25
  "o_proj",
26
+ "gate_proj",
27
+ "up_proj",
28
+ "q_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:468b063969066b67543986fcdfd9612e1cf13bd6f9ed9c8cb089fa8deef7fbbd
3
  size 70430032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c52e8f1a03ce2c954a7d92823200ce003f5d41ca43efc1a7ea8466f10a30f7a
3
  size 70430032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:802bb23f2e39ef6455fe99fb42b6fc5e3f5b93e4b39a391d6ac0d7800d92e4dd
3
  size 36135892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb40e53669851a6edcbcb0a9a9ca0685636ac1ee03cf8aacc62c4c68e06b79c3
3
  size 36135892
last-checkpoint/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 2.1631815433502197,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-80",
4
  "epoch": 0.03385168729503861,
5
  "eval_steps": 20,
6
  "global_step": 80,
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0004231460911879826,
13
- "grad_norm": 1.6116182804107666,
14
  "learning_rate": 2e-05,
15
  "loss": 2.4891,
16
  "step": 1
@@ -18,594 +18,594 @@
18
  {
19
  "epoch": 0.0004231460911879826,
20
  "eval_loss": 2.7103111743927,
21
- "eval_runtime": 76.0744,
22
- "eval_samples_per_second": 26.159,
23
- "eval_steps_per_second": 6.546,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0008462921823759652,
28
- "grad_norm": 1.6491059064865112,
29
  "learning_rate": 4e-05,
30
  "loss": 2.6718,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0012694382735639479,
35
- "grad_norm": 1.6981528997421265,
36
  "learning_rate": 6e-05,
37
- "loss": 2.647,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0016925843647519305,
42
- "grad_norm": 1.639736294746399,
43
  "learning_rate": 8e-05,
44
- "loss": 2.6702,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.002115730455939913,
49
- "grad_norm": 1.5359561443328857,
50
  "learning_rate": 0.0001,
51
- "loss": 2.5932,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.0025388765471278957,
56
- "grad_norm": 1.455960750579834,
57
  "learning_rate": 0.00012,
58
- "loss": 2.6167,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.0029620226383158784,
63
- "grad_norm": 1.323106050491333,
64
  "learning_rate": 0.00014,
65
- "loss": 2.6387,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.003385168729503861,
70
- "grad_norm": 1.3717588186264038,
71
  "learning_rate": 0.00016,
72
- "loss": 2.5911,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.003808314820691844,
77
- "grad_norm": 1.4253047704696655,
78
  "learning_rate": 0.00018,
79
- "loss": 2.55,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.004231460911879826,
84
- "grad_norm": 1.5064291954040527,
85
  "learning_rate": 0.0002,
86
- "loss": 2.5373,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.004654607003067809,
91
- "grad_norm": 1.5042705535888672,
92
  "learning_rate": 0.00019989930665413147,
93
- "loss": 2.5484,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.0050777530942557915,
98
- "grad_norm": 1.4497551918029785,
99
  "learning_rate": 0.00019959742939952392,
100
- "loss": 2.4952,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.005500899185443774,
105
- "grad_norm": 1.5038385391235352,
106
  "learning_rate": 0.00019909497617679348,
107
- "loss": 2.3651,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.005924045276631757,
112
- "grad_norm": 1.4070632457733154,
113
  "learning_rate": 0.00019839295885986296,
114
- "loss": 2.3963,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.006347191367819739,
119
- "grad_norm": 1.349345326423645,
120
  "learning_rate": 0.00019749279121818235,
121
- "loss": 2.4433,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.006770337459007722,
126
- "grad_norm": 1.36220383644104,
127
  "learning_rate": 0.00019639628606958533,
128
- "loss": 2.4385,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.0071934835501957055,
133
- "grad_norm": 1.3120527267456055,
134
  "learning_rate": 0.00019510565162951537,
135
- "loss": 2.3923,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.007616629641383688,
140
- "grad_norm": 1.2771496772766113,
141
  "learning_rate": 0.00019362348706397373,
142
- "loss": 2.3667,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.00803977573257167,
147
- "grad_norm": 1.2408928871154785,
148
  "learning_rate": 0.0001919527772551451,
149
- "loss": 2.2521,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.008462921823759652,
154
- "grad_norm": 1.263331413269043,
155
  "learning_rate": 0.0001900968867902419,
156
- "loss": 2.3208,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.008462921823759652,
161
- "eval_loss": 2.3450167179107666,
162
- "eval_runtime": 59.4058,
163
- "eval_samples_per_second": 33.498,
164
- "eval_steps_per_second": 8.383,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 0.008886067914947636,
169
- "grad_norm": 1.3622525930404663,
170
  "learning_rate": 0.0001880595531856738,
171
- "loss": 2.3444,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 0.009309214006135618,
176
- "grad_norm": 1.3455640077590942,
177
  "learning_rate": 0.00018584487936018661,
178
- "loss": 2.4493,
179
  "step": 22
180
  },
181
  {
182
  "epoch": 0.009732360097323601,
183
- "grad_norm": 1.4010404348373413,
184
  "learning_rate": 0.00018345732537213027,
185
- "loss": 2.4324,
186
  "step": 23
187
  },
188
  {
189
  "epoch": 0.010155506188511583,
190
- "grad_norm": 1.306706428527832,
191
  "learning_rate": 0.00018090169943749476,
192
- "loss": 2.3131,
193
  "step": 24
194
  },
195
  {
196
  "epoch": 0.010578652279699566,
197
- "grad_norm": 1.194472074508667,
198
  "learning_rate": 0.000178183148246803,
199
- "loss": 2.3262,
200
  "step": 25
201
  },
202
  {
203
  "epoch": 0.011001798370887548,
204
- "grad_norm": 1.2795559167861938,
205
  "learning_rate": 0.00017530714660036112,
206
- "loss": 2.2353,
207
  "step": 26
208
  },
209
  {
210
  "epoch": 0.011424944462075532,
211
- "grad_norm": 1.4027005434036255,
212
  "learning_rate": 0.00017227948638273916,
213
- "loss": 2.2406,
214
  "step": 27
215
  },
216
  {
217
  "epoch": 0.011848090553263513,
218
- "grad_norm": 1.3928605318069458,
219
  "learning_rate": 0.00016910626489868649,
220
- "loss": 2.3127,
221
  "step": 28
222
  },
223
  {
224
  "epoch": 0.012271236644451497,
225
- "grad_norm": 1.5246529579162598,
226
  "learning_rate": 0.00016579387259397127,
227
- "loss": 2.3343,
228
  "step": 29
229
  },
230
  {
231
  "epoch": 0.012694382735639479,
232
- "grad_norm": 1.4026535749435425,
233
  "learning_rate": 0.00016234898018587337,
234
- "loss": 2.3008,
235
  "step": 30
236
  },
237
  {
238
  "epoch": 0.013117528826827462,
239
- "grad_norm": 1.375996708869934,
240
  "learning_rate": 0.00015877852522924732,
241
- "loss": 2.26,
242
  "step": 31
243
  },
244
  {
245
  "epoch": 0.013540674918015444,
246
- "grad_norm": 1.3354617357254028,
247
  "learning_rate": 0.00015508969814521025,
248
- "loss": 2.3904,
249
  "step": 32
250
  },
251
  {
252
  "epoch": 0.013963821009203427,
253
- "grad_norm": 1.3204755783081055,
254
  "learning_rate": 0.00015128992774059063,
255
- "loss": 2.2757,
256
  "step": 33
257
  },
258
  {
259
  "epoch": 0.014386967100391411,
260
- "grad_norm": 1.7939963340759277,
261
  "learning_rate": 0.00014738686624729986,
262
- "loss": 2.2381,
263
  "step": 34
264
  },
265
  {
266
  "epoch": 0.014810113191579393,
267
- "grad_norm": 1.3174601793289185,
268
  "learning_rate": 0.00014338837391175582,
269
- "loss": 2.1787,
270
  "step": 35
271
  },
272
  {
273
  "epoch": 0.015233259282767376,
274
- "grad_norm": 1.3664051294326782,
275
  "learning_rate": 0.00013930250316539238,
276
- "loss": 2.1824,
277
  "step": 36
278
  },
279
  {
280
  "epoch": 0.015656405373955358,
281
- "grad_norm": 1.3687667846679688,
282
  "learning_rate": 0.0001351374824081343,
283
- "loss": 2.2669,
284
  "step": 37
285
  },
286
  {
287
  "epoch": 0.01607955146514334,
288
- "grad_norm": 1.2880141735076904,
289
  "learning_rate": 0.00013090169943749476,
290
- "loss": 2.2264,
291
  "step": 38
292
  },
293
  {
294
  "epoch": 0.016502697556331325,
295
- "grad_norm": 1.4410102367401123,
296
  "learning_rate": 0.00012660368455666752,
297
- "loss": 2.217,
298
  "step": 39
299
  },
300
  {
301
  "epoch": 0.016925843647519305,
302
- "grad_norm": 1.3260499238967896,
303
  "learning_rate": 0.00012225209339563145,
304
- "loss": 2.1896,
305
  "step": 40
306
  },
307
  {
308
  "epoch": 0.016925843647519305,
309
- "eval_loss": 2.213444709777832,
310
- "eval_runtime": 76.1355,
311
- "eval_samples_per_second": 26.138,
312
- "eval_steps_per_second": 6.541,
313
  "step": 40
314
  },
315
  {
316
  "epoch": 0.01734898973870729,
317
- "grad_norm": 1.311227798461914,
318
  "learning_rate": 0.00011785568947986367,
319
- "loss": 2.2835,
320
  "step": 41
321
  },
322
  {
323
  "epoch": 0.017772135829895272,
324
- "grad_norm": 5.271040916442871,
325
  "learning_rate": 0.00011342332658176555,
326
- "loss": 2.3215,
327
  "step": 42
328
  },
329
  {
330
  "epoch": 0.018195281921083255,
331
- "grad_norm": 1.3460105657577515,
332
  "learning_rate": 0.00010896393089034336,
333
- "loss": 2.1808,
334
  "step": 43
335
  },
336
  {
337
  "epoch": 0.018618428012271235,
338
- "grad_norm": 1.4446858167648315,
339
  "learning_rate": 0.00010448648303505151,
340
- "loss": 2.2533,
341
  "step": 44
342
  },
343
  {
344
  "epoch": 0.01904157410345922,
345
- "grad_norm": 1.5081716775894165,
346
  "learning_rate": 0.0001,
347
- "loss": 2.1724,
348
  "step": 45
349
  },
350
  {
351
  "epoch": 0.019464720194647202,
352
- "grad_norm": 1.2462027072906494,
353
  "learning_rate": 9.551351696494854e-05,
354
- "loss": 2.1896,
355
  "step": 46
356
  },
357
  {
358
  "epoch": 0.019887866285835186,
359
- "grad_norm": 1.4999518394470215,
360
  "learning_rate": 9.103606910965666e-05,
361
- "loss": 2.3309,
362
  "step": 47
363
  },
364
  {
365
  "epoch": 0.020311012377023166,
366
- "grad_norm": 1.250173807144165,
367
  "learning_rate": 8.657667341823448e-05,
368
- "loss": 2.1242,
369
  "step": 48
370
  },
371
  {
372
  "epoch": 0.02073415846821115,
373
- "grad_norm": 5.076053142547607,
374
  "learning_rate": 8.214431052013634e-05,
375
- "loss": 2.3353,
376
  "step": 49
377
  },
378
  {
379
  "epoch": 0.021157304559399133,
380
- "grad_norm": 1.4977030754089355,
381
  "learning_rate": 7.774790660436858e-05,
382
- "loss": 2.3957,
383
  "step": 50
384
  },
385
  {
386
  "epoch": 0.021580450650587116,
387
- "grad_norm": 1.3641694784164429,
388
  "learning_rate": 7.339631544333249e-05,
389
- "loss": 2.1407,
390
  "step": 51
391
  },
392
  {
393
  "epoch": 0.022003596741775096,
394
- "grad_norm": 1.596494197845459,
395
  "learning_rate": 6.909830056250527e-05,
396
- "loss": 2.1233,
397
  "step": 52
398
  },
399
  {
400
  "epoch": 0.02242674283296308,
401
- "grad_norm": 1.3014681339263916,
402
  "learning_rate": 6.486251759186572e-05,
403
- "loss": 2.0812,
404
  "step": 53
405
  },
406
  {
407
  "epoch": 0.022849888924151063,
408
- "grad_norm": 1.2006770372390747,
409
  "learning_rate": 6.069749683460765e-05,
410
- "loss": 2.171,
411
  "step": 54
412
  },
413
  {
414
  "epoch": 0.023273035015339047,
415
- "grad_norm": 1.1349421739578247,
416
  "learning_rate": 5.6611626088244194e-05,
417
- "loss": 2.0624,
418
  "step": 55
419
  },
420
  {
421
  "epoch": 0.023696181106527027,
422
- "grad_norm": 1.238204002380371,
423
  "learning_rate": 5.261313375270014e-05,
424
- "loss": 2.2335,
425
  "step": 56
426
  },
427
  {
428
  "epoch": 0.02411932719771501,
429
- "grad_norm": 1.2505645751953125,
430
  "learning_rate": 4.87100722594094e-05,
431
- "loss": 2.2071,
432
  "step": 57
433
  },
434
  {
435
  "epoch": 0.024542473288902994,
436
- "grad_norm": 1.2006800174713135,
437
  "learning_rate": 4.491030185478976e-05,
438
- "loss": 2.0714,
439
  "step": 58
440
  },
441
  {
442
  "epoch": 0.024965619380090977,
443
- "grad_norm": 1.3659069538116455,
444
  "learning_rate": 4.12214747707527e-05,
445
- "loss": 2.1511,
446
  "step": 59
447
  },
448
  {
449
  "epoch": 0.025388765471278957,
450
- "grad_norm": 1.116264820098877,
451
  "learning_rate": 3.7651019814126654e-05,
452
- "loss": 2.1554,
453
  "step": 60
454
  },
455
  {
456
  "epoch": 0.025388765471278957,
457
- "eval_loss": 2.1694796085357666,
458
- "eval_runtime": 71.9131,
459
- "eval_samples_per_second": 27.672,
460
- "eval_steps_per_second": 6.925,
461
  "step": 60
462
  },
463
  {
464
  "epoch": 0.02581191156246694,
465
- "grad_norm": 1.1517850160598755,
466
  "learning_rate": 3.4206127406028745e-05,
467
- "loss": 2.2314,
468
  "step": 61
469
  },
470
  {
471
  "epoch": 0.026235057653654924,
472
- "grad_norm": 1.0954445600509644,
473
  "learning_rate": 3.089373510131354e-05,
474
- "loss": 2.1453,
475
  "step": 62
476
  },
477
  {
478
  "epoch": 0.026658203744842908,
479
- "grad_norm": 1.129459261894226,
480
  "learning_rate": 2.7720513617260856e-05,
481
- "loss": 2.1496,
482
  "step": 63
483
  },
484
  {
485
  "epoch": 0.027081349836030888,
486
- "grad_norm": 1.2924052476882935,
487
  "learning_rate": 2.4692853399638917e-05,
488
- "loss": 2.1504,
489
  "step": 64
490
  },
491
  {
492
  "epoch": 0.02750449592721887,
493
- "grad_norm": 1.1182539463043213,
494
  "learning_rate": 2.181685175319702e-05,
495
  "loss": 2.0713,
496
  "step": 65
497
  },
498
  {
499
  "epoch": 0.027927642018406855,
500
- "grad_norm": 1.2294206619262695,
501
  "learning_rate": 1.9098300562505266e-05,
502
- "loss": 2.1622,
503
  "step": 66
504
  },
505
  {
506
  "epoch": 0.02835078810959484,
507
- "grad_norm": 1.191117763519287,
508
  "learning_rate": 1.6542674627869737e-05,
509
- "loss": 2.208,
510
  "step": 67
511
  },
512
  {
513
  "epoch": 0.028773934200782822,
514
- "grad_norm": 1.1100726127624512,
515
  "learning_rate": 1.415512063981339e-05,
516
- "loss": 2.1405,
517
  "step": 68
518
  },
519
  {
520
  "epoch": 0.029197080291970802,
521
- "grad_norm": 1.0611038208007812,
522
  "learning_rate": 1.19404468143262e-05,
523
- "loss": 2.1244,
524
  "step": 69
525
  },
526
  {
527
  "epoch": 0.029620226383158785,
528
- "grad_norm": 1.193991780281067,
529
  "learning_rate": 9.903113209758096e-06,
530
- "loss": 2.2719,
531
  "step": 70
532
  },
533
  {
534
  "epoch": 0.03004337247434677,
535
- "grad_norm": 1.1647319793701172,
536
  "learning_rate": 8.047222744854943e-06,
537
- "loss": 2.1554,
538
  "step": 71
539
  },
540
  {
541
  "epoch": 0.030466518565534752,
542
- "grad_norm": 1.1668955087661743,
543
  "learning_rate": 6.37651293602628e-06,
544
- "loss": 2.1259,
545
  "step": 72
546
  },
547
  {
548
  "epoch": 0.030889664656722732,
549
- "grad_norm": 1.1693888902664185,
550
  "learning_rate": 4.8943483704846475e-06,
551
- "loss": 2.2087,
552
  "step": 73
553
  },
554
  {
555
  "epoch": 0.031312810747910716,
556
- "grad_norm": 1.1683961153030396,
557
  "learning_rate": 3.6037139304146762e-06,
558
- "loss": 2.0883,
559
  "step": 74
560
  },
561
  {
562
  "epoch": 0.031735956839098696,
563
- "grad_norm": 1.1831532716751099,
564
  "learning_rate": 2.5072087818176382e-06,
565
- "loss": 2.1605,
566
  "step": 75
567
  },
568
  {
569
  "epoch": 0.03215910293028668,
570
- "grad_norm": 1.237900972366333,
571
  "learning_rate": 1.6070411401370334e-06,
572
- "loss": 2.1349,
573
  "step": 76
574
  },
575
  {
576
  "epoch": 0.03258224902147466,
577
- "grad_norm": 1.3579156398773193,
578
  "learning_rate": 9.0502382320653e-07,
579
- "loss": 2.166,
580
  "step": 77
581
  },
582
  {
583
  "epoch": 0.03300539511266265,
584
- "grad_norm": 1.1819465160369873,
585
  "learning_rate": 4.025706004760932e-07,
586
- "loss": 2.2146,
587
  "step": 78
588
  },
589
  {
590
  "epoch": 0.03342854120385063,
591
- "grad_norm": 1.3723968267440796,
592
  "learning_rate": 1.0069334586854107e-07,
593
- "loss": 2.2131,
594
  "step": 79
595
  },
596
  {
597
  "epoch": 0.03385168729503861,
598
- "grad_norm": 1.183445930480957,
599
  "learning_rate": 0.0,
600
- "loss": 2.266,
601
  "step": 80
602
  },
603
  {
604
  "epoch": 0.03385168729503861,
605
- "eval_loss": 2.1631815433502197,
606
- "eval_runtime": 63.4411,
607
- "eval_samples_per_second": 31.368,
608
- "eval_steps_per_second": 7.85,
609
  "step": 80
610
  }
611
  ],
@@ -613,17 +613,8 @@
613
  "max_steps": 80,
614
  "num_input_tokens_seen": 0,
615
  "num_train_epochs": 1,
616
- "save_steps": 20,
617
  "stateful_callbacks": {
618
- "EarlyStoppingCallback": {
619
- "args": {
620
- "early_stopping_patience": 5,
621
- "early_stopping_threshold": 0.0
622
- },
623
- "attributes": {
624
- "early_stopping_patience_counter": 0
625
- }
626
- },
627
  "TrainerControl": {
628
  "args": {
629
  "should_epoch_stop": false,
 
1
  {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
  "epoch": 0.03385168729503861,
5
  "eval_steps": 20,
6
  "global_step": 80,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0004231460911879826,
13
+ "grad_norm": 1.6379058361053467,
14
  "learning_rate": 2e-05,
15
  "loss": 2.4891,
16
  "step": 1
 
18
  {
19
  "epoch": 0.0004231460911879826,
20
  "eval_loss": 2.7103111743927,
21
+ "eval_runtime": 63.2579,
22
+ "eval_samples_per_second": 31.459,
23
+ "eval_steps_per_second": 7.873,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0008462921823759652,
28
+ "grad_norm": 1.6534020900726318,
29
  "learning_rate": 4e-05,
30
  "loss": 2.6718,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0012694382735639479,
35
+ "grad_norm": 1.7026811838150024,
36
  "learning_rate": 6e-05,
37
+ "loss": 2.6471,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0016925843647519305,
42
+ "grad_norm": 1.6745214462280273,
43
  "learning_rate": 8e-05,
44
+ "loss": 2.6711,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.002115730455939913,
49
+ "grad_norm": 1.5553100109100342,
50
  "learning_rate": 0.0001,
51
+ "loss": 2.5945,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.0025388765471278957,
56
+ "grad_norm": 1.4465041160583496,
57
  "learning_rate": 0.00012,
58
+ "loss": 2.6169,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.0029620226383158784,
63
+ "grad_norm": 1.3366378545761108,
64
  "learning_rate": 0.00014,
65
+ "loss": 2.6392,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.003385168729503861,
70
+ "grad_norm": 1.3913536071777344,
71
  "learning_rate": 0.00016,
72
+ "loss": 2.5899,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.003808314820691844,
77
+ "grad_norm": 1.4674654006958008,
78
  "learning_rate": 0.00018,
79
+ "loss": 2.5501,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.004231460911879826,
84
+ "grad_norm": 1.5482183694839478,
85
  "learning_rate": 0.0002,
86
+ "loss": 2.5371,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.004654607003067809,
91
+ "grad_norm": 1.5878816843032837,
92
  "learning_rate": 0.00019989930665413147,
93
+ "loss": 2.55,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.0050777530942557915,
98
+ "grad_norm": 1.4205892086029053,
99
  "learning_rate": 0.00019959742939952392,
100
+ "loss": 2.4969,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.005500899185443774,
105
+ "grad_norm": 1.4903907775878906,
106
  "learning_rate": 0.00019909497617679348,
107
+ "loss": 2.3645,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.005924045276631757,
112
+ "grad_norm": 1.3923956155776978,
113
  "learning_rate": 0.00019839295885986296,
114
+ "loss": 2.3951,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.006347191367819739,
119
+ "grad_norm": 1.3330631256103516,
120
  "learning_rate": 0.00019749279121818235,
121
+ "loss": 2.4397,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.006770337459007722,
126
+ "grad_norm": 1.3482651710510254,
127
  "learning_rate": 0.00019639628606958533,
128
+ "loss": 2.4354,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.0071934835501957055,
133
+ "grad_norm": 1.3089728355407715,
134
  "learning_rate": 0.00019510565162951537,
135
+ "loss": 2.3894,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.007616629641383688,
140
+ "grad_norm": 1.2744436264038086,
141
  "learning_rate": 0.00019362348706397373,
142
+ "loss": 2.3651,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.00803977573257167,
147
+ "grad_norm": 1.23836350440979,
148
  "learning_rate": 0.0001919527772551451,
149
+ "loss": 2.2486,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.008462921823759652,
154
+ "grad_norm": 1.2774382829666138,
155
  "learning_rate": 0.0001900968867902419,
156
+ "loss": 2.3211,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.008462921823759652,
161
+ "eval_loss": 2.3435285091400146,
162
+ "eval_runtime": 72.0517,
163
+ "eval_samples_per_second": 27.619,
164
+ "eval_steps_per_second": 6.912,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 0.008886067914947636,
169
+ "grad_norm": 1.351232886314392,
170
  "learning_rate": 0.0001880595531856738,
171
+ "loss": 2.3451,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 0.009309214006135618,
176
+ "grad_norm": 1.3749635219573975,
177
  "learning_rate": 0.00018584487936018661,
178
+ "loss": 2.4502,
179
  "step": 22
180
  },
181
  {
182
  "epoch": 0.009732360097323601,
183
+ "grad_norm": 1.4125924110412598,
184
  "learning_rate": 0.00018345732537213027,
185
+ "loss": 2.4342,
186
  "step": 23
187
  },
188
  {
189
  "epoch": 0.010155506188511583,
190
+ "grad_norm": 1.326550841331482,
191
  "learning_rate": 0.00018090169943749476,
192
+ "loss": 2.3139,
193
  "step": 24
194
  },
195
  {
196
  "epoch": 0.010578652279699566,
197
+ "grad_norm": 1.1940902471542358,
198
  "learning_rate": 0.000178183148246803,
199
+ "loss": 2.3268,
200
  "step": 25
201
  },
202
  {
203
  "epoch": 0.011001798370887548,
204
+ "grad_norm": 1.294498085975647,
205
  "learning_rate": 0.00017530714660036112,
206
+ "loss": 2.2343,
207
  "step": 26
208
  },
209
  {
210
  "epoch": 0.011424944462075532,
211
+ "grad_norm": 1.4341365098953247,
212
  "learning_rate": 0.00017227948638273916,
213
+ "loss": 2.2412,
214
  "step": 27
215
  },
216
  {
217
  "epoch": 0.011848090553263513,
218
+ "grad_norm": 1.4379862546920776,
219
  "learning_rate": 0.00016910626489868649,
220
+ "loss": 2.3088,
221
  "step": 28
222
  },
223
  {
224
  "epoch": 0.012271236644451497,
225
+ "grad_norm": 1.807862401008606,
226
  "learning_rate": 0.00016579387259397127,
227
+ "loss": 2.3386,
228
  "step": 29
229
  },
230
  {
231
  "epoch": 0.012694382735639479,
232
+ "grad_norm": 1.4435659646987915,
233
  "learning_rate": 0.00016234898018587337,
234
+ "loss": 2.2997,
235
  "step": 30
236
  },
237
  {
238
  "epoch": 0.013117528826827462,
239
+ "grad_norm": 1.380933403968811,
240
  "learning_rate": 0.00015877852522924732,
241
+ "loss": 2.2587,
242
  "step": 31
243
  },
244
  {
245
  "epoch": 0.013540674918015444,
246
+ "grad_norm": 1.3532695770263672,
247
  "learning_rate": 0.00015508969814521025,
248
+ "loss": 2.389,
249
  "step": 32
250
  },
251
  {
252
  "epoch": 0.013963821009203427,
253
+ "grad_norm": 1.3885740041732788,
254
  "learning_rate": 0.00015128992774059063,
255
+ "loss": 2.2765,
256
  "step": 33
257
  },
258
  {
259
  "epoch": 0.014386967100391411,
260
+ "grad_norm": 1.6315644979476929,
261
  "learning_rate": 0.00014738686624729986,
262
+ "loss": 2.2364,
263
  "step": 34
264
  },
265
  {
266
  "epoch": 0.014810113191579393,
267
+ "grad_norm": 1.5489228963851929,
268
  "learning_rate": 0.00014338837391175582,
269
+ "loss": 2.1861,
270
  "step": 35
271
  },
272
  {
273
  "epoch": 0.015233259282767376,
274
+ "grad_norm": 1.4338537454605103,
275
  "learning_rate": 0.00013930250316539238,
276
+ "loss": 2.1853,
277
  "step": 36
278
  },
279
  {
280
  "epoch": 0.015656405373955358,
281
+ "grad_norm": 1.3952263593673706,
282
  "learning_rate": 0.0001351374824081343,
283
+ "loss": 2.2732,
284
  "step": 37
285
  },
286
  {
287
  "epoch": 0.01607955146514334,
288
+ "grad_norm": 1.3308148384094238,
289
  "learning_rate": 0.00013090169943749476,
290
+ "loss": 2.2248,
291
  "step": 38
292
  },
293
  {
294
  "epoch": 0.016502697556331325,
295
+ "grad_norm": 1.477274775505066,
296
  "learning_rate": 0.00012660368455666752,
297
+ "loss": 2.2161,
298
  "step": 39
299
  },
300
  {
301
  "epoch": 0.016925843647519305,
302
+ "grad_norm": 1.3079551458358765,
303
  "learning_rate": 0.00012225209339563145,
304
+ "loss": 2.1925,
305
  "step": 40
306
  },
307
  {
308
  "epoch": 0.016925843647519305,
309
+ "eval_loss": 2.211884021759033,
310
+ "eval_runtime": 75.9809,
311
+ "eval_samples_per_second": 26.191,
312
+ "eval_steps_per_second": 6.554,
313
  "step": 40
314
  },
315
  {
316
  "epoch": 0.01734898973870729,
317
+ "grad_norm": 1.243667483329773,
318
  "learning_rate": 0.00011785568947986367,
319
+ "loss": 2.282,
320
  "step": 41
321
  },
322
  {
323
  "epoch": 0.017772135829895272,
324
+ "grad_norm": 5.15204381942749,
325
  "learning_rate": 0.00011342332658176555,
326
+ "loss": 2.3092,
327
  "step": 42
328
  },
329
  {
330
  "epoch": 0.018195281921083255,
331
+ "grad_norm": 1.3127551078796387,
332
  "learning_rate": 0.00010896393089034336,
333
+ "loss": 2.185,
334
  "step": 43
335
  },
336
  {
337
  "epoch": 0.018618428012271235,
338
+ "grad_norm": 1.3681222200393677,
339
  "learning_rate": 0.00010448648303505151,
340
+ "loss": 2.2499,
341
  "step": 44
342
  },
343
  {
344
  "epoch": 0.01904157410345922,
345
+ "grad_norm": 1.3882651329040527,
346
  "learning_rate": 0.0001,
347
+ "loss": 2.168,
348
  "step": 45
349
  },
350
  {
351
  "epoch": 0.019464720194647202,
352
+ "grad_norm": 1.2424044609069824,
353
  "learning_rate": 9.551351696494854e-05,
354
+ "loss": 2.1857,
355
  "step": 46
356
  },
357
  {
358
  "epoch": 0.019887866285835186,
359
+ "grad_norm": 1.449005365371704,
360
  "learning_rate": 9.103606910965666e-05,
361
+ "loss": 2.3204,
362
  "step": 47
363
  },
364
  {
365
  "epoch": 0.020311012377023166,
366
+ "grad_norm": 1.2062491178512573,
367
  "learning_rate": 8.657667341823448e-05,
368
+ "loss": 2.1174,
369
  "step": 48
370
  },
371
  {
372
  "epoch": 0.02073415846821115,
373
+ "grad_norm": 4.882323741912842,
374
  "learning_rate": 8.214431052013634e-05,
375
+ "loss": 2.3162,
376
  "step": 49
377
  },
378
  {
379
  "epoch": 0.021157304559399133,
380
+ "grad_norm": 1.4830926656723022,
381
  "learning_rate": 7.774790660436858e-05,
382
+ "loss": 2.4003,
383
  "step": 50
384
  },
385
  {
386
  "epoch": 0.021580450650587116,
387
+ "grad_norm": 1.2459235191345215,
388
  "learning_rate": 7.339631544333249e-05,
389
+ "loss": 2.1419,
390
  "step": 51
391
  },
392
  {
393
  "epoch": 0.022003596741775096,
394
+ "grad_norm": 1.3951412439346313,
395
  "learning_rate": 6.909830056250527e-05,
396
+ "loss": 2.114,
397
  "step": 52
398
  },
399
  {
400
  "epoch": 0.02242674283296308,
401
+ "grad_norm": 1.2715801000595093,
402
  "learning_rate": 6.486251759186572e-05,
403
+ "loss": 2.0807,
404
  "step": 53
405
  },
406
  {
407
  "epoch": 0.022849888924151063,
408
+ "grad_norm": 1.1952708959579468,
409
  "learning_rate": 6.069749683460765e-05,
410
+ "loss": 2.1682,
411
  "step": 54
412
  },
413
  {
414
  "epoch": 0.023273035015339047,
415
+ "grad_norm": 1.1247831583023071,
416
  "learning_rate": 5.6611626088244194e-05,
417
+ "loss": 2.062,
418
  "step": 55
419
  },
420
  {
421
  "epoch": 0.023696181106527027,
422
+ "grad_norm": 1.2911661863327026,
423
  "learning_rate": 5.261313375270014e-05,
424
+ "loss": 2.234,
425
  "step": 56
426
  },
427
  {
428
  "epoch": 0.02411932719771501,
429
+ "grad_norm": 1.1771955490112305,
430
  "learning_rate": 4.87100722594094e-05,
431
+ "loss": 2.2042,
432
  "step": 57
433
  },
434
  {
435
  "epoch": 0.024542473288902994,
436
+ "grad_norm": 1.1919941902160645,
437
  "learning_rate": 4.491030185478976e-05,
438
+ "loss": 2.0736,
439
  "step": 58
440
  },
441
  {
442
  "epoch": 0.024965619380090977,
443
+ "grad_norm": 1.3543719053268433,
444
  "learning_rate": 4.12214747707527e-05,
445
+ "loss": 2.1515,
446
  "step": 59
447
  },
448
  {
449
  "epoch": 0.025388765471278957,
450
+ "grad_norm": 1.123665452003479,
451
  "learning_rate": 3.7651019814126654e-05,
452
+ "loss": 2.1566,
453
  "step": 60
454
  },
455
  {
456
  "epoch": 0.025388765471278957,
457
+ "eval_loss": 2.16865873336792,
458
+ "eval_runtime": 25.383,
459
+ "eval_samples_per_second": 78.399,
460
+ "eval_steps_per_second": 19.619,
461
  "step": 60
462
  },
463
  {
464
  "epoch": 0.02581191156246694,
465
+ "grad_norm": 1.132775068283081,
466
  "learning_rate": 3.4206127406028745e-05,
467
+ "loss": 2.2303,
468
  "step": 61
469
  },
470
  {
471
  "epoch": 0.026235057653654924,
472
+ "grad_norm": 1.1115837097167969,
473
  "learning_rate": 3.089373510131354e-05,
474
+ "loss": 2.1469,
475
  "step": 62
476
  },
477
  {
478
  "epoch": 0.026658203744842908,
479
+ "grad_norm": 1.1179817914962769,
480
  "learning_rate": 2.7720513617260856e-05,
481
+ "loss": 2.1541,
482
  "step": 63
483
  },
484
  {
485
  "epoch": 0.027081349836030888,
486
+ "grad_norm": 1.3277086019515991,
487
  "learning_rate": 2.4692853399638917e-05,
488
+ "loss": 2.1514,
489
  "step": 64
490
  },
491
  {
492
  "epoch": 0.02750449592721887,
493
+ "grad_norm": 1.1748079061508179,
494
  "learning_rate": 2.181685175319702e-05,
495
  "loss": 2.0713,
496
  "step": 65
497
  },
498
  {
499
  "epoch": 0.027927642018406855,
500
+ "grad_norm": 1.210524559020996,
501
  "learning_rate": 1.9098300562505266e-05,
502
+ "loss": 2.1612,
503
  "step": 66
504
  },
505
  {
506
  "epoch": 0.02835078810959484,
507
+ "grad_norm": 1.1560407876968384,
508
  "learning_rate": 1.6542674627869737e-05,
509
+ "loss": 2.2039,
510
  "step": 67
511
  },
512
  {
513
  "epoch": 0.028773934200782822,
514
+ "grad_norm": 1.1386845111846924,
515
  "learning_rate": 1.415512063981339e-05,
516
+ "loss": 2.1411,
517
  "step": 68
518
  },
519
  {
520
  "epoch": 0.029197080291970802,
521
+ "grad_norm": 1.0700095891952515,
522
  "learning_rate": 1.19404468143262e-05,
523
+ "loss": 2.1263,
524
  "step": 69
525
  },
526
  {
527
  "epoch": 0.029620226383158785,
528
+ "grad_norm": 1.1910362243652344,
529
  "learning_rate": 9.903113209758096e-06,
530
+ "loss": 2.27,
531
  "step": 70
532
  },
533
  {
534
  "epoch": 0.03004337247434677,
535
+ "grad_norm": 1.1575465202331543,
536
  "learning_rate": 8.047222744854943e-06,
537
+ "loss": 2.156,
538
  "step": 71
539
  },
540
  {
541
  "epoch": 0.030466518565534752,
542
+ "grad_norm": 1.1447267532348633,
543
  "learning_rate": 6.37651293602628e-06,
544
+ "loss": 2.1257,
545
  "step": 72
546
  },
547
  {
548
  "epoch": 0.030889664656722732,
549
+ "grad_norm": 1.1838666200637817,
550
  "learning_rate": 4.8943483704846475e-06,
551
+ "loss": 2.2088,
552
  "step": 73
553
  },
554
  {
555
  "epoch": 0.031312810747910716,
556
+ "grad_norm": 1.1437898874282837,
557
  "learning_rate": 3.6037139304146762e-06,
558
+ "loss": 2.0872,
559
  "step": 74
560
  },
561
  {
562
  "epoch": 0.031735956839098696,
563
+ "grad_norm": 1.1745190620422363,
564
  "learning_rate": 2.5072087818176382e-06,
565
+ "loss": 2.1591,
566
  "step": 75
567
  },
568
  {
569
  "epoch": 0.03215910293028668,
570
+ "grad_norm": 1.2256075143814087,
571
  "learning_rate": 1.6070411401370334e-06,
572
+ "loss": 2.1358,
573
  "step": 76
574
  },
575
  {
576
  "epoch": 0.03258224902147466,
577
+ "grad_norm": 1.3967663049697876,
578
  "learning_rate": 9.0502382320653e-07,
579
+ "loss": 2.1687,
580
  "step": 77
581
  },
582
  {
583
  "epoch": 0.03300539511266265,
584
+ "grad_norm": 1.205739140510559,
585
  "learning_rate": 4.025706004760932e-07,
586
+ "loss": 2.2164,
587
  "step": 78
588
  },
589
  {
590
  "epoch": 0.03342854120385063,
591
+ "grad_norm": 1.2898608446121216,
592
  "learning_rate": 1.0069334586854107e-07,
593
+ "loss": 2.2109,
594
  "step": 79
595
  },
596
  {
597
  "epoch": 0.03385168729503861,
598
+ "grad_norm": 1.1767884492874146,
599
  "learning_rate": 0.0,
600
+ "loss": 2.2616,
601
  "step": 80
602
  },
603
  {
604
  "epoch": 0.03385168729503861,
605
+ "eval_loss": 2.162363290786743,
606
+ "eval_runtime": 25.534,
607
+ "eval_samples_per_second": 77.935,
608
+ "eval_steps_per_second": 19.503,
609
  "step": 80
610
  }
611
  ],
 
613
  "max_steps": 80,
614
  "num_input_tokens_seen": 0,
615
  "num_train_epochs": 1,
616
+ "save_steps": 500,
617
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
618
  "TrainerControl": {
619
  "args": {
620
  "should_epoch_stop": false,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36cfa28a6f86c549badab2f7c7fad8fc6532e38818f78a59f1cbd0cf0f8aa692
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ac4240e40fdb363a1e7a56a6e1d5a6e0b2e75612d81fb46e828164eba22a34a
3
  size 6776