error577 commited on
Commit
1fc1d6d
·
verified ·
1 Parent(s): e08c940

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -10,23 +10,23 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 128,
14
  "lora_dropout": 0.15,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "gate_proj",
24
  "o_proj",
25
- "q_proj",
26
- "k_proj",
27
  "up_proj",
28
  "v_proj",
29
- "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 16,
14
  "lora_dropout": 0.15,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "k_proj",
24
+ "down_proj",
25
  "gate_proj",
26
  "o_proj",
 
 
27
  "up_proj",
28
  "v_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db5cce6156c4621517be68ed6604412d1e180059ddcba2665cbdb58955f9bb05
3
- size 180385008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7140be330dda6372b6231f1c967402a64cbc852cb4999f9baf8ceb68d4fd23ab
3
+ size 22573704
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfcc971a71e688b4db954d5f9e261787333eb8279ea692ae9db960cb16db16c5
3
- size 137651322
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c11b5cd2010a67482310a3fced3b7b838bf6893d6d31933481f02786b688254
3
+ size 17437626
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11309a88af1da04c34187de7c9fa4eeb4751eebe97a4effc8b29c06633b89aa3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5701b93a28e5cda28f54c982c99c9f1cc13d09b125c8060476f4a1658335c0
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.7864285707473755,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.003217076240685559,
5
  "eval_steps": 200,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,1424 +9,1424 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.6085381203427795e-05,
13
- "grad_norm": 0.9673656821250916,
14
  "learning_rate": 2e-05,
15
- "loss": 0.5297,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 1.6085381203427795e-05,
20
- "eval_loss": 0.46725764870643616,
21
- "eval_runtime": 25.4799,
22
- "eval_samples_per_second": 9.772,
23
- "eval_steps_per_second": 9.772,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 3.217076240685559e-05,
28
- "grad_norm": 0.9954096078872681,
29
  "learning_rate": 4e-05,
30
- "loss": 0.6221,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 4.825614361028339e-05,
35
- "grad_norm": 0.8733547329902649,
36
  "learning_rate": 6e-05,
37
- "loss": 0.4143,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 6.434152481371118e-05,
42
- "grad_norm": 0.8365621566772461,
43
  "learning_rate": 8e-05,
44
- "loss": 0.2579,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 8.042690601713898e-05,
49
- "grad_norm": 1.2265547513961792,
50
  "learning_rate": 0.0001,
51
- "loss": 0.4713,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 9.651228722056678e-05,
56
- "grad_norm": 1.1219959259033203,
57
  "learning_rate": 0.00012,
58
- "loss": 0.4131,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.00011259766842399456,
63
- "grad_norm": 1.630370855331421,
64
  "learning_rate": 0.00014,
65
- "loss": 0.5203,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.00012868304962742236,
70
- "grad_norm": 1.959912896156311,
71
  "learning_rate": 0.00016,
72
- "loss": 0.6619,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.00014476843083085014,
77
- "grad_norm": 2.2232961654663086,
78
  "learning_rate": 0.00018,
79
- "loss": 0.5758,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.00016085381203427795,
84
- "grad_norm": 2.4021875858306885,
85
  "learning_rate": 0.0002,
86
- "loss": 0.5578,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.00017693919323770574,
91
- "grad_norm": 2.4358997344970703,
92
  "learning_rate": 0.0002,
93
- "loss": 0.5025,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.00019302457444113355,
98
- "grad_norm": 2.9442031383514404,
99
  "learning_rate": 0.0002,
100
- "loss": 0.6399,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.00020910995564456133,
105
- "grad_norm": 3.2934744358062744,
106
  "learning_rate": 0.0002,
107
- "loss": 0.7238,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.00022519533684798912,
112
- "grad_norm": 2.135126829147339,
113
  "learning_rate": 0.0002,
114
- "loss": 0.6701,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.00024128071805141693,
119
- "grad_norm": 3.4425387382507324,
120
  "learning_rate": 0.0002,
121
- "loss": 0.7285,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.0002573660992548447,
126
- "grad_norm": 4.053037166595459,
127
  "learning_rate": 0.0002,
128
- "loss": 0.6067,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.0002734514804582725,
133
- "grad_norm": 1.9749451875686646,
134
  "learning_rate": 0.0002,
135
- "loss": 0.6545,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.0002895368616617003,
140
- "grad_norm": 2.6539998054504395,
141
  "learning_rate": 0.0002,
142
- "loss": 0.5582,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.0003056222428651281,
147
- "grad_norm": 4.1893205642700195,
148
  "learning_rate": 0.0002,
149
- "loss": 0.6518,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.0003217076240685559,
154
- "grad_norm": 2.7660045623779297,
155
  "learning_rate": 0.0002,
156
- "loss": 0.619,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.00033779300527198367,
161
- "grad_norm": 3.1297731399536133,
162
  "learning_rate": 0.0002,
163
- "loss": 0.6169,
164
  "step": 21
165
  },
166
  {
167
- "epoch": 0.0003538783864754115,
168
- "grad_norm": 2.4766297340393066,
169
  "learning_rate": 0.0002,
170
- "loss": 0.671,
171
  "step": 22
172
  },
173
  {
174
- "epoch": 0.0003699637676788393,
175
- "grad_norm": 1.840955376625061,
176
  "learning_rate": 0.0002,
177
- "loss": 0.5704,
178
  "step": 23
179
  },
180
  {
181
- "epoch": 0.0003860491488822671,
182
- "grad_norm": 2.017615556716919,
183
  "learning_rate": 0.0002,
184
- "loss": 0.4936,
185
  "step": 24
186
  },
187
  {
188
- "epoch": 0.00040213453008569486,
189
- "grad_norm": 2.527812957763672,
190
  "learning_rate": 0.0002,
191
- "loss": 0.436,
192
  "step": 25
193
  },
194
  {
195
- "epoch": 0.00041821991128912267,
196
- "grad_norm": 2.738335132598877,
197
  "learning_rate": 0.0002,
198
- "loss": 0.6511,
199
  "step": 26
200
  },
201
  {
202
- "epoch": 0.0004343052924925505,
203
- "grad_norm": 2.6857173442840576,
204
  "learning_rate": 0.0002,
205
- "loss": 0.8459,
206
  "step": 27
207
  },
208
  {
209
- "epoch": 0.00045039067369597824,
210
- "grad_norm": 3.223954200744629,
211
  "learning_rate": 0.0002,
212
- "loss": 0.5558,
213
  "step": 28
214
  },
215
  {
216
- "epoch": 0.00046647605489940605,
217
- "grad_norm": 2.828322649002075,
218
  "learning_rate": 0.0002,
219
- "loss": 0.7201,
220
  "step": 29
221
  },
222
  {
223
- "epoch": 0.00048256143610283386,
224
- "grad_norm": 3.2195804119110107,
225
  "learning_rate": 0.0002,
226
- "loss": 0.5933,
227
  "step": 30
228
  },
229
  {
230
- "epoch": 0.0004986468173062617,
231
- "grad_norm": 2.4919071197509766,
232
  "learning_rate": 0.0002,
233
- "loss": 0.5764,
234
  "step": 31
235
  },
236
  {
237
- "epoch": 0.0005147321985096894,
238
- "grad_norm": 4.92438268661499,
239
  "learning_rate": 0.0002,
240
- "loss": 0.9201,
241
  "step": 32
242
  },
243
  {
244
- "epoch": 0.0005308175797131172,
245
- "grad_norm": 2.232290267944336,
246
  "learning_rate": 0.0002,
247
- "loss": 0.5863,
248
  "step": 33
249
  },
250
  {
251
- "epoch": 0.000546902960916545,
252
- "grad_norm": 3.7385706901550293,
253
  "learning_rate": 0.0002,
254
- "loss": 0.9086,
255
  "step": 34
256
  },
257
  {
258
- "epoch": 0.0005629883421199728,
259
- "grad_norm": 3.262006998062134,
260
  "learning_rate": 0.0002,
261
- "loss": 0.6261,
262
  "step": 35
263
  },
264
  {
265
- "epoch": 0.0005790737233234006,
266
- "grad_norm": 2.7973763942718506,
267
  "learning_rate": 0.0002,
268
- "loss": 0.6955,
269
  "step": 36
270
  },
271
  {
272
- "epoch": 0.0005951591045268284,
273
- "grad_norm": 3.127302885055542,
274
  "learning_rate": 0.0002,
275
- "loss": 0.7446,
276
  "step": 37
277
  },
278
  {
279
- "epoch": 0.0006112444857302562,
280
- "grad_norm": 2.1533172130584717,
281
  "learning_rate": 0.0002,
282
- "loss": 0.5484,
283
  "step": 38
284
  },
285
  {
286
- "epoch": 0.000627329866933684,
287
- "grad_norm": 4.116796016693115,
288
  "learning_rate": 0.0002,
289
- "loss": 0.7521,
290
  "step": 39
291
  },
292
  {
293
- "epoch": 0.0006434152481371118,
294
- "grad_norm": 4.400921821594238,
295
  "learning_rate": 0.0002,
296
- "loss": 0.9317,
297
  "step": 40
298
  },
299
  {
300
- "epoch": 0.0006595006293405396,
301
- "grad_norm": 2.6137619018554688,
302
  "learning_rate": 0.0002,
303
- "loss": 0.7086,
304
  "step": 41
305
  },
306
  {
307
- "epoch": 0.0006755860105439673,
308
- "grad_norm": 2.341974973678589,
309
  "learning_rate": 0.0002,
310
- "loss": 0.5551,
311
  "step": 42
312
  },
313
  {
314
- "epoch": 0.0006916713917473952,
315
- "grad_norm": 2.7685954570770264,
316
  "learning_rate": 0.0002,
317
- "loss": 0.7665,
318
  "step": 43
319
  },
320
  {
321
- "epoch": 0.000707756772950823,
322
- "grad_norm": 3.1898794174194336,
323
  "learning_rate": 0.0002,
324
- "loss": 0.8037,
325
  "step": 44
326
  },
327
  {
328
- "epoch": 0.0007238421541542507,
329
- "grad_norm": 3.215623617172241,
330
  "learning_rate": 0.0002,
331
- "loss": 0.9811,
332
  "step": 45
333
  },
334
  {
335
- "epoch": 0.0007399275353576786,
336
- "grad_norm": 3.3365135192871094,
337
  "learning_rate": 0.0002,
338
- "loss": 0.7127,
339
  "step": 46
340
  },
341
  {
342
- "epoch": 0.0007560129165611063,
343
- "grad_norm": 4.518591403961182,
344
  "learning_rate": 0.0002,
345
- "loss": 0.797,
346
  "step": 47
347
  },
348
  {
349
- "epoch": 0.0007720982977645342,
350
- "grad_norm": 2.179842948913574,
351
  "learning_rate": 0.0002,
352
- "loss": 0.7091,
353
  "step": 48
354
  },
355
  {
356
- "epoch": 0.000788183678967962,
357
- "grad_norm": 2.5702974796295166,
358
  "learning_rate": 0.0002,
359
- "loss": 0.8829,
360
  "step": 49
361
  },
362
  {
363
- "epoch": 0.0008042690601713897,
364
- "grad_norm": 2.2742362022399902,
365
  "learning_rate": 0.0002,
366
- "loss": 0.5818,
367
  "step": 50
368
  },
369
  {
370
- "epoch": 0.0008203544413748176,
371
- "grad_norm": 3.2687766551971436,
372
  "learning_rate": 0.0002,
373
- "loss": 0.7228,
374
  "step": 51
375
  },
376
  {
377
- "epoch": 0.0008364398225782453,
378
- "grad_norm": 3.5674126148223877,
379
  "learning_rate": 0.0002,
380
- "loss": 0.8874,
381
  "step": 52
382
  },
383
  {
384
- "epoch": 0.0008525252037816731,
385
- "grad_norm": 2.703923225402832,
386
  "learning_rate": 0.0002,
387
- "loss": 0.6596,
388
  "step": 53
389
  },
390
  {
391
- "epoch": 0.000868610584985101,
392
- "grad_norm": 2.3442795276641846,
393
  "learning_rate": 0.0002,
394
- "loss": 0.8213,
395
  "step": 54
396
  },
397
  {
398
- "epoch": 0.0008846959661885287,
399
- "grad_norm": 3.142275094985962,
400
  "learning_rate": 0.0002,
401
- "loss": 0.8181,
402
  "step": 55
403
  },
404
  {
405
- "epoch": 0.0009007813473919565,
406
- "grad_norm": 4.0531487464904785,
407
  "learning_rate": 0.0002,
408
- "loss": 0.5939,
409
  "step": 56
410
  },
411
  {
412
- "epoch": 0.0009168667285953843,
413
- "grad_norm": 4.309750556945801,
414
  "learning_rate": 0.0002,
415
- "loss": 0.867,
416
  "step": 57
417
  },
418
  {
419
- "epoch": 0.0009329521097988121,
420
- "grad_norm": 3.4528746604919434,
421
  "learning_rate": 0.0002,
422
- "loss": 0.5944,
423
  "step": 58
424
  },
425
  {
426
- "epoch": 0.0009490374910022399,
427
- "grad_norm": 3.531193494796753,
428
  "learning_rate": 0.0002,
429
- "loss": 0.7985,
430
  "step": 59
431
  },
432
  {
433
- "epoch": 0.0009651228722056677,
434
- "grad_norm": 3.000215768814087,
435
  "learning_rate": 0.0002,
436
- "loss": 0.7939,
437
  "step": 60
438
  },
439
  {
440
- "epoch": 0.0009812082534090955,
441
- "grad_norm": 4.317079067230225,
442
  "learning_rate": 0.0002,
443
- "loss": 0.6823,
444
  "step": 61
445
  },
446
  {
447
- "epoch": 0.0009972936346125233,
448
- "grad_norm": 3.4617133140563965,
449
  "learning_rate": 0.0002,
450
- "loss": 0.7672,
451
  "step": 62
452
  },
453
  {
454
- "epoch": 0.001013379015815951,
455
- "grad_norm": 3.625797986984253,
456
  "learning_rate": 0.0002,
457
- "loss": 0.7985,
458
  "step": 63
459
  },
460
  {
461
- "epoch": 0.0010294643970193789,
462
- "grad_norm": 4.261772632598877,
463
  "learning_rate": 0.0002,
464
- "loss": 0.8154,
465
  "step": 64
466
  },
467
  {
468
- "epoch": 0.0010455497782228067,
469
- "grad_norm": 3.3078057765960693,
470
  "learning_rate": 0.0002,
471
- "loss": 0.7663,
472
  "step": 65
473
  },
474
  {
475
- "epoch": 0.0010616351594262344,
476
- "grad_norm": 2.1908516883850098,
477
  "learning_rate": 0.0002,
478
- "loss": 0.6996,
479
  "step": 66
480
  },
481
  {
482
- "epoch": 0.0010777205406296622,
483
- "grad_norm": 2.491776943206787,
484
  "learning_rate": 0.0002,
485
- "loss": 0.659,
486
  "step": 67
487
  },
488
  {
489
- "epoch": 0.00109380592183309,
490
- "grad_norm": 2.7965214252471924,
491
  "learning_rate": 0.0002,
492
- "loss": 0.6798,
493
  "step": 68
494
  },
495
  {
496
- "epoch": 0.0011098913030365178,
497
- "grad_norm": 3.3033552169799805,
498
  "learning_rate": 0.0002,
499
- "loss": 0.9425,
500
  "step": 69
501
  },
502
  {
503
- "epoch": 0.0011259766842399456,
504
- "grad_norm": 2.6152732372283936,
505
  "learning_rate": 0.0002,
506
- "loss": 0.9675,
507
  "step": 70
508
  },
509
  {
510
- "epoch": 0.0011420620654433735,
511
- "grad_norm": 2.942465305328369,
512
  "learning_rate": 0.0002,
513
- "loss": 0.8886,
514
  "step": 71
515
  },
516
  {
517
- "epoch": 0.0011581474466468011,
518
- "grad_norm": 3.2040352821350098,
519
  "learning_rate": 0.0002,
520
- "loss": 0.7208,
521
  "step": 72
522
  },
523
  {
524
- "epoch": 0.001174232827850229,
525
- "grad_norm": 5.6633501052856445,
526
  "learning_rate": 0.0002,
527
- "loss": 0.9701,
528
  "step": 73
529
  },
530
  {
531
- "epoch": 0.0011903182090536569,
532
- "grad_norm": 2.924656867980957,
533
  "learning_rate": 0.0002,
534
- "loss": 0.6366,
535
  "step": 74
536
  },
537
  {
538
- "epoch": 0.0012064035902570845,
539
- "grad_norm": 3.251835584640503,
540
  "learning_rate": 0.0002,
541
- "loss": 0.8638,
542
  "step": 75
543
  },
544
  {
545
- "epoch": 0.0012224889714605124,
546
- "grad_norm": 3.145000696182251,
547
  "learning_rate": 0.0002,
548
- "loss": 0.6692,
549
  "step": 76
550
  },
551
  {
552
- "epoch": 0.0012385743526639402,
553
- "grad_norm": 2.7392325401306152,
554
  "learning_rate": 0.0002,
555
- "loss": 0.7459,
556
  "step": 77
557
  },
558
  {
559
- "epoch": 0.001254659733867368,
560
- "grad_norm": 2.8011040687561035,
561
  "learning_rate": 0.0002,
562
- "loss": 0.7722,
563
  "step": 78
564
  },
565
  {
566
- "epoch": 0.0012707451150707958,
567
- "grad_norm": 3.5295469760894775,
568
  "learning_rate": 0.0002,
569
- "loss": 0.7733,
570
  "step": 79
571
  },
572
  {
573
- "epoch": 0.0012868304962742236,
574
- "grad_norm": 2.9453213214874268,
575
  "learning_rate": 0.0002,
576
- "loss": 0.6945,
577
  "step": 80
578
  },
579
  {
580
- "epoch": 0.0013029158774776513,
581
- "grad_norm": 3.2154369354248047,
582
  "learning_rate": 0.0002,
583
- "loss": 0.8776,
584
  "step": 81
585
  },
586
  {
587
- "epoch": 0.0013190012586810791,
588
- "grad_norm": 3.536776065826416,
589
  "learning_rate": 0.0002,
590
- "loss": 0.8774,
591
  "step": 82
592
  },
593
  {
594
- "epoch": 0.001335086639884507,
595
- "grad_norm": 2.8547418117523193,
596
  "learning_rate": 0.0002,
597
- "loss": 0.7109,
598
  "step": 83
599
  },
600
  {
601
- "epoch": 0.0013511720210879347,
602
- "grad_norm": 3.4063565731048584,
603
  "learning_rate": 0.0002,
604
- "loss": 0.8466,
605
  "step": 84
606
  },
607
  {
608
- "epoch": 0.0013672574022913625,
609
- "grad_norm": 5.920643329620361,
610
  "learning_rate": 0.0002,
611
- "loss": 0.8423,
612
  "step": 85
613
  },
614
  {
615
- "epoch": 0.0013833427834947904,
616
- "grad_norm": 4.299768924713135,
617
  "learning_rate": 0.0002,
618
- "loss": 1.0802,
619
  "step": 86
620
  },
621
  {
622
- "epoch": 0.001399428164698218,
623
- "grad_norm": 3.5304558277130127,
624
  "learning_rate": 0.0002,
625
- "loss": 0.8542,
626
  "step": 87
627
  },
628
  {
629
- "epoch": 0.001415513545901646,
630
- "grad_norm": 3.0248117446899414,
631
  "learning_rate": 0.0002,
632
- "loss": 0.6346,
633
  "step": 88
634
  },
635
  {
636
- "epoch": 0.0014315989271050738,
637
- "grad_norm": 3.5863444805145264,
638
  "learning_rate": 0.0002,
639
- "loss": 0.9679,
640
  "step": 89
641
  },
642
  {
643
- "epoch": 0.0014476843083085014,
644
- "grad_norm": 3.6556644439697266,
645
  "learning_rate": 0.0002,
646
- "loss": 0.7355,
647
  "step": 90
648
  },
649
  {
650
- "epoch": 0.0014637696895119293,
651
- "grad_norm": 3.691444158554077,
652
  "learning_rate": 0.0002,
653
- "loss": 0.8556,
654
  "step": 91
655
  },
656
  {
657
- "epoch": 0.0014798550707153572,
658
- "grad_norm": 3.8535704612731934,
659
  "learning_rate": 0.0002,
660
- "loss": 1.0531,
661
  "step": 92
662
  },
663
  {
664
- "epoch": 0.0014959404519187848,
665
- "grad_norm": 3.402984619140625,
666
  "learning_rate": 0.0002,
667
- "loss": 0.7127,
668
  "step": 93
669
  },
670
  {
671
- "epoch": 0.0015120258331222127,
672
- "grad_norm": 2.967519760131836,
673
  "learning_rate": 0.0002,
674
- "loss": 0.7416,
675
  "step": 94
676
  },
677
  {
678
- "epoch": 0.0015281112143256405,
679
- "grad_norm": 4.5817718505859375,
680
  "learning_rate": 0.0002,
681
- "loss": 0.6667,
682
  "step": 95
683
  },
684
  {
685
- "epoch": 0.0015441965955290684,
686
- "grad_norm": 4.2193379402160645,
687
  "learning_rate": 0.0002,
688
- "loss": 0.6914,
689
  "step": 96
690
  },
691
  {
692
- "epoch": 0.001560281976732496,
693
- "grad_norm": 4.412436485290527,
694
  "learning_rate": 0.0002,
695
- "loss": 0.6476,
696
  "step": 97
697
  },
698
  {
699
- "epoch": 0.001576367357935924,
700
- "grad_norm": 3.960810661315918,
701
  "learning_rate": 0.0002,
702
- "loss": 0.6829,
703
  "step": 98
704
  },
705
  {
706
- "epoch": 0.0015924527391393518,
707
- "grad_norm": 4.494846343994141,
708
  "learning_rate": 0.0002,
709
- "loss": 0.899,
710
  "step": 99
711
  },
712
  {
713
- "epoch": 0.0016085381203427794,
714
- "grad_norm": 5.150880813598633,
715
  "learning_rate": 0.0002,
716
- "loss": 0.8743,
717
  "step": 100
718
  },
719
  {
720
- "epoch": 0.0016246235015462073,
721
- "grad_norm": 3.156965970993042,
722
  "learning_rate": 0.0002,
723
- "loss": 0.754,
724
  "step": 101
725
  },
726
  {
727
- "epoch": 0.0016407088827496352,
728
- "grad_norm": 3.00789213180542,
729
  "learning_rate": 0.0002,
730
- "loss": 0.8606,
731
  "step": 102
732
  },
733
  {
734
- "epoch": 0.0016567942639530628,
735
- "grad_norm": 3.9045052528381348,
736
  "learning_rate": 0.0002,
737
- "loss": 0.833,
738
  "step": 103
739
  },
740
  {
741
- "epoch": 0.0016728796451564907,
742
- "grad_norm": 3.0179498195648193,
743
  "learning_rate": 0.0002,
744
- "loss": 0.6971,
745
  "step": 104
746
  },
747
  {
748
- "epoch": 0.0016889650263599185,
749
- "grad_norm": 3.441555976867676,
750
  "learning_rate": 0.0002,
751
- "loss": 0.9697,
752
  "step": 105
753
  },
754
  {
755
- "epoch": 0.0017050504075633462,
756
- "grad_norm": 3.4271888732910156,
757
  "learning_rate": 0.0002,
758
- "loss": 0.8264,
759
  "step": 106
760
  },
761
  {
762
- "epoch": 0.001721135788766774,
763
- "grad_norm": 3.3394598960876465,
764
  "learning_rate": 0.0002,
765
- "loss": 0.7529,
766
  "step": 107
767
  },
768
  {
769
- "epoch": 0.001737221169970202,
770
- "grad_norm": 4.098421573638916,
771
  "learning_rate": 0.0002,
772
- "loss": 0.7967,
773
  "step": 108
774
  },
775
  {
776
- "epoch": 0.0017533065511736296,
777
- "grad_norm": 5.323544979095459,
778
  "learning_rate": 0.0002,
779
- "loss": 0.9429,
780
  "step": 109
781
  },
782
  {
783
- "epoch": 0.0017693919323770574,
784
- "grad_norm": 3.8546035289764404,
785
  "learning_rate": 0.0002,
786
- "loss": 0.8392,
787
  "step": 110
788
  },
789
  {
790
- "epoch": 0.0017854773135804853,
791
- "grad_norm": 3.514596939086914,
792
  "learning_rate": 0.0002,
793
- "loss": 0.904,
794
  "step": 111
795
  },
796
  {
797
- "epoch": 0.001801562694783913,
798
- "grad_norm": 4.436436653137207,
799
  "learning_rate": 0.0002,
800
- "loss": 0.8841,
801
  "step": 112
802
  },
803
  {
804
- "epoch": 0.0018176480759873408,
805
- "grad_norm": 3.042628049850464,
806
  "learning_rate": 0.0002,
807
- "loss": 0.6856,
808
  "step": 113
809
  },
810
  {
811
- "epoch": 0.0018337334571907687,
812
- "grad_norm": 3.558793306350708,
813
  "learning_rate": 0.0002,
814
- "loss": 0.9463,
815
  "step": 114
816
  },
817
  {
818
- "epoch": 0.0018498188383941963,
819
- "grad_norm": 3.0797207355499268,
820
  "learning_rate": 0.0002,
821
- "loss": 0.7813,
822
  "step": 115
823
  },
824
  {
825
- "epoch": 0.0018659042195976242,
826
- "grad_norm": 3.2403101921081543,
827
  "learning_rate": 0.0002,
828
- "loss": 0.9499,
829
  "step": 116
830
  },
831
  {
832
- "epoch": 0.001881989600801052,
833
- "grad_norm": 3.385939121246338,
834
  "learning_rate": 0.0002,
835
- "loss": 0.6545,
836
  "step": 117
837
  },
838
  {
839
- "epoch": 0.0018980749820044797,
840
- "grad_norm": 3.525153636932373,
841
  "learning_rate": 0.0002,
842
- "loss": 0.9449,
843
  "step": 118
844
  },
845
  {
846
- "epoch": 0.0019141603632079076,
847
- "grad_norm": 2.670220375061035,
848
  "learning_rate": 0.0002,
849
- "loss": 0.6208,
850
  "step": 119
851
  },
852
  {
853
- "epoch": 0.0019302457444113354,
854
- "grad_norm": 3.3499555587768555,
855
  "learning_rate": 0.0002,
856
- "loss": 0.833,
857
  "step": 120
858
  },
859
  {
860
- "epoch": 0.001946331125614763,
861
- "grad_norm": 5.413862705230713,
862
  "learning_rate": 0.0002,
863
- "loss": 1.2186,
864
  "step": 121
865
  },
866
  {
867
- "epoch": 0.001962416506818191,
868
- "grad_norm": 3.637068271636963,
869
  "learning_rate": 0.0002,
870
- "loss": 0.8746,
871
  "step": 122
872
  },
873
  {
874
- "epoch": 0.0019785018880216186,
875
- "grad_norm": 6.209028244018555,
876
  "learning_rate": 0.0002,
877
- "loss": 1.1379,
878
  "step": 123
879
  },
880
  {
881
- "epoch": 0.0019945872692250467,
882
- "grad_norm": 4.2924418449401855,
883
  "learning_rate": 0.0002,
884
- "loss": 1.0075,
885
  "step": 124
886
  },
887
  {
888
- "epoch": 0.0020106726504284743,
889
- "grad_norm": 2.749718427658081,
890
  "learning_rate": 0.0002,
891
- "loss": 0.694,
892
  "step": 125
893
  },
894
  {
895
- "epoch": 0.002026758031631902,
896
- "grad_norm": 4.217276573181152,
897
  "learning_rate": 0.0002,
898
- "loss": 0.778,
899
  "step": 126
900
  },
901
  {
902
- "epoch": 0.00204284341283533,
903
- "grad_norm": 3.031771421432495,
904
  "learning_rate": 0.0002,
905
- "loss": 0.9696,
906
  "step": 127
907
  },
908
  {
909
- "epoch": 0.0020589287940387577,
910
- "grad_norm": 3.4838218688964844,
911
  "learning_rate": 0.0002,
912
- "loss": 0.6629,
913
  "step": 128
914
  },
915
  {
916
- "epoch": 0.0020750141752421854,
917
- "grad_norm": 3.218451738357544,
918
  "learning_rate": 0.0002,
919
- "loss": 0.6899,
920
  "step": 129
921
  },
922
  {
923
- "epoch": 0.0020910995564456135,
924
- "grad_norm": 3.4607691764831543,
925
  "learning_rate": 0.0002,
926
- "loss": 0.6832,
927
  "step": 130
928
  },
929
  {
930
- "epoch": 0.002107184937649041,
931
- "grad_norm": 3.70224666595459,
932
  "learning_rate": 0.0002,
933
- "loss": 0.7241,
934
  "step": 131
935
  },
936
  {
937
- "epoch": 0.0021232703188524688,
938
- "grad_norm": 4.122409820556641,
939
  "learning_rate": 0.0002,
940
- "loss": 0.8109,
941
  "step": 132
942
  },
943
  {
944
- "epoch": 0.002139355700055897,
945
- "grad_norm": 3.3417394161224365,
946
  "learning_rate": 0.0002,
947
- "loss": 0.6684,
948
  "step": 133
949
  },
950
  {
951
- "epoch": 0.0021554410812593245,
952
- "grad_norm": 3.019958972930908,
953
  "learning_rate": 0.0002,
954
- "loss": 0.7826,
955
  "step": 134
956
  },
957
  {
958
- "epoch": 0.002171526462462752,
959
- "grad_norm": 3.201491117477417,
960
  "learning_rate": 0.0002,
961
- "loss": 0.7875,
962
  "step": 135
963
  },
964
  {
965
- "epoch": 0.00218761184366618,
966
- "grad_norm": 5.85605525970459,
967
  "learning_rate": 0.0002,
968
- "loss": 1.1128,
969
  "step": 136
970
  },
971
  {
972
- "epoch": 0.002203697224869608,
973
- "grad_norm": 3.976530075073242,
974
  "learning_rate": 0.0002,
975
- "loss": 0.8679,
976
  "step": 137
977
  },
978
  {
979
- "epoch": 0.0022197826060730355,
980
- "grad_norm": 3.621382713317871,
981
  "learning_rate": 0.0002,
982
- "loss": 0.7601,
983
  "step": 138
984
  },
985
  {
986
- "epoch": 0.0022358679872764636,
987
- "grad_norm": 18.2700252532959,
988
  "learning_rate": 0.0002,
989
- "loss": 0.9312,
990
  "step": 139
991
  },
992
  {
993
- "epoch": 0.0022519533684798912,
994
- "grad_norm": 3.050555467605591,
995
  "learning_rate": 0.0002,
996
- "loss": 0.9431,
997
  "step": 140
998
  },
999
  {
1000
- "epoch": 0.002268038749683319,
1001
- "grad_norm": 4.187278747558594,
1002
  "learning_rate": 0.0002,
1003
- "loss": 1.16,
1004
  "step": 141
1005
  },
1006
  {
1007
- "epoch": 0.002284124130886747,
1008
- "grad_norm": 2.9168365001678467,
1009
  "learning_rate": 0.0002,
1010
- "loss": 0.7853,
1011
  "step": 142
1012
  },
1013
  {
1014
- "epoch": 0.0023002095120901746,
1015
- "grad_norm": 118.312744140625,
1016
  "learning_rate": 0.0002,
1017
- "loss": 1.1003,
1018
  "step": 143
1019
  },
1020
  {
1021
- "epoch": 0.0023162948932936023,
1022
- "grad_norm": 4.7243971824646,
1023
  "learning_rate": 0.0002,
1024
- "loss": 0.694,
1025
  "step": 144
1026
  },
1027
  {
1028
- "epoch": 0.0023323802744970304,
1029
- "grad_norm": 4.773429870605469,
1030
  "learning_rate": 0.0002,
1031
- "loss": 0.7167,
1032
  "step": 145
1033
  },
1034
  {
1035
- "epoch": 0.002348465655700458,
1036
- "grad_norm": 6.2195868492126465,
1037
  "learning_rate": 0.0002,
1038
- "loss": 0.7979,
1039
  "step": 146
1040
  },
1041
  {
1042
- "epoch": 0.0023645510369038857,
1043
- "grad_norm": 12.494455337524414,
1044
  "learning_rate": 0.0002,
1045
- "loss": 1.2257,
1046
  "step": 147
1047
  },
1048
  {
1049
- "epoch": 0.0023806364181073137,
1050
- "grad_norm": 6.841114521026611,
1051
  "learning_rate": 0.0002,
1052
- "loss": 1.28,
1053
  "step": 148
1054
  },
1055
  {
1056
- "epoch": 0.0023967217993107414,
1057
- "grad_norm": 5.901433944702148,
1058
  "learning_rate": 0.0002,
1059
- "loss": 0.826,
1060
  "step": 149
1061
  },
1062
  {
1063
- "epoch": 0.002412807180514169,
1064
- "grad_norm": 7.198768615722656,
1065
  "learning_rate": 0.0002,
1066
- "loss": 0.7969,
1067
  "step": 150
1068
  },
1069
  {
1070
- "epoch": 0.002428892561717597,
1071
- "grad_norm": 9.673176765441895,
1072
  "learning_rate": 0.0002,
1073
- "loss": 0.8828,
1074
  "step": 151
1075
  },
1076
  {
1077
- "epoch": 0.0024449779429210248,
1078
- "grad_norm": 10.305676460266113,
1079
  "learning_rate": 0.0002,
1080
- "loss": 0.8668,
1081
  "step": 152
1082
  },
1083
  {
1084
- "epoch": 0.0024610633241244524,
1085
- "grad_norm": 14.00606632232666,
1086
  "learning_rate": 0.0002,
1087
- "loss": 0.9462,
1088
  "step": 153
1089
  },
1090
  {
1091
- "epoch": 0.0024771487053278805,
1092
- "grad_norm": 6.559825897216797,
1093
  "learning_rate": 0.0002,
1094
- "loss": 0.7042,
1095
  "step": 154
1096
  },
1097
  {
1098
- "epoch": 0.002493234086531308,
1099
- "grad_norm": 3.9966037273406982,
1100
  "learning_rate": 0.0002,
1101
- "loss": 0.8798,
1102
  "step": 155
1103
  },
1104
  {
1105
- "epoch": 0.002509319467734736,
1106
- "grad_norm": 5.800797462463379,
1107
  "learning_rate": 0.0002,
1108
- "loss": 0.7377,
1109
  "step": 156
1110
  },
1111
  {
1112
- "epoch": 0.002525404848938164,
1113
- "grad_norm": 7.694753646850586,
1114
  "learning_rate": 0.0002,
1115
- "loss": 0.9589,
1116
  "step": 157
1117
  },
1118
  {
1119
- "epoch": 0.0025414902301415915,
1120
- "grad_norm": 4.698418617248535,
1121
  "learning_rate": 0.0002,
1122
- "loss": 0.826,
1123
  "step": 158
1124
  },
1125
  {
1126
- "epoch": 0.002557575611345019,
1127
- "grad_norm": 3.7439236640930176,
1128
  "learning_rate": 0.0002,
1129
- "loss": 0.874,
1130
  "step": 159
1131
  },
1132
  {
1133
- "epoch": 0.0025736609925484473,
1134
- "grad_norm": 4.441625118255615,
1135
  "learning_rate": 0.0002,
1136
- "loss": 0.8844,
1137
  "step": 160
1138
  },
1139
  {
1140
- "epoch": 0.002589746373751875,
1141
- "grad_norm": 4.822892665863037,
1142
  "learning_rate": 0.0002,
1143
- "loss": 0.9741,
1144
  "step": 161
1145
  },
1146
  {
1147
- "epoch": 0.0026058317549553026,
1148
- "grad_norm": 5.727447986602783,
1149
  "learning_rate": 0.0002,
1150
- "loss": 1.228,
1151
  "step": 162
1152
  },
1153
  {
1154
- "epoch": 0.0026219171361587306,
1155
- "grad_norm": 4.084842681884766,
1156
  "learning_rate": 0.0002,
1157
- "loss": 0.8113,
1158
  "step": 163
1159
  },
1160
  {
1161
- "epoch": 0.0026380025173621583,
1162
- "grad_norm": 4.884864330291748,
1163
  "learning_rate": 0.0002,
1164
- "loss": 0.9853,
1165
  "step": 164
1166
  },
1167
  {
1168
- "epoch": 0.002654087898565586,
1169
- "grad_norm": 4.315978527069092,
1170
  "learning_rate": 0.0002,
1171
- "loss": 0.7985,
1172
  "step": 165
1173
  },
1174
  {
1175
- "epoch": 0.002670173279769014,
1176
- "grad_norm": 3.958301544189453,
1177
  "learning_rate": 0.0002,
1178
- "loss": 0.8639,
1179
  "step": 166
1180
  },
1181
  {
1182
- "epoch": 0.0026862586609724417,
1183
- "grad_norm": 5.930337905883789,
1184
  "learning_rate": 0.0002,
1185
- "loss": 0.9575,
1186
  "step": 167
1187
  },
1188
  {
1189
- "epoch": 0.0027023440421758693,
1190
- "grad_norm": 3.374218702316284,
1191
  "learning_rate": 0.0002,
1192
- "loss": 0.5752,
1193
  "step": 168
1194
  },
1195
  {
1196
- "epoch": 0.0027184294233792974,
1197
- "grad_norm": 7.738460063934326,
1198
  "learning_rate": 0.0002,
1199
- "loss": 1.1104,
1200
  "step": 169
1201
  },
1202
  {
1203
- "epoch": 0.002734514804582725,
1204
- "grad_norm": 6.493184566497803,
1205
  "learning_rate": 0.0002,
1206
- "loss": 0.9614,
1207
  "step": 170
1208
  },
1209
  {
1210
- "epoch": 0.0027506001857861527,
1211
- "grad_norm": 7.904129981994629,
1212
  "learning_rate": 0.0002,
1213
- "loss": 1.1735,
1214
  "step": 171
1215
  },
1216
  {
1217
- "epoch": 0.002766685566989581,
1218
- "grad_norm": 6.135262489318848,
1219
  "learning_rate": 0.0002,
1220
- "loss": 1.1976,
1221
  "step": 172
1222
  },
1223
  {
1224
- "epoch": 0.0027827709481930084,
1225
- "grad_norm": 6.674580097198486,
1226
  "learning_rate": 0.0002,
1227
- "loss": 0.7546,
1228
  "step": 173
1229
  },
1230
  {
1231
- "epoch": 0.002798856329396436,
1232
- "grad_norm": 3.6253364086151123,
1233
  "learning_rate": 0.0002,
1234
- "loss": 0.8027,
1235
  "step": 174
1236
  },
1237
  {
1238
- "epoch": 0.002814941710599864,
1239
- "grad_norm": 3.2293593883514404,
1240
  "learning_rate": 0.0002,
1241
- "loss": 0.8404,
1242
  "step": 175
1243
  },
1244
  {
1245
- "epoch": 0.002831027091803292,
1246
- "grad_norm": 4.404852867126465,
1247
  "learning_rate": 0.0002,
1248
- "loss": 0.8233,
1249
  "step": 176
1250
  },
1251
  {
1252
- "epoch": 0.0028471124730067195,
1253
- "grad_norm": 9.036417007446289,
1254
  "learning_rate": 0.0002,
1255
- "loss": 1.2197,
1256
  "step": 177
1257
  },
1258
  {
1259
- "epoch": 0.0028631978542101475,
1260
- "grad_norm": 3.6753194332122803,
1261
  "learning_rate": 0.0002,
1262
- "loss": 0.8155,
1263
  "step": 178
1264
  },
1265
  {
1266
- "epoch": 0.002879283235413575,
1267
- "grad_norm": 4.148676872253418,
1268
  "learning_rate": 0.0002,
1269
- "loss": 1.0028,
1270
  "step": 179
1271
  },
1272
  {
1273
- "epoch": 0.002895368616617003,
1274
- "grad_norm": 10.267266273498535,
1275
  "learning_rate": 0.0002,
1276
- "loss": 0.8078,
1277
  "step": 180
1278
  },
1279
  {
1280
- "epoch": 0.002911453997820431,
1281
- "grad_norm": 5.570545673370361,
1282
  "learning_rate": 0.0002,
1283
- "loss": 0.9974,
1284
  "step": 181
1285
  },
1286
  {
1287
- "epoch": 0.0029275393790238586,
1288
- "grad_norm": 6.258678436279297,
1289
  "learning_rate": 0.0002,
1290
- "loss": 1.1986,
1291
  "step": 182
1292
  },
1293
  {
1294
- "epoch": 0.0029436247602272862,
1295
- "grad_norm": 11.766939163208008,
1296
  "learning_rate": 0.0002,
1297
- "loss": 0.8153,
1298
  "step": 183
1299
  },
1300
  {
1301
- "epoch": 0.0029597101414307143,
1302
- "grad_norm": 4.668914318084717,
1303
  "learning_rate": 0.0002,
1304
- "loss": 0.7482,
1305
  "step": 184
1306
  },
1307
  {
1308
- "epoch": 0.002975795522634142,
1309
- "grad_norm": 3.728922128677368,
1310
  "learning_rate": 0.0002,
1311
- "loss": 0.7389,
1312
  "step": 185
1313
  },
1314
  {
1315
- "epoch": 0.0029918809038375696,
1316
- "grad_norm": 3.9253530502319336,
1317
  "learning_rate": 0.0002,
1318
- "loss": 0.8526,
1319
  "step": 186
1320
  },
1321
  {
1322
- "epoch": 0.0030079662850409977,
1323
- "grad_norm": 4.449740409851074,
1324
  "learning_rate": 0.0002,
1325
- "loss": 0.8117,
1326
  "step": 187
1327
  },
1328
  {
1329
- "epoch": 0.0030240516662444253,
1330
- "grad_norm": 3.856152296066284,
1331
  "learning_rate": 0.0002,
1332
- "loss": 0.6481,
1333
  "step": 188
1334
  },
1335
  {
1336
- "epoch": 0.0030401370474478534,
1337
- "grad_norm": 140.99961853027344,
1338
  "learning_rate": 0.0002,
1339
- "loss": 2.8234,
1340
  "step": 189
1341
  },
1342
  {
1343
- "epoch": 0.003056222428651281,
1344
- "grad_norm": 4.190764904022217,
1345
  "learning_rate": 0.0002,
1346
- "loss": 0.7266,
1347
  "step": 190
1348
  },
1349
  {
1350
- "epoch": 0.0030723078098547087,
1351
- "grad_norm": 3.9606616497039795,
1352
  "learning_rate": 0.0002,
1353
- "loss": 0.8465,
1354
  "step": 191
1355
  },
1356
  {
1357
- "epoch": 0.003088393191058137,
1358
- "grad_norm": 4.197356700897217,
1359
  "learning_rate": 0.0002,
1360
- "loss": 0.7764,
1361
  "step": 192
1362
  },
1363
  {
1364
- "epoch": 0.0031044785722615644,
1365
- "grad_norm": 4.308269023895264,
1366
  "learning_rate": 0.0002,
1367
- "loss": 0.6308,
1368
  "step": 193
1369
  },
1370
  {
1371
- "epoch": 0.003120563953464992,
1372
- "grad_norm": 7.85593843460083,
1373
  "learning_rate": 0.0002,
1374
- "loss": 1.2231,
1375
  "step": 194
1376
  },
1377
  {
1378
- "epoch": 0.00313664933466842,
1379
- "grad_norm": 5.271966934204102,
1380
  "learning_rate": 0.0002,
1381
- "loss": 0.6263,
1382
  "step": 195
1383
  },
1384
  {
1385
- "epoch": 0.003152734715871848,
1386
- "grad_norm": 4.99168062210083,
1387
  "learning_rate": 0.0002,
1388
- "loss": 0.8379,
1389
  "step": 196
1390
  },
1391
  {
1392
- "epoch": 0.0031688200970752755,
1393
- "grad_norm": 4.923642635345459,
1394
  "learning_rate": 0.0002,
1395
- "loss": 0.7982,
1396
  "step": 197
1397
  },
1398
  {
1399
- "epoch": 0.0031849054782787036,
1400
- "grad_norm": 8.511445999145508,
1401
  "learning_rate": 0.0002,
1402
- "loss": 0.8379,
1403
  "step": 198
1404
  },
1405
  {
1406
- "epoch": 0.003200990859482131,
1407
- "grad_norm": 6.066445350646973,
1408
  "learning_rate": 0.0002,
1409
- "loss": 0.7347,
1410
  "step": 199
1411
  },
1412
  {
1413
- "epoch": 0.003217076240685559,
1414
- "grad_norm": 6.310784339904785,
1415
  "learning_rate": 0.0002,
1416
- "loss": 0.9526,
1417
  "step": 200
1418
  },
1419
  {
1420
- "epoch": 0.003217076240685559,
1421
- "eval_loss": 0.7864285707473755,
1422
- "eval_runtime": 25.6512,
1423
- "eval_samples_per_second": 9.707,
1424
- "eval_steps_per_second": 9.707,
1425
  "step": 200
1426
  }
1427
  ],
1428
  "logging_steps": 1,
1429
- "max_steps": 186504,
1430
  "num_input_tokens_seen": 0,
1431
  "num_train_epochs": 3,
1432
  "save_steps": 200,
@@ -1451,8 +1451,8 @@
1451
  "attributes": {}
1452
  }
1453
  },
1454
- "total_flos": 5911953172070400.0,
1455
- "train_batch_size": 1,
1456
  "trial_name": null,
1457
  "trial_params": null
1458
  }
 
1
  {
2
+ "best_metric": 0.3310258388519287,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 0.006434126607526319,
5
  "eval_steps": 200,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 3.21706330376316e-05,
13
+ "grad_norm": 0.2609178423881531,
14
  "learning_rate": 2e-05,
15
+ "loss": 0.5746,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 3.21706330376316e-05,
20
+ "eval_loss": 0.46101704239845276,
21
+ "eval_runtime": 28.8598,
22
+ "eval_samples_per_second": 8.628,
23
+ "eval_steps_per_second": 4.331,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 6.43412660752632e-05,
28
+ "grad_norm": 0.23752833902835846,
29
  "learning_rate": 4e-05,
30
+ "loss": 0.3828,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 9.65118991128948e-05,
35
+ "grad_norm": 0.25489795207977295,
36
  "learning_rate": 6e-05,
37
+ "loss": 0.4657,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.0001286825321505264,
42
+ "grad_norm": 0.3867496848106384,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.5709,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.000160853165188158,
49
+ "grad_norm": 0.21718668937683105,
50
  "learning_rate": 0.0001,
51
+ "loss": 0.4053,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.0001930237982257896,
56
+ "grad_norm": 0.22927387058734894,
57
  "learning_rate": 0.00012,
58
+ "loss": 0.3848,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.00022519443126342117,
63
+ "grad_norm": 0.2708449363708496,
64
  "learning_rate": 0.00014,
65
+ "loss": 0.3925,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.0002573650643010528,
70
+ "grad_norm": 0.378393292427063,
71
  "learning_rate": 0.00016,
72
+ "loss": 0.3656,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.0002895356973386844,
77
+ "grad_norm": 0.5767518281936646,
78
  "learning_rate": 0.00018,
79
+ "loss": 0.3179,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.000321706330376316,
84
+ "grad_norm": 0.6105942726135254,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.3719,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.00035387696341394756,
91
+ "grad_norm": 0.4722791016101837,
92
  "learning_rate": 0.0002,
93
+ "loss": 0.3447,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.0003860475964515792,
98
+ "grad_norm": 0.4688263237476349,
99
  "learning_rate": 0.0002,
100
+ "loss": 0.3055,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.00041821822948921077,
105
+ "grad_norm": 0.35639023780822754,
106
  "learning_rate": 0.0002,
107
+ "loss": 0.2548,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.00045038886252684235,
112
+ "grad_norm": 0.4272077679634094,
113
  "learning_rate": 0.0002,
114
+ "loss": 0.3488,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.000482559495564474,
119
+ "grad_norm": 0.3379782736301422,
120
  "learning_rate": 0.0002,
121
+ "loss": 0.3432,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.0005147301286021056,
126
+ "grad_norm": 0.6516053676605225,
127
  "learning_rate": 0.0002,
128
+ "loss": 0.3964,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.0005469007616397372,
133
+ "grad_norm": 0.6698662042617798,
134
  "learning_rate": 0.0002,
135
+ "loss": 0.3674,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.0005790713946773688,
140
+ "grad_norm": 0.9461187124252319,
141
  "learning_rate": 0.0002,
142
+ "loss": 0.2705,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.0006112420277150003,
147
+ "grad_norm": 0.5860435366630554,
148
  "learning_rate": 0.0002,
149
+ "loss": 0.3152,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.000643412660752632,
154
+ "grad_norm": 0.4475187659263611,
155
  "learning_rate": 0.0002,
156
+ "loss": 0.4237,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.0006755832937902636,
161
+ "grad_norm": 0.5749617218971252,
162
  "learning_rate": 0.0002,
163
+ "loss": 0.2669,
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.0007077539268278951,
168
+ "grad_norm": 0.6806007623672485,
169
  "learning_rate": 0.0002,
170
+ "loss": 0.3513,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.0007399245598655268,
175
+ "grad_norm": 0.6327475905418396,
176
  "learning_rate": 0.0002,
177
+ "loss": 0.3948,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.0007720951929031584,
182
+ "grad_norm": 0.5336435437202454,
183
  "learning_rate": 0.0002,
184
+ "loss": 0.3854,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.0008042658259407899,
189
+ "grad_norm": 0.5399162173271179,
190
  "learning_rate": 0.0002,
191
+ "loss": 0.3333,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.0008364364589784215,
196
+ "grad_norm": 0.7226356863975525,
197
  "learning_rate": 0.0002,
198
+ "loss": 0.3978,
199
  "step": 26
200
  },
201
  {
202
+ "epoch": 0.0008686070920160532,
203
+ "grad_norm": 0.6512770652770996,
204
  "learning_rate": 0.0002,
205
+ "loss": 0.3304,
206
  "step": 27
207
  },
208
  {
209
+ "epoch": 0.0009007777250536847,
210
+ "grad_norm": 0.7261360287666321,
211
  "learning_rate": 0.0002,
212
+ "loss": 0.3004,
213
  "step": 28
214
  },
215
  {
216
+ "epoch": 0.0009329483580913163,
217
+ "grad_norm": 0.5120699405670166,
218
  "learning_rate": 0.0002,
219
+ "loss": 0.3675,
220
  "step": 29
221
  },
222
  {
223
+ "epoch": 0.000965118991128948,
224
+ "grad_norm": 0.5695130228996277,
225
  "learning_rate": 0.0002,
226
+ "loss": 0.3728,
227
  "step": 30
228
  },
229
  {
230
+ "epoch": 0.0009972896241665795,
231
+ "grad_norm": 0.5845438838005066,
232
  "learning_rate": 0.0002,
233
+ "loss": 0.2799,
234
  "step": 31
235
  },
236
  {
237
+ "epoch": 0.001029460257204211,
238
+ "grad_norm": 0.5468902587890625,
239
  "learning_rate": 0.0002,
240
+ "loss": 0.3652,
241
  "step": 32
242
  },
243
  {
244
+ "epoch": 0.0010616308902418427,
245
+ "grad_norm": 0.5148847699165344,
246
  "learning_rate": 0.0002,
247
+ "loss": 0.3211,
248
  "step": 33
249
  },
250
  {
251
+ "epoch": 0.0010938015232794744,
252
+ "grad_norm": 0.4925091862678528,
253
  "learning_rate": 0.0002,
254
+ "loss": 0.2873,
255
  "step": 34
256
  },
257
  {
258
+ "epoch": 0.001125972156317106,
259
+ "grad_norm": 0.45565104484558105,
260
  "learning_rate": 0.0002,
261
+ "loss": 0.4159,
262
  "step": 35
263
  },
264
  {
265
+ "epoch": 0.0011581427893547376,
266
+ "grad_norm": 0.44539037346839905,
267
  "learning_rate": 0.0002,
268
+ "loss": 0.3451,
269
  "step": 36
270
  },
271
  {
272
+ "epoch": 0.001190313422392369,
273
+ "grad_norm": 0.5995281338691711,
274
  "learning_rate": 0.0002,
275
+ "loss": 0.3294,
276
  "step": 37
277
  },
278
  {
279
+ "epoch": 0.0012224840554300007,
280
+ "grad_norm": 0.5937873125076294,
281
  "learning_rate": 0.0002,
282
+ "loss": 0.3174,
283
  "step": 38
284
  },
285
  {
286
+ "epoch": 0.0012546546884676323,
287
+ "grad_norm": 0.5223010182380676,
288
  "learning_rate": 0.0002,
289
+ "loss": 0.3538,
290
  "step": 39
291
  },
292
  {
293
+ "epoch": 0.001286825321505264,
294
+ "grad_norm": 0.45076924562454224,
295
  "learning_rate": 0.0002,
296
+ "loss": 0.2965,
297
  "step": 40
298
  },
299
  {
300
+ "epoch": 0.0013189959545428956,
301
+ "grad_norm": 0.5829368233680725,
302
  "learning_rate": 0.0002,
303
+ "loss": 0.3505,
304
  "step": 41
305
  },
306
  {
307
+ "epoch": 0.0013511665875805272,
308
+ "grad_norm": 0.5640948414802551,
309
  "learning_rate": 0.0002,
310
+ "loss": 0.3274,
311
  "step": 42
312
  },
313
  {
314
+ "epoch": 0.0013833372206181586,
315
+ "grad_norm": 0.5946338772773743,
316
  "learning_rate": 0.0002,
317
+ "loss": 0.3784,
318
  "step": 43
319
  },
320
  {
321
+ "epoch": 0.0014155078536557902,
322
+ "grad_norm": 0.49370312690734863,
323
  "learning_rate": 0.0002,
324
+ "loss": 0.3396,
325
  "step": 44
326
  },
327
  {
328
+ "epoch": 0.0014476784866934219,
329
+ "grad_norm": 0.552584707736969,
330
  "learning_rate": 0.0002,
331
+ "loss": 0.3028,
332
  "step": 45
333
  },
334
  {
335
+ "epoch": 0.0014798491197310535,
336
+ "grad_norm": 0.6281300187110901,
337
  "learning_rate": 0.0002,
338
+ "loss": 0.4233,
339
  "step": 46
340
  },
341
  {
342
+ "epoch": 0.0015120197527686851,
343
+ "grad_norm": 0.4612821638584137,
344
  "learning_rate": 0.0002,
345
+ "loss": 0.2995,
346
  "step": 47
347
  },
348
  {
349
+ "epoch": 0.0015441903858063168,
350
+ "grad_norm": 0.4370185434818268,
351
  "learning_rate": 0.0002,
352
+ "loss": 0.243,
353
  "step": 48
354
  },
355
  {
356
+ "epoch": 0.0015763610188439484,
357
+ "grad_norm": 0.6153799891471863,
358
  "learning_rate": 0.0002,
359
+ "loss": 0.2805,
360
  "step": 49
361
  },
362
  {
363
+ "epoch": 0.0016085316518815798,
364
+ "grad_norm": 0.6847407817840576,
365
  "learning_rate": 0.0002,
366
+ "loss": 0.3014,
367
  "step": 50
368
  },
369
  {
370
+ "epoch": 0.0016407022849192114,
371
+ "grad_norm": 0.49228036403656006,
372
  "learning_rate": 0.0002,
373
+ "loss": 0.3074,
374
  "step": 51
375
  },
376
  {
377
+ "epoch": 0.001672872917956843,
378
+ "grad_norm": 0.5665944218635559,
379
  "learning_rate": 0.0002,
380
+ "loss": 0.2657,
381
  "step": 52
382
  },
383
  {
384
+ "epoch": 0.0017050435509944747,
385
+ "grad_norm": 0.5495525002479553,
386
  "learning_rate": 0.0002,
387
+ "loss": 0.4076,
388
  "step": 53
389
  },
390
  {
391
+ "epoch": 0.0017372141840321063,
392
+ "grad_norm": 0.7097938656806946,
393
  "learning_rate": 0.0002,
394
+ "loss": 0.3506,
395
  "step": 54
396
  },
397
  {
398
+ "epoch": 0.001769384817069738,
399
+ "grad_norm": 0.6005829572677612,
400
  "learning_rate": 0.0002,
401
+ "loss": 0.3041,
402
  "step": 55
403
  },
404
  {
405
+ "epoch": 0.0018015554501073694,
406
+ "grad_norm": 0.4742415249347687,
407
  "learning_rate": 0.0002,
408
+ "loss": 0.3457,
409
  "step": 56
410
  },
411
  {
412
+ "epoch": 0.001833726083145001,
413
+ "grad_norm": 0.6460862159729004,
414
  "learning_rate": 0.0002,
415
+ "loss": 0.352,
416
  "step": 57
417
  },
418
  {
419
+ "epoch": 0.0018658967161826326,
420
+ "grad_norm": 0.5325047373771667,
421
  "learning_rate": 0.0002,
422
+ "loss": 0.3353,
423
  "step": 58
424
  },
425
  {
426
+ "epoch": 0.0018980673492202643,
427
+ "grad_norm": 0.550370991230011,
428
  "learning_rate": 0.0002,
429
+ "loss": 0.3186,
430
  "step": 59
431
  },
432
  {
433
+ "epoch": 0.001930237982257896,
434
+ "grad_norm": 0.5427353978157043,
435
  "learning_rate": 0.0002,
436
+ "loss": 0.2949,
437
  "step": 60
438
  },
439
  {
440
+ "epoch": 0.0019624086152955273,
441
+ "grad_norm": 0.7852073907852173,
442
  "learning_rate": 0.0002,
443
+ "loss": 0.4574,
444
  "step": 61
445
  },
446
  {
447
+ "epoch": 0.001994579248333159,
448
+ "grad_norm": 0.584457516670227,
449
  "learning_rate": 0.0002,
450
+ "loss": 0.3599,
451
  "step": 62
452
  },
453
  {
454
+ "epoch": 0.0020267498813707906,
455
+ "grad_norm": 0.6022618412971497,
456
  "learning_rate": 0.0002,
457
+ "loss": 0.2653,
458
  "step": 63
459
  },
460
  {
461
+ "epoch": 0.002058920514408422,
462
+ "grad_norm": 0.5993865728378296,
463
  "learning_rate": 0.0002,
464
+ "loss": 0.3466,
465
  "step": 64
466
  },
467
  {
468
+ "epoch": 0.002091091147446054,
469
+ "grad_norm": 0.5613014698028564,
470
  "learning_rate": 0.0002,
471
+ "loss": 0.2818,
472
  "step": 65
473
  },
474
  {
475
+ "epoch": 0.0021232617804836855,
476
+ "grad_norm": 0.5827286243438721,
477
  "learning_rate": 0.0002,
478
+ "loss": 0.2641,
479
  "step": 66
480
  },
481
  {
482
+ "epoch": 0.002155432413521317,
483
+ "grad_norm": 0.6258942484855652,
484
  "learning_rate": 0.0002,
485
+ "loss": 0.3016,
486
  "step": 67
487
  },
488
  {
489
+ "epoch": 0.0021876030465589487,
490
+ "grad_norm": 0.5805741548538208,
491
  "learning_rate": 0.0002,
492
+ "loss": 0.4094,
493
  "step": 68
494
  },
495
  {
496
+ "epoch": 0.0022197736795965804,
497
+ "grad_norm": 0.6247344017028809,
498
  "learning_rate": 0.0002,
499
+ "loss": 0.2884,
500
  "step": 69
501
  },
502
  {
503
+ "epoch": 0.002251944312634212,
504
+ "grad_norm": 0.6786600351333618,
505
  "learning_rate": 0.0002,
506
+ "loss": 0.352,
507
  "step": 70
508
  },
509
  {
510
+ "epoch": 0.0022841149456718436,
511
+ "grad_norm": 0.5860627889633179,
512
  "learning_rate": 0.0002,
513
+ "loss": 0.409,
514
  "step": 71
515
  },
516
  {
517
+ "epoch": 0.0023162855787094753,
518
+ "grad_norm": 0.5486606955528259,
519
  "learning_rate": 0.0002,
520
+ "loss": 0.3088,
521
  "step": 72
522
  },
523
  {
524
+ "epoch": 0.0023484562117471065,
525
+ "grad_norm": 0.7216318249702454,
526
  "learning_rate": 0.0002,
527
+ "loss": 0.3077,
528
  "step": 73
529
  },
530
  {
531
+ "epoch": 0.002380626844784738,
532
+ "grad_norm": 0.7599589228630066,
533
  "learning_rate": 0.0002,
534
+ "loss": 0.4633,
535
  "step": 74
536
  },
537
  {
538
+ "epoch": 0.0024127974778223697,
539
+ "grad_norm": 0.5651103854179382,
540
  "learning_rate": 0.0002,
541
+ "loss": 0.297,
542
  "step": 75
543
  },
544
  {
545
+ "epoch": 0.0024449681108600014,
546
+ "grad_norm": 0.5803356170654297,
547
  "learning_rate": 0.0002,
548
+ "loss": 0.2458,
549
  "step": 76
550
  },
551
  {
552
+ "epoch": 0.002477138743897633,
553
+ "grad_norm": 0.5019489526748657,
554
  "learning_rate": 0.0002,
555
+ "loss": 0.3269,
556
  "step": 77
557
  },
558
  {
559
+ "epoch": 0.0025093093769352646,
560
+ "grad_norm": 0.5693783760070801,
561
  "learning_rate": 0.0002,
562
+ "loss": 0.2905,
563
  "step": 78
564
  },
565
  {
566
+ "epoch": 0.0025414800099728962,
567
+ "grad_norm": 0.7466827034950256,
568
  "learning_rate": 0.0002,
569
+ "loss": 0.3504,
570
  "step": 79
571
  },
572
  {
573
+ "epoch": 0.002573650643010528,
574
+ "grad_norm": 0.6029163599014282,
575
  "learning_rate": 0.0002,
576
+ "loss": 0.2978,
577
  "step": 80
578
  },
579
  {
580
+ "epoch": 0.0026058212760481595,
581
+ "grad_norm": 0.6179245114326477,
582
  "learning_rate": 0.0002,
583
+ "loss": 0.4017,
584
  "step": 81
585
  },
586
  {
587
+ "epoch": 0.002637991909085791,
588
+ "grad_norm": 0.7006585001945496,
589
  "learning_rate": 0.0002,
590
+ "loss": 0.3377,
591
  "step": 82
592
  },
593
  {
594
+ "epoch": 0.0026701625421234228,
595
+ "grad_norm": 0.5809662938117981,
596
  "learning_rate": 0.0002,
597
+ "loss": 0.3141,
598
  "step": 83
599
  },
600
  {
601
+ "epoch": 0.0027023331751610544,
602
+ "grad_norm": 0.58149254322052,
603
  "learning_rate": 0.0002,
604
+ "loss": 0.2874,
605
  "step": 84
606
  },
607
  {
608
+ "epoch": 0.002734503808198686,
609
+ "grad_norm": 0.8158010840415955,
610
  "learning_rate": 0.0002,
611
+ "loss": 0.4517,
612
  "step": 85
613
  },
614
  {
615
+ "epoch": 0.0027666744412363172,
616
+ "grad_norm": 0.6752007603645325,
617
  "learning_rate": 0.0002,
618
+ "loss": 0.4586,
619
  "step": 86
620
  },
621
  {
622
+ "epoch": 0.002798845074273949,
623
+ "grad_norm": 0.6040322780609131,
624
  "learning_rate": 0.0002,
625
+ "loss": 0.2622,
626
  "step": 87
627
  },
628
  {
629
+ "epoch": 0.0028310157073115805,
630
+ "grad_norm": 0.7154407501220703,
631
  "learning_rate": 0.0002,
632
+ "loss": 0.3639,
633
  "step": 88
634
  },
635
  {
636
+ "epoch": 0.002863186340349212,
637
+ "grad_norm": 0.6612291932106018,
638
  "learning_rate": 0.0002,
639
+ "loss": 0.3774,
640
  "step": 89
641
  },
642
  {
643
+ "epoch": 0.0028953569733868438,
644
+ "grad_norm": 0.7525337338447571,
645
  "learning_rate": 0.0002,
646
+ "loss": 0.4023,
647
  "step": 90
648
  },
649
  {
650
+ "epoch": 0.0029275276064244754,
651
+ "grad_norm": 0.6839393377304077,
652
  "learning_rate": 0.0002,
653
+ "loss": 0.456,
654
  "step": 91
655
  },
656
  {
657
+ "epoch": 0.002959698239462107,
658
+ "grad_norm": 0.6505508422851562,
659
  "learning_rate": 0.0002,
660
+ "loss": 0.2841,
661
  "step": 92
662
  },
663
  {
664
+ "epoch": 0.0029918688724997386,
665
+ "grad_norm": 0.541343092918396,
666
  "learning_rate": 0.0002,
667
+ "loss": 0.301,
668
  "step": 93
669
  },
670
  {
671
+ "epoch": 0.0030240395055373703,
672
+ "grad_norm": 0.5484374165534973,
673
  "learning_rate": 0.0002,
674
+ "loss": 0.2425,
675
  "step": 94
676
  },
677
  {
678
+ "epoch": 0.003056210138575002,
679
+ "grad_norm": 0.5703783631324768,
680
  "learning_rate": 0.0002,
681
+ "loss": 0.6352,
682
  "step": 95
683
  },
684
  {
685
+ "epoch": 0.0030883807716126335,
686
+ "grad_norm": 0.5479252934455872,
687
  "learning_rate": 0.0002,
688
+ "loss": 0.3013,
689
  "step": 96
690
  },
691
  {
692
+ "epoch": 0.003120551404650265,
693
+ "grad_norm": 0.7930196523666382,
694
  "learning_rate": 0.0002,
695
+ "loss": 0.3676,
696
  "step": 97
697
  },
698
  {
699
+ "epoch": 0.003152722037687897,
700
+ "grad_norm": 0.6894263029098511,
701
  "learning_rate": 0.0002,
702
+ "loss": 0.26,
703
  "step": 98
704
  },
705
  {
706
+ "epoch": 0.003184892670725528,
707
+ "grad_norm": 0.639010488986969,
708
  "learning_rate": 0.0002,
709
+ "loss": 0.2685,
710
  "step": 99
711
  },
712
  {
713
+ "epoch": 0.0032170633037631596,
714
+ "grad_norm": 0.6424719095230103,
715
  "learning_rate": 0.0002,
716
+ "loss": 0.2628,
717
  "step": 100
718
  },
719
  {
720
+ "epoch": 0.0032492339368007913,
721
+ "grad_norm": 0.7405576109886169,
722
  "learning_rate": 0.0002,
723
+ "loss": 0.3001,
724
  "step": 101
725
  },
726
  {
727
+ "epoch": 0.003281404569838423,
728
+ "grad_norm": 0.6489754915237427,
729
  "learning_rate": 0.0002,
730
+ "loss": 0.3407,
731
  "step": 102
732
  },
733
  {
734
+ "epoch": 0.0033135752028760545,
735
+ "grad_norm": 0.6659820675849915,
736
  "learning_rate": 0.0002,
737
+ "loss": 0.3678,
738
  "step": 103
739
  },
740
  {
741
+ "epoch": 0.003345745835913686,
742
+ "grad_norm": 0.706896960735321,
743
  "learning_rate": 0.0002,
744
+ "loss": 0.3577,
745
  "step": 104
746
  },
747
  {
748
+ "epoch": 0.003377916468951318,
749
+ "grad_norm": 0.6583238840103149,
750
  "learning_rate": 0.0002,
751
+ "loss": 0.3567,
752
  "step": 105
753
  },
754
  {
755
+ "epoch": 0.0034100871019889494,
756
+ "grad_norm": 0.7842928767204285,
757
  "learning_rate": 0.0002,
758
+ "loss": 0.3746,
759
  "step": 106
760
  },
761
  {
762
+ "epoch": 0.003442257735026581,
763
+ "grad_norm": 0.7192911505699158,
764
  "learning_rate": 0.0002,
765
+ "loss": 0.381,
766
  "step": 107
767
  },
768
  {
769
+ "epoch": 0.0034744283680642127,
770
+ "grad_norm": 0.8255159258842468,
771
  "learning_rate": 0.0002,
772
+ "loss": 0.4093,
773
  "step": 108
774
  },
775
  {
776
+ "epoch": 0.0035065990011018443,
777
+ "grad_norm": 0.731376588344574,
778
  "learning_rate": 0.0002,
779
+ "loss": 0.2749,
780
  "step": 109
781
  },
782
  {
783
+ "epoch": 0.003538769634139476,
784
+ "grad_norm": 0.7096914649009705,
785
  "learning_rate": 0.0002,
786
+ "loss": 0.2585,
787
  "step": 110
788
  },
789
  {
790
+ "epoch": 0.0035709402671771076,
791
+ "grad_norm": 0.7141759991645813,
792
  "learning_rate": 0.0002,
793
+ "loss": 0.3652,
794
  "step": 111
795
  },
796
  {
797
+ "epoch": 0.0036031109002147388,
798
+ "grad_norm": 0.8442528247833252,
799
  "learning_rate": 0.0002,
800
+ "loss": 0.301,
801
  "step": 112
802
  },
803
  {
804
+ "epoch": 0.0036352815332523704,
805
+ "grad_norm": 0.8419767618179321,
806
  "learning_rate": 0.0002,
807
+ "loss": 0.4241,
808
  "step": 113
809
  },
810
  {
811
+ "epoch": 0.003667452166290002,
812
+ "grad_norm": 0.7170063257217407,
813
  "learning_rate": 0.0002,
814
+ "loss": 0.4237,
815
  "step": 114
816
  },
817
  {
818
+ "epoch": 0.0036996227993276337,
819
+ "grad_norm": 0.7070204019546509,
820
  "learning_rate": 0.0002,
821
+ "loss": 0.4084,
822
  "step": 115
823
  },
824
  {
825
+ "epoch": 0.0037317934323652653,
826
+ "grad_norm": 0.6054997444152832,
827
  "learning_rate": 0.0002,
828
+ "loss": 0.3128,
829
  "step": 116
830
  },
831
  {
832
+ "epoch": 0.003763964065402897,
833
+ "grad_norm": 0.5738762021064758,
834
  "learning_rate": 0.0002,
835
+ "loss": 0.4329,
836
  "step": 117
837
  },
838
  {
839
+ "epoch": 0.0037961346984405286,
840
+ "grad_norm": 0.6349337100982666,
841
  "learning_rate": 0.0002,
842
+ "loss": 0.3271,
843
  "step": 118
844
  },
845
  {
846
+ "epoch": 0.00382830533147816,
847
+ "grad_norm": 0.6344738006591797,
848
  "learning_rate": 0.0002,
849
+ "loss": 0.3099,
850
  "step": 119
851
  },
852
  {
853
+ "epoch": 0.003860475964515792,
854
+ "grad_norm": 0.7301223874092102,
855
  "learning_rate": 0.0002,
856
+ "loss": 0.4126,
857
  "step": 120
858
  },
859
  {
860
+ "epoch": 0.0038926465975534234,
861
+ "grad_norm": 0.7683565616607666,
862
  "learning_rate": 0.0002,
863
+ "loss": 0.3152,
864
  "step": 121
865
  },
866
  {
867
+ "epoch": 0.003924817230591055,
868
+ "grad_norm": 0.5495012402534485,
869
  "learning_rate": 0.0002,
870
+ "loss": 0.2863,
871
  "step": 122
872
  },
873
  {
874
+ "epoch": 0.003956987863628686,
875
+ "grad_norm": 0.6135990023612976,
876
  "learning_rate": 0.0002,
877
+ "loss": 0.3381,
878
  "step": 123
879
  },
880
  {
881
+ "epoch": 0.003989158496666318,
882
+ "grad_norm": 0.8360633850097656,
883
  "learning_rate": 0.0002,
884
+ "loss": 0.3739,
885
  "step": 124
886
  },
887
  {
888
+ "epoch": 0.0040213291297039495,
889
+ "grad_norm": 0.7187512516975403,
890
  "learning_rate": 0.0002,
891
+ "loss": 0.24,
892
  "step": 125
893
  },
894
  {
895
+ "epoch": 0.004053499762741581,
896
+ "grad_norm": 0.7280769348144531,
897
  "learning_rate": 0.0002,
898
+ "loss": 0.3286,
899
  "step": 126
900
  },
901
  {
902
+ "epoch": 0.004085670395779213,
903
+ "grad_norm": 0.6523069143295288,
904
  "learning_rate": 0.0002,
905
+ "loss": 0.2758,
906
  "step": 127
907
  },
908
  {
909
+ "epoch": 0.004117841028816844,
910
+ "grad_norm": 0.6205531358718872,
911
  "learning_rate": 0.0002,
912
+ "loss": 0.3053,
913
  "step": 128
914
  },
915
  {
916
+ "epoch": 0.004150011661854476,
917
+ "grad_norm": 0.8631265163421631,
918
  "learning_rate": 0.0002,
919
+ "loss": 0.3149,
920
  "step": 129
921
  },
922
  {
923
+ "epoch": 0.004182182294892108,
924
+ "grad_norm": 0.7578058242797852,
925
  "learning_rate": 0.0002,
926
+ "loss": 0.5391,
927
  "step": 130
928
  },
929
  {
930
+ "epoch": 0.004214352927929739,
931
+ "grad_norm": 0.7494041323661804,
932
  "learning_rate": 0.0002,
933
+ "loss": 0.2909,
934
  "step": 131
935
  },
936
  {
937
+ "epoch": 0.004246523560967371,
938
+ "grad_norm": 0.6875420808792114,
939
  "learning_rate": 0.0002,
940
+ "loss": 0.3464,
941
  "step": 132
942
  },
943
  {
944
+ "epoch": 0.004278694194005003,
945
+ "grad_norm": 0.6999627947807312,
946
  "learning_rate": 0.0002,
947
+ "loss": 0.374,
948
  "step": 133
949
  },
950
  {
951
+ "epoch": 0.004310864827042634,
952
+ "grad_norm": 0.7990955114364624,
953
  "learning_rate": 0.0002,
954
+ "loss": 0.3734,
955
  "step": 134
956
  },
957
  {
958
+ "epoch": 0.004343035460080266,
959
+ "grad_norm": 0.8626115918159485,
960
  "learning_rate": 0.0002,
961
+ "loss": 0.4212,
962
  "step": 135
963
  },
964
  {
965
+ "epoch": 0.0043752060931178975,
966
+ "grad_norm": 0.7310900092124939,
967
  "learning_rate": 0.0002,
968
+ "loss": 0.3488,
969
  "step": 136
970
  },
971
  {
972
+ "epoch": 0.004407376726155529,
973
+ "grad_norm": 0.8717100024223328,
974
  "learning_rate": 0.0002,
975
+ "loss": 0.3812,
976
  "step": 137
977
  },
978
  {
979
+ "epoch": 0.004439547359193161,
980
+ "grad_norm": 0.6076570153236389,
981
  "learning_rate": 0.0002,
982
+ "loss": 0.2651,
983
  "step": 138
984
  },
985
  {
986
+ "epoch": 0.004471717992230792,
987
+ "grad_norm": 0.9252959489822388,
988
  "learning_rate": 0.0002,
989
+ "loss": 0.352,
990
  "step": 139
991
  },
992
  {
993
+ "epoch": 0.004503888625268424,
994
+ "grad_norm": 0.9629406929016113,
995
  "learning_rate": 0.0002,
996
+ "loss": 0.3687,
997
  "step": 140
998
  },
999
  {
1000
+ "epoch": 0.004536059258306056,
1001
+ "grad_norm": 0.6811290979385376,
1002
  "learning_rate": 0.0002,
1003
+ "loss": 0.3433,
1004
  "step": 141
1005
  },
1006
  {
1007
+ "epoch": 0.004568229891343687,
1008
+ "grad_norm": 1.0294033288955688,
1009
  "learning_rate": 0.0002,
1010
+ "loss": 0.4794,
1011
  "step": 142
1012
  },
1013
  {
1014
+ "epoch": 0.004600400524381319,
1015
+ "grad_norm": 0.6730893850326538,
1016
  "learning_rate": 0.0002,
1017
+ "loss": 0.3088,
1018
  "step": 143
1019
  },
1020
  {
1021
+ "epoch": 0.0046325711574189505,
1022
+ "grad_norm": 0.8225754499435425,
1023
  "learning_rate": 0.0002,
1024
+ "loss": 0.3625,
1025
  "step": 144
1026
  },
1027
  {
1028
+ "epoch": 0.004664741790456581,
1029
+ "grad_norm": 0.6656695008277893,
1030
  "learning_rate": 0.0002,
1031
+ "loss": 0.3459,
1032
  "step": 145
1033
  },
1034
  {
1035
+ "epoch": 0.004696912423494213,
1036
+ "grad_norm": 0.6955097913742065,
1037
  "learning_rate": 0.0002,
1038
+ "loss": 0.3035,
1039
  "step": 146
1040
  },
1041
  {
1042
+ "epoch": 0.0047290830565318446,
1043
+ "grad_norm": 0.9222290515899658,
1044
  "learning_rate": 0.0002,
1045
+ "loss": 0.3599,
1046
  "step": 147
1047
  },
1048
  {
1049
+ "epoch": 0.004761253689569476,
1050
+ "grad_norm": 0.7328464984893799,
1051
  "learning_rate": 0.0002,
1052
+ "loss": 0.2753,
1053
  "step": 148
1054
  },
1055
  {
1056
+ "epoch": 0.004793424322607108,
1057
+ "grad_norm": 0.6858335137367249,
1058
  "learning_rate": 0.0002,
1059
+ "loss": 0.3916,
1060
  "step": 149
1061
  },
1062
  {
1063
+ "epoch": 0.0048255949556447394,
1064
+ "grad_norm": 0.8160132765769958,
1065
  "learning_rate": 0.0002,
1066
+ "loss": 0.3318,
1067
  "step": 150
1068
  },
1069
  {
1070
+ "epoch": 0.004857765588682371,
1071
+ "grad_norm": 0.7775120735168457,
1072
  "learning_rate": 0.0002,
1073
+ "loss": 0.2875,
1074
  "step": 151
1075
  },
1076
  {
1077
+ "epoch": 0.004889936221720003,
1078
+ "grad_norm": 0.7653348445892334,
1079
  "learning_rate": 0.0002,
1080
+ "loss": 0.2932,
1081
  "step": 152
1082
  },
1083
  {
1084
+ "epoch": 0.004922106854757634,
1085
+ "grad_norm": 0.7305892705917358,
1086
  "learning_rate": 0.0002,
1087
+ "loss": 0.3026,
1088
  "step": 153
1089
  },
1090
  {
1091
+ "epoch": 0.004954277487795266,
1092
+ "grad_norm": 0.7535127997398376,
1093
  "learning_rate": 0.0002,
1094
+ "loss": 0.2856,
1095
  "step": 154
1096
  },
1097
  {
1098
+ "epoch": 0.004986448120832898,
1099
+ "grad_norm": 0.7200407981872559,
1100
  "learning_rate": 0.0002,
1101
+ "loss": 0.3097,
1102
  "step": 155
1103
  },
1104
  {
1105
+ "epoch": 0.005018618753870529,
1106
+ "grad_norm": 0.8162491917610168,
1107
  "learning_rate": 0.0002,
1108
+ "loss": 0.3619,
1109
  "step": 156
1110
  },
1111
  {
1112
+ "epoch": 0.005050789386908161,
1113
+ "grad_norm": 1.155956506729126,
1114
  "learning_rate": 0.0002,
1115
+ "loss": 0.4073,
1116
  "step": 157
1117
  },
1118
  {
1119
+ "epoch": 0.0050829600199457925,
1120
+ "grad_norm": 0.7546277046203613,
1121
  "learning_rate": 0.0002,
1122
+ "loss": 0.349,
1123
  "step": 158
1124
  },
1125
  {
1126
+ "epoch": 0.005115130652983424,
1127
+ "grad_norm": 0.5961102247238159,
1128
  "learning_rate": 0.0002,
1129
+ "loss": 0.2786,
1130
  "step": 159
1131
  },
1132
  {
1133
+ "epoch": 0.005147301286021056,
1134
+ "grad_norm": 0.7311742901802063,
1135
  "learning_rate": 0.0002,
1136
+ "loss": 0.3039,
1137
  "step": 160
1138
  },
1139
  {
1140
+ "epoch": 0.005179471919058687,
1141
+ "grad_norm": 1.035477876663208,
1142
  "learning_rate": 0.0002,
1143
+ "loss": 0.4212,
1144
  "step": 161
1145
  },
1146
  {
1147
+ "epoch": 0.005211642552096319,
1148
+ "grad_norm": 0.8163110017776489,
1149
  "learning_rate": 0.0002,
1150
+ "loss": 0.387,
1151
  "step": 162
1152
  },
1153
  {
1154
+ "epoch": 0.005243813185133951,
1155
+ "grad_norm": 0.6917060017585754,
1156
  "learning_rate": 0.0002,
1157
+ "loss": 0.3123,
1158
  "step": 163
1159
  },
1160
  {
1161
+ "epoch": 0.005275983818171582,
1162
+ "grad_norm": 0.8115301132202148,
1163
  "learning_rate": 0.0002,
1164
+ "loss": 0.3335,
1165
  "step": 164
1166
  },
1167
  {
1168
+ "epoch": 0.005308154451209214,
1169
+ "grad_norm": 0.972899854183197,
1170
  "learning_rate": 0.0002,
1171
+ "loss": 0.3542,
1172
  "step": 165
1173
  },
1174
  {
1175
+ "epoch": 0.0053403250842468455,
1176
+ "grad_norm": 0.8446269035339355,
1177
  "learning_rate": 0.0002,
1178
+ "loss": 0.3183,
1179
  "step": 166
1180
  },
1181
  {
1182
+ "epoch": 0.005372495717284477,
1183
+ "grad_norm": 0.7965036630630493,
1184
  "learning_rate": 0.0002,
1185
+ "loss": 0.352,
1186
  "step": 167
1187
  },
1188
  {
1189
+ "epoch": 0.005404666350322109,
1190
+ "grad_norm": 0.7153119444847107,
1191
  "learning_rate": 0.0002,
1192
+ "loss": 0.3603,
1193
  "step": 168
1194
  },
1195
  {
1196
+ "epoch": 0.0054368369833597404,
1197
+ "grad_norm": 0.7775716185569763,
1198
  "learning_rate": 0.0002,
1199
+ "loss": 0.3528,
1200
  "step": 169
1201
  },
1202
  {
1203
+ "epoch": 0.005469007616397372,
1204
+ "grad_norm": 0.7184603214263916,
1205
  "learning_rate": 0.0002,
1206
+ "loss": 0.365,
1207
  "step": 170
1208
  },
1209
  {
1210
+ "epoch": 0.005501178249435003,
1211
+ "grad_norm": 0.6972705125808716,
1212
  "learning_rate": 0.0002,
1213
+ "loss": 0.2906,
1214
  "step": 171
1215
  },
1216
  {
1217
+ "epoch": 0.0055333488824726345,
1218
+ "grad_norm": 0.9904060363769531,
1219
  "learning_rate": 0.0002,
1220
+ "loss": 0.4492,
1221
  "step": 172
1222
  },
1223
  {
1224
+ "epoch": 0.005565519515510266,
1225
+ "grad_norm": 0.852296769618988,
1226
  "learning_rate": 0.0002,
1227
+ "loss": 0.2981,
1228
  "step": 173
1229
  },
1230
  {
1231
+ "epoch": 0.005597690148547898,
1232
+ "grad_norm": 0.6921360492706299,
1233
  "learning_rate": 0.0002,
1234
+ "loss": 0.2875,
1235
  "step": 174
1236
  },
1237
  {
1238
+ "epoch": 0.005629860781585529,
1239
+ "grad_norm": 0.7813829779624939,
1240
  "learning_rate": 0.0002,
1241
+ "loss": 0.3991,
1242
  "step": 175
1243
  },
1244
  {
1245
+ "epoch": 0.005662031414623161,
1246
+ "grad_norm": 0.8460421562194824,
1247
  "learning_rate": 0.0002,
1248
+ "loss": 0.3269,
1249
  "step": 176
1250
  },
1251
  {
1252
+ "epoch": 0.005694202047660793,
1253
+ "grad_norm": 0.793835461139679,
1254
  "learning_rate": 0.0002,
1255
+ "loss": 0.2896,
1256
  "step": 177
1257
  },
1258
  {
1259
+ "epoch": 0.005726372680698424,
1260
+ "grad_norm": 0.8878104090690613,
1261
  "learning_rate": 0.0002,
1262
+ "loss": 0.402,
1263
  "step": 178
1264
  },
1265
  {
1266
+ "epoch": 0.005758543313736056,
1267
+ "grad_norm": 0.8582636713981628,
1268
  "learning_rate": 0.0002,
1269
+ "loss": 0.4386,
1270
  "step": 179
1271
  },
1272
  {
1273
+ "epoch": 0.0057907139467736875,
1274
+ "grad_norm": 0.8274714350700378,
1275
  "learning_rate": 0.0002,
1276
+ "loss": 0.3228,
1277
  "step": 180
1278
  },
1279
  {
1280
+ "epoch": 0.005822884579811319,
1281
+ "grad_norm": 0.8849393725395203,
1282
  "learning_rate": 0.0002,
1283
+ "loss": 0.3835,
1284
  "step": 181
1285
  },
1286
  {
1287
+ "epoch": 0.005855055212848951,
1288
+ "grad_norm": 1.2293494939804077,
1289
  "learning_rate": 0.0002,
1290
+ "loss": 0.2947,
1291
  "step": 182
1292
  },
1293
  {
1294
+ "epoch": 0.005887225845886582,
1295
+ "grad_norm": 0.766805112361908,
1296
  "learning_rate": 0.0002,
1297
+ "loss": 0.4024,
1298
  "step": 183
1299
  },
1300
  {
1301
+ "epoch": 0.005919396478924214,
1302
+ "grad_norm": 1.023227572441101,
1303
  "learning_rate": 0.0002,
1304
+ "loss": 0.2848,
1305
  "step": 184
1306
  },
1307
  {
1308
+ "epoch": 0.005951567111961846,
1309
+ "grad_norm": 0.8333758115768433,
1310
  "learning_rate": 0.0002,
1311
+ "loss": 0.3701,
1312
  "step": 185
1313
  },
1314
  {
1315
+ "epoch": 0.005983737744999477,
1316
+ "grad_norm": 0.9221575260162354,
1317
  "learning_rate": 0.0002,
1318
+ "loss": 0.2953,
1319
  "step": 186
1320
  },
1321
  {
1322
+ "epoch": 0.006015908378037109,
1323
+ "grad_norm": 0.7268536686897278,
1324
  "learning_rate": 0.0002,
1325
+ "loss": 0.3487,
1326
  "step": 187
1327
  },
1328
  {
1329
+ "epoch": 0.0060480790110747406,
1330
+ "grad_norm": 0.7841563820838928,
1331
  "learning_rate": 0.0002,
1332
+ "loss": 0.3737,
1333
  "step": 188
1334
  },
1335
  {
1336
+ "epoch": 0.006080249644112372,
1337
+ "grad_norm": 0.7304165959358215,
1338
  "learning_rate": 0.0002,
1339
+ "loss": 0.3718,
1340
  "step": 189
1341
  },
1342
  {
1343
+ "epoch": 0.006112420277150004,
1344
+ "grad_norm": 0.9500126838684082,
1345
  "learning_rate": 0.0002,
1346
+ "loss": 0.3661,
1347
  "step": 190
1348
  },
1349
  {
1350
+ "epoch": 0.0061445909101876355,
1351
+ "grad_norm": 1.027346134185791,
1352
  "learning_rate": 0.0002,
1353
+ "loss": 0.3755,
1354
  "step": 191
1355
  },
1356
  {
1357
+ "epoch": 0.006176761543225267,
1358
+ "grad_norm": 0.6862695217132568,
1359
  "learning_rate": 0.0002,
1360
+ "loss": 0.3208,
1361
  "step": 192
1362
  },
1363
  {
1364
+ "epoch": 0.006208932176262899,
1365
+ "grad_norm": 0.7714293003082275,
1366
  "learning_rate": 0.0002,
1367
+ "loss": 0.361,
1368
  "step": 193
1369
  },
1370
  {
1371
+ "epoch": 0.00624110280930053,
1372
+ "grad_norm": 0.8124901056289673,
1373
  "learning_rate": 0.0002,
1374
+ "loss": 0.3565,
1375
  "step": 194
1376
  },
1377
  {
1378
+ "epoch": 0.006273273442338162,
1379
+ "grad_norm": 0.7867235541343689,
1380
  "learning_rate": 0.0002,
1381
+ "loss": 0.3192,
1382
  "step": 195
1383
  },
1384
  {
1385
+ "epoch": 0.006305444075375794,
1386
+ "grad_norm": 0.7322407364845276,
1387
  "learning_rate": 0.0002,
1388
+ "loss": 0.3075,
1389
  "step": 196
1390
  },
1391
  {
1392
+ "epoch": 0.006337614708413425,
1393
+ "grad_norm": 0.9365407228469849,
1394
  "learning_rate": 0.0002,
1395
+ "loss": 0.2552,
1396
  "step": 197
1397
  },
1398
  {
1399
+ "epoch": 0.006369785341451056,
1400
+ "grad_norm": 0.9422205686569214,
1401
  "learning_rate": 0.0002,
1402
+ "loss": 0.336,
1403
  "step": 198
1404
  },
1405
  {
1406
+ "epoch": 0.006401955974488688,
1407
+ "grad_norm": 0.76619952917099,
1408
  "learning_rate": 0.0002,
1409
+ "loss": 0.2907,
1410
  "step": 199
1411
  },
1412
  {
1413
+ "epoch": 0.006434126607526319,
1414
+ "grad_norm": 0.7989760041236877,
1415
  "learning_rate": 0.0002,
1416
+ "loss": 0.3402,
1417
  "step": 200
1418
  },
1419
  {
1420
+ "epoch": 0.006434126607526319,
1421
+ "eval_loss": 0.3310258388519287,
1422
+ "eval_runtime": 28.8107,
1423
+ "eval_samples_per_second": 8.643,
1424
+ "eval_steps_per_second": 4.339,
1425
  "step": 200
1426
  }
1427
  ],
1428
  "logging_steps": 1,
1429
+ "max_steps": 93252,
1430
  "num_input_tokens_seen": 0,
1431
  "num_train_epochs": 3,
1432
  "save_steps": 200,
 
1451
  "attributes": {}
1452
  }
1453
  },
1454
+ "total_flos": 1.298945783365632e+16,
1455
+ "train_batch_size": 2,
1456
  "trial_name": null,
1457
  "trial_params": null
1458
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5bfb8f9ee0d17252ff4577fc9c15127560771b7e188338420238d872618fd3b
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:559463d95a91aa4519eb17ce5aba32cad078cdd9196a6b9560209fa7cf008a3b
3
  size 6776