besimray commited on
Commit
fac25fc
·
verified ·
1 Parent(s): 61cd3a3

Upload task output 1bb7f5eb-6f15-4cc7-904c-cbb98a510983

Browse files
config.json CHANGED
@@ -130,6 +130,6 @@
130
  "tie_word_embeddings": false,
131
  "torch_dtype": "bfloat16",
132
  "transformers_version": "4.51.3",
133
- "use_cache": true,
134
  "vocab_size": 32064
135
  }
 
130
  "tie_word_embeddings": false,
131
  "torch_dtype": "bfloat16",
132
  "transformers_version": "4.51.3",
133
+ "use_cache": false,
134
  "vocab_size": 32064
135
  }
generation_config.json CHANGED
@@ -1,5 +1,7 @@
1
  {
2
- "temperature": null,
3
- "top_p": null,
 
 
4
  "transformers_version": "4.51.3"
5
  }
 
1
  {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 32000,
5
+ "pad_token_id": 32000,
6
  "transformers_version": "4.51.3"
7
  }
loss.txt CHANGED
@@ -1 +1 @@
1
- 419,no_eval
 
1
+ 500,9.067032814025879
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e2c8692400c1dce77feead9d8b9a3df70ca02d680dbcfed4ad9072f35897f10
3
  size 4991370968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c06e4d9ef0079babfe5495ac87b368c1ac69fa74896dfe69d6f5c2dd65edd003
3
  size 4991370968
trainer_state.json CHANGED
@@ -2,597 +2,724 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.17656974294142436,
6
  "eval_steps": 500,
7
- "global_step": 419,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.002107037505267594,
14
  "grad_norm": Infinity,
15
  "learning_rate": 1.0425224359183675e-05,
16
- "loss": 9.2528,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.004214075010535188,
21
  "grad_norm": Infinity,
22
  "learning_rate": 2.3456754808163266e-05,
23
- "loss": 9.0876,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.006321112515802781,
28
  "grad_norm": Infinity,
29
  "learning_rate": 3.6488285257142865e-05,
30
- "loss": 9.071,
31
  "step": 15
32
  },
33
  {
34
- "epoch": 0.008428150021070375,
35
  "grad_norm": Infinity,
36
  "learning_rate": 4.951981570612245e-05,
37
- "loss": 9.0988,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.01053518752633797,
42
  "grad_norm": Infinity,
43
  "learning_rate": 6.255134615510205e-05,
44
- "loss": 9.1175,
45
  "step": 25
46
  },
47
  {
48
- "epoch": 0.012642225031605562,
49
  "grad_norm": Infinity,
50
  "learning_rate": 7.558287660408165e-05,
51
- "loss": 9.1526,
52
  "step": 30
53
  },
54
  {
55
- "epoch": 0.014749262536873156,
56
  "grad_norm": Infinity,
57
  "learning_rate": 8.861440705306124e-05,
58
- "loss": 9.009,
59
  "step": 35
60
  },
61
  {
62
- "epoch": 0.01685630004214075,
63
  "grad_norm": Infinity,
64
- "learning_rate": 9.122065932117784e-05,
65
- "loss": 9.2034,
66
  "step": 40
67
  },
68
  {
69
- "epoch": 0.018963337547408345,
70
  "grad_norm": Infinity,
71
- "learning_rate": 9.122044067089586e-05,
72
- "loss": 9.0025,
73
  "step": 45
74
  },
75
  {
76
- "epoch": 0.02107037505267594,
77
  "grad_norm": Infinity,
78
- "learning_rate": 9.122005382923049e-05,
79
- "loss": 9.0069,
80
  "step": 50
81
  },
82
  {
83
- "epoch": 0.023177412557943533,
84
  "grad_norm": Infinity,
85
- "learning_rate": 9.121949879808376e-05,
86
- "loss": 8.9929,
87
  "step": 55
88
  },
89
  {
90
- "epoch": 0.025284450063211124,
91
  "grad_norm": Infinity,
92
- "learning_rate": 9.121877558018465e-05,
93
- "loss": 8.8893,
94
  "step": 60
95
  },
96
  {
97
- "epoch": 0.027391487568478718,
98
  "grad_norm": Infinity,
99
- "learning_rate": 9.121788417908908e-05,
100
- "loss": 9.0498,
101
  "step": 65
102
  },
103
  {
104
- "epoch": 0.029498525073746312,
105
  "grad_norm": Infinity,
106
- "learning_rate": 9.121682459917987e-05,
107
- "loss": 9.1409,
108
  "step": 70
109
  },
110
  {
111
- "epoch": 0.0316055625790139,
112
  "grad_norm": Infinity,
113
- "learning_rate": 9.121559684566679e-05,
114
- "loss": 9.3385,
115
  "step": 75
116
  },
117
  {
118
- "epoch": 0.0337126000842815,
119
  "grad_norm": Infinity,
120
- "learning_rate": 9.121420092458643e-05,
121
- "loss": 9.0298,
122
  "step": 80
123
  },
124
  {
125
- "epoch": 0.03581963758954909,
126
  "grad_norm": Infinity,
127
- "learning_rate": 9.121263684280228e-05,
128
- "loss": 9.0127,
129
  "step": 85
130
  },
131
  {
132
- "epoch": 0.03792667509481669,
133
  "grad_norm": Infinity,
134
- "learning_rate": 9.121090460800458e-05,
135
- "loss": 9.0721,
136
  "step": 90
137
  },
138
  {
139
- "epoch": 0.04003371260008428,
140
  "grad_norm": Infinity,
141
- "learning_rate": 9.120900422871043e-05,
142
- "loss": 9.1312,
143
  "step": 95
144
  },
145
  {
146
- "epoch": 0.04214075010535188,
147
  "grad_norm": Infinity,
148
- "learning_rate": 9.120693571426357e-05,
149
- "loss": 9.114,
150
  "step": 100
151
  },
152
  {
153
- "epoch": 0.04424778761061947,
154
  "grad_norm": Infinity,
155
- "learning_rate": 9.12046990748345e-05,
156
- "loss": 9.1711,
157
  "step": 105
158
  },
159
  {
160
- "epoch": 0.046354825115887066,
161
  "grad_norm": Infinity,
162
- "learning_rate": 9.120229432142029e-05,
163
- "loss": 9.1641,
164
  "step": 110
165
  },
166
  {
167
- "epoch": 0.04846186262115466,
168
  "grad_norm": Infinity,
169
- "learning_rate": 9.119972146584466e-05,
170
- "loss": 9.2252,
171
  "step": 115
172
  },
173
  {
174
- "epoch": 0.05056890012642225,
175
  "grad_norm": Infinity,
176
- "learning_rate": 9.119698052075782e-05,
177
- "loss": 9.0076,
178
  "step": 120
179
  },
180
  {
181
- "epoch": 0.052675937631689845,
182
  "grad_norm": Infinity,
183
- "learning_rate": 9.119407149963643e-05,
184
- "loss": 9.0771,
185
  "step": 125
186
  },
187
  {
188
- "epoch": 0.054782975136957436,
189
  "grad_norm": Infinity,
190
- "learning_rate": 9.11909944167836e-05,
191
- "loss": 9.262,
192
  "step": 130
193
  },
194
  {
195
- "epoch": 0.056890012642225034,
196
  "grad_norm": Infinity,
197
- "learning_rate": 9.118774928732871e-05,
198
- "loss": 9.0539,
199
  "step": 135
200
  },
201
  {
202
- "epoch": 0.058997050147492625,
203
  "grad_norm": Infinity,
204
- "learning_rate": 9.118433612722738e-05,
205
- "loss": 9.0142,
206
  "step": 140
207
  },
208
  {
209
- "epoch": 0.06110408765276022,
210
  "grad_norm": Infinity,
211
- "learning_rate": 9.118075495326146e-05,
212
- "loss": 9.1751,
213
  "step": 145
214
  },
215
  {
216
- "epoch": 0.0632111251580278,
217
  "grad_norm": Infinity,
218
- "learning_rate": 9.117700578303887e-05,
219
- "loss": 8.805,
220
  "step": 150
221
  },
222
  {
223
- "epoch": 0.0653181626632954,
224
  "grad_norm": Infinity,
225
- "learning_rate": 9.117308863499353e-05,
226
- "loss": 9.0702,
227
  "step": 155
228
  },
229
  {
230
- "epoch": 0.067425200168563,
231
  "grad_norm": Infinity,
232
- "learning_rate": 9.116900352838523e-05,
233
- "loss": 9.073,
234
  "step": 160
235
  },
236
  {
237
- "epoch": 0.0695322376738306,
238
  "grad_norm": Infinity,
239
- "learning_rate": 9.116475048329966e-05,
240
- "loss": 9.2132,
241
  "step": 165
242
  },
243
  {
244
- "epoch": 0.07163927517909818,
245
  "grad_norm": Infinity,
246
- "learning_rate": 9.116032952064818e-05,
247
- "loss": 9.1914,
248
  "step": 170
249
  },
250
  {
251
- "epoch": 0.07374631268436578,
252
  "grad_norm": Infinity,
253
- "learning_rate": 9.115574066216778e-05,
254
- "loss": 9.0857,
255
  "step": 175
256
  },
257
  {
258
- "epoch": 0.07585335018963338,
259
  "grad_norm": Infinity,
260
- "learning_rate": 9.115098393042094e-05,
261
- "loss": 9.3902,
262
  "step": 180
263
  },
264
  {
265
- "epoch": 0.07796038769490098,
266
  "grad_norm": Infinity,
267
- "learning_rate": 9.114605934879554e-05,
268
- "loss": 9.1069,
269
  "step": 185
270
  },
271
  {
272
- "epoch": 0.08006742520016856,
273
  "grad_norm": Infinity,
274
- "learning_rate": 9.11409669415048e-05,
275
- "loss": 9.0304,
276
  "step": 190
277
  },
278
  {
279
- "epoch": 0.08217446270543616,
280
  "grad_norm": Infinity,
281
- "learning_rate": 9.113570673358704e-05,
282
- "loss": 8.8525,
283
  "step": 195
284
  },
285
  {
286
- "epoch": 0.08428150021070376,
287
  "grad_norm": Infinity,
288
- "learning_rate": 9.113027875090565e-05,
289
- "loss": 9.3074,
290
  "step": 200
291
  },
292
  {
293
- "epoch": 0.08638853771597134,
294
  "grad_norm": Infinity,
295
- "learning_rate": 9.112468302014893e-05,
296
- "loss": 9.1235,
297
  "step": 205
298
  },
299
  {
300
- "epoch": 0.08849557522123894,
301
  "grad_norm": Infinity,
302
- "learning_rate": 9.111891956882998e-05,
303
- "loss": 9.0219,
304
  "step": 210
305
  },
306
  {
307
- "epoch": 0.09060261272650653,
308
  "grad_norm": Infinity,
309
- "learning_rate": 9.11129884252865e-05,
310
- "loss": 9.1624,
311
  "step": 215
312
  },
313
  {
314
- "epoch": 0.09270965023177413,
315
  "grad_norm": Infinity,
316
- "learning_rate": 9.110688961868076e-05,
317
- "loss": 9.0277,
318
  "step": 220
319
  },
320
  {
321
- "epoch": 0.09481668773704172,
322
  "grad_norm": Infinity,
323
- "learning_rate": 9.110062317899935e-05,
324
- "loss": 9.0036,
325
  "step": 225
326
  },
327
  {
328
- "epoch": 0.09692372524230931,
329
  "grad_norm": Infinity,
330
- "learning_rate": 9.109418913705311e-05,
331
- "loss": 9.0376,
332
  "step": 230
333
  },
334
  {
335
- "epoch": 0.09903076274757691,
336
  "grad_norm": Infinity,
337
- "learning_rate": 9.108758752447692e-05,
338
- "loss": 9.1026,
339
  "step": 235
340
  },
341
  {
342
- "epoch": 0.1011378002528445,
343
  "grad_norm": Infinity,
344
- "learning_rate": 9.108081837372961e-05,
345
- "loss": 9.2828,
346
  "step": 240
347
  },
348
  {
349
- "epoch": 0.10324483775811209,
350
  "grad_norm": Infinity,
351
- "learning_rate": 9.107388171809369e-05,
352
- "loss": 9.1233,
353
  "step": 245
354
  },
355
  {
356
- "epoch": 0.10535187526337969,
357
  "grad_norm": Infinity,
358
- "learning_rate": 9.106677759167533e-05,
359
- "loss": 9.2087,
360
  "step": 250
361
  },
362
  {
363
- "epoch": 0.10745891276864729,
364
  "grad_norm": Infinity,
365
- "learning_rate": 9.10595060294041e-05,
366
- "loss": 9.0822,
367
  "step": 255
368
  },
369
  {
370
- "epoch": 0.10956595027391487,
371
  "grad_norm": Infinity,
372
- "learning_rate": 9.105206706703279e-05,
373
- "loss": 9.0266,
374
  "step": 260
375
  },
376
  {
377
- "epoch": 0.11167298777918247,
378
  "grad_norm": Infinity,
379
- "learning_rate": 9.104446074113729e-05,
380
- "loss": 9.0158,
381
  "step": 265
382
  },
383
  {
384
- "epoch": 0.11378002528445007,
385
  "grad_norm": Infinity,
386
- "learning_rate": 9.10366870891164e-05,
387
- "loss": 9.1653,
388
  "step": 270
389
  },
390
  {
391
- "epoch": 0.11588706278971765,
392
  "grad_norm": Infinity,
393
- "learning_rate": 9.102874614919157e-05,
394
- "loss": 9.1888,
395
  "step": 275
396
  },
397
  {
398
- "epoch": 0.11799410029498525,
399
  "grad_norm": Infinity,
400
- "learning_rate": 9.102063796040684e-05,
401
- "loss": 9.1944,
402
  "step": 280
403
  },
404
  {
405
- "epoch": 0.12010113780025285,
406
  "grad_norm": Infinity,
407
- "learning_rate": 9.101236256262852e-05,
408
- "loss": 8.8196,
409
  "step": 285
410
  },
411
  {
412
- "epoch": 0.12220817530552044,
413
  "grad_norm": Infinity,
414
- "learning_rate": 9.10039199965451e-05,
415
- "loss": 9.1051,
416
  "step": 290
417
  },
418
  {
419
- "epoch": 0.12431521281078803,
420
  "grad_norm": Infinity,
421
- "learning_rate": 9.099531030366696e-05,
422
- "loss": 9.0194,
423
  "step": 295
424
  },
425
  {
426
- "epoch": 0.1264222503160556,
427
  "grad_norm": Infinity,
428
- "learning_rate": 9.098653352632625e-05,
429
- "loss": 9.0343,
430
  "step": 300
431
  },
432
  {
433
- "epoch": 0.12852928782132322,
434
  "grad_norm": Infinity,
435
- "learning_rate": 9.097758970767663e-05,
436
- "loss": 9.0571,
437
  "step": 305
438
  },
439
  {
440
- "epoch": 0.1306363253265908,
441
  "grad_norm": Infinity,
442
- "learning_rate": 9.096847889169301e-05,
443
- "loss": 9.1159,
444
  "step": 310
445
  },
446
  {
447
- "epoch": 0.13274336283185842,
448
  "grad_norm": Infinity,
449
- "learning_rate": 9.09592011231715e-05,
450
- "loss": 9.3278,
451
  "step": 315
452
  },
453
  {
454
- "epoch": 0.134850400337126,
455
  "grad_norm": Infinity,
456
- "learning_rate": 9.0949756447729e-05,
457
- "loss": 9.049,
458
  "step": 320
459
  },
460
  {
461
- "epoch": 0.1369574378423936,
462
  "grad_norm": Infinity,
463
- "learning_rate": 9.094014491180309e-05,
464
- "loss": 9.053,
465
  "step": 325
466
  },
467
  {
468
- "epoch": 0.1390644753476612,
469
  "grad_norm": Infinity,
470
- "learning_rate": 9.093036656265176e-05,
471
- "loss": 9.1193,
472
  "step": 330
473
  },
474
  {
475
- "epoch": 0.14117151285292878,
476
  "grad_norm": Infinity,
477
- "learning_rate": 9.092042144835316e-05,
478
- "loss": 9.0794,
479
  "step": 335
480
  },
481
  {
482
- "epoch": 0.14327855035819637,
483
  "grad_norm": Infinity,
484
- "learning_rate": 9.091030961780546e-05,
485
- "loss": 9.0032,
486
  "step": 340
487
  },
488
  {
489
- "epoch": 0.14538558786346398,
490
  "grad_norm": Infinity,
491
- "learning_rate": 9.090003112072649e-05,
492
- "loss": 9.0861,
493
  "step": 345
494
  },
495
  {
496
- "epoch": 0.14749262536873156,
497
  "grad_norm": Infinity,
498
- "learning_rate": 9.088958600765355e-05,
499
- "loss": 8.9942,
500
  "step": 350
501
  },
502
  {
503
- "epoch": 0.14959966287399915,
504
  "grad_norm": Infinity,
505
- "learning_rate": 9.087897432994316e-05,
506
- "loss": 8.968,
507
  "step": 355
508
  },
509
  {
510
- "epoch": 0.15170670037926676,
511
  "grad_norm": Infinity,
512
- "learning_rate": 9.086819613977083e-05,
513
- "loss": 9.1462,
514
  "step": 360
515
  },
516
  {
517
- "epoch": 0.15381373788453434,
518
  "grad_norm": Infinity,
519
- "learning_rate": 9.085725149013074e-05,
520
- "loss": 8.8035,
521
  "step": 365
522
  },
523
  {
524
- "epoch": 0.15592077538980195,
525
  "grad_norm": Infinity,
526
- "learning_rate": 9.08461404348355e-05,
527
- "loss": 9.0213,
528
  "step": 370
529
  },
530
  {
531
- "epoch": 0.15802781289506954,
532
  "grad_norm": Infinity,
533
- "learning_rate": 9.083486302851602e-05,
534
- "loss": 9.1359,
535
  "step": 375
536
  },
537
  {
538
- "epoch": 0.16013485040033712,
539
  "grad_norm": Infinity,
540
- "learning_rate": 9.082341932662096e-05,
541
- "loss": 9.1997,
542
  "step": 380
543
  },
544
  {
545
- "epoch": 0.16224188790560473,
546
  "grad_norm": Infinity,
547
- "learning_rate": 9.081180938541676e-05,
548
- "loss": 8.9983,
549
  "step": 385
550
  },
551
  {
552
- "epoch": 0.16434892541087232,
553
  "grad_norm": Infinity,
554
- "learning_rate": 9.080003326198714e-05,
555
- "loss": 9.2442,
556
  "step": 390
557
  },
558
  {
559
- "epoch": 0.1664559629161399,
560
  "grad_norm": Infinity,
561
- "learning_rate": 9.078809101423295e-05,
562
- "loss": 9.283,
563
  "step": 395
564
  },
565
  {
566
- "epoch": 0.1685630004214075,
567
  "grad_norm": Infinity,
568
- "learning_rate": 9.07759827008718e-05,
569
- "loss": 9.0873,
570
  "step": 400
571
  },
572
  {
573
- "epoch": 0.1706700379266751,
574
  "grad_norm": Infinity,
575
- "learning_rate": 9.076370838143787e-05,
576
- "loss": 9.1848,
577
  "step": 405
578
  },
579
  {
580
- "epoch": 0.17277707543194268,
581
  "grad_norm": Infinity,
582
- "learning_rate": 9.07512681162815e-05,
583
- "loss": 9.1581,
584
  "step": 410
585
  },
586
  {
587
- "epoch": 0.1748841129372103,
588
  "grad_norm": Infinity,
589
- "learning_rate": 9.073866196656897e-05,
590
- "loss": 9.3075,
591
  "step": 415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  }
593
  ],
594
  "logging_steps": 5,
595
- "max_steps": 7119,
596
  "num_input_tokens_seen": 0,
597
  "num_train_epochs": 3,
598
  "save_steps": 500,
@@ -608,7 +735,7 @@
608
  "attributes": {}
609
  }
610
  },
611
- "total_flos": 9.199842228845937e+17,
612
  "train_batch_size": 48,
613
  "trial_name": null,
614
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.42122999157540014,
6
  "eval_steps": 500,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.004212299915754001,
14
  "grad_norm": Infinity,
15
  "learning_rate": 1.0425224359183675e-05,
16
+ "loss": 9.1367,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.008424599831508003,
21
  "grad_norm": Infinity,
22
  "learning_rate": 2.3456754808163266e-05,
23
+ "loss": 9.1854,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.012636899747262006,
28
  "grad_norm": Infinity,
29
  "learning_rate": 3.6488285257142865e-05,
30
+ "loss": 9.2157,
31
  "step": 15
32
  },
33
  {
34
+ "epoch": 0.016849199663016005,
35
  "grad_norm": Infinity,
36
  "learning_rate": 4.951981570612245e-05,
37
+ "loss": 9.1202,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.02106149957877001,
42
  "grad_norm": Infinity,
43
  "learning_rate": 6.255134615510205e-05,
44
+ "loss": 9.0366,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.02527379949452401,
49
  "grad_norm": Infinity,
50
  "learning_rate": 7.558287660408165e-05,
51
+ "loss": 8.9155,
52
  "step": 30
53
  },
54
  {
55
+ "epoch": 0.02948609941027801,
56
  "grad_norm": Infinity,
57
  "learning_rate": 8.861440705306124e-05,
58
+ "loss": 9.0951,
59
  "step": 35
60
  },
61
  {
62
+ "epoch": 0.03369839932603201,
63
  "grad_norm": Infinity,
64
+ "learning_rate": 9.122049589805829e-05,
65
+ "loss": 9.2247,
66
  "step": 40
67
  },
68
  {
69
+ "epoch": 0.037910699241786014,
70
  "grad_norm": Infinity,
71
+ "learning_rate": 9.121961334579207e-05,
72
+ "loss": 9.1174,
73
  "step": 45
74
  },
75
  {
76
+ "epoch": 0.04212299915754002,
77
  "grad_norm": Infinity,
78
+ "learning_rate": 9.121805192576025e-05,
79
+ "loss": 9.1988,
80
  "step": 50
81
  },
82
  {
83
+ "epoch": 0.04633529907329402,
84
  "grad_norm": Infinity,
85
+ "learning_rate": 9.121581166895088e-05,
86
+ "loss": 9.1803,
87
  "step": 55
88
  },
89
  {
90
+ "epoch": 0.05054759898904802,
91
  "grad_norm": Infinity,
92
+ "learning_rate": 9.121289261982425e-05,
93
+ "loss": 9.0852,
94
  "step": 60
95
  },
96
  {
97
+ "epoch": 0.05475989890480202,
98
  "grad_norm": Infinity,
99
+ "learning_rate": 9.120929483631201e-05,
100
+ "loss": 9.161,
101
  "step": 65
102
  },
103
  {
104
+ "epoch": 0.05897219882055602,
105
  "grad_norm": Infinity,
106
+ "learning_rate": 9.120501838981599e-05,
107
+ "loss": 9.0158,
108
  "step": 70
109
  },
110
  {
111
+ "epoch": 0.06318449873631002,
112
  "grad_norm": Infinity,
113
+ "learning_rate": 9.120006336520685e-05,
114
+ "loss": 9.0559,
115
  "step": 75
116
  },
117
  {
118
+ "epoch": 0.06739679865206402,
119
  "grad_norm": Infinity,
120
+ "learning_rate": 9.11944298608223e-05,
121
+ "loss": 9.0891,
122
  "step": 80
123
  },
124
  {
125
+ "epoch": 0.07160909856781802,
126
  "grad_norm": Infinity,
127
+ "learning_rate": 9.118811798846527e-05,
128
+ "loss": 9.1733,
129
  "step": 85
130
  },
131
  {
132
+ "epoch": 0.07582139848357203,
133
  "grad_norm": Infinity,
134
+ "learning_rate": 9.118112787340156e-05,
135
+ "loss": 9.2267,
136
  "step": 90
137
  },
138
  {
139
+ "epoch": 0.08003369839932603,
140
  "grad_norm": Infinity,
141
+ "learning_rate": 9.117345965435748e-05,
142
+ "loss": 9.0607,
143
  "step": 95
144
  },
145
  {
146
+ "epoch": 0.08424599831508003,
147
  "grad_norm": Infinity,
148
+ "learning_rate": 9.116511348351699e-05,
149
+ "loss": 9.0453,
150
  "step": 100
151
  },
152
  {
153
+ "epoch": 0.08845829823083404,
154
  "grad_norm": Infinity,
155
+ "learning_rate": 9.115608952651875e-05,
156
+ "loss": 9.0472,
157
  "step": 105
158
  },
159
  {
160
+ "epoch": 0.09267059814658804,
161
  "grad_norm": Infinity,
162
+ "learning_rate": 9.11463879624528e-05,
163
+ "loss": 9.1655,
164
  "step": 110
165
  },
166
  {
167
+ "epoch": 0.09688289806234204,
168
  "grad_norm": Infinity,
169
+ "learning_rate": 9.113600898385701e-05,
170
+ "loss": 8.9799,
171
  "step": 115
172
  },
173
  {
174
+ "epoch": 0.10109519797809605,
175
  "grad_norm": Infinity,
176
+ "learning_rate": 9.112495279671328e-05,
177
+ "loss": 9.2456,
178
  "step": 120
179
  },
180
  {
181
+ "epoch": 0.10530749789385004,
182
  "grad_norm": Infinity,
183
+ "learning_rate": 9.11132196204434e-05,
184
+ "loss": 9.0901,
185
  "step": 125
186
  },
187
  {
188
+ "epoch": 0.10951979780960404,
189
  "grad_norm": Infinity,
190
+ "learning_rate": 9.11008096879048e-05,
191
+ "loss": 9.0431,
192
  "step": 130
193
  },
194
  {
195
+ "epoch": 0.11373209772535804,
196
  "grad_norm": Infinity,
197
+ "learning_rate": 9.108772324538577e-05,
198
+ "loss": 9.153,
199
  "step": 135
200
  },
201
  {
202
+ "epoch": 0.11794439764111204,
203
  "grad_norm": Infinity,
204
+ "learning_rate": 9.107396055260072e-05,
205
+ "loss": 9.2556,
206
  "step": 140
207
  },
208
  {
209
+ "epoch": 0.12215669755686605,
210
  "grad_norm": Infinity,
211
+ "learning_rate": 9.105952188268497e-05,
212
+ "loss": 8.9465,
213
  "step": 145
214
  },
215
  {
216
+ "epoch": 0.12636899747262004,
217
  "grad_norm": Infinity,
218
+ "learning_rate": 9.104440752218928e-05,
219
+ "loss": 9.0371,
220
  "step": 150
221
  },
222
  {
223
+ "epoch": 0.13058129738837404,
224
  "grad_norm": Infinity,
225
+ "learning_rate": 9.102861777107424e-05,
226
+ "loss": 9.2312,
227
  "step": 155
228
  },
229
  {
230
+ "epoch": 0.13479359730412804,
231
  "grad_norm": Infinity,
232
+ "learning_rate": 9.101215294270431e-05,
233
+ "loss": 9.2269,
234
  "step": 160
235
  },
236
  {
237
+ "epoch": 0.13900589721988205,
238
  "grad_norm": Infinity,
239
+ "learning_rate": 9.099501336384152e-05,
240
+ "loss": 9.0931,
241
  "step": 165
242
  },
243
  {
244
+ "epoch": 0.14321819713563605,
245
  "grad_norm": Infinity,
246
+ "learning_rate": 9.097719937463912e-05,
247
+ "loss": 9.072,
248
  "step": 170
249
  },
250
  {
251
+ "epoch": 0.14743049705139005,
252
  "grad_norm": Infinity,
253
+ "learning_rate": 9.095871132863466e-05,
254
+ "loss": 8.983,
255
  "step": 175
256
  },
257
  {
258
+ "epoch": 0.15164279696714406,
259
  "grad_norm": Infinity,
260
+ "learning_rate": 9.093954959274312e-05,
261
+ "loss": 9.1125,
262
  "step": 180
263
  },
264
  {
265
+ "epoch": 0.15585509688289806,
266
  "grad_norm": Infinity,
267
+ "learning_rate": 9.091971454724959e-05,
268
+ "loss": 8.8615,
269
  "step": 185
270
  },
271
  {
272
+ "epoch": 0.16006739679865206,
273
  "grad_norm": Infinity,
274
+ "learning_rate": 9.089920658580165e-05,
275
+ "loss": 9.1443,
276
  "step": 190
277
  },
278
  {
279
+ "epoch": 0.16427969671440606,
280
  "grad_norm": Infinity,
281
+ "learning_rate": 9.087802611540166e-05,
282
+ "loss": 9.2218,
283
  "step": 195
284
  },
285
  {
286
+ "epoch": 0.16849199663016007,
287
  "grad_norm": Infinity,
288
+ "learning_rate": 9.085617355639865e-05,
289
+ "loss": 9.1446,
290
  "step": 200
291
  },
292
  {
293
+ "epoch": 0.17270429654591407,
294
  "grad_norm": Infinity,
295
+ "learning_rate": 9.083364934247988e-05,
296
+ "loss": 9.2042,
297
  "step": 205
298
  },
299
  {
300
+ "epoch": 0.17691659646166807,
301
  "grad_norm": Infinity,
302
+ "learning_rate": 9.081045392066242e-05,
303
+ "loss": 9.2255,
304
  "step": 210
305
  },
306
  {
307
+ "epoch": 0.18112889637742208,
308
  "grad_norm": Infinity,
309
+ "learning_rate": 9.07865877512841e-05,
310
+ "loss": 8.9658,
311
  "step": 215
312
  },
313
  {
314
+ "epoch": 0.18534119629317608,
315
  "grad_norm": Infinity,
316
+ "learning_rate": 9.076205130799451e-05,
317
+ "loss": 9.0488,
318
  "step": 220
319
  },
320
  {
321
+ "epoch": 0.18955349620893008,
322
  "grad_norm": Infinity,
323
+ "learning_rate": 9.073684507774549e-05,
324
+ "loss": 8.9994,
325
  "step": 225
326
  },
327
  {
328
+ "epoch": 0.1937657961246841,
329
  "grad_norm": Infinity,
330
+ "learning_rate": 9.071096956078153e-05,
331
+ "loss": 9.2227,
332
  "step": 230
333
  },
334
  {
335
+ "epoch": 0.1979780960404381,
336
  "grad_norm": Infinity,
337
+ "learning_rate": 9.068442527062987e-05,
338
+ "loss": 9.0947,
339
  "step": 235
340
  },
341
  {
342
+ "epoch": 0.2021903959561921,
343
  "grad_norm": Infinity,
344
+ "learning_rate": 9.065721273409019e-05,
345
+ "loss": 8.9357,
346
  "step": 240
347
  },
348
  {
349
+ "epoch": 0.2064026958719461,
350
  "grad_norm": Infinity,
351
+ "learning_rate": 9.062933249122428e-05,
352
+ "loss": 9.1496,
353
  "step": 245
354
  },
355
  {
356
+ "epoch": 0.21061499578770007,
357
  "grad_norm": Infinity,
358
+ "learning_rate": 9.060078509534528e-05,
359
+ "loss": 9.198,
360
  "step": 250
361
  },
362
  {
363
+ "epoch": 0.21482729570345407,
364
  "grad_norm": Infinity,
365
+ "learning_rate": 9.057157111300668e-05,
366
+ "loss": 9.2222,
367
  "step": 255
368
  },
369
  {
370
+ "epoch": 0.21903959561920808,
371
  "grad_norm": Infinity,
372
+ "learning_rate": 9.054169112399107e-05,
373
+ "loss": 9.104,
374
  "step": 260
375
  },
376
  {
377
+ "epoch": 0.22325189553496208,
378
  "grad_norm": Infinity,
379
+ "learning_rate": 9.051114572129868e-05,
380
+ "loss": 9.2462,
381
  "step": 265
382
  },
383
  {
384
+ "epoch": 0.22746419545071608,
385
  "grad_norm": Infinity,
386
+ "learning_rate": 9.047993551113556e-05,
387
+ "loss": 9.2007,
388
  "step": 270
389
  },
390
  {
391
+ "epoch": 0.2316764953664701,
392
  "grad_norm": Infinity,
393
+ "learning_rate": 9.044806111290159e-05,
394
+ "loss": 9.0567,
395
  "step": 275
396
  },
397
  {
398
+ "epoch": 0.2358887952822241,
399
  "grad_norm": Infinity,
400
+ "learning_rate": 9.041552315917816e-05,
401
+ "loss": 9.3158,
402
  "step": 280
403
  },
404
  {
405
+ "epoch": 0.2401010951979781,
406
  "grad_norm": Infinity,
407
+ "learning_rate": 9.038232229571564e-05,
408
+ "loss": 9.127,
409
  "step": 285
410
  },
411
  {
412
+ "epoch": 0.2443133951137321,
413
  "grad_norm": Infinity,
414
+ "learning_rate": 9.034845918142056e-05,
415
+ "loss": 9.139,
416
  "step": 290
417
  },
418
  {
419
+ "epoch": 0.2485256950294861,
420
  "grad_norm": Infinity,
421
+ "learning_rate": 9.031393448834246e-05,
422
+ "loss": 8.9646,
423
  "step": 295
424
  },
425
  {
426
+ "epoch": 0.2527379949452401,
427
  "grad_norm": Infinity,
428
+ "learning_rate": 9.027874890166069e-05,
429
+ "loss": 9.1291,
430
  "step": 300
431
  },
432
  {
433
+ "epoch": 0.2569502948609941,
434
  "grad_norm": Infinity,
435
+ "learning_rate": 9.024290311967066e-05,
436
+ "loss": 9.1037,
437
  "step": 305
438
  },
439
  {
440
+ "epoch": 0.2611625947767481,
441
  "grad_norm": Infinity,
442
+ "learning_rate": 9.020639785377019e-05,
443
+ "loss": 9.0707,
444
  "step": 310
445
  },
446
  {
447
+ "epoch": 0.2653748946925021,
448
  "grad_norm": Infinity,
449
+ "learning_rate": 9.01692338284451e-05,
450
+ "loss": 9.2199,
451
  "step": 315
452
  },
453
  {
454
+ "epoch": 0.2695871946082561,
455
  "grad_norm": Infinity,
456
+ "learning_rate": 9.013141178125513e-05,
457
+ "loss": 9.2428,
458
  "step": 320
459
  },
460
  {
461
+ "epoch": 0.2737994945240101,
462
  "grad_norm": Infinity,
463
+ "learning_rate": 9.009293246281905e-05,
464
+ "loss": 9.163,
465
  "step": 325
466
  },
467
  {
468
+ "epoch": 0.2780117944397641,
469
  "grad_norm": Infinity,
470
+ "learning_rate": 9.005379663679996e-05,
471
+ "loss": 9.22,
472
  "step": 330
473
  },
474
  {
475
+ "epoch": 0.2822240943555181,
476
  "grad_norm": Infinity,
477
+ "learning_rate": 9.001400507989004e-05,
478
+ "loss": 9.0213,
479
  "step": 335
480
  },
481
  {
482
+ "epoch": 0.2864363942712721,
483
  "grad_norm": Infinity,
484
+ "learning_rate": 8.99735585817951e-05,
485
+ "loss": 9.1937,
486
  "step": 340
487
  },
488
  {
489
+ "epoch": 0.2906486941870261,
490
  "grad_norm": Infinity,
491
+ "learning_rate": 8.993245794521902e-05,
492
+ "loss": 9.1865,
493
  "step": 345
494
  },
495
  {
496
+ "epoch": 0.2948609941027801,
497
  "grad_norm": Infinity,
498
+ "learning_rate": 8.98907039858477e-05,
499
+ "loss": 9.2424,
500
  "step": 350
501
  },
502
  {
503
+ "epoch": 0.2990732940185341,
504
  "grad_norm": Infinity,
505
+ "learning_rate": 8.984829753233298e-05,
506
+ "loss": 9.1797,
507
  "step": 355
508
  },
509
  {
510
+ "epoch": 0.3032855939342881,
511
  "grad_norm": Infinity,
512
+ "learning_rate": 8.980523942627609e-05,
513
+ "loss": 9.127,
514
  "step": 360
515
  },
516
  {
517
+ "epoch": 0.3074978938500421,
518
  "grad_norm": Infinity,
519
+ "learning_rate": 8.976153052221104e-05,
520
+ "loss": 9.0876,
521
  "step": 365
522
  },
523
  {
524
+ "epoch": 0.3117101937657961,
525
  "grad_norm": Infinity,
526
+ "learning_rate": 8.971717168758756e-05,
527
+ "loss": 9.1596,
528
  "step": 370
529
  },
530
  {
531
+ "epoch": 0.3159224936815501,
532
  "grad_norm": Infinity,
533
+ "learning_rate": 8.967216380275405e-05,
534
+ "loss": 9.048,
535
  "step": 375
536
  },
537
  {
538
+ "epoch": 0.3201347935973041,
539
  "grad_norm": Infinity,
540
+ "learning_rate": 8.962650776093989e-05,
541
+ "loss": 9.0332,
542
  "step": 380
543
  },
544
  {
545
+ "epoch": 0.3243470935130581,
546
  "grad_norm": Infinity,
547
+ "learning_rate": 8.958020446823789e-05,
548
+ "loss": 9.1551,
549
  "step": 385
550
  },
551
  {
552
+ "epoch": 0.32855939342881213,
553
  "grad_norm": Infinity,
554
+ "learning_rate": 8.953325484358625e-05,
555
+ "loss": 8.9415,
556
  "step": 390
557
  },
558
  {
559
+ "epoch": 0.33277169334456613,
560
  "grad_norm": Infinity,
561
+ "learning_rate": 8.948565981875027e-05,
562
+ "loss": 9.0024,
563
  "step": 395
564
  },
565
  {
566
+ "epoch": 0.33698399326032014,
567
  "grad_norm": Infinity,
568
+ "learning_rate": 8.943742033830394e-05,
569
+ "loss": 9.1814,
570
  "step": 400
571
  },
572
  {
573
+ "epoch": 0.34119629317607414,
574
  "grad_norm": Infinity,
575
+ "learning_rate": 8.938853735961113e-05,
576
+ "loss": 9.2051,
577
  "step": 405
578
  },
579
  {
580
+ "epoch": 0.34540859309182814,
581
  "grad_norm": Infinity,
582
+ "learning_rate": 8.933901185280665e-05,
583
+ "loss": 9.1032,
584
  "step": 410
585
  },
586
  {
587
+ "epoch": 0.34962089300758215,
588
  "grad_norm": Infinity,
589
+ "learning_rate": 8.928884480077696e-05,
590
+ "loss": 9.0838,
591
  "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.35383319292333615,
595
+ "grad_norm": Infinity,
596
+ "learning_rate": 8.923803719914063e-05,
597
+ "loss": 9.0075,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.35804549283909015,
602
+ "grad_norm": Infinity,
603
+ "learning_rate": 8.91865900562287e-05,
604
+ "loss": 8.9951,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.36225779275484415,
609
+ "grad_norm": Infinity,
610
+ "learning_rate": 8.913450439306448e-05,
611
+ "loss": 9.1082,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.36647009267059816,
616
+ "grad_norm": Infinity,
617
+ "learning_rate": 8.908178124334348e-05,
618
+ "loss": 9.2004,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.37068239258635216,
623
+ "grad_norm": Infinity,
624
+ "learning_rate": 8.902842165341278e-05,
625
+ "loss": 8.9807,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.37489469250210616,
630
+ "grad_norm": Infinity,
631
+ "learning_rate": 8.89744266822503e-05,
632
+ "loss": 9.043,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.37910699241786017,
637
+ "grad_norm": Infinity,
638
+ "learning_rate": 8.891979740144376e-05,
639
+ "loss": 9.1759,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.38331929233361417,
644
+ "grad_norm": Infinity,
645
+ "learning_rate": 8.886453489516945e-05,
646
+ "loss": 9.0236,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.3875315922493682,
651
+ "grad_norm": Infinity,
652
+ "learning_rate": 8.880864026017068e-05,
653
+ "loss": 9.2918,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.3917438921651222,
658
+ "grad_norm": Infinity,
659
+ "learning_rate": 8.875211460573607e-05,
660
+ "loss": 9.0641,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.3959561920808762,
665
+ "grad_norm": Infinity,
666
+ "learning_rate": 8.869495905367742e-05,
667
+ "loss": 9.048,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.4001684919966302,
672
+ "grad_norm": Infinity,
673
+ "learning_rate": 8.863717473830758e-05,
674
+ "loss": 9.0192,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.4043807919123842,
679
+ "grad_norm": Infinity,
680
+ "learning_rate": 8.857876280641784e-05,
681
+ "loss": 9.0275,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.4085930918281382,
686
+ "grad_norm": Infinity,
687
+ "learning_rate": 8.851972441725522e-05,
688
+ "loss": 8.9821,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.4128053917438922,
693
+ "grad_norm": Infinity,
694
+ "learning_rate": 8.846006074249951e-05,
695
+ "loss": 8.9907,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.41701769165964614,
700
+ "grad_norm": Infinity,
701
+ "learning_rate": 8.839977296623983e-05,
702
+ "loss": 9.0855,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.42122999157540014,
707
+ "grad_norm": Infinity,
708
+ "learning_rate": 8.833886228495139e-05,
709
+ "loss": 9.2186,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.42122999157540014,
714
+ "eval_loss": 9.067032814025879,
715
+ "eval_runtime": 2.0409,
716
+ "eval_samples_per_second": 26.949,
717
+ "eval_steps_per_second": 13.719,
718
+ "step": 500
719
  }
720
  ],
721
  "logging_steps": 5,
722
+ "max_steps": 3561,
723
  "num_input_tokens_seen": 0,
724
  "num_train_epochs": 3,
725
  "save_steps": 500,
 
735
  "attributes": {}
736
  }
737
  },
738
+ "total_flos": 2.1956663091134464e+18,
739
  "train_batch_size": 48,
740
  "trial_name": null,
741
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d61848344d261f7ee72ed40e543705d4c9b47355806f2308c7db674796a9a95
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d048f01f6853330251af6fe32add0aa6a3f4efcb452d1313fc0b4287ea95d91e
3
  size 5624