finalform commited on
Commit
449eb7b
·
verified ·
1 Parent(s): 74b4226

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -13,7 +13,7 @@
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
- "lora_alpha": 8,
17
  "lora_bias": false,
18
  "lora_dropout": 0.1,
19
  "megatron_config": null,
@@ -26,11 +26,11 @@
26
  "revision": null,
27
  "target_modules": [
28
  "o_proj",
29
- "down_proj",
30
- "v_proj",
31
- "k_proj",
32
  "up_proj",
33
  "q_proj",
 
 
 
34
  "gate_proj"
35
  ],
36
  "target_parameters": null,
 
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
+ "lora_alpha": 16,
17
  "lora_bias": false,
18
  "lora_dropout": 0.1,
19
  "megatron_config": null,
 
26
  "revision": null,
27
  "target_modules": [
28
  "o_proj",
 
 
 
29
  "up_proj",
30
  "q_proj",
31
+ "down_proj",
32
+ "k_proj",
33
+ "v_proj",
34
  "gate_proj"
35
  ],
36
  "target_parameters": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7113572bd44d1b799b937244a2a917333bc610c5e13306503d99b2a5d605d2d
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7910da3c4957e71800c849379173ba91d8f5e700436be19029cac09f62f6d4a8
3
  size 645975704
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:715b98a5d8bdd5b8483d08b0795b537485c9e1e9acc5147f00dd8f4fc6958299
3
  size 1292087499
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dd8ce59e89a9efc32937df859f80518d82028400a830a9b29264b30c291ebd1
3
  size 1292087499
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38d7663d615e7d9fcd80d81534d2ffa88a4bd0246f4ca20f26a690fbfdce8036
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5844d24e029460a14b821935a1464ceba31cf265535853016e1f6652d0694907
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42f5f246856bfc4b5fa9a61aafa2feee7015df871d22d57b538c34491b7e33b3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954c3b26f534b41d23b4604b2fd25ad07029756854b57c8ddfc09c8a621110f7
3
  size 1465
trainer_state.json CHANGED
@@ -2,1126 +2,483 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 7.0,
6
  "eval_steps": 500,
7
- "global_step": 2905,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
- "grad_norm": 0.20090460777282715,
15
- "learning_rate": 0.0001636363636363636,
16
- "loss": 1.8921,
17
- "mean_token_accuracy": 0.621737614274025,
18
- "num_tokens": 155587.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
- "grad_norm": 0.17085565626621246,
24
- "learning_rate": 0.00033409090909090905,
25
- "loss": 0.9553,
26
- "mean_token_accuracy": 0.7632233685255051,
27
- "num_tokens": 282345.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
- "grad_norm": 0.15321753919124603,
33
- "learning_rate": 0.0005045454545454546,
34
- "loss": 0.653,
35
- "mean_token_accuracy": 0.8232995158433914,
36
- "num_tokens": 441398.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
- "grad_norm": 0.24833154678344727,
42
- "learning_rate": 0.0005999774265866424,
43
- "loss": 0.5573,
44
- "mean_token_accuracy": 0.8464936971664428,
45
- "num_tokens": 567353.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
- "grad_norm": 0.15748420357704163,
51
- "learning_rate": 0.0005997582513956242,
52
- "loss": 0.4065,
53
- "mean_token_accuracy": 0.8838793677091599,
54
- "num_tokens": 724260.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
- "grad_norm": 0.17303556203842163,
60
- "learning_rate": 0.0005993060798733474,
61
- "loss": 0.3538,
62
- "mean_token_accuracy": 0.8992524355649948,
63
- "num_tokens": 849790.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
- "grad_norm": 0.1528756469488144,
69
- "learning_rate": 0.0005986212634840513,
70
- "loss": 0.2586,
71
- "mean_token_accuracy": 0.9249306803941727,
72
- "num_tokens": 1006269.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
- "grad_norm": 0.16112498939037323,
78
- "learning_rate": 0.0005977043345223621,
79
- "loss": 0.2347,
80
- "mean_token_accuracy": 0.9317040795087814,
81
- "num_tokens": 1132441.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
- "grad_norm": 0.11938610672950745,
87
- "learning_rate": 0.0005965560056995495,
88
- "loss": 0.1813,
89
- "mean_token_accuracy": 0.9481321328878403,
90
- "num_tokens": 1290980.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
- "grad_norm": 0.17489156126976013,
96
- "learning_rate": 0.0005951771695895515,
97
- "loss": 0.1551,
98
- "mean_token_accuracy": 0.9559709775447846,
99
- "num_tokens": 1417621.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
- "grad_norm": 0.12637701630592346,
105
- "learning_rate": 0.0005935688979351926,
106
- "loss": 0.1437,
107
- "mean_token_accuracy": 0.9594241315126419,
108
- "num_tokens": 1576677.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
- "grad_norm": 0.16465479135513306,
114
- "learning_rate": 0.0005917324408151391,
115
- "loss": 0.1302,
116
- "mean_token_accuracy": 0.9632405745983124,
117
- "num_tokens": 1704788.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
- "grad_norm": 0.08237937092781067,
123
- "learning_rate": 0.0005896692256722372,
124
- "loss": 0.1191,
125
- "mean_token_accuracy": 0.9669104343652726,
126
- "num_tokens": 1862745.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
- "grad_norm": 0.20071323215961456,
132
- "learning_rate": 0.0005873808562039883,
133
- "loss": 0.0883,
134
- "mean_token_accuracy": 0.9747090804576873,
135
- "num_tokens": 1989934.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
- "grad_norm": 0.08867258578538895,
141
- "learning_rate": 0.000584869111116027,
142
- "loss": 0.1031,
143
- "mean_token_accuracy": 0.9720449894666672,
144
- "num_tokens": 2147872.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
- "grad_norm": 0.15626519918441772,
150
- "learning_rate": 0.000582135942739566,
151
- "loss": 0.0846,
152
- "mean_token_accuracy": 0.9759581971168518,
153
- "num_tokens": 2274914.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
- "eval_loss": 0.0935852974653244,
159
- "eval_mean_token_accuracy": 0.973474471955686,
160
  "eval_num_tokens": 2354180.0,
161
- "eval_runtime": 15.7785,
162
- "eval_samples_per_second": 23.386,
163
- "eval_steps_per_second": 11.725,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
- "grad_norm": 0.10558875650167465,
169
- "learning_rate": 0.0005791834755138876,
170
- "loss": 0.0891,
171
- "mean_token_accuracy": 0.9749482642744005,
172
- "num_tokens": 2422664.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
- "grad_norm": 0.0805174931883812,
178
- "learning_rate": 0.0005760140043350575,
179
- "loss": 0.0642,
180
- "mean_token_accuracy": 0.9817380511760712,
181
- "num_tokens": 2564993.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
- "grad_norm": 0.07888537645339966,
187
- "learning_rate": 0.0005726299927721457,
188
- "loss": 0.0801,
189
- "mean_token_accuracy": 0.9775149941444397,
190
- "num_tokens": 2706480.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
- "grad_norm": 0.08255880326032639,
196
- "learning_rate": 0.0005690340711523424,
197
- "loss": 0.0574,
198
- "mean_token_accuracy": 0.9840424680709838,
199
- "num_tokens": 2849915.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
- "grad_norm": 0.08410744369029999,
205
- "learning_rate": 0.0005652290345164548,
206
- "loss": 0.0843,
207
- "mean_token_accuracy": 0.9765041953325272,
208
- "num_tokens": 2991414.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
- "grad_norm": 0.08183339238166809,
214
- "learning_rate": 0.0005612178404463753,
215
- "loss": 0.0552,
216
- "mean_token_accuracy": 0.9844763785600662,
217
- "num_tokens": 3133623.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
- "grad_norm": 0.0897441878914833,
223
- "learning_rate": 0.0005570036067662102,
224
- "loss": 0.0664,
225
- "mean_token_accuracy": 0.9818173968791961,
226
- "num_tokens": 3275382.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
- "grad_norm": 0.08169595152139664,
232
- "learning_rate": 0.0005525896091188552,
233
- "loss": 0.0506,
234
- "mean_token_accuracy": 0.9850967526435852,
235
- "num_tokens": 3416201.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
- "grad_norm": 0.05605079606175423,
241
- "learning_rate": 0.0005479792784199004,
242
- "loss": 0.0775,
243
- "mean_token_accuracy": 0.9789031559228897,
244
- "num_tokens": 3559740.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
- "grad_norm": 0.07204550504684448,
250
- "learning_rate": 0.0005431761981908461,
251
- "loss": 0.049,
252
- "mean_token_accuracy": 0.9865607488155365,
253
- "num_tokens": 3704393.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
- "grad_norm": 0.05560595169663429,
259
- "learning_rate": 0.0005381841017737,
260
- "loss": 0.0626,
261
- "mean_token_accuracy": 0.982689215540886,
262
- "num_tokens": 3847664.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
- "grad_norm": 0.05605713650584221,
268
- "learning_rate": 0.0005330068694291224,
269
- "loss": 0.0485,
270
- "mean_token_accuracy": 0.9857253217697144,
271
- "num_tokens": 3989914.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
- "grad_norm": 0.053301677107810974,
277
- "learning_rate": 0.000527648525320374,
278
- "loss": 0.0561,
279
- "mean_token_accuracy": 0.9841935896873474,
280
- "num_tokens": 4130965.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
- "grad_norm": 0.058443792164325714,
286
- "learning_rate": 0.0005221132343854112,
287
- "loss": 0.0454,
288
- "mean_token_accuracy": 0.9869389832019806,
289
- "num_tokens": 4273126.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
- "grad_norm": 0.05557582527399063,
295
- "learning_rate": 0.0005164052990995595,
296
- "loss": 0.0586,
297
- "mean_token_accuracy": 0.9833663034439087,
298
- "num_tokens": 4413184.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
- "grad_norm": 0.06535590440034866,
304
- "learning_rate": 0.0005105291561312827,
305
- "loss": 0.0423,
306
- "mean_token_accuracy": 0.9879541498422623,
307
- "num_tokens": 4556318.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
- "grad_norm": 0.04030626267194748,
313
- "learning_rate": 0.0005044893728936449,
314
- "loss": 0.0492,
315
- "mean_token_accuracy": 0.9859029227495193,
316
- "num_tokens": 4688481.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
- "eval_loss": 0.0574427954852581,
322
- "eval_mean_token_accuracy": 0.9839089316290778,
323
  "eval_num_tokens": 4708360.0,
324
- "eval_runtime": 15.7562,
325
- "eval_samples_per_second": 23.419,
326
- "eval_steps_per_second": 11.741,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
- "grad_norm": 0.0485665388405323,
332
- "learning_rate": 0.0004982906439941489,
333
- "loss": 0.0505,
334
- "mean_token_accuracy": 0.9851801610484565,
335
- "num_tokens": 4837161.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
- "grad_norm": 0.05395647510886192,
341
- "learning_rate": 0.0004919377875857071,
342
- "loss": 0.0343,
343
- "mean_token_accuracy": 0.9896635556221008,
344
- "num_tokens": 4970144.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
- "grad_norm": 0.0749589204788208,
350
- "learning_rate": 0.00048543574162158455,
351
- "loss": 0.0537,
352
- "mean_token_accuracy": 0.985042062997818,
353
- "num_tokens": 5121673.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
- "grad_norm": 0.04066776856780052,
359
- "learning_rate": 0.00047878956001722235,
360
- "loss": 0.0353,
361
- "mean_token_accuracy": 0.9900505661964416,
362
- "num_tokens": 5254931.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
- "grad_norm": 0.045161083340644836,
368
- "learning_rate": 0.00047200440872192636,
369
- "loss": 0.0474,
370
- "mean_token_accuracy": 0.9861310094594955,
371
- "num_tokens": 5405358.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
- "grad_norm": 0.03835665062069893,
377
- "learning_rate": 0.0004650855617034737,
378
- "loss": 0.034,
379
- "mean_token_accuracy": 0.9899905091524124,
380
- "num_tokens": 5537530.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
- "grad_norm": 0.044723257422447205,
386
- "learning_rate": 0.00045803839684875944,
387
- "loss": 0.0431,
388
- "mean_token_accuracy": 0.9870987349748611,
389
- "num_tokens": 5689697.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
- "grad_norm": 0.02631463296711445,
395
- "learning_rate": 0.00045086839178366795,
396
- "loss": 0.0316,
397
- "mean_token_accuracy": 0.9909015417098999,
398
- "num_tokens": 5822211.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
- "grad_norm": 0.053898368030786514,
404
- "learning_rate": 0.00044358111961541986,
405
- "loss": 0.0449,
406
- "mean_token_accuracy": 0.9868414753675461,
407
- "num_tokens": 5974286.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
- "grad_norm": 0.050944212824106216,
413
- "learning_rate": 0.0004361822446007026,
414
- "loss": 0.0326,
415
- "mean_token_accuracy": 0.9903159868717194,
416
- "num_tokens": 6107693.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
- "grad_norm": 0.07088279724121094,
422
- "learning_rate": 0.00042867751774295254,
423
- "loss": 0.0474,
424
- "mean_token_accuracy": 0.9863699376583099,
425
- "num_tokens": 6258918.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
- "grad_norm": 0.045055486261844635,
431
- "learning_rate": 0.0004210727723222105,
432
- "loss": 0.0288,
433
- "mean_token_accuracy": 0.9913076066970825,
434
- "num_tokens": 6391301.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
- "grad_norm": 0.04561085253953934,
440
- "learning_rate": 0.0004133739193610255,
441
- "loss": 0.0492,
442
- "mean_token_accuracy": 0.9852611935138702,
443
- "num_tokens": 6544010.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
- "grad_norm": 0.04456303268671036,
449
- "learning_rate": 0.00040558694302992963,
450
- "loss": 0.0292,
451
- "mean_token_accuracy": 0.9911560428142547,
452
- "num_tokens": 6677793.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
- "grad_norm": 0.034333277493715286,
458
- "learning_rate": 0.00039771789599605845,
459
- "loss": 0.0387,
460
- "mean_token_accuracy": 0.988199480175972,
461
- "num_tokens": 6826843.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
- "grad_norm": 0.03919946029782295,
467
- "learning_rate": 0.0003897728947185279,
468
- "loss": 0.028,
469
- "mean_token_accuracy": 0.9913868033885955,
470
- "num_tokens": 6958293.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
- "eval_loss": 0.04582400992512703,
476
- "eval_mean_token_accuracy": 0.9875572819967527,
477
  "eval_num_tokens": 7062540.0,
478
- "eval_runtime": 15.7727,
479
- "eval_samples_per_second": 23.395,
480
- "eval_steps_per_second": 11.729,
481
  "step": 1245
482
- },
483
- {
484
- "epoch": 3.012070006035003,
485
- "grad_norm": 0.04459620267152786,
486
- "learning_rate": 0.00038175811469422905,
487
- "loss": 0.04,
488
- "mean_token_accuracy": 0.9881644027749288,
489
- "num_tokens": 7099773.0,
490
- "step": 1250
491
- },
492
- {
493
- "epoch": 3.0724200362100182,
494
- "grad_norm": 0.025668803602457047,
495
- "learning_rate": 0.00037367978565773226,
496
- "loss": 0.027,
497
- "mean_token_accuracy": 0.9916459822654724,
498
- "num_tokens": 7247178.0,
499
- "step": 1275
500
- },
501
- {
502
- "epoch": 3.1327700663850333,
503
- "grad_norm": 0.048161111772060394,
504
- "learning_rate": 0.0003655441867390346,
505
- "loss": 0.0296,
506
- "mean_token_accuracy": 0.990880873799324,
507
- "num_tokens": 7383662.0,
508
- "step": 1300
509
- },
510
- {
511
- "epoch": 3.1931200965600484,
512
- "grad_norm": 0.04576310142874718,
513
- "learning_rate": 0.00035735764158291254,
514
- "loss": 0.0296,
515
- "mean_token_accuracy": 0.9907885307073593,
516
- "num_tokens": 7533651.0,
517
- "step": 1325
518
- },
519
- {
520
- "epoch": 3.2534701267350634,
521
- "grad_norm": 0.04082406684756279,
522
- "learning_rate": 0.0003491265134336745,
523
- "loss": 0.0327,
524
- "mean_token_accuracy": 0.9898776888847352,
525
- "num_tokens": 7671505.0,
526
- "step": 1350
527
- },
528
- {
529
- "epoch": 3.3138201569100785,
530
- "grad_norm": 0.04111408442258835,
531
- "learning_rate": 0.00034085720018913276,
532
- "loss": 0.0264,
533
- "mean_token_accuracy": 0.9915948182344436,
534
- "num_tokens": 7819430.0,
535
- "step": 1375
536
- },
537
- {
538
- "epoch": 3.3741701870850935,
539
- "grad_norm": 0.05790106952190399,
540
- "learning_rate": 0.0003325561294276413,
541
- "loss": 0.0286,
542
- "mean_token_accuracy": 0.9912853974103928,
543
- "num_tokens": 7954433.0,
544
- "step": 1400
545
- },
546
- {
547
- "epoch": 3.4345202172601086,
548
- "grad_norm": 0.027405843138694763,
549
- "learning_rate": 0.00032422975341206157,
550
- "loss": 0.0281,
551
- "mean_token_accuracy": 0.991134095788002,
552
- "num_tokens": 8102229.0,
553
- "step": 1425
554
- },
555
- {
556
- "epoch": 3.4948702474351236,
557
- "grad_norm": 0.043464187532663345,
558
- "learning_rate": 0.000315884544074543,
559
- "loss": 0.0292,
560
- "mean_token_accuracy": 0.9907919150590897,
561
- "num_tokens": 8237057.0,
562
- "step": 1450
563
- },
564
- {
565
- "epoch": 3.5552202776101387,
566
- "grad_norm": 0.04483381658792496,
567
- "learning_rate": 0.0003075269879860149,
568
- "loss": 0.0255,
569
- "mean_token_accuracy": 0.9919383651018143,
570
- "num_tokens": 8385229.0,
571
- "step": 1475
572
- },
573
- {
574
- "epoch": 3.6155703077851538,
575
- "grad_norm": 0.03719889372587204,
576
- "learning_rate": 0.0002991635813142984,
577
- "loss": 0.0305,
578
- "mean_token_accuracy": 0.9901222789287567,
579
- "num_tokens": 8521392.0,
580
- "step": 1500
581
- },
582
- {
583
- "epoch": 3.675920337960169,
584
- "grad_norm": 0.026905681937932968,
585
- "learning_rate": 0.0002908008247747611,
586
- "loss": 0.025,
587
- "mean_token_accuracy": 0.9921067571640014,
588
- "num_tokens": 8669874.0,
589
- "step": 1525
590
- },
591
- {
592
- "epoch": 3.736270368135184,
593
- "grad_norm": 0.05321066826581955,
594
- "learning_rate": 0.00028244521857743467,
595
- "loss": 0.0309,
596
- "mean_token_accuracy": 0.9903623193502427,
597
- "num_tokens": 8804899.0,
598
- "step": 1550
599
- },
600
- {
601
- "epoch": 3.796620398310199,
602
- "grad_norm": 0.02997380495071411,
603
- "learning_rate": 0.00027410325737452793,
604
- "loss": 0.0247,
605
- "mean_token_accuracy": 0.9921429508924484,
606
- "num_tokens": 8952948.0,
607
- "step": 1575
608
- },
609
- {
610
- "epoch": 3.856970428485214,
611
- "grad_norm": 0.042058832943439484,
612
- "learning_rate": 0.0002657814252122571,
613
- "loss": 0.0276,
614
- "mean_token_accuracy": 0.9911820942163467,
615
- "num_tokens": 9088804.0,
616
- "step": 1600
617
- },
618
- {
619
- "epoch": 3.9173204586602295,
620
- "grad_norm": 0.028029246255755424,
621
- "learning_rate": 0.00025748619049092167,
622
- "loss": 0.0246,
623
- "mean_token_accuracy": 0.9920519244670868,
624
- "num_tokens": 9237788.0,
625
- "step": 1625
626
- },
627
- {
628
- "epoch": 3.9776704888352445,
629
- "grad_norm": 0.02371133305132389,
630
- "learning_rate": 0.0002492240009371417,
631
- "loss": 0.0245,
632
- "mean_token_accuracy": 0.9923285436630249,
633
- "num_tokens": 9369559.0,
634
- "step": 1650
635
- },
636
- {
637
- "epoch": 4.0,
638
- "eval_loss": 0.041972678154706955,
639
- "eval_mean_token_accuracy": 0.989047556632274,
640
- "eval_num_tokens": 9416720.0,
641
- "eval_runtime": 15.7616,
642
- "eval_samples_per_second": 23.411,
643
- "eval_steps_per_second": 11.737,
644
- "step": 1660
645
- },
646
- {
647
- "epoch": 4.036210018105009,
648
- "grad_norm": 0.027629472315311432,
649
- "learning_rate": 0.00024100127859216246,
650
- "loss": 0.0242,
651
- "mean_token_accuracy": 0.9923697267611002,
652
- "num_tokens": 9517456.0,
653
- "step": 1675
654
- },
655
- {
656
- "epoch": 4.096560048280024,
657
- "grad_norm": 0.018458090722560883,
658
- "learning_rate": 0.0002328244148201266,
659
- "loss": 0.0198,
660
- "mean_token_accuracy": 0.9934357500076294,
661
- "num_tokens": 9655983.0,
662
- "step": 1700
663
- },
664
- {
665
- "epoch": 4.15691007845504,
666
- "grad_norm": 0.025151990354061127,
667
- "learning_rate": 0.0002246997653401883,
668
- "loss": 0.0252,
669
- "mean_token_accuracy": 0.9922802877426148,
670
- "num_tokens": 9802927.0,
671
- "step": 1725
672
- },
673
- {
674
- "epoch": 4.217260108630055,
675
- "grad_norm": 0.02359866164624691,
676
- "learning_rate": 0.00021663364528633574,
677
- "loss": 0.0192,
678
- "mean_token_accuracy": 0.9935739403963089,
679
- "num_tokens": 9939784.0,
680
- "step": 1750
681
- },
682
- {
683
- "epoch": 4.27761013880507,
684
- "grad_norm": 0.029893064871430397,
685
- "learning_rate": 0.00020863232429875822,
686
- "loss": 0.0219,
687
- "mean_token_accuracy": 0.9929185563325882,
688
- "num_tokens": 10085314.0,
689
- "step": 1775
690
- },
691
- {
692
- "epoch": 4.337960168980085,
693
- "grad_norm": 0.01731249690055847,
694
- "learning_rate": 0.00020070202165057554,
695
- "loss": 0.0191,
696
- "mean_token_accuracy": 0.9936865222454071,
697
- "num_tokens": 10223073.0,
698
- "step": 1800
699
- },
700
- {
701
- "epoch": 4.3983101991551,
702
- "grad_norm": 0.01831655018031597,
703
- "learning_rate": 0.00019284890141371618,
704
- "loss": 0.0228,
705
- "mean_token_accuracy": 0.992409136891365,
706
- "num_tokens": 10368536.0,
707
- "step": 1825
708
- },
709
- {
710
- "epoch": 4.458660229330115,
711
- "grad_norm": 0.018383309245109558,
712
- "learning_rate": 0.00018507906766770314,
713
- "loss": 0.0186,
714
- "mean_token_accuracy": 0.99377023935318,
715
- "num_tokens": 10507164.0,
716
- "step": 1850
717
- },
718
- {
719
- "epoch": 4.51901025950513,
720
- "grad_norm": 0.022526893764734268,
721
- "learning_rate": 0.00017739855975506917,
722
- "loss": 0.024,
723
- "mean_token_accuracy": 0.9923234033584595,
724
- "num_tokens": 10653645.0,
725
- "step": 1875
726
- },
727
- {
728
- "epoch": 4.579360289680145,
729
- "grad_norm": 0.01562182791531086,
730
- "learning_rate": 0.00016981334758709322,
731
- "loss": 0.0187,
732
- "mean_token_accuracy": 0.9941441065073013,
733
- "num_tokens": 10791134.0,
734
- "step": 1900
735
- },
736
- {
737
- "epoch": 4.63971031985516,
738
- "grad_norm": 0.02683549001812935,
739
- "learning_rate": 0.00016232932700350157,
740
- "loss": 0.0214,
741
- "mean_token_accuracy": 0.9926981467008591,
742
- "num_tokens": 10938557.0,
743
- "step": 1925
744
- },
745
- {
746
- "epoch": 4.700060350030175,
747
- "grad_norm": 0.03423510119318962,
748
- "learning_rate": 0.00015495231518974608,
749
- "loss": 0.0193,
750
- "mean_token_accuracy": 0.9936627286672592,
751
- "num_tokens": 11076634.0,
752
- "step": 1950
753
- },
754
- {
755
- "epoch": 4.76041038020519,
756
- "grad_norm": 0.029518209397792816,
757
- "learning_rate": 0.000147688046155417,
758
- "loss": 0.0226,
759
- "mean_token_accuracy": 0.9926369667053223,
760
- "num_tokens": 11224638.0,
761
- "step": 1975
762
- },
763
- {
764
- "epoch": 4.820760410380205,
765
- "grad_norm": 0.021354857832193375,
766
- "learning_rate": 0.00014054216627730755,
767
- "loss": 0.0188,
768
- "mean_token_accuracy": 0.9936299502849579,
769
- "num_tokens": 11362368.0,
770
- "step": 2000
771
- },
772
- {
773
- "epoch": 4.88111044055522,
774
- "grad_norm": 0.033567801117897034,
775
- "learning_rate": 0.00013352022991059375,
776
- "loss": 0.0236,
777
- "mean_token_accuracy": 0.9924637532234192,
778
- "num_tokens": 11509364.0,
779
- "step": 2025
780
- },
781
- {
782
- "epoch": 4.941460470730235,
783
- "grad_norm": 0.02483433112502098,
784
- "learning_rate": 0.00012662769507154113,
785
- "loss": 0.0183,
786
- "mean_token_accuracy": 0.9939434814453125,
787
- "num_tokens": 11645477.0,
788
- "step": 2050
789
- },
790
- {
791
- "epoch": 5.0,
792
- "grad_norm": 0.04451137036085129,
793
- "learning_rate": 0.00011986991919509261,
794
- "loss": 0.0203,
795
- "mean_token_accuracy": 0.9933847133646306,
796
- "num_tokens": 11770900.0,
797
- "step": 2075
798
- },
799
- {
800
- "epoch": 5.0,
801
- "eval_loss": 0.04097144305706024,
802
- "eval_mean_token_accuracy": 0.9897108348640236,
803
- "eval_num_tokens": 11770900.0,
804
- "eval_runtime": 15.7693,
805
- "eval_samples_per_second": 23.4,
806
- "eval_steps_per_second": 11.732,
807
- "step": 2075
808
- },
809
- {
810
- "epoch": 5.060350030175015,
811
- "grad_norm": 0.01309128850698471,
812
- "learning_rate": 0.00011325215497063858,
813
- "loss": 0.0169,
814
- "mean_token_accuracy": 0.9943383944034576,
815
- "num_tokens": 11929319.0,
816
- "step": 2100
817
- },
818
- {
819
- "epoch": 5.12070006035003,
820
- "grad_norm": 0.018124833703041077,
821
- "learning_rate": 0.00010677954625920108,
822
- "loss": 0.0165,
823
- "mean_token_accuracy": 0.994610543847084,
824
- "num_tokens": 12054482.0,
825
- "step": 2125
826
- },
827
- {
828
- "epoch": 5.181050090525045,
829
- "grad_norm": 0.014541085809469223,
830
- "learning_rate": 0.00010045712409521008,
831
- "loss": 0.017,
832
- "mean_token_accuracy": 0.9943281805515289,
833
- "num_tokens": 12211531.0,
834
- "step": 2150
835
- },
836
- {
837
- "epoch": 5.24140012070006,
838
- "grad_norm": 0.024412041530013084,
839
- "learning_rate": 9.428980277597675e-05,
840
- "loss": 0.0164,
841
- "mean_token_accuracy": 0.9944160294532776,
842
- "num_tokens": 12338886.0,
843
- "step": 2175
844
- },
845
- {
846
- "epoch": 5.301750150875075,
847
- "grad_norm": 0.01901293359696865,
848
- "learning_rate": 8.828237604190513e-05,
849
- "loss": 0.0175,
850
- "mean_token_accuracy": 0.9941371762752533,
851
- "num_tokens": 12495549.0,
852
- "step": 2200
853
- },
854
- {
855
- "epoch": 5.36210018105009,
856
- "grad_norm": 0.017199428752064705,
857
- "learning_rate": 8.243951335040891e-05,
858
- "loss": 0.0166,
859
- "mean_token_accuracy": 0.9944494879245758,
860
- "num_tokens": 12622459.0,
861
- "step": 2225
862
- },
863
- {
864
- "epoch": 5.422450211225105,
865
- "grad_norm": 0.03323078528046608,
866
- "learning_rate": 7.67657562464325e-05,
867
- "loss": 0.0174,
868
- "mean_token_accuracy": 0.9939983171224595,
869
- "num_tokens": 12780223.0,
870
- "step": 2250
871
- },
872
- {
873
- "epoch": 5.4828002414001205,
874
- "grad_norm": 0.02575470507144928,
875
- "learning_rate": 7.126551483239509e-05,
876
- "loss": 0.0165,
877
- "mean_token_accuracy": 0.9944772917032242,
878
- "num_tokens": 12907099.0,
879
- "step": 2275
880
- },
881
- {
882
- "epoch": 5.5431502715751355,
883
- "grad_norm": 0.01993207074701786,
884
- "learning_rate": 6.59430643403031e-05,
885
- "loss": 0.0172,
886
- "mean_token_accuracy": 0.9943294197320938,
887
- "num_tokens": 13065287.0,
888
- "step": 2300
889
- },
890
- {
891
- "epoch": 5.603500301750151,
892
- "grad_norm": 0.021883873268961906,
893
- "learning_rate": 6.0802541808693984e-05,
894
- "loss": 0.0167,
895
- "mean_token_accuracy": 0.9944075202941894,
896
- "num_tokens": 13191570.0,
897
- "step": 2325
898
- },
899
- {
900
- "epoch": 5.663850331925166,
901
- "grad_norm": 0.02263251692056656,
902
- "learning_rate": 5.584794286699583e-05,
903
- "loss": 0.017,
904
- "mean_token_accuracy": 0.9944959133863449,
905
- "num_tokens": 13348858.0,
906
- "step": 2350
907
- },
908
- {
909
- "epoch": 5.724200362100181,
910
- "grad_norm": 0.01838994212448597,
911
- "learning_rate": 5.10831186298017e-05,
912
- "loss": 0.0165,
913
- "mean_token_accuracy": 0.994613738656044,
914
- "num_tokens": 13475838.0,
915
- "step": 2375
916
- },
917
- {
918
- "epoch": 5.784550392275197,
919
- "grad_norm": 0.015676449984312057,
920
- "learning_rate": 4.6511772703471414e-05,
921
- "loss": 0.0169,
922
- "mean_token_accuracy": 0.9943792551755906,
923
- "num_tokens": 13634866.0,
924
- "step": 2400
925
- },
926
- {
927
- "epoch": 5.844900422450211,
928
- "grad_norm": 0.024185990914702415,
929
- "learning_rate": 4.2137458307390404e-05,
930
- "loss": 0.0166,
931
- "mean_token_accuracy": 0.9944125992059708,
932
- "num_tokens": 13761458.0,
933
- "step": 2425
934
- },
935
- {
936
- "epoch": 5.905250452625227,
937
- "grad_norm": 0.017580190673470497,
938
- "learning_rate": 3.796357551211986e-05,
939
- "loss": 0.0163,
940
- "mean_token_accuracy": 0.9945475596189499,
941
- "num_tokens": 13918561.0,
942
- "step": 2450
943
- },
944
- {
945
- "epoch": 5.965600482800241,
946
- "grad_norm": 0.018194038420915604,
947
- "learning_rate": 3.399336859658742e-05,
948
- "loss": 0.016,
949
- "mean_token_accuracy": 0.9947626197338104,
950
- "num_tokens": 14046227.0,
951
- "step": 2475
952
- },
953
- {
954
- "epoch": 6.0,
955
- "eval_loss": 0.04244406521320343,
956
- "eval_mean_token_accuracy": 0.9900913918340528,
957
- "eval_num_tokens": 14125080.0,
958
- "eval_runtime": 15.774,
959
- "eval_samples_per_second": 23.393,
960
- "eval_steps_per_second": 11.728,
961
- "step": 2490
962
- },
963
- {
964
- "epoch": 6.024140012070006,
965
- "grad_norm": 0.019757593050599098,
966
- "learning_rate": 3.0229923526371535e-05,
967
- "loss": 0.0165,
968
- "mean_token_accuracy": 0.9946722658639101,
969
- "num_tokens": 14194026.0,
970
- "step": 2500
971
- },
972
- {
973
- "epoch": 6.084490042245021,
974
- "grad_norm": 0.015921050682663918,
975
- "learning_rate": 2.667616555503964e-05,
976
- "loss": 0.0139,
977
- "mean_token_accuracy": 0.9953104478120803,
978
- "num_tokens": 14337164.0,
979
- "step": 2525
980
- },
981
- {
982
- "epoch": 6.1448400724200365,
983
- "grad_norm": 0.01795029267668724,
984
- "learning_rate": 2.333485695040469e-05,
985
- "loss": 0.0153,
986
- "mean_token_accuracy": 0.9950164467096329,
987
- "num_tokens": 14478393.0,
988
- "step": 2550
989
- },
990
- {
991
- "epoch": 6.2051901025950515,
992
- "grad_norm": 0.01615080237388611,
993
- "learning_rate": 2.0208594847467508e-05,
994
- "loss": 0.0142,
995
- "mean_token_accuracy": 0.9951647013425827,
996
- "num_tokens": 14620740.0,
997
- "step": 2575
998
- },
999
- {
1000
- "epoch": 6.265540132770067,
1001
- "grad_norm": 0.02364126406610012,
1002
- "learning_rate": 1.729980922971349e-05,
1003
- "loss": 0.0154,
1004
- "mean_token_accuracy": 0.9948040336370468,
1005
- "num_tokens": 14762878.0,
1006
- "step": 2600
1007
- },
1008
- {
1009
- "epoch": 6.325890162945082,
1010
- "grad_norm": 0.018266433849930763,
1011
- "learning_rate": 1.4610761040333573e-05,
1012
- "loss": 0.0142,
1013
- "mean_token_accuracy": 0.9954759681224823,
1014
- "num_tokens": 14905328.0,
1015
- "step": 2625
1016
- },
1017
- {
1018
- "epoch": 6.386240193120097,
1019
- "grad_norm": 0.022892849519848824,
1020
- "learning_rate": 1.214354042483573e-05,
1021
- "loss": 0.0148,
1022
- "mean_token_accuracy": 0.9951095223426819,
1023
- "num_tokens": 15047420.0,
1024
- "step": 2650
1025
- },
1026
- {
1027
- "epoch": 6.446590223295112,
1028
- "grad_norm": 0.013281609863042831,
1029
- "learning_rate": 9.900065106415866e-06,
1030
- "loss": 0.0139,
1031
- "mean_token_accuracy": 0.9951714134216308,
1032
- "num_tokens": 15191784.0,
1033
- "step": 2675
1034
- },
1035
- {
1036
- "epoch": 6.506940253470127,
1037
- "grad_norm": 0.015181739814579487,
1038
- "learning_rate": 7.882078895347798e-06,
1039
- "loss": 0.016,
1040
- "mean_token_accuracy": 0.9947543793916702,
1041
- "num_tokens": 15336195.0,
1042
- "step": 2700
1043
- },
1044
- {
1045
- "epoch": 6.567290283645142,
1046
- "grad_norm": 0.013374337926506996,
1047
- "learning_rate": 6.091150333553274e-06,
1048
- "loss": 0.0136,
1049
- "mean_token_accuracy": 0.995315499305725,
1050
- "num_tokens": 15480763.0,
1051
- "step": 2725
1052
- },
1053
- {
1054
- "epoch": 6.627640313820157,
1055
- "grad_norm": 0.014570921659469604,
1056
- "learning_rate": 4.528671475404433e-06,
1057
- "loss": 0.0152,
1058
- "mean_token_accuracy": 0.9951956886053085,
1059
- "num_tokens": 15623624.0,
1060
- "step": 2750
1061
- },
1062
- {
1063
- "epoch": 6.687990343995172,
1064
- "grad_norm": 0.0181302297860384,
1065
- "learning_rate": 3.1958568057067313e-06,
1066
- "loss": 0.0142,
1067
- "mean_token_accuracy": 0.9953425723314285,
1068
- "num_tokens": 15765653.0,
1069
- "step": 2775
1070
- },
1071
- {
1072
- "epoch": 6.748340374170187,
1073
- "grad_norm": 0.017040640115737915,
1074
- "learning_rate": 2.093742295703127e-06,
1075
- "loss": 0.0148,
1076
- "mean_token_accuracy": 0.9950855672359467,
1077
- "num_tokens": 15905979.0,
1078
- "step": 2800
1079
- },
1080
- {
1081
- "epoch": 6.808690404345202,
1082
- "grad_norm": 0.016313739120960236,
1083
- "learning_rate": 1.2231845978335708e-06,
1084
- "loss": 0.0141,
1085
- "mean_token_accuracy": 0.9952906262874603,
1086
- "num_tokens": 16047220.0,
1087
- "step": 2825
1088
- },
1089
- {
1090
- "epoch": 6.869040434520217,
1091
- "grad_norm": 0.0156137989833951,
1092
- "learning_rate": 5.848603798755402e-07,
1093
- "loss": 0.0152,
1094
- "mean_token_accuracy": 0.9951014250516892,
1095
- "num_tokens": 16187543.0,
1096
- "step": 2850
1097
- },
1098
- {
1099
- "epoch": 6.929390464695232,
1100
- "grad_norm": 0.01440385077148676,
1101
- "learning_rate": 1.7926579898319693e-07,
1102
- "loss": 0.014,
1103
- "mean_token_accuracy": 0.9952051192522049,
1104
- "num_tokens": 16328399.0,
1105
- "step": 2875
1106
- },
1107
- {
1108
- "epoch": 6.989740494870247,
1109
- "grad_norm": 0.014613240025937557,
1110
- "learning_rate": 6.716116033844699e-09,
1111
- "loss": 0.0148,
1112
- "mean_token_accuracy": 0.9950145679712296,
1113
- "num_tokens": 16459625.0,
1114
- "step": 2900
1115
- },
1116
- {
1117
- "epoch": 7.0,
1118
- "eval_loss": 0.04441880062222481,
1119
- "eval_mean_token_accuracy": 0.990020250307547,
1120
- "eval_num_tokens": 16479260.0,
1121
- "eval_runtime": 15.7702,
1122
- "eval_samples_per_second": 23.398,
1123
- "eval_steps_per_second": 11.731,
1124
- "step": 2905
1125
  }
1126
  ],
1127
  "logging_steps": 25,
@@ -1136,12 +493,12 @@
1136
  "should_evaluate": false,
1137
  "should_log": false,
1138
  "should_save": true,
1139
- "should_training_stop": true
1140
  },
1141
  "attributes": {}
1142
  }
1143
  },
1144
- "total_flos": 7.159071108309658e+17,
1145
  "train_batch_size": 2,
1146
  "trial_name": null,
1147
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 1245,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.3043883144855499,
15
+ "learning_rate": 0.00010909090909090909,
16
+ "loss": 1.8586,
17
+ "mean_token_accuracy": 0.6245462906360626,
18
+ "num_tokens": 157786.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.24695894122123718,
24
+ "learning_rate": 0.00022272727272727272,
25
+ "loss": 0.9383,
26
+ "mean_token_accuracy": 0.7651280963420868,
27
+ "num_tokens": 283437.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.19023272395133972,
33
+ "learning_rate": 0.0003363636363636364,
34
+ "loss": 0.649,
35
+ "mean_token_accuracy": 0.8234076803922653,
36
+ "num_tokens": 442299.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.22401368618011475,
42
+ "learning_rate": 0.0003999849510577617,
43
+ "loss": 0.5125,
44
+ "mean_token_accuracy": 0.8562434083223343,
45
+ "num_tokens": 569621.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.23774276673793793,
51
+ "learning_rate": 0.0003998388342637495,
52
+ "loss": 0.3972,
53
+ "mean_token_accuracy": 0.8866757136583329,
54
+ "num_tokens": 725959.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.3460250496864319,
60
+ "learning_rate": 0.00039953738658223166,
61
+ "loss": 0.3554,
62
+ "mean_token_accuracy": 0.9000710541009903,
63
+ "num_tokens": 851583.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.2395441234111786,
69
+ "learning_rate": 0.00039908084232270096,
70
+ "loss": 0.2503,
71
+ "mean_token_accuracy": 0.9276190227270127,
72
+ "num_tokens": 1010006.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.28712666034698486,
78
+ "learning_rate": 0.00039846955634824144,
79
+ "loss": 0.2602,
80
+ "mean_token_accuracy": 0.9267517280578613,
81
+ "num_tokens": 1137732.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.17056156694889069,
87
+ "learning_rate": 0.00039770400379969973,
88
+ "loss": 0.1851,
89
+ "mean_token_accuracy": 0.9475770330429077,
90
+ "num_tokens": 1294237.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.21380072832107544,
96
+ "learning_rate": 0.00039678477972636774,
97
+ "loss": 0.1622,
98
+ "mean_token_accuracy": 0.9546336072683335,
99
+ "num_tokens": 1419338.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
+ "grad_norm": 0.20465914905071259,
105
+ "learning_rate": 0.00039571259862346183,
106
+ "loss": 0.1329,
107
+ "mean_token_accuracy": 0.961662837266922,
108
+ "num_tokens": 1576946.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
+ "grad_norm": 0.2113582342863083,
114
+ "learning_rate": 0.00039448829387675954,
115
+ "loss": 0.1287,
116
+ "mean_token_accuracy": 0.9641147536039353,
117
+ "num_tokens": 1703683.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.15074850618839264,
123
+ "learning_rate": 0.0003931128171148249,
124
+ "loss": 0.1232,
125
+ "mean_token_accuracy": 0.9665374368429184,
126
+ "num_tokens": 1860622.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.4272288978099823,
132
+ "learning_rate": 0.00039158723746932566,
133
+ "loss": 0.0974,
134
+ "mean_token_accuracy": 0.9736644911766053,
135
+ "num_tokens": 1987884.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.13680922985076904,
141
+ "learning_rate": 0.00038991274074401806,
142
+ "loss": 0.1007,
143
+ "mean_token_accuracy": 0.9723848593235016,
144
+ "num_tokens": 2145729.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.15923215448856354,
150
+ "learning_rate": 0.00038809062849304407,
151
+ "loss": 0.0747,
152
+ "mean_token_accuracy": 0.9784966939687729,
153
+ "num_tokens": 2273076.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
+ "eval_loss": 0.09010029584169388,
159
+ "eval_mean_token_accuracy": 0.9750096942927386,
160
  "eval_num_tokens": 2354180.0,
161
+ "eval_runtime": 15.8229,
162
+ "eval_samples_per_second": 23.321,
163
+ "eval_steps_per_second": 11.692,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
+ "grad_norm": 0.12324528396129608,
169
+ "learning_rate": 0.0003861223170092585,
170
+ "loss": 0.0923,
171
+ "mean_token_accuracy": 0.9740137457847595,
172
+ "num_tokens": 2422803.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
+ "grad_norm": 0.21983321011066437,
178
+ "learning_rate": 0.00038400933622337167,
179
+ "loss": 0.0619,
180
+ "mean_token_accuracy": 0.9822656351327896,
181
+ "num_tokens": 2563793.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
+ "grad_norm": 0.11064327508211136,
187
+ "learning_rate": 0.00038175332851476387,
188
+ "loss": 0.075,
189
+ "mean_token_accuracy": 0.9794844657182693,
190
+ "num_tokens": 2706197.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
+ "grad_norm": 0.1068505346775055,
196
+ "learning_rate": 0.00037935604743489506,
197
+ "loss": 0.0544,
198
+ "mean_token_accuracy": 0.9840904027223587,
199
+ "num_tokens": 2850268.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
+ "grad_norm": 0.15822124481201172,
205
+ "learning_rate": 0.00037681935634430327,
206
+ "loss": 0.078,
207
+ "mean_token_accuracy": 0.9782475352287292,
208
+ "num_tokens": 2992275.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
+ "grad_norm": 0.10555300116539001,
214
+ "learning_rate": 0.0003741452269642502,
215
+ "loss": 0.0542,
216
+ "mean_token_accuracy": 0.9846927672624588,
217
+ "num_tokens": 3135599.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
+ "grad_norm": 0.07533632218837738,
223
+ "learning_rate": 0.0003713357378441402,
224
+ "loss": 0.0641,
225
+ "mean_token_accuracy": 0.9822721928358078,
226
+ "num_tokens": 3278124.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
+ "grad_norm": 0.1403859406709671,
232
+ "learning_rate": 0.00036839307274590355,
233
+ "loss": 0.0491,
234
+ "mean_token_accuracy": 0.985781243443489,
235
+ "num_tokens": 3421679.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
+ "grad_norm": 0.10379917174577713,
241
+ "learning_rate": 0.00036531951894660034,
242
+ "loss": 0.069,
243
+ "mean_token_accuracy": 0.9809466338157654,
244
+ "num_tokens": 3564486.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
+ "grad_norm": 0.09850325435400009,
250
+ "learning_rate": 0.00036211746546056415,
251
+ "loss": 0.0516,
252
+ "mean_token_accuracy": 0.9853906160593033,
253
+ "num_tokens": 3707529.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
+ "grad_norm": 0.11964337527751923,
259
+ "learning_rate": 0.00035878940118246673,
260
+ "loss": 0.0596,
261
+ "mean_token_accuracy": 0.9836755973100663,
262
+ "num_tokens": 3848570.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
+ "grad_norm": 0.05955597385764122,
268
+ "learning_rate": 0.00035533791295274834,
269
+ "loss": 0.0428,
270
+ "mean_token_accuracy": 0.9875983273983002,
271
+ "num_tokens": 3990912.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
+ "grad_norm": 0.07114022970199585,
277
+ "learning_rate": 0.0003517656835469161,
278
+ "loss": 0.0627,
279
+ "mean_token_accuracy": 0.9830698877573013,
280
+ "num_tokens": 4131622.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
+ "grad_norm": 0.09389431029558182,
286
+ "learning_rate": 0.0003480754895902742,
287
+ "loss": 0.0459,
288
+ "mean_token_accuracy": 0.9864954763650894,
289
+ "num_tokens": 4273640.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
+ "grad_norm": 0.06497396528720856,
295
+ "learning_rate": 0.0003442701993997064,
296
+ "loss": 0.0577,
297
+ "mean_token_accuracy": 0.9838357955217362,
298
+ "num_tokens": 4414471.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
+ "grad_norm": 0.11333642154932022,
304
+ "learning_rate": 0.00034035277075418854,
305
+ "loss": 0.0431,
306
+ "mean_token_accuracy": 0.9871519947052002,
307
+ "num_tokens": 4555376.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
+ "grad_norm": 0.05740824714303017,
313
+ "learning_rate": 0.0003363262485957633,
314
+ "loss": 0.0418,
315
+ "mean_token_accuracy": 0.9883395010232925,
316
+ "num_tokens": 4688406.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
+ "eval_loss": 0.053012676537036896,
322
+ "eval_mean_token_accuracy": 0.9854460068651147,
323
  "eval_num_tokens": 4708360.0,
324
+ "eval_runtime": 15.7874,
325
+ "eval_samples_per_second": 23.373,
326
+ "eval_steps_per_second": 11.718,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
+ "grad_norm": 0.09828540682792664,
332
+ "learning_rate": 0.00033219376266276594,
333
+ "loss": 0.0495,
334
+ "mean_token_accuracy": 0.9852321922164602,
335
+ "num_tokens": 4836946.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
+ "grad_norm": 0.10078238695859909,
341
+ "learning_rate": 0.00032795852505713806,
342
+ "loss": 0.0313,
343
+ "mean_token_accuracy": 0.9908820760250091,
344
+ "num_tokens": 4968203.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
+ "grad_norm": 0.07217393070459366,
350
+ "learning_rate": 0.0003236238277477231,
351
+ "loss": 0.0471,
352
+ "mean_token_accuracy": 0.985995357632637,
353
+ "num_tokens": 5118541.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
+ "grad_norm": 0.09504982829093933,
359
+ "learning_rate": 0.0003191930400114816,
360
+ "loss": 0.0322,
361
+ "mean_token_accuracy": 0.9904332131147384,
362
+ "num_tokens": 5251007.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
+ "grad_norm": 0.04393278807401657,
368
+ "learning_rate": 0.0003146696058146176,
369
+ "loss": 0.0481,
370
+ "mean_token_accuracy": 0.9860882490873337,
371
+ "num_tokens": 5403517.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
+ "grad_norm": 0.12678726017475128,
377
+ "learning_rate": 0.00031005704113564917,
378
+ "loss": 0.0349,
379
+ "mean_token_accuracy": 0.9900296354293823,
380
+ "num_tokens": 5536492.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
+ "grad_norm": 0.06465219706296921,
386
+ "learning_rate": 0.00030535893123250635,
387
+ "loss": 0.0484,
388
+ "mean_token_accuracy": 0.9857117992639541,
389
+ "num_tokens": 5688657.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
+ "grad_norm": 0.07911183685064316,
395
+ "learning_rate": 0.00030057892785577867,
396
+ "loss": 0.0313,
397
+ "mean_token_accuracy": 0.9907743036746979,
398
+ "num_tokens": 5821424.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
+ "grad_norm": 0.04166734963655472,
404
+ "learning_rate": 0.00029572074641027996,
405
+ "loss": 0.0448,
406
+ "mean_token_accuracy": 0.9870600712299347,
407
+ "num_tokens": 5970144.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
+ "grad_norm": 0.09666065871715546,
413
+ "learning_rate": 0.0002907881630671351,
414
+ "loss": 0.0301,
415
+ "mean_token_accuracy": 0.9908553779125213,
416
+ "num_tokens": 6103689.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
+ "grad_norm": 0.07375594228506088,
422
+ "learning_rate": 0.00028578501182863507,
423
+ "loss": 0.0425,
424
+ "mean_token_accuracy": 0.9875227802991867,
425
+ "num_tokens": 6256525.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
+ "grad_norm": 0.086298368871212,
431
+ "learning_rate": 0.00028071518154814036,
432
+ "loss": 0.0323,
433
+ "mean_token_accuracy": 0.9902477955818176,
434
+ "num_tokens": 6390348.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
+ "grad_norm": 0.03587740287184715,
440
+ "learning_rate": 0.0002755826129073503,
441
+ "loss": 0.0427,
442
+ "mean_token_accuracy": 0.9872915095090866,
443
+ "num_tokens": 6540900.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
+ "grad_norm": 0.08814697712659836,
449
+ "learning_rate": 0.00027039129535328646,
450
+ "loss": 0.0328,
451
+ "mean_token_accuracy": 0.990228921175003,
452
+ "num_tokens": 6673531.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
+ "grad_norm": 0.054661527276039124,
458
+ "learning_rate": 0.00026514526399737235,
459
+ "loss": 0.0406,
460
+ "mean_token_accuracy": 0.9875594407320023,
461
+ "num_tokens": 6826045.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
+ "grad_norm": 0.04500816389918327,
467
+ "learning_rate": 0.00025984859647901865,
468
+ "loss": 0.0282,
469
+ "mean_token_accuracy": 0.9915571695566178,
470
+ "num_tokens": 6959176.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
+ "eval_loss": 0.04558952525258064,
476
+ "eval_mean_token_accuracy": 0.9875019208804982,
477
  "eval_num_tokens": 7062540.0,
478
+ "eval_runtime": 15.8013,
479
+ "eval_samples_per_second": 23.352,
480
+ "eval_steps_per_second": 11.708,
481
  "step": 1245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  }
483
  ],
484
  "logging_steps": 25,
 
493
  "should_evaluate": false,
494
  "should_log": false,
495
  "should_save": true,
496
+ "should_training_stop": false
497
  },
498
  "attributes": {}
499
  }
500
  },
501
+ "total_flos": 3.0683713883526144e+17,
502
  "train_batch_size": 2,
503
  "trial_name": null,
504
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd8c143acd31b17c8bb07fdf2f75438bc73146d85eae4fba82837890a5b819c7
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb3c001872d0d895453223b44ff6bc35437b36d517310a2af89b31048c8561be
3
  size 6097