Looyyd commited on
Commit
1e98f38
·
verified ·
1 Parent(s): 9df61d4

Upload folder using huggingface_hub

Browse files
latest CHANGED
@@ -1 +1 @@
1
- global_step600
 
1
+ global_step1000
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cf9bfd707812d72f5a2f40c45ba5c2ade438173337e1c29834fd3e3037011ff
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e879af4cfa1deedcfcbfbc79c36f3b79e64cfeb25e69353f59f22fbbed30b4b2
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b975c10a07b708e742e7cb498fcced0a973fbd8859b789e4dfe3301444028446
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6bbeaab087a68ed068c163998873e84898552a6dd7657797aa8e18dd6e37c9
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5ef815689ef6c0bfa2fae21d058410ddac52f6dfa497f61ce0528c0048e28ca
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:940e1d12d2759eb1985ee34c381129f7669b5e7c9efa0fe113b07ae8e3d703f5
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feb613ac98d3bc9000f0580b9a04c39b421463de536c85e884bc163960196f35
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b91ba3198a47e3c90a19a1e47dd4672c7a645b066c6f451d99e859350ba8db
3
  size 1089994880
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de9d8745000adb7215ca2521877edd5f094afa53a74774ac8b3b63564f5f3057
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c235c10397ca3fb3b82475883c48d3bb786206feaee53c2199c913179faf1fb
3
  size 15429
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e2026ab04e5d49d3c8b494900054aef11e6e952a7529a1b927d1dccc8ad5fe6
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937bfac24cd2fe886a72cb180e9d726f8629acaf1e31b2beab1f7a03381ca0ca
3
  size 15429
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acf8ba155a6d9b407b1f2ecfea487928997ad3435515ffd02fa7f8b6b705e24e
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0687693332dd9f28a675c2a9f27590ae650095d80dac61354fce4437e7f9de
3
  size 15429
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff2f2069bbf3d6817bcbd84d46ca0b94f0c93abcc2a02a30fca9f617c28b5b80
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb4dab4ba8c60d5f5c48a1048c1ecc4e949aff462fd8340d7ad1a380fc12fdd
3
  size 15429
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aaf33a5037307809cc6622bc68eb6bedd2e4cb1aa2db28d8b3d6793d8a9e807d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f0808edf6ea4e8e5c26a90c425f750ca5d24bb19176e28af0a072f41d725a49
3
  size 1465
special_tokens_map.json CHANGED
@@ -21,11 +21,5 @@
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
- "pad_token": {
25
- "content": "<|im_end|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
  }
 
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
+ "pad_token": "<|im_end|>"
 
 
 
 
 
 
25
  }
tokenizer_config.json CHANGED
@@ -201,7 +201,7 @@
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
  "pad_token": "<|im_end|>",
204
- "padding_size": "left",
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
 
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
  "pad_token": "<|im_end|>",
204
+ "padding_size": "right",
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
trainer_state.json CHANGED
@@ -1,565 +1,930 @@
1
  {
2
- "best_global_step": 600,
3
- "best_metric": 1.239326000213623,
4
- "best_model_checkpoint": "./chess_format_aligned/checkpoint-600",
5
- "epoch": 60.0,
6
  "eval_steps": 75,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 23.66467355431497,
15
  "learning_rate": 0.0018,
16
- "loss": 3.5194,
17
  "num_tokens": 223219.0,
18
  "step": 10
19
  },
20
  {
21
  "epoch": 2.0,
22
- "grad_norm": 18.97579249164185,
23
  "learning_rate": 0.0038,
24
- "loss": 3.2502,
25
  "num_tokens": 445754.0,
26
  "step": 20
27
  },
28
  {
29
  "epoch": 3.0,
30
- "grad_norm": 12.499377192523346,
31
  "learning_rate": 0.0058,
32
- "loss": 2.7908,
33
  "num_tokens": 668444.0,
34
  "step": 30
35
  },
36
  {
37
  "epoch": 4.0,
38
- "grad_norm": 6.822190158164025,
39
  "learning_rate": 0.0078000000000000005,
40
- "loss": 2.3016,
41
  "num_tokens": 890313.0,
42
  "step": 40
43
  },
44
  {
45
  "epoch": 5.0,
46
- "grad_norm": 4.814810084102039,
47
  "learning_rate": 0.0098,
48
- "loss": 1.9667,
49
  "num_tokens": 1113242.0,
50
  "step": 50
51
  },
52
  {
53
  "epoch": 6.0,
54
- "grad_norm": 3.0120886424271407,
55
  "learning_rate": 0.009905263157894736,
56
- "loss": 1.7411,
57
  "num_tokens": 1335423.0,
58
  "step": 60
59
  },
60
  {
61
  "epoch": 7.0,
62
- "grad_norm": 2.2025365118435607,
63
  "learning_rate": 0.0098,
64
- "loss": 1.5805,
65
  "num_tokens": 1555919.0,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 7.5,
70
- "eval_loss": 1.6486115455627441,
71
  "eval_num_tokens": 1666144.0,
72
- "eval_runtime": 0.3228,
73
- "eval_samples_per_second": 83.636,
74
- "eval_steps_per_second": 3.098,
75
  "step": 75
76
  },
77
  {
78
  "epoch": 8.0,
79
- "grad_norm": 1.8389871017060493,
80
  "learning_rate": 0.009694736842105263,
81
- "loss": 1.4891,
82
  "num_tokens": 1777537.0,
83
  "step": 80
84
  },
85
  {
86
  "epoch": 9.0,
87
- "grad_norm": 1.6813718221774028,
88
  "learning_rate": 0.009589473684210526,
89
- "loss": 1.4132,
90
  "num_tokens": 1998166.0,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 10.0,
95
- "grad_norm": 1.553358734659622,
96
  "learning_rate": 0.00948421052631579,
97
- "loss": 1.3635,
98
  "num_tokens": 2219390.0,
99
  "step": 100
100
  },
101
  {
102
  "epoch": 11.0,
103
- "grad_norm": 1.492953948601938,
104
  "learning_rate": 0.009378947368421053,
105
- "loss": 1.3262,
106
  "num_tokens": 2443706.0,
107
  "step": 110
108
  },
109
  {
110
  "epoch": 12.0,
111
- "grad_norm": 1.4241915097241344,
112
  "learning_rate": 0.009273684210526316,
113
- "loss": 1.2886,
114
  "num_tokens": 2666784.0,
115
  "step": 120
116
  },
117
  {
118
  "epoch": 13.0,
119
- "grad_norm": 1.5227222988900821,
120
  "learning_rate": 0.00916842105263158,
121
- "loss": 1.2587,
122
  "num_tokens": 2888489.0,
123
  "step": 130
124
  },
125
  {
126
  "epoch": 14.0,
127
- "grad_norm": 1.4868248858393083,
128
  "learning_rate": 0.009063157894736842,
129
- "loss": 1.2396,
130
  "num_tokens": 3110642.0,
131
  "step": 140
132
  },
133
  {
134
  "epoch": 15.0,
135
- "grad_norm": 1.7084794653006408,
136
  "learning_rate": 0.008957894736842106,
137
- "loss": 1.2227,
138
  "num_tokens": 3334174.0,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 15.0,
143
- "eval_loss": 1.3719849586486816,
144
  "eval_num_tokens": 3334174.0,
145
- "eval_runtime": 0.3187,
146
- "eval_samples_per_second": 84.722,
147
- "eval_steps_per_second": 3.138,
148
  "step": 150
149
  },
150
  {
151
  "epoch": 16.0,
152
- "grad_norm": 1.5476765935969736,
153
  "learning_rate": 0.008852631578947369,
154
- "loss": 1.2045,
155
  "num_tokens": 3556307.0,
156
  "step": 160
157
  },
158
  {
159
  "epoch": 17.0,
160
- "grad_norm": 1.5643988157955648,
161
  "learning_rate": 0.008747368421052632,
162
- "loss": 1.1964,
163
  "num_tokens": 3777446.0,
164
  "step": 170
165
  },
166
  {
167
  "epoch": 18.0,
168
- "grad_norm": 1.6887729685493036,
169
  "learning_rate": 0.008642105263157894,
170
- "loss": 1.185,
171
  "num_tokens": 3999051.0,
172
  "step": 180
173
  },
174
  {
175
  "epoch": 19.0,
176
- "grad_norm": 1.5181448057389182,
177
  "learning_rate": 0.008536842105263159,
178
- "loss": 1.1795,
179
  "num_tokens": 4221841.0,
180
  "step": 190
181
  },
182
  {
183
  "epoch": 20.0,
184
- "grad_norm": 1.8257922196416516,
185
  "learning_rate": 0.008431578947368422,
186
- "loss": 1.1638,
187
  "num_tokens": 4443930.0,
188
  "step": 200
189
  },
190
  {
191
  "epoch": 21.0,
192
- "grad_norm": 1.776889058864946,
193
  "learning_rate": 0.008326315789473683,
194
- "loss": 1.165,
195
  "num_tokens": 4665914.0,
196
  "step": 210
197
  },
198
  {
199
  "epoch": 22.0,
200
- "grad_norm": 1.728259368892959,
201
  "learning_rate": 0.008221052631578948,
202
- "loss": 1.1465,
203
  "num_tokens": 4887092.0,
204
  "step": 220
205
  },
206
  {
207
  "epoch": 22.5,
208
- "eval_loss": 1.312962293624878,
209
  "eval_num_tokens": 4995523.0,
210
- "eval_runtime": 0.3193,
211
- "eval_samples_per_second": 84.567,
212
- "eval_steps_per_second": 3.132,
213
  "step": 225
214
  },
215
  {
216
  "epoch": 23.0,
217
- "grad_norm": 1.7248452098513343,
218
  "learning_rate": 0.008115789473684212,
219
- "loss": 1.1489,
220
  "num_tokens": 5109590.0,
221
  "step": 230
222
  },
223
  {
224
  "epoch": 24.0,
225
- "grad_norm": 1.7410335554485061,
226
  "learning_rate": 0.008010526315789473,
227
- "loss": 1.1396,
228
  "num_tokens": 5331354.0,
229
  "step": 240
230
  },
231
  {
232
  "epoch": 25.0,
233
- "grad_norm": 2.1297185700103336,
234
  "learning_rate": 0.007905263157894736,
235
- "loss": 1.1435,
236
  "num_tokens": 5555613.0,
237
  "step": 250
238
  },
239
  {
240
  "epoch": 26.0,
241
- "grad_norm": 1.9138585710908027,
242
  "learning_rate": 0.0078000000000000005,
243
- "loss": 1.1313,
244
  "num_tokens": 5777356.0,
245
  "step": 260
246
  },
247
  {
248
  "epoch": 27.0,
249
- "grad_norm": 1.9212944582494524,
250
  "learning_rate": 0.007694736842105263,
251
- "loss": 1.1313,
252
  "num_tokens": 6001492.0,
253
  "step": 270
254
  },
255
  {
256
  "epoch": 28.0,
257
- "grad_norm": 1.5807029569566686,
258
  "learning_rate": 0.007589473684210526,
259
- "loss": 1.1158,
260
  "num_tokens": 6223232.0,
261
  "step": 280
262
  },
263
  {
264
  "epoch": 29.0,
265
- "grad_norm": 1.5146177252013164,
266
  "learning_rate": 0.00748421052631579,
267
- "loss": 1.1208,
268
  "num_tokens": 6447091.0,
269
  "step": 290
270
  },
271
  {
272
  "epoch": 30.0,
273
- "grad_norm": 1.3929929601301219,
274
  "learning_rate": 0.007378947368421053,
275
- "loss": 1.1057,
276
  "num_tokens": 6668779.0,
277
  "step": 300
278
  },
279
  {
280
  "epoch": 30.0,
281
- "eval_loss": 1.2842637300491333,
282
  "eval_num_tokens": 6668779.0,
283
- "eval_runtime": 0.3216,
284
- "eval_samples_per_second": 83.962,
285
- "eval_steps_per_second": 3.11,
286
  "step": 300
287
  },
288
  {
289
  "epoch": 31.0,
290
- "grad_norm": 1.633018099421827,
291
  "learning_rate": 0.007273684210526316,
292
- "loss": 1.1048,
293
  "num_tokens": 6889939.0,
294
  "step": 310
295
  },
296
  {
297
  "epoch": 32.0,
298
- "grad_norm": 1.6493326218561297,
299
  "learning_rate": 0.00716842105263158,
300
- "loss": 1.1072,
301
  "num_tokens": 7113839.0,
302
  "step": 320
303
  },
304
  {
305
  "epoch": 33.0,
306
- "grad_norm": 1.7909614789226167,
307
  "learning_rate": 0.007063157894736842,
308
- "loss": 1.0959,
309
  "num_tokens": 7335019.0,
310
  "step": 330
311
  },
312
  {
313
  "epoch": 34.0,
314
- "grad_norm": 1.5940239518396466,
315
  "learning_rate": 0.006957894736842106,
316
- "loss": 1.0992,
317
  "num_tokens": 7558222.0,
318
  "step": 340
319
  },
320
  {
321
  "epoch": 35.0,
322
- "grad_norm": 1.5769271621614351,
323
  "learning_rate": 0.006852631578947368,
324
- "loss": 1.0877,
325
  "num_tokens": 7780865.0,
326
  "step": 350
327
  },
328
  {
329
  "epoch": 36.0,
330
- "grad_norm": 1.678971550670819,
331
  "learning_rate": 0.006747368421052632,
332
- "loss": 1.0895,
333
  "num_tokens": 8003119.0,
334
  "step": 360
335
  },
336
  {
337
  "epoch": 37.0,
338
- "grad_norm": 1.6441297827560035,
339
  "learning_rate": 0.0066421052631578945,
340
- "loss": 1.0916,
341
  "num_tokens": 8227584.0,
342
  "step": 370
343
  },
344
  {
345
  "epoch": 37.5,
346
- "eval_loss": 1.2667254209518433,
347
  "eval_num_tokens": 8343208.0,
348
- "eval_runtime": 0.317,
349
- "eval_samples_per_second": 85.162,
350
- "eval_steps_per_second": 3.154,
351
  "step": 375
352
  },
353
  {
354
  "epoch": 38.0,
355
- "grad_norm": 1.562161162484757,
356
  "learning_rate": 0.006536842105263158,
357
- "loss": 1.0788,
358
  "num_tokens": 8449768.0,
359
  "step": 380
360
  },
361
  {
362
  "epoch": 39.0,
363
- "grad_norm": 1.5481332327502733,
364
  "learning_rate": 0.006431578947368422,
365
- "loss": 1.0735,
366
  "num_tokens": 8670822.0,
367
  "step": 390
368
  },
369
  {
370
  "epoch": 40.0,
371
- "grad_norm": 1.6710119570197965,
372
  "learning_rate": 0.006326315789473684,
373
- "loss": 1.0691,
374
  "num_tokens": 8892114.0,
375
  "step": 400
376
  },
377
  {
378
  "epoch": 41.0,
379
- "grad_norm": 1.6213897605104313,
380
  "learning_rate": 0.0062210526315789475,
381
- "loss": 1.0743,
382
  "num_tokens": 9115676.0,
383
  "step": 410
384
  },
385
  {
386
  "epoch": 42.0,
387
- "grad_norm": 1.584290915233654,
388
  "learning_rate": 0.00611578947368421,
389
- "loss": 1.0642,
390
  "num_tokens": 9336910.0,
391
  "step": 420
392
  },
393
  {
394
  "epoch": 43.0,
395
- "grad_norm": 1.638009204812852,
396
  "learning_rate": 0.006010526315789474,
397
- "loss": 1.0584,
398
  "num_tokens": 9557990.0,
399
  "step": 430
400
  },
401
  {
402
  "epoch": 44.0,
403
- "grad_norm": 1.5462442741556879,
404
  "learning_rate": 0.005905263157894737,
405
- "loss": 1.0636,
406
  "num_tokens": 9780068.0,
407
  "step": 440
408
  },
409
  {
410
  "epoch": 45.0,
411
- "grad_norm": 2.0666288780886912,
412
  "learning_rate": 0.0058,
413
- "loss": 1.059,
414
  "num_tokens": 10001728.0,
415
  "step": 450
416
  },
417
  {
418
  "epoch": 45.0,
419
- "eval_loss": 1.2540942430496216,
420
  "eval_num_tokens": 10001728.0,
421
- "eval_runtime": 0.3199,
422
- "eval_samples_per_second": 84.39,
423
- "eval_steps_per_second": 3.126,
424
  "step": 450
425
  },
426
  {
427
  "epoch": 46.0,
428
- "grad_norm": 2.0435408178858805,
429
  "learning_rate": 0.005694736842105264,
430
- "loss": 1.0511,
431
  "num_tokens": 10223307.0,
432
  "step": 460
433
  },
434
  {
435
  "epoch": 47.0,
436
- "grad_norm": 1.6699802100100627,
437
  "learning_rate": 0.005589473684210526,
438
- "loss": 1.0636,
439
  "num_tokens": 10447321.0,
440
  "step": 470
441
  },
442
  {
443
  "epoch": 48.0,
444
- "grad_norm": 1.8435894012332885,
445
  "learning_rate": 0.005484210526315789,
446
- "loss": 1.0479,
447
  "num_tokens": 10670049.0,
448
  "step": 480
449
  },
450
  {
451
  "epoch": 49.0,
452
- "grad_norm": 1.7083885546579185,
453
  "learning_rate": 0.0053789473684210535,
454
- "loss": 1.0456,
455
  "num_tokens": 10891067.0,
456
  "step": 490
457
  },
458
  {
459
  "epoch": 50.0,
460
- "grad_norm": 1.6697717392243483,
461
  "learning_rate": 0.005273684210526316,
462
- "loss": 1.0446,
463
  "num_tokens": 11111551.0,
464
  "step": 500
465
  },
466
  {
467
  "epoch": 51.0,
468
- "grad_norm": 1.6862974388818066,
469
  "learning_rate": 0.005168421052631579,
470
- "loss": 1.045,
471
  "num_tokens": 11332291.0,
472
  "step": 510
473
  },
474
  {
475
  "epoch": 52.0,
476
- "grad_norm": 1.6031127839314492,
477
  "learning_rate": 0.0050631578947368415,
478
- "loss": 1.0396,
479
  "num_tokens": 11553533.0,
480
  "step": 520
481
  },
482
  {
483
  "epoch": 52.5,
484
- "eval_loss": 1.2458847761154175,
485
  "eval_num_tokens": 11665840.0,
486
- "eval_runtime": 0.3246,
487
- "eval_samples_per_second": 83.174,
488
- "eval_steps_per_second": 3.081,
489
  "step": 525
490
  },
491
  {
492
  "epoch": 53.0,
493
- "grad_norm": 1.6444188469318939,
494
  "learning_rate": 0.004957894736842105,
495
- "loss": 1.0384,
496
  "num_tokens": 11776548.0,
497
  "step": 530
498
  },
499
  {
500
  "epoch": 54.0,
501
- "grad_norm": 1.904756015833275,
502
  "learning_rate": 0.004852631578947369,
503
- "loss": 1.0331,
504
  "num_tokens": 11997805.0,
505
  "step": 540
506
  },
507
  {
508
  "epoch": 55.0,
509
- "grad_norm": 1.670301671264176,
510
  "learning_rate": 0.004747368421052632,
511
- "loss": 1.0286,
512
  "num_tokens": 12219453.0,
513
  "step": 550
514
  },
515
  {
516
  "epoch": 56.0,
517
- "grad_norm": 1.7460051405724548,
518
  "learning_rate": 0.0046421052631578945,
519
- "loss": 1.0371,
520
  "num_tokens": 12442555.0,
521
  "step": 560
522
  },
523
  {
524
  "epoch": 57.0,
525
- "grad_norm": 1.6995407527097683,
526
  "learning_rate": 0.004536842105263158,
527
- "loss": 1.0284,
528
  "num_tokens": 12664583.0,
529
  "step": 570
530
  },
531
  {
532
  "epoch": 58.0,
533
- "grad_norm": 1.5330059340993094,
534
  "learning_rate": 0.004431578947368421,
535
- "loss": 1.036,
536
  "num_tokens": 12887745.0,
537
  "step": 580
538
  },
539
  {
540
  "epoch": 59.0,
541
- "grad_norm": 1.6962640828440887,
542
  "learning_rate": 0.004326315789473684,
543
- "loss": 1.0303,
544
  "num_tokens": 13112833.0,
545
  "step": 590
546
  },
547
  {
548
  "epoch": 60.0,
549
- "grad_norm": 1.751524405194981,
550
  "learning_rate": 0.0042210526315789474,
551
- "loss": 1.0245,
552
  "num_tokens": 13333925.0,
553
  "step": 600
554
  },
555
  {
556
  "epoch": 60.0,
557
- "eval_loss": 1.239326000213623,
558
  "eval_num_tokens": 13333925.0,
559
- "eval_runtime": 0.3217,
560
- "eval_samples_per_second": 83.927,
561
- "eval_steps_per_second": 3.108,
562
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  }
564
  ],
565
  "logging_steps": 10,
@@ -574,12 +939,12 @@
574
  "should_evaluate": false,
575
  "should_log": false,
576
  "should_save": true,
577
- "should_training_stop": false
578
  },
579
  "attributes": {}
580
  }
581
  },
582
- "total_flos": 43112136835072.0,
583
  "train_batch_size": 8,
584
  "trial_name": null,
585
  "trial_params": null
 
1
  {
2
+ "best_global_step": 975,
3
+ "best_metric": 1.1944328546524048,
4
+ "best_model_checkpoint": "./chess_format_aligned/checkpoint-900",
5
+ "epoch": 100.0,
6
  "eval_steps": 75,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 17.83828250762641,
15
  "learning_rate": 0.0018,
16
+ "loss": 1.8323,
17
  "num_tokens": 223219.0,
18
  "step": 10
19
  },
20
  {
21
  "epoch": 2.0,
22
+ "grad_norm": 9.01867006533579,
23
  "learning_rate": 0.0038,
24
+ "loss": 1.673,
25
  "num_tokens": 445754.0,
26
  "step": 20
27
  },
28
  {
29
  "epoch": 3.0,
30
+ "grad_norm": 6.94008470876228,
31
  "learning_rate": 0.0058,
32
+ "loss": 1.5482,
33
  "num_tokens": 668444.0,
34
  "step": 30
35
  },
36
  {
37
  "epoch": 4.0,
38
+ "grad_norm": 70.78811181835867,
39
  "learning_rate": 0.0078000000000000005,
40
+ "loss": 1.4254,
41
  "num_tokens": 890313.0,
42
  "step": 40
43
  },
44
  {
45
  "epoch": 5.0,
46
+ "grad_norm": 3.1662635788659146,
47
  "learning_rate": 0.0098,
48
+ "loss": 1.3487,
49
  "num_tokens": 1113242.0,
50
  "step": 50
51
  },
52
  {
53
  "epoch": 6.0,
54
+ "grad_norm": 2.556648918505806,
55
  "learning_rate": 0.009905263157894736,
56
+ "loss": 1.2985,
57
  "num_tokens": 1335423.0,
58
  "step": 60
59
  },
60
  {
61
  "epoch": 7.0,
62
+ "grad_norm": 2.4375270330993013,
63
  "learning_rate": 0.0098,
64
+ "loss": 1.2586,
65
  "num_tokens": 1555919.0,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 7.5,
70
+ "eval_loss": 1.381267786026001,
71
  "eval_num_tokens": 1666144.0,
72
+ "eval_runtime": 0.3185,
73
+ "eval_samples_per_second": 84.763,
74
+ "eval_steps_per_second": 3.139,
75
  "step": 75
76
  },
77
  {
78
  "epoch": 8.0,
79
+ "grad_norm": 2.7644229869799037,
80
  "learning_rate": 0.009694736842105263,
81
+ "loss": 1.2423,
82
  "num_tokens": 1777537.0,
83
  "step": 80
84
  },
85
  {
86
  "epoch": 9.0,
87
+ "grad_norm": 3.1156791872493654,
88
  "learning_rate": 0.009589473684210526,
89
+ "loss": 1.2221,
90
  "num_tokens": 1998166.0,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 10.0,
95
+ "grad_norm": 2.4278915778023373,
96
  "learning_rate": 0.00948421052631579,
97
+ "loss": 1.2127,
98
  "num_tokens": 2219390.0,
99
  "step": 100
100
  },
101
  {
102
  "epoch": 11.0,
103
+ "grad_norm": 2.4188045848334006,
104
  "learning_rate": 0.009378947368421053,
105
+ "loss": 1.2062,
106
  "num_tokens": 2443706.0,
107
  "step": 110
108
  },
109
  {
110
  "epoch": 12.0,
111
+ "grad_norm": 3.2611655554691295,
112
  "learning_rate": 0.009273684210526316,
113
+ "loss": 1.1925,
114
  "num_tokens": 2666784.0,
115
  "step": 120
116
  },
117
  {
118
  "epoch": 13.0,
119
+ "grad_norm": 2.7245785280374597,
120
  "learning_rate": 0.00916842105263158,
121
+ "loss": 1.1798,
122
  "num_tokens": 2888489.0,
123
  "step": 130
124
  },
125
  {
126
  "epoch": 14.0,
127
+ "grad_norm": 2.4379435102787324,
128
  "learning_rate": 0.009063157894736842,
129
+ "loss": 1.1723,
130
  "num_tokens": 3110642.0,
131
  "step": 140
132
  },
133
  {
134
  "epoch": 15.0,
135
+ "grad_norm": 2.9693195241863073,
136
  "learning_rate": 0.008957894736842106,
137
+ "loss": 1.1649,
138
  "num_tokens": 3334174.0,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 15.0,
143
+ "eval_loss": 1.3144068717956543,
144
  "eval_num_tokens": 3334174.0,
145
+ "eval_runtime": 0.3186,
146
+ "eval_samples_per_second": 84.741,
147
+ "eval_steps_per_second": 3.139,
148
  "step": 150
149
  },
150
  {
151
  "epoch": 16.0,
152
+ "grad_norm": 4.741318971535345,
153
  "learning_rate": 0.008852631578947369,
154
+ "loss": 1.153,
155
  "num_tokens": 3556307.0,
156
  "step": 160
157
  },
158
  {
159
  "epoch": 17.0,
160
+ "grad_norm": 3.3424312070631905,
161
  "learning_rate": 0.008747368421052632,
162
+ "loss": 1.1505,
163
  "num_tokens": 3777446.0,
164
  "step": 170
165
  },
166
  {
167
  "epoch": 18.0,
168
+ "grad_norm": 3.9043733611223996,
169
  "learning_rate": 0.008642105263157894,
170
+ "loss": 1.1432,
171
  "num_tokens": 3999051.0,
172
  "step": 180
173
  },
174
  {
175
  "epoch": 19.0,
176
+ "grad_norm": 2.508990648808123,
177
  "learning_rate": 0.008536842105263159,
178
+ "loss": 1.1415,
179
  "num_tokens": 4221841.0,
180
  "step": 190
181
  },
182
  {
183
  "epoch": 20.0,
184
+ "grad_norm": 2.7614256983207572,
185
  "learning_rate": 0.008431578947368422,
186
+ "loss": 1.1295,
187
  "num_tokens": 4443930.0,
188
  "step": 200
189
  },
190
  {
191
  "epoch": 21.0,
192
+ "grad_norm": 2.820857768625703,
193
  "learning_rate": 0.008326315789473683,
194
+ "loss": 1.1316,
195
  "num_tokens": 4665914.0,
196
  "step": 210
197
  },
198
  {
199
  "epoch": 22.0,
200
+ "grad_norm": 3.176834461080823,
201
  "learning_rate": 0.008221052631578948,
202
+ "loss": 1.1162,
203
  "num_tokens": 4887092.0,
204
  "step": 220
205
  },
206
  {
207
  "epoch": 22.5,
208
+ "eval_loss": 1.2824864387512207,
209
  "eval_num_tokens": 4995523.0,
210
+ "eval_runtime": 0.3183,
211
+ "eval_samples_per_second": 84.823,
212
+ "eval_steps_per_second": 3.142,
213
  "step": 225
214
  },
215
  {
216
  "epoch": 23.0,
217
+ "grad_norm": 2.989691496498352,
218
  "learning_rate": 0.008115789473684212,
219
+ "loss": 1.1164,
220
  "num_tokens": 5109590.0,
221
  "step": 230
222
  },
223
  {
224
  "epoch": 24.0,
225
+ "grad_norm": 2.959771702308034,
226
  "learning_rate": 0.008010526315789473,
227
+ "loss": 1.1082,
228
  "num_tokens": 5331354.0,
229
  "step": 240
230
  },
231
  {
232
  "epoch": 25.0,
233
+ "grad_norm": 2.9095338362857075,
234
  "learning_rate": 0.007905263157894736,
235
+ "loss": 1.1124,
236
  "num_tokens": 5555613.0,
237
  "step": 250
238
  },
239
  {
240
  "epoch": 26.0,
241
+ "grad_norm": 3.340734142845819,
242
  "learning_rate": 0.0078000000000000005,
243
+ "loss": 1.0985,
244
  "num_tokens": 5777356.0,
245
  "step": 260
246
  },
247
  {
248
  "epoch": 27.0,
249
+ "grad_norm": 2.78338780839128,
250
  "learning_rate": 0.007694736842105263,
251
+ "loss": 1.0985,
252
  "num_tokens": 6001492.0,
253
  "step": 270
254
  },
255
  {
256
  "epoch": 28.0,
257
+ "grad_norm": 3.8541351179946823,
258
  "learning_rate": 0.007589473684210526,
259
+ "loss": 1.0844,
260
  "num_tokens": 6223232.0,
261
  "step": 280
262
  },
263
  {
264
  "epoch": 29.0,
265
+ "grad_norm": 2.757976381041582,
266
  "learning_rate": 0.00748421052631579,
267
+ "loss": 1.0873,
268
  "num_tokens": 6447091.0,
269
  "step": 290
270
  },
271
  {
272
  "epoch": 30.0,
273
+ "grad_norm": 3.292959575183194,
274
  "learning_rate": 0.007378947368421053,
275
+ "loss": 1.074,
276
  "num_tokens": 6668779.0,
277
  "step": 300
278
  },
279
  {
280
  "epoch": 30.0,
281
+ "eval_loss": 1.257373571395874,
282
  "eval_num_tokens": 6668779.0,
283
+ "eval_runtime": 0.3211,
284
+ "eval_samples_per_second": 84.075,
285
+ "eval_steps_per_second": 3.114,
286
  "step": 300
287
  },
288
  {
289
  "epoch": 31.0,
290
+ "grad_norm": 3.618991428419505,
291
  "learning_rate": 0.007273684210526316,
292
+ "loss": 1.0714,
293
  "num_tokens": 6889939.0,
294
  "step": 310
295
  },
296
  {
297
  "epoch": 32.0,
298
+ "grad_norm": 2.834779698695166,
299
  "learning_rate": 0.00716842105263158,
300
+ "loss": 1.0733,
301
  "num_tokens": 7113839.0,
302
  "step": 320
303
  },
304
  {
305
  "epoch": 33.0,
306
+ "grad_norm": 3.069229341212673,
307
  "learning_rate": 0.007063157894736842,
308
+ "loss": 1.0607,
309
  "num_tokens": 7335019.0,
310
  "step": 330
311
  },
312
  {
313
  "epoch": 34.0,
314
+ "grad_norm": 3.435945397864499,
315
  "learning_rate": 0.006957894736842106,
316
+ "loss": 1.0623,
317
  "num_tokens": 7558222.0,
318
  "step": 340
319
  },
320
  {
321
  "epoch": 35.0,
322
+ "grad_norm": 3.9823610333754242,
323
  "learning_rate": 0.006852631578947368,
324
+ "loss": 1.0501,
325
  "num_tokens": 7780865.0,
326
  "step": 350
327
  },
328
  {
329
  "epoch": 36.0,
330
+ "grad_norm": 3.0337688363785995,
331
  "learning_rate": 0.006747368421052632,
332
+ "loss": 1.0507,
333
  "num_tokens": 8003119.0,
334
  "step": 360
335
  },
336
  {
337
  "epoch": 37.0,
338
+ "grad_norm": 3.0380173399070407,
339
  "learning_rate": 0.0066421052631578945,
340
+ "loss": 1.0519,
341
  "num_tokens": 8227584.0,
342
  "step": 370
343
  },
344
  {
345
  "epoch": 37.5,
346
+ "eval_loss": 1.2381330728530884,
347
  "eval_num_tokens": 8343208.0,
348
+ "eval_runtime": 0.3201,
349
+ "eval_samples_per_second": 84.347,
350
+ "eval_steps_per_second": 3.124,
351
  "step": 375
352
  },
353
  {
354
  "epoch": 38.0,
355
+ "grad_norm": 3.790903744111934,
356
  "learning_rate": 0.006536842105263158,
357
+ "loss": 1.0392,
358
  "num_tokens": 8449768.0,
359
  "step": 380
360
  },
361
  {
362
  "epoch": 39.0,
363
+ "grad_norm": 4.32554367560641,
364
  "learning_rate": 0.006431578947368422,
365
+ "loss": 1.0331,
366
  "num_tokens": 8670822.0,
367
  "step": 390
368
  },
369
  {
370
  "epoch": 40.0,
371
+ "grad_norm": 4.251822793877274,
372
  "learning_rate": 0.006326315789473684,
373
+ "loss": 1.0271,
374
  "num_tokens": 8892114.0,
375
  "step": 400
376
  },
377
  {
378
  "epoch": 41.0,
379
+ "grad_norm": 4.560423157551954,
380
  "learning_rate": 0.0062210526315789475,
381
+ "loss": 1.0329,
382
  "num_tokens": 9115676.0,
383
  "step": 410
384
  },
385
  {
386
  "epoch": 42.0,
387
+ "grad_norm": 4.265073330832839,
388
  "learning_rate": 0.00611578947368421,
389
+ "loss": 1.0213,
390
  "num_tokens": 9336910.0,
391
  "step": 420
392
  },
393
  {
394
  "epoch": 43.0,
395
+ "grad_norm": 3.23555980855867,
396
  "learning_rate": 0.006010526315789474,
397
+ "loss": 1.0158,
398
  "num_tokens": 9557990.0,
399
  "step": 430
400
  },
401
  {
402
  "epoch": 44.0,
403
+ "grad_norm": 4.755857183286942,
404
  "learning_rate": 0.005905263157894737,
405
+ "loss": 1.0199,
406
  "num_tokens": 9780068.0,
407
  "step": 440
408
  },
409
  {
410
  "epoch": 45.0,
411
+ "grad_norm": 4.566195384062095,
412
  "learning_rate": 0.0058,
413
+ "loss": 1.0146,
414
  "num_tokens": 10001728.0,
415
  "step": 450
416
  },
417
  {
418
  "epoch": 45.0,
419
+ "eval_loss": 1.2229516506195068,
420
  "eval_num_tokens": 10001728.0,
421
+ "eval_runtime": 0.3204,
422
+ "eval_samples_per_second": 84.271,
423
+ "eval_steps_per_second": 3.121,
424
  "step": 450
425
  },
426
  {
427
  "epoch": 46.0,
428
+ "grad_norm": 5.067438161876945,
429
  "learning_rate": 0.005694736842105264,
430
+ "loss": 1.0058,
431
  "num_tokens": 10223307.0,
432
  "step": 460
433
  },
434
  {
435
  "epoch": 47.0,
436
+ "grad_norm": 3.1567080114120674,
437
  "learning_rate": 0.005589473684210526,
438
+ "loss": 1.0161,
439
  "num_tokens": 10447321.0,
440
  "step": 470
441
  },
442
  {
443
  "epoch": 48.0,
444
+ "grad_norm": 3.5448737550653515,
445
  "learning_rate": 0.005484210526315789,
446
+ "loss": 1.0003,
447
  "num_tokens": 10670049.0,
448
  "step": 480
449
  },
450
  {
451
  "epoch": 49.0,
452
+ "grad_norm": 4.0815007389042455,
453
  "learning_rate": 0.0053789473684210535,
454
+ "loss": 0.9985,
455
  "num_tokens": 10891067.0,
456
  "step": 490
457
  },
458
  {
459
  "epoch": 50.0,
460
+ "grad_norm": 3.54375709976258,
461
  "learning_rate": 0.005273684210526316,
462
+ "loss": 0.9957,
463
  "num_tokens": 11111551.0,
464
  "step": 500
465
  },
466
  {
467
  "epoch": 51.0,
468
+ "grad_norm": 4.260407472183029,
469
  "learning_rate": 0.005168421052631579,
470
+ "loss": 0.9968,
471
  "num_tokens": 11332291.0,
472
  "step": 510
473
  },
474
  {
475
  "epoch": 52.0,
476
+ "grad_norm": 4.6691095377196286,
477
  "learning_rate": 0.0050631578947368415,
478
+ "loss": 0.9897,
479
  "num_tokens": 11553533.0,
480
  "step": 520
481
  },
482
  {
483
  "epoch": 52.5,
484
+ "eval_loss": 1.2112921476364136,
485
  "eval_num_tokens": 11665840.0,
486
+ "eval_runtime": 0.3231,
487
+ "eval_samples_per_second": 83.56,
488
+ "eval_steps_per_second": 3.095,
489
  "step": 525
490
  },
491
  {
492
  "epoch": 53.0,
493
+ "grad_norm": 4.956977136862753,
494
  "learning_rate": 0.004957894736842105,
495
+ "loss": 0.9882,
496
  "num_tokens": 11776548.0,
497
  "step": 530
498
  },
499
  {
500
  "epoch": 54.0,
501
+ "grad_norm": 3.432387228754158,
502
  "learning_rate": 0.004852631578947369,
503
+ "loss": 0.9819,
504
  "num_tokens": 11997805.0,
505
  "step": 540
506
  },
507
  {
508
  "epoch": 55.0,
509
+ "grad_norm": 3.6460618813546453,
510
  "learning_rate": 0.004747368421052632,
511
+ "loss": 0.9781,
512
  "num_tokens": 12219453.0,
513
  "step": 550
514
  },
515
  {
516
  "epoch": 56.0,
517
+ "grad_norm": 3.8194375303151307,
518
  "learning_rate": 0.0046421052631578945,
519
+ "loss": 0.9863,
520
  "num_tokens": 12442555.0,
521
  "step": 560
522
  },
523
  {
524
  "epoch": 57.0,
525
+ "grad_norm": 5.39295222734748,
526
  "learning_rate": 0.004536842105263158,
527
+ "loss": 0.9751,
528
  "num_tokens": 12664583.0,
529
  "step": 570
530
  },
531
  {
532
  "epoch": 58.0,
533
+ "grad_norm": 4.147654144194828,
534
  "learning_rate": 0.004431578947368421,
535
+ "loss": 0.9835,
536
  "num_tokens": 12887745.0,
537
  "step": 580
538
  },
539
  {
540
  "epoch": 59.0,
541
+ "grad_norm": 3.644549538967929,
542
  "learning_rate": 0.004326315789473684,
543
+ "loss": 0.9779,
544
  "num_tokens": 13112833.0,
545
  "step": 590
546
  },
547
  {
548
  "epoch": 60.0,
549
+ "grad_norm": 5.504838132470386,
550
  "learning_rate": 0.0042210526315789474,
551
+ "loss": 0.9717,
552
  "num_tokens": 13333925.0,
553
  "step": 600
554
  },
555
  {
556
  "epoch": 60.0,
557
+ "eval_loss": 1.2046301364898682,
558
  "eval_num_tokens": 13333925.0,
559
+ "eval_runtime": 0.3213,
560
+ "eval_samples_per_second": 84.046,
561
+ "eval_steps_per_second": 3.113,
562
  "step": 600
563
+ },
564
+ {
565
+ "epoch": 61.0,
566
+ "grad_norm": 4.293620012289861,
567
+ "learning_rate": 0.004115789473684211,
568
+ "loss": 0.9714,
569
+ "num_tokens": 13556685.0,
570
+ "step": 610
571
+ },
572
+ {
573
+ "epoch": 62.0,
574
+ "grad_norm": 3.506756001951183,
575
+ "learning_rate": 0.004010526315789474,
576
+ "loss": 0.9723,
577
+ "num_tokens": 13778897.0,
578
+ "step": 620
579
+ },
580
+ {
581
+ "epoch": 63.0,
582
+ "grad_norm": 4.378783254083467,
583
+ "learning_rate": 0.0039052631578947367,
584
+ "loss": 0.967,
585
+ "num_tokens": 14001358.0,
586
+ "step": 630
587
+ },
588
+ {
589
+ "epoch": 64.0,
590
+ "grad_norm": 3.305892841637407,
591
+ "learning_rate": 0.0038,
592
+ "loss": 0.9622,
593
+ "num_tokens": 14222983.0,
594
+ "step": 640
595
+ },
596
+ {
597
+ "epoch": 65.0,
598
+ "grad_norm": 3.8912634544994047,
599
+ "learning_rate": 0.0036947368421052637,
600
+ "loss": 0.9679,
601
+ "num_tokens": 14445938.0,
602
+ "step": 650
603
+ },
604
+ {
605
+ "epoch": 66.0,
606
+ "grad_norm": 4.409907613658091,
607
+ "learning_rate": 0.0035894736842105265,
608
+ "loss": 0.968,
609
+ "num_tokens": 14668829.0,
610
+ "step": 660
611
+ },
612
+ {
613
+ "epoch": 67.0,
614
+ "grad_norm": 3.3890926337654688,
615
+ "learning_rate": 0.0034842105263157897,
616
+ "loss": 0.9633,
617
+ "num_tokens": 14890816.0,
618
+ "step": 670
619
+ },
620
+ {
621
+ "epoch": 67.5,
622
+ "eval_loss": 1.2010632753372192,
623
+ "eval_num_tokens": 15003360.0,
624
+ "eval_runtime": 0.3201,
625
+ "eval_samples_per_second": 84.356,
626
+ "eval_steps_per_second": 3.124,
627
+ "step": 675
628
+ },
629
+ {
630
+ "epoch": 68.0,
631
+ "grad_norm": 3.934065351717797,
632
+ "learning_rate": 0.0033789473684210525,
633
+ "loss": 0.9555,
634
+ "num_tokens": 15111361.0,
635
+ "step": 680
636
+ },
637
+ {
638
+ "epoch": 69.0,
639
+ "grad_norm": 3.5775176080596656,
640
+ "learning_rate": 0.003273684210526316,
641
+ "loss": 0.9591,
642
+ "num_tokens": 15332355.0,
643
+ "step": 690
644
+ },
645
+ {
646
+ "epoch": 70.0,
647
+ "grad_norm": 3.3110436862073906,
648
+ "learning_rate": 0.0031684210526315786,
649
+ "loss": 0.9549,
650
+ "num_tokens": 15554378.0,
651
+ "step": 700
652
+ },
653
+ {
654
+ "epoch": 71.0,
655
+ "grad_norm": 3.880549759257821,
656
+ "learning_rate": 0.0030631578947368423,
657
+ "loss": 0.9472,
658
+ "num_tokens": 15774738.0,
659
+ "step": 710
660
+ },
661
+ {
662
+ "epoch": 72.0,
663
+ "grad_norm": 3.731395551097834,
664
+ "learning_rate": 0.0029578947368421055,
665
+ "loss": 0.9507,
666
+ "num_tokens": 15995583.0,
667
+ "step": 720
668
+ },
669
+ {
670
+ "epoch": 73.0,
671
+ "grad_norm": 3.430579284839533,
672
+ "learning_rate": 0.0028526315789473683,
673
+ "loss": 0.9556,
674
+ "num_tokens": 16219509.0,
675
+ "step": 730
676
+ },
677
+ {
678
+ "epoch": 74.0,
679
+ "grad_norm": 4.143480775832373,
680
+ "learning_rate": 0.0027473684210526316,
681
+ "loss": 0.9579,
682
+ "num_tokens": 16443780.0,
683
+ "step": 740
684
+ },
685
+ {
686
+ "epoch": 75.0,
687
+ "grad_norm": 3.6743106053855814,
688
+ "learning_rate": 0.0026421052631578944,
689
+ "loss": 0.9452,
690
+ "num_tokens": 16665238.0,
691
+ "step": 750
692
+ },
693
+ {
694
+ "epoch": 75.0,
695
+ "eval_loss": 1.1967283487319946,
696
+ "eval_num_tokens": 16665238.0,
697
+ "eval_runtime": 0.3185,
698
+ "eval_samples_per_second": 84.784,
699
+ "eval_steps_per_second": 3.14,
700
+ "step": 750
701
+ },
702
+ {
703
+ "epoch": 76.0,
704
+ "grad_norm": 4.197306578134703,
705
+ "learning_rate": 0.002536842105263158,
706
+ "loss": 0.9493,
707
+ "num_tokens": 16887354.0,
708
+ "step": 760
709
+ },
710
+ {
711
+ "epoch": 77.0,
712
+ "grad_norm": 3.9942870331698868,
713
+ "learning_rate": 0.002431578947368421,
714
+ "loss": 0.9472,
715
+ "num_tokens": 17109856.0,
716
+ "step": 770
717
+ },
718
+ {
719
+ "epoch": 78.0,
720
+ "grad_norm": 4.209686621558697,
721
+ "learning_rate": 0.002326315789473684,
722
+ "loss": 0.9493,
723
+ "num_tokens": 17332931.0,
724
+ "step": 780
725
+ },
726
+ {
727
+ "epoch": 79.0,
728
+ "grad_norm": 5.511737064443916,
729
+ "learning_rate": 0.0022210526315789474,
730
+ "loss": 0.9463,
731
+ "num_tokens": 17553676.0,
732
+ "step": 790
733
+ },
734
+ {
735
+ "epoch": 80.0,
736
+ "grad_norm": 4.390465832236099,
737
+ "learning_rate": 0.0021157894736842106,
738
+ "loss": 0.9425,
739
+ "num_tokens": 17776111.0,
740
+ "step": 800
741
+ },
742
+ {
743
+ "epoch": 81.0,
744
+ "grad_norm": 5.005842824438807,
745
+ "learning_rate": 0.002010526315789474,
746
+ "loss": 0.9448,
747
+ "num_tokens": 17997428.0,
748
+ "step": 810
749
+ },
750
+ {
751
+ "epoch": 82.0,
752
+ "grad_norm": 4.881807931973373,
753
+ "learning_rate": 0.001905263157894737,
754
+ "loss": 0.9422,
755
+ "num_tokens": 18219176.0,
756
+ "step": 820
757
+ },
758
+ {
759
+ "epoch": 82.5,
760
+ "eval_loss": 1.196116328239441,
761
+ "eval_num_tokens": 18330757.0,
762
+ "eval_runtime": 0.3188,
763
+ "eval_samples_per_second": 84.702,
764
+ "eval_steps_per_second": 3.137,
765
+ "step": 825
766
+ },
767
+ {
768
+ "epoch": 83.0,
769
+ "grad_norm": 4.1644693854831365,
770
+ "learning_rate": 0.0018,
771
+ "loss": 0.9387,
772
+ "num_tokens": 18440948.0,
773
+ "step": 830
774
+ },
775
+ {
776
+ "epoch": 84.0,
777
+ "grad_norm": 3.403286465588362,
778
+ "learning_rate": 0.001694736842105263,
779
+ "loss": 0.9404,
780
+ "num_tokens": 18662723.0,
781
+ "step": 840
782
+ },
783
+ {
784
+ "epoch": 85.0,
785
+ "grad_norm": 3.9479844988657042,
786
+ "learning_rate": 0.0015894736842105264,
787
+ "loss": 0.9402,
788
+ "num_tokens": 18884234.0,
789
+ "step": 850
790
+ },
791
+ {
792
+ "epoch": 86.0,
793
+ "grad_norm": 3.293992963000251,
794
+ "learning_rate": 0.0014842105263157895,
795
+ "loss": 0.936,
796
+ "num_tokens": 19106137.0,
797
+ "step": 860
798
+ },
799
+ {
800
+ "epoch": 87.0,
801
+ "grad_norm": 3.0634247251244884,
802
+ "learning_rate": 0.0013789473684210527,
803
+ "loss": 0.9371,
804
+ "num_tokens": 19327897.0,
805
+ "step": 870
806
+ },
807
+ {
808
+ "epoch": 88.0,
809
+ "grad_norm": 3.4274407097867443,
810
+ "learning_rate": 0.0012736842105263158,
811
+ "loss": 0.9344,
812
+ "num_tokens": 19549151.0,
813
+ "step": 880
814
+ },
815
+ {
816
+ "epoch": 89.0,
817
+ "grad_norm": 3.4140297161368003,
818
+ "learning_rate": 0.001168421052631579,
819
+ "loss": 0.9348,
820
+ "num_tokens": 19770623.0,
821
+ "step": 890
822
+ },
823
+ {
824
+ "epoch": 90.0,
825
+ "grad_norm": 3.1168838509225285,
826
+ "learning_rate": 0.001063157894736842,
827
+ "loss": 0.936,
828
+ "num_tokens": 19992317.0,
829
+ "step": 900
830
+ },
831
+ {
832
+ "epoch": 90.0,
833
+ "eval_loss": 1.1946989297866821,
834
+ "eval_num_tokens": 19992317.0,
835
+ "eval_runtime": 0.3217,
836
+ "eval_samples_per_second": 83.935,
837
+ "eval_steps_per_second": 3.109,
838
+ "step": 900
839
+ },
840
+ {
841
+ "epoch": 91.0,
842
+ "grad_norm": 3.079611135216578,
843
+ "learning_rate": 0.0009578947368421053,
844
+ "loss": 0.9372,
845
+ "num_tokens": 20214731.0,
846
+ "step": 910
847
+ },
848
+ {
849
+ "epoch": 92.0,
850
+ "grad_norm": 3.792032956614427,
851
+ "learning_rate": 0.0008526315789473685,
852
+ "loss": 0.932,
853
+ "num_tokens": 20436683.0,
854
+ "step": 920
855
+ },
856
+ {
857
+ "epoch": 93.0,
858
+ "grad_norm": 3.6470806092555166,
859
+ "learning_rate": 0.0007473684210526316,
860
+ "loss": 0.9344,
861
+ "num_tokens": 20659388.0,
862
+ "step": 930
863
+ },
864
+ {
865
+ "epoch": 94.0,
866
+ "grad_norm": 3.370001392101702,
867
+ "learning_rate": 0.0006421052631578947,
868
+ "loss": 0.9348,
869
+ "num_tokens": 20881755.0,
870
+ "step": 940
871
+ },
872
+ {
873
+ "epoch": 95.0,
874
+ "grad_norm": 3.326107539027494,
875
+ "learning_rate": 0.0005368421052631579,
876
+ "loss": 0.934,
877
+ "num_tokens": 21103662.0,
878
+ "step": 950
879
+ },
880
+ {
881
+ "epoch": 96.0,
882
+ "grad_norm": 3.1738358139915475,
883
+ "learning_rate": 0.000431578947368421,
884
+ "loss": 0.9327,
885
+ "num_tokens": 21325035.0,
886
+ "step": 960
887
+ },
888
+ {
889
+ "epoch": 97.0,
890
+ "grad_norm": 3.9648314762416943,
891
+ "learning_rate": 0.0003263157894736842,
892
+ "loss": 0.9287,
893
+ "num_tokens": 21545180.0,
894
+ "step": 970
895
+ },
896
+ {
897
+ "epoch": 97.5,
898
+ "eval_loss": 1.1944328546524048,
899
+ "eval_num_tokens": 21651877.0,
900
+ "eval_runtime": 0.3188,
901
+ "eval_samples_per_second": 84.685,
902
+ "eval_steps_per_second": 3.136,
903
+ "step": 975
904
+ },
905
+ {
906
+ "epoch": 98.0,
907
+ "grad_norm": 3.313089750576275,
908
+ "learning_rate": 0.00022105263157894735,
909
+ "loss": 0.9323,
910
+ "num_tokens": 21767119.0,
911
+ "step": 980
912
+ },
913
+ {
914
+ "epoch": 99.0,
915
+ "grad_norm": 3.179576833461792,
916
+ "learning_rate": 0.00011578947368421053,
917
+ "loss": 0.9301,
918
+ "num_tokens": 21989272.0,
919
+ "step": 990
920
+ },
921
+ {
922
+ "epoch": 100.0,
923
+ "grad_norm": 3.1836571339116047,
924
+ "learning_rate": 1.0526315789473684e-05,
925
+ "loss": 0.9298,
926
+ "num_tokens": 22211321.0,
927
+ "step": 1000
928
  }
929
  ],
930
  "logging_steps": 10,
 
939
  "should_evaluate": false,
940
  "should_log": false,
941
  "should_save": true,
942
+ "should_training_stop": true
943
  },
944
  "attributes": {}
945
  }
946
  },
947
+ "total_flos": 71815179534336.0,
948
  "train_batch_size": 8,
949
  "trial_name": null,
950
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cdc7f5c1ee11e13f289f3d3a6ae1dd489ba7111db7c7b217a9d756782d57af3
3
  size 7377
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aaaa9ea2a04a7fc17c0d0b878afd9a08d3174e85e74b74a2eee6684ba9b3065
3
  size 7377