aq1048576 commited on
Commit
8c8a274
·
verified ·
1 Parent(s): 0cc0853

Upload math SFT checkpoint

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.1-8B
3
+ library_name: transformers
4
+ model_name: math
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for math
13
+
14
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/aq1048576-princeton-university/huggingface/runs/3npxcokv)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.23.0
38
+ - Transformers: 4.57.1
39
+ - Pytorch: 2.8.0
40
+ - Datasets: 3.5.0
41
+ - Tokenizers: 0.22.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c9ee29d73bdd07ea72d03279bc2ffbfa3ee77484a01abf00bfc822f308d37ca
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdbdb0b664d4e9c7e513be6668cd1fc051b071e6a652e9948678364600125476
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19fffaf57a12349b94838398d632a9bcf3dfd88656db7a7c245606f89053e977
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6a7804fa585d031b548d1ef958def1c8e0ee5af66c5d7ebb360eaec23815b5a
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57e0ce2e5ee497b4672592217620cffa453c2eb6b2944795170e4f243c3dd88b
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea953a29f478a82af8b6c1c26ec4d58878da0a0bac82741718169aad010a0cea
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b4e8defb49000fb60157a8d2488c46c8a88029f07de7e8d2b6da55342ddf3c8
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c0d3c81356ced85ec9704dc8b1a6aec7026326ee508f15199a0e02e4d096094
3
  size 1168138808
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2f90a0ee1b41702c7b233b02234294a53bc0684a08d3bcd8c8ff702e9a12f64
3
- size 17210019
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
tokenizer_config.json CHANGED
@@ -2059,5 +2059,6 @@
2059
  ],
2060
  "model_max_length": 4096,
2061
  "pad_token": "<|end_of_text|>",
2062
- "tokenizer_class": "PreTrainedTokenizerFast"
 
2063
  }
 
2059
  ],
2060
  "model_max_length": 4096,
2061
  "pad_token": "<|end_of_text|>",
2062
+ "tokenizer_class": "PreTrainedTokenizerFast",
2063
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}{% else %}{{ '<|assistant|>\n' + message['content'] + eos_token }}{% endif %}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endfor %}"
2064
  }
trainer_state.json CHANGED
@@ -4,487 +4,86 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 1293,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.015472391451503724,
14
- "grad_norm": 0.86328125,
15
- "learning_rate": 3.653846153846154e-05,
16
- "loss": 0.5808,
17
- "step": 20
18
- },
19
- {
20
- "epoch": 0.030944782903007447,
21
- "grad_norm": 0.609375,
22
- "learning_rate": 4.998701309604454e-05,
23
- "loss": 0.4787,
24
- "step": 40
25
- },
26
- {
27
- "epoch": 0.04641717435451117,
28
- "grad_norm": 0.5625,
29
- "learning_rate": 4.9916354596319675e-05,
30
- "loss": 0.4591,
31
- "step": 60
32
- },
33
- {
34
- "epoch": 0.061889565806014894,
35
- "grad_norm": 0.59765625,
36
- "learning_rate": 4.978443256845644e-05,
37
- "loss": 0.4506,
38
- "step": 80
39
- },
40
- {
41
- "epoch": 0.07736195725751861,
42
- "grad_norm": 0.53125,
43
- "learning_rate": 4.9591571378077203e-05,
44
- "loss": 0.4429,
45
- "step": 100
46
- },
47
- {
48
- "epoch": 0.09283434870902234,
49
- "grad_norm": 0.6015625,
50
- "learning_rate": 4.933824522604945e-05,
51
- "loss": 0.4434,
52
- "step": 120
53
- },
54
- {
55
- "epoch": 0.10830674016052606,
56
- "grad_norm": 0.52734375,
57
- "learning_rate": 4.9025076982535925e-05,
58
- "loss": 0.4411,
59
- "step": 140
60
- },
61
- {
62
- "epoch": 0.12377913161202979,
63
- "grad_norm": 0.50390625,
64
- "learning_rate": 4.865283665550167e-05,
65
- "loss": 0.4358,
66
- "step": 160
67
- },
68
- {
69
- "epoch": 0.1392515230635335,
70
- "grad_norm": 0.54296875,
71
- "learning_rate": 4.8222439497443233e-05,
72
- "loss": 0.4324,
73
- "step": 180
74
- },
75
- {
76
- "epoch": 0.15472391451503723,
77
- "grad_norm": 0.453125,
78
- "learning_rate": 4.773494375499543e-05,
79
- "loss": 0.4336,
80
- "step": 200
81
- },
82
- {
83
- "epoch": 0.17019630596654095,
84
- "grad_norm": 0.515625,
85
- "learning_rate": 4.7191548066948686e-05,
86
- "loss": 0.4289,
87
- "step": 220
88
- },
89
- {
90
- "epoch": 0.18566869741804468,
91
- "grad_norm": 0.4765625,
92
- "learning_rate": 4.659358851707464e-05,
93
- "loss": 0.4242,
94
- "step": 240
95
- },
96
- {
97
- "epoch": 0.2011410888695484,
98
- "grad_norm": 0.45703125,
99
- "learning_rate": 4.5942535349006555e-05,
100
- "loss": 0.42,
101
- "step": 260
102
- },
103
- {
104
- "epoch": 0.21661348032105213,
105
- "grad_norm": 0.439453125,
106
- "learning_rate": 4.523998935125173e-05,
107
- "loss": 0.4255,
108
- "step": 280
109
- },
110
- {
111
- "epoch": 0.23208587177255585,
112
- "grad_norm": 0.46484375,
113
- "learning_rate": 4.44876779212244e-05,
114
- "loss": 0.421,
115
- "step": 300
116
- },
117
- {
118
- "epoch": 0.24755826322405958,
119
- "grad_norm": 0.54296875,
120
- "learning_rate": 4.368745081797678e-05,
121
- "loss": 0.4207,
122
- "step": 320
123
- },
124
- {
125
- "epoch": 0.2630306546755633,
126
- "grad_norm": 0.44140625,
127
- "learning_rate": 4.2841275614071176e-05,
128
- "loss": 0.4133,
129
- "step": 340
130
- },
131
- {
132
- "epoch": 0.278503046127067,
133
- "grad_norm": 0.5,
134
- "learning_rate": 4.1951232857776164e-05,
135
- "loss": 0.4144,
136
- "step": 360
137
- },
138
- {
139
- "epoch": 0.29397543757857075,
140
- "grad_norm": 0.404296875,
141
- "learning_rate": 4.1019510957481656e-05,
142
- "loss": 0.412,
143
- "step": 380
144
- },
145
- {
146
- "epoch": 0.30944782903007445,
147
- "grad_norm": 0.453125,
148
- "learning_rate": 4.004840080091103e-05,
149
- "loss": 0.4109,
150
- "step": 400
151
- },
152
- {
153
- "epoch": 0.3249202204815782,
154
- "grad_norm": 0.439453125,
155
- "learning_rate": 3.904029012236033e-05,
156
- "loss": 0.4078,
157
- "step": 420
158
- },
159
- {
160
- "epoch": 0.3403926119330819,
161
- "grad_norm": 0.447265625,
162
- "learning_rate": 3.7997657631814363e-05,
163
- "loss": 0.4048,
164
- "step": 440
165
- },
166
- {
167
- "epoch": 0.35586500338458565,
168
- "grad_norm": 0.44921875,
169
- "learning_rate": 3.6923066920374494e-05,
170
- "loss": 0.4012,
171
- "step": 460
172
- },
173
- {
174
- "epoch": 0.37133739483608935,
175
- "grad_norm": 0.43359375,
176
- "learning_rate": 3.5819160156983755e-05,
177
- "loss": 0.4033,
178
- "step": 480
179
- },
180
- {
181
- "epoch": 0.38680978628759305,
182
- "grad_norm": 0.427734375,
183
- "learning_rate": 3.4688651591947096e-05,
184
- "loss": 0.4046,
185
- "step": 500
186
- },
187
- {
188
- "epoch": 0.38680978628759305,
189
- "eval_loss": 0.3988688290119171,
190
- "eval_runtime": 35.5904,
191
- "eval_samples_per_second": 93.93,
192
- "eval_steps_per_second": 2.95,
193
  "step": 500
194
  },
195
  {
196
- "epoch": 0.4022821777390968,
197
- "grad_norm": 0.421875,
198
- "learning_rate": 3.3534320883220366e-05,
199
- "loss": 0.4044,
200
- "step": 520
201
- },
202
- {
203
- "epoch": 0.4177545691906005,
204
- "grad_norm": 0.421875,
205
- "learning_rate": 3.235900626187713e-05,
206
- "loss": 0.3979,
207
- "step": 540
208
- },
209
- {
210
- "epoch": 0.43322696064210425,
211
- "grad_norm": 0.458984375,
212
- "learning_rate": 3.116559755355772e-05,
213
- "loss": 0.3964,
214
- "step": 560
215
- },
216
- {
217
- "epoch": 0.44869935209360795,
218
- "grad_norm": 0.416015625,
219
- "learning_rate": 2.9957029073059272e-05,
220
- "loss": 0.3983,
221
- "step": 580
222
- },
223
- {
224
- "epoch": 0.4641717435451117,
225
- "grad_norm": 0.423828125,
226
- "learning_rate": 2.8736272409537257e-05,
227
- "loss": 0.3909,
228
- "step": 600
229
- },
230
- {
231
- "epoch": 0.4796441349966154,
232
- "grad_norm": 0.400390625,
233
- "learning_rate": 2.7506329120058007e-05,
234
- "loss": 0.392,
235
- "step": 620
236
- },
237
- {
238
- "epoch": 0.49511652644811915,
239
- "grad_norm": 0.390625,
240
- "learning_rate": 2.6270223349467123e-05,
241
- "loss": 0.3923,
242
- "step": 640
243
- },
244
- {
245
- "epoch": 0.5105889178996229,
246
- "grad_norm": 0.38671875,
247
- "learning_rate": 2.503099439471977e-05,
248
- "loss": 0.3922,
249
- "step": 660
250
- },
251
- {
252
- "epoch": 0.5260613093511266,
253
- "grad_norm": 0.3984375,
254
- "learning_rate": 2.3791689231955474e-05,
255
- "loss": 0.389,
256
- "step": 680
257
- },
258
- {
259
- "epoch": 0.5415337008026303,
260
- "grad_norm": 0.375,
261
- "learning_rate": 2.2555355024691588e-05,
262
- "loss": 0.3908,
263
- "step": 700
264
- },
265
- {
266
- "epoch": 0.557006092254134,
267
- "grad_norm": 0.42578125,
268
- "learning_rate": 2.1325031631555993e-05,
269
- "loss": 0.3848,
270
- "step": 720
271
- },
272
- {
273
- "epoch": 0.5724784837056377,
274
- "grad_norm": 0.37890625,
275
- "learning_rate": 2.0103744131980902e-05,
276
- "loss": 0.3887,
277
- "step": 740
278
- },
279
- {
280
- "epoch": 0.5879508751571415,
281
- "grad_norm": 0.380859375,
282
- "learning_rate": 1.8894495388235166e-05,
283
- "loss": 0.3905,
284
- "step": 760
285
- },
286
- {
287
- "epoch": 0.6034232666086452,
288
- "grad_norm": 0.37109375,
289
- "learning_rate": 1.7700258662083573e-05,
290
- "loss": 0.3907,
291
- "step": 780
292
- },
293
- {
294
- "epoch": 0.6188956580601489,
295
- "grad_norm": 0.376953125,
296
- "learning_rate": 1.6523970304226778e-05,
297
- "loss": 0.3875,
298
- "step": 800
299
- },
300
- {
301
- "epoch": 0.6343680495116526,
302
- "grad_norm": 0.376953125,
303
- "learning_rate": 1.5368522534496994e-05,
304
- "loss": 0.3843,
305
- "step": 820
306
- },
307
- {
308
- "epoch": 0.6498404409631564,
309
- "grad_norm": 0.373046875,
310
- "learning_rate": 1.4236756330561318e-05,
311
- "loss": 0.3826,
312
- "step": 840
313
- },
314
- {
315
- "epoch": 0.6653128324146601,
316
- "grad_norm": 0.3671875,
317
- "learning_rate": 1.3131454442617521e-05,
318
- "loss": 0.3832,
319
- "step": 860
320
- },
321
- {
322
- "epoch": 0.6807852238661638,
323
- "grad_norm": 0.38671875,
324
- "learning_rate": 1.2055334551257747e-05,
325
- "loss": 0.3804,
326
- "step": 880
327
- },
328
- {
329
- "epoch": 0.6962576153176675,
330
- "grad_norm": 0.357421875,
331
- "learning_rate": 1.1011042585323233e-05,
332
- "loss": 0.3798,
333
- "step": 900
334
- },
335
- {
336
- "epoch": 0.7117300067691713,
337
- "grad_norm": 0.36328125,
338
- "learning_rate": 1.000114621617988e-05,
339
- "loss": 0.3834,
340
- "step": 920
341
- },
342
- {
343
- "epoch": 0.727202398220675,
344
- "grad_norm": 0.357421875,
345
- "learning_rate": 9.028128544410814e-06,
346
- "loss": 0.3764,
347
- "step": 940
348
- },
349
- {
350
- "epoch": 0.7426747896721787,
351
- "grad_norm": 0.39453125,
352
- "learning_rate": 8.094381994448896e-06,
353
- "loss": 0.3772,
354
- "step": 960
355
- },
356
- {
357
- "epoch": 0.7581471811236824,
358
- "grad_norm": 0.400390625,
359
- "learning_rate": 7.202202432160712e-06,
360
- "loss": 0.3857,
361
- "step": 980
362
- },
363
- {
364
- "epoch": 0.7736195725751861,
365
- "grad_norm": 0.380859375,
366
- "learning_rate": 6.3537835198457515e-06,
367
- "loss": 0.3811,
368
- "step": 1000
369
- },
370
- {
371
- "epoch": 0.7736195725751861,
372
- "eval_loss": 0.3767356276512146,
373
- "eval_runtime": 35.55,
374
- "eval_samples_per_second": 94.036,
375
- "eval_steps_per_second": 2.954,
376
  "step": 1000
377
  },
378
  {
379
- "epoch": 0.7890919640266899,
380
- "grad_norm": 0.359375,
381
- "learning_rate": 5.551211322530381e-06,
382
- "loss": 0.3784,
383
- "step": 1020
384
- },
385
- {
386
- "epoch": 0.8045643554781936,
387
- "grad_norm": 0.359375,
388
- "learning_rate": 4.796459178818496e-06,
389
- "loss": 0.3821,
390
- "step": 1040
391
- },
392
- {
393
- "epoch": 0.8200367469296973,
394
- "grad_norm": 0.375,
395
- "learning_rate": 4.09138284891028e-06,
396
- "loss": 0.3756,
397
- "step": 1060
398
- },
399
- {
400
- "epoch": 0.835509138381201,
401
- "grad_norm": 0.36328125,
402
- "learning_rate": 3.4377159517189896e-06,
403
- "loss": 0.3774,
404
- "step": 1080
405
  },
406
  {
407
- "epoch": 0.8509815298327048,
408
- "grad_norm": 0.376953125,
409
- "learning_rate": 2.837065702304667e-06,
410
- "loss": 0.3824,
411
- "step": 1100
 
 
 
412
  },
413
  {
414
- "epoch": 0.8664539212842085,
415
- "grad_norm": 0.36328125,
416
- "learning_rate": 2.2909089601057367e-06,
417
- "loss": 0.3819,
418
- "step": 1120
419
- },
420
- {
421
- "epoch": 0.8819263127357122,
422
- "grad_norm": 0.369140625,
423
- "learning_rate": 1.800588597684652e-06,
424
- "loss": 0.3772,
425
- "step": 1140
426
- },
427
- {
428
- "epoch": 0.8973987041872159,
429
- "grad_norm": 0.376953125,
430
- "learning_rate": 1.3673101989161912e-06,
431
- "loss": 0.3787,
432
- "step": 1160
433
- },
434
- {
435
- "epoch": 0.9128710956387197,
436
- "grad_norm": 0.353515625,
437
- "learning_rate": 9.921390947368076e-07,
438
- "loss": 0.3779,
439
- "step": 1180
440
- },
441
- {
442
- "epoch": 0.9283434870902234,
443
- "grad_norm": 0.359375,
444
- "learning_rate": 6.759977437432647e-07,
445
- "loss": 0.3803,
446
- "step": 1200
447
- },
448
- {
449
- "epoch": 0.9438158785417271,
450
- "grad_norm": 0.35546875,
451
- "learning_rate": 4.196634640812602e-07,
452
- "loss": 0.3804,
453
- "step": 1220
454
- },
455
- {
456
- "epoch": 0.9592882699932308,
457
- "grad_norm": 0.365234375,
458
- "learning_rate": 2.237665222006552e-07,
459
- "loss": 0.38,
460
- "step": 1240
461
- },
462
- {
463
- "epoch": 0.9747606614447345,
464
- "grad_norm": 0.36328125,
465
- "learning_rate": 8.87885831766827e-08,
466
- "loss": 0.3757,
467
- "step": 1260
468
  },
469
  {
470
- "epoch": 0.9902330528962383,
471
- "grad_norm": 0.376953125,
472
- "learning_rate": 1.5061526407406103e-08,
473
- "loss": 0.3798,
474
- "step": 1280
 
 
 
475
  },
476
  {
 
477
  "epoch": 1.0,
478
- "step": 1293,
479
- "total_flos": 2.149210011822614e+19,
480
- "train_loss": 0.4033243281843122,
481
- "train_runtime": 11128.3081,
482
- "train_samples_per_second": 29.736,
483
- "train_steps_per_second": 0.116
 
 
484
  }
485
  ],
486
- "logging_steps": 20,
487
- "max_steps": 1293,
488
  "num_input_tokens_seen": 0,
489
  "num_train_epochs": 1,
490
  "save_steps": 500,
@@ -500,7 +99,7 @@
500
  "attributes": {}
501
  }
502
  },
503
- "total_flos": 2.149210011822614e+19,
504
  "train_batch_size": 4,
505
  "trial_name": null,
506
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 3447,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.5079199960827827,
14
+ "epoch": 0.1450536698578474,
15
+ "grad_norm": 0.67578125,
16
+ "learning_rate": 4.363528715216104e-05,
17
+ "loss": 0.5061,
18
+ "mean_token_accuracy": 0.847917699560523,
19
+ "num_tokens": 34675536.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "step": 500
21
  },
22
  {
23
+ "entropy": 0.4645708839818835,
24
+ "epoch": 0.2901073397156948,
25
+ "grad_norm": 0.609375,
26
+ "learning_rate": 3.6234458259325044e-05,
27
+ "loss": 0.4625,
28
+ "mean_token_accuracy": 0.8582921151965857,
29
+ "num_tokens": 69136517.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "step": 1000
31
  },
32
  {
33
+ "entropy": 0.4413825359642506,
34
+ "epoch": 0.4351610095735422,
35
+ "grad_norm": 0.5859375,
36
+ "learning_rate": 2.8833629366489046e-05,
37
+ "loss": 0.4391,
38
+ "mean_token_accuracy": 0.8642738572955132,
39
+ "num_tokens": 103657138.0,
40
+ "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  },
42
  {
43
+ "entropy": 0.426292674459517,
44
+ "epoch": 0.5802146794313896,
45
+ "grad_norm": 0.58984375,
46
+ "learning_rate": 2.143280047365305e-05,
47
+ "loss": 0.4239,
48
+ "mean_token_accuracy": 0.8679788280278444,
49
+ "num_tokens": 138126943.0,
50
+ "step": 2000
51
  },
52
  {
53
+ "entropy": 0.41619670213758947,
54
+ "epoch": 0.725268349289237,
55
+ "grad_norm": 0.6015625,
56
+ "learning_rate": 1.4031971580817053e-05,
57
+ "loss": 0.4135,
58
+ "mean_token_accuracy": 0.8707230059802532,
59
+ "num_tokens": 172549297.0,
60
+ "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
+ "entropy": 0.41040751719474794,
64
+ "epoch": 0.8703220191470844,
65
+ "grad_norm": 0.61328125,
66
+ "learning_rate": 6.631142687981054e-06,
67
+ "loss": 0.4075,
68
+ "mean_token_accuracy": 0.8723464601933956,
69
+ "num_tokens": 207014931.0,
70
+ "step": 3000
71
  },
72
  {
73
+ "entropy": 0.40811442769647177,
74
  "epoch": 1.0,
75
+ "mean_token_accuracy": 0.8727223603117386,
76
+ "num_tokens": 237886773.0,
77
+ "step": 3447,
78
+ "total_flos": 1.4839042051252683e+19,
79
+ "train_loss": 0.43732520784887674,
80
+ "train_runtime": 16918.1607,
81
+ "train_samples_per_second": 19.559,
82
+ "train_steps_per_second": 0.204
83
  }
84
  ],
85
+ "logging_steps": 500,
86
+ "max_steps": 3447,
87
  "num_input_tokens_seen": 0,
88
  "num_train_epochs": 1,
89
  "save_steps": 500,
 
99
  "attributes": {}
100
  }
101
  },
102
+ "total_flos": 1.4839042051252683e+19,
103
  "train_batch_size": 4,
104
  "trial_name": null,
105
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e59b1b525c9b30f502e4fa90763c8538b3812972f6f2d3b635427432568b1e5f
3
- size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81d65e9d414959d083a9cc3fc2b708a400d23698532b047892373466c21f0227
3
+ size 6289