hlillemark commited on
Commit
b90bd04
·
verified ·
1 Parent(s): b4f4db2

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3
4
  base_model: meta-llama/Meta-Llama-3-8B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: sft_mc_filtered
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # sft_mc_filtered
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 2.3079
21
 
 
4
  base_model: meta-llama/Meta-Llama-3-8B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: sft_mc_filtered
 
16
 
17
  # sft_mc_filtered
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the identity and the data_mc_filtered datasets.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 2.3079
22
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.3078930377960205,
4
+ "eval_runtime": 1.0328,
5
+ "eval_samples_per_second": 16.46,
6
+ "eval_steps_per_second": 2.905,
7
+ "total_flos": 37534574837760.0,
8
+ "train_loss": 0.26039194798469545,
9
+ "train_runtime": 1475.1742,
10
+ "train_samples_per_second": 5.403,
11
+ "train_steps_per_second": 0.339
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.3078930377960205,
4
+ "eval_runtime": 1.0328,
5
+ "eval_samples_per_second": 16.46,
6
+ "eval_steps_per_second": 2.905
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 37534574837760.0,
4
+ "train_loss": 0.26039194798469545,
5
+ "train_runtime": 1475.1742,
6
+ "train_samples_per_second": 5.403,
7
+ "train_steps_per_second": 0.339
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 50,
6
+ "global_step": 500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.2,
13
+ "grad_norm": 32.215486418057836,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 1.507,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.4,
20
+ "grad_norm": 17.53584966854256,
21
+ "learning_rate": 4.000000000000001e-06,
22
+ "loss": 1.0158,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.6,
27
+ "grad_norm": 10.21777506024786,
28
+ "learning_rate": 6e-06,
29
+ "loss": 0.9148,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.8,
34
+ "grad_norm": 11.037533581774264,
35
+ "learning_rate": 8.000000000000001e-06,
36
+ "loss": 0.7659,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "grad_norm": 12.348125604423814,
42
+ "learning_rate": 1e-05,
43
+ "loss": 0.8433,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 1.0,
48
+ "eval_loss": 1.0960203409194946,
49
+ "eval_runtime": 1.0335,
50
+ "eval_samples_per_second": 16.448,
51
+ "eval_steps_per_second": 2.903,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 1.2,
56
+ "grad_norm": 11.850091156248704,
57
+ "learning_rate": 9.987820251299121e-06,
58
+ "loss": 0.6045,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 1.4,
63
+ "grad_norm": 8.774567976123997,
64
+ "learning_rate": 9.951340343707852e-06,
65
+ "loss": 0.6436,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 1.6,
70
+ "grad_norm": 7.56423224860242,
71
+ "learning_rate": 9.890738003669029e-06,
72
+ "loss": 0.6194,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 1.8,
77
+ "grad_norm": 9.761428972107632,
78
+ "learning_rate": 9.806308479691595e-06,
79
+ "loss": 0.6762,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 2.0,
84
+ "grad_norm": 8.275718136211415,
85
+ "learning_rate": 9.698463103929542e-06,
86
+ "loss": 0.6237,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 2.0,
91
+ "eval_loss": 1.131184697151184,
92
+ "eval_runtime": 1.0288,
93
+ "eval_samples_per_second": 16.524,
94
+ "eval_steps_per_second": 2.916,
95
+ "step": 100
96
+ },
97
+ {
98
+ "epoch": 2.2,
99
+ "grad_norm": 10.243082528915453,
100
+ "learning_rate": 9.567727288213005e-06,
101
+ "loss": 0.3926,
102
+ "step": 110
103
+ },
104
+ {
105
+ "epoch": 2.4,
106
+ "grad_norm": 6.692595611616738,
107
+ "learning_rate": 9.414737964294636e-06,
108
+ "loss": 0.386,
109
+ "step": 120
110
+ },
111
+ {
112
+ "epoch": 2.6,
113
+ "grad_norm": 7.767268297417072,
114
+ "learning_rate": 9.24024048078213e-06,
115
+ "loss": 0.4027,
116
+ "step": 130
117
+ },
118
+ {
119
+ "epoch": 2.8,
120
+ "grad_norm": 8.931326769204444,
121
+ "learning_rate": 9.045084971874738e-06,
122
+ "loss": 0.397,
123
+ "step": 140
124
+ },
125
+ {
126
+ "epoch": 3.0,
127
+ "grad_norm": 8.32608967253919,
128
+ "learning_rate": 8.83022221559489e-06,
129
+ "loss": 0.391,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 3.0,
134
+ "eval_loss": 1.2465544939041138,
135
+ "eval_runtime": 1.0305,
136
+ "eval_samples_per_second": 16.498,
137
+ "eval_steps_per_second": 2.911,
138
+ "step": 150
139
+ },
140
+ {
141
+ "epoch": 3.2,
142
+ "grad_norm": 7.073212488180378,
143
+ "learning_rate": 8.596699001693257e-06,
144
+ "loss": 0.2198,
145
+ "step": 160
146
+ },
147
+ {
148
+ "epoch": 3.4,
149
+ "grad_norm": 7.2442332680693875,
150
+ "learning_rate": 8.345653031794292e-06,
151
+ "loss": 0.2356,
152
+ "step": 170
153
+ },
154
+ {
155
+ "epoch": 3.6,
156
+ "grad_norm": 6.16850982087996,
157
+ "learning_rate": 8.078307376628292e-06,
158
+ "loss": 0.2416,
159
+ "step": 180
160
+ },
161
+ {
162
+ "epoch": 3.8,
163
+ "grad_norm": 5.844877152864581,
164
+ "learning_rate": 7.795964517353734e-06,
165
+ "loss": 0.2504,
166
+ "step": 190
167
+ },
168
+ {
169
+ "epoch": 4.0,
170
+ "grad_norm": 6.614175675387944,
171
+ "learning_rate": 7.500000000000001e-06,
172
+ "loss": 0.2529,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 4.0,
177
+ "eval_loss": 1.4002876281738281,
178
+ "eval_runtime": 1.0309,
179
+ "eval_samples_per_second": 16.49,
180
+ "eval_steps_per_second": 2.91,
181
+ "step": 200
182
+ },
183
+ {
184
+ "epoch": 4.2,
185
+ "grad_norm": 4.846835703254587,
186
+ "learning_rate": 7.191855733945388e-06,
187
+ "loss": 0.1487,
188
+ "step": 210
189
+ },
190
+ {
191
+ "epoch": 4.4,
192
+ "grad_norm": 5.61752695846374,
193
+ "learning_rate": 6.873032967079562e-06,
194
+ "loss": 0.1411,
195
+ "step": 220
196
+ },
197
+ {
198
+ "epoch": 4.6,
199
+ "grad_norm": 7.574897231650108,
200
+ "learning_rate": 6.545084971874738e-06,
201
+ "loss": 0.1586,
202
+ "step": 230
203
+ },
204
+ {
205
+ "epoch": 4.8,
206
+ "grad_norm": 4.565228907443193,
207
+ "learning_rate": 6.209609477998339e-06,
208
+ "loss": 0.2038,
209
+ "step": 240
210
+ },
211
+ {
212
+ "epoch": 5.0,
213
+ "grad_norm": 4.885717790285905,
214
+ "learning_rate": 5.8682408883346535e-06,
215
+ "loss": 0.148,
216
+ "step": 250
217
+ },
218
+ {
219
+ "epoch": 5.0,
220
+ "eval_loss": 1.46064031124115,
221
+ "eval_runtime": 1.03,
222
+ "eval_samples_per_second": 16.505,
223
+ "eval_steps_per_second": 2.913,
224
+ "step": 250
225
+ },
226
+ {
227
+ "epoch": 5.2,
228
+ "grad_norm": 7.108983510814077,
229
+ "learning_rate": 5.522642316338268e-06,
230
+ "loss": 0.0645,
231
+ "step": 260
232
+ },
233
+ {
234
+ "epoch": 5.4,
235
+ "grad_norm": 5.454792099444044,
236
+ "learning_rate": 5.174497483512506e-06,
237
+ "loss": 0.084,
238
+ "step": 270
239
+ },
240
+ {
241
+ "epoch": 5.6,
242
+ "grad_norm": 3.0763686972511572,
243
+ "learning_rate": 4.825502516487497e-06,
244
+ "loss": 0.0773,
245
+ "step": 280
246
+ },
247
+ {
248
+ "epoch": 5.8,
249
+ "grad_norm": 4.791359217511534,
250
+ "learning_rate": 4.477357683661734e-06,
251
+ "loss": 0.1003,
252
+ "step": 290
253
+ },
254
+ {
255
+ "epoch": 6.0,
256
+ "grad_norm": 3.746744930874563,
257
+ "learning_rate": 4.131759111665349e-06,
258
+ "loss": 0.0818,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 6.0,
263
+ "eval_loss": 1.5259884595870972,
264
+ "eval_runtime": 1.0326,
265
+ "eval_samples_per_second": 16.463,
266
+ "eval_steps_per_second": 2.905,
267
+ "step": 300
268
+ },
269
+ {
270
+ "epoch": 6.2,
271
+ "grad_norm": 7.111673793819084,
272
+ "learning_rate": 3.790390522001662e-06,
273
+ "loss": 0.049,
274
+ "step": 310
275
+ },
276
+ {
277
+ "epoch": 6.4,
278
+ "grad_norm": 2.936988902922916,
279
+ "learning_rate": 3.4549150281252635e-06,
280
+ "loss": 0.0428,
281
+ "step": 320
282
+ },
283
+ {
284
+ "epoch": 6.6,
285
+ "grad_norm": 1.8852801169974247,
286
+ "learning_rate": 3.12696703292044e-06,
287
+ "loss": 0.0508,
288
+ "step": 330
289
+ },
290
+ {
291
+ "epoch": 6.8,
292
+ "grad_norm": 2.313238846786626,
293
+ "learning_rate": 2.8081442660546126e-06,
294
+ "loss": 0.0394,
295
+ "step": 340
296
+ },
297
+ {
298
+ "epoch": 7.0,
299
+ "grad_norm": 3.084443875902426,
300
+ "learning_rate": 2.5000000000000015e-06,
301
+ "loss": 0.0352,
302
+ "step": 350
303
+ },
304
+ {
305
+ "epoch": 7.0,
306
+ "eval_loss": 2.0276832580566406,
307
+ "eval_runtime": 1.0308,
308
+ "eval_samples_per_second": 16.492,
309
+ "eval_steps_per_second": 2.91,
310
+ "step": 350
311
+ },
312
+ {
313
+ "epoch": 7.2,
314
+ "grad_norm": 3.5661235534617943,
315
+ "learning_rate": 2.204035482646267e-06,
316
+ "loss": 0.0272,
317
+ "step": 360
318
+ },
319
+ {
320
+ "epoch": 7.4,
321
+ "grad_norm": 0.4070183405976748,
322
+ "learning_rate": 1.9216926233717087e-06,
323
+ "loss": 0.0257,
324
+ "step": 370
325
+ },
326
+ {
327
+ "epoch": 7.6,
328
+ "grad_norm": 2.699699532592376,
329
+ "learning_rate": 1.6543469682057105e-06,
330
+ "loss": 0.0259,
331
+ "step": 380
332
+ },
333
+ {
334
+ "epoch": 7.8,
335
+ "grad_norm": 1.0226717900156348,
336
+ "learning_rate": 1.4033009983067454e-06,
337
+ "loss": 0.0179,
338
+ "step": 390
339
+ },
340
+ {
341
+ "epoch": 8.0,
342
+ "grad_norm": 1.2731011142101376,
343
+ "learning_rate": 1.1697777844051105e-06,
344
+ "loss": 0.0197,
345
+ "step": 400
346
+ },
347
+ {
348
+ "epoch": 8.0,
349
+ "eval_loss": 2.1508045196533203,
350
+ "eval_runtime": 1.0303,
351
+ "eval_samples_per_second": 16.5,
352
+ "eval_steps_per_second": 2.912,
353
+ "step": 400
354
+ },
355
+ {
356
+ "epoch": 8.2,
357
+ "grad_norm": 0.9183553687922849,
358
+ "learning_rate": 9.549150281252633e-07,
359
+ "loss": 0.009,
360
+ "step": 410
361
+ },
362
+ {
363
+ "epoch": 8.4,
364
+ "grad_norm": 1.051244265434313,
365
+ "learning_rate": 7.597595192178702e-07,
366
+ "loss": 0.0074,
367
+ "step": 420
368
+ },
369
+ {
370
+ "epoch": 8.6,
371
+ "grad_norm": 0.8106413095614442,
372
+ "learning_rate": 5.852620357053651e-07,
373
+ "loss": 0.0115,
374
+ "step": 430
375
+ },
376
+ {
377
+ "epoch": 8.8,
378
+ "grad_norm": 1.4089090443500107,
379
+ "learning_rate": 4.322727117869951e-07,
380
+ "loss": 0.0142,
381
+ "step": 440
382
+ },
383
+ {
384
+ "epoch": 9.0,
385
+ "grad_norm": 0.2897517823285663,
386
+ "learning_rate": 3.015368960704584e-07,
387
+ "loss": 0.0129,
388
+ "step": 450
389
+ },
390
+ {
391
+ "epoch": 9.0,
392
+ "eval_loss": 2.2827517986297607,
393
+ "eval_runtime": 1.0366,
394
+ "eval_samples_per_second": 16.401,
395
+ "eval_steps_per_second": 2.894,
396
+ "step": 450
397
+ },
398
+ {
399
+ "epoch": 9.2,
400
+ "grad_norm": 0.581294302728187,
401
+ "learning_rate": 1.9369152030840553e-07,
402
+ "loss": 0.009,
403
+ "step": 460
404
+ },
405
+ {
406
+ "epoch": 9.4,
407
+ "grad_norm": 0.07224896290446714,
408
+ "learning_rate": 1.0926199633097156e-07,
409
+ "loss": 0.0071,
410
+ "step": 470
411
+ },
412
+ {
413
+ "epoch": 9.6,
414
+ "grad_norm": 0.9382264066330144,
415
+ "learning_rate": 4.865965629214819e-08,
416
+ "loss": 0.0058,
417
+ "step": 480
418
+ },
419
+ {
420
+ "epoch": 9.8,
421
+ "grad_norm": 0.7841805435055503,
422
+ "learning_rate": 1.2179748700879013e-08,
423
+ "loss": 0.0104,
424
+ "step": 490
425
+ },
426
+ {
427
+ "epoch": 10.0,
428
+ "grad_norm": 0.5841804580391363,
429
+ "learning_rate": 0.0,
430
+ "loss": 0.0066,
431
+ "step": 500
432
+ },
433
+ {
434
+ "epoch": 10.0,
435
+ "eval_loss": 2.3078930377960205,
436
+ "eval_runtime": 1.0306,
437
+ "eval_samples_per_second": 16.495,
438
+ "eval_steps_per_second": 2.911,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 10.0,
443
+ "step": 500,
444
+ "total_flos": 37534574837760.0,
445
+ "train_loss": 0.26039194798469545,
446
+ "train_runtime": 1475.1742,
447
+ "train_samples_per_second": 5.403,
448
+ "train_steps_per_second": 0.339
449
+ }
450
+ ],
451
+ "logging_steps": 10,
452
+ "max_steps": 500,
453
+ "num_input_tokens_seen": 0,
454
+ "num_train_epochs": 10,
455
+ "save_steps": 500,
456
+ "stateful_callbacks": {
457
+ "TrainerControl": {
458
+ "args": {
459
+ "should_epoch_stop": false,
460
+ "should_evaluate": false,
461
+ "should_log": false,
462
+ "should_save": true,
463
+ "should_training_stop": true
464
+ },
465
+ "attributes": {}
466
+ }
467
+ },
468
+ "total_flos": 37534574837760.0,
469
+ "train_batch_size": 2,
470
+ "trial_name": null,
471
+ "trial_params": null
472
+ }
training_eval_loss.png ADDED
training_loss.png ADDED