sedrickkeh commited on
Commit
6325898
·
verified ·
1 Parent(s): 7d275a8

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: oh_teknium_scaling_down_random_0.4
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # oh_teknium_scaling_down_random_0.4
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.5314
21
 
 
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: oh_teknium_scaling_down_random_0.4
 
16
 
17
  # oh_teknium_scaling_down_random_0.4
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh_teknium_scaling_down_random_0.4 dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.5314
22
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9973799126637557,
3
+ "eval_loss": 0.5314457416534424,
4
+ "eval_runtime": 99.4888,
5
+ "eval_samples_per_second": 38.768,
6
+ "eval_steps_per_second": 0.613,
7
+ "total_flos": 718381598638080.0,
8
+ "train_loss": 0.5104286659569729,
9
+ "train_runtime": 14367.279,
10
+ "train_samples_per_second": 15.301,
11
+ "train_steps_per_second": 0.03
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9973799126637557,
3
+ "eval_loss": 0.5314457416534424,
4
+ "eval_runtime": 99.4888,
5
+ "eval_samples_per_second": 38.768,
6
+ "eval_steps_per_second": 0.613
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9973799126637557,
3
+ "total_flos": 718381598638080.0,
4
+ "train_loss": 0.5104286659569729,
5
+ "train_runtime": 14367.279,
6
+ "train_samples_per_second": 15.301,
7
+ "train_steps_per_second": 0.03
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9973799126637557,
5
+ "eval_steps": 500,
6
+ "global_step": 429,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06986899563318777,
13
+ "grad_norm": 5.182594166300789,
14
+ "learning_rate": 5e-06,
15
+ "loss": 0.6932,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.13973799126637554,
20
+ "grad_norm": 0.8105274865614129,
21
+ "learning_rate": 5e-06,
22
+ "loss": 0.6288,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.2096069868995633,
27
+ "grad_norm": 0.8904752466777895,
28
+ "learning_rate": 5e-06,
29
+ "loss": 0.6036,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.2794759825327511,
34
+ "grad_norm": 0.6207860876300546,
35
+ "learning_rate": 5e-06,
36
+ "loss": 0.5902,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.34934497816593885,
41
+ "grad_norm": 0.7152500537832366,
42
+ "learning_rate": 5e-06,
43
+ "loss": 0.5769,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.4192139737991266,
48
+ "grad_norm": 0.7013464450514418,
49
+ "learning_rate": 5e-06,
50
+ "loss": 0.5766,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.4890829694323144,
55
+ "grad_norm": 0.6585965690127796,
56
+ "learning_rate": 5e-06,
57
+ "loss": 0.5608,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.5589519650655022,
62
+ "grad_norm": 0.5056238657103318,
63
+ "learning_rate": 5e-06,
64
+ "loss": 0.5628,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.62882096069869,
69
+ "grad_norm": 0.5015857792164096,
70
+ "learning_rate": 5e-06,
71
+ "loss": 0.5568,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.6986899563318777,
76
+ "grad_norm": 0.5250734927951002,
77
+ "learning_rate": 5e-06,
78
+ "loss": 0.5527,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.7685589519650655,
83
+ "grad_norm": 0.5330395330357691,
84
+ "learning_rate": 5e-06,
85
+ "loss": 0.5538,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.8384279475982532,
90
+ "grad_norm": 0.5383302692811954,
91
+ "learning_rate": 5e-06,
92
+ "loss": 0.5421,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.9082969432314411,
97
+ "grad_norm": 0.4580590442515656,
98
+ "learning_rate": 5e-06,
99
+ "loss": 0.5438,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.9781659388646288,
104
+ "grad_norm": 0.5111086405959315,
105
+ "learning_rate": 5e-06,
106
+ "loss": 0.547,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.9991266375545852,
111
+ "eval_loss": 0.5411319136619568,
112
+ "eval_runtime": 97.15,
113
+ "eval_samples_per_second": 39.702,
114
+ "eval_steps_per_second": 0.628,
115
+ "step": 143
116
+ },
117
+ {
118
+ "epoch": 1.0480349344978166,
119
+ "grad_norm": 0.5986185773031188,
120
+ "learning_rate": 5e-06,
121
+ "loss": 0.561,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 1.1179039301310043,
126
+ "grad_norm": 0.6935147463024353,
127
+ "learning_rate": 5e-06,
128
+ "loss": 0.4992,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 1.1877729257641922,
133
+ "grad_norm": 0.828701854577101,
134
+ "learning_rate": 5e-06,
135
+ "loss": 0.5037,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 1.25764192139738,
140
+ "grad_norm": 0.47892479220169987,
141
+ "learning_rate": 5e-06,
142
+ "loss": 0.4977,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 1.3275109170305677,
147
+ "grad_norm": 0.4854767069935743,
148
+ "learning_rate": 5e-06,
149
+ "loss": 0.4972,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 1.3973799126637554,
154
+ "grad_norm": 0.4636030652817092,
155
+ "learning_rate": 5e-06,
156
+ "loss": 0.4939,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 1.467248908296943,
161
+ "grad_norm": 0.46460367066805386,
162
+ "learning_rate": 5e-06,
163
+ "loss": 0.497,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 1.537117903930131,
168
+ "grad_norm": 0.5541857158168692,
169
+ "learning_rate": 5e-06,
170
+ "loss": 0.4967,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 1.6069868995633187,
175
+ "grad_norm": 0.43929258201041527,
176
+ "learning_rate": 5e-06,
177
+ "loss": 0.4972,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 1.6768558951965065,
182
+ "grad_norm": 0.5673809631668943,
183
+ "learning_rate": 5e-06,
184
+ "loss": 0.5003,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 1.7467248908296944,
189
+ "grad_norm": 0.5312435781280381,
190
+ "learning_rate": 5e-06,
191
+ "loss": 0.5029,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 1.8165938864628821,
196
+ "grad_norm": 0.42426976715635795,
197
+ "learning_rate": 5e-06,
198
+ "loss": 0.4881,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 1.8864628820960698,
203
+ "grad_norm": 0.4900651685024478,
204
+ "learning_rate": 5e-06,
205
+ "loss": 0.4914,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 1.9563318777292578,
210
+ "grad_norm": 0.5082119447902608,
211
+ "learning_rate": 5e-06,
212
+ "loss": 0.4989,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 1.9982532751091702,
217
+ "eval_loss": 0.5304298996925354,
218
+ "eval_runtime": 100.0792,
219
+ "eval_samples_per_second": 38.539,
220
+ "eval_steps_per_second": 0.61,
221
+ "step": 286
222
+ },
223
+ {
224
+ "epoch": 2.0262008733624453,
225
+ "grad_norm": 0.5750485199278518,
226
+ "learning_rate": 5e-06,
227
+ "loss": 0.5172,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 2.096069868995633,
232
+ "grad_norm": 0.5082128041211101,
233
+ "learning_rate": 5e-06,
234
+ "loss": 0.4502,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 2.165938864628821,
239
+ "grad_norm": 0.45659427403567093,
240
+ "learning_rate": 5e-06,
241
+ "loss": 0.4449,
242
+ "step": 310
243
+ },
244
+ {
245
+ "epoch": 2.2358078602620086,
246
+ "grad_norm": 0.5382190170872588,
247
+ "learning_rate": 5e-06,
248
+ "loss": 0.4475,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 2.3056768558951966,
253
+ "grad_norm": 0.5824902306199307,
254
+ "learning_rate": 5e-06,
255
+ "loss": 0.4511,
256
+ "step": 330
257
+ },
258
+ {
259
+ "epoch": 2.3755458515283845,
260
+ "grad_norm": 0.49390187220978227,
261
+ "learning_rate": 5e-06,
262
+ "loss": 0.4527,
263
+ "step": 340
264
+ },
265
+ {
266
+ "epoch": 2.445414847161572,
267
+ "grad_norm": 0.5075624377832567,
268
+ "learning_rate": 5e-06,
269
+ "loss": 0.4544,
270
+ "step": 350
271
+ },
272
+ {
273
+ "epoch": 2.51528384279476,
274
+ "grad_norm": 0.49339275826514356,
275
+ "learning_rate": 5e-06,
276
+ "loss": 0.4483,
277
+ "step": 360
278
+ },
279
+ {
280
+ "epoch": 2.5851528384279474,
281
+ "grad_norm": 0.46662802106266,
282
+ "learning_rate": 5e-06,
283
+ "loss": 0.454,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 2.6550218340611353,
288
+ "grad_norm": 0.55123614316667,
289
+ "learning_rate": 5e-06,
290
+ "loss": 0.4531,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 2.7248908296943233,
295
+ "grad_norm": 0.5404203217788516,
296
+ "learning_rate": 5e-06,
297
+ "loss": 0.4479,
298
+ "step": 390
299
+ },
300
+ {
301
+ "epoch": 2.7947598253275108,
302
+ "grad_norm": 0.45881743373968936,
303
+ "learning_rate": 5e-06,
304
+ "loss": 0.4518,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 2.8646288209606987,
309
+ "grad_norm": 0.44281752763961424,
310
+ "learning_rate": 5e-06,
311
+ "loss": 0.4532,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 2.934497816593886,
316
+ "grad_norm": 0.48358299433650903,
317
+ "learning_rate": 5e-06,
318
+ "loss": 0.4588,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 2.9973799126637557,
323
+ "eval_loss": 0.5314457416534424,
324
+ "eval_runtime": 97.6794,
325
+ "eval_samples_per_second": 39.486,
326
+ "eval_steps_per_second": 0.624,
327
+ "step": 429
328
+ },
329
+ {
330
+ "epoch": 2.9973799126637557,
331
+ "step": 429,
332
+ "total_flos": 718381598638080.0,
333
+ "train_loss": 0.5104286659569729,
334
+ "train_runtime": 14367.279,
335
+ "train_samples_per_second": 15.301,
336
+ "train_steps_per_second": 0.03
337
+ }
338
+ ],
339
+ "logging_steps": 10,
340
+ "max_steps": 429,
341
+ "num_input_tokens_seen": 0,
342
+ "num_train_epochs": 3,
343
+ "save_steps": 500,
344
+ "stateful_callbacks": {
345
+ "TrainerControl": {
346
+ "args": {
347
+ "should_epoch_stop": false,
348
+ "should_evaluate": false,
349
+ "should_log": false,
350
+ "should_save": true,
351
+ "should_training_stop": true
352
+ },
353
+ "attributes": {}
354
+ }
355
+ },
356
+ "total_flos": 718381598638080.0,
357
+ "train_batch_size": 8,
358
+ "trial_name": null,
359
+ "trial_params": null
360
+ }
training_eval_loss.png ADDED
training_loss.png ADDED