rbelanec commited on
Commit
e7ecc5e
verified
1 Parent(s): f8f5eb3

End of training

Browse files
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # test
19
 
20
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.3585
23
- - Num Input Tokens Seen: 46944
24
 
25
  ## Model description
26
 
 
17
 
18
  # test
19
 
20
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the wsc dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3497
23
+ - Num Input Tokens Seen: 49376
24
 
25
  ## Model description
26
 
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.3496828079223633,
4
+ "eval_runtime": 1.4986,
5
+ "eval_samples_per_second": 37.369,
6
+ "eval_steps_per_second": 9.342,
7
+ "num_input_tokens_seen": 49376,
8
+ "total_flos": 2223378963628032.0,
9
+ "train_loss": 1.100914571762085,
10
+ "train_runtime": 86.254,
11
+ "train_samples_per_second": 5.774,
12
+ "train_steps_per_second": 1.449
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.3496828079223633,
4
+ "eval_runtime": 1.4986,
5
+ "eval_samples_per_second": 37.369,
6
+ "eval_steps_per_second": 9.342,
7
+ "num_input_tokens_seen": 49376
8
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "num_input_tokens_seen": 49376,
4
+ "total_flos": 2223378963628032.0,
5
+ "train_loss": 1.100914571762085,
6
+ "train_runtime": 86.254,
7
+ "train_samples_per_second": 5.774,
8
+ "train_steps_per_second": 1.449
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 98,
3
+ "best_metric": 0.3496828079223633,
4
+ "best_model_checkpoint": "saves/test/checkpoint-98",
5
+ "epoch": 1.0,
6
+ "eval_steps": 7,
7
+ "global_step": 125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04,
14
+ "grad_norm": 396.0,
15
+ "learning_rate": 0.009230769230769232,
16
+ "loss": 3.6273,
17
+ "num_input_tokens_seen": 2144,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.056,
22
+ "eval_loss": 6.932018280029297,
23
+ "eval_runtime": 1.4196,
24
+ "eval_samples_per_second": 39.449,
25
+ "eval_steps_per_second": 9.862,
26
+ "num_input_tokens_seen": 2880,
27
+ "step": 7
28
+ },
29
+ {
30
+ "epoch": 0.08,
31
+ "grad_norm": 22.5,
32
+ "learning_rate": 0.02076923076923077,
33
+ "loss": 6.3813,
34
+ "num_input_tokens_seen": 4128,
35
+ "step": 10
36
+ },
37
+ {
38
+ "epoch": 0.112,
39
+ "eval_loss": 1.6227623224258423,
40
+ "eval_runtime": 1.3856,
41
+ "eval_samples_per_second": 40.417,
42
+ "eval_steps_per_second": 10.104,
43
+ "num_input_tokens_seen": 5920,
44
+ "step": 14
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "grad_norm": 9.9375,
49
+ "learning_rate": 0.029994099395745794,
50
+ "loss": 2.5342,
51
+ "num_input_tokens_seen": 6240,
52
+ "step": 15
53
+ },
54
+ {
55
+ "epoch": 0.16,
56
+ "grad_norm": 0.9140625,
57
+ "learning_rate": 0.029788065277773536,
58
+ "loss": 1.4507,
59
+ "num_input_tokens_seen": 8096,
60
+ "step": 20
61
+ },
62
+ {
63
+ "epoch": 0.168,
64
+ "eval_loss": 0.4040253460407257,
65
+ "eval_runtime": 1.3702,
66
+ "eval_samples_per_second": 40.869,
67
+ "eval_steps_per_second": 10.217,
68
+ "num_input_tokens_seen": 8416,
69
+ "step": 21
70
+ },
71
+ {
72
+ "epoch": 0.2,
73
+ "grad_norm": 1.140625,
74
+ "learning_rate": 0.02929162684084344,
75
+ "loss": 1.7771,
76
+ "num_input_tokens_seen": 10112,
77
+ "step": 25
78
+ },
79
+ {
80
+ "epoch": 0.224,
81
+ "eval_loss": 3.618654489517212,
82
+ "eval_runtime": 1.3779,
83
+ "eval_samples_per_second": 40.642,
84
+ "eval_steps_per_second": 10.161,
85
+ "num_input_tokens_seen": 11264,
86
+ "step": 28
87
+ },
88
+ {
89
+ "epoch": 0.24,
90
+ "grad_norm": 103.5,
91
+ "learning_rate": 0.028514533018536285,
92
+ "loss": 3.7935,
93
+ "num_input_tokens_seen": 12032,
94
+ "step": 30
95
+ },
96
+ {
97
+ "epoch": 0.28,
98
+ "grad_norm": 0.12060546875,
99
+ "learning_rate": 0.02747204418453818,
100
+ "loss": 0.7848,
101
+ "num_input_tokens_seen": 13824,
102
+ "step": 35
103
+ },
104
+ {
105
+ "epoch": 0.28,
106
+ "eval_loss": 0.36673951148986816,
107
+ "eval_runtime": 1.3899,
108
+ "eval_samples_per_second": 40.29,
109
+ "eval_steps_per_second": 10.072,
110
+ "num_input_tokens_seen": 13824,
111
+ "step": 35
112
+ },
113
+ {
114
+ "epoch": 0.32,
115
+ "grad_norm": 0.11083984375,
116
+ "learning_rate": 0.026184632473247482,
117
+ "loss": 0.4314,
118
+ "num_input_tokens_seen": 15840,
119
+ "step": 40
120
+ },
121
+ {
122
+ "epoch": 0.336,
123
+ "eval_loss": 0.3661668598651886,
124
+ "eval_runtime": 1.3878,
125
+ "eval_samples_per_second": 40.353,
126
+ "eval_steps_per_second": 10.088,
127
+ "num_input_tokens_seen": 16672,
128
+ "step": 42
129
+ },
130
+ {
131
+ "epoch": 0.36,
132
+ "grad_norm": 0.251953125,
133
+ "learning_rate": 0.024677579753131316,
134
+ "loss": 0.4096,
135
+ "num_input_tokens_seen": 17920,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 0.392,
140
+ "eval_loss": 0.5265085101127625,
141
+ "eval_runtime": 1.3816,
142
+ "eval_samples_per_second": 40.532,
143
+ "eval_steps_per_second": 10.133,
144
+ "num_input_tokens_seen": 19296,
145
+ "step": 49
146
+ },
147
+ {
148
+ "epoch": 0.4,
149
+ "grad_norm": 0.07373046875,
150
+ "learning_rate": 0.022980481147730047,
151
+ "loss": 0.2423,
152
+ "num_input_tokens_seen": 19712,
153
+ "step": 50
154
+ },
155
+ {
156
+ "epoch": 0.44,
157
+ "grad_norm": 0.150390625,
158
+ "learning_rate": 0.021126663854039943,
159
+ "loss": 0.5554,
160
+ "num_input_tokens_seen": 21952,
161
+ "step": 55
162
+ },
163
+ {
164
+ "epoch": 0.448,
165
+ "eval_loss": 0.39251771569252014,
166
+ "eval_runtime": 1.3876,
167
+ "eval_samples_per_second": 40.359,
168
+ "eval_steps_per_second": 10.09,
169
+ "num_input_tokens_seen": 22432,
170
+ "step": 56
171
+ },
172
+ {
173
+ "epoch": 0.48,
174
+ "grad_norm": 0.1513671875,
175
+ "learning_rate": 0.01915253267137274,
176
+ "loss": 0.4968,
177
+ "num_input_tokens_seen": 24160,
178
+ "step": 60
179
+ },
180
+ {
181
+ "epoch": 0.504,
182
+ "eval_loss": 2.652535915374756,
183
+ "eval_runtime": 1.3806,
184
+ "eval_samples_per_second": 40.562,
185
+ "eval_steps_per_second": 10.141,
186
+ "num_input_tokens_seen": 25504,
187
+ "step": 63
188
+ },
189
+ {
190
+ "epoch": 0.52,
191
+ "grad_norm": 10.0625,
192
+ "learning_rate": 0.017096855093032493,
193
+ "loss": 0.7728,
194
+ "num_input_tokens_seen": 26112,
195
+ "step": 65
196
+ },
197
+ {
198
+ "epoch": 0.56,
199
+ "grad_norm": 0.189453125,
200
+ "learning_rate": 0.015,
201
+ "loss": 0.3298,
202
+ "num_input_tokens_seen": 28064,
203
+ "step": 70
204
+ },
205
+ {
206
+ "epoch": 0.56,
207
+ "eval_loss": 0.37763792276382446,
208
+ "eval_runtime": 1.3741,
209
+ "eval_samples_per_second": 40.753,
210
+ "eval_steps_per_second": 10.188,
211
+ "num_input_tokens_seen": 28064,
212
+ "step": 70
213
+ },
214
+ {
215
+ "epoch": 0.6,
216
+ "grad_norm": 0.12060546875,
217
+ "learning_rate": 0.012903144906967513,
218
+ "loss": 0.3663,
219
+ "num_input_tokens_seen": 29824,
220
+ "step": 75
221
+ },
222
+ {
223
+ "epoch": 0.616,
224
+ "eval_loss": 0.36274200677871704,
225
+ "eval_runtime": 1.4269,
226
+ "eval_samples_per_second": 39.245,
227
+ "eval_steps_per_second": 9.811,
228
+ "num_input_tokens_seen": 30720,
229
+ "step": 77
230
+ },
231
+ {
232
+ "epoch": 0.64,
233
+ "grad_norm": 0.142578125,
234
+ "learning_rate": 0.01084746732862726,
235
+ "loss": 0.3654,
236
+ "num_input_tokens_seen": 31904,
237
+ "step": 80
238
+ },
239
+ {
240
+ "epoch": 0.672,
241
+ "eval_loss": 0.35259029269218445,
242
+ "eval_runtime": 1.4343,
243
+ "eval_samples_per_second": 39.044,
244
+ "eval_steps_per_second": 9.761,
245
+ "num_input_tokens_seen": 33504,
246
+ "step": 84
247
+ },
248
+ {
249
+ "epoch": 0.68,
250
+ "grad_norm": 0.08349609375,
251
+ "learning_rate": 0.008873336145960059,
252
+ "loss": 0.3696,
253
+ "num_input_tokens_seen": 33984,
254
+ "step": 85
255
+ },
256
+ {
257
+ "epoch": 0.72,
258
+ "grad_norm": 0.039306640625,
259
+ "learning_rate": 0.007019518852269953,
260
+ "loss": 0.3495,
261
+ "num_input_tokens_seen": 35776,
262
+ "step": 90
263
+ },
264
+ {
265
+ "epoch": 0.728,
266
+ "eval_loss": 0.3546493351459503,
267
+ "eval_runtime": 1.4075,
268
+ "eval_samples_per_second": 39.786,
269
+ "eval_steps_per_second": 9.946,
270
+ "num_input_tokens_seen": 36128,
271
+ "step": 91
272
+ },
273
+ {
274
+ "epoch": 0.76,
275
+ "grad_norm": 0.017578125,
276
+ "learning_rate": 0.005322420246868689,
277
+ "loss": 0.412,
278
+ "num_input_tokens_seen": 37472,
279
+ "step": 95
280
+ },
281
+ {
282
+ "epoch": 0.784,
283
+ "eval_loss": 0.3496828079223633,
284
+ "eval_runtime": 1.4395,
285
+ "eval_samples_per_second": 38.902,
286
+ "eval_steps_per_second": 9.726,
287
+ "num_input_tokens_seen": 38592,
288
+ "step": 98
289
+ },
290
+ {
291
+ "epoch": 0.8,
292
+ "grad_norm": 0.02734375,
293
+ "learning_rate": 0.003815367526752516,
294
+ "loss": 0.3676,
295
+ "num_input_tokens_seen": 39328,
296
+ "step": 100
297
+ },
298
+ {
299
+ "epoch": 0.84,
300
+ "grad_norm": 0.047607421875,
301
+ "learning_rate": 0.0025279558154618197,
302
+ "loss": 0.349,
303
+ "num_input_tokens_seen": 41280,
304
+ "step": 105
305
+ },
306
+ {
307
+ "epoch": 0.84,
308
+ "eval_loss": 0.35384148359298706,
309
+ "eval_runtime": 1.4061,
310
+ "eval_samples_per_second": 39.827,
311
+ "eval_steps_per_second": 9.957,
312
+ "num_input_tokens_seen": 41280,
313
+ "step": 105
314
+ },
315
+ {
316
+ "epoch": 0.88,
317
+ "grad_norm": 0.037841796875,
318
+ "learning_rate": 0.0014854669814637145,
319
+ "loss": 0.3482,
320
+ "num_input_tokens_seen": 43552,
321
+ "step": 110
322
+ },
323
+ {
324
+ "epoch": 0.896,
325
+ "eval_loss": 0.3565780818462372,
326
+ "eval_runtime": 1.4483,
327
+ "eval_samples_per_second": 38.666,
328
+ "eval_steps_per_second": 9.666,
329
+ "num_input_tokens_seen": 44160,
330
+ "step": 112
331
+ },
332
+ {
333
+ "epoch": 0.92,
334
+ "grad_norm": 0.040283203125,
335
+ "learning_rate": 0.000708373159156555,
336
+ "loss": 0.3258,
337
+ "num_input_tokens_seen": 45216,
338
+ "step": 115
339
+ },
340
+ {
341
+ "epoch": 0.952,
342
+ "eval_loss": 0.35846200585365295,
343
+ "eval_runtime": 1.4597,
344
+ "eval_samples_per_second": 38.364,
345
+ "eval_steps_per_second": 9.591,
346
+ "num_input_tokens_seen": 46944,
347
+ "step": 119
348
+ },
349
+ {
350
+ "epoch": 0.96,
351
+ "grad_norm": 0.0189208984375,
352
+ "learning_rate": 0.0002119347222264617,
353
+ "loss": 0.3217,
354
+ "num_input_tokens_seen": 47360,
355
+ "step": 120
356
+ },
357
+ {
358
+ "epoch": 1.0,
359
+ "grad_norm": 0.0859375,
360
+ "learning_rate": 5.90060425420702e-06,
361
+ "loss": 0.3608,
362
+ "num_input_tokens_seen": 49376,
363
+ "step": 125
364
+ },
365
+ {
366
+ "epoch": 1.0,
367
+ "num_input_tokens_seen": 49376,
368
+ "step": 125,
369
+ "total_flos": 2223378963628032.0,
370
+ "train_loss": 1.100914571762085,
371
+ "train_runtime": 86.254,
372
+ "train_samples_per_second": 5.774,
373
+ "train_steps_per_second": 1.449
374
+ }
375
+ ],
376
+ "logging_steps": 5,
377
+ "max_steps": 125,
378
+ "num_input_tokens_seen": 49376,
379
+ "num_train_epochs": 1,
380
+ "save_steps": 7,
381
+ "stateful_callbacks": {
382
+ "TrainerControl": {
383
+ "args": {
384
+ "should_epoch_stop": false,
385
+ "should_evaluate": false,
386
+ "should_log": false,
387
+ "should_save": true,
388
+ "should_training_stop": true
389
+ },
390
+ "attributes": {}
391
+ }
392
+ },
393
+ "total_flos": 2223378963628032.0,
394
+ "train_batch_size": 4,
395
+ "trial_name": null,
396
+ "trial_params": null
397
+ }
training_eval_loss.png ADDED
training_loss.png ADDED