rbelanec commited on
Commit
4a6ef8b
·
verified ·
1 Parent(s): c45f876

End of training

Browse files
README.md CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # train_cb_42_1760637133
19
 
20
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 0.2378
23
  - Num Input Tokens Seen: 158656
 
17
 
18
  # train_cb_42_1760637133
19
 
20
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the cb dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 0.2378
23
  - Num Input Tokens Seen: 158656
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.23779883980751038,
4
+ "eval_runtime": 1.3015,
5
+ "eval_samples_per_second": 38.417,
6
+ "eval_steps_per_second": 9.988,
7
+ "num_input_tokens_seen": 158656,
8
+ "total_flos": 7144207972564992.0,
9
+ "train_loss": 0.6316961252689361,
10
+ "train_runtime": 79.306,
11
+ "train_samples_per_second": 12.609,
12
+ "train_steps_per_second": 3.152
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.23779883980751038,
4
+ "eval_runtime": 1.3015,
5
+ "eval_samples_per_second": 38.417,
6
+ "eval_steps_per_second": 9.988,
7
+ "num_input_tokens_seen": 158656
8
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "num_input_tokens_seen": 158656,
4
+ "total_flos": 7144207972564992.0,
5
+ "train_loss": 0.6316961252689361,
6
+ "train_runtime": 79.306,
7
+ "train_samples_per_second": 12.609,
8
+ "train_steps_per_second": 3.152
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 175,
3
+ "best_metric": 0.22990073263645172,
4
+ "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_cb_42_1760637133/checkpoint-175",
5
+ "epoch": 5.0,
6
+ "eval_steps": 25,
7
+ "global_step": 250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1,
14
+ "grad_norm": 147.09878540039062,
15
+ "learning_rate": 1.6000000000000001e-06,
16
+ "loss": 7.3911,
17
+ "num_input_tokens_seen": 2720,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.2,
22
+ "grad_norm": 201.96444702148438,
23
+ "learning_rate": 3.6000000000000003e-06,
24
+ "loss": 6.0584,
25
+ "num_input_tokens_seen": 5536,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.3,
30
+ "grad_norm": 169.54861450195312,
31
+ "learning_rate": 5.600000000000001e-06,
32
+ "loss": 4.1152,
33
+ "num_input_tokens_seen": 8480,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 0.4,
38
+ "grad_norm": 46.49274826049805,
39
+ "learning_rate": 7.600000000000001e-06,
40
+ "loss": 2.3628,
41
+ "num_input_tokens_seen": 11776,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 0.5,
46
+ "grad_norm": 27.129613876342773,
47
+ "learning_rate": 9.600000000000001e-06,
48
+ "loss": 1.1162,
49
+ "num_input_tokens_seen": 14720,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 0.5,
54
+ "eval_loss": 0.7037240266799927,
55
+ "eval_runtime": 1.2436,
56
+ "eval_samples_per_second": 40.204,
57
+ "eval_steps_per_second": 10.453,
58
+ "num_input_tokens_seen": 14720,
59
+ "step": 25
60
+ },
61
+ {
62
+ "epoch": 0.6,
63
+ "grad_norm": 48.88370895385742,
64
+ "learning_rate": 9.992203820909906e-06,
65
+ "loss": 0.5016,
66
+ "num_input_tokens_seen": 18560,
67
+ "step": 30
68
+ },
69
+ {
70
+ "epoch": 0.7,
71
+ "grad_norm": 26.719514846801758,
72
+ "learning_rate": 9.960573506572391e-06,
73
+ "loss": 0.2915,
74
+ "num_input_tokens_seen": 21824,
75
+ "step": 35
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "grad_norm": 9.22452449798584,
80
+ "learning_rate": 9.904775776745959e-06,
81
+ "loss": 0.2798,
82
+ "num_input_tokens_seen": 24288,
83
+ "step": 40
84
+ },
85
+ {
86
+ "epoch": 0.9,
87
+ "grad_norm": 22.899555206298828,
88
+ "learning_rate": 9.825082472361558e-06,
89
+ "loss": 0.3252,
90
+ "num_input_tokens_seen": 27648,
91
+ "step": 45
92
+ },
93
+ {
94
+ "epoch": 1.0,
95
+ "grad_norm": 11.016155242919922,
96
+ "learning_rate": 9.721881851187406e-06,
97
+ "loss": 0.2568,
98
+ "num_input_tokens_seen": 31456,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 1.0,
103
+ "eval_loss": 0.28914472460746765,
104
+ "eval_runtime": 1.2319,
105
+ "eval_samples_per_second": 40.588,
106
+ "eval_steps_per_second": 10.553,
107
+ "num_input_tokens_seen": 31456,
108
+ "step": 50
109
+ },
110
+ {
111
+ "epoch": 1.1,
112
+ "grad_norm": 9.274727821350098,
113
+ "learning_rate": 9.595676696276173e-06,
114
+ "loss": 0.24,
115
+ "num_input_tokens_seen": 34976,
116
+ "step": 55
117
+ },
118
+ {
119
+ "epoch": 1.2,
120
+ "grad_norm": 30.56814956665039,
121
+ "learning_rate": 9.44708186645649e-06,
122
+ "loss": 0.2301,
123
+ "num_input_tokens_seen": 37856,
124
+ "step": 60
125
+ },
126
+ {
127
+ "epoch": 1.3,
128
+ "grad_norm": 14.439095497131348,
129
+ "learning_rate": 9.276821300802535e-06,
130
+ "loss": 0.2214,
131
+ "num_input_tokens_seen": 41728,
132
+ "step": 65
133
+ },
134
+ {
135
+ "epoch": 1.4,
136
+ "grad_norm": 105.18769836425781,
137
+ "learning_rate": 9.085724491675642e-06,
138
+ "loss": 0.3427,
139
+ "num_input_tokens_seen": 44704,
140
+ "step": 70
141
+ },
142
+ {
143
+ "epoch": 1.5,
144
+ "grad_norm": 69.08840942382812,
145
+ "learning_rate": 8.874722443520898e-06,
146
+ "loss": 0.254,
147
+ "num_input_tokens_seen": 47168,
148
+ "step": 75
149
+ },
150
+ {
151
+ "epoch": 1.5,
152
+ "eval_loss": 0.2438250482082367,
153
+ "eval_runtime": 1.2262,
154
+ "eval_samples_per_second": 40.778,
155
+ "eval_steps_per_second": 10.602,
156
+ "num_input_tokens_seen": 47168,
157
+ "step": 75
158
+ },
159
+ {
160
+ "epoch": 1.6,
161
+ "grad_norm": 37.581912994384766,
162
+ "learning_rate": 8.644843137107058e-06,
163
+ "loss": 0.2904,
164
+ "num_input_tokens_seen": 51072,
165
+ "step": 80
166
+ },
167
+ {
168
+ "epoch": 1.7,
169
+ "grad_norm": 152.4672088623047,
170
+ "learning_rate": 8.397206521307584e-06,
171
+ "loss": 0.4201,
172
+ "num_input_tokens_seen": 54400,
173
+ "step": 85
174
+ },
175
+ {
176
+ "epoch": 1.8,
177
+ "grad_norm": 128.72598266601562,
178
+ "learning_rate": 8.133019056822303e-06,
179
+ "loss": 0.2121,
180
+ "num_input_tokens_seen": 56736,
181
+ "step": 90
182
+ },
183
+ {
184
+ "epoch": 1.9,
185
+ "grad_norm": 29.933185577392578,
186
+ "learning_rate": 7.85356783842216e-06,
187
+ "loss": 0.2648,
188
+ "num_input_tokens_seen": 60064,
189
+ "step": 95
190
+ },
191
+ {
192
+ "epoch": 2.0,
193
+ "grad_norm": 10.928328514099121,
194
+ "learning_rate": 7.560214324352858e-06,
195
+ "loss": 0.1647,
196
+ "num_input_tokens_seen": 63168,
197
+ "step": 100
198
+ },
199
+ {
200
+ "epoch": 2.0,
201
+ "eval_loss": 0.36001044511795044,
202
+ "eval_runtime": 1.235,
203
+ "eval_samples_per_second": 40.485,
204
+ "eval_steps_per_second": 10.526,
205
+ "num_input_tokens_seen": 63168,
206
+ "step": 100
207
+ },
208
+ {
209
+ "epoch": 2.1,
210
+ "grad_norm": 70.27168273925781,
211
+ "learning_rate": 7.254387703447154e-06,
212
+ "loss": 0.427,
213
+ "num_input_tokens_seen": 66464,
214
+ "step": 105
215
+ },
216
+ {
217
+ "epoch": 2.2,
218
+ "grad_norm": 61.89235305786133,
219
+ "learning_rate": 6.9375779322605154e-06,
220
+ "loss": 0.277,
221
+ "num_input_tokens_seen": 70112,
222
+ "step": 110
223
+ },
224
+ {
225
+ "epoch": 2.3,
226
+ "grad_norm": 8.1712646484375,
227
+ "learning_rate": 6.611328476152557e-06,
228
+ "loss": 0.1532,
229
+ "num_input_tokens_seen": 73376,
230
+ "step": 115
231
+ },
232
+ {
233
+ "epoch": 2.4,
234
+ "grad_norm": 31.11164665222168,
235
+ "learning_rate": 6.277228789678953e-06,
236
+ "loss": 0.174,
237
+ "num_input_tokens_seen": 76288,
238
+ "step": 120
239
+ },
240
+ {
241
+ "epoch": 2.5,
242
+ "grad_norm": 10.602115631103516,
243
+ "learning_rate": 5.936906572928625e-06,
244
+ "loss": 0.3223,
245
+ "num_input_tokens_seen": 79424,
246
+ "step": 125
247
+ },
248
+ {
249
+ "epoch": 2.5,
250
+ "eval_loss": 0.2356501966714859,
251
+ "eval_runtime": 1.2576,
252
+ "eval_samples_per_second": 39.757,
253
+ "eval_steps_per_second": 10.337,
254
+ "num_input_tokens_seen": 79424,
255
+ "step": 125
256
+ },
257
+ {
258
+ "epoch": 2.6,
259
+ "grad_norm": 8.446126937866211,
260
+ "learning_rate": 5.592019841532507e-06,
261
+ "loss": 0.1352,
262
+ "num_input_tokens_seen": 82880,
263
+ "step": 130
264
+ },
265
+ {
266
+ "epoch": 2.7,
267
+ "grad_norm": 55.46540832519531,
268
+ "learning_rate": 5.244248848978067e-06,
269
+ "loss": 0.2971,
270
+ "num_input_tokens_seen": 86560,
271
+ "step": 135
272
+ },
273
+ {
274
+ "epoch": 2.8,
275
+ "grad_norm": 12.031166076660156,
276
+ "learning_rate": 4.895287900583216e-06,
277
+ "loss": 0.1537,
278
+ "num_input_tokens_seen": 89184,
279
+ "step": 140
280
+ },
281
+ {
282
+ "epoch": 2.9,
283
+ "grad_norm": 9.252001762390137,
284
+ "learning_rate": 4.546837099011101e-06,
285
+ "loss": 0.3131,
286
+ "num_input_tokens_seen": 92256,
287
+ "step": 145
288
+ },
289
+ {
290
+ "epoch": 3.0,
291
+ "grad_norm": 35.95872497558594,
292
+ "learning_rate": 4.200594061540827e-06,
293
+ "loss": 0.2999,
294
+ "num_input_tokens_seen": 95168,
295
+ "step": 150
296
+ },
297
+ {
298
+ "epoch": 3.0,
299
+ "eval_loss": 0.2933087944984436,
300
+ "eval_runtime": 1.2189,
301
+ "eval_samples_per_second": 41.021,
302
+ "eval_steps_per_second": 10.666,
303
+ "num_input_tokens_seen": 95168,
304
+ "step": 150
305
+ },
306
+ {
307
+ "epoch": 3.1,
308
+ "grad_norm": 63.808555603027344,
309
+ "learning_rate": 3.8582456494467214e-06,
310
+ "loss": 0.2206,
311
+ "num_input_tokens_seen": 98528,
312
+ "step": 155
313
+ },
314
+ {
315
+ "epoch": 3.2,
316
+ "grad_norm": 38.348934173583984,
317
+ "learning_rate": 3.521459749779769e-06,
318
+ "loss": 0.2139,
319
+ "num_input_tokens_seen": 101408,
320
+ "step": 160
321
+ },
322
+ {
323
+ "epoch": 3.3,
324
+ "grad_norm": 116.32037353515625,
325
+ "learning_rate": 3.1918771495895395e-06,
326
+ "loss": 0.2016,
327
+ "num_input_tokens_seen": 104736,
328
+ "step": 165
329
+ },
330
+ {
331
+ "epoch": 3.4,
332
+ "grad_norm": 56.04789352416992,
333
+ "learning_rate": 2.871103542174637e-06,
334
+ "loss": 0.2427,
335
+ "num_input_tokens_seen": 107840,
336
+ "step": 170
337
+ },
338
+ {
339
+ "epoch": 3.5,
340
+ "grad_norm": 35.397884368896484,
341
+ "learning_rate": 2.560701704306336e-06,
342
+ "loss": 0.244,
343
+ "num_input_tokens_seen": 111392,
344
+ "step": 175
345
+ },
346
+ {
347
+ "epoch": 3.5,
348
+ "eval_loss": 0.22990073263645172,
349
+ "eval_runtime": 1.2427,
350
+ "eval_samples_per_second": 40.235,
351
+ "eval_steps_per_second": 10.461,
352
+ "num_input_tokens_seen": 111392,
353
+ "step": 175
354
+ },
355
+ {
356
+ "epoch": 3.6,
357
+ "grad_norm": 5.372102737426758,
358
+ "learning_rate": 2.2621838825372496e-06,
359
+ "loss": 0.1392,
360
+ "num_input_tokens_seen": 114720,
361
+ "step": 180
362
+ },
363
+ {
364
+ "epoch": 3.7,
365
+ "grad_norm": 44.36689376831055,
366
+ "learning_rate": 1.977004425688126e-06,
367
+ "loss": 0.1194,
368
+ "num_input_tokens_seen": 117632,
369
+ "step": 185
370
+ },
371
+ {
372
+ "epoch": 3.8,
373
+ "grad_norm": 32.87962341308594,
374
+ "learning_rate": 1.7065526994065973e-06,
375
+ "loss": 0.1453,
376
+ "num_input_tokens_seen": 120512,
377
+ "step": 190
378
+ },
379
+ {
380
+ "epoch": 3.9,
381
+ "grad_norm": 15.784488677978516,
382
+ "learning_rate": 1.4521463173173966e-06,
383
+ "loss": 0.2925,
384
+ "num_input_tokens_seen": 124352,
385
+ "step": 195
386
+ },
387
+ {
388
+ "epoch": 4.0,
389
+ "grad_norm": 8.471246719360352,
390
+ "learning_rate": 1.2150247217412186e-06,
391
+ "loss": 0.2501,
392
+ "num_input_tokens_seen": 127136,
393
+ "step": 200
394
+ },
395
+ {
396
+ "epoch": 4.0,
397
+ "eval_loss": 0.23893244564533234,
398
+ "eval_runtime": 1.2441,
399
+ "eval_samples_per_second": 40.189,
400
+ "eval_steps_per_second": 10.449,
401
+ "num_input_tokens_seen": 127136,
402
+ "step": 200
403
+ },
404
+ {
405
+ "epoch": 4.1,
406
+ "grad_norm": 31.993316650390625,
407
+ "learning_rate": 9.963431452563331e-07,
408
+ "loss": 0.1709,
409
+ "num_input_tokens_seen": 129664,
410
+ "step": 205
411
+ },
412
+ {
413
+ "epoch": 4.2,
414
+ "grad_norm": 27.969955444335938,
415
+ "learning_rate": 7.971669825215789e-07,
416
+ "loss": 0.2556,
417
+ "num_input_tokens_seen": 132128,
418
+ "step": 210
419
+ },
420
+ {
421
+ "epoch": 4.3,
422
+ "grad_norm": 22.807920455932617,
423
+ "learning_rate": 6.184665997806832e-07,
424
+ "loss": 0.143,
425
+ "num_input_tokens_seen": 135200,
426
+ "step": 215
427
+ },
428
+ {
429
+ "epoch": 4.4,
430
+ "grad_norm": 14.716069221496582,
431
+ "learning_rate": 4.6111260733545714e-07,
432
+ "loss": 0.2176,
433
+ "num_input_tokens_seen": 138368,
434
+ "step": 220
435
+ },
436
+ {
437
+ "epoch": 4.5,
438
+ "grad_norm": 15.53546142578125,
439
+ "learning_rate": 3.258716180199278e-07,
440
+ "loss": 0.1977,
441
+ "num_input_tokens_seen": 141632,
442
+ "step": 225
443
+ },
444
+ {
445
+ "epoch": 4.5,
446
+ "eval_loss": 0.23097044229507446,
447
+ "eval_runtime": 1.2534,
448
+ "eval_samples_per_second": 39.89,
449
+ "eval_steps_per_second": 10.371,
450
+ "num_input_tokens_seen": 141632,
451
+ "step": 225
452
+ },
453
+ {
454
+ "epoch": 4.6,
455
+ "grad_norm": 18.080829620361328,
456
+ "learning_rate": 2.134025123396638e-07,
457
+ "loss": 0.1109,
458
+ "num_input_tokens_seen": 144960,
459
+ "step": 230
460
+ },
461
+ {
462
+ "epoch": 4.7,
463
+ "grad_norm": 26.747798919677734,
464
+ "learning_rate": 1.2425322847218368e-07,
465
+ "loss": 0.1466,
466
+ "num_input_tokens_seen": 147936,
467
+ "step": 235
468
+ },
469
+ {
470
+ "epoch": 4.8,
471
+ "grad_norm": 7.783914089202881,
472
+ "learning_rate": 5.8858092767236084e-08,
473
+ "loss": 0.1319,
474
+ "num_input_tokens_seen": 151392,
475
+ "step": 240
476
+ },
477
+ {
478
+ "epoch": 4.9,
479
+ "grad_norm": 16.87287712097168,
480
+ "learning_rate": 1.753570375247815e-08,
481
+ "loss": 0.1445,
482
+ "num_input_tokens_seen": 154624,
483
+ "step": 245
484
+ },
485
+ {
486
+ "epoch": 5.0,
487
+ "grad_norm": 15.167189598083496,
488
+ "learning_rate": 4.87379953478806e-10,
489
+ "loss": 0.1052,
490
+ "num_input_tokens_seen": 158656,
491
+ "step": 250
492
+ },
493
+ {
494
+ "epoch": 5.0,
495
+ "eval_loss": 0.23779883980751038,
496
+ "eval_runtime": 1.2348,
497
+ "eval_samples_per_second": 40.491,
498
+ "eval_steps_per_second": 10.528,
499
+ "num_input_tokens_seen": 158656,
500
+ "step": 250
501
+ },
502
+ {
503
+ "epoch": 5.0,
504
+ "num_input_tokens_seen": 158656,
505
+ "step": 250,
506
+ "total_flos": 7144207972564992.0,
507
+ "train_loss": 0.6316961252689361,
508
+ "train_runtime": 79.306,
509
+ "train_samples_per_second": 12.609,
510
+ "train_steps_per_second": 3.152
511
+ }
512
+ ],
513
+ "logging_steps": 5,
514
+ "max_steps": 250,
515
+ "num_input_tokens_seen": 158656,
516
+ "num_train_epochs": 5,
517
+ "save_steps": 25,
518
+ "stateful_callbacks": {
519
+ "TrainerControl": {
520
+ "args": {
521
+ "should_epoch_stop": false,
522
+ "should_evaluate": false,
523
+ "should_log": false,
524
+ "should_save": true,
525
+ "should_training_stop": true
526
+ },
527
+ "attributes": {}
528
+ }
529
+ },
530
+ "total_flos": 7144207972564992.0,
531
+ "train_batch_size": 4,
532
+ "trial_name": null,
533
+ "trial_params": null
534
+ }
training_eval_loss.png ADDED
training_loss.png ADDED