rbelanec commited on
Commit
e677762
·
verified ·
1 Parent(s): 641e72e

End of training

Browse files
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # test
19
 
20
- This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.3492
23
- - Num Input Tokens Seen: 46944
24
 
25
  ## Model description
26
 
 
17
 
18
  # test
19
 
20
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the wsc dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3459
23
+ - Num Input Tokens Seen: 49376
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34708982706069946,
4
- "eval_runtime": 0.6975,
5
- "eval_samples_per_second": 80.282,
6
- "eval_steps_per_second": 20.071,
7
  "num_input_tokens_seen": 49376,
8
- "total_flos": 497055112495104.0,
9
- "train_loss": 0.5283625726699829,
10
- "train_runtime": 476.3079,
11
- "train_samples_per_second": 1.046,
12
- "train_steps_per_second": 0.262
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.34589245915412903,
4
+ "eval_runtime": 0.8481,
5
+ "eval_samples_per_second": 66.033,
6
+ "eval_steps_per_second": 16.508,
7
  "num_input_tokens_seen": 49376,
8
+ "total_flos": 497127920369664.0,
9
+ "train_loss": 1.1438678817749024,
10
+ "train_runtime": 264.1495,
11
+ "train_samples_per_second": 1.885,
12
+ "train_steps_per_second": 0.473
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34708982706069946,
4
- "eval_runtime": 0.6975,
5
- "eval_samples_per_second": 80.282,
6
- "eval_steps_per_second": 20.071,
7
  "num_input_tokens_seen": 49376
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.34589245915412903,
4
+ "eval_runtime": 0.8481,
5
+ "eval_samples_per_second": 66.033,
6
+ "eval_steps_per_second": 16.508,
7
  "num_input_tokens_seen": 49376
8
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "num_input_tokens_seen": 49376,
4
- "total_flos": 497055112495104.0,
5
- "train_loss": 0.5283625726699829,
6
- "train_runtime": 476.3079,
7
- "train_samples_per_second": 1.046,
8
- "train_steps_per_second": 0.262
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "num_input_tokens_seen": 49376,
4
+ "total_flos": 497127920369664.0,
5
+ "train_loss": 1.1438678817749024,
6
+ "train_runtime": 264.1495,
7
+ "train_samples_per_second": 1.885,
8
+ "train_steps_per_second": 0.473
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_global_step": 84,
3
- "best_metric": 0.34708982706069946,
4
- "best_model_checkpoint": "saves/test/checkpoint-84",
5
  "epoch": 1.0,
6
  "eval_steps": 7,
7
  "global_step": 125,
@@ -11,354 +11,354 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04,
14
- "grad_norm": 301.9017333984375,
15
  "learning_rate": 1.5384615384615387e-05,
16
- "loss": 1.0409,
17
  "num_input_tokens_seen": 2144,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.056,
22
- "eval_loss": 0.3513108789920807,
23
- "eval_runtime": 0.6546,
24
- "eval_samples_per_second": 85.55,
25
- "eval_steps_per_second": 21.387,
26
  "num_input_tokens_seen": 2880,
27
  "step": 7
28
  },
29
  {
30
  "epoch": 0.08,
31
- "grad_norm": 37.0427360534668,
32
  "learning_rate": 3.461538461538462e-05,
33
- "loss": 0.4086,
34
  "num_input_tokens_seen": 4128,
35
  "step": 10
36
  },
37
  {
38
  "epoch": 0.112,
39
- "eval_loss": 1.1120651960372925,
40
- "eval_runtime": 0.6513,
41
- "eval_samples_per_second": 85.976,
42
- "eval_steps_per_second": 21.494,
43
  "num_input_tokens_seen": 5920,
44
  "step": 14
45
  },
46
  {
47
  "epoch": 0.12,
48
- "grad_norm": 14.121573448181152,
49
  "learning_rate": 4.999016565957633e-05,
50
- "loss": 0.8807,
51
  "num_input_tokens_seen": 6240,
52
  "step": 15
53
  },
54
  {
55
  "epoch": 0.16,
56
- "grad_norm": 6.5915303230285645,
57
  "learning_rate": 4.96467754629559e-05,
58
- "loss": 0.9267,
59
  "num_input_tokens_seen": 8096,
60
  "step": 20
61
  },
62
  {
63
  "epoch": 0.168,
64
- "eval_loss": 0.35109928250312805,
65
- "eval_runtime": 0.7163,
66
- "eval_samples_per_second": 78.179,
67
- "eval_steps_per_second": 19.545,
68
  "num_input_tokens_seen": 8416,
69
  "step": 21
70
  },
71
  {
72
  "epoch": 0.2,
73
- "grad_norm": 19.82288360595703,
74
  "learning_rate": 4.881937806807241e-05,
75
- "loss": 0.7142,
76
  "num_input_tokens_seen": 10112,
77
  "step": 25
78
  },
79
  {
80
  "epoch": 0.224,
81
- "eval_loss": 0.38589543104171753,
82
- "eval_runtime": 0.7026,
83
- "eval_samples_per_second": 79.699,
84
- "eval_steps_per_second": 19.925,
85
  "num_input_tokens_seen": 11264,
86
  "step": 28
87
  },
88
  {
89
  "epoch": 0.24,
90
- "grad_norm": 14.33414363861084,
91
  "learning_rate": 4.752422169756048e-05,
92
- "loss": 0.385,
93
  "num_input_tokens_seen": 12032,
94
  "step": 30
95
  },
96
  {
97
  "epoch": 0.28,
98
- "grad_norm": 21.1870174407959,
99
  "learning_rate": 4.5786740307563636e-05,
100
- "loss": 0.5983,
101
  "num_input_tokens_seen": 13824,
102
  "step": 35
103
  },
104
  {
105
  "epoch": 0.28,
106
- "eval_loss": 0.658510684967041,
107
- "eval_runtime": 0.6682,
108
- "eval_samples_per_second": 83.804,
109
- "eval_steps_per_second": 20.951,
110
  "num_input_tokens_seen": 13824,
111
  "step": 35
112
  },
113
  {
114
  "epoch": 0.32,
115
- "grad_norm": 6.050720691680908,
116
  "learning_rate": 4.364105412207914e-05,
117
- "loss": 0.394,
118
  "num_input_tokens_seen": 15840,
119
  "step": 40
120
  },
121
  {
122
  "epoch": 0.336,
123
- "eval_loss": 0.41257768869400024,
124
- "eval_runtime": 0.6969,
125
- "eval_samples_per_second": 80.356,
126
- "eval_steps_per_second": 20.089,
127
  "num_input_tokens_seen": 16672,
128
  "step": 42
129
  },
130
  {
131
  "epoch": 0.36,
132
- "grad_norm": 29.777023315429688,
133
  "learning_rate": 4.1129299588552193e-05,
134
- "loss": 0.4533,
135
  "num_input_tokens_seen": 17920,
136
  "step": 45
137
  },
138
  {
139
  "epoch": 0.392,
140
- "eval_loss": 1.1762187480926514,
141
- "eval_runtime": 0.6684,
142
- "eval_samples_per_second": 83.778,
143
- "eval_steps_per_second": 20.945,
144
  "num_input_tokens_seen": 19296,
145
  "step": 49
146
  },
147
  {
148
  "epoch": 0.4,
149
- "grad_norm": 0.5394620895385742,
150
  "learning_rate": 3.830080191288342e-05,
151
- "loss": 0.1866,
152
  "num_input_tokens_seen": 19712,
153
  "step": 50
154
  },
155
  {
156
  "epoch": 0.44,
157
- "grad_norm": 19.856895446777344,
158
  "learning_rate": 3.521110642339991e-05,
159
- "loss": 1.3512,
160
  "num_input_tokens_seen": 21952,
161
  "step": 55
162
  },
163
  {
164
  "epoch": 0.448,
165
- "eval_loss": 0.806473433971405,
166
- "eval_runtime": 0.6813,
167
- "eval_samples_per_second": 82.196,
168
- "eval_steps_per_second": 20.549,
169
  "num_input_tokens_seen": 22432,
170
  "step": 56
171
  },
172
  {
173
  "epoch": 0.48,
174
- "grad_norm": 3.317122220993042,
175
  "learning_rate": 3.1920887785621235e-05,
176
- "loss": 0.7948,
177
  "num_input_tokens_seen": 24160,
178
  "step": 60
179
  },
180
  {
181
  "epoch": 0.504,
182
- "eval_loss": 1.0268325805664062,
183
- "eval_runtime": 0.679,
184
- "eval_samples_per_second": 82.475,
185
- "eval_steps_per_second": 20.619,
186
  "num_input_tokens_seen": 25504,
187
  "step": 63
188
  },
189
  {
190
  "epoch": 0.52,
191
- "grad_norm": 29.714481353759766,
192
  "learning_rate": 2.849475848838749e-05,
193
- "loss": 0.6941,
194
  "num_input_tokens_seen": 26112,
195
  "step": 65
196
  },
197
  {
198
  "epoch": 0.56,
199
- "grad_norm": 16.07105827331543,
200
  "learning_rate": 2.5e-05,
201
- "loss": 0.3463,
202
  "num_input_tokens_seen": 28064,
203
  "step": 70
204
  },
205
  {
206
  "epoch": 0.56,
207
- "eval_loss": 0.35280704498291016,
208
- "eval_runtime": 0.6586,
209
- "eval_samples_per_second": 85.032,
210
- "eval_steps_per_second": 21.258,
211
  "num_input_tokens_seen": 28064,
212
  "step": 70
213
  },
214
  {
215
  "epoch": 0.6,
216
- "grad_norm": 10.748852729797363,
217
  "learning_rate": 2.1505241511612522e-05,
218
- "loss": 0.3652,
219
  "num_input_tokens_seen": 29824,
220
  "step": 75
221
  },
222
  {
223
  "epoch": 0.616,
224
- "eval_loss": 0.3505268096923828,
225
- "eval_runtime": 0.6898,
226
- "eval_samples_per_second": 81.188,
227
- "eval_steps_per_second": 20.297,
228
  "num_input_tokens_seen": 30720,
229
  "step": 77
230
  },
231
  {
232
  "epoch": 0.64,
233
- "grad_norm": 8.937973976135254,
234
  "learning_rate": 1.8079112214378768e-05,
235
- "loss": 0.3476,
236
  "num_input_tokens_seen": 31904,
237
  "step": 80
238
  },
239
  {
240
  "epoch": 0.672,
241
- "eval_loss": 0.34708982706069946,
242
- "eval_runtime": 0.7099,
243
- "eval_samples_per_second": 78.887,
244
- "eval_steps_per_second": 19.722,
245
  "num_input_tokens_seen": 33504,
246
  "step": 84
247
  },
248
  {
249
  "epoch": 0.68,
250
- "grad_norm": 9.469837188720703,
251
  "learning_rate": 1.4788893576600099e-05,
252
- "loss": 0.3491,
253
  "num_input_tokens_seen": 33984,
254
  "step": 85
255
  },
256
  {
257
  "epoch": 0.72,
258
- "grad_norm": 4.574488639831543,
259
  "learning_rate": 1.1699198087116589e-05,
260
- "loss": 0.3395,
261
  "num_input_tokens_seen": 35776,
262
  "step": 90
263
  },
264
  {
265
  "epoch": 0.728,
266
- "eval_loss": 0.3647787868976593,
267
- "eval_runtime": 0.7422,
268
- "eval_samples_per_second": 75.449,
269
- "eval_steps_per_second": 18.862,
270
  "num_input_tokens_seen": 36128,
271
  "step": 91
272
  },
273
  {
274
  "epoch": 0.76,
275
- "grad_norm": 4.598474502563477,
276
  "learning_rate": 8.870700411447816e-06,
277
- "loss": 0.4569,
278
  "num_input_tokens_seen": 37472,
279
  "step": 95
280
  },
281
  {
282
  "epoch": 0.784,
283
- "eval_loss": 0.3610990643501282,
284
- "eval_runtime": 0.683,
285
- "eval_samples_per_second": 81.991,
286
- "eval_steps_per_second": 20.498,
287
  "num_input_tokens_seen": 38592,
288
  "step": 98
289
  },
290
  {
291
  "epoch": 0.8,
292
- "grad_norm": 3.7893221378326416,
293
  "learning_rate": 6.358945877920861e-06,
294
- "loss": 0.3916,
295
  "num_input_tokens_seen": 39328,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 0.84,
300
- "grad_norm": 1.4534764289855957,
301
  "learning_rate": 4.213259692436367e-06,
302
- "loss": 0.3191,
303
  "num_input_tokens_seen": 41280,
304
  "step": 105
305
  },
306
  {
307
  "epoch": 0.84,
308
- "eval_loss": 0.43045976758003235,
309
- "eval_runtime": 0.7093,
310
- "eval_samples_per_second": 78.948,
311
- "eval_steps_per_second": 19.737,
312
  "num_input_tokens_seen": 41280,
313
  "step": 105
314
  },
315
  {
316
  "epoch": 0.88,
317
- "grad_norm": 9.676743507385254,
318
  "learning_rate": 2.475778302439524e-06,
319
- "loss": 0.3951,
320
  "num_input_tokens_seen": 43552,
321
  "step": 110
322
  },
323
  {
324
  "epoch": 0.896,
325
- "eval_loss": 0.4486384391784668,
326
- "eval_runtime": 0.6661,
327
- "eval_samples_per_second": 84.076,
328
- "eval_steps_per_second": 21.019,
329
  "num_input_tokens_seen": 44160,
330
  "step": 112
331
  },
332
  {
333
  "epoch": 0.92,
334
- "grad_norm": 0.255204439163208,
335
  "learning_rate": 1.180621931927592e-06,
336
- "loss": 0.3107,
337
  "num_input_tokens_seen": 45216,
338
  "step": 115
339
  },
340
  {
341
  "epoch": 0.952,
342
- "eval_loss": 0.44127246737480164,
343
- "eval_runtime": 0.6928,
344
- "eval_samples_per_second": 80.827,
345
- "eval_steps_per_second": 20.207,
346
  "num_input_tokens_seen": 46944,
347
  "step": 119
348
  },
349
  {
350
  "epoch": 0.96,
351
- "grad_norm": 9.6677827835083,
352
  "learning_rate": 3.5322453704410286e-07,
353
- "loss": 0.2809,
354
  "num_input_tokens_seen": 47360,
355
  "step": 120
356
  },
357
  {
358
  "epoch": 1.0,
359
- "grad_norm": 9.322759628295898,
360
  "learning_rate": 9.834340423678368e-09,
361
- "loss": 0.4785,
362
  "num_input_tokens_seen": 49376,
363
  "step": 125
364
  },
@@ -366,11 +366,11 @@
366
  "epoch": 1.0,
367
  "num_input_tokens_seen": 49376,
368
  "step": 125,
369
- "total_flos": 497055112495104.0,
370
- "train_loss": 0.5283625726699829,
371
- "train_runtime": 476.3079,
372
- "train_samples_per_second": 1.046,
373
- "train_steps_per_second": 0.262
374
  }
375
  ],
376
  "logging_steps": 5,
@@ -390,7 +390,7 @@
390
  "attributes": {}
391
  }
392
  },
393
- "total_flos": 497055112495104.0,
394
  "train_batch_size": 4,
395
  "trial_name": null,
396
  "trial_params": null
 
1
  {
2
+ "best_global_step": 112,
3
+ "best_metric": 0.34589245915412903,
4
+ "best_model_checkpoint": "saves/test/checkpoint-112",
5
  "epoch": 1.0,
6
  "eval_steps": 7,
7
  "global_step": 125,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04,
14
+ "grad_norm": 556.8270263671875,
15
  "learning_rate": 1.5384615384615387e-05,
16
+ "loss": 10.9709,
17
  "num_input_tokens_seen": 2144,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.056,
22
+ "eval_loss": 6.5227251052856445,
23
+ "eval_runtime": 0.7016,
24
+ "eval_samples_per_second": 79.812,
25
+ "eval_steps_per_second": 19.953,
26
  "num_input_tokens_seen": 2880,
27
  "step": 7
28
  },
29
  {
30
  "epoch": 0.08,
31
+ "grad_norm": 166.4779815673828,
32
  "learning_rate": 3.461538461538462e-05,
33
+ "loss": 6.4075,
34
  "num_input_tokens_seen": 4128,
35
  "step": 10
36
  },
37
  {
38
  "epoch": 0.112,
39
+ "eval_loss": 1.382468581199646,
40
+ "eval_runtime": 0.7422,
41
+ "eval_samples_per_second": 75.454,
42
+ "eval_steps_per_second": 18.864,
43
  "num_input_tokens_seen": 5920,
44
  "step": 14
45
  },
46
  {
47
  "epoch": 0.12,
48
+ "grad_norm": 137.21327209472656,
49
  "learning_rate": 4.999016565957633e-05,
50
+ "loss": 2.5338,
51
  "num_input_tokens_seen": 6240,
52
  "step": 15
53
  },
54
  {
55
  "epoch": 0.16,
56
+ "grad_norm": 15.133822441101074,
57
  "learning_rate": 4.96467754629559e-05,
58
+ "loss": 0.5326,
59
  "num_input_tokens_seen": 8096,
60
  "step": 20
61
  },
62
  {
63
  "epoch": 0.168,
64
+ "eval_loss": 0.4987373352050781,
65
+ "eval_runtime": 0.7195,
66
+ "eval_samples_per_second": 77.827,
67
+ "eval_steps_per_second": 19.457,
68
  "num_input_tokens_seen": 8416,
69
  "step": 21
70
  },
71
  {
72
  "epoch": 0.2,
73
+ "grad_norm": 18.45602798461914,
74
  "learning_rate": 4.881937806807241e-05,
75
+ "loss": 0.4144,
76
  "num_input_tokens_seen": 10112,
77
  "step": 25
78
  },
79
  {
80
  "epoch": 0.224,
81
+ "eval_loss": 0.4531269073486328,
82
+ "eval_runtime": 0.7843,
83
+ "eval_samples_per_second": 71.406,
84
+ "eval_steps_per_second": 17.851,
85
  "num_input_tokens_seen": 11264,
86
  "step": 28
87
  },
88
  {
89
  "epoch": 0.24,
90
+ "grad_norm": 42.76151657104492,
91
  "learning_rate": 4.752422169756048e-05,
92
+ "loss": 0.4563,
93
  "num_input_tokens_seen": 12032,
94
  "step": 30
95
  },
96
  {
97
  "epoch": 0.28,
98
+ "grad_norm": 4.994872570037842,
99
  "learning_rate": 4.5786740307563636e-05,
100
+ "loss": 0.4802,
101
  "num_input_tokens_seen": 13824,
102
  "step": 35
103
  },
104
  {
105
  "epoch": 0.28,
106
+ "eval_loss": 0.36931881308555603,
107
+ "eval_runtime": 0.8719,
108
+ "eval_samples_per_second": 64.225,
109
+ "eval_steps_per_second": 16.056,
110
  "num_input_tokens_seen": 13824,
111
  "step": 35
112
  },
113
  {
114
  "epoch": 0.32,
115
+ "grad_norm": 8.647699356079102,
116
  "learning_rate": 4.364105412207914e-05,
117
+ "loss": 0.3809,
118
  "num_input_tokens_seen": 15840,
119
  "step": 40
120
  },
121
  {
122
  "epoch": 0.336,
123
+ "eval_loss": 0.3872639238834381,
124
+ "eval_runtime": 0.7387,
125
+ "eval_samples_per_second": 75.812,
126
+ "eval_steps_per_second": 18.953,
127
  "num_input_tokens_seen": 16672,
128
  "step": 42
129
  },
130
  {
131
  "epoch": 0.36,
132
+ "grad_norm": 12.718330383300781,
133
  "learning_rate": 4.1129299588552193e-05,
134
+ "loss": 0.3844,
135
  "num_input_tokens_seen": 17920,
136
  "step": 45
137
  },
138
  {
139
  "epoch": 0.392,
140
+ "eval_loss": 0.3777945637702942,
141
+ "eval_runtime": 0.7665,
142
+ "eval_samples_per_second": 73.06,
143
+ "eval_steps_per_second": 18.265,
144
  "num_input_tokens_seen": 19296,
145
  "step": 49
146
  },
147
  {
148
  "epoch": 0.4,
149
+ "grad_norm": 12.110234260559082,
150
  "learning_rate": 3.830080191288342e-05,
151
+ "loss": 0.2817,
152
  "num_input_tokens_seen": 19712,
153
  "step": 50
154
  },
155
  {
156
  "epoch": 0.44,
157
+ "grad_norm": 10.657136917114258,
158
  "learning_rate": 3.521110642339991e-05,
159
+ "loss": 0.3831,
160
  "num_input_tokens_seen": 21952,
161
  "step": 55
162
  },
163
  {
164
  "epoch": 0.448,
165
+ "eval_loss": 0.4436803460121155,
166
+ "eval_runtime": 0.7702,
167
+ "eval_samples_per_second": 72.71,
168
+ "eval_steps_per_second": 18.178,
169
  "num_input_tokens_seen": 22432,
170
  "step": 56
171
  },
172
  {
173
  "epoch": 0.48,
174
+ "grad_norm": 10.894862174987793,
175
  "learning_rate": 3.1920887785621235e-05,
176
+ "loss": 0.5576,
177
  "num_input_tokens_seen": 24160,
178
  "step": 60
179
  },
180
  {
181
  "epoch": 0.504,
182
+ "eval_loss": 0.35032057762145996,
183
+ "eval_runtime": 0.9256,
184
+ "eval_samples_per_second": 60.502,
185
+ "eval_steps_per_second": 15.125,
186
  "num_input_tokens_seen": 25504,
187
  "step": 63
188
  },
189
  {
190
  "epoch": 0.52,
191
+ "grad_norm": 7.415125370025635,
192
  "learning_rate": 2.849475848838749e-05,
193
+ "loss": 0.4013,
194
  "num_input_tokens_seen": 26112,
195
  "step": 65
196
  },
197
  {
198
  "epoch": 0.56,
199
+ "grad_norm": 9.572220802307129,
200
  "learning_rate": 2.5e-05,
201
+ "loss": 0.3242,
202
  "num_input_tokens_seen": 28064,
203
  "step": 70
204
  },
205
  {
206
  "epoch": 0.56,
207
+ "eval_loss": 0.37164703011512756,
208
+ "eval_runtime": 0.816,
209
+ "eval_samples_per_second": 68.627,
210
+ "eval_steps_per_second": 17.157,
211
  "num_input_tokens_seen": 28064,
212
  "step": 70
213
  },
214
  {
215
  "epoch": 0.6,
216
+ "grad_norm": 11.036535263061523,
217
  "learning_rate": 2.1505241511612522e-05,
218
+ "loss": 0.3963,
219
  "num_input_tokens_seen": 29824,
220
  "step": 75
221
  },
222
  {
223
  "epoch": 0.616,
224
+ "eval_loss": 0.3748786747455597,
225
+ "eval_runtime": 0.7992,
226
+ "eval_samples_per_second": 70.066,
227
+ "eval_steps_per_second": 17.516,
228
  "num_input_tokens_seen": 30720,
229
  "step": 77
230
  },
231
  {
232
  "epoch": 0.64,
233
+ "grad_norm": 2.2476918697357178,
234
  "learning_rate": 1.8079112214378768e-05,
235
+ "loss": 0.3946,
236
  "num_input_tokens_seen": 31904,
237
  "step": 80
238
  },
239
  {
240
  "epoch": 0.672,
241
+ "eval_loss": 0.3603578209877014,
242
+ "eval_runtime": 0.7982,
243
+ "eval_samples_per_second": 70.16,
244
+ "eval_steps_per_second": 17.54,
245
  "num_input_tokens_seen": 33504,
246
  "step": 84
247
  },
248
  {
249
  "epoch": 0.68,
250
+ "grad_norm": 2.605731248855591,
251
  "learning_rate": 1.4788893576600099e-05,
252
+ "loss": 0.3496,
253
  "num_input_tokens_seen": 33984,
254
  "step": 85
255
  },
256
  {
257
  "epoch": 0.72,
258
+ "grad_norm": 2.532665967941284,
259
  "learning_rate": 1.1699198087116589e-05,
260
+ "loss": 0.337,
261
  "num_input_tokens_seen": 35776,
262
  "step": 90
263
  },
264
  {
265
  "epoch": 0.728,
266
+ "eval_loss": 0.35710158944129944,
267
+ "eval_runtime": 0.8602,
268
+ "eval_samples_per_second": 65.102,
269
+ "eval_steps_per_second": 16.276,
270
  "num_input_tokens_seen": 36128,
271
  "step": 91
272
  },
273
  {
274
  "epoch": 0.76,
275
+ "grad_norm": 2.5953240394592285,
276
  "learning_rate": 8.870700411447816e-06,
277
+ "loss": 0.4315,
278
  "num_input_tokens_seen": 37472,
279
  "step": 95
280
  },
281
  {
282
  "epoch": 0.784,
283
+ "eval_loss": 0.3520326614379883,
284
+ "eval_runtime": 0.7911,
285
+ "eval_samples_per_second": 70.792,
286
+ "eval_steps_per_second": 17.698,
287
  "num_input_tokens_seen": 38592,
288
  "step": 98
289
  },
290
  {
291
  "epoch": 0.8,
292
+ "grad_norm": 2.179095506668091,
293
  "learning_rate": 6.358945877920861e-06,
294
+ "loss": 0.38,
295
  "num_input_tokens_seen": 39328,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 0.84,
300
+ "grad_norm": 5.5090203285217285,
301
  "learning_rate": 4.213259692436367e-06,
302
+ "loss": 0.371,
303
  "num_input_tokens_seen": 41280,
304
  "step": 105
305
  },
306
  {
307
  "epoch": 0.84,
308
+ "eval_loss": 0.34758228063583374,
309
+ "eval_runtime": 0.8072,
310
+ "eval_samples_per_second": 69.373,
311
+ "eval_steps_per_second": 17.343,
312
  "num_input_tokens_seen": 41280,
313
  "step": 105
314
  },
315
  {
316
  "epoch": 0.88,
317
+ "grad_norm": 1.8210620880126953,
318
  "learning_rate": 2.475778302439524e-06,
319
+ "loss": 0.364,
320
  "num_input_tokens_seen": 43552,
321
  "step": 110
322
  },
323
  {
324
  "epoch": 0.896,
325
+ "eval_loss": 0.34589245915412903,
326
+ "eval_runtime": 0.822,
327
+ "eval_samples_per_second": 68.123,
328
+ "eval_steps_per_second": 17.031,
329
  "num_input_tokens_seen": 44160,
330
  "step": 112
331
  },
332
  {
333
  "epoch": 0.92,
334
+ "grad_norm": 5.644977569580078,
335
  "learning_rate": 1.180621931927592e-06,
336
+ "loss": 0.3554,
337
  "num_input_tokens_seen": 45216,
338
  "step": 115
339
  },
340
  {
341
  "epoch": 0.952,
342
+ "eval_loss": 0.3492301404476166,
343
+ "eval_runtime": 0.8757,
344
+ "eval_samples_per_second": 63.945,
345
+ "eval_steps_per_second": 15.986,
346
  "num_input_tokens_seen": 46944,
347
  "step": 119
348
  },
349
  {
350
  "epoch": 0.96,
351
+ "grad_norm": 1.6824895143508911,
352
  "learning_rate": 3.5322453704410286e-07,
353
+ "loss": 0.3494,
354
  "num_input_tokens_seen": 47360,
355
  "step": 120
356
  },
357
  {
358
  "epoch": 1.0,
359
+ "grad_norm": 10.473832130432129,
360
  "learning_rate": 9.834340423678368e-09,
361
+ "loss": 0.3588,
362
  "num_input_tokens_seen": 49376,
363
  "step": 125
364
  },
 
366
  "epoch": 1.0,
367
  "num_input_tokens_seen": 49376,
368
  "step": 125,
369
+ "total_flos": 497127920369664.0,
370
+ "train_loss": 1.1438678817749024,
371
+ "train_runtime": 264.1495,
372
+ "train_samples_per_second": 1.885,
373
+ "train_steps_per_second": 0.473
374
  }
375
  ],
376
  "logging_steps": 5,
 
390
  "attributes": {}
391
  }
392
  },
393
+ "total_flos": 497127920369664.0,
394
  "train_batch_size": 4,
395
  "trial_name": null,
396
  "trial_params": null
training_eval_loss.png CHANGED
training_loss.png CHANGED