rbelanec commited on
Commit
86dd13f
verified
1 Parent(s): 1413a12

End of training

Browse files
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # test
19
 
20
- This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.3516
23
- - Num Input Tokens Seen: 43600
24
 
25
  ## Model description
26
 
 
17
 
18
  # test
19
 
20
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the wsc dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3491
23
+ - Num Input Tokens Seen: 43904
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34589245915412903,
4
- "eval_runtime": 0.7925,
5
- "eval_samples_per_second": 70.661,
6
- "eval_steps_per_second": 17.665,
7
- "num_input_tokens_seen": 49376,
8
- "total_flos": 497127920369664.0,
9
- "train_loss": 1.1438678817749024,
10
- "train_runtime": 224.169,
11
- "train_samples_per_second": 2.222,
12
- "train_steps_per_second": 0.558
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.34907668828964233,
4
+ "eval_runtime": 0.5932,
5
+ "eval_samples_per_second": 94.404,
6
+ "eval_steps_per_second": 47.202,
7
+ "num_input_tokens_seen": 43904,
8
+ "total_flos": 278458437992448.0,
9
+ "train_loss": 0.3984213411568638,
10
+ "train_runtime": 80.7936,
11
+ "train_samples_per_second": 6.164,
12
+ "train_steps_per_second": 3.082
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34589245915412903,
4
- "eval_runtime": 0.7925,
5
- "eval_samples_per_second": 70.661,
6
- "eval_steps_per_second": 17.665,
7
- "num_input_tokens_seen": 49376
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.34907668828964233,
4
+ "eval_runtime": 0.5932,
5
+ "eval_samples_per_second": 94.404,
6
+ "eval_steps_per_second": 47.202,
7
+ "num_input_tokens_seen": 43904
8
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "num_input_tokens_seen": 49376,
4
- "total_flos": 497127920369664.0,
5
- "train_loss": 1.1438678817749024,
6
- "train_runtime": 224.169,
7
- "train_samples_per_second": 2.222,
8
- "train_steps_per_second": 0.558
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "num_input_tokens_seen": 43904,
4
+ "total_flos": 278458437992448.0,
5
+ "train_loss": 0.3984213411568638,
6
+ "train_runtime": 80.7936,
7
+ "train_samples_per_second": 6.164,
8
+ "train_steps_per_second": 3.082
9
  }
trainer_state.json CHANGED
@@ -1,383 +1,593 @@
1
  {
2
- "best_global_step": 112,
3
- "best_metric": 0.34589245915412903,
4
- "best_model_checkpoint": "saves/test/checkpoint-112",
5
  "epoch": 1.0,
6
- "eval_steps": 7,
7
- "global_step": 125,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.04,
14
- "grad_norm": 556.8270263671875,
15
- "learning_rate": 1.5384615384615387e-05,
16
- "loss": 10.9709,
17
- "num_input_tokens_seen": 2144,
18
  "step": 5
19
  },
20
  {
21
- "epoch": 0.056,
22
- "eval_loss": 6.5227251052856445,
23
- "eval_runtime": 0.703,
24
- "eval_samples_per_second": 79.659,
25
- "eval_steps_per_second": 19.915,
26
- "num_input_tokens_seen": 2880,
27
- "step": 7
28
- },
29
- {
30
- "epoch": 0.08,
31
- "grad_norm": 166.4779815673828,
32
- "learning_rate": 3.461538461538462e-05,
33
- "loss": 6.4075,
34
- "num_input_tokens_seen": 4128,
35
  "step": 10
36
  },
37
  {
38
- "epoch": 0.112,
39
- "eval_loss": 1.382468581199646,
40
- "eval_runtime": 0.7137,
41
- "eval_samples_per_second": 78.462,
42
- "eval_steps_per_second": 19.616,
43
- "num_input_tokens_seen": 5920,
44
- "step": 14
45
  },
46
  {
47
- "epoch": 0.12,
48
- "grad_norm": 137.21327209472656,
49
- "learning_rate": 4.999016565957633e-05,
50
- "loss": 2.5338,
51
- "num_input_tokens_seen": 6240,
52
  "step": 15
53
  },
54
  {
55
- "epoch": 0.16,
56
- "grad_norm": 15.133822441101074,
57
- "learning_rate": 4.96467754629559e-05,
58
- "loss": 0.5326,
59
- "num_input_tokens_seen": 8096,
60
  "step": 20
61
  },
62
  {
63
- "epoch": 0.168,
64
- "eval_loss": 0.4987373352050781,
65
- "eval_runtime": 0.5096,
66
- "eval_samples_per_second": 109.892,
67
- "eval_steps_per_second": 27.473,
68
- "num_input_tokens_seen": 8416,
69
- "step": 21
70
- },
71
- {
72
- "epoch": 0.2,
73
- "grad_norm": 18.45602798461914,
74
- "learning_rate": 4.881937806807241e-05,
75
- "loss": 0.4144,
76
- "num_input_tokens_seen": 10112,
77
  "step": 25
78
  },
79
  {
80
- "epoch": 0.224,
81
- "eval_loss": 0.4531269073486328,
82
- "eval_runtime": 0.5083,
83
- "eval_samples_per_second": 110.162,
84
- "eval_steps_per_second": 27.54,
85
- "num_input_tokens_seen": 11264,
86
- "step": 28
87
  },
88
  {
89
- "epoch": 0.24,
90
- "grad_norm": 42.76151657104492,
91
- "learning_rate": 4.752422169756048e-05,
92
- "loss": 0.4563,
93
- "num_input_tokens_seen": 12032,
94
  "step": 30
95
  },
96
  {
97
- "epoch": 0.28,
98
- "grad_norm": 4.994872570037842,
99
- "learning_rate": 4.5786740307563636e-05,
100
- "loss": 0.4802,
101
- "num_input_tokens_seen": 13824,
102
  "step": 35
103
  },
104
  {
105
- "epoch": 0.28,
106
- "eval_loss": 0.36931881308555603,
107
- "eval_runtime": 0.7118,
108
- "eval_samples_per_second": 78.679,
109
- "eval_steps_per_second": 19.67,
110
- "num_input_tokens_seen": 13824,
111
- "step": 35
112
  },
113
  {
114
- "epoch": 0.32,
115
- "grad_norm": 8.647699356079102,
116
- "learning_rate": 4.364105412207914e-05,
117
- "loss": 0.3809,
118
- "num_input_tokens_seen": 15840,
119
  "step": 40
120
  },
121
  {
122
- "epoch": 0.336,
123
- "eval_loss": 0.3872639238834381,
124
- "eval_runtime": 0.7546,
125
- "eval_samples_per_second": 74.213,
126
- "eval_steps_per_second": 18.553,
127
- "num_input_tokens_seen": 16672,
128
- "step": 42
129
- },
130
- {
131
- "epoch": 0.36,
132
- "grad_norm": 12.718330383300781,
133
- "learning_rate": 4.1129299588552193e-05,
134
- "loss": 0.3844,
135
- "num_input_tokens_seen": 17920,
136
  "step": 45
137
  },
138
  {
139
- "epoch": 0.392,
140
- "eval_loss": 0.3777945637702942,
141
- "eval_runtime": 0.7935,
142
- "eval_samples_per_second": 70.573,
143
- "eval_steps_per_second": 17.643,
144
- "num_input_tokens_seen": 19296,
145
- "step": 49
146
- },
147
- {
148
- "epoch": 0.4,
149
- "grad_norm": 12.110234260559082,
150
- "learning_rate": 3.830080191288342e-05,
151
- "loss": 0.2817,
152
- "num_input_tokens_seen": 19712,
153
  "step": 50
154
  },
155
  {
156
- "epoch": 0.44,
157
- "grad_norm": 10.657136917114258,
158
- "learning_rate": 3.521110642339991e-05,
159
- "loss": 0.3831,
160
- "num_input_tokens_seen": 21952,
161
- "step": 55
 
162
  },
163
  {
164
- "epoch": 0.448,
165
- "eval_loss": 0.4436803460121155,
166
- "eval_runtime": 0.7397,
167
- "eval_samples_per_second": 75.71,
168
- "eval_steps_per_second": 18.927,
169
- "num_input_tokens_seen": 22432,
170
- "step": 56
171
  },
172
  {
173
- "epoch": 0.48,
174
- "grad_norm": 10.894862174987793,
175
- "learning_rate": 3.1920887785621235e-05,
176
- "loss": 0.5576,
177
- "num_input_tokens_seen": 24160,
178
  "step": 60
179
  },
180
  {
181
- "epoch": 0.504,
182
- "eval_loss": 0.35032057762145996,
183
- "eval_runtime": 0.7463,
184
- "eval_samples_per_second": 75.04,
185
- "eval_steps_per_second": 18.76,
186
- "num_input_tokens_seen": 25504,
187
- "step": 63
188
- },
189
- {
190
- "epoch": 0.52,
191
- "grad_norm": 7.415125370025635,
192
- "learning_rate": 2.849475848838749e-05,
193
- "loss": 0.4013,
194
- "num_input_tokens_seen": 26112,
195
  "step": 65
196
  },
197
  {
198
- "epoch": 0.56,
199
- "grad_norm": 9.572220802307129,
200
- "learning_rate": 2.5e-05,
201
- "loss": 0.3242,
202
- "num_input_tokens_seen": 28064,
203
- "step": 70
 
204
  },
205
  {
206
- "epoch": 0.56,
207
- "eval_loss": 0.37164703011512756,
208
- "eval_runtime": 0.7524,
209
- "eval_samples_per_second": 74.427,
210
- "eval_steps_per_second": 18.607,
211
- "num_input_tokens_seen": 28064,
212
  "step": 70
213
  },
214
  {
215
- "epoch": 0.6,
216
- "grad_norm": 11.036535263061523,
217
- "learning_rate": 2.1505241511612522e-05,
218
- "loss": 0.3963,
219
- "num_input_tokens_seen": 29824,
220
  "step": 75
221
  },
222
  {
223
- "epoch": 0.616,
224
- "eval_loss": 0.3748786747455597,
225
- "eval_runtime": 0.6694,
226
- "eval_samples_per_second": 83.657,
227
- "eval_steps_per_second": 20.914,
228
- "num_input_tokens_seen": 30720,
229
- "step": 77
230
  },
231
  {
232
- "epoch": 0.64,
233
- "grad_norm": 2.2476918697357178,
234
- "learning_rate": 1.8079112214378768e-05,
235
- "loss": 0.3946,
236
- "num_input_tokens_seen": 31904,
237
  "step": 80
238
  },
239
  {
240
- "epoch": 0.672,
241
- "eval_loss": 0.3603578209877014,
242
- "eval_runtime": 0.7578,
243
- "eval_samples_per_second": 73.897,
244
- "eval_steps_per_second": 18.474,
245
- "num_input_tokens_seen": 33504,
246
- "step": 84
247
- },
248
- {
249
- "epoch": 0.68,
250
- "grad_norm": 2.605731248855591,
251
- "learning_rate": 1.4788893576600099e-05,
252
- "loss": 0.3496,
253
- "num_input_tokens_seen": 33984,
254
  "step": 85
255
  },
256
  {
257
- "epoch": 0.72,
258
- "grad_norm": 2.532665967941284,
259
- "learning_rate": 1.1699198087116589e-05,
260
- "loss": 0.337,
261
- "num_input_tokens_seen": 35776,
262
  "step": 90
263
  },
264
  {
265
- "epoch": 0.728,
266
- "eval_loss": 0.35710158944129944,
267
- "eval_runtime": 0.5883,
268
- "eval_samples_per_second": 95.185,
269
- "eval_steps_per_second": 23.796,
270
- "num_input_tokens_seen": 36128,
271
  "step": 91
272
  },
273
  {
274
- "epoch": 0.76,
275
- "grad_norm": 2.5953240394592285,
276
- "learning_rate": 8.870700411447816e-06,
277
- "loss": 0.4315,
278
- "num_input_tokens_seen": 37472,
279
  "step": 95
280
  },
281
  {
282
- "epoch": 0.784,
283
- "eval_loss": 0.3520326614379883,
284
- "eval_runtime": 0.8055,
285
- "eval_samples_per_second": 69.518,
286
- "eval_steps_per_second": 17.38,
287
- "num_input_tokens_seen": 38592,
288
- "step": 98
289
- },
290
- {
291
- "epoch": 0.8,
292
- "grad_norm": 2.179095506668091,
293
- "learning_rate": 6.358945877920861e-06,
294
- "loss": 0.38,
295
- "num_input_tokens_seen": 39328,
296
  "step": 100
297
  },
298
  {
299
- "epoch": 0.84,
300
- "grad_norm": 5.5090203285217285,
301
- "learning_rate": 4.213259692436367e-06,
302
- "loss": 0.371,
303
- "num_input_tokens_seen": 41280,
304
- "step": 105
 
305
  },
306
  {
307
- "epoch": 0.84,
308
- "eval_loss": 0.34758228063583374,
309
- "eval_runtime": 0.7519,
310
- "eval_samples_per_second": 74.483,
311
- "eval_steps_per_second": 18.621,
312
- "num_input_tokens_seen": 41280,
313
  "step": 105
314
  },
315
  {
316
- "epoch": 0.88,
317
- "grad_norm": 1.8210620880126953,
318
- "learning_rate": 2.475778302439524e-06,
319
- "loss": 0.364,
320
- "num_input_tokens_seen": 43552,
321
  "step": 110
322
  },
323
  {
324
- "epoch": 0.896,
325
- "eval_loss": 0.34589245915412903,
326
- "eval_runtime": 0.7431,
327
- "eval_samples_per_second": 75.359,
328
- "eval_steps_per_second": 18.84,
329
- "num_input_tokens_seen": 44160,
330
- "step": 112
331
- },
332
- {
333
- "epoch": 0.92,
334
- "grad_norm": 5.644977569580078,
335
- "learning_rate": 1.180621931927592e-06,
336
- "loss": 0.3554,
337
- "num_input_tokens_seen": 45216,
338
  "step": 115
339
  },
340
  {
341
- "epoch": 0.952,
342
- "eval_loss": 0.3492301404476166,
343
- "eval_runtime": 0.7889,
344
- "eval_samples_per_second": 70.986,
345
- "eval_steps_per_second": 17.747,
346
- "num_input_tokens_seen": 46944,
347
- "step": 119
348
  },
349
  {
350
- "epoch": 0.96,
351
- "grad_norm": 1.6824895143508911,
352
- "learning_rate": 3.5322453704410286e-07,
353
- "loss": 0.3494,
354
- "num_input_tokens_seen": 47360,
355
  "step": 120
356
  },
357
  {
358
- "epoch": 1.0,
359
- "grad_norm": 10.473832130432129,
360
- "learning_rate": 9.834340423678368e-09,
361
- "loss": 0.3588,
362
- "num_input_tokens_seen": 49376,
363
  "step": 125
364
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  {
366
  "epoch": 1.0,
367
- "num_input_tokens_seen": 49376,
368
- "step": 125,
369
- "total_flos": 497127920369664.0,
370
- "train_loss": 1.1438678817749024,
371
- "train_runtime": 224.169,
372
- "train_samples_per_second": 2.222,
373
- "train_steps_per_second": 0.558
374
  }
375
  ],
376
  "logging_steps": 5,
377
- "max_steps": 125,
378
- "num_input_tokens_seen": 49376,
379
  "num_train_epochs": 1,
380
- "save_steps": 7,
381
  "stateful_callbacks": {
382
  "TrainerControl": {
383
  "args": {
@@ -390,8 +600,8 @@
390
  "attributes": {}
391
  }
392
  },
393
- "total_flos": 497127920369664.0,
394
- "train_batch_size": 4,
395
  "trial_name": null,
396
  "trial_params": null
397
  }
 
1
  {
2
+ "best_global_step": 182,
3
+ "best_metric": 0.34907668828964233,
4
+ "best_model_checkpoint": "saves/test/checkpoint-182",
5
  "epoch": 1.0,
6
+ "eval_steps": 13,
7
+ "global_step": 249,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020080321285140562,
14
+ "grad_norm": 29.701719284057617,
15
+ "learning_rate": 8.000000000000001e-06,
16
+ "loss": 0.8323,
17
+ "num_input_tokens_seen": 832,
18
  "step": 5
19
  },
20
  {
21
+ "epoch": 0.040160642570281124,
22
+ "grad_norm": 19.538766860961914,
23
+ "learning_rate": 1.8e-05,
24
+ "loss": 0.7462,
25
+ "num_input_tokens_seen": 1760,
 
 
 
 
 
 
 
 
 
26
  "step": 10
27
  },
28
  {
29
+ "epoch": 0.05220883534136546,
30
+ "eval_loss": 0.6849029660224915,
31
+ "eval_runtime": 0.5644,
32
+ "eval_samples_per_second": 99.229,
33
+ "eval_steps_per_second": 49.614,
34
+ "num_input_tokens_seen": 2288,
35
+ "step": 13
36
  },
37
  {
38
+ "epoch": 0.060240963855421686,
39
+ "grad_norm": 9.36767292022705,
40
+ "learning_rate": 2.8000000000000003e-05,
41
+ "loss": 0.71,
42
+ "num_input_tokens_seen": 2608,
43
  "step": 15
44
  },
45
  {
46
+ "epoch": 0.08032128514056225,
47
+ "grad_norm": 17.907136917114258,
48
+ "learning_rate": 3.8e-05,
49
+ "loss": 0.5466,
50
+ "num_input_tokens_seen": 3536,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.10040160642570281,
55
+ "grad_norm": 11.546435356140137,
56
+ "learning_rate": 4.8e-05,
57
+ "loss": 0.6639,
58
+ "num_input_tokens_seen": 4496,
 
 
 
 
 
 
 
 
 
59
  "step": 25
60
  },
61
  {
62
+ "epoch": 0.10441767068273092,
63
+ "eval_loss": 0.45566946268081665,
64
+ "eval_runtime": 0.5583,
65
+ "eval_samples_per_second": 100.311,
66
+ "eval_steps_per_second": 50.156,
67
+ "num_input_tokens_seen": 4656,
68
+ "step": 26
69
  },
70
  {
71
+ "epoch": 0.12048192771084337,
72
+ "grad_norm": 8.822799682617188,
73
+ "learning_rate": 4.996067037544542e-05,
74
+ "loss": 0.5107,
75
+ "num_input_tokens_seen": 5424,
76
  "step": 30
77
  },
78
  {
79
+ "epoch": 0.14056224899598393,
80
+ "grad_norm": 7.542176246643066,
81
+ "learning_rate": 4.980110583549062e-05,
82
+ "loss": 0.3742,
83
+ "num_input_tokens_seen": 6304,
84
  "step": 35
85
  },
86
  {
87
+ "epoch": 0.1566265060240964,
88
+ "eval_loss": 0.3848888874053955,
89
+ "eval_runtime": 0.5676,
90
+ "eval_samples_per_second": 98.664,
91
+ "eval_steps_per_second": 49.332,
92
+ "num_input_tokens_seen": 6944,
93
+ "step": 39
94
  },
95
  {
96
+ "epoch": 0.1606425702811245,
97
+ "grad_norm": 1.9545893669128418,
98
+ "learning_rate": 4.951963201008076e-05,
99
+ "loss": 0.3994,
100
+ "num_input_tokens_seen": 7072,
101
  "step": 40
102
  },
103
  {
104
+ "epoch": 0.18072289156626506,
105
+ "grad_norm": 6.747119903564453,
106
+ "learning_rate": 4.91176324775594e-05,
107
+ "loss": 0.3929,
108
+ "num_input_tokens_seen": 7856,
 
 
 
 
 
 
 
 
 
109
  "step": 45
110
  },
111
  {
112
+ "epoch": 0.20080321285140562,
113
+ "grad_norm": 9.478217124938965,
114
+ "learning_rate": 4.8597083257709194e-05,
115
+ "loss": 0.3565,
116
+ "num_input_tokens_seen": 8880,
 
 
 
 
 
 
 
 
 
117
  "step": 50
118
  },
119
  {
120
+ "epoch": 0.20883534136546184,
121
+ "eval_loss": 0.3768366277217865,
122
+ "eval_runtime": 0.5874,
123
+ "eval_samples_per_second": 95.338,
124
+ "eval_steps_per_second": 47.669,
125
+ "num_input_tokens_seen": 9232,
126
+ "step": 52
127
  },
128
  {
129
+ "epoch": 0.22088353413654618,
130
+ "grad_norm": 2.1806461811065674,
131
+ "learning_rate": 4.796054309867053e-05,
132
+ "loss": 0.4015,
133
+ "num_input_tokens_seen": 9680,
134
+ "step": 55
 
135
  },
136
  {
137
+ "epoch": 0.24096385542168675,
138
+ "grad_norm": 7.034917831420898,
139
+ "learning_rate": 4.721114089947181e-05,
140
+ "loss": 0.3437,
141
+ "num_input_tokens_seen": 10576,
142
  "step": 60
143
  },
144
  {
145
+ "epoch": 0.26104417670682734,
146
+ "grad_norm": 2.6137804985046387,
147
+ "learning_rate": 4.6352560329995686e-05,
148
+ "loss": 0.3087,
149
+ "num_input_tokens_seen": 11424,
 
 
 
 
 
 
 
 
 
150
  "step": 65
151
  },
152
  {
153
+ "epoch": 0.26104417670682734,
154
+ "eval_loss": 0.3713006377220154,
155
+ "eval_runtime": 0.5783,
156
+ "eval_samples_per_second": 96.834,
157
+ "eval_steps_per_second": 48.417,
158
+ "num_input_tokens_seen": 11424,
159
+ "step": 65
160
  },
161
  {
162
+ "epoch": 0.28112449799196787,
163
+ "grad_norm": 2.673372507095337,
164
+ "learning_rate": 4.538902172398151e-05,
165
+ "loss": 0.3702,
166
+ "num_input_tokens_seen": 12224,
 
167
  "step": 70
168
  },
169
  {
170
+ "epoch": 0.30120481927710846,
171
+ "grad_norm": 1.5868593454360962,
172
+ "learning_rate": 4.4325261334068426e-05,
173
+ "loss": 0.3607,
174
+ "num_input_tokens_seen": 13168,
175
  "step": 75
176
  },
177
  {
178
+ "epoch": 0.3132530120481928,
179
+ "eval_loss": 0.3614208996295929,
180
+ "eval_runtime": 0.6054,
181
+ "eval_samples_per_second": 92.508,
182
+ "eval_steps_per_second": 46.254,
183
+ "num_input_tokens_seen": 13760,
184
+ "step": 78
185
  },
186
  {
187
+ "epoch": 0.321285140562249,
188
+ "grad_norm": 7.262303829193115,
189
+ "learning_rate": 4.316650805085068e-05,
190
+ "loss": 0.3766,
191
+ "num_input_tokens_seen": 14080,
192
  "step": 80
193
  },
194
  {
195
+ "epoch": 0.3413654618473896,
196
+ "grad_norm": 6.2033772468566895,
197
+ "learning_rate": 4.1918457700381855e-05,
198
+ "loss": 0.3639,
199
+ "num_input_tokens_seen": 15056,
 
 
 
 
 
 
 
 
 
200
  "step": 85
201
  },
202
  {
203
+ "epoch": 0.3614457831325301,
204
+ "grad_norm": 6.97931432723999,
205
+ "learning_rate": 4.058724504646834e-05,
206
+ "loss": 0.3589,
207
+ "num_input_tokens_seen": 15904,
208
  "step": 90
209
  },
210
  {
211
+ "epoch": 0.3654618473895582,
212
+ "eval_loss": 0.36092114448547363,
213
+ "eval_runtime": 0.5824,
214
+ "eval_samples_per_second": 96.155,
215
+ "eval_steps_per_second": 48.078,
216
+ "num_input_tokens_seen": 16048,
217
  "step": 91
218
  },
219
  {
220
+ "epoch": 0.3815261044176707,
221
+ "grad_norm": 6.607943058013916,
222
+ "learning_rate": 3.9179413635373897e-05,
223
+ "loss": 0.3395,
224
+ "num_input_tokens_seen": 16688,
225
  "step": 95
226
  },
227
  {
228
+ "epoch": 0.40160642570281124,
229
+ "grad_norm": 6.295155048370361,
230
+ "learning_rate": 3.770188363116324e-05,
231
+ "loss": 0.2898,
232
+ "num_input_tokens_seen": 17552,
 
 
 
 
 
 
 
 
 
233
  "step": 100
234
  },
235
  {
236
+ "epoch": 0.41767068273092367,
237
+ "eval_loss": 0.37226182222366333,
238
+ "eval_runtime": 0.5933,
239
+ "eval_samples_per_second": 94.389,
240
+ "eval_steps_per_second": 47.194,
241
+ "num_input_tokens_seen": 18272,
242
+ "step": 104
243
  },
244
  {
245
+ "epoch": 0.42168674698795183,
246
+ "grad_norm": 1.7356857061386108,
247
+ "learning_rate": 3.616191779978907e-05,
248
+ "loss": 0.3169,
249
+ "num_input_tokens_seen": 18400,
 
250
  "step": 105
251
  },
252
  {
253
+ "epoch": 0.44176706827309237,
254
+ "grad_norm": 5.197076797485352,
255
+ "learning_rate": 3.456708580912725e-05,
256
+ "loss": 0.3631,
257
+ "num_input_tokens_seen": 19456,
258
  "step": 110
259
  },
260
  {
261
+ "epoch": 0.46184738955823296,
262
+ "grad_norm": 8.22790241241455,
263
+ "learning_rate": 3.292522702044221e-05,
264
+ "loss": 0.4246,
265
+ "num_input_tokens_seen": 20288,
 
 
 
 
 
 
 
 
 
266
  "step": 115
267
  },
268
  {
269
+ "epoch": 0.46987951807228917,
270
+ "eval_loss": 0.36986905336380005,
271
+ "eval_runtime": 0.5858,
272
+ "eval_samples_per_second": 95.59,
273
+ "eval_steps_per_second": 47.795,
274
+ "num_input_tokens_seen": 20656,
275
+ "step": 117
276
  },
277
  {
278
+ "epoch": 0.4819277108433735,
279
+ "grad_norm": 2.152425527572632,
280
+ "learning_rate": 3.1244411954180676e-05,
281
+ "loss": 0.3885,
282
+ "num_input_tokens_seen": 21328,
283
  "step": 120
284
  },
285
  {
286
+ "epoch": 0.5020080321285141,
287
+ "grad_norm": 1.8676035404205322,
288
+ "learning_rate": 2.9532902619507462e-05,
289
+ "loss": 0.3539,
290
+ "num_input_tokens_seen": 22304,
291
  "step": 125
292
  },
293
+ {
294
+ "epoch": 0.5220883534136547,
295
+ "grad_norm": 6.877042293548584,
296
+ "learning_rate": 2.7799111902582696e-05,
297
+ "loss": 0.3657,
298
+ "num_input_tokens_seen": 23056,
299
+ "step": 130
300
+ },
301
+ {
302
+ "epoch": 0.5220883534136547,
303
+ "eval_loss": 0.35231098532676697,
304
+ "eval_runtime": 0.579,
305
+ "eval_samples_per_second": 96.724,
306
+ "eval_steps_per_second": 48.362,
307
+ "num_input_tokens_seen": 23056,
308
+ "step": 130
309
+ },
310
+ {
311
+ "epoch": 0.5421686746987951,
312
+ "grad_norm": 8.028996467590332,
313
+ "learning_rate": 2.6051562213206632e-05,
314
+ "loss": 0.3499,
315
+ "num_input_tokens_seen": 23840,
316
+ "step": 135
317
+ },
318
+ {
319
+ "epoch": 0.5622489959839357,
320
+ "grad_norm": 2.0267858505249023,
321
+ "learning_rate": 2.429884359310328e-05,
322
+ "loss": 0.3637,
323
+ "num_input_tokens_seen": 24832,
324
+ "step": 140
325
+ },
326
+ {
327
+ "epoch": 0.5742971887550201,
328
+ "eval_loss": 0.3550644516944885,
329
+ "eval_runtime": 0.5754,
330
+ "eval_samples_per_second": 97.321,
331
+ "eval_steps_per_second": 48.66,
332
+ "num_input_tokens_seen": 25312,
333
+ "step": 143
334
+ },
335
+ {
336
+ "epoch": 0.5823293172690763,
337
+ "grad_norm": 1.7024149894714355,
338
+ "learning_rate": 2.2549571491760986e-05,
339
+ "loss": 0.3785,
340
+ "num_input_tokens_seen": 25648,
341
+ "step": 145
342
+ },
343
+ {
344
+ "epoch": 0.6024096385542169,
345
+ "grad_norm": 7.411402225494385,
346
+ "learning_rate": 2.0812344417381595e-05,
347
+ "loss": 0.3394,
348
+ "num_input_tokens_seen": 26496,
349
+ "step": 150
350
+ },
351
+ {
352
+ "epoch": 0.6224899598393574,
353
+ "grad_norm": 6.996516227722168,
354
+ "learning_rate": 1.909570167110415e-05,
355
+ "loss": 0.3938,
356
+ "num_input_tokens_seen": 27392,
357
+ "step": 155
358
+ },
359
+ {
360
+ "epoch": 0.6265060240963856,
361
+ "eval_loss": 0.3516700565814972,
362
+ "eval_runtime": 0.5784,
363
+ "eval_samples_per_second": 96.823,
364
+ "eval_steps_per_second": 48.411,
365
+ "num_input_tokens_seen": 27552,
366
+ "step": 156
367
+ },
368
+ {
369
+ "epoch": 0.642570281124498,
370
+ "grad_norm": 1.795516848564148,
371
+ "learning_rate": 1.7408081372259632e-05,
372
+ "loss": 0.3667,
373
+ "num_input_tokens_seen": 28272,
374
+ "step": 160
375
+ },
376
+ {
377
+ "epoch": 0.6626506024096386,
378
+ "grad_norm": 5.605747222900391,
379
+ "learning_rate": 1.5757778980982626e-05,
380
+ "loss": 0.3198,
381
+ "num_input_tokens_seen": 29184,
382
+ "step": 165
383
+ },
384
+ {
385
+ "epoch": 0.678714859437751,
386
+ "eval_loss": 0.354565292596817,
387
+ "eval_runtime": 0.578,
388
+ "eval_samples_per_second": 96.89,
389
+ "eval_steps_per_second": 48.445,
390
+ "num_input_tokens_seen": 29984,
391
+ "step": 169
392
+ },
393
+ {
394
+ "epoch": 0.6827309236947792,
395
+ "grad_norm": 2.0162057876586914,
396
+ "learning_rate": 1.4152906522061048e-05,
397
+ "loss": 0.3366,
398
+ "num_input_tokens_seen": 30128,
399
+ "step": 170
400
+ },
401
+ {
402
+ "epoch": 0.7028112449799196,
403
+ "grad_norm": 2.3657188415527344,
404
+ "learning_rate": 1.2601352710458313e-05,
405
+ "loss": 0.3291,
406
+ "num_input_tokens_seen": 30976,
407
+ "step": 175
408
+ },
409
+ {
410
+ "epoch": 0.7228915662650602,
411
+ "grad_norm": 4.72028923034668,
412
+ "learning_rate": 1.1110744174509952e-05,
413
+ "loss": 0.369,
414
+ "num_input_tokens_seen": 31776,
415
+ "step": 180
416
+ },
417
+ {
418
+ "epoch": 0.7309236947791165,
419
+ "eval_loss": 0.34907668828964233,
420
+ "eval_runtime": 0.5768,
421
+ "eval_samples_per_second": 97.087,
422
+ "eval_steps_per_second": 48.544,
423
+ "num_input_tokens_seen": 32080,
424
+ "step": 182
425
+ },
426
+ {
427
+ "epoch": 0.7429718875502008,
428
+ "grad_norm": 7.622625827789307,
429
+ "learning_rate": 9.688407967401248e-06,
430
+ "loss": 0.3852,
431
+ "num_input_tokens_seen": 32608,
432
+ "step": 185
433
+ },
434
+ {
435
+ "epoch": 0.7630522088353414,
436
+ "grad_norm": 6.026548862457275,
437
+ "learning_rate": 8.341335551199902e-06,
438
+ "loss": 0.4115,
439
+ "num_input_tokens_seen": 33360,
440
+ "step": 190
441
+ },
442
+ {
443
+ "epoch": 0.7831325301204819,
444
+ "grad_norm": 7.148702621459961,
445
+ "learning_rate": 7.076148430479321e-06,
446
+ "loss": 0.3673,
447
+ "num_input_tokens_seen": 34176,
448
+ "step": 195
449
+ },
450
+ {
451
+ "epoch": 0.7831325301204819,
452
+ "eval_loss": 0.3541497588157654,
453
+ "eval_runtime": 0.58,
454
+ "eval_samples_per_second": 96.553,
455
+ "eval_steps_per_second": 48.277,
456
+ "num_input_tokens_seen": 34176,
457
+ "step": 195
458
+ },
459
+ {
460
+ "epoch": 0.8032128514056225,
461
+ "grad_norm": 1.6944422721862793,
462
+ "learning_rate": 5.899065604459814e-06,
463
+ "loss": 0.3583,
464
+ "num_input_tokens_seen": 34992,
465
+ "step": 200
466
+ },
467
+ {
468
+ "epoch": 0.8232931726907631,
469
+ "grad_norm": 1.7302725315093994,
470
+ "learning_rate": 4.81587299765594e-06,
471
+ "loss": 0.3675,
472
+ "num_input_tokens_seen": 35888,
473
+ "step": 205
474
+ },
475
+ {
476
+ "epoch": 0.8353413654618473,
477
+ "eval_loss": 0.3513210713863373,
478
+ "eval_runtime": 0.5991,
479
+ "eval_samples_per_second": 93.474,
480
+ "eval_steps_per_second": 46.737,
481
+ "num_input_tokens_seen": 36512,
482
+ "step": 208
483
+ },
484
+ {
485
+ "epoch": 0.8433734939759037,
486
+ "grad_norm": 1.571621060371399,
487
+ "learning_rate": 3.831895019292897e-06,
488
+ "loss": 0.3717,
489
+ "num_input_tokens_seen": 36848,
490
+ "step": 210
491
+ },
492
+ {
493
+ "epoch": 0.8634538152610441,
494
+ "grad_norm": 2.1745762825012207,
495
+ "learning_rate": 2.9519683912911266e-06,
496
+ "loss": 0.3723,
497
+ "num_input_tokens_seen": 37888,
498
+ "step": 215
499
+ },
500
+ {
501
+ "epoch": 0.8835341365461847,
502
+ "grad_norm": 1.900101900100708,
503
+ "learning_rate": 2.1804183734670277e-06,
504
+ "loss": 0.3634,
505
+ "num_input_tokens_seen": 38768,
506
+ "step": 220
507
+ },
508
+ {
509
+ "epoch": 0.8875502008032129,
510
+ "eval_loss": 0.3546585738658905,
511
+ "eval_runtime": 0.6095,
512
+ "eval_samples_per_second": 91.885,
513
+ "eval_steps_per_second": 45.943,
514
+ "num_input_tokens_seen": 38912,
515
+ "step": 221
516
+ },
517
+ {
518
+ "epoch": 0.9036144578313253,
519
+ "grad_norm": 1.1839439868927002,
520
+ "learning_rate": 1.5210375028143097e-06,
521
+ "loss": 0.3656,
522
+ "num_input_tokens_seen": 39488,
523
+ "step": 225
524
+ },
525
+ {
526
+ "epoch": 0.9236947791164659,
527
+ "grad_norm": 1.8801380395889282,
528
+ "learning_rate": 9.770669513725128e-07,
529
+ "loss": 0.3446,
530
+ "num_input_tokens_seen": 40336,
531
+ "step": 230
532
+ },
533
+ {
534
+ "epoch": 0.9397590361445783,
535
+ "eval_loss": 0.35187554359436035,
536
+ "eval_runtime": 0.5983,
537
+ "eval_samples_per_second": 93.594,
538
+ "eval_steps_per_second": 46.797,
539
+ "num_input_tokens_seen": 41120,
540
+ "step": 234
541
+ },
542
+ {
543
+ "epoch": 0.9437751004016064,
544
+ "grad_norm": 1.5287421941757202,
545
+ "learning_rate": 5.5118059431781e-07,
546
+ "loss": 0.3724,
547
+ "num_input_tokens_seen": 41328,
548
+ "step": 235
549
+ },
550
+ {
551
+ "epoch": 0.963855421686747,
552
+ "grad_norm": 1.8900800943374634,
553
+ "learning_rate": 2.454718665888589e-07,
554
+ "loss": 0.3493,
555
+ "num_input_tokens_seen": 42176,
556
+ "step": 240
557
+ },
558
+ {
559
+ "epoch": 0.9839357429718876,
560
+ "grad_norm": 1.2923225164413452,
561
+ "learning_rate": 6.14434726538493e-08,
562
+ "loss": 0.3364,
563
+ "num_input_tokens_seen": 43312,
564
+ "step": 245
565
+ },
566
+ {
567
+ "epoch": 0.9919678714859438,
568
+ "eval_loss": 0.3515866696834564,
569
+ "eval_runtime": 0.62,
570
+ "eval_samples_per_second": 90.326,
571
+ "eval_steps_per_second": 45.163,
572
+ "num_input_tokens_seen": 43600,
573
+ "step": 247
574
+ },
575
  {
576
  "epoch": 1.0,
577
+ "num_input_tokens_seen": 43904,
578
+ "step": 249,
579
+ "total_flos": 278458437992448.0,
580
+ "train_loss": 0.3984213411568638,
581
+ "train_runtime": 80.7936,
582
+ "train_samples_per_second": 6.164,
583
+ "train_steps_per_second": 3.082
584
  }
585
  ],
586
  "logging_steps": 5,
587
+ "max_steps": 249,
588
+ "num_input_tokens_seen": 43904,
589
  "num_train_epochs": 1,
590
+ "save_steps": 13,
591
  "stateful_callbacks": {
592
  "TrainerControl": {
593
  "args": {
 
600
  "attributes": {}
601
  }
602
  },
603
+ "total_flos": 278458437992448.0,
604
+ "train_batch_size": 2,
605
  "trial_name": null,
606
  "trial_params": null
607
  }
training_eval_loss.png CHANGED
training_loss.png CHANGED