gbemilekeonilude commited on
Commit
028cb8c
·
verified ·
1 Parent(s): e720f68

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +9 -8
  3. eval_results.json +5 -5
  4. train_results.json +4 -4
  5. trainer_state.json +119 -119
README.md CHANGED
@@ -20,7 +20,7 @@ This model is a fine-tuned version of [openai-community/gpt2-medium](https://hug
20
  It achieves the following results on the evaluation set:
21
  - Loss: 1.6997
22
  - Accuracy: 0.2113
23
- - Num Input Tokens Seen: 1638400
24
 
25
  ## Model description
26
 
 
20
  It achieves the following results on the evaluation set:
21
  - Loss: 1.6997
22
  - Accuracy: 0.2113
23
+ - Num Input Tokens Seen: 1941504
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "epoch": 3.0,
 
3
  "eval_f1": 0.2112676056338028,
4
- "eval_loss": 1.7046104669570923,
5
- "eval_runtime": 0.9636,
6
  "eval_samples": 71,
7
- "eval_samples_per_second": 73.685,
8
- "eval_steps_per_second": 9.34,
9
  "num_input_tokens_seen": 1941504,
10
  "total_flos": 3521692676653056.0,
11
- "train_loss": 2.4847586265596155,
12
- "train_runtime": 216.4531,
13
  "train_samples": 631,
14
- "train_samples_per_second": 8.746,
15
- "train_steps_per_second": 1.095
16
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.2112676056338028,
4
  "eval_f1": 0.2112676056338028,
5
+ "eval_loss": 1.6996692419052124,
6
+ "eval_runtime": 0.8945,
7
  "eval_samples": 71,
8
+ "eval_samples_per_second": 79.375,
9
+ "eval_steps_per_second": 10.062,
10
  "num_input_tokens_seen": 1941504,
11
  "total_flos": 3521692676653056.0,
12
+ "train_loss": 2.3182458032535602,
13
+ "train_runtime": 206.8614,
14
  "train_samples": 631,
15
+ "train_samples_per_second": 9.151,
16
+ "train_steps_per_second": 1.146
17
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_f1": 0.2112676056338028,
4
- "eval_loss": 1.7046104669570923,
5
- "eval_runtime": 0.9636,
6
  "eval_samples": 71,
7
- "eval_samples_per_second": 73.685,
8
- "eval_steps_per_second": 9.34,
9
  "num_input_tokens_seen": 1941504
10
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.2112676056338028,
4
+ "eval_loss": 1.6996692419052124,
5
+ "eval_runtime": 0.8945,
6
  "eval_samples": 71,
7
+ "eval_samples_per_second": 79.375,
8
+ "eval_steps_per_second": 10.062,
9
  "num_input_tokens_seen": 1941504
10
  }
train_results.json CHANGED
@@ -2,9 +2,9 @@
2
  "epoch": 3.0,
3
  "num_input_tokens_seen": 1941504,
4
  "total_flos": 3521692676653056.0,
5
- "train_loss": 2.4847586265596155,
6
- "train_runtime": 216.4531,
7
  "train_samples": 631,
8
- "train_samples_per_second": 8.746,
9
- "train_steps_per_second": 1.095
10
  }
 
2
  "epoch": 3.0,
3
  "num_input_tokens_seen": 1941504,
4
  "total_flos": 3521692676653056.0,
5
+ "train_loss": 2.3182458032535602,
6
+ "train_runtime": 206.8614,
7
  "train_samples": 631,
8
+ "train_samples_per_second": 9.151,
9
+ "train_steps_per_second": 1.146
10
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 1.7046104669570923,
3
  "best_model_checkpoint": "./python_and_text_gpt2/checkpoint-200",
4
  "epoch": 3.0,
5
  "eval_steps": 50,
@@ -10,11 +10,11 @@
10
  "log_history": [
11
  {
12
  "epoch": 0,
13
- "eval_f1": 0.11267605633802817,
14
  "eval_loss": 8.402484893798828,
15
- "eval_runtime": 1.189,
16
- "eval_samples_per_second": 59.712,
17
- "eval_steps_per_second": 7.569,
18
  "num_input_tokens_seen": 0,
19
  "step": 0
20
  },
@@ -44,401 +44,401 @@
44
  },
45
  {
46
  "epoch": 0.189873417721519,
47
- "grad_norm": 219.31243896484375,
48
  "learning_rate": 1.5000000000000002e-05,
49
- "loss": 8.4654,
50
  "num_input_tokens_seen": 122880,
51
  "step": 15
52
  },
53
  {
54
  "epoch": 0.25316455696202533,
55
- "grad_norm": 291.5434265136719,
56
  "learning_rate": 1.999153201672344e-05,
57
- "loss": 5.9405,
58
  "num_input_tokens_seen": 163840,
59
  "step": 20
60
  },
61
  {
62
  "epoch": 0.31645569620253167,
63
- "grad_norm": 60.928070068359375,
64
  "learning_rate": 1.9939835156657616e-05,
65
- "loss": 2.9225,
66
  "num_input_tokens_seen": 204800,
67
  "step": 25
68
  },
69
  {
70
  "epoch": 0.379746835443038,
71
- "grad_norm": 50.70585250854492,
72
  "learning_rate": 1.9841388720031727e-05,
73
- "loss": 2.6155,
74
  "num_input_tokens_seen": 245760,
75
  "step": 30
76
  },
77
  {
78
  "epoch": 0.4430379746835443,
79
- "grad_norm": 80.83404541015625,
80
  "learning_rate": 1.9696655725512933e-05,
81
- "loss": 3.1102,
82
  "num_input_tokens_seen": 286720,
83
  "step": 35
84
  },
85
  {
86
  "epoch": 0.5063291139240507,
87
- "grad_norm": 26.1557674407959,
88
  "learning_rate": 1.9506316889240027e-05,
89
- "loss": 2.0094,
90
  "num_input_tokens_seen": 327680,
91
  "step": 40
92
  },
93
  {
94
  "epoch": 0.569620253164557,
95
- "grad_norm": 40.397979736328125,
96
  "learning_rate": 1.9271267423242028e-05,
97
- "loss": 3.6116,
98
  "num_input_tokens_seen": 368640,
99
  "step": 45
100
  },
101
  {
102
  "epoch": 0.6329113924050633,
103
- "grad_norm": 16.823810577392578,
104
  "learning_rate": 1.8992612825027978e-05,
105
- "loss": 1.9331,
106
  "num_input_tokens_seen": 409600,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 0.6329113924050633,
111
- "eval_f1": 0.18309859154929578,
112
- "eval_loss": 2.710301399230957,
113
- "eval_runtime": 0.9185,
114
- "eval_samples_per_second": 77.303,
115
- "eval_steps_per_second": 9.799,
116
  "num_input_tokens_seen": 409600,
117
  "step": 50
118
  },
119
  {
120
  "epoch": 0.6962025316455697,
121
- "grad_norm": 15.224857330322266,
122
  "learning_rate": 1.8671663678150605e-05,
123
- "loss": 1.8949,
124
  "num_input_tokens_seen": 450560,
125
  "step": 55
126
  },
127
  {
128
  "epoch": 0.759493670886076,
129
- "grad_norm": 83.90426635742188,
130
  "learning_rate": 1.8309929488198012e-05,
131
- "loss": 2.0895,
132
  "num_input_tokens_seen": 491520,
133
  "step": 60
134
  },
135
  {
136
  "epoch": 0.8227848101265823,
137
- "grad_norm": 18.31580924987793,
138
  "learning_rate": 1.790911158320442e-05,
139
- "loss": 2.098,
140
  "num_input_tokens_seen": 532480,
141
  "step": 65
142
  },
143
  {
144
  "epoch": 0.8860759493670886,
145
- "grad_norm": 87.63136291503906,
146
  "learning_rate": 1.7471095111871076e-05,
147
- "loss": 3.3778,
148
  "num_input_tokens_seen": 573440,
149
  "step": 70
150
  },
151
  {
152
  "epoch": 0.9493670886075949,
153
- "grad_norm": 10.84165096282959,
154
  "learning_rate": 1.6997940177231722e-05,
155
- "loss": 2.4451,
156
  "num_input_tokens_seen": 614400,
157
  "step": 75
158
  },
159
  {
160
  "epoch": 1.0126582278481013,
161
- "grad_norm": 16.434215545654297,
162
  "learning_rate": 1.6491872147463307e-05,
163
- "loss": 2.8367,
164
  "num_input_tokens_seen": 655360,
165
  "step": 80
166
  },
167
  {
168
  "epoch": 1.0759493670886076,
169
- "grad_norm": 95.65695190429688,
170
  "learning_rate": 1.5955271189412596e-05,
171
- "loss": 3.7962,
172
  "num_input_tokens_seen": 696320,
173
  "step": 85
174
  },
175
  {
176
  "epoch": 1.139240506329114,
177
- "grad_norm": 14.396378517150879,
178
  "learning_rate": 1.5390661074065257e-05,
179
- "loss": 1.9228,
180
  "num_input_tokens_seen": 737280,
181
  "step": 90
182
  },
183
  {
184
  "epoch": 1.2025316455696202,
185
- "grad_norm": 19.485475540161133,
186
  "learning_rate": 1.4800697306608043e-05,
187
- "loss": 2.3293,
188
  "num_input_tokens_seen": 778240,
189
  "step": 95
190
  },
191
  {
192
  "epoch": 1.2658227848101267,
193
- "grad_norm": 12.279613494873047,
194
  "learning_rate": 1.4188154636911524e-05,
195
- "loss": 1.627,
196
  "num_input_tokens_seen": 819200,
197
  "step": 100
198
  },
199
  {
200
  "epoch": 1.2658227848101267,
201
- "eval_f1": 0.19718309859154928,
202
- "eval_loss": 1.7709410190582275,
203
- "eval_runtime": 0.9341,
204
- "eval_samples_per_second": 76.012,
205
- "eval_steps_per_second": 9.635,
206
  "num_input_tokens_seen": 819200,
207
  "step": 100
208
  },
209
  {
210
  "epoch": 1.3291139240506329,
211
- "grad_norm": 10.757928848266602,
212
  "learning_rate": 1.3555914009174665e-05,
213
- "loss": 1.7464,
214
  "num_input_tokens_seen": 860160,
215
  "step": 105
216
  },
217
  {
218
  "epoch": 1.3924050632911391,
219
- "grad_norm": 19.480527877807617,
220
  "learning_rate": 1.2906949012110456e-05,
221
- "loss": 1.5746,
222
  "num_input_tokens_seen": 901120,
223
  "step": 110
224
  },
225
  {
226
  "epoch": 1.4556962025316456,
227
- "grad_norm": 8.611454963684082,
228
  "learning_rate": 1.2244311893400761e-05,
229
- "loss": 1.6549,
230
  "num_input_tokens_seen": 942080,
231
  "step": 115
232
  },
233
  {
234
  "epoch": 1.518987341772152,
235
- "grad_norm": 9.67941665649414,
236
  "learning_rate": 1.1571119204198038e-05,
237
- "loss": 1.6763,
238
  "num_input_tokens_seen": 983040,
239
  "step": 120
240
  },
241
  {
242
  "epoch": 1.5822784810126582,
243
- "grad_norm": 16.590065002441406,
244
  "learning_rate": 1.0890537141191417e-05,
245
- "loss": 1.7645,
246
  "num_input_tokens_seen": 1024000,
247
  "step": 125
248
  },
249
  {
250
  "epoch": 1.6455696202531644,
251
- "grad_norm": 16.423194885253906,
252
  "learning_rate": 1.0205766655177217e-05,
253
- "loss": 1.5635,
254
  "num_input_tokens_seen": 1064960,
255
  "step": 130
256
  },
257
  {
258
  "epoch": 1.7088607594936709,
259
- "grad_norm": 8.813819885253906,
260
  "learning_rate": 9.520028396172002e-06,
261
- "loss": 1.5885,
262
  "num_input_tokens_seen": 1105920,
263
  "step": 135
264
  },
265
  {
266
  "epoch": 1.7721518987341773,
267
- "grad_norm": 15.314586639404297,
268
  "learning_rate": 8.836547565875227e-06,
269
- "loss": 1.5926,
270
  "num_input_tokens_seen": 1146880,
271
  "step": 140
272
  },
273
  {
274
  "epoch": 1.8354430379746836,
275
- "grad_norm": 12.160784721374512,
276
  "learning_rate": 8.158538748724139e-06,
277
- "loss": 1.5661,
278
  "num_input_tokens_seen": 1187840,
279
  "step": 145
280
  },
281
  {
282
  "epoch": 1.8987341772151898,
283
- "grad_norm": 21.02486801147461,
284
  "learning_rate": 7.489190792884338e-06,
285
- "loss": 1.7811,
286
  "num_input_tokens_seen": 1228800,
287
  "step": 150
288
  },
289
  {
290
  "epoch": 1.8987341772151898,
291
- "eval_f1": 0.14084507042253522,
292
- "eval_loss": 1.7736163139343262,
293
- "eval_runtime": 0.9274,
294
- "eval_samples_per_second": 76.555,
295
- "eval_steps_per_second": 9.704,
296
  "num_input_tokens_seen": 1228800,
297
  "step": 150
298
  },
299
  {
300
  "epoch": 1.9620253164556962,
301
- "grad_norm": 9.002788543701172,
302
  "learning_rate": 6.831651812284652e-06,
303
- "loss": 1.5641,
304
  "num_input_tokens_seen": 1269760,
305
  "step": 155
306
  },
307
  {
308
  "epoch": 2.0253164556962027,
309
- "grad_norm": 17.318862915039062,
310
  "learning_rate": 6.18901438023543e-06,
311
- "loss": 1.6396,
312
  "num_input_tokens_seen": 1310720,
313
  "step": 160
314
  },
315
  {
316
  "epoch": 2.088607594936709,
317
- "grad_norm": 15.061686515808105,
318
  "learning_rate": 5.564300984268556e-06,
319
- "loss": 1.7634,
320
  "num_input_tokens_seen": 1351680,
321
  "step": 165
322
  },
323
  {
324
  "epoch": 2.151898734177215,
325
- "grad_norm": 11.59403133392334,
326
  "learning_rate": 4.960449810608705e-06,
327
- "loss": 1.6339,
328
  "num_input_tokens_seen": 1392640,
329
  "step": 170
330
  },
331
  {
332
  "epoch": 2.2151898734177213,
333
- "grad_norm": 14.045591354370117,
334
  "learning_rate": 4.380300925135138e-06,
335
- "loss": 1.6144,
336
  "num_input_tokens_seen": 1433600,
337
  "step": 175
338
  },
339
  {
340
  "epoch": 2.278481012658228,
341
- "grad_norm": 17.21590232849121,
342
  "learning_rate": 3.826582915828468e-06,
343
- "loss": 1.6045,
344
  "num_input_tokens_seen": 1474560,
345
  "step": 180
346
  },
347
  {
348
  "epoch": 2.3417721518987342,
349
- "grad_norm": 10.03845500946045,
350
  "learning_rate": 3.3019000595263573e-06,
351
- "loss": 1.6607,
352
  "num_input_tokens_seen": 1515520,
353
  "step": 185
354
  },
355
  {
356
  "epoch": 2.4050632911392404,
357
- "grad_norm": 6.748795509338379,
358
  "learning_rate": 2.8087200733462427e-06,
359
- "loss": 1.6225,
360
  "num_input_tokens_seen": 1556480,
361
  "step": 190
362
  },
363
  {
364
  "epoch": 2.4683544303797467,
365
- "grad_norm": 25.36518096923828,
366
  "learning_rate": 2.3493625083831217e-06,
367
- "loss": 1.5717,
368
  "num_input_tokens_seen": 1597440,
369
  "step": 195
370
  },
371
  {
372
  "epoch": 2.5316455696202533,
373
- "grad_norm": 11.94782829284668,
374
  "learning_rate": 1.9259878402699704e-06,
375
- "loss": 1.5969,
376
  "num_input_tokens_seen": 1638400,
377
  "step": 200
378
  },
379
  {
380
  "epoch": 2.5316455696202533,
381
- "eval_f1": 0.2112676056338028,
382
- "eval_loss": 1.7046104669570923,
383
- "eval_runtime": 0.9368,
384
- "eval_samples_per_second": 75.792,
385
- "eval_steps_per_second": 9.607,
386
  "num_input_tokens_seen": 1638400,
387
  "step": 200
388
  },
389
  {
390
  "epoch": 2.5949367088607596,
391
- "grad_norm": 12.540451049804688,
392
  "learning_rate": 1.5405873079105083e-06,
393
- "loss": 1.5583,
394
  "num_input_tokens_seen": 1679360,
395
  "step": 205
396
  },
397
  {
398
  "epoch": 2.6582278481012658,
399
- "grad_norm": 7.964724540710449,
400
  "learning_rate": 1.1949735481754565e-06,
401
- "loss": 1.6119,
402
  "num_input_tokens_seen": 1720320,
403
  "step": 210
404
  },
405
  {
406
  "epoch": 2.721518987341772,
407
- "grad_norm": 12.422355651855469,
408
  "learning_rate": 8.907720706096223e-07,
409
- "loss": 1.5886,
410
  "num_input_tokens_seen": 1761280,
411
  "step": 215
412
  },
413
  {
414
  "epoch": 2.7848101265822782,
415
- "grad_norm": 7.827919960021973,
416
  "learning_rate": 6.294136122464701e-07,
417
- "loss": 1.6044,
418
  "num_input_tokens_seen": 1802240,
419
  "step": 220
420
  },
421
  {
422
  "epoch": 2.848101265822785,
423
- "grad_norm": 14.147858619689941,
424
  "learning_rate": 4.121274084874194e-07,
425
- "loss": 1.6529,
426
  "num_input_tokens_seen": 1843200,
427
  "step": 225
428
  },
429
  {
430
  "epoch": 2.911392405063291,
431
- "grad_norm": 8.577438354492188,
432
  "learning_rate": 2.399354116946584e-07,
433
- "loss": 1.577,
434
  "num_input_tokens_seen": 1884160,
435
  "step": 230
436
  },
437
  {
438
  "epoch": 2.9746835443037973,
439
- "grad_norm": 15.957610130310059,
440
  "learning_rate": 1.1364748468886688e-07,
441
- "loss": 1.6043,
442
  "num_input_tokens_seen": 1925120,
443
  "step": 235
444
  },
@@ -447,10 +447,10 @@
447
  "num_input_tokens_seen": 1941504,
448
  "step": 237,
449
  "total_flos": 3521692676653056.0,
450
- "train_loss": 2.4847586265596155,
451
- "train_runtime": 216.4531,
452
- "train_samples_per_second": 8.746,
453
- "train_steps_per_second": 1.095
454
  }
455
  ],
456
  "logging_steps": 5,
 
1
  {
2
+ "best_metric": 1.6996692419052124,
3
  "best_model_checkpoint": "./python_and_text_gpt2/checkpoint-200",
4
  "epoch": 3.0,
5
  "eval_steps": 50,
 
10
  "log_history": [
11
  {
12
  "epoch": 0,
13
+ "eval_accuracy": 0.11267605633802817,
14
  "eval_loss": 8.402484893798828,
15
+ "eval_runtime": 1.1735,
16
+ "eval_samples_per_second": 60.505,
17
+ "eval_steps_per_second": 7.67,
18
  "num_input_tokens_seen": 0,
19
  "step": 0
20
  },
 
44
  },
45
  {
46
  "epoch": 0.189873417721519,
47
+ "grad_norm": 218.1217803955078,
48
  "learning_rate": 1.5000000000000002e-05,
49
+ "loss": 8.4642,
50
  "num_input_tokens_seen": 122880,
51
  "step": 15
52
  },
53
  {
54
  "epoch": 0.25316455696202533,
55
+ "grad_norm": 296.8869934082031,
56
  "learning_rate": 1.999153201672344e-05,
57
+ "loss": 5.9304,
58
  "num_input_tokens_seen": 163840,
59
  "step": 20
60
  },
61
  {
62
  "epoch": 0.31645569620253167,
63
+ "grad_norm": 61.51658630371094,
64
  "learning_rate": 1.9939835156657616e-05,
65
+ "loss": 2.9233,
66
  "num_input_tokens_seen": 204800,
67
  "step": 25
68
  },
69
  {
70
  "epoch": 0.379746835443038,
71
+ "grad_norm": 40.88030242919922,
72
  "learning_rate": 1.9841388720031727e-05,
73
+ "loss": 2.6465,
74
  "num_input_tokens_seen": 245760,
75
  "step": 30
76
  },
77
  {
78
  "epoch": 0.4430379746835443,
79
+ "grad_norm": 47.827491760253906,
80
  "learning_rate": 1.9696655725512933e-05,
81
+ "loss": 2.7127,
82
  "num_input_tokens_seen": 286720,
83
  "step": 35
84
  },
85
  {
86
  "epoch": 0.5063291139240507,
87
+ "grad_norm": 20.431304931640625,
88
  "learning_rate": 1.9506316889240027e-05,
89
+ "loss": 1.8677,
90
  "num_input_tokens_seen": 327680,
91
  "step": 40
92
  },
93
  {
94
  "epoch": 0.569620253164557,
95
+ "grad_norm": 29.551557540893555,
96
  "learning_rate": 1.9271267423242028e-05,
97
+ "loss": 2.5811,
98
  "num_input_tokens_seen": 368640,
99
  "step": 45
100
  },
101
  {
102
  "epoch": 0.6329113924050633,
103
+ "grad_norm": 16.11282730102539,
104
  "learning_rate": 1.8992612825027978e-05,
105
+ "loss": 2.0715,
106
  "num_input_tokens_seen": 409600,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 0.6329113924050633,
111
+ "eval_accuracy": 0.18309859154929578,
112
+ "eval_loss": 1.8793463706970215,
113
+ "eval_runtime": 0.9197,
114
+ "eval_samples_per_second": 77.198,
115
+ "eval_steps_per_second": 9.786,
116
  "num_input_tokens_seen": 409600,
117
  "step": 50
118
  },
119
  {
120
  "epoch": 0.6962025316455697,
121
+ "grad_norm": 19.049699783325195,
122
  "learning_rate": 1.8671663678150605e-05,
123
+ "loss": 1.8742,
124
  "num_input_tokens_seen": 450560,
125
  "step": 55
126
  },
127
  {
128
  "epoch": 0.759493670886076,
129
+ "grad_norm": 13.142491340637207,
130
  "learning_rate": 1.8309929488198012e-05,
131
+ "loss": 1.5956,
132
  "num_input_tokens_seen": 491520,
133
  "step": 60
134
  },
135
  {
136
  "epoch": 0.8227848101265823,
137
+ "grad_norm": 19.499195098876953,
138
  "learning_rate": 1.790911158320442e-05,
139
+ "loss": 2.0677,
140
  "num_input_tokens_seen": 532480,
141
  "step": 65
142
  },
143
  {
144
  "epoch": 0.8860759493670886,
145
+ "grad_norm": 13.254035949707031,
146
  "learning_rate": 1.7471095111871076e-05,
147
+ "loss": 1.8408,
148
  "num_input_tokens_seen": 573440,
149
  "step": 70
150
  },
151
  {
152
  "epoch": 0.9493670886075949,
153
+ "grad_norm": 13.480071067810059,
154
  "learning_rate": 1.6997940177231722e-05,
155
+ "loss": 1.73,
156
  "num_input_tokens_seen": 614400,
157
  "step": 75
158
  },
159
  {
160
  "epoch": 1.0126582278481013,
161
+ "grad_norm": 16.7222900390625,
162
  "learning_rate": 1.6491872147463307e-05,
163
+ "loss": 2.0164,
164
  "num_input_tokens_seen": 655360,
165
  "step": 80
166
  },
167
  {
168
  "epoch": 1.0759493670886076,
169
+ "grad_norm": 20.90951156616211,
170
  "learning_rate": 1.5955271189412596e-05,
171
+ "loss": 1.838,
172
  "num_input_tokens_seen": 696320,
173
  "step": 85
174
  },
175
  {
176
  "epoch": 1.139240506329114,
177
+ "grad_norm": 20.80415153503418,
178
  "learning_rate": 1.5390661074065257e-05,
179
+ "loss": 1.8573,
180
  "num_input_tokens_seen": 737280,
181
  "step": 90
182
  },
183
  {
184
  "epoch": 1.2025316455696202,
185
+ "grad_norm": 21.243846893310547,
186
  "learning_rate": 1.4800697306608043e-05,
187
+ "loss": 1.7605,
188
  "num_input_tokens_seen": 778240,
189
  "step": 95
190
  },
191
  {
192
  "epoch": 1.2658227848101267,
193
+ "grad_norm": 9.170429229736328,
194
  "learning_rate": 1.4188154636911524e-05,
195
+ "loss": 1.5544,
196
  "num_input_tokens_seen": 819200,
197
  "step": 100
198
  },
199
  {
200
  "epoch": 1.2658227848101267,
201
+ "eval_accuracy": 0.2112676056338028,
202
+ "eval_loss": 1.8253157138824463,
203
+ "eval_runtime": 0.9307,
204
+ "eval_samples_per_second": 76.288,
205
+ "eval_steps_per_second": 9.67,
206
  "num_input_tokens_seen": 819200,
207
  "step": 100
208
  },
209
  {
210
  "epoch": 1.3291139240506329,
211
+ "grad_norm": 11.778152465820312,
212
  "learning_rate": 1.3555914009174665e-05,
213
+ "loss": 1.7069,
214
  "num_input_tokens_seen": 860160,
215
  "step": 105
216
  },
217
  {
218
  "epoch": 1.3924050632911391,
219
+ "grad_norm": 17.081523895263672,
220
  "learning_rate": 1.2906949012110456e-05,
221
+ "loss": 1.5678,
222
  "num_input_tokens_seen": 901120,
223
  "step": 110
224
  },
225
  {
226
  "epoch": 1.4556962025316456,
227
+ "grad_norm": 8.573921203613281,
228
  "learning_rate": 1.2244311893400761e-05,
229
+ "loss": 1.6158,
230
  "num_input_tokens_seen": 942080,
231
  "step": 115
232
  },
233
  {
234
  "epoch": 1.518987341772152,
235
+ "grad_norm": 10.426651000976562,
236
  "learning_rate": 1.1571119204198038e-05,
237
+ "loss": 1.6879,
238
  "num_input_tokens_seen": 983040,
239
  "step": 120
240
  },
241
  {
242
  "epoch": 1.5822784810126582,
243
+ "grad_norm": 16.324264526367188,
244
  "learning_rate": 1.0890537141191417e-05,
245
+ "loss": 1.7929,
246
  "num_input_tokens_seen": 1024000,
247
  "step": 125
248
  },
249
  {
250
  "epoch": 1.6455696202531644,
251
+ "grad_norm": 16.601455688476562,
252
  "learning_rate": 1.0205766655177217e-05,
253
+ "loss": 1.5528,
254
  "num_input_tokens_seen": 1064960,
255
  "step": 130
256
  },
257
  {
258
  "epoch": 1.7088607594936709,
259
+ "grad_norm": 8.302882194519043,
260
  "learning_rate": 9.520028396172002e-06,
261
+ "loss": 1.6046,
262
  "num_input_tokens_seen": 1105920,
263
  "step": 135
264
  },
265
  {
266
  "epoch": 1.7721518987341773,
267
+ "grad_norm": 14.192557334899902,
268
  "learning_rate": 8.836547565875227e-06,
269
+ "loss": 1.5904,
270
  "num_input_tokens_seen": 1146880,
271
  "step": 140
272
  },
273
  {
274
  "epoch": 1.8354430379746836,
275
+ "grad_norm": 11.201437950134277,
276
  "learning_rate": 8.158538748724139e-06,
277
+ "loss": 1.5891,
278
  "num_input_tokens_seen": 1187840,
279
  "step": 145
280
  },
281
  {
282
  "epoch": 1.8987341772151898,
283
+ "grad_norm": 20.305402755737305,
284
  "learning_rate": 7.489190792884338e-06,
285
+ "loss": 1.7587,
286
  "num_input_tokens_seen": 1228800,
287
  "step": 150
288
  },
289
  {
290
  "epoch": 1.8987341772151898,
291
+ "eval_accuracy": 0.14084507042253522,
292
+ "eval_loss": 1.7602745294570923,
293
+ "eval_runtime": 0.938,
294
+ "eval_samples_per_second": 75.69,
295
+ "eval_steps_per_second": 9.594,
296
  "num_input_tokens_seen": 1228800,
297
  "step": 150
298
  },
299
  {
300
  "epoch": 1.9620253164556962,
301
+ "grad_norm": 8.459460258483887,
302
  "learning_rate": 6.831651812284652e-06,
303
+ "loss": 1.5431,
304
  "num_input_tokens_seen": 1269760,
305
  "step": 155
306
  },
307
  {
308
  "epoch": 2.0253164556962027,
309
+ "grad_norm": 15.566550254821777,
310
  "learning_rate": 6.18901438023543e-06,
311
+ "loss": 1.6141,
312
  "num_input_tokens_seen": 1310720,
313
  "step": 160
314
  },
315
  {
316
  "epoch": 2.088607594936709,
317
+ "grad_norm": 15.074378967285156,
318
  "learning_rate": 5.564300984268556e-06,
319
+ "loss": 1.7919,
320
  "num_input_tokens_seen": 1351680,
321
  "step": 165
322
  },
323
  {
324
  "epoch": 2.151898734177215,
325
+ "grad_norm": 12.48886775970459,
326
  "learning_rate": 4.960449810608705e-06,
327
+ "loss": 1.6343,
328
  "num_input_tokens_seen": 1392640,
329
  "step": 170
330
  },
331
  {
332
  "epoch": 2.2151898734177213,
333
+ "grad_norm": 13.26822566986084,
334
  "learning_rate": 4.380300925135138e-06,
335
+ "loss": 1.6183,
336
  "num_input_tokens_seen": 1433600,
337
  "step": 175
338
  },
339
  {
340
  "epoch": 2.278481012658228,
341
+ "grad_norm": 16.358903884887695,
342
  "learning_rate": 3.826582915828468e-06,
343
+ "loss": 1.5828,
344
  "num_input_tokens_seen": 1474560,
345
  "step": 180
346
  },
347
  {
348
  "epoch": 2.3417721518987342,
349
+ "grad_norm": 9.541784286499023,
350
  "learning_rate": 3.3019000595263573e-06,
351
+ "loss": 1.6376,
352
  "num_input_tokens_seen": 1515520,
353
  "step": 185
354
  },
355
  {
356
  "epoch": 2.4050632911392404,
357
+ "grad_norm": 7.097592830657959,
358
  "learning_rate": 2.8087200733462427e-06,
359
+ "loss": 1.6106,
360
  "num_input_tokens_seen": 1556480,
361
  "step": 190
362
  },
363
  {
364
  "epoch": 2.4683544303797467,
365
+ "grad_norm": 21.592899322509766,
366
  "learning_rate": 2.3493625083831217e-06,
367
+ "loss": 1.571,
368
  "num_input_tokens_seen": 1597440,
369
  "step": 195
370
  },
371
  {
372
  "epoch": 2.5316455696202533,
373
+ "grad_norm": 11.701766967773438,
374
  "learning_rate": 1.9259878402699704e-06,
375
+ "loss": 1.5982,
376
  "num_input_tokens_seen": 1638400,
377
  "step": 200
378
  },
379
  {
380
  "epoch": 2.5316455696202533,
381
+ "eval_accuracy": 0.2112676056338028,
382
+ "eval_loss": 1.6996692419052124,
383
+ "eval_runtime": 0.9416,
384
+ "eval_samples_per_second": 75.403,
385
+ "eval_steps_per_second": 9.558,
386
  "num_input_tokens_seen": 1638400,
387
  "step": 200
388
  },
389
  {
390
  "epoch": 2.5949367088607596,
391
+ "grad_norm": 11.591459274291992,
392
  "learning_rate": 1.5405873079105083e-06,
393
+ "loss": 1.5655,
394
  "num_input_tokens_seen": 1679360,
395
  "step": 205
396
  },
397
  {
398
  "epoch": 2.6582278481012658,
399
+ "grad_norm": 7.575652599334717,
400
  "learning_rate": 1.1949735481754565e-06,
401
+ "loss": 1.5902,
402
  "num_input_tokens_seen": 1720320,
403
  "step": 210
404
  },
405
  {
406
  "epoch": 2.721518987341772,
407
+ "grad_norm": 11.8199462890625,
408
  "learning_rate": 8.907720706096223e-07,
409
+ "loss": 1.5801,
410
  "num_input_tokens_seen": 1761280,
411
  "step": 215
412
  },
413
  {
414
  "epoch": 2.7848101265822782,
415
+ "grad_norm": 8.641218185424805,
416
  "learning_rate": 6.294136122464701e-07,
417
+ "loss": 1.5755,
418
  "num_input_tokens_seen": 1802240,
419
  "step": 220
420
  },
421
  {
422
  "epoch": 2.848101265822785,
423
+ "grad_norm": 13.54507064819336,
424
  "learning_rate": 4.121274084874194e-07,
425
+ "loss": 1.6523,
426
  "num_input_tokens_seen": 1843200,
427
  "step": 225
428
  },
429
  {
430
  "epoch": 2.911392405063291,
431
+ "grad_norm": 8.669917106628418,
432
  "learning_rate": 2.399354116946584e-07,
433
+ "loss": 1.5707,
434
  "num_input_tokens_seen": 1884160,
435
  "step": 230
436
  },
437
  {
438
  "epoch": 2.9746835443037973,
439
+ "grad_norm": 15.19518756866455,
440
  "learning_rate": 1.1364748468886688e-07,
441
+ "loss": 1.5997,
442
  "num_input_tokens_seen": 1925120,
443
  "step": 235
444
  },
 
447
  "num_input_tokens_seen": 1941504,
448
  "step": 237,
449
  "total_flos": 3521692676653056.0,
450
+ "train_loss": 2.3182458032535602,
451
+ "train_runtime": 206.8614,
452
+ "train_samples_per_second": 9.151,
453
+ "train_steps_per_second": 1.146
454
  }
455
  ],
456
  "logging_steps": 5,