irodkin commited on
Commit
e595725
·
verified ·
1 Parent(s): 7f77219

Training checkpoint at step 1000

Browse files
Files changed (1) hide show
  1. trainer_state.json +121 -121
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1000,
3
- "best_metric": 2.491666078567505,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-1000",
5
  "epoch": 0.02,
6
  "eval_steps": 100,
@@ -11,362 +11,362 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 2.7020849153690363,
15
  "learning_rate": 4.8e-08,
16
- "loss": 4.4151,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 1.7714713388908587,
22
  "learning_rate": 9.8e-08,
23
- "loss": 4.2692,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 1.0569441206778722,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.9071,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 0.43396234020605096,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.4257,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 3.1404547691345215,
43
- "eval_runtime": 33.0346,
44
- "eval_samples_per_second": 3.542,
45
- "eval_steps_per_second": 1.786,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 0.2585925841488232,
51
  "learning_rate": 2.48e-07,
52
- "loss": 3.1579,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 0.15143157149805395,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 2.9738,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 0.1080278835540699,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.8727,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 0.09698869766195446,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.827,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.752889633178711,
79
- "eval_runtime": 33.2978,
80
- "eval_samples_per_second": 3.514,
81
- "eval_steps_per_second": 1.772,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 0.07244257780140247,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.773,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 0.06884256698341579,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.7357,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 0.097508726424133,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.708,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 0.05004869412393589,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.6786,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.647900342941284,
115
- "eval_runtime": 33.2991,
116
- "eval_samples_per_second": 3.514,
117
- "eval_steps_per_second": 1.772,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 0.049132610170334615,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.6474,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 0.05729778967483004,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.6357,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 0.04063098299936942,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.6253,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 0.034498073897365616,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.6076,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.594409942626953,
151
- "eval_runtime": 34.2954,
152
- "eval_samples_per_second": 3.412,
153
- "eval_steps_per_second": 1.72,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 0.03558666298221422,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.595,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 0.03443154792307346,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.5908,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 0.035488270944549226,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.5809,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 0.03146469333633836,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.5736,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.5649118423461914,
187
- "eval_runtime": 36.3209,
188
- "eval_samples_per_second": 3.221,
189
- "eval_steps_per_second": 1.624,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
- "grad_norm": 0.03114420601995518,
195
  "learning_rate": 1.0480000000000002e-06,
196
- "loss": 2.5671,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
- "grad_norm": 0.03096542621853569,
202
  "learning_rate": 1.0980000000000001e-06,
203
- "loss": 2.5558,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
- "grad_norm": 0.02905782871061764,
209
  "learning_rate": 1.148e-06,
210
- "loss": 2.5623,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
- "grad_norm": 0.030562740052257713,
216
  "learning_rate": 1.1980000000000002e-06,
217
- "loss": 2.5622,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
- "eval_loss": 2.54438853263855,
223
- "eval_runtime": 36.9048,
224
- "eval_samples_per_second": 3.17,
225
- "eval_steps_per_second": 1.599,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
- "grad_norm": 0.03153013886632261,
231
  "learning_rate": 1.248e-06,
232
- "loss": 2.5404,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
- "grad_norm": 0.028752715448972253,
238
  "learning_rate": 1.2980000000000001e-06,
239
- "loss": 2.5396,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
- "grad_norm": 0.03011440752674912,
245
  "learning_rate": 1.348e-06,
246
- "loss": 2.5418,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
- "grad_norm": 0.027934694405631223,
252
  "learning_rate": 1.3980000000000002e-06,
253
- "loss": 2.5371,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
- "eval_loss": 2.529193878173828,
259
- "eval_runtime": 41.1238,
260
- "eval_samples_per_second": 2.845,
261
- "eval_steps_per_second": 1.435,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
- "grad_norm": 0.029595976023437127,
267
  "learning_rate": 1.4480000000000002e-06,
268
- "loss": 2.5285,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
- "grad_norm": 0.03539675413826323,
274
  "learning_rate": 1.498e-06,
275
- "loss": 2.5286,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
- "grad_norm": 0.03363430055312599,
281
  "learning_rate": 1.548e-06,
282
- "loss": 2.5146,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
- "grad_norm": 0.036143105087014814,
288
  "learning_rate": 1.5980000000000002e-06,
289
- "loss": 2.505,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
- "eval_loss": 2.5139412879943848,
295
- "eval_runtime": 41.1683,
296
- "eval_samples_per_second": 2.842,
297
- "eval_steps_per_second": 1.433,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
- "grad_norm": 0.07023101199739942,
303
  "learning_rate": 1.6480000000000001e-06,
304
- "loss": 2.5168,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
- "grad_norm": 0.036491334947115234,
310
  "learning_rate": 1.6980000000000003e-06,
311
- "loss": 2.5088,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
- "grad_norm": 0.052452197634130976,
317
  "learning_rate": 1.7480000000000002e-06,
318
- "loss": 2.4912,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
- "grad_norm": 0.04670910122190947,
324
  "learning_rate": 1.798e-06,
325
- "loss": 2.4976,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
- "eval_loss": 2.5007853507995605,
331
- "eval_runtime": 38.6574,
332
- "eval_samples_per_second": 3.027,
333
- "eval_steps_per_second": 1.526,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
- "grad_norm": 0.054668821609945045,
339
  "learning_rate": 1.8480000000000001e-06,
340
- "loss": 2.4915,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
- "grad_norm": 0.048767134507416036,
346
  "learning_rate": 1.898e-06,
347
- "loss": 2.488,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
- "grad_norm": 0.04356515009200933,
353
  "learning_rate": 1.9480000000000002e-06,
354
- "loss": 2.4904,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
- "grad_norm": 0.0393758269000268,
360
  "learning_rate": 1.998e-06,
361
- "loss": 2.4859,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
- "eval_loss": 2.491666078567505,
367
- "eval_runtime": 40.9073,
368
- "eval_samples_per_second": 2.86,
369
- "eval_steps_per_second": 1.442,
370
  "step": 1000
371
  }
372
  ],
 
1
  {
2
  "best_global_step": 1000,
3
+ "best_metric": 2.4966063499450684,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-1000",
5
  "epoch": 0.02,
6
  "eval_steps": 100,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 2.624103276270124,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 4.0893,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 1.3629568986234561,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 3.9543,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 0.8050128701430977,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.6763,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 0.3690286383727022,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.327,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 3.100055694580078,
43
+ "eval_runtime": 32.7706,
44
+ "eval_samples_per_second": 3.57,
45
+ "eval_steps_per_second": 1.8,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 0.24011694167100578,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 3.1322,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 0.149511940963387,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.9672,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 0.10071711520195754,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.8684,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 0.09695377414070089,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.8244,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.7518060207366943,
79
+ "eval_runtime": 32.9203,
80
+ "eval_samples_per_second": 3.554,
81
+ "eval_steps_per_second": 1.792,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 0.06541174981920718,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.7736,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 0.061297886999798934,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.7392,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 0.07881073149840945,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.7194,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 0.05125386617161651,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.6982,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.6622018814086914,
115
+ "eval_runtime": 32.9076,
116
+ "eval_samples_per_second": 3.555,
117
+ "eval_steps_per_second": 1.793,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 0.04659366450077996,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.6725,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 0.04588097652548341,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.6592,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 0.058421958212028904,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.6481,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 0.04289575736155661,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.6257,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.6052613258361816,
151
+ "eval_runtime": 32.8227,
152
+ "eval_samples_per_second": 3.565,
153
+ "eval_steps_per_second": 1.798,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 0.041602666338794385,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.6089,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 0.040090024026539266,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.5985,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 0.05346463020318845,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.5858,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 0.03240197247016216,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.5773,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.5677218437194824,
187
+ "eval_runtime": 32.9146,
188
+ "eval_samples_per_second": 3.555,
189
+ "eval_steps_per_second": 1.793,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
+ "grad_norm": 0.030627609315729644,
195
  "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.5695,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
+ "grad_norm": 0.03146801435404312,
202
  "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.558,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
+ "grad_norm": 0.028453864143727626,
209
  "learning_rate": 1.148e-06,
210
+ "loss": 2.5645,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
+ "grad_norm": 0.03026805511159676,
216
  "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.5645,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
+ "eval_loss": 2.546586275100708,
223
+ "eval_runtime": 32.8424,
224
+ "eval_samples_per_second": 3.562,
225
+ "eval_steps_per_second": 1.796,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
+ "grad_norm": 0.032033771539522,
231
  "learning_rate": 1.248e-06,
232
+ "loss": 2.5424,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
+ "grad_norm": 0.0281966122475446,
238
  "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.5409,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
+ "grad_norm": 0.02887428243284281,
245
  "learning_rate": 1.348e-06,
246
+ "loss": 2.543,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
+ "grad_norm": 0.027672621753278132,
252
  "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.5385,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
+ "eval_loss": 2.530237913131714,
259
+ "eval_runtime": 32.7994,
260
+ "eval_samples_per_second": 3.567,
261
+ "eval_steps_per_second": 1.799,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
+ "grad_norm": 0.030815191380069624,
267
  "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.5302,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
+ "grad_norm": 0.0336387385604783,
274
  "learning_rate": 1.498e-06,
275
+ "loss": 2.531,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
+ "grad_norm": 0.02858543320323233,
281
  "learning_rate": 1.548e-06,
282
+ "loss": 2.5184,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
+ "grad_norm": 0.028120393653995705,
288
  "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.5101,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
+ "eval_loss": 2.5182888507843018,
295
+ "eval_runtime": 33.2135,
296
+ "eval_samples_per_second": 3.523,
297
+ "eval_steps_per_second": 1.776,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
+ "grad_norm": 0.03014167593156162,
303
  "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.5232,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
+ "grad_norm": 0.028528349033195077,
310
  "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.5162,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
+ "grad_norm": 0.031230193601244804,
317
  "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.4995,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
+ "grad_norm": 0.03555060954716827,
324
  "learning_rate": 1.798e-06,
325
+ "loss": 2.5064,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
+ "eval_loss": 2.5070879459381104,
331
+ "eval_runtime": 33.3807,
332
+ "eval_samples_per_second": 3.505,
333
+ "eval_steps_per_second": 1.767,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
+ "grad_norm": 0.03561871969060444,
339
  "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.5004,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
+ "grad_norm": 0.03094584673111385,
346
  "learning_rate": 1.898e-06,
347
+ "loss": 2.4959,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
+ "grad_norm": 0.035545021685136444,
353
  "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4982,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
+ "grad_norm": 0.0370422613473599,
360
  "learning_rate": 1.998e-06,
361
+ "loss": 2.4927,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
+ "eval_loss": 2.4966063499450684,
367
+ "eval_runtime": 33.3038,
368
+ "eval_samples_per_second": 3.513,
369
+ "eval_steps_per_second": 1.772,
370
  "step": 1000
371
  }
372
  ],