irodkin commited on
Commit
2661c68
·
verified ·
1 Parent(s): 9c9fff8

Training checkpoint at step 1000

Browse files
Files changed (1) hide show
  1. trainer_state.json +121 -121
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1000,
3
- "best_metric": 2.493894577026367,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-1000",
5
  "epoch": 0.02,
6
  "eval_steps": 100,
@@ -11,362 +11,362 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 1.374199618670654,
15
  "learning_rate": 4.8e-08,
16
- "loss": 3.8192,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 1.0626796430084513,
22
  "learning_rate": 9.8e-08,
23
- "loss": 3.7481,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 0.554900729210382,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.5249,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 0.407340177021375,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.3424,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 3.072638988494873,
43
- "eval_runtime": 33.0855,
44
- "eval_samples_per_second": 3.536,
45
- "eval_steps_per_second": 1.783,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 0.22554288925827917,
51
  "learning_rate": 2.48e-07,
52
- "loss": 3.1284,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 0.14377524138618566,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 2.9699,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 0.1273100096023732,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.8784,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 0.08052145135761532,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.808,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.7464711666107178,
79
- "eval_runtime": 33.2528,
80
- "eval_samples_per_second": 3.519,
81
- "eval_steps_per_second": 1.774,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 0.09601426553193865,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.7735,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 0.06772677130883735,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.7358,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 0.0617749171010752,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.7143,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 0.06081364438555446,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.695,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.658987045288086,
115
- "eval_runtime": 33.098,
116
- "eval_samples_per_second": 3.535,
117
- "eval_steps_per_second": 1.783,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 0.055229056926588416,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.6775,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 0.05285547880508365,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.6498,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 0.05321418646538081,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.6375,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 0.046163922554101317,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.6273,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.5985612869262695,
151
- "eval_runtime": 33.8223,
152
- "eval_samples_per_second": 3.459,
153
- "eval_steps_per_second": 1.744,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 0.05184119325112733,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.6037,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 0.036013105129600216,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.594,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 0.03061363860030697,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.5746,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 0.036970324693471876,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.5827,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.56477952003479,
187
- "eval_runtime": 33.0671,
188
- "eval_samples_per_second": 3.538,
189
- "eval_steps_per_second": 1.784,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
- "grad_norm": 0.03044033343054017,
195
  "learning_rate": 1.0480000000000002e-06,
196
- "loss": 2.5717,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
- "grad_norm": 0.03340669717167394,
202
  "learning_rate": 1.0980000000000001e-06,
203
- "loss": 2.5613,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
- "grad_norm": 0.029909971322257495,
209
  "learning_rate": 1.148e-06,
210
- "loss": 2.5661,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
- "grad_norm": 0.028901093383770705,
216
  "learning_rate": 1.1980000000000002e-06,
217
- "loss": 2.561,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
- "eval_loss": 2.5446865558624268,
223
- "eval_runtime": 33.1445,
224
- "eval_samples_per_second": 3.53,
225
- "eval_steps_per_second": 1.78,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
- "grad_norm": 0.0338582199400455,
231
  "learning_rate": 1.248e-06,
232
- "loss": 2.5392,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
- "grad_norm": 0.031107046362937877,
238
  "learning_rate": 1.2980000000000001e-06,
239
- "loss": 2.5378,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
- "grad_norm": 0.027777474622611625,
245
  "learning_rate": 1.348e-06,
246
- "loss": 2.5421,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
- "grad_norm": 0.032848272502404616,
252
  "learning_rate": 1.3980000000000002e-06,
253
- "loss": 2.5345,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
- "eval_loss": 2.5294137001037598,
259
- "eval_runtime": 33.222,
260
- "eval_samples_per_second": 3.522,
261
- "eval_steps_per_second": 1.776,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
- "grad_norm": 0.028260965292318807,
267
  "learning_rate": 1.4480000000000002e-06,
268
- "loss": 2.5342,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
- "grad_norm": 0.02748431921263886,
274
  "learning_rate": 1.498e-06,
275
- "loss": 2.5188,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
- "grad_norm": 0.02859453618814513,
281
  "learning_rate": 1.548e-06,
282
- "loss": 2.5245,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
- "grad_norm": 0.03906649589898274,
288
  "learning_rate": 1.5980000000000002e-06,
289
- "loss": 2.5142,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
- "eval_loss": 2.5156726837158203,
295
- "eval_runtime": 33.2465,
296
- "eval_samples_per_second": 3.519,
297
- "eval_steps_per_second": 1.775,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
- "grad_norm": 0.03055728445213633,
303
  "learning_rate": 1.6480000000000001e-06,
304
- "loss": 2.504,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
- "grad_norm": 0.03831919004049627,
310
  "learning_rate": 1.6980000000000003e-06,
311
- "loss": 2.5096,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
- "grad_norm": 0.04714764947462498,
317
  "learning_rate": 1.7480000000000002e-06,
318
- "loss": 2.5057,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
- "grad_norm": 0.04480333925801958,
324
  "learning_rate": 1.798e-06,
325
- "loss": 2.4949,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
- "eval_loss": 2.503145456314087,
331
- "eval_runtime": 33.3398,
332
- "eval_samples_per_second": 3.509,
333
- "eval_steps_per_second": 1.77,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
- "grad_norm": 0.04514734300904146,
339
  "learning_rate": 1.8480000000000001e-06,
340
- "loss": 2.5044,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
- "grad_norm": 0.03664477032679196,
346
  "learning_rate": 1.898e-06,
347
- "loss": 2.4857,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
- "grad_norm": 0.03891788038244039,
353
  "learning_rate": 1.9480000000000002e-06,
354
- "loss": 2.4954,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
- "grad_norm": 0.041701680819843504,
360
  "learning_rate": 1.998e-06,
361
- "loss": 2.4935,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
- "eval_loss": 2.493894577026367,
367
- "eval_runtime": 33.4036,
368
- "eval_samples_per_second": 3.503,
369
- "eval_steps_per_second": 1.766,
370
  "step": 1000
371
  }
372
  ],
 
1
  {
2
  "best_global_step": 1000,
3
+ "best_metric": 2.491666078567505,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-1000",
5
  "epoch": 0.02,
6
  "eval_steps": 100,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 2.7020849153690363,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 4.4151,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 1.7714713388908587,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 4.2692,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 1.0569441206778722,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.9071,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 0.43396234020605096,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.4257,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 3.1404547691345215,
43
+ "eval_runtime": 33.0346,
44
+ "eval_samples_per_second": 3.542,
45
+ "eval_steps_per_second": 1.786,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 0.2585925841488232,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 3.1579,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 0.15143157149805395,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.9738,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 0.1080278835540699,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.8727,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 0.09698869766195446,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.827,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.752889633178711,
79
+ "eval_runtime": 33.2978,
80
+ "eval_samples_per_second": 3.514,
81
+ "eval_steps_per_second": 1.772,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 0.07244257780140247,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.773,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 0.06884256698341579,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.7357,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 0.097508726424133,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.708,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 0.05004869412393589,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.6786,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.647900342941284,
115
+ "eval_runtime": 33.2991,
116
+ "eval_samples_per_second": 3.514,
117
+ "eval_steps_per_second": 1.772,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 0.049132610170334615,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.6474,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 0.05729778967483004,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.6357,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 0.04063098299936942,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.6253,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 0.034498073897365616,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.6076,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.594409942626953,
151
+ "eval_runtime": 34.2954,
152
+ "eval_samples_per_second": 3.412,
153
+ "eval_steps_per_second": 1.72,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 0.03558666298221422,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.595,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 0.03443154792307346,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.5908,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 0.035488270944549226,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.5809,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 0.03146469333633836,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.5736,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.5649118423461914,
187
+ "eval_runtime": 36.3209,
188
+ "eval_samples_per_second": 3.221,
189
+ "eval_steps_per_second": 1.624,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
+ "grad_norm": 0.03114420601995518,
195
  "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.5671,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
+ "grad_norm": 0.03096542621853569,
202
  "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.5558,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
+ "grad_norm": 0.02905782871061764,
209
  "learning_rate": 1.148e-06,
210
+ "loss": 2.5623,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
+ "grad_norm": 0.030562740052257713,
216
  "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.5622,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
+ "eval_loss": 2.54438853263855,
223
+ "eval_runtime": 36.9048,
224
+ "eval_samples_per_second": 3.17,
225
+ "eval_steps_per_second": 1.599,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
+ "grad_norm": 0.03153013886632261,
231
  "learning_rate": 1.248e-06,
232
+ "loss": 2.5404,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
+ "grad_norm": 0.028752715448972253,
238
  "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.5396,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
+ "grad_norm": 0.03011440752674912,
245
  "learning_rate": 1.348e-06,
246
+ "loss": 2.5418,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
+ "grad_norm": 0.027934694405631223,
252
  "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.5371,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
+ "eval_loss": 2.529193878173828,
259
+ "eval_runtime": 41.1238,
260
+ "eval_samples_per_second": 2.845,
261
+ "eval_steps_per_second": 1.435,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
+ "grad_norm": 0.029595976023437127,
267
  "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.5285,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
+ "grad_norm": 0.03539675413826323,
274
  "learning_rate": 1.498e-06,
275
+ "loss": 2.5286,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
+ "grad_norm": 0.03363430055312599,
281
  "learning_rate": 1.548e-06,
282
+ "loss": 2.5146,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
+ "grad_norm": 0.036143105087014814,
288
  "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.505,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
+ "eval_loss": 2.5139412879943848,
295
+ "eval_runtime": 41.1683,
296
+ "eval_samples_per_second": 2.842,
297
+ "eval_steps_per_second": 1.433,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
+ "grad_norm": 0.07023101199739942,
303
  "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.5168,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
+ "grad_norm": 0.036491334947115234,
310
  "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.5088,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
+ "grad_norm": 0.052452197634130976,
317
  "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.4912,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
+ "grad_norm": 0.04670910122190947,
324
  "learning_rate": 1.798e-06,
325
+ "loss": 2.4976,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
+ "eval_loss": 2.5007853507995605,
331
+ "eval_runtime": 38.6574,
332
+ "eval_samples_per_second": 3.027,
333
+ "eval_steps_per_second": 1.526,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
+ "grad_norm": 0.054668821609945045,
339
  "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.4915,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
+ "grad_norm": 0.048767134507416036,
346
  "learning_rate": 1.898e-06,
347
+ "loss": 2.488,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
+ "grad_norm": 0.04356515009200933,
353
  "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4904,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
+ "grad_norm": 0.0393758269000268,
360
  "learning_rate": 1.998e-06,
361
+ "loss": 2.4859,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
+ "eval_loss": 2.491666078567505,
367
+ "eval_runtime": 40.9073,
368
+ "eval_samples_per_second": 2.86,
369
+ "eval_steps_per_second": 1.442,
370
  "step": 1000
371
  }
372
  ],