irodkin commited on
Commit
17ae09b
·
verified ·
1 Parent(s): 530fb77

Training checkpoint at step 1000

Browse files
Files changed (1) hide show
  1. trainer_state.json +394 -0
trainer_state.json ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 2.493894577026367,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-1000",
5
+ "epoch": 0.02,
6
+ "eval_steps": 100,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005,
14
+ "grad_norm": 1.374199618670654,
15
+ "learning_rate": 4.8e-08,
16
+ "loss": 3.8192,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.001,
21
+ "grad_norm": 1.0626796430084513,
22
+ "learning_rate": 9.8e-08,
23
+ "loss": 3.7481,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.0015,
28
+ "grad_norm": 0.554900729210382,
29
+ "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.5249,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.002,
35
+ "grad_norm": 0.407340177021375,
36
+ "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.3424,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.002,
42
+ "eval_loss": 3.072638988494873,
43
+ "eval_runtime": 33.0855,
44
+ "eval_samples_per_second": 3.536,
45
+ "eval_steps_per_second": 1.783,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.0025,
50
+ "grad_norm": 0.22554288925827917,
51
+ "learning_rate": 2.48e-07,
52
+ "loss": 3.1284,
53
+ "step": 125
54
+ },
55
+ {
56
+ "epoch": 0.003,
57
+ "grad_norm": 0.14377524138618566,
58
+ "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.9699,
60
+ "step": 150
61
+ },
62
+ {
63
+ "epoch": 0.0035,
64
+ "grad_norm": 0.1273100096023732,
65
+ "learning_rate": 3.48e-07,
66
+ "loss": 2.8784,
67
+ "step": 175
68
+ },
69
+ {
70
+ "epoch": 0.004,
71
+ "grad_norm": 0.08052145135761532,
72
+ "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.808,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.004,
78
+ "eval_loss": 2.7464711666107178,
79
+ "eval_runtime": 33.2528,
80
+ "eval_samples_per_second": 3.519,
81
+ "eval_steps_per_second": 1.774,
82
+ "step": 200
83
+ },
84
+ {
85
+ "epoch": 0.0045,
86
+ "grad_norm": 0.09601426553193865,
87
+ "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.7735,
89
+ "step": 225
90
+ },
91
+ {
92
+ "epoch": 0.005,
93
+ "grad_norm": 0.06772677130883735,
94
+ "learning_rate": 4.98e-07,
95
+ "loss": 2.7358,
96
+ "step": 250
97
+ },
98
+ {
99
+ "epoch": 0.0055,
100
+ "grad_norm": 0.0617749171010752,
101
+ "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.7143,
103
+ "step": 275
104
+ },
105
+ {
106
+ "epoch": 0.006,
107
+ "grad_norm": 0.06081364438555446,
108
+ "learning_rate": 5.98e-07,
109
+ "loss": 2.695,
110
+ "step": 300
111
+ },
112
+ {
113
+ "epoch": 0.006,
114
+ "eval_loss": 2.658987045288086,
115
+ "eval_runtime": 33.098,
116
+ "eval_samples_per_second": 3.535,
117
+ "eval_steps_per_second": 1.783,
118
+ "step": 300
119
+ },
120
+ {
121
+ "epoch": 0.0065,
122
+ "grad_norm": 0.055229056926588416,
123
+ "learning_rate": 6.48e-07,
124
+ "loss": 2.6775,
125
+ "step": 325
126
+ },
127
+ {
128
+ "epoch": 0.007,
129
+ "grad_norm": 0.05285547880508365,
130
+ "learning_rate": 6.98e-07,
131
+ "loss": 2.6498,
132
+ "step": 350
133
+ },
134
+ {
135
+ "epoch": 0.0075,
136
+ "grad_norm": 0.05321418646538081,
137
+ "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.6375,
139
+ "step": 375
140
+ },
141
+ {
142
+ "epoch": 0.008,
143
+ "grad_norm": 0.046163922554101317,
144
+ "learning_rate": 7.98e-07,
145
+ "loss": 2.6273,
146
+ "step": 400
147
+ },
148
+ {
149
+ "epoch": 0.008,
150
+ "eval_loss": 2.5985612869262695,
151
+ "eval_runtime": 33.8223,
152
+ "eval_samples_per_second": 3.459,
153
+ "eval_steps_per_second": 1.744,
154
+ "step": 400
155
+ },
156
+ {
157
+ "epoch": 0.0085,
158
+ "grad_norm": 0.05184119325112733,
159
+ "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.6037,
161
+ "step": 425
162
+ },
163
+ {
164
+ "epoch": 0.009,
165
+ "grad_norm": 0.036013105129600216,
166
+ "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.594,
168
+ "step": 450
169
+ },
170
+ {
171
+ "epoch": 0.0095,
172
+ "grad_norm": 0.03061363860030697,
173
+ "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.5746,
175
+ "step": 475
176
+ },
177
+ {
178
+ "epoch": 0.01,
179
+ "grad_norm": 0.036970324693471876,
180
+ "learning_rate": 9.98e-07,
181
+ "loss": 2.5827,
182
+ "step": 500
183
+ },
184
+ {
185
+ "epoch": 0.01,
186
+ "eval_loss": 2.56477952003479,
187
+ "eval_runtime": 33.0671,
188
+ "eval_samples_per_second": 3.538,
189
+ "eval_steps_per_second": 1.784,
190
+ "step": 500
191
+ },
192
+ {
193
+ "epoch": 0.0105,
194
+ "grad_norm": 0.03044033343054017,
195
+ "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.5717,
197
+ "step": 525
198
+ },
199
+ {
200
+ "epoch": 0.011,
201
+ "grad_norm": 0.03340669717167394,
202
+ "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.5613,
204
+ "step": 550
205
+ },
206
+ {
207
+ "epoch": 0.0115,
208
+ "grad_norm": 0.029909971322257495,
209
+ "learning_rate": 1.148e-06,
210
+ "loss": 2.5661,
211
+ "step": 575
212
+ },
213
+ {
214
+ "epoch": 0.012,
215
+ "grad_norm": 0.028901093383770705,
216
+ "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.561,
218
+ "step": 600
219
+ },
220
+ {
221
+ "epoch": 0.012,
222
+ "eval_loss": 2.5446865558624268,
223
+ "eval_runtime": 33.1445,
224
+ "eval_samples_per_second": 3.53,
225
+ "eval_steps_per_second": 1.78,
226
+ "step": 600
227
+ },
228
+ {
229
+ "epoch": 0.0125,
230
+ "grad_norm": 0.0338582199400455,
231
+ "learning_rate": 1.248e-06,
232
+ "loss": 2.5392,
233
+ "step": 625
234
+ },
235
+ {
236
+ "epoch": 0.013,
237
+ "grad_norm": 0.031107046362937877,
238
+ "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.5378,
240
+ "step": 650
241
+ },
242
+ {
243
+ "epoch": 0.0135,
244
+ "grad_norm": 0.027777474622611625,
245
+ "learning_rate": 1.348e-06,
246
+ "loss": 2.5421,
247
+ "step": 675
248
+ },
249
+ {
250
+ "epoch": 0.014,
251
+ "grad_norm": 0.032848272502404616,
252
+ "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.5345,
254
+ "step": 700
255
+ },
256
+ {
257
+ "epoch": 0.014,
258
+ "eval_loss": 2.5294137001037598,
259
+ "eval_runtime": 33.222,
260
+ "eval_samples_per_second": 3.522,
261
+ "eval_steps_per_second": 1.776,
262
+ "step": 700
263
+ },
264
+ {
265
+ "epoch": 0.0145,
266
+ "grad_norm": 0.028260965292318807,
267
+ "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.5342,
269
+ "step": 725
270
+ },
271
+ {
272
+ "epoch": 0.015,
273
+ "grad_norm": 0.02748431921263886,
274
+ "learning_rate": 1.498e-06,
275
+ "loss": 2.5188,
276
+ "step": 750
277
+ },
278
+ {
279
+ "epoch": 0.0155,
280
+ "grad_norm": 0.02859453618814513,
281
+ "learning_rate": 1.548e-06,
282
+ "loss": 2.5245,
283
+ "step": 775
284
+ },
285
+ {
286
+ "epoch": 0.016,
287
+ "grad_norm": 0.03906649589898274,
288
+ "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.5142,
290
+ "step": 800
291
+ },
292
+ {
293
+ "epoch": 0.016,
294
+ "eval_loss": 2.5156726837158203,
295
+ "eval_runtime": 33.2465,
296
+ "eval_samples_per_second": 3.519,
297
+ "eval_steps_per_second": 1.775,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.0165,
302
+ "grad_norm": 0.03055728445213633,
303
+ "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.504,
305
+ "step": 825
306
+ },
307
+ {
308
+ "epoch": 0.017,
309
+ "grad_norm": 0.03831919004049627,
310
+ "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.5096,
312
+ "step": 850
313
+ },
314
+ {
315
+ "epoch": 0.0175,
316
+ "grad_norm": 0.04714764947462498,
317
+ "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.5057,
319
+ "step": 875
320
+ },
321
+ {
322
+ "epoch": 0.018,
323
+ "grad_norm": 0.04480333925801958,
324
+ "learning_rate": 1.798e-06,
325
+ "loss": 2.4949,
326
+ "step": 900
327
+ },
328
+ {
329
+ "epoch": 0.018,
330
+ "eval_loss": 2.503145456314087,
331
+ "eval_runtime": 33.3398,
332
+ "eval_samples_per_second": 3.509,
333
+ "eval_steps_per_second": 1.77,
334
+ "step": 900
335
+ },
336
+ {
337
+ "epoch": 0.0185,
338
+ "grad_norm": 0.04514734300904146,
339
+ "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.5044,
341
+ "step": 925
342
+ },
343
+ {
344
+ "epoch": 0.019,
345
+ "grad_norm": 0.03664477032679196,
346
+ "learning_rate": 1.898e-06,
347
+ "loss": 2.4857,
348
+ "step": 950
349
+ },
350
+ {
351
+ "epoch": 0.0195,
352
+ "grad_norm": 0.03891788038244039,
353
+ "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4954,
355
+ "step": 975
356
+ },
357
+ {
358
+ "epoch": 0.02,
359
+ "grad_norm": 0.041701680819843504,
360
+ "learning_rate": 1.998e-06,
361
+ "loss": 2.4935,
362
+ "step": 1000
363
+ },
364
+ {
365
+ "epoch": 0.02,
366
+ "eval_loss": 2.493894577026367,
367
+ "eval_runtime": 33.4036,
368
+ "eval_samples_per_second": 3.503,
369
+ "eval_steps_per_second": 1.766,
370
+ "step": 1000
371
+ }
372
+ ],
373
+ "logging_steps": 25,
374
+ "max_steps": 50000,
375
+ "num_input_tokens_seen": 0,
376
+ "num_train_epochs": 9223372036854775807,
377
+ "save_steps": 1000,
378
+ "stateful_callbacks": {
379
+ "TrainerControl": {
380
+ "args": {
381
+ "should_epoch_stop": false,
382
+ "should_evaluate": false,
383
+ "should_log": false,
384
+ "should_save": true,
385
+ "should_training_stop": false
386
+ },
387
+ "attributes": {}
388
+ }
389
+ },
390
+ "total_flos": 2.7853019251369574e+18,
391
+ "train_batch_size": 1,
392
+ "trial_name": null,
393
+ "trial_params": null
394
+ }