irodkin commited on
Commit
77505e6
·
verified ·
1 Parent(s): fcfa7e3

Training checkpoint at step 1000

Browse files
Files changed (1) hide show
  1. trainer_state.json +369 -8
trainer_state.json CHANGED
@@ -1,19 +1,380 @@
1
  {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 4e-05,
6
  "eval_steps": 100,
7
- "global_step": 2,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
- "log_history": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "logging_steps": 25,
13
  "max_steps": 50000,
14
  "num_input_tokens_seen": 0,
15
  "num_train_epochs": 9223372036854775807,
16
- "save_steps": 1,
17
  "stateful_callbacks": {
18
  "TrainerControl": {
19
  "args": {
@@ -26,7 +387,7 @@
26
  "attributes": {}
27
  }
28
  },
29
- "total_flos": 6366403984621568.0,
30
  "train_batch_size": 1,
31
  "trial_name": null,
32
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1000,
3
+ "best_metric": 2.463880777359009,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-1000",
5
+ "epoch": 0.02,
6
  "eval_steps": 100,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005,
14
+ "grad_norm": 39.75564521032967,
15
+ "learning_rate": 4.8e-08,
16
+ "loss": 3.6517,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.001,
21
+ "grad_norm": 28.937531835097435,
22
+ "learning_rate": 9.8e-08,
23
+ "loss": 3.5931,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.0015,
28
+ "grad_norm": 21.922720332659644,
29
+ "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.3397,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.002,
35
+ "grad_norm": 8.739610199908325,
36
+ "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.1289,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.002,
42
+ "eval_loss": 2.9243295192718506,
43
+ "eval_runtime": 264.3302,
44
+ "eval_samples_per_second": 3.11,
45
+ "eval_steps_per_second": 1.555,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.0025,
50
+ "grad_norm": 4.433912600039677,
51
+ "learning_rate": 2.48e-07,
52
+ "loss": 2.8957,
53
+ "step": 125
54
+ },
55
+ {
56
+ "epoch": 0.003,
57
+ "grad_norm": 3.2874790066620303,
58
+ "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.763,
60
+ "step": 150
61
+ },
62
+ {
63
+ "epoch": 0.0035,
64
+ "grad_norm": 1.5203472215469231,
65
+ "learning_rate": 3.48e-07,
66
+ "loss": 2.676,
67
+ "step": 175
68
+ },
69
+ {
70
+ "epoch": 0.004,
71
+ "grad_norm": 1.1945541683905954,
72
+ "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.635,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.004,
78
+ "eval_loss": 2.6094932556152344,
79
+ "eval_runtime": 265.7702,
80
+ "eval_samples_per_second": 3.093,
81
+ "eval_steps_per_second": 1.546,
82
+ "step": 200
83
+ },
84
+ {
85
+ "epoch": 0.0045,
86
+ "grad_norm": 1.0852713304633745,
87
+ "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.6016,
89
+ "step": 225
90
+ },
91
+ {
92
+ "epoch": 0.005,
93
+ "grad_norm": 1.0733940346699529,
94
+ "learning_rate": 4.98e-07,
95
+ "loss": 2.5797,
96
+ "step": 250
97
+ },
98
+ {
99
+ "epoch": 0.0055,
100
+ "grad_norm": 0.9273949035031271,
101
+ "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.5607,
103
+ "step": 275
104
+ },
105
+ {
106
+ "epoch": 0.006,
107
+ "grad_norm": 0.9289300678591714,
108
+ "learning_rate": 5.98e-07,
109
+ "loss": 2.552,
110
+ "step": 300
111
+ },
112
+ {
113
+ "epoch": 0.006,
114
+ "eval_loss": 2.541522264480591,
115
+ "eval_runtime": 266.7478,
116
+ "eval_samples_per_second": 3.082,
117
+ "eval_steps_per_second": 1.541,
118
+ "step": 300
119
+ },
120
+ {
121
+ "epoch": 0.0065,
122
+ "grad_norm": 1.1328584507449984,
123
+ "learning_rate": 6.48e-07,
124
+ "loss": 2.5402,
125
+ "step": 325
126
+ },
127
+ {
128
+ "epoch": 0.007,
129
+ "grad_norm": 0.8593307029257858,
130
+ "learning_rate": 6.98e-07,
131
+ "loss": 2.5286,
132
+ "step": 350
133
+ },
134
+ {
135
+ "epoch": 0.0075,
136
+ "grad_norm": 0.895615604067586,
137
+ "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.5311,
139
+ "step": 375
140
+ },
141
+ {
142
+ "epoch": 0.008,
143
+ "grad_norm": 0.912306580242149,
144
+ "learning_rate": 7.98e-07,
145
+ "loss": 2.5037,
146
+ "step": 400
147
+ },
148
+ {
149
+ "epoch": 0.008,
150
+ "eval_loss": 2.514389991760254,
151
+ "eval_runtime": 266.4899,
152
+ "eval_samples_per_second": 3.085,
153
+ "eval_steps_per_second": 1.542,
154
+ "step": 400
155
+ },
156
+ {
157
+ "epoch": 0.0085,
158
+ "grad_norm": 1.1866535514670034,
159
+ "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.5011,
161
+ "step": 425
162
+ },
163
+ {
164
+ "epoch": 0.009,
165
+ "grad_norm": 1.211342504193914,
166
+ "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.503,
168
+ "step": 450
169
+ },
170
+ {
171
+ "epoch": 0.0095,
172
+ "grad_norm": 1.113763817383069,
173
+ "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.4999,
175
+ "step": 475
176
+ },
177
+ {
178
+ "epoch": 0.01,
179
+ "grad_norm": 1.2585585589647226,
180
+ "learning_rate": 9.98e-07,
181
+ "loss": 2.4872,
182
+ "step": 500
183
+ },
184
+ {
185
+ "epoch": 0.01,
186
+ "eval_loss": 2.497868061065674,
187
+ "eval_runtime": 265.7962,
188
+ "eval_samples_per_second": 3.093,
189
+ "eval_steps_per_second": 1.546,
190
+ "step": 500
191
+ },
192
+ {
193
+ "epoch": 0.0105,
194
+ "grad_norm": 1.2585825718084245,
195
+ "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.4852,
197
+ "step": 525
198
+ },
199
+ {
200
+ "epoch": 0.011,
201
+ "grad_norm": 1.4101257437846046,
202
+ "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.4892,
204
+ "step": 550
205
+ },
206
+ {
207
+ "epoch": 0.0115,
208
+ "grad_norm": 1.1975234150707363,
209
+ "learning_rate": 1.148e-06,
210
+ "loss": 2.4861,
211
+ "step": 575
212
+ },
213
+ {
214
+ "epoch": 0.012,
215
+ "grad_norm": 1.3662769225582332,
216
+ "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.4882,
218
+ "step": 600
219
+ },
220
+ {
221
+ "epoch": 0.012,
222
+ "eval_loss": 2.4879231452941895,
223
+ "eval_runtime": 267.0005,
224
+ "eval_samples_per_second": 3.079,
225
+ "eval_steps_per_second": 1.539,
226
+ "step": 600
227
+ },
228
+ {
229
+ "epoch": 0.0125,
230
+ "grad_norm": 1.3086724275194024,
231
+ "learning_rate": 1.248e-06,
232
+ "loss": 2.4745,
233
+ "step": 625
234
+ },
235
+ {
236
+ "epoch": 0.013,
237
+ "grad_norm": 1.317023206802888,
238
+ "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.4727,
240
+ "step": 650
241
+ },
242
+ {
243
+ "epoch": 0.0135,
244
+ "grad_norm": 1.5284967544483212,
245
+ "learning_rate": 1.348e-06,
246
+ "loss": 2.469,
247
+ "step": 675
248
+ },
249
+ {
250
+ "epoch": 0.014,
251
+ "grad_norm": 1.1047595217316941,
252
+ "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.4695,
254
+ "step": 700
255
+ },
256
+ {
257
+ "epoch": 0.014,
258
+ "eval_loss": 2.480103015899658,
259
+ "eval_runtime": 263.5022,
260
+ "eval_samples_per_second": 3.12,
261
+ "eval_steps_per_second": 1.56,
262
+ "step": 700
263
+ },
264
+ {
265
+ "epoch": 0.0145,
266
+ "grad_norm": 1.2077328209863791,
267
+ "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.4654,
269
+ "step": 725
270
+ },
271
+ {
272
+ "epoch": 0.015,
273
+ "grad_norm": 1.209220841771836,
274
+ "learning_rate": 1.498e-06,
275
+ "loss": 2.4663,
276
+ "step": 750
277
+ },
278
+ {
279
+ "epoch": 0.0155,
280
+ "grad_norm": 1.3063169829879686,
281
+ "learning_rate": 1.548e-06,
282
+ "loss": 2.4704,
283
+ "step": 775
284
+ },
285
+ {
286
+ "epoch": 0.016,
287
+ "grad_norm": 1.3180183352683195,
288
+ "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.4583,
290
+ "step": 800
291
+ },
292
+ {
293
+ "epoch": 0.016,
294
+ "eval_loss": 2.473590850830078,
295
+ "eval_runtime": 305.9875,
296
+ "eval_samples_per_second": 2.686,
297
+ "eval_steps_per_second": 1.343,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.0165,
302
+ "grad_norm": 1.1674852380778837,
303
+ "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.467,
305
+ "step": 825
306
+ },
307
+ {
308
+ "epoch": 0.017,
309
+ "grad_norm": 1.2497656349941002,
310
+ "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.4612,
312
+ "step": 850
313
+ },
314
+ {
315
+ "epoch": 0.0175,
316
+ "grad_norm": 1.3358614980967494,
317
+ "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.4636,
319
+ "step": 875
320
+ },
321
+ {
322
+ "epoch": 0.018,
323
+ "grad_norm": 1.252489857653356,
324
+ "learning_rate": 1.798e-06,
325
+ "loss": 2.454,
326
+ "step": 900
327
+ },
328
+ {
329
+ "epoch": 0.018,
330
+ "eval_loss": 2.4681763648986816,
331
+ "eval_runtime": 264.702,
332
+ "eval_samples_per_second": 3.105,
333
+ "eval_steps_per_second": 1.553,
334
+ "step": 900
335
+ },
336
+ {
337
+ "epoch": 0.0185,
338
+ "grad_norm": 1.2815437998994337,
339
+ "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.4571,
341
+ "step": 925
342
+ },
343
+ {
344
+ "epoch": 0.019,
345
+ "grad_norm": 1.0902475329451575,
346
+ "learning_rate": 1.898e-06,
347
+ "loss": 2.451,
348
+ "step": 950
349
+ },
350
+ {
351
+ "epoch": 0.0195,
352
+ "grad_norm": 1.1502696024965324,
353
+ "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4527,
355
+ "step": 975
356
+ },
357
+ {
358
+ "epoch": 0.02,
359
+ "grad_norm": 1.2336661855806117,
360
+ "learning_rate": 1.998e-06,
361
+ "loss": 2.4496,
362
+ "step": 1000
363
+ },
364
+ {
365
+ "epoch": 0.02,
366
+ "eval_loss": 2.463880777359009,
367
+ "eval_runtime": 275.7426,
368
+ "eval_samples_per_second": 2.981,
369
+ "eval_steps_per_second": 1.491,
370
+ "step": 1000
371
+ }
372
+ ],
373
  "logging_steps": 25,
374
  "max_steps": 50000,
375
  "num_input_tokens_seen": 0,
376
  "num_train_epochs": 9223372036854775807,
377
+ "save_steps": 1000,
378
  "stateful_callbacks": {
379
  "TrainerControl": {
380
  "args": {
 
387
  "attributes": {}
388
  }
389
  },
390
+ "total_flos": 3.183202298327204e+18,
391
  "train_batch_size": 1,
392
  "trial_name": null,
393
  "trial_params": null