irodkin commited on
Commit
35bf3f2
·
verified ·
1 Parent(s): aa93063

Training checkpoint at step 7000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 6000,
3
- "best_metric": 2.4190170764923096,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-6000",
5
- "epoch": 0.12,
6
  "eval_steps": 100,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2168,6 +2168,366 @@
2168
  "eval_samples_per_second": 3.19,
2169
  "eval_steps_per_second": 1.595,
2170
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2171
  }
2172
  ],
2173
  "logging_steps": 25,
@@ -2187,7 +2547,7 @@
2187
  "attributes": {}
2188
  }
2189
  },
2190
- "total_flos": 1.9099213789963223e+19,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null
 
1
  {
2
+ "best_global_step": 7000,
3
+ "best_metric": 2.415269374847412,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-7000",
5
+ "epoch": 0.14,
6
  "eval_steps": 100,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2168
  "eval_samples_per_second": 3.19,
2169
  "eval_steps_per_second": 1.595,
2170
  "step": 6000
2171
+ },
2172
+ {
2173
+ "epoch": 0.1205,
2174
+ "grad_norm": 0.5979904516506753,
2175
+ "learning_rate": 9.772444444444445e-06,
2176
+ "loss": 2.4044,
2177
+ "step": 6025
2178
+ },
2179
+ {
2180
+ "epoch": 0.121,
2181
+ "grad_norm": 0.5980588594331456,
2182
+ "learning_rate": 9.76688888888889e-06,
2183
+ "loss": 2.41,
2184
+ "step": 6050
2185
+ },
2186
+ {
2187
+ "epoch": 0.1215,
2188
+ "grad_norm": 0.6344150039672136,
2189
+ "learning_rate": 9.761333333333334e-06,
2190
+ "loss": 2.4,
2191
+ "step": 6075
2192
+ },
2193
+ {
2194
+ "epoch": 0.122,
2195
+ "grad_norm": 0.6035110768502723,
2196
+ "learning_rate": 9.755777777777778e-06,
2197
+ "loss": 2.4148,
2198
+ "step": 6100
2199
+ },
2200
+ {
2201
+ "epoch": 0.122,
2202
+ "eval_loss": 2.418259382247925,
2203
+ "eval_runtime": 31.784,
2204
+ "eval_samples_per_second": 3.209,
2205
+ "eval_steps_per_second": 1.605,
2206
+ "step": 6100
2207
+ },
2208
+ {
2209
+ "epoch": 0.1225,
2210
+ "grad_norm": 0.5792932239951794,
2211
+ "learning_rate": 9.750222222222223e-06,
2212
+ "loss": 2.4061,
2213
+ "step": 6125
2214
+ },
2215
+ {
2216
+ "epoch": 0.123,
2217
+ "grad_norm": 0.6529554995007899,
2218
+ "learning_rate": 9.744666666666668e-06,
2219
+ "loss": 2.4036,
2220
+ "step": 6150
2221
+ },
2222
+ {
2223
+ "epoch": 0.1235,
2224
+ "grad_norm": 0.5946064726146467,
2225
+ "learning_rate": 9.739111111111112e-06,
2226
+ "loss": 2.4014,
2227
+ "step": 6175
2228
+ },
2229
+ {
2230
+ "epoch": 0.124,
2231
+ "grad_norm": 0.5739473618849045,
2232
+ "learning_rate": 9.733555555555555e-06,
2233
+ "loss": 2.4057,
2234
+ "step": 6200
2235
+ },
2236
+ {
2237
+ "epoch": 0.124,
2238
+ "eval_loss": 2.4179208278656006,
2239
+ "eval_runtime": 31.6981,
2240
+ "eval_samples_per_second": 3.218,
2241
+ "eval_steps_per_second": 1.609,
2242
+ "step": 6200
2243
+ },
2244
+ {
2245
+ "epoch": 0.1245,
2246
+ "grad_norm": 0.6907211114020956,
2247
+ "learning_rate": 9.728e-06,
2248
+ "loss": 2.393,
2249
+ "step": 6225
2250
+ },
2251
+ {
2252
+ "epoch": 0.125,
2253
+ "grad_norm": 0.6225931887903327,
2254
+ "learning_rate": 9.722444444444446e-06,
2255
+ "loss": 2.4147,
2256
+ "step": 6250
2257
+ },
2258
+ {
2259
+ "epoch": 0.1255,
2260
+ "grad_norm": 0.568397246680531,
2261
+ "learning_rate": 9.71688888888889e-06,
2262
+ "loss": 2.4024,
2263
+ "step": 6275
2264
+ },
2265
+ {
2266
+ "epoch": 0.126,
2267
+ "grad_norm": 0.5842879344272728,
2268
+ "learning_rate": 9.711333333333333e-06,
2269
+ "loss": 2.404,
2270
+ "step": 6300
2271
+ },
2272
+ {
2273
+ "epoch": 0.126,
2274
+ "eval_loss": 2.4178576469421387,
2275
+ "eval_runtime": 31.7994,
2276
+ "eval_samples_per_second": 3.208,
2277
+ "eval_steps_per_second": 1.604,
2278
+ "step": 6300
2279
+ },
2280
+ {
2281
+ "epoch": 0.1265,
2282
+ "grad_norm": 0.5805192382099048,
2283
+ "learning_rate": 9.705777777777778e-06,
2284
+ "loss": 2.4063,
2285
+ "step": 6325
2286
+ },
2287
+ {
2288
+ "epoch": 0.127,
2289
+ "grad_norm": 0.6600294122711824,
2290
+ "learning_rate": 9.700222222222224e-06,
2291
+ "loss": 2.4078,
2292
+ "step": 6350
2293
+ },
2294
+ {
2295
+ "epoch": 0.1275,
2296
+ "grad_norm": 0.6263098682936462,
2297
+ "learning_rate": 9.694666666666667e-06,
2298
+ "loss": 2.3961,
2299
+ "step": 6375
2300
+ },
2301
+ {
2302
+ "epoch": 0.128,
2303
+ "grad_norm": 0.6961912679129473,
2304
+ "learning_rate": 9.68911111111111e-06,
2305
+ "loss": 2.4127,
2306
+ "step": 6400
2307
+ },
2308
+ {
2309
+ "epoch": 0.128,
2310
+ "eval_loss": 2.417247772216797,
2311
+ "eval_runtime": 31.7325,
2312
+ "eval_samples_per_second": 3.214,
2313
+ "eval_steps_per_second": 1.607,
2314
+ "step": 6400
2315
+ },
2316
+ {
2317
+ "epoch": 0.1285,
2318
+ "grad_norm": 0.6396950069271417,
2319
+ "learning_rate": 9.683555555555556e-06,
2320
+ "loss": 2.4041,
2321
+ "step": 6425
2322
+ },
2323
+ {
2324
+ "epoch": 0.129,
2325
+ "grad_norm": 0.6164180606933177,
2326
+ "learning_rate": 9.678000000000001e-06,
2327
+ "loss": 2.4,
2328
+ "step": 6450
2329
+ },
2330
+ {
2331
+ "epoch": 0.1295,
2332
+ "grad_norm": 0.6120640198257105,
2333
+ "learning_rate": 9.672444444444445e-06,
2334
+ "loss": 2.3966,
2335
+ "step": 6475
2336
+ },
2337
+ {
2338
+ "epoch": 0.13,
2339
+ "grad_norm": 0.6013045247718226,
2340
+ "learning_rate": 9.66688888888889e-06,
2341
+ "loss": 2.3991,
2342
+ "step": 6500
2343
+ },
2344
+ {
2345
+ "epoch": 0.13,
2346
+ "eval_loss": 2.417280673980713,
2347
+ "eval_runtime": 31.8112,
2348
+ "eval_samples_per_second": 3.206,
2349
+ "eval_steps_per_second": 1.603,
2350
+ "step": 6500
2351
+ },
2352
+ {
2353
+ "epoch": 0.1305,
2354
+ "grad_norm": 0.6061836537875764,
2355
+ "learning_rate": 9.661333333333334e-06,
2356
+ "loss": 2.4161,
2357
+ "step": 6525
2358
+ },
2359
+ {
2360
+ "epoch": 0.131,
2361
+ "grad_norm": 0.6100864625060891,
2362
+ "learning_rate": 9.655777777777779e-06,
2363
+ "loss": 2.4052,
2364
+ "step": 6550
2365
+ },
2366
+ {
2367
+ "epoch": 0.1315,
2368
+ "grad_norm": 0.6932893052541476,
2369
+ "learning_rate": 9.650222222222222e-06,
2370
+ "loss": 2.4036,
2371
+ "step": 6575
2372
+ },
2373
+ {
2374
+ "epoch": 0.132,
2375
+ "grad_norm": 0.5859072202807338,
2376
+ "learning_rate": 9.644666666666668e-06,
2377
+ "loss": 2.4045,
2378
+ "step": 6600
2379
+ },
2380
+ {
2381
+ "epoch": 0.132,
2382
+ "eval_loss": 2.416877031326294,
2383
+ "eval_runtime": 31.5203,
2384
+ "eval_samples_per_second": 3.236,
2385
+ "eval_steps_per_second": 1.618,
2386
+ "step": 6600
2387
+ },
2388
+ {
2389
+ "epoch": 0.1325,
2390
+ "grad_norm": 0.579002436095642,
2391
+ "learning_rate": 9.639111111111113e-06,
2392
+ "loss": 2.4015,
2393
+ "step": 6625
2394
+ },
2395
+ {
2396
+ "epoch": 0.133,
2397
+ "grad_norm": 0.5968858601649685,
2398
+ "learning_rate": 9.633555555555556e-06,
2399
+ "loss": 2.3986,
2400
+ "step": 6650
2401
+ },
2402
+ {
2403
+ "epoch": 0.1335,
2404
+ "grad_norm": 0.5964714549861985,
2405
+ "learning_rate": 9.628e-06,
2406
+ "loss": 2.4062,
2407
+ "step": 6675
2408
+ },
2409
+ {
2410
+ "epoch": 0.134,
2411
+ "grad_norm": 0.6126102944808797,
2412
+ "learning_rate": 9.622444444444445e-06,
2413
+ "loss": 2.4033,
2414
+ "step": 6700
2415
+ },
2416
+ {
2417
+ "epoch": 0.134,
2418
+ "eval_loss": 2.4164350032806396,
2419
+ "eval_runtime": 31.4543,
2420
+ "eval_samples_per_second": 3.243,
2421
+ "eval_steps_per_second": 1.621,
2422
+ "step": 6700
2423
+ },
2424
+ {
2425
+ "epoch": 0.1345,
2426
+ "grad_norm": 0.5774452345333466,
2427
+ "learning_rate": 9.61688888888889e-06,
2428
+ "loss": 2.3997,
2429
+ "step": 6725
2430
+ },
2431
+ {
2432
+ "epoch": 0.135,
2433
+ "grad_norm": 0.6227260743975279,
2434
+ "learning_rate": 9.611333333333334e-06,
2435
+ "loss": 2.4018,
2436
+ "step": 6750
2437
+ },
2438
+ {
2439
+ "epoch": 0.1355,
2440
+ "grad_norm": 0.5846707991616706,
2441
+ "learning_rate": 9.605777777777778e-06,
2442
+ "loss": 2.3985,
2443
+ "step": 6775
2444
+ },
2445
+ {
2446
+ "epoch": 0.136,
2447
+ "grad_norm": 0.6172483484063671,
2448
+ "learning_rate": 9.600222222222223e-06,
2449
+ "loss": 2.4213,
2450
+ "step": 6800
2451
+ },
2452
+ {
2453
+ "epoch": 0.136,
2454
+ "eval_loss": 2.41625714302063,
2455
+ "eval_runtime": 31.5517,
2456
+ "eval_samples_per_second": 3.233,
2457
+ "eval_steps_per_second": 1.616,
2458
+ "step": 6800
2459
+ },
2460
+ {
2461
+ "epoch": 0.1365,
2462
+ "grad_norm": 0.5965299711032601,
2463
+ "learning_rate": 9.594666666666668e-06,
2464
+ "loss": 2.3976,
2465
+ "step": 6825
2466
+ },
2467
+ {
2468
+ "epoch": 0.137,
2469
+ "grad_norm": 0.5884739304234496,
2470
+ "learning_rate": 9.589111111111112e-06,
2471
+ "loss": 2.3947,
2472
+ "step": 6850
2473
+ },
2474
+ {
2475
+ "epoch": 0.1375,
2476
+ "grad_norm": 0.5737065693146471,
2477
+ "learning_rate": 9.583555555555555e-06,
2478
+ "loss": 2.3983,
2479
+ "step": 6875
2480
+ },
2481
+ {
2482
+ "epoch": 0.138,
2483
+ "grad_norm": 0.6249698819825935,
2484
+ "learning_rate": 9.578e-06,
2485
+ "loss": 2.4008,
2486
+ "step": 6900
2487
+ },
2488
+ {
2489
+ "epoch": 0.138,
2490
+ "eval_loss": 2.4156551361083984,
2491
+ "eval_runtime": 31.5071,
2492
+ "eval_samples_per_second": 3.237,
2493
+ "eval_steps_per_second": 1.619,
2494
+ "step": 6900
2495
+ },
2496
+ {
2497
+ "epoch": 0.1385,
2498
+ "grad_norm": 0.5930008566650997,
2499
+ "learning_rate": 9.572444444444446e-06,
2500
+ "loss": 2.3951,
2501
+ "step": 6925
2502
+ },
2503
+ {
2504
+ "epoch": 0.139,
2505
+ "grad_norm": 0.6564746022716046,
2506
+ "learning_rate": 9.56688888888889e-06,
2507
+ "loss": 2.4083,
2508
+ "step": 6950
2509
+ },
2510
+ {
2511
+ "epoch": 0.1395,
2512
+ "grad_norm": 0.611311960098376,
2513
+ "learning_rate": 9.561333333333333e-06,
2514
+ "loss": 2.4032,
2515
+ "step": 6975
2516
+ },
2517
+ {
2518
+ "epoch": 0.14,
2519
+ "grad_norm": 0.594692534551516,
2520
+ "learning_rate": 9.555777777777778e-06,
2521
+ "loss": 2.41,
2522
+ "step": 7000
2523
+ },
2524
+ {
2525
+ "epoch": 0.14,
2526
+ "eval_loss": 2.415269374847412,
2527
+ "eval_runtime": 31.7535,
2528
+ "eval_samples_per_second": 3.212,
2529
+ "eval_steps_per_second": 1.606,
2530
+ "step": 7000
2531
  }
2532
  ],
2533
  "logging_steps": 25,
 
2547
  "attributes": {}
2548
  }
2549
  },
2550
+ "total_flos": 2.2282416088290427e+19,
2551
  "train_batch_size": 1,
2552
  "trial_name": null,
2553
  "trial_params": null