irodkin commited on
Commit
77fde04
·
verified ·
1 Parent(s): 89dfc07

Training checkpoint at step 9500

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 8900,
3
- "best_metric": 2.559945821762085,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-8500",
5
- "epoch": 0.18,
6
  "eval_steps": 100,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3248,6 +3248,186 @@
3248
  "eval_samples_per_second": 2.461,
3249
  "eval_steps_per_second": 1.23,
3250
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3251
  }
3252
  ],
3253
  "logging_steps": 25,
@@ -3267,7 +3447,7 @@
3267
  "attributes": {}
3268
  }
3269
  },
3270
- "total_flos": 2.0197974124879938e+19,
3271
  "train_batch_size": 1,
3272
  "trial_name": null,
3273
  "trial_params": null
 
1
  {
2
+ "best_global_step": 9500,
3
+ "best_metric": 2.555588960647583,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-9500",
5
+ "epoch": 0.19,
6
  "eval_steps": 100,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3248
  "eval_samples_per_second": 2.461,
3249
  "eval_steps_per_second": 1.23,
3250
  "step": 9000
3251
+ },
3252
+ {
3253
+ "epoch": 0.1805,
3254
+ "grad_norm": 2.7335192428299697,
3255
+ "learning_rate": 9.105777777777779e-06,
3256
+ "loss": 2.56,
3257
+ "step": 9025
3258
+ },
3259
+ {
3260
+ "epoch": 0.181,
3261
+ "grad_norm": 2.616833970329197,
3262
+ "learning_rate": 9.100222222222223e-06,
3263
+ "loss": 2.5659,
3264
+ "step": 9050
3265
+ },
3266
+ {
3267
+ "epoch": 0.1815,
3268
+ "grad_norm": 2.636296249975529,
3269
+ "learning_rate": 9.094666666666668e-06,
3270
+ "loss": 2.5605,
3271
+ "step": 9075
3272
+ },
3273
+ {
3274
+ "epoch": 0.182,
3275
+ "grad_norm": 2.1413102875849828,
3276
+ "learning_rate": 9.089111111111111e-06,
3277
+ "loss": 2.5454,
3278
+ "step": 9100
3279
+ },
3280
+ {
3281
+ "epoch": 0.182,
3282
+ "eval_loss": 2.558293342590332,
3283
+ "eval_runtime": 42.2294,
3284
+ "eval_samples_per_second": 2.463,
3285
+ "eval_steps_per_second": 1.231,
3286
+ "step": 9100
3287
+ },
3288
+ {
3289
+ "epoch": 0.1825,
3290
+ "grad_norm": 2.195374313863304,
3291
+ "learning_rate": 9.083555555555557e-06,
3292
+ "loss": 2.5584,
3293
+ "step": 9125
3294
+ },
3295
+ {
3296
+ "epoch": 0.183,
3297
+ "grad_norm": 2.9470418486379546,
3298
+ "learning_rate": 9.078000000000002e-06,
3299
+ "loss": 2.5604,
3300
+ "step": 9150
3301
+ },
3302
+ {
3303
+ "epoch": 0.1835,
3304
+ "grad_norm": 1.9289932950554558,
3305
+ "learning_rate": 9.072444444444445e-06,
3306
+ "loss": 2.5529,
3307
+ "step": 9175
3308
+ },
3309
+ {
3310
+ "epoch": 0.184,
3311
+ "grad_norm": 2.905671046574134,
3312
+ "learning_rate": 9.066888888888889e-06,
3313
+ "loss": 2.5551,
3314
+ "step": 9200
3315
+ },
3316
+ {
3317
+ "epoch": 0.184,
3318
+ "eval_loss": 2.558293342590332,
3319
+ "eval_runtime": 42.216,
3320
+ "eval_samples_per_second": 2.464,
3321
+ "eval_steps_per_second": 1.232,
3322
+ "step": 9200
3323
+ },
3324
+ {
3325
+ "epoch": 0.1845,
3326
+ "grad_norm": 2.8062526156064522,
3327
+ "learning_rate": 9.061333333333334e-06,
3328
+ "loss": 2.5438,
3329
+ "step": 9225
3330
+ },
3331
+ {
3332
+ "epoch": 0.185,
3333
+ "grad_norm": 2.543328123273362,
3334
+ "learning_rate": 9.05577777777778e-06,
3335
+ "loss": 2.5476,
3336
+ "step": 9250
3337
+ },
3338
+ {
3339
+ "epoch": 0.1855,
3340
+ "grad_norm": 2.396296044779414,
3341
+ "learning_rate": 9.050222222222223e-06,
3342
+ "loss": 2.5437,
3343
+ "step": 9275
3344
+ },
3345
+ {
3346
+ "epoch": 0.186,
3347
+ "grad_norm": 1.980055565462775,
3348
+ "learning_rate": 9.044666666666667e-06,
3349
+ "loss": 2.5552,
3350
+ "step": 9300
3351
+ },
3352
+ {
3353
+ "epoch": 0.186,
3354
+ "eval_loss": 2.557692289352417,
3355
+ "eval_runtime": 42.6636,
3356
+ "eval_samples_per_second": 2.438,
3357
+ "eval_steps_per_second": 1.219,
3358
+ "step": 9300
3359
+ },
3360
+ {
3361
+ "epoch": 0.1865,
3362
+ "grad_norm": 2.028891972183573,
3363
+ "learning_rate": 9.039111111111112e-06,
3364
+ "loss": 2.5603,
3365
+ "step": 9325
3366
+ },
3367
+ {
3368
+ "epoch": 0.187,
3369
+ "grad_norm": 2.244801606614392,
3370
+ "learning_rate": 9.033555555555557e-06,
3371
+ "loss": 2.5565,
3372
+ "step": 9350
3373
+ },
3374
+ {
3375
+ "epoch": 0.1875,
3376
+ "grad_norm": 2.6445168963619348,
3377
+ "learning_rate": 9.028e-06,
3378
+ "loss": 2.5453,
3379
+ "step": 9375
3380
+ },
3381
+ {
3382
+ "epoch": 0.188,
3383
+ "grad_norm": 2.2015819629656543,
3384
+ "learning_rate": 9.022444444444444e-06,
3385
+ "loss": 2.5463,
3386
+ "step": 9400
3387
+ },
3388
+ {
3389
+ "epoch": 0.188,
3390
+ "eval_loss": 2.555739164352417,
3391
+ "eval_runtime": 44.4913,
3392
+ "eval_samples_per_second": 2.338,
3393
+ "eval_steps_per_second": 1.169,
3394
+ "step": 9400
3395
+ },
3396
+ {
3397
+ "epoch": 0.1885,
3398
+ "grad_norm": 2.0871782907981076,
3399
+ "learning_rate": 9.01688888888889e-06,
3400
+ "loss": 2.5494,
3401
+ "step": 9425
3402
+ },
3403
+ {
3404
+ "epoch": 0.189,
3405
+ "grad_norm": 2.3339796044543006,
3406
+ "learning_rate": 9.011333333333335e-06,
3407
+ "loss": 2.562,
3408
+ "step": 9450
3409
+ },
3410
+ {
3411
+ "epoch": 0.1895,
3412
+ "grad_norm": 2.5447600145368257,
3413
+ "learning_rate": 9.005777777777778e-06,
3414
+ "loss": 2.5613,
3415
+ "step": 9475
3416
+ },
3417
+ {
3418
+ "epoch": 0.19,
3419
+ "grad_norm": 2.2530767222642805,
3420
+ "learning_rate": 9.000222222222222e-06,
3421
+ "loss": 2.5561,
3422
+ "step": 9500
3423
+ },
3424
+ {
3425
+ "epoch": 0.19,
3426
+ "eval_loss": 2.555588960647583,
3427
+ "eval_runtime": 42.3312,
3428
+ "eval_samples_per_second": 2.457,
3429
+ "eval_steps_per_second": 1.228,
3430
+ "step": 9500
3431
  }
3432
  ],
3433
  "logging_steps": 25,
 
3447
  "attributes": {}
3448
  }
3449
  },
3450
+ "total_flos": 2.1320083808983187e+19,
3451
  "train_batch_size": 1,
3452
  "trial_name": null,
3453
  "trial_params": null