irodkin commited on
Commit
7a5cca5
·
verified ·
1 Parent(s): ac4a648

Training checkpoint at step 13000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 12000,
3
- "best_metric": 2.4031572341918945,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-12000",
5
- "epoch": 0.24,
6
  "eval_steps": 100,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4328,6 +4328,366 @@
4328
  "eval_samples_per_second": 3.216,
4329
  "eval_steps_per_second": 1.608,
4330
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4331
  }
4332
  ],
4333
  "logging_steps": 25,
@@ -4347,7 +4707,7 @@
4347
  "attributes": {}
4348
  }
4349
  },
4350
- "total_flos": 3.819842757992645e+19,
4351
  "train_batch_size": 1,
4352
  "trial_name": null,
4353
  "trial_params": null
 
1
  {
2
+ "best_global_step": 13000,
3
+ "best_metric": 2.4009385108947754,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
5
+ "epoch": 0.26,
6
  "eval_steps": 100,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4328
  "eval_samples_per_second": 3.216,
4329
  "eval_steps_per_second": 1.608,
4330
  "step": 12000
4331
+ },
4332
+ {
4333
+ "epoch": 0.2405,
4334
+ "grad_norm": 0.5687487747515707,
4335
+ "learning_rate": 8.43911111111111e-06,
4336
+ "loss": 2.3859,
4337
+ "step": 12025
4338
+ },
4339
+ {
4340
+ "epoch": 0.241,
4341
+ "grad_norm": 0.6156971193882954,
4342
+ "learning_rate": 8.433555555555556e-06,
4343
+ "loss": 2.3936,
4344
+ "step": 12050
4345
+ },
4346
+ {
4347
+ "epoch": 0.2415,
4348
+ "grad_norm": 0.5735725917481376,
4349
+ "learning_rate": 8.428000000000001e-06,
4350
+ "loss": 2.3867,
4351
+ "step": 12075
4352
+ },
4353
+ {
4354
+ "epoch": 0.242,
4355
+ "grad_norm": 0.5900311312717111,
4356
+ "learning_rate": 8.422444444444445e-06,
4357
+ "loss": 2.381,
4358
+ "step": 12100
4359
+ },
4360
+ {
4361
+ "epoch": 0.242,
4362
+ "eval_loss": 2.402616262435913,
4363
+ "eval_runtime": 31.728,
4364
+ "eval_samples_per_second": 3.215,
4365
+ "eval_steps_per_second": 1.607,
4366
+ "step": 12100
4367
+ },
4368
+ {
4369
+ "epoch": 0.2425,
4370
+ "grad_norm": 0.6210456413331185,
4371
+ "learning_rate": 8.41688888888889e-06,
4372
+ "loss": 2.3897,
4373
+ "step": 12125
4374
+ },
4375
+ {
4376
+ "epoch": 0.243,
4377
+ "grad_norm": 0.564076844370536,
4378
+ "learning_rate": 8.411333333333334e-06,
4379
+ "loss": 2.3789,
4380
+ "step": 12150
4381
+ },
4382
+ {
4383
+ "epoch": 0.2435,
4384
+ "grad_norm": 0.5787670607206897,
4385
+ "learning_rate": 8.405777777777779e-06,
4386
+ "loss": 2.3927,
4387
+ "step": 12175
4388
+ },
4389
+ {
4390
+ "epoch": 0.244,
4391
+ "grad_norm": 0.557686861390105,
4392
+ "learning_rate": 8.400222222222222e-06,
4393
+ "loss": 2.3761,
4394
+ "step": 12200
4395
+ },
4396
+ {
4397
+ "epoch": 0.244,
4398
+ "eval_loss": 2.4025542736053467,
4399
+ "eval_runtime": 31.8116,
4400
+ "eval_samples_per_second": 3.206,
4401
+ "eval_steps_per_second": 1.603,
4402
+ "step": 12200
4403
+ },
4404
+ {
4405
+ "epoch": 0.2445,
4406
+ "grad_norm": 0.5642621664909974,
4407
+ "learning_rate": 8.394666666666668e-06,
4408
+ "loss": 2.3787,
4409
+ "step": 12225
4410
+ },
4411
+ {
4412
+ "epoch": 0.245,
4413
+ "grad_norm": 0.5812642245692796,
4414
+ "learning_rate": 8.389111111111113e-06,
4415
+ "loss": 2.3888,
4416
+ "step": 12250
4417
+ },
4418
+ {
4419
+ "epoch": 0.2455,
4420
+ "grad_norm": 0.5903665572148793,
4421
+ "learning_rate": 8.383555555555557e-06,
4422
+ "loss": 2.3874,
4423
+ "step": 12275
4424
+ },
4425
+ {
4426
+ "epoch": 0.246,
4427
+ "grad_norm": 0.5752826274496151,
4428
+ "learning_rate": 8.378e-06,
4429
+ "loss": 2.3851,
4430
+ "step": 12300
4431
+ },
4432
+ {
4433
+ "epoch": 0.246,
4434
+ "eval_loss": 2.4024178981781006,
4435
+ "eval_runtime": 31.9538,
4436
+ "eval_samples_per_second": 3.192,
4437
+ "eval_steps_per_second": 1.596,
4438
+ "step": 12300
4439
+ },
4440
+ {
4441
+ "epoch": 0.2465,
4442
+ "grad_norm": 0.5625780105871633,
4443
+ "learning_rate": 8.372444444444445e-06,
4444
+ "loss": 2.3857,
4445
+ "step": 12325
4446
+ },
4447
+ {
4448
+ "epoch": 0.247,
4449
+ "grad_norm": 0.5516059110433715,
4450
+ "learning_rate": 8.36688888888889e-06,
4451
+ "loss": 2.387,
4452
+ "step": 12350
4453
+ },
4454
+ {
4455
+ "epoch": 0.2475,
4456
+ "grad_norm": 0.5743651124710031,
4457
+ "learning_rate": 8.361333333333334e-06,
4458
+ "loss": 2.3899,
4459
+ "step": 12375
4460
+ },
4461
+ {
4462
+ "epoch": 0.248,
4463
+ "grad_norm": 0.6065509345211424,
4464
+ "learning_rate": 8.355777777777778e-06,
4465
+ "loss": 2.3811,
4466
+ "step": 12400
4467
+ },
4468
+ {
4469
+ "epoch": 0.248,
4470
+ "eval_loss": 2.402189254760742,
4471
+ "eval_runtime": 31.7357,
4472
+ "eval_samples_per_second": 3.214,
4473
+ "eval_steps_per_second": 1.607,
4474
+ "step": 12400
4475
+ },
4476
+ {
4477
+ "epoch": 0.2485,
4478
+ "grad_norm": 0.569411806780091,
4479
+ "learning_rate": 8.350222222222223e-06,
4480
+ "loss": 2.3891,
4481
+ "step": 12425
4482
+ },
4483
+ {
4484
+ "epoch": 0.249,
4485
+ "grad_norm": 0.5781227404353481,
4486
+ "learning_rate": 8.344666666666668e-06,
4487
+ "loss": 2.3799,
4488
+ "step": 12450
4489
+ },
4490
+ {
4491
+ "epoch": 0.2495,
4492
+ "grad_norm": 0.5882770416548074,
4493
+ "learning_rate": 8.339111111111112e-06,
4494
+ "loss": 2.3921,
4495
+ "step": 12475
4496
+ },
4497
+ {
4498
+ "epoch": 0.25,
4499
+ "grad_norm": 0.6053137792053689,
4500
+ "learning_rate": 8.333555555555555e-06,
4501
+ "loss": 2.3923,
4502
+ "step": 12500
4503
+ },
4504
+ {
4505
+ "epoch": 0.25,
4506
+ "eval_loss": 2.401906967163086,
4507
+ "eval_runtime": 31.7052,
4508
+ "eval_samples_per_second": 3.217,
4509
+ "eval_steps_per_second": 1.609,
4510
+ "step": 12500
4511
+ },
4512
+ {
4513
+ "epoch": 0.2505,
4514
+ "grad_norm": 0.5493940361276148,
4515
+ "learning_rate": 8.328e-06,
4516
+ "loss": 2.3872,
4517
+ "step": 12525
4518
+ },
4519
+ {
4520
+ "epoch": 0.251,
4521
+ "grad_norm": 0.5844453837465953,
4522
+ "learning_rate": 8.322444444444446e-06,
4523
+ "loss": 2.3859,
4524
+ "step": 12550
4525
+ },
4526
+ {
4527
+ "epoch": 0.2515,
4528
+ "grad_norm": 0.589694030674745,
4529
+ "learning_rate": 8.31688888888889e-06,
4530
+ "loss": 2.3852,
4531
+ "step": 12575
4532
+ },
4533
+ {
4534
+ "epoch": 0.252,
4535
+ "grad_norm": 0.5985872367130171,
4536
+ "learning_rate": 8.311333333333333e-06,
4537
+ "loss": 2.378,
4538
+ "step": 12600
4539
+ },
4540
+ {
4541
+ "epoch": 0.252,
4542
+ "eval_loss": 2.4017632007598877,
4543
+ "eval_runtime": 31.8059,
4544
+ "eval_samples_per_second": 3.207,
4545
+ "eval_steps_per_second": 1.603,
4546
+ "step": 12600
4547
+ },
4548
+ {
4549
+ "epoch": 0.2525,
4550
+ "grad_norm": 0.6246560097732429,
4551
+ "learning_rate": 8.305777777777778e-06,
4552
+ "loss": 2.3891,
4553
+ "step": 12625
4554
+ },
4555
+ {
4556
+ "epoch": 0.253,
4557
+ "grad_norm": 0.5977851115835912,
4558
+ "learning_rate": 8.300222222222223e-06,
4559
+ "loss": 2.3884,
4560
+ "step": 12650
4561
+ },
4562
+ {
4563
+ "epoch": 0.2535,
4564
+ "grad_norm": 0.5535634109353079,
4565
+ "learning_rate": 8.294666666666667e-06,
4566
+ "loss": 2.3894,
4567
+ "step": 12675
4568
+ },
4569
+ {
4570
+ "epoch": 0.254,
4571
+ "grad_norm": 0.5647542662126371,
4572
+ "learning_rate": 8.289111111111112e-06,
4573
+ "loss": 2.3889,
4574
+ "step": 12700
4575
+ },
4576
+ {
4577
+ "epoch": 0.254,
4578
+ "eval_loss": 2.4015073776245117,
4579
+ "eval_runtime": 31.6682,
4580
+ "eval_samples_per_second": 3.221,
4581
+ "eval_steps_per_second": 1.61,
4582
+ "step": 12700
4583
+ },
4584
+ {
4585
+ "epoch": 0.2545,
4586
+ "grad_norm": 0.5689860381748764,
4587
+ "learning_rate": 8.283555555555556e-06,
4588
+ "loss": 2.391,
4589
+ "step": 12725
4590
+ },
4591
+ {
4592
+ "epoch": 0.255,
4593
+ "grad_norm": 0.5788815220722723,
4594
+ "learning_rate": 8.278000000000001e-06,
4595
+ "loss": 2.3746,
4596
+ "step": 12750
4597
+ },
4598
+ {
4599
+ "epoch": 0.2555,
4600
+ "grad_norm": 0.5746385277305921,
4601
+ "learning_rate": 8.272444444444445e-06,
4602
+ "loss": 2.3884,
4603
+ "step": 12775
4604
+ },
4605
+ {
4606
+ "epoch": 0.256,
4607
+ "grad_norm": 0.5952261074381101,
4608
+ "learning_rate": 8.26688888888889e-06,
4609
+ "loss": 2.387,
4610
+ "step": 12800
4611
+ },
4612
+ {
4613
+ "epoch": 0.256,
4614
+ "eval_loss": 2.401090383529663,
4615
+ "eval_runtime": 31.7518,
4616
+ "eval_samples_per_second": 3.212,
4617
+ "eval_steps_per_second": 1.606,
4618
+ "step": 12800
4619
+ },
4620
+ {
4621
+ "epoch": 0.2565,
4622
+ "grad_norm": 0.581914246490724,
4623
+ "learning_rate": 8.261333333333335e-06,
4624
+ "loss": 2.3879,
4625
+ "step": 12825
4626
+ },
4627
+ {
4628
+ "epoch": 0.257,
4629
+ "grad_norm": 0.5582195018164189,
4630
+ "learning_rate": 8.255777777777779e-06,
4631
+ "loss": 2.3783,
4632
+ "step": 12850
4633
+ },
4634
+ {
4635
+ "epoch": 0.2575,
4636
+ "grad_norm": 0.5633036552978725,
4637
+ "learning_rate": 8.250222222222222e-06,
4638
+ "loss": 2.3845,
4639
+ "step": 12875
4640
+ },
4641
+ {
4642
+ "epoch": 0.258,
4643
+ "grad_norm": 0.5613155523789654,
4644
+ "learning_rate": 8.244666666666667e-06,
4645
+ "loss": 2.3942,
4646
+ "step": 12900
4647
+ },
4648
+ {
4649
+ "epoch": 0.258,
4650
+ "eval_loss": 2.4014108180999756,
4651
+ "eval_runtime": 31.8052,
4652
+ "eval_samples_per_second": 3.207,
4653
+ "eval_steps_per_second": 1.604,
4654
+ "step": 12900
4655
+ },
4656
+ {
4657
+ "epoch": 0.2585,
4658
+ "grad_norm": 0.5906307979751212,
4659
+ "learning_rate": 8.239111111111113e-06,
4660
+ "loss": 2.3807,
4661
+ "step": 12925
4662
+ },
4663
+ {
4664
+ "epoch": 0.259,
4665
+ "grad_norm": 0.5786593603781868,
4666
+ "learning_rate": 8.233555555555556e-06,
4667
+ "loss": 2.3848,
4668
+ "step": 12950
4669
+ },
4670
+ {
4671
+ "epoch": 0.2595,
4672
+ "grad_norm": 0.5739057988147651,
4673
+ "learning_rate": 8.228e-06,
4674
+ "loss": 2.3841,
4675
+ "step": 12975
4676
+ },
4677
+ {
4678
+ "epoch": 0.26,
4679
+ "grad_norm": 0.5727067411665359,
4680
+ "learning_rate": 8.222444444444445e-06,
4681
+ "loss": 2.3771,
4682
+ "step": 13000
4683
+ },
4684
+ {
4685
+ "epoch": 0.26,
4686
+ "eval_loss": 2.4009385108947754,
4687
+ "eval_runtime": 31.8075,
4688
+ "eval_samples_per_second": 3.207,
4689
+ "eval_steps_per_second": 1.603,
4690
+ "step": 13000
4691
  }
4692
  ],
4693
  "logging_steps": 25,
 
4707
  "attributes": {}
4708
  }
4709
  },
4710
+ "total_flos": 4.138162987825365e+19,
4711
  "train_batch_size": 1,
4712
  "trial_name": null,
4713
  "trial_params": null