irodkin commited on
Commit
cc5b89a
·
verified ·
1 Parent(s): 16889f5

Training checkpoint at step 13000

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 12400,
3
- "best_metric": 2.535456657409668,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-12000",
5
- "epoch": 0.25,
6
  "eval_steps": 100,
7
- "global_step": 12500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4508,6 +4508,186 @@
4508
  "eval_samples_per_second": 2.473,
4509
  "eval_steps_per_second": 1.237,
4510
  "step": 12500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4511
  }
4512
  ],
4513
  "logging_steps": 25,
@@ -4527,7 +4707,7 @@
4527
  "attributes": {}
4528
  }
4529
  },
4530
- "total_flos": 2.8052741913602687e+19,
4531
  "train_batch_size": 1,
4532
  "trial_name": null,
4533
  "trial_params": null
 
1
  {
2
+ "best_global_step": 13000,
3
+ "best_metric": 2.532376766204834,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-13000",
5
+ "epoch": 0.26,
6
  "eval_steps": 100,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4508
  "eval_samples_per_second": 2.473,
4509
  "eval_steps_per_second": 1.237,
4510
  "step": 12500
4511
+ },
4512
+ {
4513
+ "epoch": 0.2505,
4514
+ "grad_norm": 2.019140906115923,
4515
+ "learning_rate": 8.328e-06,
4516
+ "loss": 2.5241,
4517
+ "step": 12525
4518
+ },
4519
+ {
4520
+ "epoch": 0.251,
4521
+ "grad_norm": 1.9012303831260067,
4522
+ "learning_rate": 8.322444444444446e-06,
4523
+ "loss": 2.5354,
4524
+ "step": 12550
4525
+ },
4526
+ {
4527
+ "epoch": 0.2515,
4528
+ "grad_norm": 1.7607101331370496,
4529
+ "learning_rate": 8.31688888888889e-06,
4530
+ "loss": 2.5254,
4531
+ "step": 12575
4532
+ },
4533
+ {
4534
+ "epoch": 0.252,
4535
+ "grad_norm": 2.5505055208286933,
4536
+ "learning_rate": 8.311333333333333e-06,
4537
+ "loss": 2.5294,
4538
+ "step": 12600
4539
+ },
4540
+ {
4541
+ "epoch": 0.252,
4542
+ "eval_loss": 2.535231351852417,
4543
+ "eval_runtime": 41.9731,
4544
+ "eval_samples_per_second": 2.478,
4545
+ "eval_steps_per_second": 1.239,
4546
+ "step": 12600
4547
+ },
4548
+ {
4549
+ "epoch": 0.2525,
4550
+ "grad_norm": 1.6218420390627293,
4551
+ "learning_rate": 8.305777777777778e-06,
4552
+ "loss": 2.5262,
4553
+ "step": 12625
4554
+ },
4555
+ {
4556
+ "epoch": 0.253,
4557
+ "grad_norm": 2.0991897222525115,
4558
+ "learning_rate": 8.300222222222223e-06,
4559
+ "loss": 2.5206,
4560
+ "step": 12650
4561
+ },
4562
+ {
4563
+ "epoch": 0.2535,
4564
+ "grad_norm": 2.478785246720621,
4565
+ "learning_rate": 8.294666666666667e-06,
4566
+ "loss": 2.5275,
4567
+ "step": 12675
4568
+ },
4569
+ {
4570
+ "epoch": 0.254,
4571
+ "grad_norm": 2.141371973093057,
4572
+ "learning_rate": 8.289111111111112e-06,
4573
+ "loss": 2.5323,
4574
+ "step": 12700
4575
+ },
4576
+ {
4577
+ "epoch": 0.254,
4578
+ "eval_loss": 2.5341796875,
4579
+ "eval_runtime": 42.2622,
4580
+ "eval_samples_per_second": 2.461,
4581
+ "eval_steps_per_second": 1.23,
4582
+ "step": 12700
4583
+ },
4584
+ {
4585
+ "epoch": 0.2545,
4586
+ "grad_norm": 2.269733740633448,
4587
+ "learning_rate": 8.283555555555556e-06,
4588
+ "loss": 2.5367,
4589
+ "step": 12725
4590
+ },
4591
+ {
4592
+ "epoch": 0.255,
4593
+ "grad_norm": 1.893617133257015,
4594
+ "learning_rate": 8.278000000000001e-06,
4595
+ "loss": 2.5257,
4596
+ "step": 12750
4597
+ },
4598
+ {
4599
+ "epoch": 0.2555,
4600
+ "grad_norm": 1.751381032940087,
4601
+ "learning_rate": 8.272444444444445e-06,
4602
+ "loss": 2.5276,
4603
+ "step": 12775
4604
+ },
4605
+ {
4606
+ "epoch": 0.256,
4607
+ "grad_norm": 2.6264391487699545,
4608
+ "learning_rate": 8.26688888888889e-06,
4609
+ "loss": 2.5281,
4610
+ "step": 12800
4611
+ },
4612
+ {
4613
+ "epoch": 0.256,
4614
+ "eval_loss": 2.534780740737915,
4615
+ "eval_runtime": 42.0037,
4616
+ "eval_samples_per_second": 2.476,
4617
+ "eval_steps_per_second": 1.238,
4618
+ "step": 12800
4619
+ },
4620
+ {
4621
+ "epoch": 0.2565,
4622
+ "grad_norm": 2.9544216590918766,
4623
+ "learning_rate": 8.261333333333335e-06,
4624
+ "loss": 2.5159,
4625
+ "step": 12825
4626
+ },
4627
+ {
4628
+ "epoch": 0.257,
4629
+ "grad_norm": 1.703574826031134,
4630
+ "learning_rate": 8.255777777777779e-06,
4631
+ "loss": 2.5314,
4632
+ "step": 12850
4633
+ },
4634
+ {
4635
+ "epoch": 0.2575,
4636
+ "grad_norm": 2.23456733038464,
4637
+ "learning_rate": 8.250222222222222e-06,
4638
+ "loss": 2.5301,
4639
+ "step": 12875
4640
+ },
4641
+ {
4642
+ "epoch": 0.258,
4643
+ "grad_norm": 2.0236952351089132,
4644
+ "learning_rate": 8.244666666666667e-06,
4645
+ "loss": 2.5274,
4646
+ "step": 12900
4647
+ },
4648
+ {
4649
+ "epoch": 0.258,
4650
+ "eval_loss": 2.532827615737915,
4651
+ "eval_runtime": 42.2742,
4652
+ "eval_samples_per_second": 2.46,
4653
+ "eval_steps_per_second": 1.23,
4654
+ "step": 12900
4655
+ },
4656
+ {
4657
+ "epoch": 0.2585,
4658
+ "grad_norm": 1.9175658573019432,
4659
+ "learning_rate": 8.239111111111113e-06,
4660
+ "loss": 2.5293,
4661
+ "step": 12925
4662
+ },
4663
+ {
4664
+ "epoch": 0.259,
4665
+ "grad_norm": 2.227745372848629,
4666
+ "learning_rate": 8.233555555555556e-06,
4667
+ "loss": 2.5346,
4668
+ "step": 12950
4669
+ },
4670
+ {
4671
+ "epoch": 0.2595,
4672
+ "grad_norm": 2.0320264112024375,
4673
+ "learning_rate": 8.228e-06,
4674
+ "loss": 2.5133,
4675
+ "step": 12975
4676
+ },
4677
+ {
4678
+ "epoch": 0.26,
4679
+ "grad_norm": 2.3254627331546636,
4680
+ "learning_rate": 8.222444444444445e-06,
4681
+ "loss": 2.5257,
4682
+ "step": 13000
4683
+ },
4684
+ {
4685
+ "epoch": 0.26,
4686
+ "eval_loss": 2.532376766204834,
4687
+ "eval_runtime": 42.0555,
4688
+ "eval_samples_per_second": 2.473,
4689
+ "eval_steps_per_second": 1.236,
4690
+ "step": 13000
4691
  }
4692
  ],
4693
  "logging_steps": 25,
 
4707
  "attributes": {}
4708
  }
4709
  },
4710
+ "total_flos": 2.9174851597705937e+19,
4711
  "train_batch_size": 1,
4712
  "trial_name": null,
4713
  "trial_params": null