irodkin commited on
Commit
ec50f69
·
verified ·
1 Parent(s): 5469f05

Training checkpoint at step 8000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 7000,
3
- "best_metric": 2.415269374847412,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-7000",
5
- "epoch": 0.14,
6
  "eval_steps": 100,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2528,6 +2528,366 @@
2528
  "eval_samples_per_second": 3.212,
2529
  "eval_steps_per_second": 1.606,
2530
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2531
  }
2532
  ],
2533
  "logging_steps": 25,
@@ -2547,7 +2907,7 @@
2547
  "attributes": {}
2548
  }
2549
  },
2550
- "total_flos": 2.2282416088290427e+19,
2551
  "train_batch_size": 1,
2552
  "trial_name": null,
2553
  "trial_params": null
 
1
  {
2
+ "best_global_step": 8000,
3
+ "best_metric": 2.4125914573669434,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-8000",
5
+ "epoch": 0.16,
6
  "eval_steps": 100,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2528
  "eval_samples_per_second": 3.212,
2529
  "eval_steps_per_second": 1.606,
2530
  "step": 7000
2531
+ },
2532
+ {
2533
+ "epoch": 0.1405,
2534
+ "grad_norm": 0.5975652527083385,
2535
+ "learning_rate": 9.550222222222223e-06,
2536
+ "loss": 2.398,
2537
+ "step": 7025
2538
+ },
2539
+ {
2540
+ "epoch": 0.141,
2541
+ "grad_norm": 0.5642285559875744,
2542
+ "learning_rate": 9.544666666666667e-06,
2543
+ "loss": 2.3907,
2544
+ "step": 7050
2545
+ },
2546
+ {
2547
+ "epoch": 0.1415,
2548
+ "grad_norm": 0.5977243463765347,
2549
+ "learning_rate": 9.539111111111112e-06,
2550
+ "loss": 2.4063,
2551
+ "step": 7075
2552
+ },
2553
+ {
2554
+ "epoch": 0.142,
2555
+ "grad_norm": 0.5938091922766982,
2556
+ "learning_rate": 9.533555555555556e-06,
2557
+ "loss": 2.4064,
2558
+ "step": 7100
2559
+ },
2560
+ {
2561
+ "epoch": 0.142,
2562
+ "eval_loss": 2.4153244495391846,
2563
+ "eval_runtime": 31.6856,
2564
+ "eval_samples_per_second": 3.219,
2565
+ "eval_steps_per_second": 1.61,
2566
+ "step": 7100
2567
+ },
2568
+ {
2569
+ "epoch": 0.1425,
2570
+ "grad_norm": 0.6203811817044198,
2571
+ "learning_rate": 9.528000000000001e-06,
2572
+ "loss": 2.3995,
2573
+ "step": 7125
2574
+ },
2575
+ {
2576
+ "epoch": 0.143,
2577
+ "grad_norm": 0.5748373728564159,
2578
+ "learning_rate": 9.522444444444444e-06,
2579
+ "loss": 2.4052,
2580
+ "step": 7150
2581
+ },
2582
+ {
2583
+ "epoch": 0.1435,
2584
+ "grad_norm": 0.6318360721408016,
2585
+ "learning_rate": 9.51688888888889e-06,
2586
+ "loss": 2.396,
2587
+ "step": 7175
2588
+ },
2589
+ {
2590
+ "epoch": 0.144,
2591
+ "grad_norm": 0.5777480191110791,
2592
+ "learning_rate": 9.511333333333335e-06,
2593
+ "loss": 2.3966,
2594
+ "step": 7200
2595
+ },
2596
+ {
2597
+ "epoch": 0.144,
2598
+ "eval_loss": 2.414691209793091,
2599
+ "eval_runtime": 31.5495,
2600
+ "eval_samples_per_second": 3.233,
2601
+ "eval_steps_per_second": 1.617,
2602
+ "step": 7200
2603
+ },
2604
+ {
2605
+ "epoch": 0.1445,
2606
+ "grad_norm": 0.5896122820881663,
2607
+ "learning_rate": 9.505777777777779e-06,
2608
+ "loss": 2.4018,
2609
+ "step": 7225
2610
+ },
2611
+ {
2612
+ "epoch": 0.145,
2613
+ "grad_norm": 0.6081675838061575,
2614
+ "learning_rate": 9.500222222222222e-06,
2615
+ "loss": 2.4036,
2616
+ "step": 7250
2617
+ },
2618
+ {
2619
+ "epoch": 0.1455,
2620
+ "grad_norm": 0.6032973832585987,
2621
+ "learning_rate": 9.494666666666667e-06,
2622
+ "loss": 2.4025,
2623
+ "step": 7275
2624
+ },
2625
+ {
2626
+ "epoch": 0.146,
2627
+ "grad_norm": 0.6283775464354142,
2628
+ "learning_rate": 9.489111111111113e-06,
2629
+ "loss": 2.4078,
2630
+ "step": 7300
2631
+ },
2632
+ {
2633
+ "epoch": 0.146,
2634
+ "eval_loss": 2.4143505096435547,
2635
+ "eval_runtime": 31.4643,
2636
+ "eval_samples_per_second": 3.242,
2637
+ "eval_steps_per_second": 1.621,
2638
+ "step": 7300
2639
+ },
2640
+ {
2641
+ "epoch": 0.1465,
2642
+ "grad_norm": 0.5969038728051346,
2643
+ "learning_rate": 9.483555555555556e-06,
2644
+ "loss": 2.4066,
2645
+ "step": 7325
2646
+ },
2647
+ {
2648
+ "epoch": 0.147,
2649
+ "grad_norm": 0.6048317665387537,
2650
+ "learning_rate": 9.478e-06,
2651
+ "loss": 2.4007,
2652
+ "step": 7350
2653
+ },
2654
+ {
2655
+ "epoch": 0.1475,
2656
+ "grad_norm": 0.5721050600021237,
2657
+ "learning_rate": 9.472444444444445e-06,
2658
+ "loss": 2.4146,
2659
+ "step": 7375
2660
+ },
2661
+ {
2662
+ "epoch": 0.148,
2663
+ "grad_norm": 0.6019256818391423,
2664
+ "learning_rate": 9.46688888888889e-06,
2665
+ "loss": 2.399,
2666
+ "step": 7400
2667
+ },
2668
+ {
2669
+ "epoch": 0.148,
2670
+ "eval_loss": 2.414281129837036,
2671
+ "eval_runtime": 31.7034,
2672
+ "eval_samples_per_second": 3.217,
2673
+ "eval_steps_per_second": 1.609,
2674
+ "step": 7400
2675
+ },
2676
+ {
2677
+ "epoch": 0.1485,
2678
+ "grad_norm": 0.6386043502919573,
2679
+ "learning_rate": 9.461333333333334e-06,
2680
+ "loss": 2.3957,
2681
+ "step": 7425
2682
+ },
2683
+ {
2684
+ "epoch": 0.149,
2685
+ "grad_norm": 0.5819226766027404,
2686
+ "learning_rate": 9.455777777777777e-06,
2687
+ "loss": 2.4001,
2688
+ "step": 7450
2689
+ },
2690
+ {
2691
+ "epoch": 0.1495,
2692
+ "grad_norm": 0.6372396676223023,
2693
+ "learning_rate": 9.450222222222223e-06,
2694
+ "loss": 2.3976,
2695
+ "step": 7475
2696
+ },
2697
+ {
2698
+ "epoch": 0.15,
2699
+ "grad_norm": 0.5888017578283452,
2700
+ "learning_rate": 9.444666666666668e-06,
2701
+ "loss": 2.4008,
2702
+ "step": 7500
2703
+ },
2704
+ {
2705
+ "epoch": 0.15,
2706
+ "eval_loss": 2.414154291152954,
2707
+ "eval_runtime": 31.8152,
2708
+ "eval_samples_per_second": 3.206,
2709
+ "eval_steps_per_second": 1.603,
2710
+ "step": 7500
2711
+ },
2712
+ {
2713
+ "epoch": 0.1505,
2714
+ "grad_norm": 0.6132781564549638,
2715
+ "learning_rate": 9.439111111111111e-06,
2716
+ "loss": 2.4077,
2717
+ "step": 7525
2718
+ },
2719
+ {
2720
+ "epoch": 0.151,
2721
+ "grad_norm": 0.6063002641957036,
2722
+ "learning_rate": 9.433555555555557e-06,
2723
+ "loss": 2.3889,
2724
+ "step": 7550
2725
+ },
2726
+ {
2727
+ "epoch": 0.1515,
2728
+ "grad_norm": 0.614169638364484,
2729
+ "learning_rate": 9.428e-06,
2730
+ "loss": 2.4121,
2731
+ "step": 7575
2732
+ },
2733
+ {
2734
+ "epoch": 0.152,
2735
+ "grad_norm": 0.5826866596297434,
2736
+ "learning_rate": 9.422444444444445e-06,
2737
+ "loss": 2.4075,
2738
+ "step": 7600
2739
+ },
2740
+ {
2741
+ "epoch": 0.152,
2742
+ "eval_loss": 2.414039134979248,
2743
+ "eval_runtime": 31.7985,
2744
+ "eval_samples_per_second": 3.208,
2745
+ "eval_steps_per_second": 1.604,
2746
+ "step": 7600
2747
+ },
2748
+ {
2749
+ "epoch": 0.1525,
2750
+ "grad_norm": 0.5964985955677213,
2751
+ "learning_rate": 9.41688888888889e-06,
2752
+ "loss": 2.3976,
2753
+ "step": 7625
2754
+ },
2755
+ {
2756
+ "epoch": 0.153,
2757
+ "grad_norm": 0.5946671745059025,
2758
+ "learning_rate": 9.411333333333334e-06,
2759
+ "loss": 2.3947,
2760
+ "step": 7650
2761
+ },
2762
+ {
2763
+ "epoch": 0.1535,
2764
+ "grad_norm": 0.5894909865358033,
2765
+ "learning_rate": 9.405777777777778e-06,
2766
+ "loss": 2.4079,
2767
+ "step": 7675
2768
+ },
2769
+ {
2770
+ "epoch": 0.154,
2771
+ "grad_norm": 0.6048420481174572,
2772
+ "learning_rate": 9.400222222222223e-06,
2773
+ "loss": 2.4015,
2774
+ "step": 7700
2775
+ },
2776
+ {
2777
+ "epoch": 0.154,
2778
+ "eval_loss": 2.413475275039673,
2779
+ "eval_runtime": 31.9136,
2780
+ "eval_samples_per_second": 3.196,
2781
+ "eval_steps_per_second": 1.598,
2782
+ "step": 7700
2783
+ },
2784
+ {
2785
+ "epoch": 0.1545,
2786
+ "grad_norm": 0.617559481688582,
2787
+ "learning_rate": 9.394666666666668e-06,
2788
+ "loss": 2.4036,
2789
+ "step": 7725
2790
+ },
2791
+ {
2792
+ "epoch": 0.155,
2793
+ "grad_norm": 0.6350332331451685,
2794
+ "learning_rate": 9.389111111111112e-06,
2795
+ "loss": 2.3989,
2796
+ "step": 7750
2797
+ },
2798
+ {
2799
+ "epoch": 0.1555,
2800
+ "grad_norm": 0.6034892604414784,
2801
+ "learning_rate": 9.383555555555557e-06,
2802
+ "loss": 2.398,
2803
+ "step": 7775
2804
+ },
2805
+ {
2806
+ "epoch": 0.156,
2807
+ "grad_norm": 0.5879016941841427,
2808
+ "learning_rate": 9.378e-06,
2809
+ "loss": 2.3989,
2810
+ "step": 7800
2811
+ },
2812
+ {
2813
+ "epoch": 0.156,
2814
+ "eval_loss": 2.4134128093719482,
2815
+ "eval_runtime": 31.7809,
2816
+ "eval_samples_per_second": 3.209,
2817
+ "eval_steps_per_second": 1.605,
2818
+ "step": 7800
2819
+ },
2820
+ {
2821
+ "epoch": 0.1565,
2822
+ "grad_norm": 0.5957060592966067,
2823
+ "learning_rate": 9.372444444444446e-06,
2824
+ "loss": 2.3951,
2825
+ "step": 7825
2826
+ },
2827
+ {
2828
+ "epoch": 0.157,
2829
+ "grad_norm": 0.6127788552445546,
2830
+ "learning_rate": 9.36688888888889e-06,
2831
+ "loss": 2.3966,
2832
+ "step": 7850
2833
+ },
2834
+ {
2835
+ "epoch": 0.1575,
2836
+ "grad_norm": 0.6103495429829666,
2837
+ "learning_rate": 9.361333333333335e-06,
2838
+ "loss": 2.3974,
2839
+ "step": 7875
2840
+ },
2841
+ {
2842
+ "epoch": 0.158,
2843
+ "grad_norm": 0.5940303847498369,
2844
+ "learning_rate": 9.355777777777778e-06,
2845
+ "loss": 2.3982,
2846
+ "step": 7900
2847
+ },
2848
+ {
2849
+ "epoch": 0.158,
2850
+ "eval_loss": 2.4130520820617676,
2851
+ "eval_runtime": 31.8718,
2852
+ "eval_samples_per_second": 3.2,
2853
+ "eval_steps_per_second": 1.6,
2854
+ "step": 7900
2855
+ },
2856
+ {
2857
+ "epoch": 0.1585,
2858
+ "grad_norm": 0.5967208318826438,
2859
+ "learning_rate": 9.350222222222224e-06,
2860
+ "loss": 2.3963,
2861
+ "step": 7925
2862
+ },
2863
+ {
2864
+ "epoch": 0.159,
2865
+ "grad_norm": 0.6074697420049116,
2866
+ "learning_rate": 9.344666666666667e-06,
2867
+ "loss": 2.4004,
2868
+ "step": 7950
2869
+ },
2870
+ {
2871
+ "epoch": 0.1595,
2872
+ "grad_norm": 0.6007548308453654,
2873
+ "learning_rate": 9.339111111111112e-06,
2874
+ "loss": 2.3972,
2875
+ "step": 7975
2876
+ },
2877
+ {
2878
+ "epoch": 0.16,
2879
+ "grad_norm": 0.6058573477149505,
2880
+ "learning_rate": 9.333555555555558e-06,
2881
+ "loss": 2.4,
2882
+ "step": 8000
2883
+ },
2884
+ {
2885
+ "epoch": 0.16,
2886
+ "eval_loss": 2.4125914573669434,
2887
+ "eval_runtime": 31.8819,
2888
+ "eval_samples_per_second": 3.199,
2889
+ "eval_steps_per_second": 1.6,
2890
+ "step": 8000
2891
  }
2892
  ],
2893
  "logging_steps": 25,
 
2907
  "attributes": {}
2908
  }
2909
  },
2910
+ "total_flos": 2.546561838661763e+19,
2911
  "train_batch_size": 1,
2912
  "trial_name": null,
2913
  "trial_params": null