irodkin commited on
Commit
b26eec1
·
verified ·
1 Parent(s): 08eb02b

Training checkpoint at step 3000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 2000,
3
- "best_metric": 2.450512647628784,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-2000",
5
- "epoch": 0.04,
6
  "eval_steps": 100,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -728,6 +728,366 @@
728
  "eval_samples_per_second": 3.516,
729
  "eval_steps_per_second": 1.773,
730
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  }
732
  ],
733
  "logging_steps": 25,
@@ -747,7 +1107,7 @@
747
  "attributes": {}
748
  }
749
  },
750
- "total_flos": 5.570603850273915e+18,
751
  "train_batch_size": 1,
752
  "trial_name": null,
753
  "trial_params": null
 
1
  {
2
+ "best_global_step": 3000,
3
+ "best_metric": 2.436969041824341,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-3000",
5
+ "epoch": 0.06,
6
  "eval_steps": 100,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
728
  "eval_samples_per_second": 3.516,
729
  "eval_steps_per_second": 1.773,
730
  "step": 2000
731
+ },
732
+ {
733
+ "epoch": 0.0405,
734
+ "grad_norm": 0.034945541932647324,
735
+ "learning_rate": 4.048e-06,
736
+ "loss": 2.4357,
737
+ "step": 2025
738
+ },
739
+ {
740
+ "epoch": 0.041,
741
+ "grad_norm": 0.029322959861707003,
742
+ "learning_rate": 4.098e-06,
743
+ "loss": 2.4373,
744
+ "step": 2050
745
+ },
746
+ {
747
+ "epoch": 0.0415,
748
+ "grad_norm": 0.027365033479394632,
749
+ "learning_rate": 4.148000000000001e-06,
750
+ "loss": 2.442,
751
+ "step": 2075
752
+ },
753
+ {
754
+ "epoch": 0.042,
755
+ "grad_norm": 0.042214130565513416,
756
+ "learning_rate": 4.198e-06,
757
+ "loss": 2.4362,
758
+ "step": 2100
759
+ },
760
+ {
761
+ "epoch": 0.042,
762
+ "eval_loss": 2.448322296142578,
763
+ "eval_runtime": 33.466,
764
+ "eval_samples_per_second": 3.496,
765
+ "eval_steps_per_second": 1.763,
766
+ "step": 2100
767
+ },
768
+ {
769
+ "epoch": 0.0425,
770
+ "grad_norm": 0.028874346576168566,
771
+ "learning_rate": 4.248000000000001e-06,
772
+ "loss": 2.4428,
773
+ "step": 2125
774
+ },
775
+ {
776
+ "epoch": 0.043,
777
+ "grad_norm": 0.029771861998040296,
778
+ "learning_rate": 4.298e-06,
779
+ "loss": 2.4298,
780
+ "step": 2150
781
+ },
782
+ {
783
+ "epoch": 0.0435,
784
+ "grad_norm": 0.029668415484575914,
785
+ "learning_rate": 4.3480000000000006e-06,
786
+ "loss": 2.4352,
787
+ "step": 2175
788
+ },
789
+ {
790
+ "epoch": 0.044,
791
+ "grad_norm": 0.02564927582570633,
792
+ "learning_rate": 4.398000000000001e-06,
793
+ "loss": 2.4349,
794
+ "step": 2200
795
+ },
796
+ {
797
+ "epoch": 0.044,
798
+ "eval_loss": 2.4465889930725098,
799
+ "eval_runtime": 33.3555,
800
+ "eval_samples_per_second": 3.508,
801
+ "eval_steps_per_second": 1.769,
802
+ "step": 2200
803
+ },
804
+ {
805
+ "epoch": 0.0445,
806
+ "grad_norm": 0.024797235968250814,
807
+ "learning_rate": 4.4480000000000004e-06,
808
+ "loss": 2.4409,
809
+ "step": 2225
810
+ },
811
+ {
812
+ "epoch": 0.045,
813
+ "grad_norm": 0.02813189377877088,
814
+ "learning_rate": 4.498e-06,
815
+ "loss": 2.4367,
816
+ "step": 2250
817
+ },
818
+ {
819
+ "epoch": 0.0455,
820
+ "grad_norm": 0.02750903211389184,
821
+ "learning_rate": 4.548e-06,
822
+ "loss": 2.4326,
823
+ "step": 2275
824
+ },
825
+ {
826
+ "epoch": 0.046,
827
+ "grad_norm": 0.027737559952553607,
828
+ "learning_rate": 4.598e-06,
829
+ "loss": 2.4375,
830
+ "step": 2300
831
+ },
832
+ {
833
+ "epoch": 0.046,
834
+ "eval_loss": 2.4448626041412354,
835
+ "eval_runtime": 33.2658,
836
+ "eval_samples_per_second": 3.517,
837
+ "eval_steps_per_second": 1.774,
838
+ "step": 2300
839
+ },
840
+ {
841
+ "epoch": 0.0465,
842
+ "grad_norm": 0.02630663299301831,
843
+ "learning_rate": 4.648e-06,
844
+ "loss": 2.4392,
845
+ "step": 2325
846
+ },
847
+ {
848
+ "epoch": 0.047,
849
+ "grad_norm": 0.027929449055597393,
850
+ "learning_rate": 4.698000000000001e-06,
851
+ "loss": 2.4256,
852
+ "step": 2350
853
+ },
854
+ {
855
+ "epoch": 0.0475,
856
+ "grad_norm": 0.0283193243102273,
857
+ "learning_rate": 4.748e-06,
858
+ "loss": 2.429,
859
+ "step": 2375
860
+ },
861
+ {
862
+ "epoch": 0.048,
863
+ "grad_norm": 0.029295313451333963,
864
+ "learning_rate": 4.7980000000000005e-06,
865
+ "loss": 2.4393,
866
+ "step": 2400
867
+ },
868
+ {
869
+ "epoch": 0.048,
870
+ "eval_loss": 2.4432175159454346,
871
+ "eval_runtime": 33.3067,
872
+ "eval_samples_per_second": 3.513,
873
+ "eval_steps_per_second": 1.771,
874
+ "step": 2400
875
+ },
876
+ {
877
+ "epoch": 0.0485,
878
+ "grad_norm": 0.025382897552394503,
879
+ "learning_rate": 4.848000000000001e-06,
880
+ "loss": 2.4322,
881
+ "step": 2425
882
+ },
883
+ {
884
+ "epoch": 0.049,
885
+ "grad_norm": 0.02450548193909556,
886
+ "learning_rate": 4.898e-06,
887
+ "loss": 2.4314,
888
+ "step": 2450
889
+ },
890
+ {
891
+ "epoch": 0.0495,
892
+ "grad_norm": 0.033065483070063684,
893
+ "learning_rate": 4.948000000000001e-06,
894
+ "loss": 2.4338,
895
+ "step": 2475
896
+ },
897
+ {
898
+ "epoch": 0.05,
899
+ "grad_norm": 0.027543894857825314,
900
+ "learning_rate": 4.998e-06,
901
+ "loss": 2.4333,
902
+ "step": 2500
903
+ },
904
+ {
905
+ "epoch": 0.05,
906
+ "eval_loss": 2.441807985305786,
907
+ "eval_runtime": 33.0379,
908
+ "eval_samples_per_second": 3.541,
909
+ "eval_steps_per_second": 1.786,
910
+ "step": 2500
911
+ },
912
+ {
913
+ "epoch": 0.0505,
914
+ "grad_norm": 0.027354239436717945,
915
+ "learning_rate": 5.048000000000001e-06,
916
+ "loss": 2.439,
917
+ "step": 2525
918
+ },
919
+ {
920
+ "epoch": 0.051,
921
+ "grad_norm": 0.022458884368301627,
922
+ "learning_rate": 5.098000000000001e-06,
923
+ "loss": 2.427,
924
+ "step": 2550
925
+ },
926
+ {
927
+ "epoch": 0.0515,
928
+ "grad_norm": 0.033350881745701555,
929
+ "learning_rate": 5.1480000000000005e-06,
930
+ "loss": 2.4275,
931
+ "step": 2575
932
+ },
933
+ {
934
+ "epoch": 0.052,
935
+ "grad_norm": 0.025032545530163004,
936
+ "learning_rate": 5.198000000000001e-06,
937
+ "loss": 2.4275,
938
+ "step": 2600
939
+ },
940
+ {
941
+ "epoch": 0.052,
942
+ "eval_loss": 2.440882444381714,
943
+ "eval_runtime": 33.1835,
944
+ "eval_samples_per_second": 3.526,
945
+ "eval_steps_per_second": 1.778,
946
+ "step": 2600
947
+ },
948
+ {
949
+ "epoch": 0.0525,
950
+ "grad_norm": 0.026294170044068685,
951
+ "learning_rate": 5.248000000000001e-06,
952
+ "loss": 2.4312,
953
+ "step": 2625
954
+ },
955
+ {
956
+ "epoch": 0.053,
957
+ "grad_norm": 0.03301155351988982,
958
+ "learning_rate": 5.298000000000001e-06,
959
+ "loss": 2.4203,
960
+ "step": 2650
961
+ },
962
+ {
963
+ "epoch": 0.0535,
964
+ "grad_norm": 0.02389586194961339,
965
+ "learning_rate": 5.348000000000001e-06,
966
+ "loss": 2.4332,
967
+ "step": 2675
968
+ },
969
+ {
970
+ "epoch": 0.054,
971
+ "grad_norm": 0.056862279743176244,
972
+ "learning_rate": 5.398e-06,
973
+ "loss": 2.4313,
974
+ "step": 2700
975
+ },
976
+ {
977
+ "epoch": 0.054,
978
+ "eval_loss": 2.4402644634246826,
979
+ "eval_runtime": 33.2071,
980
+ "eval_samples_per_second": 3.523,
981
+ "eval_steps_per_second": 1.777,
982
+ "step": 2700
983
+ },
984
+ {
985
+ "epoch": 0.0545,
986
+ "grad_norm": 0.025636671246445756,
987
+ "learning_rate": 5.448e-06,
988
+ "loss": 2.4311,
989
+ "step": 2725
990
+ },
991
+ {
992
+ "epoch": 0.055,
993
+ "grad_norm": 0.022083605910153424,
994
+ "learning_rate": 5.498e-06,
995
+ "loss": 2.4357,
996
+ "step": 2750
997
+ },
998
+ {
999
+ "epoch": 0.0555,
1000
+ "grad_norm": 0.024223735712298522,
1001
+ "learning_rate": 5.548e-06,
1002
+ "loss": 2.4294,
1003
+ "step": 2775
1004
+ },
1005
+ {
1006
+ "epoch": 0.056,
1007
+ "grad_norm": 0.029847698463432104,
1008
+ "learning_rate": 5.5980000000000004e-06,
1009
+ "loss": 2.4344,
1010
+ "step": 2800
1011
+ },
1012
+ {
1013
+ "epoch": 0.056,
1014
+ "eval_loss": 2.4389007091522217,
1015
+ "eval_runtime": 33.2705,
1016
+ "eval_samples_per_second": 3.517,
1017
+ "eval_steps_per_second": 1.773,
1018
+ "step": 2800
1019
+ },
1020
+ {
1021
+ "epoch": 0.0565,
1022
+ "grad_norm": 0.032144633236930065,
1023
+ "learning_rate": 5.648e-06,
1024
+ "loss": 2.4282,
1025
+ "step": 2825
1026
+ },
1027
+ {
1028
+ "epoch": 0.057,
1029
+ "grad_norm": 0.02355863809037046,
1030
+ "learning_rate": 5.698e-06,
1031
+ "loss": 2.4322,
1032
+ "step": 2850
1033
+ },
1034
+ {
1035
+ "epoch": 0.0575,
1036
+ "grad_norm": 0.023728744427970416,
1037
+ "learning_rate": 5.748e-06,
1038
+ "loss": 2.4286,
1039
+ "step": 2875
1040
+ },
1041
+ {
1042
+ "epoch": 0.058,
1043
+ "grad_norm": 0.025539915034515293,
1044
+ "learning_rate": 5.798e-06,
1045
+ "loss": 2.4287,
1046
+ "step": 2900
1047
+ },
1048
+ {
1049
+ "epoch": 0.058,
1050
+ "eval_loss": 2.4376914501190186,
1051
+ "eval_runtime": 33.3179,
1052
+ "eval_samples_per_second": 3.512,
1053
+ "eval_steps_per_second": 1.771,
1054
+ "step": 2900
1055
+ },
1056
+ {
1057
+ "epoch": 0.0585,
1058
+ "grad_norm": 0.023457547558388747,
1059
+ "learning_rate": 5.848000000000001e-06,
1060
+ "loss": 2.4289,
1061
+ "step": 2925
1062
+ },
1063
+ {
1064
+ "epoch": 0.059,
1065
+ "grad_norm": 0.025297710201421797,
1066
+ "learning_rate": 5.898e-06,
1067
+ "loss": 2.4274,
1068
+ "step": 2950
1069
+ },
1070
+ {
1071
+ "epoch": 0.0595,
1072
+ "grad_norm": 0.024155176530161276,
1073
+ "learning_rate": 5.9480000000000005e-06,
1074
+ "loss": 2.4169,
1075
+ "step": 2975
1076
+ },
1077
+ {
1078
+ "epoch": 0.06,
1079
+ "grad_norm": 0.023954841726960448,
1080
+ "learning_rate": 5.998000000000001e-06,
1081
+ "loss": 2.4244,
1082
+ "step": 3000
1083
+ },
1084
+ {
1085
+ "epoch": 0.06,
1086
+ "eval_loss": 2.436969041824341,
1087
+ "eval_runtime": 33.2713,
1088
+ "eval_samples_per_second": 3.517,
1089
+ "eval_steps_per_second": 1.773,
1090
+ "step": 3000
1091
  }
1092
  ],
1093
  "logging_steps": 25,
 
1107
  "attributes": {}
1108
  }
1109
  },
1110
+ "total_flos": 8.355905775410872e+18,
1111
  "train_batch_size": 1,
1112
  "trial_name": null,
1113
  "trial_params": null