aghatage commited on
Commit
db76d1c
·
verified ·
1 Parent(s): ee8ec19

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b160330a699e6391aabdd6c326d1ca2154af597460c4109b821f3a27a3de51f
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e52bad05f2e4c26960ed218d0c8eb65c9304d2d27be834a92d821653ed150b67
3
  size 12017472
last-checkpoint/global_step2500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6003a02a1a66ef1745b42ac42443bddc34528a9fef8726db27fedbc72adb1572
3
+ size 71982309
last-checkpoint/global_step2500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a5fc9fcbc092435c4e9b986a9ec53b21b22bc833e0c004c3f820523d58970b6
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step2000
 
1
+ global_step2500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d92ac44cc5eabc6a5deb9b9de409e8c10d46ff0d44b4e3a5b61bcb9e4a0349fe
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af223d92fe6846f9d1e5ce7aaf1ae97c0e4e19a087e2147be916f38012f3d229
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 2000,
3
- "best_metric": 0.6596384644508362,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-2000",
5
- "epoch": 1.4537356844210143,
6
  "eval_steps": 250,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -817,6 +817,206 @@
817
  "eval_samples_per_second": 43.382,
818
  "eval_steps_per_second": 5.429,
819
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820
  }
821
  ],
822
  "logging_steps": 25,
@@ -836,7 +1036,7 @@
836
  "attributes": {}
837
  }
838
  },
839
- "total_flos": 1.111102656169902e+17,
840
  "train_batch_size": 4,
841
  "trial_name": null,
842
  "trial_params": null
 
1
  {
2
+ "best_global_step": 2500,
3
+ "best_metric": 0.6409846544265747,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-2500",
5
+ "epoch": 1.8173059443737503,
6
  "eval_steps": 250,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
817
  "eval_samples_per_second": 43.382,
818
  "eval_steps_per_second": 5.429,
819
  "step": 2000
820
+ },
821
+ {
822
+ "epoch": 1.4719141974186511,
823
+ "grad_norm": 1.093906283378601,
824
+ "learning_rate": 7.731497365851944e-05,
825
+ "loss": 0.66,
826
+ "mean_token_accuracy": 0.7957050919532775,
827
+ "num_tokens": 44603467.0,
828
+ "step": 2025
829
+ },
830
+ {
831
+ "epoch": 1.490092710416288,
832
+ "grad_norm": 0.8411886692047119,
833
+ "learning_rate": 7.724559661591966e-05,
834
+ "loss": 0.6492,
835
+ "mean_token_accuracy": 0.799662963449955,
836
+ "num_tokens": 45144337.0,
837
+ "step": 2050
838
+ },
839
+ {
840
+ "epoch": 1.5082712234139248,
841
+ "grad_norm": 0.9079028964042664,
842
+ "learning_rate": 7.717536660902353e-05,
843
+ "loss": 0.6535,
844
+ "mean_token_accuracy": 0.7987899404764175,
845
+ "num_tokens": 45708073.0,
846
+ "step": 2075
847
+ },
848
+ {
849
+ "epoch": 1.5264497364115615,
850
+ "grad_norm": 0.9115111827850342,
851
+ "learning_rate": 7.710428524617389e-05,
852
+ "loss": 0.6516,
853
+ "mean_token_accuracy": 0.7993895325064659,
854
+ "num_tokens": 46249985.0,
855
+ "step": 2100
856
+ },
857
+ {
858
+ "epoch": 1.5446282494091983,
859
+ "grad_norm": 0.8034014105796814,
860
+ "learning_rate": 7.703235415521057e-05,
861
+ "loss": 0.6553,
862
+ "mean_token_accuracy": 0.7976609247922898,
863
+ "num_tokens": 46795146.0,
864
+ "step": 2125
865
+ },
866
+ {
867
+ "epoch": 1.5628067624068351,
868
+ "grad_norm": 1.0506081581115723,
869
+ "learning_rate": 7.695957498343304e-05,
870
+ "loss": 0.6542,
871
+ "mean_token_accuracy": 0.7982049816846848,
872
+ "num_tokens": 47345330.0,
873
+ "step": 2150
874
+ },
875
+ {
876
+ "epoch": 1.5809852754044718,
877
+ "grad_norm": 0.9649513959884644,
878
+ "learning_rate": 7.688594939756276e-05,
879
+ "loss": 0.6548,
880
+ "mean_token_accuracy": 0.7982990917563438,
881
+ "num_tokens": 47896343.0,
882
+ "step": 2175
883
+ },
884
+ {
885
+ "epoch": 1.5991637884021088,
886
+ "grad_norm": 0.8364529609680176,
887
+ "learning_rate": 7.681147908370497e-05,
888
+ "loss": 0.6476,
889
+ "mean_token_accuracy": 0.8009107887744904,
890
+ "num_tokens": 48443987.0,
891
+ "step": 2200
892
+ },
893
+ {
894
+ "epoch": 1.6173423013997454,
895
+ "grad_norm": 0.8900915384292603,
896
+ "learning_rate": 7.673616574731013e-05,
897
+ "loss": 0.6664,
898
+ "mean_token_accuracy": 0.796454921066761,
899
+ "num_tokens": 48993810.0,
900
+ "step": 2225
901
+ },
902
+ {
903
+ "epoch": 1.6355208143973823,
904
+ "grad_norm": 0.8416359424591064,
905
+ "learning_rate": 7.666001111313477e-05,
906
+ "loss": 0.656,
907
+ "mean_token_accuracy": 0.7976474016904831,
908
+ "num_tokens": 49564541.0,
909
+ "step": 2250
910
+ },
911
+ {
912
+ "epoch": 1.6355208143973823,
913
+ "eval_loss": 0.648926854133606,
914
+ "eval_mean_token_accuracy": 0.7985351690474678,
915
+ "eval_num_tokens": 49564541.0,
916
+ "eval_runtime": 111.8774,
917
+ "eval_samples_per_second": 43.709,
918
+ "eval_steps_per_second": 5.47,
919
+ "step": 2250
920
+ },
921
+ {
922
+ "epoch": 1.6536993273950191,
923
+ "grad_norm": 0.836439311504364,
924
+ "learning_rate": 7.658301692520209e-05,
925
+ "loss": 0.642,
926
+ "mean_token_accuracy": 0.8027165573835373,
927
+ "num_tokens": 50098122.0,
928
+ "step": 2275
929
+ },
930
+ {
931
+ "epoch": 1.6718778403926557,
932
+ "grad_norm": 0.8868879079818726,
933
+ "learning_rate": 7.650518494676194e-05,
934
+ "loss": 0.6537,
935
+ "mean_token_accuracy": 0.7993291038274765,
936
+ "num_tokens": 50648590.0,
937
+ "step": 2300
938
+ },
939
+ {
940
+ "epoch": 1.6900563533902928,
941
+ "grad_norm": 0.8488360047340393,
942
+ "learning_rate": 7.642651696025052e-05,
943
+ "loss": 0.6403,
944
+ "mean_token_accuracy": 0.8029101991653442,
945
+ "num_tokens": 51215679.0,
946
+ "step": 2325
947
+ },
948
+ {
949
+ "epoch": 1.7082348663879294,
950
+ "grad_norm": 0.8410452604293823,
951
+ "learning_rate": 7.634701476724948e-05,
952
+ "loss": 0.6528,
953
+ "mean_token_accuracy": 0.798929190337658,
954
+ "num_tokens": 51783858.0,
955
+ "step": 2350
956
+ },
957
+ {
958
+ "epoch": 1.7264133793855663,
959
+ "grad_norm": 0.8173678517341614,
960
+ "learning_rate": 7.626668018844469e-05,
961
+ "loss": 0.6545,
962
+ "mean_token_accuracy": 0.7984850916266442,
963
+ "num_tokens": 52329463.0,
964
+ "step": 2375
965
+ },
966
+ {
967
+ "epoch": 1.7445918923832031,
968
+ "grad_norm": 0.8305994868278503,
969
+ "learning_rate": 7.618551506358459e-05,
970
+ "loss": 0.6444,
971
+ "mean_token_accuracy": 0.8014543145895004,
972
+ "num_tokens": 52868102.0,
973
+ "step": 2400
974
+ },
975
+ {
976
+ "epoch": 1.7627704053808397,
977
+ "grad_norm": 0.8392990231513977,
978
+ "learning_rate": 7.610352125143798e-05,
979
+ "loss": 0.6407,
980
+ "mean_token_accuracy": 0.8039175960421562,
981
+ "num_tokens": 53412329.0,
982
+ "step": 2425
983
+ },
984
+ {
985
+ "epoch": 1.7809489183784768,
986
+ "grad_norm": 0.8528268337249756,
987
+ "learning_rate": 7.602070062975153e-05,
988
+ "loss": 0.6418,
989
+ "mean_token_accuracy": 0.802329548895359,
990
+ "num_tokens": 53960577.0,
991
+ "step": 2450
992
+ },
993
+ {
994
+ "epoch": 1.7991274313761134,
995
+ "grad_norm": 0.8892678022384644,
996
+ "learning_rate": 7.593705509520669e-05,
997
+ "loss": 0.6442,
998
+ "mean_token_accuracy": 0.801820527613163,
999
+ "num_tokens": 54508868.0,
1000
+ "step": 2475
1001
+ },
1002
+ {
1003
+ "epoch": 1.8173059443737503,
1004
+ "grad_norm": 0.858299195766449,
1005
+ "learning_rate": 7.585258656337637e-05,
1006
+ "loss": 0.6464,
1007
+ "mean_token_accuracy": 0.8014724615216255,
1008
+ "num_tokens": 55070505.0,
1009
+ "step": 2500
1010
+ },
1011
+ {
1012
+ "epoch": 1.8173059443737503,
1013
+ "eval_loss": 0.6409846544265747,
1014
+ "eval_mean_token_accuracy": 0.8009895614159652,
1015
+ "eval_num_tokens": 55070505.0,
1016
+ "eval_runtime": 112.4439,
1017
+ "eval_samples_per_second": 43.488,
1018
+ "eval_steps_per_second": 5.443,
1019
+ "step": 2500
1020
  }
1021
  ],
1022
  "logging_steps": 25,
 
1036
  "attributes": {}
1037
  }
1038
  },
1039
+ "total_flos": 1.3887661057612186e+17,
1040
  "train_batch_size": 4,
1041
  "trial_name": null,
1042
  "trial_params": null