Training in progress, step 2500, checkpoint
Browse files- last-checkpoint/adapter_model.safetensors +1 -1
- last-checkpoint/global_step2500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step2500/mp_rank_00_model_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/trainer_state.json +206 -6
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12017472
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e52bad05f2e4c26960ed218d0c8eb65c9304d2d27be834a92d821653ed150b67
|
| 3 |
size 12017472
|
last-checkpoint/global_step2500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6003a02a1a66ef1745b42ac42443bddc34528a9fef8726db27fedbc72adb1572
|
| 3 |
+
size 71982309
|
last-checkpoint/global_step2500/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a5fc9fcbc092435c4e9b986a9ec53b21b22bc833e0c004c3f820523d58970b6
|
| 3 |
+
size 146356645
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step2500
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14709
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af223d92fe6846f9d1e5ce7aaf1ae97c0e4e19a087e2147be916f38012f3d229
|
| 3 |
size 14709
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 0.
|
| 4 |
-
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 250,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -817,6 +817,206 @@
|
|
| 817 |
"eval_samples_per_second": 43.382,
|
| 818 |
"eval_steps_per_second": 5.429,
|
| 819 |
"step": 2000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
}
|
| 821 |
],
|
| 822 |
"logging_steps": 25,
|
|
@@ -836,7 +1036,7 @@
|
|
| 836 |
"attributes": {}
|
| 837 |
}
|
| 838 |
},
|
| 839 |
-
"total_flos": 1.
|
| 840 |
"train_batch_size": 4,
|
| 841 |
"trial_name": null,
|
| 842 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 2500,
|
| 3 |
+
"best_metric": 0.6409846544265747,
|
| 4 |
+
"best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-2500",
|
| 5 |
+
"epoch": 1.8173059443737503,
|
| 6 |
"eval_steps": 250,
|
| 7 |
+
"global_step": 2500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 817 |
"eval_samples_per_second": 43.382,
|
| 818 |
"eval_steps_per_second": 5.429,
|
| 819 |
"step": 2000
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"epoch": 1.4719141974186511,
|
| 823 |
+
"grad_norm": 1.093906283378601,
|
| 824 |
+
"learning_rate": 7.731497365851944e-05,
|
| 825 |
+
"loss": 0.66,
|
| 826 |
+
"mean_token_accuracy": 0.7957050919532775,
|
| 827 |
+
"num_tokens": 44603467.0,
|
| 828 |
+
"step": 2025
|
| 829 |
+
},
|
| 830 |
+
{
|
| 831 |
+
"epoch": 1.490092710416288,
|
| 832 |
+
"grad_norm": 0.8411886692047119,
|
| 833 |
+
"learning_rate": 7.724559661591966e-05,
|
| 834 |
+
"loss": 0.6492,
|
| 835 |
+
"mean_token_accuracy": 0.799662963449955,
|
| 836 |
+
"num_tokens": 45144337.0,
|
| 837 |
+
"step": 2050
|
| 838 |
+
},
|
| 839 |
+
{
|
| 840 |
+
"epoch": 1.5082712234139248,
|
| 841 |
+
"grad_norm": 0.9079028964042664,
|
| 842 |
+
"learning_rate": 7.717536660902353e-05,
|
| 843 |
+
"loss": 0.6535,
|
| 844 |
+
"mean_token_accuracy": 0.7987899404764175,
|
| 845 |
+
"num_tokens": 45708073.0,
|
| 846 |
+
"step": 2075
|
| 847 |
+
},
|
| 848 |
+
{
|
| 849 |
+
"epoch": 1.5264497364115615,
|
| 850 |
+
"grad_norm": 0.9115111827850342,
|
| 851 |
+
"learning_rate": 7.710428524617389e-05,
|
| 852 |
+
"loss": 0.6516,
|
| 853 |
+
"mean_token_accuracy": 0.7993895325064659,
|
| 854 |
+
"num_tokens": 46249985.0,
|
| 855 |
+
"step": 2100
|
| 856 |
+
},
|
| 857 |
+
{
|
| 858 |
+
"epoch": 1.5446282494091983,
|
| 859 |
+
"grad_norm": 0.8034014105796814,
|
| 860 |
+
"learning_rate": 7.703235415521057e-05,
|
| 861 |
+
"loss": 0.6553,
|
| 862 |
+
"mean_token_accuracy": 0.7976609247922898,
|
| 863 |
+
"num_tokens": 46795146.0,
|
| 864 |
+
"step": 2125
|
| 865 |
+
},
|
| 866 |
+
{
|
| 867 |
+
"epoch": 1.5628067624068351,
|
| 868 |
+
"grad_norm": 1.0506081581115723,
|
| 869 |
+
"learning_rate": 7.695957498343304e-05,
|
| 870 |
+
"loss": 0.6542,
|
| 871 |
+
"mean_token_accuracy": 0.7982049816846848,
|
| 872 |
+
"num_tokens": 47345330.0,
|
| 873 |
+
"step": 2150
|
| 874 |
+
},
|
| 875 |
+
{
|
| 876 |
+
"epoch": 1.5809852754044718,
|
| 877 |
+
"grad_norm": 0.9649513959884644,
|
| 878 |
+
"learning_rate": 7.688594939756276e-05,
|
| 879 |
+
"loss": 0.6548,
|
| 880 |
+
"mean_token_accuracy": 0.7982990917563438,
|
| 881 |
+
"num_tokens": 47896343.0,
|
| 882 |
+
"step": 2175
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 1.5991637884021088,
|
| 886 |
+
"grad_norm": 0.8364529609680176,
|
| 887 |
+
"learning_rate": 7.681147908370497e-05,
|
| 888 |
+
"loss": 0.6476,
|
| 889 |
+
"mean_token_accuracy": 0.8009107887744904,
|
| 890 |
+
"num_tokens": 48443987.0,
|
| 891 |
+
"step": 2200
|
| 892 |
+
},
|
| 893 |
+
{
|
| 894 |
+
"epoch": 1.6173423013997454,
|
| 895 |
+
"grad_norm": 0.8900915384292603,
|
| 896 |
+
"learning_rate": 7.673616574731013e-05,
|
| 897 |
+
"loss": 0.6664,
|
| 898 |
+
"mean_token_accuracy": 0.796454921066761,
|
| 899 |
+
"num_tokens": 48993810.0,
|
| 900 |
+
"step": 2225
|
| 901 |
+
},
|
| 902 |
+
{
|
| 903 |
+
"epoch": 1.6355208143973823,
|
| 904 |
+
"grad_norm": 0.8416359424591064,
|
| 905 |
+
"learning_rate": 7.666001111313477e-05,
|
| 906 |
+
"loss": 0.656,
|
| 907 |
+
"mean_token_accuracy": 0.7976474016904831,
|
| 908 |
+
"num_tokens": 49564541.0,
|
| 909 |
+
"step": 2250
|
| 910 |
+
},
|
| 911 |
+
{
|
| 912 |
+
"epoch": 1.6355208143973823,
|
| 913 |
+
"eval_loss": 0.648926854133606,
|
| 914 |
+
"eval_mean_token_accuracy": 0.7985351690474678,
|
| 915 |
+
"eval_num_tokens": 49564541.0,
|
| 916 |
+
"eval_runtime": 111.8774,
|
| 917 |
+
"eval_samples_per_second": 43.709,
|
| 918 |
+
"eval_steps_per_second": 5.47,
|
| 919 |
+
"step": 2250
|
| 920 |
+
},
|
| 921 |
+
{
|
| 922 |
+
"epoch": 1.6536993273950191,
|
| 923 |
+
"grad_norm": 0.836439311504364,
|
| 924 |
+
"learning_rate": 7.658301692520209e-05,
|
| 925 |
+
"loss": 0.642,
|
| 926 |
+
"mean_token_accuracy": 0.8027165573835373,
|
| 927 |
+
"num_tokens": 50098122.0,
|
| 928 |
+
"step": 2275
|
| 929 |
+
},
|
| 930 |
+
{
|
| 931 |
+
"epoch": 1.6718778403926557,
|
| 932 |
+
"grad_norm": 0.8868879079818726,
|
| 933 |
+
"learning_rate": 7.650518494676194e-05,
|
| 934 |
+
"loss": 0.6537,
|
| 935 |
+
"mean_token_accuracy": 0.7993291038274765,
|
| 936 |
+
"num_tokens": 50648590.0,
|
| 937 |
+
"step": 2300
|
| 938 |
+
},
|
| 939 |
+
{
|
| 940 |
+
"epoch": 1.6900563533902928,
|
| 941 |
+
"grad_norm": 0.8488360047340393,
|
| 942 |
+
"learning_rate": 7.642651696025052e-05,
|
| 943 |
+
"loss": 0.6403,
|
| 944 |
+
"mean_token_accuracy": 0.8029101991653442,
|
| 945 |
+
"num_tokens": 51215679.0,
|
| 946 |
+
"step": 2325
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 1.7082348663879294,
|
| 950 |
+
"grad_norm": 0.8410452604293823,
|
| 951 |
+
"learning_rate": 7.634701476724948e-05,
|
| 952 |
+
"loss": 0.6528,
|
| 953 |
+
"mean_token_accuracy": 0.798929190337658,
|
| 954 |
+
"num_tokens": 51783858.0,
|
| 955 |
+
"step": 2350
|
| 956 |
+
},
|
| 957 |
+
{
|
| 958 |
+
"epoch": 1.7264133793855663,
|
| 959 |
+
"grad_norm": 0.8173678517341614,
|
| 960 |
+
"learning_rate": 7.626668018844469e-05,
|
| 961 |
+
"loss": 0.6545,
|
| 962 |
+
"mean_token_accuracy": 0.7984850916266442,
|
| 963 |
+
"num_tokens": 52329463.0,
|
| 964 |
+
"step": 2375
|
| 965 |
+
},
|
| 966 |
+
{
|
| 967 |
+
"epoch": 1.7445918923832031,
|
| 968 |
+
"grad_norm": 0.8305994868278503,
|
| 969 |
+
"learning_rate": 7.618551506358459e-05,
|
| 970 |
+
"loss": 0.6444,
|
| 971 |
+
"mean_token_accuracy": 0.8014543145895004,
|
| 972 |
+
"num_tokens": 52868102.0,
|
| 973 |
+
"step": 2400
|
| 974 |
+
},
|
| 975 |
+
{
|
| 976 |
+
"epoch": 1.7627704053808397,
|
| 977 |
+
"grad_norm": 0.8392990231513977,
|
| 978 |
+
"learning_rate": 7.610352125143798e-05,
|
| 979 |
+
"loss": 0.6407,
|
| 980 |
+
"mean_token_accuracy": 0.8039175960421562,
|
| 981 |
+
"num_tokens": 53412329.0,
|
| 982 |
+
"step": 2425
|
| 983 |
+
},
|
| 984 |
+
{
|
| 985 |
+
"epoch": 1.7809489183784768,
|
| 986 |
+
"grad_norm": 0.8528268337249756,
|
| 987 |
+
"learning_rate": 7.602070062975153e-05,
|
| 988 |
+
"loss": 0.6418,
|
| 989 |
+
"mean_token_accuracy": 0.802329548895359,
|
| 990 |
+
"num_tokens": 53960577.0,
|
| 991 |
+
"step": 2450
|
| 992 |
+
},
|
| 993 |
+
{
|
| 994 |
+
"epoch": 1.7991274313761134,
|
| 995 |
+
"grad_norm": 0.8892678022384644,
|
| 996 |
+
"learning_rate": 7.593705509520669e-05,
|
| 997 |
+
"loss": 0.6442,
|
| 998 |
+
"mean_token_accuracy": 0.801820527613163,
|
| 999 |
+
"num_tokens": 54508868.0,
|
| 1000 |
+
"step": 2475
|
| 1001 |
+
},
|
| 1002 |
+
{
|
| 1003 |
+
"epoch": 1.8173059443737503,
|
| 1004 |
+
"grad_norm": 0.858299195766449,
|
| 1005 |
+
"learning_rate": 7.585258656337637e-05,
|
| 1006 |
+
"loss": 0.6464,
|
| 1007 |
+
"mean_token_accuracy": 0.8014724615216255,
|
| 1008 |
+
"num_tokens": 55070505.0,
|
| 1009 |
+
"step": 2500
|
| 1010 |
+
},
|
| 1011 |
+
{
|
| 1012 |
+
"epoch": 1.8173059443737503,
|
| 1013 |
+
"eval_loss": 0.6409846544265747,
|
| 1014 |
+
"eval_mean_token_accuracy": 0.8009895614159652,
|
| 1015 |
+
"eval_num_tokens": 55070505.0,
|
| 1016 |
+
"eval_runtime": 112.4439,
|
| 1017 |
+
"eval_samples_per_second": 43.488,
|
| 1018 |
+
"eval_steps_per_second": 5.443,
|
| 1019 |
+
"step": 2500
|
| 1020 |
}
|
| 1021 |
],
|
| 1022 |
"logging_steps": 25,
|
|
|
|
| 1036 |
"attributes": {}
|
| 1037 |
}
|
| 1038 |
},
|
| 1039 |
+
"total_flos": 1.3887661057612186e+17,
|
| 1040 |
"train_batch_size": 4,
|
| 1041 |
"trial_name": null,
|
| 1042 |
"trial_params": null
|