Training in progress, step 7000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85553c2cc4b71cc764d219a255a3d7c329d548c46a05c1b60f352b7a9a28b2a1
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8cef3423760a08b83e2c1f1529056dce5e88b5150c5b965e4bf1c35daa74b70f
|
| 3 |
size 4768663315
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc1bf8ba09c7a33e82766bf9f5af704c56a2c04ffb9328ada50fa2f824e9badd
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5858,6 +5858,456 @@
|
|
| 5858 |
"mean_token_accuracy": 0.7979574371129274,
|
| 5859 |
"num_tokens": 53241856.0,
|
| 5860 |
"step": 6500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5861 |
}
|
| 5862 |
],
|
| 5863 |
"logging_steps": 10,
|
|
@@ -5877,7 +6327,7 @@
|
|
| 5877 |
"attributes": {}
|
| 5878 |
}
|
| 5879 |
},
|
| 5880 |
-
"total_flos": 1.
|
| 5881 |
"train_batch_size": 2,
|
| 5882 |
"trial_name": null,
|
| 5883 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.107823206083879,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 7000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5858 |
"mean_token_accuracy": 0.7979574371129274,
|
| 5859 |
"num_tokens": 53241856.0,
|
| 5860 |
"step": 6500
|
| 5861 |
+
},
|
| 5862 |
+
{
|
| 5863 |
+
"epoch": 1.9604698441382427,
|
| 5864 |
+
"grad_norm": 1.5861074924468994,
|
| 5865 |
+
"learning_rate": 3.852331028329244e-06,
|
| 5866 |
+
"loss": 0.1401,
|
| 5867 |
+
"mean_token_accuracy": 0.7667318984866143,
|
| 5868 |
+
"num_tokens": 53323776.0,
|
| 5869 |
+
"step": 6510
|
| 5870 |
+
},
|
| 5871 |
+
{
|
| 5872 |
+
"epoch": 1.9634816655372336,
|
| 5873 |
+
"grad_norm": 1.195090889930725,
|
| 5874 |
+
"learning_rate": 3.841177782734776e-06,
|
| 5875 |
+
"loss": 0.1359,
|
| 5876 |
+
"mean_token_accuracy": 0.7738136008381844,
|
| 5877 |
+
"num_tokens": 53405696.0,
|
| 5878 |
+
"step": 6520
|
| 5879 |
+
},
|
| 5880 |
+
{
|
| 5881 |
+
"epoch": 1.9664934869362247,
|
| 5882 |
+
"grad_norm": 1.410537600517273,
|
| 5883 |
+
"learning_rate": 3.830024537140309e-06,
|
| 5884 |
+
"loss": 0.1116,
|
| 5885 |
+
"mean_token_accuracy": 0.7794153623282909,
|
| 5886 |
+
"num_tokens": 53487616.0,
|
| 5887 |
+
"step": 6530
|
| 5888 |
+
},
|
| 5889 |
+
{
|
| 5890 |
+
"epoch": 1.9695053083352159,
|
| 5891 |
+
"grad_norm": 1.2453457117080688,
|
| 5892 |
+
"learning_rate": 3.81887129154584e-06,
|
| 5893 |
+
"loss": 0.1626,
|
| 5894 |
+
"mean_token_accuracy": 0.7636007871478796,
|
| 5895 |
+
"num_tokens": 53569536.0,
|
| 5896 |
+
"step": 6540
|
| 5897 |
+
},
|
| 5898 |
+
{
|
| 5899 |
+
"epoch": 1.9725171297342068,
|
| 5900 |
+
"grad_norm": 1.5458024740219116,
|
| 5901 |
+
"learning_rate": 3.8077180459513723e-06,
|
| 5902 |
+
"loss": 0.1225,
|
| 5903 |
+
"mean_token_accuracy": 0.7851883560419083,
|
| 5904 |
+
"num_tokens": 53651456.0,
|
| 5905 |
+
"step": 6550
|
| 5906 |
+
},
|
| 5907 |
+
{
|
| 5908 |
+
"epoch": 1.9755289511331977,
|
| 5909 |
+
"grad_norm": 1.335051417350769,
|
| 5910 |
+
"learning_rate": 3.7965648003569045e-06,
|
| 5911 |
+
"loss": 0.1244,
|
| 5912 |
+
"mean_token_accuracy": 0.7711105648428201,
|
| 5913 |
+
"num_tokens": 53733376.0,
|
| 5914 |
+
"step": 6560
|
| 5915 |
+
},
|
| 5916 |
+
{
|
| 5917 |
+
"epoch": 1.9785407725321889,
|
| 5918 |
+
"grad_norm": 1.1321961879730225,
|
| 5919 |
+
"learning_rate": 3.785411554762436e-06,
|
| 5920 |
+
"loss": 0.1145,
|
| 5921 |
+
"mean_token_accuracy": 0.7770547956228256,
|
| 5922 |
+
"num_tokens": 53815296.0,
|
| 5923 |
+
"step": 6570
|
| 5924 |
+
},
|
| 5925 |
+
{
|
| 5926 |
+
"epoch": 1.98155259393118,
|
| 5927 |
+
"grad_norm": 1.4666228294372559,
|
| 5928 |
+
"learning_rate": 3.7742583091679678e-06,
|
| 5929 |
+
"loss": 0.1128,
|
| 5930 |
+
"mean_token_accuracy": 0.8008316993713379,
|
| 5931 |
+
"num_tokens": 53897216.0,
|
| 5932 |
+
"step": 6580
|
| 5933 |
+
},
|
| 5934 |
+
{
|
| 5935 |
+
"epoch": 1.984564415330171,
|
| 5936 |
+
"grad_norm": 1.1132220029830933,
|
| 5937 |
+
"learning_rate": 3.7631050635735e-06,
|
| 5938 |
+
"loss": 0.1223,
|
| 5939 |
+
"mean_token_accuracy": 0.7956457916647196,
|
| 5940 |
+
"num_tokens": 53979136.0,
|
| 5941 |
+
"step": 6590
|
| 5942 |
+
},
|
| 5943 |
+
{
|
| 5944 |
+
"epoch": 1.9875762367291618,
|
| 5945 |
+
"grad_norm": 1.015281319618225,
|
| 5946 |
+
"learning_rate": 3.751951817979032e-06,
|
| 5947 |
+
"loss": 0.1115,
|
| 5948 |
+
"mean_token_accuracy": 0.7839285705238581,
|
| 5949 |
+
"num_tokens": 54061056.0,
|
| 5950 |
+
"step": 6600
|
| 5951 |
+
},
|
| 5952 |
+
{
|
| 5953 |
+
"epoch": 1.990588058128153,
|
| 5954 |
+
"grad_norm": 1.3019957542419434,
|
| 5955 |
+
"learning_rate": 3.740798572384564e-06,
|
| 5956 |
+
"loss": 0.1132,
|
| 5957 |
+
"mean_token_accuracy": 0.7918786682188511,
|
| 5958 |
+
"num_tokens": 54142976.0,
|
| 5959 |
+
"step": 6610
|
| 5960 |
+
},
|
| 5961 |
+
{
|
| 5962 |
+
"epoch": 1.9935998795271441,
|
| 5963 |
+
"grad_norm": 1.3737001419067383,
|
| 5964 |
+
"learning_rate": 3.729645326790096e-06,
|
| 5965 |
+
"loss": 0.12,
|
| 5966 |
+
"mean_token_accuracy": 0.7895425636321306,
|
| 5967 |
+
"num_tokens": 54224896.0,
|
| 5968 |
+
"step": 6620
|
| 5969 |
+
},
|
| 5970 |
+
{
|
| 5971 |
+
"epoch": 1.996611700926135,
|
| 5972 |
+
"grad_norm": 1.220357060432434,
|
| 5973 |
+
"learning_rate": 3.7184920811956282e-06,
|
| 5974 |
+
"loss": 0.1267,
|
| 5975 |
+
"mean_token_accuracy": 0.7734589025378227,
|
| 5976 |
+
"num_tokens": 54306816.0,
|
| 5977 |
+
"step": 6630
|
| 5978 |
+
},
|
| 5979 |
+
{
|
| 5980 |
+
"epoch": 1.999623522325126,
|
| 5981 |
+
"grad_norm": 0.9205222725868225,
|
| 5982 |
+
"learning_rate": 3.70733883560116e-06,
|
| 5983 |
+
"loss": 0.1376,
|
| 5984 |
+
"mean_token_accuracy": 0.77977005392313,
|
| 5985 |
+
"num_tokens": 54388736.0,
|
| 5986 |
+
"step": 6640
|
| 5987 |
+
},
|
| 5988 |
+
{
|
| 5989 |
+
"epoch": 2.002409457119193,
|
| 5990 |
+
"grad_norm": 1.058834433555603,
|
| 5991 |
+
"learning_rate": 3.6961855900066923e-06,
|
| 5992 |
+
"loss": 0.1066,
|
| 5993 |
+
"mean_token_accuracy": 0.7967816152282663,
|
| 5994 |
+
"num_tokens": 54464512.0,
|
| 5995 |
+
"step": 6650
|
| 5996 |
+
},
|
| 5997 |
+
{
|
| 5998 |
+
"epoch": 2.0054212785181837,
|
| 5999 |
+
"grad_norm": 1.4777971506118774,
|
| 6000 |
+
"learning_rate": 3.685032344412224e-06,
|
| 6001 |
+
"loss": 0.1153,
|
| 6002 |
+
"mean_token_accuracy": 0.783109100162983,
|
| 6003 |
+
"num_tokens": 54546432.0,
|
| 6004 |
+
"step": 6660
|
| 6005 |
+
},
|
| 6006 |
+
{
|
| 6007 |
+
"epoch": 2.008433099917175,
|
| 6008 |
+
"grad_norm": 1.3833023309707642,
|
| 6009 |
+
"learning_rate": 3.6738790988177564e-06,
|
| 6010 |
+
"loss": 0.1312,
|
| 6011 |
+
"mean_token_accuracy": 0.7731409035623074,
|
| 6012 |
+
"num_tokens": 54628352.0,
|
| 6013 |
+
"step": 6670
|
| 6014 |
+
},
|
| 6015 |
+
{
|
| 6016 |
+
"epoch": 2.011444921316166,
|
| 6017 |
+
"grad_norm": 1.062574028968811,
|
| 6018 |
+
"learning_rate": 3.6627258532232887e-06,
|
| 6019 |
+
"loss": 0.0978,
|
| 6020 |
+
"mean_token_accuracy": 0.7889799430966378,
|
| 6021 |
+
"num_tokens": 54710272.0,
|
| 6022 |
+
"step": 6680
|
| 6023 |
+
},
|
| 6024 |
+
{
|
| 6025 |
+
"epoch": 2.014456742715157,
|
| 6026 |
+
"grad_norm": 1.269668459892273,
|
| 6027 |
+
"learning_rate": 3.6515726076288205e-06,
|
| 6028 |
+
"loss": 0.1001,
|
| 6029 |
+
"mean_token_accuracy": 0.7908879652619362,
|
| 6030 |
+
"num_tokens": 54792192.0,
|
| 6031 |
+
"step": 6690
|
| 6032 |
+
},
|
| 6033 |
+
{
|
| 6034 |
+
"epoch": 2.017468564114148,
|
| 6035 |
+
"grad_norm": 1.7478396892547607,
|
| 6036 |
+
"learning_rate": 3.6404193620343527e-06,
|
| 6037 |
+
"loss": 0.1288,
|
| 6038 |
+
"mean_token_accuracy": 0.7696673195809126,
|
| 6039 |
+
"num_tokens": 54874112.0,
|
| 6040 |
+
"step": 6700
|
| 6041 |
+
},
|
| 6042 |
+
{
|
| 6043 |
+
"epoch": 2.0204803855131392,
|
| 6044 |
+
"grad_norm": 1.484840989112854,
|
| 6045 |
+
"learning_rate": 3.6292661164398846e-06,
|
| 6046 |
+
"loss": 0.1461,
|
| 6047 |
+
"mean_token_accuracy": 0.779011744260788,
|
| 6048 |
+
"num_tokens": 54956032.0,
|
| 6049 |
+
"step": 6710
|
| 6050 |
+
},
|
| 6051 |
+
{
|
| 6052 |
+
"epoch": 2.02349220691213,
|
| 6053 |
+
"grad_norm": 1.2291215658187866,
|
| 6054 |
+
"learning_rate": 3.618112870845416e-06,
|
| 6055 |
+
"loss": 0.1269,
|
| 6056 |
+
"mean_token_accuracy": 0.7728228956460953,
|
| 6057 |
+
"num_tokens": 55037952.0,
|
| 6058 |
+
"step": 6720
|
| 6059 |
+
},
|
| 6060 |
+
{
|
| 6061 |
+
"epoch": 2.026504028311121,
|
| 6062 |
+
"grad_norm": 1.2073824405670166,
|
| 6063 |
+
"learning_rate": 3.6069596252509482e-06,
|
| 6064 |
+
"loss": 0.1097,
|
| 6065 |
+
"mean_token_accuracy": 0.7927470624446868,
|
| 6066 |
+
"num_tokens": 55119872.0,
|
| 6067 |
+
"step": 6730
|
| 6068 |
+
},
|
| 6069 |
+
{
|
| 6070 |
+
"epoch": 2.029515849710112,
|
| 6071 |
+
"grad_norm": 1.3367125988006592,
|
| 6072 |
+
"learning_rate": 3.59580637965648e-06,
|
| 6073 |
+
"loss": 0.0825,
|
| 6074 |
+
"mean_token_accuracy": 0.8145425617694855,
|
| 6075 |
+
"num_tokens": 55201792.0,
|
| 6076 |
+
"step": 6740
|
| 6077 |
+
},
|
| 6078 |
+
{
|
| 6079 |
+
"epoch": 2.0325276711091034,
|
| 6080 |
+
"grad_norm": 0.9058095812797546,
|
| 6081 |
+
"learning_rate": 3.5846531340620123e-06,
|
| 6082 |
+
"loss": 0.1062,
|
| 6083 |
+
"mean_token_accuracy": 0.8042319010943174,
|
| 6084 |
+
"num_tokens": 55283712.0,
|
| 6085 |
+
"step": 6750
|
| 6086 |
+
},
|
| 6087 |
+
{
|
| 6088 |
+
"epoch": 2.0355394925080943,
|
| 6089 |
+
"grad_norm": 1.2049607038497925,
|
| 6090 |
+
"learning_rate": 3.573499888467544e-06,
|
| 6091 |
+
"loss": 0.1278,
|
| 6092 |
+
"mean_token_accuracy": 0.7739603724330664,
|
| 6093 |
+
"num_tokens": 55365632.0,
|
| 6094 |
+
"step": 6760
|
| 6095 |
+
},
|
| 6096 |
+
{
|
| 6097 |
+
"epoch": 2.038551313907085,
|
| 6098 |
+
"grad_norm": 1.4414746761322021,
|
| 6099 |
+
"learning_rate": 3.5623466428730764e-06,
|
| 6100 |
+
"loss": 0.0992,
|
| 6101 |
+
"mean_token_accuracy": 0.8063600823283196,
|
| 6102 |
+
"num_tokens": 55447552.0,
|
| 6103 |
+
"step": 6770
|
| 6104 |
+
},
|
| 6105 |
+
{
|
| 6106 |
+
"epoch": 2.041563135306076,
|
| 6107 |
+
"grad_norm": 1.0376569032669067,
|
| 6108 |
+
"learning_rate": 3.5511933972786083e-06,
|
| 6109 |
+
"loss": 0.1134,
|
| 6110 |
+
"mean_token_accuracy": 0.7815435409545899,
|
| 6111 |
+
"num_tokens": 55529472.0,
|
| 6112 |
+
"step": 6780
|
| 6113 |
+
},
|
| 6114 |
+
{
|
| 6115 |
+
"epoch": 2.0445749567050675,
|
| 6116 |
+
"grad_norm": 1.3576596975326538,
|
| 6117 |
+
"learning_rate": 3.5400401516841405e-06,
|
| 6118 |
+
"loss": 0.1019,
|
| 6119 |
+
"mean_token_accuracy": 0.7937255371361971,
|
| 6120 |
+
"num_tokens": 55611392.0,
|
| 6121 |
+
"step": 6790
|
| 6122 |
+
},
|
| 6123 |
+
{
|
| 6124 |
+
"epoch": 2.0475867781040584,
|
| 6125 |
+
"grad_norm": 0.9655880331993103,
|
| 6126 |
+
"learning_rate": 3.5288869060896724e-06,
|
| 6127 |
+
"loss": 0.1065,
|
| 6128 |
+
"mean_token_accuracy": 0.7986423678696155,
|
| 6129 |
+
"num_tokens": 55693312.0,
|
| 6130 |
+
"step": 6800
|
| 6131 |
+
},
|
| 6132 |
+
{
|
| 6133 |
+
"epoch": 2.0505985995030493,
|
| 6134 |
+
"grad_norm": 1.2648464441299438,
|
| 6135 |
+
"learning_rate": 3.5177336604952046e-06,
|
| 6136 |
+
"loss": 0.1086,
|
| 6137 |
+
"mean_token_accuracy": 0.795303326100111,
|
| 6138 |
+
"num_tokens": 55775232.0,
|
| 6139 |
+
"step": 6810
|
| 6140 |
+
},
|
| 6141 |
+
{
|
| 6142 |
+
"epoch": 2.0536104209020407,
|
| 6143 |
+
"grad_norm": 1.6027874946594238,
|
| 6144 |
+
"learning_rate": 3.5065804149007364e-06,
|
| 6145 |
+
"loss": 0.0982,
|
| 6146 |
+
"mean_token_accuracy": 0.7956213317811489,
|
| 6147 |
+
"num_tokens": 55857152.0,
|
| 6148 |
+
"step": 6820
|
| 6149 |
+
},
|
| 6150 |
+
{
|
| 6151 |
+
"epoch": 2.0566222423010316,
|
| 6152 |
+
"grad_norm": 1.4525415897369385,
|
| 6153 |
+
"learning_rate": 3.4954271693062687e-06,
|
| 6154 |
+
"loss": 0.1175,
|
| 6155 |
+
"mean_token_accuracy": 0.7873654570430517,
|
| 6156 |
+
"num_tokens": 55939072.0,
|
| 6157 |
+
"step": 6830
|
| 6158 |
+
},
|
| 6159 |
+
{
|
| 6160 |
+
"epoch": 2.0596340637000226,
|
| 6161 |
+
"grad_norm": 1.5248804092407227,
|
| 6162 |
+
"learning_rate": 3.4842739237118005e-06,
|
| 6163 |
+
"loss": 0.0992,
|
| 6164 |
+
"mean_token_accuracy": 0.7903008766472339,
|
| 6165 |
+
"num_tokens": 56020992.0,
|
| 6166 |
+
"step": 6840
|
| 6167 |
+
},
|
| 6168 |
+
{
|
| 6169 |
+
"epoch": 2.0626458850990135,
|
| 6170 |
+
"grad_norm": 1.1746339797973633,
|
| 6171 |
+
"learning_rate": 3.473120678117333e-06,
|
| 6172 |
+
"loss": 0.1205,
|
| 6173 |
+
"mean_token_accuracy": 0.7796355158090591,
|
| 6174 |
+
"num_tokens": 56102912.0,
|
| 6175 |
+
"step": 6850
|
| 6176 |
+
},
|
| 6177 |
+
{
|
| 6178 |
+
"epoch": 2.065657706498005,
|
| 6179 |
+
"grad_norm": 1.181340217590332,
|
| 6180 |
+
"learning_rate": 3.4619674325228646e-06,
|
| 6181 |
+
"loss": 0.1235,
|
| 6182 |
+
"mean_token_accuracy": 0.7802837561815977,
|
| 6183 |
+
"num_tokens": 56184832.0,
|
| 6184 |
+
"step": 6860
|
| 6185 |
+
},
|
| 6186 |
+
{
|
| 6187 |
+
"epoch": 2.0686695278969958,
|
| 6188 |
+
"grad_norm": 1.4108185768127441,
|
| 6189 |
+
"learning_rate": 3.450814186928396e-06,
|
| 6190 |
+
"loss": 0.1011,
|
| 6191 |
+
"mean_token_accuracy": 0.8037915851920843,
|
| 6192 |
+
"num_tokens": 56266752.0,
|
| 6193 |
+
"step": 6870
|
| 6194 |
+
},
|
| 6195 |
+
{
|
| 6196 |
+
"epoch": 2.0716813492959867,
|
| 6197 |
+
"grad_norm": 1.146896481513977,
|
| 6198 |
+
"learning_rate": 3.4396609413339283e-06,
|
| 6199 |
+
"loss": 0.1233,
|
| 6200 |
+
"mean_token_accuracy": 0.7929427601397038,
|
| 6201 |
+
"num_tokens": 56348672.0,
|
| 6202 |
+
"step": 6880
|
| 6203 |
+
},
|
| 6204 |
+
{
|
| 6205 |
+
"epoch": 2.0746931706949776,
|
| 6206 |
+
"grad_norm": 1.2894806861877441,
|
| 6207 |
+
"learning_rate": 3.42850769573946e-06,
|
| 6208 |
+
"loss": 0.1127,
|
| 6209 |
+
"mean_token_accuracy": 0.7803816046565771,
|
| 6210 |
+
"num_tokens": 56430592.0,
|
| 6211 |
+
"step": 6890
|
| 6212 |
+
},
|
| 6213 |
+
{
|
| 6214 |
+
"epoch": 2.077704992093969,
|
| 6215 |
+
"grad_norm": 0.9775878190994263,
|
| 6216 |
+
"learning_rate": 3.4173544501449924e-06,
|
| 6217 |
+
"loss": 0.1012,
|
| 6218 |
+
"mean_token_accuracy": 0.7998899217694998,
|
| 6219 |
+
"num_tokens": 56512512.0,
|
| 6220 |
+
"step": 6900
|
| 6221 |
+
},
|
| 6222 |
+
{
|
| 6223 |
+
"epoch": 2.08071681349296,
|
| 6224 |
+
"grad_norm": 1.141923427581787,
|
| 6225 |
+
"learning_rate": 3.4062012045505242e-06,
|
| 6226 |
+
"loss": 0.1032,
|
| 6227 |
+
"mean_token_accuracy": 0.7836839504539966,
|
| 6228 |
+
"num_tokens": 56594432.0,
|
| 6229 |
+
"step": 6910
|
| 6230 |
+
},
|
| 6231 |
+
{
|
| 6232 |
+
"epoch": 2.083728634891951,
|
| 6233 |
+
"grad_norm": 1.037724494934082,
|
| 6234 |
+
"learning_rate": 3.3950479589560565e-06,
|
| 6235 |
+
"loss": 0.1049,
|
| 6236 |
+
"mean_token_accuracy": 0.8032534249126911,
|
| 6237 |
+
"num_tokens": 56676352.0,
|
| 6238 |
+
"step": 6920
|
| 6239 |
+
},
|
| 6240 |
+
{
|
| 6241 |
+
"epoch": 2.0867404562909417,
|
| 6242 |
+
"grad_norm": 1.3930587768554688,
|
| 6243 |
+
"learning_rate": 3.3838947133615883e-06,
|
| 6244 |
+
"loss": 0.1065,
|
| 6245 |
+
"mean_token_accuracy": 0.786497063934803,
|
| 6246 |
+
"num_tokens": 56758272.0,
|
| 6247 |
+
"step": 6930
|
| 6248 |
+
},
|
| 6249 |
+
{
|
| 6250 |
+
"epoch": 2.089752277689933,
|
| 6251 |
+
"grad_norm": 0.9995868802070618,
|
| 6252 |
+
"learning_rate": 3.3727414677671206e-06,
|
| 6253 |
+
"loss": 0.1105,
|
| 6254 |
+
"mean_token_accuracy": 0.7776051837950945,
|
| 6255 |
+
"num_tokens": 56840192.0,
|
| 6256 |
+
"step": 6940
|
| 6257 |
+
},
|
| 6258 |
+
{
|
| 6259 |
+
"epoch": 2.092764099088924,
|
| 6260 |
+
"grad_norm": 1.704577088356018,
|
| 6261 |
+
"learning_rate": 3.3615882221726524e-06,
|
| 6262 |
+
"loss": 0.1174,
|
| 6263 |
+
"mean_token_accuracy": 0.7858610555529595,
|
| 6264 |
+
"num_tokens": 56922112.0,
|
| 6265 |
+
"step": 6950
|
| 6266 |
+
},
|
| 6267 |
+
{
|
| 6268 |
+
"epoch": 2.095775920487915,
|
| 6269 |
+
"grad_norm": 1.1011236906051636,
|
| 6270 |
+
"learning_rate": 3.3504349765781847e-06,
|
| 6271 |
+
"loss": 0.1084,
|
| 6272 |
+
"mean_token_accuracy": 0.7746819939464331,
|
| 6273 |
+
"num_tokens": 57004032.0,
|
| 6274 |
+
"step": 6960
|
| 6275 |
+
},
|
| 6276 |
+
{
|
| 6277 |
+
"epoch": 2.0987877418869063,
|
| 6278 |
+
"grad_norm": 0.932067334651947,
|
| 6279 |
+
"learning_rate": 3.3392817309837165e-06,
|
| 6280 |
+
"loss": 0.1242,
|
| 6281 |
+
"mean_token_accuracy": 0.7733732841908931,
|
| 6282 |
+
"num_tokens": 57085952.0,
|
| 6283 |
+
"step": 6970
|
| 6284 |
+
},
|
| 6285 |
+
{
|
| 6286 |
+
"epoch": 2.1017995632858972,
|
| 6287 |
+
"grad_norm": 0.9481123685836792,
|
| 6288 |
+
"learning_rate": 3.3281284853892487e-06,
|
| 6289 |
+
"loss": 0.1079,
|
| 6290 |
+
"mean_token_accuracy": 0.7982142839580775,
|
| 6291 |
+
"num_tokens": 57167872.0,
|
| 6292 |
+
"step": 6980
|
| 6293 |
+
},
|
| 6294 |
+
{
|
| 6295 |
+
"epoch": 2.104811384684888,
|
| 6296 |
+
"grad_norm": 1.3651145696640015,
|
| 6297 |
+
"learning_rate": 3.3169752397947806e-06,
|
| 6298 |
+
"loss": 0.1265,
|
| 6299 |
+
"mean_token_accuracy": 0.7904231909662485,
|
| 6300 |
+
"num_tokens": 57249792.0,
|
| 6301 |
+
"step": 6990
|
| 6302 |
+
},
|
| 6303 |
+
{
|
| 6304 |
+
"epoch": 2.107823206083879,
|
| 6305 |
+
"grad_norm": 1.0314269065856934,
|
| 6306 |
+
"learning_rate": 3.305821994200313e-06,
|
| 6307 |
+
"loss": 0.1519,
|
| 6308 |
+
"mean_token_accuracy": 0.7659491188824177,
|
| 6309 |
+
"num_tokens": 57331712.0,
|
| 6310 |
+
"step": 7000
|
| 6311 |
}
|
| 6312 |
],
|
| 6313 |
"logging_steps": 10,
|
|
|
|
| 6327 |
"attributes": {}
|
| 6328 |
}
|
| 6329 |
},
|
| 6330 |
+
"total_flos": 1.5151651999658803e+17,
|
| 6331 |
"train_batch_size": 2,
|
| 6332 |
"trial_name": null,
|
| 6333 |
"trial_params": null
|