Training in progress, step 118000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb226fadbf28661b9371114993dc12e49ac5975cdb3cc0b050988cda066eda63
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:619f8c200e9aaadfdae5aad82237b7f7ba5a625617b8275ba58c98a0a1cd45f8
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eadabea5b840d3b07e42e9e423397807b167316e75ece7076e65c7e1fda35503
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c77e72696edbb72e0b5c20319181466e8d1ea3a266d160a365b8e9afc9f97b0
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20834,11 +20834,189 @@
|
|
| 20834 |
"eval_steps_per_second": 15.104,
|
| 20835 |
"num_input_tokens_seen": 61331831488,
|
| 20836 |
"step": 117000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20837 |
}
|
| 20838 |
],
|
| 20839 |
"logging_steps": 50,
|
| 20840 |
"max_steps": 140000,
|
| 20841 |
-
"num_input_tokens_seen":
|
| 20842 |
"num_train_epochs": 2,
|
| 20843 |
"save_steps": 1000,
|
| 20844 |
"stateful_callbacks": {
|
|
@@ -20853,7 +21031,7 @@
|
|
| 20853 |
"attributes": {}
|
| 20854 |
}
|
| 20855 |
},
|
| 20856 |
-
"total_flos": 1.
|
| 20857 |
"train_batch_size": 32,
|
| 20858 |
"trial_name": null,
|
| 20859 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1257283232169049,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 118000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20834 |
"eval_steps_per_second": 15.104,
|
| 20835 |
"num_input_tokens_seen": 61331831488,
|
| 20836 |
"step": 117000
|
| 20837 |
+
},
|
| 20838 |
+
{
|
| 20839 |
+
"epoch": 1.1166652754093278,
|
| 20840 |
+
"grad_norm": 0.13141483068466187,
|
| 20841 |
+
"learning_rate": 0.0009218630989585645,
|
| 20842 |
+
"loss": 2.0933,
|
| 20843 |
+
"num_input_tokens_seen": 61358045888,
|
| 20844 |
+
"step": 117050
|
| 20845 |
+
},
|
| 20846 |
+
{
|
| 20847 |
+
"epoch": 1.117142277925516,
|
| 20848 |
+
"grad_norm": 0.14495305716991425,
|
| 20849 |
+
"learning_rate": 0.0009203508214822651,
|
| 20850 |
+
"loss": 2.0864,
|
| 20851 |
+
"num_input_tokens_seen": 61384257568,
|
| 20852 |
+
"step": 117100
|
| 20853 |
+
},
|
| 20854 |
+
{
|
| 20855 |
+
"epoch": 1.1176192804417044,
|
| 20856 |
+
"grad_norm": 0.14642465114593506,
|
| 20857 |
+
"learning_rate": 0.0009188253147794443,
|
| 20858 |
+
"loss": 2.0918,
|
| 20859 |
+
"num_input_tokens_seen": 61410471968,
|
| 20860 |
+
"step": 117150
|
| 20861 |
+
},
|
| 20862 |
+
{
|
| 20863 |
+
"epoch": 1.1180962829578927,
|
| 20864 |
+
"grad_norm": 0.13314634561538696,
|
| 20865 |
+
"learning_rate": 0.0009172866268606513,
|
| 20866 |
+
"loss": 2.0896,
|
| 20867 |
+
"num_input_tokens_seen": 61436668768,
|
| 20868 |
+
"step": 117200
|
| 20869 |
+
},
|
| 20870 |
+
{
|
| 20871 |
+
"epoch": 1.1185732854740809,
|
| 20872 |
+
"grad_norm": 0.15387175977230072,
|
| 20873 |
+
"learning_rate": 0.0009157348061512727,
|
| 20874 |
+
"loss": 2.0771,
|
| 20875 |
+
"num_input_tokens_seen": 61462881056,
|
| 20876 |
+
"step": 117250
|
| 20877 |
+
},
|
| 20878 |
+
{
|
| 20879 |
+
"epoch": 1.119050287990269,
|
| 20880 |
+
"grad_norm": 0.13886821269989014,
|
| 20881 |
+
"learning_rate": 0.0009141699014900082,
|
| 20882 |
+
"loss": 2.0945,
|
| 20883 |
+
"num_input_tokens_seen": 61489085536,
|
| 20884 |
+
"step": 117300
|
| 20885 |
+
},
|
| 20886 |
+
{
|
| 20887 |
+
"epoch": 1.1195272905064575,
|
| 20888 |
+
"grad_norm": 0.13939301669597626,
|
| 20889 |
+
"learning_rate": 0.0009125919621273348,
|
| 20890 |
+
"loss": 2.0918,
|
| 20891 |
+
"num_input_tokens_seen": 61515286016,
|
| 20892 |
+
"step": 117350
|
| 20893 |
+
},
|
| 20894 |
+
{
|
| 20895 |
+
"epoch": 1.1200042930226457,
|
| 20896 |
+
"grad_norm": 0.1996990144252777,
|
| 20897 |
+
"learning_rate": 0.0009110010377239551,
|
| 20898 |
+
"loss": 2.0859,
|
| 20899 |
+
"num_input_tokens_seen": 61541500416,
|
| 20900 |
+
"step": 117400
|
| 20901 |
+
},
|
| 20902 |
+
{
|
| 20903 |
+
"epoch": 1.120481295538834,
|
| 20904 |
+
"grad_norm": 0.135545015335083,
|
| 20905 |
+
"learning_rate": 0.0009093971783492354,
|
| 20906 |
+
"loss": 2.089,
|
| 20907 |
+
"num_input_tokens_seen": 61567714816,
|
| 20908 |
+
"step": 117450
|
| 20909 |
+
},
|
| 20910 |
+
{
|
| 20911 |
+
"epoch": 1.1209582980550223,
|
| 20912 |
+
"grad_norm": 0.1394105702638626,
|
| 20913 |
+
"learning_rate": 0.0009077804344796301,
|
| 20914 |
+
"loss": 2.0759,
|
| 20915 |
+
"num_input_tokens_seen": 61593927520,
|
| 20916 |
+
"step": 117500
|
| 20917 |
+
},
|
| 20918 |
+
{
|
| 20919 |
+
"epoch": 1.1209582980550223,
|
| 20920 |
+
"eval_loss": 2.003880739212036,
|
| 20921 |
+
"eval_runtime": 83.0803,
|
| 20922 |
+
"eval_samples_per_second": 60.183,
|
| 20923 |
+
"eval_steps_per_second": 15.046,
|
| 20924 |
+
"num_input_tokens_seen": 61593927520,
|
| 20925 |
+
"step": 117500
|
| 20926 |
+
},
|
| 20927 |
+
{
|
| 20928 |
+
"epoch": 1.1214353005712105,
|
| 20929 |
+
"grad_norm": 0.1590648591518402,
|
| 20930 |
+
"learning_rate": 0.0009061508569970925,
|
| 20931 |
+
"loss": 2.0825,
|
| 20932 |
+
"num_input_tokens_seen": 61620139072,
|
| 20933 |
+
"step": 117550
|
| 20934 |
+
},
|
| 20935 |
+
{
|
| 20936 |
+
"epoch": 1.1219123030873988,
|
| 20937 |
+
"grad_norm": 0.13328000903129578,
|
| 20938 |
+
"learning_rate": 0.0009045084971874737,
|
| 20939 |
+
"loss": 2.0877,
|
| 20940 |
+
"num_input_tokens_seen": 61646353472,
|
| 20941 |
+
"step": 117600
|
| 20942 |
+
},
|
| 20943 |
+
{
|
| 20944 |
+
"epoch": 1.122389305603587,
|
| 20945 |
+
"grad_norm": 0.13834019005298615,
|
| 20946 |
+
"learning_rate": 0.0009028534067389086,
|
| 20947 |
+
"loss": 2.0871,
|
| 20948 |
+
"num_input_tokens_seen": 61672566336,
|
| 20949 |
+
"step": 117650
|
| 20950 |
+
},
|
| 20951 |
+
{
|
| 20952 |
+
"epoch": 1.1228663081197754,
|
| 20953 |
+
"grad_norm": 0.13156409561634064,
|
| 20954 |
+
"learning_rate": 0.000901185637740189,
|
| 20955 |
+
"loss": 2.0906,
|
| 20956 |
+
"num_input_tokens_seen": 61698777696,
|
| 20957 |
+
"step": 117700
|
| 20958 |
+
},
|
| 20959 |
+
{
|
| 20960 |
+
"epoch": 1.1233433106359636,
|
| 20961 |
+
"grad_norm": 0.1528773009777069,
|
| 20962 |
+
"learning_rate": 0.0008995052426791246,
|
| 20963 |
+
"loss": 2.0731,
|
| 20964 |
+
"num_input_tokens_seen": 61724974336,
|
| 20965 |
+
"step": 117750
|
| 20966 |
+
},
|
| 20967 |
+
{
|
| 20968 |
+
"epoch": 1.1238203131521518,
|
| 20969 |
+
"grad_norm": 0.14865480363368988,
|
| 20970 |
+
"learning_rate": 0.0008978122744408905,
|
| 20971 |
+
"loss": 2.082,
|
| 20972 |
+
"num_input_tokens_seen": 61751177792,
|
| 20973 |
+
"step": 117800
|
| 20974 |
+
},
|
| 20975 |
+
{
|
| 20976 |
+
"epoch": 1.1242973156683402,
|
| 20977 |
+
"grad_norm": 0.14318804442882538,
|
| 20978 |
+
"learning_rate": 0.0008961067863063638,
|
| 20979 |
+
"loss": 2.0891,
|
| 20980 |
+
"num_input_tokens_seen": 61777391648,
|
| 20981 |
+
"step": 117850
|
| 20982 |
+
},
|
| 20983 |
+
{
|
| 20984 |
+
"epoch": 1.1247743181845284,
|
| 20985 |
+
"grad_norm": 0.14581789076328278,
|
| 20986 |
+
"learning_rate": 0.0008943888319504456,
|
| 20987 |
+
"loss": 2.0908,
|
| 20988 |
+
"num_input_tokens_seen": 61803602176,
|
| 20989 |
+
"step": 117900
|
| 20990 |
+
},
|
| 20991 |
+
{
|
| 20992 |
+
"epoch": 1.1252513207007167,
|
| 20993 |
+
"grad_norm": 0.14142882823944092,
|
| 20994 |
+
"learning_rate": 0.0008926584654403724,
|
| 20995 |
+
"loss": 2.0791,
|
| 20996 |
+
"num_input_tokens_seen": 61829816576,
|
| 20997 |
+
"step": 117950
|
| 20998 |
+
},
|
| 20999 |
+
{
|
| 21000 |
+
"epoch": 1.1257283232169049,
|
| 21001 |
+
"grad_norm": 0.15033917129039764,
|
| 21002 |
+
"learning_rate": 0.000890915741234015,
|
| 21003 |
+
"loss": 2.0801,
|
| 21004 |
+
"num_input_tokens_seen": 61856020192,
|
| 21005 |
+
"step": 118000
|
| 21006 |
+
},
|
| 21007 |
+
{
|
| 21008 |
+
"epoch": 1.1257283232169049,
|
| 21009 |
+
"eval_loss": 2.0019845962524414,
|
| 21010 |
+
"eval_runtime": 82.7188,
|
| 21011 |
+
"eval_samples_per_second": 60.446,
|
| 21012 |
+
"eval_steps_per_second": 15.111,
|
| 21013 |
+
"num_input_tokens_seen": 61856020192,
|
| 21014 |
+
"step": 118000
|
| 21015 |
}
|
| 21016 |
],
|
| 21017 |
"logging_steps": 50,
|
| 21018 |
"max_steps": 140000,
|
| 21019 |
+
"num_input_tokens_seen": 61856020192,
|
| 21020 |
"num_train_epochs": 2,
|
| 21021 |
"save_steps": 1000,
|
| 21022 |
"stateful_callbacks": {
|
|
|
|
| 21031 |
"attributes": {}
|
| 21032 |
}
|
| 21033 |
},
|
| 21034 |
+
"total_flos": 1.0947387320175698e+20,
|
| 21035 |
"train_batch_size": 32,
|
| 21036 |
"trial_name": null,
|
| 21037 |
"trial_params": null
|