Training in progress, step 57000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16b35a6c5a2893347ac39200ce6524a1890f21615a98cf260909a1625f36f1c5
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b96c4f49154280d995e547e25a75aad825b4ac333aa881c2f7edaa3460a4415
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b80a94302b027aba469e721f259f7cea336e0f08145beaf0eef00eec23f3459c
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d03f04e05cd70ad1a826e9dcf44af396ac68835a057941493a30d6d09cfeca51
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9976,11 +9976,189 @@
|
|
| 9976 |
"eval_steps_per_second": 23.522,
|
| 9977 |
"num_input_tokens_seen": 14680064000,
|
| 9978 |
"step": 56000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9979 |
}
|
| 9980 |
],
|
| 9981 |
"logging_steps": 50,
|
| 9982 |
"max_steps": 60000,
|
| 9983 |
-
"num_input_tokens_seen":
|
| 9984 |
"num_train_epochs": 1,
|
| 9985 |
"save_steps": 1000,
|
| 9986 |
"stateful_callbacks": {
|
|
@@ -9995,7 +10173,7 @@
|
|
| 9995 |
"attributes": {}
|
| 9996 |
}
|
| 9997 |
},
|
| 9998 |
-
"total_flos": 3.
|
| 9999 |
"train_batch_size": 64,
|
| 10000 |
"trial_name": null,
|
| 10001 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.38341043582667833,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 57000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9976 |
"eval_steps_per_second": 23.522,
|
| 9977 |
"num_input_tokens_seen": 14680064000,
|
| 9978 |
"step": 56000
|
| 9979 |
+
},
|
| 9980 |
+
{
|
| 9981 |
+
"epoch": 0.37702026189623367,
|
| 9982 |
+
"grad_norm": 0.2067674696445465,
|
| 9983 |
+
"learning_rate": 0.0007385793801298042,
|
| 9984 |
+
"loss": 3.05,
|
| 9985 |
+
"num_input_tokens_seen": 14693171200,
|
| 9986 |
+
"step": 56050
|
| 9987 |
+
},
|
| 9988 |
+
{
|
| 9989 |
+
"epoch": 0.3773565868399413,
|
| 9990 |
+
"grad_norm": 0.20803235471248627,
|
| 9991 |
+
"learning_rate": 0.0007269952498697733,
|
| 9992 |
+
"loss": 3.0451,
|
| 9993 |
+
"num_input_tokens_seen": 14706278400,
|
| 9994 |
+
"step": 56100
|
| 9995 |
+
},
|
| 9996 |
+
{
|
| 9997 |
+
"epoch": 0.3776929117836489,
|
| 9998 |
+
"grad_norm": 0.2035783976316452,
|
| 9999 |
+
"learning_rate": 0.0007152555484041476,
|
| 10000 |
+
"loss": 3.0281,
|
| 10001 |
+
"num_input_tokens_seen": 14719385600,
|
| 10002 |
+
"step": 56150
|
| 10003 |
+
},
|
| 10004 |
+
{
|
| 10005 |
+
"epoch": 0.3780292367273565,
|
| 10006 |
+
"grad_norm": 0.21911849081516266,
|
| 10007 |
+
"learning_rate": 0.0007033683215379002,
|
| 10008 |
+
"loss": 3.0312,
|
| 10009 |
+
"num_input_tokens_seen": 14732492800,
|
| 10010 |
+
"step": 56200
|
| 10011 |
+
},
|
| 10012 |
+
{
|
| 10013 |
+
"epoch": 0.3783655616710641,
|
| 10014 |
+
"grad_norm": 0.2263978123664856,
|
| 10015 |
+
"learning_rate": 0.000691341716182545,
|
| 10016 |
+
"loss": 3.0237,
|
| 10017 |
+
"num_input_tokens_seen": 14745600000,
|
| 10018 |
+
"step": 56250
|
| 10019 |
+
},
|
| 10020 |
+
{
|
| 10021 |
+
"epoch": 0.3787018866147717,
|
| 10022 |
+
"grad_norm": 0.20394045114517212,
|
| 10023 |
+
"learning_rate": 0.0006791839747726501,
|
| 10024 |
+
"loss": 3.0271,
|
| 10025 |
+
"num_input_tokens_seen": 14758707200,
|
| 10026 |
+
"step": 56300
|
| 10027 |
+
},
|
| 10028 |
+
{
|
| 10029 |
+
"epoch": 0.37903821155847933,
|
| 10030 |
+
"grad_norm": 0.1954122930765152,
|
| 10031 |
+
"learning_rate": 0.0006669034296168854,
|
| 10032 |
+
"loss": 3.0368,
|
| 10033 |
+
"num_input_tokens_seen": 14771814400,
|
| 10034 |
+
"step": 56350
|
| 10035 |
+
},
|
| 10036 |
+
{
|
| 10037 |
+
"epoch": 0.37937453650218694,
|
| 10038 |
+
"grad_norm": 0.2434541881084442,
|
| 10039 |
+
"learning_rate": 0.0006545084971874737,
|
| 10040 |
+
"loss": 3.0268,
|
| 10041 |
+
"num_input_tokens_seen": 14784921600,
|
| 10042 |
+
"step": 56400
|
| 10043 |
+
},
|
| 10044 |
+
{
|
| 10045 |
+
"epoch": 0.37971086144589455,
|
| 10046 |
+
"grad_norm": 0.19820261001586914,
|
| 10047 |
+
"learning_rate": 0.0006420076723519614,
|
| 10048 |
+
"loss": 3.0193,
|
| 10049 |
+
"num_input_tokens_seen": 14798028800,
|
| 10050 |
+
"step": 56450
|
| 10051 |
+
},
|
| 10052 |
+
{
|
| 10053 |
+
"epoch": 0.38004718638960217,
|
| 10054 |
+
"grad_norm": 0.18117697536945343,
|
| 10055 |
+
"learning_rate": 0.0006294095225512603,
|
| 10056 |
+
"loss": 3.0241,
|
| 10057 |
+
"num_input_tokens_seen": 14811136000,
|
| 10058 |
+
"step": 56500
|
| 10059 |
+
},
|
| 10060 |
+
{
|
| 10061 |
+
"epoch": 0.38004718638960217,
|
| 10062 |
+
"eval_loss": 2.920185089111328,
|
| 10063 |
+
"eval_runtime": 53.8805,
|
| 10064 |
+
"eval_samples_per_second": 92.798,
|
| 10065 |
+
"eval_steps_per_second": 23.199,
|
| 10066 |
+
"num_input_tokens_seen": 14811136000,
|
| 10067 |
+
"step": 56500
|
| 10068 |
+
},
|
| 10069 |
+
{
|
| 10070 |
+
"epoch": 0.3803835113333098,
|
| 10071 |
+
"grad_norm": 0.20303522050380707,
|
| 10072 |
+
"learning_rate": 0.0006167226819279528,
|
| 10073 |
+
"loss": 3.0133,
|
| 10074 |
+
"num_input_tokens_seen": 14824243200,
|
| 10075 |
+
"step": 56550
|
| 10076 |
+
},
|
| 10077 |
+
{
|
| 10078 |
+
"epoch": 0.3807198362770174,
|
| 10079 |
+
"grad_norm": 0.19498929381370544,
|
| 10080 |
+
"learning_rate": 0.0006039558454088796,
|
| 10081 |
+
"loss": 3.0241,
|
| 10082 |
+
"num_input_tokens_seen": 14837350400,
|
| 10083 |
+
"step": 56600
|
| 10084 |
+
},
|
| 10085 |
+
{
|
| 10086 |
+
"epoch": 0.381056161220725,
|
| 10087 |
+
"grad_norm": 0.21773076057434082,
|
| 10088 |
+
"learning_rate": 0.0005911177627460738,
|
| 10089 |
+
"loss": 3.0235,
|
| 10090 |
+
"num_input_tokens_seen": 14850457600,
|
| 10091 |
+
"step": 56650
|
| 10092 |
+
},
|
| 10093 |
+
{
|
| 10094 |
+
"epoch": 0.3813924861644326,
|
| 10095 |
+
"grad_norm": 0.19796748459339142,
|
| 10096 |
+
"learning_rate": 0.0005782172325201155,
|
| 10097 |
+
"loss": 3.019,
|
| 10098 |
+
"num_input_tokens_seen": 14863564800,
|
| 10099 |
+
"step": 56700
|
| 10100 |
+
},
|
| 10101 |
+
{
|
| 10102 |
+
"epoch": 0.3817288111081402,
|
| 10103 |
+
"grad_norm": 0.18569409847259521,
|
| 10104 |
+
"learning_rate": 0.000565263096110026,
|
| 10105 |
+
"loss": 3.0189,
|
| 10106 |
+
"num_input_tokens_seen": 14876672000,
|
| 10107 |
+
"step": 56750
|
| 10108 |
+
},
|
| 10109 |
+
{
|
| 10110 |
+
"epoch": 0.38206513605184783,
|
| 10111 |
+
"grad_norm": 0.27358362078666687,
|
| 10112 |
+
"learning_rate": 0.0005522642316338268,
|
| 10113 |
+
"loss": 3.0107,
|
| 10114 |
+
"num_input_tokens_seen": 14889779200,
|
| 10115 |
+
"step": 56800
|
| 10116 |
+
},
|
| 10117 |
+
{
|
| 10118 |
+
"epoch": 0.38240146099555544,
|
| 10119 |
+
"grad_norm": 0.2143600583076477,
|
| 10120 |
+
"learning_rate": 0.0005392295478639225,
|
| 10121 |
+
"loss": 3.0139,
|
| 10122 |
+
"num_input_tokens_seen": 14902886400,
|
| 10123 |
+
"step": 56850
|
| 10124 |
+
},
|
| 10125 |
+
{
|
| 10126 |
+
"epoch": 0.38273778593926305,
|
| 10127 |
+
"grad_norm": 0.18786349892616272,
|
| 10128 |
+
"learning_rate": 0.000526167978121472,
|
| 10129 |
+
"loss": 3.0187,
|
| 10130 |
+
"num_input_tokens_seen": 14915993600,
|
| 10131 |
+
"step": 56900
|
| 10132 |
+
},
|
| 10133 |
+
{
|
| 10134 |
+
"epoch": 0.38307411088297066,
|
| 10135 |
+
"grad_norm": 0.1809261441230774,
|
| 10136 |
+
"learning_rate": 0.0005130884741539367,
|
| 10137 |
+
"loss": 3.0197,
|
| 10138 |
+
"num_input_tokens_seen": 14929100800,
|
| 10139 |
+
"step": 56950
|
| 10140 |
+
},
|
| 10141 |
+
{
|
| 10142 |
+
"epoch": 0.38341043582667833,
|
| 10143 |
+
"grad_norm": 0.1926116794347763,
|
| 10144 |
+
"learning_rate": 0.0005,
|
| 10145 |
+
"loss": 3.0101,
|
| 10146 |
+
"num_input_tokens_seen": 14942208000,
|
| 10147 |
+
"step": 57000
|
| 10148 |
+
},
|
| 10149 |
+
{
|
| 10150 |
+
"epoch": 0.38341043582667833,
|
| 10151 |
+
"eval_loss": 2.912503242492676,
|
| 10152 |
+
"eval_runtime": 52.7455,
|
| 10153 |
+
"eval_samples_per_second": 94.795,
|
| 10154 |
+
"eval_steps_per_second": 23.699,
|
| 10155 |
+
"num_input_tokens_seen": 14942208000,
|
| 10156 |
+
"step": 57000
|
| 10157 |
}
|
| 10158 |
],
|
| 10159 |
"logging_steps": 50,
|
| 10160 |
"max_steps": 60000,
|
| 10161 |
+
"num_input_tokens_seen": 14942208000,
|
| 10162 |
"num_train_epochs": 1,
|
| 10163 |
"save_steps": 1000,
|
| 10164 |
"stateful_callbacks": {
|
|
|
|
| 10173 |
"attributes": {}
|
| 10174 |
}
|
| 10175 |
},
|
| 10176 |
+
"total_flos": 3.99718647595008e+18,
|
| 10177 |
"train_batch_size": 64,
|
| 10178 |
"trial_name": null,
|
| 10179 |
"trial_params": null
|