Training in progress, step 58000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fbbcf8e4efabf5866400ce20d5f64dfe9bcdba3c76105321e75b94424bbdf9a
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee28446b68e061d51e2acb6d49ad965661e91bf2d3291a5dc5003af4c9992cc6
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec1bfb0db1c21e8b4cd52af95928aa8366b624cdfe8a7ae4baa053e84325dfb8
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e04abc75ac3354daa3070b9f9eb5e8a95eba4855d092af143aa714bd01a0140a
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10154,11 +10154,189 @@
|
|
| 10154 |
"eval_steps_per_second": 23.699,
|
| 10155 |
"num_input_tokens_seen": 14942208000,
|
| 10156 |
"step": 57000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10157 |
}
|
| 10158 |
],
|
| 10159 |
"logging_steps": 50,
|
| 10160 |
"max_steps": 60000,
|
| 10161 |
-
"num_input_tokens_seen":
|
| 10162 |
"num_train_epochs": 1,
|
| 10163 |
"save_steps": 1000,
|
| 10164 |
"stateful_callbacks": {
|
|
@@ -10173,7 +10351,7 @@
|
|
| 10173 |
"attributes": {}
|
| 10174 |
}
|
| 10175 |
},
|
| 10176 |
-
"total_flos":
|
| 10177 |
"train_batch_size": 64,
|
| 10178 |
"trial_name": null,
|
| 10179 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.39013693470083055,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 58000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10154 |
"eval_steps_per_second": 23.699,
|
| 10155 |
"num_input_tokens_seen": 14942208000,
|
| 10156 |
"step": 57000
|
| 10157 |
+
},
|
| 10158 |
+
{
|
| 10159 |
+
"epoch": 0.38374676077038594,
|
| 10160 |
+
"grad_norm": 0.255500853061676,
|
| 10161 |
+
"learning_rate": 0.0004869115258460635,
|
| 10162 |
+
"loss": 3.0102,
|
| 10163 |
+
"num_input_tokens_seen": 14955315200,
|
| 10164 |
+
"step": 57050
|
| 10165 |
+
},
|
| 10166 |
+
{
|
| 10167 |
+
"epoch": 0.38408308571409355,
|
| 10168 |
+
"grad_norm": 0.18287675082683563,
|
| 10169 |
+
"learning_rate": 0.0004738320218785281,
|
| 10170 |
+
"loss": 3.0074,
|
| 10171 |
+
"num_input_tokens_seen": 14968422400,
|
| 10172 |
+
"step": 57100
|
| 10173 |
+
},
|
| 10174 |
+
{
|
| 10175 |
+
"epoch": 0.38441941065780116,
|
| 10176 |
+
"grad_norm": 0.1864452064037323,
|
| 10177 |
+
"learning_rate": 0.0004607704521360776,
|
| 10178 |
+
"loss": 3.0181,
|
| 10179 |
+
"num_input_tokens_seen": 14981529600,
|
| 10180 |
+
"step": 57150
|
| 10181 |
+
},
|
| 10182 |
+
{
|
| 10183 |
+
"epoch": 0.3847557356015088,
|
| 10184 |
+
"grad_norm": 0.17273065447807312,
|
| 10185 |
+
"learning_rate": 0.00044773576836617336,
|
| 10186 |
+
"loss": 3.0077,
|
| 10187 |
+
"num_input_tokens_seen": 14994636800,
|
| 10188 |
+
"step": 57200
|
| 10189 |
+
},
|
| 10190 |
+
{
|
| 10191 |
+
"epoch": 0.3850920605452164,
|
| 10192 |
+
"grad_norm": 0.17590677738189697,
|
| 10193 |
+
"learning_rate": 0.00043473690388997434,
|
| 10194 |
+
"loss": 3.0118,
|
| 10195 |
+
"num_input_tokens_seen": 15007744000,
|
| 10196 |
+
"step": 57250
|
| 10197 |
+
},
|
| 10198 |
+
{
|
| 10199 |
+
"epoch": 0.385428385488924,
|
| 10200 |
+
"grad_norm": 0.16380582749843597,
|
| 10201 |
+
"learning_rate": 0.0004217827674798845,
|
| 10202 |
+
"loss": 3.0074,
|
| 10203 |
+
"num_input_tokens_seen": 15020851200,
|
| 10204 |
+
"step": 57300
|
| 10205 |
+
},
|
| 10206 |
+
{
|
| 10207 |
+
"epoch": 0.3857647104326316,
|
| 10208 |
+
"grad_norm": 0.19464251399040222,
|
| 10209 |
+
"learning_rate": 0.00040888223725392626,
|
| 10210 |
+
"loss": 3.0126,
|
| 10211 |
+
"num_input_tokens_seen": 15033958400,
|
| 10212 |
+
"step": 57350
|
| 10213 |
+
},
|
| 10214 |
+
{
|
| 10215 |
+
"epoch": 0.3861010353763392,
|
| 10216 |
+
"grad_norm": 0.17150136828422546,
|
| 10217 |
+
"learning_rate": 0.0003960441545911204,
|
| 10218 |
+
"loss": 3.0049,
|
| 10219 |
+
"num_input_tokens_seen": 15047065600,
|
| 10220 |
+
"step": 57400
|
| 10221 |
+
},
|
| 10222 |
+
{
|
| 10223 |
+
"epoch": 0.38643736032004683,
|
| 10224 |
+
"grad_norm": 0.1877928376197815,
|
| 10225 |
+
"learning_rate": 0.00038327731807204744,
|
| 10226 |
+
"loss": 3.0089,
|
| 10227 |
+
"num_input_tokens_seen": 15060172800,
|
| 10228 |
+
"step": 57450
|
| 10229 |
+
},
|
| 10230 |
+
{
|
| 10231 |
+
"epoch": 0.38677368526375444,
|
| 10232 |
+
"grad_norm": 0.2605326771736145,
|
| 10233 |
+
"learning_rate": 0.0003705904774487396,
|
| 10234 |
+
"loss": 3.0115,
|
| 10235 |
+
"num_input_tokens_seen": 15073280000,
|
| 10236 |
+
"step": 57500
|
| 10237 |
+
},
|
| 10238 |
+
{
|
| 10239 |
+
"epoch": 0.38677368526375444,
|
| 10240 |
+
"eval_loss": 2.9029135704040527,
|
| 10241 |
+
"eval_runtime": 53.9097,
|
| 10242 |
+
"eval_samples_per_second": 92.748,
|
| 10243 |
+
"eval_steps_per_second": 23.187,
|
| 10244 |
+
"num_input_tokens_seen": 15073280000,
|
| 10245 |
+
"step": 57500
|
| 10246 |
+
},
|
| 10247 |
+
{
|
| 10248 |
+
"epoch": 0.38711001020746205,
|
| 10249 |
+
"grad_norm": 0.21006393432617188,
|
| 10250 |
+
"learning_rate": 0.0003579923276480387,
|
| 10251 |
+
"loss": 3.0044,
|
| 10252 |
+
"num_input_tokens_seen": 15086387200,
|
| 10253 |
+
"step": 57550
|
| 10254 |
+
},
|
| 10255 |
+
{
|
| 10256 |
+
"epoch": 0.38744633515116966,
|
| 10257 |
+
"grad_norm": 0.1743878722190857,
|
| 10258 |
+
"learning_rate": 0.00034549150281252633,
|
| 10259 |
+
"loss": 3.0114,
|
| 10260 |
+
"num_input_tokens_seen": 15099494400,
|
| 10261 |
+
"step": 57600
|
| 10262 |
+
},
|
| 10263 |
+
{
|
| 10264 |
+
"epoch": 0.3877826600948773,
|
| 10265 |
+
"grad_norm": 0.16699257493019104,
|
| 10266 |
+
"learning_rate": 0.00033309657038311456,
|
| 10267 |
+
"loss": 3.0041,
|
| 10268 |
+
"num_input_tokens_seen": 15112601600,
|
| 10269 |
+
"step": 57650
|
| 10270 |
+
},
|
| 10271 |
+
{
|
| 10272 |
+
"epoch": 0.3881189850385849,
|
| 10273 |
+
"grad_norm": 0.17115868628025055,
|
| 10274 |
+
"learning_rate": 0.00032081602522734986,
|
| 10275 |
+
"loss": 3.0051,
|
| 10276 |
+
"num_input_tokens_seen": 15125708800,
|
| 10277 |
+
"step": 57700
|
| 10278 |
+
},
|
| 10279 |
+
{
|
| 10280 |
+
"epoch": 0.3884553099822925,
|
| 10281 |
+
"grad_norm": 0.16885310411453247,
|
| 10282 |
+
"learning_rate": 0.0003086582838174551,
|
| 10283 |
+
"loss": 2.9969,
|
| 10284 |
+
"num_input_tokens_seen": 15138816000,
|
| 10285 |
+
"step": 57750
|
| 10286 |
+
},
|
| 10287 |
+
{
|
| 10288 |
+
"epoch": 0.3887916349260001,
|
| 10289 |
+
"grad_norm": 0.17101123929023743,
|
| 10290 |
+
"learning_rate": 0.0002966316784621,
|
| 10291 |
+
"loss": 2.9947,
|
| 10292 |
+
"num_input_tokens_seen": 15151923200,
|
| 10293 |
+
"step": 57800
|
| 10294 |
+
},
|
| 10295 |
+
{
|
| 10296 |
+
"epoch": 0.3891279598697077,
|
| 10297 |
+
"grad_norm": 0.1529199331998825,
|
| 10298 |
+
"learning_rate": 0.0002847444515958523,
|
| 10299 |
+
"loss": 3.0019,
|
| 10300 |
+
"num_input_tokens_seen": 15165030400,
|
| 10301 |
+
"step": 57850
|
| 10302 |
+
},
|
| 10303 |
+
{
|
| 10304 |
+
"epoch": 0.38946428481341533,
|
| 10305 |
+
"grad_norm": 0.16087768971920013,
|
| 10306 |
+
"learning_rate": 0.00027300475013022663,
|
| 10307 |
+
"loss": 2.9947,
|
| 10308 |
+
"num_input_tokens_seen": 15178137600,
|
| 10309 |
+
"step": 57900
|
| 10310 |
+
},
|
| 10311 |
+
{
|
| 10312 |
+
"epoch": 0.38980060975712294,
|
| 10313 |
+
"grad_norm": 0.16023555397987366,
|
| 10314 |
+
"learning_rate": 0.00026142061987019576,
|
| 10315 |
+
"loss": 3.0022,
|
| 10316 |
+
"num_input_tokens_seen": 15191244800,
|
| 10317 |
+
"step": 57950
|
| 10318 |
+
},
|
| 10319 |
+
{
|
| 10320 |
+
"epoch": 0.39013693470083055,
|
| 10321 |
+
"grad_norm": 0.16161410510540009,
|
| 10322 |
+
"learning_rate": 0.0002500000000000001,
|
| 10323 |
+
"loss": 2.9931,
|
| 10324 |
+
"num_input_tokens_seen": 15204352000,
|
| 10325 |
+
"step": 58000
|
| 10326 |
+
},
|
| 10327 |
+
{
|
| 10328 |
+
"epoch": 0.39013693470083055,
|
| 10329 |
+
"eval_loss": 2.8950610160827637,
|
| 10330 |
+
"eval_runtime": 53.5434,
|
| 10331 |
+
"eval_samples_per_second": 93.382,
|
| 10332 |
+
"eval_steps_per_second": 23.346,
|
| 10333 |
+
"num_input_tokens_seen": 15204352000,
|
| 10334 |
+
"step": 58000
|
| 10335 |
}
|
| 10336 |
],
|
| 10337 |
"logging_steps": 50,
|
| 10338 |
"max_steps": 60000,
|
| 10339 |
+
"num_input_tokens_seen": 15204352000,
|
| 10340 |
"num_train_epochs": 1,
|
| 10341 |
"save_steps": 1000,
|
| 10342 |
"stateful_callbacks": {
|
|
|
|
| 10351 |
"attributes": {}
|
| 10352 |
}
|
| 10353 |
},
|
| 10354 |
+
"total_flos": 4.06731255447552e+18,
|
| 10355 |
"train_batch_size": 64,
|
| 10356 |
"trial_name": null,
|
| 10357 |
"trial_params": null
|