Training in progress, step 120000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e70907b0d675ee2643842e014ed6c972c9663ac94c350f0ab42a0be8632152c
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3a65d04a4bb9bbc428894a0e56fe5a8ff86920144b87270537f75bf5b3558c9
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d8a9a435a8fb7efaea34ed653a04299793c4ab23d440f306a1001d1a5e2fe4d
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f14fb013cc682f88bd394d32631eff6723ea097f4e238bec79824a853a5616c4
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21190,11 +21190,189 @@
|
|
| 21190 |
"eval_steps_per_second": 15.211,
|
| 21191 |
"num_input_tokens_seen": 62380238112,
|
| 21192 |
"step": 119000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21193 |
}
|
| 21194 |
],
|
| 21195 |
"logging_steps": 50,
|
| 21196 |
"max_steps": 140000,
|
| 21197 |
-
"num_input_tokens_seen":
|
| 21198 |
"num_train_epochs": 2,
|
| 21199 |
"save_steps": 1000,
|
| 21200 |
"stateful_callbacks": {
|
|
@@ -21209,7 +21387,7 @@
|
|
| 21209 |
"attributes": {}
|
| 21210 |
}
|
| 21211 |
},
|
| 21212 |
-
"total_flos": 1.
|
| 21213 |
"train_batch_size": 32,
|
| 21214 |
"trial_name": null,
|
| 21215 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1448084238644358,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 120000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21190 |
"eval_steps_per_second": 15.211,
|
| 21191 |
"num_input_tokens_seen": 62380238112,
|
| 21192 |
"step": 119000
|
| 21193 |
+
},
|
| 21194 |
+
{
|
| 21195 |
+
"epoch": 1.1357453760568588,
|
| 21196 |
+
"grad_norm": 0.1409357637166977,
|
| 21197 |
+
"learning_rate": 0.0008515644076206653,
|
| 21198 |
+
"loss": 2.0885,
|
| 21199 |
+
"num_input_tokens_seen": 62406448192,
|
| 21200 |
+
"step": 119050
|
| 21201 |
+
},
|
| 21202 |
+
{
|
| 21203 |
+
"epoch": 1.136222378573047,
|
| 21204 |
+
"grad_norm": 0.15409712493419647,
|
| 21205 |
+
"learning_rate": 0.0008495643602586287,
|
| 21206 |
+
"loss": 2.0778,
|
| 21207 |
+
"num_input_tokens_seen": 62432661632,
|
| 21208 |
+
"step": 119100
|
| 21209 |
+
},
|
| 21210 |
+
{
|
| 21211 |
+
"epoch": 1.1366993810892352,
|
| 21212 |
+
"grad_norm": 0.1327887326478958,
|
| 21213 |
+
"learning_rate": 0.0008475533114523955,
|
| 21214 |
+
"loss": 2.086,
|
| 21215 |
+
"num_input_tokens_seen": 62458870752,
|
| 21216 |
+
"step": 119150
|
| 21217 |
+
},
|
| 21218 |
+
{
|
| 21219 |
+
"epoch": 1.1371763836054236,
|
| 21220 |
+
"grad_norm": 0.14051629602909088,
|
| 21221 |
+
"learning_rate": 0.0008455313244934324,
|
| 21222 |
+
"loss": 2.0765,
|
| 21223 |
+
"num_input_tokens_seen": 62485082688,
|
| 21224 |
+
"step": 119200
|
| 21225 |
+
},
|
| 21226 |
+
{
|
| 21227 |
+
"epoch": 1.1376533861216118,
|
| 21228 |
+
"grad_norm": 0.13998936116695404,
|
| 21229 |
+
"learning_rate": 0.0008434984630174508,
|
| 21230 |
+
"loss": 2.0784,
|
| 21231 |
+
"num_input_tokens_seen": 62511288832,
|
| 21232 |
+
"step": 119250
|
| 21233 |
+
},
|
| 21234 |
+
{
|
| 21235 |
+
"epoch": 1.1381303886378,
|
| 21236 |
+
"grad_norm": 0.1316358745098114,
|
| 21237 |
+
"learning_rate": 0.0008414547910024035,
|
| 21238 |
+
"loss": 2.0839,
|
| 21239 |
+
"num_input_tokens_seen": 62537499648,
|
| 21240 |
+
"step": 119300
|
| 21241 |
+
},
|
| 21242 |
+
{
|
| 21243 |
+
"epoch": 1.1386073911539882,
|
| 21244 |
+
"grad_norm": 0.13315369188785553,
|
| 21245 |
+
"learning_rate": 0.0008394003727664709,
|
| 21246 |
+
"loss": 2.0793,
|
| 21247 |
+
"num_input_tokens_seen": 62563710336,
|
| 21248 |
+
"step": 119350
|
| 21249 |
+
},
|
| 21250 |
+
{
|
| 21251 |
+
"epoch": 1.1390843936701767,
|
| 21252 |
+
"grad_norm": 0.1454961597919464,
|
| 21253 |
+
"learning_rate": 0.0008373352729660373,
|
| 21254 |
+
"loss": 2.0814,
|
| 21255 |
+
"num_input_tokens_seen": 62589918400,
|
| 21256 |
+
"step": 119400
|
| 21257 |
+
},
|
| 21258 |
+
{
|
| 21259 |
+
"epoch": 1.1395613961863649,
|
| 21260 |
+
"grad_norm": 0.14860859513282776,
|
| 21261 |
+
"learning_rate": 0.0008352595565936554,
|
| 21262 |
+
"loss": 2.0885,
|
| 21263 |
+
"num_input_tokens_seen": 62616130880,
|
| 21264 |
+
"step": 119450
|
| 21265 |
+
},
|
| 21266 |
+
{
|
| 21267 |
+
"epoch": 1.140038398702553,
|
| 21268 |
+
"grad_norm": 0.13664905726909637,
|
| 21269 |
+
"learning_rate": 0.000833173288976002,
|
| 21270 |
+
"loss": 2.0836,
|
| 21271 |
+
"num_input_tokens_seen": 62642339520,
|
| 21272 |
+
"step": 119500
|
| 21273 |
+
},
|
| 21274 |
+
{
|
| 21275 |
+
"epoch": 1.140038398702553,
|
| 21276 |
+
"eval_loss": 1.9989631175994873,
|
| 21277 |
+
"eval_runtime": 83.3074,
|
| 21278 |
+
"eval_samples_per_second": 60.019,
|
| 21279 |
+
"eval_steps_per_second": 15.005,
|
| 21280 |
+
"num_input_tokens_seen": 62642339520,
|
| 21281 |
+
"step": 119500
|
| 21282 |
+
},
|
| 21283 |
+
{
|
| 21284 |
+
"epoch": 1.1405154012187415,
|
| 21285 |
+
"grad_norm": 0.1337277889251709,
|
| 21286 |
+
"learning_rate": 0.0008310765357718206,
|
| 21287 |
+
"loss": 2.0745,
|
| 21288 |
+
"num_input_tokens_seen": 62668548896,
|
| 21289 |
+
"step": 119550
|
| 21290 |
+
},
|
| 21291 |
+
{
|
| 21292 |
+
"epoch": 1.1409924037349297,
|
| 21293 |
+
"grad_norm": 0.13231709599494934,
|
| 21294 |
+
"learning_rate": 0.0008289693629698564,
|
| 21295 |
+
"loss": 2.0851,
|
| 21296 |
+
"num_input_tokens_seen": 62694761888,
|
| 21297 |
+
"step": 119600
|
| 21298 |
+
},
|
| 21299 |
+
{
|
| 21300 |
+
"epoch": 1.141469406251118,
|
| 21301 |
+
"grad_norm": 0.13446244597434998,
|
| 21302 |
+
"learning_rate": 0.0008268518368867782,
|
| 21303 |
+
"loss": 2.0737,
|
| 21304 |
+
"num_input_tokens_seen": 62720974368,
|
| 21305 |
+
"step": 119650
|
| 21306 |
+
},
|
| 21307 |
+
{
|
| 21308 |
+
"epoch": 1.1419464087673061,
|
| 21309 |
+
"grad_norm": 0.14359907805919647,
|
| 21310 |
+
"learning_rate": 0.0008247240241650918,
|
| 21311 |
+
"loss": 2.0772,
|
| 21312 |
+
"num_input_tokens_seen": 62747188768,
|
| 21313 |
+
"step": 119700
|
| 21314 |
+
},
|
| 21315 |
+
{
|
| 21316 |
+
"epoch": 1.1424234112834946,
|
| 21317 |
+
"grad_norm": 0.13156485557556152,
|
| 21318 |
+
"learning_rate": 0.0008225859917710439,
|
| 21319 |
+
"loss": 2.0791,
|
| 21320 |
+
"num_input_tokens_seen": 62773395936,
|
| 21321 |
+
"step": 119750
|
| 21322 |
+
},
|
| 21323 |
+
{
|
| 21324 |
+
"epoch": 1.1429004137996828,
|
| 21325 |
+
"grad_norm": 0.14039525389671326,
|
| 21326 |
+
"learning_rate": 0.000820437806992512,
|
| 21327 |
+
"loss": 2.0656,
|
| 21328 |
+
"num_input_tokens_seen": 62799610336,
|
| 21329 |
+
"step": 119800
|
| 21330 |
+
},
|
| 21331 |
+
{
|
| 21332 |
+
"epoch": 1.143377416315871,
|
| 21333 |
+
"grad_norm": 0.14653949439525604,
|
| 21334 |
+
"learning_rate": 0.0008182795374368893,
|
| 21335 |
+
"loss": 2.0741,
|
| 21336 |
+
"num_input_tokens_seen": 62825821984,
|
| 21337 |
+
"step": 119850
|
| 21338 |
+
},
|
| 21339 |
+
{
|
| 21340 |
+
"epoch": 1.1438544188320594,
|
| 21341 |
+
"grad_norm": 0.12294785678386688,
|
| 21342 |
+
"learning_rate": 0.0008161112510289549,
|
| 21343 |
+
"loss": 2.0741,
|
| 21344 |
+
"num_input_tokens_seen": 62852031840,
|
| 21345 |
+
"step": 119900
|
| 21346 |
+
},
|
| 21347 |
+
{
|
| 21348 |
+
"epoch": 1.1443314213482476,
|
| 21349 |
+
"grad_norm": 0.18639816343784332,
|
| 21350 |
+
"learning_rate": 0.0008139330160087374,
|
| 21351 |
+
"loss": 2.1258,
|
| 21352 |
+
"num_input_tokens_seen": 62878240576,
|
| 21353 |
+
"step": 119950
|
| 21354 |
+
},
|
| 21355 |
+
{
|
| 21356 |
+
"epoch": 1.1448084238644358,
|
| 21357 |
+
"grad_norm": 0.1320071518421173,
|
| 21358 |
+
"learning_rate": 0.0008117449009293668,
|
| 21359 |
+
"loss": 2.0956,
|
| 21360 |
+
"num_input_tokens_seen": 62904447680,
|
| 21361 |
+
"step": 120000
|
| 21362 |
+
},
|
| 21363 |
+
{
|
| 21364 |
+
"epoch": 1.1448084238644358,
|
| 21365 |
+
"eval_loss": 2.0032639503479004,
|
| 21366 |
+
"eval_runtime": 82.8531,
|
| 21367 |
+
"eval_samples_per_second": 60.348,
|
| 21368 |
+
"eval_steps_per_second": 15.087,
|
| 21369 |
+
"num_input_tokens_seen": 62904447680,
|
| 21370 |
+
"step": 120000
|
| 21371 |
}
|
| 21372 |
],
|
| 21373 |
"logging_steps": 50,
|
| 21374 |
"max_steps": 140000,
|
| 21375 |
+
"num_input_tokens_seen": 62904447680,
|
| 21376 |
"num_train_epochs": 2,
|
| 21377 |
"save_steps": 1000,
|
| 21378 |
"stateful_callbacks": {
|
|
|
|
| 21387 |
"attributes": {}
|
| 21388 |
}
|
| 21389 |
},
|
| 21390 |
+
"total_flos": 1.1132939862234317e+20,
|
| 21391 |
"train_batch_size": 32,
|
| 21392 |
"trial_name": null,
|
| 21393 |
"trial_params": null
|