Training in progress, step 16500, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 487156538
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:289545fc1552428b0e12aeeecd55b134e9a52241f44036de8d8f204e35e20afb
|
| 3 |
size 487156538
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1059459406
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99059f4bf0ce62b572ac5e7aed5f529d09c121705516bcb6c43b763dcdea026d
|
| 3 |
size 1059459406
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82d0cb560e8cc88d37b6fdb38283c527bf386371741e3fb12423b76b412c4d30
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:139282925be08220983826aa431aa288b1fc5afb82a768d6f91bc4f11be56858
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:495800332f29bfc930f2d135466505a7115bb60583302cd925a56ab992bb542c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8384800dd0611fe7ff52172a3c0ca3c74f64cf34436f28d688acecfccf0334f6
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1fa44d83f1ea27212c9079f128b8147324741571d792587433ce7cd41805e05
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11208,6 +11208,356 @@
|
|
| 11208 |
"learning_rate": 0.0004949612511467957,
|
| 11209 |
"loss": 16.847,
|
| 11210 |
"step": 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11211 |
}
|
| 11212 |
],
|
| 11213 |
"logging_steps": 10,
|
|
@@ -11227,7 +11577,7 @@
|
|
| 11227 |
"attributes": {}
|
| 11228 |
}
|
| 11229 |
},
|
| 11230 |
-
"total_flos": 3.
|
| 11231 |
"train_batch_size": 48,
|
| 11232 |
"trial_name": null,
|
| 11233 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.0321745616215979,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 16500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11208 |
"learning_rate": 0.0004949612511467957,
|
| 11209 |
"loss": 16.847,
|
| 11210 |
"step": 16000
|
| 11211 |
+
},
|
| 11212 |
+
{
|
| 11213 |
+
"epoch": 0.03121907464010803,
|
| 11214 |
+
"grad_norm": 7.65625,
|
| 11215 |
+
"learning_rate": 0.000494958000131341,
|
| 11216 |
+
"loss": 17.0239,
|
| 11217 |
+
"step": 16010
|
| 11218 |
+
},
|
| 11219 |
+
{
|
| 11220 |
+
"epoch": 0.03123857437442415,
|
| 11221 |
+
"grad_norm": 10.125,
|
| 11222 |
+
"learning_rate": 0.0004949547491158863,
|
| 11223 |
+
"loss": 16.8909,
|
| 11224 |
+
"step": 16020
|
| 11225 |
+
},
|
| 11226 |
+
{
|
| 11227 |
+
"epoch": 0.03125807410874027,
|
| 11228 |
+
"grad_norm": 10.75,
|
| 11229 |
+
"learning_rate": 0.0004949514981004317,
|
| 11230 |
+
"loss": 16.8678,
|
| 11231 |
+
"step": 16030
|
| 11232 |
+
},
|
| 11233 |
+
{
|
| 11234 |
+
"epoch": 0.03127757384305639,
|
| 11235 |
+
"grad_norm": 6.4375,
|
| 11236 |
+
"learning_rate": 0.000494948247084977,
|
| 11237 |
+
"loss": 16.7921,
|
| 11238 |
+
"step": 16040
|
| 11239 |
+
},
|
| 11240 |
+
{
|
| 11241 |
+
"epoch": 0.031297073577372506,
|
| 11242 |
+
"grad_norm": 6.875,
|
| 11243 |
+
"learning_rate": 0.0004949449960695223,
|
| 11244 |
+
"loss": 16.8111,
|
| 11245 |
+
"step": 16050
|
| 11246 |
+
},
|
| 11247 |
+
{
|
| 11248 |
+
"epoch": 0.03131657331168863,
|
| 11249 |
+
"grad_norm": 8.1875,
|
| 11250 |
+
"learning_rate": 0.0004949417450540676,
|
| 11251 |
+
"loss": 16.8964,
|
| 11252 |
+
"step": 16060
|
| 11253 |
+
},
|
| 11254 |
+
{
|
| 11255 |
+
"epoch": 0.03133607304600475,
|
| 11256 |
+
"grad_norm": 7.96875,
|
| 11257 |
+
"learning_rate": 0.000494938494038613,
|
| 11258 |
+
"loss": 16.9569,
|
| 11259 |
+
"step": 16070
|
| 11260 |
+
},
|
| 11261 |
+
{
|
| 11262 |
+
"epoch": 0.03135557278032087,
|
| 11263 |
+
"grad_norm": 8.5625,
|
| 11264 |
+
"learning_rate": 0.0004949352430231583,
|
| 11265 |
+
"loss": 16.9247,
|
| 11266 |
+
"step": 16080
|
| 11267 |
+
},
|
| 11268 |
+
{
|
| 11269 |
+
"epoch": 0.03137507251463699,
|
| 11270 |
+
"grad_norm": 9.6875,
|
| 11271 |
+
"learning_rate": 0.0004949319920077036,
|
| 11272 |
+
"loss": 16.9235,
|
| 11273 |
+
"step": 16090
|
| 11274 |
+
},
|
| 11275 |
+
{
|
| 11276 |
+
"epoch": 0.03139457224895311,
|
| 11277 |
+
"grad_norm": 6.5625,
|
| 11278 |
+
"learning_rate": 0.000494928740992249,
|
| 11279 |
+
"loss": 17.061,
|
| 11280 |
+
"step": 16100
|
| 11281 |
+
},
|
| 11282 |
+
{
|
| 11283 |
+
"epoch": 0.031414071983269225,
|
| 11284 |
+
"grad_norm": 7.1875,
|
| 11285 |
+
"learning_rate": 0.0004949254899767943,
|
| 11286 |
+
"loss": 16.9691,
|
| 11287 |
+
"step": 16110
|
| 11288 |
+
},
|
| 11289 |
+
{
|
| 11290 |
+
"epoch": 0.031433571717585346,
|
| 11291 |
+
"grad_norm": 8.4375,
|
| 11292 |
+
"learning_rate": 0.0004949222389613396,
|
| 11293 |
+
"loss": 16.9493,
|
| 11294 |
+
"step": 16120
|
| 11295 |
+
},
|
| 11296 |
+
{
|
| 11297 |
+
"epoch": 0.03145307145190147,
|
| 11298 |
+
"grad_norm": 7.625,
|
| 11299 |
+
"learning_rate": 0.0004949189879458849,
|
| 11300 |
+
"loss": 16.8725,
|
| 11301 |
+
"step": 16130
|
| 11302 |
+
},
|
| 11303 |
+
{
|
| 11304 |
+
"epoch": 0.03147257118621759,
|
| 11305 |
+
"grad_norm": 8.125,
|
| 11306 |
+
"learning_rate": 0.0004949157369304303,
|
| 11307 |
+
"loss": 16.9221,
|
| 11308 |
+
"step": 16140
|
| 11309 |
+
},
|
| 11310 |
+
{
|
| 11311 |
+
"epoch": 0.03149207092053371,
|
| 11312 |
+
"grad_norm": 8.1875,
|
| 11313 |
+
"learning_rate": 0.0004949124859149756,
|
| 11314 |
+
"loss": 16.964,
|
| 11315 |
+
"step": 16150
|
| 11316 |
+
},
|
| 11317 |
+
{
|
| 11318 |
+
"epoch": 0.03151157065484983,
|
| 11319 |
+
"grad_norm": 7.03125,
|
| 11320 |
+
"learning_rate": 0.0004949092348995209,
|
| 11321 |
+
"loss": 16.9567,
|
| 11322 |
+
"step": 16160
|
| 11323 |
+
},
|
| 11324 |
+
{
|
| 11325 |
+
"epoch": 0.03153107038916595,
|
| 11326 |
+
"grad_norm": 7.40625,
|
| 11327 |
+
"learning_rate": 0.0004949059838840663,
|
| 11328 |
+
"loss": 16.8704,
|
| 11329 |
+
"step": 16170
|
| 11330 |
+
},
|
| 11331 |
+
{
|
| 11332 |
+
"epoch": 0.031550570123482065,
|
| 11333 |
+
"grad_norm": 9.0625,
|
| 11334 |
+
"learning_rate": 0.0004949027328686116,
|
| 11335 |
+
"loss": 16.8933,
|
| 11336 |
+
"step": 16180
|
| 11337 |
+
},
|
| 11338 |
+
{
|
| 11339 |
+
"epoch": 0.031570069857798186,
|
| 11340 |
+
"grad_norm": 7.625,
|
| 11341 |
+
"learning_rate": 0.0004948994818531569,
|
| 11342 |
+
"loss": 17.0769,
|
| 11343 |
+
"step": 16190
|
| 11344 |
+
},
|
| 11345 |
+
{
|
| 11346 |
+
"epoch": 0.03158956959211431,
|
| 11347 |
+
"grad_norm": 7.375,
|
| 11348 |
+
"learning_rate": 0.0004948962308377022,
|
| 11349 |
+
"loss": 16.9815,
|
| 11350 |
+
"step": 16200
|
| 11351 |
+
},
|
| 11352 |
+
{
|
| 11353 |
+
"epoch": 0.03160906932643043,
|
| 11354 |
+
"grad_norm": 143.0,
|
| 11355 |
+
"learning_rate": 0.0004948929798222476,
|
| 11356 |
+
"loss": 16.9903,
|
| 11357 |
+
"step": 16210
|
| 11358 |
+
},
|
| 11359 |
+
{
|
| 11360 |
+
"epoch": 0.03162856906074655,
|
| 11361 |
+
"grad_norm": 7.1875,
|
| 11362 |
+
"learning_rate": 0.0004948897288067928,
|
| 11363 |
+
"loss": 16.9974,
|
| 11364 |
+
"step": 16220
|
| 11365 |
+
},
|
| 11366 |
+
{
|
| 11367 |
+
"epoch": 0.03164806879506267,
|
| 11368 |
+
"grad_norm": 7.21875,
|
| 11369 |
+
"learning_rate": 0.0004948864777913381,
|
| 11370 |
+
"loss": 16.8849,
|
| 11371 |
+
"step": 16230
|
| 11372 |
+
},
|
| 11373 |
+
{
|
| 11374 |
+
"epoch": 0.031667568529378784,
|
| 11375 |
+
"grad_norm": 10.125,
|
| 11376 |
+
"learning_rate": 0.0004948832267758834,
|
| 11377 |
+
"loss": 16.9787,
|
| 11378 |
+
"step": 16240
|
| 11379 |
+
},
|
| 11380 |
+
{
|
| 11381 |
+
"epoch": 0.031687068263694905,
|
| 11382 |
+
"grad_norm": 10.4375,
|
| 11383 |
+
"learning_rate": 0.0004948799757604288,
|
| 11384 |
+
"loss": 16.941,
|
| 11385 |
+
"step": 16250
|
| 11386 |
+
},
|
| 11387 |
+
{
|
| 11388 |
+
"epoch": 0.031706567998011026,
|
| 11389 |
+
"grad_norm": 9.0625,
|
| 11390 |
+
"learning_rate": 0.0004948767247449741,
|
| 11391 |
+
"loss": 16.7498,
|
| 11392 |
+
"step": 16260
|
| 11393 |
+
},
|
| 11394 |
+
{
|
| 11395 |
+
"epoch": 0.03172606773232715,
|
| 11396 |
+
"grad_norm": 9.3125,
|
| 11397 |
+
"learning_rate": 0.0004948734737295194,
|
| 11398 |
+
"loss": 16.8968,
|
| 11399 |
+
"step": 16270
|
| 11400 |
+
},
|
| 11401 |
+
{
|
| 11402 |
+
"epoch": 0.03174556746664327,
|
| 11403 |
+
"grad_norm": 23.25,
|
| 11404 |
+
"learning_rate": 0.0004948702227140648,
|
| 11405 |
+
"loss": 16.9446,
|
| 11406 |
+
"step": 16280
|
| 11407 |
+
},
|
| 11408 |
+
{
|
| 11409 |
+
"epoch": 0.03176506720095939,
|
| 11410 |
+
"grad_norm": 6.4375,
|
| 11411 |
+
"learning_rate": 0.0004948669716986101,
|
| 11412 |
+
"loss": 16.9886,
|
| 11413 |
+
"step": 16290
|
| 11414 |
+
},
|
| 11415 |
+
{
|
| 11416 |
+
"epoch": 0.03178456693527551,
|
| 11417 |
+
"grad_norm": 6.59375,
|
| 11418 |
+
"learning_rate": 0.0004948637206831554,
|
| 11419 |
+
"loss": 16.8989,
|
| 11420 |
+
"step": 16300
|
| 11421 |
+
},
|
| 11422 |
+
{
|
| 11423 |
+
"epoch": 0.031804066669591624,
|
| 11424 |
+
"grad_norm": 7.125,
|
| 11425 |
+
"learning_rate": 0.0004948604696677007,
|
| 11426 |
+
"loss": 16.9622,
|
| 11427 |
+
"step": 16310
|
| 11428 |
+
},
|
| 11429 |
+
{
|
| 11430 |
+
"epoch": 0.031823566403907745,
|
| 11431 |
+
"grad_norm": 10.6875,
|
| 11432 |
+
"learning_rate": 0.0004948572186522461,
|
| 11433 |
+
"loss": 16.9232,
|
| 11434 |
+
"step": 16320
|
| 11435 |
+
},
|
| 11436 |
+
{
|
| 11437 |
+
"epoch": 0.031843066138223866,
|
| 11438 |
+
"grad_norm": 8.4375,
|
| 11439 |
+
"learning_rate": 0.0004948539676367914,
|
| 11440 |
+
"loss": 17.0506,
|
| 11441 |
+
"step": 16330
|
| 11442 |
+
},
|
| 11443 |
+
{
|
| 11444 |
+
"epoch": 0.03186256587253999,
|
| 11445 |
+
"grad_norm": 8.875,
|
| 11446 |
+
"learning_rate": 0.0004948507166213367,
|
| 11447 |
+
"loss": 16.8868,
|
| 11448 |
+
"step": 16340
|
| 11449 |
+
},
|
| 11450 |
+
{
|
| 11451 |
+
"epoch": 0.03188206560685611,
|
| 11452 |
+
"grad_norm": 6.71875,
|
| 11453 |
+
"learning_rate": 0.0004948474656058821,
|
| 11454 |
+
"loss": 16.9945,
|
| 11455 |
+
"step": 16350
|
| 11456 |
+
},
|
| 11457 |
+
{
|
| 11458 |
+
"epoch": 0.03190156534117223,
|
| 11459 |
+
"grad_norm": 8.25,
|
| 11460 |
+
"learning_rate": 0.0004948442145904274,
|
| 11461 |
+
"loss": 16.832,
|
| 11462 |
+
"step": 16360
|
| 11463 |
+
},
|
| 11464 |
+
{
|
| 11465 |
+
"epoch": 0.03192106507548834,
|
| 11466 |
+
"grad_norm": 6.53125,
|
| 11467 |
+
"learning_rate": 0.0004948409635749726,
|
| 11468 |
+
"loss": 17.0248,
|
| 11469 |
+
"step": 16370
|
| 11470 |
+
},
|
| 11471 |
+
{
|
| 11472 |
+
"epoch": 0.031940564809804464,
|
| 11473 |
+
"grad_norm": 7.03125,
|
| 11474 |
+
"learning_rate": 0.0004948377125595179,
|
| 11475 |
+
"loss": 17.0616,
|
| 11476 |
+
"step": 16380
|
| 11477 |
+
},
|
| 11478 |
+
{
|
| 11479 |
+
"epoch": 0.031960064544120585,
|
| 11480 |
+
"grad_norm": 6.34375,
|
| 11481 |
+
"learning_rate": 0.0004948344615440633,
|
| 11482 |
+
"loss": 16.9359,
|
| 11483 |
+
"step": 16390
|
| 11484 |
+
},
|
| 11485 |
+
{
|
| 11486 |
+
"epoch": 0.031979564278436706,
|
| 11487 |
+
"grad_norm": 18.0,
|
| 11488 |
+
"learning_rate": 0.0004948312105286086,
|
| 11489 |
+
"loss": 16.9874,
|
| 11490 |
+
"step": 16400
|
| 11491 |
+
},
|
| 11492 |
+
{
|
| 11493 |
+
"epoch": 0.03199906401275283,
|
| 11494 |
+
"grad_norm": 9.25,
|
| 11495 |
+
"learning_rate": 0.0004948279595131539,
|
| 11496 |
+
"loss": 16.9289,
|
| 11497 |
+
"step": 16410
|
| 11498 |
+
},
|
| 11499 |
+
{
|
| 11500 |
+
"epoch": 0.03201856374706895,
|
| 11501 |
+
"grad_norm": 6.5,
|
| 11502 |
+
"learning_rate": 0.0004948247084976992,
|
| 11503 |
+
"loss": 16.8866,
|
| 11504 |
+
"step": 16420
|
| 11505 |
+
},
|
| 11506 |
+
{
|
| 11507 |
+
"epoch": 0.03203806348138507,
|
| 11508 |
+
"grad_norm": 7.5625,
|
| 11509 |
+
"learning_rate": 0.0004948214574822446,
|
| 11510 |
+
"loss": 16.8968,
|
| 11511 |
+
"step": 16430
|
| 11512 |
+
},
|
| 11513 |
+
{
|
| 11514 |
+
"epoch": 0.03205756321570118,
|
| 11515 |
+
"grad_norm": 14.0,
|
| 11516 |
+
"learning_rate": 0.0004948182064667899,
|
| 11517 |
+
"loss": 16.8328,
|
| 11518 |
+
"step": 16440
|
| 11519 |
+
},
|
| 11520 |
+
{
|
| 11521 |
+
"epoch": 0.032077062950017304,
|
| 11522 |
+
"grad_norm": 8.125,
|
| 11523 |
+
"learning_rate": 0.0004948149554513352,
|
| 11524 |
+
"loss": 16.9513,
|
| 11525 |
+
"step": 16450
|
| 11526 |
+
},
|
| 11527 |
+
{
|
| 11528 |
+
"epoch": 0.032096562684333425,
|
| 11529 |
+
"grad_norm": 6.3125,
|
| 11530 |
+
"learning_rate": 0.0004948117044358806,
|
| 11531 |
+
"loss": 16.9399,
|
| 11532 |
+
"step": 16460
|
| 11533 |
+
},
|
| 11534 |
+
{
|
| 11535 |
+
"epoch": 0.032116062418649546,
|
| 11536 |
+
"grad_norm": 8.4375,
|
| 11537 |
+
"learning_rate": 0.0004948084534204259,
|
| 11538 |
+
"loss": 16.9612,
|
| 11539 |
+
"step": 16470
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.03213556215296567,
|
| 11543 |
+
"grad_norm": 7.40625,
|
| 11544 |
+
"learning_rate": 0.0004948052024049712,
|
| 11545 |
+
"loss": 16.9981,
|
| 11546 |
+
"step": 16480
|
| 11547 |
+
},
|
| 11548 |
+
{
|
| 11549 |
+
"epoch": 0.03215506188728179,
|
| 11550 |
+
"grad_norm": 8.1875,
|
| 11551 |
+
"learning_rate": 0.0004948019513895165,
|
| 11552 |
+
"loss": 16.9472,
|
| 11553 |
+
"step": 16490
|
| 11554 |
+
},
|
| 11555 |
+
{
|
| 11556 |
+
"epoch": 0.0321745616215979,
|
| 11557 |
+
"grad_norm": 7.3125,
|
| 11558 |
+
"learning_rate": 0.0004947987003740619,
|
| 11559 |
+
"loss": 16.9264,
|
| 11560 |
+
"step": 16500
|
| 11561 |
}
|
| 11562 |
],
|
| 11563 |
"logging_steps": 10,
|
|
|
|
| 11577 |
"attributes": {}
|
| 11578 |
}
|
| 11579 |
},
|
| 11580 |
+
"total_flos": 3.6711547362756e+19,
|
| 11581 |
"train_batch_size": 48,
|
| 11582 |
"trial_name": null,
|
| 11583 |
"trial_params": null
|