Training in progress, step 2400, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4ddfdb9e3869897cc8e2c794340a2005ba76c5f50e34e53325b8ac99f6dc318
|
| 3 |
size 228140600
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 117931203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e830b2069082bb840c6d5f287e7770c0cf0e2e3f80198ae73bbef00b14811db
|
| 3 |
size 117931203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e39d866cd1fc861fe2c47687364cde08217b0454e6f5ff3c9a3af4b1571fdbed
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:137d8a11890df77c4e1b6a4687bee089955dbcdddb421d49b265e762ccebb1d2
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a3c006c2c7c0bc33914c8e11069f53d495f2eafa42ba0a076cb7cebbe066c7a
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2252,6 +2252,318 @@
|
|
| 2252 |
"eval_samples_per_second": 2.3,
|
| 2253 |
"eval_steps_per_second": 0.575,
|
| 2254 |
"step": 2100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2255 |
}
|
| 2256 |
],
|
| 2257 |
"logging_steps": 10,
|
|
@@ -2271,7 +2583,7 @@
|
|
| 2271 |
"attributes": {}
|
| 2272 |
}
|
| 2273 |
},
|
| 2274 |
-
"total_flos":
|
| 2275 |
"train_batch_size": 1,
|
| 2276 |
"trial_name": null,
|
| 2277 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
+
"epoch": 3.84,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 2400,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2252 |
"eval_samples_per_second": 2.3,
|
| 2253 |
"eval_steps_per_second": 0.575,
|
| 2254 |
"step": 2100
|
| 2255 |
+
},
|
| 2256 |
+
{
|
| 2257 |
+
"entropy": 0.3517730229534209,
|
| 2258 |
+
"epoch": 3.376,
|
| 2259 |
+
"grad_norm": 0.6908054947853088,
|
| 2260 |
+
"learning_rate": 3.2544000000000006e-05,
|
| 2261 |
+
"loss": 0.3057,
|
| 2262 |
+
"mean_token_accuracy": 0.9103573642671108,
|
| 2263 |
+
"num_tokens": 266432.0,
|
| 2264 |
+
"step": 2110
|
| 2265 |
+
},
|
| 2266 |
+
{
|
| 2267 |
+
"entropy": 0.38618900515139104,
|
| 2268 |
+
"epoch": 3.392,
|
| 2269 |
+
"grad_norm": 0.9056383967399597,
|
| 2270 |
+
"learning_rate": 3.2224e-05,
|
| 2271 |
+
"loss": 0.3188,
|
| 2272 |
+
"mean_token_accuracy": 0.9076898027211427,
|
| 2273 |
+
"num_tokens": 282655.0,
|
| 2274 |
+
"step": 2120
|
| 2275 |
+
},
|
| 2276 |
+
{
|
| 2277 |
+
"entropy": 0.3537537831813097,
|
| 2278 |
+
"epoch": 3.408,
|
| 2279 |
+
"grad_norm": 0.48644715547561646,
|
| 2280 |
+
"learning_rate": 3.1904e-05,
|
| 2281 |
+
"loss": 0.2886,
|
| 2282 |
+
"mean_token_accuracy": 0.9162093725055456,
|
| 2283 |
+
"num_tokens": 310801.0,
|
| 2284 |
+
"step": 2130
|
| 2285 |
+
},
|
| 2286 |
+
{
|
| 2287 |
+
"entropy": 0.26729877749457953,
|
| 2288 |
+
"epoch": 3.424,
|
| 2289 |
+
"grad_norm": 0.6074755787849426,
|
| 2290 |
+
"learning_rate": 3.1584e-05,
|
| 2291 |
+
"loss": 0.2371,
|
| 2292 |
+
"mean_token_accuracy": 0.9263024788349867,
|
| 2293 |
+
"num_tokens": 343555.0,
|
| 2294 |
+
"step": 2140
|
| 2295 |
+
},
|
| 2296 |
+
{
|
| 2297 |
+
"entropy": 0.25955253606662154,
|
| 2298 |
+
"epoch": 3.44,
|
| 2299 |
+
"grad_norm": 0.8773949146270752,
|
| 2300 |
+
"learning_rate": 3.1264e-05,
|
| 2301 |
+
"loss": 0.2227,
|
| 2302 |
+
"mean_token_accuracy": 0.9337353933602571,
|
| 2303 |
+
"num_tokens": 369134.0,
|
| 2304 |
+
"step": 2150
|
| 2305 |
+
},
|
| 2306 |
+
{
|
| 2307 |
+
"entropy": 0.27338800597935914,
|
| 2308 |
+
"epoch": 3.456,
|
| 2309 |
+
"grad_norm": 0.7504522204399109,
|
| 2310 |
+
"learning_rate": 3.0975999999999996e-05,
|
| 2311 |
+
"loss": 0.2261,
|
| 2312 |
+
"mean_token_accuracy": 0.9332862004637719,
|
| 2313 |
+
"num_tokens": 390152.0,
|
| 2314 |
+
"step": 2160
|
| 2315 |
+
},
|
| 2316 |
+
{
|
| 2317 |
+
"entropy": 0.30181694105267526,
|
| 2318 |
+
"epoch": 3.472,
|
| 2319 |
+
"grad_norm": 0.8649200201034546,
|
| 2320 |
+
"learning_rate": 3.0656e-05,
|
| 2321 |
+
"loss": 0.2289,
|
| 2322 |
+
"mean_token_accuracy": 0.9334215141832829,
|
| 2323 |
+
"num_tokens": 406222.0,
|
| 2324 |
+
"step": 2170
|
| 2325 |
+
},
|
| 2326 |
+
{
|
| 2327 |
+
"entropy": 0.28406244921498003,
|
| 2328 |
+
"epoch": 3.488,
|
| 2329 |
+
"grad_norm": 1.9269925355911255,
|
| 2330 |
+
"learning_rate": 3.0336000000000002e-05,
|
| 2331 |
+
"loss": 0.2353,
|
| 2332 |
+
"mean_token_accuracy": 0.9303826864808797,
|
| 2333 |
+
"num_tokens": 434767.0,
|
| 2334 |
+
"step": 2180
|
| 2335 |
+
},
|
| 2336 |
+
{
|
| 2337 |
+
"entropy": 0.2358154426328838,
|
| 2338 |
+
"epoch": 3.504,
|
| 2339 |
+
"grad_norm": 0.7775760293006897,
|
| 2340 |
+
"learning_rate": 3.0016e-05,
|
| 2341 |
+
"loss": 0.2277,
|
| 2342 |
+
"mean_token_accuracy": 0.9293628957122564,
|
| 2343 |
+
"num_tokens": 467498.0,
|
| 2344 |
+
"step": 2190
|
| 2345 |
+
},
|
| 2346 |
+
{
|
| 2347 |
+
"entropy": 0.2596265008673072,
|
| 2348 |
+
"epoch": 3.52,
|
| 2349 |
+
"grad_norm": 0.7286163568496704,
|
| 2350 |
+
"learning_rate": 2.9696e-05,
|
| 2351 |
+
"loss": 0.2266,
|
| 2352 |
+
"mean_token_accuracy": 0.9321592267602682,
|
| 2353 |
+
"num_tokens": 493146.0,
|
| 2354 |
+
"step": 2200
|
| 2355 |
+
},
|
| 2356 |
+
{
|
| 2357 |
+
"entropy": 0.28550293026492,
|
| 2358 |
+
"epoch": 3.536,
|
| 2359 |
+
"grad_norm": 0.7693914175033569,
|
| 2360 |
+
"learning_rate": 2.9376000000000005e-05,
|
| 2361 |
+
"loss": 0.2291,
|
| 2362 |
+
"mean_token_accuracy": 0.9351058643311262,
|
| 2363 |
+
"num_tokens": 513926.0,
|
| 2364 |
+
"step": 2210
|
| 2365 |
+
},
|
| 2366 |
+
{
|
| 2367 |
+
"entropy": 0.2885140863247216,
|
| 2368 |
+
"epoch": 3.552,
|
| 2369 |
+
"grad_norm": 1.1927505731582642,
|
| 2370 |
+
"learning_rate": 2.9056e-05,
|
| 2371 |
+
"loss": 0.219,
|
| 2372 |
+
"mean_token_accuracy": 0.9396381825208664,
|
| 2373 |
+
"num_tokens": 530263.0,
|
| 2374 |
+
"step": 2220
|
| 2375 |
+
},
|
| 2376 |
+
{
|
| 2377 |
+
"entropy": 0.283741835039109,
|
| 2378 |
+
"epoch": 3.568,
|
| 2379 |
+
"grad_norm": 0.6537899971008301,
|
| 2380 |
+
"learning_rate": 2.8736e-05,
|
| 2381 |
+
"loss": 0.2324,
|
| 2382 |
+
"mean_token_accuracy": 0.9302929677069187,
|
| 2383 |
+
"num_tokens": 559791.0,
|
| 2384 |
+
"step": 2230
|
| 2385 |
+
},
|
| 2386 |
+
{
|
| 2387 |
+
"entropy": 0.2369093818590045,
|
| 2388 |
+
"epoch": 3.584,
|
| 2389 |
+
"grad_norm": 0.793480396270752,
|
| 2390 |
+
"learning_rate": 2.8416000000000004e-05,
|
| 2391 |
+
"loss": 0.2165,
|
| 2392 |
+
"mean_token_accuracy": 0.9320364937186241,
|
| 2393 |
+
"num_tokens": 592398.0,
|
| 2394 |
+
"step": 2240
|
| 2395 |
+
},
|
| 2396 |
+
{
|
| 2397 |
+
"entropy": 0.264733817987144,
|
| 2398 |
+
"epoch": 3.6,
|
| 2399 |
+
"grad_norm": 0.7945203185081482,
|
| 2400 |
+
"learning_rate": 2.8096e-05,
|
| 2401 |
+
"loss": 0.2337,
|
| 2402 |
+
"mean_token_accuracy": 0.9294226188212633,
|
| 2403 |
+
"num_tokens": 617982.0,
|
| 2404 |
+
"step": 2250
|
| 2405 |
+
},
|
| 2406 |
+
{
|
| 2407 |
+
"entropy": 0.2889886857941747,
|
| 2408 |
+
"epoch": 3.616,
|
| 2409 |
+
"grad_norm": 0.7558261752128601,
|
| 2410 |
+
"learning_rate": 2.7776000000000003e-05,
|
| 2411 |
+
"loss": 0.2305,
|
| 2412 |
+
"mean_token_accuracy": 0.9317790925502777,
|
| 2413 |
+
"num_tokens": 639115.0,
|
| 2414 |
+
"step": 2260
|
| 2415 |
+
},
|
| 2416 |
+
{
|
| 2417 |
+
"entropy": 0.28708559228107333,
|
| 2418 |
+
"epoch": 3.632,
|
| 2419 |
+
"grad_norm": 0.6877163648605347,
|
| 2420 |
+
"learning_rate": 2.7456000000000003e-05,
|
| 2421 |
+
"loss": 0.2215,
|
| 2422 |
+
"mean_token_accuracy": 0.9357377961277962,
|
| 2423 |
+
"num_tokens": 655709.0,
|
| 2424 |
+
"step": 2270
|
| 2425 |
+
},
|
| 2426 |
+
{
|
| 2427 |
+
"entropy": 0.28660596534609795,
|
| 2428 |
+
"epoch": 3.648,
|
| 2429 |
+
"grad_norm": 0.6599491238594055,
|
| 2430 |
+
"learning_rate": 2.7136e-05,
|
| 2431 |
+
"loss": 0.2363,
|
| 2432 |
+
"mean_token_accuracy": 0.928611570596695,
|
| 2433 |
+
"num_tokens": 684500.0,
|
| 2434 |
+
"step": 2280
|
| 2435 |
+
},
|
| 2436 |
+
{
|
| 2437 |
+
"entropy": 0.23836621949449183,
|
| 2438 |
+
"epoch": 3.664,
|
| 2439 |
+
"grad_norm": 0.7436323165893555,
|
| 2440 |
+
"learning_rate": 2.6816000000000002e-05,
|
| 2441 |
+
"loss": 0.2194,
|
| 2442 |
+
"mean_token_accuracy": 0.9314162913709879,
|
| 2443 |
+
"num_tokens": 717271.0,
|
| 2444 |
+
"step": 2290
|
| 2445 |
+
},
|
| 2446 |
+
{
|
| 2447 |
+
"entropy": 0.27099227644503115,
|
| 2448 |
+
"epoch": 3.68,
|
| 2449 |
+
"grad_norm": 0.7519745826721191,
|
| 2450 |
+
"learning_rate": 2.6496e-05,
|
| 2451 |
+
"loss": 0.2369,
|
| 2452 |
+
"mean_token_accuracy": 0.9278060872107744,
|
| 2453 |
+
"num_tokens": 743068.0,
|
| 2454 |
+
"step": 2300
|
| 2455 |
+
},
|
| 2456 |
+
{
|
| 2457 |
+
"entropy": 0.282380092702806,
|
| 2458 |
+
"epoch": 3.6959999999999997,
|
| 2459 |
+
"grad_norm": 0.7645207643508911,
|
| 2460 |
+
"learning_rate": 2.6176e-05,
|
| 2461 |
+
"loss": 0.2175,
|
| 2462 |
+
"mean_token_accuracy": 0.9372334524989128,
|
| 2463 |
+
"num_tokens": 763925.0,
|
| 2464 |
+
"step": 2310
|
| 2465 |
+
},
|
| 2466 |
+
{
|
| 2467 |
+
"entropy": 0.2850790939293802,
|
| 2468 |
+
"epoch": 3.7119999999999997,
|
| 2469 |
+
"grad_norm": 0.9016556143760681,
|
| 2470 |
+
"learning_rate": 2.5856e-05,
|
| 2471 |
+
"loss": 0.217,
|
| 2472 |
+
"mean_token_accuracy": 0.9392455574125051,
|
| 2473 |
+
"num_tokens": 780111.0,
|
| 2474 |
+
"step": 2320
|
| 2475 |
+
},
|
| 2476 |
+
{
|
| 2477 |
+
"entropy": 0.2691464308649302,
|
| 2478 |
+
"epoch": 3.7279999999999998,
|
| 2479 |
+
"grad_norm": 0.77091383934021,
|
| 2480 |
+
"learning_rate": 2.5535999999999997e-05,
|
| 2481 |
+
"loss": 0.2334,
|
| 2482 |
+
"mean_token_accuracy": 0.929338139295578,
|
| 2483 |
+
"num_tokens": 808661.0,
|
| 2484 |
+
"step": 2330
|
| 2485 |
+
},
|
| 2486 |
+
{
|
| 2487 |
+
"entropy": 0.2395469973795116,
|
| 2488 |
+
"epoch": 3.7439999999999998,
|
| 2489 |
+
"grad_norm": 0.7632396221160889,
|
| 2490 |
+
"learning_rate": 2.5216e-05,
|
| 2491 |
+
"loss": 0.2148,
|
| 2492 |
+
"mean_token_accuracy": 0.9322273649275303,
|
| 2493 |
+
"num_tokens": 840932.0,
|
| 2494 |
+
"step": 2340
|
| 2495 |
+
},
|
| 2496 |
+
{
|
| 2497 |
+
"entropy": 0.2645680231973529,
|
| 2498 |
+
"epoch": 3.76,
|
| 2499 |
+
"grad_norm": 0.819273054599762,
|
| 2500 |
+
"learning_rate": 2.4896e-05,
|
| 2501 |
+
"loss": 0.226,
|
| 2502 |
+
"mean_token_accuracy": 0.930556321516633,
|
| 2503 |
+
"num_tokens": 866564.0,
|
| 2504 |
+
"step": 2350
|
| 2505 |
+
},
|
| 2506 |
+
{
|
| 2507 |
+
"entropy": 0.2808503101579845,
|
| 2508 |
+
"epoch": 3.776,
|
| 2509 |
+
"grad_norm": 0.8598120808601379,
|
| 2510 |
+
"learning_rate": 2.4576000000000003e-05,
|
| 2511 |
+
"loss": 0.2215,
|
| 2512 |
+
"mean_token_accuracy": 0.9356644533574581,
|
| 2513 |
+
"num_tokens": 887527.0,
|
| 2514 |
+
"step": 2360
|
| 2515 |
+
},
|
| 2516 |
+
{
|
| 2517 |
+
"entropy": 0.28694011168554423,
|
| 2518 |
+
"epoch": 3.792,
|
| 2519 |
+
"grad_norm": 1.0404748916625977,
|
| 2520 |
+
"learning_rate": 2.4256e-05,
|
| 2521 |
+
"loss": 0.214,
|
| 2522 |
+
"mean_token_accuracy": 0.9388030290603637,
|
| 2523 |
+
"num_tokens": 903688.0,
|
| 2524 |
+
"step": 2370
|
| 2525 |
+
},
|
| 2526 |
+
{
|
| 2527 |
+
"entropy": 0.2774578414391726,
|
| 2528 |
+
"epoch": 3.808,
|
| 2529 |
+
"grad_norm": 1.2308194637298584,
|
| 2530 |
+
"learning_rate": 2.3936e-05,
|
| 2531 |
+
"loss": 0.2328,
|
| 2532 |
+
"mean_token_accuracy": 0.929581755027175,
|
| 2533 |
+
"num_tokens": 932975.0,
|
| 2534 |
+
"step": 2380
|
| 2535 |
+
},
|
| 2536 |
+
{
|
| 2537 |
+
"entropy": 0.2381771973334253,
|
| 2538 |
+
"epoch": 3.824,
|
| 2539 |
+
"grad_norm": 0.7983541488647461,
|
| 2540 |
+
"learning_rate": 2.3616000000000002e-05,
|
| 2541 |
+
"loss": 0.2177,
|
| 2542 |
+
"mean_token_accuracy": 0.9316004611551761,
|
| 2543 |
+
"num_tokens": 965221.0,
|
| 2544 |
+
"step": 2390
|
| 2545 |
+
},
|
| 2546 |
+
{
|
| 2547 |
+
"entropy": 0.2579630766995251,
|
| 2548 |
+
"epoch": 3.84,
|
| 2549 |
+
"grad_norm": 0.8867554068565369,
|
| 2550 |
+
"learning_rate": 2.3296000000000002e-05,
|
| 2551 |
+
"loss": 0.2221,
|
| 2552 |
+
"mean_token_accuracy": 0.9320516049861908,
|
| 2553 |
+
"num_tokens": 990859.0,
|
| 2554 |
+
"step": 2400
|
| 2555 |
+
},
|
| 2556 |
+
{
|
| 2557 |
+
"epoch": 3.84,
|
| 2558 |
+
"eval_accuracy": 0.02676376698545462,
|
| 2559 |
+
"eval_entropy": 0.3534155045747757,
|
| 2560 |
+
"eval_loss": 0.6058897972106934,
|
| 2561 |
+
"eval_mean_token_accuracy": 0.8553497910499572,
|
| 2562 |
+
"eval_num_tokens": 990859.0,
|
| 2563 |
+
"eval_runtime": 869.2088,
|
| 2564 |
+
"eval_samples_per_second": 2.301,
|
| 2565 |
+
"eval_steps_per_second": 0.575,
|
| 2566 |
+
"step": 2400
|
| 2567 |
}
|
| 2568 |
],
|
| 2569 |
"logging_steps": 10,
|
|
|
|
| 2583 |
"attributes": {}
|
| 2584 |
}
|
| 2585 |
},
|
| 2586 |
+
"total_flos": 4.143800723056128e+17,
|
| 2587 |
"train_batch_size": 1,
|
| 2588 |
"trial_name": null,
|
| 2589 |
"trial_params": null
|