{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 785, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 13.448105258759517, "learning_rate": 2.0253164556962026e-06, "loss": 1.2075, "loss_nan_ranks": 0, "loss_rank_avg": 0.31800004839897156, "step": 5, "valid_targets_mean": 748.6, "valid_targets_min": 403 }, { "epoch": 0.064, "grad_norm": 9.982424737572133, "learning_rate": 4.556962025316456e-06, "loss": 1.1769, "loss_nan_ranks": 0, "loss_rank_avg": 0.3006700873374939, "step": 10, "valid_targets_mean": 714.3, "valid_targets_min": 468 }, { "epoch": 0.096, "grad_norm": 3.8464182382921748, "learning_rate": 7.08860759493671e-06, "loss": 1.0706, "loss_nan_ranks": 0, "loss_rank_avg": 0.2761875092983246, "step": 15, "valid_targets_mean": 833.6, "valid_targets_min": 428 }, { "epoch": 0.128, "grad_norm": 2.1620562822697003, "learning_rate": 9.620253164556963e-06, "loss": 1.0013, "loss_nan_ranks": 0, "loss_rank_avg": 0.25137242674827576, "step": 20, "valid_targets_mean": 814.9, "valid_targets_min": 392 }, { "epoch": 0.16, "grad_norm": 1.6337626383709007, "learning_rate": 1.2151898734177216e-05, "loss": 0.9265, "loss_nan_ranks": 0, "loss_rank_avg": 0.2275439351797104, "step": 25, "valid_targets_mean": 728.6, "valid_targets_min": 416 }, { "epoch": 0.192, "grad_norm": 1.9026845295535284, "learning_rate": 1.468354430379747e-05, "loss": 0.8832, "loss_nan_ranks": 0, "loss_rank_avg": 0.20272783935070038, "step": 30, "valid_targets_mean": 751.1, "valid_targets_min": 439 }, { "epoch": 0.224, "grad_norm": 1.073091888557403, "learning_rate": 1.7215189873417723e-05, "loss": 0.8224, "loss_nan_ranks": 0, "loss_rank_avg": 0.21370205283164978, "step": 35, "valid_targets_mean": 786.2, "valid_targets_min": 463 }, { "epoch": 0.256, "grad_norm": 0.8617650331409201, "learning_rate": 1.974683544303798e-05, "loss": 0.8024, "loss_nan_ranks": 0, "loss_rank_avg": 0.1904214322566986, "step": 40, "valid_targets_mean": 843.4, "valid_targets_min": 359 }, { "epoch": 0.288, "grad_norm": 0.759980239581234, "learning_rate": 2.2278481012658228e-05, "loss": 0.7507, "loss_nan_ranks": 0, "loss_rank_avg": 0.15183305740356445, "step": 45, "valid_targets_mean": 724.6, "valid_targets_min": 343 }, { "epoch": 0.32, "grad_norm": 0.7463412043901649, "learning_rate": 2.481012658227848e-05, "loss": 0.736, "loss_nan_ranks": 0, "loss_rank_avg": 0.1823093295097351, "step": 50, "valid_targets_mean": 826.6, "valid_targets_min": 562 }, { "epoch": 0.352, "grad_norm": 0.6886141045868305, "learning_rate": 2.7341772151898737e-05, "loss": 0.7099, "loss_nan_ranks": 0, "loss_rank_avg": 0.17409449815750122, "step": 55, "valid_targets_mean": 738.2, "valid_targets_min": 387 }, { "epoch": 0.384, "grad_norm": 0.6723991947723206, "learning_rate": 2.987341772151899e-05, "loss": 0.6817, "loss_nan_ranks": 0, "loss_rank_avg": 0.1746065616607666, "step": 60, "valid_targets_mean": 817.4, "valid_targets_min": 503 }, { "epoch": 0.416, "grad_norm": 0.7219762987855783, "learning_rate": 3.240506329113924e-05, "loss": 0.7042, "loss_nan_ranks": 0, "loss_rank_avg": 0.15542849898338318, "step": 65, "valid_targets_mean": 631.1, "valid_targets_min": 418 }, { "epoch": 0.448, "grad_norm": 0.711945449946034, "learning_rate": 3.49367088607595e-05, "loss": 0.6683, "loss_nan_ranks": 0, "loss_rank_avg": 0.1588052213191986, "step": 70, "valid_targets_mean": 717.1, "valid_targets_min": 483 }, { "epoch": 0.48, "grad_norm": 0.7723131517405141, "learning_rate": 3.746835443037975e-05, "loss": 0.6602, "loss_nan_ranks": 0, "loss_rank_avg": 0.16386288404464722, "step": 75, "valid_targets_mean": 850.6, "valid_targets_min": 252 }, { "epoch": 0.512, "grad_norm": 0.7455251121043946, "learning_rate": 4e-05, "loss": 0.6492, "loss_nan_ranks": 0, "loss_rank_avg": 0.16289247572422028, "step": 80, "valid_targets_mean": 734.2, "valid_targets_min": 385 }, { "epoch": 0.544, "grad_norm": 0.6806696789899241, "learning_rate": 3.999504991751045e-05, "loss": 0.6491, "loss_nan_ranks": 0, "loss_rank_avg": 0.14095750451087952, "step": 85, "valid_targets_mean": 764.7, "valid_targets_min": 487 }, { "epoch": 0.576, "grad_norm": 0.6997408572409954, "learning_rate": 3.9980202120373464e-05, "loss": 0.6471, "loss_nan_ranks": 0, "loss_rank_avg": 0.15331107378005981, "step": 90, "valid_targets_mean": 785.2, "valid_targets_min": 492 }, { "epoch": 0.608, "grad_norm": 0.7180343376382498, "learning_rate": 3.995546395837111e-05, "loss": 0.6481, "loss_nan_ranks": 0, "loss_rank_avg": 0.17766906321048737, "step": 95, "valid_targets_mean": 781.8, "valid_targets_min": 397 }, { "epoch": 0.64, "grad_norm": 0.7022835185401193, "learning_rate": 3.992084767709763e-05, "loss": 0.6207, "loss_nan_ranks": 0, "loss_rank_avg": 0.17354968190193176, "step": 100, "valid_targets_mean": 836.8, "valid_targets_min": 418 }, { "epoch": 0.672, "grad_norm": 0.7367281503709805, "learning_rate": 3.987637041189781e-05, "loss": 0.6272, "loss_nan_ranks": 0, "loss_rank_avg": 0.162840336561203, "step": 105, "valid_targets_mean": 837.1, "valid_targets_min": 434 }, { "epoch": 0.704, "grad_norm": 0.6596850958225633, "learning_rate": 3.982205417938482e-05, "loss": 0.6176, "loss_nan_ranks": 0, "loss_rank_avg": 0.14747610688209534, "step": 110, "valid_targets_mean": 715.6, "valid_targets_min": 485 }, { "epoch": 0.736, "grad_norm": 0.6740339776230146, "learning_rate": 3.975792586654179e-05, "loss": 0.6183, "loss_nan_ranks": 0, "loss_rank_avg": 0.15198436379432678, "step": 115, "valid_targets_mean": 681.2, "valid_targets_min": 400 }, { "epoch": 0.768, "grad_norm": 0.6279891719181242, "learning_rate": 3.968401721741259e-05, "loss": 0.6138, "loss_nan_ranks": 0, "loss_rank_avg": 0.1494234800338745, "step": 120, "valid_targets_mean": 762.9, "valid_targets_min": 313 }, { "epoch": 0.8, "grad_norm": 0.6762014593224054, "learning_rate": 3.960036481738819e-05, "loss": 0.6152, "loss_nan_ranks": 0, "loss_rank_avg": 0.14409850537776947, "step": 125, "valid_targets_mean": 826.2, "valid_targets_min": 432 }, { "epoch": 0.832, "grad_norm": 0.6951704276499515, "learning_rate": 3.950701007509667e-05, "loss": 0.6211, "loss_nan_ranks": 0, "loss_rank_avg": 0.1555303931236267, "step": 130, "valid_targets_mean": 755.7, "valid_targets_min": 496 }, { "epoch": 0.864, "grad_norm": 0.6296778796453734, "learning_rate": 3.940399920190552e-05, "loss": 0.6129, "loss_nan_ranks": 0, "loss_rank_avg": 0.1367054283618927, "step": 135, "valid_targets_mean": 662.6, "valid_targets_min": 328 }, { "epoch": 0.896, "grad_norm": 0.6783518650883749, "learning_rate": 3.92913831890467e-05, "loss": 0.5886, "loss_nan_ranks": 0, "loss_rank_avg": 0.15633609890937805, "step": 140, "valid_targets_mean": 727.8, "valid_targets_min": 507 }, { "epoch": 0.928, "grad_norm": 0.708146207975277, "learning_rate": 3.916921778237556e-05, "loss": 0.5948, "loss_nan_ranks": 0, "loss_rank_avg": 0.16752688586711884, "step": 145, "valid_targets_mean": 762.7, "valid_targets_min": 541 }, { "epoch": 0.96, "grad_norm": 0.6593931091646447, "learning_rate": 3.903756345477612e-05, "loss": 0.5933, "loss_nan_ranks": 0, "loss_rank_avg": 0.164844810962677, "step": 150, "valid_targets_mean": 851.9, "valid_targets_min": 389 }, { "epoch": 0.992, "grad_norm": 0.6510250678307847, "learning_rate": 3.889648537622657e-05, "loss": 0.5897, "loss_nan_ranks": 0, "loss_rank_avg": 0.1499878615140915, "step": 155, "valid_targets_mean": 824.8, "valid_targets_min": 424 }, { "epoch": 1.0192, "grad_norm": 0.6781043016492474, "learning_rate": 3.874605338153952e-05, "loss": 0.5918, "loss_nan_ranks": 0, "loss_rank_avg": 0.1404564082622528, "step": 160, "valid_targets_mean": 745.8, "valid_targets_min": 381 }, { "epoch": 1.0512, "grad_norm": 0.6508626538105656, "learning_rate": 3.8586341935793265e-05, "loss": 0.5805, "loss_nan_ranks": 0, "loss_rank_avg": 0.14573101699352264, "step": 165, "valid_targets_mean": 690.4, "valid_targets_min": 426 }, { "epoch": 1.0832, "grad_norm": 0.6313686061473457, "learning_rate": 3.841743009747089e-05, "loss": 0.5801, "loss_nan_ranks": 0, "loss_rank_avg": 0.13861840963363647, "step": 170, "valid_targets_mean": 763.2, "valid_targets_min": 457 }, { "epoch": 1.1152, "grad_norm": 0.6718637932023319, "learning_rate": 3.8239401479325714e-05, "loss": 0.5727, "loss_nan_ranks": 0, "loss_rank_avg": 0.13479197025299072, "step": 175, "valid_targets_mean": 740.8, "valid_targets_min": 325 }, { "epoch": 1.1472, "grad_norm": 0.6534534284351228, "learning_rate": 3.8052344206992276e-05, "loss": 0.5688, "loss_nan_ranks": 0, "loss_rank_avg": 0.11501454561948776, "step": 180, "valid_targets_mean": 669.1, "valid_targets_min": 361 }, { "epoch": 1.1792, "grad_norm": 0.6298087553175363, "learning_rate": 3.7856350875363396e-05, "loss": 0.5533, "loss_nan_ranks": 0, "loss_rank_avg": 0.12853585183620453, "step": 185, "valid_targets_mean": 759.6, "valid_targets_min": 433 }, { "epoch": 1.2112, "grad_norm": 0.6629648321914637, "learning_rate": 3.765151850275497e-05, "loss": 0.5718, "loss_nan_ranks": 0, "loss_rank_avg": 0.13259121775627136, "step": 190, "valid_targets_mean": 673.8, "valid_targets_min": 428 }, { "epoch": 1.2432, "grad_norm": 0.6446910197966146, "learning_rate": 3.7437948482881104e-05, "loss": 0.5721, "loss_nan_ranks": 0, "loss_rank_avg": 0.1383952796459198, "step": 195, "valid_targets_mean": 777.4, "valid_targets_min": 559 }, { "epoch": 1.2752, "grad_norm": 0.6715593999996728, "learning_rate": 3.721574653466336e-05, "loss": 0.5767, "loss_nan_ranks": 0, "loss_rank_avg": 0.13751399517059326, "step": 200, "valid_targets_mean": 690.3, "valid_targets_min": 362 }, { "epoch": 1.3072, "grad_norm": 0.6634970200747077, "learning_rate": 3.698502264989903e-05, "loss": 0.5749, "loss_nan_ranks": 0, "loss_rank_avg": 0.14345307648181915, "step": 205, "valid_targets_mean": 795.1, "valid_targets_min": 436 }, { "epoch": 1.3392, "grad_norm": 0.6852785997622871, "learning_rate": 3.674589103881432e-05, "loss": 0.5757, "loss_nan_ranks": 0, "loss_rank_avg": 0.16555346548557281, "step": 210, "valid_targets_mean": 953.1, "valid_targets_min": 495 }, { "epoch": 1.3712, "grad_norm": 0.7182921119480419, "learning_rate": 3.64984700735293e-05, "loss": 0.5662, "loss_nan_ranks": 0, "loss_rank_avg": 0.13560381531715393, "step": 215, "valid_targets_mean": 657.9, "valid_targets_min": 380 }, { "epoch": 1.4032, "grad_norm": 0.6383915774320499, "learning_rate": 3.624288222946273e-05, "loss": 0.5598, "loss_nan_ranks": 0, "loss_rank_avg": 0.1385164111852646, "step": 220, "valid_targets_mean": 883.5, "valid_targets_min": 351 }, { "epoch": 1.4352, "grad_norm": 0.6197700397838696, "learning_rate": 3.597925402470578e-05, "loss": 0.5645, "loss_nan_ranks": 0, "loss_rank_avg": 0.12843455374240875, "step": 225, "valid_targets_mean": 750.9, "valid_targets_min": 434 }, { "epoch": 1.4672, "grad_norm": 0.5711420210151498, "learning_rate": 3.570771595739445e-05, "loss": 0.5598, "loss_nan_ranks": 0, "loss_rank_avg": 0.14530006051063538, "step": 230, "valid_targets_mean": 942.8, "valid_targets_min": 528 }, { "epoch": 1.4992, "grad_norm": 0.6210204975257504, "learning_rate": 3.5428402441111964e-05, "loss": 0.5593, "loss_nan_ranks": 0, "loss_rank_avg": 0.1555243879556656, "step": 235, "valid_targets_mean": 887.2, "valid_targets_min": 537 }, { "epoch": 1.5312000000000001, "grad_norm": 0.6647791326711432, "learning_rate": 3.5141451738352936e-05, "loss": 0.5609, "loss_nan_ranks": 0, "loss_rank_avg": 0.13146094977855682, "step": 240, "valid_targets_mean": 744.5, "valid_targets_min": 461 }, { "epoch": 1.5632000000000001, "grad_norm": 0.6693133474516781, "learning_rate": 3.4847005892082266e-05, "loss": 0.577, "loss_nan_ranks": 0, "loss_rank_avg": 0.1364261358976364, "step": 245, "valid_targets_mean": 682.8, "valid_targets_min": 444 }, { "epoch": 1.5952, "grad_norm": 0.7006371063676372, "learning_rate": 3.454521065542273e-05, "loss": 0.5673, "loss_nan_ranks": 0, "loss_rank_avg": 0.13232733309268951, "step": 250, "valid_targets_mean": 724.2, "valid_targets_min": 468 }, { "epoch": 1.6272, "grad_norm": 0.655773431685838, "learning_rate": 3.423621541950597e-05, "loss": 0.561, "loss_nan_ranks": 0, "loss_rank_avg": 0.13282510638237, "step": 255, "valid_targets_mean": 766.9, "valid_targets_min": 402 }, { "epoch": 1.6592, "grad_norm": 0.6172192241649032, "learning_rate": 3.3920173139522664e-05, "loss": 0.5643, "loss_nan_ranks": 0, "loss_rank_avg": 0.13580721616744995, "step": 260, "valid_targets_mean": 764.2, "valid_targets_min": 400 }, { "epoch": 1.6912, "grad_norm": 0.6628955705415062, "learning_rate": 3.35972402590084e-05, "loss": 0.5581, "loss_nan_ranks": 0, "loss_rank_avg": 0.1439439356327057, "step": 265, "valid_targets_mean": 715.3, "valid_targets_min": 309 }, { "epoch": 1.7231999999999998, "grad_norm": 0.641312579448422, "learning_rate": 3.326757663240291e-05, "loss": 0.5624, "loss_nan_ranks": 0, "loss_rank_avg": 0.1477782130241394, "step": 270, "valid_targets_mean": 739.2, "valid_targets_min": 472 }, { "epoch": 1.7551999999999999, "grad_norm": 0.6833918688567646, "learning_rate": 3.293134544592073e-05, "loss": 0.5578, "loss_nan_ranks": 0, "loss_rank_avg": 0.15322580933570862, "step": 275, "valid_targets_mean": 752.0, "valid_targets_min": 463 }, { "epoch": 1.7872, "grad_norm": 0.640918224676768, "learning_rate": 3.258871313677274e-05, "loss": 0.5599, "loss_nan_ranks": 0, "loss_rank_avg": 0.13823790848255157, "step": 280, "valid_targets_mean": 789.4, "valid_targets_min": 321 }, { "epoch": 1.8192, "grad_norm": 0.6654266115883691, "learning_rate": 3.2239849310778316e-05, "loss": 0.5485, "loss_nan_ranks": 0, "loss_rank_avg": 0.14035166800022125, "step": 285, "valid_targets_mean": 807.4, "valid_targets_min": 448 }, { "epoch": 1.8512, "grad_norm": 0.6123014675225724, "learning_rate": 3.188492665840909e-05, "loss": 0.5557, "loss_nan_ranks": 0, "loss_rank_avg": 0.12820711731910706, "step": 290, "valid_targets_mean": 850.4, "valid_targets_min": 424 }, { "epoch": 1.8832, "grad_norm": 0.6754595695415974, "learning_rate": 3.1524120869305726e-05, "loss": 0.5627, "loss_nan_ranks": 0, "loss_rank_avg": 0.15113355219364166, "step": 295, "valid_targets_mean": 696.0, "valid_targets_min": 472 }, { "epoch": 1.9152, "grad_norm": 0.6578705510880959, "learning_rate": 3.11576105453101e-05, "loss": 0.5519, "loss_nan_ranks": 0, "loss_rank_avg": 0.15135137736797333, "step": 300, "valid_targets_mean": 767.6, "valid_targets_min": 517 }, { "epoch": 1.9472, "grad_norm": 0.6449455299555445, "learning_rate": 3.0785577112055916e-05, "loss": 0.548, "loss_nan_ranks": 0, "loss_rank_avg": 0.14119261503219604, "step": 305, "valid_targets_mean": 762.3, "valid_targets_min": 434 }, { "epoch": 1.9792, "grad_norm": 0.6408490242266223, "learning_rate": 3.040820472916153e-05, "loss": 0.5456, "loss_nan_ranks": 0, "loss_rank_avg": 0.12257601320743561, "step": 310, "valid_targets_mean": 696.9, "valid_targets_min": 300 }, { "epoch": 2.0064, "grad_norm": 0.7541041516650534, "learning_rate": 3.002568019906939e-05, "loss": 0.5427, "loss_nan_ranks": 0, "loss_rank_avg": 0.15626591444015503, "step": 315, "valid_targets_mean": 719.5, "valid_targets_min": 396 }, { "epoch": 2.0384, "grad_norm": 0.665090463756649, "learning_rate": 2.963819287457733e-05, "loss": 0.5352, "loss_nan_ranks": 0, "loss_rank_avg": 0.13162535429000854, "step": 320, "valid_targets_mean": 670.2, "valid_targets_min": 440 }, { "epoch": 2.0704, "grad_norm": 0.6366823920807368, "learning_rate": 2.924593456510733e-05, "loss": 0.5321, "loss_nan_ranks": 0, "loss_rank_avg": 0.13383950293064117, "step": 325, "valid_targets_mean": 810.5, "valid_targets_min": 394 }, { "epoch": 2.1024, "grad_norm": 0.6213548819315227, "learning_rate": 2.8849099441758306e-05, "loss": 0.5265, "loss_nan_ranks": 0, "loss_rank_avg": 0.10977214574813843, "step": 330, "valid_targets_mean": 728.4, "valid_targets_min": 429 }, { "epoch": 2.1344, "grad_norm": 0.6547366858654076, "learning_rate": 2.844788394118979e-05, "loss": 0.5254, "loss_nan_ranks": 0, "loss_rank_avg": 0.1424219012260437, "step": 335, "valid_targets_mean": 800.0, "valid_targets_min": 452 }, { "epoch": 2.1664, "grad_norm": 0.6619785188617626, "learning_rate": 2.8042486668384164e-05, "loss": 0.5306, "loss_nan_ranks": 0, "loss_rank_avg": 0.12859517335891724, "step": 340, "valid_targets_mean": 695.9, "valid_targets_min": 435 }, { "epoch": 2.1984, "grad_norm": 0.687101029751781, "learning_rate": 2.7633108298335582e-05, "loss": 0.5184, "loss_nan_ranks": 0, "loss_rank_avg": 0.12573498487472534, "step": 345, "valid_targets_mean": 777.2, "valid_targets_min": 532 }, { "epoch": 2.2304, "grad_norm": 0.645409223244431, "learning_rate": 2.721995147671416e-05, "loss": 0.5215, "loss_nan_ranks": 0, "loss_rank_avg": 0.118968665599823, "step": 350, "valid_targets_mean": 712.1, "valid_targets_min": 426 }, { "epoch": 2.2624, "grad_norm": 0.671387281197927, "learning_rate": 2.68032207195547e-05, "loss": 0.5203, "loss_nan_ranks": 0, "loss_rank_avg": 0.12927383184432983, "step": 355, "valid_targets_mean": 742.2, "valid_targets_min": 464 }, { "epoch": 2.2944, "grad_norm": 0.7041426331177664, "learning_rate": 2.6383122312019604e-05, "loss": 0.5165, "loss_nan_ranks": 0, "loss_rank_avg": 0.13757221400737762, "step": 360, "valid_targets_mean": 762.4, "valid_targets_min": 397 }, { "epoch": 2.3264, "grad_norm": 0.5959539114433047, "learning_rate": 2.595986420628597e-05, "loss": 0.5197, "loss_nan_ranks": 0, "loss_rank_avg": 0.1234401986002922, "step": 365, "valid_targets_mean": 834.3, "valid_targets_min": 459 }, { "epoch": 2.3584, "grad_norm": 0.6531185712082775, "learning_rate": 2.5533655918607573e-05, "loss": 0.5098, "loss_nan_ranks": 0, "loss_rank_avg": 0.1260676383972168, "step": 370, "valid_targets_mean": 682.6, "valid_targets_min": 395 }, { "epoch": 2.3904, "grad_norm": 0.6634208938956202, "learning_rate": 2.510470842560259e-05, "loss": 0.533, "loss_nan_ranks": 0, "loss_rank_avg": 0.1401265263557434, "step": 375, "valid_targets_mean": 758.4, "valid_targets_min": 325 }, { "epoch": 2.4224, "grad_norm": 0.6727520164532284, "learning_rate": 2.467323405981841e-05, "loss": 0.5271, "loss_nan_ranks": 0, "loss_rank_avg": 0.13311098515987396, "step": 380, "valid_targets_mean": 742.8, "valid_targets_min": 454 }, { "epoch": 2.4544, "grad_norm": 0.6427726949014289, "learning_rate": 2.423944640462533e-05, "loss": 0.5202, "loss_nan_ranks": 0, "loss_rank_avg": 0.13644330203533173, "step": 385, "valid_targets_mean": 789.9, "valid_targets_min": 428 }, { "epoch": 2.4864, "grad_norm": 0.6840033417316511, "learning_rate": 2.3803560188490968e-05, "loss": 0.5308, "loss_nan_ranks": 0, "loss_rank_avg": 0.12673068046569824, "step": 390, "valid_targets_mean": 702.9, "valid_targets_min": 420 }, { "epoch": 2.5183999999999997, "grad_norm": 0.6563061162401826, "learning_rate": 2.336579117868789e-05, "loss": 0.5297, "loss_nan_ranks": 0, "loss_rank_avg": 0.1355002373456955, "step": 395, "valid_targets_mean": 729.9, "valid_targets_min": 526 }, { "epoch": 2.5504, "grad_norm": 0.641354939892573, "learning_rate": 2.292635607448711e-05, "loss": 0.5177, "loss_nan_ranks": 0, "loss_rank_avg": 0.12141138315200806, "step": 400, "valid_targets_mean": 733.4, "valid_targets_min": 316 }, { "epoch": 2.5824, "grad_norm": 0.8906863552430406, "learning_rate": 2.248547239989008e-05, "loss": 0.5165, "loss_nan_ranks": 0, "loss_rank_avg": 0.12363258004188538, "step": 405, "valid_targets_mean": 859.4, "valid_targets_min": 384 }, { "epoch": 2.6144, "grad_norm": 0.676141962708069, "learning_rate": 2.204335839595255e-05, "loss": 0.5358, "loss_nan_ranks": 0, "loss_rank_avg": 0.14006322622299194, "step": 410, "valid_targets_mean": 766.3, "valid_targets_min": 300 }, { "epoch": 2.6464, "grad_norm": 0.615596066669323, "learning_rate": 2.1600232912753452e-05, "loss": 0.5215, "loss_nan_ranks": 0, "loss_rank_avg": 0.12042121589183807, "step": 415, "valid_targets_mean": 843.1, "valid_targets_min": 342 }, { "epoch": 2.6784, "grad_norm": 0.615943796645789, "learning_rate": 2.1156315301062293e-05, "loss": 0.5144, "loss_nan_ranks": 0, "loss_rank_avg": 0.1382143646478653, "step": 420, "valid_targets_mean": 864.3, "valid_targets_min": 464 }, { "epoch": 2.7104, "grad_norm": 1.1772547291108486, "learning_rate": 2.0711825303758712e-05, "loss": 0.535, "loss_nan_ranks": 0, "loss_rank_avg": 0.1458306610584259, "step": 425, "valid_targets_mean": 898.8, "valid_targets_min": 554 }, { "epoch": 2.7424, "grad_norm": 0.6393166323484019, "learning_rate": 2.0266982947057962e-05, "loss": 0.5235, "loss_nan_ranks": 0, "loss_rank_avg": 0.13645440340042114, "step": 430, "valid_targets_mean": 740.2, "valid_targets_min": 391 }, { "epoch": 2.7744, "grad_norm": 0.6786636785060219, "learning_rate": 1.9822008431596083e-05, "loss": 0.5337, "loss_nan_ranks": 0, "loss_rank_avg": 0.14586326479911804, "step": 435, "valid_targets_mean": 734.4, "valid_targets_min": 313 }, { "epoch": 2.8064, "grad_norm": 0.6138448927187343, "learning_rate": 1.937712202342881e-05, "loss": 0.5094, "loss_nan_ranks": 0, "loss_rank_avg": 0.14957866072654724, "step": 440, "valid_targets_mean": 983.4, "valid_targets_min": 476 }, { "epoch": 2.8384, "grad_norm": 0.6115317247558727, "learning_rate": 1.8932543944998037e-05, "loss": 0.5215, "loss_nan_ranks": 0, "loss_rank_avg": 0.13156554102897644, "step": 445, "valid_targets_mean": 756.7, "valid_targets_min": 509 }, { "epoch": 2.8704, "grad_norm": 0.6480026309160363, "learning_rate": 1.8488494266119877e-05, "loss": 0.5164, "loss_nan_ranks": 0, "loss_rank_avg": 0.13125823438167572, "step": 450, "valid_targets_mean": 749.1, "valid_targets_min": 384 }, { "epoch": 2.9024, "grad_norm": 0.6221700405662001, "learning_rate": 1.804519279504834e-05, "loss": 0.5289, "loss_nan_ranks": 0, "loss_rank_avg": 0.1366758495569229, "step": 455, "valid_targets_mean": 716.8, "valid_targets_min": 372 }, { "epoch": 2.9344, "grad_norm": 0.6835135386375543, "learning_rate": 1.7602858969668365e-05, "loss": 0.5423, "loss_nan_ranks": 0, "loss_rank_avg": 0.1429317593574524, "step": 460, "valid_targets_mean": 738.8, "valid_targets_min": 370 }, { "epoch": 2.9664, "grad_norm": 0.6476761584366846, "learning_rate": 1.716171174887231e-05, "loss": 0.5177, "loss_nan_ranks": 0, "loss_rank_avg": 0.12608763575553894, "step": 465, "valid_targets_mean": 771.1, "valid_targets_min": 577 }, { "epoch": 2.9984, "grad_norm": 0.6705239133879689, "learning_rate": 1.6721969504173484e-05, "loss": 0.5304, "loss_nan_ranks": 0, "loss_rank_avg": 0.156068354845047, "step": 470, "valid_targets_mean": 891.4, "valid_targets_min": 436 }, { "epoch": 3.0256, "grad_norm": 0.6254648310969626, "learning_rate": 1.628384991161041e-05, "loss": 0.4947, "loss_nan_ranks": 0, "loss_rank_avg": 0.12908436357975006, "step": 475, "valid_targets_mean": 856.1, "valid_targets_min": 468 }, { "epoch": 3.0576, "grad_norm": 0.6835751653097379, "learning_rate": 1.5847569843995452e-05, "loss": 0.5142, "loss_nan_ranks": 0, "loss_rank_avg": 0.14210891723632812, "step": 480, "valid_targets_mean": 753.6, "valid_targets_min": 416 }, { "epoch": 3.0896, "grad_norm": 0.6619131317365188, "learning_rate": 1.5413345263560922e-05, "loss": 0.5013, "loss_nan_ranks": 0, "loss_rank_avg": 0.12965424358844757, "step": 485, "valid_targets_mean": 726.4, "valid_targets_min": 444 }, { "epoch": 3.1216, "grad_norm": 0.6591313448070079, "learning_rate": 1.4981391115056032e-05, "loss": 0.5063, "loss_nan_ranks": 0, "loss_rank_avg": 0.1259090006351471, "step": 490, "valid_targets_mean": 749.0, "valid_targets_min": 385 }, { "epoch": 3.1536, "grad_norm": 0.6569116597796409, "learning_rate": 1.455192121934748e-05, "loss": 0.5012, "loss_nan_ranks": 0, "loss_rank_avg": 0.1285029649734497, "step": 495, "valid_targets_mean": 837.4, "valid_targets_min": 496 }, { "epoch": 3.1856, "grad_norm": 0.6316910437995757, "learning_rate": 1.4125148167576303e-05, "loss": 0.5031, "loss_nan_ranks": 0, "loss_rank_avg": 0.11511541903018951, "step": 500, "valid_targets_mean": 672.5, "valid_targets_min": 430 }, { "epoch": 3.2176, "grad_norm": 0.6389505729458541, "learning_rate": 1.3701283215923563e-05, "loss": 0.5044, "loss_nan_ranks": 0, "loss_rank_avg": 0.12613186240196228, "step": 505, "valid_targets_mean": 796.8, "valid_targets_min": 310 }, { "epoch": 3.2496, "grad_norm": 0.6629145801717702, "learning_rate": 1.328053618103677e-05, "loss": 0.495, "loss_nan_ranks": 0, "loss_rank_avg": 0.11542559415102005, "step": 510, "valid_targets_mean": 727.7, "valid_targets_min": 439 }, { "epoch": 3.2816, "grad_norm": 0.634372579989713, "learning_rate": 1.2863115336168916e-05, "loss": 0.5014, "loss_nan_ranks": 0, "loss_rank_avg": 0.12187671661376953, "step": 515, "valid_targets_mean": 753.3, "valid_targets_min": 482 }, { "epoch": 3.3136, "grad_norm": 0.6384070283786387, "learning_rate": 1.2449227308081509e-05, "loss": 0.5008, "loss_nan_ranks": 0, "loss_rank_avg": 0.1421659290790558, "step": 520, "valid_targets_mean": 845.1, "valid_targets_min": 573 }, { "epoch": 3.3456, "grad_norm": 0.6332388154728119, "learning_rate": 1.2039076974762587e-05, "loss": 0.4928, "loss_nan_ranks": 0, "loss_rank_avg": 0.11383464932441711, "step": 525, "valid_targets_mean": 718.1, "valid_targets_min": 397 }, { "epoch": 3.3776, "grad_norm": 0.6093799616673348, "learning_rate": 1.163286736401044e-05, "loss": 0.4926, "loss_nan_ranks": 0, "loss_rank_avg": 0.12254788726568222, "step": 530, "valid_targets_mean": 798.4, "valid_targets_min": 413 }, { "epoch": 3.4096, "grad_norm": 0.6455046460858423, "learning_rate": 1.123079955293322e-05, "loss": 0.5112, "loss_nan_ranks": 0, "loss_rank_avg": 0.13581742346286774, "step": 535, "valid_targets_mean": 846.3, "valid_targets_min": 520 }, { "epoch": 3.4416, "grad_norm": 0.631427790063437, "learning_rate": 1.0833072568414037e-05, "loss": 0.5119, "loss_nan_ranks": 0, "loss_rank_avg": 0.1291504204273224, "step": 540, "valid_targets_mean": 806.6, "valid_targets_min": 501 }, { "epoch": 3.4736000000000002, "grad_norm": 0.6322674040782007, "learning_rate": 1.0439883288591057e-05, "loss": 0.4964, "loss_nan_ranks": 0, "loss_rank_avg": 0.11973114311695099, "step": 545, "valid_targets_mean": 736.1, "valid_targets_min": 400 }, { "epoch": 3.5056000000000003, "grad_norm": 0.6441453581221164, "learning_rate": 1.0051426345401202e-05, "loss": 0.5132, "loss_nan_ranks": 0, "loss_rank_avg": 0.13486211001873016, "step": 550, "valid_targets_mean": 804.2, "valid_targets_min": 478 }, { "epoch": 3.5376, "grad_norm": 0.6425950808621582, "learning_rate": 9.667894028235704e-06, "loss": 0.5077, "loss_nan_ranks": 0, "loss_rank_avg": 0.12831318378448486, "step": 555, "valid_targets_mean": 724.8, "valid_targets_min": 512 }, { "epoch": 3.5696, "grad_norm": 0.6124823375637398, "learning_rate": 9.289476188755315e-06, "loss": 0.5038, "loss_nan_ranks": 0, "loss_rank_avg": 0.12071146070957184, "step": 560, "valid_targets_mean": 741.8, "valid_targets_min": 316 }, { "epoch": 3.6016, "grad_norm": 0.6205786442665068, "learning_rate": 8.916360146912122e-06, "loss": 0.5037, "loss_nan_ranks": 0, "loss_rank_avg": 0.10591184347867966, "step": 565, "valid_targets_mean": 709.2, "valid_targets_min": 480 }, { "epoch": 3.6336, "grad_norm": 0.6636478895992294, "learning_rate": 8.548730598224646e-06, "loss": 0.4993, "loss_nan_ranks": 0, "loss_rank_avg": 0.12058466672897339, "step": 570, "valid_targets_mean": 727.6, "valid_targets_min": 417 }, { "epoch": 3.6656, "grad_norm": 0.5971242481772463, "learning_rate": 8.186769522352053e-06, "loss": 0.4952, "loss_nan_ranks": 0, "loss_rank_avg": 0.13874907791614532, "step": 575, "valid_targets_mean": 860.1, "valid_targets_min": 527 }, { "epoch": 3.6976, "grad_norm": 0.615075544398069, "learning_rate": 7.830656093012714e-06, "loss": 0.5037, "loss_nan_ranks": 0, "loss_rank_avg": 0.11050853878259659, "step": 580, "valid_targets_mean": 743.3, "valid_targets_min": 468 }, { "epoch": 3.7296, "grad_norm": 0.6470427339088323, "learning_rate": 7.480566589291696e-06, "loss": 0.5064, "loss_nan_ranks": 0, "loss_rank_avg": 0.13953588902950287, "step": 585, "valid_targets_mean": 822.9, "valid_targets_min": 493 }, { "epoch": 3.7616, "grad_norm": 0.597610350393088, "learning_rate": 7.1366743083812285e-06, "loss": 0.4967, "loss_nan_ranks": 0, "loss_rank_avg": 0.11674021184444427, "step": 590, "valid_targets_mean": 738.9, "valid_targets_min": 415 }, { "epoch": 3.7936, "grad_norm": 0.6338991358688321, "learning_rate": 6.799149479797101e-06, "loss": 0.5006, "loss_nan_ranks": 0, "loss_rank_avg": 0.12019048631191254, "step": 595, "valid_targets_mean": 707.8, "valid_targets_min": 442 }, { "epoch": 3.8256, "grad_norm": 0.6202165939689909, "learning_rate": 6.4681591811137e-06, "loss": 0.4968, "loss_nan_ranks": 0, "loss_rank_avg": 0.12805257737636566, "step": 600, "valid_targets_mean": 873.7, "valid_targets_min": 529 }, { "epoch": 3.8576, "grad_norm": 0.6714289302846277, "learning_rate": 6.143867255259197e-06, "loss": 0.5012, "loss_nan_ranks": 0, "loss_rank_avg": 0.12346205115318298, "step": 605, "valid_targets_mean": 609.8, "valid_targets_min": 416 }, { "epoch": 3.8895999999999997, "grad_norm": 0.6242136384257845, "learning_rate": 5.8264342294119504e-06, "loss": 0.501, "loss_nan_ranks": 0, "loss_rank_avg": 0.11968934535980225, "step": 610, "valid_targets_mean": 689.3, "valid_targets_min": 443 }, { "epoch": 3.9215999999999998, "grad_norm": 0.6463661477202667, "learning_rate": 5.516017235538258e-06, "loss": 0.5053, "loss_nan_ranks": 0, "loss_rank_avg": 0.1359320878982544, "step": 615, "valid_targets_mean": 786.9, "valid_targets_min": 559 }, { "epoch": 3.9536, "grad_norm": 0.6491908587833649, "learning_rate": 5.212769932610695e-06, "loss": 0.4973, "loss_nan_ranks": 0, "loss_rank_avg": 0.14212754368782043, "step": 620, "valid_targets_mean": 861.1, "valid_targets_min": 507 }, { "epoch": 3.9856, "grad_norm": 0.6281434470761624, "learning_rate": 4.916842430545681e-06, "loss": 0.4975, "loss_nan_ranks": 0, "loss_rank_avg": 0.11817573010921478, "step": 625, "valid_targets_mean": 755.6, "valid_targets_min": 445 }, { "epoch": 4.0128, "grad_norm": 0.6490532684460447, "learning_rate": 4.628381215897837e-06, "loss": 0.4819, "loss_nan_ranks": 0, "loss_rank_avg": 0.11554375290870667, "step": 630, "valid_targets_mean": 724.1, "valid_targets_min": 367 }, { "epoch": 4.0448, "grad_norm": 0.6327445790214326, "learning_rate": 4.347529079347914e-06, "loss": 0.4894, "loss_nan_ranks": 0, "loss_rank_avg": 0.12214243412017822, "step": 635, "valid_targets_mean": 676.2, "valid_targets_min": 310 }, { "epoch": 4.0768, "grad_norm": 0.6158265147921352, "learning_rate": 4.074425045020247e-06, "loss": 0.4868, "loss_nan_ranks": 0, "loss_rank_avg": 0.13750189542770386, "step": 640, "valid_targets_mean": 834.1, "valid_targets_min": 555 }, { "epoch": 4.1088, "grad_norm": 0.627070881496122, "learning_rate": 3.8092043016646487e-06, "loss": 0.4929, "loss_nan_ranks": 0, "loss_rank_avg": 0.11655047535896301, "step": 645, "valid_targets_mean": 674.8, "valid_targets_min": 411 }, { "epoch": 4.1408, "grad_norm": 0.6221973742999799, "learning_rate": 3.551998135736867e-06, "loss": 0.489, "loss_nan_ranks": 0, "loss_rank_avg": 0.12250182032585144, "step": 650, "valid_targets_mean": 665.8, "valid_targets_min": 394 }, { "epoch": 4.1728, "grad_norm": 0.6626884060775692, "learning_rate": 3.3029338664107267e-06, "loss": 0.4741, "loss_nan_ranks": 0, "loss_rank_avg": 0.12655746936798096, "step": 655, "valid_targets_mean": 705.9, "valid_targets_min": 548 }, { "epoch": 4.2048, "grad_norm": 0.6224489306449362, "learning_rate": 3.0621347825540625e-06, "loss": 0.4841, "loss_nan_ranks": 0, "loss_rank_avg": 0.12335214763879776, "step": 660, "valid_targets_mean": 784.9, "valid_targets_min": 525 }, { "epoch": 4.2368, "grad_norm": 0.6520842934256829, "learning_rate": 2.8297200816997183e-06, "loss": 0.4911, "loss_nan_ranks": 0, "loss_rank_avg": 0.12841469049453735, "step": 665, "valid_targets_mean": 765.9, "valid_targets_min": 481 }, { "epoch": 4.2688, "grad_norm": 0.6423816689951226, "learning_rate": 2.605804811041803e-06, "loss": 0.4819, "loss_nan_ranks": 0, "loss_rank_avg": 0.10488501191139221, "step": 670, "valid_targets_mean": 732.0, "valid_targets_min": 450 }, { "epoch": 4.3008, "grad_norm": 0.6181615169830669, "learning_rate": 2.390499810486351e-06, "loss": 0.481, "loss_nan_ranks": 0, "loss_rank_avg": 0.12980340421199799, "step": 675, "valid_targets_mean": 780.8, "valid_targets_min": 466 }, { "epoch": 4.3328, "grad_norm": 0.6322677157647695, "learning_rate": 2.183911657784685e-06, "loss": 0.4964, "loss_nan_ranks": 0, "loss_rank_avg": 0.12320327758789062, "step": 680, "valid_targets_mean": 868.0, "valid_targets_min": 429 }, { "epoch": 4.3648, "grad_norm": 0.6043179861396448, "learning_rate": 1.986142615776532e-06, "loss": 0.4919, "loss_nan_ranks": 0, "loss_rank_avg": 0.1195220947265625, "step": 685, "valid_targets_mean": 728.2, "valid_targets_min": 471 }, { "epoch": 4.3968, "grad_norm": 0.640478901066271, "learning_rate": 1.7972905817690644e-06, "loss": 0.4888, "loss_nan_ranks": 0, "loss_rank_avg": 0.12453019618988037, "step": 690, "valid_targets_mean": 716.9, "valid_targets_min": 362 }, { "epoch": 4.4288, "grad_norm": 0.6112657891179784, "learning_rate": 1.617449039076955e-06, "loss": 0.4972, "loss_nan_ranks": 0, "loss_rank_avg": 0.12277944386005402, "step": 695, "valid_targets_mean": 826.2, "valid_targets_min": 418 }, { "epoch": 4.4608, "grad_norm": 0.680661377919841, "learning_rate": 1.4467070107473413e-06, "loss": 0.4926, "loss_nan_ranks": 0, "loss_rank_avg": 0.11709259450435638, "step": 700, "valid_targets_mean": 670.3, "valid_targets_min": 446 }, { "epoch": 4.4928, "grad_norm": 0.6067946877348196, "learning_rate": 1.2851490154926816e-06, "loss": 0.4844, "loss_nan_ranks": 0, "loss_rank_avg": 0.11835283041000366, "step": 705, "valid_targets_mean": 833.8, "valid_targets_min": 535 }, { "epoch": 4.5248, "grad_norm": 0.6214656228794159, "learning_rate": 1.1328550258533211e-06, "loss": 0.4955, "loss_nan_ranks": 0, "loss_rank_avg": 0.1317848563194275, "step": 710, "valid_targets_mean": 780.4, "valid_targets_min": 540 }, { "epoch": 4.5568, "grad_norm": 0.6101425589821468, "learning_rate": 9.899004286103953e-07, "loss": 0.4892, "loss_nan_ranks": 0, "loss_rank_avg": 0.1342540979385376, "step": 715, "valid_targets_mean": 813.8, "valid_targets_min": 592 }, { "epoch": 4.5888, "grad_norm": 0.6431480087423993, "learning_rate": 8.5635598746876e-07, "loss": 0.4892, "loss_nan_ranks": 0, "loss_rank_avg": 0.12438573688268661, "step": 720, "valid_targets_mean": 771.0, "valid_targets_min": 505 }, { "epoch": 4.6208, "grad_norm": 0.6403336841979095, "learning_rate": 7.32287808028389e-07, "loss": 0.502, "loss_nan_ranks": 0, "loss_rank_avg": 0.14776507019996643, "step": 725, "valid_targets_mean": 895.8, "valid_targets_min": 484 }, { "epoch": 4.6528, "grad_norm": 0.6038822017761685, "learning_rate": 6.177573050615327e-07, "loss": 0.4909, "loss_nan_ranks": 0, "loss_rank_avg": 0.11538829654455185, "step": 730, "valid_targets_mean": 747.9, "valid_targets_min": 428 }, { "epoch": 4.6848, "grad_norm": 0.6324119113515146, "learning_rate": 5.128211721119213e-07, "loss": 0.4979, "loss_nan_ranks": 0, "loss_rank_avg": 0.13889926671981812, "step": 735, "valid_targets_mean": 792.2, "valid_targets_min": 385 }, { "epoch": 4.7168, "grad_norm": 0.6324936257203374, "learning_rate": 4.175313534309755e-07, "loss": 0.487, "loss_nan_ranks": 0, "loss_rank_avg": 0.1103043407201767, "step": 740, "valid_targets_mean": 703.1, "valid_targets_min": 435 }, { "epoch": 4.7488, "grad_norm": 0.629729044732728, "learning_rate": 3.319350182649861e-07, "loss": 0.4892, "loss_nan_ranks": 0, "loss_rank_avg": 0.12301481515169144, "step": 745, "valid_targets_mean": 739.1, "valid_targets_min": 491 }, { "epoch": 4.7808, "grad_norm": 0.6274760958790774, "learning_rate": 2.560745375059392e-07, "loss": 0.4877, "loss_nan_ranks": 0, "loss_rank_avg": 0.11202463507652283, "step": 750, "valid_targets_mean": 644.5, "valid_targets_min": 356 }, { "epoch": 4.8128, "grad_norm": 0.6073341940792345, "learning_rate": 1.8998746271758016e-07, "loss": 0.4851, "loss_nan_ranks": 0, "loss_rank_avg": 0.12768879532814026, "step": 755, "valid_targets_mean": 871.2, "valid_targets_min": 439 }, { "epoch": 4.8448, "grad_norm": 0.6300616469590965, "learning_rate": 1.337065075470778e-07, "loss": 0.4962, "loss_nan_ranks": 0, "loss_rank_avg": 0.10540274530649185, "step": 760, "valid_targets_mean": 732.8, "valid_targets_min": 458 }, { "epoch": 4.8768, "grad_norm": 0.607465565387994, "learning_rate": 8.725953153150279e-08, "loss": 0.4911, "loss_nan_ranks": 0, "loss_rank_avg": 0.132490336894989, "step": 765, "valid_targets_mean": 854.8, "valid_targets_min": 435 }, { "epoch": 4.9088, "grad_norm": 0.5917313437808399, "learning_rate": 5.066952630711886e-08, "loss": 0.4864, "loss_nan_ranks": 0, "loss_rank_avg": 0.10872993618249893, "step": 770, "valid_targets_mean": 783.2, "valid_targets_min": 507 }, { "epoch": 4.9408, "grad_norm": 0.7403546016233938, "learning_rate": 2.3954604228342283e-08, "loss": 0.4979, "loss_nan_ranks": 0, "loss_rank_avg": 0.12833866477012634, "step": 775, "valid_targets_mean": 677.8, "valid_targets_min": 428 }, { "epoch": 4.9728, "grad_norm": 0.6407918253430359, "learning_rate": 7.12798940197601e-09, "loss": 0.498, "loss_nan_ranks": 0, "loss_rank_avg": 0.12702913582324982, "step": 780, "valid_targets_mean": 762.4, "valid_targets_min": 411 }, { "epoch": 5.0, "grad_norm": 1.2926809123761072, "learning_rate": 1.9801114115480802e-10, "loss": 0.4833, "loss_nan_ranks": 0, "loss_rank_avg": 0.4620312452316284, "step": 785, "valid_targets_mean": 702.6, "valid_targets_min": 459 }, { "epoch": 5.0, "loss_nan_ranks": 0, "loss_rank_avg": 0.4620312452316284, "step": 785, "total_flos": 2.3852276139845222e+17, "train_loss": 0.5620896111628052, "train_runtime": 6728.981, "train_samples_per_second": 7.428, "train_steps_per_second": 0.117, "valid_targets_mean": 702.6, "valid_targets_min": 459 } ], "logging_steps": 5, "max_steps": 785, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3852276139845222e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }