| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.5877022653721684, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012944983818770227, | |
| "grad_norm": 2.9600605964660645, | |
| "learning_rate": 9e-06, | |
| "loss": 2.4498, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.025889967637540454, | |
| "grad_norm": 2.9047722816467285, | |
| "learning_rate": 1.9e-05, | |
| "loss": 2.307, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.038834951456310676, | |
| "grad_norm": 1.4037628173828125, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.9692, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05177993527508091, | |
| "grad_norm": 0.9827209711074829, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 1.5169, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06472491909385113, | |
| "grad_norm": 1.69009530544281, | |
| "learning_rate": 4.9e-05, | |
| "loss": 1.1746, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07766990291262135, | |
| "grad_norm": 0.6970093250274658, | |
| "learning_rate": 5.9e-05, | |
| "loss": 1.1462, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09061488673139159, | |
| "grad_norm": 1.1838749647140503, | |
| "learning_rate": 6.9e-05, | |
| "loss": 1.0314, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10355987055016182, | |
| "grad_norm": 1.2029207944869995, | |
| "learning_rate": 7.900000000000001e-05, | |
| "loss": 1.015, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11650485436893204, | |
| "grad_norm": 0.7995015978813171, | |
| "learning_rate": 8.900000000000001e-05, | |
| "loss": 0.9881, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12944983818770225, | |
| "grad_norm": 0.7544731497764587, | |
| "learning_rate": 9.900000000000001e-05, | |
| "loss": 0.9749, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1423948220064725, | |
| "grad_norm": 0.9032944440841675, | |
| "learning_rate": 9.959441189725102e-05, | |
| "loss": 0.9717, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1553398058252427, | |
| "grad_norm": 0.7858815789222717, | |
| "learning_rate": 9.914375844975215e-05, | |
| "loss": 0.9365, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16828478964401294, | |
| "grad_norm": 0.6408785581588745, | |
| "learning_rate": 9.869310500225327e-05, | |
| "loss": 1.0061, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18122977346278318, | |
| "grad_norm": 0.7295084595680237, | |
| "learning_rate": 9.82424515547544e-05, | |
| "loss": 1.0838, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1941747572815534, | |
| "grad_norm": 1.299147129058838, | |
| "learning_rate": 9.779179810725552e-05, | |
| "loss": 0.8957, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20711974110032363, | |
| "grad_norm": 0.6116259694099426, | |
| "learning_rate": 9.734114465975666e-05, | |
| "loss": 0.9007, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.22006472491909385, | |
| "grad_norm": 1.0018341541290283, | |
| "learning_rate": 9.689049121225779e-05, | |
| "loss": 0.9559, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.23300970873786409, | |
| "grad_norm": 0.6822437047958374, | |
| "learning_rate": 9.64398377647589e-05, | |
| "loss": 0.8807, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2459546925566343, | |
| "grad_norm": 0.6875489950180054, | |
| "learning_rate": 9.598918431726003e-05, | |
| "loss": 0.892, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2588996763754045, | |
| "grad_norm": 0.9155406355857849, | |
| "learning_rate": 9.553853086976116e-05, | |
| "loss": 0.8846, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.27184466019417475, | |
| "grad_norm": 0.7715094685554504, | |
| "learning_rate": 9.508787742226228e-05, | |
| "loss": 0.9194, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.284789644012945, | |
| "grad_norm": 0.9338945150375366, | |
| "learning_rate": 9.463722397476341e-05, | |
| "loss": 0.843, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2977346278317152, | |
| "grad_norm": 0.5477790236473083, | |
| "learning_rate": 9.418657052726453e-05, | |
| "loss": 0.9608, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3106796116504854, | |
| "grad_norm": 0.7385268211364746, | |
| "learning_rate": 9.373591707976567e-05, | |
| "loss": 0.8578, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.32362459546925565, | |
| "grad_norm": 0.7461796402931213, | |
| "learning_rate": 9.32852636322668e-05, | |
| "loss": 0.8357, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3365695792880259, | |
| "grad_norm": 0.7043349146842957, | |
| "learning_rate": 9.283461018476792e-05, | |
| "loss": 0.9199, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.34951456310679613, | |
| "grad_norm": 0.6864545941352844, | |
| "learning_rate": 9.238395673726904e-05, | |
| "loss": 0.8716, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.36245954692556637, | |
| "grad_norm": 0.8263736963272095, | |
| "learning_rate": 9.193330328977017e-05, | |
| "loss": 0.9072, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.37540453074433655, | |
| "grad_norm": 0.7197927832603455, | |
| "learning_rate": 9.148264984227129e-05, | |
| "loss": 0.9276, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 0.769057035446167, | |
| "learning_rate": 9.103199639477243e-05, | |
| "loss": 0.9371, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.40129449838187703, | |
| "grad_norm": 0.6112708449363708, | |
| "learning_rate": 9.058134294727354e-05, | |
| "loss": 0.841, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.41423948220064727, | |
| "grad_norm": 0.6785593628883362, | |
| "learning_rate": 9.013068949977468e-05, | |
| "loss": 0.8026, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.42718446601941745, | |
| "grad_norm": 0.75263911485672, | |
| "learning_rate": 8.968003605227581e-05, | |
| "loss": 0.8829, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4401294498381877, | |
| "grad_norm": 0.6636873483657837, | |
| "learning_rate": 8.922938260477693e-05, | |
| "loss": 0.877, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.45307443365695793, | |
| "grad_norm": 0.7837623357772827, | |
| "learning_rate": 8.877872915727806e-05, | |
| "loss": 0.8355, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.46601941747572817, | |
| "grad_norm": 0.7330045104026794, | |
| "learning_rate": 8.832807570977918e-05, | |
| "loss": 0.8869, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.47896440129449835, | |
| "grad_norm": 0.7116459012031555, | |
| "learning_rate": 8.78774222622803e-05, | |
| "loss": 0.8861, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4919093851132686, | |
| "grad_norm": 0.7309095859527588, | |
| "learning_rate": 8.742676881478144e-05, | |
| "loss": 0.8943, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5048543689320388, | |
| "grad_norm": 0.9951479434967041, | |
| "learning_rate": 8.697611536728256e-05, | |
| "loss": 0.8906, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.517799352750809, | |
| "grad_norm": 0.8258851766586304, | |
| "learning_rate": 8.652546191978369e-05, | |
| "loss": 0.7817, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5307443365695793, | |
| "grad_norm": 0.8005662560462952, | |
| "learning_rate": 8.607480847228482e-05, | |
| "loss": 0.8965, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5436893203883495, | |
| "grad_norm": 0.956330418586731, | |
| "learning_rate": 8.562415502478594e-05, | |
| "loss": 0.8186, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5566343042071198, | |
| "grad_norm": 0.7853320240974426, | |
| "learning_rate": 8.517350157728708e-05, | |
| "loss": 0.7989, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.56957928802589, | |
| "grad_norm": 0.8193638920783997, | |
| "learning_rate": 8.47228481297882e-05, | |
| "loss": 0.833, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 0.6770658493041992, | |
| "learning_rate": 8.427219468228931e-05, | |
| "loss": 0.754, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5954692556634305, | |
| "grad_norm": 0.7300212979316711, | |
| "learning_rate": 8.382154123479045e-05, | |
| "loss": 0.8801, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6084142394822006, | |
| "grad_norm": 0.9311557412147522, | |
| "learning_rate": 8.337088778729157e-05, | |
| "loss": 0.9497, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6213592233009708, | |
| "grad_norm": 0.8132520914077759, | |
| "learning_rate": 8.29202343397927e-05, | |
| "loss": 0.7912, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6343042071197411, | |
| "grad_norm": 0.6899361610412598, | |
| "learning_rate": 8.246958089229383e-05, | |
| "loss": 0.8118, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6472491909385113, | |
| "grad_norm": 0.8400319218635559, | |
| "learning_rate": 8.201892744479495e-05, | |
| "loss": 0.8223, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6601941747572816, | |
| "grad_norm": 0.6506232023239136, | |
| "learning_rate": 8.156827399729609e-05, | |
| "loss": 0.8067, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6731391585760518, | |
| "grad_norm": 0.7419421672821045, | |
| "learning_rate": 8.111762054979722e-05, | |
| "loss": 0.8338, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.686084142394822, | |
| "grad_norm": 0.8188750743865967, | |
| "learning_rate": 8.066696710229834e-05, | |
| "loss": 0.8502, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6990291262135923, | |
| "grad_norm": 0.7666177153587341, | |
| "learning_rate": 8.021631365479946e-05, | |
| "loss": 0.9033, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7119741100323624, | |
| "grad_norm": 0.7812498211860657, | |
| "learning_rate": 7.976566020730059e-05, | |
| "loss": 0.8686, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7249190938511327, | |
| "grad_norm": 0.7209528684616089, | |
| "learning_rate": 7.931500675980171e-05, | |
| "loss": 0.9243, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7378640776699029, | |
| "grad_norm": 0.8110234141349792, | |
| "learning_rate": 7.886435331230284e-05, | |
| "loss": 0.8535, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7508090614886731, | |
| "grad_norm": 0.9169409871101379, | |
| "learning_rate": 7.841369986480396e-05, | |
| "loss": 0.7916, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7637540453074434, | |
| "grad_norm": 0.9186325073242188, | |
| "learning_rate": 7.79630464173051e-05, | |
| "loss": 0.8712, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.7394826412200928, | |
| "learning_rate": 7.751239296980623e-05, | |
| "loss": 0.8319, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7896440129449838, | |
| "grad_norm": 0.9118407964706421, | |
| "learning_rate": 7.706173952230735e-05, | |
| "loss": 0.8615, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8025889967637541, | |
| "grad_norm": 0.8932146430015564, | |
| "learning_rate": 7.661108607480848e-05, | |
| "loss": 0.8729, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8155339805825242, | |
| "grad_norm": 0.8682609796524048, | |
| "learning_rate": 7.61604326273096e-05, | |
| "loss": 0.8196, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8284789644012945, | |
| "grad_norm": 0.7901037931442261, | |
| "learning_rate": 7.570977917981072e-05, | |
| "loss": 0.8124, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8414239482200647, | |
| "grad_norm": 0.787814199924469, | |
| "learning_rate": 7.525912573231186e-05, | |
| "loss": 0.7854, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8543689320388349, | |
| "grad_norm": 0.9356242418289185, | |
| "learning_rate": 7.480847228481298e-05, | |
| "loss": 0.7527, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8673139158576052, | |
| "grad_norm": 0.7236626744270325, | |
| "learning_rate": 7.435781883731411e-05, | |
| "loss": 0.7956, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8802588996763754, | |
| "grad_norm": 1.0258172750473022, | |
| "learning_rate": 7.390716538981524e-05, | |
| "loss": 0.7896, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8932038834951457, | |
| "grad_norm": 0.8183866739273071, | |
| "learning_rate": 7.345651194231636e-05, | |
| "loss": 0.7084, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9061488673139159, | |
| "grad_norm": 1.1878470182418823, | |
| "learning_rate": 7.30058584948175e-05, | |
| "loss": 0.7582, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.919093851132686, | |
| "grad_norm": 0.933775007724762, | |
| "learning_rate": 7.255520504731861e-05, | |
| "loss": 0.7988, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9320388349514563, | |
| "grad_norm": 0.8619435429573059, | |
| "learning_rate": 7.210455159981975e-05, | |
| "loss": 0.8472, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9449838187702265, | |
| "grad_norm": 0.8439723253250122, | |
| "learning_rate": 7.165389815232087e-05, | |
| "loss": 0.7647, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9579288025889967, | |
| "grad_norm": 0.8391640782356262, | |
| "learning_rate": 7.120324470482199e-05, | |
| "loss": 0.811, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.970873786407767, | |
| "grad_norm": 0.8031836748123169, | |
| "learning_rate": 7.075259125732312e-05, | |
| "loss": 0.7377, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9838187702265372, | |
| "grad_norm": 0.9124151468276978, | |
| "learning_rate": 7.030193780982425e-05, | |
| "loss": 0.7091, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9967637540453075, | |
| "grad_norm": 1.0887125730514526, | |
| "learning_rate": 6.985128436232537e-05, | |
| "loss": 0.8262, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.009061488673139, | |
| "grad_norm": 0.9555509090423584, | |
| "learning_rate": 6.94006309148265e-05, | |
| "loss": 0.7195, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.0220064724919093, | |
| "grad_norm": 1.1681190729141235, | |
| "learning_rate": 6.894997746732763e-05, | |
| "loss": 0.6532, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0349514563106796, | |
| "grad_norm": 1.120592713356018, | |
| "learning_rate": 6.849932401982876e-05, | |
| "loss": 0.6175, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.04789644012945, | |
| "grad_norm": 0.9625107645988464, | |
| "learning_rate": 6.804867057232989e-05, | |
| "loss": 0.6541, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.06084142394822, | |
| "grad_norm": 0.9709998965263367, | |
| "learning_rate": 6.7598017124831e-05, | |
| "loss": 0.6446, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0737864077669903, | |
| "grad_norm": 0.9543795585632324, | |
| "learning_rate": 6.714736367733213e-05, | |
| "loss": 0.5685, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0867313915857606, | |
| "grad_norm": 1.2367397546768188, | |
| "learning_rate": 6.669671022983326e-05, | |
| "loss": 0.6747, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0996763754045307, | |
| "grad_norm": 1.0891395807266235, | |
| "learning_rate": 6.624605678233438e-05, | |
| "loss": 0.6536, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.112621359223301, | |
| "grad_norm": 1.1543422937393188, | |
| "learning_rate": 6.579540333483552e-05, | |
| "loss": 0.7627, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1255663430420713, | |
| "grad_norm": 1.363010048866272, | |
| "learning_rate": 6.534474988733664e-05, | |
| "loss": 0.7699, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1385113268608413, | |
| "grad_norm": 1.171339511871338, | |
| "learning_rate": 6.489409643983777e-05, | |
| "loss": 0.6775, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1514563106796116, | |
| "grad_norm": 1.1836340427398682, | |
| "learning_rate": 6.44434429923389e-05, | |
| "loss": 0.6744, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.164401294498382, | |
| "grad_norm": 1.1864937543869019, | |
| "learning_rate": 6.399278954484002e-05, | |
| "loss": 0.61, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.177346278317152, | |
| "grad_norm": 1.3767447471618652, | |
| "learning_rate": 6.354213609734114e-05, | |
| "loss": 0.7274, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1902912621359223, | |
| "grad_norm": 1.1915229558944702, | |
| "learning_rate": 6.309148264984228e-05, | |
| "loss": 0.6787, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.2032362459546926, | |
| "grad_norm": 1.416157603263855, | |
| "learning_rate": 6.26408292023434e-05, | |
| "loss": 0.5878, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.2161812297734629, | |
| "grad_norm": 1.179671049118042, | |
| "learning_rate": 6.219017575484453e-05, | |
| "loss": 0.6416, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.229126213592233, | |
| "grad_norm": 0.9643092751502991, | |
| "learning_rate": 6.173952230734565e-05, | |
| "loss": 0.6718, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2420711974110032, | |
| "grad_norm": 1.417594075202942, | |
| "learning_rate": 6.128886885984678e-05, | |
| "loss": 0.6965, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.2550161812297735, | |
| "grad_norm": 1.740006685256958, | |
| "learning_rate": 6.083821541234791e-05, | |
| "loss": 0.6674, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.2679611650485436, | |
| "grad_norm": 1.2844997644424438, | |
| "learning_rate": 6.0387561964849034e-05, | |
| "loss": 0.7144, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.280906148867314, | |
| "grad_norm": 1.012427568435669, | |
| "learning_rate": 5.993690851735017e-05, | |
| "loss": 0.6602, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2938511326860842, | |
| "grad_norm": 1.6793211698532104, | |
| "learning_rate": 5.948625506985128e-05, | |
| "loss": 0.6379, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3067961165048545, | |
| "grad_norm": 1.1831872463226318, | |
| "learning_rate": 5.903560162235241e-05, | |
| "loss": 0.6309, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.3197411003236246, | |
| "grad_norm": 1.1994572877883911, | |
| "learning_rate": 5.858494817485354e-05, | |
| "loss": 0.6099, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3326860841423949, | |
| "grad_norm": 1.3904818296432495, | |
| "learning_rate": 5.8134294727354666e-05, | |
| "loss": 0.6408, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.3456310679611652, | |
| "grad_norm": 1.3394759893417358, | |
| "learning_rate": 5.768364127985579e-05, | |
| "loss": 0.6148, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.3585760517799352, | |
| "grad_norm": 1.0991851091384888, | |
| "learning_rate": 5.723298783235692e-05, | |
| "loss": 0.6353, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3715210355987055, | |
| "grad_norm": 2.004030227661133, | |
| "learning_rate": 5.678233438485805e-05, | |
| "loss": 0.6547, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3844660194174758, | |
| "grad_norm": 1.5414537191390991, | |
| "learning_rate": 5.633168093735918e-05, | |
| "loss": 0.6684, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.397411003236246, | |
| "grad_norm": 1.4660983085632324, | |
| "learning_rate": 5.5881027489860305e-05, | |
| "loss": 0.7235, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4103559870550162, | |
| "grad_norm": 0.6685907244682312, | |
| "learning_rate": 5.5430374042361425e-05, | |
| "loss": 0.6797, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.4233009708737865, | |
| "grad_norm": 1.2811195850372314, | |
| "learning_rate": 5.497972059486255e-05, | |
| "loss": 0.6325, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4362459546925566, | |
| "grad_norm": 1.3929194211959839, | |
| "learning_rate": 5.452906714736368e-05, | |
| "loss": 0.6982, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.4491909385113269, | |
| "grad_norm": 1.1184651851654053, | |
| "learning_rate": 5.4078413699864804e-05, | |
| "loss": 0.561, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.4621359223300971, | |
| "grad_norm": 1.3536425828933716, | |
| "learning_rate": 5.362776025236593e-05, | |
| "loss": 0.606, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.4750809061488672, | |
| "grad_norm": 1.4096359014511108, | |
| "learning_rate": 5.3177106804867064e-05, | |
| "loss": 0.7083, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.4880258899676375, | |
| "grad_norm": 1.5671049356460571, | |
| "learning_rate": 5.272645335736819e-05, | |
| "loss": 0.6337, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.5009708737864078, | |
| "grad_norm": 1.3812810182571411, | |
| "learning_rate": 5.2275799909869316e-05, | |
| "loss": 0.646, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.5139158576051779, | |
| "grad_norm": 1.2574256658554077, | |
| "learning_rate": 5.182514646237044e-05, | |
| "loss": 0.5943, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.5268608414239482, | |
| "grad_norm": 1.0817134380340576, | |
| "learning_rate": 5.137449301487156e-05, | |
| "loss": 0.6729, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.5398058252427185, | |
| "grad_norm": 2.198194980621338, | |
| "learning_rate": 5.092383956737269e-05, | |
| "loss": 0.6537, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.5527508090614885, | |
| "grad_norm": 1.8652335405349731, | |
| "learning_rate": 5.0473186119873815e-05, | |
| "loss": 0.6486, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.565695792880259, | |
| "grad_norm": 1.6180390119552612, | |
| "learning_rate": 5.002253267237494e-05, | |
| "loss": 0.567, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.5786407766990291, | |
| "grad_norm": 1.9595000743865967, | |
| "learning_rate": 4.9571879224876075e-05, | |
| "loss": 0.6248, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.5915857605177992, | |
| "grad_norm": 0.9194151759147644, | |
| "learning_rate": 4.91212257773772e-05, | |
| "loss": 0.6251, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.6045307443365697, | |
| "grad_norm": 1.026760458946228, | |
| "learning_rate": 4.867057232987833e-05, | |
| "loss": 0.7088, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.6174757281553398, | |
| "grad_norm": 1.3673216104507446, | |
| "learning_rate": 4.821991888237945e-05, | |
| "loss": 0.6626, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.6304207119741099, | |
| "grad_norm": 1.1107579469680786, | |
| "learning_rate": 4.776926543488058e-05, | |
| "loss": 0.7106, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6433656957928804, | |
| "grad_norm": 0.9585609436035156, | |
| "learning_rate": 4.731861198738171e-05, | |
| "loss": 0.7389, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.6563106796116505, | |
| "grad_norm": 1.2537648677825928, | |
| "learning_rate": 4.686795853988283e-05, | |
| "loss": 0.6415, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.6692556634304208, | |
| "grad_norm": 1.3973714113235474, | |
| "learning_rate": 4.641730509238396e-05, | |
| "loss": 0.5704, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.682200647249191, | |
| "grad_norm": 1.69650399684906, | |
| "learning_rate": 4.5966651644885086e-05, | |
| "loss": 0.6029, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.6951456310679611, | |
| "grad_norm": 1.6716722249984741, | |
| "learning_rate": 4.551599819738621e-05, | |
| "loss": 0.582, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.7080906148867314, | |
| "grad_norm": 1.383424997329712, | |
| "learning_rate": 4.506534474988734e-05, | |
| "loss": 0.6176, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.7210355987055017, | |
| "grad_norm": 1.0892506837844849, | |
| "learning_rate": 4.4614691302388465e-05, | |
| "loss": 0.5937, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.7339805825242718, | |
| "grad_norm": 1.4017115831375122, | |
| "learning_rate": 4.416403785488959e-05, | |
| "loss": 0.5447, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.746925566343042, | |
| "grad_norm": 1.332664132118225, | |
| "learning_rate": 4.371338440739072e-05, | |
| "loss": 0.65, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7598705501618124, | |
| "grad_norm": 1.4425687789916992, | |
| "learning_rate": 4.3262730959891845e-05, | |
| "loss": 0.6095, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.7728155339805824, | |
| "grad_norm": 1.7485853433609009, | |
| "learning_rate": 4.281207751239297e-05, | |
| "loss": 0.6011, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.7857605177993527, | |
| "grad_norm": 1.2251473665237427, | |
| "learning_rate": 4.23614240648941e-05, | |
| "loss": 0.5049, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.798705501618123, | |
| "grad_norm": 1.543966293334961, | |
| "learning_rate": 4.1910770617395224e-05, | |
| "loss": 0.5768, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.811650485436893, | |
| "grad_norm": 1.153024435043335, | |
| "learning_rate": 4.146011716989635e-05, | |
| "loss": 0.606, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8245954692556634, | |
| "grad_norm": 1.4503074884414673, | |
| "learning_rate": 4.1009463722397477e-05, | |
| "loss": 0.6068, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.8375404530744337, | |
| "grad_norm": 1.5761051177978516, | |
| "learning_rate": 4.055881027489861e-05, | |
| "loss": 0.5736, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.8504854368932038, | |
| "grad_norm": 1.4788157939910889, | |
| "learning_rate": 4.010815682739973e-05, | |
| "loss": 0.6098, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.863430420711974, | |
| "grad_norm": 1.3545244932174683, | |
| "learning_rate": 3.9657503379900856e-05, | |
| "loss": 0.5781, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.8763754045307444, | |
| "grad_norm": 1.4322385787963867, | |
| "learning_rate": 3.920684993240198e-05, | |
| "loss": 0.5779, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.8893203883495144, | |
| "grad_norm": 1.4146760702133179, | |
| "learning_rate": 3.8756196484903115e-05, | |
| "loss": 0.6097, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.902265372168285, | |
| "grad_norm": 1.7581102848052979, | |
| "learning_rate": 3.830554303740424e-05, | |
| "loss": 0.5672, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.915210355987055, | |
| "grad_norm": 1.5007004737854004, | |
| "learning_rate": 3.785488958990536e-05, | |
| "loss": 0.5951, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.928155339805825, | |
| "grad_norm": 1.2334699630737305, | |
| "learning_rate": 3.740423614240649e-05, | |
| "loss": 0.549, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.9411003236245956, | |
| "grad_norm": 0.9463567733764648, | |
| "learning_rate": 3.695358269490762e-05, | |
| "loss": 0.6448, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9540453074433657, | |
| "grad_norm": 1.8025217056274414, | |
| "learning_rate": 3.650292924740875e-05, | |
| "loss": 0.7363, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9669902912621358, | |
| "grad_norm": 2.9467597007751465, | |
| "learning_rate": 3.6052275799909874e-05, | |
| "loss": 0.5586, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.9799352750809063, | |
| "grad_norm": 1.8900437355041504, | |
| "learning_rate": 3.5601622352410993e-05, | |
| "loss": 0.623, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.9928802588996763, | |
| "grad_norm": 1.5157594680786133, | |
| "learning_rate": 3.515096890491213e-05, | |
| "loss": 0.606, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.005177993527508, | |
| "grad_norm": 1.370686411857605, | |
| "learning_rate": 3.470031545741325e-05, | |
| "loss": 0.5081, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.018122977346278, | |
| "grad_norm": 1.3025308847427368, | |
| "learning_rate": 3.424966200991438e-05, | |
| "loss": 0.4448, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.0310679611650486, | |
| "grad_norm": 0.7799197435379028, | |
| "learning_rate": 3.37990085624155e-05, | |
| "loss": 0.4341, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.0440129449838187, | |
| "grad_norm": 1.375433087348938, | |
| "learning_rate": 3.334835511491663e-05, | |
| "loss": 0.5702, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.056957928802589, | |
| "grad_norm": 1.4452621936798096, | |
| "learning_rate": 3.289770166741776e-05, | |
| "loss": 0.455, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.0699029126213593, | |
| "grad_norm": 1.0591763257980347, | |
| "learning_rate": 3.2447048219918885e-05, | |
| "loss": 0.5472, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0828478964401294, | |
| "grad_norm": 1.366735816001892, | |
| "learning_rate": 3.199639477242001e-05, | |
| "loss": 0.5011, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.0957928802589, | |
| "grad_norm": 2.3307456970214844, | |
| "learning_rate": 3.154574132492114e-05, | |
| "loss": 0.5293, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.10873786407767, | |
| "grad_norm": 1.5701334476470947, | |
| "learning_rate": 3.1095087877422264e-05, | |
| "loss": 0.5571, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.12168284789644, | |
| "grad_norm": 1.585015892982483, | |
| "learning_rate": 3.064443442992339e-05, | |
| "loss": 0.4914, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.1346278317152105, | |
| "grad_norm": 1.5167232751846313, | |
| "learning_rate": 3.0193780982424517e-05, | |
| "loss": 0.4462, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.1475728155339806, | |
| "grad_norm": 1.8519450426101685, | |
| "learning_rate": 2.974312753492564e-05, | |
| "loss": 0.5367, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.1605177993527507, | |
| "grad_norm": 1.1500009298324585, | |
| "learning_rate": 2.929247408742677e-05, | |
| "loss": 0.4091, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.173462783171521, | |
| "grad_norm": 1.9004416465759277, | |
| "learning_rate": 2.8841820639927896e-05, | |
| "loss": 0.4752, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.1864077669902913, | |
| "grad_norm": 1.588977575302124, | |
| "learning_rate": 2.8391167192429026e-05, | |
| "loss": 0.4229, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.1993527508090613, | |
| "grad_norm": 2.033543825149536, | |
| "learning_rate": 2.7940513744930153e-05, | |
| "loss": 0.5575, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.212297734627832, | |
| "grad_norm": 1.1972986459732056, | |
| "learning_rate": 2.7489860297431276e-05, | |
| "loss": 0.4302, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.225242718446602, | |
| "grad_norm": 0.8637037873268127, | |
| "learning_rate": 2.7039206849932402e-05, | |
| "loss": 0.4924, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.238187702265372, | |
| "grad_norm": 1.6572158336639404, | |
| "learning_rate": 2.6588553402433532e-05, | |
| "loss": 0.5, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.2511326860841425, | |
| "grad_norm": 1.8110625743865967, | |
| "learning_rate": 2.6137899954934658e-05, | |
| "loss": 0.4361, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.2640776699029126, | |
| "grad_norm": 1.676248550415039, | |
| "learning_rate": 2.568724650743578e-05, | |
| "loss": 0.4407, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.2770226537216827, | |
| "grad_norm": 1.149107813835144, | |
| "learning_rate": 2.5236593059936908e-05, | |
| "loss": 0.5472, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.289967637540453, | |
| "grad_norm": 1.7543455362319946, | |
| "learning_rate": 2.4785939612438037e-05, | |
| "loss": 0.5396, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.3029126213592233, | |
| "grad_norm": 2.7097597122192383, | |
| "learning_rate": 2.4335286164939164e-05, | |
| "loss": 0.5013, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.3158576051779933, | |
| "grad_norm": 1.9181076288223267, | |
| "learning_rate": 2.388463271744029e-05, | |
| "loss": 0.4476, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.328802588996764, | |
| "grad_norm": 1.1099668741226196, | |
| "learning_rate": 2.3433979269941417e-05, | |
| "loss": 0.4113, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.341747572815534, | |
| "grad_norm": 1.5468546152114868, | |
| "learning_rate": 2.2983325822442543e-05, | |
| "loss": 0.4629, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.354692556634304, | |
| "grad_norm": 1.149834394454956, | |
| "learning_rate": 2.253267237494367e-05, | |
| "loss": 0.434, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.3676375404530745, | |
| "grad_norm": 1.4918863773345947, | |
| "learning_rate": 2.2082018927444796e-05, | |
| "loss": 0.4554, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.3805825242718446, | |
| "grad_norm": 1.610051155090332, | |
| "learning_rate": 2.1631365479945922e-05, | |
| "loss": 0.4081, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.3935275080906147, | |
| "grad_norm": 1.377886176109314, | |
| "learning_rate": 2.118071203244705e-05, | |
| "loss": 0.4531, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.406472491909385, | |
| "grad_norm": 1.3633161783218384, | |
| "learning_rate": 2.0730058584948175e-05, | |
| "loss": 0.4767, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.4194174757281552, | |
| "grad_norm": 2.3759074211120605, | |
| "learning_rate": 2.0279405137449305e-05, | |
| "loss": 0.5155, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.4323624595469258, | |
| "grad_norm": 1.634912133216858, | |
| "learning_rate": 1.9828751689950428e-05, | |
| "loss": 0.4279, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.445307443365696, | |
| "grad_norm": 1.3101922273635864, | |
| "learning_rate": 1.9378098242451558e-05, | |
| "loss": 0.4284, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.458252427184466, | |
| "grad_norm": 1.2841248512268066, | |
| "learning_rate": 1.892744479495268e-05, | |
| "loss": 0.5135, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.4711974110032364, | |
| "grad_norm": 2.4284558296203613, | |
| "learning_rate": 1.847679134745381e-05, | |
| "loss": 0.4455, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.4841423948220065, | |
| "grad_norm": 1.589353084564209, | |
| "learning_rate": 1.8026137899954937e-05, | |
| "loss": 0.484, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.4970873786407766, | |
| "grad_norm": 1.588586688041687, | |
| "learning_rate": 1.7575484452456063e-05, | |
| "loss": 0.5047, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.510032362459547, | |
| "grad_norm": 1.8486393690109253, | |
| "learning_rate": 1.712483100495719e-05, | |
| "loss": 0.4384, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.522977346278317, | |
| "grad_norm": 1.9914798736572266, | |
| "learning_rate": 1.6674177557458316e-05, | |
| "loss": 0.4324, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.5359223300970872, | |
| "grad_norm": 1.3951258659362793, | |
| "learning_rate": 1.6223524109959443e-05, | |
| "loss": 0.5316, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.5488673139158577, | |
| "grad_norm": 1.471160888671875, | |
| "learning_rate": 1.577287066246057e-05, | |
| "loss": 0.4943, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.561812297734628, | |
| "grad_norm": 1.5667455196380615, | |
| "learning_rate": 1.5322217214961695e-05, | |
| "loss": 0.5354, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.574757281553398, | |
| "grad_norm": 1.5954725742340088, | |
| "learning_rate": 1.487156376746282e-05, | |
| "loss": 0.4484, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.5877022653721684, | |
| "grad_norm": 1.3914281129837036, | |
| "learning_rate": 1.4420910319963948e-05, | |
| "loss": 0.4617, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2319, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.084236146533417e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |