{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5877022653721684, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012944983818770227, "grad_norm": 2.9600605964660645, "learning_rate": 9e-06, "loss": 2.4498, "step": 10 }, { "epoch": 0.025889967637540454, "grad_norm": 2.9047722816467285, "learning_rate": 1.9e-05, "loss": 2.307, "step": 20 }, { "epoch": 0.038834951456310676, "grad_norm": 1.4037628173828125, "learning_rate": 2.9e-05, "loss": 1.9692, "step": 30 }, { "epoch": 0.05177993527508091, "grad_norm": 0.9827209711074829, "learning_rate": 3.9000000000000006e-05, "loss": 1.5169, "step": 40 }, { "epoch": 0.06472491909385113, "grad_norm": 1.69009530544281, "learning_rate": 4.9e-05, "loss": 1.1746, "step": 50 }, { "epoch": 0.07766990291262135, "grad_norm": 0.6970093250274658, "learning_rate": 5.9e-05, "loss": 1.1462, "step": 60 }, { "epoch": 0.09061488673139159, "grad_norm": 1.1838749647140503, "learning_rate": 6.9e-05, "loss": 1.0314, "step": 70 }, { "epoch": 0.10355987055016182, "grad_norm": 1.2029207944869995, "learning_rate": 7.900000000000001e-05, "loss": 1.015, "step": 80 }, { "epoch": 0.11650485436893204, "grad_norm": 0.7995015978813171, "learning_rate": 8.900000000000001e-05, "loss": 0.9881, "step": 90 }, { "epoch": 0.12944983818770225, "grad_norm": 0.7544731497764587, "learning_rate": 9.900000000000001e-05, "loss": 0.9749, "step": 100 }, { "epoch": 0.1423948220064725, "grad_norm": 0.9032944440841675, "learning_rate": 9.959441189725102e-05, "loss": 0.9717, "step": 110 }, { "epoch": 0.1553398058252427, "grad_norm": 0.7858815789222717, "learning_rate": 9.914375844975215e-05, "loss": 0.9365, "step": 120 }, { "epoch": 0.16828478964401294, "grad_norm": 0.6408785581588745, "learning_rate": 9.869310500225327e-05, "loss": 1.0061, "step": 130 }, { "epoch": 0.18122977346278318, "grad_norm": 0.7295084595680237, "learning_rate": 9.82424515547544e-05, "loss": 1.0838, "step": 140 }, { "epoch": 0.1941747572815534, "grad_norm": 1.299147129058838, "learning_rate": 9.779179810725552e-05, "loss": 0.8957, "step": 150 }, { "epoch": 0.20711974110032363, "grad_norm": 0.6116259694099426, "learning_rate": 9.734114465975666e-05, "loss": 0.9007, "step": 160 }, { "epoch": 0.22006472491909385, "grad_norm": 1.0018341541290283, "learning_rate": 9.689049121225779e-05, "loss": 0.9559, "step": 170 }, { "epoch": 0.23300970873786409, "grad_norm": 0.6822437047958374, "learning_rate": 9.64398377647589e-05, "loss": 0.8807, "step": 180 }, { "epoch": 0.2459546925566343, "grad_norm": 0.6875489950180054, "learning_rate": 9.598918431726003e-05, "loss": 0.892, "step": 190 }, { "epoch": 0.2588996763754045, "grad_norm": 0.9155406355857849, "learning_rate": 9.553853086976116e-05, "loss": 0.8846, "step": 200 }, { "epoch": 0.27184466019417475, "grad_norm": 0.7715094685554504, "learning_rate": 9.508787742226228e-05, "loss": 0.9194, "step": 210 }, { "epoch": 0.284789644012945, "grad_norm": 0.9338945150375366, "learning_rate": 9.463722397476341e-05, "loss": 0.843, "step": 220 }, { "epoch": 0.2977346278317152, "grad_norm": 0.5477790236473083, "learning_rate": 9.418657052726453e-05, "loss": 0.9608, "step": 230 }, { "epoch": 0.3106796116504854, "grad_norm": 0.7385268211364746, "learning_rate": 9.373591707976567e-05, "loss": 0.8578, "step": 240 }, { "epoch": 0.32362459546925565, "grad_norm": 0.7461796402931213, "learning_rate": 9.32852636322668e-05, "loss": 0.8357, "step": 250 }, { "epoch": 0.3365695792880259, "grad_norm": 0.7043349146842957, "learning_rate": 9.283461018476792e-05, "loss": 0.9199, "step": 260 }, { "epoch": 0.34951456310679613, "grad_norm": 0.6864545941352844, "learning_rate": 9.238395673726904e-05, "loss": 0.8716, "step": 270 }, { "epoch": 0.36245954692556637, "grad_norm": 0.8263736963272095, "learning_rate": 9.193330328977017e-05, "loss": 0.9072, "step": 280 }, { "epoch": 0.37540453074433655, "grad_norm": 0.7197927832603455, "learning_rate": 9.148264984227129e-05, "loss": 0.9276, "step": 290 }, { "epoch": 0.3883495145631068, "grad_norm": 0.769057035446167, "learning_rate": 9.103199639477243e-05, "loss": 0.9371, "step": 300 }, { "epoch": 0.40129449838187703, "grad_norm": 0.6112708449363708, "learning_rate": 9.058134294727354e-05, "loss": 0.841, "step": 310 }, { "epoch": 0.41423948220064727, "grad_norm": 0.6785593628883362, "learning_rate": 9.013068949977468e-05, "loss": 0.8026, "step": 320 }, { "epoch": 0.42718446601941745, "grad_norm": 0.75263911485672, "learning_rate": 8.968003605227581e-05, "loss": 0.8829, "step": 330 }, { "epoch": 0.4401294498381877, "grad_norm": 0.6636873483657837, "learning_rate": 8.922938260477693e-05, "loss": 0.877, "step": 340 }, { "epoch": 0.45307443365695793, "grad_norm": 0.7837623357772827, "learning_rate": 8.877872915727806e-05, "loss": 0.8355, "step": 350 }, { "epoch": 0.46601941747572817, "grad_norm": 0.7330045104026794, "learning_rate": 8.832807570977918e-05, "loss": 0.8869, "step": 360 }, { "epoch": 0.47896440129449835, "grad_norm": 0.7116459012031555, "learning_rate": 8.78774222622803e-05, "loss": 0.8861, "step": 370 }, { "epoch": 0.4919093851132686, "grad_norm": 0.7309095859527588, "learning_rate": 8.742676881478144e-05, "loss": 0.8943, "step": 380 }, { "epoch": 0.5048543689320388, "grad_norm": 0.9951479434967041, "learning_rate": 8.697611536728256e-05, "loss": 0.8906, "step": 390 }, { "epoch": 0.517799352750809, "grad_norm": 0.8258851766586304, "learning_rate": 8.652546191978369e-05, "loss": 0.7817, "step": 400 }, { "epoch": 0.5307443365695793, "grad_norm": 0.8005662560462952, "learning_rate": 8.607480847228482e-05, "loss": 0.8965, "step": 410 }, { "epoch": 0.5436893203883495, "grad_norm": 0.956330418586731, "learning_rate": 8.562415502478594e-05, "loss": 0.8186, "step": 420 }, { "epoch": 0.5566343042071198, "grad_norm": 0.7853320240974426, "learning_rate": 8.517350157728708e-05, "loss": 0.7989, "step": 430 }, { "epoch": 0.56957928802589, "grad_norm": 0.8193638920783997, "learning_rate": 8.47228481297882e-05, "loss": 0.833, "step": 440 }, { "epoch": 0.5825242718446602, "grad_norm": 0.6770658493041992, "learning_rate": 8.427219468228931e-05, "loss": 0.754, "step": 450 }, { "epoch": 0.5954692556634305, "grad_norm": 0.7300212979316711, "learning_rate": 8.382154123479045e-05, "loss": 0.8801, "step": 460 }, { "epoch": 0.6084142394822006, "grad_norm": 0.9311557412147522, "learning_rate": 8.337088778729157e-05, "loss": 0.9497, "step": 470 }, { "epoch": 0.6213592233009708, "grad_norm": 0.8132520914077759, "learning_rate": 8.29202343397927e-05, "loss": 0.7912, "step": 480 }, { "epoch": 0.6343042071197411, "grad_norm": 0.6899361610412598, "learning_rate": 8.246958089229383e-05, "loss": 0.8118, "step": 490 }, { "epoch": 0.6472491909385113, "grad_norm": 0.8400319218635559, "learning_rate": 8.201892744479495e-05, "loss": 0.8223, "step": 500 }, { "epoch": 0.6601941747572816, "grad_norm": 0.6506232023239136, "learning_rate": 8.156827399729609e-05, "loss": 0.8067, "step": 510 }, { "epoch": 0.6731391585760518, "grad_norm": 0.7419421672821045, "learning_rate": 8.111762054979722e-05, "loss": 0.8338, "step": 520 }, { "epoch": 0.686084142394822, "grad_norm": 0.8188750743865967, "learning_rate": 8.066696710229834e-05, "loss": 0.8502, "step": 530 }, { "epoch": 0.6990291262135923, "grad_norm": 0.7666177153587341, "learning_rate": 8.021631365479946e-05, "loss": 0.9033, "step": 540 }, { "epoch": 0.7119741100323624, "grad_norm": 0.7812498211860657, "learning_rate": 7.976566020730059e-05, "loss": 0.8686, "step": 550 }, { "epoch": 0.7249190938511327, "grad_norm": 0.7209528684616089, "learning_rate": 7.931500675980171e-05, "loss": 0.9243, "step": 560 }, { "epoch": 0.7378640776699029, "grad_norm": 0.8110234141349792, "learning_rate": 7.886435331230284e-05, "loss": 0.8535, "step": 570 }, { "epoch": 0.7508090614886731, "grad_norm": 0.9169409871101379, "learning_rate": 7.841369986480396e-05, "loss": 0.7916, "step": 580 }, { "epoch": 0.7637540453074434, "grad_norm": 0.9186325073242188, "learning_rate": 7.79630464173051e-05, "loss": 0.8712, "step": 590 }, { "epoch": 0.7766990291262136, "grad_norm": 0.7394826412200928, "learning_rate": 7.751239296980623e-05, "loss": 0.8319, "step": 600 }, { "epoch": 0.7896440129449838, "grad_norm": 0.9118407964706421, "learning_rate": 7.706173952230735e-05, "loss": 0.8615, "step": 610 }, { "epoch": 0.8025889967637541, "grad_norm": 0.8932146430015564, "learning_rate": 7.661108607480848e-05, "loss": 0.8729, "step": 620 }, { "epoch": 0.8155339805825242, "grad_norm": 0.8682609796524048, "learning_rate": 7.61604326273096e-05, "loss": 0.8196, "step": 630 }, { "epoch": 0.8284789644012945, "grad_norm": 0.7901037931442261, "learning_rate": 7.570977917981072e-05, "loss": 0.8124, "step": 640 }, { "epoch": 0.8414239482200647, "grad_norm": 0.787814199924469, "learning_rate": 7.525912573231186e-05, "loss": 0.7854, "step": 650 }, { "epoch": 0.8543689320388349, "grad_norm": 0.9356242418289185, "learning_rate": 7.480847228481298e-05, "loss": 0.7527, "step": 660 }, { "epoch": 0.8673139158576052, "grad_norm": 0.7236626744270325, "learning_rate": 7.435781883731411e-05, "loss": 0.7956, "step": 670 }, { "epoch": 0.8802588996763754, "grad_norm": 1.0258172750473022, "learning_rate": 7.390716538981524e-05, "loss": 0.7896, "step": 680 }, { "epoch": 0.8932038834951457, "grad_norm": 0.8183866739273071, "learning_rate": 7.345651194231636e-05, "loss": 0.7084, "step": 690 }, { "epoch": 0.9061488673139159, "grad_norm": 1.1878470182418823, "learning_rate": 7.30058584948175e-05, "loss": 0.7582, "step": 700 }, { "epoch": 0.919093851132686, "grad_norm": 0.933775007724762, "learning_rate": 7.255520504731861e-05, "loss": 0.7988, "step": 710 }, { "epoch": 0.9320388349514563, "grad_norm": 0.8619435429573059, "learning_rate": 7.210455159981975e-05, "loss": 0.8472, "step": 720 }, { "epoch": 0.9449838187702265, "grad_norm": 0.8439723253250122, "learning_rate": 7.165389815232087e-05, "loss": 0.7647, "step": 730 }, { "epoch": 0.9579288025889967, "grad_norm": 0.8391640782356262, "learning_rate": 7.120324470482199e-05, "loss": 0.811, "step": 740 }, { "epoch": 0.970873786407767, "grad_norm": 0.8031836748123169, "learning_rate": 7.075259125732312e-05, "loss": 0.7377, "step": 750 }, { "epoch": 0.9838187702265372, "grad_norm": 0.9124151468276978, "learning_rate": 7.030193780982425e-05, "loss": 0.7091, "step": 760 }, { "epoch": 0.9967637540453075, "grad_norm": 1.0887125730514526, "learning_rate": 6.985128436232537e-05, "loss": 0.8262, "step": 770 }, { "epoch": 1.009061488673139, "grad_norm": 0.9555509090423584, "learning_rate": 6.94006309148265e-05, "loss": 0.7195, "step": 780 }, { "epoch": 1.0220064724919093, "grad_norm": 1.1681190729141235, "learning_rate": 6.894997746732763e-05, "loss": 0.6532, "step": 790 }, { "epoch": 1.0349514563106796, "grad_norm": 1.120592713356018, "learning_rate": 6.849932401982876e-05, "loss": 0.6175, "step": 800 }, { "epoch": 1.04789644012945, "grad_norm": 0.9625107645988464, "learning_rate": 6.804867057232989e-05, "loss": 0.6541, "step": 810 }, { "epoch": 1.06084142394822, "grad_norm": 0.9709998965263367, "learning_rate": 6.7598017124831e-05, "loss": 0.6446, "step": 820 }, { "epoch": 1.0737864077669903, "grad_norm": 0.9543795585632324, "learning_rate": 6.714736367733213e-05, "loss": 0.5685, "step": 830 }, { "epoch": 1.0867313915857606, "grad_norm": 1.2367397546768188, "learning_rate": 6.669671022983326e-05, "loss": 0.6747, "step": 840 }, { "epoch": 1.0996763754045307, "grad_norm": 1.0891395807266235, "learning_rate": 6.624605678233438e-05, "loss": 0.6536, "step": 850 }, { "epoch": 1.112621359223301, "grad_norm": 1.1543422937393188, "learning_rate": 6.579540333483552e-05, "loss": 0.7627, "step": 860 }, { "epoch": 1.1255663430420713, "grad_norm": 1.363010048866272, "learning_rate": 6.534474988733664e-05, "loss": 0.7699, "step": 870 }, { "epoch": 1.1385113268608413, "grad_norm": 1.171339511871338, "learning_rate": 6.489409643983777e-05, "loss": 0.6775, "step": 880 }, { "epoch": 1.1514563106796116, "grad_norm": 1.1836340427398682, "learning_rate": 6.44434429923389e-05, "loss": 0.6744, "step": 890 }, { "epoch": 1.164401294498382, "grad_norm": 1.1864937543869019, "learning_rate": 6.399278954484002e-05, "loss": 0.61, "step": 900 }, { "epoch": 1.177346278317152, "grad_norm": 1.3767447471618652, "learning_rate": 6.354213609734114e-05, "loss": 0.7274, "step": 910 }, { "epoch": 1.1902912621359223, "grad_norm": 1.1915229558944702, "learning_rate": 6.309148264984228e-05, "loss": 0.6787, "step": 920 }, { "epoch": 1.2032362459546926, "grad_norm": 1.416157603263855, "learning_rate": 6.26408292023434e-05, "loss": 0.5878, "step": 930 }, { "epoch": 1.2161812297734629, "grad_norm": 1.179671049118042, "learning_rate": 6.219017575484453e-05, "loss": 0.6416, "step": 940 }, { "epoch": 1.229126213592233, "grad_norm": 0.9643092751502991, "learning_rate": 6.173952230734565e-05, "loss": 0.6718, "step": 950 }, { "epoch": 1.2420711974110032, "grad_norm": 1.417594075202942, "learning_rate": 6.128886885984678e-05, "loss": 0.6965, "step": 960 }, { "epoch": 1.2550161812297735, "grad_norm": 1.740006685256958, "learning_rate": 6.083821541234791e-05, "loss": 0.6674, "step": 970 }, { "epoch": 1.2679611650485436, "grad_norm": 1.2844997644424438, "learning_rate": 6.0387561964849034e-05, "loss": 0.7144, "step": 980 }, { "epoch": 1.280906148867314, "grad_norm": 1.012427568435669, "learning_rate": 5.993690851735017e-05, "loss": 0.6602, "step": 990 }, { "epoch": 1.2938511326860842, "grad_norm": 1.6793211698532104, "learning_rate": 5.948625506985128e-05, "loss": 0.6379, "step": 1000 }, { "epoch": 1.3067961165048545, "grad_norm": 1.1831872463226318, "learning_rate": 5.903560162235241e-05, "loss": 0.6309, "step": 1010 }, { "epoch": 1.3197411003236246, "grad_norm": 1.1994572877883911, "learning_rate": 5.858494817485354e-05, "loss": 0.6099, "step": 1020 }, { "epoch": 1.3326860841423949, "grad_norm": 1.3904818296432495, "learning_rate": 5.8134294727354666e-05, "loss": 0.6408, "step": 1030 }, { "epoch": 1.3456310679611652, "grad_norm": 1.3394759893417358, "learning_rate": 5.768364127985579e-05, "loss": 0.6148, "step": 1040 }, { "epoch": 1.3585760517799352, "grad_norm": 1.0991851091384888, "learning_rate": 5.723298783235692e-05, "loss": 0.6353, "step": 1050 }, { "epoch": 1.3715210355987055, "grad_norm": 2.004030227661133, "learning_rate": 5.678233438485805e-05, "loss": 0.6547, "step": 1060 }, { "epoch": 1.3844660194174758, "grad_norm": 1.5414537191390991, "learning_rate": 5.633168093735918e-05, "loss": 0.6684, "step": 1070 }, { "epoch": 1.397411003236246, "grad_norm": 1.4660983085632324, "learning_rate": 5.5881027489860305e-05, "loss": 0.7235, "step": 1080 }, { "epoch": 1.4103559870550162, "grad_norm": 0.6685907244682312, "learning_rate": 5.5430374042361425e-05, "loss": 0.6797, "step": 1090 }, { "epoch": 1.4233009708737865, "grad_norm": 1.2811195850372314, "learning_rate": 5.497972059486255e-05, "loss": 0.6325, "step": 1100 }, { "epoch": 1.4362459546925566, "grad_norm": 1.3929194211959839, "learning_rate": 5.452906714736368e-05, "loss": 0.6982, "step": 1110 }, { "epoch": 1.4491909385113269, "grad_norm": 1.1184651851654053, "learning_rate": 5.4078413699864804e-05, "loss": 0.561, "step": 1120 }, { "epoch": 1.4621359223300971, "grad_norm": 1.3536425828933716, "learning_rate": 5.362776025236593e-05, "loss": 0.606, "step": 1130 }, { "epoch": 1.4750809061488672, "grad_norm": 1.4096359014511108, "learning_rate": 5.3177106804867064e-05, "loss": 0.7083, "step": 1140 }, { "epoch": 1.4880258899676375, "grad_norm": 1.5671049356460571, "learning_rate": 5.272645335736819e-05, "loss": 0.6337, "step": 1150 }, { "epoch": 1.5009708737864078, "grad_norm": 1.3812810182571411, "learning_rate": 5.2275799909869316e-05, "loss": 0.646, "step": 1160 }, { "epoch": 1.5139158576051779, "grad_norm": 1.2574256658554077, "learning_rate": 5.182514646237044e-05, "loss": 0.5943, "step": 1170 }, { "epoch": 1.5268608414239482, "grad_norm": 1.0817134380340576, "learning_rate": 5.137449301487156e-05, "loss": 0.6729, "step": 1180 }, { "epoch": 1.5398058252427185, "grad_norm": 2.198194980621338, "learning_rate": 5.092383956737269e-05, "loss": 0.6537, "step": 1190 }, { "epoch": 1.5527508090614885, "grad_norm": 1.8652335405349731, "learning_rate": 5.0473186119873815e-05, "loss": 0.6486, "step": 1200 }, { "epoch": 1.565695792880259, "grad_norm": 1.6180390119552612, "learning_rate": 5.002253267237494e-05, "loss": 0.567, "step": 1210 }, { "epoch": 1.5786407766990291, "grad_norm": 1.9595000743865967, "learning_rate": 4.9571879224876075e-05, "loss": 0.6248, "step": 1220 }, { "epoch": 1.5915857605177992, "grad_norm": 0.9194151759147644, "learning_rate": 4.91212257773772e-05, "loss": 0.6251, "step": 1230 }, { "epoch": 1.6045307443365697, "grad_norm": 1.026760458946228, "learning_rate": 4.867057232987833e-05, "loss": 0.7088, "step": 1240 }, { "epoch": 1.6174757281553398, "grad_norm": 1.3673216104507446, "learning_rate": 4.821991888237945e-05, "loss": 0.6626, "step": 1250 }, { "epoch": 1.6304207119741099, "grad_norm": 1.1107579469680786, "learning_rate": 4.776926543488058e-05, "loss": 0.7106, "step": 1260 }, { "epoch": 1.6433656957928804, "grad_norm": 0.9585609436035156, "learning_rate": 4.731861198738171e-05, "loss": 0.7389, "step": 1270 }, { "epoch": 1.6563106796116505, "grad_norm": 1.2537648677825928, "learning_rate": 4.686795853988283e-05, "loss": 0.6415, "step": 1280 }, { "epoch": 1.6692556634304208, "grad_norm": 1.3973714113235474, "learning_rate": 4.641730509238396e-05, "loss": 0.5704, "step": 1290 }, { "epoch": 1.682200647249191, "grad_norm": 1.69650399684906, "learning_rate": 4.5966651644885086e-05, "loss": 0.6029, "step": 1300 }, { "epoch": 1.6951456310679611, "grad_norm": 1.6716722249984741, "learning_rate": 4.551599819738621e-05, "loss": 0.582, "step": 1310 }, { "epoch": 1.7080906148867314, "grad_norm": 1.383424997329712, "learning_rate": 4.506534474988734e-05, "loss": 0.6176, "step": 1320 }, { "epoch": 1.7210355987055017, "grad_norm": 1.0892506837844849, "learning_rate": 4.4614691302388465e-05, "loss": 0.5937, "step": 1330 }, { "epoch": 1.7339805825242718, "grad_norm": 1.4017115831375122, "learning_rate": 4.416403785488959e-05, "loss": 0.5447, "step": 1340 }, { "epoch": 1.746925566343042, "grad_norm": 1.332664132118225, "learning_rate": 4.371338440739072e-05, "loss": 0.65, "step": 1350 }, { "epoch": 1.7598705501618124, "grad_norm": 1.4425687789916992, "learning_rate": 4.3262730959891845e-05, "loss": 0.6095, "step": 1360 }, { "epoch": 1.7728155339805824, "grad_norm": 1.7485853433609009, "learning_rate": 4.281207751239297e-05, "loss": 0.6011, "step": 1370 }, { "epoch": 1.7857605177993527, "grad_norm": 1.2251473665237427, "learning_rate": 4.23614240648941e-05, "loss": 0.5049, "step": 1380 }, { "epoch": 1.798705501618123, "grad_norm": 1.543966293334961, "learning_rate": 4.1910770617395224e-05, "loss": 0.5768, "step": 1390 }, { "epoch": 1.811650485436893, "grad_norm": 1.153024435043335, "learning_rate": 4.146011716989635e-05, "loss": 0.606, "step": 1400 }, { "epoch": 1.8245954692556634, "grad_norm": 1.4503074884414673, "learning_rate": 4.1009463722397477e-05, "loss": 0.6068, "step": 1410 }, { "epoch": 1.8375404530744337, "grad_norm": 1.5761051177978516, "learning_rate": 4.055881027489861e-05, "loss": 0.5736, "step": 1420 }, { "epoch": 1.8504854368932038, "grad_norm": 1.4788157939910889, "learning_rate": 4.010815682739973e-05, "loss": 0.6098, "step": 1430 }, { "epoch": 1.863430420711974, "grad_norm": 1.3545244932174683, "learning_rate": 3.9657503379900856e-05, "loss": 0.5781, "step": 1440 }, { "epoch": 1.8763754045307444, "grad_norm": 1.4322385787963867, "learning_rate": 3.920684993240198e-05, "loss": 0.5779, "step": 1450 }, { "epoch": 1.8893203883495144, "grad_norm": 1.4146760702133179, "learning_rate": 3.8756196484903115e-05, "loss": 0.6097, "step": 1460 }, { "epoch": 1.902265372168285, "grad_norm": 1.7581102848052979, "learning_rate": 3.830554303740424e-05, "loss": 0.5672, "step": 1470 }, { "epoch": 1.915210355987055, "grad_norm": 1.5007004737854004, "learning_rate": 3.785488958990536e-05, "loss": 0.5951, "step": 1480 }, { "epoch": 1.928155339805825, "grad_norm": 1.2334699630737305, "learning_rate": 3.740423614240649e-05, "loss": 0.549, "step": 1490 }, { "epoch": 1.9411003236245956, "grad_norm": 0.9463567733764648, "learning_rate": 3.695358269490762e-05, "loss": 0.6448, "step": 1500 }, { "epoch": 1.9540453074433657, "grad_norm": 1.8025217056274414, "learning_rate": 3.650292924740875e-05, "loss": 0.7363, "step": 1510 }, { "epoch": 1.9669902912621358, "grad_norm": 2.9467597007751465, "learning_rate": 3.6052275799909874e-05, "loss": 0.5586, "step": 1520 }, { "epoch": 1.9799352750809063, "grad_norm": 1.8900437355041504, "learning_rate": 3.5601622352410993e-05, "loss": 0.623, "step": 1530 }, { "epoch": 1.9928802588996763, "grad_norm": 1.5157594680786133, "learning_rate": 3.515096890491213e-05, "loss": 0.606, "step": 1540 }, { "epoch": 2.005177993527508, "grad_norm": 1.370686411857605, "learning_rate": 3.470031545741325e-05, "loss": 0.5081, "step": 1550 }, { "epoch": 2.018122977346278, "grad_norm": 1.3025308847427368, "learning_rate": 3.424966200991438e-05, "loss": 0.4448, "step": 1560 }, { "epoch": 2.0310679611650486, "grad_norm": 0.7799197435379028, "learning_rate": 3.37990085624155e-05, "loss": 0.4341, "step": 1570 }, { "epoch": 2.0440129449838187, "grad_norm": 1.375433087348938, "learning_rate": 3.334835511491663e-05, "loss": 0.5702, "step": 1580 }, { "epoch": 2.056957928802589, "grad_norm": 1.4452621936798096, "learning_rate": 3.289770166741776e-05, "loss": 0.455, "step": 1590 }, { "epoch": 2.0699029126213593, "grad_norm": 1.0591763257980347, "learning_rate": 3.2447048219918885e-05, "loss": 0.5472, "step": 1600 }, { "epoch": 2.0828478964401294, "grad_norm": 1.366735816001892, "learning_rate": 3.199639477242001e-05, "loss": 0.5011, "step": 1610 }, { "epoch": 2.0957928802589, "grad_norm": 2.3307456970214844, "learning_rate": 3.154574132492114e-05, "loss": 0.5293, "step": 1620 }, { "epoch": 2.10873786407767, "grad_norm": 1.5701334476470947, "learning_rate": 3.1095087877422264e-05, "loss": 0.5571, "step": 1630 }, { "epoch": 2.12168284789644, "grad_norm": 1.585015892982483, "learning_rate": 3.064443442992339e-05, "loss": 0.4914, "step": 1640 }, { "epoch": 2.1346278317152105, "grad_norm": 1.5167232751846313, "learning_rate": 3.0193780982424517e-05, "loss": 0.4462, "step": 1650 }, { "epoch": 2.1475728155339806, "grad_norm": 1.8519450426101685, "learning_rate": 2.974312753492564e-05, "loss": 0.5367, "step": 1660 }, { "epoch": 2.1605177993527507, "grad_norm": 1.1500009298324585, "learning_rate": 2.929247408742677e-05, "loss": 0.4091, "step": 1670 }, { "epoch": 2.173462783171521, "grad_norm": 1.9004416465759277, "learning_rate": 2.8841820639927896e-05, "loss": 0.4752, "step": 1680 }, { "epoch": 2.1864077669902913, "grad_norm": 1.588977575302124, "learning_rate": 2.8391167192429026e-05, "loss": 0.4229, "step": 1690 }, { "epoch": 2.1993527508090613, "grad_norm": 2.033543825149536, "learning_rate": 2.7940513744930153e-05, "loss": 0.5575, "step": 1700 }, { "epoch": 2.212297734627832, "grad_norm": 1.1972986459732056, "learning_rate": 2.7489860297431276e-05, "loss": 0.4302, "step": 1710 }, { "epoch": 2.225242718446602, "grad_norm": 0.8637037873268127, "learning_rate": 2.7039206849932402e-05, "loss": 0.4924, "step": 1720 }, { "epoch": 2.238187702265372, "grad_norm": 1.6572158336639404, "learning_rate": 2.6588553402433532e-05, "loss": 0.5, "step": 1730 }, { "epoch": 2.2511326860841425, "grad_norm": 1.8110625743865967, "learning_rate": 2.6137899954934658e-05, "loss": 0.4361, "step": 1740 }, { "epoch": 2.2640776699029126, "grad_norm": 1.676248550415039, "learning_rate": 2.568724650743578e-05, "loss": 0.4407, "step": 1750 }, { "epoch": 2.2770226537216827, "grad_norm": 1.149107813835144, "learning_rate": 2.5236593059936908e-05, "loss": 0.5472, "step": 1760 }, { "epoch": 2.289967637540453, "grad_norm": 1.7543455362319946, "learning_rate": 2.4785939612438037e-05, "loss": 0.5396, "step": 1770 }, { "epoch": 2.3029126213592233, "grad_norm": 2.7097597122192383, "learning_rate": 2.4335286164939164e-05, "loss": 0.5013, "step": 1780 }, { "epoch": 2.3158576051779933, "grad_norm": 1.9181076288223267, "learning_rate": 2.388463271744029e-05, "loss": 0.4476, "step": 1790 }, { "epoch": 2.328802588996764, "grad_norm": 1.1099668741226196, "learning_rate": 2.3433979269941417e-05, "loss": 0.4113, "step": 1800 }, { "epoch": 2.341747572815534, "grad_norm": 1.5468546152114868, "learning_rate": 2.2983325822442543e-05, "loss": 0.4629, "step": 1810 }, { "epoch": 2.354692556634304, "grad_norm": 1.149834394454956, "learning_rate": 2.253267237494367e-05, "loss": 0.434, "step": 1820 }, { "epoch": 2.3676375404530745, "grad_norm": 1.4918863773345947, "learning_rate": 2.2082018927444796e-05, "loss": 0.4554, "step": 1830 }, { "epoch": 2.3805825242718446, "grad_norm": 1.610051155090332, "learning_rate": 2.1631365479945922e-05, "loss": 0.4081, "step": 1840 }, { "epoch": 2.3935275080906147, "grad_norm": 1.377886176109314, "learning_rate": 2.118071203244705e-05, "loss": 0.4531, "step": 1850 }, { "epoch": 2.406472491909385, "grad_norm": 1.3633161783218384, "learning_rate": 2.0730058584948175e-05, "loss": 0.4767, "step": 1860 }, { "epoch": 2.4194174757281552, "grad_norm": 2.3759074211120605, "learning_rate": 2.0279405137449305e-05, "loss": 0.5155, "step": 1870 }, { "epoch": 2.4323624595469258, "grad_norm": 1.634912133216858, "learning_rate": 1.9828751689950428e-05, "loss": 0.4279, "step": 1880 }, { "epoch": 2.445307443365696, "grad_norm": 1.3101922273635864, "learning_rate": 1.9378098242451558e-05, "loss": 0.4284, "step": 1890 }, { "epoch": 2.458252427184466, "grad_norm": 1.2841248512268066, "learning_rate": 1.892744479495268e-05, "loss": 0.5135, "step": 1900 }, { "epoch": 2.4711974110032364, "grad_norm": 2.4284558296203613, "learning_rate": 1.847679134745381e-05, "loss": 0.4455, "step": 1910 }, { "epoch": 2.4841423948220065, "grad_norm": 1.589353084564209, "learning_rate": 1.8026137899954937e-05, "loss": 0.484, "step": 1920 }, { "epoch": 2.4970873786407766, "grad_norm": 1.588586688041687, "learning_rate": 1.7575484452456063e-05, "loss": 0.5047, "step": 1930 }, { "epoch": 2.510032362459547, "grad_norm": 1.8486393690109253, "learning_rate": 1.712483100495719e-05, "loss": 0.4384, "step": 1940 }, { "epoch": 2.522977346278317, "grad_norm": 1.9914798736572266, "learning_rate": 1.6674177557458316e-05, "loss": 0.4324, "step": 1950 }, { "epoch": 2.5359223300970872, "grad_norm": 1.3951258659362793, "learning_rate": 1.6223524109959443e-05, "loss": 0.5316, "step": 1960 }, { "epoch": 2.5488673139158577, "grad_norm": 1.471160888671875, "learning_rate": 1.577287066246057e-05, "loss": 0.4943, "step": 1970 }, { "epoch": 2.561812297734628, "grad_norm": 1.5667455196380615, "learning_rate": 1.5322217214961695e-05, "loss": 0.5354, "step": 1980 }, { "epoch": 2.574757281553398, "grad_norm": 1.5954725742340088, "learning_rate": 1.487156376746282e-05, "loss": 0.4484, "step": 1990 }, { "epoch": 2.5877022653721684, "grad_norm": 1.3914281129837036, "learning_rate": 1.4420910319963948e-05, "loss": 0.4617, "step": 2000 } ], "logging_steps": 10, "max_steps": 2319, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.084236146533417e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }