{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000.0, "global_step": 1120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008928571428571428, "grad_norm": 7.46875, "learning_rate": 3.5714285714285716e-07, "loss": 1.9798583984375, "step": 1, "token_acc": 0.5198793098029355 }, { "epoch": 0.004464285714285714, "grad_norm": 6.84375, "learning_rate": 1.7857142857142859e-06, "loss": 1.942692756652832, "step": 5, "token_acc": 0.5277546839276701 }, { "epoch": 0.008928571428571428, "grad_norm": 6.0625, "learning_rate": 3.5714285714285718e-06, "loss": 1.9170162200927734, "step": 10, "token_acc": 0.5350005891363262 }, { "epoch": 0.013392857142857142, "grad_norm": 4.46875, "learning_rate": 5.357142857142857e-06, "loss": 1.8599681854248047, "step": 15, "token_acc": 0.5440987793825112 }, { "epoch": 0.017857142857142856, "grad_norm": 3.375, "learning_rate": 7.1428571428571436e-06, "loss": 1.773095703125, "step": 20, "token_acc": 0.558397212543554 }, { "epoch": 0.022321428571428572, "grad_norm": 2.59375, "learning_rate": 8.92857142857143e-06, "loss": 1.7239681243896485, "step": 25, "token_acc": 0.564640616036101 }, { "epoch": 0.026785714285714284, "grad_norm": 2.203125, "learning_rate": 1.0714285714285714e-05, "loss": 1.662449264526367, "step": 30, "token_acc": 0.5744314346237109 }, { "epoch": 0.03125, "grad_norm": 1.8125, "learning_rate": 1.25e-05, "loss": 1.565970230102539, "step": 35, "token_acc": 0.5905128053468978 }, { "epoch": 0.03571428571428571, "grad_norm": 1.5390625, "learning_rate": 1.4285714285714287e-05, "loss": 1.4979233741760254, "step": 40, "token_acc": 0.6035646975846081 }, { "epoch": 0.04017857142857143, "grad_norm": 1.46875, "learning_rate": 1.6071428571428572e-05, "loss": 1.4658918380737305, "step": 45, "token_acc": 0.6079301236643054 }, { "epoch": 0.044642857142857144, "grad_norm": 1.4296875, "learning_rate": 1.785714285714286e-05, "loss": 1.422969436645508, "step": 50, "token_acc": 0.6154094170613891 }, { "epoch": 0.049107142857142856, "grad_norm": 1.4609375, "learning_rate": 1.9642857142857145e-05, "loss": 1.3724126815795898, "step": 55, "token_acc": 0.6238321824406662 }, { "epoch": 0.05357142857142857, "grad_norm": 1.4609375, "learning_rate": 1.9999302568709548e-05, "loss": 1.3787870407104492, "step": 60, "token_acc": 0.6205635902788131 }, { "epoch": 0.05803571428571429, "grad_norm": 1.4296875, "learning_rate": 1.999646942081983e-05, "loss": 1.3338838577270509, "step": 65, "token_acc": 0.628516364022137 }, { "epoch": 0.0625, "grad_norm": 1.390625, "learning_rate": 1.999145758387301e-05, "loss": 1.3212929725646974, "step": 70, "token_acc": 0.6295137500294056 }, { "epoch": 0.06696428571428571, "grad_norm": 1.4140625, "learning_rate": 1.998426815017817e-05, "loss": 1.290837574005127, "step": 75, "token_acc": 0.6369310469506665 }, { "epoch": 0.07142857142857142, "grad_norm": 1.3828125, "learning_rate": 1.997490268664256e-05, "loss": 1.2963342666625977, "step": 80, "token_acc": 0.6334583410737507 }, { "epoch": 0.07589285714285714, "grad_norm": 1.4453125, "learning_rate": 1.996336323443013e-05, "loss": 1.2906134605407715, "step": 85, "token_acc": 0.6352960586401749 }, { "epoch": 0.08035714285714286, "grad_norm": 1.390625, "learning_rate": 1.9949652308516635e-05, "loss": 1.2730415344238282, "step": 90, "token_acc": 0.6381611754746083 }, { "epoch": 0.08482142857142858, "grad_norm": 1.4140625, "learning_rate": 1.9933772897141525e-05, "loss": 1.2450992584228515, "step": 95, "token_acc": 0.6466158412384333 }, { "epoch": 0.08928571428571429, "grad_norm": 1.4140625, "learning_rate": 1.991572846115666e-05, "loss": 1.2471370697021484, "step": 100, "token_acc": 0.6422763263951449 }, { "epoch": 0.09375, "grad_norm": 1.3984375, "learning_rate": 1.9895522933272028e-05, "loss": 1.2441038131713866, "step": 105, "token_acc": 0.6425760561463307 }, { "epoch": 0.09821428571428571, "grad_norm": 1.4140625, "learning_rate": 1.9873160717198655e-05, "loss": 1.230722427368164, "step": 110, "token_acc": 0.6461212036192747 }, { "epoch": 0.10267857142857142, "grad_norm": 1.4375, "learning_rate": 1.9848646686688798e-05, "loss": 1.2274154663085937, "step": 115, "token_acc": 0.645449105054132 }, { "epoch": 0.10714285714285714, "grad_norm": 1.421875, "learning_rate": 1.9821986184473757e-05, "loss": 1.2445575714111328, "step": 120, "token_acc": 0.640380009025793 }, { "epoch": 0.11160714285714286, "grad_norm": 1.375, "learning_rate": 1.9793185021099426e-05, "loss": 1.2152713775634765, "step": 125, "token_acc": 0.650086875941156 }, { "epoch": 0.11607142857142858, "grad_norm": 1.421875, "learning_rate": 1.9762249473659936e-05, "loss": 1.2223292350769044, "step": 130, "token_acc": 0.646190244076421 }, { "epoch": 0.12053571428571429, "grad_norm": 1.34375, "learning_rate": 1.9729186284429567e-05, "loss": 1.20882568359375, "step": 135, "token_acc": 0.6505111612683495 }, { "epoch": 0.125, "grad_norm": 1.4765625, "learning_rate": 1.9694002659393306e-05, "loss": 1.198903465270996, "step": 140, "token_acc": 0.6506264468770722 }, { "epoch": 0.12946428571428573, "grad_norm": 1.453125, "learning_rate": 1.965670626667633e-05, "loss": 1.18528413772583, "step": 145, "token_acc": 0.6555728824688115 }, { "epoch": 0.13392857142857142, "grad_norm": 1.46875, "learning_rate": 1.9617305234872773e-05, "loss": 1.2123258590698243, "step": 150, "token_acc": 0.648010957945968 }, { "epoch": 0.13839285714285715, "grad_norm": 1.3984375, "learning_rate": 1.9575808151274133e-05, "loss": 1.20123291015625, "step": 155, "token_acc": 0.6520161821001254 }, { "epoch": 0.14285714285714285, "grad_norm": 1.453125, "learning_rate": 1.9532224059997693e-05, "loss": 1.2176162719726562, "step": 160, "token_acc": 0.6464380895343614 }, { "epoch": 0.14732142857142858, "grad_norm": 1.375, "learning_rate": 1.948656246001542e-05, "loss": 1.1905139923095702, "step": 165, "token_acc": 0.6515007571439264 }, { "epoch": 0.15178571428571427, "grad_norm": 1.34375, "learning_rate": 1.9438833303083677e-05, "loss": 1.2004039764404297, "step": 170, "token_acc": 0.650028138976926 }, { "epoch": 0.15625, "grad_norm": 1.3515625, "learning_rate": 1.9389046991574298e-05, "loss": 1.1866092681884766, "step": 175, "token_acc": 0.6536794480531047 }, { "epoch": 0.16071428571428573, "grad_norm": 1.359375, "learning_rate": 1.9337214376207417e-05, "loss": 1.1794092178344726, "step": 180, "token_acc": 0.6547250738292786 }, { "epoch": 0.16517857142857142, "grad_norm": 1.3515625, "learning_rate": 1.9283346753686625e-05, "loss": 1.1931296348571778, "step": 185, "token_acc": 0.6512469949273786 }, { "epoch": 0.16964285714285715, "grad_norm": 1.4375, "learning_rate": 1.922745586423687e-05, "loss": 1.175852394104004, "step": 190, "token_acc": 0.6550968319583254 }, { "epoch": 0.17410714285714285, "grad_norm": 1.359375, "learning_rate": 1.9169553889045732e-05, "loss": 1.1729495048522949, "step": 195, "token_acc": 0.6564944949906185 }, { "epoch": 0.17857142857142858, "grad_norm": 1.296875, "learning_rate": 1.9109653447608607e-05, "loss": 1.1875883102416993, "step": 200, "token_acc": 0.6519358095416979 }, { "epoch": 0.18303571428571427, "grad_norm": 1.453125, "learning_rate": 1.9047767594978308e-05, "loss": 1.1628761291503906, "step": 205, "token_acc": 0.6568504306122326 }, { "epoch": 0.1875, "grad_norm": 1.4375, "learning_rate": 1.898390981891979e-05, "loss": 1.1821978569030762, "step": 210, "token_acc": 0.6514381801603373 }, { "epoch": 0.19196428571428573, "grad_norm": 1.3671875, "learning_rate": 1.891809403697054e-05, "loss": 1.1805411338806153, "step": 215, "token_acc": 0.6520954938127481 }, { "epoch": 0.19642857142857142, "grad_norm": 1.3828125, "learning_rate": 1.885033459340731e-05, "loss": 1.1788909912109375, "step": 220, "token_acc": 0.6525619317127482 }, { "epoch": 0.20089285714285715, "grad_norm": 1.3359375, "learning_rate": 1.8780646256119843e-05, "loss": 1.1617810249328613, "step": 225, "token_acc": 0.6574105324410824 }, { "epoch": 0.20535714285714285, "grad_norm": 1.4453125, "learning_rate": 1.8709044213392265e-05, "loss": 1.1675668716430665, "step": 230, "token_acc": 0.6567929066970355 }, { "epoch": 0.20982142857142858, "grad_norm": 1.3671875, "learning_rate": 1.8635544070592876e-05, "loss": 1.1429882049560547, "step": 235, "token_acc": 0.6626304648850445 }, { "epoch": 0.21428571428571427, "grad_norm": 1.46875, "learning_rate": 1.8560161846773002e-05, "loss": 1.1523635864257813, "step": 240, "token_acc": 0.6588007349650185 }, { "epoch": 0.21875, "grad_norm": 1.3828125, "learning_rate": 1.8482913971175737e-05, "loss": 1.1638723373413087, "step": 245, "token_acc": 0.6575461339567173 }, { "epoch": 0.22321428571428573, "grad_norm": 1.3828125, "learning_rate": 1.8403817279655237e-05, "loss": 1.1588207244873048, "step": 250, "token_acc": 0.6566742318540298 }, { "epoch": 0.22767857142857142, "grad_norm": 1.375, "learning_rate": 1.8322889011007424e-05, "loss": 1.1697870254516602, "step": 255, "token_acc": 0.6530151356474628 }, { "epoch": 0.23214285714285715, "grad_norm": 1.375, "learning_rate": 1.8240146803212854e-05, "loss": 1.163081169128418, "step": 260, "token_acc": 0.654347747769385 }, { "epoch": 0.23660714285714285, "grad_norm": 1.3515625, "learning_rate": 1.8155608689592604e-05, "loss": 1.1553804397583007, "step": 265, "token_acc": 0.656771626981819 }, { "epoch": 0.24107142857142858, "grad_norm": 1.3984375, "learning_rate": 1.8069293094877974e-05, "loss": 1.1408929824829102, "step": 270, "token_acc": 0.6625453583525501 }, { "epoch": 0.24553571428571427, "grad_norm": 1.34375, "learning_rate": 1.7981218831194904e-05, "loss": 1.1588726043701172, "step": 275, "token_acc": 0.6563144046706936 }, { "epoch": 0.25, "grad_norm": 1.3671875, "learning_rate": 1.789140509396394e-05, "loss": 1.1532201766967773, "step": 280, "token_acc": 0.6581244540600577 }, { "epoch": 0.2544642857142857, "grad_norm": 1.3515625, "learning_rate": 1.7799871457716665e-05, "loss": 1.1504724502563477, "step": 285, "token_acc": 0.6587934607950444 }, { "epoch": 0.25892857142857145, "grad_norm": 1.359375, "learning_rate": 1.770663787182954e-05, "loss": 1.1456276893615722, "step": 290, "token_acc": 0.6618624162066538 }, { "epoch": 0.26339285714285715, "grad_norm": 1.3125, "learning_rate": 1.7611724656175982e-05, "loss": 1.1442519187927247, "step": 295, "token_acc": 0.6591544798140778 }, { "epoch": 0.26785714285714285, "grad_norm": 1.375, "learning_rate": 1.7515152496697765e-05, "loss": 1.152108383178711, "step": 300, "token_acc": 0.6586486613213298 }, { "epoch": 0.27232142857142855, "grad_norm": 1.3515625, "learning_rate": 1.7416942440896577e-05, "loss": 1.1452009201049804, "step": 305, "token_acc": 0.6591358039294544 }, { "epoch": 0.2767857142857143, "grad_norm": 1.34375, "learning_rate": 1.7317115893246833e-05, "loss": 1.1422765731811524, "step": 310, "token_acc": 0.6589653054958551 }, { "epoch": 0.28125, "grad_norm": 1.3515625, "learning_rate": 1.7215694610530624e-05, "loss": 1.1257204055786132, "step": 315, "token_acc": 0.6644322342183476 }, { "epoch": 0.2857142857142857, "grad_norm": 1.3984375, "learning_rate": 1.7112700697095955e-05, "loss": 1.150202178955078, "step": 320, "token_acc": 0.657143724020753 }, { "epoch": 0.29017857142857145, "grad_norm": 1.265625, "learning_rate": 1.7008156600039157e-05, "loss": 1.1264986038208007, "step": 325, "token_acc": 0.6647021643690532 }, { "epoch": 0.29464285714285715, "grad_norm": 1.390625, "learning_rate": 1.690208510431267e-05, "loss": 1.1261561393737793, "step": 330, "token_acc": 0.6645312870725624 }, { "epoch": 0.29910714285714285, "grad_norm": 1.421875, "learning_rate": 1.6794509327759132e-05, "loss": 1.1390050888061523, "step": 335, "token_acc": 0.6606452523992492 }, { "epoch": 0.30357142857142855, "grad_norm": 1.34375, "learning_rate": 1.6685452716072946e-05, "loss": 1.128352451324463, "step": 340, "token_acc": 0.6637220739566702 }, { "epoch": 0.3080357142857143, "grad_norm": 1.359375, "learning_rate": 1.6574939037690394e-05, "loss": 1.1525999069213868, "step": 345, "token_acc": 0.6566719096261906 }, { "epoch": 0.3125, "grad_norm": 1.3671875, "learning_rate": 1.646299237860941e-05, "loss": 1.1295086860656738, "step": 350, "token_acc": 0.6623984060840936 }, { "epoch": 0.3169642857142857, "grad_norm": 1.375, "learning_rate": 1.634963713714012e-05, "loss": 1.1322021484375, "step": 355, "token_acc": 0.6617089233387465 }, { "epoch": 0.32142857142857145, "grad_norm": 1.390625, "learning_rate": 1.6234898018587336e-05, "loss": 1.1252225875854491, "step": 360, "token_acc": 0.6641228446029669 }, { "epoch": 0.32589285714285715, "grad_norm": 1.328125, "learning_rate": 1.6118800029866157e-05, "loss": 1.1185049057006835, "step": 365, "token_acc": 0.6656541637014795 }, { "epoch": 0.33035714285714285, "grad_norm": 1.3515625, "learning_rate": 1.600136847405179e-05, "loss": 1.1188979148864746, "step": 370, "token_acc": 0.6649322180559561 }, { "epoch": 0.33482142857142855, "grad_norm": 1.7421875, "learning_rate": 1.5882628944864862e-05, "loss": 1.1338905334472655, "step": 375, "token_acc": 0.6627974764645106 }, { "epoch": 0.3392857142857143, "grad_norm": 1.328125, "learning_rate": 1.5762607321093368e-05, "loss": 1.1297473907470703, "step": 380, "token_acc": 0.662811429370499 }, { "epoch": 0.34375, "grad_norm": 1.3359375, "learning_rate": 1.5641329760952514e-05, "loss": 1.1086080551147461, "step": 385, "token_acc": 0.6686927987727868 }, { "epoch": 0.3482142857142857, "grad_norm": 1.3203125, "learning_rate": 1.5518822696383612e-05, "loss": 1.1249174118041991, "step": 390, "token_acc": 0.6625904370106592 }, { "epoch": 0.35267857142857145, "grad_norm": 1.3203125, "learning_rate": 1.539511282729338e-05, "loss": 1.1013822555541992, "step": 395, "token_acc": 0.670699511008058 }, { "epoch": 0.35714285714285715, "grad_norm": 1.3515625, "learning_rate": 1.527022711573479e-05, "loss": 1.1248649597167968, "step": 400, "token_acc": 0.6626294342018012 }, { "epoch": 0.36160714285714285, "grad_norm": 1.28125, "learning_rate": 1.51441927800308e-05, "loss": 1.119422721862793, "step": 405, "token_acc": 0.6646745395834887 }, { "epoch": 0.36607142857142855, "grad_norm": 1.359375, "learning_rate": 1.5017037288842238e-05, "loss": 1.1286213874816895, "step": 410, "token_acc": 0.6624454043387759 }, { "epoch": 0.3705357142857143, "grad_norm": 1.375, "learning_rate": 1.4888788355181128e-05, "loss": 1.1240810394287108, "step": 415, "token_acc": 0.661629880136473 }, { "epoch": 0.375, "grad_norm": 1.296875, "learning_rate": 1.4759473930370738e-05, "loss": 1.122232151031494, "step": 420, "token_acc": 0.6647913929040736 }, { "epoch": 0.3794642857142857, "grad_norm": 1.3359375, "learning_rate": 1.4629122197953716e-05, "loss": 1.127860927581787, "step": 425, "token_acc": 0.6610134748138546 }, { "epoch": 0.38392857142857145, "grad_norm": 1.328125, "learning_rate": 1.4497761567549602e-05, "loss": 1.1123634338378907, "step": 430, "token_acc": 0.6672016743978223 }, { "epoch": 0.38839285714285715, "grad_norm": 1.3046875, "learning_rate": 1.4365420668663075e-05, "loss": 1.1055331230163574, "step": 435, "token_acc": 0.667418617766589 }, { "epoch": 0.39285714285714285, "grad_norm": 1.4140625, "learning_rate": 1.4232128344444251e-05, "loss": 1.1390070915222168, "step": 440, "token_acc": 0.6599930821407843 }, { "epoch": 0.39732142857142855, "grad_norm": 1.3359375, "learning_rate": 1.4097913645402463e-05, "loss": 1.119845199584961, "step": 445, "token_acc": 0.6643133551388953 }, { "epoch": 0.4017857142857143, "grad_norm": 1.3359375, "learning_rate": 1.396280582307481e-05, "loss": 1.1190576553344727, "step": 450, "token_acc": 0.6644705200637404 }, { "epoch": 0.40625, "grad_norm": 1.265625, "learning_rate": 1.3826834323650899e-05, "loss": 1.114608383178711, "step": 455, "token_acc": 0.6656076250992852 }, { "epoch": 0.4107142857142857, "grad_norm": 1.375, "learning_rate": 1.369002878155519e-05, "loss": 1.122308349609375, "step": 460, "token_acc": 0.6635312122906672 }, { "epoch": 0.41517857142857145, "grad_norm": 1.34375, "learning_rate": 1.3552419012988284e-05, "loss": 1.112066650390625, "step": 465, "token_acc": 0.6657017205282056 }, { "epoch": 0.41964285714285715, "grad_norm": 1.296875, "learning_rate": 1.3414035009428598e-05, "loss": 1.1216192245483398, "step": 470, "token_acc": 0.663814267923857 }, { "epoch": 0.42410714285714285, "grad_norm": 1.34375, "learning_rate": 1.3274906931095863e-05, "loss": 1.1180400848388672, "step": 475, "token_acc": 0.6634265241395513 }, { "epoch": 0.42857142857142855, "grad_norm": 1.2890625, "learning_rate": 1.3135065100377816e-05, "loss": 1.1156521797180177, "step": 480, "token_acc": 0.664764806569064 }, { "epoch": 0.4330357142857143, "grad_norm": 1.3125, "learning_rate": 1.2994539995221564e-05, "loss": 1.115281867980957, "step": 485, "token_acc": 0.6655095086932463 }, { "epoch": 0.4375, "grad_norm": 1.3515625, "learning_rate": 1.2853362242491054e-05, "loss": 1.1114022254943847, "step": 490, "token_acc": 0.6649909487491468 }, { "epoch": 0.4419642857142857, "grad_norm": 1.2421875, "learning_rate": 1.2711562611292063e-05, "loss": 1.1045937538146973, "step": 495, "token_acc": 0.6681451286030272 }, { "epoch": 0.44642857142857145, "grad_norm": 1.3828125, "learning_rate": 1.2569172006266192e-05, "loss": 1.1057794570922852, "step": 500, "token_acc": 0.6668334494856537 }, { "epoch": 0.45089285714285715, "grad_norm": 1.3359375, "learning_rate": 1.2426221460855352e-05, "loss": 1.1071063041687013, "step": 505, "token_acc": 0.6664125673607487 }, { "epoch": 0.45535714285714285, "grad_norm": 1.265625, "learning_rate": 1.2282742130538121e-05, "loss": 1.0995834350585938, "step": 510, "token_acc": 0.6681870274068059 }, { "epoch": 0.45982142857142855, "grad_norm": 1.34375, "learning_rate": 1.2138765286039573e-05, "loss": 1.1039738655090332, "step": 515, "token_acc": 0.668202407915127 }, { "epoch": 0.4642857142857143, "grad_norm": 1.2890625, "learning_rate": 1.1994322306515926e-05, "loss": 1.11993465423584, "step": 520, "token_acc": 0.6619411576211444 }, { "epoch": 0.46875, "grad_norm": 1.3046875, "learning_rate": 1.1849444672715587e-05, "loss": 1.0845392227172852, "step": 525, "token_acc": 0.6731254973876336 }, { "epoch": 0.4732142857142857, "grad_norm": 1.2109375, "learning_rate": 1.1704163960118069e-05, "loss": 1.0899698257446289, "step": 530, "token_acc": 0.6714283226626576 }, { "epoch": 0.47767857142857145, "grad_norm": 1.3828125, "learning_rate": 1.155851183205224e-05, "loss": 1.1180584907531739, "step": 535, "token_acc": 0.6640847703163699 }, { "epoch": 0.48214285714285715, "grad_norm": 1.265625, "learning_rate": 1.141252003279542e-05, "loss": 1.0958803176879883, "step": 540, "token_acc": 0.6683001406764094 }, { "epoch": 0.48660714285714285, "grad_norm": 1.25, "learning_rate": 1.1266220380654862e-05, "loss": 1.0876192092895507, "step": 545, "token_acc": 0.674294498100293 }, { "epoch": 0.49107142857142855, "grad_norm": 1.3203125, "learning_rate": 1.1119644761033079e-05, "loss": 1.1089275360107422, "step": 550, "token_acc": 0.6656800260519644 }, { "epoch": 0.4955357142857143, "grad_norm": 1.3203125, "learning_rate": 1.097282511947855e-05, "loss": 1.105890655517578, "step": 555, "token_acc": 0.6669049856720307 }, { "epoch": 0.5, "grad_norm": 1.234375, "learning_rate": 1.0825793454723325e-05, "loss": 1.0942912101745605, "step": 560, "token_acc": 0.6705247046279258 }, { "epoch": 0.5044642857142857, "grad_norm": 1.3046875, "learning_rate": 1.0678581811709025e-05, "loss": 1.0961302757263183, "step": 565, "token_acc": 0.6700240310483472 }, { "epoch": 0.5089285714285714, "grad_norm": 1.3359375, "learning_rate": 1.0531222274602795e-05, "loss": 1.0944637298583983, "step": 570, "token_acc": 0.6694562628384824 }, { "epoch": 0.5133928571428571, "grad_norm": 1.2578125, "learning_rate": 1.0383746959804672e-05, "loss": 1.0805794715881347, "step": 575, "token_acc": 0.6751718465425063 }, { "epoch": 0.5178571428571429, "grad_norm": 1.2578125, "learning_rate": 1.023618800894798e-05, "loss": 1.0854421615600587, "step": 580, "token_acc": 0.6732954215748854 }, { "epoch": 0.5223214285714286, "grad_norm": 1.2890625, "learning_rate": 1.0088577581894154e-05, "loss": 1.0917674064636231, "step": 585, "token_acc": 0.670189745515675 }, { "epoch": 0.5267857142857143, "grad_norm": 1.3203125, "learning_rate": 9.94094784972367e-06, "loss": 1.0841856002807617, "step": 590, "token_acc": 0.6734028220945898 }, { "epoch": 0.53125, "grad_norm": 1.296875, "learning_rate": 9.79333098772446e-06, "loss": 1.1098053932189942, "step": 595, "token_acc": 0.6651632518802563 }, { "epoch": 0.5357142857142857, "grad_norm": 1.2421875, "learning_rate": 9.645759168379463e-06, "loss": 1.1012333869934081, "step": 600, "token_acc": 0.667337838941999 }, { "epoch": 0.5401785714285714, "grad_norm": 1.3125, "learning_rate": 9.498264554354761e-06, "loss": 1.0885583877563476, "step": 605, "token_acc": 0.6714689082367089 }, { "epoch": 0.5446428571428571, "grad_norm": 1.3046875, "learning_rate": 9.350879291489848e-06, "loss": 1.104905128479004, "step": 610, "token_acc": 0.6663167322052396 }, { "epoch": 0.5491071428571429, "grad_norm": 1.2734375, "learning_rate": 9.203635501791595e-06, "loss": 1.1066022872924806, "step": 615, "token_acc": 0.6652294646322893 }, { "epoch": 0.5535714285714286, "grad_norm": 1.2890625, "learning_rate": 9.056565276433378e-06, "loss": 1.1055733680725097, "step": 620, "token_acc": 0.6665237252904985 }, { "epoch": 0.5580357142857143, "grad_norm": 1.3515625, "learning_rate": 8.909700668760945e-06, "loss": 1.1153239250183105, "step": 625, "token_acc": 0.6633878551716786 }, { "epoch": 0.5625, "grad_norm": 1.265625, "learning_rate": 8.763073687306523e-06, "loss": 1.1114237785339356, "step": 630, "token_acc": 0.665496981051093 }, { "epoch": 0.5669642857142857, "grad_norm": 1.3203125, "learning_rate": 8.616716288812694e-06, "loss": 1.0952648162841796, "step": 635, "token_acc": 0.6696672683549216 }, { "epoch": 0.5714285714285714, "grad_norm": 1.28125, "learning_rate": 8.47066037126754e-06, "loss": 1.1008512496948242, "step": 640, "token_acc": 0.6676704190666269 }, { "epoch": 0.5758928571428571, "grad_norm": 1.3515625, "learning_rate": 8.324937766952638e-06, "loss": 1.1116990089416503, "step": 645, "token_acc": 0.6651052619143255 }, { "epoch": 0.5803571428571429, "grad_norm": 1.3046875, "learning_rate": 8.17958023550531e-06, "loss": 1.1096595764160155, "step": 650, "token_acc": 0.6650905800358365 }, { "epoch": 0.5848214285714286, "grad_norm": 1.2890625, "learning_rate": 8.03461945699677e-06, "loss": 1.0997350692749024, "step": 655, "token_acc": 0.66805490553854 }, { "epoch": 0.5892857142857143, "grad_norm": 1.2265625, "learning_rate": 7.89008702502758e-06, "loss": 1.1013197898864746, "step": 660, "token_acc": 0.6695434588695944 }, { "epoch": 0.59375, "grad_norm": 1.3359375, "learning_rate": 7.746014439841941e-06, "loss": 1.1132354736328125, "step": 665, "token_acc": 0.6643929538342533 }, { "epoch": 0.5982142857142857, "grad_norm": 1.265625, "learning_rate": 7.602433101462351e-06, "loss": 1.1085187911987304, "step": 670, "token_acc": 0.6662411498410222 }, { "epoch": 0.6026785714285714, "grad_norm": 1.328125, "learning_rate": 7.459374302846114e-06, "loss": 1.0951228141784668, "step": 675, "token_acc": 0.6677065047207857 }, { "epoch": 0.6071428571428571, "grad_norm": 1.375, "learning_rate": 7.316869223065156e-06, "loss": 1.114619827270508, "step": 680, "token_acc": 0.6641058671975641 }, { "epoch": 0.6116071428571429, "grad_norm": 1.296875, "learning_rate": 7.174948920510675e-06, "loss": 1.0971644401550293, "step": 685, "token_acc": 0.6688319741107969 }, { "epoch": 0.6160714285714286, "grad_norm": 1.296875, "learning_rate": 7.033644326124104e-06, "loss": 1.104668140411377, "step": 690, "token_acc": 0.6667567008413406 }, { "epoch": 0.6205357142857143, "grad_norm": 1.3359375, "learning_rate": 6.892986236655827e-06, "loss": 1.114396095275879, "step": 695, "token_acc": 0.6627671848794471 }, { "epoch": 0.625, "grad_norm": 1.3046875, "learning_rate": 6.7530053079531664e-06, "loss": 1.098177146911621, "step": 700, "token_acc": 0.668855782018009 }, { "epoch": 0.6294642857142857, "grad_norm": 1.265625, "learning_rate": 6.613732048279064e-06, "loss": 1.0986656188964843, "step": 705, "token_acc": 0.6675273764392436 }, { "epoch": 0.6339285714285714, "grad_norm": 1.2890625, "learning_rate": 6.475196811662929e-06, "loss": 1.0969505310058594, "step": 710, "token_acc": 0.6685544895216992 }, { "epoch": 0.6383928571428571, "grad_norm": 1.265625, "learning_rate": 6.337429791285107e-06, "loss": 1.0815807342529298, "step": 715, "token_acc": 0.6731986752641865 }, { "epoch": 0.6428571428571429, "grad_norm": 1.265625, "learning_rate": 6.200461012896401e-06, "loss": 1.0996244430541993, "step": 720, "token_acc": 0.66839132010241 }, { "epoch": 0.6473214285714286, "grad_norm": 1.3125, "learning_rate": 6.064320328274079e-06, "loss": 1.0926881790161134, "step": 725, "token_acc": 0.6687555529230587 }, { "epoch": 0.6517857142857143, "grad_norm": 1.3359375, "learning_rate": 5.929037408715812e-06, "loss": 1.1201751708984375, "step": 730, "token_acc": 0.6626515495086923 }, { "epoch": 0.65625, "grad_norm": 1.328125, "learning_rate": 5.794641738572925e-06, "loss": 1.1207469940185546, "step": 735, "token_acc": 0.6625378393718664 }, { "epoch": 0.6607142857142857, "grad_norm": 1.2578125, "learning_rate": 5.66116260882442e-06, "loss": 1.0935090065002442, "step": 740, "token_acc": 0.6697370980003889 }, { "epoch": 0.6651785714285714, "grad_norm": 1.3046875, "learning_rate": 5.528629110693111e-06, "loss": 1.1128035545349122, "step": 745, "token_acc": 0.6645328465003976 }, { "epoch": 0.6696428571428571, "grad_norm": 1.265625, "learning_rate": 5.397070129305343e-06, "loss": 1.0865594863891601, "step": 750, "token_acc": 0.6718154777435683 }, { "epoch": 0.6741071428571429, "grad_norm": 1.2734375, "learning_rate": 5.2665143373955476e-06, "loss": 1.105082130432129, "step": 755, "token_acc": 0.6654881601202781 }, { "epoch": 0.6785714285714286, "grad_norm": 1.265625, "learning_rate": 5.136990189057187e-06, "loss": 1.0965201377868652, "step": 760, "token_acc": 0.6685612497216721 }, { "epoch": 0.6830357142857143, "grad_norm": 1.234375, "learning_rate": 5.008525913541292e-06, "loss": 1.0776897430419923, "step": 765, "token_acc": 0.673377759708695 }, { "epoch": 0.6875, "grad_norm": 1.3125, "learning_rate": 4.881149509103993e-06, "loss": 1.1034416198730468, "step": 770, "token_acc": 0.6676547557024715 }, { "epoch": 0.6919642857142857, "grad_norm": 1.3046875, "learning_rate": 4.754888736904432e-06, "loss": 1.111644172668457, "step": 775, "token_acc": 0.6635881969829432 }, { "epoch": 0.6964285714285714, "grad_norm": 1.28125, "learning_rate": 4.629771114954341e-06, "loss": 1.1045246124267578, "step": 780, "token_acc": 0.6661986169129779 }, { "epoch": 0.7008928571428571, "grad_norm": 1.203125, "learning_rate": 4.505823912120586e-06, "loss": 1.083775806427002, "step": 785, "token_acc": 0.6717203862042463 }, { "epoch": 0.7053571428571429, "grad_norm": 1.265625, "learning_rate": 4.3830741421820376e-06, "loss": 1.0916669845581055, "step": 790, "token_acc": 0.670933072789448 }, { "epoch": 0.7098214285714286, "grad_norm": 1.3359375, "learning_rate": 4.261548557942047e-06, "loss": 1.11307373046875, "step": 795, "token_acc": 0.6643882351532854 }, { "epoch": 0.7142857142857143, "grad_norm": 1.25, "learning_rate": 4.1412736453977545e-06, "loss": 1.0729114532470703, "step": 800, "token_acc": 0.6777025043600384 }, { "epoch": 0.71875, "grad_norm": 1.2421875, "learning_rate": 4.0222756179675915e-06, "loss": 1.0922969818115233, "step": 805, "token_acc": 0.669288262867004 }, { "epoch": 0.7232142857142857, "grad_norm": 1.2734375, "learning_rate": 3.904580410778185e-06, "loss": 1.1067237854003906, "step": 810, "token_acc": 0.6661032228983548 }, { "epoch": 0.7276785714285714, "grad_norm": 1.3046875, "learning_rate": 3.7882136750118823e-06, "loss": 1.0945161819458007, "step": 815, "token_acc": 0.6704976778178561 }, { "epoch": 0.7321428571428571, "grad_norm": 1.2578125, "learning_rate": 3.6732007723161933e-06, "loss": 1.0965933799743652, "step": 820, "token_acc": 0.6691131570294518 }, { "epoch": 0.7366071428571429, "grad_norm": 1.2734375, "learning_rate": 3.5595667692763346e-06, "loss": 1.0896465301513671, "step": 825, "token_acc": 0.6698150243632081 }, { "epoch": 0.7410714285714286, "grad_norm": 1.2421875, "learning_rate": 3.447336431952052e-06, "loss": 1.103147315979004, "step": 830, "token_acc": 0.6670983919710921 }, { "epoch": 0.7455357142857143, "grad_norm": 1.3359375, "learning_rate": 3.3365342204799613e-06, "loss": 1.0856735229492187, "step": 835, "token_acc": 0.6713515977465578 }, { "epoch": 0.75, "grad_norm": 1.25, "learning_rate": 3.2271842837425917e-06, "loss": 1.1038305282592773, "step": 840, "token_acc": 0.6657289592038155 }, { "epoch": 0.7544642857142857, "grad_norm": 1.328125, "learning_rate": 3.119310454105199e-06, "loss": 1.0918630599975585, "step": 845, "token_acc": 0.6703148288973384 }, { "epoch": 0.7589285714285714, "grad_norm": 1.2421875, "learning_rate": 3.0129362422216223e-06, "loss": 1.0974313735961914, "step": 850, "token_acc": 0.6704314984982306 }, { "epoch": 0.7633928571428571, "grad_norm": 1.328125, "learning_rate": 2.908084831910237e-06, "loss": 1.1198549270629883, "step": 855, "token_acc": 0.6628890624347961 }, { "epoch": 0.7678571428571429, "grad_norm": 1.2734375, "learning_rate": 2.8047790751011216e-06, "loss": 1.0908279418945312, "step": 860, "token_acc": 0.6704230184504901 }, { "epoch": 0.7723214285714286, "grad_norm": 1.234375, "learning_rate": 2.703041486855583e-06, "loss": 1.108245849609375, "step": 865, "token_acc": 0.6653421633554084 }, { "epoch": 0.7767857142857143, "grad_norm": 1.25, "learning_rate": 2.602894240459103e-06, "loss": 1.116114044189453, "step": 870, "token_acc": 0.6625173300470429 }, { "epoch": 0.78125, "grad_norm": 1.1796875, "learning_rate": 2.504359162588741e-06, "loss": 1.08361759185791, "step": 875, "token_acc": 0.673106125478107 }, { "epoch": 0.7857142857142857, "grad_norm": 1.296875, "learning_rate": 2.407457728556115e-06, "loss": 1.102341079711914, "step": 880, "token_acc": 0.6663129222823587 }, { "epoch": 0.7901785714285714, "grad_norm": 1.2734375, "learning_rate": 2.312211057626942e-06, "loss": 1.100531005859375, "step": 885, "token_acc": 0.6663185254637597 }, { "epoch": 0.7946428571428571, "grad_norm": 1.3203125, "learning_rate": 2.218639908418189e-06, "loss": 1.1055935859680175, "step": 890, "token_acc": 0.666173639479739 }, { "epoch": 0.7991071428571429, "grad_norm": 1.2734375, "learning_rate": 2.1267646743738034e-06, "loss": 1.0964255332946777, "step": 895, "token_acc": 0.6691968672248048 }, { "epoch": 0.8035714285714286, "grad_norm": 1.2109375, "learning_rate": 2.0366053793200567e-06, "loss": 1.0952293395996093, "step": 900, "token_acc": 0.6708661325086683 }, { "epoch": 0.8080357142857143, "grad_norm": 1.3203125, "learning_rate": 1.9481816731014412e-06, "loss": 1.080392837524414, "step": 905, "token_acc": 0.6736406863758528 }, { "epoch": 0.8125, "grad_norm": 1.3046875, "learning_rate": 1.861512827298051e-06, "loss": 1.110099983215332, "step": 910, "token_acc": 0.6655230956311279 }, { "epoch": 0.8169642857142857, "grad_norm": 1.2265625, "learning_rate": 1.7766177310254306e-06, "loss": 1.090493392944336, "step": 915, "token_acc": 0.6710389858691432 }, { "epoch": 0.8214285714285714, "grad_norm": 1.359375, "learning_rate": 1.693514886817772e-06, "loss": 1.0946979522705078, "step": 920, "token_acc": 0.6691246847215617 }, { "epoch": 0.8258928571428571, "grad_norm": 1.265625, "learning_rate": 1.6122224065953618e-06, "loss": 1.0981364250183105, "step": 925, "token_acc": 0.6693018885434551 }, { "epoch": 0.8303571428571429, "grad_norm": 1.3203125, "learning_rate": 1.5327580077171589e-06, "loss": 1.084502601623535, "step": 930, "token_acc": 0.6716733571432733 }, { "epoch": 0.8348214285714286, "grad_norm": 1.2890625, "learning_rate": 1.455139009119383e-06, "loss": 1.1079372406005858, "step": 935, "token_acc": 0.6665884987659243 }, { "epoch": 0.8392857142857143, "grad_norm": 1.484375, "learning_rate": 1.3793823275409068e-06, "loss": 1.1032150268554688, "step": 940, "token_acc": 0.6673829265444353 }, { "epoch": 0.84375, "grad_norm": 1.234375, "learning_rate": 1.305504473836331e-06, "loss": 1.0951696395874024, "step": 945, "token_acc": 0.6693173328082196 }, { "epoch": 0.8482142857142857, "grad_norm": 1.2890625, "learning_rate": 1.233521549377522e-06, "loss": 1.091273307800293, "step": 950, "token_acc": 0.6727149994807473 }, { "epoch": 0.8526785714285714, "grad_norm": 1.2265625, "learning_rate": 1.1634492425443634e-06, "loss": 1.0925727844238282, "step": 955, "token_acc": 0.6704545454545454 }, { "epoch": 0.8571428571428571, "grad_norm": 1.3046875, "learning_rate": 1.0953028253055541e-06, "loss": 1.1154996871948242, "step": 960, "token_acc": 0.6647821768795434 }, { "epoch": 0.8616071428571429, "grad_norm": 1.296875, "learning_rate": 1.0290971498901481e-06, "loss": 1.105567741394043, "step": 965, "token_acc": 0.6661696526337861 }, { "epoch": 0.8660714285714286, "grad_norm": 1.28125, "learning_rate": 9.648466455505578e-07, "loss": 1.0953655242919922, "step": 970, "token_acc": 0.6667339188226916 }, { "epoch": 0.8705357142857143, "grad_norm": 1.296875, "learning_rate": 9.025653154177705e-07, "loss": 1.1006592750549316, "step": 975, "token_acc": 0.666848379986253 }, { "epoch": 0.875, "grad_norm": 1.2890625, "learning_rate": 8.42266733449425e-07, "loss": 1.103367233276367, "step": 980, "token_acc": 0.6671485771741006 }, { "epoch": 0.8794642857142857, "grad_norm": 1.328125, "learning_rate": 7.839640414714156e-07, "loss": 1.1081018447875977, "step": 985, "token_acc": 0.664527035156532 }, { "epoch": 0.8839285714285714, "grad_norm": 1.296875, "learning_rate": 7.276699463136872e-07, "loss": 1.091609001159668, "step": 990, "token_acc": 0.6708852005532503 }, { "epoch": 0.8883928571428571, "grad_norm": 1.2734375, "learning_rate": 6.733967170408451e-07, "loss": 1.1118325233459472, "step": 995, "token_acc": 0.6650144617932624 }, { "epoch": 0.8928571428571429, "grad_norm": 1.25, "learning_rate": 6.211561822781476e-07, "loss": 1.0831655502319335, "step": 1000, "token_acc": 0.6735403414496951 }, { "epoch": 0.8973214285714286, "grad_norm": 1.34375, "learning_rate": 5.709597276335144e-07, "loss": 1.0991512298583985, "step": 1005, "token_acc": 0.6691225126435695 }, { "epoch": 0.9017857142857143, "grad_norm": 1.3046875, "learning_rate": 5.228182932160841e-07, "loss": 1.108486270904541, "step": 1010, "token_acc": 0.6649077455770052 }, { "epoch": 0.90625, "grad_norm": 1.2578125, "learning_rate": 4.7674237125185597e-07, "loss": 1.1087259292602538, "step": 1015, "token_acc": 0.6655792360501174 }, { "epoch": 0.9107142857142857, "grad_norm": 1.2734375, "learning_rate": 4.327420037969532e-07, "loss": 1.1094024658203125, "step": 1020, "token_acc": 0.663856754611703 }, { "epoch": 0.9151785714285714, "grad_norm": 1.2109375, "learning_rate": 3.908267805490051e-07, "loss": 1.0980701446533203, "step": 1025, "token_acc": 0.6679018819553691 }, { "epoch": 0.9196428571428571, "grad_norm": 1.265625, "learning_rate": 3.510058367571045e-07, "loss": 1.098175048828125, "step": 1030, "token_acc": 0.668898687688601 }, { "epoch": 0.9241071428571429, "grad_norm": 1.2890625, "learning_rate": 3.132878512308213e-07, "loss": 1.1017606735229493, "step": 1035, "token_acc": 0.6671746861165542 }, { "epoch": 0.9285714285714286, "grad_norm": 1.234375, "learning_rate": 2.776810444486944e-07, "loss": 1.1003715515136718, "step": 1040, "token_acc": 0.6674644336916524 }, { "epoch": 0.9330357142857143, "grad_norm": 1.2265625, "learning_rate": 2.441931767666084e-07, "loss": 1.0855772972106934, "step": 1045, "token_acc": 0.6727498238524203 }, { "epoch": 0.9375, "grad_norm": 1.28125, "learning_rate": 2.1283154672645522e-07, "loss": 1.0910385131835938, "step": 1050, "token_acc": 0.6683551673944687 }, { "epoch": 0.9419642857142857, "grad_norm": 1.25, "learning_rate": 1.8360298946545452e-07, "loss": 1.0924718856811524, "step": 1055, "token_acc": 0.6707896340201002 }, { "epoch": 0.9464285714285714, "grad_norm": 1.2734375, "learning_rate": 1.5651387522645721e-07, "loss": 1.0851358413696288, "step": 1060, "token_acc": 0.6709870034475813 }, { "epoch": 0.9508928571428571, "grad_norm": 1.234375, "learning_rate": 1.315701079695775e-07, "loss": 1.0930654525756835, "step": 1065, "token_acc": 0.669103371531395 }, { "epoch": 0.9553571428571429, "grad_norm": 1.2421875, "learning_rate": 1.0877712408545294e-07, "loss": 1.1010807037353516, "step": 1070, "token_acc": 0.6683365733045945 }, { "epoch": 0.9598214285714286, "grad_norm": 1.2578125, "learning_rate": 8.813989121040478e-08, "loss": 1.1053053855895996, "step": 1075, "token_acc": 0.6653947493353589 }, { "epoch": 0.9642857142857143, "grad_norm": 1.2890625, "learning_rate": 6.966290714375934e-08, "loss": 1.0881473541259765, "step": 1080, "token_acc": 0.671083660214032 }, { "epoch": 0.96875, "grad_norm": 1.265625, "learning_rate": 5.3350198867574424e-08, "loss": 1.0896425247192383, "step": 1085, "token_acc": 0.6703079702533506 }, { "epoch": 0.9732142857142857, "grad_norm": 1.2578125, "learning_rate": 3.9205321668972506e-08, "loss": 1.1037522315979005, "step": 1090, "token_acc": 0.6670471076628307 }, { "epoch": 0.9776785714285714, "grad_norm": 1.25, "learning_rate": 2.723135836528501e-08, "loss": 1.108121109008789, "step": 1095, "token_acc": 0.6640236439084097 }, { "epoch": 0.9821428571428571, "grad_norm": 1.296875, "learning_rate": 1.7430918632157513e-08, "loss": 1.0776978492736817, "step": 1100, "token_acc": 0.6742684264534163 }, { "epoch": 0.9866071428571429, "grad_norm": 1.3125, "learning_rate": 9.80613843479361e-09, "loss": 1.1027990341186524, "step": 1105, "token_acc": 0.6674602942466875 }, { "epoch": 0.9910714285714286, "grad_norm": 1.3203125, "learning_rate": 4.358679562416202e-09, "loss": 1.0937559127807617, "step": 1110, "token_acc": 0.670395055464588 }, { "epoch": 0.9955357142857143, "grad_norm": 1.2578125, "learning_rate": 1.0897292660971836e-09, "loss": 1.0890558242797852, "step": 1115, "token_acc": 0.6709881013358767 }, { "epoch": 1.0, "grad_norm": 1.328125, "learning_rate": 0.0, "loss": 1.0961315155029296, "step": 1120, "token_acc": 0.6685871908005899 } ], "logging_steps": 5, "max_steps": 1120, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8781764718816133e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }