| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500.0, | |
| "global_step": 1500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0013333333333333333, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 4.444444444444445e-07, | |
| "loss": 0.027944788336753845, | |
| "step": 1, | |
| "token_acc": 0.9901477832512315 | |
| }, | |
| { | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.05075684520933363, | |
| "step": 10, | |
| "token_acc": 0.9901595744680851 | |
| }, | |
| { | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.04696210622787476, | |
| "step": 20, | |
| "token_acc": 0.9893488259501332 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.05452235341072083, | |
| "step": 30, | |
| "token_acc": 0.9853012048192771 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.03315775990486145, | |
| "step": 40, | |
| "token_acc": 0.9846264712947393 | |
| }, | |
| { | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.9999417253661235e-05, | |
| "loss": 0.028552538156509398, | |
| "step": 50, | |
| "token_acc": 0.9908411665461557 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.9994755690455154e-05, | |
| "loss": 0.04210628271102905, | |
| "step": 60, | |
| "token_acc": 0.9860944617597699 | |
| }, | |
| { | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.998543473718677e-05, | |
| "loss": 0.03467268347740173, | |
| "step": 70, | |
| "token_acc": 0.9840695148443157 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.9971458739130598e-05, | |
| "loss": 0.03366573452949524, | |
| "step": 80, | |
| "token_acc": 0.986271676300578 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.995283421166614e-05, | |
| "loss": 0.03945094347000122, | |
| "step": 90, | |
| "token_acc": 0.9827420901246404 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.9929569837240567e-05, | |
| "loss": 0.03638350963592529, | |
| "step": 100, | |
| "token_acc": 0.9838981014179283 | |
| }, | |
| { | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.990167646132107e-05, | |
| "loss": 0.027288395166397094, | |
| "step": 110, | |
| "token_acc": 0.9886555636012552 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.375, | |
| "learning_rate": 1.9869167087338908e-05, | |
| "loss": 0.03376817405223846, | |
| "step": 120, | |
| "token_acc": 0.987075155576831 | |
| }, | |
| { | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.983205687062742e-05, | |
| "loss": 0.03553054332733154, | |
| "step": 130, | |
| "token_acc": 0.9824687800192123 | |
| }, | |
| { | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.9790363111356838e-05, | |
| "loss": 0.033854860067367556, | |
| "step": 140, | |
| "token_acc": 0.9848812095032398 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.9744105246469264e-05, | |
| "loss": 0.02748125195503235, | |
| "step": 150, | |
| "token_acc": 0.9879459980713597 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.9693304840617456e-05, | |
| "loss": 0.03520364165306091, | |
| "step": 160, | |
| "token_acc": 0.9846042819340871 | |
| }, | |
| { | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.963798557611178e-05, | |
| "loss": 0.03284199237823486, | |
| "step": 170, | |
| "token_acc": 0.9877902801053388 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.957817324187987e-05, | |
| "loss": 0.03556506037712097, | |
| "step": 180, | |
| "token_acc": 0.98562874251497 | |
| }, | |
| { | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.9513895721444286e-05, | |
| "loss": 0.030231645703315733, | |
| "step": 190, | |
| "token_acc": 0.9858275282248379 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.9445182979923657e-05, | |
| "loss": 0.03149127662181854, | |
| "step": 200, | |
| "token_acc": 0.9862085652068715 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.937206705006344e-05, | |
| "loss": 0.03637541532516479, | |
| "step": 210, | |
| "token_acc": 0.9851211903047756 | |
| }, | |
| { | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 1.9294582017302797e-05, | |
| "loss": 0.03209535479545593, | |
| "step": 220, | |
| "token_acc": 0.9861542134160898 | |
| }, | |
| { | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.921276400388451e-05, | |
| "loss": 0.031918269395828244, | |
| "step": 230, | |
| "token_acc": 0.986810551558753 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.9126651152015404e-05, | |
| "loss": 0.03563873469829559, | |
| "step": 240, | |
| "token_acc": 0.983626294245124 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 1.9036283606085057e-05, | |
| "loss": 0.03571958541870117, | |
| "step": 250, | |
| "token_acc": 0.9839019702066314 | |
| }, | |
| { | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.8941703493951163e-05, | |
| "loss": 0.030891618132591246, | |
| "step": 260, | |
| "token_acc": 0.9862881885975463 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.8842954907300236e-05, | |
| "loss": 0.03673713207244873, | |
| "step": 270, | |
| "token_acc": 0.9827006246996636 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.874008388109276e-05, | |
| "loss": 0.033187645673751834, | |
| "step": 280, | |
| "token_acc": 0.9853752097818269 | |
| }, | |
| { | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 1.863313837210247e-05, | |
| "loss": 0.03002692461013794, | |
| "step": 290, | |
| "token_acc": 0.9861011262880421 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.8522168236559693e-05, | |
| "loss": 0.03252564668655396, | |
| "step": 300, | |
| "token_acc": 0.9860409145607701 | |
| }, | |
| { | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.840722520690921e-05, | |
| "loss": 0.034389343857765195, | |
| "step": 310, | |
| "token_acc": 0.9862683690676946 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.8288362867693414e-05, | |
| "loss": 0.030599406361579894, | |
| "step": 320, | |
| "token_acc": 0.9861011262880421 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.816563663057211e-05, | |
| "loss": 0.029587957262992858, | |
| "step": 330, | |
| "token_acc": 0.9865513928914506 | |
| }, | |
| { | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.8039103708490503e-05, | |
| "loss": 0.03238507807254791, | |
| "step": 340, | |
| "token_acc": 0.985875029925784 | |
| }, | |
| { | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 1.790882308900746e-05, | |
| "loss": 0.03101579546928406, | |
| "step": 350, | |
| "token_acc": 0.9872962607861937 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.7774855506796497e-05, | |
| "loss": 0.031994479894638064, | |
| "step": 360, | |
| "token_acc": 0.9857350096711799 | |
| }, | |
| { | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.7637263415332272e-05, | |
| "loss": 0.028815871477127074, | |
| "step": 370, | |
| "token_acc": 0.9896110171539019 | |
| }, | |
| { | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.749611095777581e-05, | |
| "loss": 0.03330559730529785, | |
| "step": 380, | |
| "token_acc": 0.9864734299516909 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.7351463937072008e-05, | |
| "loss": 0.024368155002593993, | |
| "step": 390, | |
| "token_acc": 0.9897202964379632 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.7203389785273402e-05, | |
| "loss": 0.0315426915884018, | |
| "step": 400, | |
| "token_acc": 0.9867501806793544 | |
| }, | |
| { | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.705195753210446e-05, | |
| "loss": 0.027628937363624574, | |
| "step": 410, | |
| "token_acc": 0.9856699307379986 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.6897237772781046e-05, | |
| "loss": 0.031161597371101378, | |
| "step": 420, | |
| "token_acc": 0.9867756672276989 | |
| }, | |
| { | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.673930263510011e-05, | |
| "loss": 0.030777221918106078, | |
| "step": 430, | |
| "token_acc": 0.9867851994233542 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.6578225745814907e-05, | |
| "loss": 0.030515575408935548, | |
| "step": 440, | |
| "token_acc": 0.9867947178871549 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.6414082196311402e-05, | |
| "loss": 0.031556323170661926, | |
| "step": 450, | |
| "token_acc": 0.9853471054527985 | |
| }, | |
| { | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.6246948507601915e-05, | |
| "loss": 0.028740781545639037, | |
| "step": 460, | |
| "token_acc": 0.9858173076923077 | |
| }, | |
| { | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.607690259465229e-05, | |
| "loss": 0.031500387191772464, | |
| "step": 470, | |
| "token_acc": 0.9882522176935986 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.5904023730059227e-05, | |
| "loss": 0.03357301354408264, | |
| "step": 480, | |
| "token_acc": 0.9862914862914863 | |
| }, | |
| { | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.57283925070947e-05, | |
| "loss": 0.03677979111671448, | |
| "step": 490, | |
| "token_acc": 0.982924482924483 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.55500908021347e-05, | |
| "loss": 0.03298424780368805, | |
| "step": 500, | |
| "token_acc": 0.9850169163847269 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.536920173648984e-05, | |
| "loss": 0.035187387466430665, | |
| "step": 510, | |
| "token_acc": 0.9867597496389022 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.5185809637655548e-05, | |
| "loss": 0.02551887333393097, | |
| "step": 520, | |
| "token_acc": 0.990148966842864 | |
| }, | |
| { | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 1.375, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.029168868064880372, | |
| "step": 530, | |
| "token_acc": 0.9879489033502049 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.4811859444908053e-05, | |
| "loss": 0.02838689088821411, | |
| "step": 540, | |
| "token_acc": 0.9872442839951865 | |
| }, | |
| { | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.4621475680399771e-05, | |
| "loss": 0.027503234148025513, | |
| "step": 550, | |
| "token_acc": 0.9870285851549363 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.4428937460242417e-05, | |
| "loss": 0.03381537199020386, | |
| "step": 560, | |
| "token_acc": 0.9860744297719087 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 1.4234334542574906e-05, | |
| "loss": 0.028558316826820373, | |
| "step": 570, | |
| "token_acc": 0.9874909790714458 | |
| }, | |
| { | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.4037757648064019e-05, | |
| "loss": 0.029447346925735474, | |
| "step": 580, | |
| "token_acc": 0.9868609651218346 | |
| }, | |
| { | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.3839298417611964e-05, | |
| "loss": 0.02759793698787689, | |
| "step": 590, | |
| "token_acc": 0.9877285851780558 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3639049369634878e-05, | |
| "loss": 0.030391490459442137, | |
| "step": 600, | |
| "token_acc": 0.9863505747126436 | |
| }, | |
| { | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.3437103856932266e-05, | |
| "loss": 0.028662437200546266, | |
| "step": 610, | |
| "token_acc": 0.9881470730527334 | |
| }, | |
| { | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.3233556023167487e-05, | |
| "loss": 0.02797028422355652, | |
| "step": 620, | |
| "token_acc": 0.9882211538461538 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.3028500758979507e-05, | |
| "loss": 0.02865118682384491, | |
| "step": 630, | |
| "token_acc": 0.9860911270983214 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 1.2822033657746478e-05, | |
| "loss": 0.026763680577278137, | |
| "step": 640, | |
| "token_acc": 0.9887127761767531 | |
| }, | |
| { | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.2614250971021658e-05, | |
| "loss": 0.03335306942462921, | |
| "step": 650, | |
| "token_acc": 0.9831568816169394 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2405249563662539e-05, | |
| "loss": 0.02619180679321289, | |
| "step": 660, | |
| "token_acc": 0.9882465819141281 | |
| }, | |
| { | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.2195126868674052e-05, | |
| "loss": 0.028585124015808105, | |
| "step": 670, | |
| "token_acc": 0.9881984585741811 | |
| }, | |
| { | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.1983980841786899e-05, | |
| "loss": 0.02787652611732483, | |
| "step": 680, | |
| "token_acc": 0.9889290012033695 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.177190991579223e-05, | |
| "loss": 0.0291363924741745, | |
| "step": 690, | |
| "token_acc": 0.9864897466827504 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 1.1559012954653865e-05, | |
| "loss": 0.03034358024597168, | |
| "step": 700, | |
| "token_acc": 0.9869407496977025 | |
| }, | |
| { | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 1.1345389207419588e-05, | |
| "loss": 0.027029412984848022, | |
| "step": 710, | |
| "token_acc": 0.9873084291187739 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.1131138261952845e-05, | |
| "loss": 0.029054158926010133, | |
| "step": 720, | |
| "token_acc": 0.9872657376261412 | |
| }, | |
| { | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 1.0916359998506549e-05, | |
| "loss": 0.030344563722610473, | |
| "step": 730, | |
| "token_acc": 0.987710843373494 | |
| }, | |
| { | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.070115454316054e-05, | |
| "loss": 0.028968954086303712, | |
| "step": 740, | |
| "token_acc": 0.986003861003861 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.0485622221144485e-05, | |
| "loss": 0.027395570278167726, | |
| "step": 750, | |
| "token_acc": 0.9886005335920446 | |
| }, | |
| { | |
| "epoch": 1.0133333333333334, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 1.0269863510067872e-05, | |
| "loss": 0.013629350066184997, | |
| "step": 760, | |
| "token_acc": 0.9968712394705175 | |
| }, | |
| { | |
| "epoch": 1.0266666666666666, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.0053978993079046e-05, | |
| "loss": 0.012067935615777969, | |
| "step": 770, | |
| "token_acc": 0.9961749940234281 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.838069311974986e-06, | |
| "loss": 0.011612998694181443, | |
| "step": 780, | |
| "token_acc": 0.9954348870735223 | |
| }, | |
| { | |
| "epoch": 1.0533333333333332, | |
| "grad_norm": 0.5, | |
| "learning_rate": 9.622235120283769e-06, | |
| "loss": 0.0077203229069709774, | |
| "step": 790, | |
| "token_acc": 0.9971271247306679 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 9.406577036341548e-06, | |
| "loss": 0.011040687561035156, | |
| "step": 800, | |
| "token_acc": 0.9956772334293948 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 9.19119559638596e-06, | |
| "loss": 0.008952221274375916, | |
| "step": 810, | |
| "token_acc": 0.9966450994488377 | |
| }, | |
| { | |
| "epoch": 1.0933333333333333, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 8.976191207687775e-06, | |
| "loss": 0.010079852491617202, | |
| "step": 820, | |
| "token_acc": 0.9959310674964098 | |
| }, | |
| { | |
| "epoch": 1.1066666666666667, | |
| "grad_norm": 1.625, | |
| "learning_rate": 8.7616641017427e-06, | |
| "loss": 0.01139761358499527, | |
| "step": 830, | |
| "token_acc": 0.9963916285783017 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 8.5477142875451e-06, | |
| "loss": 0.010524387657642364, | |
| "step": 840, | |
| "token_acc": 0.9961361989857522 | |
| }, | |
| { | |
| "epoch": 1.1333333333333333, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 8.334441504965456e-06, | |
| "loss": 0.011410476267337799, | |
| "step": 850, | |
| "token_acc": 0.9956490210297317 | |
| }, | |
| { | |
| "epoch": 1.1466666666666667, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 8.1219451782533e-06, | |
| "loss": 0.009808246791362763, | |
| "step": 860, | |
| "token_acc": 0.9964020148716719 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 7.91032436968725e-06, | |
| "loss": 0.008106120675802232, | |
| "step": 870, | |
| "token_acc": 0.9964054636951833 | |
| }, | |
| { | |
| "epoch": 1.1733333333333333, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 7.699677733393827e-06, | |
| "loss": 0.005388218909502029, | |
| "step": 880, | |
| "token_acc": 0.9978515158749105 | |
| }, | |
| { | |
| "epoch": 1.1866666666666668, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 7.490103469356513e-06, | |
| "loss": 0.016499459743499756, | |
| "step": 890, | |
| "token_acc": 0.9947000722717417 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.75, | |
| "learning_rate": 7.2816992776365714e-06, | |
| "loss": 0.008301901072263718, | |
| "step": 900, | |
| "token_acc": 0.9966418805468937 | |
| }, | |
| { | |
| "epoch": 1.2133333333333334, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 7.0745623128268605e-06, | |
| "loss": 0.010182877629995346, | |
| "step": 910, | |
| "token_acc": 0.9963968292097045 | |
| }, | |
| { | |
| "epoch": 1.2266666666666666, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 6.868789138759977e-06, | |
| "loss": 0.007760406285524368, | |
| "step": 920, | |
| "token_acc": 0.9971278123504069 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 6.664475683491797e-06, | |
| "loss": 0.007416041195392608, | |
| "step": 930, | |
| "token_acc": 0.997346840328027 | |
| }, | |
| { | |
| "epoch": 1.2533333333333334, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 6.461717194581394e-06, | |
| "loss": 0.012545964121818543, | |
| "step": 940, | |
| "token_acc": 0.9954260953298026 | |
| }, | |
| { | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 2.625, | |
| "learning_rate": 6.260608194688207e-06, | |
| "loss": 0.009528040885925293, | |
| "step": 950, | |
| "token_acc": 0.9964011516314779 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 6.061242437507131e-06, | |
| "loss": 0.010063067823648453, | |
| "step": 960, | |
| "token_acc": 0.9958907420836355 | |
| }, | |
| { | |
| "epoch": 1.2933333333333334, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 5.863712864062089e-06, | |
| "loss": 0.016041702032089232, | |
| "step": 970, | |
| "token_acc": 0.9954326923076923 | |
| }, | |
| { | |
| "epoch": 1.3066666666666666, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.6681115593784705e-06, | |
| "loss": 0.010919718444347382, | |
| "step": 980, | |
| "token_acc": 0.9959124789612888 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 5.4745297095546125e-06, | |
| "loss": 0.011771070957183837, | |
| "step": 990, | |
| "token_acc": 0.9963933637893725 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 5.2830575592523415e-06, | |
| "loss": 0.011084750294685364, | |
| "step": 1000, | |
| "token_acc": 0.996135265700483 | |
| }, | |
| { | |
| "epoch": 1.3466666666666667, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 5.093784369626397e-06, | |
| "loss": 0.007512730360031128, | |
| "step": 1010, | |
| "token_acc": 0.9968772519817439 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 4.9067983767123736e-06, | |
| "loss": 0.010888008773326874, | |
| "step": 1020, | |
| "token_acc": 0.9963985594237695 | |
| }, | |
| { | |
| "epoch": 1.3733333333333333, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 4.722186750292511e-06, | |
| "loss": 0.01149669587612152, | |
| "step": 1030, | |
| "token_acc": 0.9964054636951833 | |
| }, | |
| { | |
| "epoch": 1.3866666666666667, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.54003555325862e-06, | |
| "loss": 0.009606964886188507, | |
| "step": 1040, | |
| "token_acc": 0.9970937272947444 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.360429701490935e-06, | |
| "loss": 0.011592777073383331, | |
| "step": 1050, | |
| "token_acc": 0.9959222835212281 | |
| }, | |
| { | |
| "epoch": 1.4133333333333333, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.183452924271776e-06, | |
| "loss": 0.008268815279006959, | |
| "step": 1060, | |
| "token_acc": 0.9963924963924964 | |
| }, | |
| { | |
| "epoch": 1.4266666666666667, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 4.009187725252309e-06, | |
| "loss": 0.008235112577676774, | |
| "step": 1070, | |
| "token_acc": 0.996875 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 3.837715343990727e-06, | |
| "loss": 0.010764123499393463, | |
| "step": 1080, | |
| "token_acc": 0.9963645176926805 | |
| }, | |
| { | |
| "epoch": 1.4533333333333334, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.669115718079702e-06, | |
| "loss": 0.01102955937385559, | |
| "step": 1090, | |
| "token_acc": 0.9954425521707844 | |
| }, | |
| { | |
| "epoch": 1.4666666666666668, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 3.5034674458807893e-06, | |
| "loss": 0.01023436188697815, | |
| "step": 1100, | |
| "token_acc": 0.9963750604156597 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 3.3408477498831917e-06, | |
| "loss": 0.008587966859340667, | |
| "step": 1110, | |
| "token_acc": 0.9961676646706586 | |
| }, | |
| { | |
| "epoch": 1.4933333333333334, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 3.1813324407038826e-06, | |
| "loss": 0.007490788400173187, | |
| "step": 1120, | |
| "token_acc": 0.9971056439942113 | |
| }, | |
| { | |
| "epoch": 1.5066666666666668, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 3.024995881745972e-06, | |
| "loss": 0.00892709642648697, | |
| "step": 1130, | |
| "token_acc": 0.9952049868137137 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.8719109545317102e-06, | |
| "loss": 0.010660454630851746, | |
| "step": 1140, | |
| "token_acc": 0.9963715529753265 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 2.722149024726307e-06, | |
| "loss": 0.008663681149482728, | |
| "step": 1150, | |
| "token_acc": 0.9968892079444843 | |
| }, | |
| { | |
| "epoch": 1.5466666666666666, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 2.5757799088684654e-06, | |
| "loss": 0.011409056186676026, | |
| "step": 1160, | |
| "token_acc": 0.9959193470955353 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.432871841823047e-06, | |
| "loss": 0.008858834207057954, | |
| "step": 1170, | |
| "token_acc": 0.9973513123043583 | |
| }, | |
| { | |
| "epoch": 1.5733333333333333, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.293491444971109e-06, | |
| "loss": 0.005919945612549782, | |
| "step": 1180, | |
| "token_acc": 0.9971133028626413 | |
| }, | |
| { | |
| "epoch": 1.5866666666666667, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.157703695152109e-06, | |
| "loss": 0.011429443210363387, | |
| "step": 1190, | |
| "token_acc": 0.9944724825763037 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 2.025571894372794e-06, | |
| "loss": 0.008558385819196702, | |
| "step": 1200, | |
| "token_acc": 0.9966159052453468 | |
| }, | |
| { | |
| "epoch": 1.6133333333333333, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.897157640296825e-06, | |
| "loss": 0.012137772142887115, | |
| "step": 1210, | |
| "token_acc": 0.9954282964388835 | |
| }, | |
| { | |
| "epoch": 1.6266666666666667, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.7725207975289883e-06, | |
| "loss": 0.008094522356986999, | |
| "step": 1220, | |
| "token_acc": 0.9976019184652278 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.6517194697072903e-06, | |
| "loss": 0.01032220721244812, | |
| "step": 1230, | |
| "token_acc": 0.9975984630163305 | |
| }, | |
| { | |
| "epoch": 1.6533333333333333, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.534809972415998e-06, | |
| "loss": 0.006939056515693665, | |
| "step": 1240, | |
| "token_acc": 0.996868978805395 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.4218468069322576e-06, | |
| "loss": 0.015096238255500794, | |
| "step": 1250, | |
| "token_acc": 0.9951853635050554 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.3128826348184886e-06, | |
| "loss": 0.010143952071666717, | |
| "step": 1260, | |
| "token_acc": 0.9961482908040443 | |
| }, | |
| { | |
| "epoch": 1.6933333333333334, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 1.207968253372438e-06, | |
| "loss": 0.009067404270172118, | |
| "step": 1270, | |
| "token_acc": 0.995906573561281 | |
| }, | |
| { | |
| "epoch": 1.7066666666666666, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.1071525719463094e-06, | |
| "loss": 0.011023186147212982, | |
| "step": 1280, | |
| "token_acc": 0.9954447374730281 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.010482589146048e-06, | |
| "loss": 0.009403180330991745, | |
| "step": 1290, | |
| "token_acc": 0.9964251668255482 | |
| }, | |
| { | |
| "epoch": 1.7333333333333334, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.180033709213454e-07, | |
| "loss": 0.010034725069999695, | |
| "step": 1300, | |
| "token_acc": 0.997114691031498 | |
| }, | |
| { | |
| "epoch": 1.7466666666666666, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.297580295566576e-07, | |
| "loss": 0.009973371028900146, | |
| "step": 1310, | |
| "token_acc": 0.9968772519817439 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 7.457877035729588e-07, | |
| "loss": 0.0062577612698078156, | |
| "step": 1320, | |
| "token_acc": 0.9978406909788867 | |
| }, | |
| { | |
| "epoch": 1.7733333333333334, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 6.661315385496426e-07, | |
| "loss": 0.010465647280216216, | |
| "step": 1330, | |
| "token_acc": 0.9971133028626413 | |
| }, | |
| { | |
| "epoch": 1.7866666666666666, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 5.908266688755049e-07, | |
| "loss": 0.012356024980545045, | |
| "step": 1340, | |
| "token_acc": 0.9951737451737451 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 5.199082004372958e-07, | |
| "loss": 0.009671059250831605, | |
| "step": 1350, | |
| "token_acc": 0.9966378482228626 | |
| }, | |
| { | |
| "epoch": 1.8133333333333335, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.534091942539476e-07, | |
| "loss": 0.008851684629917145, | |
| "step": 1360, | |
| "token_acc": 0.9968802495800336 | |
| }, | |
| { | |
| "epoch": 1.8266666666666667, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 3.913606510640644e-07, | |
| "loss": 0.008533693850040436, | |
| "step": 1370, | |
| "token_acc": 0.9966232513265798 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 3.3379149687388866e-07, | |
| "loss": 0.010838476568460464, | |
| "step": 1380, | |
| "token_acc": 0.9959193470955353 | |
| }, | |
| { | |
| "epoch": 1.8533333333333335, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.807285694724804e-07, | |
| "loss": 0.0096530020236969, | |
| "step": 1390, | |
| "token_acc": 0.9966442953020134 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.3219660592038285e-07, | |
| "loss": 0.01174573004245758, | |
| "step": 1400, | |
| "token_acc": 0.9966346153846154 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.8821823101760949e-07, | |
| "loss": 0.012217908352613448, | |
| "step": 1410, | |
| "token_acc": 0.9949555608935864 | |
| }, | |
| { | |
| "epoch": 1.8933333333333333, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 1.4881394675633543e-07, | |
| "loss": 0.007538451254367829, | |
| "step": 1420, | |
| "token_acc": 0.9978354978354979 | |
| }, | |
| { | |
| "epoch": 1.9066666666666667, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1.1400212276321377e-07, | |
| "loss": 0.005265282094478607, | |
| "step": 1430, | |
| "token_acc": 0.9976065102920058 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.379898773574924e-08, | |
| "loss": 0.009292224794626236, | |
| "step": 1440, | |
| "token_acc": 0.9964046021093 | |
| }, | |
| { | |
| "epoch": 1.9333333333333333, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.821862187675775e-08, | |
| "loss": 0.009811708331108093, | |
| "step": 1450, | |
| "token_acc": 0.9966450994488377 | |
| }, | |
| { | |
| "epoch": 1.9466666666666668, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.727295033040035e-08, | |
| "loss": 0.00994875207543373, | |
| "step": 1460, | |
| "token_acc": 0.9966329966329966 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 2.0971737622883515e-08, | |
| "loss": 0.009690174460411071, | |
| "step": 1470, | |
| "token_acc": 0.9963776865491427 | |
| }, | |
| { | |
| "epoch": 1.9733333333333334, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 9.322583110392692e-09, | |
| "loss": 0.00724238008260727, | |
| "step": 1480, | |
| "token_acc": 0.9978380975258228 | |
| }, | |
| { | |
| "epoch": 1.9866666666666668, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.330917436402791e-09, | |
| "loss": 0.008754293620586395, | |
| "step": 1490, | |
| "token_acc": 0.9961492178098676 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.0, | |
| "loss": 0.007110082358121872, | |
| "step": 1500, | |
| "token_acc": 0.997303260603089 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.111061190492815e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |