{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015037593984962405, "grad_norm": 0.8192565441131592, "learning_rate": 1.111111111111111e-06, "loss": 1.936692237854004, "step": 2 }, { "epoch": 0.03007518796992481, "grad_norm": 0.7943888306617737, "learning_rate": 3.3333333333333333e-06, "loss": 2.246655225753784, "step": 4 }, { "epoch": 0.045112781954887216, "grad_norm": 0.5120864510536194, "learning_rate": 5.555555555555555e-06, "loss": 2.0245468616485596, "step": 6 }, { "epoch": 0.06015037593984962, "grad_norm": 0.3112846314907074, "learning_rate": 7.777777777777777e-06, "loss": 1.894791603088379, "step": 8 }, { "epoch": 0.07518796992481203, "grad_norm": 0.7659847140312195, "learning_rate": 9.999999999999999e-06, "loss": 1.8956142663955688, "step": 10 }, { "epoch": 0.09022556390977443, "grad_norm": 1.0534412860870361, "learning_rate": 1.2222222222222222e-05, "loss": 2.20853590965271, "step": 12 }, { "epoch": 0.10526315789473684, "grad_norm": 0.2340962290763855, "learning_rate": 1.4444444444444444e-05, "loss": 1.858487606048584, "step": 14 }, { "epoch": 0.12030075187969924, "grad_norm": 1.1463453769683838, "learning_rate": 1.6666666666666667e-05, "loss": 2.3683369159698486, "step": 16 }, { "epoch": 0.13533834586466165, "grad_norm": 1.6465355157852173, "learning_rate": 1.888888888888889e-05, "loss": 2.907562017440796, "step": 18 }, { "epoch": 0.15037593984962405, "grad_norm": 0.883372962474823, "learning_rate": 2.111111111111111e-05, "loss": 1.6304150819778442, "step": 20 }, { "epoch": 0.16541353383458646, "grad_norm": 0.7589054703712463, "learning_rate": 2.3333333333333336e-05, "loss": 1.670052170753479, "step": 22 }, { "epoch": 0.18045112781954886, "grad_norm": 0.5909481048583984, "learning_rate": 2.5555555555555557e-05, "loss": 1.6498050689697266, "step": 24 }, { "epoch": 0.19548872180451127, "grad_norm": 1.89938223361969, "learning_rate": 2.777777777777778e-05, "loss": 1.5983651876449585, "step": 26 }, { "epoch": 0.21052631578947367, "grad_norm": 0.8610926270484924, "learning_rate": 3e-05, "loss": 1.25473952293396, "step": 28 }, { "epoch": 0.22556390977443608, "grad_norm": 1.2555012702941895, "learning_rate": 2.999111925794138e-05, "loss": 1.228848934173584, "step": 30 }, { "epoch": 0.24060150375939848, "grad_norm": 1.1694159507751465, "learning_rate": 2.996448940315055e-05, "loss": 1.39642333984375, "step": 32 }, { "epoch": 0.2556390977443609, "grad_norm": 0.687818169593811, "learning_rate": 2.9920147532548513e-05, "loss": 1.2702100276947021, "step": 34 }, { "epoch": 0.2706766917293233, "grad_norm": 0.33861589431762695, "learning_rate": 2.9858155416914135e-05, "loss": 1.326142430305481, "step": 36 }, { "epoch": 0.2857142857142857, "grad_norm": 2.371178388595581, "learning_rate": 2.9778599414833865e-05, "loss": 1.4221186637878418, "step": 38 }, { "epoch": 0.3007518796992481, "grad_norm": 0.7431687712669373, "learning_rate": 2.9681590352399252e-05, "loss": 1.0404337644577026, "step": 40 }, { "epoch": 0.3157894736842105, "grad_norm": 0.42008474469184875, "learning_rate": 2.956726336881985e-05, "loss": 1.2850358486175537, "step": 42 }, { "epoch": 0.3308270676691729, "grad_norm": 0.24258685111999512, "learning_rate": 2.9435777728166477e-05, "loss": 0.9888750314712524, "step": 44 }, { "epoch": 0.3458646616541353, "grad_norm": 0.19806115329265594, "learning_rate": 2.928731659750722e-05, "loss": 1.389718770980835, "step": 46 }, { "epoch": 0.3609022556390977, "grad_norm": 0.5233106017112732, "learning_rate": 2.912208679174516e-05, "loss": 1.0381746292114258, "step": 48 }, { "epoch": 0.37593984962406013, "grad_norm": 0.3707257807254791, "learning_rate": 2.8940318485513296e-05, "loss": 1.0249922275543213, "step": 50 }, { "epoch": 0.39097744360902253, "grad_norm": 0.29312264919281006, "learning_rate": 2.8742264892528024e-05, "loss": 0.9782328009605408, "step": 52 }, { "epoch": 0.40601503759398494, "grad_norm": 0.34875017404556274, "learning_rate": 2.8528201912847877e-05, "loss": 1.0573807954788208, "step": 54 }, { "epoch": 0.42105263157894735, "grad_norm": 0.14175978302955627, "learning_rate": 2.829842774852883e-05, "loss": 1.005712628364563, "step": 56 }, { "epoch": 0.43609022556390975, "grad_norm": 0.3452085852622986, "learning_rate": 2.805326248821166e-05, "loss": 0.9127753376960754, "step": 58 }, { "epoch": 0.45112781954887216, "grad_norm": 0.27766284346580505, "learning_rate": 2.7793047661220094e-05, "loss": 1.0905134677886963, "step": 60 }, { "epoch": 0.46616541353383456, "grad_norm": 0.4942661225795746, "learning_rate": 2.751814576179072e-05, "loss": 0.8560956120491028, "step": 62 }, { "epoch": 0.48120300751879697, "grad_norm": 0.225164532661438, "learning_rate": 2.722893974409769e-05, "loss": 1.1211824417114258, "step": 64 }, { "epoch": 0.49624060150375937, "grad_norm": 0.5361406803131104, "learning_rate": 2.6925832488775517e-05, "loss": 1.101810336112976, "step": 66 }, { "epoch": 0.5112781954887218, "grad_norm": 0.1772085279226303, "learning_rate": 2.660924624168312e-05, "loss": 1.2826346158981323, "step": 68 }, { "epoch": 0.5263157894736842, "grad_norm": 0.7415258288383484, "learning_rate": 2.627962202569103e-05, "loss": 1.0522770881652832, "step": 70 }, { "epoch": 0.5413533834586466, "grad_norm": 0.14701074361801147, "learning_rate": 2.593741902631119e-05, "loss": 0.7640881538391113, "step": 72 }, { "epoch": 0.556390977443609, "grad_norm": 0.2193162888288498, "learning_rate": 2.558311395202502e-05, "loss": 0.8525770306587219, "step": 74 }, { "epoch": 0.5714285714285714, "grad_norm": 0.38675349950790405, "learning_rate": 2.5217200370201126e-05, "loss": 1.0316098928451538, "step": 76 }, { "epoch": 0.5864661654135338, "grad_norm": 0.15427573025226593, "learning_rate": 2.4840188019527494e-05, "loss": 1.2194627523422241, "step": 78 }, { "epoch": 0.6015037593984962, "grad_norm": 1.0054173469543457, "learning_rate": 2.445260209991616e-05, "loss": 0.6321249008178711, "step": 80 }, { "epoch": 0.6165413533834586, "grad_norm": 0.3735978603363037, "learning_rate": 2.4054982540869497e-05, "loss": 1.1763536930084229, "step": 82 }, { "epoch": 0.631578947368421, "grad_norm": 0.1597137153148651, "learning_rate": 2.3647883249327334e-05, "loss": 1.271316409111023, "step": 84 }, { "epoch": 0.6466165413533834, "grad_norm": 0.20840084552764893, "learning_rate": 2.3231871338042668e-05, "loss": 0.9115048050880432, "step": 86 }, { "epoch": 0.6616541353383458, "grad_norm": 0.11203482747077942, "learning_rate": 2.280752633556098e-05, "loss": 1.0730034112930298, "step": 88 }, { "epoch": 0.6766917293233082, "grad_norm": 0.2053004801273346, "learning_rate": 2.2375439378903597e-05, "loss": 1.1552447080612183, "step": 90 }, { "epoch": 0.6917293233082706, "grad_norm": 0.1656394600868225, "learning_rate": 2.1936212390079758e-05, "loss": 1.035262107849121, "step": 92 }, { "epoch": 0.706766917293233, "grad_norm": 0.18118023872375488, "learning_rate": 2.1490457237574638e-05, "loss": 0.961626410484314, "step": 94 }, { "epoch": 0.7218045112781954, "grad_norm": 0.3013463020324707, "learning_rate": 2.103879488398128e-05, "loss": 1.3001712560653687, "step": 96 }, { "epoch": 0.7368421052631579, "grad_norm": 0.21438409388065338, "learning_rate": 2.058185452096397e-05, "loss": 1.1097919940948486, "step": 98 }, { "epoch": 0.7518796992481203, "grad_norm": 0.4920536279678345, "learning_rate": 2.0120272692758044e-05, "loss": 0.6178168058395386, "step": 100 }, { "epoch": 0.7669172932330827, "grad_norm": 0.2382662147283554, "learning_rate": 1.965469240942704e-05, "loss": 1.3048324584960938, "step": 102 }, { "epoch": 0.7819548872180451, "grad_norm": 0.18292629718780518, "learning_rate": 1.918576225111276e-05, "loss": 0.9727452397346497, "step": 104 }, { "epoch": 0.7969924812030075, "grad_norm": 0.35222965478897095, "learning_rate": 1.8714135464525706e-05, "loss": 0.6771279573440552, "step": 106 }, { "epoch": 0.8120300751879699, "grad_norm": 0.32725006341934204, "learning_rate": 1.824046905293483e-05, "loss": 0.9753497242927551, "step": 108 }, { "epoch": 0.8270676691729323, "grad_norm": 0.40115198493003845, "learning_rate": 1.7765422860924167e-05, "loss": 0.8510618209838867, "step": 110 }, { "epoch": 0.8421052631578947, "grad_norm": 0.26472556591033936, "learning_rate": 1.7289658655191308e-05, "loss": 0.7452026605606079, "step": 112 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6424808502197266, "learning_rate": 1.6813839202668314e-05, "loss": 0.9618666768074036, "step": 114 }, { "epoch": 0.8721804511278195, "grad_norm": 0.2608170509338379, "learning_rate": 1.6338627347249194e-05, "loss": 1.3540914058685303, "step": 116 }, { "epoch": 0.8872180451127819, "grad_norm": 0.16167515516281128, "learning_rate": 1.5864685086410205e-05, "loss": 1.023390293121338, "step": 118 }, { "epoch": 0.9022556390977443, "grad_norm": 0.36847585439682007, "learning_rate": 1.539267264900926e-05, "loss": 0.8938322067260742, "step": 120 }, { "epoch": 0.9172932330827067, "grad_norm": 0.42921480536460876, "learning_rate": 1.4923247575549108e-05, "loss": 0.8056025505065918, "step": 122 }, { "epoch": 0.9323308270676691, "grad_norm": 0.6019521355628967, "learning_rate": 1.4457063802185558e-05, "loss": 0.8153986930847168, "step": 124 }, { "epoch": 0.9473684210526315, "grad_norm": 0.17627808451652527, "learning_rate": 1.3994770749756746e-05, "loss": 0.7411991953849792, "step": 126 }, { "epoch": 0.9624060150375939, "grad_norm": 0.27609097957611084, "learning_rate": 1.3537012419102535e-05, "loss": 1.0465192794799805, "step": 128 }, { "epoch": 0.9774436090225563, "grad_norm": 0.7103683948516846, "learning_rate": 1.3084426493934257e-05, "loss": 0.8794819712638855, "step": 130 }, { "epoch": 0.9924812030075187, "grad_norm": 0.13337653875350952, "learning_rate": 1.2637643452504579e-05, "loss": 1.019758701324463, "step": 132 }, { "epoch": 1.0075187969924813, "grad_norm": 0.15754307806491852, "learning_rate": 1.2197285689315004e-05, "loss": 0.7352499961853027, "step": 134 }, { "epoch": 1.0225563909774436, "grad_norm": 0.1713666021823883, "learning_rate": 1.1763966648084505e-05, "loss": 0.8829557299613953, "step": 136 }, { "epoch": 1.037593984962406, "grad_norm": 0.3357762098312378, "learning_rate": 1.1338289967187079e-05, "loss": 0.6713441610336304, "step": 138 }, { "epoch": 1.0526315789473684, "grad_norm": 0.17261233925819397, "learning_rate": 1.0920848638748748e-05, "loss": 0.7187601327896118, "step": 140 }, { "epoch": 1.0676691729323309, "grad_norm": 0.13400448858737946, "learning_rate": 1.0512224182575395e-05, "loss": 0.7740556597709656, "step": 142 }, { "epoch": 1.0827067669172932, "grad_norm": 0.3450721502304077, "learning_rate": 1.0112985836062175e-05, "loss": 0.969446063041687, "step": 144 }, { "epoch": 1.0977443609022557, "grad_norm": 0.2375430464744568, "learning_rate": 9.723689761213051e-06, "loss": 0.9913895130157471, "step": 146 }, { "epoch": 1.112781954887218, "grad_norm": 0.15823714435100555, "learning_rate": 9.34487826987512e-06, "loss": 0.925875186920166, "step": 148 }, { "epoch": 1.1278195488721805, "grad_norm": 0.23397411406040192, "learning_rate": 8.97707906826694e-06, "loss": 0.8095348477363586, "step": 150 }, { "epoch": 1.1428571428571428, "grad_norm": 0.23221950232982635, "learning_rate": 8.620804521853441e-06, "loss": 0.9493626952171326, "step": 152 }, { "epoch": 1.1578947368421053, "grad_norm": 0.1775089055299759, "learning_rate": 8.27655094159128e-06, "loss": 0.9873220920562744, "step": 154 }, { "epoch": 1.1729323308270676, "grad_norm": 0.22347131371498108, "learning_rate": 7.944797892539146e-06, "loss": 0.9379909634590149, "step": 156 }, { "epoch": 1.1879699248120301, "grad_norm": 0.17521372437477112, "learning_rate": 7.626007525795976e-06, "loss": 0.9363319277763367, "step": 158 }, { "epoch": 1.2030075187969924, "grad_norm": 0.5944448113441467, "learning_rate": 7.320623934697899e-06, "loss": 0.5006011724472046, "step": 160 }, { "epoch": 1.218045112781955, "grad_norm": 0.21691644191741943, "learning_rate": 7.029072536170642e-06, "loss": 0.877805233001709, "step": 162 }, { "epoch": 1.2330827067669172, "grad_norm": 0.19128523766994476, "learning_rate": 6.751759478099246e-06, "loss": 1.0667612552642822, "step": 164 }, { "epoch": 1.2481203007518797, "grad_norm": 0.32106316089630127, "learning_rate": 6.489071073540686e-06, "loss": 0.8215808868408203, "step": 166 }, { "epoch": 1.263157894736842, "grad_norm": 0.1843944638967514, "learning_rate": 6.241373262567537e-06, "loss": 0.6570966243743896, "step": 168 }, { "epoch": 1.2781954887218046, "grad_norm": 0.24442797899246216, "learning_rate": 6.009011102492393e-06, "loss": 0.7164343595504761, "step": 170 }, { "epoch": 1.2932330827067668, "grad_norm": 0.17113906145095825, "learning_rate": 5.7923082871831375e-06, "loss": 0.8579428791999817, "step": 172 }, { "epoch": 1.3082706766917294, "grad_norm": 0.24567635357379913, "learning_rate": 5.591566696138772e-06, "loss": 0.8993586301803589, "step": 174 }, { "epoch": 1.3233082706766917, "grad_norm": 0.19532179832458496, "learning_rate": 5.407065973953888e-06, "loss": 0.6733898520469666, "step": 176 }, { "epoch": 1.3383458646616542, "grad_norm": 0.3204295337200165, "learning_rate": 5.239063140757639e-06, "loss": 0.6425676345825195, "step": 178 }, { "epoch": 1.3533834586466165, "grad_norm": 0.22146816551685333, "learning_rate": 5.0877922341699066e-06, "loss": 1.1642075777053833, "step": 180 }, { "epoch": 1.368421052631579, "grad_norm": 0.2698806822299957, "learning_rate": 4.953463983273412e-06, "loss": 0.9253040552139282, "step": 182 }, { "epoch": 1.3834586466165413, "grad_norm": 0.2668271064758301, "learning_rate": 4.836265515055985e-06, "loss": 0.7467199563980103, "step": 184 }, { "epoch": 1.3984962406015038, "grad_norm": 0.17395268380641937, "learning_rate": 4.736360093731884e-06, "loss": 1.0783255100250244, "step": 186 }, { "epoch": 1.413533834586466, "grad_norm": 0.20097728073596954, "learning_rate": 4.653886893305353e-06, "loss": 0.5329846143722534, "step": 188 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2769680917263031, "learning_rate": 4.588960803693209e-06, "loss": 0.8872597813606262, "step": 190 }, { "epoch": 1.443609022556391, "grad_norm": 0.1900765597820282, "learning_rate": 4.5416722706765875e-06, "loss": 0.653458297252655, "step": 192 }, { "epoch": 1.4586466165413534, "grad_norm": 0.299067884683609, "learning_rate": 4.512087169904754e-06, "loss": 0.7420106530189514, "step": 194 }, { "epoch": 1.4736842105263157, "grad_norm": 0.19370242953300476, "learning_rate": 4.500246715126523e-06, "loss": 0.8862230181694031, "step": 196 }, { "epoch": 1.4887218045112782, "grad_norm": 0.17850318551063538, "learning_rate": 4.506167400777152e-06, "loss": 0.8613809943199158, "step": 198 }, { "epoch": 1.5037593984962405, "grad_norm": 0.17290696501731873, "learning_rate": 4.52984097900063e-06, "loss": 0.8784961104393005, "step": 200 }, { "epoch": 1.518796992481203, "grad_norm": 0.3493019938468933, "learning_rate": 4.5712344711394154e-06, "loss": 1.2700152397155762, "step": 202 }, { "epoch": 1.5338345864661656, "grad_norm": 0.42357122898101807, "learning_rate": 4.630290213675614e-06, "loss": 0.9580332636833191, "step": 204 }, { "epoch": 1.5488721804511278, "grad_norm": 1.1557518243789673, "learning_rate": 4.706925938559573e-06, "loss": 0.7860268354415894, "step": 206 }, { "epoch": 1.5639097744360901, "grad_norm": 0.28890499472618103, "learning_rate": 4.801034887814009e-06, "loss": 0.9093602895736694, "step": 208 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5986707210540771, "learning_rate": 4.912485962254024e-06, "loss": 0.8598864674568176, "step": 210 }, { "epoch": 1.5939849624060152, "grad_norm": 0.17273662984371185, "learning_rate": 5.04112390411581e-06, "loss": 0.5818964242935181, "step": 212 }, { "epoch": 1.6090225563909775, "grad_norm": 0.21565358340740204, "learning_rate": 5.186769513339663e-06, "loss": 0.884915292263031, "step": 214 }, { "epoch": 1.6240601503759398, "grad_norm": 0.26930728554725647, "learning_rate": 5.349219897205977e-06, "loss": 0.9705126881599426, "step": 216 }, { "epoch": 1.6390977443609023, "grad_norm": 0.1285410076379776, "learning_rate": 5.5282487529764855e-06, "loss": 0.7298458218574524, "step": 218 }, { "epoch": 1.6541353383458648, "grad_norm": 0.21168453991413116, "learning_rate": 5.7236066831470105e-06, "loss": 0.7564178109169006, "step": 220 }, { "epoch": 1.669172932330827, "grad_norm": 0.5188248753547668, "learning_rate": 5.935021542872539e-06, "loss": 0.8646745681762695, "step": 222 }, { "epoch": 1.6842105263157894, "grad_norm": 0.24589960277080536, "learning_rate": 6.162198819080668e-06, "loss": 0.699385941028595, "step": 224 }, { "epoch": 1.699248120300752, "grad_norm": 0.3245724141597748, "learning_rate": 6.404822040745263e-06, "loss": 1.0145379304885864, "step": 226 }, { "epoch": 1.7142857142857144, "grad_norm": 0.24885950982570648, "learning_rate": 6.662553219748833e-06, "loss": 0.7830167412757874, "step": 228 }, { "epoch": 1.7293233082706767, "grad_norm": 0.38476553559303284, "learning_rate": 6.935033321719419e-06, "loss": 0.9040583372116089, "step": 230 }, { "epoch": 1.744360902255639, "grad_norm": 0.3446056544780731, "learning_rate": 7.2218827661861725e-06, "loss": 1.0128272771835327, "step": 232 }, { "epoch": 1.7593984962406015, "grad_norm": 0.15947787463665009, "learning_rate": 7.522701955356779e-06, "loss": 0.9765535593032837, "step": 234 }, { "epoch": 1.774436090225564, "grad_norm": 0.2729548513889313, "learning_rate": 7.837071830780217e-06, "loss": 1.0480151176452637, "step": 236 }, { "epoch": 1.7894736842105263, "grad_norm": 0.3705751597881317, "learning_rate": 8.164554457119286e-06, "loss": 0.5091387033462524, "step": 238 }, { "epoch": 1.8045112781954886, "grad_norm": 0.3041347563266754, "learning_rate": 8.504693632219755e-06, "loss": 0.9318640232086182, "step": 240 }, { "epoch": 1.8195488721804511, "grad_norm": 0.3558288812637329, "learning_rate": 8.857015522626238e-06, "loss": 0.6968544721603394, "step": 242 }, { "epoch": 1.8345864661654137, "grad_norm": 0.2972772717475891, "learning_rate": 9.221029323659478e-06, "loss": 1.047217607498169, "step": 244 }, { "epoch": 1.849624060150376, "grad_norm": 0.1722293198108673, "learning_rate": 9.596227943135503e-06, "loss": 1.0219006538391113, "step": 246 }, { "epoch": 1.8646616541353382, "grad_norm": 1.3821203708648682, "learning_rate": 9.982088707774262e-06, "loss": 0.9025890827178955, "step": 248 }, { "epoch": 1.8796992481203008, "grad_norm": 0.15428081154823303, "learning_rate": 1.0378074091313615e-05, "loss": 1.1540802717208862, "step": 250 }, { "epoch": 1.8947368421052633, "grad_norm": 0.16087524592876434, "learning_rate": 1.0783632463314283e-05, "loss": 0.9519784450531006, "step": 252 }, { "epoch": 1.9097744360902256, "grad_norm": 0.1820048987865448, "learning_rate": 1.1198198857612926e-05, "loss": 1.1188257932662964, "step": 254 }, { "epoch": 1.9248120300751879, "grad_norm": 0.8227013349533081, "learning_rate": 1.1621195759352438e-05, "loss": 0.8220981955528259, "step": 256 }, { "epoch": 1.9398496240601504, "grad_norm": 0.39509618282318115, "learning_rate": 1.2052033909493471e-05, "loss": 0.5426740646362305, "step": 258 }, { "epoch": 1.954887218045113, "grad_norm": 0.2340533286333084, "learning_rate": 1.2490113125686138e-05, "loss": 0.8964567184448242, "step": 260 }, { "epoch": 1.9699248120300752, "grad_norm": 0.1862659901380539, "learning_rate": 1.2934823138358649e-05, "loss": 0.9239405393600464, "step": 262 }, { "epoch": 1.9849624060150375, "grad_norm": 0.21919912099838257, "learning_rate": 1.338554444085792e-05, "loss": 0.8809694051742554, "step": 264 }, { "epoch": 2.0, "grad_norm": 0.2186099886894226, "learning_rate": 1.3841649152458003e-05, "loss": 0.7980599403381348, "step": 266 }, { "epoch": 2.0150375939849625, "grad_norm": 0.38081610202789307, "learning_rate": 1.430250189303413e-05, "loss": 0.463468998670578, "step": 268 }, { "epoch": 2.030075187969925, "grad_norm": 0.2002028077840805, "learning_rate": 1.4767460668183795e-05, "loss": 0.6159178614616394, "step": 270 }, { "epoch": 2.045112781954887, "grad_norm": 0.21730108559131622, "learning_rate": 1.523587776356188e-05, "loss": 0.46370548009872437, "step": 272 }, { "epoch": 2.0601503759398496, "grad_norm": 1.0244005918502808, "learning_rate": 1.5707100647184093e-05, "loss": 0.897263765335083, "step": 274 }, { "epoch": 2.075187969924812, "grad_norm": 0.1658545732498169, "learning_rate": 1.6180472878441575e-05, "loss": 0.7874804735183716, "step": 276 }, { "epoch": 2.090225563909774, "grad_norm": 0.47336888313293457, "learning_rate": 1.6655335022560423e-05, "loss": 0.7593191266059875, "step": 278 }, { "epoch": 2.1052631578947367, "grad_norm": 0.3983185589313507, "learning_rate": 1.7131025569232362e-05, "loss": 0.8093394637107849, "step": 280 }, { "epoch": 2.1203007518796992, "grad_norm": 0.5871224999427795, "learning_rate": 1.7606881854136644e-05, "loss": 0.8642159104347229, "step": 282 }, { "epoch": 2.1353383458646618, "grad_norm": 0.16395071148872375, "learning_rate": 1.8082240982069634e-05, "loss": 0.5777812004089355, "step": 284 }, { "epoch": 2.1503759398496243, "grad_norm": 0.266190767288208, "learning_rate": 1.8556440750395985e-05, "loss": 0.8966842889785767, "step": 286 }, { "epoch": 2.1654135338345863, "grad_norm": 0.26495200395584106, "learning_rate": 1.9028820571535015e-05, "loss": 1.0453461408615112, "step": 288 }, { "epoch": 2.180451127819549, "grad_norm": 0.19011439383029938, "learning_rate": 1.949872239319729e-05, "loss": 1.0706809759140015, "step": 290 }, { "epoch": 2.1954887218045114, "grad_norm": 0.25460541248321533, "learning_rate": 1.996549161508929e-05, "loss": 0.6951987743377686, "step": 292 }, { "epoch": 2.2105263157894735, "grad_norm": 0.2636259198188782, "learning_rate": 2.042847800080939e-05, "loss": 0.8458771705627441, "step": 294 }, { "epoch": 2.225563909774436, "grad_norm": 0.3756290674209595, "learning_rate": 2.0887036583664505e-05, "loss": 0.3105054199695587, "step": 296 }, { "epoch": 2.2406015037593985, "grad_norm": 0.23921579122543335, "learning_rate": 2.1340528565145932e-05, "loss": 1.1170181035995483, "step": 298 }, { "epoch": 2.255639097744361, "grad_norm": 0.2471323013305664, "learning_rate": 2.1788322204812397e-05, "loss": 0.9951118230819702, "step": 300 }, { "epoch": 2.2706766917293235, "grad_norm": 0.2469598799943924, "learning_rate": 2.2229793700340833e-05, "loss": 1.0403016805648804, "step": 302 }, { "epoch": 2.2857142857142856, "grad_norm": 0.34822267293930054, "learning_rate": 2.2664328056519028e-05, "loss": 0.7423543334007263, "step": 304 }, { "epoch": 2.300751879699248, "grad_norm": 0.5992878079414368, "learning_rate": 2.3091319941969266e-05, "loss": 0.7819874286651611, "step": 306 }, { "epoch": 2.3157894736842106, "grad_norm": 0.31358832120895386, "learning_rate": 2.3510174532409867e-05, "loss": 1.109780192375183, "step": 308 }, { "epoch": 2.3308270676691727, "grad_norm": 0.656645655632019, "learning_rate": 2.392030833927959e-05, "loss": 0.4651540219783783, "step": 310 }, { "epoch": 2.345864661654135, "grad_norm": 0.20808559656143188, "learning_rate": 2.4321150022570873e-05, "loss": 0.8532482385635376, "step": 312 }, { "epoch": 2.3609022556390977, "grad_norm": 0.20093803107738495, "learning_rate": 2.471214118673929e-05, "loss": 0.568276584148407, "step": 314 }, { "epoch": 2.3759398496240602, "grad_norm": 0.2839231491088867, "learning_rate": 2.509273715858074e-05, "loss": 0.9199910163879395, "step": 316 }, { "epoch": 2.3909774436090228, "grad_norm": 0.19820740818977356, "learning_rate": 2.546240774599257e-05, "loss": 0.8895071744918823, "step": 318 }, { "epoch": 2.406015037593985, "grad_norm": 0.26256436109542847, "learning_rate": 2.582063797656167e-05, "loss": 1.0534682273864746, "step": 320 }, { "epoch": 2.4210526315789473, "grad_norm": 0.18550805747509003, "learning_rate": 2.6166928814950743e-05, "loss": 1.1147539615631104, "step": 322 }, { "epoch": 2.43609022556391, "grad_norm": 0.509353518486023, "learning_rate": 2.6500797858083262e-05, "loss": 0.9222637414932251, "step": 324 }, { "epoch": 2.451127819548872, "grad_norm": 0.18731118738651276, "learning_rate": 2.682178000715866e-05, "loss": 1.0500245094299316, "step": 326 }, { "epoch": 2.4661654135338344, "grad_norm": 0.29536113142967224, "learning_rate": 2.712942811556184e-05, "loss": 0.8539433479309082, "step": 328 }, { "epoch": 2.481203007518797, "grad_norm": 0.3995274007320404, "learning_rate": 2.7423313611764086e-05, "loss": 0.6855474710464478, "step": 330 }, { "epoch": 2.4962406015037595, "grad_norm": 0.23789720237255096, "learning_rate": 2.77030270963479e-05, "loss": 1.0410560369491577, "step": 332 }, { "epoch": 2.511278195488722, "grad_norm": 0.21356765925884247, "learning_rate": 2.796817891232397e-05, "loss": 0.6004407405853271, "step": 334 }, { "epoch": 2.526315789473684, "grad_norm": 1.3617724180221558, "learning_rate": 2.8218399687945758e-05, "loss": 0.7526741027832031, "step": 336 }, { "epoch": 2.5413533834586466, "grad_norm": 0.6006386876106262, "learning_rate": 2.8453340851265676e-05, "loss": 0.6869713664054871, "step": 338 }, { "epoch": 2.556390977443609, "grad_norm": 0.3373733460903168, "learning_rate": 2.8672675115715806e-05, "loss": 1.0165461301803589, "step": 340 }, { "epoch": 2.571428571428571, "grad_norm": 0.5051329731941223, "learning_rate": 2.887609693603699e-05, "loss": 0.9631428718566895, "step": 342 }, { "epoch": 2.5864661654135337, "grad_norm": 0.3386491537094116, "learning_rate": 2.906332293392093e-05, "loss": 0.8245996236801147, "step": 344 }, { "epoch": 2.601503759398496, "grad_norm": 0.21825748682022095, "learning_rate": 2.92340922927725e-05, "loss": 0.5915822386741638, "step": 346 }, { "epoch": 2.6165413533834587, "grad_norm": 0.24130862951278687, "learning_rate": 2.9388167121042307e-05, "loss": 0.7320323586463928, "step": 348 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5413809418678284, "learning_rate": 2.952533278362327e-05, "loss": 0.8300567269325256, "step": 350 }, { "epoch": 2.6466165413533833, "grad_norm": 0.25651729106903076, "learning_rate": 2.9645398200849713e-05, "loss": 0.7235583066940308, "step": 352 }, { "epoch": 2.661654135338346, "grad_norm": 0.15655282139778137, "learning_rate": 2.9748196114682335e-05, "loss": 1.0085736513137817, "step": 354 }, { "epoch": 2.6766917293233083, "grad_norm": 0.7222322225570679, "learning_rate": 2.983358332170829e-05, "loss": 0.7790261507034302, "step": 356 }, { "epoch": 2.6917293233082704, "grad_norm": 0.21224772930145264, "learning_rate": 2.9901440872631778e-05, "loss": 0.42803671956062317, "step": 358 }, { "epoch": 2.706766917293233, "grad_norm": 0.22365406155586243, "learning_rate": 2.9951674237977273e-05, "loss": 1.0629819631576538, "step": 360 }, { "epoch": 2.7218045112781954, "grad_norm": 0.2804076373577118, "learning_rate": 2.998421343977452e-05, "loss": 0.550415575504303, "step": 362 }, { "epoch": 2.736842105263158, "grad_norm": 0.2370826154947281, "learning_rate": 2.9999013149041885e-05, "loss": 0.721561074256897, "step": 364 }, { "epoch": 2.7518796992481205, "grad_norm": 0.835011899471283, "learning_rate": 2.999605274893222e-05, "loss": 0.8219574689865112, "step": 366 }, { "epoch": 2.7669172932330826, "grad_norm": 0.14573420584201813, "learning_rate": 2.9975336363453326e-05, "loss": 0.7218166589736938, "step": 368 }, { "epoch": 2.781954887218045, "grad_norm": 3.228212356567383, "learning_rate": 2.993689285172299e-05, "loss": 0.8398270010948181, "step": 370 }, { "epoch": 2.7969924812030076, "grad_norm": 0.5311354994773865, "learning_rate": 2.9880775767766535e-05, "loss": 0.8649424314498901, "step": 372 }, { "epoch": 2.8120300751879697, "grad_norm": 0.44514158368110657, "learning_rate": 2.980706328591302e-05, "loss": 0.7094336152076721, "step": 374 }, { "epoch": 2.827067669172932, "grad_norm": 0.41514015197753906, "learning_rate": 2.971585809189387e-05, "loss": 0.9906347393989563, "step": 376 }, { "epoch": 2.8421052631578947, "grad_norm": 0.14243760704994202, "learning_rate": 2.9607287239795747e-05, "loss": 1.0890015363693237, "step": 378 }, { "epoch": 2.857142857142857, "grad_norm": 0.4031289517879486, "learning_rate": 2.94815019750669e-05, "loss": 0.7638394832611084, "step": 380 }, { "epoch": 2.8721804511278197, "grad_norm": 0.2597931921482086, "learning_rate": 2.933867752382353e-05, "loss": 0.9143038392066956, "step": 382 }, { "epoch": 2.887218045112782, "grad_norm": 0.3925493061542511, "learning_rate": 2.917901284874975e-05, "loss": 0.9326249957084656, "step": 384 }, { "epoch": 2.9022556390977443, "grad_norm": 0.31625744700431824, "learning_rate": 2.9002730371931074e-05, "loss": 0.6936108469963074, "step": 386 }, { "epoch": 2.917293233082707, "grad_norm": 0.2891203463077545, "learning_rate": 2.881007566500768e-05, "loss": 0.9043726921081543, "step": 388 }, { "epoch": 2.932330827067669, "grad_norm": 0.30182725191116333, "learning_rate": 2.8601317107078944e-05, "loss": 0.8188687562942505, "step": 390 }, { "epoch": 2.9473684210526314, "grad_norm": 0.43725159764289856, "learning_rate": 2.8376745510835926e-05, "loss": 0.9015698432922363, "step": 392 }, { "epoch": 2.962406015037594, "grad_norm": 0.39825642108917236, "learning_rate": 2.813667371744254e-05, "loss": 0.7247455716133118, "step": 394 }, { "epoch": 2.9774436090225564, "grad_norm": 0.15052802860736847, "learning_rate": 2.7881436160729783e-05, "loss": 0.9713034629821777, "step": 396 }, { "epoch": 2.992481203007519, "grad_norm": 0.4360320270061493, "learning_rate": 2.7611388401310196e-05, "loss": 0.7928329706192017, "step": 398 }, { "epoch": 3.007518796992481, "grad_norm": 0.20822873711585999, "learning_rate": 2.7326906631261394e-05, "loss": 0.7827808856964111, "step": 400 }, { "epoch": 3.0225563909774436, "grad_norm": 0.09618931263685226, "learning_rate": 2.7028387150068913e-05, "loss": 0.6030799150466919, "step": 402 }, { "epoch": 3.037593984962406, "grad_norm": 0.3093872666358948, "learning_rate": 2.6716245812558134e-05, "loss": 0.7962419390678406, "step": 404 }, { "epoch": 3.0526315789473686, "grad_norm": 0.3837755024433136, "learning_rate": 2.6390917449584653e-05, "loss": 0.6203740239143372, "step": 406 }, { "epoch": 3.0676691729323307, "grad_norm": 0.14874151349067688, "learning_rate": 2.605285526228978e-05, "loss": 0.51124107837677, "step": 408 }, { "epoch": 3.082706766917293, "grad_norm": 0.21628743410110474, "learning_rate": 2.570253019076529e-05, "loss": 0.7276190519332886, "step": 410 }, { "epoch": 3.0977443609022557, "grad_norm": 0.284242182970047, "learning_rate": 2.5340430258006786e-05, "loss": 0.5125940442085266, "step": 412 }, { "epoch": 3.112781954887218, "grad_norm": 0.25154373049736023, "learning_rate": 2.496705989006952e-05, "loss": 0.8815844058990479, "step": 414 }, { "epoch": 3.1278195488721803, "grad_norm": 0.14138440787792206, "learning_rate": 2.4582939213373886e-05, "loss": 0.37600424885749817, "step": 416 }, { "epoch": 3.142857142857143, "grad_norm": 0.3758879005908966, "learning_rate": 2.4188603330139344e-05, "loss": 0.6696433424949646, "step": 418 }, { "epoch": 3.1578947368421053, "grad_norm": 0.16141177713871002, "learning_rate": 2.378460157295626e-05, "loss": 0.6787968277931213, "step": 420 }, { "epoch": 3.172932330827068, "grad_norm": 0.29796668887138367, "learning_rate": 2.3371496739533913e-05, "loss": 0.5915691256523132, "step": 422 }, { "epoch": 3.18796992481203, "grad_norm": 0.4774704575538635, "learning_rate": 2.294986430869094e-05, "loss": 0.733458399772644, "step": 424 }, { "epoch": 3.2030075187969924, "grad_norm": 0.36931291222572327, "learning_rate": 2.252029163868019e-05, "loss": 0.6868959069252014, "step": 426 }, { "epoch": 3.218045112781955, "grad_norm": 0.45511841773986816, "learning_rate": 2.208337714896483e-05, "loss": 0.569706380367279, "step": 428 }, { "epoch": 3.2330827067669174, "grad_norm": 1.0468262434005737, "learning_rate": 2.1639729486585647e-05, "loss": 0.4343474209308624, "step": 430 }, { "epoch": 3.2481203007518795, "grad_norm": 0.131326824426651, "learning_rate": 2.1189966678280585e-05, "loss": 0.4525618553161621, "step": 432 }, { "epoch": 3.263157894736842, "grad_norm": 0.1383962780237198, "learning_rate": 2.0734715269537963e-05, "loss": 0.44801121950149536, "step": 434 }, { "epoch": 3.2781954887218046, "grad_norm": 0.34697940945625305, "learning_rate": 2.0274609451782568e-05, "loss": 0.42984333634376526, "step": 436 }, { "epoch": 3.293233082706767, "grad_norm": 0.10286783427000046, "learning_rate": 1.9810290178910406e-05, "loss": 0.4518528878688812, "step": 438 }, { "epoch": 3.308270676691729, "grad_norm": 0.18340881168842316, "learning_rate": 1.934240427440311e-05, "loss": 0.9285587072372437, "step": 440 }, { "epoch": 3.3233082706766917, "grad_norm": 0.18978752195835114, "learning_rate": 1.8871603530265477e-05, "loss": 0.39083921909332275, "step": 442 }, { "epoch": 3.338345864661654, "grad_norm": 0.21235691010951996, "learning_rate": 1.8398543799041773e-05, "loss": 0.6497979760169983, "step": 444 }, { "epoch": 3.3533834586466167, "grad_norm": 0.9397839903831482, "learning_rate": 1.792388408017536e-05, "loss": 0.5017030239105225, "step": 446 }, { "epoch": 3.3684210526315788, "grad_norm": 0.23365262150764465, "learning_rate": 1.744828560198448e-05, "loss": 0.7379826903343201, "step": 448 }, { "epoch": 3.3834586466165413, "grad_norm": 0.42739665508270264, "learning_rate": 1.697241090053319e-05, "loss": 0.7720116972923279, "step": 450 }, { "epoch": 3.398496240601504, "grad_norm": 0.2359744757413864, "learning_rate": 1.6496922896680423e-05, "loss": 0.7877475619316101, "step": 452 }, { "epoch": 3.4135338345864663, "grad_norm": 0.4221789240837097, "learning_rate": 1.6022483972593128e-05, "loss": 0.7371859550476074, "step": 454 }, { "epoch": 3.4285714285714284, "grad_norm": 0.46123459935188293, "learning_rate": 1.5549755049009714e-05, "loss": 0.731837809085846, "step": 456 }, { "epoch": 3.443609022556391, "grad_norm": 0.20335260033607483, "learning_rate": 1.5079394664539421e-05, "loss": 0.48273712396621704, "step": 458 }, { "epoch": 3.4586466165413534, "grad_norm": 0.2316899299621582, "learning_rate": 1.4612058058280153e-05, "loss": 0.7381947636604309, "step": 460 }, { "epoch": 3.473684210526316, "grad_norm": 0.3751467168331146, "learning_rate": 1.4148396257032674e-05, "loss": 0.769965648651123, "step": 462 }, { "epoch": 3.488721804511278, "grad_norm": 0.5033459663391113, "learning_rate": 1.3689055168382717e-05, "loss": 0.6628371477127075, "step": 464 }, { "epoch": 3.5037593984962405, "grad_norm": 0.30648085474967957, "learning_rate": 1.3234674680914651e-05, "loss": 0.7021836638450623, "step": 466 }, { "epoch": 3.518796992481203, "grad_norm": 0.19549153745174408, "learning_rate": 1.2785887772809783e-05, "loss": 0.5976605415344238, "step": 468 }, { "epoch": 3.5338345864661656, "grad_norm": 0.47317057847976685, "learning_rate": 1.2343319630071227e-05, "loss": 0.678418755531311, "step": 470 }, { "epoch": 3.548872180451128, "grad_norm": 0.3564242720603943, "learning_rate": 1.1907586775603957e-05, "loss": 0.6626768708229065, "step": 472 }, { "epoch": 3.56390977443609, "grad_norm": 0.33226093649864197, "learning_rate": 1.147929621036279e-05, "loss": 0.7116915583610535, "step": 474 }, { "epoch": 3.5789473684210527, "grad_norm": 0.3433665931224823, "learning_rate": 1.1059044567765164e-05, "loss": 0.36730286478996277, "step": 476 }, { "epoch": 3.593984962406015, "grad_norm": 0.17942893505096436, "learning_rate": 1.0647417282546353e-05, "loss": 0.3575655221939087, "step": 478 }, { "epoch": 3.6090225563909772, "grad_norm": 0.14913895726203918, "learning_rate": 1.024498777521529e-05, "loss": 0.751462996006012, "step": 480 }, { "epoch": 3.6240601503759398, "grad_norm": 0.6876167058944702, "learning_rate": 9.852316653246724e-06, "loss": 0.7515479922294617, "step": 482 }, { "epoch": 3.6390977443609023, "grad_norm": 0.30825933814048767, "learning_rate": 9.469950930122665e-06, "loss": 0.6766018867492676, "step": 484 }, { "epoch": 3.654135338345865, "grad_norm": 0.3747425675392151, "learning_rate": 9.098423263311226e-06, "loss": 0.3269270956516266, "step": 486 }, { "epoch": 3.6691729323308273, "grad_norm": 0.19600830972194672, "learning_rate": 8.738251212244036e-06, "loss": 0.6345582008361816, "step": 488 }, { "epoch": 3.6842105263157894, "grad_norm": 0.21161755919456482, "learning_rate": 8.389936517326165e-06, "loss": 0.8583235144615173, "step": 490 }, { "epoch": 3.699248120300752, "grad_norm": 2.3246822357177734, "learning_rate": 8.053964400982803e-06, "loss": 0.7647910714149475, "step": 492 }, { "epoch": 3.7142857142857144, "grad_norm": 0.3167845606803894, "learning_rate": 7.730802891716579e-06, "loss": 0.3876282870769501, "step": 494 }, { "epoch": 3.7293233082706765, "grad_norm": 0.16611672937870026, "learning_rate": 7.420902172116848e-06, "loss": 0.8268077969551086, "step": 496 }, { "epoch": 3.744360902255639, "grad_norm": 0.27863407135009766, "learning_rate": 7.124693951729393e-06, "loss": 0.9286668300628662, "step": 498 }, { "epoch": 3.7593984962406015, "grad_norm": 0.37909525632858276, "learning_rate": 6.842590865660255e-06, "loss": 0.6480289101600647, "step": 500 }, { "epoch": 3.774436090225564, "grad_norm": 0.2283431440591812, "learning_rate": 6.574985899751219e-06, "loss": 0.576987624168396, "step": 502 }, { "epoch": 3.7894736842105265, "grad_norm": 0.6403146982192993, "learning_rate": 6.322251843127883e-06, "loss": 0.5665578842163086, "step": 504 }, { "epoch": 3.8045112781954886, "grad_norm": 0.24255450069904327, "learning_rate": 6.0847407688830226e-06, "loss": 0.44220831990242004, "step": 506 }, { "epoch": 3.819548872180451, "grad_norm": 0.17681249976158142, "learning_rate": 5.862783543618414e-06, "loss": 0.6706622242927551, "step": 508 }, { "epoch": 3.8345864661654137, "grad_norm": 0.6901529431343079, "learning_rate": 5.65668936652867e-06, "loss": 0.45284244418144226, "step": 510 }, { "epoch": 3.8496240601503757, "grad_norm": 0.2059166431427002, "learning_rate": 5.466745338668931e-06, "loss": 0.6849936246871948, "step": 512 }, { "epoch": 3.8646616541353382, "grad_norm": 0.17384979128837585, "learning_rate": 5.293216063006581e-06, "loss": 0.6412226557731628, "step": 514 }, { "epoch": 3.8796992481203008, "grad_norm": 1.0503121614456177, "learning_rate": 5.136343275814039e-06, "loss": 0.8608755469322205, "step": 516 }, { "epoch": 3.8947368421052633, "grad_norm": 0.19333027303218842, "learning_rate": 4.9963455099162615e-06, "loss": 0.5098147392272949, "step": 518 }, { "epoch": 3.909774436090226, "grad_norm": 0.14984972774982452, "learning_rate": 4.8734177902619205e-06, "loss": 0.7260234951972961, "step": 520 }, { "epoch": 3.924812030075188, "grad_norm": 0.14300547540187836, "learning_rate": 4.7677313622423905e-06, "loss": 0.8742654919624329, "step": 522 }, { "epoch": 3.9398496240601504, "grad_norm": 0.1549258679151535, "learning_rate": 4.6794334531371056e-06, "loss": 0.9179413318634033, "step": 524 }, { "epoch": 3.954887218045113, "grad_norm": 3.7465577125549316, "learning_rate": 4.608647067017448e-06, "loss": 0.8616862297058105, "step": 526 }, { "epoch": 3.969924812030075, "grad_norm": 0.1470378190279007, "learning_rate": 4.555470813395014e-06, "loss": 0.5497387647628784, "step": 528 }, { "epoch": 3.9849624060150375, "grad_norm": 0.2949855327606201, "learning_rate": 4.519978769852865e-06, "loss": 0.42557334899902344, "step": 530 }, { "epoch": 4.0, "grad_norm": 0.2642301321029663, "learning_rate": 4.502220378851213e-06, "loss": 0.6198008060455322, "step": 532 }, { "epoch": 4.0, "step": 532, "total_flos": 3.873354436822696e+18, "train_loss": 0.8859393315431767, "train_runtime": 11385.0199, "train_samples_per_second": 5.607, "train_steps_per_second": 0.047 } ], "logging_steps": 2, "max_steps": 532, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.873354436822696e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }