| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 100, |
| "global_step": 478, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0041841004184100415, |
| "grad_norm": 41.53185103984857, |
| "learning_rate": 4.1666666666666667e-07, |
| "loss": 0.4749, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.02092050209205021, |
| "grad_norm": 38.30127983273649, |
| "learning_rate": 2.0833333333333334e-06, |
| "loss": 0.45, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04184100418410042, |
| "grad_norm": 18.219981518260038, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.4009, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06276150627615062, |
| "grad_norm": 19.993422812493172, |
| "learning_rate": 6.25e-06, |
| "loss": 0.2968, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08368200836820083, |
| "grad_norm": 2.728691896454265, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 0.2389, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10460251046025104, |
| "grad_norm": 1.885623294377404, |
| "learning_rate": 1.0416666666666668e-05, |
| "loss": 0.2226, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12552301255230125, |
| "grad_norm": 1.884968683506505, |
| "learning_rate": 1.25e-05, |
| "loss": 0.2151, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14644351464435146, |
| "grad_norm": 1.7305199472110575, |
| "learning_rate": 1.4583333333333333e-05, |
| "loss": 0.2033, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.16736401673640167, |
| "grad_norm": 1.624704502269397, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.1962, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18828451882845187, |
| "grad_norm": 1.0713867108877762, |
| "learning_rate": 1.8750000000000002e-05, |
| "loss": 0.1981, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.20920502092050208, |
| "grad_norm": 0.9779064413853102, |
| "learning_rate": 1.9998932457674904e-05, |
| "loss": 0.1765, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2301255230125523, |
| "grad_norm": 0.9775409995700168, |
| "learning_rate": 1.9986925223989665e-05, |
| "loss": 0.2064, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2510460251046025, |
| "grad_norm": 0.9714894285452347, |
| "learning_rate": 1.996159240342547e-05, |
| "loss": 0.2019, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2719665271966527, |
| "grad_norm": 1.0717005150854204, |
| "learning_rate": 1.9922967797647357e-05, |
| "loss": 0.2068, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2928870292887029, |
| "grad_norm": 0.9560722147267008, |
| "learning_rate": 1.9871102943592717e-05, |
| "loss": 0.2068, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3138075313807531, |
| "grad_norm": 1.0251690501367547, |
| "learning_rate": 1.9806067044705375e-05, |
| "loss": 0.206, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.33472803347280333, |
| "grad_norm": 0.8930867203257072, |
| "learning_rate": 1.9727946878597193e-05, |
| "loss": 0.2133, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.35564853556485354, |
| "grad_norm": 0.978887003337326, |
| "learning_rate": 1.963684668126046e-05, |
| "loss": 0.2138, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.37656903765690375, |
| "grad_norm": 0.9045993049904391, |
| "learning_rate": 1.9532888007985408e-05, |
| "loss": 0.2101, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.39748953974895396, |
| "grad_norm": 0.8416364765226602, |
| "learning_rate": 1.9416209571168648e-05, |
| "loss": 0.1986, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.41841004184100417, |
| "grad_norm": 0.889013264056005, |
| "learning_rate": 1.9286967055228744e-05, |
| "loss": 0.2025, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.41841004184100417, |
| "eval_loss": 0.2196013182401657, |
| "eval_runtime": 4.704, |
| "eval_samples_per_second": 63.776, |
| "eval_steps_per_second": 2.126, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4393305439330544, |
| "grad_norm": 0.8700387414372234, |
| "learning_rate": 1.9145332908875984e-05, |
| "loss": 0.2135, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4602510460251046, |
| "grad_norm": 0.9841056505818219, |
| "learning_rate": 1.89914961150135e-05, |
| "loss": 0.2236, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4811715481171548, |
| "grad_norm": 0.9183128471339689, |
| "learning_rate": 1.8825661938576784e-05, |
| "loss": 0.1981, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.502092050209205, |
| "grad_norm": 0.9064275588483083, |
| "learning_rate": 1.864805165264799e-05, |
| "loss": 0.2112, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5230125523012552, |
| "grad_norm": 0.9277328098243097, |
| "learning_rate": 1.8458902243210558e-05, |
| "loss": 0.2108, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5439330543933054, |
| "grad_norm": 0.8390131309861991, |
| "learning_rate": 1.8258466092938042e-05, |
| "loss": 0.2036, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5648535564853556, |
| "grad_norm": 0.8475693099997708, |
| "learning_rate": 1.8047010644439074e-05, |
| "loss": 0.2055, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5857740585774058, |
| "grad_norm": 0.8174296090345491, |
| "learning_rate": 1.7824818043407828e-05, |
| "loss": 0.1965, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.606694560669456, |
| "grad_norm": 0.9073247456446722, |
| "learning_rate": 1.75921847621561e-05, |
| "loss": 0.2189, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6276150627615062, |
| "grad_norm": 0.8921810755026803, |
| "learning_rate": 1.7349421204029343e-05, |
| "loss": 0.2083, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6485355648535565, |
| "grad_norm": 0.8941222203338465, |
| "learning_rate": 1.7096851289234448e-05, |
| "loss": 0.1969, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.6694560669456067, |
| "grad_norm": 0.804546952924198, |
| "learning_rate": 1.6834812022632e-05, |
| "loss": 0.1967, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6903765690376569, |
| "grad_norm": 0.9037696185427722, |
| "learning_rate": 1.656365304406953e-05, |
| "loss": 0.2203, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7112970711297071, |
| "grad_norm": 0.8167977368613387, |
| "learning_rate": 1.6283736161855995e-05, |
| "loss": 0.2086, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7322175732217573, |
| "grad_norm": 0.9715422112753858, |
| "learning_rate": 1.5995434869999723e-05, |
| "loss": 0.2079, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7531380753138075, |
| "grad_norm": 0.8466883455587352, |
| "learning_rate": 1.5699133849854164e-05, |
| "loss": 0.2023, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7740585774058577, |
| "grad_norm": 0.8429311671633376, |
| "learning_rate": 1.5395228456836298e-05, |
| "loss": 0.2133, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.7949790794979079, |
| "grad_norm": 0.7670187426056939, |
| "learning_rate": 1.5084124192902612e-05, |
| "loss": 0.2111, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8158995815899581, |
| "grad_norm": 0.8839232649853831, |
| "learning_rate": 1.4766236165486526e-05, |
| "loss": 0.2066, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8368200836820083, |
| "grad_norm": 0.8327080522078059, |
| "learning_rate": 1.4441988533619182e-05, |
| "loss": 0.2033, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8368200836820083, |
| "eval_loss": 0.21050043404102325, |
| "eval_runtime": 4.6998, |
| "eval_samples_per_second": 63.833, |
| "eval_steps_per_second": 2.128, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8577405857740585, |
| "grad_norm": 0.882685513711128, |
| "learning_rate": 1.4111813941972672e-05, |
| "loss": 0.2115, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.8786610878661087, |
| "grad_norm": 0.7869283802680849, |
| "learning_rate": 1.3776152943580846e-05, |
| "loss": 0.2032, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.899581589958159, |
| "grad_norm": 0.9323518972909932, |
| "learning_rate": 1.3435453412007949e-05, |
| "loss": 0.2065, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9205020920502092, |
| "grad_norm": 0.8687693359849569, |
| "learning_rate": 1.3090169943749475e-05, |
| "loss": 0.1992, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9414225941422594, |
| "grad_norm": 0.788589875497053, |
| "learning_rate": 1.2740763251662585e-05, |
| "loss": 0.2088, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9623430962343096, |
| "grad_norm": 0.8338643090434609, |
| "learning_rate": 1.2387699550235419e-05, |
| "loss": 0.2134, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9832635983263598, |
| "grad_norm": 0.8945112364204731, |
| "learning_rate": 1.2031449933515625e-05, |
| "loss": 0.2143, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.00418410041841, |
| "grad_norm": 0.7637171354512544, |
| "learning_rate": 1.1672489746527979e-05, |
| "loss": 0.1895, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0251046025104602, |
| "grad_norm": 0.7512141249987404, |
| "learning_rate": 1.1311297951020028e-05, |
| "loss": 0.11, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0460251046025104, |
| "grad_norm": 0.831833076591956, |
| "learning_rate": 1.0948356486381829e-05, |
| "loss": 0.1, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0669456066945606, |
| "grad_norm": 0.7206258555841986, |
| "learning_rate": 1.0584149626592662e-05, |
| "loss": 0.1037, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.0878661087866108, |
| "grad_norm": 0.7085622045220055, |
| "learning_rate": 1.0219163334052682e-05, |
| "loss": 0.0973, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.108786610878661, |
| "grad_norm": 0.7831952386854, |
| "learning_rate": 9.853884611161709e-06, |
| "loss": 0.0973, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1297071129707112, |
| "grad_norm": 0.8240407306584351, |
| "learning_rate": 9.48880085051033e-06, |
| "loss": 0.1013, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1506276150627615, |
| "grad_norm": 0.6482126525278735, |
| "learning_rate": 9.124399184550377e-06, |
| "loss": 0.0918, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.1715481171548117, |
| "grad_norm": 0.7359956824319477, |
| "learning_rate": 8.76116583561252e-06, |
| "loss": 0.1009, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.1924686192468619, |
| "grad_norm": 0.7488157028333433, |
| "learning_rate": 8.399585467138215e-06, |
| "loss": 0.104, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.213389121338912, |
| "grad_norm": 0.9216117075330189, |
| "learning_rate": 8.040140536991688e-06, |
| "loss": 0.1088, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2343096234309623, |
| "grad_norm": 0.7078309437741438, |
| "learning_rate": 7.683310653714857e-06, |
| "loss": 0.0961, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2552301255230125, |
| "grad_norm": 0.6400397127293158, |
| "learning_rate": 7.329571936584072e-06, |
| "loss": 0.0953, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2552301255230125, |
| "eval_loss": 0.21791227161884308, |
| "eval_runtime": 4.7021, |
| "eval_samples_per_second": 63.801, |
| "eval_steps_per_second": 2.127, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2761506276150627, |
| "grad_norm": 0.7946545562826351, |
| "learning_rate": 6.979396380322621e-06, |
| "loss": 0.1012, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.297071129707113, |
| "grad_norm": 0.8682581025303368, |
| "learning_rate": 6.63325122531663e-06, |
| "loss": 0.0969, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3179916317991631, |
| "grad_norm": 0.6258337390507666, |
| "learning_rate": 6.291598334174685e-06, |
| "loss": 0.0882, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3389121338912133, |
| "grad_norm": 0.6131224054121029, |
| "learning_rate": 5.954893575463064e-06, |
| "loss": 0.0944, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3598326359832635, |
| "grad_norm": 0.7854353505666302, |
| "learning_rate": 5.623586215438813e-06, |
| "loss": 0.0958, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.3807531380753137, |
| "grad_norm": 0.6627539948719601, |
| "learning_rate": 5.298118318592316e-06, |
| "loss": 0.0986, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.401673640167364, |
| "grad_norm": 0.6727466516033451, |
| "learning_rate": 4.978924157799208e-06, |
| "loss": 0.0871, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.4225941422594142, |
| "grad_norm": 0.837001485916376, |
| "learning_rate": 4.666429634868651e-06, |
| "loss": 0.0919, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4435146443514644, |
| "grad_norm": 0.6312831436608192, |
| "learning_rate": 4.361051712261173e-06, |
| "loss": 0.0956, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.4644351464435146, |
| "grad_norm": 0.7493453791258211, |
| "learning_rate": 4.063197856734295e-06, |
| "loss": 0.0973, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.4853556485355648, |
| "grad_norm": 0.6718965888684427, |
| "learning_rate": 3.773265495658309e-06, |
| "loss": 0.0935, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.506276150627615, |
| "grad_norm": 0.669199054440991, |
| "learning_rate": 3.491641486727645e-06, |
| "loss": 0.0934, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.5271966527196654, |
| "grad_norm": 0.6071433879794146, |
| "learning_rate": 3.2187016017753714e-06, |
| "loss": 0.0856, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.5481171548117154, |
| "grad_norm": 0.657064753562051, |
| "learning_rate": 2.954810025379633e-06, |
| "loss": 0.0946, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5690376569037658, |
| "grad_norm": 0.7108276426518368, |
| "learning_rate": 2.700318868930977e-06, |
| "loss": 0.0908, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.5899581589958158, |
| "grad_norm": 0.6801812109704454, |
| "learning_rate": 2.455567700808974e-06, |
| "loss": 0.0867, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.6108786610878663, |
| "grad_norm": 0.7675050726089606, |
| "learning_rate": 2.2208830932950175e-06, |
| "loss": 0.0947, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.6317991631799162, |
| "grad_norm": 0.6895822123846358, |
| "learning_rate": 1.996578186825876e-06, |
| "loss": 0.0897, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6527196652719667, |
| "grad_norm": 0.7665469889085529, |
| "learning_rate": 1.7829522721693738e-06, |
| "loss": 0.0942, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.6736401673640167, |
| "grad_norm": 0.7844644313627527, |
| "learning_rate": 1.5802903910797584e-06, |
| "loss": 0.0963, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6736401673640167, |
| "eval_loss": 0.21011365950107574, |
| "eval_runtime": 4.7026, |
| "eval_samples_per_second": 63.794, |
| "eval_steps_per_second": 2.126, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.694560669456067, |
| "grad_norm": 0.8226521847258956, |
| "learning_rate": 1.3888629559655497e-06, |
| "loss": 0.0933, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.715481171548117, |
| "grad_norm": 0.7519758533069896, |
| "learning_rate": 1.2089253890773789e-06, |
| "loss": 0.0931, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7364016736401675, |
| "grad_norm": 0.6928321056405649, |
| "learning_rate": 1.0407177816972558e-06, |
| "loss": 0.0921, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.7573221757322175, |
| "grad_norm": 0.7598493011363774, |
| "learning_rate": 8.844645737839874e-07, |
| "loss": 0.0836, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.778242677824268, |
| "grad_norm": 0.7883660509749922, |
| "learning_rate": 7.403742545021986e-07, |
| "loss": 0.0957, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.799163179916318, |
| "grad_norm": 0.6379425807939199, |
| "learning_rate": 6.086390840345758e-07, |
| "loss": 0.0855, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.8200836820083683, |
| "grad_norm": 0.7094514927837703, |
| "learning_rate": 4.894348370484648e-07, |
| "loss": 0.0961, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.8410041841004183, |
| "grad_norm": 0.7307395509986969, |
| "learning_rate": 3.8292056815916965e-07, |
| "loss": 0.0906, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8619246861924688, |
| "grad_norm": 0.698574360214073, |
| "learning_rate": 2.8923839970285473e-07, |
| "loss": 0.0935, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.8828451882845187, |
| "grad_norm": 0.6707455680501093, |
| "learning_rate": 2.0851333210225032e-07, |
| "loss": 0.0883, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.9037656903765692, |
| "grad_norm": 0.7392245964188288, |
| "learning_rate": 1.408530770781813e-07, |
| "loss": 0.093, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.9246861924686192, |
| "grad_norm": 0.8076174792599771, |
| "learning_rate": 8.634791392946429e-08, |
| "loss": 0.0883, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9456066945606696, |
| "grad_norm": 0.7903152829386576, |
| "learning_rate": 4.5070569072952485e-08, |
| "loss": 0.0896, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.9665271966527196, |
| "grad_norm": 0.7075539268549017, |
| "learning_rate": 1.7076119004429958e-08, |
| "loss": 0.0899, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.98744769874477, |
| "grad_norm": 0.6839920154541665, |
| "learning_rate": 2.401916809872118e-09, |
| "loss": 0.086, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 478, |
| "total_flos": 29254447685632.0, |
| "train_loss": 0.15641464851017278, |
| "train_runtime": 1194.3029, |
| "train_samples_per_second": 12.777, |
| "train_steps_per_second": 0.4 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 478, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 29254447685632.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|