| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 908, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04405286343612335, | |
| "grad_norm": 10.519313963525699, | |
| "learning_rate": 3.296703296703297e-07, | |
| "loss": 1.4512, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0881057268722467, | |
| "grad_norm": 5.275494720043015, | |
| "learning_rate": 6.593406593406594e-07, | |
| "loss": 1.4723, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13215859030837004, | |
| "grad_norm": 1.8226315526905816, | |
| "learning_rate": 9.890109890109891e-07, | |
| "loss": 1.3932, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1762114537444934, | |
| "grad_norm": 2.044759232085493, | |
| "learning_rate": 1.3186813186813187e-06, | |
| "loss": 1.3093, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22026431718061673, | |
| "grad_norm": 1.7807644006153884, | |
| "learning_rate": 1.6483516483516486e-06, | |
| "loss": 1.2687, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2643171806167401, | |
| "grad_norm": 1.5054649369651574, | |
| "learning_rate": 1.9780219780219782e-06, | |
| "loss": 1.2427, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.30837004405286345, | |
| "grad_norm": 1.645138466222819, | |
| "learning_rate": 2.307692307692308e-06, | |
| "loss": 1.2286, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3524229074889868, | |
| "grad_norm": 1.4999909747158735, | |
| "learning_rate": 2.6373626373626375e-06, | |
| "loss": 1.1827, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3964757709251101, | |
| "grad_norm": 1.7582872706972468, | |
| "learning_rate": 2.9670329670329673e-06, | |
| "loss": 1.1941, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "grad_norm": 1.693711129252621, | |
| "learning_rate": 2.999101829950985e-06, | |
| "loss": 1.1731, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4845814977973568, | |
| "grad_norm": 1.4467996095205224, | |
| "learning_rate": 2.9959984254953274e-06, | |
| "loss": 1.1148, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5286343612334802, | |
| "grad_norm": 1.5298410629642052, | |
| "learning_rate": 2.9906832852013294e-06, | |
| "loss": 1.191, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5726872246696035, | |
| "grad_norm": 1.4536712154523523, | |
| "learning_rate": 2.9831642671563205e-06, | |
| "loss": 1.1818, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6167400881057269, | |
| "grad_norm": 1.5305738885906355, | |
| "learning_rate": 2.9734524877367604e-06, | |
| "loss": 1.1969, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6607929515418502, | |
| "grad_norm": 1.4595832724906421, | |
| "learning_rate": 2.961562305173399e-06, | |
| "loss": 1.1203, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7048458149779736, | |
| "grad_norm": 1.7304480079500448, | |
| "learning_rate": 2.9475112983235753e-06, | |
| "loss": 1.2301, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.748898678414097, | |
| "grad_norm": 1.5486760284827386, | |
| "learning_rate": 2.9313202406820232e-06, | |
| "loss": 1.183, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7929515418502202, | |
| "grad_norm": 1.700519163465852, | |
| "learning_rate": 2.9130130696686196e-06, | |
| "loss": 1.1304, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8370044052863436, | |
| "grad_norm": 1.4763145358774594, | |
| "learning_rate": 2.8926168512384744e-06, | |
| "loss": 1.1618, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 1.5267413192202046, | |
| "learning_rate": 2.870161739866686e-06, | |
| "loss": 1.1491, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9251101321585903, | |
| "grad_norm": 1.733908977152635, | |
| "learning_rate": 2.845680933966922e-06, | |
| "loss": 1.145, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9691629955947136, | |
| "grad_norm": 1.498098134685152, | |
| "learning_rate": 2.8192106268097337e-06, | |
| "loss": 1.2206, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.013215859030837, | |
| "grad_norm": 4.7125491501567724, | |
| "learning_rate": 2.7907899530131763e-06, | |
| "loss": 1.133, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0572687224669604, | |
| "grad_norm": 2.038277716111568, | |
| "learning_rate": 2.7604609306848312e-06, | |
| "loss": 1.0399, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.1013215859030836, | |
| "grad_norm": 1.8451563548941678, | |
| "learning_rate": 2.7282683993007865e-06, | |
| "loss": 1.007, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.145374449339207, | |
| "grad_norm": 1.5438408367179703, | |
| "learning_rate": 2.6942599534133987e-06, | |
| "loss": 1.0139, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1894273127753303, | |
| "grad_norm": 1.6126849019096945, | |
| "learning_rate": 2.658485872285863e-06, | |
| "loss": 1.0493, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2334801762114538, | |
| "grad_norm": 1.9217991771681158, | |
| "learning_rate": 2.6209990455576105e-06, | |
| "loss": 0.9478, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.277533039647577, | |
| "grad_norm": 1.981595215749417, | |
| "learning_rate": 2.5818548950504342e-06, | |
| "loss": 1.0108, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "grad_norm": 1.6455591880816465, | |
| "learning_rate": 2.541111292830951e-06, | |
| "loss": 1.0294, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3656387665198237, | |
| "grad_norm": 1.5108115682140577, | |
| "learning_rate": 2.4988284756505333e-06, | |
| "loss": 0.9748, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4096916299559472, | |
| "grad_norm": 1.5787304886557472, | |
| "learning_rate": 2.455068955889216e-06, | |
| "loss": 1.0054, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4537444933920705, | |
| "grad_norm": 1.6633458413004865, | |
| "learning_rate": 2.4098974291352257e-06, | |
| "loss": 1.0155, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.497797356828194, | |
| "grad_norm": 1.7167391655172333, | |
| "learning_rate": 2.3633806785367873e-06, | |
| "loss": 1.0677, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5418502202643172, | |
| "grad_norm": 1.519612290214899, | |
| "learning_rate": 2.315587476067607e-06, | |
| "loss": 0.987, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5859030837004404, | |
| "grad_norm": 1.6948233763887988, | |
| "learning_rate": 2.2665884808520048e-06, | |
| "loss": 1.0526, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6299559471365639, | |
| "grad_norm": 1.6931931897778194, | |
| "learning_rate": 2.2164561347000213e-06, | |
| "loss": 1.0304, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6740088105726874, | |
| "grad_norm": 1.5818868292083217, | |
| "learning_rate": 2.1652645550069394e-06, | |
| "loss": 0.9921, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.7180616740088106, | |
| "grad_norm": 1.670222373489549, | |
| "learning_rate": 2.113089425175561e-06, | |
| "loss": 1.0062, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "grad_norm": 1.5676805638818068, | |
| "learning_rate": 2.0600078827232473e-06, | |
| "loss": 0.993, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8061674008810573, | |
| "grad_norm": 1.6518866030477966, | |
| "learning_rate": 2.006098405239142e-06, | |
| "loss": 0.989, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8502202643171806, | |
| "grad_norm": 1.7025140153769915, | |
| "learning_rate": 1.95144069436019e-06, | |
| "loss": 1.0184, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.894273127753304, | |
| "grad_norm": 1.6535956905677116, | |
| "learning_rate": 1.896115557937476e-06, | |
| "loss": 1.0655, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9383259911894273, | |
| "grad_norm": 1.6062899976218377, | |
| "learning_rate": 1.8402047905671065e-06, | |
| "loss": 1.0139, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9823788546255505, | |
| "grad_norm": 1.6037621791723693, | |
| "learning_rate": 1.7837910526622437e-06, | |
| "loss": 1.0103, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.026431718061674, | |
| "grad_norm": 2.347205562826239, | |
| "learning_rate": 1.726957748245093e-06, | |
| "loss": 0.8977, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.0704845814977975, | |
| "grad_norm": 1.9061910666122825, | |
| "learning_rate": 1.6697889016395088e-06, | |
| "loss": 0.8438, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.1145374449339207, | |
| "grad_norm": 1.6451786150601784, | |
| "learning_rate": 1.6123690332465296e-06, | |
| "loss": 0.9008, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.158590308370044, | |
| "grad_norm": 1.8874201469690022, | |
| "learning_rate": 1.5547830345864886e-06, | |
| "loss": 0.8176, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.202643171806167, | |
| "grad_norm": 1.9254603832738357, | |
| "learning_rate": 1.4971160427924554e-06, | |
| "loss": 0.8901, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.246696035242291, | |
| "grad_norm": 1.835023323363367, | |
| "learning_rate": 1.439453314740552e-06, | |
| "loss": 0.8247, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.290748898678414, | |
| "grad_norm": 1.93123359826113, | |
| "learning_rate": 1.381880101003235e-06, | |
| "loss": 0.8294, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.3348017621145374, | |
| "grad_norm": 1.866122498094617, | |
| "learning_rate": 1.3244815198119024e-06, | |
| "loss": 0.9032, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.3788546255506606, | |
| "grad_norm": 1.7177415721533669, | |
| "learning_rate": 1.2673424312151519e-06, | |
| "loss": 0.8862, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.4229074889867843, | |
| "grad_norm": 2.1997603304322237, | |
| "learning_rate": 1.2105473116187517e-06, | |
| "loss": 0.894, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.4669603524229076, | |
| "grad_norm": 1.8715395183959846, | |
| "learning_rate": 1.154180128892796e-06, | |
| "loss": 0.8126, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.511013215859031, | |
| "grad_norm": 1.86690865903126, | |
| "learning_rate": 1.0983242182307033e-06, | |
| "loss": 0.8059, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.555066079295154, | |
| "grad_norm": 2.0917187570122517, | |
| "learning_rate": 1.0430621589435803e-06, | |
| "loss": 0.8959, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.5991189427312777, | |
| "grad_norm": 2.558938317516458, | |
| "learning_rate": 9.884756523721117e-07, | |
| "loss": 0.8393, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.643171806167401, | |
| "grad_norm": 1.5987062295416765, | |
| "learning_rate": 9.346454010964724e-07, | |
| "loss": 0.862, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.6872246696035242, | |
| "grad_norm": 1.6649451786987866, | |
| "learning_rate": 8.816509896228376e-07, | |
| "loss": 0.859, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.7312775330396475, | |
| "grad_norm": 1.6834312437789734, | |
| "learning_rate": 8.295707667228987e-07, | |
| "loss": 0.8348, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.7753303964757707, | |
| "grad_norm": 1.7102647759571665, | |
| "learning_rate": 7.784817296003237e-07, | |
| "loss": 0.8668, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.8193832599118944, | |
| "grad_norm": 1.7566168287161772, | |
| "learning_rate": 7.284594100554311e-07, | |
| "loss": 0.8627, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.8634361233480177, | |
| "grad_norm": 1.7731380983410612, | |
| "learning_rate": 6.7957776281636e-07, | |
| "loss": 0.855, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.907488986784141, | |
| "grad_norm": 3.413162667971741, | |
| "learning_rate": 6.31909056201842e-07, | |
| "loss": 0.8849, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.951541850220264, | |
| "grad_norm": 1.793806206724264, | |
| "learning_rate": 5.855237652772183e-07, | |
| "loss": 0.8629, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.995594713656388, | |
| "grad_norm": 1.7832177184392761, | |
| "learning_rate": 5.404904676616635e-07, | |
| "loss": 0.8796, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.039647577092511, | |
| "grad_norm": 13.295009142965908, | |
| "learning_rate": 4.968757421406609e-07, | |
| "loss": 0.779, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.0837004405286343, | |
| "grad_norm": 1.8559346739765339, | |
| "learning_rate": 4.547440702336238e-07, | |
| "loss": 0.7582, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.1277533039647576, | |
| "grad_norm": 1.7949928357857732, | |
| "learning_rate": 4.1415774086218277e-07, | |
| "loss": 0.7063, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.171806167400881, | |
| "grad_norm": 1.778323655929332, | |
| "learning_rate": 3.7517675826009083e-07, | |
| "loss": 0.752, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.2158590308370045, | |
| "grad_norm": 6.004368017727455, | |
| "learning_rate": 3.378587532608872e-07, | |
| "loss": 0.7829, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.2599118942731278, | |
| "grad_norm": 1.9250826926237672, | |
| "learning_rate": 3.0225889809447925e-07, | |
| "loss": 0.72, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.303964757709251, | |
| "grad_norm": 1.8286462728037371, | |
| "learning_rate": 2.684298248186077e-07, | |
| "loss": 0.7852, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.3480176211453743, | |
| "grad_norm": 1.8502492223085096, | |
| "learning_rate": 2.3642154750579275e-07, | |
| "loss": 0.7912, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.392070484581498, | |
| "grad_norm": 1.8690732598717457, | |
| "learning_rate": 2.0628138830079696e-07, | |
| "loss": 0.7456, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.436123348017621, | |
| "grad_norm": 2.841225832952903, | |
| "learning_rate": 1.7805390745792993e-07, | |
| "loss": 0.7654, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.4801762114537445, | |
| "grad_norm": 2.2582022273376845, | |
| "learning_rate": 1.5178083746162667e-07, | |
| "loss": 0.7437, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.5242290748898677, | |
| "grad_norm": 1.874522268468866, | |
| "learning_rate": 1.2750102132769738e-07, | |
| "loss": 0.7834, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.568281938325991, | |
| "grad_norm": 2.037336725579371, | |
| "learning_rate": 1.0525035517647014e-07, | |
| "loss": 0.7943, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.6123348017621146, | |
| "grad_norm": 1.7256797980612035, | |
| "learning_rate": 8.50617351627232e-08, | |
| "loss": 0.7826, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.656387665198238, | |
| "grad_norm": 1.8660504133848668, | |
| "learning_rate": 6.696500884087259e-08, | |
| "loss": 0.8379, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.700440528634361, | |
| "grad_norm": 1.6608110580956394, | |
| "learning_rate": 5.0986931037314666e-08, | |
| "loss": 0.7614, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.744493392070485, | |
| "grad_norm": 2.132591912569078, | |
| "learning_rate": 3.7151124295163374e-08, | |
| "loss": 0.8152, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.788546255506608, | |
| "grad_norm": 2.142924158927503, | |
| "learning_rate": 2.5478043949868192e-08, | |
| "loss": 0.7517, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.8325991189427313, | |
| "grad_norm": 2.1270227065723284, | |
| "learning_rate": 1.5984947887334623e-08, | |
| "loss": 0.7697, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.8766519823788546, | |
| "grad_norm": 1.8195791930606864, | |
| "learning_rate": 8.685871029272318e-09, | |
| "loss": 0.7506, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.920704845814978, | |
| "grad_norm": 2.2849986922595553, | |
| "learning_rate": 3.5916045834781253e-09, | |
| "loss": 0.7273, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.964757709251101, | |
| "grad_norm": 1.8303283767313243, | |
| "learning_rate": 7.096800897425371e-10, | |
| "loss": 0.8123, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 908, | |
| "total_flos": 648773332107264.0, | |
| "train_loss": 0.9660957064397535, | |
| "train_runtime": 3882.5924, | |
| "train_samples_per_second": 5.59, | |
| "train_steps_per_second": 0.234 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 908, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 648773332107264.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |