SpireLab's picture
Upload folder using huggingface_hub
137c748 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.990047413608987,
"eval_steps": 1000,
"global_step": 70000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004271496305155696,
"grad_norm": 5.181503772735596,
"learning_rate": 7.118451025056948e-07,
"loss": 5.7231,
"step": 100
},
{
"epoch": 0.008542992610311393,
"grad_norm": 4.810484886169434,
"learning_rate": 1.4236902050113896e-06,
"loss": 5.663,
"step": 200
},
{
"epoch": 0.012814488915467088,
"grad_norm": 6.623730182647705,
"learning_rate": 2.1355353075170844e-06,
"loss": 5.8302,
"step": 300
},
{
"epoch": 0.017085985220622785,
"grad_norm": 4.488267421722412,
"learning_rate": 2.847380410022779e-06,
"loss": 5.5719,
"step": 400
},
{
"epoch": 0.02135748152577848,
"grad_norm": 5.217711448669434,
"learning_rate": 3.559225512528474e-06,
"loss": 5.572,
"step": 500
},
{
"epoch": 0.025628977830934176,
"grad_norm": 6.749180316925049,
"learning_rate": 4.271070615034169e-06,
"loss": 5.6101,
"step": 600
},
{
"epoch": 0.029900474136089872,
"grad_norm": 7.648950099945068,
"learning_rate": 4.9829157175398636e-06,
"loss": 5.6377,
"step": 700
},
{
"epoch": 0.03417197044124557,
"grad_norm": 8.535877227783203,
"learning_rate": 5.694760820045558e-06,
"loss": 5.3197,
"step": 800
},
{
"epoch": 0.03844346674640126,
"grad_norm": 10.138134002685547,
"learning_rate": 6.406605922551254e-06,
"loss": 5.2204,
"step": 900
},
{
"epoch": 0.04271496305155696,
"grad_norm": 9.492013931274414,
"learning_rate": 7.118451025056948e-06,
"loss": 5.1271,
"step": 1000
},
{
"epoch": 0.04271496305155696,
"eval_runtime": 404.87,
"eval_samples_per_second": 115.647,
"eval_steps_per_second": 14.456,
"step": 1000
},
{
"epoch": 0.046986459356712654,
"grad_norm": 9.8007230758667,
"learning_rate": 7.830296127562643e-06,
"loss": 4.8906,
"step": 1100
},
{
"epoch": 0.05125795566186835,
"grad_norm": 15.544700622558594,
"learning_rate": 8.542141230068338e-06,
"loss": 4.7123,
"step": 1200
},
{
"epoch": 0.05552945196702405,
"grad_norm": 11.552872657775879,
"learning_rate": 9.253986332574032e-06,
"loss": 4.542,
"step": 1300
},
{
"epoch": 0.059800948272179744,
"grad_norm": 8.390003204345703,
"learning_rate": 9.965831435079727e-06,
"loss": 4.1772,
"step": 1400
},
{
"epoch": 0.06407244457733544,
"grad_norm": 6.352422714233398,
"learning_rate": 1.0677676537585422e-05,
"loss": 4.1877,
"step": 1500
},
{
"epoch": 0.06834394088249114,
"grad_norm": 11.394415855407715,
"learning_rate": 1.1389521640091117e-05,
"loss": 4.1625,
"step": 1600
},
{
"epoch": 0.07261543718764683,
"grad_norm": 7.095893859863281,
"learning_rate": 1.2101366742596812e-05,
"loss": 4.0512,
"step": 1700
},
{
"epoch": 0.07688693349280253,
"grad_norm": 11.011418342590332,
"learning_rate": 1.2813211845102508e-05,
"loss": 4.0444,
"step": 1800
},
{
"epoch": 0.08115842979795823,
"grad_norm": 7.294766902923584,
"learning_rate": 1.35250569476082e-05,
"loss": 4.1648,
"step": 1900
},
{
"epoch": 0.08542992610311392,
"grad_norm": 9.359979629516602,
"learning_rate": 1.4236902050113896e-05,
"loss": 4.1958,
"step": 2000
},
{
"epoch": 0.08542992610311392,
"eval_runtime": 404.9689,
"eval_samples_per_second": 115.619,
"eval_steps_per_second": 14.453,
"step": 2000
},
{
"epoch": 0.08970142240826962,
"grad_norm": 9.699823379516602,
"learning_rate": 1.494874715261959e-05,
"loss": 4.0555,
"step": 2100
},
{
"epoch": 0.09397291871342531,
"grad_norm": 9.576581954956055,
"learning_rate": 1.5660592255125285e-05,
"loss": 4.1073,
"step": 2200
},
{
"epoch": 0.09824441501858101,
"grad_norm": 9.165473937988281,
"learning_rate": 1.637243735763098e-05,
"loss": 4.0373,
"step": 2300
},
{
"epoch": 0.1025159113237367,
"grad_norm": 9.258238792419434,
"learning_rate": 1.7084282460136675e-05,
"loss": 3.8695,
"step": 2400
},
{
"epoch": 0.1067874076288924,
"grad_norm": 10.60352897644043,
"learning_rate": 1.779612756264237e-05,
"loss": 3.8457,
"step": 2500
},
{
"epoch": 0.1110589039340481,
"grad_norm": 8.64367961883545,
"learning_rate": 1.8507972665148065e-05,
"loss": 3.8887,
"step": 2600
},
{
"epoch": 0.1153304002392038,
"grad_norm": 11.192293167114258,
"learning_rate": 1.9219817767653758e-05,
"loss": 4.0215,
"step": 2700
},
{
"epoch": 0.11960189654435949,
"grad_norm": 11.153294563293457,
"learning_rate": 1.9931662870159454e-05,
"loss": 3.9655,
"step": 2800
},
{
"epoch": 0.12387339284951518,
"grad_norm": 9.78614616394043,
"learning_rate": 2.064350797266515e-05,
"loss": 4.0018,
"step": 2900
},
{
"epoch": 0.12814488915467087,
"grad_norm": 11.2694730758667,
"learning_rate": 2.1355353075170844e-05,
"loss": 3.9469,
"step": 3000
},
{
"epoch": 0.12814488915467087,
"eval_runtime": 404.9852,
"eval_samples_per_second": 115.614,
"eval_steps_per_second": 14.452,
"step": 3000
},
{
"epoch": 0.1324163854598266,
"grad_norm": 9.227404594421387,
"learning_rate": 2.2067198177676537e-05,
"loss": 3.9662,
"step": 3100
},
{
"epoch": 0.13668788176498228,
"grad_norm": 13.020267486572266,
"learning_rate": 2.2779043280182233e-05,
"loss": 3.9011,
"step": 3200
},
{
"epoch": 0.14095937807013798,
"grad_norm": 8.806400299072266,
"learning_rate": 2.349088838268793e-05,
"loss": 3.9507,
"step": 3300
},
{
"epoch": 0.14523087437529367,
"grad_norm": 7.716139793395996,
"learning_rate": 2.4202733485193623e-05,
"loss": 3.9536,
"step": 3400
},
{
"epoch": 0.14950237068044936,
"grad_norm": 12.768115997314453,
"learning_rate": 2.4914578587699316e-05,
"loss": 3.9268,
"step": 3500
},
{
"epoch": 0.15377386698560505,
"grad_norm": 10.236725807189941,
"learning_rate": 2.5626423690205016e-05,
"loss": 3.8231,
"step": 3600
},
{
"epoch": 0.15804536329076074,
"grad_norm": 13.891462326049805,
"learning_rate": 2.633826879271071e-05,
"loss": 3.8843,
"step": 3700
},
{
"epoch": 0.16231685959591646,
"grad_norm": 12.619128227233887,
"learning_rate": 2.70501138952164e-05,
"loss": 3.8942,
"step": 3800
},
{
"epoch": 0.16658835590107215,
"grad_norm": 8.733612060546875,
"learning_rate": 2.77619589977221e-05,
"loss": 3.7588,
"step": 3900
},
{
"epoch": 0.17085985220622785,
"grad_norm": 10.724875450134277,
"learning_rate": 2.8473804100227792e-05,
"loss": 3.8525,
"step": 4000
},
{
"epoch": 0.17085985220622785,
"eval_runtime": 404.7816,
"eval_samples_per_second": 115.672,
"eval_steps_per_second": 14.46,
"step": 4000
},
{
"epoch": 0.17513134851138354,
"grad_norm": 9.703784942626953,
"learning_rate": 2.9185649202733488e-05,
"loss": 3.7397,
"step": 4100
},
{
"epoch": 0.17940284481653923,
"grad_norm": 10.537924766540527,
"learning_rate": 2.989749430523918e-05,
"loss": 3.8457,
"step": 4200
},
{
"epoch": 0.18367434112169492,
"grad_norm": 13.7029447555542,
"learning_rate": 3.0609339407744874e-05,
"loss": 3.8889,
"step": 4300
},
{
"epoch": 0.18794583742685061,
"grad_norm": 11.692300796508789,
"learning_rate": 3.132118451025057e-05,
"loss": 3.9057,
"step": 4400
},
{
"epoch": 0.19221733373200633,
"grad_norm": 11.873428344726562,
"learning_rate": 3.203302961275627e-05,
"loss": 3.8959,
"step": 4500
},
{
"epoch": 0.19648883003716203,
"grad_norm": 10.291272163391113,
"learning_rate": 3.274487471526196e-05,
"loss": 3.8212,
"step": 4600
},
{
"epoch": 0.20076032634231772,
"grad_norm": 10.874945640563965,
"learning_rate": 3.3456719817767654e-05,
"loss": 3.8884,
"step": 4700
},
{
"epoch": 0.2050318226474734,
"grad_norm": 15.713820457458496,
"learning_rate": 3.416856492027335e-05,
"loss": 3.9066,
"step": 4800
},
{
"epoch": 0.2093033189526291,
"grad_norm": 11.526785850524902,
"learning_rate": 3.488041002277905e-05,
"loss": 3.7686,
"step": 4900
},
{
"epoch": 0.2135748152577848,
"grad_norm": 10.42326545715332,
"learning_rate": 3.559225512528474e-05,
"loss": 3.784,
"step": 5000
},
{
"epoch": 0.2135748152577848,
"eval_runtime": 404.8034,
"eval_samples_per_second": 115.666,
"eval_steps_per_second": 14.459,
"step": 5000
},
{
"epoch": 0.2178463115629405,
"grad_norm": 14.717082023620605,
"learning_rate": 3.630410022779043e-05,
"loss": 3.8462,
"step": 5100
},
{
"epoch": 0.2221178078680962,
"grad_norm": 11.965718269348145,
"learning_rate": 3.701594533029613e-05,
"loss": 3.9435,
"step": 5200
},
{
"epoch": 0.2263893041732519,
"grad_norm": 10.752185821533203,
"learning_rate": 3.7727790432801826e-05,
"loss": 3.8663,
"step": 5300
},
{
"epoch": 0.2306608004784076,
"grad_norm": 12.059910774230957,
"learning_rate": 3.8439635535307516e-05,
"loss": 3.7925,
"step": 5400
},
{
"epoch": 0.23493229678356328,
"grad_norm": 9.081160545349121,
"learning_rate": 3.915148063781321e-05,
"loss": 3.8639,
"step": 5500
},
{
"epoch": 0.23920379308871897,
"grad_norm": 10.45064926147461,
"learning_rate": 3.986332574031891e-05,
"loss": 3.8497,
"step": 5600
},
{
"epoch": 0.24347528939387467,
"grad_norm": 15.188603401184082,
"learning_rate": 4.0575170842824605e-05,
"loss": 3.7753,
"step": 5700
},
{
"epoch": 0.24774678569903036,
"grad_norm": 9.032523155212402,
"learning_rate": 4.12870159453303e-05,
"loss": 3.8309,
"step": 5800
},
{
"epoch": 0.2520182820041861,
"grad_norm": 9.886519432067871,
"learning_rate": 4.199886104783599e-05,
"loss": 3.8508,
"step": 5900
},
{
"epoch": 0.25628977830934174,
"grad_norm": 11.432881355285645,
"learning_rate": 4.271070615034169e-05,
"loss": 3.8327,
"step": 6000
},
{
"epoch": 0.25628977830934174,
"eval_runtime": 404.7732,
"eval_samples_per_second": 115.675,
"eval_steps_per_second": 14.46,
"step": 6000
},
{
"epoch": 0.26056127461449746,
"grad_norm": 10.134676933288574,
"learning_rate": 4.3422551252847384e-05,
"loss": 3.8419,
"step": 6100
},
{
"epoch": 0.2648327709196532,
"grad_norm": 12.583077430725098,
"learning_rate": 4.4134396355353074e-05,
"loss": 3.884,
"step": 6200
},
{
"epoch": 0.26910426722480885,
"grad_norm": 9.845976829528809,
"learning_rate": 4.484624145785877e-05,
"loss": 3.7787,
"step": 6300
},
{
"epoch": 0.27337576352996457,
"grad_norm": 21.58133888244629,
"learning_rate": 4.555808656036447e-05,
"loss": 3.8962,
"step": 6400
},
{
"epoch": 0.27764725983512023,
"grad_norm": 12.139480590820312,
"learning_rate": 4.626993166287016e-05,
"loss": 3.722,
"step": 6500
},
{
"epoch": 0.28191875614027595,
"grad_norm": 8.343817710876465,
"learning_rate": 4.698177676537586e-05,
"loss": 3.8009,
"step": 6600
},
{
"epoch": 0.2861902524454316,
"grad_norm": 17.52387809753418,
"learning_rate": 4.769362186788155e-05,
"loss": 3.8687,
"step": 6700
},
{
"epoch": 0.29046174875058733,
"grad_norm": 7.540428638458252,
"learning_rate": 4.8405466970387246e-05,
"loss": 3.727,
"step": 6800
},
{
"epoch": 0.29473324505574305,
"grad_norm": 11.794758796691895,
"learning_rate": 4.911731207289294e-05,
"loss": 3.7971,
"step": 6900
},
{
"epoch": 0.2990047413608987,
"grad_norm": 10.799798011779785,
"learning_rate": 4.982915717539863e-05,
"loss": 3.8319,
"step": 7000
},
{
"epoch": 0.2990047413608987,
"eval_runtime": 404.96,
"eval_samples_per_second": 115.621,
"eval_steps_per_second": 14.453,
"step": 7000
},
{
"epoch": 0.30327623766605444,
"grad_norm": 10.189812660217285,
"learning_rate": 4.993988197883213e-05,
"loss": 3.8696,
"step": 7100
},
{
"epoch": 0.3075477339712101,
"grad_norm": 9.960589408874512,
"learning_rate": 4.986077931940072e-05,
"loss": 3.7356,
"step": 7200
},
{
"epoch": 0.3118192302763658,
"grad_norm": 19.682788848876953,
"learning_rate": 4.978167665996931e-05,
"loss": 3.8524,
"step": 7300
},
{
"epoch": 0.3160907265815215,
"grad_norm": 10.302993774414062,
"learning_rate": 4.9702574000537896e-05,
"loss": 3.6824,
"step": 7400
},
{
"epoch": 0.3203622228866772,
"grad_norm": 11.691702842712402,
"learning_rate": 4.9623471341106494e-05,
"loss": 3.8522,
"step": 7500
},
{
"epoch": 0.3246337191918329,
"grad_norm": 8.01658821105957,
"learning_rate": 4.954436868167508e-05,
"loss": 3.8386,
"step": 7600
},
{
"epoch": 0.3289052154969886,
"grad_norm": 8.131669998168945,
"learning_rate": 4.946526602224367e-05,
"loss": 3.7587,
"step": 7700
},
{
"epoch": 0.3331767118021443,
"grad_norm": 63.0767822265625,
"learning_rate": 4.938616336281226e-05,
"loss": 3.9019,
"step": 7800
},
{
"epoch": 0.3374482081073,
"grad_norm": 10.04031753540039,
"learning_rate": 4.930706070338085e-05,
"loss": 3.8661,
"step": 7900
},
{
"epoch": 0.3417197044124557,
"grad_norm": 8.829032897949219,
"learning_rate": 4.922795804394944e-05,
"loss": 3.8105,
"step": 8000
},
{
"epoch": 0.3417197044124557,
"eval_runtime": 405.1664,
"eval_samples_per_second": 115.562,
"eval_steps_per_second": 14.446,
"step": 8000
},
{
"epoch": 0.34599120071761136,
"grad_norm": 15.314312934875488,
"learning_rate": 4.914885538451803e-05,
"loss": 3.6279,
"step": 8100
},
{
"epoch": 0.3502626970227671,
"grad_norm": 8.690494537353516,
"learning_rate": 4.906975272508662e-05,
"loss": 3.8336,
"step": 8200
},
{
"epoch": 0.3545341933279228,
"grad_norm": 10.526517868041992,
"learning_rate": 4.8990650065655206e-05,
"loss": 3.8046,
"step": 8300
},
{
"epoch": 0.35880568963307846,
"grad_norm": 7.85405969619751,
"learning_rate": 4.8911547406223804e-05,
"loss": 3.7721,
"step": 8400
},
{
"epoch": 0.3630771859382342,
"grad_norm": 11.473519325256348,
"learning_rate": 4.883244474679239e-05,
"loss": 3.7854,
"step": 8500
},
{
"epoch": 0.36734868224338985,
"grad_norm": 9.746980667114258,
"learning_rate": 4.875334208736098e-05,
"loss": 3.7573,
"step": 8600
},
{
"epoch": 0.37162017854854557,
"grad_norm": 9.924898147583008,
"learning_rate": 4.867423942792957e-05,
"loss": 3.8054,
"step": 8700
},
{
"epoch": 0.37589167485370123,
"grad_norm": 8.137608528137207,
"learning_rate": 4.859513676849816e-05,
"loss": 3.8245,
"step": 8800
},
{
"epoch": 0.38016317115885695,
"grad_norm": 8.987218856811523,
"learning_rate": 4.851603410906675e-05,
"loss": 3.742,
"step": 8900
},
{
"epoch": 0.38443466746401267,
"grad_norm": 9.04791259765625,
"learning_rate": 4.843693144963534e-05,
"loss": 3.7748,
"step": 9000
},
{
"epoch": 0.38443466746401267,
"eval_runtime": 404.3649,
"eval_samples_per_second": 115.791,
"eval_steps_per_second": 14.475,
"step": 9000
},
{
"epoch": 0.38870616376916833,
"grad_norm": 10.426161766052246,
"learning_rate": 4.835782879020393e-05,
"loss": 3.823,
"step": 9100
},
{
"epoch": 0.39297766007432405,
"grad_norm": 8.951033592224121,
"learning_rate": 4.8278726130772516e-05,
"loss": 3.5732,
"step": 9200
},
{
"epoch": 0.3972491563794797,
"grad_norm": 10.766894340515137,
"learning_rate": 4.819962347134111e-05,
"loss": 3.7286,
"step": 9300
},
{
"epoch": 0.40152065268463544,
"grad_norm": 9.429593086242676,
"learning_rate": 4.81205208119097e-05,
"loss": 3.7493,
"step": 9400
},
{
"epoch": 0.4057921489897911,
"grad_norm": 14.518060684204102,
"learning_rate": 4.804141815247829e-05,
"loss": 3.81,
"step": 9500
},
{
"epoch": 0.4100636452949468,
"grad_norm": 20.795759201049805,
"learning_rate": 4.7962315493046875e-05,
"loss": 3.7045,
"step": 9600
},
{
"epoch": 0.41433514160010254,
"grad_norm": 10.095163345336914,
"learning_rate": 4.788321283361547e-05,
"loss": 3.7767,
"step": 9700
},
{
"epoch": 0.4186066379052582,
"grad_norm": 14.195402145385742,
"learning_rate": 4.780411017418406e-05,
"loss": 3.6874,
"step": 9800
},
{
"epoch": 0.4228781342104139,
"grad_norm": 8.357501029968262,
"learning_rate": 4.772500751475265e-05,
"loss": 3.6675,
"step": 9900
},
{
"epoch": 0.4271496305155696,
"grad_norm": 8.715255737304688,
"learning_rate": 4.764590485532124e-05,
"loss": 3.7657,
"step": 10000
},
{
"epoch": 0.4271496305155696,
"eval_runtime": 405.002,
"eval_samples_per_second": 115.609,
"eval_steps_per_second": 14.452,
"step": 10000
},
{
"epoch": 0.4314211268207253,
"grad_norm": 11.821653366088867,
"learning_rate": 4.7566802195889826e-05,
"loss": 3.6386,
"step": 10100
},
{
"epoch": 0.435692623125881,
"grad_norm": 10.1320161819458,
"learning_rate": 4.748769953645842e-05,
"loss": 3.808,
"step": 10200
},
{
"epoch": 0.4399641194310367,
"grad_norm": 9.089192390441895,
"learning_rate": 4.740859687702701e-05,
"loss": 3.7376,
"step": 10300
},
{
"epoch": 0.4442356157361924,
"grad_norm": 9.963017463684082,
"learning_rate": 4.73294942175956e-05,
"loss": 3.8257,
"step": 10400
},
{
"epoch": 0.4485071120413481,
"grad_norm": 9.806687355041504,
"learning_rate": 4.7250391558164185e-05,
"loss": 3.7903,
"step": 10500
},
{
"epoch": 0.4527786083465038,
"grad_norm": 10.770492553710938,
"learning_rate": 4.7171288898732776e-05,
"loss": 3.7205,
"step": 10600
},
{
"epoch": 0.45705010465165946,
"grad_norm": 11.635176658630371,
"learning_rate": 4.709218623930137e-05,
"loss": 3.6938,
"step": 10700
},
{
"epoch": 0.4613216009568152,
"grad_norm": 8.8875732421875,
"learning_rate": 4.701308357986996e-05,
"loss": 3.7123,
"step": 10800
},
{
"epoch": 0.46559309726197085,
"grad_norm": 7.193441867828369,
"learning_rate": 4.6933980920438544e-05,
"loss": 3.6563,
"step": 10900
},
{
"epoch": 0.46986459356712656,
"grad_norm": 9.051284790039062,
"learning_rate": 4.6854878261007135e-05,
"loss": 3.7284,
"step": 11000
},
{
"epoch": 0.46986459356712656,
"eval_runtime": 405.9538,
"eval_samples_per_second": 115.338,
"eval_steps_per_second": 14.418,
"step": 11000
},
{
"epoch": 0.4741360898722823,
"grad_norm": 12.352762222290039,
"learning_rate": 4.677577560157573e-05,
"loss": 3.789,
"step": 11100
},
{
"epoch": 0.47840758617743795,
"grad_norm": 9.574907302856445,
"learning_rate": 4.669667294214432e-05,
"loss": 3.7442,
"step": 11200
},
{
"epoch": 0.48267908248259367,
"grad_norm": 9.639921188354492,
"learning_rate": 4.661757028271291e-05,
"loss": 3.7511,
"step": 11300
},
{
"epoch": 0.48695057878774933,
"grad_norm": 10.295048713684082,
"learning_rate": 4.6538467623281494e-05,
"loss": 3.7032,
"step": 11400
},
{
"epoch": 0.49122207509290505,
"grad_norm": 8.52436351776123,
"learning_rate": 4.6459364963850086e-05,
"loss": 3.7266,
"step": 11500
},
{
"epoch": 0.4954935713980607,
"grad_norm": 12.574061393737793,
"learning_rate": 4.638026230441868e-05,
"loss": 3.7196,
"step": 11600
},
{
"epoch": 0.49976506770321644,
"grad_norm": 12.421334266662598,
"learning_rate": 4.630115964498727e-05,
"loss": 3.6734,
"step": 11700
},
{
"epoch": 0.5040365640083722,
"grad_norm": 14.321782112121582,
"learning_rate": 4.6222056985555854e-05,
"loss": 3.8048,
"step": 11800
},
{
"epoch": 0.5083080603135278,
"grad_norm": 12.409293174743652,
"learning_rate": 4.614295432612445e-05,
"loss": 3.7366,
"step": 11900
},
{
"epoch": 0.5125795566186835,
"grad_norm": 9.002853393554688,
"learning_rate": 4.6063851666693037e-05,
"loss": 3.7532,
"step": 12000
},
{
"epoch": 0.5125795566186835,
"eval_runtime": 404.8624,
"eval_samples_per_second": 115.649,
"eval_steps_per_second": 14.457,
"step": 12000
},
{
"epoch": 0.5168510529238393,
"grad_norm": 8.214455604553223,
"learning_rate": 4.598474900726162e-05,
"loss": 3.7221,
"step": 12100
},
{
"epoch": 0.5211225492289949,
"grad_norm": 7.018616199493408,
"learning_rate": 4.590564634783022e-05,
"loss": 3.6718,
"step": 12200
},
{
"epoch": 0.5253940455341506,
"grad_norm": 12.610575675964355,
"learning_rate": 4.5826543688398804e-05,
"loss": 3.7282,
"step": 12300
},
{
"epoch": 0.5296655418393064,
"grad_norm": 10.168871879577637,
"learning_rate": 4.5747441028967396e-05,
"loss": 3.7455,
"step": 12400
},
{
"epoch": 0.533937038144462,
"grad_norm": 9.421287536621094,
"learning_rate": 4.566833836953599e-05,
"loss": 3.8177,
"step": 12500
},
{
"epoch": 0.5382085344496177,
"grad_norm": 11.314359664916992,
"learning_rate": 4.558923571010458e-05,
"loss": 3.7797,
"step": 12600
},
{
"epoch": 0.5424800307547734,
"grad_norm": 10.510274887084961,
"learning_rate": 4.5510133050673163e-05,
"loss": 3.7639,
"step": 12700
},
{
"epoch": 0.5467515270599291,
"grad_norm": 14.740921020507812,
"learning_rate": 4.5431030391241755e-05,
"loss": 3.8299,
"step": 12800
},
{
"epoch": 0.5510230233650848,
"grad_norm": 7.322781562805176,
"learning_rate": 4.5351927731810346e-05,
"loss": 3.8357,
"step": 12900
},
{
"epoch": 0.5552945196702405,
"grad_norm": 10.399321556091309,
"learning_rate": 4.527282507237894e-05,
"loss": 3.6613,
"step": 13000
},
{
"epoch": 0.5552945196702405,
"eval_runtime": 404.2488,
"eval_samples_per_second": 115.825,
"eval_steps_per_second": 14.479,
"step": 13000
},
{
"epoch": 0.5595660159753962,
"grad_norm": 8.805388450622559,
"learning_rate": 4.519372241294752e-05,
"loss": 3.7299,
"step": 13100
},
{
"epoch": 0.5638375122805519,
"grad_norm": 7.358932018280029,
"learning_rate": 4.5114619753516114e-05,
"loss": 3.7956,
"step": 13200
},
{
"epoch": 0.5681090085857076,
"grad_norm": 8.338736534118652,
"learning_rate": 4.5035517094084706e-05,
"loss": 3.8274,
"step": 13300
},
{
"epoch": 0.5723805048908632,
"grad_norm": 9.217434883117676,
"learning_rate": 4.495641443465329e-05,
"loss": 3.8071,
"step": 13400
},
{
"epoch": 0.576652001196019,
"grad_norm": 8.726869583129883,
"learning_rate": 4.487731177522189e-05,
"loss": 3.7296,
"step": 13500
},
{
"epoch": 0.5809234975011747,
"grad_norm": 11.813636779785156,
"learning_rate": 4.479820911579047e-05,
"loss": 3.8608,
"step": 13600
},
{
"epoch": 0.5851949938063303,
"grad_norm": 11.595725059509277,
"learning_rate": 4.4719106456359065e-05,
"loss": 3.7096,
"step": 13700
},
{
"epoch": 0.5894664901114861,
"grad_norm": 9.259355545043945,
"learning_rate": 4.4640003796927656e-05,
"loss": 3.6732,
"step": 13800
},
{
"epoch": 0.5937379864166418,
"grad_norm": 13.34984016418457,
"learning_rate": 4.456090113749625e-05,
"loss": 3.8131,
"step": 13900
},
{
"epoch": 0.5980094827217974,
"grad_norm": 10.516456604003906,
"learning_rate": 4.448179847806483e-05,
"loss": 3.7439,
"step": 14000
},
{
"epoch": 0.5980094827217974,
"eval_runtime": 404.1711,
"eval_samples_per_second": 115.847,
"eval_steps_per_second": 14.481,
"step": 14000
},
{
"epoch": 0.6022809790269531,
"grad_norm": 12.842930793762207,
"learning_rate": 4.4402695818633424e-05,
"loss": 3.7682,
"step": 14100
},
{
"epoch": 0.6065524753321089,
"grad_norm": 19.421875,
"learning_rate": 4.4323593159202015e-05,
"loss": 3.663,
"step": 14200
},
{
"epoch": 0.6108239716372645,
"grad_norm": 7.454352855682373,
"learning_rate": 4.42444904997706e-05,
"loss": 3.7463,
"step": 14300
},
{
"epoch": 0.6150954679424202,
"grad_norm": 13.48552131652832,
"learning_rate": 4.41653878403392e-05,
"loss": 3.649,
"step": 14400
},
{
"epoch": 0.619366964247576,
"grad_norm": 11.968147277832031,
"learning_rate": 4.408628518090778e-05,
"loss": 3.7516,
"step": 14500
},
{
"epoch": 0.6236384605527316,
"grad_norm": 12.150687217712402,
"learning_rate": 4.4007182521476375e-05,
"loss": 3.7322,
"step": 14600
},
{
"epoch": 0.6279099568578873,
"grad_norm": 8.789100646972656,
"learning_rate": 4.3928079862044966e-05,
"loss": 3.6886,
"step": 14700
},
{
"epoch": 0.632181453163043,
"grad_norm": 10.21249008178711,
"learning_rate": 4.384897720261356e-05,
"loss": 3.6862,
"step": 14800
},
{
"epoch": 0.6364529494681987,
"grad_norm": 9.880081176757812,
"learning_rate": 4.376987454318214e-05,
"loss": 3.6766,
"step": 14900
},
{
"epoch": 0.6407244457733544,
"grad_norm": 11.831520080566406,
"learning_rate": 4.3690771883750734e-05,
"loss": 3.645,
"step": 15000
},
{
"epoch": 0.6407244457733544,
"eval_runtime": 404.5696,
"eval_samples_per_second": 115.733,
"eval_steps_per_second": 14.467,
"step": 15000
},
{
"epoch": 0.6449959420785101,
"grad_norm": 8.91051959991455,
"learning_rate": 4.3611669224319325e-05,
"loss": 3.6847,
"step": 15100
},
{
"epoch": 0.6492674383836659,
"grad_norm": 9.260310173034668,
"learning_rate": 4.353256656488791e-05,
"loss": 3.7197,
"step": 15200
},
{
"epoch": 0.6535389346888215,
"grad_norm": 10.138089179992676,
"learning_rate": 4.34534639054565e-05,
"loss": 3.6529,
"step": 15300
},
{
"epoch": 0.6578104309939772,
"grad_norm": 8.813399314880371,
"learning_rate": 4.337436124602509e-05,
"loss": 3.6541,
"step": 15400
},
{
"epoch": 0.6620819272991328,
"grad_norm": 9.144048690795898,
"learning_rate": 4.3295258586593684e-05,
"loss": 3.5101,
"step": 15500
},
{
"epoch": 0.6663534236042886,
"grad_norm": 8.948995590209961,
"learning_rate": 4.321615592716227e-05,
"loss": 3.7848,
"step": 15600
},
{
"epoch": 0.6706249199094443,
"grad_norm": 9.42180061340332,
"learning_rate": 4.313705326773087e-05,
"loss": 3.5926,
"step": 15700
},
{
"epoch": 0.6748964162146,
"grad_norm": 8.974250793457031,
"learning_rate": 4.305795060829945e-05,
"loss": 3.6967,
"step": 15800
},
{
"epoch": 0.6791679125197557,
"grad_norm": 12.110358238220215,
"learning_rate": 4.2978847948868044e-05,
"loss": 3.6521,
"step": 15900
},
{
"epoch": 0.6834394088249114,
"grad_norm": 9.907513618469238,
"learning_rate": 4.2899745289436635e-05,
"loss": 3.703,
"step": 16000
},
{
"epoch": 0.6834394088249114,
"eval_runtime": 404.4023,
"eval_samples_per_second": 115.781,
"eval_steps_per_second": 14.473,
"step": 16000
},
{
"epoch": 0.687710905130067,
"grad_norm": 8.421221733093262,
"learning_rate": 4.2820642630005226e-05,
"loss": 3.5446,
"step": 16100
},
{
"epoch": 0.6919824014352227,
"grad_norm": 8.890350341796875,
"learning_rate": 4.274153997057381e-05,
"loss": 3.7051,
"step": 16200
},
{
"epoch": 0.6962538977403785,
"grad_norm": 7.985939979553223,
"learning_rate": 4.26624373111424e-05,
"loss": 3.7894,
"step": 16300
},
{
"epoch": 0.7005253940455342,
"grad_norm": 9.559375762939453,
"learning_rate": 4.2583334651710994e-05,
"loss": 3.7158,
"step": 16400
},
{
"epoch": 0.7047968903506898,
"grad_norm": 9.920624732971191,
"learning_rate": 4.250423199227958e-05,
"loss": 3.7286,
"step": 16500
},
{
"epoch": 0.7090683866558456,
"grad_norm": 13.251816749572754,
"learning_rate": 4.242512933284817e-05,
"loss": 3.7271,
"step": 16600
},
{
"epoch": 0.7133398829610013,
"grad_norm": 11.248520851135254,
"learning_rate": 4.234602667341676e-05,
"loss": 3.6633,
"step": 16700
},
{
"epoch": 0.7176113792661569,
"grad_norm": 7.556328296661377,
"learning_rate": 4.226692401398535e-05,
"loss": 3.6706,
"step": 16800
},
{
"epoch": 0.7218828755713126,
"grad_norm": 6.298122406005859,
"learning_rate": 4.218782135455394e-05,
"loss": 3.7307,
"step": 16900
},
{
"epoch": 0.7261543718764684,
"grad_norm": 10.17672061920166,
"learning_rate": 4.2108718695122536e-05,
"loss": 3.7274,
"step": 17000
},
{
"epoch": 0.7261543718764684,
"eval_runtime": 403.9594,
"eval_samples_per_second": 115.908,
"eval_steps_per_second": 14.489,
"step": 17000
},
{
"epoch": 0.730425868181624,
"grad_norm": 8.036096572875977,
"learning_rate": 4.202961603569112e-05,
"loss": 3.641,
"step": 17100
},
{
"epoch": 0.7346973644867797,
"grad_norm": 8.982136726379395,
"learning_rate": 4.195051337625971e-05,
"loss": 3.7079,
"step": 17200
},
{
"epoch": 0.7389688607919355,
"grad_norm": 11.923723220825195,
"learning_rate": 4.1871410716828304e-05,
"loss": 3.7252,
"step": 17300
},
{
"epoch": 0.7432403570970911,
"grad_norm": 10.336372375488281,
"learning_rate": 4.179230805739689e-05,
"loss": 3.6793,
"step": 17400
},
{
"epoch": 0.7475118534022468,
"grad_norm": 10.367877960205078,
"learning_rate": 4.171320539796548e-05,
"loss": 3.6833,
"step": 17500
},
{
"epoch": 0.7517833497074025,
"grad_norm": 8.473801612854004,
"learning_rate": 4.163410273853407e-05,
"loss": 3.6678,
"step": 17600
},
{
"epoch": 0.7560548460125582,
"grad_norm": 7.864530563354492,
"learning_rate": 4.155500007910266e-05,
"loss": 3.6848,
"step": 17700
},
{
"epoch": 0.7603263423177139,
"grad_norm": 10.16886043548584,
"learning_rate": 4.147589741967125e-05,
"loss": 3.8073,
"step": 17800
},
{
"epoch": 0.7645978386228696,
"grad_norm": 10.161076545715332,
"learning_rate": 4.1396794760239846e-05,
"loss": 3.7171,
"step": 17900
},
{
"epoch": 0.7688693349280253,
"grad_norm": 9.742669105529785,
"learning_rate": 4.131769210080843e-05,
"loss": 3.6089,
"step": 18000
},
{
"epoch": 0.7688693349280253,
"eval_runtime": 403.6413,
"eval_samples_per_second": 115.999,
"eval_steps_per_second": 14.5,
"step": 18000
},
{
"epoch": 0.773140831233181,
"grad_norm": 7.834203243255615,
"learning_rate": 4.123858944137702e-05,
"loss": 3.7618,
"step": 18100
},
{
"epoch": 0.7774123275383367,
"grad_norm": 8.670865058898926,
"learning_rate": 4.1159486781945614e-05,
"loss": 3.5937,
"step": 18200
},
{
"epoch": 0.7816838238434923,
"grad_norm": 10.17273998260498,
"learning_rate": 4.10803841225142e-05,
"loss": 3.6682,
"step": 18300
},
{
"epoch": 0.7859553201486481,
"grad_norm": 7.384734630584717,
"learning_rate": 4.100128146308279e-05,
"loss": 3.7461,
"step": 18400
},
{
"epoch": 0.7902268164538038,
"grad_norm": 9.957521438598633,
"learning_rate": 4.092217880365138e-05,
"loss": 3.646,
"step": 18500
},
{
"epoch": 0.7944983127589594,
"grad_norm": 9.32741928100586,
"learning_rate": 4.084307614421997e-05,
"loss": 3.7085,
"step": 18600
},
{
"epoch": 0.7987698090641152,
"grad_norm": 8.64340591430664,
"learning_rate": 4.076397348478856e-05,
"loss": 3.7954,
"step": 18700
},
{
"epoch": 0.8030413053692709,
"grad_norm": 8.776473999023438,
"learning_rate": 4.068487082535715e-05,
"loss": 3.7171,
"step": 18800
},
{
"epoch": 0.8073128016744265,
"grad_norm": 13.726973533630371,
"learning_rate": 4.060576816592574e-05,
"loss": 3.7593,
"step": 18900
},
{
"epoch": 0.8115842979795822,
"grad_norm": 8.291767120361328,
"learning_rate": 4.052666550649433e-05,
"loss": 3.6206,
"step": 19000
},
{
"epoch": 0.8115842979795822,
"eval_runtime": 404.755,
"eval_samples_per_second": 115.68,
"eval_steps_per_second": 14.461,
"step": 19000
},
{
"epoch": 0.815855794284738,
"grad_norm": 9.087343215942383,
"learning_rate": 4.044756284706292e-05,
"loss": 3.6748,
"step": 19100
},
{
"epoch": 0.8201272905898936,
"grad_norm": 7.206308364868164,
"learning_rate": 4.0368460187631515e-05,
"loss": 3.6976,
"step": 19200
},
{
"epoch": 0.8243987868950493,
"grad_norm": 13.21072006225586,
"learning_rate": 4.02893575282001e-05,
"loss": 3.7547,
"step": 19300
},
{
"epoch": 0.8286702832002051,
"grad_norm": 11.711308479309082,
"learning_rate": 4.0210254868768685e-05,
"loss": 3.7046,
"step": 19400
},
{
"epoch": 0.8329417795053607,
"grad_norm": 7.493105411529541,
"learning_rate": 4.013115220933728e-05,
"loss": 3.6605,
"step": 19500
},
{
"epoch": 0.8372132758105164,
"grad_norm": 10.920802116394043,
"learning_rate": 4.005204954990587e-05,
"loss": 3.701,
"step": 19600
},
{
"epoch": 0.8414847721156721,
"grad_norm": 8.588319778442383,
"learning_rate": 3.997294689047446e-05,
"loss": 3.7119,
"step": 19700
},
{
"epoch": 0.8457562684208279,
"grad_norm": 9.688274383544922,
"learning_rate": 3.989384423104305e-05,
"loss": 3.6968,
"step": 19800
},
{
"epoch": 0.8500277647259835,
"grad_norm": 13.46649169921875,
"learning_rate": 3.981474157161164e-05,
"loss": 3.6608,
"step": 19900
},
{
"epoch": 0.8542992610311392,
"grad_norm": 9.020798683166504,
"learning_rate": 3.973563891218023e-05,
"loss": 3.7014,
"step": 20000
},
{
"epoch": 0.8542992610311392,
"eval_runtime": 404.5707,
"eval_samples_per_second": 115.733,
"eval_steps_per_second": 14.467,
"step": 20000
},
{
"epoch": 0.858570757336295,
"grad_norm": 7.667457103729248,
"learning_rate": 3.9656536252748825e-05,
"loss": 3.6261,
"step": 20100
},
{
"epoch": 0.8628422536414506,
"grad_norm": 10.752103805541992,
"learning_rate": 3.957743359331741e-05,
"loss": 3.7678,
"step": 20200
},
{
"epoch": 0.8671137499466063,
"grad_norm": 8.758957862854004,
"learning_rate": 3.9498330933885994e-05,
"loss": 3.8176,
"step": 20300
},
{
"epoch": 0.871385246251762,
"grad_norm": 9.372211456298828,
"learning_rate": 3.941922827445459e-05,
"loss": 3.6383,
"step": 20400
},
{
"epoch": 0.8756567425569177,
"grad_norm": 10.67364501953125,
"learning_rate": 3.934012561502318e-05,
"loss": 3.7067,
"step": 20500
},
{
"epoch": 0.8799282388620734,
"grad_norm": 12.151751518249512,
"learning_rate": 3.926102295559177e-05,
"loss": 3.7826,
"step": 20600
},
{
"epoch": 0.884199735167229,
"grad_norm": 7.820495128631592,
"learning_rate": 3.918192029616036e-05,
"loss": 3.6867,
"step": 20700
},
{
"epoch": 0.8884712314723848,
"grad_norm": 9.453180313110352,
"learning_rate": 3.910281763672895e-05,
"loss": 3.7301,
"step": 20800
},
{
"epoch": 0.8927427277775405,
"grad_norm": 11.202925682067871,
"learning_rate": 3.9023714977297536e-05,
"loss": 3.6845,
"step": 20900
},
{
"epoch": 0.8970142240826962,
"grad_norm": 13.49270248413086,
"learning_rate": 3.894461231786613e-05,
"loss": 3.7193,
"step": 21000
},
{
"epoch": 0.8970142240826962,
"eval_runtime": 403.5135,
"eval_samples_per_second": 116.036,
"eval_steps_per_second": 14.505,
"step": 21000
},
{
"epoch": 0.9012857203878518,
"grad_norm": 8.086437225341797,
"learning_rate": 3.886550965843472e-05,
"loss": 3.6406,
"step": 21100
},
{
"epoch": 0.9055572166930076,
"grad_norm": 10.620895385742188,
"learning_rate": 3.878640699900331e-05,
"loss": 3.762,
"step": 21200
},
{
"epoch": 0.9098287129981633,
"grad_norm": 6.320925712585449,
"learning_rate": 3.8707304339571896e-05,
"loss": 3.7283,
"step": 21300
},
{
"epoch": 0.9141002093033189,
"grad_norm": 8.072772026062012,
"learning_rate": 3.862820168014049e-05,
"loss": 3.6657,
"step": 21400
},
{
"epoch": 0.9183717056084747,
"grad_norm": 8.310846328735352,
"learning_rate": 3.854909902070908e-05,
"loss": 3.7271,
"step": 21500
},
{
"epoch": 0.9226432019136304,
"grad_norm": 6.958920478820801,
"learning_rate": 3.846999636127766e-05,
"loss": 3.8027,
"step": 21600
},
{
"epoch": 0.926914698218786,
"grad_norm": 10.530051231384277,
"learning_rate": 3.839089370184626e-05,
"loss": 3.6227,
"step": 21700
},
{
"epoch": 0.9311861945239417,
"grad_norm": 9.503134727478027,
"learning_rate": 3.8311791042414846e-05,
"loss": 3.7262,
"step": 21800
},
{
"epoch": 0.9354576908290975,
"grad_norm": 8.891386985778809,
"learning_rate": 3.823268838298344e-05,
"loss": 3.6882,
"step": 21900
},
{
"epoch": 0.9397291871342531,
"grad_norm": 9.793424606323242,
"learning_rate": 3.815358572355203e-05,
"loss": 3.7304,
"step": 22000
},
{
"epoch": 0.9397291871342531,
"eval_runtime": 404.4356,
"eval_samples_per_second": 115.771,
"eval_steps_per_second": 14.472,
"step": 22000
},
{
"epoch": 0.9440006834394088,
"grad_norm": 8.443710327148438,
"learning_rate": 3.807448306412062e-05,
"loss": 3.6348,
"step": 22100
},
{
"epoch": 0.9482721797445646,
"grad_norm": 8.634336471557617,
"learning_rate": 3.7995380404689205e-05,
"loss": 3.6732,
"step": 22200
},
{
"epoch": 0.9525436760497202,
"grad_norm": 9.720598220825195,
"learning_rate": 3.79162777452578e-05,
"loss": 3.7,
"step": 22300
},
{
"epoch": 0.9568151723548759,
"grad_norm": 9.388401985168457,
"learning_rate": 3.783717508582639e-05,
"loss": 3.646,
"step": 22400
},
{
"epoch": 0.9610866686600316,
"grad_norm": 8.947492599487305,
"learning_rate": 3.775807242639497e-05,
"loss": 3.5409,
"step": 22500
},
{
"epoch": 0.9653581649651873,
"grad_norm": 9.596850395202637,
"learning_rate": 3.7678969766963565e-05,
"loss": 3.6636,
"step": 22600
},
{
"epoch": 0.969629661270343,
"grad_norm": 12.37696361541748,
"learning_rate": 3.7599867107532156e-05,
"loss": 3.7021,
"step": 22700
},
{
"epoch": 0.9739011575754987,
"grad_norm": 8.767573356628418,
"learning_rate": 3.752076444810075e-05,
"loss": 3.6152,
"step": 22800
},
{
"epoch": 0.9781726538806544,
"grad_norm": 8.559804916381836,
"learning_rate": 3.744166178866933e-05,
"loss": 3.6866,
"step": 22900
},
{
"epoch": 0.9824441501858101,
"grad_norm": 7.025428771972656,
"learning_rate": 3.736255912923793e-05,
"loss": 3.7933,
"step": 23000
},
{
"epoch": 0.9824441501858101,
"eval_runtime": 403.8391,
"eval_samples_per_second": 115.942,
"eval_steps_per_second": 14.493,
"step": 23000
},
{
"epoch": 0.9867156464909658,
"grad_norm": 8.558939933776855,
"learning_rate": 3.7283456469806515e-05,
"loss": 3.6242,
"step": 23100
},
{
"epoch": 0.9909871427961214,
"grad_norm": 7.838054656982422,
"learning_rate": 3.720435381037511e-05,
"loss": 3.7873,
"step": 23200
},
{
"epoch": 0.9952586391012772,
"grad_norm": 9.238251686096191,
"learning_rate": 3.71252511509437e-05,
"loss": 3.7437,
"step": 23300
},
{
"epoch": 0.9995301354064329,
"grad_norm": 8.105572700500488,
"learning_rate": 3.704614849151229e-05,
"loss": 3.5954,
"step": 23400
},
{
"epoch": 1.0038016317115885,
"grad_norm": 8.4044771194458,
"learning_rate": 3.6967045832080874e-05,
"loss": 3.6959,
"step": 23500
},
{
"epoch": 1.0080731280167443,
"grad_norm": 7.410630702972412,
"learning_rate": 3.6887943172649466e-05,
"loss": 3.5738,
"step": 23600
},
{
"epoch": 1.0123446243218999,
"grad_norm": 13.264152526855469,
"learning_rate": 3.680884051321806e-05,
"loss": 3.7171,
"step": 23700
},
{
"epoch": 1.0166161206270556,
"grad_norm": 10.43344783782959,
"learning_rate": 3.672973785378664e-05,
"loss": 3.7067,
"step": 23800
},
{
"epoch": 1.0208876169322114,
"grad_norm": 10.395238876342773,
"learning_rate": 3.665063519435524e-05,
"loss": 3.7069,
"step": 23900
},
{
"epoch": 1.025159113237367,
"grad_norm": 9.611321449279785,
"learning_rate": 3.6571532534923825e-05,
"loss": 3.6583,
"step": 24000
},
{
"epoch": 1.025159113237367,
"eval_runtime": 403.723,
"eval_samples_per_second": 115.976,
"eval_steps_per_second": 14.498,
"step": 24000
},
{
"epoch": 1.0294306095425227,
"grad_norm": 7.200385570526123,
"learning_rate": 3.6492429875492417e-05,
"loss": 3.6306,
"step": 24100
},
{
"epoch": 1.0337021058476785,
"grad_norm": 9.20836067199707,
"learning_rate": 3.641332721606101e-05,
"loss": 3.6762,
"step": 24200
},
{
"epoch": 1.037973602152834,
"grad_norm": 7.563958644866943,
"learning_rate": 3.63342245566296e-05,
"loss": 3.7167,
"step": 24300
},
{
"epoch": 1.0422450984579898,
"grad_norm": 13.854744911193848,
"learning_rate": 3.6255121897198184e-05,
"loss": 3.618,
"step": 24400
},
{
"epoch": 1.0465165947631456,
"grad_norm": 7.969038963317871,
"learning_rate": 3.6176019237766776e-05,
"loss": 3.722,
"step": 24500
},
{
"epoch": 1.0507880910683012,
"grad_norm": 9.738038063049316,
"learning_rate": 3.609691657833537e-05,
"loss": 3.7552,
"step": 24600
},
{
"epoch": 1.055059587373457,
"grad_norm": 10.093921661376953,
"learning_rate": 3.601781391890395e-05,
"loss": 3.7502,
"step": 24700
},
{
"epoch": 1.0593310836786127,
"grad_norm": 6.876298427581787,
"learning_rate": 3.593871125947254e-05,
"loss": 3.6343,
"step": 24800
},
{
"epoch": 1.0636025799837683,
"grad_norm": 7.968320846557617,
"learning_rate": 3.5859608600041135e-05,
"loss": 3.6845,
"step": 24900
},
{
"epoch": 1.067874076288924,
"grad_norm": 9.148797988891602,
"learning_rate": 3.5780505940609726e-05,
"loss": 3.6106,
"step": 25000
},
{
"epoch": 1.067874076288924,
"eval_runtime": 403.8237,
"eval_samples_per_second": 115.947,
"eval_steps_per_second": 14.494,
"step": 25000
},
{
"epoch": 1.0721455725940796,
"grad_norm": 9.72523307800293,
"learning_rate": 3.570140328117831e-05,
"loss": 3.6919,
"step": 25100
},
{
"epoch": 1.0764170688992354,
"grad_norm": 9.23186206817627,
"learning_rate": 3.562230062174691e-05,
"loss": 3.6378,
"step": 25200
},
{
"epoch": 1.0806885652043912,
"grad_norm": 8.581460952758789,
"learning_rate": 3.5543197962315494e-05,
"loss": 3.7005,
"step": 25300
},
{
"epoch": 1.0849600615095467,
"grad_norm": 9.83565902709961,
"learning_rate": 3.5464095302884085e-05,
"loss": 3.6315,
"step": 25400
},
{
"epoch": 1.0892315578147025,
"grad_norm": 7.4981770515441895,
"learning_rate": 3.538499264345268e-05,
"loss": 3.703,
"step": 25500
},
{
"epoch": 1.0935030541198583,
"grad_norm": 9.84447193145752,
"learning_rate": 3.530588998402126e-05,
"loss": 3.703,
"step": 25600
},
{
"epoch": 1.0977745504250138,
"grad_norm": 11.77198600769043,
"learning_rate": 3.522678732458985e-05,
"loss": 3.74,
"step": 25700
},
{
"epoch": 1.1020460467301696,
"grad_norm": 9.35525131225586,
"learning_rate": 3.5147684665158445e-05,
"loss": 3.6574,
"step": 25800
},
{
"epoch": 1.1063175430353254,
"grad_norm": 11.326153755187988,
"learning_rate": 3.5068582005727036e-05,
"loss": 3.6052,
"step": 25900
},
{
"epoch": 1.110589039340481,
"grad_norm": 8.957196235656738,
"learning_rate": 3.498947934629562e-05,
"loss": 3.6924,
"step": 26000
},
{
"epoch": 1.110589039340481,
"eval_runtime": 404.7232,
"eval_samples_per_second": 115.689,
"eval_steps_per_second": 14.462,
"step": 26000
},
{
"epoch": 1.1148605356456367,
"grad_norm": 8.46112060546875,
"learning_rate": 3.491037668686421e-05,
"loss": 3.5975,
"step": 26100
},
{
"epoch": 1.1191320319507922,
"grad_norm": 10.088958740234375,
"learning_rate": 3.4831274027432804e-05,
"loss": 3.6885,
"step": 26200
},
{
"epoch": 1.123403528255948,
"grad_norm": 8.147522926330566,
"learning_rate": 3.4752171368001395e-05,
"loss": 3.587,
"step": 26300
},
{
"epoch": 1.1276750245611038,
"grad_norm": 9.306445121765137,
"learning_rate": 3.467306870856999e-05,
"loss": 3.6943,
"step": 26400
},
{
"epoch": 1.1319465208662594,
"grad_norm": 7.206762790679932,
"learning_rate": 3.459396604913858e-05,
"loss": 3.8088,
"step": 26500
},
{
"epoch": 1.1362180171714151,
"grad_norm": 7.733761787414551,
"learning_rate": 3.451486338970716e-05,
"loss": 3.6604,
"step": 26600
},
{
"epoch": 1.140489513476571,
"grad_norm": 12.075209617614746,
"learning_rate": 3.4435760730275754e-05,
"loss": 3.6716,
"step": 26700
},
{
"epoch": 1.1447610097817265,
"grad_norm": 13.957979202270508,
"learning_rate": 3.4356658070844346e-05,
"loss": 3.6605,
"step": 26800
},
{
"epoch": 1.1490325060868822,
"grad_norm": 6.747539520263672,
"learning_rate": 3.427755541141293e-05,
"loss": 3.6524,
"step": 26900
},
{
"epoch": 1.153304002392038,
"grad_norm": 7.960626602172852,
"learning_rate": 3.419845275198152e-05,
"loss": 3.6574,
"step": 27000
},
{
"epoch": 1.153304002392038,
"eval_runtime": 403.5226,
"eval_samples_per_second": 116.033,
"eval_steps_per_second": 14.505,
"step": 27000
},
{
"epoch": 1.1575754986971936,
"grad_norm": 6.446537971496582,
"learning_rate": 3.4119350092550114e-05,
"loss": 3.6968,
"step": 27100
},
{
"epoch": 1.1618469950023493,
"grad_norm": 8.360943794250488,
"learning_rate": 3.4040247433118705e-05,
"loss": 3.7481,
"step": 27200
},
{
"epoch": 1.1661184913075051,
"grad_norm": 9.145593643188477,
"learning_rate": 3.396114477368729e-05,
"loss": 3.7588,
"step": 27300
},
{
"epoch": 1.1703899876126607,
"grad_norm": 11.084358215332031,
"learning_rate": 3.388204211425589e-05,
"loss": 3.6393,
"step": 27400
},
{
"epoch": 1.1746614839178164,
"grad_norm": 11.370959281921387,
"learning_rate": 3.380293945482447e-05,
"loss": 3.6566,
"step": 27500
},
{
"epoch": 1.1789329802229722,
"grad_norm": 9.31324291229248,
"learning_rate": 3.372383679539306e-05,
"loss": 3.5833,
"step": 27600
},
{
"epoch": 1.1832044765281278,
"grad_norm": 10.302188873291016,
"learning_rate": 3.3644734135961656e-05,
"loss": 3.6591,
"step": 27700
},
{
"epoch": 1.1874759728332835,
"grad_norm": 9.487174034118652,
"learning_rate": 3.356563147653024e-05,
"loss": 3.6245,
"step": 27800
},
{
"epoch": 1.191747469138439,
"grad_norm": 8.596895217895508,
"learning_rate": 3.348652881709883e-05,
"loss": 3.7252,
"step": 27900
},
{
"epoch": 1.1960189654435949,
"grad_norm": 8.368951797485352,
"learning_rate": 3.3407426157667423e-05,
"loss": 3.7371,
"step": 28000
},
{
"epoch": 1.1960189654435949,
"eval_runtime": 404.648,
"eval_samples_per_second": 115.71,
"eval_steps_per_second": 14.464,
"step": 28000
},
{
"epoch": 1.2002904617487506,
"grad_norm": 15.659690856933594,
"learning_rate": 3.3328323498236015e-05,
"loss": 3.5971,
"step": 28100
},
{
"epoch": 1.2045619580539062,
"grad_norm": 7.678028106689453,
"learning_rate": 3.32492208388046e-05,
"loss": 3.722,
"step": 28200
},
{
"epoch": 1.208833454359062,
"grad_norm": 7.527515888214111,
"learning_rate": 3.317011817937319e-05,
"loss": 3.6987,
"step": 28300
},
{
"epoch": 1.2131049506642178,
"grad_norm": 7.842383861541748,
"learning_rate": 3.309101551994178e-05,
"loss": 3.6394,
"step": 28400
},
{
"epoch": 1.2173764469693733,
"grad_norm": 7.929213523864746,
"learning_rate": 3.3011912860510374e-05,
"loss": 3.6538,
"step": 28500
},
{
"epoch": 1.221647943274529,
"grad_norm": 10.908970832824707,
"learning_rate": 3.293281020107896e-05,
"loss": 3.8107,
"step": 28600
},
{
"epoch": 1.2259194395796849,
"grad_norm": 13.46042251586914,
"learning_rate": 3.285370754164755e-05,
"loss": 3.7662,
"step": 28700
},
{
"epoch": 1.2301909358848404,
"grad_norm": 9.317851066589355,
"learning_rate": 3.277460488221614e-05,
"loss": 3.7211,
"step": 28800
},
{
"epoch": 1.2344624321899962,
"grad_norm": 8.503124237060547,
"learning_rate": 3.2695502222784726e-05,
"loss": 3.5346,
"step": 28900
},
{
"epoch": 1.2387339284951517,
"grad_norm": 7.7580718994140625,
"learning_rate": 3.2616399563353325e-05,
"loss": 3.7839,
"step": 29000
},
{
"epoch": 1.2387339284951517,
"eval_runtime": 404.9737,
"eval_samples_per_second": 115.617,
"eval_steps_per_second": 14.453,
"step": 29000
},
{
"epoch": 1.2430054248003075,
"grad_norm": 6.63585090637207,
"learning_rate": 3.253729690392191e-05,
"loss": 3.7525,
"step": 29100
},
{
"epoch": 1.2472769211054633,
"grad_norm": 8.438802719116211,
"learning_rate": 3.24581942444905e-05,
"loss": 3.6626,
"step": 29200
},
{
"epoch": 1.251548417410619,
"grad_norm": 10.101741790771484,
"learning_rate": 3.237909158505909e-05,
"loss": 3.6074,
"step": 29300
},
{
"epoch": 1.2558199137157746,
"grad_norm": 7.5071797370910645,
"learning_rate": 3.2299988925627684e-05,
"loss": 3.6081,
"step": 29400
},
{
"epoch": 1.2600914100209304,
"grad_norm": 8.632312774658203,
"learning_rate": 3.222088626619627e-05,
"loss": 3.5992,
"step": 29500
},
{
"epoch": 1.264362906326086,
"grad_norm": 8.449037551879883,
"learning_rate": 3.214178360676487e-05,
"loss": 3.5541,
"step": 29600
},
{
"epoch": 1.2686344026312417,
"grad_norm": 7.799576759338379,
"learning_rate": 3.206268094733345e-05,
"loss": 3.6541,
"step": 29700
},
{
"epoch": 1.2729058989363975,
"grad_norm": 11.673965454101562,
"learning_rate": 3.1983578287902036e-05,
"loss": 3.6498,
"step": 29800
},
{
"epoch": 1.277177395241553,
"grad_norm": 8.359036445617676,
"learning_rate": 3.1904475628470635e-05,
"loss": 3.6494,
"step": 29900
},
{
"epoch": 1.2814488915467088,
"grad_norm": 13.046087265014648,
"learning_rate": 3.182537296903922e-05,
"loss": 3.8596,
"step": 30000
},
{
"epoch": 1.2814488915467088,
"eval_runtime": 404.5555,
"eval_samples_per_second": 115.737,
"eval_steps_per_second": 14.468,
"step": 30000
},
{
"epoch": 1.2857203878518644,
"grad_norm": 10.439358711242676,
"learning_rate": 3.174627030960781e-05,
"loss": 3.7013,
"step": 30100
},
{
"epoch": 1.2899918841570202,
"grad_norm": 7.784947395324707,
"learning_rate": 3.16671676501764e-05,
"loss": 3.7573,
"step": 30200
},
{
"epoch": 1.294263380462176,
"grad_norm": 11.142706871032715,
"learning_rate": 3.1588064990744994e-05,
"loss": 3.6061,
"step": 30300
},
{
"epoch": 1.2985348767673317,
"grad_norm": 8.045978546142578,
"learning_rate": 3.150896233131358e-05,
"loss": 3.6615,
"step": 30400
},
{
"epoch": 1.3028063730724873,
"grad_norm": 8.409544944763184,
"learning_rate": 3.142985967188217e-05,
"loss": 3.6075,
"step": 30500
},
{
"epoch": 1.307077869377643,
"grad_norm": 10.13918685913086,
"learning_rate": 3.135075701245076e-05,
"loss": 3.6318,
"step": 30600
},
{
"epoch": 1.3113493656827986,
"grad_norm": 10.452644348144531,
"learning_rate": 3.1271654353019346e-05,
"loss": 3.6653,
"step": 30700
},
{
"epoch": 1.3156208619879544,
"grad_norm": 8.69783878326416,
"learning_rate": 3.119255169358794e-05,
"loss": 3.5687,
"step": 30800
},
{
"epoch": 1.3198923582931101,
"grad_norm": 9.234668731689453,
"learning_rate": 3.111344903415653e-05,
"loss": 3.6629,
"step": 30900
},
{
"epoch": 1.324163854598266,
"grad_norm": 7.855345249176025,
"learning_rate": 3.103434637472512e-05,
"loss": 3.7406,
"step": 31000
},
{
"epoch": 1.324163854598266,
"eval_runtime": 404.076,
"eval_samples_per_second": 115.874,
"eval_steps_per_second": 14.485,
"step": 31000
},
{
"epoch": 1.3284353509034215,
"grad_norm": 13.292342185974121,
"learning_rate": 3.0955243715293705e-05,
"loss": 3.7277,
"step": 31100
},
{
"epoch": 1.3327068472085772,
"grad_norm": 14.126769065856934,
"learning_rate": 3.0876141055862304e-05,
"loss": 3.8065,
"step": 31200
},
{
"epoch": 1.3369783435137328,
"grad_norm": 7.635355472564697,
"learning_rate": 3.079703839643089e-05,
"loss": 3.6653,
"step": 31300
},
{
"epoch": 1.3412498398188886,
"grad_norm": 9.28641128540039,
"learning_rate": 3.071793573699948e-05,
"loss": 3.7125,
"step": 31400
},
{
"epoch": 1.3455213361240443,
"grad_norm": 8.960599899291992,
"learning_rate": 3.063883307756807e-05,
"loss": 3.639,
"step": 31500
},
{
"epoch": 1.3497928324292,
"grad_norm": 10.085050582885742,
"learning_rate": 3.055973041813666e-05,
"loss": 3.7317,
"step": 31600
},
{
"epoch": 1.3540643287343557,
"grad_norm": 8.527816772460938,
"learning_rate": 3.048062775870525e-05,
"loss": 3.6044,
"step": 31700
},
{
"epoch": 1.3583358250395112,
"grad_norm": 11.678420066833496,
"learning_rate": 3.0401525099273835e-05,
"loss": 3.572,
"step": 31800
},
{
"epoch": 1.362607321344667,
"grad_norm": 5.9545207023620605,
"learning_rate": 3.032242243984243e-05,
"loss": 3.7374,
"step": 31900
},
{
"epoch": 1.3668788176498228,
"grad_norm": 8.175214767456055,
"learning_rate": 3.024331978041102e-05,
"loss": 3.624,
"step": 32000
},
{
"epoch": 1.3668788176498228,
"eval_runtime": 403.7487,
"eval_samples_per_second": 115.968,
"eval_steps_per_second": 14.497,
"step": 32000
},
{
"epoch": 1.3711503139549786,
"grad_norm": 7.345489978790283,
"learning_rate": 3.016421712097961e-05,
"loss": 3.6508,
"step": 32100
},
{
"epoch": 1.375421810260134,
"grad_norm": 10.301737785339355,
"learning_rate": 3.0085114461548198e-05,
"loss": 3.5836,
"step": 32200
},
{
"epoch": 1.3796933065652899,
"grad_norm": 8.771992683410645,
"learning_rate": 3.000601180211679e-05,
"loss": 3.7915,
"step": 32300
},
{
"epoch": 1.3839648028704454,
"grad_norm": 9.168205261230469,
"learning_rate": 2.9926909142685378e-05,
"loss": 3.6517,
"step": 32400
},
{
"epoch": 1.3882362991756012,
"grad_norm": 7.1654744148254395,
"learning_rate": 2.984780648325397e-05,
"loss": 3.503,
"step": 32500
},
{
"epoch": 1.392507795480757,
"grad_norm": 8.390599250793457,
"learning_rate": 2.9768703823822557e-05,
"loss": 3.7276,
"step": 32600
},
{
"epoch": 1.3967792917859125,
"grad_norm": 12.229002952575684,
"learning_rate": 2.9689601164391152e-05,
"loss": 3.7753,
"step": 32700
},
{
"epoch": 1.4010507880910683,
"grad_norm": 11.649025917053223,
"learning_rate": 2.9610498504959737e-05,
"loss": 3.6765,
"step": 32800
},
{
"epoch": 1.4053222843962239,
"grad_norm": 8.619730949401855,
"learning_rate": 2.9531395845528325e-05,
"loss": 3.6508,
"step": 32900
},
{
"epoch": 1.4095937807013796,
"grad_norm": 9.323366165161133,
"learning_rate": 2.945229318609692e-05,
"loss": 3.7256,
"step": 33000
},
{
"epoch": 1.4095937807013796,
"eval_runtime": 404.2192,
"eval_samples_per_second": 115.833,
"eval_steps_per_second": 14.48,
"step": 33000
},
{
"epoch": 1.4138652770065354,
"grad_norm": 13.431550979614258,
"learning_rate": 2.9373190526665504e-05,
"loss": 3.7312,
"step": 33100
},
{
"epoch": 1.4181367733116912,
"grad_norm": 7.221197128295898,
"learning_rate": 2.92940878672341e-05,
"loss": 3.5233,
"step": 33200
},
{
"epoch": 1.4224082696168467,
"grad_norm": 8.221494674682617,
"learning_rate": 2.9214985207802687e-05,
"loss": 3.6815,
"step": 33300
},
{
"epoch": 1.4266797659220025,
"grad_norm": 8.996779441833496,
"learning_rate": 2.913588254837128e-05,
"loss": 3.5838,
"step": 33400
},
{
"epoch": 1.430951262227158,
"grad_norm": 7.899658679962158,
"learning_rate": 2.9056779888939867e-05,
"loss": 3.5632,
"step": 33500
},
{
"epoch": 1.4352227585323138,
"grad_norm": 7.839208602905273,
"learning_rate": 2.897767722950846e-05,
"loss": 3.6154,
"step": 33600
},
{
"epoch": 1.4394942548374696,
"grad_norm": 7.053780555725098,
"learning_rate": 2.8898574570077047e-05,
"loss": 3.6218,
"step": 33700
},
{
"epoch": 1.4437657511426254,
"grad_norm": 11.56430721282959,
"learning_rate": 2.8819471910645635e-05,
"loss": 3.6301,
"step": 33800
},
{
"epoch": 1.448037247447781,
"grad_norm": 12.948223114013672,
"learning_rate": 2.8740369251214226e-05,
"loss": 3.6278,
"step": 33900
},
{
"epoch": 1.4523087437529367,
"grad_norm": 10.741262435913086,
"learning_rate": 2.8661266591782814e-05,
"loss": 3.7338,
"step": 34000
},
{
"epoch": 1.4523087437529367,
"eval_runtime": 404.4425,
"eval_samples_per_second": 115.769,
"eval_steps_per_second": 14.472,
"step": 34000
},
{
"epoch": 1.4565802400580923,
"grad_norm": 8.808819770812988,
"learning_rate": 2.858216393235141e-05,
"loss": 3.5755,
"step": 34100
},
{
"epoch": 1.460851736363248,
"grad_norm": 7.9008097648620605,
"learning_rate": 2.8503061272919994e-05,
"loss": 3.5744,
"step": 34200
},
{
"epoch": 1.4651232326684038,
"grad_norm": 10.011557579040527,
"learning_rate": 2.842395861348859e-05,
"loss": 3.7014,
"step": 34300
},
{
"epoch": 1.4693947289735594,
"grad_norm": 8.058487892150879,
"learning_rate": 2.8344855954057177e-05,
"loss": 3.6554,
"step": 34400
},
{
"epoch": 1.4736662252787152,
"grad_norm": 7.3602824211120605,
"learning_rate": 2.8265753294625768e-05,
"loss": 3.7166,
"step": 34500
},
{
"epoch": 1.4779377215838707,
"grad_norm": 7.900210857391357,
"learning_rate": 2.8186650635194356e-05,
"loss": 3.6276,
"step": 34600
},
{
"epoch": 1.4822092178890265,
"grad_norm": 7.839376926422119,
"learning_rate": 2.8107547975762948e-05,
"loss": 3.6036,
"step": 34700
},
{
"epoch": 1.4864807141941823,
"grad_norm": 8.925679206848145,
"learning_rate": 2.8028445316331536e-05,
"loss": 3.6868,
"step": 34800
},
{
"epoch": 1.490752210499338,
"grad_norm": 8.532880783081055,
"learning_rate": 2.7949342656900124e-05,
"loss": 3.7388,
"step": 34900
},
{
"epoch": 1.4950237068044936,
"grad_norm": 9.397866249084473,
"learning_rate": 2.7870239997468716e-05,
"loss": 3.7206,
"step": 35000
},
{
"epoch": 1.4950237068044936,
"eval_runtime": 404.715,
"eval_samples_per_second": 115.691,
"eval_steps_per_second": 14.462,
"step": 35000
},
{
"epoch": 1.4992952031096494,
"grad_norm": 9.152342796325684,
"learning_rate": 2.7791137338037304e-05,
"loss": 3.6816,
"step": 35100
},
{
"epoch": 1.503566699414805,
"grad_norm": 7.594329357147217,
"learning_rate": 2.77120346786059e-05,
"loss": 3.6713,
"step": 35200
},
{
"epoch": 1.5078381957199607,
"grad_norm": 9.826537132263184,
"learning_rate": 2.7632932019174483e-05,
"loss": 3.6467,
"step": 35300
},
{
"epoch": 1.5121096920251165,
"grad_norm": 9.374577522277832,
"learning_rate": 2.7553829359743078e-05,
"loss": 3.6588,
"step": 35400
},
{
"epoch": 1.5163811883302722,
"grad_norm": 10.790063858032227,
"learning_rate": 2.7474726700311666e-05,
"loss": 3.5689,
"step": 35500
},
{
"epoch": 1.5206526846354278,
"grad_norm": 10.145702362060547,
"learning_rate": 2.7395624040880258e-05,
"loss": 3.7457,
"step": 35600
},
{
"epoch": 1.5249241809405834,
"grad_norm": 11.168187141418457,
"learning_rate": 2.7316521381448846e-05,
"loss": 3.5706,
"step": 35700
},
{
"epoch": 1.5291956772457391,
"grad_norm": 9.234560012817383,
"learning_rate": 2.7237418722017437e-05,
"loss": 3.6744,
"step": 35800
},
{
"epoch": 1.533467173550895,
"grad_norm": 10.015559196472168,
"learning_rate": 2.7158316062586025e-05,
"loss": 3.6374,
"step": 35900
},
{
"epoch": 1.5377386698560507,
"grad_norm": 8.472687721252441,
"learning_rate": 2.7079213403154613e-05,
"loss": 3.6958,
"step": 36000
},
{
"epoch": 1.5377386698560507,
"eval_runtime": 404.1589,
"eval_samples_per_second": 115.85,
"eval_steps_per_second": 14.482,
"step": 36000
},
{
"epoch": 1.5420101661612062,
"grad_norm": 7.5909199714660645,
"learning_rate": 2.7000110743723205e-05,
"loss": 3.6096,
"step": 36100
},
{
"epoch": 1.546281662466362,
"grad_norm": 18.598318099975586,
"learning_rate": 2.6921008084291793e-05,
"loss": 3.6368,
"step": 36200
},
{
"epoch": 1.5505531587715176,
"grad_norm": 10.265989303588867,
"learning_rate": 2.6841905424860388e-05,
"loss": 3.6886,
"step": 36300
},
{
"epoch": 1.5548246550766733,
"grad_norm": 16.7838077545166,
"learning_rate": 2.6762802765428973e-05,
"loss": 3.4999,
"step": 36400
},
{
"epoch": 1.559096151381829,
"grad_norm": 9.542481422424316,
"learning_rate": 2.6683700105997567e-05,
"loss": 3.5173,
"step": 36500
},
{
"epoch": 1.5633676476869849,
"grad_norm": 7.0144758224487305,
"learning_rate": 2.6604597446566156e-05,
"loss": 3.6004,
"step": 36600
},
{
"epoch": 1.5676391439921404,
"grad_norm": 7.273271560668945,
"learning_rate": 2.6525494787134747e-05,
"loss": 3.5539,
"step": 36700
},
{
"epoch": 1.571910640297296,
"grad_norm": 9.942744255065918,
"learning_rate": 2.6446392127703335e-05,
"loss": 3.6823,
"step": 36800
},
{
"epoch": 1.5761821366024518,
"grad_norm": 8.686135292053223,
"learning_rate": 2.6367289468271923e-05,
"loss": 3.6757,
"step": 36900
},
{
"epoch": 1.5804536329076075,
"grad_norm": 6.468233108520508,
"learning_rate": 2.6288186808840515e-05,
"loss": 3.6318,
"step": 37000
},
{
"epoch": 1.5804536329076075,
"eval_runtime": 403.714,
"eval_samples_per_second": 115.978,
"eval_steps_per_second": 14.498,
"step": 37000
},
{
"epoch": 1.5847251292127633,
"grad_norm": 8.390809059143066,
"learning_rate": 2.6209084149409103e-05,
"loss": 3.6048,
"step": 37100
},
{
"epoch": 1.588996625517919,
"grad_norm": 11.824224472045898,
"learning_rate": 2.6129981489977694e-05,
"loss": 3.6128,
"step": 37200
},
{
"epoch": 1.5932681218230746,
"grad_norm": 9.557259559631348,
"learning_rate": 2.6050878830546282e-05,
"loss": 3.5252,
"step": 37300
},
{
"epoch": 1.5975396181282302,
"grad_norm": 10.761728286743164,
"learning_rate": 2.5971776171114874e-05,
"loss": 3.6447,
"step": 37400
},
{
"epoch": 1.601811114433386,
"grad_norm": 7.978828430175781,
"learning_rate": 2.5892673511683462e-05,
"loss": 3.6211,
"step": 37500
},
{
"epoch": 1.6060826107385417,
"grad_norm": 8.314446449279785,
"learning_rate": 2.5813570852252057e-05,
"loss": 3.6197,
"step": 37600
},
{
"epoch": 1.6103541070436975,
"grad_norm": 7.391338348388672,
"learning_rate": 2.573446819282064e-05,
"loss": 3.6123,
"step": 37700
},
{
"epoch": 1.614625603348853,
"grad_norm": 9.402429580688477,
"learning_rate": 2.5655365533389236e-05,
"loss": 3.7008,
"step": 37800
},
{
"epoch": 1.6188970996540089,
"grad_norm": 9.703052520751953,
"learning_rate": 2.5576262873957825e-05,
"loss": 3.6748,
"step": 37900
},
{
"epoch": 1.6231685959591644,
"grad_norm": 7.890733242034912,
"learning_rate": 2.5497160214526413e-05,
"loss": 3.6766,
"step": 38000
},
{
"epoch": 1.6231685959591644,
"eval_runtime": 403.3234,
"eval_samples_per_second": 116.09,
"eval_steps_per_second": 14.512,
"step": 38000
},
{
"epoch": 1.6274400922643202,
"grad_norm": 7.096985816955566,
"learning_rate": 2.5418057555095004e-05,
"loss": 3.6308,
"step": 38100
},
{
"epoch": 1.631711588569476,
"grad_norm": 10.07547378540039,
"learning_rate": 2.5338954895663592e-05,
"loss": 3.5905,
"step": 38200
},
{
"epoch": 1.6359830848746317,
"grad_norm": 8.68416690826416,
"learning_rate": 2.5259852236232184e-05,
"loss": 3.5352,
"step": 38300
},
{
"epoch": 1.6402545811797873,
"grad_norm": 10.171316146850586,
"learning_rate": 2.5180749576800772e-05,
"loss": 3.6892,
"step": 38400
},
{
"epoch": 1.6445260774849428,
"grad_norm": 12.549208641052246,
"learning_rate": 2.5101646917369363e-05,
"loss": 3.6036,
"step": 38500
},
{
"epoch": 1.6487975737900986,
"grad_norm": 9.339801788330078,
"learning_rate": 2.502254425793795e-05,
"loss": 3.5937,
"step": 38600
},
{
"epoch": 1.6530690700952544,
"grad_norm": 7.933904647827148,
"learning_rate": 2.4943441598506543e-05,
"loss": 3.588,
"step": 38700
},
{
"epoch": 1.6573405664004102,
"grad_norm": 11.310770988464355,
"learning_rate": 2.486433893907513e-05,
"loss": 3.7504,
"step": 38800
},
{
"epoch": 1.6616120627055657,
"grad_norm": 9.128674507141113,
"learning_rate": 2.4785236279643722e-05,
"loss": 3.65,
"step": 38900
},
{
"epoch": 1.6658835590107215,
"grad_norm": 10.278397560119629,
"learning_rate": 2.4706133620212314e-05,
"loss": 3.7471,
"step": 39000
},
{
"epoch": 1.6658835590107215,
"eval_runtime": 403.4984,
"eval_samples_per_second": 116.04,
"eval_steps_per_second": 14.506,
"step": 39000
},
{
"epoch": 1.670155055315877,
"grad_norm": 9.706366539001465,
"learning_rate": 2.4627030960780902e-05,
"loss": 3.6192,
"step": 39100
},
{
"epoch": 1.6744265516210328,
"grad_norm": 8.632245063781738,
"learning_rate": 2.4547928301349494e-05,
"loss": 3.5752,
"step": 39200
},
{
"epoch": 1.6786980479261886,
"grad_norm": 8.402227401733398,
"learning_rate": 2.4468825641918085e-05,
"loss": 3.6774,
"step": 39300
},
{
"epoch": 1.6829695442313444,
"grad_norm": 8.275464057922363,
"learning_rate": 2.4389722982486673e-05,
"loss": 3.6148,
"step": 39400
},
{
"epoch": 1.6872410405365,
"grad_norm": 9.17482852935791,
"learning_rate": 2.431062032305526e-05,
"loss": 3.6413,
"step": 39500
},
{
"epoch": 1.6915125368416555,
"grad_norm": 8.914527893066406,
"learning_rate": 2.4231517663623853e-05,
"loss": 3.7191,
"step": 39600
},
{
"epoch": 1.6957840331468113,
"grad_norm": 8.066243171691895,
"learning_rate": 2.415241500419244e-05,
"loss": 3.5606,
"step": 39700
},
{
"epoch": 1.700055529451967,
"grad_norm": 9.488569259643555,
"learning_rate": 2.4073312344761032e-05,
"loss": 3.6873,
"step": 39800
},
{
"epoch": 1.7043270257571228,
"grad_norm": 9.717203140258789,
"learning_rate": 2.399420968532962e-05,
"loss": 3.688,
"step": 39900
},
{
"epoch": 1.7085985220622786,
"grad_norm": 8.048073768615723,
"learning_rate": 2.3915107025898212e-05,
"loss": 3.515,
"step": 40000
},
{
"epoch": 1.7085985220622786,
"eval_runtime": 404.1323,
"eval_samples_per_second": 115.858,
"eval_steps_per_second": 14.483,
"step": 40000
},
{
"epoch": 1.7128700183674341,
"grad_norm": 9.101920127868652,
"learning_rate": 2.3836004366466803e-05,
"loss": 3.6674,
"step": 40100
},
{
"epoch": 1.7171415146725897,
"grad_norm": 6.701783180236816,
"learning_rate": 2.375690170703539e-05,
"loss": 3.6339,
"step": 40200
},
{
"epoch": 1.7214130109777455,
"grad_norm": 9.65266227722168,
"learning_rate": 2.3677799047603983e-05,
"loss": 3.5557,
"step": 40300
},
{
"epoch": 1.7256845072829012,
"grad_norm": 9.488314628601074,
"learning_rate": 2.359869638817257e-05,
"loss": 3.7312,
"step": 40400
},
{
"epoch": 1.729956003588057,
"grad_norm": 8.73523235321045,
"learning_rate": 2.3519593728741163e-05,
"loss": 3.6714,
"step": 40500
},
{
"epoch": 1.7342274998932126,
"grad_norm": 9.438526153564453,
"learning_rate": 2.344049106930975e-05,
"loss": 3.6664,
"step": 40600
},
{
"epoch": 1.7384989961983683,
"grad_norm": 9.409259796142578,
"learning_rate": 2.336138840987834e-05,
"loss": 3.5598,
"step": 40700
},
{
"epoch": 1.742770492503524,
"grad_norm": 6.831430435180664,
"learning_rate": 2.328228575044693e-05,
"loss": 3.7215,
"step": 40800
},
{
"epoch": 1.7470419888086797,
"grad_norm": 8.387484550476074,
"learning_rate": 2.320318309101552e-05,
"loss": 3.578,
"step": 40900
},
{
"epoch": 1.7513134851138354,
"grad_norm": 9.247336387634277,
"learning_rate": 2.312408043158411e-05,
"loss": 3.615,
"step": 41000
},
{
"epoch": 1.7513134851138354,
"eval_runtime": 404.0825,
"eval_samples_per_second": 115.872,
"eval_steps_per_second": 14.485,
"step": 41000
},
{
"epoch": 1.7555849814189912,
"grad_norm": 11.280122756958008,
"learning_rate": 2.30449777721527e-05,
"loss": 3.5713,
"step": 41100
},
{
"epoch": 1.7598564777241468,
"grad_norm": 8.902118682861328,
"learning_rate": 2.2965875112721293e-05,
"loss": 3.6999,
"step": 41200
},
{
"epoch": 1.7641279740293023,
"grad_norm": 11.0384521484375,
"learning_rate": 2.288677245328988e-05,
"loss": 3.7134,
"step": 41300
},
{
"epoch": 1.768399470334458,
"grad_norm": 8.986517906188965,
"learning_rate": 2.2807669793858472e-05,
"loss": 3.5391,
"step": 41400
},
{
"epoch": 1.7726709666396139,
"grad_norm": 9.237929344177246,
"learning_rate": 2.272856713442706e-05,
"loss": 3.781,
"step": 41500
},
{
"epoch": 1.7769424629447697,
"grad_norm": 12.143738746643066,
"learning_rate": 2.264946447499565e-05,
"loss": 3.5613,
"step": 41600
},
{
"epoch": 1.7812139592499252,
"grad_norm": 9.296298027038574,
"learning_rate": 2.257036181556424e-05,
"loss": 3.6645,
"step": 41700
},
{
"epoch": 1.785485455555081,
"grad_norm": 9.721207618713379,
"learning_rate": 2.2491259156132828e-05,
"loss": 3.5764,
"step": 41800
},
{
"epoch": 1.7897569518602365,
"grad_norm": 11.145936012268066,
"learning_rate": 2.241215649670142e-05,
"loss": 3.5321,
"step": 41900
},
{
"epoch": 1.7940284481653923,
"grad_norm": 10.27043628692627,
"learning_rate": 2.233305383727001e-05,
"loss": 3.6625,
"step": 42000
},
{
"epoch": 1.7940284481653923,
"eval_runtime": 405.1907,
"eval_samples_per_second": 115.555,
"eval_steps_per_second": 14.445,
"step": 42000
},
{
"epoch": 1.798299944470548,
"grad_norm": 10.281463623046875,
"learning_rate": 2.22539511778386e-05,
"loss": 3.5566,
"step": 42100
},
{
"epoch": 1.8025714407757039,
"grad_norm": 6.728999614715576,
"learning_rate": 2.217484851840719e-05,
"loss": 3.5608,
"step": 42200
},
{
"epoch": 1.8068429370808594,
"grad_norm": 6.053191184997559,
"learning_rate": 2.209574585897578e-05,
"loss": 3.685,
"step": 42300
},
{
"epoch": 1.811114433386015,
"grad_norm": 8.071969032287598,
"learning_rate": 2.201664319954437e-05,
"loss": 3.6003,
"step": 42400
},
{
"epoch": 1.8153859296911707,
"grad_norm": 29.326370239257812,
"learning_rate": 2.1937540540112962e-05,
"loss": 3.6615,
"step": 42500
},
{
"epoch": 1.8196574259963265,
"grad_norm": 8.652432441711426,
"learning_rate": 2.185843788068155e-05,
"loss": 3.5731,
"step": 42600
},
{
"epoch": 1.8239289223014823,
"grad_norm": 11.717292785644531,
"learning_rate": 2.1779335221250138e-05,
"loss": 3.6371,
"step": 42700
},
{
"epoch": 1.828200418606638,
"grad_norm": 10.365557670593262,
"learning_rate": 2.170023256181873e-05,
"loss": 3.6857,
"step": 42800
},
{
"epoch": 1.8324719149117936,
"grad_norm": 12.400829315185547,
"learning_rate": 2.1621129902387317e-05,
"loss": 3.6896,
"step": 42900
},
{
"epoch": 1.8367434112169492,
"grad_norm": 8.40799331665039,
"learning_rate": 2.154202724295591e-05,
"loss": 3.6611,
"step": 43000
},
{
"epoch": 1.8367434112169492,
"eval_runtime": 403.8313,
"eval_samples_per_second": 115.944,
"eval_steps_per_second": 14.494,
"step": 43000
},
{
"epoch": 1.841014907522105,
"grad_norm": 10.518604278564453,
"learning_rate": 2.14629245835245e-05,
"loss": 3.6118,
"step": 43100
},
{
"epoch": 1.8452864038272607,
"grad_norm": 7.877737998962402,
"learning_rate": 2.138382192409309e-05,
"loss": 3.5943,
"step": 43200
},
{
"epoch": 1.8495579001324165,
"grad_norm": 12.722783088684082,
"learning_rate": 2.130471926466168e-05,
"loss": 3.6583,
"step": 43300
},
{
"epoch": 1.853829396437572,
"grad_norm": 8.382994651794434,
"learning_rate": 2.1225616605230268e-05,
"loss": 3.5931,
"step": 43400
},
{
"epoch": 1.8581008927427278,
"grad_norm": 10.603730201721191,
"learning_rate": 2.114651394579886e-05,
"loss": 3.6558,
"step": 43500
},
{
"epoch": 1.8623723890478834,
"grad_norm": 6.978638172149658,
"learning_rate": 2.106741128636745e-05,
"loss": 3.504,
"step": 43600
},
{
"epoch": 1.8666438853530392,
"grad_norm": 7.777115345001221,
"learning_rate": 2.0988308626936036e-05,
"loss": 3.7573,
"step": 43700
},
{
"epoch": 1.870915381658195,
"grad_norm": 8.054482460021973,
"learning_rate": 2.0909205967504627e-05,
"loss": 3.5624,
"step": 43800
},
{
"epoch": 1.8751868779633507,
"grad_norm": 8.191532135009766,
"learning_rate": 2.083010330807322e-05,
"loss": 3.6115,
"step": 43900
},
{
"epoch": 1.8794583742685063,
"grad_norm": 9.908390998840332,
"learning_rate": 2.0751000648641807e-05,
"loss": 3.6564,
"step": 44000
},
{
"epoch": 1.8794583742685063,
"eval_runtime": 403.8168,
"eval_samples_per_second": 115.949,
"eval_steps_per_second": 14.494,
"step": 44000
},
{
"epoch": 1.8837298705736618,
"grad_norm": 10.703449249267578,
"learning_rate": 2.06718979892104e-05,
"loss": 3.6265,
"step": 44100
},
{
"epoch": 1.8880013668788176,
"grad_norm": 8.703311920166016,
"learning_rate": 2.059279532977899e-05,
"loss": 3.601,
"step": 44200
},
{
"epoch": 1.8922728631839734,
"grad_norm": 16.844961166381836,
"learning_rate": 2.0513692670347578e-05,
"loss": 3.5735,
"step": 44300
},
{
"epoch": 1.8965443594891291,
"grad_norm": 7.944665908813477,
"learning_rate": 2.043459001091617e-05,
"loss": 3.6514,
"step": 44400
},
{
"epoch": 1.9008158557942847,
"grad_norm": 10.938014030456543,
"learning_rate": 2.0355487351484758e-05,
"loss": 3.6739,
"step": 44500
},
{
"epoch": 1.9050873520994405,
"grad_norm": 7.884680271148682,
"learning_rate": 2.027638469205335e-05,
"loss": 3.6705,
"step": 44600
},
{
"epoch": 1.909358848404596,
"grad_norm": 10.993422508239746,
"learning_rate": 2.0197282032621937e-05,
"loss": 3.5416,
"step": 44700
},
{
"epoch": 1.9136303447097518,
"grad_norm": 9.719098091125488,
"learning_rate": 2.0118179373190525e-05,
"loss": 3.6548,
"step": 44800
},
{
"epoch": 1.9179018410149076,
"grad_norm": 9.458189964294434,
"learning_rate": 2.0039076713759117e-05,
"loss": 3.7504,
"step": 44900
},
{
"epoch": 1.9221733373200633,
"grad_norm": 10.599435806274414,
"learning_rate": 1.9959974054327708e-05,
"loss": 3.5734,
"step": 45000
},
{
"epoch": 1.9221733373200633,
"eval_runtime": 404.0106,
"eval_samples_per_second": 115.893,
"eval_steps_per_second": 14.487,
"step": 45000
},
{
"epoch": 1.926444833625219,
"grad_norm": 9.23690128326416,
"learning_rate": 1.9880871394896296e-05,
"loss": 3.7208,
"step": 45100
},
{
"epoch": 1.9307163299303745,
"grad_norm": 7.124606609344482,
"learning_rate": 1.9801768735464888e-05,
"loss": 3.6351,
"step": 45200
},
{
"epoch": 1.9349878262355302,
"grad_norm": 8.71446704864502,
"learning_rate": 1.9722666076033476e-05,
"loss": 3.6835,
"step": 45300
},
{
"epoch": 1.939259322540686,
"grad_norm": 9.558823585510254,
"learning_rate": 1.9643563416602067e-05,
"loss": 3.5569,
"step": 45400
},
{
"epoch": 1.9435308188458418,
"grad_norm": 9.622088432312012,
"learning_rate": 1.956446075717066e-05,
"loss": 3.6797,
"step": 45500
},
{
"epoch": 1.9478023151509976,
"grad_norm": 8.641619682312012,
"learning_rate": 1.9485358097739247e-05,
"loss": 3.6377,
"step": 45600
},
{
"epoch": 1.952073811456153,
"grad_norm": 12.308704376220703,
"learning_rate": 1.940625543830784e-05,
"loss": 3.5211,
"step": 45700
},
{
"epoch": 1.9563453077613087,
"grad_norm": 8.850275993347168,
"learning_rate": 1.9327152778876426e-05,
"loss": 3.5652,
"step": 45800
},
{
"epoch": 1.9606168040664644,
"grad_norm": 8.595603942871094,
"learning_rate": 1.9248050119445015e-05,
"loss": 3.6181,
"step": 45900
},
{
"epoch": 1.9648883003716202,
"grad_norm": 8.737709999084473,
"learning_rate": 1.9168947460013606e-05,
"loss": 3.6392,
"step": 46000
},
{
"epoch": 1.9648883003716202,
"eval_runtime": 403.5624,
"eval_samples_per_second": 116.022,
"eval_steps_per_second": 14.503,
"step": 46000
},
{
"epoch": 1.969159796676776,
"grad_norm": 10.178166389465332,
"learning_rate": 1.9089844800582198e-05,
"loss": 3.5406,
"step": 46100
},
{
"epoch": 1.9734312929819315,
"grad_norm": 8.49496841430664,
"learning_rate": 1.9010742141150786e-05,
"loss": 3.5631,
"step": 46200
},
{
"epoch": 1.9777027892870873,
"grad_norm": 12.1917724609375,
"learning_rate": 1.8931639481719377e-05,
"loss": 3.6878,
"step": 46300
},
{
"epoch": 1.9819742855922429,
"grad_norm": 7.169999599456787,
"learning_rate": 1.8852536822287965e-05,
"loss": 3.5653,
"step": 46400
},
{
"epoch": 1.9862457818973986,
"grad_norm": 9.828686714172363,
"learning_rate": 1.8773434162856557e-05,
"loss": 3.5959,
"step": 46500
},
{
"epoch": 1.9905172782025544,
"grad_norm": 11.669685363769531,
"learning_rate": 1.8694331503425148e-05,
"loss": 3.7558,
"step": 46600
},
{
"epoch": 1.9947887745077102,
"grad_norm": 9.722572326660156,
"learning_rate": 1.8615228843993736e-05,
"loss": 3.5793,
"step": 46700
},
{
"epoch": 1.9990602708128657,
"grad_norm": 7.060891151428223,
"learning_rate": 1.8536126184562324e-05,
"loss": 3.5613,
"step": 46800
},
{
"epoch": 2.0033317671180213,
"grad_norm": 7.597713470458984,
"learning_rate": 1.8457023525130916e-05,
"loss": 3.527,
"step": 46900
},
{
"epoch": 2.007603263423177,
"grad_norm": 8.622049331665039,
"learning_rate": 1.8377920865699504e-05,
"loss": 3.576,
"step": 47000
},
{
"epoch": 2.007603263423177,
"eval_runtime": 404.752,
"eval_samples_per_second": 115.681,
"eval_steps_per_second": 14.461,
"step": 47000
},
{
"epoch": 2.011874759728333,
"grad_norm": 7.117955207824707,
"learning_rate": 1.8298818206268095e-05,
"loss": 3.6644,
"step": 47100
},
{
"epoch": 2.0161462560334886,
"grad_norm": 7.748778820037842,
"learning_rate": 1.8219715546836687e-05,
"loss": 3.5904,
"step": 47200
},
{
"epoch": 2.0204177523386444,
"grad_norm": 7.402785301208496,
"learning_rate": 1.8140612887405275e-05,
"loss": 3.6069,
"step": 47300
},
{
"epoch": 2.0246892486437997,
"grad_norm": 7.453569412231445,
"learning_rate": 1.8061510227973867e-05,
"loss": 3.6795,
"step": 47400
},
{
"epoch": 2.0289607449489555,
"grad_norm": 8.299507141113281,
"learning_rate": 1.7982407568542455e-05,
"loss": 3.6194,
"step": 47500
},
{
"epoch": 2.0332322412541113,
"grad_norm": 10.050152778625488,
"learning_rate": 1.7903304909111046e-05,
"loss": 3.5512,
"step": 47600
},
{
"epoch": 2.037503737559267,
"grad_norm": 8.691873550415039,
"learning_rate": 1.7824202249679638e-05,
"loss": 3.6216,
"step": 47700
},
{
"epoch": 2.041775233864423,
"grad_norm": 7.912090301513672,
"learning_rate": 1.7745099590248222e-05,
"loss": 3.5601,
"step": 47800
},
{
"epoch": 2.0460467301695786,
"grad_norm": 9.80728530883789,
"learning_rate": 1.7665996930816814e-05,
"loss": 3.6074,
"step": 47900
},
{
"epoch": 2.050318226474734,
"grad_norm": 11.86419677734375,
"learning_rate": 1.7586894271385405e-05,
"loss": 3.5964,
"step": 48000
},
{
"epoch": 2.050318226474734,
"eval_runtime": 403.5223,
"eval_samples_per_second": 116.033,
"eval_steps_per_second": 14.505,
"step": 48000
},
{
"epoch": 2.0545897227798897,
"grad_norm": 8.644769668579102,
"learning_rate": 1.7507791611953993e-05,
"loss": 3.53,
"step": 48100
},
{
"epoch": 2.0588612190850455,
"grad_norm": 8.596597671508789,
"learning_rate": 1.7428688952522585e-05,
"loss": 3.6423,
"step": 48200
},
{
"epoch": 2.0631327153902013,
"grad_norm": 8.68507194519043,
"learning_rate": 1.7349586293091173e-05,
"loss": 3.5187,
"step": 48300
},
{
"epoch": 2.067404211695357,
"grad_norm": 12.417092323303223,
"learning_rate": 1.7270483633659764e-05,
"loss": 3.6873,
"step": 48400
},
{
"epoch": 2.0716757080005124,
"grad_norm": 7.873465061187744,
"learning_rate": 1.7191380974228356e-05,
"loss": 3.556,
"step": 48500
},
{
"epoch": 2.075947204305668,
"grad_norm": 9.485852241516113,
"learning_rate": 1.7112278314796944e-05,
"loss": 3.5671,
"step": 48600
},
{
"epoch": 2.080218700610824,
"grad_norm": 9.282876968383789,
"learning_rate": 1.7033175655365536e-05,
"loss": 3.5649,
"step": 48700
},
{
"epoch": 2.0844901969159797,
"grad_norm": 9.663043022155762,
"learning_rate": 1.6954072995934127e-05,
"loss": 3.5675,
"step": 48800
},
{
"epoch": 2.0887616932211355,
"grad_norm": 9.47641372680664,
"learning_rate": 1.6874970336502712e-05,
"loss": 3.5404,
"step": 48900
},
{
"epoch": 2.0930331895262912,
"grad_norm": 9.768278121948242,
"learning_rate": 1.6795867677071303e-05,
"loss": 3.6144,
"step": 49000
},
{
"epoch": 2.0930331895262912,
"eval_runtime": 403.9425,
"eval_samples_per_second": 115.913,
"eval_steps_per_second": 14.49,
"step": 49000
},
{
"epoch": 2.0973046858314466,
"grad_norm": 9.314282417297363,
"learning_rate": 1.6716765017639895e-05,
"loss": 3.6568,
"step": 49100
},
{
"epoch": 2.1015761821366024,
"grad_norm": 8.707430839538574,
"learning_rate": 1.6637662358208483e-05,
"loss": 3.5775,
"step": 49200
},
{
"epoch": 2.105847678441758,
"grad_norm": 11.704259872436523,
"learning_rate": 1.6558559698777074e-05,
"loss": 3.5568,
"step": 49300
},
{
"epoch": 2.110119174746914,
"grad_norm": 8.504453659057617,
"learning_rate": 1.6479457039345662e-05,
"loss": 3.6528,
"step": 49400
},
{
"epoch": 2.1143906710520697,
"grad_norm": 8.935593605041504,
"learning_rate": 1.6400354379914254e-05,
"loss": 3.7016,
"step": 49500
},
{
"epoch": 2.1186621673572255,
"grad_norm": 8.349204063415527,
"learning_rate": 1.6321251720482845e-05,
"loss": 3.5431,
"step": 49600
},
{
"epoch": 2.122933663662381,
"grad_norm": 11.8608980178833,
"learning_rate": 1.6242149061051433e-05,
"loss": 3.5844,
"step": 49700
},
{
"epoch": 2.1272051599675366,
"grad_norm": 7.555705547332764,
"learning_rate": 1.6163046401620025e-05,
"loss": 3.5815,
"step": 49800
},
{
"epoch": 2.1314766562726923,
"grad_norm": 9.529816627502441,
"learning_rate": 1.6083943742188613e-05,
"loss": 3.5485,
"step": 49900
},
{
"epoch": 2.135748152577848,
"grad_norm": 9.32353401184082,
"learning_rate": 1.60048410827572e-05,
"loss": 3.589,
"step": 50000
},
{
"epoch": 2.135748152577848,
"eval_runtime": 404.1831,
"eval_samples_per_second": 115.844,
"eval_steps_per_second": 14.481,
"step": 50000
},
{
"epoch": 2.140019648883004,
"grad_norm": 8.285137176513672,
"learning_rate": 1.5925738423325793e-05,
"loss": 3.6874,
"step": 50100
},
{
"epoch": 2.144291145188159,
"grad_norm": 6.94751501083374,
"learning_rate": 1.5846635763894384e-05,
"loss": 3.6489,
"step": 50200
},
{
"epoch": 2.148562641493315,
"grad_norm": 11.093490600585938,
"learning_rate": 1.5767533104462972e-05,
"loss": 3.6675,
"step": 50300
},
{
"epoch": 2.1528341377984708,
"grad_norm": 8.154306411743164,
"learning_rate": 1.5688430445031564e-05,
"loss": 3.5934,
"step": 50400
},
{
"epoch": 2.1571056341036265,
"grad_norm": 8.806336402893066,
"learning_rate": 1.5609327785600152e-05,
"loss": 3.5804,
"step": 50500
},
{
"epoch": 2.1613771304087823,
"grad_norm": 10.496975898742676,
"learning_rate": 1.5530225126168743e-05,
"loss": 3.6913,
"step": 50600
},
{
"epoch": 2.165648626713938,
"grad_norm": 9.081565856933594,
"learning_rate": 1.5451122466737335e-05,
"loss": 3.7297,
"step": 50700
},
{
"epoch": 2.1699201230190934,
"grad_norm": 7.850902557373047,
"learning_rate": 1.5372019807305923e-05,
"loss": 3.7112,
"step": 50800
},
{
"epoch": 2.174191619324249,
"grad_norm": 8.145720481872559,
"learning_rate": 1.529291714787451e-05,
"loss": 3.6324,
"step": 50900
},
{
"epoch": 2.178463115629405,
"grad_norm": 8.924689292907715,
"learning_rate": 1.52138144884431e-05,
"loss": 3.6598,
"step": 51000
},
{
"epoch": 2.178463115629405,
"eval_runtime": 404.5353,
"eval_samples_per_second": 115.743,
"eval_steps_per_second": 14.468,
"step": 51000
},
{
"epoch": 2.1827346119345608,
"grad_norm": 13.303974151611328,
"learning_rate": 1.513471182901169e-05,
"loss": 3.5284,
"step": 51100
},
{
"epoch": 2.1870061082397165,
"grad_norm": 8.976105690002441,
"learning_rate": 1.5055609169580282e-05,
"loss": 3.6514,
"step": 51200
},
{
"epoch": 2.1912776045448723,
"grad_norm": 7.439825057983398,
"learning_rate": 1.4976506510148872e-05,
"loss": 3.5687,
"step": 51300
},
{
"epoch": 2.1955491008500276,
"grad_norm": 8.54857349395752,
"learning_rate": 1.4897403850717462e-05,
"loss": 3.7166,
"step": 51400
},
{
"epoch": 2.1998205971551834,
"grad_norm": 11.23593521118164,
"learning_rate": 1.4818301191286051e-05,
"loss": 3.5591,
"step": 51500
},
{
"epoch": 2.204092093460339,
"grad_norm": 13.313474655151367,
"learning_rate": 1.4739198531854643e-05,
"loss": 3.7213,
"step": 51600
},
{
"epoch": 2.208363589765495,
"grad_norm": 9.998103141784668,
"learning_rate": 1.4660095872423233e-05,
"loss": 3.5843,
"step": 51700
},
{
"epoch": 2.2126350860706507,
"grad_norm": 8.799863815307617,
"learning_rate": 1.4580993212991822e-05,
"loss": 3.7109,
"step": 51800
},
{
"epoch": 2.216906582375806,
"grad_norm": 7.352701187133789,
"learning_rate": 1.4501890553560412e-05,
"loss": 3.6722,
"step": 51900
},
{
"epoch": 2.221178078680962,
"grad_norm": 12.166138648986816,
"learning_rate": 1.4422787894129e-05,
"loss": 3.582,
"step": 52000
},
{
"epoch": 2.221178078680962,
"eval_runtime": 403.5081,
"eval_samples_per_second": 116.037,
"eval_steps_per_second": 14.505,
"step": 52000
},
{
"epoch": 2.2254495749861176,
"grad_norm": 10.341227531433105,
"learning_rate": 1.434368523469759e-05,
"loss": 3.7105,
"step": 52100
},
{
"epoch": 2.2297210712912734,
"grad_norm": 7.697736740112305,
"learning_rate": 1.426458257526618e-05,
"loss": 3.5896,
"step": 52200
},
{
"epoch": 2.233992567596429,
"grad_norm": 7.957235336303711,
"learning_rate": 1.4185479915834771e-05,
"loss": 3.5472,
"step": 52300
},
{
"epoch": 2.2382640639015845,
"grad_norm": 7.778316020965576,
"learning_rate": 1.4106377256403361e-05,
"loss": 3.5998,
"step": 52400
},
{
"epoch": 2.2425355602067403,
"grad_norm": 8.099467277526855,
"learning_rate": 1.4027274596971951e-05,
"loss": 3.7143,
"step": 52500
},
{
"epoch": 2.246807056511896,
"grad_norm": 8.077199935913086,
"learning_rate": 1.394817193754054e-05,
"loss": 3.6727,
"step": 52600
},
{
"epoch": 2.251078552817052,
"grad_norm": 10.278371810913086,
"learning_rate": 1.3869069278109132e-05,
"loss": 3.638,
"step": 52700
},
{
"epoch": 2.2553500491222076,
"grad_norm": 10.49933910369873,
"learning_rate": 1.3789966618677722e-05,
"loss": 3.5718,
"step": 52800
},
{
"epoch": 2.2596215454273634,
"grad_norm": 10.37414264678955,
"learning_rate": 1.3710863959246312e-05,
"loss": 3.551,
"step": 52900
},
{
"epoch": 2.2638930417325187,
"grad_norm": 6.969189643859863,
"learning_rate": 1.36317612998149e-05,
"loss": 3.593,
"step": 53000
},
{
"epoch": 2.2638930417325187,
"eval_runtime": 404.0848,
"eval_samples_per_second": 115.872,
"eval_steps_per_second": 14.485,
"step": 53000
},
{
"epoch": 2.2681645380376745,
"grad_norm": 7.354485511779785,
"learning_rate": 1.355265864038349e-05,
"loss": 3.5927,
"step": 53100
},
{
"epoch": 2.2724360343428303,
"grad_norm": 10.107403755187988,
"learning_rate": 1.347355598095208e-05,
"loss": 3.6626,
"step": 53200
},
{
"epoch": 2.276707530647986,
"grad_norm": 9.613969802856445,
"learning_rate": 1.339445332152067e-05,
"loss": 3.6073,
"step": 53300
},
{
"epoch": 2.280979026953142,
"grad_norm": 7.995043754577637,
"learning_rate": 1.3315350662089259e-05,
"loss": 3.5675,
"step": 53400
},
{
"epoch": 2.2852505232582976,
"grad_norm": 7.049370765686035,
"learning_rate": 1.323624800265785e-05,
"loss": 3.6559,
"step": 53500
},
{
"epoch": 2.289522019563453,
"grad_norm": 10.962531089782715,
"learning_rate": 1.315714534322644e-05,
"loss": 3.667,
"step": 53600
},
{
"epoch": 2.2937935158686087,
"grad_norm": 8.100302696228027,
"learning_rate": 1.307804268379503e-05,
"loss": 3.6044,
"step": 53700
},
{
"epoch": 2.2980650121737645,
"grad_norm": 8.079455375671387,
"learning_rate": 1.299894002436362e-05,
"loss": 3.7206,
"step": 53800
},
{
"epoch": 2.3023365084789202,
"grad_norm": 8.500101089477539,
"learning_rate": 1.2919837364932211e-05,
"loss": 3.6075,
"step": 53900
},
{
"epoch": 2.306608004784076,
"grad_norm": 12.927189826965332,
"learning_rate": 1.2840734705500801e-05,
"loss": 3.654,
"step": 54000
},
{
"epoch": 2.306608004784076,
"eval_runtime": 403.7677,
"eval_samples_per_second": 115.963,
"eval_steps_per_second": 14.496,
"step": 54000
},
{
"epoch": 2.3108795010892313,
"grad_norm": 7.132537841796875,
"learning_rate": 1.2761632046069388e-05,
"loss": 3.6083,
"step": 54100
},
{
"epoch": 2.315150997394387,
"grad_norm": 8.612030982971191,
"learning_rate": 1.2682529386637979e-05,
"loss": 3.5848,
"step": 54200
},
{
"epoch": 2.319422493699543,
"grad_norm": 8.210352897644043,
"learning_rate": 1.2603426727206569e-05,
"loss": 3.7611,
"step": 54300
},
{
"epoch": 2.3236939900046987,
"grad_norm": 9.12792682647705,
"learning_rate": 1.2524324067775159e-05,
"loss": 3.518,
"step": 54400
},
{
"epoch": 2.3279654863098544,
"grad_norm": 9.999760627746582,
"learning_rate": 1.2445221408343748e-05,
"loss": 3.5513,
"step": 54500
},
{
"epoch": 2.3322369826150102,
"grad_norm": 12.578611373901367,
"learning_rate": 1.236611874891234e-05,
"loss": 3.5614,
"step": 54600
},
{
"epoch": 2.3365084789201656,
"grad_norm": 12.089159965515137,
"learning_rate": 1.228701608948093e-05,
"loss": 3.5944,
"step": 54700
},
{
"epoch": 2.3407799752253213,
"grad_norm": 7.965277671813965,
"learning_rate": 1.2207913430049518e-05,
"loss": 3.4852,
"step": 54800
},
{
"epoch": 2.345051471530477,
"grad_norm": 10.866728782653809,
"learning_rate": 1.2128810770618108e-05,
"loss": 3.5431,
"step": 54900
},
{
"epoch": 2.349322967835633,
"grad_norm": 10.489164352416992,
"learning_rate": 1.2049708111186699e-05,
"loss": 3.4309,
"step": 55000
},
{
"epoch": 2.349322967835633,
"eval_runtime": 404.0413,
"eval_samples_per_second": 115.884,
"eval_steps_per_second": 14.486,
"step": 55000
},
{
"epoch": 2.3535944641407887,
"grad_norm": 8.494600296020508,
"learning_rate": 1.1970605451755289e-05,
"loss": 3.4994,
"step": 55100
},
{
"epoch": 2.3578659604459444,
"grad_norm": 9.755086898803711,
"learning_rate": 1.1891502792323879e-05,
"loss": 3.5679,
"step": 55200
},
{
"epoch": 2.3621374567510998,
"grad_norm": 9.621931076049805,
"learning_rate": 1.1812400132892468e-05,
"loss": 3.6264,
"step": 55300
},
{
"epoch": 2.3664089530562555,
"grad_norm": 12.946763038635254,
"learning_rate": 1.1733297473461058e-05,
"loss": 3.5741,
"step": 55400
},
{
"epoch": 2.3706804493614113,
"grad_norm": 8.984336853027344,
"learning_rate": 1.1654194814029648e-05,
"loss": 3.5317,
"step": 55500
},
{
"epoch": 2.374951945666567,
"grad_norm": 9.304176330566406,
"learning_rate": 1.1575092154598238e-05,
"loss": 3.6013,
"step": 55600
},
{
"epoch": 2.379223441971723,
"grad_norm": 8.324792861938477,
"learning_rate": 1.1495989495166828e-05,
"loss": 3.5887,
"step": 55700
},
{
"epoch": 2.383494938276878,
"grad_norm": 11.814824104309082,
"learning_rate": 1.1416886835735419e-05,
"loss": 3.4865,
"step": 55800
},
{
"epoch": 2.387766434582034,
"grad_norm": 9.219450950622559,
"learning_rate": 1.1337784176304007e-05,
"loss": 3.5598,
"step": 55900
},
{
"epoch": 2.3920379308871897,
"grad_norm": 10.202199935913086,
"learning_rate": 1.1258681516872597e-05,
"loss": 3.6973,
"step": 56000
},
{
"epoch": 2.3920379308871897,
"eval_runtime": 405.1455,
"eval_samples_per_second": 115.568,
"eval_steps_per_second": 14.447,
"step": 56000
},
{
"epoch": 2.3963094271923455,
"grad_norm": 10.352853775024414,
"learning_rate": 1.1179578857441188e-05,
"loss": 3.5715,
"step": 56100
},
{
"epoch": 2.4005809234975013,
"grad_norm": 9.794927597045898,
"learning_rate": 1.1100476198009778e-05,
"loss": 3.5751,
"step": 56200
},
{
"epoch": 2.4048524198026566,
"grad_norm": 9.24485969543457,
"learning_rate": 1.1021373538578368e-05,
"loss": 3.6939,
"step": 56300
},
{
"epoch": 2.4091239161078124,
"grad_norm": 6.9035162925720215,
"learning_rate": 1.0942270879146956e-05,
"loss": 3.6613,
"step": 56400
},
{
"epoch": 2.413395412412968,
"grad_norm": 9.021778106689453,
"learning_rate": 1.0863168219715548e-05,
"loss": 3.5978,
"step": 56500
},
{
"epoch": 2.417666908718124,
"grad_norm": 7.050608158111572,
"learning_rate": 1.0784065560284137e-05,
"loss": 3.7212,
"step": 56600
},
{
"epoch": 2.4219384050232797,
"grad_norm": 8.771140098571777,
"learning_rate": 1.0704962900852727e-05,
"loss": 3.6447,
"step": 56700
},
{
"epoch": 2.4262099013284355,
"grad_norm": 14.564820289611816,
"learning_rate": 1.0625860241421317e-05,
"loss": 3.6091,
"step": 56800
},
{
"epoch": 2.4304813976335913,
"grad_norm": 10.664299011230469,
"learning_rate": 1.0546757581989907e-05,
"loss": 3.6506,
"step": 56900
},
{
"epoch": 2.4347528939387466,
"grad_norm": 14.445178985595703,
"learning_rate": 1.0467654922558497e-05,
"loss": 3.6226,
"step": 57000
},
{
"epoch": 2.4347528939387466,
"eval_runtime": 404.9602,
"eval_samples_per_second": 115.621,
"eval_steps_per_second": 14.453,
"step": 57000
},
{
"epoch": 2.4390243902439024,
"grad_norm": 19.93160057067871,
"learning_rate": 1.0388552263127086e-05,
"loss": 3.6144,
"step": 57100
},
{
"epoch": 2.443295886549058,
"grad_norm": 7.793177604675293,
"learning_rate": 1.0309449603695676e-05,
"loss": 3.6577,
"step": 57200
},
{
"epoch": 2.447567382854214,
"grad_norm": 7.95759391784668,
"learning_rate": 1.0230346944264268e-05,
"loss": 3.6079,
"step": 57300
},
{
"epoch": 2.4518388791593697,
"grad_norm": 10.07507610321045,
"learning_rate": 1.0151244284832856e-05,
"loss": 3.5974,
"step": 57400
},
{
"epoch": 2.456110375464525,
"grad_norm": 9.73681926727295,
"learning_rate": 1.0072141625401446e-05,
"loss": 3.5602,
"step": 57500
},
{
"epoch": 2.460381871769681,
"grad_norm": 18.652366638183594,
"learning_rate": 9.993038965970037e-06,
"loss": 3.5649,
"step": 57600
},
{
"epoch": 2.4646533680748366,
"grad_norm": 10.758431434631348,
"learning_rate": 9.913936306538627e-06,
"loss": 3.6587,
"step": 57700
},
{
"epoch": 2.4689248643799924,
"grad_norm": 8.963933944702148,
"learning_rate": 9.834833647107217e-06,
"loss": 3.5872,
"step": 57800
},
{
"epoch": 2.473196360685148,
"grad_norm": 12.521937370300293,
"learning_rate": 9.755730987675805e-06,
"loss": 3.6379,
"step": 57900
},
{
"epoch": 2.4774678569903035,
"grad_norm": 8.87618350982666,
"learning_rate": 9.676628328244396e-06,
"loss": 3.6867,
"step": 58000
},
{
"epoch": 2.4774678569903035,
"eval_runtime": 403.9214,
"eval_samples_per_second": 115.919,
"eval_steps_per_second": 14.49,
"step": 58000
},
{
"epoch": 2.4817393532954592,
"grad_norm": 8.210921287536621,
"learning_rate": 9.597525668812986e-06,
"loss": 3.5951,
"step": 58100
},
{
"epoch": 2.486010849600615,
"grad_norm": 11.452202796936035,
"learning_rate": 9.518423009381576e-06,
"loss": 3.573,
"step": 58200
},
{
"epoch": 2.490282345905771,
"grad_norm": 6.497128486633301,
"learning_rate": 9.439320349950166e-06,
"loss": 3.5676,
"step": 58300
},
{
"epoch": 2.4945538422109266,
"grad_norm": 10.434738159179688,
"learning_rate": 9.360217690518757e-06,
"loss": 3.6214,
"step": 58400
},
{
"epoch": 2.4988253385160824,
"grad_norm": 10.927915573120117,
"learning_rate": 9.281115031087345e-06,
"loss": 3.6166,
"step": 58500
},
{
"epoch": 2.503096834821238,
"grad_norm": 9.382610321044922,
"learning_rate": 9.202012371655935e-06,
"loss": 3.6148,
"step": 58600
},
{
"epoch": 2.5073683311263935,
"grad_norm": 10.243247032165527,
"learning_rate": 9.122909712224525e-06,
"loss": 3.5586,
"step": 58700
},
{
"epoch": 2.5116398274315492,
"grad_norm": 9.074312210083008,
"learning_rate": 9.043807052793116e-06,
"loss": 3.5202,
"step": 58800
},
{
"epoch": 2.515911323736705,
"grad_norm": 8.498826026916504,
"learning_rate": 8.964704393361706e-06,
"loss": 3.6136,
"step": 58900
},
{
"epoch": 2.520182820041861,
"grad_norm": 9.749957084655762,
"learning_rate": 8.885601733930294e-06,
"loss": 3.6116,
"step": 59000
},
{
"epoch": 2.520182820041861,
"eval_runtime": 403.7151,
"eval_samples_per_second": 115.978,
"eval_steps_per_second": 14.498,
"step": 59000
},
{
"epoch": 2.5244543163470166,
"grad_norm": 12.452668190002441,
"learning_rate": 8.806499074498886e-06,
"loss": 3.6322,
"step": 59100
},
{
"epoch": 2.528725812652172,
"grad_norm": 10.466354370117188,
"learning_rate": 8.727396415067475e-06,
"loss": 3.623,
"step": 59200
},
{
"epoch": 2.5329973089573277,
"grad_norm": 11.655385971069336,
"learning_rate": 8.648293755636065e-06,
"loss": 3.6222,
"step": 59300
},
{
"epoch": 2.5372688052624834,
"grad_norm": 46.87141799926758,
"learning_rate": 8.569191096204655e-06,
"loss": 3.5937,
"step": 59400
},
{
"epoch": 2.541540301567639,
"grad_norm": 7.815254211425781,
"learning_rate": 8.490088436773245e-06,
"loss": 3.6052,
"step": 59500
},
{
"epoch": 2.545811797872795,
"grad_norm": 8.2904052734375,
"learning_rate": 8.410985777341835e-06,
"loss": 3.668,
"step": 59600
},
{
"epoch": 2.5500832941779503,
"grad_norm": 7.5048017501831055,
"learning_rate": 8.331883117910424e-06,
"loss": 3.5193,
"step": 59700
},
{
"epoch": 2.554354790483106,
"grad_norm": 8.502148628234863,
"learning_rate": 8.252780458479014e-06,
"loss": 3.5909,
"step": 59800
},
{
"epoch": 2.558626286788262,
"grad_norm": 7.68582820892334,
"learning_rate": 8.173677799047606e-06,
"loss": 3.5942,
"step": 59900
},
{
"epoch": 2.5628977830934176,
"grad_norm": 8.871585845947266,
"learning_rate": 8.094575139616194e-06,
"loss": 3.609,
"step": 60000
},
{
"epoch": 2.5628977830934176,
"eval_runtime": 403.125,
"eval_samples_per_second": 116.148,
"eval_steps_per_second": 14.519,
"step": 60000
},
{
"epoch": 2.5671692793985734,
"grad_norm": 11.707693099975586,
"learning_rate": 8.015472480184783e-06,
"loss": 3.559,
"step": 60100
},
{
"epoch": 2.5714407757037288,
"grad_norm": 11.136000633239746,
"learning_rate": 7.936369820753373e-06,
"loss": 3.6089,
"step": 60200
},
{
"epoch": 2.575712272008885,
"grad_norm": 8.095897674560547,
"learning_rate": 7.857267161321965e-06,
"loss": 3.5446,
"step": 60300
},
{
"epoch": 2.5799837683140403,
"grad_norm": 9.27779769897461,
"learning_rate": 7.778164501890555e-06,
"loss": 3.6471,
"step": 60400
},
{
"epoch": 2.584255264619196,
"grad_norm": 10.214181900024414,
"learning_rate": 7.699061842459143e-06,
"loss": 3.568,
"step": 60500
},
{
"epoch": 2.588526760924352,
"grad_norm": 7.064481258392334,
"learning_rate": 7.619959183027733e-06,
"loss": 3.6581,
"step": 60600
},
{
"epoch": 2.5927982572295076,
"grad_norm": 10.396333694458008,
"learning_rate": 7.540856523596324e-06,
"loss": 3.5857,
"step": 60700
},
{
"epoch": 2.5970697535346634,
"grad_norm": 9.091317176818848,
"learning_rate": 7.461753864164914e-06,
"loss": 3.5121,
"step": 60800
},
{
"epoch": 2.6013412498398187,
"grad_norm": 8.94211483001709,
"learning_rate": 7.3826512047335035e-06,
"loss": 3.5963,
"step": 60900
},
{
"epoch": 2.6056127461449745,
"grad_norm": 9.317548751831055,
"learning_rate": 7.303548545302094e-06,
"loss": 3.6764,
"step": 61000
},
{
"epoch": 2.6056127461449745,
"eval_runtime": 403.6369,
"eval_samples_per_second": 116.0,
"eval_steps_per_second": 14.501,
"step": 61000
},
{
"epoch": 2.6098842424501303,
"grad_norm": 9.0656156539917,
"learning_rate": 7.224445885870683e-06,
"loss": 3.5781,
"step": 61100
},
{
"epoch": 2.614155738755286,
"grad_norm": 12.859307289123535,
"learning_rate": 7.145343226439273e-06,
"loss": 3.553,
"step": 61200
},
{
"epoch": 2.618427235060442,
"grad_norm": 10.962692260742188,
"learning_rate": 7.0662405670078635e-06,
"loss": 3.6146,
"step": 61300
},
{
"epoch": 2.622698731365597,
"grad_norm": 11.84343147277832,
"learning_rate": 6.987137907576453e-06,
"loss": 3.6113,
"step": 61400
},
{
"epoch": 2.626970227670753,
"grad_norm": 8.770605087280273,
"learning_rate": 6.908035248145044e-06,
"loss": 3.6307,
"step": 61500
},
{
"epoch": 2.6312417239759087,
"grad_norm": 11.979937553405762,
"learning_rate": 6.828932588713632e-06,
"loss": 3.5754,
"step": 61600
},
{
"epoch": 2.6355132202810645,
"grad_norm": 8.271350860595703,
"learning_rate": 6.749829929282223e-06,
"loss": 3.6004,
"step": 61700
},
{
"epoch": 2.6397847165862203,
"grad_norm": 9.494888305664062,
"learning_rate": 6.6707272698508125e-06,
"loss": 3.5873,
"step": 61800
},
{
"epoch": 2.6440562128913756,
"grad_norm": 8.384838104248047,
"learning_rate": 6.591624610419403e-06,
"loss": 3.7076,
"step": 61900
},
{
"epoch": 2.648327709196532,
"grad_norm": 11.468506813049316,
"learning_rate": 6.512521950987993e-06,
"loss": 3.6043,
"step": 62000
},
{
"epoch": 2.648327709196532,
"eval_runtime": 403.9819,
"eval_samples_per_second": 115.901,
"eval_steps_per_second": 14.488,
"step": 62000
},
{
"epoch": 2.652599205501687,
"grad_norm": 9.485078811645508,
"learning_rate": 6.433419291556582e-06,
"loss": 3.6547,
"step": 62100
},
{
"epoch": 2.656870701806843,
"grad_norm": 6.771136283874512,
"learning_rate": 6.3543166321251725e-06,
"loss": 3.6359,
"step": 62200
},
{
"epoch": 2.6611421981119987,
"grad_norm": 7.749585151672363,
"learning_rate": 6.275213972693762e-06,
"loss": 3.6473,
"step": 62300
},
{
"epoch": 2.6654136944171545,
"grad_norm": 9.156508445739746,
"learning_rate": 6.196111313262352e-06,
"loss": 3.5664,
"step": 62400
},
{
"epoch": 2.6696851907223103,
"grad_norm": 7.322949409484863,
"learning_rate": 6.117008653830942e-06,
"loss": 3.6327,
"step": 62500
},
{
"epoch": 2.6739566870274656,
"grad_norm": 9.038566589355469,
"learning_rate": 6.0379059943995325e-06,
"loss": 3.588,
"step": 62600
},
{
"epoch": 2.6782281833326214,
"grad_norm": 9.974699020385742,
"learning_rate": 5.9588033349681214e-06,
"loss": 3.5183,
"step": 62700
},
{
"epoch": 2.682499679637777,
"grad_norm": 15.095208168029785,
"learning_rate": 5.879700675536712e-06,
"loss": 3.6799,
"step": 62800
},
{
"epoch": 2.686771175942933,
"grad_norm": 6.498071670532227,
"learning_rate": 5.800598016105302e-06,
"loss": 3.665,
"step": 62900
},
{
"epoch": 2.6910426722480887,
"grad_norm": 10.172649383544922,
"learning_rate": 5.721495356673892e-06,
"loss": 3.5558,
"step": 63000
},
{
"epoch": 2.6910426722480887,
"eval_runtime": 404.4896,
"eval_samples_per_second": 115.756,
"eval_steps_per_second": 14.47,
"step": 63000
},
{
"epoch": 2.695314168553244,
"grad_norm": 9.67616081237793,
"learning_rate": 5.6423926972424814e-06,
"loss": 3.6391,
"step": 63100
},
{
"epoch": 2.6995856648584,
"grad_norm": 8.725837707519531,
"learning_rate": 5.563290037811072e-06,
"loss": 3.5525,
"step": 63200
},
{
"epoch": 2.7038571611635556,
"grad_norm": 7.677910327911377,
"learning_rate": 5.484187378379661e-06,
"loss": 3.5785,
"step": 63300
},
{
"epoch": 2.7081286574687113,
"grad_norm": 9.097688674926758,
"learning_rate": 5.405084718948252e-06,
"loss": 3.6759,
"step": 63400
},
{
"epoch": 2.712400153773867,
"grad_norm": 9.70285415649414,
"learning_rate": 5.325982059516841e-06,
"loss": 3.641,
"step": 63500
},
{
"epoch": 2.7166716500790224,
"grad_norm": 8.540017127990723,
"learning_rate": 5.246879400085431e-06,
"loss": 3.6462,
"step": 63600
},
{
"epoch": 2.7209431463841782,
"grad_norm": 9.38048267364502,
"learning_rate": 5.167776740654021e-06,
"loss": 3.6205,
"step": 63700
},
{
"epoch": 2.725214642689334,
"grad_norm": 7.8036417961120605,
"learning_rate": 5.088674081222611e-06,
"loss": 3.5581,
"step": 63800
},
{
"epoch": 2.7294861389944898,
"grad_norm": 8.558833122253418,
"learning_rate": 5.009571421791201e-06,
"loss": 3.6245,
"step": 63900
},
{
"epoch": 2.7337576352996455,
"grad_norm": 10.551793098449707,
"learning_rate": 4.93046876235979e-06,
"loss": 3.6178,
"step": 64000
},
{
"epoch": 2.7337576352996455,
"eval_runtime": 403.4664,
"eval_samples_per_second": 116.049,
"eval_steps_per_second": 14.507,
"step": 64000
},
{
"epoch": 2.7380291316048013,
"grad_norm": 7.700866222381592,
"learning_rate": 4.851366102928381e-06,
"loss": 3.5907,
"step": 64100
},
{
"epoch": 2.742300627909957,
"grad_norm": 10.438343048095703,
"learning_rate": 4.772263443496971e-06,
"loss": 3.6566,
"step": 64200
},
{
"epoch": 2.7465721242151124,
"grad_norm": 10.483076095581055,
"learning_rate": 4.693160784065561e-06,
"loss": 3.5357,
"step": 64300
},
{
"epoch": 2.750843620520268,
"grad_norm": 10.251741409301758,
"learning_rate": 4.61405812463415e-06,
"loss": 3.5436,
"step": 64400
},
{
"epoch": 2.755115116825424,
"grad_norm": 7.065600872039795,
"learning_rate": 4.534955465202741e-06,
"loss": 3.588,
"step": 64500
},
{
"epoch": 2.7593866131305798,
"grad_norm": 6.57476282119751,
"learning_rate": 4.45585280577133e-06,
"loss": 3.5464,
"step": 64600
},
{
"epoch": 2.7636581094357355,
"grad_norm": 10.847752571105957,
"learning_rate": 4.376750146339921e-06,
"loss": 3.6241,
"step": 64700
},
{
"epoch": 2.767929605740891,
"grad_norm": 9.701374053955078,
"learning_rate": 4.29764748690851e-06,
"loss": 3.6638,
"step": 64800
},
{
"epoch": 2.7722011020460466,
"grad_norm": 8.982709884643555,
"learning_rate": 4.2185448274771e-06,
"loss": 3.5764,
"step": 64900
},
{
"epoch": 2.7764725983512024,
"grad_norm": 11.895380973815918,
"learning_rate": 4.13944216804569e-06,
"loss": 3.6685,
"step": 65000
},
{
"epoch": 2.7764725983512024,
"eval_runtime": 404.1638,
"eval_samples_per_second": 115.849,
"eval_steps_per_second": 14.482,
"step": 65000
},
{
"epoch": 2.780744094656358,
"grad_norm": 7.826374053955078,
"learning_rate": 4.06033950861428e-06,
"loss": 3.6005,
"step": 65100
},
{
"epoch": 2.785015590961514,
"grad_norm": 9.21237564086914,
"learning_rate": 3.98123684918287e-06,
"loss": 3.5123,
"step": 65200
},
{
"epoch": 2.7892870872666693,
"grad_norm": 9.817154884338379,
"learning_rate": 3.902134189751459e-06,
"loss": 3.6442,
"step": 65300
},
{
"epoch": 2.793558583571825,
"grad_norm": 9.74837875366211,
"learning_rate": 3.823031530320049e-06,
"loss": 3.5619,
"step": 65400
},
{
"epoch": 2.797830079876981,
"grad_norm": 8.357489585876465,
"learning_rate": 3.74392887088864e-06,
"loss": 3.613,
"step": 65500
},
{
"epoch": 2.8021015761821366,
"grad_norm": 10.162979125976562,
"learning_rate": 3.664826211457229e-06,
"loss": 3.636,
"step": 65600
},
{
"epoch": 2.8063730724872924,
"grad_norm": 9.95310115814209,
"learning_rate": 3.5857235520258194e-06,
"loss": 3.7321,
"step": 65700
},
{
"epoch": 2.8106445687924477,
"grad_norm": 8.15718936920166,
"learning_rate": 3.5066208925944088e-06,
"loss": 3.6125,
"step": 65800
},
{
"epoch": 2.814916065097604,
"grad_norm": 11.377549171447754,
"learning_rate": 3.427518233162999e-06,
"loss": 3.6859,
"step": 65900
},
{
"epoch": 2.8191875614027593,
"grad_norm": 7.227240562438965,
"learning_rate": 3.348415573731589e-06,
"loss": 3.6915,
"step": 66000
},
{
"epoch": 2.8191875614027593,
"eval_runtime": 403.9368,
"eval_samples_per_second": 115.914,
"eval_steps_per_second": 14.49,
"step": 66000
},
{
"epoch": 2.823459057707915,
"grad_norm": 9.386198043823242,
"learning_rate": 3.2693129143001786e-06,
"loss": 3.5728,
"step": 66100
},
{
"epoch": 2.827730554013071,
"grad_norm": 9.223987579345703,
"learning_rate": 3.1902102548687688e-06,
"loss": 3.6028,
"step": 66200
},
{
"epoch": 2.8320020503182266,
"grad_norm": 7.933398246765137,
"learning_rate": 3.1111075954373586e-06,
"loss": 3.6051,
"step": 66300
},
{
"epoch": 2.8362735466233824,
"grad_norm": 8.923637390136719,
"learning_rate": 3.0320049360059483e-06,
"loss": 3.6427,
"step": 66400
},
{
"epoch": 2.8405450429285377,
"grad_norm": 9.507107734680176,
"learning_rate": 2.9529022765745386e-06,
"loss": 3.6082,
"step": 66500
},
{
"epoch": 2.8448165392336935,
"grad_norm": 8.927079200744629,
"learning_rate": 2.8737996171431284e-06,
"loss": 3.6545,
"step": 66600
},
{
"epoch": 2.8490880355388493,
"grad_norm": 11.978940963745117,
"learning_rate": 2.7946969577117186e-06,
"loss": 3.6035,
"step": 66700
},
{
"epoch": 2.853359531844005,
"grad_norm": 8.081381797790527,
"learning_rate": 2.7155942982803084e-06,
"loss": 3.5326,
"step": 66800
},
{
"epoch": 2.857631028149161,
"grad_norm": 7.906234264373779,
"learning_rate": 2.6364916388488986e-06,
"loss": 3.5,
"step": 66900
},
{
"epoch": 2.861902524454316,
"grad_norm": 7.646207809448242,
"learning_rate": 2.5573889794174884e-06,
"loss": 3.6673,
"step": 67000
},
{
"epoch": 2.861902524454316,
"eval_runtime": 403.9147,
"eval_samples_per_second": 115.921,
"eval_steps_per_second": 14.491,
"step": 67000
},
{
"epoch": 2.866174020759472,
"grad_norm": 7.473504543304443,
"learning_rate": 2.478286319986078e-06,
"loss": 3.5667,
"step": 67100
},
{
"epoch": 2.8704455170646277,
"grad_norm": 7.414896011352539,
"learning_rate": 2.399183660554668e-06,
"loss": 3.703,
"step": 67200
},
{
"epoch": 2.8747170133697835,
"grad_norm": 10.893074035644531,
"learning_rate": 2.3200810011232577e-06,
"loss": 3.5693,
"step": 67300
},
{
"epoch": 2.8789885096749392,
"grad_norm": 9.107102394104004,
"learning_rate": 2.240978341691848e-06,
"loss": 3.5448,
"step": 67400
},
{
"epoch": 2.8832600059800946,
"grad_norm": 12.374472618103027,
"learning_rate": 2.1618756822604377e-06,
"loss": 3.5897,
"step": 67500
},
{
"epoch": 2.887531502285251,
"grad_norm": 10.249347686767578,
"learning_rate": 2.0827730228290275e-06,
"loss": 3.6645,
"step": 67600
},
{
"epoch": 2.891802998590406,
"grad_norm": 10.349568367004395,
"learning_rate": 2.0036703633976173e-06,
"loss": 3.5886,
"step": 67700
},
{
"epoch": 2.896074494895562,
"grad_norm": 9.08791732788086,
"learning_rate": 1.9245677039662075e-06,
"loss": 3.6344,
"step": 67800
},
{
"epoch": 2.9003459912007177,
"grad_norm": 23.89297866821289,
"learning_rate": 1.8454650445347973e-06,
"loss": 3.5352,
"step": 67900
},
{
"epoch": 2.9046174875058735,
"grad_norm": 7.383826732635498,
"learning_rate": 1.7663623851033873e-06,
"loss": 3.5756,
"step": 68000
},
{
"epoch": 2.9046174875058735,
"eval_runtime": 403.9584,
"eval_samples_per_second": 115.908,
"eval_steps_per_second": 14.489,
"step": 68000
},
{
"epoch": 2.9088889838110292,
"grad_norm": 10.697755813598633,
"learning_rate": 1.6872597256719771e-06,
"loss": 3.7499,
"step": 68100
},
{
"epoch": 2.9131604801161846,
"grad_norm": 8.796770095825195,
"learning_rate": 1.608157066240567e-06,
"loss": 3.6521,
"step": 68200
},
{
"epoch": 2.9174319764213403,
"grad_norm": 9.28873348236084,
"learning_rate": 1.529054406809157e-06,
"loss": 3.6073,
"step": 68300
},
{
"epoch": 2.921703472726496,
"grad_norm": 9.964879035949707,
"learning_rate": 1.449951747377747e-06,
"loss": 3.5816,
"step": 68400
},
{
"epoch": 2.925974969031652,
"grad_norm": 8.402993202209473,
"learning_rate": 1.3708490879463367e-06,
"loss": 3.5752,
"step": 68500
},
{
"epoch": 2.9302464653368077,
"grad_norm": 11.395208358764648,
"learning_rate": 1.2917464285149267e-06,
"loss": 3.5633,
"step": 68600
},
{
"epoch": 2.934517961641963,
"grad_norm": 8.817408561706543,
"learning_rate": 1.2126437690835167e-06,
"loss": 3.6463,
"step": 68700
},
{
"epoch": 2.9387894579471188,
"grad_norm": 12.337442398071289,
"learning_rate": 1.1335411096521067e-06,
"loss": 3.6794,
"step": 68800
},
{
"epoch": 2.9430609542522745,
"grad_norm": 8.89869213104248,
"learning_rate": 1.0544384502206965e-06,
"loss": 3.4842,
"step": 68900
},
{
"epoch": 2.9473324505574303,
"grad_norm": 8.63214111328125,
"learning_rate": 9.753357907892865e-07,
"loss": 3.5863,
"step": 69000
},
{
"epoch": 2.9473324505574303,
"eval_runtime": 404.2191,
"eval_samples_per_second": 115.833,
"eval_steps_per_second": 14.48,
"step": 69000
},
{
"epoch": 2.951603946862586,
"grad_norm": 8.96723747253418,
"learning_rate": 8.962331313578763e-07,
"loss": 3.6939,
"step": 69100
},
{
"epoch": 2.9558754431677414,
"grad_norm": 9.602298736572266,
"learning_rate": 8.171304719264663e-07,
"loss": 3.7252,
"step": 69200
},
{
"epoch": 2.960146939472897,
"grad_norm": 12.317747116088867,
"learning_rate": 7.380278124950561e-07,
"loss": 3.6703,
"step": 69300
},
{
"epoch": 2.964418435778053,
"grad_norm": 7.297631740570068,
"learning_rate": 6.589251530636461e-07,
"loss": 3.6542,
"step": 69400
},
{
"epoch": 2.9686899320832087,
"grad_norm": 8.46646499633789,
"learning_rate": 5.79822493632236e-07,
"loss": 3.6304,
"step": 69500
},
{
"epoch": 2.9729614283883645,
"grad_norm": 9.025291442871094,
"learning_rate": 5.007198342008259e-07,
"loss": 3.5323,
"step": 69600
},
{
"epoch": 2.9772329246935203,
"grad_norm": 8.952362060546875,
"learning_rate": 4.216171747694158e-07,
"loss": 3.5854,
"step": 69700
},
{
"epoch": 2.981504420998676,
"grad_norm": 6.965066432952881,
"learning_rate": 3.4251451533800567e-07,
"loss": 3.4377,
"step": 69800
},
{
"epoch": 2.9857759173038314,
"grad_norm": 10.185367584228516,
"learning_rate": 2.6341185590659557e-07,
"loss": 3.5891,
"step": 69900
},
{
"epoch": 2.990047413608987,
"grad_norm": 11.02718734741211,
"learning_rate": 1.8430919647518552e-07,
"loss": 3.5265,
"step": 70000
},
{
"epoch": 2.990047413608987,
"eval_runtime": 403.8967,
"eval_samples_per_second": 115.926,
"eval_steps_per_second": 14.491,
"step": 70000
}
],
"logging_steps": 100,
"max_steps": 70233,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.3668072761491424e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}