BNCHNOV / trainer_state.json
Sabbir772's picture
Upload full training checkpoint (including optimizer state)
17be1be verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 50420,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.039666798889329634,
"grad_norm": 46.122135162353516,
"learning_rate": 4.990182467274891e-05,
"loss": 11.8425,
"step": 100
},
{
"epoch": 0.07933359777865927,
"grad_norm": 23.861114501953125,
"learning_rate": 4.9802657675525586e-05,
"loss": 9.0419,
"step": 200
},
{
"epoch": 0.1190003966679889,
"grad_norm": 11.361534118652344,
"learning_rate": 4.970349067830226e-05,
"loss": 7.4942,
"step": 300
},
{
"epoch": 0.15866719555731854,
"grad_norm": 9.868680000305176,
"learning_rate": 4.960432368107894e-05,
"loss": 6.5606,
"step": 400
},
{
"epoch": 0.19833399444664815,
"grad_norm": 19.773534774780273,
"learning_rate": 4.9505156683855616e-05,
"loss": 6.0234,
"step": 500
},
{
"epoch": 0.2380007933359778,
"grad_norm": 6.297336101531982,
"learning_rate": 4.9405989686632294e-05,
"loss": 5.7502,
"step": 600
},
{
"epoch": 0.2776675922253074,
"grad_norm": 6.993984222412109,
"learning_rate": 4.9306822689408966e-05,
"loss": 5.4746,
"step": 700
},
{
"epoch": 0.31733439111463707,
"grad_norm": 5.947575569152832,
"learning_rate": 4.9207655692185645e-05,
"loss": 5.2263,
"step": 800
},
{
"epoch": 0.3570011900039667,
"grad_norm": 6.459949493408203,
"learning_rate": 4.9108488694962317e-05,
"loss": 5.071,
"step": 900
},
{
"epoch": 0.3966679888932963,
"grad_norm": 6.12597131729126,
"learning_rate": 4.9009321697738995e-05,
"loss": 4.9277,
"step": 1000
},
{
"epoch": 0.43633478778262597,
"grad_norm": 7.8361406326293945,
"learning_rate": 4.891015470051567e-05,
"loss": 4.7178,
"step": 1100
},
{
"epoch": 0.4760015866719556,
"grad_norm": 8.883167266845703,
"learning_rate": 4.881098770329235e-05,
"loss": 4.5832,
"step": 1200
},
{
"epoch": 0.5156683855612852,
"grad_norm": 5.529339790344238,
"learning_rate": 4.8711820706069024e-05,
"loss": 4.5019,
"step": 1300
},
{
"epoch": 0.5553351844506148,
"grad_norm": 5.6431121826171875,
"learning_rate": 4.86126537088457e-05,
"loss": 4.4477,
"step": 1400
},
{
"epoch": 0.5950019833399445,
"grad_norm": 5.4094367027282715,
"learning_rate": 4.8513486711622375e-05,
"loss": 4.317,
"step": 1500
},
{
"epoch": 0.6346687822292741,
"grad_norm": 4.631023406982422,
"learning_rate": 4.841431971439905e-05,
"loss": 4.2006,
"step": 1600
},
{
"epoch": 0.6743355811186037,
"grad_norm": 5.530189037322998,
"learning_rate": 4.8315152717175725e-05,
"loss": 4.0726,
"step": 1700
},
{
"epoch": 0.7140023800079334,
"grad_norm": 6.281075477600098,
"learning_rate": 4.82159857199524e-05,
"loss": 4.005,
"step": 1800
},
{
"epoch": 0.753669178897263,
"grad_norm": 8.573356628417969,
"learning_rate": 4.8116818722729076e-05,
"loss": 3.8903,
"step": 1900
},
{
"epoch": 0.7933359777865926,
"grad_norm": 7.195920467376709,
"learning_rate": 4.8017651725505754e-05,
"loss": 3.8038,
"step": 2000
},
{
"epoch": 0.8330027766759223,
"grad_norm": 6.207021236419678,
"learning_rate": 4.791848472828243e-05,
"loss": 3.775,
"step": 2100
},
{
"epoch": 0.8726695755652519,
"grad_norm": 6.628279685974121,
"learning_rate": 4.7819317731059105e-05,
"loss": 3.711,
"step": 2200
},
{
"epoch": 0.9123363744545815,
"grad_norm": 5.220765590667725,
"learning_rate": 4.7720150733835784e-05,
"loss": 3.6694,
"step": 2300
},
{
"epoch": 0.9520031733439112,
"grad_norm": 4.995284557342529,
"learning_rate": 4.7620983736612455e-05,
"loss": 3.5359,
"step": 2400
},
{
"epoch": 0.9916699722332408,
"grad_norm": 5.370284557342529,
"learning_rate": 4.7521816739389134e-05,
"loss": 3.5537,
"step": 2500
},
{
"epoch": 1.0,
"eval_loss": 2.6742756366729736,
"eval_runtime": 33.2175,
"eval_samples_per_second": 45.699,
"eval_steps_per_second": 5.72,
"step": 2521
},
{
"epoch": 1.0313367711225705,
"grad_norm": 6.217808246612549,
"learning_rate": 4.7422649742165806e-05,
"loss": 3.4208,
"step": 2600
},
{
"epoch": 1.0710035700119,
"grad_norm": 5.972238540649414,
"learning_rate": 4.7323482744942484e-05,
"loss": 3.3766,
"step": 2700
},
{
"epoch": 1.1106703689012296,
"grad_norm": 6.439736366271973,
"learning_rate": 4.722431574771916e-05,
"loss": 3.2706,
"step": 2800
},
{
"epoch": 1.1503371677905594,
"grad_norm": 4.722689151763916,
"learning_rate": 4.712514875049584e-05,
"loss": 3.1785,
"step": 2900
},
{
"epoch": 1.190003966679889,
"grad_norm": 4.344363689422607,
"learning_rate": 4.7025981753272514e-05,
"loss": 3.2959,
"step": 3000
},
{
"epoch": 1.2296707655692185,
"grad_norm": 5.724086284637451,
"learning_rate": 4.6926814756049185e-05,
"loss": 3.1135,
"step": 3100
},
{
"epoch": 1.269337564458548,
"grad_norm": 5.9762163162231445,
"learning_rate": 4.6827647758825864e-05,
"loss": 3.2194,
"step": 3200
},
{
"epoch": 1.3090043633478778,
"grad_norm": 5.490255355834961,
"learning_rate": 4.6728480761602536e-05,
"loss": 3.1226,
"step": 3300
},
{
"epoch": 1.3486711622372074,
"grad_norm": 5.34173583984375,
"learning_rate": 4.6629313764379215e-05,
"loss": 3.0641,
"step": 3400
},
{
"epoch": 1.388337961126537,
"grad_norm": 6.500023365020752,
"learning_rate": 4.653014676715589e-05,
"loss": 3.0342,
"step": 3500
},
{
"epoch": 1.4280047600158667,
"grad_norm": 4.705812931060791,
"learning_rate": 4.643097976993257e-05,
"loss": 2.9638,
"step": 3600
},
{
"epoch": 1.4676715589051963,
"grad_norm": 5.796449661254883,
"learning_rate": 4.6331812772709244e-05,
"loss": 2.9635,
"step": 3700
},
{
"epoch": 1.5073383577945259,
"grad_norm": 5.73616886138916,
"learning_rate": 4.623264577548592e-05,
"loss": 2.9933,
"step": 3800
},
{
"epoch": 1.5470051566838556,
"grad_norm": 5.073670864105225,
"learning_rate": 4.6133478778262594e-05,
"loss": 2.9872,
"step": 3900
},
{
"epoch": 1.5866719555731852,
"grad_norm": 5.04343318939209,
"learning_rate": 4.603431178103927e-05,
"loss": 2.9624,
"step": 4000
},
{
"epoch": 1.6263387544625147,
"grad_norm": 4.266116619110107,
"learning_rate": 4.5935144783815945e-05,
"loss": 2.9711,
"step": 4100
},
{
"epoch": 1.6660055533518445,
"grad_norm": 4.732306957244873,
"learning_rate": 4.583597778659262e-05,
"loss": 2.8238,
"step": 4200
},
{
"epoch": 1.705672352241174,
"grad_norm": 5.156635284423828,
"learning_rate": 4.57368107893693e-05,
"loss": 2.839,
"step": 4300
},
{
"epoch": 1.7453391511305036,
"grad_norm": 6.178804874420166,
"learning_rate": 4.563764379214598e-05,
"loss": 2.7441,
"step": 4400
},
{
"epoch": 1.7850059500198334,
"grad_norm": 6.307518482208252,
"learning_rate": 4.553847679492265e-05,
"loss": 2.7271,
"step": 4500
},
{
"epoch": 1.824672748909163,
"grad_norm": 4.5322136878967285,
"learning_rate": 4.5439309797699324e-05,
"loss": 2.6839,
"step": 4600
},
{
"epoch": 1.8643395477984925,
"grad_norm": 4.728321552276611,
"learning_rate": 4.5340142800476e-05,
"loss": 2.815,
"step": 4700
},
{
"epoch": 1.9040063466878223,
"grad_norm": 5.051918029785156,
"learning_rate": 4.5240975803252675e-05,
"loss": 2.6735,
"step": 4800
},
{
"epoch": 1.9436731455771519,
"grad_norm": 4.968688011169434,
"learning_rate": 4.5141808806029353e-05,
"loss": 2.6604,
"step": 4900
},
{
"epoch": 1.9833399444664814,
"grad_norm": 4.792623996734619,
"learning_rate": 4.504264180880603e-05,
"loss": 2.6375,
"step": 5000
},
{
"epoch": 2.0,
"eval_loss": 2.0515847206115723,
"eval_runtime": 33.0931,
"eval_samples_per_second": 45.871,
"eval_steps_per_second": 5.741,
"step": 5042
},
{
"epoch": 2.023006743355811,
"grad_norm": 7.228871822357178,
"learning_rate": 4.494347481158271e-05,
"loss": 2.5813,
"step": 5100
},
{
"epoch": 2.062673542245141,
"grad_norm": 4.44078254699707,
"learning_rate": 4.484430781435938e-05,
"loss": 2.5798,
"step": 5200
},
{
"epoch": 2.1023403411344703,
"grad_norm": 5.475325107574463,
"learning_rate": 4.474514081713606e-05,
"loss": 2.5297,
"step": 5300
},
{
"epoch": 2.1420071400238,
"grad_norm": 4.271339416503906,
"learning_rate": 4.464597381991273e-05,
"loss": 2.5595,
"step": 5400
},
{
"epoch": 2.18167393891313,
"grad_norm": 3.9716315269470215,
"learning_rate": 4.454680682268941e-05,
"loss": 2.5629,
"step": 5500
},
{
"epoch": 2.221340737802459,
"grad_norm": 5.6469807624816895,
"learning_rate": 4.4447639825466084e-05,
"loss": 2.4691,
"step": 5600
},
{
"epoch": 2.261007536691789,
"grad_norm": 4.760526657104492,
"learning_rate": 4.434847282824276e-05,
"loss": 2.606,
"step": 5700
},
{
"epoch": 2.300674335581119,
"grad_norm": 5.259726047515869,
"learning_rate": 4.424930583101944e-05,
"loss": 2.4984,
"step": 5800
},
{
"epoch": 2.340341134470448,
"grad_norm": 4.372512340545654,
"learning_rate": 4.415013883379612e-05,
"loss": 2.4104,
"step": 5900
},
{
"epoch": 2.380007933359778,
"grad_norm": 5.21671724319458,
"learning_rate": 4.405097183657279e-05,
"loss": 2.4612,
"step": 6000
},
{
"epoch": 2.4196747322491077,
"grad_norm": 4.706778049468994,
"learning_rate": 4.395180483934946e-05,
"loss": 2.3845,
"step": 6100
},
{
"epoch": 2.459341531138437,
"grad_norm": 4.4265217781066895,
"learning_rate": 4.385263784212614e-05,
"loss": 2.4508,
"step": 6200
},
{
"epoch": 2.499008330027767,
"grad_norm": 112.53572082519531,
"learning_rate": 4.3753470844902814e-05,
"loss": 2.3788,
"step": 6300
},
{
"epoch": 2.538675128917096,
"grad_norm": 5.193419933319092,
"learning_rate": 4.365430384767949e-05,
"loss": 2.3999,
"step": 6400
},
{
"epoch": 2.578341927806426,
"grad_norm": 4.786646842956543,
"learning_rate": 4.355513685045617e-05,
"loss": 2.3964,
"step": 6500
},
{
"epoch": 2.6180087266957557,
"grad_norm": 4.764982223510742,
"learning_rate": 4.345596985323285e-05,
"loss": 2.2939,
"step": 6600
},
{
"epoch": 2.657675525585085,
"grad_norm": 8.752727508544922,
"learning_rate": 4.335680285600952e-05,
"loss": 2.2859,
"step": 6700
},
{
"epoch": 2.697342324474415,
"grad_norm": 5.419288158416748,
"learning_rate": 4.32576358587862e-05,
"loss": 2.3073,
"step": 6800
},
{
"epoch": 2.7370091233637446,
"grad_norm": 3.573631763458252,
"learning_rate": 4.315846886156287e-05,
"loss": 2.1833,
"step": 6900
},
{
"epoch": 2.776675922253074,
"grad_norm": 5.297525882720947,
"learning_rate": 4.305930186433955e-05,
"loss": 2.4245,
"step": 7000
},
{
"epoch": 2.8163427211424037,
"grad_norm": 4.3615827560424805,
"learning_rate": 4.296013486711622e-05,
"loss": 2.2811,
"step": 7100
},
{
"epoch": 2.8560095200317335,
"grad_norm": 6.935328960418701,
"learning_rate": 4.28609678698929e-05,
"loss": 2.2544,
"step": 7200
},
{
"epoch": 2.895676318921063,
"grad_norm": 3.9425063133239746,
"learning_rate": 4.276180087266958e-05,
"loss": 2.2934,
"step": 7300
},
{
"epoch": 2.9353431178103926,
"grad_norm": 6.062328815460205,
"learning_rate": 4.266263387544626e-05,
"loss": 2.3048,
"step": 7400
},
{
"epoch": 2.9750099166997224,
"grad_norm": 4.808726787567139,
"learning_rate": 4.256346687822293e-05,
"loss": 2.2118,
"step": 7500
},
{
"epoch": 3.0,
"eval_loss": 1.770484209060669,
"eval_runtime": 33.1033,
"eval_samples_per_second": 45.856,
"eval_steps_per_second": 5.74,
"step": 7563
},
{
"epoch": 3.014676715589052,
"grad_norm": 4.881776809692383,
"learning_rate": 4.24642998809996e-05,
"loss": 2.2472,
"step": 7600
},
{
"epoch": 3.0543435144783815,
"grad_norm": 6.6921706199646,
"learning_rate": 4.236513288377628e-05,
"loss": 2.1728,
"step": 7700
},
{
"epoch": 3.0940103133677113,
"grad_norm": 3.29506254196167,
"learning_rate": 4.226596588655295e-05,
"loss": 2.0689,
"step": 7800
},
{
"epoch": 3.133677112257041,
"grad_norm": 4.864801406860352,
"learning_rate": 4.216679888932963e-05,
"loss": 2.2328,
"step": 7900
},
{
"epoch": 3.1733439111463704,
"grad_norm": 3.8594539165496826,
"learning_rate": 4.206763189210631e-05,
"loss": 2.0911,
"step": 8000
},
{
"epoch": 3.2130107100357,
"grad_norm": 5.1737380027771,
"learning_rate": 4.196846489488299e-05,
"loss": 2.0999,
"step": 8100
},
{
"epoch": 3.25267750892503,
"grad_norm": 4.454146385192871,
"learning_rate": 4.186929789765966e-05,
"loss": 2.0902,
"step": 8200
},
{
"epoch": 3.2923443078143593,
"grad_norm": 5.417801380157471,
"learning_rate": 4.177013090043634e-05,
"loss": 2.0971,
"step": 8300
},
{
"epoch": 3.332011106703689,
"grad_norm": 2.7768959999084473,
"learning_rate": 4.167096390321301e-05,
"loss": 2.1635,
"step": 8400
},
{
"epoch": 3.371677905593019,
"grad_norm": 4.387384414672852,
"learning_rate": 4.157179690598969e-05,
"loss": 2.0166,
"step": 8500
},
{
"epoch": 3.411344704482348,
"grad_norm": 4.593613624572754,
"learning_rate": 4.147262990876636e-05,
"loss": 2.0944,
"step": 8600
},
{
"epoch": 3.451011503371678,
"grad_norm": 5.243652820587158,
"learning_rate": 4.137346291154304e-05,
"loss": 2.0518,
"step": 8700
},
{
"epoch": 3.4906783022610077,
"grad_norm": 5.076266765594482,
"learning_rate": 4.127429591431972e-05,
"loss": 2.0412,
"step": 8800
},
{
"epoch": 3.530345101150337,
"grad_norm": 5.36345911026001,
"learning_rate": 4.11751289170964e-05,
"loss": 2.0586,
"step": 8900
},
{
"epoch": 3.570011900039667,
"grad_norm": 6.591952800750732,
"learning_rate": 4.107596191987307e-05,
"loss": 2.059,
"step": 9000
},
{
"epoch": 3.609678698928996,
"grad_norm": 5.091315746307373,
"learning_rate": 4.097679492264974e-05,
"loss": 2.0451,
"step": 9100
},
{
"epoch": 3.649345497818326,
"grad_norm": 4.647657871246338,
"learning_rate": 4.087762792542642e-05,
"loss": 2.0488,
"step": 9200
},
{
"epoch": 3.6890122967076557,
"grad_norm": 5.167809963226318,
"learning_rate": 4.077846092820309e-05,
"loss": 2.0688,
"step": 9300
},
{
"epoch": 3.728679095596985,
"grad_norm": 67.48959350585938,
"learning_rate": 4.067929393097977e-05,
"loss": 2.0513,
"step": 9400
},
{
"epoch": 3.768345894486315,
"grad_norm": 3.942390203475952,
"learning_rate": 4.058012693375645e-05,
"loss": 1.9697,
"step": 9500
},
{
"epoch": 3.8080126933756446,
"grad_norm": 5.491151332855225,
"learning_rate": 4.048095993653313e-05,
"loss": 2.0849,
"step": 9600
},
{
"epoch": 3.847679492264974,
"grad_norm": 4.637006759643555,
"learning_rate": 4.03817929393098e-05,
"loss": 1.9753,
"step": 9700
},
{
"epoch": 3.8873462911543037,
"grad_norm": 4.818416595458984,
"learning_rate": 4.028262594208648e-05,
"loss": 2.0526,
"step": 9800
},
{
"epoch": 3.9270130900436335,
"grad_norm": 4.810122013092041,
"learning_rate": 4.018345894486315e-05,
"loss": 2.0148,
"step": 9900
},
{
"epoch": 3.966679888932963,
"grad_norm": 4.372331142425537,
"learning_rate": 4.008429194763983e-05,
"loss": 2.0324,
"step": 10000
},
{
"epoch": 4.0,
"eval_loss": 1.5883285999298096,
"eval_runtime": 33.141,
"eval_samples_per_second": 45.804,
"eval_steps_per_second": 5.733,
"step": 10084
},
{
"epoch": 4.006346687822293,
"grad_norm": 4.643691539764404,
"learning_rate": 3.99851249504165e-05,
"loss": 2.0171,
"step": 10100
},
{
"epoch": 4.046013486711622,
"grad_norm": 5.210694789886475,
"learning_rate": 3.988595795319318e-05,
"loss": 1.9401,
"step": 10200
},
{
"epoch": 4.085680285600952,
"grad_norm": 5.724204063415527,
"learning_rate": 3.978679095596986e-05,
"loss": 1.9012,
"step": 10300
},
{
"epoch": 4.125347084490282,
"grad_norm": 3.6750075817108154,
"learning_rate": 3.9687623958746536e-05,
"loss": 1.7982,
"step": 10400
},
{
"epoch": 4.165013883379611,
"grad_norm": 4.948938369750977,
"learning_rate": 3.958845696152321e-05,
"loss": 1.9426,
"step": 10500
},
{
"epoch": 4.204680682268941,
"grad_norm": 5.098011016845703,
"learning_rate": 3.948928996429988e-05,
"loss": 1.9476,
"step": 10600
},
{
"epoch": 4.244347481158271,
"grad_norm": 3.605708599090576,
"learning_rate": 3.939012296707656e-05,
"loss": 1.91,
"step": 10700
},
{
"epoch": 4.2840142800476,
"grad_norm": 4.1741766929626465,
"learning_rate": 3.929095596985323e-05,
"loss": 1.9512,
"step": 10800
},
{
"epoch": 4.3236810789369295,
"grad_norm": 4.427469730377197,
"learning_rate": 3.919178897262991e-05,
"loss": 1.859,
"step": 10900
},
{
"epoch": 4.36334787782626,
"grad_norm": 4.128306865692139,
"learning_rate": 3.909262197540659e-05,
"loss": 1.8465,
"step": 11000
},
{
"epoch": 4.403014676715589,
"grad_norm": 3.959047317504883,
"learning_rate": 3.8993454978183266e-05,
"loss": 1.9168,
"step": 11100
},
{
"epoch": 4.442681475604918,
"grad_norm": 5.283690452575684,
"learning_rate": 3.889428798095994e-05,
"loss": 1.8212,
"step": 11200
},
{
"epoch": 4.482348274494249,
"grad_norm": 4.190108299255371,
"learning_rate": 3.8795120983736616e-05,
"loss": 1.7881,
"step": 11300
},
{
"epoch": 4.522015073383578,
"grad_norm": 5.957630157470703,
"learning_rate": 3.869595398651329e-05,
"loss": 1.8606,
"step": 11400
},
{
"epoch": 4.561681872272907,
"grad_norm": 4.41494607925415,
"learning_rate": 3.859678698928997e-05,
"loss": 1.8808,
"step": 11500
},
{
"epoch": 4.601348671162238,
"grad_norm": 4.355372428894043,
"learning_rate": 3.849761999206664e-05,
"loss": 1.8034,
"step": 11600
},
{
"epoch": 4.641015470051567,
"grad_norm": 4.594727993011475,
"learning_rate": 3.839845299484332e-05,
"loss": 1.8631,
"step": 11700
},
{
"epoch": 4.680682268940896,
"grad_norm": 3.8081648349761963,
"learning_rate": 3.8299285997619996e-05,
"loss": 1.7664,
"step": 11800
},
{
"epoch": 4.7203490678302265,
"grad_norm": 5.383887767791748,
"learning_rate": 3.8200119000396675e-05,
"loss": 1.8918,
"step": 11900
},
{
"epoch": 4.760015866719556,
"grad_norm": 4.703048229217529,
"learning_rate": 3.8100952003173347e-05,
"loss": 1.7821,
"step": 12000
},
{
"epoch": 4.799682665608885,
"grad_norm": 5.115866661071777,
"learning_rate": 3.800178500595002e-05,
"loss": 1.8276,
"step": 12100
},
{
"epoch": 4.839349464498215,
"grad_norm": 4.647130012512207,
"learning_rate": 3.79026180087267e-05,
"loss": 1.7743,
"step": 12200
},
{
"epoch": 4.879016263387545,
"grad_norm": 4.2948994636535645,
"learning_rate": 3.780345101150337e-05,
"loss": 1.7567,
"step": 12300
},
{
"epoch": 4.918683062276874,
"grad_norm": 4.055002212524414,
"learning_rate": 3.770428401428005e-05,
"loss": 1.832,
"step": 12400
},
{
"epoch": 4.958349861166204,
"grad_norm": 4.373877048492432,
"learning_rate": 3.7605117017056726e-05,
"loss": 1.7995,
"step": 12500
},
{
"epoch": 4.998016660055534,
"grad_norm": 5.246423721313477,
"learning_rate": 3.7505950019833405e-05,
"loss": 1.7464,
"step": 12600
},
{
"epoch": 5.0,
"eval_loss": 1.4670053720474243,
"eval_runtime": 33.135,
"eval_samples_per_second": 45.813,
"eval_steps_per_second": 5.734,
"step": 12605
},
{
"epoch": 5.037683458944863,
"grad_norm": 5.669796943664551,
"learning_rate": 3.740678302261008e-05,
"loss": 1.6803,
"step": 12700
},
{
"epoch": 5.077350257834193,
"grad_norm": 4.203566074371338,
"learning_rate": 3.7307616025386755e-05,
"loss": 1.665,
"step": 12800
},
{
"epoch": 5.1170170567235225,
"grad_norm": 3.6892035007476807,
"learning_rate": 3.720844902816343e-05,
"loss": 1.7269,
"step": 12900
},
{
"epoch": 5.156683855612852,
"grad_norm": 4.452983379364014,
"learning_rate": 3.7109282030940106e-05,
"loss": 1.7276,
"step": 13000
},
{
"epoch": 5.196350654502182,
"grad_norm": 3.7172744274139404,
"learning_rate": 3.701011503371678e-05,
"loss": 1.6838,
"step": 13100
},
{
"epoch": 5.236017453391511,
"grad_norm": 4.209805488586426,
"learning_rate": 3.6910948036493456e-05,
"loss": 1.6982,
"step": 13200
},
{
"epoch": 5.275684252280841,
"grad_norm": 4.2851362228393555,
"learning_rate": 3.6811781039270135e-05,
"loss": 1.6656,
"step": 13300
},
{
"epoch": 5.315351051170171,
"grad_norm": 3.345033884048462,
"learning_rate": 3.6712614042046814e-05,
"loss": 1.7176,
"step": 13400
},
{
"epoch": 5.3550178500595,
"grad_norm": 7.854482173919678,
"learning_rate": 3.6613447044823485e-05,
"loss": 1.7395,
"step": 13500
},
{
"epoch": 5.39468464894883,
"grad_norm": 5.201159477233887,
"learning_rate": 3.651428004760016e-05,
"loss": 1.7159,
"step": 13600
},
{
"epoch": 5.43435144783816,
"grad_norm": 5.032053470611572,
"learning_rate": 3.6415113050376836e-05,
"loss": 1.7993,
"step": 13700
},
{
"epoch": 5.474018246727489,
"grad_norm": 4.350612640380859,
"learning_rate": 3.631594605315351e-05,
"loss": 1.722,
"step": 13800
},
{
"epoch": 5.5136850456168185,
"grad_norm": 5.67685604095459,
"learning_rate": 3.6216779055930186e-05,
"loss": 1.5882,
"step": 13900
},
{
"epoch": 5.553351844506149,
"grad_norm": 3.8744733333587646,
"learning_rate": 3.6117612058706865e-05,
"loss": 1.6864,
"step": 14000
},
{
"epoch": 5.593018643395478,
"grad_norm": 3.0556750297546387,
"learning_rate": 3.6018445061483544e-05,
"loss": 1.6494,
"step": 14100
},
{
"epoch": 5.632685442284807,
"grad_norm": 5.0797343254089355,
"learning_rate": 3.5919278064260215e-05,
"loss": 1.747,
"step": 14200
},
{
"epoch": 5.672352241174138,
"grad_norm": 4.625453948974609,
"learning_rate": 3.5820111067036894e-05,
"loss": 1.689,
"step": 14300
},
{
"epoch": 5.712019040063467,
"grad_norm": 5.2560133934021,
"learning_rate": 3.5720944069813566e-05,
"loss": 1.6218,
"step": 14400
},
{
"epoch": 5.751685838952796,
"grad_norm": 4.45328950881958,
"learning_rate": 3.5621777072590245e-05,
"loss": 1.7095,
"step": 14500
},
{
"epoch": 5.7913526378421265,
"grad_norm": 3.0788328647613525,
"learning_rate": 3.5522610075366916e-05,
"loss": 1.7223,
"step": 14600
},
{
"epoch": 5.831019436731456,
"grad_norm": 6.247290134429932,
"learning_rate": 3.5423443078143595e-05,
"loss": 1.6158,
"step": 14700
},
{
"epoch": 5.870686235620785,
"grad_norm": 4.095520973205566,
"learning_rate": 3.5324276080920274e-05,
"loss": 1.6313,
"step": 14800
},
{
"epoch": 5.910353034510115,
"grad_norm": 4.251845836639404,
"learning_rate": 3.522510908369695e-05,
"loss": 1.6458,
"step": 14900
},
{
"epoch": 5.950019833399445,
"grad_norm": 3.833702802658081,
"learning_rate": 3.5125942086473624e-05,
"loss": 1.6425,
"step": 15000
},
{
"epoch": 5.989686632288774,
"grad_norm": 4.577655792236328,
"learning_rate": 3.5026775089250296e-05,
"loss": 1.6747,
"step": 15100
},
{
"epoch": 6.0,
"eval_loss": 1.3812555074691772,
"eval_runtime": 33.4674,
"eval_samples_per_second": 45.358,
"eval_steps_per_second": 5.677,
"step": 15126
},
{
"epoch": 6.029353431178104,
"grad_norm": 3.609616279602051,
"learning_rate": 3.4927608092026975e-05,
"loss": 1.6579,
"step": 15200
},
{
"epoch": 6.069020230067434,
"grad_norm": 3.5658130645751953,
"learning_rate": 3.4828441094803647e-05,
"loss": 1.6842,
"step": 15300
},
{
"epoch": 6.108687028956763,
"grad_norm": 4.586058139801025,
"learning_rate": 3.4729274097580325e-05,
"loss": 1.5563,
"step": 15400
},
{
"epoch": 6.148353827846093,
"grad_norm": 5.103824615478516,
"learning_rate": 3.4630107100357004e-05,
"loss": 1.5386,
"step": 15500
},
{
"epoch": 6.1880206267354225,
"grad_norm": 5.14306116104126,
"learning_rate": 3.453094010313368e-05,
"loss": 1.6081,
"step": 15600
},
{
"epoch": 6.227687425624752,
"grad_norm": 4.270661354064941,
"learning_rate": 3.4431773105910354e-05,
"loss": 1.5569,
"step": 15700
},
{
"epoch": 6.267354224514082,
"grad_norm": 13.869562149047852,
"learning_rate": 3.433260610868703e-05,
"loss": 1.5484,
"step": 15800
},
{
"epoch": 6.307021023403411,
"grad_norm": 3.9378180503845215,
"learning_rate": 3.4233439111463705e-05,
"loss": 1.5441,
"step": 15900
},
{
"epoch": 6.346687822292741,
"grad_norm": 4.3542656898498535,
"learning_rate": 3.4134272114240383e-05,
"loss": 1.58,
"step": 16000
},
{
"epoch": 6.386354621182071,
"grad_norm": 3.8545126914978027,
"learning_rate": 3.4035105117017055e-05,
"loss": 1.4445,
"step": 16100
},
{
"epoch": 6.4260214200714,
"grad_norm": 3.9810452461242676,
"learning_rate": 3.3935938119793734e-05,
"loss": 1.6052,
"step": 16200
},
{
"epoch": 6.46568821896073,
"grad_norm": 7.306039333343506,
"learning_rate": 3.383677112257041e-05,
"loss": 1.608,
"step": 16300
},
{
"epoch": 6.50535501785006,
"grad_norm": 4.018649578094482,
"learning_rate": 3.373760412534709e-05,
"loss": 1.581,
"step": 16400
},
{
"epoch": 6.545021816739389,
"grad_norm": 5.1577019691467285,
"learning_rate": 3.363843712812376e-05,
"loss": 1.548,
"step": 16500
},
{
"epoch": 6.5846886156287185,
"grad_norm": 6.858482837677002,
"learning_rate": 3.3539270130900435e-05,
"loss": 1.5823,
"step": 16600
},
{
"epoch": 6.624355414518049,
"grad_norm": 4.213831901550293,
"learning_rate": 3.3440103133677114e-05,
"loss": 1.5199,
"step": 16700
},
{
"epoch": 6.664022213407378,
"grad_norm": 3.531313180923462,
"learning_rate": 3.3340936136453785e-05,
"loss": 1.5993,
"step": 16800
},
{
"epoch": 6.703689012296707,
"grad_norm": 4.222484588623047,
"learning_rate": 3.3241769139230464e-05,
"loss": 1.5839,
"step": 16900
},
{
"epoch": 6.743355811186038,
"grad_norm": 3.11354660987854,
"learning_rate": 3.314260214200714e-05,
"loss": 1.5302,
"step": 17000
},
{
"epoch": 6.783022610075367,
"grad_norm": 3.699721574783325,
"learning_rate": 3.304343514478382e-05,
"loss": 1.5662,
"step": 17100
},
{
"epoch": 6.822689408964696,
"grad_norm": 6.095912456512451,
"learning_rate": 3.294426814756049e-05,
"loss": 1.6208,
"step": 17200
},
{
"epoch": 6.862356207854027,
"grad_norm": 3.0489301681518555,
"learning_rate": 3.284510115033717e-05,
"loss": 1.4306,
"step": 17300
},
{
"epoch": 6.902023006743356,
"grad_norm": 4.094913005828857,
"learning_rate": 3.2745934153113844e-05,
"loss": 1.438,
"step": 17400
},
{
"epoch": 6.941689805632685,
"grad_norm": 3.900447130203247,
"learning_rate": 3.264676715589052e-05,
"loss": 1.4798,
"step": 17500
},
{
"epoch": 6.9813566045220155,
"grad_norm": 4.244141578674316,
"learning_rate": 3.2547600158667194e-05,
"loss": 1.5627,
"step": 17600
},
{
"epoch": 7.0,
"eval_loss": 1.3121882677078247,
"eval_runtime": 33.1418,
"eval_samples_per_second": 45.803,
"eval_steps_per_second": 5.733,
"step": 17647
},
{
"epoch": 7.021023403411345,
"grad_norm": 5.134289264678955,
"learning_rate": 3.244843316144387e-05,
"loss": 1.4704,
"step": 17700
},
{
"epoch": 7.060690202300674,
"grad_norm": 4.705554008483887,
"learning_rate": 3.234926616422055e-05,
"loss": 1.4257,
"step": 17800
},
{
"epoch": 7.100357001190004,
"grad_norm": 5.20936918258667,
"learning_rate": 3.225009916699723e-05,
"loss": 1.4684,
"step": 17900
},
{
"epoch": 7.140023800079334,
"grad_norm": 5.669763565063477,
"learning_rate": 3.21509321697739e-05,
"loss": 1.4626,
"step": 18000
},
{
"epoch": 7.179690598968663,
"grad_norm": 4.726533889770508,
"learning_rate": 3.2051765172550574e-05,
"loss": 1.4362,
"step": 18100
},
{
"epoch": 7.219357397857993,
"grad_norm": 3.413167715072632,
"learning_rate": 3.195259817532725e-05,
"loss": 1.4591,
"step": 18200
},
{
"epoch": 7.259024196747323,
"grad_norm": 3.3368911743164062,
"learning_rate": 3.1853431178103924e-05,
"loss": 1.5077,
"step": 18300
},
{
"epoch": 7.298690995636652,
"grad_norm": 3.5089704990386963,
"learning_rate": 3.17542641808806e-05,
"loss": 1.509,
"step": 18400
},
{
"epoch": 7.338357794525982,
"grad_norm": 4.13053035736084,
"learning_rate": 3.165509718365728e-05,
"loss": 1.5048,
"step": 18500
},
{
"epoch": 7.3780245934153115,
"grad_norm": 4.646170139312744,
"learning_rate": 3.155593018643396e-05,
"loss": 1.474,
"step": 18600
},
{
"epoch": 7.417691392304641,
"grad_norm": 4.4724812507629395,
"learning_rate": 3.145676318921063e-05,
"loss": 1.5448,
"step": 18700
},
{
"epoch": 7.457358191193971,
"grad_norm": 3.79464054107666,
"learning_rate": 3.135759619198731e-05,
"loss": 1.4379,
"step": 18800
},
{
"epoch": 7.4970249900833,
"grad_norm": 3.2396161556243896,
"learning_rate": 3.125842919476398e-05,
"loss": 1.4857,
"step": 18900
},
{
"epoch": 7.53669178897263,
"grad_norm": 3.6047024726867676,
"learning_rate": 3.115926219754066e-05,
"loss": 1.453,
"step": 19000
},
{
"epoch": 7.57635858786196,
"grad_norm": 4.998748779296875,
"learning_rate": 3.106009520031733e-05,
"loss": 1.4062,
"step": 19100
},
{
"epoch": 7.616025386751289,
"grad_norm": 4.068435192108154,
"learning_rate": 3.096092820309401e-05,
"loss": 1.3582,
"step": 19200
},
{
"epoch": 7.655692185640619,
"grad_norm": 5.680367469787598,
"learning_rate": 3.086176120587069e-05,
"loss": 1.4897,
"step": 19300
},
{
"epoch": 7.695358984529949,
"grad_norm": 3.917802333831787,
"learning_rate": 3.076259420864737e-05,
"loss": 1.4195,
"step": 19400
},
{
"epoch": 7.735025783419278,
"grad_norm": 3.1522891521453857,
"learning_rate": 3.066342721142404e-05,
"loss": 1.4599,
"step": 19500
},
{
"epoch": 7.7746925823086075,
"grad_norm": 4.597601890563965,
"learning_rate": 3.056426021420071e-05,
"loss": 1.4701,
"step": 19600
},
{
"epoch": 7.814359381197937,
"grad_norm": 4.217317581176758,
"learning_rate": 3.046509321697739e-05,
"loss": 1.4263,
"step": 19700
},
{
"epoch": 7.854026180087267,
"grad_norm": 4.17954158782959,
"learning_rate": 3.0365926219754066e-05,
"loss": 1.4155,
"step": 19800
},
{
"epoch": 7.893692978976596,
"grad_norm": 4.049231052398682,
"learning_rate": 3.0266759222530745e-05,
"loss": 1.4343,
"step": 19900
},
{
"epoch": 7.933359777865926,
"grad_norm": 3.9351389408111572,
"learning_rate": 3.0167592225307417e-05,
"loss": 1.4247,
"step": 20000
},
{
"epoch": 7.973026576755256,
"grad_norm": 6.478794097900391,
"learning_rate": 3.0068425228084096e-05,
"loss": 1.4336,
"step": 20100
},
{
"epoch": 8.0,
"eval_loss": 1.2493535280227661,
"eval_runtime": 33.1532,
"eval_samples_per_second": 45.787,
"eval_steps_per_second": 5.731,
"step": 20168
},
{
"epoch": 8.012693375644586,
"grad_norm": 7.988471508026123,
"learning_rate": 2.996925823086077e-05,
"loss": 1.4408,
"step": 20200
},
{
"epoch": 8.052360174533915,
"grad_norm": 3.978797674179077,
"learning_rate": 2.987009123363745e-05,
"loss": 1.4227,
"step": 20300
},
{
"epoch": 8.092026973423245,
"grad_norm": 2.8589699268341064,
"learning_rate": 2.977092423641412e-05,
"loss": 1.3348,
"step": 20400
},
{
"epoch": 8.131693772312575,
"grad_norm": 4.3820061683654785,
"learning_rate": 2.96717572391908e-05,
"loss": 1.3374,
"step": 20500
},
{
"epoch": 8.171360571201904,
"grad_norm": 4.421834468841553,
"learning_rate": 2.9572590241967475e-05,
"loss": 1.379,
"step": 20600
},
{
"epoch": 8.211027370091234,
"grad_norm": 3.6717193126678467,
"learning_rate": 2.9473423244744154e-05,
"loss": 1.3878,
"step": 20700
},
{
"epoch": 8.250694168980564,
"grad_norm": 5.8960466384887695,
"learning_rate": 2.9374256247520826e-05,
"loss": 1.418,
"step": 20800
},
{
"epoch": 8.290360967869892,
"grad_norm": 4.1541428565979,
"learning_rate": 2.9275089250297504e-05,
"loss": 1.3427,
"step": 20900
},
{
"epoch": 8.330027766759223,
"grad_norm": 4.0375566482543945,
"learning_rate": 2.917592225307418e-05,
"loss": 1.3659,
"step": 21000
},
{
"epoch": 8.369694565648553,
"grad_norm": 2.6886465549468994,
"learning_rate": 2.907675525585085e-05,
"loss": 1.3568,
"step": 21100
},
{
"epoch": 8.409361364537881,
"grad_norm": 4.069731712341309,
"learning_rate": 2.897758825862753e-05,
"loss": 1.4326,
"step": 21200
},
{
"epoch": 8.449028163427212,
"grad_norm": 4.844085693359375,
"learning_rate": 2.8878421261404205e-05,
"loss": 1.4363,
"step": 21300
},
{
"epoch": 8.488694962316542,
"grad_norm": 2.894545316696167,
"learning_rate": 2.8779254264180884e-05,
"loss": 1.362,
"step": 21400
},
{
"epoch": 8.52836176120587,
"grad_norm": 3.8921375274658203,
"learning_rate": 2.8680087266957556e-05,
"loss": 1.3303,
"step": 21500
},
{
"epoch": 8.5680285600952,
"grad_norm": 3.6468684673309326,
"learning_rate": 2.8580920269734234e-05,
"loss": 1.387,
"step": 21600
},
{
"epoch": 8.60769535898453,
"grad_norm": 4.2180938720703125,
"learning_rate": 2.848175327251091e-05,
"loss": 1.366,
"step": 21700
},
{
"epoch": 8.647362157873859,
"grad_norm": 4.113888263702393,
"learning_rate": 2.8382586275287588e-05,
"loss": 1.4047,
"step": 21800
},
{
"epoch": 8.68702895676319,
"grad_norm": 4.009461402893066,
"learning_rate": 2.828341927806426e-05,
"loss": 1.3446,
"step": 21900
},
{
"epoch": 8.72669575565252,
"grad_norm": 3.8195252418518066,
"learning_rate": 2.818425228084094e-05,
"loss": 1.3304,
"step": 22000
},
{
"epoch": 8.766362554541848,
"grad_norm": 4.5541300773620605,
"learning_rate": 2.8085085283617614e-05,
"loss": 1.4156,
"step": 22100
},
{
"epoch": 8.806029353431178,
"grad_norm": 4.221588611602783,
"learning_rate": 2.7985918286394293e-05,
"loss": 1.3258,
"step": 22200
},
{
"epoch": 8.845696152320508,
"grad_norm": 3.7638354301452637,
"learning_rate": 2.7886751289170964e-05,
"loss": 1.2697,
"step": 22300
},
{
"epoch": 8.885362951209837,
"grad_norm": 3.7174267768859863,
"learning_rate": 2.7787584291947643e-05,
"loss": 1.3468,
"step": 22400
},
{
"epoch": 8.925029750099167,
"grad_norm": 4.4955153465271,
"learning_rate": 2.768841729472432e-05,
"loss": 1.3074,
"step": 22500
},
{
"epoch": 8.964696548988497,
"grad_norm": 4.170012950897217,
"learning_rate": 2.758925029750099e-05,
"loss": 1.3324,
"step": 22600
},
{
"epoch": 9.0,
"eval_loss": 1.204575538635254,
"eval_runtime": 33.1963,
"eval_samples_per_second": 45.728,
"eval_steps_per_second": 5.724,
"step": 22689
},
{
"epoch": 9.004363347877826,
"grad_norm": 3.331163167953491,
"learning_rate": 2.749008330027767e-05,
"loss": 1.3285,
"step": 22700
},
{
"epoch": 9.044030146767156,
"grad_norm": 3.822847843170166,
"learning_rate": 2.7390916303054344e-05,
"loss": 1.3589,
"step": 22800
},
{
"epoch": 9.083696945656486,
"grad_norm": 3.4321391582489014,
"learning_rate": 2.7291749305831023e-05,
"loss": 1.2863,
"step": 22900
},
{
"epoch": 9.123363744545815,
"grad_norm": 4.23520040512085,
"learning_rate": 2.7192582308607695e-05,
"loss": 1.297,
"step": 23000
},
{
"epoch": 9.163030543435145,
"grad_norm": 3.0839881896972656,
"learning_rate": 2.7093415311384373e-05,
"loss": 1.272,
"step": 23100
},
{
"epoch": 9.202697342324475,
"grad_norm": 5.115342617034912,
"learning_rate": 2.699424831416105e-05,
"loss": 1.2667,
"step": 23200
},
{
"epoch": 9.242364141213804,
"grad_norm": 3.8965401649475098,
"learning_rate": 2.6895081316937727e-05,
"loss": 1.2995,
"step": 23300
},
{
"epoch": 9.282030940103134,
"grad_norm": 3.395707368850708,
"learning_rate": 2.67959143197144e-05,
"loss": 1.2064,
"step": 23400
},
{
"epoch": 9.321697738992464,
"grad_norm": 3.7783238887786865,
"learning_rate": 2.6696747322491078e-05,
"loss": 1.354,
"step": 23500
},
{
"epoch": 9.361364537881792,
"grad_norm": 3.6201136112213135,
"learning_rate": 2.6597580325267753e-05,
"loss": 1.318,
"step": 23600
},
{
"epoch": 9.401031336771123,
"grad_norm": 7.127315044403076,
"learning_rate": 2.649841332804443e-05,
"loss": 1.2809,
"step": 23700
},
{
"epoch": 9.440698135660453,
"grad_norm": 3.341298818588257,
"learning_rate": 2.6399246330821103e-05,
"loss": 1.3285,
"step": 23800
},
{
"epoch": 9.480364934549781,
"grad_norm": 3.38814377784729,
"learning_rate": 2.6300079333597782e-05,
"loss": 1.3326,
"step": 23900
},
{
"epoch": 9.520031733439112,
"grad_norm": 2.880125045776367,
"learning_rate": 2.6200912336374457e-05,
"loss": 1.3142,
"step": 24000
},
{
"epoch": 9.559698532328442,
"grad_norm": 3.778383731842041,
"learning_rate": 2.610174533915113e-05,
"loss": 1.3217,
"step": 24100
},
{
"epoch": 9.59936533121777,
"grad_norm": 5.5109734535217285,
"learning_rate": 2.6002578341927808e-05,
"loss": 1.2715,
"step": 24200
},
{
"epoch": 9.6390321301071,
"grad_norm": 3.931368112564087,
"learning_rate": 2.5903411344704483e-05,
"loss": 1.318,
"step": 24300
},
{
"epoch": 9.67869892899643,
"grad_norm": 3.6587719917297363,
"learning_rate": 2.580424434748116e-05,
"loss": 1.2384,
"step": 24400
},
{
"epoch": 9.71836572788576,
"grad_norm": 3.4478108882904053,
"learning_rate": 2.5705077350257833e-05,
"loss": 1.2682,
"step": 24500
},
{
"epoch": 9.75803252677509,
"grad_norm": 3.9226527214050293,
"learning_rate": 2.5605910353034512e-05,
"loss": 1.2966,
"step": 24600
},
{
"epoch": 9.79769932566442,
"grad_norm": 4.621306419372559,
"learning_rate": 2.5506743355811187e-05,
"loss": 1.2788,
"step": 24700
},
{
"epoch": 9.837366124553748,
"grad_norm": 3.4298593997955322,
"learning_rate": 2.5407576358587866e-05,
"loss": 1.3299,
"step": 24800
},
{
"epoch": 9.877032923443078,
"grad_norm": 3.7832400798797607,
"learning_rate": 2.5308409361364538e-05,
"loss": 1.2634,
"step": 24900
},
{
"epoch": 9.916699722332408,
"grad_norm": 5.351818561553955,
"learning_rate": 2.5209242364141216e-05,
"loss": 1.3117,
"step": 25000
},
{
"epoch": 9.956366521221737,
"grad_norm": 4.65415096282959,
"learning_rate": 2.511007536691789e-05,
"loss": 1.2613,
"step": 25100
},
{
"epoch": 9.996033320111067,
"grad_norm": 3.2736618518829346,
"learning_rate": 2.501090836969457e-05,
"loss": 1.3156,
"step": 25200
},
{
"epoch": 10.0,
"eval_loss": 1.164570689201355,
"eval_runtime": 33.0846,
"eval_samples_per_second": 45.882,
"eval_steps_per_second": 5.743,
"step": 25210
},
{
"epoch": 10.035700119000397,
"grad_norm": 3.6819069385528564,
"learning_rate": 2.4911741372471242e-05,
"loss": 1.2604,
"step": 25300
},
{
"epoch": 10.075366917889726,
"grad_norm": 3.9212143421173096,
"learning_rate": 2.4812574375247917e-05,
"loss": 1.2308,
"step": 25400
},
{
"epoch": 10.115033716779056,
"grad_norm": 3.3087549209594727,
"learning_rate": 2.4713407378024596e-05,
"loss": 1.1652,
"step": 25500
},
{
"epoch": 10.154700515668386,
"grad_norm": 3.8680827617645264,
"learning_rate": 2.461424038080127e-05,
"loss": 1.2311,
"step": 25600
},
{
"epoch": 10.194367314557715,
"grad_norm": 5.244319438934326,
"learning_rate": 2.4515073383577946e-05,
"loss": 1.1819,
"step": 25700
},
{
"epoch": 10.234034113447045,
"grad_norm": 3.2293717861175537,
"learning_rate": 2.4415906386354622e-05,
"loss": 1.249,
"step": 25800
},
{
"epoch": 10.273700912336375,
"grad_norm": 4.391103744506836,
"learning_rate": 2.43167393891313e-05,
"loss": 1.2283,
"step": 25900
},
{
"epoch": 10.313367711225704,
"grad_norm": 4.615547180175781,
"learning_rate": 2.4217572391907976e-05,
"loss": 1.2915,
"step": 26000
},
{
"epoch": 10.353034510115034,
"grad_norm": 3.3367502689361572,
"learning_rate": 2.411840539468465e-05,
"loss": 1.2221,
"step": 26100
},
{
"epoch": 10.392701309004364,
"grad_norm": 5.194177150726318,
"learning_rate": 2.4019238397461326e-05,
"loss": 1.2611,
"step": 26200
},
{
"epoch": 10.432368107893693,
"grad_norm": 5.576562404632568,
"learning_rate": 2.3920071400238005e-05,
"loss": 1.2764,
"step": 26300
},
{
"epoch": 10.472034906783023,
"grad_norm": 4.902477264404297,
"learning_rate": 2.3820904403014677e-05,
"loss": 1.2066,
"step": 26400
},
{
"epoch": 10.511701705672353,
"grad_norm": 4.312764644622803,
"learning_rate": 2.3721737405791352e-05,
"loss": 1.219,
"step": 26500
},
{
"epoch": 10.551368504561681,
"grad_norm": 4.345120429992676,
"learning_rate": 2.362257040856803e-05,
"loss": 1.2679,
"step": 26600
},
{
"epoch": 10.591035303451012,
"grad_norm": 3.9365150928497314,
"learning_rate": 2.3523403411344706e-05,
"loss": 1.1752,
"step": 26700
},
{
"epoch": 10.630702102340342,
"grad_norm": 3.843207597732544,
"learning_rate": 2.342423641412138e-05,
"loss": 1.2706,
"step": 26800
},
{
"epoch": 10.67036890122967,
"grad_norm": 4.076716423034668,
"learning_rate": 2.3325069416898056e-05,
"loss": 1.1561,
"step": 26900
},
{
"epoch": 10.710035700119,
"grad_norm": 4.182331562042236,
"learning_rate": 2.3225902419674735e-05,
"loss": 1.2027,
"step": 27000
},
{
"epoch": 10.74970249900833,
"grad_norm": 5.730105876922607,
"learning_rate": 2.312673542245141e-05,
"loss": 1.2703,
"step": 27100
},
{
"epoch": 10.78936929789766,
"grad_norm": 5.552068710327148,
"learning_rate": 2.3027568425228085e-05,
"loss": 1.252,
"step": 27200
},
{
"epoch": 10.82903609678699,
"grad_norm": 4.406209945678711,
"learning_rate": 2.292840142800476e-05,
"loss": 1.183,
"step": 27300
},
{
"epoch": 10.86870289567632,
"grad_norm": 3.434688091278076,
"learning_rate": 2.282923443078144e-05,
"loss": 1.3214,
"step": 27400
},
{
"epoch": 10.908369694565648,
"grad_norm": 5.0344085693359375,
"learning_rate": 2.2730067433558114e-05,
"loss": 1.3043,
"step": 27500
},
{
"epoch": 10.948036493454978,
"grad_norm": 3.3030033111572266,
"learning_rate": 2.263090043633479e-05,
"loss": 1.1764,
"step": 27600
},
{
"epoch": 10.987703292344309,
"grad_norm": 5.79923152923584,
"learning_rate": 2.2531733439111465e-05,
"loss": 1.2218,
"step": 27700
},
{
"epoch": 11.0,
"eval_loss": 1.1352205276489258,
"eval_runtime": 31.6991,
"eval_samples_per_second": 47.888,
"eval_steps_per_second": 5.994,
"step": 27731
},
{
"epoch": 11.027370091233637,
"grad_norm": 4.073122501373291,
"learning_rate": 2.2432566441888144e-05,
"loss": 1.1861,
"step": 27800
},
{
"epoch": 11.067036890122967,
"grad_norm": 2.8648433685302734,
"learning_rate": 2.2333399444664815e-05,
"loss": 1.1659,
"step": 27900
},
{
"epoch": 11.106703689012297,
"grad_norm": 3.6053709983825684,
"learning_rate": 2.223423244744149e-05,
"loss": 1.2087,
"step": 28000
},
{
"epoch": 11.146370487901626,
"grad_norm": 3.5773251056671143,
"learning_rate": 2.2135065450218166e-05,
"loss": 1.1787,
"step": 28100
},
{
"epoch": 11.186037286790956,
"grad_norm": 5.5593485832214355,
"learning_rate": 2.2035898452994845e-05,
"loss": 1.1941,
"step": 28200
},
{
"epoch": 11.225704085680286,
"grad_norm": 3.9467504024505615,
"learning_rate": 2.193673145577152e-05,
"loss": 1.2505,
"step": 28300
},
{
"epoch": 11.265370884569615,
"grad_norm": 4.707422733306885,
"learning_rate": 2.1837564458548195e-05,
"loss": 1.1165,
"step": 28400
},
{
"epoch": 11.305037683458945,
"grad_norm": 4.517952919006348,
"learning_rate": 2.1738397461324874e-05,
"loss": 1.2379,
"step": 28500
},
{
"epoch": 11.344704482348275,
"grad_norm": 2.318586587905884,
"learning_rate": 2.163923046410155e-05,
"loss": 1.2098,
"step": 28600
},
{
"epoch": 11.384371281237604,
"grad_norm": 3.655980110168457,
"learning_rate": 2.1540063466878224e-05,
"loss": 1.2044,
"step": 28700
},
{
"epoch": 11.424038080126934,
"grad_norm": 4.038224697113037,
"learning_rate": 2.14408964696549e-05,
"loss": 1.1651,
"step": 28800
},
{
"epoch": 11.463704879016264,
"grad_norm": 3.9811367988586426,
"learning_rate": 2.1341729472431578e-05,
"loss": 1.199,
"step": 28900
},
{
"epoch": 11.503371677905593,
"grad_norm": 6.200103759765625,
"learning_rate": 2.1242562475208253e-05,
"loss": 1.1094,
"step": 29000
},
{
"epoch": 11.543038476794923,
"grad_norm": 3.919187545776367,
"learning_rate": 2.114339547798493e-05,
"loss": 1.1522,
"step": 29100
},
{
"epoch": 11.582705275684253,
"grad_norm": 3.701822519302368,
"learning_rate": 2.1044228480761604e-05,
"loss": 1.1556,
"step": 29200
},
{
"epoch": 11.622372074573581,
"grad_norm": 4.491922855377197,
"learning_rate": 2.0945061483538282e-05,
"loss": 1.1779,
"step": 29300
},
{
"epoch": 11.662038873462912,
"grad_norm": 4.367665767669678,
"learning_rate": 2.0845894486314954e-05,
"loss": 1.1392,
"step": 29400
},
{
"epoch": 11.701705672352242,
"grad_norm": 4.0435028076171875,
"learning_rate": 2.074672748909163e-05,
"loss": 1.1621,
"step": 29500
},
{
"epoch": 11.74137247124157,
"grad_norm": 4.151968955993652,
"learning_rate": 2.0647560491868305e-05,
"loss": 1.1983,
"step": 29600
},
{
"epoch": 11.7810392701309,
"grad_norm": 4.687623500823975,
"learning_rate": 2.0548393494644983e-05,
"loss": 1.1563,
"step": 29700
},
{
"epoch": 11.82070606902023,
"grad_norm": 4.415579795837402,
"learning_rate": 2.044922649742166e-05,
"loss": 1.1497,
"step": 29800
},
{
"epoch": 11.86037286790956,
"grad_norm": 4.241002082824707,
"learning_rate": 2.0350059500198334e-05,
"loss": 1.2298,
"step": 29900
},
{
"epoch": 11.90003966679889,
"grad_norm": 5.38535213470459,
"learning_rate": 2.025089250297501e-05,
"loss": 1.1403,
"step": 30000
},
{
"epoch": 11.93970646568822,
"grad_norm": 3.886983633041382,
"learning_rate": 2.0151725505751688e-05,
"loss": 1.237,
"step": 30100
},
{
"epoch": 11.979373264577548,
"grad_norm": 4.2845048904418945,
"learning_rate": 2.0052558508528363e-05,
"loss": 1.2216,
"step": 30200
},
{
"epoch": 12.0,
"eval_loss": 1.097899317741394,
"eval_runtime": 31.7141,
"eval_samples_per_second": 47.865,
"eval_steps_per_second": 5.991,
"step": 30252
},
{
"epoch": 12.019040063466878,
"grad_norm": 4.043181896209717,
"learning_rate": 1.9953391511305038e-05,
"loss": 1.1738,
"step": 30300
},
{
"epoch": 12.058706862356209,
"grad_norm": 3.213641405105591,
"learning_rate": 1.9854224514081713e-05,
"loss": 1.1143,
"step": 30400
},
{
"epoch": 12.098373661245537,
"grad_norm": 4.7294511795043945,
"learning_rate": 1.9755057516858392e-05,
"loss": 1.142,
"step": 30500
},
{
"epoch": 12.138040460134867,
"grad_norm": 4.42033052444458,
"learning_rate": 1.9655890519635067e-05,
"loss": 1.1422,
"step": 30600
},
{
"epoch": 12.177707259024197,
"grad_norm": 4.57334041595459,
"learning_rate": 1.9556723522411743e-05,
"loss": 1.1148,
"step": 30700
},
{
"epoch": 12.217374057913526,
"grad_norm": 4.560477256774902,
"learning_rate": 1.945755652518842e-05,
"loss": 1.1742,
"step": 30800
},
{
"epoch": 12.257040856802856,
"grad_norm": 3.4284374713897705,
"learning_rate": 1.9358389527965093e-05,
"loss": 1.1115,
"step": 30900
},
{
"epoch": 12.296707655692186,
"grad_norm": 3.185410499572754,
"learning_rate": 1.925922253074177e-05,
"loss": 1.1542,
"step": 31000
},
{
"epoch": 12.336374454581515,
"grad_norm": 3.674408435821533,
"learning_rate": 1.9160055533518444e-05,
"loss": 1.1787,
"step": 31100
},
{
"epoch": 12.376041253470845,
"grad_norm": 3.7118613719940186,
"learning_rate": 1.9060888536295122e-05,
"loss": 1.1716,
"step": 31200
},
{
"epoch": 12.415708052360175,
"grad_norm": 4.5831756591796875,
"learning_rate": 1.8961721539071797e-05,
"loss": 1.1372,
"step": 31300
},
{
"epoch": 12.455374851249504,
"grad_norm": 7.098066806793213,
"learning_rate": 1.8862554541848473e-05,
"loss": 1.1361,
"step": 31400
},
{
"epoch": 12.495041650138834,
"grad_norm": 3.451817512512207,
"learning_rate": 1.8763387544625148e-05,
"loss": 1.1458,
"step": 31500
},
{
"epoch": 12.534708449028164,
"grad_norm": 2.6188955307006836,
"learning_rate": 1.8664220547401827e-05,
"loss": 1.0782,
"step": 31600
},
{
"epoch": 12.574375247917493,
"grad_norm": 3.3588056564331055,
"learning_rate": 1.8565053550178502e-05,
"loss": 1.1593,
"step": 31700
},
{
"epoch": 12.614042046806823,
"grad_norm": 5.186858654022217,
"learning_rate": 1.8465886552955177e-05,
"loss": 1.137,
"step": 31800
},
{
"epoch": 12.653708845696153,
"grad_norm": 4.593524932861328,
"learning_rate": 1.8366719555731852e-05,
"loss": 1.1715,
"step": 31900
},
{
"epoch": 12.693375644585482,
"grad_norm": 4.951717853546143,
"learning_rate": 1.826755255850853e-05,
"loss": 1.0765,
"step": 32000
},
{
"epoch": 12.733042443474812,
"grad_norm": 6.989925384521484,
"learning_rate": 1.8168385561285206e-05,
"loss": 1.1062,
"step": 32100
},
{
"epoch": 12.772709242364142,
"grad_norm": 3.6436753273010254,
"learning_rate": 1.806921856406188e-05,
"loss": 1.1574,
"step": 32200
},
{
"epoch": 12.81237604125347,
"grad_norm": 4.659509181976318,
"learning_rate": 1.7970051566838557e-05,
"loss": 1.1257,
"step": 32300
},
{
"epoch": 12.8520428401428,
"grad_norm": 2.914414882659912,
"learning_rate": 1.7870884569615232e-05,
"loss": 1.1131,
"step": 32400
},
{
"epoch": 12.89170963903213,
"grad_norm": 3.9510741233825684,
"learning_rate": 1.7771717572391907e-05,
"loss": 1.1144,
"step": 32500
},
{
"epoch": 12.93137643792146,
"grad_norm": 4.820216178894043,
"learning_rate": 1.7672550575168582e-05,
"loss": 1.1628,
"step": 32600
},
{
"epoch": 12.97104323681079,
"grad_norm": 4.699492931365967,
"learning_rate": 1.757338357794526e-05,
"loss": 1.1587,
"step": 32700
},
{
"epoch": 13.0,
"eval_loss": 1.081364631652832,
"eval_runtime": 31.6894,
"eval_samples_per_second": 47.902,
"eval_steps_per_second": 5.996,
"step": 32773
},
{
"epoch": 13.01071003570012,
"grad_norm": 3.7646989822387695,
"learning_rate": 1.7474216580721936e-05,
"loss": 1.1084,
"step": 32800
},
{
"epoch": 13.050376834589448,
"grad_norm": 4.074378967285156,
"learning_rate": 1.737504958349861e-05,
"loss": 1.1007,
"step": 32900
},
{
"epoch": 13.090043633478778,
"grad_norm": 4.0714521408081055,
"learning_rate": 1.7275882586275287e-05,
"loss": 1.1298,
"step": 33000
},
{
"epoch": 13.129710432368109,
"grad_norm": 3.7556121349334717,
"learning_rate": 1.7176715589051965e-05,
"loss": 1.1407,
"step": 33100
},
{
"epoch": 13.169377231257437,
"grad_norm": 3.3032736778259277,
"learning_rate": 1.707754859182864e-05,
"loss": 1.1437,
"step": 33200
},
{
"epoch": 13.209044030146767,
"grad_norm": 4.428369522094727,
"learning_rate": 1.6978381594605316e-05,
"loss": 1.0659,
"step": 33300
},
{
"epoch": 13.248710829036098,
"grad_norm": 3.486649990081787,
"learning_rate": 1.687921459738199e-05,
"loss": 1.0744,
"step": 33400
},
{
"epoch": 13.288377627925426,
"grad_norm": 4.116626262664795,
"learning_rate": 1.678004760015867e-05,
"loss": 1.0933,
"step": 33500
},
{
"epoch": 13.328044426814756,
"grad_norm": 5.455049991607666,
"learning_rate": 1.6680880602935345e-05,
"loss": 1.0387,
"step": 33600
},
{
"epoch": 13.367711225704086,
"grad_norm": 4.454029083251953,
"learning_rate": 1.658171360571202e-05,
"loss": 1.0488,
"step": 33700
},
{
"epoch": 13.407378024593415,
"grad_norm": 3.605964422225952,
"learning_rate": 1.6482546608488695e-05,
"loss": 1.1565,
"step": 33800
},
{
"epoch": 13.447044823482745,
"grad_norm": 3.3428781032562256,
"learning_rate": 1.638337961126537e-05,
"loss": 1.1255,
"step": 33900
},
{
"epoch": 13.486711622372075,
"grad_norm": 5.9332709312438965,
"learning_rate": 1.6284212614042046e-05,
"loss": 1.0814,
"step": 34000
},
{
"epoch": 13.526378421261404,
"grad_norm": 3.3487417697906494,
"learning_rate": 1.618504561681872e-05,
"loss": 1.1105,
"step": 34100
},
{
"epoch": 13.566045220150734,
"grad_norm": 3.4275264739990234,
"learning_rate": 1.60858786195954e-05,
"loss": 1.0292,
"step": 34200
},
{
"epoch": 13.605712019040064,
"grad_norm": 5.602040767669678,
"learning_rate": 1.5986711622372075e-05,
"loss": 1.0629,
"step": 34300
},
{
"epoch": 13.645378817929393,
"grad_norm": 2.6752493381500244,
"learning_rate": 1.588754462514875e-05,
"loss": 1.0761,
"step": 34400
},
{
"epoch": 13.685045616818723,
"grad_norm": 3.2931220531463623,
"learning_rate": 1.5788377627925426e-05,
"loss": 0.9885,
"step": 34500
},
{
"epoch": 13.724712415708053,
"grad_norm": 8.223132133483887,
"learning_rate": 1.5689210630702104e-05,
"loss": 1.1423,
"step": 34600
},
{
"epoch": 13.764379214597382,
"grad_norm": 4.580158233642578,
"learning_rate": 1.559004363347878e-05,
"loss": 1.0879,
"step": 34700
},
{
"epoch": 13.804046013486712,
"grad_norm": 3.891131639480591,
"learning_rate": 1.5490876636255455e-05,
"loss": 1.0819,
"step": 34800
},
{
"epoch": 13.843712812376042,
"grad_norm": 5.4781084060668945,
"learning_rate": 1.539170963903213e-05,
"loss": 1.1007,
"step": 34900
},
{
"epoch": 13.88337961126537,
"grad_norm": 5.0408220291137695,
"learning_rate": 1.529254264180881e-05,
"loss": 1.1124,
"step": 35000
},
{
"epoch": 13.9230464101547,
"grad_norm": 4.6583452224731445,
"learning_rate": 1.5193375644585484e-05,
"loss": 1.1607,
"step": 35100
},
{
"epoch": 13.962713209044031,
"grad_norm": 5.026098251342773,
"learning_rate": 1.5094208647362159e-05,
"loss": 1.0744,
"step": 35200
},
{
"epoch": 14.0,
"eval_loss": 1.068395733833313,
"eval_runtime": 31.6512,
"eval_samples_per_second": 47.96,
"eval_steps_per_second": 6.003,
"step": 35294
},
{
"epoch": 14.00238000793336,
"grad_norm": 2.9335262775421143,
"learning_rate": 1.4995041650138836e-05,
"loss": 1.0841,
"step": 35300
},
{
"epoch": 14.04204680682269,
"grad_norm": 4.208588123321533,
"learning_rate": 1.489587465291551e-05,
"loss": 1.0901,
"step": 35400
},
{
"epoch": 14.08171360571202,
"grad_norm": 5.132387638092041,
"learning_rate": 1.4796707655692185e-05,
"loss": 1.1201,
"step": 35500
},
{
"epoch": 14.121380404601348,
"grad_norm": 3.9229278564453125,
"learning_rate": 1.4697540658468862e-05,
"loss": 1.0782,
"step": 35600
},
{
"epoch": 14.161047203490678,
"grad_norm": 6.1097259521484375,
"learning_rate": 1.4598373661245537e-05,
"loss": 1.1051,
"step": 35700
},
{
"epoch": 14.200714002380009,
"grad_norm": 4.1445417404174805,
"learning_rate": 1.4499206664022214e-05,
"loss": 1.1283,
"step": 35800
},
{
"epoch": 14.240380801269337,
"grad_norm": 3.5986008644104004,
"learning_rate": 1.440003966679889e-05,
"loss": 1.0453,
"step": 35900
},
{
"epoch": 14.280047600158667,
"grad_norm": 3.8175106048583984,
"learning_rate": 1.4300872669575566e-05,
"loss": 1.0585,
"step": 36000
},
{
"epoch": 14.319714399047998,
"grad_norm": 2.821758985519409,
"learning_rate": 1.4201705672352241e-05,
"loss": 1.06,
"step": 36100
},
{
"epoch": 14.359381197937326,
"grad_norm": 3.65065860748291,
"learning_rate": 1.4102538675128918e-05,
"loss": 1.1064,
"step": 36200
},
{
"epoch": 14.399047996826656,
"grad_norm": 5.7176713943481445,
"learning_rate": 1.4003371677905594e-05,
"loss": 1.008,
"step": 36300
},
{
"epoch": 14.438714795715986,
"grad_norm": 5.075132846832275,
"learning_rate": 1.390420468068227e-05,
"loss": 1.114,
"step": 36400
},
{
"epoch": 14.478381594605315,
"grad_norm": 5.210816860198975,
"learning_rate": 1.3805037683458946e-05,
"loss": 1.0944,
"step": 36500
},
{
"epoch": 14.518048393494645,
"grad_norm": 4.964089870452881,
"learning_rate": 1.3705870686235623e-05,
"loss": 1.0904,
"step": 36600
},
{
"epoch": 14.557715192383975,
"grad_norm": 3.131520986557007,
"learning_rate": 1.3606703689012298e-05,
"loss": 1.063,
"step": 36700
},
{
"epoch": 14.597381991273304,
"grad_norm": 6.203433036804199,
"learning_rate": 1.3507536691788975e-05,
"loss": 1.0885,
"step": 36800
},
{
"epoch": 14.637048790162634,
"grad_norm": 2.8487484455108643,
"learning_rate": 1.3408369694565648e-05,
"loss": 1.0785,
"step": 36900
},
{
"epoch": 14.676715589051964,
"grad_norm": 3.4533579349517822,
"learning_rate": 1.3309202697342324e-05,
"loss": 1.0956,
"step": 37000
},
{
"epoch": 14.716382387941293,
"grad_norm": 5.409042835235596,
"learning_rate": 1.3210035700119e-05,
"loss": 1.0635,
"step": 37100
},
{
"epoch": 14.756049186830623,
"grad_norm": 4.514674186706543,
"learning_rate": 1.3110868702895676e-05,
"loss": 1.0829,
"step": 37200
},
{
"epoch": 14.795715985719953,
"grad_norm": 4.7005791664123535,
"learning_rate": 1.3011701705672353e-05,
"loss": 1.0003,
"step": 37300
},
{
"epoch": 14.835382784609282,
"grad_norm": 4.253646373748779,
"learning_rate": 1.2912534708449028e-05,
"loss": 1.0562,
"step": 37400
},
{
"epoch": 14.875049583498612,
"grad_norm": 4.305023193359375,
"learning_rate": 1.2813367711225705e-05,
"loss": 1.0712,
"step": 37500
},
{
"epoch": 14.914716382387942,
"grad_norm": 4.189399719238281,
"learning_rate": 1.271420071400238e-05,
"loss": 1.0761,
"step": 37600
},
{
"epoch": 14.95438318127727,
"grad_norm": 3.2512216567993164,
"learning_rate": 1.2615033716779057e-05,
"loss": 1.0336,
"step": 37700
},
{
"epoch": 14.9940499801666,
"grad_norm": 3.3554651737213135,
"learning_rate": 1.2515866719555732e-05,
"loss": 1.0636,
"step": 37800
},
{
"epoch": 15.0,
"eval_loss": 1.051405906677246,
"eval_runtime": 31.6428,
"eval_samples_per_second": 47.973,
"eval_steps_per_second": 6.005,
"step": 37815
},
{
"epoch": 15.033716779055931,
"grad_norm": 3.6472902297973633,
"learning_rate": 1.241669972233241e-05,
"loss": 1.0596,
"step": 37900
},
{
"epoch": 15.07338357794526,
"grad_norm": 5.338723659515381,
"learning_rate": 1.2317532725109085e-05,
"loss": 1.0462,
"step": 38000
},
{
"epoch": 15.11305037683459,
"grad_norm": 4.401419639587402,
"learning_rate": 1.221836572788576e-05,
"loss": 1.0869,
"step": 38100
},
{
"epoch": 15.15271717572392,
"grad_norm": 9.426093101501465,
"learning_rate": 1.2119198730662435e-05,
"loss": 1.0198,
"step": 38200
},
{
"epoch": 15.192383974613248,
"grad_norm": 3.7169394493103027,
"learning_rate": 1.2020031733439112e-05,
"loss": 1.1285,
"step": 38300
},
{
"epoch": 15.232050773502579,
"grad_norm": 3.466498851776123,
"learning_rate": 1.1920864736215787e-05,
"loss": 1.0125,
"step": 38400
},
{
"epoch": 15.271717572391909,
"grad_norm": 2.7933382987976074,
"learning_rate": 1.1821697738992464e-05,
"loss": 1.0545,
"step": 38500
},
{
"epoch": 15.311384371281237,
"grad_norm": 2.926934003829956,
"learning_rate": 1.172253074176914e-05,
"loss": 1.1035,
"step": 38600
},
{
"epoch": 15.351051170170567,
"grad_norm": 3.2757022380828857,
"learning_rate": 1.1623363744545816e-05,
"loss": 1.0479,
"step": 38700
},
{
"epoch": 15.390717969059898,
"grad_norm": 4.160761833190918,
"learning_rate": 1.1524196747322492e-05,
"loss": 1.064,
"step": 38800
},
{
"epoch": 15.430384767949226,
"grad_norm": 3.412480592727661,
"learning_rate": 1.1425029750099167e-05,
"loss": 0.9485,
"step": 38900
},
{
"epoch": 15.470051566838556,
"grad_norm": 3.1907808780670166,
"learning_rate": 1.1325862752875844e-05,
"loss": 1.0605,
"step": 39000
},
{
"epoch": 15.509718365727887,
"grad_norm": 4.184901714324951,
"learning_rate": 1.1226695755652519e-05,
"loss": 1.0551,
"step": 39100
},
{
"epoch": 15.549385164617215,
"grad_norm": 4.784205436706543,
"learning_rate": 1.1127528758429196e-05,
"loss": 0.9941,
"step": 39200
},
{
"epoch": 15.589051963506545,
"grad_norm": 4.00923490524292,
"learning_rate": 1.1028361761205871e-05,
"loss": 1.076,
"step": 39300
},
{
"epoch": 15.628718762395875,
"grad_norm": 4.559725284576416,
"learning_rate": 1.0929194763982548e-05,
"loss": 0.9979,
"step": 39400
},
{
"epoch": 15.668385561285204,
"grad_norm": 3.8985109329223633,
"learning_rate": 1.0830027766759223e-05,
"loss": 1.0397,
"step": 39500
},
{
"epoch": 15.708052360174534,
"grad_norm": 3.3521323204040527,
"learning_rate": 1.0730860769535899e-05,
"loss": 1.006,
"step": 39600
},
{
"epoch": 15.747719159063864,
"grad_norm": 3.2745351791381836,
"learning_rate": 1.0631693772312574e-05,
"loss": 1.0642,
"step": 39700
},
{
"epoch": 15.787385957953193,
"grad_norm": 3.955242156982422,
"learning_rate": 1.053252677508925e-05,
"loss": 1.0455,
"step": 39800
},
{
"epoch": 15.827052756842523,
"grad_norm": 3.2223598957061768,
"learning_rate": 1.0433359777865926e-05,
"loss": 1.0675,
"step": 39900
},
{
"epoch": 15.866719555731853,
"grad_norm": 4.809605121612549,
"learning_rate": 1.0334192780642603e-05,
"loss": 1.0992,
"step": 40000
},
{
"epoch": 15.906386354621182,
"grad_norm": 2.6435019969940186,
"learning_rate": 1.0235025783419278e-05,
"loss": 0.9905,
"step": 40100
},
{
"epoch": 15.946053153510512,
"grad_norm": 6.68290376663208,
"learning_rate": 1.0135858786195955e-05,
"loss": 1.0951,
"step": 40200
},
{
"epoch": 15.985719952399842,
"grad_norm": 2.6426591873168945,
"learning_rate": 1.003669178897263e-05,
"loss": 1.073,
"step": 40300
},
{
"epoch": 16.0,
"eval_loss": 1.039953351020813,
"eval_runtime": 31.6995,
"eval_samples_per_second": 47.887,
"eval_steps_per_second": 5.994,
"step": 40336
},
{
"epoch": 16.025386751289172,
"grad_norm": 3.456146001815796,
"learning_rate": 9.937524791749306e-06,
"loss": 1.0191,
"step": 40400
},
{
"epoch": 16.0650535501785,
"grad_norm": 5.939918518066406,
"learning_rate": 9.838357794525983e-06,
"loss": 1.0433,
"step": 40500
},
{
"epoch": 16.10472034906783,
"grad_norm": 3.538282871246338,
"learning_rate": 9.739190797302658e-06,
"loss": 1.0295,
"step": 40600
},
{
"epoch": 16.14438714795716,
"grad_norm": 4.2307844161987305,
"learning_rate": 9.640023800079335e-06,
"loss": 1.0371,
"step": 40700
},
{
"epoch": 16.18405394684649,
"grad_norm": 4.40711784362793,
"learning_rate": 9.54085680285601e-06,
"loss": 1.0236,
"step": 40800
},
{
"epoch": 16.223720745735818,
"grad_norm": 3.8492507934570312,
"learning_rate": 9.441689805632687e-06,
"loss": 1.0628,
"step": 40900
},
{
"epoch": 16.26338754462515,
"grad_norm": 4.397724628448486,
"learning_rate": 9.342522808409362e-06,
"loss": 1.0072,
"step": 41000
},
{
"epoch": 16.30305434351448,
"grad_norm": 3.3145904541015625,
"learning_rate": 9.243355811186037e-06,
"loss": 1.045,
"step": 41100
},
{
"epoch": 16.342721142403807,
"grad_norm": 5.359413146972656,
"learning_rate": 9.144188813962713e-06,
"loss": 1.0299,
"step": 41200
},
{
"epoch": 16.38238794129314,
"grad_norm": 3.4849679470062256,
"learning_rate": 9.04502181673939e-06,
"loss": 1.012,
"step": 41300
},
{
"epoch": 16.422054740182467,
"grad_norm": 2.9378600120544434,
"learning_rate": 8.945854819516065e-06,
"loss": 1.0269,
"step": 41400
},
{
"epoch": 16.461721539071796,
"grad_norm": 3.024475574493408,
"learning_rate": 8.846687822292742e-06,
"loss": 1.0373,
"step": 41500
},
{
"epoch": 16.501388337961128,
"grad_norm": 3.2381701469421387,
"learning_rate": 8.747520825069417e-06,
"loss": 0.9888,
"step": 41600
},
{
"epoch": 16.541055136850456,
"grad_norm": 3.816202163696289,
"learning_rate": 8.648353827846094e-06,
"loss": 0.9384,
"step": 41700
},
{
"epoch": 16.580721935739785,
"grad_norm": 4.290541648864746,
"learning_rate": 8.54918683062277e-06,
"loss": 1.0653,
"step": 41800
},
{
"epoch": 16.620388734629117,
"grad_norm": 4.712522029876709,
"learning_rate": 8.450019833399444e-06,
"loss": 0.9951,
"step": 41900
},
{
"epoch": 16.660055533518445,
"grad_norm": 3.3500611782073975,
"learning_rate": 8.350852836176121e-06,
"loss": 1.0356,
"step": 42000
},
{
"epoch": 16.699722332407774,
"grad_norm": 3.6570308208465576,
"learning_rate": 8.251685838952797e-06,
"loss": 1.0205,
"step": 42100
},
{
"epoch": 16.739389131297106,
"grad_norm": 3.4734184741973877,
"learning_rate": 8.152518841729474e-06,
"loss": 1.037,
"step": 42200
},
{
"epoch": 16.779055930186434,
"grad_norm": 3.528817653656006,
"learning_rate": 8.053351844506149e-06,
"loss": 0.9402,
"step": 42300
},
{
"epoch": 16.818722729075763,
"grad_norm": 4.3084025382995605,
"learning_rate": 7.954184847282826e-06,
"loss": 1.0702,
"step": 42400
},
{
"epoch": 16.858389527965095,
"grad_norm": 3.520242214202881,
"learning_rate": 7.855017850059501e-06,
"loss": 1.0474,
"step": 42500
},
{
"epoch": 16.898056326854423,
"grad_norm": 4.44198751449585,
"learning_rate": 7.755850852836176e-06,
"loss": 1.0506,
"step": 42600
},
{
"epoch": 16.93772312574375,
"grad_norm": 2.8113813400268555,
"learning_rate": 7.656683855612852e-06,
"loss": 1.0167,
"step": 42700
},
{
"epoch": 16.977389924633083,
"grad_norm": 3.3131535053253174,
"learning_rate": 7.5575168583895284e-06,
"loss": 1.0077,
"step": 42800
},
{
"epoch": 17.0,
"eval_loss": 1.0288244485855103,
"eval_runtime": 31.6731,
"eval_samples_per_second": 47.927,
"eval_steps_per_second": 5.999,
"step": 42857
},
{
"epoch": 17.017056723522412,
"grad_norm": 5.444199562072754,
"learning_rate": 7.4583498611662045e-06,
"loss": 0.9996,
"step": 42900
},
{
"epoch": 17.05672352241174,
"grad_norm": 4.1272783279418945,
"learning_rate": 7.359182863942881e-06,
"loss": 1.0256,
"step": 43000
},
{
"epoch": 17.096390321301072,
"grad_norm": 4.819570064544678,
"learning_rate": 7.260015866719557e-06,
"loss": 1.0325,
"step": 43100
},
{
"epoch": 17.1360571201904,
"grad_norm": 4.795453071594238,
"learning_rate": 7.160848869496233e-06,
"loss": 0.9845,
"step": 43200
},
{
"epoch": 17.17572391907973,
"grad_norm": 5.2741827964782715,
"learning_rate": 7.061681872272907e-06,
"loss": 1.0406,
"step": 43300
},
{
"epoch": 17.21539071796906,
"grad_norm": 5.457202911376953,
"learning_rate": 6.962514875049583e-06,
"loss": 1.0704,
"step": 43400
},
{
"epoch": 17.25505751685839,
"grad_norm": 6.256078243255615,
"learning_rate": 6.863347877826259e-06,
"loss": 1.0182,
"step": 43500
},
{
"epoch": 17.294724315747718,
"grad_norm": 3.9407060146331787,
"learning_rate": 6.7641808806029355e-06,
"loss": 0.9889,
"step": 43600
},
{
"epoch": 17.33439111463705,
"grad_norm": 3.250436782836914,
"learning_rate": 6.6650138833796116e-06,
"loss": 1.0079,
"step": 43700
},
{
"epoch": 17.37405791352638,
"grad_norm": 2.7779972553253174,
"learning_rate": 6.565846886156288e-06,
"loss": 1.0134,
"step": 43800
},
{
"epoch": 17.413724712415707,
"grad_norm": 4.296668529510498,
"learning_rate": 6.466679888932964e-06,
"loss": 0.9585,
"step": 43900
},
{
"epoch": 17.45339151130504,
"grad_norm": 3.737541437149048,
"learning_rate": 6.36751289170964e-06,
"loss": 1.0307,
"step": 44000
},
{
"epoch": 17.493058310194368,
"grad_norm": 5.0776848793029785,
"learning_rate": 6.268345894486314e-06,
"loss": 1.0395,
"step": 44100
},
{
"epoch": 17.532725109083696,
"grad_norm": 6.334095001220703,
"learning_rate": 6.169178897262991e-06,
"loss": 0.9772,
"step": 44200
},
{
"epoch": 17.572391907973028,
"grad_norm": 5.443525314331055,
"learning_rate": 6.070011900039667e-06,
"loss": 0.9264,
"step": 44300
},
{
"epoch": 17.612058706862356,
"grad_norm": 4.61970853805542,
"learning_rate": 5.970844902816343e-06,
"loss": 1.0307,
"step": 44400
},
{
"epoch": 17.651725505751685,
"grad_norm": 3.089509963989258,
"learning_rate": 5.8716779055930195e-06,
"loss": 0.9633,
"step": 44500
},
{
"epoch": 17.691392304641017,
"grad_norm": 4.635293006896973,
"learning_rate": 5.7725109083696955e-06,
"loss": 1.0537,
"step": 44600
},
{
"epoch": 17.731059103530345,
"grad_norm": 3.052475929260254,
"learning_rate": 5.673343911146371e-06,
"loss": 0.9983,
"step": 44700
},
{
"epoch": 17.770725902419674,
"grad_norm": 3.9765052795410156,
"learning_rate": 5.574176913923047e-06,
"loss": 1.0687,
"step": 44800
},
{
"epoch": 17.810392701309006,
"grad_norm": 4.3488030433654785,
"learning_rate": 5.475009916699723e-06,
"loss": 1.012,
"step": 44900
},
{
"epoch": 17.850059500198334,
"grad_norm": 3.6032917499542236,
"learning_rate": 5.375842919476398e-06,
"loss": 0.9933,
"step": 45000
},
{
"epoch": 17.889726299087663,
"grad_norm": 3.2621772289276123,
"learning_rate": 5.276675922253074e-06,
"loss": 0.9657,
"step": 45100
},
{
"epoch": 17.929393097976995,
"grad_norm": 3.9976959228515625,
"learning_rate": 5.17750892502975e-06,
"loss": 0.9799,
"step": 45200
},
{
"epoch": 17.969059896866323,
"grad_norm": 4.725791931152344,
"learning_rate": 5.0783419278064265e-06,
"loss": 1.043,
"step": 45300
},
{
"epoch": 18.0,
"eval_loss": 1.0184741020202637,
"eval_runtime": 31.6571,
"eval_samples_per_second": 47.951,
"eval_steps_per_second": 6.002,
"step": 45378
},
{
"epoch": 18.00872669575565,
"grad_norm": 3.418588876724243,
"learning_rate": 4.979174930583102e-06,
"loss": 0.9965,
"step": 45400
},
{
"epoch": 18.048393494644984,
"grad_norm": 4.835160255432129,
"learning_rate": 4.880007933359778e-06,
"loss": 0.9789,
"step": 45500
},
{
"epoch": 18.088060293534312,
"grad_norm": 4.275815486907959,
"learning_rate": 4.780840936136454e-06,
"loss": 1.0233,
"step": 45600
},
{
"epoch": 18.12772709242364,
"grad_norm": 4.429009914398193,
"learning_rate": 4.68167393891313e-06,
"loss": 1.0487,
"step": 45700
},
{
"epoch": 18.167393891312972,
"grad_norm": 4.390066146850586,
"learning_rate": 4.582506941689805e-06,
"loss": 0.968,
"step": 45800
},
{
"epoch": 18.2070606902023,
"grad_norm": 3.265092372894287,
"learning_rate": 4.483339944466481e-06,
"loss": 1.0171,
"step": 45900
},
{
"epoch": 18.24672748909163,
"grad_norm": 4.843317031860352,
"learning_rate": 4.3841729472431574e-06,
"loss": 1.0204,
"step": 46000
},
{
"epoch": 18.28639428798096,
"grad_norm": 4.457988262176514,
"learning_rate": 4.2850059500198335e-06,
"loss": 1.0246,
"step": 46100
},
{
"epoch": 18.32606108687029,
"grad_norm": 3.9527127742767334,
"learning_rate": 4.18583895279651e-06,
"loss": 0.9241,
"step": 46200
},
{
"epoch": 18.36572788575962,
"grad_norm": 3.7694692611694336,
"learning_rate": 4.086671955573186e-06,
"loss": 1.0318,
"step": 46300
},
{
"epoch": 18.40539468464895,
"grad_norm": 5.390737533569336,
"learning_rate": 3.987504958349862e-06,
"loss": 0.9938,
"step": 46400
},
{
"epoch": 18.44506148353828,
"grad_norm": 3.8084776401519775,
"learning_rate": 3.888337961126538e-06,
"loss": 0.9652,
"step": 46500
},
{
"epoch": 18.484728282427607,
"grad_norm": 3.5767834186553955,
"learning_rate": 3.789170963903213e-06,
"loss": 0.9582,
"step": 46600
},
{
"epoch": 18.52439508131694,
"grad_norm": 3.4777605533599854,
"learning_rate": 3.6900039666798892e-06,
"loss": 0.9981,
"step": 46700
},
{
"epoch": 18.564061880206268,
"grad_norm": 4.1490092277526855,
"learning_rate": 3.5908369694565653e-06,
"loss": 0.9607,
"step": 46800
},
{
"epoch": 18.603728679095596,
"grad_norm": 4.089176654815674,
"learning_rate": 3.4916699722332406e-06,
"loss": 1.0168,
"step": 46900
},
{
"epoch": 18.643395477984928,
"grad_norm": 3.9602725505828857,
"learning_rate": 3.3925029750099167e-06,
"loss": 0.9785,
"step": 47000
},
{
"epoch": 18.683062276874256,
"grad_norm": 4.800217628479004,
"learning_rate": 3.2933359777865927e-06,
"loss": 1.0514,
"step": 47100
},
{
"epoch": 18.722729075763585,
"grad_norm": 4.848387718200684,
"learning_rate": 3.194168980563269e-06,
"loss": 0.9798,
"step": 47200
},
{
"epoch": 18.762395874652917,
"grad_norm": 3.5444610118865967,
"learning_rate": 3.0950019833399445e-06,
"loss": 1.0602,
"step": 47300
},
{
"epoch": 18.802062673542245,
"grad_norm": 3.4162533283233643,
"learning_rate": 2.9958349861166206e-06,
"loss": 0.9881,
"step": 47400
},
{
"epoch": 18.841729472431574,
"grad_norm": 4.719314098358154,
"learning_rate": 2.8966679888932967e-06,
"loss": 0.9503,
"step": 47500
},
{
"epoch": 18.881396271320906,
"grad_norm": 5.332608222961426,
"learning_rate": 2.7975009916699724e-06,
"loss": 1.0245,
"step": 47600
},
{
"epoch": 18.921063070210234,
"grad_norm": 5.230047702789307,
"learning_rate": 2.6983339944466484e-06,
"loss": 0.9947,
"step": 47700
},
{
"epoch": 18.960729869099563,
"grad_norm": 3.1582813262939453,
"learning_rate": 2.599166997223324e-06,
"loss": 1.0198,
"step": 47800
},
{
"epoch": 19.0,
"eval_loss": 1.017343521118164,
"eval_runtime": 31.7039,
"eval_samples_per_second": 47.881,
"eval_steps_per_second": 5.993,
"step": 47899
},
{
"epoch": 19.000396667988895,
"grad_norm": 5.45066499710083,
"learning_rate": 2.5e-06,
"loss": 0.9753,
"step": 47900
},
{
"epoch": 19.040063466878223,
"grad_norm": 3.2004072666168213,
"learning_rate": 2.400833002776676e-06,
"loss": 0.946,
"step": 48000
},
{
"epoch": 19.07973026576755,
"grad_norm": 3.971540689468384,
"learning_rate": 2.301666005553352e-06,
"loss": 0.9783,
"step": 48100
},
{
"epoch": 19.119397064656884,
"grad_norm": 4.348784923553467,
"learning_rate": 2.202499008330028e-06,
"loss": 0.952,
"step": 48200
},
{
"epoch": 19.159063863546212,
"grad_norm": 3.7044036388397217,
"learning_rate": 2.1033320111067037e-06,
"loss": 1.0662,
"step": 48300
},
{
"epoch": 19.19873066243554,
"grad_norm": 2.662105083465576,
"learning_rate": 2.00416501388338e-06,
"loss": 0.9647,
"step": 48400
},
{
"epoch": 19.238397461324872,
"grad_norm": 4.103559494018555,
"learning_rate": 1.9049980166600555e-06,
"loss": 0.9142,
"step": 48500
},
{
"epoch": 19.2780642602142,
"grad_norm": 2.8791961669921875,
"learning_rate": 1.8058310194367316e-06,
"loss": 1.0167,
"step": 48600
},
{
"epoch": 19.31773105910353,
"grad_norm": 2.689680576324463,
"learning_rate": 1.7066640222134072e-06,
"loss": 1.0012,
"step": 48700
},
{
"epoch": 19.35739785799286,
"grad_norm": 3.3067831993103027,
"learning_rate": 1.6074970249900833e-06,
"loss": 0.967,
"step": 48800
},
{
"epoch": 19.39706465688219,
"grad_norm": 3.9777708053588867,
"learning_rate": 1.5083300277667594e-06,
"loss": 0.9081,
"step": 48900
},
{
"epoch": 19.43673145577152,
"grad_norm": 3.582973003387451,
"learning_rate": 1.4091630305434353e-06,
"loss": 1.0104,
"step": 49000
},
{
"epoch": 19.47639825466085,
"grad_norm": 5.202731132507324,
"learning_rate": 1.309996033320111e-06,
"loss": 0.9648,
"step": 49100
},
{
"epoch": 19.51606505355018,
"grad_norm": 3.264211893081665,
"learning_rate": 1.210829036096787e-06,
"loss": 0.9599,
"step": 49200
},
{
"epoch": 19.555731852439507,
"grad_norm": 4.432053565979004,
"learning_rate": 1.111662038873463e-06,
"loss": 0.9935,
"step": 49300
},
{
"epoch": 19.59539865132884,
"grad_norm": 3.386671781539917,
"learning_rate": 1.0124950416501388e-06,
"loss": 1.002,
"step": 49400
},
{
"epoch": 19.635065450218168,
"grad_norm": 4.273075103759766,
"learning_rate": 9.133280444268148e-07,
"loss": 1.0225,
"step": 49500
},
{
"epoch": 19.674732249107496,
"grad_norm": 3.5673136711120605,
"learning_rate": 8.141610472034907e-07,
"loss": 1.0149,
"step": 49600
},
{
"epoch": 19.714399047996828,
"grad_norm": 3.68278431892395,
"learning_rate": 7.149940499801666e-07,
"loss": 0.996,
"step": 49700
},
{
"epoch": 19.754065846886157,
"grad_norm": 4.8836870193481445,
"learning_rate": 6.158270527568425e-07,
"loss": 1.0097,
"step": 49800
},
{
"epoch": 19.793732645775485,
"grad_norm": 3.579880475997925,
"learning_rate": 5.166600555335184e-07,
"loss": 0.9482,
"step": 49900
},
{
"epoch": 19.833399444664817,
"grad_norm": 2.7329444885253906,
"learning_rate": 4.174930583101944e-07,
"loss": 1.0365,
"step": 50000
},
{
"epoch": 19.873066243554145,
"grad_norm": 5.478430271148682,
"learning_rate": 3.1832606108687035e-07,
"loss": 1.0543,
"step": 50100
},
{
"epoch": 19.912733042443474,
"grad_norm": 3.1377158164978027,
"learning_rate": 2.191590638635462e-07,
"loss": 0.9637,
"step": 50200
},
{
"epoch": 19.952399841332806,
"grad_norm": 3.789954662322998,
"learning_rate": 1.1999206664022213e-07,
"loss": 0.9847,
"step": 50300
},
{
"epoch": 19.992066640222134,
"grad_norm": 4.29661226272583,
"learning_rate": 2.0825069416898058e-08,
"loss": 1.0306,
"step": 50400
},
{
"epoch": 20.0,
"eval_loss": 1.0158944129943848,
"eval_runtime": 31.6451,
"eval_samples_per_second": 47.97,
"eval_steps_per_second": 6.004,
"step": 50420
}
],
"logging_steps": 100,
"max_steps": 50420,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.902420484390912e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}