Legal-gemma3-12b-pt / checkpoint-52896 /trainer_state.json
QomSSLab's picture
Upload model and tokenizer
c77e1db verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9999432860086583,
"eval_steps": 500,
"global_step": 52896,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00945233189027733,
"grad_norm": 29.5,
"learning_rate": 9.960000000000001e-06,
"loss": 4.1004,
"step": 250
},
{
"epoch": 0.01890466378055466,
"grad_norm": 20.125,
"learning_rate": 9.999448050049255e-06,
"loss": 4.0386,
"step": 500
},
{
"epoch": 0.028356995670831994,
"grad_norm": 16.625,
"learning_rate": 9.997783447634044e-06,
"loss": 4.1263,
"step": 750
},
{
"epoch": 0.03780932756110932,
"grad_norm": 18.5,
"learning_rate": 9.995006554320588e-06,
"loss": 3.9826,
"step": 1000
},
{
"epoch": 0.047261659451386655,
"grad_norm": 18.625,
"learning_rate": 9.991117988125487e-06,
"loss": 4.0093,
"step": 1250
},
{
"epoch": 0.05671399134166399,
"grad_norm": 12.375,
"learning_rate": 9.986118614475757e-06,
"loss": 3.9503,
"step": 1500
},
{
"epoch": 0.06616632323194133,
"grad_norm": 15.8125,
"learning_rate": 9.980009546016204e-06,
"loss": 4.0245,
"step": 1750
},
{
"epoch": 0.07561865512221864,
"grad_norm": 17.5,
"learning_rate": 9.972792142361807e-06,
"loss": 3.9901,
"step": 2000
},
{
"epoch": 0.08507098701249598,
"grad_norm": 22.75,
"learning_rate": 9.964468009795128e-06,
"loss": 3.9098,
"step": 2250
},
{
"epoch": 0.09452331890277331,
"grad_norm": 19.5,
"learning_rate": 9.95503900090882e-06,
"loss": 3.9788,
"step": 2500
},
{
"epoch": 0.10397565079305064,
"grad_norm": 17.375,
"learning_rate": 9.944507214193314e-06,
"loss": 4.0492,
"step": 2750
},
{
"epoch": 0.11342798268332797,
"grad_norm": 16.75,
"learning_rate": 9.932874993569803e-06,
"loss": 3.9152,
"step": 3000
},
{
"epoch": 0.1228803145736053,
"grad_norm": 19.625,
"learning_rate": 9.92014492786856e-06,
"loss": 4.0289,
"step": 3250
},
{
"epoch": 0.13233264646388265,
"grad_norm": 26.75,
"learning_rate": 9.906319850252806e-06,
"loss": 3.9419,
"step": 3500
},
{
"epoch": 0.14178497835415999,
"grad_norm": 17.875,
"learning_rate": 9.891402837588142e-06,
"loss": 3.9255,
"step": 3750
},
{
"epoch": 0.1512373102444373,
"grad_norm": 17.0,
"learning_rate": 9.875397209757793e-06,
"loss": 4.068,
"step": 4000
},
{
"epoch": 0.16068964213471462,
"grad_norm": 22.375,
"learning_rate": 9.858306528923734e-06,
"loss": 3.9229,
"step": 4250
},
{
"epoch": 0.17014197402499195,
"grad_norm": 15.8125,
"learning_rate": 9.840134598733906e-06,
"loss": 3.8975,
"step": 4500
},
{
"epoch": 0.1795943059152693,
"grad_norm": 17.875,
"learning_rate": 9.8208854634757e-06,
"loss": 3.9159,
"step": 4750
},
{
"epoch": 0.18904663780554662,
"grad_norm": 19.0,
"learning_rate": 9.800563407175856e-06,
"loss": 3.9892,
"step": 5000
},
{
"epoch": 0.19849896969582395,
"grad_norm": 24.0,
"learning_rate": 9.779172952647035e-06,
"loss": 3.9846,
"step": 5250
},
{
"epoch": 0.20795130158610128,
"grad_norm": 15.9375,
"learning_rate": 9.756718860481235e-06,
"loss": 4.0746,
"step": 5500
},
{
"epoch": 0.21740363347637862,
"grad_norm": 19.25,
"learning_rate": 9.733206127990285e-06,
"loss": 3.9736,
"step": 5750
},
{
"epoch": 0.22685596536665595,
"grad_norm": 17.25,
"learning_rate": 9.708639988093663e-06,
"loss": 3.9673,
"step": 6000
},
{
"epoch": 0.23630829725693328,
"grad_norm": 17.375,
"learning_rate": 9.683025908153868e-06,
"loss": 3.9672,
"step": 6250
},
{
"epoch": 0.2457606291472106,
"grad_norm": 17.375,
"learning_rate": 9.656369588759628e-06,
"loss": 3.9812,
"step": 6500
},
{
"epoch": 0.255212961037488,
"grad_norm": 17.625,
"learning_rate": 9.628676962457194e-06,
"loss": 3.9659,
"step": 6750
},
{
"epoch": 0.2646652929277653,
"grad_norm": 18.0,
"learning_rate": 9.599954192430004e-06,
"loss": 3.9614,
"step": 7000
},
{
"epoch": 0.27411762481804264,
"grad_norm": 17.125,
"learning_rate": 9.570207671127034e-06,
"loss": 3.9424,
"step": 7250
},
{
"epoch": 0.28356995670831997,
"grad_norm": 24.125,
"learning_rate": 9.539444018840107e-06,
"loss": 3.9533,
"step": 7500
},
{
"epoch": 0.29302228859859725,
"grad_norm": 20.375,
"learning_rate": 9.507670082230507e-06,
"loss": 4.0344,
"step": 7750
},
{
"epoch": 0.3024746204888746,
"grad_norm": 18.375,
"learning_rate": 9.474892932805209e-06,
"loss": 3.8986,
"step": 8000
},
{
"epoch": 0.3119269523791519,
"grad_norm": 19.5,
"learning_rate": 9.441119865343054e-06,
"loss": 3.9415,
"step": 8250
},
{
"epoch": 0.32137928426942924,
"grad_norm": 20.125,
"learning_rate": 9.406358396271266e-06,
"loss": 3.9542,
"step": 8500
},
{
"epoch": 0.3308316161597066,
"grad_norm": 25.375,
"learning_rate": 9.370616261992605e-06,
"loss": 4.0098,
"step": 8750
},
{
"epoch": 0.3402839480499839,
"grad_norm": 19.625,
"learning_rate": 9.33390141716358e-06,
"loss": 3.9839,
"step": 9000
},
{
"epoch": 0.34973627994026124,
"grad_norm": 19.5,
"learning_rate": 9.296222032924092e-06,
"loss": 3.9886,
"step": 9250
},
{
"epoch": 0.3591886118305386,
"grad_norm": 16.75,
"learning_rate": 9.257586495078882e-06,
"loss": 3.8992,
"step": 9500
},
{
"epoch": 0.3686409437208159,
"grad_norm": 17.5,
"learning_rate": 9.21800340223122e-06,
"loss": 4.0108,
"step": 9750
},
{
"epoch": 0.37809327561109324,
"grad_norm": 17.375,
"learning_rate": 9.177481563869226e-06,
"loss": 3.9957,
"step": 10000
},
{
"epoch": 0.38754560750137057,
"grad_norm": 19.625,
"learning_rate": 9.136029998405253e-06,
"loss": 3.966,
"step": 10250
},
{
"epoch": 0.3969979393916479,
"grad_norm": 20.125,
"learning_rate": 9.093657931168782e-06,
"loss": 4.0057,
"step": 10500
},
{
"epoch": 0.40645027128192524,
"grad_norm": 25.5,
"learning_rate": 9.050374792353265e-06,
"loss": 4.0049,
"step": 10750
},
{
"epoch": 0.41590260317220257,
"grad_norm": 17.5,
"learning_rate": 9.006190214917363e-06,
"loss": 4.046,
"step": 11000
},
{
"epoch": 0.4253549350624799,
"grad_norm": 15.25,
"learning_rate": 8.961114032441067e-06,
"loss": 4.0138,
"step": 11250
},
{
"epoch": 0.43480726695275723,
"grad_norm": 58.5,
"learning_rate": 8.915156276937175e-06,
"loss": 4.0145,
"step": 11500
},
{
"epoch": 0.44425959884303456,
"grad_norm": 18.25,
"learning_rate": 8.868327176618592e-06,
"loss": 3.9748,
"step": 11750
},
{
"epoch": 0.4537119307333119,
"grad_norm": 17.0,
"learning_rate": 8.82063715362197e-06,
"loss": 4.0039,
"step": 12000
},
{
"epoch": 0.46316426262358923,
"grad_norm": 17.875,
"learning_rate": 8.772096821688194e-06,
"loss": 4.1231,
"step": 12250
},
{
"epoch": 0.47261659451386656,
"grad_norm": 19.25,
"learning_rate": 8.722716983800226e-06,
"loss": 4.0778,
"step": 12500
},
{
"epoch": 0.4820689264041439,
"grad_norm": 14.0625,
"learning_rate": 8.672508629778809e-06,
"loss": 3.9998,
"step": 12750
},
{
"epoch": 0.4915212582944212,
"grad_norm": 20.0,
"learning_rate": 8.621482933836634e-06,
"loss": 4.0298,
"step": 13000
},
{
"epoch": 0.5009735901846986,
"grad_norm": 21.5,
"learning_rate": 8.569651252091418e-06,
"loss": 3.9807,
"step": 13250
},
{
"epoch": 0.510425922074976,
"grad_norm": 17.75,
"learning_rate": 8.517025120038536e-06,
"loss": 4.084,
"step": 13500
},
{
"epoch": 0.5198782539652532,
"grad_norm": 19.0,
"learning_rate": 8.463616249983718e-06,
"loss": 4.1373,
"step": 13750
},
{
"epoch": 0.5293305858555306,
"grad_norm": 19.75,
"learning_rate": 8.409436528436381e-06,
"loss": 4.0691,
"step": 14000
},
{
"epoch": 0.5387829177458079,
"grad_norm": 17.25,
"learning_rate": 8.354498013464228e-06,
"loss": 4.0686,
"step": 14250
},
{
"epoch": 0.5482352496360853,
"grad_norm": 16.875,
"learning_rate": 8.298812932009622e-06,
"loss": 4.0066,
"step": 14500
},
{
"epoch": 0.5576875815263626,
"grad_norm": 19.625,
"learning_rate": 8.242393677168406e-06,
"loss": 4.0525,
"step": 14750
},
{
"epoch": 0.5671399134166399,
"grad_norm": 20.5,
"learning_rate": 8.185252805431732e-06,
"loss": 4.0993,
"step": 15000
},
{
"epoch": 0.5765922453069172,
"grad_norm": 20.375,
"learning_rate": 8.127403033891532e-06,
"loss": 4.0902,
"step": 15250
},
{
"epoch": 0.5860445771971945,
"grad_norm": 21.125,
"learning_rate": 8.068857237410237e-06,
"loss": 4.0273,
"step": 15500
},
{
"epoch": 0.5954969090874719,
"grad_norm": 19.875,
"learning_rate": 8.00962844575539e-06,
"loss": 4.0831,
"step": 15750
},
{
"epoch": 0.6049492409777492,
"grad_norm": 243.0,
"learning_rate": 7.949729840699784e-06,
"loss": 4.0758,
"step": 16000
},
{
"epoch": 0.6144015728680265,
"grad_norm": 17.875,
"learning_rate": 7.889174753087767e-06,
"loss": 4.0918,
"step": 16250
},
{
"epoch": 0.6238539047583038,
"grad_norm": 19.125,
"learning_rate": 7.827976659868368e-06,
"loss": 4.0538,
"step": 16500
},
{
"epoch": 0.6333062366485812,
"grad_norm": 19.625,
"learning_rate": 7.766149181095916e-06,
"loss": 4.1164,
"step": 16750
},
{
"epoch": 0.6427585685388585,
"grad_norm": 16.5,
"learning_rate": 7.703706076898803e-06,
"loss": 4.0626,
"step": 17000
},
{
"epoch": 0.6522109004291359,
"grad_norm": 14.375,
"learning_rate": 7.640661244417064e-06,
"loss": 4.0444,
"step": 17250
},
{
"epoch": 0.6616632323194132,
"grad_norm": 19.0,
"learning_rate": 7.577028714709484e-06,
"loss": 4.0429,
"step": 17500
},
{
"epoch": 0.6711155642096905,
"grad_norm": 17.875,
"learning_rate": 7.512822649630893e-06,
"loss": 4.0362,
"step": 17750
},
{
"epoch": 0.6805678960999678,
"grad_norm": 15.625,
"learning_rate": 7.44805733868033e-06,
"loss": 4.0806,
"step": 18000
},
{
"epoch": 0.6900202279902452,
"grad_norm": 17.625,
"learning_rate": 7.382747195820834e-06,
"loss": 4.0933,
"step": 18250
},
{
"epoch": 0.6994725598805225,
"grad_norm": 16.875,
"learning_rate": 7.316906756271515e-06,
"loss": 4.0495,
"step": 18500
},
{
"epoch": 0.7089248917707999,
"grad_norm": 20.375,
"learning_rate": 7.250550673272639e-06,
"loss": 4.0599,
"step": 18750
},
{
"epoch": 0.7183772236610771,
"grad_norm": 19.375,
"learning_rate": 7.1836937148244445e-06,
"loss": 4.0653,
"step": 19000
},
{
"epoch": 0.7278295555513545,
"grad_norm": 16.5,
"learning_rate": 7.1163507604004326e-06,
"loss": 4.0266,
"step": 19250
},
{
"epoch": 0.7372818874416318,
"grad_norm": 20.75,
"learning_rate": 7.048536797635832e-06,
"loss": 4.0484,
"step": 19500
},
{
"epoch": 0.7467342193319092,
"grad_norm": 18.25,
"learning_rate": 6.9802669189920005e-06,
"loss": 4.043,
"step": 19750
},
{
"epoch": 0.7561865512221865,
"grad_norm": 47.0,
"learning_rate": 6.911556318397493e-06,
"loss": 4.0716,
"step": 20000
},
{
"epoch": 0.7656388831124639,
"grad_norm": 15.625,
"learning_rate": 6.8424202878665515e-06,
"loss": 4.059,
"step": 20250
},
{
"epoch": 0.7750912150027411,
"grad_norm": 18.5,
"learning_rate": 6.772874214095761e-06,
"loss": 3.9974,
"step": 20500
},
{
"epoch": 0.7845435468930185,
"grad_norm": 16.0,
"learning_rate": 6.702933575039631e-06,
"loss": 4.0551,
"step": 20750
},
{
"epoch": 0.7939958787832958,
"grad_norm": 15.5,
"learning_rate": 6.6326139364658795e-06,
"loss": 4.1337,
"step": 21000
},
{
"epoch": 0.8034482106735732,
"grad_norm": 19.125,
"learning_rate": 6.561930948491155e-06,
"loss": 4.0849,
"step": 21250
},
{
"epoch": 0.8129005425638505,
"grad_norm": 19.125,
"learning_rate": 6.4909003420980065e-06,
"loss": 4.069,
"step": 21500
},
{
"epoch": 0.8223528744541279,
"grad_norm": 18.875,
"learning_rate": 6.419537925633836e-06,
"loss": 4.0218,
"step": 21750
},
{
"epoch": 0.8318052063444051,
"grad_norm": 17.125,
"learning_rate": 6.34785958129265e-06,
"loss": 3.9901,
"step": 22000
},
{
"epoch": 0.8412575382346825,
"grad_norm": 17.625,
"learning_rate": 6.275881261580363e-06,
"loss": 4.0088,
"step": 22250
},
{
"epoch": 0.8507098701249598,
"grad_norm": 17.0,
"learning_rate": 6.2036189857644616e-06,
"loss": 4.0448,
"step": 22500
},
{
"epoch": 0.8601622020152372,
"grad_norm": 15.8125,
"learning_rate": 6.131088836308805e-06,
"loss": 4.0443,
"step": 22750
},
{
"epoch": 0.8696145339055145,
"grad_norm": 22.875,
"learning_rate": 6.058306955294365e-06,
"loss": 4.0573,
"step": 23000
},
{
"epoch": 0.8790668657957919,
"grad_norm": 54.0,
"learning_rate": 5.9852895408266955e-06,
"loss": 4.0054,
"step": 23250
},
{
"epoch": 0.8885191976860691,
"grad_norm": 21.0,
"learning_rate": 5.9120528434309245e-06,
"loss": 4.0112,
"step": 23500
},
{
"epoch": 0.8979715295763465,
"grad_norm": 19.5,
"learning_rate": 5.838613162435106e-06,
"loss": 4.0095,
"step": 23750
},
{
"epoch": 0.9074238614666238,
"grad_norm": 22.375,
"learning_rate": 5.764986842342675e-06,
"loss": 3.9941,
"step": 24000
},
{
"epoch": 0.9168761933569012,
"grad_norm": 15.875,
"learning_rate": 5.6911902691948786e-06,
"loss": 3.9703,
"step": 24250
},
{
"epoch": 0.9263285252471785,
"grad_norm": 18.0,
"learning_rate": 5.617239866923945e-06,
"loss": 3.9949,
"step": 24500
},
{
"epoch": 0.9357808571374558,
"grad_norm": 18.25,
"learning_rate": 5.543152093697826e-06,
"loss": 4.0225,
"step": 24750
},
{
"epoch": 0.9452331890277331,
"grad_norm": 17.625,
"learning_rate": 5.4689434382573156e-06,
"loss": 3.998,
"step": 25000
},
{
"epoch": 0.9546855209180105,
"grad_norm": 16.5,
"learning_rate": 5.39463041624638e-06,
"loss": 3.9813,
"step": 25250
},
{
"epoch": 0.9641378528082878,
"grad_norm": 17.125,
"learning_rate": 5.320229566536474e-06,
"loss": 3.9089,
"step": 25500
},
{
"epoch": 0.9735901846985652,
"grad_norm": 17.875,
"learning_rate": 5.245757447545706e-06,
"loss": 4.0302,
"step": 25750
},
{
"epoch": 0.9830425165888425,
"grad_norm": 17.25,
"learning_rate": 5.171230633553656e-06,
"loss": 3.9841,
"step": 26000
},
{
"epoch": 0.9924948484791198,
"grad_norm": 16.125,
"learning_rate": 5.096665711012646e-06,
"loss": 3.9648,
"step": 26250
},
{
"epoch": 1.0019282757056165,
"grad_norm": 21.125,
"learning_rate": 5.0220792748563195e-06,
"loss": 3.8978,
"step": 26500
},
{
"epoch": 1.011380607595894,
"grad_norm": 25.75,
"learning_rate": 4.94748792480632e-06,
"loss": 3.4963,
"step": 26750
},
{
"epoch": 1.0208329394861713,
"grad_norm": 17.375,
"learning_rate": 4.872908261677911e-06,
"loss": 3.6178,
"step": 27000
},
{
"epoch": 1.0302852713764485,
"grad_norm": 25.375,
"learning_rate": 4.7983568836853564e-06,
"loss": 3.5309,
"step": 27250
},
{
"epoch": 1.0397376032667258,
"grad_norm": 26.625,
"learning_rate": 4.723850382747863e-06,
"loss": 3.4875,
"step": 27500
},
{
"epoch": 1.0491899351570033,
"grad_norm": 23.25,
"learning_rate": 4.649405340796947e-06,
"loss": 3.5433,
"step": 27750
},
{
"epoch": 1.0586422670472806,
"grad_norm": 17.125,
"learning_rate": 4.575038326086007e-06,
"loss": 3.5867,
"step": 28000
},
{
"epoch": 1.0680945989375579,
"grad_norm": 17.875,
"learning_rate": 4.500765889502937e-06,
"loss": 3.5986,
"step": 28250
},
{
"epoch": 1.0775469308278351,
"grad_norm": 18.0,
"learning_rate": 4.426604560886636e-06,
"loss": 3.5402,
"step": 28500
},
{
"epoch": 1.0869992627181126,
"grad_norm": 17.75,
"learning_rate": 4.3525708453481505e-06,
"loss": 3.5184,
"step": 28750
},
{
"epoch": 1.09645159460839,
"grad_norm": 19.875,
"learning_rate": 4.278681219597375e-06,
"loss": 3.5137,
"step": 29000
},
{
"epoch": 1.1059039264986672,
"grad_norm": 22.375,
"learning_rate": 4.204952128276027e-06,
"loss": 3.6344,
"step": 29250
},
{
"epoch": 1.1153562583889445,
"grad_norm": 19.25,
"learning_rate": 4.131399980297796e-06,
"loss": 3.5423,
"step": 29500
},
{
"epoch": 1.124808590279222,
"grad_norm": 24.25,
"learning_rate": 4.058041145196414e-06,
"loss": 3.5973,
"step": 29750
},
{
"epoch": 1.1342609221694993,
"grad_norm": 20.875,
"learning_rate": 3.98489194948251e-06,
"loss": 3.5454,
"step": 30000
},
{
"epoch": 1.1437132540597765,
"grad_norm": 19.25,
"learning_rate": 3.911968673010038e-06,
"loss": 3.5508,
"step": 30250
},
{
"epoch": 1.1531655859500538,
"grad_norm": 20.0,
"learning_rate": 3.839287545353076e-06,
"loss": 3.4747,
"step": 30500
},
{
"epoch": 1.1626179178403313,
"grad_norm": 21.25,
"learning_rate": 3.7668647421938275e-06,
"loss": 3.538,
"step": 30750
},
{
"epoch": 1.1720702497306086,
"grad_norm": 26.125,
"learning_rate": 3.694716381722609e-06,
"loss": 3.5677,
"step": 31000
},
{
"epoch": 1.1815225816208859,
"grad_norm": 25.75,
"learning_rate": 3.6228585210506427e-06,
"loss": 3.5912,
"step": 31250
},
{
"epoch": 1.1909749135111631,
"grad_norm": 17.5,
"learning_rate": 3.551307152636431e-06,
"loss": 3.5178,
"step": 31500
},
{
"epoch": 1.2004272454014404,
"grad_norm": 21.875,
"learning_rate": 3.4800782007265265e-06,
"loss": 3.5475,
"step": 31750
},
{
"epoch": 1.209879577291718,
"grad_norm": 28.0,
"learning_rate": 3.409187517811486e-06,
"loss": 3.5383,
"step": 32000
},
{
"epoch": 1.2193319091819952,
"grad_norm": 29.625,
"learning_rate": 3.3386508810977856e-06,
"loss": 3.5525,
"step": 32250
},
{
"epoch": 1.2287842410722725,
"grad_norm": 18.75,
"learning_rate": 3.2684839889964988e-06,
"loss": 3.5202,
"step": 32500
},
{
"epoch": 1.23823657296255,
"grad_norm": 24.0,
"learning_rate": 3.1987024576295012e-06,
"loss": 3.5926,
"step": 32750
},
{
"epoch": 1.2476889048528272,
"grad_norm": 21.0,
"learning_rate": 3.1293218173540074e-06,
"loss": 3.4852,
"step": 33000
},
{
"epoch": 1.2571412367431045,
"grad_norm": 20.125,
"learning_rate": 3.060357509306171e-06,
"loss": 3.4994,
"step": 33250
},
{
"epoch": 1.2665935686333818,
"grad_norm": 20.875,
"learning_rate": 2.9918248819645624e-06,
"loss": 3.566,
"step": 33500
},
{
"epoch": 1.276045900523659,
"grad_norm": 39.75,
"learning_rate": 2.923739187734258e-06,
"loss": 3.517,
"step": 33750
},
{
"epoch": 1.2854982324139366,
"grad_norm": 18.875,
"learning_rate": 2.8561155795523133e-06,
"loss": 3.4938,
"step": 34000
},
{
"epoch": 1.2949505643042138,
"grad_norm": 20.625,
"learning_rate": 2.788969107515369e-06,
"loss": 3.501,
"step": 34250
},
{
"epoch": 1.3044028961944911,
"grad_norm": 22.5,
"learning_rate": 2.722314715530156e-06,
"loss": 3.574,
"step": 34500
},
{
"epoch": 1.3138552280847686,
"grad_norm": 22.375,
"learning_rate": 2.6561672379876236e-06,
"loss": 3.4953,
"step": 34750
},
{
"epoch": 1.323307559975046,
"grad_norm": 20.375,
"learning_rate": 2.590541396461438e-06,
"loss": 3.4766,
"step": 35000
},
{
"epoch": 1.3327598918653232,
"grad_norm": 23.0,
"learning_rate": 2.5254517964316084e-06,
"loss": 3.4905,
"step": 35250
},
{
"epoch": 1.3422122237556005,
"grad_norm": 24.0,
"learning_rate": 2.4609129240339253e-06,
"loss": 3.5543,
"step": 35500
},
{
"epoch": 1.3516645556458777,
"grad_norm": 23.125,
"learning_rate": 2.39693914283598e-06,
"loss": 3.5577,
"step": 35750
},
{
"epoch": 1.3611168875361552,
"grad_norm": 19.125,
"learning_rate": 2.333544690640451e-06,
"loss": 3.515,
"step": 36000
},
{
"epoch": 1.3705692194264325,
"grad_norm": 22.625,
"learning_rate": 2.270743676316383e-06,
"loss": 3.506,
"step": 36250
},
{
"epoch": 1.3800215513167098,
"grad_norm": 18.625,
"learning_rate": 2.20855007665916e-06,
"loss": 3.5909,
"step": 36500
},
{
"epoch": 1.3894738832069873,
"grad_norm": 20.875,
"learning_rate": 2.1469777332798804e-06,
"loss": 3.5444,
"step": 36750
},
{
"epoch": 1.3989262150972646,
"grad_norm": 19.375,
"learning_rate": 2.086040349524807e-06,
"loss": 3.5885,
"step": 37000
},
{
"epoch": 1.4083785469875418,
"grad_norm": 20.5,
"learning_rate": 2.025751487425591e-06,
"loss": 3.5437,
"step": 37250
},
{
"epoch": 1.4178308788778191,
"grad_norm": 22.75,
"learning_rate": 1.9661245646809546e-06,
"loss": 3.5815,
"step": 37500
},
{
"epoch": 1.4272832107680964,
"grad_norm": 24.375,
"learning_rate": 1.9071728516704897e-06,
"loss": 3.5147,
"step": 37750
},
{
"epoch": 1.4367355426583739,
"grad_norm": 25.75,
"learning_rate": 1.8489094685012394e-06,
"loss": 3.5288,
"step": 38000
},
{
"epoch": 1.4461878745486512,
"grad_norm": 16.25,
"learning_rate": 1.7913473820877353e-06,
"loss": 3.5381,
"step": 38250
},
{
"epoch": 1.4556402064389284,
"grad_norm": 22.375,
"learning_rate": 1.7344994032661116e-06,
"loss": 3.5954,
"step": 38500
},
{
"epoch": 1.465092538329206,
"grad_norm": 19.125,
"learning_rate": 1.6783781839429785e-06,
"loss": 3.5212,
"step": 38750
},
{
"epoch": 1.4745448702194832,
"grad_norm": 23.25,
"learning_rate": 1.6229962142796469e-06,
"loss": 3.4585,
"step": 39000
},
{
"epoch": 1.4839972021097605,
"grad_norm": 18.375,
"learning_rate": 1.5683658199123524e-06,
"loss": 3.5231,
"step": 39250
},
{
"epoch": 1.4934495340000378,
"grad_norm": 25.0,
"learning_rate": 1.5144991592091162e-06,
"loss": 3.5881,
"step": 39500
},
{
"epoch": 1.502901865890315,
"grad_norm": 22.25,
"learning_rate": 1.461408220563803e-06,
"loss": 3.5278,
"step": 39750
},
{
"epoch": 1.5123541977805925,
"grad_norm": 18.625,
"learning_rate": 1.4091048197280227e-06,
"loss": 3.5224,
"step": 40000
},
{
"epoch": 1.5218065296708698,
"grad_norm": 21.5,
"learning_rate": 1.3576005971814627e-06,
"loss": 3.5465,
"step": 40250
},
{
"epoch": 1.531258861561147,
"grad_norm": 19.375,
"learning_rate": 1.3069070155412145e-06,
"loss": 3.5318,
"step": 40500
},
{
"epoch": 1.5407111934514246,
"grad_norm": 19.5,
"learning_rate": 1.2570353570106864e-06,
"loss": 3.5316,
"step": 40750
},
{
"epoch": 1.5501635253417017,
"grad_norm": 23.5,
"learning_rate": 1.2079967208686787e-06,
"loss": 3.5112,
"step": 41000
},
{
"epoch": 1.5596158572319792,
"grad_norm": 21.25,
"learning_rate": 1.159802020999159e-06,
"loss": 3.5891,
"step": 41250
},
{
"epoch": 1.5690681891222564,
"grad_norm": 22.375,
"learning_rate": 1.112461983462304e-06,
"loss": 3.5365,
"step": 41500
},
{
"epoch": 1.5785205210125337,
"grad_norm": 18.25,
"learning_rate": 1.0659871441073422e-06,
"loss": 3.5665,
"step": 41750
},
{
"epoch": 1.5879728529028112,
"grad_norm": 37.75,
"learning_rate": 1.020387846227724e-06,
"loss": 3.5764,
"step": 42000
},
{
"epoch": 1.5974251847930885,
"grad_norm": 21.75,
"learning_rate": 9.756742382591577e-07,
"loss": 3.6041,
"step": 42250
},
{
"epoch": 1.6068775166833658,
"grad_norm": 24.375,
"learning_rate": 9.318562715210039e-07,
"loss": 3.6046,
"step": 42500
},
{
"epoch": 1.6163298485736433,
"grad_norm": 18.375,
"learning_rate": 8.889436980015336e-07,
"loss": 3.5789,
"step": 42750
},
{
"epoch": 1.6257821804639203,
"grad_norm": 19.875,
"learning_rate": 8.469460681875674e-07,
"loss": 3.588,
"step": 43000
},
{
"epoch": 1.6352345123541978,
"grad_norm": 20.375,
"learning_rate": 8.058727289389485e-07,
"loss": 3.571,
"step": 43250
},
{
"epoch": 1.644686844244475,
"grad_norm": 33.25,
"learning_rate": 7.657328214083226e-07,
"loss": 3.5252,
"step": 43500
},
{
"epoch": 1.6541391761347524,
"grad_norm": 20.625,
"learning_rate": 7.26535279006727e-07,
"loss": 3.5418,
"step": 43750
},
{
"epoch": 1.6635915080250299,
"grad_norm": 20.25,
"learning_rate": 6.882888254153902e-07,
"loss": 3.475,
"step": 44000
},
{
"epoch": 1.6730438399153071,
"grad_norm": 22.25,
"learning_rate": 6.51001972644218e-07,
"loss": 3.6097,
"step": 44250
},
{
"epoch": 1.6824961718055844,
"grad_norm": 20.125,
"learning_rate": 6.146830191373909e-07,
"loss": 3.5361,
"step": 44500
},
{
"epoch": 1.691948503695862,
"grad_norm": 19.375,
"learning_rate": 5.793400479264849e-07,
"loss": 3.5127,
"step": 44750
},
{
"epoch": 1.701400835586139,
"grad_norm": 22.5,
"learning_rate": 5.449809248315402e-07,
"loss": 3.5631,
"step": 45000
},
{
"epoch": 1.7108531674764165,
"grad_norm": 20.125,
"learning_rate": 5.11613296710467e-07,
"loss": 3.4704,
"step": 45250
},
{
"epoch": 1.7203054993666937,
"grad_norm": 25.75,
"learning_rate": 4.792445897571845e-07,
"loss": 3.5528,
"step": 45500
},
{
"epoch": 1.729757831256971,
"grad_norm": 24.0,
"learning_rate": 4.478820078488749e-07,
"loss": 3.515,
"step": 45750
},
{
"epoch": 1.7392101631472485,
"grad_norm": 17.375,
"learning_rate": 4.175325309427064e-07,
"loss": 3.5246,
"step": 46000
},
{
"epoch": 1.7486624950375258,
"grad_norm": 21.875,
"learning_rate": 3.882029135223975e-07,
"loss": 3.504,
"step": 46250
},
{
"epoch": 1.758114826927803,
"grad_norm": 23.0,
"learning_rate": 3.598996830949619e-07,
"loss": 3.5885,
"step": 46500
},
{
"epoch": 1.7675671588180806,
"grad_norm": 29.0,
"learning_rate": 3.326291387379654e-07,
"loss": 3.5235,
"step": 46750
},
{
"epoch": 1.7770194907083576,
"grad_norm": 20.0,
"learning_rate": 3.0639734969762524e-07,
"loss": 3.5873,
"step": 47000
},
{
"epoch": 1.7864718225986351,
"grad_norm": 20.75,
"learning_rate": 2.8121015403805406e-07,
"loss": 3.501,
"step": 47250
},
{
"epoch": 1.7959241544889124,
"grad_norm": 19.375,
"learning_rate": 2.570731573419638e-07,
"loss": 3.4923,
"step": 47500
},
{
"epoch": 1.8053764863791897,
"grad_norm": 23.625,
"learning_rate": 2.3399173146309906e-07,
"loss": 3.4967,
"step": 47750
},
{
"epoch": 1.8148288182694672,
"grad_norm": 16.875,
"learning_rate": 2.119710133306996e-07,
"loss": 3.5254,
"step": 48000
},
{
"epoch": 1.8242811501597445,
"grad_norm": 19.625,
"learning_rate": 1.9101590380623925e-07,
"loss": 3.4922,
"step": 48250
},
{
"epoch": 1.8337334820500217,
"grad_norm": 23.125,
"learning_rate": 1.711310665927046e-07,
"loss": 3.5446,
"step": 48500
},
{
"epoch": 1.8431858139402992,
"grad_norm": 25.375,
"learning_rate": 1.5232092719666025e-07,
"loss": 3.5395,
"step": 48750
},
{
"epoch": 1.8526381458305763,
"grad_norm": 20.0,
"learning_rate": 1.3458967194331485e-07,
"loss": 3.5714,
"step": 49000
},
{
"epoch": 1.8620904777208538,
"grad_norm": 23.0,
"learning_rate": 1.1794124704483324e-07,
"loss": 3.5383,
"step": 49250
},
{
"epoch": 1.871542809611131,
"grad_norm": 16.875,
"learning_rate": 1.0237935772207608e-07,
"loss": 3.5393,
"step": 49500
},
{
"epoch": 1.8809951415014083,
"grad_norm": 18.25,
"learning_rate": 8.790746737997569e-08,
"loss": 3.5028,
"step": 49750
},
{
"epoch": 1.8904474733916858,
"grad_norm": 19.5,
"learning_rate": 7.452879683673809e-08,
"loss": 3.5613,
"step": 50000
},
{
"epoch": 1.899899805281963,
"grad_norm": 21.25,
"learning_rate": 6.224632360702143e-08,
"loss": 3.5304,
"step": 50250
},
{
"epoch": 1.9093521371722404,
"grad_norm": 21.75,
"learning_rate": 5.1062781239271665e-08,
"loss": 3.5906,
"step": 50500
},
{
"epoch": 1.9188044690625177,
"grad_norm": 21.75,
"learning_rate": 4.0980658707355234e-08,
"loss": 3.5713,
"step": 50750
},
{
"epoch": 1.928256800952795,
"grad_norm": 25.125,
"learning_rate": 3.2002199856617236e-08,
"loss": 3.5918,
"step": 51000
},
{
"epoch": 1.9377091328430724,
"grad_norm": 19.625,
"learning_rate": 2.412940290450083e-08,
"loss": 3.4949,
"step": 51250
},
{
"epoch": 1.9471614647333497,
"grad_norm": 21.5,
"learning_rate": 1.736401999582804e-08,
"loss": 3.5472,
"step": 51500
},
{
"epoch": 1.956613796623627,
"grad_norm": 23.375,
"learning_rate": 1.1707556812851074e-08,
"loss": 3.5542,
"step": 51750
},
{
"epoch": 1.9660661285139045,
"grad_norm": 24.25,
"learning_rate": 7.161272240148731e-09,
"loss": 3.5946,
"step": 52000
},
{
"epoch": 1.9755184604041816,
"grad_norm": 20.875,
"learning_rate": 3.726178084456078e-09,
"loss": 3.5614,
"step": 52250
},
{
"epoch": 1.984970792294459,
"grad_norm": 19.625,
"learning_rate": 1.4030388494790104e-09,
"loss": 3.5535,
"step": 52500
},
{
"epoch": 1.9944231241847363,
"grad_norm": 24.25,
"learning_rate": 1.923715657464742e-10,
"loss": 3.5299,
"step": 52750
}
],
"logging_steps": 250,
"max_steps": 52896,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8374993889748353e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}