QuangDuy's picture
Upload checkpoint-21140
46359fc verified
{
"best_global_step": 18000,
"best_metric": 3.9484923051611727,
"best_model_checkpoint": "outputs/bert-tiny-stage2-sbert/checkpoints/checkpoint-18000",
"epoch": 5.0,
"eval_steps": 2000,
"global_step": 21140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011825922421948912,
"grad_norm": 37.789451599121094,
"learning_rate": 2.3173327027666118e-07,
"loss": 18.0314,
"step": 50
},
{
"epoch": 0.023651844843897825,
"grad_norm": 43.297508239746094,
"learning_rate": 4.6819579096713174e-07,
"loss": 17.7147,
"step": 100
},
{
"epoch": 0.035477767265846734,
"grad_norm": 36.56124496459961,
"learning_rate": 7.046583116576024e-07,
"loss": 17.0007,
"step": 150
},
{
"epoch": 0.04730368968779565,
"grad_norm": 34.428916931152344,
"learning_rate": 9.411208323480729e-07,
"loss": 16.0667,
"step": 200
},
{
"epoch": 0.05912961210974456,
"grad_norm": 33.16645812988281,
"learning_rate": 1.1775833530385434e-06,
"loss": 14.9131,
"step": 250
},
{
"epoch": 0.07095553453169347,
"grad_norm": 29.064250946044922,
"learning_rate": 1.4140458737290142e-06,
"loss": 13.9449,
"step": 300
},
{
"epoch": 0.08278145695364239,
"grad_norm": 31.42921257019043,
"learning_rate": 1.6505083944194847e-06,
"loss": 12.7957,
"step": 350
},
{
"epoch": 0.0946073793755913,
"grad_norm": 33.341365814208984,
"learning_rate": 1.8869709151099552e-06,
"loss": 11.7288,
"step": 400
},
{
"epoch": 0.10643330179754021,
"grad_norm": 36.34325408935547,
"learning_rate": 2.123433435800426e-06,
"loss": 10.6945,
"step": 450
},
{
"epoch": 0.11825922421948912,
"grad_norm": 39.06941604614258,
"learning_rate": 2.3598959564908965e-06,
"loss": 9.4743,
"step": 500
},
{
"epoch": 0.13008514664143803,
"grad_norm": 32.969947814941406,
"learning_rate": 2.596358477181367e-06,
"loss": 8.6215,
"step": 550
},
{
"epoch": 0.14191106906338694,
"grad_norm": 33.9212760925293,
"learning_rate": 2.8328209978718375e-06,
"loss": 7.7279,
"step": 600
},
{
"epoch": 0.15373699148533584,
"grad_norm": 32.65876007080078,
"learning_rate": 3.069283518562308e-06,
"loss": 7.1892,
"step": 650
},
{
"epoch": 0.16556291390728478,
"grad_norm": 29.210859298706055,
"learning_rate": 3.3057460392527786e-06,
"loss": 6.9682,
"step": 700
},
{
"epoch": 0.1773888363292337,
"grad_norm": 29.231189727783203,
"learning_rate": 3.5422085599432495e-06,
"loss": 6.4781,
"step": 750
},
{
"epoch": 0.1892147587511826,
"grad_norm": 28.949066162109375,
"learning_rate": 3.77867108063372e-06,
"loss": 6.1271,
"step": 800
},
{
"epoch": 0.2010406811731315,
"grad_norm": 29.826133728027344,
"learning_rate": 4.01513360132419e-06,
"loss": 6.1199,
"step": 850
},
{
"epoch": 0.21286660359508042,
"grad_norm": 27.585041046142578,
"learning_rate": 4.2515961220146615e-06,
"loss": 5.9544,
"step": 900
},
{
"epoch": 0.22469252601702933,
"grad_norm": 28.10279655456543,
"learning_rate": 4.488058642705131e-06,
"loss": 5.8145,
"step": 950
},
{
"epoch": 0.23651844843897823,
"grad_norm": 26.567943572998047,
"learning_rate": 4.7245211633956025e-06,
"loss": 5.5599,
"step": 1000
},
{
"epoch": 0.24834437086092714,
"grad_norm": 24.42616081237793,
"learning_rate": 4.960983684086072e-06,
"loss": 5.2344,
"step": 1050
},
{
"epoch": 0.26017029328287605,
"grad_norm": 25.857810974121094,
"learning_rate": 5.197446204776543e-06,
"loss": 5.3013,
"step": 1100
},
{
"epoch": 0.27199621570482496,
"grad_norm": 26.047733306884766,
"learning_rate": 5.433908725467014e-06,
"loss": 5.0562,
"step": 1150
},
{
"epoch": 0.28382213812677387,
"grad_norm": 26.875659942626953,
"learning_rate": 5.670371246157485e-06,
"loss": 4.8728,
"step": 1200
},
{
"epoch": 0.2956480605487228,
"grad_norm": 21.9539737701416,
"learning_rate": 5.906833766847954e-06,
"loss": 4.7826,
"step": 1250
},
{
"epoch": 0.3074739829706717,
"grad_norm": 23.06488609313965,
"learning_rate": 6.143296287538426e-06,
"loss": 4.8806,
"step": 1300
},
{
"epoch": 0.3192999053926206,
"grad_norm": 24.24974250793457,
"learning_rate": 6.379758808228896e-06,
"loss": 4.6464,
"step": 1350
},
{
"epoch": 0.33112582781456956,
"grad_norm": 22.658571243286133,
"learning_rate": 6.616221328919367e-06,
"loss": 4.7046,
"step": 1400
},
{
"epoch": 0.34295175023651847,
"grad_norm": 21.927656173706055,
"learning_rate": 6.852683849609837e-06,
"loss": 4.5188,
"step": 1450
},
{
"epoch": 0.3547776726584674,
"grad_norm": 24.39653778076172,
"learning_rate": 7.089146370300309e-06,
"loss": 4.4968,
"step": 1500
},
{
"epoch": 0.3666035950804163,
"grad_norm": 23.591333389282227,
"learning_rate": 7.325608890990778e-06,
"loss": 4.4387,
"step": 1550
},
{
"epoch": 0.3784295175023652,
"grad_norm": 24.572961807250977,
"learning_rate": 7.562071411681249e-06,
"loss": 4.1702,
"step": 1600
},
{
"epoch": 0.3902554399243141,
"grad_norm": 22.61821174621582,
"learning_rate": 7.79853393237172e-06,
"loss": 4.2147,
"step": 1650
},
{
"epoch": 0.402081362346263,
"grad_norm": 22.490327835083008,
"learning_rate": 8.03499645306219e-06,
"loss": 3.9972,
"step": 1700
},
{
"epoch": 0.4139072847682119,
"grad_norm": 23.695873260498047,
"learning_rate": 8.271458973752661e-06,
"loss": 4.1279,
"step": 1750
},
{
"epoch": 0.42573320719016083,
"grad_norm": 24.085838317871094,
"learning_rate": 8.507921494443131e-06,
"loss": 4.0214,
"step": 1800
},
{
"epoch": 0.43755912961210974,
"grad_norm": 20.78253173828125,
"learning_rate": 8.744384015133602e-06,
"loss": 3.9161,
"step": 1850
},
{
"epoch": 0.44938505203405865,
"grad_norm": 19.800090789794922,
"learning_rate": 8.980846535824072e-06,
"loss": 3.7544,
"step": 1900
},
{
"epoch": 0.46121097445600756,
"grad_norm": 22.900514602661133,
"learning_rate": 9.217309056514543e-06,
"loss": 3.8246,
"step": 1950
},
{
"epoch": 0.47303689687795647,
"grad_norm": 22.419363021850586,
"learning_rate": 9.453771577205015e-06,
"loss": 3.7991,
"step": 2000
},
{
"epoch": 0.47303689687795647,
"eval_runtime": 46.7005,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 5.98806651504585,
"step": 2000
},
{
"epoch": 0.4848628192999054,
"grad_norm": 22.308876037597656,
"learning_rate": 9.690234097895484e-06,
"loss": 3.8554,
"step": 2050
},
{
"epoch": 0.4966887417218543,
"grad_norm": 23.8614501953125,
"learning_rate": 9.926696618585954e-06,
"loss": 3.8123,
"step": 2100
},
{
"epoch": 0.5085146641438032,
"grad_norm": 21.00491714477539,
"learning_rate": 1.0163159139276425e-05,
"loss": 3.5525,
"step": 2150
},
{
"epoch": 0.5203405865657521,
"grad_norm": 25.555097579956055,
"learning_rate": 1.0399621659966897e-05,
"loss": 3.5591,
"step": 2200
},
{
"epoch": 0.532166508987701,
"grad_norm": 25.4840087890625,
"learning_rate": 1.0636084180657367e-05,
"loss": 3.6293,
"step": 2250
},
{
"epoch": 0.5439924314096499,
"grad_norm": 21.117971420288086,
"learning_rate": 1.0872546701347836e-05,
"loss": 3.5831,
"step": 2300
},
{
"epoch": 0.5558183538315988,
"grad_norm": 23.38995361328125,
"learning_rate": 1.1109009222038308e-05,
"loss": 3.6007,
"step": 2350
},
{
"epoch": 0.5676442762535477,
"grad_norm": 22.385738372802734,
"learning_rate": 1.1345471742728777e-05,
"loss": 3.4225,
"step": 2400
},
{
"epoch": 0.5794701986754967,
"grad_norm": 21.53306007385254,
"learning_rate": 1.158193426341925e-05,
"loss": 3.4405,
"step": 2450
},
{
"epoch": 0.5912961210974456,
"grad_norm": 22.93678092956543,
"learning_rate": 1.181839678410972e-05,
"loss": 3.4002,
"step": 2500
},
{
"epoch": 0.6031220435193945,
"grad_norm": 20.330045700073242,
"learning_rate": 1.2054859304800191e-05,
"loss": 3.3653,
"step": 2550
},
{
"epoch": 0.6149479659413434,
"grad_norm": 21.98198699951172,
"learning_rate": 1.2291321825490661e-05,
"loss": 3.321,
"step": 2600
},
{
"epoch": 0.6267738883632923,
"grad_norm": 18.49015998840332,
"learning_rate": 1.252778434618113e-05,
"loss": 3.3042,
"step": 2650
},
{
"epoch": 0.6385998107852412,
"grad_norm": 22.69803237915039,
"learning_rate": 1.2764246866871602e-05,
"loss": 3.2117,
"step": 2700
},
{
"epoch": 0.6504257332071902,
"grad_norm": 19.658132553100586,
"learning_rate": 1.3000709387562072e-05,
"loss": 3.3423,
"step": 2750
},
{
"epoch": 0.6622516556291391,
"grad_norm": 20.783931732177734,
"learning_rate": 1.3237171908252545e-05,
"loss": 3.2494,
"step": 2800
},
{
"epoch": 0.674077578051088,
"grad_norm": 17.039609909057617,
"learning_rate": 1.3473634428943014e-05,
"loss": 3.1364,
"step": 2850
},
{
"epoch": 0.6859035004730369,
"grad_norm": 21.787738800048828,
"learning_rate": 1.3710096949633484e-05,
"loss": 3.1836,
"step": 2900
},
{
"epoch": 0.6977294228949859,
"grad_norm": 20.883773803710938,
"learning_rate": 1.3946559470323956e-05,
"loss": 3.1268,
"step": 2950
},
{
"epoch": 0.7095553453169348,
"grad_norm": 17.700597763061523,
"learning_rate": 1.4183021991014425e-05,
"loss": 3.072,
"step": 3000
},
{
"epoch": 0.7213812677388837,
"grad_norm": 20.23262596130371,
"learning_rate": 1.4419484511704895e-05,
"loss": 3.0135,
"step": 3050
},
{
"epoch": 0.7332071901608326,
"grad_norm": 19.417842864990234,
"learning_rate": 1.4655947032395366e-05,
"loss": 3.0607,
"step": 3100
},
{
"epoch": 0.7450331125827815,
"grad_norm": 19.843341827392578,
"learning_rate": 1.4892409553085838e-05,
"loss": 3.0963,
"step": 3150
},
{
"epoch": 0.7568590350047304,
"grad_norm": 20.248523712158203,
"learning_rate": 1.5128872073776309e-05,
"loss": 3.0419,
"step": 3200
},
{
"epoch": 0.7686849574266793,
"grad_norm": 24.61260986328125,
"learning_rate": 1.5365334594466777e-05,
"loss": 2.9891,
"step": 3250
},
{
"epoch": 0.7805108798486282,
"grad_norm": 16.637826919555664,
"learning_rate": 1.560179711515725e-05,
"loss": 2.9384,
"step": 3300
},
{
"epoch": 0.7923368022705771,
"grad_norm": 24.341026306152344,
"learning_rate": 1.583825963584772e-05,
"loss": 2.8918,
"step": 3350
},
{
"epoch": 0.804162724692526,
"grad_norm": 18.246440887451172,
"learning_rate": 1.607472215653819e-05,
"loss": 2.9816,
"step": 3400
},
{
"epoch": 0.8159886471144749,
"grad_norm": 19.296022415161133,
"learning_rate": 1.631118467722866e-05,
"loss": 2.9664,
"step": 3450
},
{
"epoch": 0.8278145695364238,
"grad_norm": 19.331918716430664,
"learning_rate": 1.6547647197919134e-05,
"loss": 2.8969,
"step": 3500
},
{
"epoch": 0.8396404919583728,
"grad_norm": 25.586254119873047,
"learning_rate": 1.6784109718609602e-05,
"loss": 2.9368,
"step": 3550
},
{
"epoch": 0.8514664143803217,
"grad_norm": 19.701223373413086,
"learning_rate": 1.7020572239300073e-05,
"loss": 2.8513,
"step": 3600
},
{
"epoch": 0.8632923368022706,
"grad_norm": 16.68182945251465,
"learning_rate": 1.7257034759990545e-05,
"loss": 2.9808,
"step": 3650
},
{
"epoch": 0.8751182592242195,
"grad_norm": 19.592416763305664,
"learning_rate": 1.7493497280681013e-05,
"loss": 2.8428,
"step": 3700
},
{
"epoch": 0.8869441816461684,
"grad_norm": 20.324504852294922,
"learning_rate": 1.7729959801371484e-05,
"loss": 2.8775,
"step": 3750
},
{
"epoch": 0.8987701040681173,
"grad_norm": 19.49851417541504,
"learning_rate": 1.7966422322061955e-05,
"loss": 2.739,
"step": 3800
},
{
"epoch": 0.9105960264900662,
"grad_norm": 19.18546485900879,
"learning_rate": 1.8202884842752427e-05,
"loss": 2.8277,
"step": 3850
},
{
"epoch": 0.9224219489120151,
"grad_norm": 23.6113338470459,
"learning_rate": 1.8439347363442898e-05,
"loss": 2.767,
"step": 3900
},
{
"epoch": 0.934247871333964,
"grad_norm": 19.779712677001953,
"learning_rate": 1.8675809884133366e-05,
"loss": 2.794,
"step": 3950
},
{
"epoch": 0.9460737937559129,
"grad_norm": 23.361425399780273,
"learning_rate": 1.8912272404823837e-05,
"loss": 2.7738,
"step": 4000
},
{
"epoch": 0.9460737937559129,
"eval_runtime": 47.0317,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 4.773771009103065,
"step": 4000
},
{
"epoch": 0.9578997161778618,
"grad_norm": 18.137535095214844,
"learning_rate": 1.914873492551431e-05,
"loss": 2.8568,
"step": 4050
},
{
"epoch": 0.9697256385998108,
"grad_norm": 18.014116287231445,
"learning_rate": 1.9385197446204777e-05,
"loss": 2.7938,
"step": 4100
},
{
"epoch": 0.9815515610217597,
"grad_norm": 17.168569564819336,
"learning_rate": 1.9621659966895248e-05,
"loss": 2.7272,
"step": 4150
},
{
"epoch": 0.9933774834437086,
"grad_norm": 17.75269889831543,
"learning_rate": 1.985812248758572e-05,
"loss": 2.7079,
"step": 4200
},
{
"epoch": 1.0052034058656576,
"grad_norm": 19.342844009399414,
"learning_rate": 1.9976346756548995e-05,
"loss": 2.6383,
"step": 4250
},
{
"epoch": 1.0170293282876064,
"grad_norm": 17.54117774963379,
"learning_rate": 1.9917213647921473e-05,
"loss": 2.6855,
"step": 4300
},
{
"epoch": 1.0288552507095554,
"grad_norm": 18.412206649780273,
"learning_rate": 1.9858080539293952e-05,
"loss": 2.6568,
"step": 4350
},
{
"epoch": 1.0406811731315042,
"grad_norm": 18.794939041137695,
"learning_rate": 1.979894743066643e-05,
"loss": 2.5981,
"step": 4400
},
{
"epoch": 1.0525070955534532,
"grad_norm": 17.26803970336914,
"learning_rate": 1.973981432203891e-05,
"loss": 2.6987,
"step": 4450
},
{
"epoch": 1.064333017975402,
"grad_norm": 15.831737518310547,
"learning_rate": 1.968068121341139e-05,
"loss": 2.6992,
"step": 4500
},
{
"epoch": 1.076158940397351,
"grad_norm": 16.746700286865234,
"learning_rate": 1.962154810478387e-05,
"loss": 2.5434,
"step": 4550
},
{
"epoch": 1.0879848628192998,
"grad_norm": 18.824857711791992,
"learning_rate": 1.956241499615635e-05,
"loss": 2.5553,
"step": 4600
},
{
"epoch": 1.0998107852412489,
"grad_norm": 16.81246566772461,
"learning_rate": 1.9503281887528828e-05,
"loss": 2.4978,
"step": 4650
},
{
"epoch": 1.1116367076631977,
"grad_norm": 18.369991302490234,
"learning_rate": 1.9444148778901307e-05,
"loss": 2.5679,
"step": 4700
},
{
"epoch": 1.1234626300851467,
"grad_norm": 19.55158805847168,
"learning_rate": 1.938501567027379e-05,
"loss": 2.4768,
"step": 4750
},
{
"epoch": 1.1352885525070955,
"grad_norm": 20.673002243041992,
"learning_rate": 1.9325882561646268e-05,
"loss": 2.5578,
"step": 4800
},
{
"epoch": 1.1471144749290445,
"grad_norm": 17.067432403564453,
"learning_rate": 1.9266749453018747e-05,
"loss": 2.4758,
"step": 4850
},
{
"epoch": 1.1589403973509933,
"grad_norm": 22.328304290771484,
"learning_rate": 1.9207616344391226e-05,
"loss": 2.5352,
"step": 4900
},
{
"epoch": 1.1707663197729423,
"grad_norm": 15.121694564819336,
"learning_rate": 1.9148483235763708e-05,
"loss": 2.5023,
"step": 4950
},
{
"epoch": 1.1825922421948911,
"grad_norm": 15.201376914978027,
"learning_rate": 1.9089350127136187e-05,
"loss": 2.4713,
"step": 5000
},
{
"epoch": 1.1944181646168401,
"grad_norm": 20.54207992553711,
"learning_rate": 1.9030217018508665e-05,
"loss": 2.486,
"step": 5050
},
{
"epoch": 1.206244087038789,
"grad_norm": 16.934635162353516,
"learning_rate": 1.8971083909881144e-05,
"loss": 2.483,
"step": 5100
},
{
"epoch": 1.218070009460738,
"grad_norm": 16.963790893554688,
"learning_rate": 1.8911950801253623e-05,
"loss": 2.4098,
"step": 5150
},
{
"epoch": 1.2298959318826868,
"grad_norm": 16.505352020263672,
"learning_rate": 1.8852817692626102e-05,
"loss": 2.5061,
"step": 5200
},
{
"epoch": 1.2417218543046358,
"grad_norm": 16.634069442749023,
"learning_rate": 1.879368458399858e-05,
"loss": 2.4597,
"step": 5250
},
{
"epoch": 1.2535477767265846,
"grad_norm": 16.373046875,
"learning_rate": 1.8734551475371063e-05,
"loss": 2.4591,
"step": 5300
},
{
"epoch": 1.2653736991485336,
"grad_norm": 21.308876037597656,
"learning_rate": 1.867541836674354e-05,
"loss": 2.3879,
"step": 5350
},
{
"epoch": 1.2771996215704826,
"grad_norm": 20.565275192260742,
"learning_rate": 1.861628525811602e-05,
"loss": 2.4146,
"step": 5400
},
{
"epoch": 1.2890255439924314,
"grad_norm": 15.853353500366211,
"learning_rate": 1.85571521494885e-05,
"loss": 2.3418,
"step": 5450
},
{
"epoch": 1.3008514664143802,
"grad_norm": 13.12362003326416,
"learning_rate": 1.8498019040860978e-05,
"loss": 2.4307,
"step": 5500
},
{
"epoch": 1.3126773888363292,
"grad_norm": 19.059667587280273,
"learning_rate": 1.843888593223346e-05,
"loss": 2.3653,
"step": 5550
},
{
"epoch": 1.3245033112582782,
"grad_norm": 17.448827743530273,
"learning_rate": 1.837975282360594e-05,
"loss": 2.3995,
"step": 5600
},
{
"epoch": 1.336329233680227,
"grad_norm": 18.326887130737305,
"learning_rate": 1.8320619714978418e-05,
"loss": 2.4527,
"step": 5650
},
{
"epoch": 1.3481551561021758,
"grad_norm": 18.03122901916504,
"learning_rate": 1.8261486606350896e-05,
"loss": 2.4547,
"step": 5700
},
{
"epoch": 1.3599810785241249,
"grad_norm": 18.269872665405273,
"learning_rate": 1.820235349772338e-05,
"loss": 2.3695,
"step": 5750
},
{
"epoch": 1.3718070009460739,
"grad_norm": 16.90838623046875,
"learning_rate": 1.8143220389095857e-05,
"loss": 2.3341,
"step": 5800
},
{
"epoch": 1.3836329233680227,
"grad_norm": 18.816362380981445,
"learning_rate": 1.8084087280468336e-05,
"loss": 2.2412,
"step": 5850
},
{
"epoch": 1.3954588457899715,
"grad_norm": 17.30527687072754,
"learning_rate": 1.8024954171840815e-05,
"loss": 2.2695,
"step": 5900
},
{
"epoch": 1.4072847682119205,
"grad_norm": 18.299711227416992,
"learning_rate": 1.7965821063213297e-05,
"loss": 2.2922,
"step": 5950
},
{
"epoch": 1.4191106906338695,
"grad_norm": 18.047449111938477,
"learning_rate": 1.7906687954585773e-05,
"loss": 2.3176,
"step": 6000
},
{
"epoch": 1.4191106906338695,
"eval_runtime": 46.9839,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 4.305679076455633,
"step": 6000
},
{
"epoch": 1.4309366130558183,
"grad_norm": 20.608333587646484,
"learning_rate": 1.784755484595825e-05,
"loss": 2.281,
"step": 6050
},
{
"epoch": 1.4427625354777673,
"grad_norm": 16.299001693725586,
"learning_rate": 1.7788421737330734e-05,
"loss": 2.2155,
"step": 6100
},
{
"epoch": 1.4545884578997161,
"grad_norm": 17.70014762878418,
"learning_rate": 1.7729288628703212e-05,
"loss": 2.1908,
"step": 6150
},
{
"epoch": 1.4664143803216652,
"grad_norm": 13.944992065429688,
"learning_rate": 1.767015552007569e-05,
"loss": 2.2071,
"step": 6200
},
{
"epoch": 1.478240302743614,
"grad_norm": 18.37308692932129,
"learning_rate": 1.761102241144817e-05,
"loss": 2.2617,
"step": 6250
},
{
"epoch": 1.490066225165563,
"grad_norm": 16.624183654785156,
"learning_rate": 1.7551889302820652e-05,
"loss": 2.2864,
"step": 6300
},
{
"epoch": 1.5018921475875118,
"grad_norm": 15.490421295166016,
"learning_rate": 1.749275619419313e-05,
"loss": 2.2509,
"step": 6350
},
{
"epoch": 1.5137180700094608,
"grad_norm": 15.517704010009766,
"learning_rate": 1.743362308556561e-05,
"loss": 2.1227,
"step": 6400
},
{
"epoch": 1.5255439924314098,
"grad_norm": 14.78442096710205,
"learning_rate": 1.737448997693809e-05,
"loss": 2.1919,
"step": 6450
},
{
"epoch": 1.5373699148533586,
"grad_norm": 19.766271591186523,
"learning_rate": 1.7315356868310567e-05,
"loss": 2.2072,
"step": 6500
},
{
"epoch": 1.5491958372753074,
"grad_norm": 17.84695053100586,
"learning_rate": 1.725622375968305e-05,
"loss": 2.1652,
"step": 6550
},
{
"epoch": 1.5610217596972564,
"grad_norm": 17.325145721435547,
"learning_rate": 1.7197090651055528e-05,
"loss": 2.2224,
"step": 6600
},
{
"epoch": 1.5728476821192054,
"grad_norm": 19.243274688720703,
"learning_rate": 1.7137957542428007e-05,
"loss": 2.0715,
"step": 6650
},
{
"epoch": 1.5846736045411542,
"grad_norm": 17.589859008789062,
"learning_rate": 1.7078824433800486e-05,
"loss": 2.1693,
"step": 6700
},
{
"epoch": 1.596499526963103,
"grad_norm": 14.71687126159668,
"learning_rate": 1.7019691325172968e-05,
"loss": 2.1141,
"step": 6750
},
{
"epoch": 1.608325449385052,
"grad_norm": 14.723918914794922,
"learning_rate": 1.6960558216545443e-05,
"loss": 2.1129,
"step": 6800
},
{
"epoch": 1.620151371807001,
"grad_norm": 16.5570011138916,
"learning_rate": 1.6901425107917922e-05,
"loss": 2.1001,
"step": 6850
},
{
"epoch": 1.6319772942289499,
"grad_norm": 17.945083618164062,
"learning_rate": 1.6842291999290404e-05,
"loss": 2.094,
"step": 6900
},
{
"epoch": 1.6438032166508987,
"grad_norm": 18.704225540161133,
"learning_rate": 1.6783158890662883e-05,
"loss": 2.2176,
"step": 6950
},
{
"epoch": 1.6556291390728477,
"grad_norm": 15.701910018920898,
"learning_rate": 1.6724025782035362e-05,
"loss": 2.109,
"step": 7000
},
{
"epoch": 1.6674550614947967,
"grad_norm": 16.768260955810547,
"learning_rate": 1.666489267340784e-05,
"loss": 2.0537,
"step": 7050
},
{
"epoch": 1.6792809839167455,
"grad_norm": 17.835603713989258,
"learning_rate": 1.6605759564780323e-05,
"loss": 2.0328,
"step": 7100
},
{
"epoch": 1.6911069063386943,
"grad_norm": 18.1043701171875,
"learning_rate": 1.6546626456152802e-05,
"loss": 2.1541,
"step": 7150
},
{
"epoch": 1.7029328287606433,
"grad_norm": 14.032896995544434,
"learning_rate": 1.648749334752528e-05,
"loss": 2.0164,
"step": 7200
},
{
"epoch": 1.7147587511825924,
"grad_norm": 15.934415817260742,
"learning_rate": 1.642836023889776e-05,
"loss": 2.0225,
"step": 7250
},
{
"epoch": 1.7265846736045412,
"grad_norm": 15.602225303649902,
"learning_rate": 1.636922713027024e-05,
"loss": 2.0243,
"step": 7300
},
{
"epoch": 1.73841059602649,
"grad_norm": 15.584887504577637,
"learning_rate": 1.631009402164272e-05,
"loss": 2.0152,
"step": 7350
},
{
"epoch": 1.750236518448439,
"grad_norm": 17.52799415588379,
"learning_rate": 1.62509609130152e-05,
"loss": 2.0455,
"step": 7400
},
{
"epoch": 1.762062440870388,
"grad_norm": 15.92798900604248,
"learning_rate": 1.6191827804387678e-05,
"loss": 2.0026,
"step": 7450
},
{
"epoch": 1.7738883632923368,
"grad_norm": 14.851804733276367,
"learning_rate": 1.6132694695760157e-05,
"loss": 1.9846,
"step": 7500
},
{
"epoch": 1.7857142857142856,
"grad_norm": 15.551090240478516,
"learning_rate": 1.607356158713264e-05,
"loss": 1.9594,
"step": 7550
},
{
"epoch": 1.7975402081362346,
"grad_norm": 14.651620864868164,
"learning_rate": 1.6014428478505118e-05,
"loss": 2.0523,
"step": 7600
},
{
"epoch": 1.8093661305581836,
"grad_norm": 19.447086334228516,
"learning_rate": 1.5955295369877596e-05,
"loss": 1.9751,
"step": 7650
},
{
"epoch": 1.8211920529801324,
"grad_norm": 14.130012512207031,
"learning_rate": 1.5896162261250075e-05,
"loss": 1.9898,
"step": 7700
},
{
"epoch": 1.8330179754020812,
"grad_norm": 18.4505615234375,
"learning_rate": 1.5837029152622554e-05,
"loss": 1.9658,
"step": 7750
},
{
"epoch": 1.8448438978240302,
"grad_norm": 12.992496490478516,
"learning_rate": 1.5777896043995033e-05,
"loss": 1.9976,
"step": 7800
},
{
"epoch": 1.8566698202459793,
"grad_norm": 17.20708656311035,
"learning_rate": 1.571876293536751e-05,
"loss": 1.9939,
"step": 7850
},
{
"epoch": 1.868495742667928,
"grad_norm": 14.438339233398438,
"learning_rate": 1.5659629826739994e-05,
"loss": 1.9666,
"step": 7900
},
{
"epoch": 1.8803216650898769,
"grad_norm": 16.87125015258789,
"learning_rate": 1.5600496718112473e-05,
"loss": 1.9704,
"step": 7950
},
{
"epoch": 1.8921475875118259,
"grad_norm": 17.480026245117188,
"learning_rate": 1.554136360948495e-05,
"loss": 1.9822,
"step": 8000
},
{
"epoch": 1.8921475875118259,
"eval_runtime": 47.026,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 4.1330844767959665,
"step": 8000
},
{
"epoch": 1.903973509933775,
"grad_norm": 15.649256706237793,
"learning_rate": 1.548223050085743e-05,
"loss": 1.8534,
"step": 8050
},
{
"epoch": 1.9157994323557237,
"grad_norm": 17.02906608581543,
"learning_rate": 1.5423097392229912e-05,
"loss": 1.856,
"step": 8100
},
{
"epoch": 1.9276253547776725,
"grad_norm": 16.321977615356445,
"learning_rate": 1.536396428360239e-05,
"loss": 1.9817,
"step": 8150
},
{
"epoch": 1.9394512771996215,
"grad_norm": 21.492490768432617,
"learning_rate": 1.530483117497487e-05,
"loss": 1.9095,
"step": 8200
},
{
"epoch": 1.9512771996215705,
"grad_norm": 18.752315521240234,
"learning_rate": 1.524569806634735e-05,
"loss": 1.9343,
"step": 8250
},
{
"epoch": 1.9631031220435196,
"grad_norm": 17.007205963134766,
"learning_rate": 1.518656495771983e-05,
"loss": 1.95,
"step": 8300
},
{
"epoch": 1.9749290444654684,
"grad_norm": 16.75872230529785,
"learning_rate": 1.512743184909231e-05,
"loss": 1.9981,
"step": 8350
},
{
"epoch": 1.9867549668874172,
"grad_norm": 18.816049575805664,
"learning_rate": 1.5068298740464788e-05,
"loss": 1.8872,
"step": 8400
},
{
"epoch": 1.9985808893093662,
"grad_norm": 16.992637634277344,
"learning_rate": 1.5009165631837266e-05,
"loss": 1.8112,
"step": 8450
},
{
"epoch": 2.010406811731315,
"grad_norm": 16.72859001159668,
"learning_rate": 1.4950032523209746e-05,
"loss": 1.8451,
"step": 8500
},
{
"epoch": 2.0222327341532638,
"grad_norm": 15.676278114318848,
"learning_rate": 1.4890899414582225e-05,
"loss": 1.8918,
"step": 8550
},
{
"epoch": 2.034058656575213,
"grad_norm": 15.531780242919922,
"learning_rate": 1.4831766305954705e-05,
"loss": 1.7837,
"step": 8600
},
{
"epoch": 2.045884578997162,
"grad_norm": 17.246252059936523,
"learning_rate": 1.4772633197327184e-05,
"loss": 1.8692,
"step": 8650
},
{
"epoch": 2.057710501419111,
"grad_norm": 13.021443367004395,
"learning_rate": 1.4713500088699663e-05,
"loss": 1.8614,
"step": 8700
},
{
"epoch": 2.0695364238410594,
"grad_norm": 15.586688041687012,
"learning_rate": 1.4654366980072143e-05,
"loss": 1.8677,
"step": 8750
},
{
"epoch": 2.0813623462630084,
"grad_norm": 19.62430191040039,
"learning_rate": 1.4595233871444622e-05,
"loss": 1.8005,
"step": 8800
},
{
"epoch": 2.0931882686849574,
"grad_norm": 15.454833984375,
"learning_rate": 1.4536100762817103e-05,
"loss": 1.8008,
"step": 8850
},
{
"epoch": 2.1050141911069065,
"grad_norm": 16.70480728149414,
"learning_rate": 1.4476967654189581e-05,
"loss": 1.8207,
"step": 8900
},
{
"epoch": 2.116840113528855,
"grad_norm": 17.584407806396484,
"learning_rate": 1.4417834545562062e-05,
"loss": 1.7491,
"step": 8950
},
{
"epoch": 2.128666035950804,
"grad_norm": 17.367647171020508,
"learning_rate": 1.435870143693454e-05,
"loss": 1.8351,
"step": 9000
},
{
"epoch": 2.140491958372753,
"grad_norm": 15.521934509277344,
"learning_rate": 1.4299568328307021e-05,
"loss": 1.7934,
"step": 9050
},
{
"epoch": 2.152317880794702,
"grad_norm": 18.928241729736328,
"learning_rate": 1.42404352196795e-05,
"loss": 1.8162,
"step": 9100
},
{
"epoch": 2.1641438032166507,
"grad_norm": 16.490169525146484,
"learning_rate": 1.418130211105198e-05,
"loss": 1.8496,
"step": 9150
},
{
"epoch": 2.1759697256385997,
"grad_norm": 16.48432731628418,
"learning_rate": 1.412216900242446e-05,
"loss": 1.7747,
"step": 9200
},
{
"epoch": 2.1877956480605487,
"grad_norm": 11.924939155578613,
"learning_rate": 1.406303589379694e-05,
"loss": 1.7665,
"step": 9250
},
{
"epoch": 2.1996215704824977,
"grad_norm": 17.498945236206055,
"learning_rate": 1.4003902785169417e-05,
"loss": 1.789,
"step": 9300
},
{
"epoch": 2.2114474929044468,
"grad_norm": 15.384320259094238,
"learning_rate": 1.3944769676541896e-05,
"loss": 1.8264,
"step": 9350
},
{
"epoch": 2.2232734153263953,
"grad_norm": 13.456559181213379,
"learning_rate": 1.3885636567914376e-05,
"loss": 1.788,
"step": 9400
},
{
"epoch": 2.2350993377483444,
"grad_norm": 24.769336700439453,
"learning_rate": 1.3826503459286855e-05,
"loss": 1.7902,
"step": 9450
},
{
"epoch": 2.2469252601702934,
"grad_norm": 18.193721771240234,
"learning_rate": 1.3767370350659335e-05,
"loss": 1.8175,
"step": 9500
},
{
"epoch": 2.258751182592242,
"grad_norm": 16.10167121887207,
"learning_rate": 1.3708237242031814e-05,
"loss": 1.8042,
"step": 9550
},
{
"epoch": 2.270577105014191,
"grad_norm": 15.939582824707031,
"learning_rate": 1.3649104133404295e-05,
"loss": 1.7767,
"step": 9600
},
{
"epoch": 2.28240302743614,
"grad_norm": 17.35470199584961,
"learning_rate": 1.3589971024776774e-05,
"loss": 1.7099,
"step": 9650
},
{
"epoch": 2.294228949858089,
"grad_norm": 16.262712478637695,
"learning_rate": 1.3530837916149252e-05,
"loss": 1.7841,
"step": 9700
},
{
"epoch": 2.306054872280038,
"grad_norm": 13.716343879699707,
"learning_rate": 1.3471704807521733e-05,
"loss": 1.87,
"step": 9750
},
{
"epoch": 2.3178807947019866,
"grad_norm": 13.402505874633789,
"learning_rate": 1.3412571698894212e-05,
"loss": 1.7485,
"step": 9800
},
{
"epoch": 2.3297067171239356,
"grad_norm": 14.37375259399414,
"learning_rate": 1.3353438590266692e-05,
"loss": 1.8367,
"step": 9850
},
{
"epoch": 2.3415326395458846,
"grad_norm": 14.258302688598633,
"learning_rate": 1.3294305481639171e-05,
"loss": 1.7925,
"step": 9900
},
{
"epoch": 2.3533585619678337,
"grad_norm": 18.176448822021484,
"learning_rate": 1.3235172373011651e-05,
"loss": 1.9135,
"step": 9950
},
{
"epoch": 2.3651844843897822,
"grad_norm": 16.076860427856445,
"learning_rate": 1.317603926438413e-05,
"loss": 1.7746,
"step": 10000
},
{
"epoch": 2.3651844843897822,
"eval_runtime": 46.8576,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 4.062871016729038,
"step": 10000
},
{
"epoch": 2.3770104068117313,
"grad_norm": 14.89098072052002,
"learning_rate": 1.311690615575661e-05,
"loss": 1.672,
"step": 10050
},
{
"epoch": 2.3888363292336803,
"grad_norm": 21.15306854248047,
"learning_rate": 1.3057773047129088e-05,
"loss": 1.7265,
"step": 10100
},
{
"epoch": 2.4006622516556293,
"grad_norm": 13.14006519317627,
"learning_rate": 1.2998639938501567e-05,
"loss": 1.6875,
"step": 10150
},
{
"epoch": 2.412488174077578,
"grad_norm": 16.71653175354004,
"learning_rate": 1.2939506829874047e-05,
"loss": 1.7421,
"step": 10200
},
{
"epoch": 2.424314096499527,
"grad_norm": 19.673765182495117,
"learning_rate": 1.2880373721246526e-05,
"loss": 1.7447,
"step": 10250
},
{
"epoch": 2.436140018921476,
"grad_norm": 13.806225776672363,
"learning_rate": 1.2821240612619006e-05,
"loss": 1.7335,
"step": 10300
},
{
"epoch": 2.447965941343425,
"grad_norm": 17.10091209411621,
"learning_rate": 1.2762107503991485e-05,
"loss": 1.6583,
"step": 10350
},
{
"epoch": 2.4597918637653735,
"grad_norm": 13.57816219329834,
"learning_rate": 1.2702974395363966e-05,
"loss": 1.6937,
"step": 10400
},
{
"epoch": 2.4716177861873225,
"grad_norm": 15.529336929321289,
"learning_rate": 1.2643841286736444e-05,
"loss": 1.6425,
"step": 10450
},
{
"epoch": 2.4834437086092715,
"grad_norm": 15.039297103881836,
"learning_rate": 1.2584708178108925e-05,
"loss": 1.7837,
"step": 10500
},
{
"epoch": 2.4952696310312206,
"grad_norm": 18.062923431396484,
"learning_rate": 1.2525575069481404e-05,
"loss": 1.7589,
"step": 10550
},
{
"epoch": 2.507095553453169,
"grad_norm": 14.291655540466309,
"learning_rate": 1.2466441960853884e-05,
"loss": 1.6618,
"step": 10600
},
{
"epoch": 2.518921475875118,
"grad_norm": 15.268333435058594,
"learning_rate": 1.2407308852226363e-05,
"loss": 1.6107,
"step": 10650
},
{
"epoch": 2.530747398297067,
"grad_norm": 15.746752738952637,
"learning_rate": 1.2348175743598842e-05,
"loss": 1.706,
"step": 10700
},
{
"epoch": 2.542573320719016,
"grad_norm": 14.740198135375977,
"learning_rate": 1.2289042634971322e-05,
"loss": 1.6662,
"step": 10750
},
{
"epoch": 2.5543992431409652,
"grad_norm": 18.715717315673828,
"learning_rate": 1.2229909526343801e-05,
"loss": 1.7491,
"step": 10800
},
{
"epoch": 2.566225165562914,
"grad_norm": 13.341856956481934,
"learning_rate": 1.2170776417716281e-05,
"loss": 1.615,
"step": 10850
},
{
"epoch": 2.578051087984863,
"grad_norm": 15.429610252380371,
"learning_rate": 1.211164330908876e-05,
"loss": 1.6314,
"step": 10900
},
{
"epoch": 2.589877010406812,
"grad_norm": 16.15951919555664,
"learning_rate": 1.2052510200461239e-05,
"loss": 1.6564,
"step": 10950
},
{
"epoch": 2.6017029328287604,
"grad_norm": 16.425504684448242,
"learning_rate": 1.1993377091833718e-05,
"loss": 1.6085,
"step": 11000
},
{
"epoch": 2.6135288552507094,
"grad_norm": 19.02115249633789,
"learning_rate": 1.1934243983206197e-05,
"loss": 1.6969,
"step": 11050
},
{
"epoch": 2.6253547776726585,
"grad_norm": 16.245838165283203,
"learning_rate": 1.1875110874578677e-05,
"loss": 1.5963,
"step": 11100
},
{
"epoch": 2.6371807000946075,
"grad_norm": 14.986413955688477,
"learning_rate": 1.1815977765951156e-05,
"loss": 1.6626,
"step": 11150
},
{
"epoch": 2.6490066225165565,
"grad_norm": 18.501134872436523,
"learning_rate": 1.1756844657323636e-05,
"loss": 1.715,
"step": 11200
},
{
"epoch": 2.660832544938505,
"grad_norm": 19.390989303588867,
"learning_rate": 1.1697711548696115e-05,
"loss": 1.6182,
"step": 11250
},
{
"epoch": 2.672658467360454,
"grad_norm": 16.83384132385254,
"learning_rate": 1.1638578440068596e-05,
"loss": 1.5667,
"step": 11300
},
{
"epoch": 2.684484389782403,
"grad_norm": 17.595382690429688,
"learning_rate": 1.1579445331441074e-05,
"loss": 1.6255,
"step": 11350
},
{
"epoch": 2.6963103122043517,
"grad_norm": 18.588014602661133,
"learning_rate": 1.1520312222813555e-05,
"loss": 1.6146,
"step": 11400
},
{
"epoch": 2.7081362346263007,
"grad_norm": 18.090600967407227,
"learning_rate": 1.1461179114186034e-05,
"loss": 1.5807,
"step": 11450
},
{
"epoch": 2.7199621570482497,
"grad_norm": 16.144756317138672,
"learning_rate": 1.1402046005558514e-05,
"loss": 1.571,
"step": 11500
},
{
"epoch": 2.7317880794701987,
"grad_norm": 19.271270751953125,
"learning_rate": 1.1342912896930993e-05,
"loss": 1.611,
"step": 11550
},
{
"epoch": 2.7436140018921478,
"grad_norm": 15.365574836730957,
"learning_rate": 1.1283779788303474e-05,
"loss": 1.5757,
"step": 11600
},
{
"epoch": 2.7554399243140963,
"grad_norm": 18.699979782104492,
"learning_rate": 1.1224646679675952e-05,
"loss": 1.6048,
"step": 11650
},
{
"epoch": 2.7672658467360454,
"grad_norm": 15.537507057189941,
"learning_rate": 1.1165513571048431e-05,
"loss": 1.5559,
"step": 11700
},
{
"epoch": 2.7790917691579944,
"grad_norm": 15.148637771606445,
"learning_rate": 1.110638046242091e-05,
"loss": 1.5634,
"step": 11750
},
{
"epoch": 2.790917691579943,
"grad_norm": 17.472370147705078,
"learning_rate": 1.1047247353793389e-05,
"loss": 1.5663,
"step": 11800
},
{
"epoch": 2.8027436140018924,
"grad_norm": 16.284570693969727,
"learning_rate": 1.098811424516587e-05,
"loss": 1.6274,
"step": 11850
},
{
"epoch": 2.814569536423841,
"grad_norm": 17.758365631103516,
"learning_rate": 1.0928981136538348e-05,
"loss": 1.5478,
"step": 11900
},
{
"epoch": 2.82639545884579,
"grad_norm": 14.631210327148438,
"learning_rate": 1.0869848027910828e-05,
"loss": 1.5728,
"step": 11950
},
{
"epoch": 2.838221381267739,
"grad_norm": 13.960256576538086,
"learning_rate": 1.0810714919283307e-05,
"loss": 1.5694,
"step": 12000
},
{
"epoch": 2.838221381267739,
"eval_runtime": 47.1893,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 3.9845195254937416,
"step": 12000
},
{
"epoch": 2.8500473036896876,
"grad_norm": 13.964838981628418,
"learning_rate": 1.0751581810655786e-05,
"loss": 1.5544,
"step": 12050
},
{
"epoch": 2.8618732261116366,
"grad_norm": 20.20077133178711,
"learning_rate": 1.0692448702028267e-05,
"loss": 1.6172,
"step": 12100
},
{
"epoch": 2.8736991485335857,
"grad_norm": 13.513507843017578,
"learning_rate": 1.0633315593400745e-05,
"loss": 1.5358,
"step": 12150
},
{
"epoch": 2.8855250709555347,
"grad_norm": 18.936565399169922,
"learning_rate": 1.0574182484773226e-05,
"loss": 1.5656,
"step": 12200
},
{
"epoch": 2.8973509933774837,
"grad_norm": 17.975814819335938,
"learning_rate": 1.0515049376145705e-05,
"loss": 1.5273,
"step": 12250
},
{
"epoch": 2.9091769157994323,
"grad_norm": 18.273731231689453,
"learning_rate": 1.0455916267518185e-05,
"loss": 1.4981,
"step": 12300
},
{
"epoch": 2.9210028382213813,
"grad_norm": 16.280357360839844,
"learning_rate": 1.0396783158890664e-05,
"loss": 1.5256,
"step": 12350
},
{
"epoch": 2.9328287606433303,
"grad_norm": 13.220331192016602,
"learning_rate": 1.0337650050263144e-05,
"loss": 1.522,
"step": 12400
},
{
"epoch": 2.944654683065279,
"grad_norm": 16.336288452148438,
"learning_rate": 1.0278516941635623e-05,
"loss": 1.465,
"step": 12450
},
{
"epoch": 2.956480605487228,
"grad_norm": 20.016876220703125,
"learning_rate": 1.0219383833008104e-05,
"loss": 1.6151,
"step": 12500
},
{
"epoch": 2.968306527909177,
"grad_norm": 17.370023727416992,
"learning_rate": 1.0160250724380582e-05,
"loss": 1.5521,
"step": 12550
},
{
"epoch": 2.980132450331126,
"grad_norm": 18.69423484802246,
"learning_rate": 1.010111761575306e-05,
"loss": 1.5657,
"step": 12600
},
{
"epoch": 2.991958372753075,
"grad_norm": 18.094669342041016,
"learning_rate": 1.004198450712554e-05,
"loss": 1.4935,
"step": 12650
},
{
"epoch": 3.0037842951750235,
"grad_norm": 15.485885620117188,
"learning_rate": 9.98285139849802e-06,
"loss": 1.4081,
"step": 12700
},
{
"epoch": 3.0156102175969726,
"grad_norm": 15.505888938903809,
"learning_rate": 9.9237182898705e-06,
"loss": 1.5243,
"step": 12750
},
{
"epoch": 3.0274361400189216,
"grad_norm": 16.799917221069336,
"learning_rate": 9.864585181242978e-06,
"loss": 1.4999,
"step": 12800
},
{
"epoch": 3.0392620624408706,
"grad_norm": 15.498218536376953,
"learning_rate": 9.805452072615459e-06,
"loss": 1.4139,
"step": 12850
},
{
"epoch": 3.051087984862819,
"grad_norm": 19.318891525268555,
"learning_rate": 9.746318963987937e-06,
"loss": 1.5522,
"step": 12900
},
{
"epoch": 3.062913907284768,
"grad_norm": 14.893320083618164,
"learning_rate": 9.687185855360418e-06,
"loss": 1.4865,
"step": 12950
},
{
"epoch": 3.074739829706717,
"grad_norm": 18.767566680908203,
"learning_rate": 9.628052746732897e-06,
"loss": 1.4755,
"step": 13000
},
{
"epoch": 3.0865657521286662,
"grad_norm": 14.623005867004395,
"learning_rate": 9.568919638105375e-06,
"loss": 1.4582,
"step": 13050
},
{
"epoch": 3.098391674550615,
"grad_norm": 14.217521667480469,
"learning_rate": 9.509786529477856e-06,
"loss": 1.4112,
"step": 13100
},
{
"epoch": 3.110217596972564,
"grad_norm": 13.287856101989746,
"learning_rate": 9.450653420850335e-06,
"loss": 1.4758,
"step": 13150
},
{
"epoch": 3.122043519394513,
"grad_norm": 14.488649368286133,
"learning_rate": 9.391520312222813e-06,
"loss": 1.4388,
"step": 13200
},
{
"epoch": 3.133869441816462,
"grad_norm": 15.88402271270752,
"learning_rate": 9.332387203595294e-06,
"loss": 1.4819,
"step": 13250
},
{
"epoch": 3.1456953642384105,
"grad_norm": 13.743453025817871,
"learning_rate": 9.273254094967773e-06,
"loss": 1.4525,
"step": 13300
},
{
"epoch": 3.1575212866603595,
"grad_norm": 16.949493408203125,
"learning_rate": 9.214120986340253e-06,
"loss": 1.4583,
"step": 13350
},
{
"epoch": 3.1693472090823085,
"grad_norm": 15.139965057373047,
"learning_rate": 9.154987877712732e-06,
"loss": 1.4714,
"step": 13400
},
{
"epoch": 3.1811731315042575,
"grad_norm": 18.97600746154785,
"learning_rate": 9.095854769085213e-06,
"loss": 1.4265,
"step": 13450
},
{
"epoch": 3.192999053926206,
"grad_norm": 16.3485107421875,
"learning_rate": 9.036721660457691e-06,
"loss": 1.454,
"step": 13500
},
{
"epoch": 3.204824976348155,
"grad_norm": 17.43102264404297,
"learning_rate": 8.97758855183017e-06,
"loss": 1.4506,
"step": 13550
},
{
"epoch": 3.216650898770104,
"grad_norm": 15.63143253326416,
"learning_rate": 8.91845544320265e-06,
"loss": 1.4055,
"step": 13600
},
{
"epoch": 3.228476821192053,
"grad_norm": 12.323601722717285,
"learning_rate": 8.85932233457513e-06,
"loss": 1.4729,
"step": 13650
},
{
"epoch": 3.2403027436140017,
"grad_norm": 17.078189849853516,
"learning_rate": 8.800189225947608e-06,
"loss": 1.4791,
"step": 13700
},
{
"epoch": 3.2521286660359507,
"grad_norm": 14.752788543701172,
"learning_rate": 8.741056117320089e-06,
"loss": 1.4962,
"step": 13750
},
{
"epoch": 3.2639545884578998,
"grad_norm": 12.897354125976562,
"learning_rate": 8.681923008692567e-06,
"loss": 1.4101,
"step": 13800
},
{
"epoch": 3.275780510879849,
"grad_norm": 12.985773086547852,
"learning_rate": 8.622789900065048e-06,
"loss": 1.4596,
"step": 13850
},
{
"epoch": 3.2876064333017974,
"grad_norm": 16.538026809692383,
"learning_rate": 8.563656791437527e-06,
"loss": 1.4207,
"step": 13900
},
{
"epoch": 3.2994323557237464,
"grad_norm": 20.395875930786133,
"learning_rate": 8.504523682810007e-06,
"loss": 1.4972,
"step": 13950
},
{
"epoch": 3.3112582781456954,
"grad_norm": 11.993943214416504,
"learning_rate": 8.445390574182486e-06,
"loss": 1.4473,
"step": 14000
},
{
"epoch": 3.3112582781456954,
"eval_runtime": 47.0766,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 3.9688115547590725,
"step": 14000
},
{
"epoch": 3.3230842005676444,
"grad_norm": 12.915935516357422,
"learning_rate": 8.386257465554965e-06,
"loss": 1.46,
"step": 14050
},
{
"epoch": 3.334910122989593,
"grad_norm": 19.571788787841797,
"learning_rate": 8.327124356927444e-06,
"loss": 1.5012,
"step": 14100
},
{
"epoch": 3.346736045411542,
"grad_norm": 17.244380950927734,
"learning_rate": 8.267991248299924e-06,
"loss": 1.5466,
"step": 14150
},
{
"epoch": 3.358561967833491,
"grad_norm": 18.117067337036133,
"learning_rate": 8.208858139672403e-06,
"loss": 1.5067,
"step": 14200
},
{
"epoch": 3.37038789025544,
"grad_norm": 14.23071575164795,
"learning_rate": 8.149725031044883e-06,
"loss": 1.4413,
"step": 14250
},
{
"epoch": 3.3822138126773886,
"grad_norm": 16.817371368408203,
"learning_rate": 8.090591922417362e-06,
"loss": 1.3448,
"step": 14300
},
{
"epoch": 3.3940397350993377,
"grad_norm": 21.39740753173828,
"learning_rate": 8.031458813789843e-06,
"loss": 1.3792,
"step": 14350
},
{
"epoch": 3.4058656575212867,
"grad_norm": 13.991111755371094,
"learning_rate": 7.972325705162321e-06,
"loss": 1.4019,
"step": 14400
},
{
"epoch": 3.4176915799432357,
"grad_norm": 14.572546005249023,
"learning_rate": 7.9131925965348e-06,
"loss": 1.4771,
"step": 14450
},
{
"epoch": 3.4295175023651847,
"grad_norm": 15.65616226196289,
"learning_rate": 7.854059487907279e-06,
"loss": 1.4383,
"step": 14500
},
{
"epoch": 3.4413434247871333,
"grad_norm": 16.871171951293945,
"learning_rate": 7.79492637927976e-06,
"loss": 1.4172,
"step": 14550
},
{
"epoch": 3.4531693472090823,
"grad_norm": 16.653839111328125,
"learning_rate": 7.735793270652238e-06,
"loss": 1.3284,
"step": 14600
},
{
"epoch": 3.4649952696310313,
"grad_norm": 18.008516311645508,
"learning_rate": 7.676660162024719e-06,
"loss": 1.3867,
"step": 14650
},
{
"epoch": 3.47682119205298,
"grad_norm": 21.629899978637695,
"learning_rate": 7.6175270533971976e-06,
"loss": 1.3931,
"step": 14700
},
{
"epoch": 3.488647114474929,
"grad_norm": 15.525995254516602,
"learning_rate": 7.558393944769677e-06,
"loss": 1.4429,
"step": 14750
},
{
"epoch": 3.500473036896878,
"grad_norm": 15.045352935791016,
"learning_rate": 7.499260836142157e-06,
"loss": 1.4665,
"step": 14800
},
{
"epoch": 3.512298959318827,
"grad_norm": 16.258941650390625,
"learning_rate": 7.440127727514636e-06,
"loss": 1.3232,
"step": 14850
},
{
"epoch": 3.524124881740776,
"grad_norm": 14.834844589233398,
"learning_rate": 7.380994618887115e-06,
"loss": 1.4112,
"step": 14900
},
{
"epoch": 3.5359508041627246,
"grad_norm": 18.840707778930664,
"learning_rate": 7.321861510259595e-06,
"loss": 1.3916,
"step": 14950
},
{
"epoch": 3.5477767265846736,
"grad_norm": 17.09494972229004,
"learning_rate": 7.2627284016320746e-06,
"loss": 1.3572,
"step": 15000
},
{
"epoch": 3.5596026490066226,
"grad_norm": 17.76523780822754,
"learning_rate": 7.203595293004554e-06,
"loss": 1.4414,
"step": 15050
},
{
"epoch": 3.571428571428571,
"grad_norm": 19.53270149230957,
"learning_rate": 7.144462184377034e-06,
"loss": 1.2716,
"step": 15100
},
{
"epoch": 3.58325449385052,
"grad_norm": 18.649320602416992,
"learning_rate": 7.085329075749513e-06,
"loss": 1.4043,
"step": 15150
},
{
"epoch": 3.595080416272469,
"grad_norm": 13.581181526184082,
"learning_rate": 7.026195967121992e-06,
"loss": 1.3686,
"step": 15200
},
{
"epoch": 3.6069063386944182,
"grad_norm": 21.46381950378418,
"learning_rate": 6.967062858494472e-06,
"loss": 1.3687,
"step": 15250
},
{
"epoch": 3.6187322611163673,
"grad_norm": 10.937467575073242,
"learning_rate": 6.907929749866951e-06,
"loss": 1.3183,
"step": 15300
},
{
"epoch": 3.630558183538316,
"grad_norm": 18.974475860595703,
"learning_rate": 6.84879664123943e-06,
"loss": 1.3712,
"step": 15350
},
{
"epoch": 3.642384105960265,
"grad_norm": 17.913204193115234,
"learning_rate": 6.78966353261191e-06,
"loss": 1.4006,
"step": 15400
},
{
"epoch": 3.654210028382214,
"grad_norm": 14.945576667785645,
"learning_rate": 6.73053042398439e-06,
"loss": 1.4326,
"step": 15450
},
{
"epoch": 3.666035950804163,
"grad_norm": 15.58818531036377,
"learning_rate": 6.671397315356869e-06,
"loss": 1.3116,
"step": 15500
},
{
"epoch": 3.677861873226112,
"grad_norm": 16.57988739013672,
"learning_rate": 6.612264206729349e-06,
"loss": 1.2975,
"step": 15550
},
{
"epoch": 3.6896877956480605,
"grad_norm": 13.658615112304688,
"learning_rate": 6.5531310981018285e-06,
"loss": 1.3709,
"step": 15600
},
{
"epoch": 3.7015137180700095,
"grad_norm": 16.559919357299805,
"learning_rate": 6.493997989474307e-06,
"loss": 1.3267,
"step": 15650
},
{
"epoch": 3.7133396404919585,
"grad_norm": 16.319732666015625,
"learning_rate": 6.434864880846786e-06,
"loss": 1.2947,
"step": 15700
},
{
"epoch": 3.725165562913907,
"grad_norm": 15.4765043258667,
"learning_rate": 6.375731772219266e-06,
"loss": 1.3524,
"step": 15750
},
{
"epoch": 3.736991485335856,
"grad_norm": 14.876737594604492,
"learning_rate": 6.316598663591745e-06,
"loss": 1.3092,
"step": 15800
},
{
"epoch": 3.748817407757805,
"grad_norm": 13.654143333435059,
"learning_rate": 6.257465554964225e-06,
"loss": 1.3635,
"step": 15850
},
{
"epoch": 3.760643330179754,
"grad_norm": 16.795425415039062,
"learning_rate": 6.198332446336705e-06,
"loss": 1.282,
"step": 15900
},
{
"epoch": 3.772469252601703,
"grad_norm": 12.707657814025879,
"learning_rate": 6.139199337709184e-06,
"loss": 1.3122,
"step": 15950
},
{
"epoch": 3.7842951750236518,
"grad_norm": 15.771327018737793,
"learning_rate": 6.080066229081664e-06,
"loss": 1.2944,
"step": 16000
},
{
"epoch": 3.7842951750236518,
"eval_runtime": 46.7062,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 3.97229260179494,
"step": 16000
},
{
"epoch": 3.796121097445601,
"grad_norm": 15.549239158630371,
"learning_rate": 6.020933120454144e-06,
"loss": 1.3878,
"step": 16050
},
{
"epoch": 3.80794701986755,
"grad_norm": 11.558497428894043,
"learning_rate": 5.9618000118266215e-06,
"loss": 1.2978,
"step": 16100
},
{
"epoch": 3.8197729422894984,
"grad_norm": 17.571189880371094,
"learning_rate": 5.902666903199101e-06,
"loss": 1.3128,
"step": 16150
},
{
"epoch": 3.8315988647114474,
"grad_norm": 16.608991622924805,
"learning_rate": 5.843533794571581e-06,
"loss": 1.317,
"step": 16200
},
{
"epoch": 3.8434247871333964,
"grad_norm": 17.64645004272461,
"learning_rate": 5.7844006859440605e-06,
"loss": 1.3225,
"step": 16250
},
{
"epoch": 3.8552507095553454,
"grad_norm": 16.340919494628906,
"learning_rate": 5.72526757731654e-06,
"loss": 1.3339,
"step": 16300
},
{
"epoch": 3.8670766319772945,
"grad_norm": 17.24504280090332,
"learning_rate": 5.66613446868902e-06,
"loss": 1.3137,
"step": 16350
},
{
"epoch": 3.878902554399243,
"grad_norm": 16.168750762939453,
"learning_rate": 5.607001360061499e-06,
"loss": 1.3128,
"step": 16400
},
{
"epoch": 3.890728476821192,
"grad_norm": 20.071321487426758,
"learning_rate": 5.547868251433979e-06,
"loss": 1.3262,
"step": 16450
},
{
"epoch": 3.902554399243141,
"grad_norm": 19.031503677368164,
"learning_rate": 5.488735142806457e-06,
"loss": 1.2235,
"step": 16500
},
{
"epoch": 3.9143803216650896,
"grad_norm": 18.36022186279297,
"learning_rate": 5.429602034178937e-06,
"loss": 1.2619,
"step": 16550
},
{
"epoch": 3.9262062440870387,
"grad_norm": 17.923831939697266,
"learning_rate": 5.370468925551416e-06,
"loss": 1.3289,
"step": 16600
},
{
"epoch": 3.9380321665089877,
"grad_norm": 13.550859451293945,
"learning_rate": 5.311335816923896e-06,
"loss": 1.2437,
"step": 16650
},
{
"epoch": 3.9498580889309367,
"grad_norm": 12.4674654006958,
"learning_rate": 5.2522027082963755e-06,
"loss": 1.2886,
"step": 16700
},
{
"epoch": 3.9616840113528857,
"grad_norm": 18.66042709350586,
"learning_rate": 5.193069599668855e-06,
"loss": 1.3309,
"step": 16750
},
{
"epoch": 3.9735099337748343,
"grad_norm": 15.646864891052246,
"learning_rate": 5.133936491041335e-06,
"loss": 1.3457,
"step": 16800
},
{
"epoch": 3.9853358561967833,
"grad_norm": 17.00884246826172,
"learning_rate": 5.0748033824138144e-06,
"loss": 1.3184,
"step": 16850
},
{
"epoch": 3.9971617786187323,
"grad_norm": 13.399270057678223,
"learning_rate": 5.015670273786294e-06,
"loss": 1.2087,
"step": 16900
},
{
"epoch": 4.008987701040681,
"grad_norm": 13.706061363220215,
"learning_rate": 4.956537165158773e-06,
"loss": 1.229,
"step": 16950
},
{
"epoch": 4.02081362346263,
"grad_norm": 22.725217819213867,
"learning_rate": 4.8974040565312525e-06,
"loss": 1.3235,
"step": 17000
},
{
"epoch": 4.032639545884579,
"grad_norm": 16.072246551513672,
"learning_rate": 4.838270947903731e-06,
"loss": 1.195,
"step": 17050
},
{
"epoch": 4.0444654683065275,
"grad_norm": 17.015745162963867,
"learning_rate": 4.779137839276211e-06,
"loss": 1.2793,
"step": 17100
},
{
"epoch": 4.056291390728477,
"grad_norm": 15.687799453735352,
"learning_rate": 4.7200047306486906e-06,
"loss": 1.2719,
"step": 17150
},
{
"epoch": 4.068117313150426,
"grad_norm": 11.79020881652832,
"learning_rate": 4.66087162202117e-06,
"loss": 1.2701,
"step": 17200
},
{
"epoch": 4.079943235572375,
"grad_norm": 14.385472297668457,
"learning_rate": 4.601738513393649e-06,
"loss": 1.2593,
"step": 17250
},
{
"epoch": 4.091769157994324,
"grad_norm": 18.47262954711914,
"learning_rate": 4.542605404766129e-06,
"loss": 1.2324,
"step": 17300
},
{
"epoch": 4.103595080416272,
"grad_norm": 22.801834106445312,
"learning_rate": 4.483472296138608e-06,
"loss": 1.2338,
"step": 17350
},
{
"epoch": 4.115421002838222,
"grad_norm": 16.11665916442871,
"learning_rate": 4.424339187511088e-06,
"loss": 1.2338,
"step": 17400
},
{
"epoch": 4.12724692526017,
"grad_norm": 18.113365173339844,
"learning_rate": 4.365206078883567e-06,
"loss": 1.2595,
"step": 17450
},
{
"epoch": 4.139072847682119,
"grad_norm": 15.56670093536377,
"learning_rate": 4.306072970256046e-06,
"loss": 1.2434,
"step": 17500
},
{
"epoch": 4.150898770104068,
"grad_norm": 18.501914978027344,
"learning_rate": 4.246939861628526e-06,
"loss": 1.268,
"step": 17550
},
{
"epoch": 4.162724692526017,
"grad_norm": 16.622150421142578,
"learning_rate": 4.187806753001006e-06,
"loss": 1.2345,
"step": 17600
},
{
"epoch": 4.174550614947966,
"grad_norm": 19.019207000732422,
"learning_rate": 4.128673644373484e-06,
"loss": 1.228,
"step": 17650
},
{
"epoch": 4.186376537369915,
"grad_norm": 11.55809211730957,
"learning_rate": 4.069540535745964e-06,
"loss": 1.235,
"step": 17700
},
{
"epoch": 4.1982024597918635,
"grad_norm": 14.763603210449219,
"learning_rate": 4.010407427118444e-06,
"loss": 1.2198,
"step": 17750
},
{
"epoch": 4.210028382213813,
"grad_norm": 17.480113983154297,
"learning_rate": 3.951274318490923e-06,
"loss": 1.263,
"step": 17800
},
{
"epoch": 4.2218543046357615,
"grad_norm": 17.487497329711914,
"learning_rate": 3.892141209863403e-06,
"loss": 1.2288,
"step": 17850
},
{
"epoch": 4.23368022705771,
"grad_norm": 14.157654762268066,
"learning_rate": 3.833008101235882e-06,
"loss": 1.2251,
"step": 17900
},
{
"epoch": 4.2455061494796595,
"grad_norm": 21.731857299804688,
"learning_rate": 3.773874992608362e-06,
"loss": 1.2796,
"step": 17950
},
{
"epoch": 4.257332071901608,
"grad_norm": 17.268417358398438,
"learning_rate": 3.7147418839808415e-06,
"loss": 1.2934,
"step": 18000
},
{
"epoch": 4.257332071901608,
"eval_runtime": 47.1593,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 3.9484923051611727,
"step": 18000
},
{
"epoch": 4.269157994323558,
"grad_norm": 12.740385055541992,
"learning_rate": 3.6556087753533203e-06,
"loss": 1.2197,
"step": 18050
},
{
"epoch": 4.280983916745506,
"grad_norm": 17.239517211914062,
"learning_rate": 3.5964756667258e-06,
"loss": 1.1908,
"step": 18100
},
{
"epoch": 4.292809839167455,
"grad_norm": 16.485107421875,
"learning_rate": 3.5373425580982795e-06,
"loss": 1.2549,
"step": 18150
},
{
"epoch": 4.304635761589404,
"grad_norm": 17.04962921142578,
"learning_rate": 3.478209449470759e-06,
"loss": 1.3468,
"step": 18200
},
{
"epoch": 4.316461684011353,
"grad_norm": 14.987895965576172,
"learning_rate": 3.419076340843238e-06,
"loss": 1.2323,
"step": 18250
},
{
"epoch": 4.328287606433301,
"grad_norm": 14.840313911437988,
"learning_rate": 3.3599432322157176e-06,
"loss": 1.2897,
"step": 18300
},
{
"epoch": 4.340113528855251,
"grad_norm": 17.09177589416504,
"learning_rate": 3.3008101235881973e-06,
"loss": 1.3231,
"step": 18350
},
{
"epoch": 4.351939451277199,
"grad_norm": 16.76932716369629,
"learning_rate": 3.241677014960677e-06,
"loss": 1.3587,
"step": 18400
},
{
"epoch": 4.363765373699149,
"grad_norm": 17.611955642700195,
"learning_rate": 3.1825439063331565e-06,
"loss": 1.2639,
"step": 18450
},
{
"epoch": 4.375591296121097,
"grad_norm": 11.352503776550293,
"learning_rate": 3.1234107977056353e-06,
"loss": 1.2244,
"step": 18500
},
{
"epoch": 4.387417218543046,
"grad_norm": 17.059810638427734,
"learning_rate": 3.064277689078115e-06,
"loss": 1.1932,
"step": 18550
},
{
"epoch": 4.3992431409649955,
"grad_norm": 15.7676420211792,
"learning_rate": 3.0051445804505946e-06,
"loss": 1.2072,
"step": 18600
},
{
"epoch": 4.411069063386944,
"grad_norm": 20.51708984375,
"learning_rate": 2.9460114718230742e-06,
"loss": 1.2257,
"step": 18650
},
{
"epoch": 4.4228949858088935,
"grad_norm": 9.713994979858398,
"learning_rate": 2.8868783631955535e-06,
"loss": 1.2368,
"step": 18700
},
{
"epoch": 4.434720908230842,
"grad_norm": 17.381057739257812,
"learning_rate": 2.8277452545680327e-06,
"loss": 1.2796,
"step": 18750
},
{
"epoch": 4.446546830652791,
"grad_norm": 17.901290893554688,
"learning_rate": 2.7686121459405123e-06,
"loss": 1.1617,
"step": 18800
},
{
"epoch": 4.45837275307474,
"grad_norm": 14.669180870056152,
"learning_rate": 2.709479037312992e-06,
"loss": 1.238,
"step": 18850
},
{
"epoch": 4.470198675496689,
"grad_norm": 19.362512588500977,
"learning_rate": 2.650345928685471e-06,
"loss": 1.1765,
"step": 18900
},
{
"epoch": 4.482024597918637,
"grad_norm": 16.083276748657227,
"learning_rate": 2.591212820057951e-06,
"loss": 1.2626,
"step": 18950
},
{
"epoch": 4.493850520340587,
"grad_norm": 12.657955169677734,
"learning_rate": 2.53207971143043e-06,
"loss": 1.2582,
"step": 19000
},
{
"epoch": 4.505676442762535,
"grad_norm": 19.395004272460938,
"learning_rate": 2.4729466028029097e-06,
"loss": 1.2478,
"step": 19050
},
{
"epoch": 4.517502365184484,
"grad_norm": 19.803897857666016,
"learning_rate": 2.413813494175389e-06,
"loss": 1.1628,
"step": 19100
},
{
"epoch": 4.529328287606433,
"grad_norm": 18.098979949951172,
"learning_rate": 2.3546803855478685e-06,
"loss": 1.251,
"step": 19150
},
{
"epoch": 4.541154210028382,
"grad_norm": 20.26512908935547,
"learning_rate": 2.295547276920348e-06,
"loss": 1.208,
"step": 19200
},
{
"epoch": 4.552980132450331,
"grad_norm": 11.94166088104248,
"learning_rate": 2.2364141682928274e-06,
"loss": 1.2535,
"step": 19250
},
{
"epoch": 4.56480605487228,
"grad_norm": 15.473821640014648,
"learning_rate": 2.177281059665307e-06,
"loss": 1.1903,
"step": 19300
},
{
"epoch": 4.5766319772942285,
"grad_norm": 14.091665267944336,
"learning_rate": 2.1181479510377862e-06,
"loss": 1.1725,
"step": 19350
},
{
"epoch": 4.588457899716178,
"grad_norm": 15.09231948852539,
"learning_rate": 2.059014842410266e-06,
"loss": 1.2023,
"step": 19400
},
{
"epoch": 4.600283822138127,
"grad_norm": 19.047542572021484,
"learning_rate": 1.999881733782745e-06,
"loss": 1.1607,
"step": 19450
},
{
"epoch": 4.612109744560076,
"grad_norm": 15.40837574005127,
"learning_rate": 1.9407486251552247e-06,
"loss": 1.2483,
"step": 19500
},
{
"epoch": 4.623935666982025,
"grad_norm": 16.487464904785156,
"learning_rate": 1.881615516527704e-06,
"loss": 1.1202,
"step": 19550
},
{
"epoch": 4.635761589403973,
"grad_norm": 18.49724006652832,
"learning_rate": 1.8224824079001836e-06,
"loss": 1.2428,
"step": 19600
},
{
"epoch": 4.647587511825923,
"grad_norm": 13.098505973815918,
"learning_rate": 1.7633492992726628e-06,
"loss": 1.2413,
"step": 19650
},
{
"epoch": 4.659413434247871,
"grad_norm": 14.599630355834961,
"learning_rate": 1.7042161906451424e-06,
"loss": 1.1916,
"step": 19700
},
{
"epoch": 4.671239356669821,
"grad_norm": 13.156811714172363,
"learning_rate": 1.6450830820176216e-06,
"loss": 1.1738,
"step": 19750
},
{
"epoch": 4.683065279091769,
"grad_norm": 12.79720687866211,
"learning_rate": 1.5859499733901013e-06,
"loss": 1.1718,
"step": 19800
},
{
"epoch": 4.694891201513718,
"grad_norm": 20.443012237548828,
"learning_rate": 1.5268168647625805e-06,
"loss": 1.2093,
"step": 19850
},
{
"epoch": 4.706717123935667,
"grad_norm": 14.799368858337402,
"learning_rate": 1.4676837561350601e-06,
"loss": 1.1457,
"step": 19900
},
{
"epoch": 4.718543046357616,
"grad_norm": 12.656880378723145,
"learning_rate": 1.4085506475075394e-06,
"loss": 1.1704,
"step": 19950
},
{
"epoch": 4.7303689687795645,
"grad_norm": 17.24571418762207,
"learning_rate": 1.349417538880019e-06,
"loss": 1.1767,
"step": 20000
},
{
"epoch": 4.7303689687795645,
"eval_runtime": 47.055,
"eval_samples_per_second": 0.0,
"eval_steps_per_second": 0.0,
"eval_validation_loss": 3.9691287893885456,
"step": 20000
},
{
"epoch": 4.742194891201514,
"grad_norm": 15.378254890441895,
"learning_rate": 1.2902844302524986e-06,
"loss": 1.1956,
"step": 20050
},
{
"epoch": 4.7540208136234625,
"grad_norm": 17.969493865966797,
"learning_rate": 1.2311513216249778e-06,
"loss": 1.1815,
"step": 20100
},
{
"epoch": 4.765846736045411,
"grad_norm": 18.55719757080078,
"learning_rate": 1.1720182129974573e-06,
"loss": 1.1376,
"step": 20150
},
{
"epoch": 4.7776726584673606,
"grad_norm": 17.548654556274414,
"learning_rate": 1.1128851043699367e-06,
"loss": 1.1839,
"step": 20200
},
{
"epoch": 4.789498580889309,
"grad_norm": 21.243549346923828,
"learning_rate": 1.0537519957424163e-06,
"loss": 1.1722,
"step": 20250
},
{
"epoch": 4.801324503311259,
"grad_norm": 12.436286926269531,
"learning_rate": 9.946188871148958e-07,
"loss": 1.2525,
"step": 20300
},
{
"epoch": 4.813150425733207,
"grad_norm": 12.564568519592285,
"learning_rate": 9.354857784873751e-07,
"loss": 1.1445,
"step": 20350
},
{
"epoch": 4.824976348155156,
"grad_norm": 19.906652450561523,
"learning_rate": 8.763526698598545e-07,
"loss": 1.1819,
"step": 20400
},
{
"epoch": 4.836802270577105,
"grad_norm": 17.435781478881836,
"learning_rate": 8.17219561232334e-07,
"loss": 1.213,
"step": 20450
},
{
"epoch": 4.848628192999054,
"grad_norm": 14.816115379333496,
"learning_rate": 7.580864526048135e-07,
"loss": 1.1366,
"step": 20500
},
{
"epoch": 4.860454115421003,
"grad_norm": 14.993414878845215,
"learning_rate": 6.989533439772929e-07,
"loss": 1.2219,
"step": 20550
},
{
"epoch": 4.872280037842952,
"grad_norm": 17.53949737548828,
"learning_rate": 6.398202353497723e-07,
"loss": 1.1501,
"step": 20600
},
{
"epoch": 4.8841059602649,
"grad_norm": 14.761266708374023,
"learning_rate": 5.806871267222519e-07,
"loss": 1.1949,
"step": 20650
},
{
"epoch": 4.89593188268685,
"grad_norm": 15.1113920211792,
"learning_rate": 5.215540180947313e-07,
"loss": 1.1757,
"step": 20700
},
{
"epoch": 4.907757805108798,
"grad_norm": 17.890682220458984,
"learning_rate": 4.624209094672107e-07,
"loss": 1.1029,
"step": 20750
},
{
"epoch": 4.919583727530747,
"grad_norm": 16.85039710998535,
"learning_rate": 4.032878008396902e-07,
"loss": 1.1426,
"step": 20800
},
{
"epoch": 4.9314096499526965,
"grad_norm": 22.656349182128906,
"learning_rate": 3.441546922121696e-07,
"loss": 1.2021,
"step": 20850
},
{
"epoch": 4.943235572374645,
"grad_norm": 19.370864868164062,
"learning_rate": 2.8502158358464905e-07,
"loss": 1.1034,
"step": 20900
},
{
"epoch": 4.955061494796594,
"grad_norm": 13.943963050842285,
"learning_rate": 2.258884749571285e-07,
"loss": 1.2271,
"step": 20950
},
{
"epoch": 4.966887417218543,
"grad_norm": 14.200597763061523,
"learning_rate": 1.6675536632960799e-07,
"loss": 1.2032,
"step": 21000
},
{
"epoch": 4.978713339640492,
"grad_norm": 12.472103118896484,
"learning_rate": 1.076222577020874e-07,
"loss": 1.2036,
"step": 21050
},
{
"epoch": 4.990539262062441,
"grad_norm": 15.718477249145508,
"learning_rate": 4.848914907456685e-08,
"loss": 1.1324,
"step": 21100
}
],
"logging_steps": 50,
"max_steps": 21140,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}