MedQwen-1.5B-Instruct / trainer_state.json
hitty28's picture
Upload folder using huggingface_hub
676ec39 verified
{
"best_global_step": 26000,
"best_metric": 1.9807677268981934,
"best_model_checkpoint": "./medical_qwen_finetuned_improved/checkpoint-26000",
"epoch": 7.9997372273734,
"eval_steps": 100,
"global_step": 26632,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00750778933143136,
"grad_norm": 5.094513416290283,
"learning_rate": 1.6893477240732053e-07,
"loss": 3.2412,
"step": 25
},
{
"epoch": 0.01501557866286272,
"grad_norm": 3.9722039699554443,
"learning_rate": 3.4490849366494603e-07,
"loss": 3.2095,
"step": 50
},
{
"epoch": 0.02252336799429408,
"grad_norm": 3.527215003967285,
"learning_rate": 5.208822149225716e-07,
"loss": 3.1277,
"step": 75
},
{
"epoch": 0.03003115732572544,
"grad_norm": 2.680919647216797,
"learning_rate": 6.968559361801971e-07,
"loss": 3.0231,
"step": 100
},
{
"epoch": 0.03003115732572544,
"eval_loss": 2.9903318881988525,
"eval_runtime": 319.3002,
"eval_samples_per_second": 17.567,
"eval_steps_per_second": 4.394,
"step": 100
},
{
"epoch": 0.0375389466571568,
"grad_norm": 2.085681438446045,
"learning_rate": 8.728296574378227e-07,
"loss": 2.9442,
"step": 125
},
{
"epoch": 0.04504673598858816,
"grad_norm": 1.8346056938171387,
"learning_rate": 1.0488033786954481e-06,
"loss": 2.8133,
"step": 150
},
{
"epoch": 0.05255452532001952,
"grad_norm": 1.7170641422271729,
"learning_rate": 1.2247770999530738e-06,
"loss": 2.741,
"step": 175
},
{
"epoch": 0.06006231465145088,
"grad_norm": 1.7041053771972656,
"learning_rate": 1.4007508212106992e-06,
"loss": 2.6097,
"step": 200
},
{
"epoch": 0.06006231465145088,
"eval_loss": 2.5728588104248047,
"eval_runtime": 244.8867,
"eval_samples_per_second": 22.904,
"eval_steps_per_second": 5.729,
"step": 200
},
{
"epoch": 0.06757010398288224,
"grad_norm": 1.1266319751739502,
"learning_rate": 1.5767245424683247e-06,
"loss": 2.5265,
"step": 225
},
{
"epoch": 0.0750778933143136,
"grad_norm": 0.9362080097198486,
"learning_rate": 1.7526982637259503e-06,
"loss": 2.4491,
"step": 250
},
{
"epoch": 0.08258568264574496,
"grad_norm": 0.5819249153137207,
"learning_rate": 1.928671984983576e-06,
"loss": 2.3677,
"step": 275
},
{
"epoch": 0.09009347197717632,
"grad_norm": 0.5669568777084351,
"learning_rate": 2.1046457062412012e-06,
"loss": 2.3315,
"step": 300
},
{
"epoch": 0.09009347197717632,
"eval_loss": 2.288224458694458,
"eval_runtime": 244.739,
"eval_samples_per_second": 22.918,
"eval_steps_per_second": 5.733,
"step": 300
},
{
"epoch": 0.09760126130860768,
"grad_norm": 0.43035316467285156,
"learning_rate": 2.280619427498827e-06,
"loss": 2.2718,
"step": 325
},
{
"epoch": 0.10510905064003905,
"grad_norm": 0.41122695803642273,
"learning_rate": 2.4565931487564526e-06,
"loss": 2.2705,
"step": 350
},
{
"epoch": 0.1126168399714704,
"grad_norm": 0.38599300384521484,
"learning_rate": 2.632566870014078e-06,
"loss": 2.2149,
"step": 375
},
{
"epoch": 0.12012462930290176,
"grad_norm": 0.36087512969970703,
"learning_rate": 2.8085405912717034e-06,
"loss": 2.242,
"step": 400
},
{
"epoch": 0.12012462930290176,
"eval_loss": 2.2342677116394043,
"eval_runtime": 244.7563,
"eval_samples_per_second": 22.917,
"eval_steps_per_second": 5.732,
"step": 400
},
{
"epoch": 0.1276324186343331,
"grad_norm": 0.39531558752059937,
"learning_rate": 2.984514312529329e-06,
"loss": 2.2117,
"step": 425
},
{
"epoch": 0.13514020796576448,
"grad_norm": 0.4547671675682068,
"learning_rate": 3.1604880337869548e-06,
"loss": 2.2321,
"step": 450
},
{
"epoch": 0.14264799729719585,
"grad_norm": 0.37058719992637634,
"learning_rate": 3.33646175504458e-06,
"loss": 2.2581,
"step": 475
},
{
"epoch": 0.1501557866286272,
"grad_norm": 0.3959207534790039,
"learning_rate": 3.5124354763022057e-06,
"loss": 2.2329,
"step": 500
},
{
"epoch": 0.1501557866286272,
"eval_loss": 2.2151083946228027,
"eval_runtime": 244.7784,
"eval_samples_per_second": 22.915,
"eval_steps_per_second": 5.732,
"step": 500
},
{
"epoch": 0.15766357596005856,
"grad_norm": 0.4138086438179016,
"learning_rate": 3.688409197559831e-06,
"loss": 2.235,
"step": 525
},
{
"epoch": 0.16517136529148993,
"grad_norm": 0.4153759777545929,
"learning_rate": 3.864382918817457e-06,
"loss": 2.2237,
"step": 550
},
{
"epoch": 0.17267915462292127,
"grad_norm": 0.4081685245037079,
"learning_rate": 4.0403566400750816e-06,
"loss": 2.2002,
"step": 575
},
{
"epoch": 0.18018694395435264,
"grad_norm": 0.38760289549827576,
"learning_rate": 4.216330361332708e-06,
"loss": 2.2159,
"step": 600
},
{
"epoch": 0.18018694395435264,
"eval_loss": 2.204134464263916,
"eval_runtime": 244.6583,
"eval_samples_per_second": 22.926,
"eval_steps_per_second": 5.735,
"step": 600
},
{
"epoch": 0.187694733285784,
"grad_norm": 0.38073575496673584,
"learning_rate": 4.392304082590333e-06,
"loss": 2.1967,
"step": 625
},
{
"epoch": 0.19520252261721535,
"grad_norm": 0.4018952250480652,
"learning_rate": 4.568277803847959e-06,
"loss": 2.1968,
"step": 650
},
{
"epoch": 0.20271031194864672,
"grad_norm": 0.4137013256549835,
"learning_rate": 4.744251525105584e-06,
"loss": 2.1997,
"step": 675
},
{
"epoch": 0.2102181012800781,
"grad_norm": 0.411466509103775,
"learning_rate": 4.92022524636321e-06,
"loss": 2.2099,
"step": 700
},
{
"epoch": 0.2102181012800781,
"eval_loss": 2.195915699005127,
"eval_runtime": 244.7304,
"eval_samples_per_second": 22.919,
"eval_steps_per_second": 5.733,
"step": 700
},
{
"epoch": 0.21772589061150943,
"grad_norm": 0.41950109601020813,
"learning_rate": 5.096198967620835e-06,
"loss": 2.1777,
"step": 725
},
{
"epoch": 0.2252336799429408,
"grad_norm": 0.41122791171073914,
"learning_rate": 5.272172688878461e-06,
"loss": 2.2063,
"step": 750
},
{
"epoch": 0.23274146927437217,
"grad_norm": 0.44570910930633545,
"learning_rate": 5.448146410136086e-06,
"loss": 2.1962,
"step": 775
},
{
"epoch": 0.2402492586058035,
"grad_norm": 0.40760159492492676,
"learning_rate": 5.624120131393712e-06,
"loss": 2.2007,
"step": 800
},
{
"epoch": 0.2402492586058035,
"eval_loss": 2.1890077590942383,
"eval_runtime": 244.6573,
"eval_samples_per_second": 22.926,
"eval_steps_per_second": 5.735,
"step": 800
},
{
"epoch": 0.24775704793723488,
"grad_norm": 0.4488222897052765,
"learning_rate": 5.800093852651337e-06,
"loss": 2.2008,
"step": 825
},
{
"epoch": 0.2552648372686662,
"grad_norm": 0.4745488166809082,
"learning_rate": 5.976067573908963e-06,
"loss": 2.2013,
"step": 850
},
{
"epoch": 0.2627726266000976,
"grad_norm": 0.45855531096458435,
"learning_rate": 6.152041295166589e-06,
"loss": 2.1824,
"step": 875
},
{
"epoch": 0.27028041593152896,
"grad_norm": 0.4843423366546631,
"learning_rate": 6.328015016424214e-06,
"loss": 2.1872,
"step": 900
},
{
"epoch": 0.27028041593152896,
"eval_loss": 2.182678699493408,
"eval_runtime": 244.7929,
"eval_samples_per_second": 22.913,
"eval_steps_per_second": 5.731,
"step": 900
},
{
"epoch": 0.27778820526296033,
"grad_norm": 0.4567316770553589,
"learning_rate": 6.5039887376818395e-06,
"loss": 2.184,
"step": 925
},
{
"epoch": 0.2852959945943917,
"grad_norm": 0.46967923641204834,
"learning_rate": 6.679962458939465e-06,
"loss": 2.1739,
"step": 950
},
{
"epoch": 0.29280378392582307,
"grad_norm": 0.4461369216442108,
"learning_rate": 6.85593618019709e-06,
"loss": 2.1818,
"step": 975
},
{
"epoch": 0.3003115732572544,
"grad_norm": 0.4638686776161194,
"learning_rate": 7.031909901454717e-06,
"loss": 2.194,
"step": 1000
},
{
"epoch": 0.3003115732572544,
"eval_loss": 2.1770713329315186,
"eval_runtime": 244.571,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 1000
},
{
"epoch": 0.30781936258868575,
"grad_norm": 0.4287603199481964,
"learning_rate": 7.207883622712341e-06,
"loss": 2.1563,
"step": 1025
},
{
"epoch": 0.3153271519201171,
"grad_norm": 0.4473567605018616,
"learning_rate": 7.383857343969968e-06,
"loss": 2.1661,
"step": 1050
},
{
"epoch": 0.3228349412515485,
"grad_norm": 0.5221546292304993,
"learning_rate": 7.559831065227592e-06,
"loss": 2.1744,
"step": 1075
},
{
"epoch": 0.33034273058297986,
"grad_norm": 0.4909228980541229,
"learning_rate": 7.735804786485218e-06,
"loss": 2.1729,
"step": 1100
},
{
"epoch": 0.33034273058297986,
"eval_loss": 2.1715242862701416,
"eval_runtime": 245.0613,
"eval_samples_per_second": 22.888,
"eval_steps_per_second": 5.725,
"step": 1100
},
{
"epoch": 0.33785051991441123,
"grad_norm": 0.5596965551376343,
"learning_rate": 7.911778507742844e-06,
"loss": 2.1615,
"step": 1125
},
{
"epoch": 0.34535830924584254,
"grad_norm": 0.4983489215373993,
"learning_rate": 8.08775222900047e-06,
"loss": 2.1717,
"step": 1150
},
{
"epoch": 0.3528660985772739,
"grad_norm": 0.485856831073761,
"learning_rate": 8.263725950258095e-06,
"loss": 2.1507,
"step": 1175
},
{
"epoch": 0.3603738879087053,
"grad_norm": 0.5247727632522583,
"learning_rate": 8.43969967151572e-06,
"loss": 2.1939,
"step": 1200
},
{
"epoch": 0.3603738879087053,
"eval_loss": 2.1655497550964355,
"eval_runtime": 244.6519,
"eval_samples_per_second": 22.926,
"eval_steps_per_second": 5.735,
"step": 1200
},
{
"epoch": 0.36788167724013665,
"grad_norm": 0.5695153474807739,
"learning_rate": 8.615673392773347e-06,
"loss": 2.1827,
"step": 1225
},
{
"epoch": 0.375389466571568,
"grad_norm": 0.5112013816833496,
"learning_rate": 8.791647114030971e-06,
"loss": 2.1748,
"step": 1250
},
{
"epoch": 0.3828972559029994,
"grad_norm": 0.46719494462013245,
"learning_rate": 8.967620835288597e-06,
"loss": 2.1856,
"step": 1275
},
{
"epoch": 0.3904050452344307,
"grad_norm": 0.48362448811531067,
"learning_rate": 9.143594556546222e-06,
"loss": 2.1766,
"step": 1300
},
{
"epoch": 0.3904050452344307,
"eval_loss": 2.160381317138672,
"eval_runtime": 245.4635,
"eval_samples_per_second": 22.851,
"eval_steps_per_second": 5.716,
"step": 1300
},
{
"epoch": 0.3979128345658621,
"grad_norm": 0.5096102356910706,
"learning_rate": 9.31956827780385e-06,
"loss": 2.1664,
"step": 1325
},
{
"epoch": 0.40542062389729344,
"grad_norm": 0.6038557887077332,
"learning_rate": 9.495541999061475e-06,
"loss": 2.1617,
"step": 1350
},
{
"epoch": 0.4129284132287248,
"grad_norm": 0.5893401503562927,
"learning_rate": 9.671515720319098e-06,
"loss": 2.1473,
"step": 1375
},
{
"epoch": 0.4204362025601562,
"grad_norm": 0.5666929483413696,
"learning_rate": 9.847489441576724e-06,
"loss": 2.195,
"step": 1400
},
{
"epoch": 0.4204362025601562,
"eval_loss": 2.154971122741699,
"eval_runtime": 244.6458,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 1400
},
{
"epoch": 0.42794399189158755,
"grad_norm": 0.6115418672561646,
"learning_rate": 1.0023463162834351e-05,
"loss": 2.1356,
"step": 1425
},
{
"epoch": 0.43545178122301886,
"grad_norm": 0.6469879150390625,
"learning_rate": 1.0199436884091976e-05,
"loss": 2.1755,
"step": 1450
},
{
"epoch": 0.44295957055445023,
"grad_norm": 0.5257688760757446,
"learning_rate": 1.0375410605349602e-05,
"loss": 2.1731,
"step": 1475
},
{
"epoch": 0.4504673598858816,
"grad_norm": 0.5619986653327942,
"learning_rate": 1.0551384326607226e-05,
"loss": 2.172,
"step": 1500
},
{
"epoch": 0.4504673598858816,
"eval_loss": 2.15023136138916,
"eval_runtime": 244.8519,
"eval_samples_per_second": 22.908,
"eval_steps_per_second": 5.73,
"step": 1500
},
{
"epoch": 0.45797514921731297,
"grad_norm": 0.5681572556495667,
"learning_rate": 1.0727358047864853e-05,
"loss": 2.1556,
"step": 1525
},
{
"epoch": 0.46548293854874434,
"grad_norm": 0.6319741010665894,
"learning_rate": 1.0903331769122478e-05,
"loss": 2.1369,
"step": 1550
},
{
"epoch": 0.4729907278801757,
"grad_norm": 0.5815430283546448,
"learning_rate": 1.1079305490380104e-05,
"loss": 2.1589,
"step": 1575
},
{
"epoch": 0.480498517211607,
"grad_norm": 0.5797183513641357,
"learning_rate": 1.1255279211637729e-05,
"loss": 2.1676,
"step": 1600
},
{
"epoch": 0.480498517211607,
"eval_loss": 2.1454966068267822,
"eval_runtime": 244.6098,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.736,
"step": 1600
},
{
"epoch": 0.4880063065430384,
"grad_norm": 0.6238908171653748,
"learning_rate": 1.1431252932895355e-05,
"loss": 2.1451,
"step": 1625
},
{
"epoch": 0.49551409587446976,
"grad_norm": 0.6378119587898254,
"learning_rate": 1.160722665415298e-05,
"loss": 2.1393,
"step": 1650
},
{
"epoch": 0.5030218852059011,
"grad_norm": 0.5630180239677429,
"learning_rate": 1.1783200375410605e-05,
"loss": 2.1532,
"step": 1675
},
{
"epoch": 0.5105296745373324,
"grad_norm": 0.5868392586708069,
"learning_rate": 1.1959174096668231e-05,
"loss": 2.1473,
"step": 1700
},
{
"epoch": 0.5105296745373324,
"eval_loss": 2.141220808029175,
"eval_runtime": 244.9172,
"eval_samples_per_second": 22.902,
"eval_steps_per_second": 5.728,
"step": 1700
},
{
"epoch": 0.5180374638687638,
"grad_norm": 0.6577850580215454,
"learning_rate": 1.2135147817925858e-05,
"loss": 2.1379,
"step": 1725
},
{
"epoch": 0.5255452532001952,
"grad_norm": 0.6026327013969421,
"learning_rate": 1.2311121539183482e-05,
"loss": 2.1464,
"step": 1750
},
{
"epoch": 0.5330530425316266,
"grad_norm": 0.60736483335495,
"learning_rate": 1.2487095260441107e-05,
"loss": 2.1588,
"step": 1775
},
{
"epoch": 0.5405608318630579,
"grad_norm": 0.6438941359519958,
"learning_rate": 1.2663068981698733e-05,
"loss": 2.1421,
"step": 1800
},
{
"epoch": 0.5405608318630579,
"eval_loss": 2.1365151405334473,
"eval_runtime": 244.577,
"eval_samples_per_second": 22.933,
"eval_steps_per_second": 5.736,
"step": 1800
},
{
"epoch": 0.5480686211944893,
"grad_norm": 0.6403496861457825,
"learning_rate": 1.283904270295636e-05,
"loss": 2.1428,
"step": 1825
},
{
"epoch": 0.5555764105259207,
"grad_norm": 0.645140528678894,
"learning_rate": 1.3015016424213985e-05,
"loss": 2.1603,
"step": 1850
},
{
"epoch": 0.563084199857352,
"grad_norm": 0.6453937292098999,
"learning_rate": 1.3190990145471609e-05,
"loss": 2.156,
"step": 1875
},
{
"epoch": 0.5705919891887834,
"grad_norm": 0.7146685123443604,
"learning_rate": 1.3366963866729234e-05,
"loss": 2.1016,
"step": 1900
},
{
"epoch": 0.5705919891887834,
"eval_loss": 2.1333518028259277,
"eval_runtime": 245.1598,
"eval_samples_per_second": 22.879,
"eval_steps_per_second": 5.723,
"step": 1900
},
{
"epoch": 0.5780997785202148,
"grad_norm": 0.6153611540794373,
"learning_rate": 1.3542937587986862e-05,
"loss": 2.1577,
"step": 1925
},
{
"epoch": 0.5856075678516461,
"grad_norm": 0.7233150601387024,
"learning_rate": 1.3718911309244487e-05,
"loss": 2.1348,
"step": 1950
},
{
"epoch": 0.5931153571830774,
"grad_norm": 0.7316763401031494,
"learning_rate": 1.3894885030502113e-05,
"loss": 2.1316,
"step": 1975
},
{
"epoch": 0.6006231465145088,
"grad_norm": 0.6433097124099731,
"learning_rate": 1.4070858751759736e-05,
"loss": 2.1445,
"step": 2000
},
{
"epoch": 0.6006231465145088,
"eval_loss": 2.129106044769287,
"eval_runtime": 244.5211,
"eval_samples_per_second": 22.939,
"eval_steps_per_second": 5.738,
"step": 2000
},
{
"epoch": 0.6081309358459401,
"grad_norm": 0.6830511689186096,
"learning_rate": 1.4246832473017363e-05,
"loss": 2.1139,
"step": 2025
},
{
"epoch": 0.6156387251773715,
"grad_norm": 0.6850073337554932,
"learning_rate": 1.4422806194274989e-05,
"loss": 2.1218,
"step": 2050
},
{
"epoch": 0.6231465145088029,
"grad_norm": 0.6426066160202026,
"learning_rate": 1.4598779915532614e-05,
"loss": 2.1275,
"step": 2075
},
{
"epoch": 0.6306543038402342,
"grad_norm": 0.6646946668624878,
"learning_rate": 1.477475363679024e-05,
"loss": 2.126,
"step": 2100
},
{
"epoch": 0.6306543038402342,
"eval_loss": 2.1254663467407227,
"eval_runtime": 244.8863,
"eval_samples_per_second": 22.905,
"eval_steps_per_second": 5.729,
"step": 2100
},
{
"epoch": 0.6381620931716656,
"grad_norm": 0.7284884452819824,
"learning_rate": 1.4950727358047865e-05,
"loss": 2.116,
"step": 2125
},
{
"epoch": 0.645669882503097,
"grad_norm": 0.8441785573959351,
"learning_rate": 1.4999980024014693e-05,
"loss": 2.1195,
"step": 2150
},
{
"epoch": 0.6531776718345284,
"grad_norm": 0.7109578847885132,
"learning_rate": 1.4999886001482528e-05,
"loss": 2.122,
"step": 2175
},
{
"epoch": 0.6606854611659597,
"grad_norm": 0.7228453755378723,
"learning_rate": 1.4999714912309012e-05,
"loss": 2.1058,
"step": 2200
},
{
"epoch": 0.6606854611659597,
"eval_loss": 2.1222198009490967,
"eval_runtime": 245.2072,
"eval_samples_per_second": 22.875,
"eval_steps_per_second": 5.722,
"step": 2200
},
{
"epoch": 0.6681932504973911,
"grad_norm": 0.7744355201721191,
"learning_rate": 1.4999466758252207e-05,
"loss": 2.1252,
"step": 2225
},
{
"epoch": 0.6757010398288225,
"grad_norm": 0.7705317735671997,
"learning_rate": 1.4999141541862068e-05,
"loss": 2.0941,
"step": 2250
},
{
"epoch": 0.6832088291602537,
"grad_norm": 0.7709174156188965,
"learning_rate": 1.4998739266480427e-05,
"loss": 2.1044,
"step": 2275
},
{
"epoch": 0.6907166184916851,
"grad_norm": 0.6840139627456665,
"learning_rate": 1.4998259936240949e-05,
"loss": 2.1146,
"step": 2300
},
{
"epoch": 0.6907166184916851,
"eval_loss": 2.1187844276428223,
"eval_runtime": 244.5599,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 2300
},
{
"epoch": 0.6982244078231165,
"grad_norm": 0.8008989095687866,
"learning_rate": 1.4997703556069088e-05,
"loss": 2.1483,
"step": 2325
},
{
"epoch": 0.7057321971545478,
"grad_norm": 0.7936817407608032,
"learning_rate": 1.499707013168205e-05,
"loss": 2.1354,
"step": 2350
},
{
"epoch": 0.7132399864859792,
"grad_norm": 0.7062814831733704,
"learning_rate": 1.4996359669588714e-05,
"loss": 2.1378,
"step": 2375
},
{
"epoch": 0.7207477758174106,
"grad_norm": 0.8156118392944336,
"learning_rate": 1.4995572177089582e-05,
"loss": 2.0949,
"step": 2400
},
{
"epoch": 0.7207477758174106,
"eval_loss": 2.1153197288513184,
"eval_runtime": 244.9003,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 5.729,
"step": 2400
},
{
"epoch": 0.7282555651488419,
"grad_norm": 0.7018394470214844,
"learning_rate": 1.4994707662276703e-05,
"loss": 2.1084,
"step": 2425
},
{
"epoch": 0.7357633544802733,
"grad_norm": 0.7865644097328186,
"learning_rate": 1.4993766134033573e-05,
"loss": 2.1087,
"step": 2450
},
{
"epoch": 0.7432711438117047,
"grad_norm": 0.7718919515609741,
"learning_rate": 1.4992747602035062e-05,
"loss": 2.1248,
"step": 2475
},
{
"epoch": 0.750778933143136,
"grad_norm": 0.8038984537124634,
"learning_rate": 1.499165207674731e-05,
"loss": 2.124,
"step": 2500
},
{
"epoch": 0.750778933143136,
"eval_loss": 2.112464666366577,
"eval_runtime": 244.648,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 2500
},
{
"epoch": 0.7582867224745674,
"grad_norm": 0.8126859664916992,
"learning_rate": 1.4990479569427615e-05,
"loss": 2.0879,
"step": 2525
},
{
"epoch": 0.7657945118059988,
"grad_norm": 0.7394261360168457,
"learning_rate": 1.4989230092124322e-05,
"loss": 2.1167,
"step": 2550
},
{
"epoch": 0.77330230113743,
"grad_norm": 0.8700124621391296,
"learning_rate": 1.498790365767669e-05,
"loss": 2.0892,
"step": 2575
},
{
"epoch": 0.7808100904688614,
"grad_norm": 0.7596783638000488,
"learning_rate": 1.4986500279714777e-05,
"loss": 2.112,
"step": 2600
},
{
"epoch": 0.7808100904688614,
"eval_loss": 2.1093900203704834,
"eval_runtime": 244.8809,
"eval_samples_per_second": 22.905,
"eval_steps_per_second": 5.729,
"step": 2600
},
{
"epoch": 0.7883178798002928,
"grad_norm": 0.7278156876564026,
"learning_rate": 1.4985019972659285e-05,
"loss": 2.1186,
"step": 2625
},
{
"epoch": 0.7958256691317241,
"grad_norm": 0.8945568203926086,
"learning_rate": 1.4983462751721418e-05,
"loss": 2.0986,
"step": 2650
},
{
"epoch": 0.8033334584631555,
"grad_norm": 0.8277415037155151,
"learning_rate": 1.498182863290272e-05,
"loss": 2.1247,
"step": 2675
},
{
"epoch": 0.8108412477945869,
"grad_norm": 0.7230107188224792,
"learning_rate": 1.4980117632994925e-05,
"loss": 2.1107,
"step": 2700
},
{
"epoch": 0.8108412477945869,
"eval_loss": 2.106996774673462,
"eval_runtime": 244.8592,
"eval_samples_per_second": 22.907,
"eval_steps_per_second": 5.73,
"step": 2700
},
{
"epoch": 0.8183490371260183,
"grad_norm": 0.8236918449401855,
"learning_rate": 1.4978329769579768e-05,
"loss": 2.1138,
"step": 2725
},
{
"epoch": 0.8258568264574496,
"grad_norm": 0.7915171384811401,
"learning_rate": 1.4976465061028811e-05,
"loss": 2.1113,
"step": 2750
},
{
"epoch": 0.833364615788881,
"grad_norm": 0.8001993894577026,
"learning_rate": 1.4974523526503252e-05,
"loss": 2.122,
"step": 2775
},
{
"epoch": 0.8408724051203124,
"grad_norm": 0.915046751499176,
"learning_rate": 1.4972505185953739e-05,
"loss": 2.1145,
"step": 2800
},
{
"epoch": 0.8408724051203124,
"eval_loss": 2.1040894985198975,
"eval_runtime": 244.924,
"eval_samples_per_second": 22.901,
"eval_steps_per_second": 5.728,
"step": 2800
},
{
"epoch": 0.8483801944517437,
"grad_norm": 0.7762336134910583,
"learning_rate": 1.4970410060120146e-05,
"loss": 2.0905,
"step": 2825
},
{
"epoch": 0.8558879837831751,
"grad_norm": 0.8220327496528625,
"learning_rate": 1.496823817053138e-05,
"loss": 2.1149,
"step": 2850
},
{
"epoch": 0.8633957731146064,
"grad_norm": 0.8111168146133423,
"learning_rate": 1.4965989539505144e-05,
"loss": 2.1035,
"step": 2875
},
{
"epoch": 0.8709035624460377,
"grad_norm": 0.7875452637672424,
"learning_rate": 1.4963664190147713e-05,
"loss": 2.1091,
"step": 2900
},
{
"epoch": 0.8709035624460377,
"eval_loss": 2.101161241531372,
"eval_runtime": 247.6368,
"eval_samples_per_second": 22.65,
"eval_steps_per_second": 5.666,
"step": 2900
},
{
"epoch": 0.8784113517774691,
"grad_norm": 0.8538459539413452,
"learning_rate": 1.4961262146353696e-05,
"loss": 2.0994,
"step": 2925
},
{
"epoch": 0.8859191411089005,
"grad_norm": 0.7686406373977661,
"learning_rate": 1.4958783432805801e-05,
"loss": 2.0858,
"step": 2950
},
{
"epoch": 0.8934269304403318,
"grad_norm": 0.792827844619751,
"learning_rate": 1.4956228074974561e-05,
"loss": 2.1001,
"step": 2975
},
{
"epoch": 0.9009347197717632,
"grad_norm": 0.9214953780174255,
"learning_rate": 1.4953596099118089e-05,
"loss": 2.0844,
"step": 3000
},
{
"epoch": 0.9009347197717632,
"eval_loss": 2.100034713745117,
"eval_runtime": 247.8843,
"eval_samples_per_second": 22.627,
"eval_steps_per_second": 5.66,
"step": 3000
},
{
"epoch": 0.9084425091031946,
"grad_norm": 0.8309657573699951,
"learning_rate": 1.49508875322818e-05,
"loss": 2.0882,
"step": 3025
},
{
"epoch": 0.9159502984346259,
"grad_norm": 0.8833063244819641,
"learning_rate": 1.4948102402298141e-05,
"loss": 2.1063,
"step": 3050
},
{
"epoch": 0.9234580877660573,
"grad_norm": 0.7956681847572327,
"learning_rate": 1.4945240737786292e-05,
"loss": 2.0885,
"step": 3075
},
{
"epoch": 0.9309658770974887,
"grad_norm": 0.8342053890228271,
"learning_rate": 1.4942302568151882e-05,
"loss": 2.1001,
"step": 3100
},
{
"epoch": 0.9309658770974887,
"eval_loss": 2.0970711708068848,
"eval_runtime": 245.0795,
"eval_samples_per_second": 22.886,
"eval_steps_per_second": 5.725,
"step": 3100
},
{
"epoch": 0.93847366642892,
"grad_norm": 0.9061738848686218,
"learning_rate": 1.493928792358669e-05,
"loss": 2.1135,
"step": 3125
},
{
"epoch": 0.9459814557603514,
"grad_norm": 0.9443092346191406,
"learning_rate": 1.4936196835068322e-05,
"loss": 2.0909,
"step": 3150
},
{
"epoch": 0.9534892450917827,
"grad_norm": 0.7598241567611694,
"learning_rate": 1.4933029334359898e-05,
"loss": 2.1215,
"step": 3175
},
{
"epoch": 0.960997034423214,
"grad_norm": 1.001592993736267,
"learning_rate": 1.4929785454009737e-05,
"loss": 2.0884,
"step": 3200
},
{
"epoch": 0.960997034423214,
"eval_loss": 2.09686541557312,
"eval_runtime": 244.53,
"eval_samples_per_second": 22.938,
"eval_steps_per_second": 5.738,
"step": 3200
},
{
"epoch": 0.9685048237546454,
"grad_norm": 0.9168058633804321,
"learning_rate": 1.4926465227351008e-05,
"loss": 2.0785,
"step": 3225
},
{
"epoch": 0.9760126130860768,
"grad_norm": 0.8249208331108093,
"learning_rate": 1.4923068688501385e-05,
"loss": 2.0841,
"step": 3250
},
{
"epoch": 0.9835204024175082,
"grad_norm": 0.8430188298225403,
"learning_rate": 1.4919595872362719e-05,
"loss": 2.0969,
"step": 3275
},
{
"epoch": 0.9910281917489395,
"grad_norm": 0.9370065927505493,
"learning_rate": 1.491604681462065e-05,
"loss": 2.1052,
"step": 3300
},
{
"epoch": 0.9910281917489395,
"eval_loss": 2.0929176807403564,
"eval_runtime": 244.8371,
"eval_samples_per_second": 22.909,
"eval_steps_per_second": 5.73,
"step": 3300
},
{
"epoch": 0.9985359810803709,
"grad_norm": 0.7515010237693787,
"learning_rate": 1.4912421551744264e-05,
"loss": 2.0882,
"step": 3325
},
{
"epoch": 1.0063065430384024,
"grad_norm": 0.8594741821289062,
"learning_rate": 1.4908720120985703e-05,
"loss": 2.2045,
"step": 3350
},
{
"epoch": 1.0138143323698336,
"grad_norm": 0.852730929851532,
"learning_rate": 1.4904942560379791e-05,
"loss": 2.0833,
"step": 3375
},
{
"epoch": 1.0213221217012651,
"grad_norm": 0.8965045809745789,
"learning_rate": 1.4901088908743635e-05,
"loss": 2.1122,
"step": 3400
},
{
"epoch": 1.0213221217012651,
"eval_loss": 2.0909690856933594,
"eval_runtime": 245.1692,
"eval_samples_per_second": 22.878,
"eval_steps_per_second": 5.723,
"step": 3400
},
{
"epoch": 1.0288299110326964,
"grad_norm": 0.8129332065582275,
"learning_rate": 1.4897159205676244e-05,
"loss": 2.062,
"step": 3425
},
{
"epoch": 1.0363377003641279,
"grad_norm": 0.7968320846557617,
"learning_rate": 1.4893153491558093e-05,
"loss": 2.1195,
"step": 3450
},
{
"epoch": 1.0438454896955591,
"grad_norm": 0.8569227457046509,
"learning_rate": 1.4889071807550734e-05,
"loss": 2.0819,
"step": 3475
},
{
"epoch": 1.0513532790269906,
"grad_norm": 0.790208101272583,
"learning_rate": 1.4884914195596364e-05,
"loss": 2.0831,
"step": 3500
},
{
"epoch": 1.0513532790269906,
"eval_loss": 2.0892488956451416,
"eval_runtime": 244.1949,
"eval_samples_per_second": 22.969,
"eval_steps_per_second": 5.745,
"step": 3500
},
{
"epoch": 1.0588610683584219,
"grad_norm": 0.7736139893531799,
"learning_rate": 1.488068069841739e-05,
"loss": 2.0969,
"step": 3525
},
{
"epoch": 1.0663688576898531,
"grad_norm": 0.9392566084861755,
"learning_rate": 1.4876371359515992e-05,
"loss": 2.0835,
"step": 3550
},
{
"epoch": 1.0738766470212846,
"grad_norm": 0.9095376133918762,
"learning_rate": 1.4871986223173682e-05,
"loss": 2.0882,
"step": 3575
},
{
"epoch": 1.0813844363527159,
"grad_norm": 0.999569833278656,
"learning_rate": 1.4867525334450842e-05,
"loss": 2.0789,
"step": 3600
},
{
"epoch": 1.0813844363527159,
"eval_loss": 2.0872867107391357,
"eval_runtime": 245.2287,
"eval_samples_per_second": 22.873,
"eval_steps_per_second": 5.721,
"step": 3600
},
{
"epoch": 1.0888922256841473,
"grad_norm": 0.8475573658943176,
"learning_rate": 1.4862988739186265e-05,
"loss": 2.0472,
"step": 3625
},
{
"epoch": 1.0964000150155786,
"grad_norm": 0.8783066868782043,
"learning_rate": 1.4858376483996675e-05,
"loss": 2.1,
"step": 3650
},
{
"epoch": 1.10390780434701,
"grad_norm": 0.8863905072212219,
"learning_rate": 1.4853688616276268e-05,
"loss": 2.112,
"step": 3675
},
{
"epoch": 1.1114155936784413,
"grad_norm": 1.0993289947509766,
"learning_rate": 1.4848925184196203e-05,
"loss": 2.0788,
"step": 3700
},
{
"epoch": 1.1114155936784413,
"eval_loss": 2.0860979557037354,
"eval_runtime": 245.145,
"eval_samples_per_second": 22.88,
"eval_steps_per_second": 5.723,
"step": 3700
},
{
"epoch": 1.1189233830098728,
"grad_norm": 0.7591436505317688,
"learning_rate": 1.4844086236704119e-05,
"loss": 2.0705,
"step": 3725
},
{
"epoch": 1.126431172341304,
"grad_norm": 0.9064419269561768,
"learning_rate": 1.4839171823523628e-05,
"loss": 2.0421,
"step": 3750
},
{
"epoch": 1.1339389616727356,
"grad_norm": 0.8282918930053711,
"learning_rate": 1.483418199515381e-05,
"loss": 2.0621,
"step": 3775
},
{
"epoch": 1.1414467510041668,
"grad_norm": 0.9208828806877136,
"learning_rate": 1.4829116802868684e-05,
"loss": 2.08,
"step": 3800
},
{
"epoch": 1.1414467510041668,
"eval_loss": 2.0833563804626465,
"eval_runtime": 245.2937,
"eval_samples_per_second": 22.866,
"eval_steps_per_second": 5.72,
"step": 3800
},
{
"epoch": 1.1489545403355983,
"grad_norm": 0.8673622608184814,
"learning_rate": 1.4823976298716686e-05,
"loss": 2.0879,
"step": 3825
},
{
"epoch": 1.1564623296670296,
"grad_norm": 0.9238690137863159,
"learning_rate": 1.4818760535520142e-05,
"loss": 2.083,
"step": 3850
},
{
"epoch": 1.1639701189984608,
"grad_norm": 0.855536937713623,
"learning_rate": 1.4813469566874711e-05,
"loss": 2.0705,
"step": 3875
},
{
"epoch": 1.1714779083298923,
"grad_norm": 0.8495576977729797,
"learning_rate": 1.4808103447148845e-05,
"loss": 2.092,
"step": 3900
},
{
"epoch": 1.1714779083298923,
"eval_loss": 2.081465721130371,
"eval_runtime": 244.6166,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.736,
"step": 3900
},
{
"epoch": 1.1789856976613236,
"grad_norm": 0.9213201403617859,
"learning_rate": 1.4802662231483224e-05,
"loss": 2.0695,
"step": 3925
},
{
"epoch": 1.186493486992755,
"grad_norm": 0.9453656673431396,
"learning_rate": 1.4797145975790194e-05,
"loss": 2.0856,
"step": 3950
},
{
"epoch": 1.1940012763241863,
"grad_norm": 0.894378662109375,
"learning_rate": 1.4791554736753193e-05,
"loss": 2.0705,
"step": 3975
},
{
"epoch": 1.2015090656556178,
"grad_norm": 0.9393320083618164,
"learning_rate": 1.4785888571826158e-05,
"loss": 2.0693,
"step": 4000
},
{
"epoch": 1.2015090656556178,
"eval_loss": 2.079852819442749,
"eval_runtime": 245.0032,
"eval_samples_per_second": 22.894,
"eval_steps_per_second": 5.726,
"step": 4000
},
{
"epoch": 1.209016854987049,
"grad_norm": 0.8150069117546082,
"learning_rate": 1.478014753923295e-05,
"loss": 2.0721,
"step": 4025
},
{
"epoch": 1.2165246443184805,
"grad_norm": 0.867784321308136,
"learning_rate": 1.4774331697966743e-05,
"loss": 2.1046,
"step": 4050
},
{
"epoch": 1.2240324336499118,
"grad_norm": 0.8931713700294495,
"learning_rate": 1.476844110778943e-05,
"loss": 2.0718,
"step": 4075
},
{
"epoch": 1.231540222981343,
"grad_norm": 0.9451190829277039,
"learning_rate": 1.4762475829230994e-05,
"loss": 2.0826,
"step": 4100
},
{
"epoch": 1.231540222981343,
"eval_loss": 2.078012466430664,
"eval_runtime": 244.9722,
"eval_samples_per_second": 22.896,
"eval_steps_per_second": 5.727,
"step": 4100
},
{
"epoch": 1.2390480123127745,
"grad_norm": 0.9044253826141357,
"learning_rate": 1.4756435923588899e-05,
"loss": 2.0853,
"step": 4125
},
{
"epoch": 1.246555801644206,
"grad_norm": 0.9442611336708069,
"learning_rate": 1.4750321452927454e-05,
"loss": 2.039,
"step": 4150
},
{
"epoch": 1.2540635909756372,
"grad_norm": 0.8297872543334961,
"learning_rate": 1.4744132480077177e-05,
"loss": 2.0371,
"step": 4175
},
{
"epoch": 1.2615713803070685,
"grad_norm": 0.783397912979126,
"learning_rate": 1.4737869068634148e-05,
"loss": 2.0508,
"step": 4200
},
{
"epoch": 1.2615713803070685,
"eval_loss": 2.076925754547119,
"eval_runtime": 244.7969,
"eval_samples_per_second": 22.913,
"eval_steps_per_second": 5.731,
"step": 4200
},
{
"epoch": 1.2690791696385,
"grad_norm": 0.9161412119865417,
"learning_rate": 1.4731531282959356e-05,
"loss": 2.0785,
"step": 4225
},
{
"epoch": 1.2765869589699312,
"grad_norm": 0.8472649455070496,
"learning_rate": 1.4725119188178038e-05,
"loss": 2.057,
"step": 4250
},
{
"epoch": 1.2840947483013627,
"grad_norm": 0.777370035648346,
"learning_rate": 1.4718632850179013e-05,
"loss": 2.0842,
"step": 4275
},
{
"epoch": 1.291602537632794,
"grad_norm": 0.9465096592903137,
"learning_rate": 1.471207233561399e-05,
"loss": 2.0788,
"step": 4300
},
{
"epoch": 1.291602537632794,
"eval_loss": 2.0751006603240967,
"eval_runtime": 244.7621,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 5.732,
"step": 4300
},
{
"epoch": 1.2991103269642255,
"grad_norm": 0.9006996750831604,
"learning_rate": 1.4705437711896914e-05,
"loss": 2.0689,
"step": 4325
},
{
"epoch": 1.3066181162956567,
"grad_norm": 0.8863036632537842,
"learning_rate": 1.469872904720324e-05,
"loss": 2.0536,
"step": 4350
},
{
"epoch": 1.3141259056270882,
"grad_norm": 0.8076067566871643,
"learning_rate": 1.4691946410469244e-05,
"loss": 2.0704,
"step": 4375
},
{
"epoch": 1.3216336949585195,
"grad_norm": 0.8585737943649292,
"learning_rate": 1.4685089871391332e-05,
"loss": 2.0566,
"step": 4400
},
{
"epoch": 1.3216336949585195,
"eval_loss": 2.0732879638671875,
"eval_runtime": 245.4201,
"eval_samples_per_second": 22.855,
"eval_steps_per_second": 5.717,
"step": 4400
},
{
"epoch": 1.3291414842899507,
"grad_norm": 0.8773880004882812,
"learning_rate": 1.4678159500425296e-05,
"loss": 2.0661,
"step": 4425
},
{
"epoch": 1.3366492736213822,
"grad_norm": 0.9763519763946533,
"learning_rate": 1.4671155368785604e-05,
"loss": 2.0684,
"step": 4450
},
{
"epoch": 1.3441570629528137,
"grad_norm": 0.8556541204452515,
"learning_rate": 1.4664077548444675e-05,
"loss": 2.0788,
"step": 4475
},
{
"epoch": 1.351664852284245,
"grad_norm": 0.8426047563552856,
"learning_rate": 1.4656926112132124e-05,
"loss": 2.0645,
"step": 4500
},
{
"epoch": 1.351664852284245,
"eval_loss": 2.0714945793151855,
"eval_runtime": 271.6463,
"eval_samples_per_second": 20.648,
"eval_steps_per_second": 5.165,
"step": 4500
},
{
"epoch": 1.3591726416156762,
"grad_norm": 0.8249872326850891,
"learning_rate": 1.4649701133334025e-05,
"loss": 2.0679,
"step": 4525
},
{
"epoch": 1.3666804309471077,
"grad_norm": 0.8870148658752441,
"learning_rate": 1.4642402686292155e-05,
"loss": 2.0873,
"step": 4550
},
{
"epoch": 1.374188220278539,
"grad_norm": 0.8625667095184326,
"learning_rate": 1.4635030846003225e-05,
"loss": 2.0655,
"step": 4575
},
{
"epoch": 1.3816960096099704,
"grad_norm": 1.0245722532272339,
"learning_rate": 1.4627585688218116e-05,
"loss": 2.0939,
"step": 4600
},
{
"epoch": 1.3816960096099704,
"eval_loss": 2.0702602863311768,
"eval_runtime": 244.5585,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 4600
},
{
"epoch": 1.3892037989414017,
"grad_norm": 0.9307467937469482,
"learning_rate": 1.4620067289441101e-05,
"loss": 2.0582,
"step": 4625
},
{
"epoch": 1.396711588272833,
"grad_norm": 0.8650360703468323,
"learning_rate": 1.461247572692905e-05,
"loss": 2.0486,
"step": 4650
},
{
"epoch": 1.4042193776042644,
"grad_norm": 0.8464282155036926,
"learning_rate": 1.4604811078690648e-05,
"loss": 2.0513,
"step": 4675
},
{
"epoch": 1.4117271669356959,
"grad_norm": 0.9079179167747498,
"learning_rate": 1.4597073423485583e-05,
"loss": 2.0642,
"step": 4700
},
{
"epoch": 1.4117271669356959,
"eval_loss": 2.068575143814087,
"eval_runtime": 244.9525,
"eval_samples_per_second": 22.898,
"eval_steps_per_second": 5.728,
"step": 4700
},
{
"epoch": 1.4192349562671271,
"grad_norm": 0.8237431049346924,
"learning_rate": 1.4589262840823746e-05,
"loss": 2.0619,
"step": 4725
},
{
"epoch": 1.4267427455985584,
"grad_norm": 0.8957166075706482,
"learning_rate": 1.4581379410964402e-05,
"loss": 2.0896,
"step": 4750
},
{
"epoch": 1.4342505349299899,
"grad_norm": 0.7650532722473145,
"learning_rate": 1.4573423214915382e-05,
"loss": 2.0554,
"step": 4775
},
{
"epoch": 1.4417583242614211,
"grad_norm": 0.9083628058433533,
"learning_rate": 1.4565394334432233e-05,
"loss": 2.0811,
"step": 4800
},
{
"epoch": 1.4417583242614211,
"eval_loss": 2.066969394683838,
"eval_runtime": 244.7686,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 5.732,
"step": 4800
},
{
"epoch": 1.4492661135928526,
"grad_norm": 0.963108479976654,
"learning_rate": 1.4557292852017392e-05,
"loss": 2.0727,
"step": 4825
},
{
"epoch": 1.4567739029242839,
"grad_norm": 0.8735617399215698,
"learning_rate": 1.454911885091933e-05,
"loss": 2.0681,
"step": 4850
},
{
"epoch": 1.4642816922557154,
"grad_norm": 1.0220097303390503,
"learning_rate": 1.4540872415131695e-05,
"loss": 2.0602,
"step": 4875
},
{
"epoch": 1.4717894815871466,
"grad_norm": 0.9304827451705933,
"learning_rate": 1.4532553629392455e-05,
"loss": 2.0539,
"step": 4900
},
{
"epoch": 1.4717894815871466,
"eval_loss": 2.0658257007598877,
"eval_runtime": 244.4897,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 5.738,
"step": 4900
},
{
"epoch": 1.479297270918578,
"grad_norm": 0.9377899765968323,
"learning_rate": 1.4524162579183032e-05,
"loss": 2.0552,
"step": 4925
},
{
"epoch": 1.4868050602500094,
"grad_norm": 0.9211867451667786,
"learning_rate": 1.451569935072741e-05,
"loss": 2.0622,
"step": 4950
},
{
"epoch": 1.4943128495814406,
"grad_norm": 1.0366291999816895,
"learning_rate": 1.4507164030991254e-05,
"loss": 2.0673,
"step": 4975
},
{
"epoch": 1.501820638912872,
"grad_norm": 0.9624854326248169,
"learning_rate": 1.449855670768102e-05,
"loss": 2.0748,
"step": 5000
},
{
"epoch": 1.501820638912872,
"eval_loss": 2.0644030570983887,
"eval_runtime": 245.047,
"eval_samples_per_second": 22.889,
"eval_steps_per_second": 5.725,
"step": 5000
},
{
"epoch": 1.5093284282443036,
"grad_norm": 0.8962668180465698,
"learning_rate": 1.4489877469243053e-05,
"loss": 2.0701,
"step": 5025
},
{
"epoch": 1.5168362175757348,
"grad_norm": 0.8921008110046387,
"learning_rate": 1.4481126404862677e-05,
"loss": 2.0669,
"step": 5050
},
{
"epoch": 1.524344006907166,
"grad_norm": 0.9402926564216614,
"learning_rate": 1.4472303604463279e-05,
"loss": 2.0576,
"step": 5075
},
{
"epoch": 1.5318517962385976,
"grad_norm": 0.8990075588226318,
"learning_rate": 1.4463409158705376e-05,
"loss": 2.0517,
"step": 5100
},
{
"epoch": 1.5318517962385976,
"eval_loss": 2.0629703998565674,
"eval_runtime": 244.3655,
"eval_samples_per_second": 22.953,
"eval_steps_per_second": 5.741,
"step": 5100
},
{
"epoch": 1.539359585570029,
"grad_norm": 1.0020679235458374,
"learning_rate": 1.4454443158985708e-05,
"loss": 2.0582,
"step": 5125
},
{
"epoch": 1.5468673749014603,
"grad_norm": 0.9144858121871948,
"learning_rate": 1.4445405697436267e-05,
"loss": 2.0518,
"step": 5150
},
{
"epoch": 1.5543751642328916,
"grad_norm": 0.9205281138420105,
"learning_rate": 1.4436296866923373e-05,
"loss": 2.0553,
"step": 5175
},
{
"epoch": 1.5618829535643228,
"grad_norm": 1.0122096538543701,
"learning_rate": 1.4427116761046714e-05,
"loss": 2.0333,
"step": 5200
},
{
"epoch": 1.5618829535643228,
"eval_loss": 2.061532735824585,
"eval_runtime": 244.549,
"eval_samples_per_second": 22.936,
"eval_steps_per_second": 5.737,
"step": 5200
},
{
"epoch": 1.5693907428957543,
"grad_norm": 0.9542369842529297,
"learning_rate": 1.441786547413838e-05,
"loss": 2.0722,
"step": 5225
},
{
"epoch": 1.5768985322271858,
"grad_norm": 0.9306456446647644,
"learning_rate": 1.4408543101261898e-05,
"loss": 2.0731,
"step": 5250
},
{
"epoch": 1.584406321558617,
"grad_norm": 0.8262733221054077,
"learning_rate": 1.4399149738211251e-05,
"loss": 2.0629,
"step": 5275
},
{
"epoch": 1.5919141108900483,
"grad_norm": 0.9227537512779236,
"learning_rate": 1.43896854815099e-05,
"loss": 2.0832,
"step": 5300
},
{
"epoch": 1.5919141108900483,
"eval_loss": 2.0603787899017334,
"eval_runtime": 244.6958,
"eval_samples_per_second": 22.922,
"eval_steps_per_second": 5.734,
"step": 5300
},
{
"epoch": 1.5994219002214798,
"grad_norm": 0.9182181358337402,
"learning_rate": 1.4380150428409788e-05,
"loss": 2.0516,
"step": 5325
},
{
"epoch": 1.6069296895529113,
"grad_norm": 0.8036996126174927,
"learning_rate": 1.4370544676890333e-05,
"loss": 2.0531,
"step": 5350
},
{
"epoch": 1.6144374788843425,
"grad_norm": 0.9126760363578796,
"learning_rate": 1.4360868325657447e-05,
"loss": 2.0665,
"step": 5375
},
{
"epoch": 1.6219452682157738,
"grad_norm": 1.0143436193466187,
"learning_rate": 1.4351121474142484e-05,
"loss": 2.029,
"step": 5400
},
{
"epoch": 1.6219452682157738,
"eval_loss": 2.0587964057922363,
"eval_runtime": 244.7582,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 5.732,
"step": 5400
},
{
"epoch": 1.6294530575472053,
"grad_norm": 0.9128186702728271,
"learning_rate": 1.4341304222501254e-05,
"loss": 2.0253,
"step": 5425
},
{
"epoch": 1.6369608468786367,
"grad_norm": 0.915397584438324,
"learning_rate": 1.4331416671612966e-05,
"loss": 2.0771,
"step": 5450
},
{
"epoch": 1.644468636210068,
"grad_norm": 0.8913278579711914,
"learning_rate": 1.4321458923079216e-05,
"loss": 2.0781,
"step": 5475
},
{
"epoch": 1.6519764255414993,
"grad_norm": 1.062047004699707,
"learning_rate": 1.431143107922292e-05,
"loss": 2.0567,
"step": 5500
},
{
"epoch": 1.6519764255414993,
"eval_loss": 2.057093858718872,
"eval_runtime": 245.0447,
"eval_samples_per_second": 22.89,
"eval_steps_per_second": 5.725,
"step": 5500
},
{
"epoch": 1.6594842148729305,
"grad_norm": 0.8677504658699036,
"learning_rate": 1.4301333243087277e-05,
"loss": 2.0696,
"step": 5525
},
{
"epoch": 1.666992004204362,
"grad_norm": 0.9853184223175049,
"learning_rate": 1.4291165518434707e-05,
"loss": 2.0113,
"step": 5550
},
{
"epoch": 1.6744997935357935,
"grad_norm": 0.8988690972328186,
"learning_rate": 1.4280928009745786e-05,
"loss": 2.0278,
"step": 5575
},
{
"epoch": 1.6820075828672247,
"grad_norm": 0.877238929271698,
"learning_rate": 1.4270620822218162e-05,
"loss": 2.0231,
"step": 5600
},
{
"epoch": 1.6820075828672247,
"eval_loss": 2.0566163063049316,
"eval_runtime": 244.8536,
"eval_samples_per_second": 22.908,
"eval_steps_per_second": 5.73,
"step": 5600
},
{
"epoch": 1.689515372198656,
"grad_norm": 0.8475340008735657,
"learning_rate": 1.4260244061765492e-05,
"loss": 2.0667,
"step": 5625
},
{
"epoch": 1.6970231615300875,
"grad_norm": 1.0350947380065918,
"learning_rate": 1.4249797835016339e-05,
"loss": 2.0482,
"step": 5650
},
{
"epoch": 1.704530950861519,
"grad_norm": 0.9984613656997681,
"learning_rate": 1.4239282249313083e-05,
"loss": 2.0553,
"step": 5675
},
{
"epoch": 1.7120387401929502,
"grad_norm": 0.8884134888648987,
"learning_rate": 1.4228697412710817e-05,
"loss": 2.063,
"step": 5700
},
{
"epoch": 1.7120387401929502,
"eval_loss": 2.0545597076416016,
"eval_runtime": 244.9412,
"eval_samples_per_second": 22.899,
"eval_steps_per_second": 5.728,
"step": 5700
},
{
"epoch": 1.7195465295243815,
"grad_norm": 0.8889881372451782,
"learning_rate": 1.4218043433976232e-05,
"loss": 2.0594,
"step": 5725
},
{
"epoch": 1.727054318855813,
"grad_norm": 0.9351671934127808,
"learning_rate": 1.4207320422586511e-05,
"loss": 2.0317,
"step": 5750
},
{
"epoch": 1.7345621081872442,
"grad_norm": 0.9845299124717712,
"learning_rate": 1.4196528488728189e-05,
"loss": 2.0613,
"step": 5775
},
{
"epoch": 1.7420698975186757,
"grad_norm": 1.0036661624908447,
"learning_rate": 1.418566774329603e-05,
"loss": 2.0203,
"step": 5800
},
{
"epoch": 1.7420698975186757,
"eval_loss": 2.052852153778076,
"eval_runtime": 244.7583,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 5.732,
"step": 5800
},
{
"epoch": 1.749577686850107,
"grad_norm": 1.1337708234786987,
"learning_rate": 1.4174738297891891e-05,
"loss": 2.035,
"step": 5825
},
{
"epoch": 1.7570854761815382,
"grad_norm": 0.9224268198013306,
"learning_rate": 1.416374026482356e-05,
"loss": 2.068,
"step": 5850
},
{
"epoch": 1.7645932655129697,
"grad_norm": 0.8932907581329346,
"learning_rate": 1.4152673757103622e-05,
"loss": 2.0668,
"step": 5875
},
{
"epoch": 1.7721010548444012,
"grad_norm": 0.9014378786087036,
"learning_rate": 1.414153888844828e-05,
"loss": 2.0585,
"step": 5900
},
{
"epoch": 1.7721010548444012,
"eval_loss": 2.0522830486297607,
"eval_runtime": 244.4651,
"eval_samples_per_second": 22.944,
"eval_steps_per_second": 5.739,
"step": 5900
},
{
"epoch": 1.7796088441758324,
"grad_norm": 0.9573795795440674,
"learning_rate": 1.41303357732762e-05,
"loss": 2.0726,
"step": 5925
},
{
"epoch": 1.7871166335072637,
"grad_norm": 1.0068199634552002,
"learning_rate": 1.4119064526707325e-05,
"loss": 2.0117,
"step": 5950
},
{
"epoch": 1.7946244228386952,
"grad_norm": 0.8137004971504211,
"learning_rate": 1.4107725264561694e-05,
"loss": 2.0531,
"step": 5975
},
{
"epoch": 1.8021322121701266,
"grad_norm": 0.9432706832885742,
"learning_rate": 1.4096318103358264e-05,
"loss": 2.0528,
"step": 6000
},
{
"epoch": 1.8021322121701266,
"eval_loss": 2.0512585639953613,
"eval_runtime": 244.6438,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 6000
},
{
"epoch": 1.809640001501558,
"grad_norm": 0.8738940954208374,
"learning_rate": 1.4084843160313693e-05,
"loss": 2.0486,
"step": 6025
},
{
"epoch": 1.8171477908329892,
"grad_norm": 0.9203903079032898,
"learning_rate": 1.407330055334115e-05,
"loss": 2.0431,
"step": 6050
},
{
"epoch": 1.8246555801644204,
"grad_norm": 0.8773927688598633,
"learning_rate": 1.4061690401049101e-05,
"loss": 2.0336,
"step": 6075
},
{
"epoch": 1.832163369495852,
"grad_norm": 1.0781759023666382,
"learning_rate": 1.4050012822740082e-05,
"loss": 2.0839,
"step": 6100
},
{
"epoch": 1.832163369495852,
"eval_loss": 2.0504093170166016,
"eval_runtime": 244.864,
"eval_samples_per_second": 22.907,
"eval_steps_per_second": 5.73,
"step": 6100
},
{
"epoch": 1.8396711588272834,
"grad_norm": 0.8537021279335022,
"learning_rate": 1.4038267938409481e-05,
"loss": 2.0394,
"step": 6125
},
{
"epoch": 1.8471789481587146,
"grad_norm": 0.9055094122886658,
"learning_rate": 1.4026455868744306e-05,
"loss": 2.0267,
"step": 6150
},
{
"epoch": 1.854686737490146,
"grad_norm": 0.8958349227905273,
"learning_rate": 1.401457673512194e-05,
"loss": 2.0427,
"step": 6175
},
{
"epoch": 1.8621945268215774,
"grad_norm": 0.8849508166313171,
"learning_rate": 1.4002630659608895e-05,
"loss": 2.0492,
"step": 6200
},
{
"epoch": 1.8621945268215774,
"eval_loss": 2.0487124919891357,
"eval_runtime": 244.4909,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 5.738,
"step": 6200
},
{
"epoch": 1.8697023161530089,
"grad_norm": 0.9771384000778198,
"learning_rate": 1.3990617764959564e-05,
"loss": 2.0473,
"step": 6225
},
{
"epoch": 1.8772101054844401,
"grad_norm": 0.9234246611595154,
"learning_rate": 1.3978538174614942e-05,
"loss": 2.0408,
"step": 6250
},
{
"epoch": 1.8847178948158714,
"grad_norm": 1.0580551624298096,
"learning_rate": 1.3966392012701381e-05,
"loss": 2.0299,
"step": 6275
},
{
"epoch": 1.8922256841473029,
"grad_norm": 0.8676178455352783,
"learning_rate": 1.3954179404029295e-05,
"loss": 2.0513,
"step": 6300
},
{
"epoch": 1.8922256841473029,
"eval_loss": 2.0470457077026367,
"eval_runtime": 244.6825,
"eval_samples_per_second": 22.924,
"eval_steps_per_second": 5.734,
"step": 6300
},
{
"epoch": 1.8997334734787343,
"grad_norm": 1.0486456155776978,
"learning_rate": 1.3941900474091892e-05,
"loss": 2.0646,
"step": 6325
},
{
"epoch": 1.9072412628101656,
"grad_norm": 0.963049054145813,
"learning_rate": 1.3929555349063875e-05,
"loss": 2.0421,
"step": 6350
},
{
"epoch": 1.9147490521415969,
"grad_norm": 0.9626838564872742,
"learning_rate": 1.391714415580015e-05,
"loss": 2.0369,
"step": 6375
},
{
"epoch": 1.922256841473028,
"grad_norm": 0.9801763296127319,
"learning_rate": 1.3904667021834514e-05,
"loss": 2.0114,
"step": 6400
},
{
"epoch": 1.922256841473028,
"eval_loss": 2.046201467514038,
"eval_runtime": 244.6721,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 6400
},
{
"epoch": 1.9297646308044596,
"grad_norm": 1.0865575075149536,
"learning_rate": 1.3892124075378364e-05,
"loss": 2.0132,
"step": 6425
},
{
"epoch": 1.937272420135891,
"grad_norm": 0.899895429611206,
"learning_rate": 1.3879515445319353e-05,
"loss": 2.0412,
"step": 6450
},
{
"epoch": 1.9447802094673223,
"grad_norm": 0.9657663106918335,
"learning_rate": 1.3866841261220093e-05,
"loss": 2.0367,
"step": 6475
},
{
"epoch": 1.9522879987987536,
"grad_norm": 0.8613144159317017,
"learning_rate": 1.3854101653316798e-05,
"loss": 2.0456,
"step": 6500
},
{
"epoch": 1.9522879987987536,
"eval_loss": 2.0444774627685547,
"eval_runtime": 244.5805,
"eval_samples_per_second": 22.933,
"eval_steps_per_second": 5.736,
"step": 6500
},
{
"epoch": 1.959795788130185,
"grad_norm": 0.8493949174880981,
"learning_rate": 1.3841296752517967e-05,
"loss": 2.0617,
"step": 6525
},
{
"epoch": 1.9673035774616165,
"grad_norm": 0.9268197417259216,
"learning_rate": 1.3828426690403026e-05,
"loss": 2.0502,
"step": 6550
},
{
"epoch": 1.9748113667930478,
"grad_norm": 0.9686461091041565,
"learning_rate": 1.3815491599220977e-05,
"loss": 2.057,
"step": 6575
},
{
"epoch": 1.982319156124479,
"grad_norm": 0.9616640210151672,
"learning_rate": 1.3802491611889048e-05,
"loss": 2.0442,
"step": 6600
},
{
"epoch": 1.982319156124479,
"eval_loss": 2.043835401535034,
"eval_runtime": 244.9743,
"eval_samples_per_second": 22.896,
"eval_steps_per_second": 5.727,
"step": 6600
},
{
"epoch": 1.9898269454559105,
"grad_norm": 0.8984593152999878,
"learning_rate": 1.3789426861991317e-05,
"loss": 2.0366,
"step": 6625
},
{
"epoch": 1.997334734787342,
"grad_norm": 0.8971940875053406,
"learning_rate": 1.3776297483777344e-05,
"loss": 2.0255,
"step": 6650
},
{
"epoch": 2.0051052967453735,
"grad_norm": 0.9031795859336853,
"learning_rate": 1.3763103612160788e-05,
"loss": 2.0926,
"step": 6675
},
{
"epoch": 2.012613086076805,
"grad_norm": 0.8842533230781555,
"learning_rate": 1.374984538271803e-05,
"loss": 2.0172,
"step": 6700
},
{
"epoch": 2.012613086076805,
"eval_loss": 2.0426952838897705,
"eval_runtime": 244.4788,
"eval_samples_per_second": 22.943,
"eval_steps_per_second": 5.739,
"step": 6700
},
{
"epoch": 2.020120875408236,
"grad_norm": 1.008647084236145,
"learning_rate": 1.3736522931686765e-05,
"loss": 2.0135,
"step": 6725
},
{
"epoch": 2.0276286647396673,
"grad_norm": 1.0014972686767578,
"learning_rate": 1.372313639596462e-05,
"loss": 2.0175,
"step": 6750
},
{
"epoch": 2.0351364540710986,
"grad_norm": 0.9429395198822021,
"learning_rate": 1.3709685913107728e-05,
"loss": 2.0228,
"step": 6775
},
{
"epoch": 2.0426442434025303,
"grad_norm": 1.057131052017212,
"learning_rate": 1.369617162132933e-05,
"loss": 2.0281,
"step": 6800
},
{
"epoch": 2.0426442434025303,
"eval_loss": 2.0424487590789795,
"eval_runtime": 244.6503,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 6800
},
{
"epoch": 2.0501520327339615,
"grad_norm": 0.990040123462677,
"learning_rate": 1.3682593659498343e-05,
"loss": 2.0111,
"step": 6825
},
{
"epoch": 2.0576598220653928,
"grad_norm": 0.9503148794174194,
"learning_rate": 1.3668952167137948e-05,
"loss": 2.0273,
"step": 6850
},
{
"epoch": 2.065167611396824,
"grad_norm": 0.9117149710655212,
"learning_rate": 1.3655247284424141e-05,
"loss": 2.0239,
"step": 6875
},
{
"epoch": 2.0726754007282557,
"grad_norm": 1.0101039409637451,
"learning_rate": 1.36414791521843e-05,
"loss": 2.0336,
"step": 6900
},
{
"epoch": 2.0726754007282557,
"eval_loss": 2.0416696071624756,
"eval_runtime": 245.4111,
"eval_samples_per_second": 22.856,
"eval_steps_per_second": 5.717,
"step": 6900
},
{
"epoch": 2.080183190059687,
"grad_norm": 0.8587022423744202,
"learning_rate": 1.3627647911895737e-05,
"loss": 2.0239,
"step": 6925
},
{
"epoch": 2.0876909793911183,
"grad_norm": 0.8640381693840027,
"learning_rate": 1.3613753705684241e-05,
"loss": 2.0079,
"step": 6950
},
{
"epoch": 2.0951987687225495,
"grad_norm": 0.8698000907897949,
"learning_rate": 1.3599796676322627e-05,
"loss": 2.0181,
"step": 6975
},
{
"epoch": 2.102706558053981,
"grad_norm": 0.9826030731201172,
"learning_rate": 1.3585776967229254e-05,
"loss": 2.0165,
"step": 7000
},
{
"epoch": 2.102706558053981,
"eval_loss": 2.0403730869293213,
"eval_runtime": 244.4187,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 7000
},
{
"epoch": 2.1102143473854125,
"grad_norm": 0.9374090433120728,
"learning_rate": 1.3571694722466567e-05,
"loss": 2.0125,
"step": 7025
},
{
"epoch": 2.1177221367168437,
"grad_norm": 0.9569231271743774,
"learning_rate": 1.3557550086739605e-05,
"loss": 2.0426,
"step": 7050
},
{
"epoch": 2.125229926048275,
"grad_norm": 1.0747652053833008,
"learning_rate": 1.3543343205394521e-05,
"loss": 2.0391,
"step": 7075
},
{
"epoch": 2.1327377153797062,
"grad_norm": 0.9164227247238159,
"learning_rate": 1.3529074224417086e-05,
"loss": 2.0171,
"step": 7100
},
{
"epoch": 2.1327377153797062,
"eval_loss": 2.0392725467681885,
"eval_runtime": 244.3097,
"eval_samples_per_second": 22.959,
"eval_steps_per_second": 5.743,
"step": 7100
},
{
"epoch": 2.140245504711138,
"grad_norm": 1.2145719528198242,
"learning_rate": 1.3514743290431186e-05,
"loss": 1.9985,
"step": 7125
},
{
"epoch": 2.147753294042569,
"grad_norm": 1.0173206329345703,
"learning_rate": 1.3500350550697316e-05,
"loss": 2.0221,
"step": 7150
},
{
"epoch": 2.1552610833740005,
"grad_norm": 1.0180777311325073,
"learning_rate": 1.3485896153111076e-05,
"loss": 2.0316,
"step": 7175
},
{
"epoch": 2.1627688727054317,
"grad_norm": 0.9768148064613342,
"learning_rate": 1.3471380246201637e-05,
"loss": 2.0115,
"step": 7200
},
{
"epoch": 2.1627688727054317,
"eval_loss": 2.038167953491211,
"eval_runtime": 244.3446,
"eval_samples_per_second": 22.955,
"eval_steps_per_second": 5.742,
"step": 7200
},
{
"epoch": 2.1702766620368634,
"grad_norm": 1.1061457395553589,
"learning_rate": 1.3456802979130227e-05,
"loss": 2.0091,
"step": 7225
},
{
"epoch": 2.1777844513682947,
"grad_norm": 1.1214226484298706,
"learning_rate": 1.3442164501688593e-05,
"loss": 2.0287,
"step": 7250
},
{
"epoch": 2.185292240699726,
"grad_norm": 0.9686478972434998,
"learning_rate": 1.342746496429746e-05,
"loss": 2.0485,
"step": 7275
},
{
"epoch": 2.192800030031157,
"grad_norm": 0.971811056137085,
"learning_rate": 1.3412704518004983e-05,
"loss": 2.0011,
"step": 7300
},
{
"epoch": 2.192800030031157,
"eval_loss": 2.0375237464904785,
"eval_runtime": 244.4348,
"eval_samples_per_second": 22.947,
"eval_steps_per_second": 5.74,
"step": 7300
},
{
"epoch": 2.200307819362589,
"grad_norm": 0.9958051443099976,
"learning_rate": 1.3397883314485206e-05,
"loss": 2.0151,
"step": 7325
},
{
"epoch": 2.20781560869402,
"grad_norm": 0.9805117249488831,
"learning_rate": 1.3383001506036497e-05,
"loss": 2.012,
"step": 7350
},
{
"epoch": 2.2153233980254514,
"grad_norm": 0.9299209117889404,
"learning_rate": 1.3368059245579976e-05,
"loss": 2.0226,
"step": 7375
},
{
"epoch": 2.2228311873568827,
"grad_norm": 0.9592748880386353,
"learning_rate": 1.3353056686657956e-05,
"loss": 2.0256,
"step": 7400
},
{
"epoch": 2.2228311873568827,
"eval_loss": 2.0365006923675537,
"eval_runtime": 243.9271,
"eval_samples_per_second": 22.995,
"eval_steps_per_second": 5.752,
"step": 7400
},
{
"epoch": 2.230338976688314,
"grad_norm": 0.9213986396789551,
"learning_rate": 1.3337993983432353e-05,
"loss": 2.0179,
"step": 7425
},
{
"epoch": 2.2378467660197456,
"grad_norm": 0.9306337237358093,
"learning_rate": 1.3322871290683117e-05,
"loss": 2.0189,
"step": 7450
},
{
"epoch": 2.245354555351177,
"grad_norm": 0.9785804152488708,
"learning_rate": 1.3307688763806629e-05,
"loss": 2.0228,
"step": 7475
},
{
"epoch": 2.252862344682608,
"grad_norm": 0.9108986258506775,
"learning_rate": 1.3292446558814106e-05,
"loss": 2.0357,
"step": 7500
},
{
"epoch": 2.252862344682608,
"eval_loss": 2.035933494567871,
"eval_runtime": 244.2267,
"eval_samples_per_second": 22.966,
"eval_steps_per_second": 5.745,
"step": 7500
},
{
"epoch": 2.2603701340140394,
"grad_norm": 0.9188127517700195,
"learning_rate": 1.3277144832329998e-05,
"loss": 2.0241,
"step": 7525
},
{
"epoch": 2.267877923345471,
"grad_norm": 0.9804355502128601,
"learning_rate": 1.3261783741590389e-05,
"loss": 2.0234,
"step": 7550
},
{
"epoch": 2.2753857126769024,
"grad_norm": 0.9870203137397766,
"learning_rate": 1.3246363444441365e-05,
"loss": 2.0078,
"step": 7575
},
{
"epoch": 2.2828935020083336,
"grad_norm": 1.1177314519882202,
"learning_rate": 1.3230884099337404e-05,
"loss": 2.0186,
"step": 7600
},
{
"epoch": 2.2828935020083336,
"eval_loss": 2.035186290740967,
"eval_runtime": 244.2073,
"eval_samples_per_second": 22.968,
"eval_steps_per_second": 5.745,
"step": 7600
},
{
"epoch": 2.290401291339765,
"grad_norm": 0.9781551957130432,
"learning_rate": 1.3215345865339738e-05,
"loss": 1.9881,
"step": 7625
},
{
"epoch": 2.2979090806711966,
"grad_norm": 1.1340678930282593,
"learning_rate": 1.3199748902114734e-05,
"loss": 2.0113,
"step": 7650
},
{
"epoch": 2.305416870002628,
"grad_norm": 0.8932919502258301,
"learning_rate": 1.3184093369932237e-05,
"loss": 2.0349,
"step": 7675
},
{
"epoch": 2.312924659334059,
"grad_norm": 0.9024244546890259,
"learning_rate": 1.3168379429663924e-05,
"loss": 2.0241,
"step": 7700
},
{
"epoch": 2.312924659334059,
"eval_loss": 2.0337536334991455,
"eval_runtime": 243.8773,
"eval_samples_per_second": 22.999,
"eval_steps_per_second": 5.753,
"step": 7700
},
{
"epoch": 2.3204324486654904,
"grad_norm": 0.9510346055030823,
"learning_rate": 1.3152607242781668e-05,
"loss": 2.0297,
"step": 7725
},
{
"epoch": 2.3279402379969216,
"grad_norm": 1.004501461982727,
"learning_rate": 1.313677697135586e-05,
"loss": 2.0276,
"step": 7750
},
{
"epoch": 2.3354480273283533,
"grad_norm": 1.0247652530670166,
"learning_rate": 1.312088877805375e-05,
"loss": 2.0152,
"step": 7775
},
{
"epoch": 2.3429558166597846,
"grad_norm": 0.9948970675468445,
"learning_rate": 1.3104942826137785e-05,
"loss": 2.0104,
"step": 7800
},
{
"epoch": 2.3429558166597846,
"eval_loss": 2.032724618911743,
"eval_runtime": 244.6368,
"eval_samples_per_second": 22.928,
"eval_steps_per_second": 5.735,
"step": 7800
},
{
"epoch": 2.350463605991216,
"grad_norm": 1.062002182006836,
"learning_rate": 1.3088939279463914e-05,
"loss": 2.0329,
"step": 7825
},
{
"epoch": 2.357971395322647,
"grad_norm": 0.9641005396842957,
"learning_rate": 1.3072878302479912e-05,
"loss": 2.0121,
"step": 7850
},
{
"epoch": 2.3654791846540784,
"grad_norm": 0.9504510164260864,
"learning_rate": 1.30567600602237e-05,
"loss": 2.0203,
"step": 7875
},
{
"epoch": 2.37298697398551,
"grad_norm": 0.970635712146759,
"learning_rate": 1.3040584718321629e-05,
"loss": 2.0101,
"step": 7900
},
{
"epoch": 2.37298697398551,
"eval_loss": 2.032496452331543,
"eval_runtime": 243.9409,
"eval_samples_per_second": 22.993,
"eval_steps_per_second": 5.751,
"step": 7900
},
{
"epoch": 2.3804947633169413,
"grad_norm": 0.9251878261566162,
"learning_rate": 1.30243524429868e-05,
"loss": 2.0166,
"step": 7925
},
{
"epoch": 2.3880025526483726,
"grad_norm": 0.8651822805404663,
"learning_rate": 1.300806340101734e-05,
"loss": 2.0213,
"step": 7950
},
{
"epoch": 2.3955103419798043,
"grad_norm": 1.0655325651168823,
"learning_rate": 1.2991717759794689e-05,
"loss": 1.9892,
"step": 7975
},
{
"epoch": 2.4030181313112355,
"grad_norm": 0.8861711621284485,
"learning_rate": 1.2975315687281895e-05,
"loss": 2.0632,
"step": 8000
},
{
"epoch": 2.4030181313112355,
"eval_loss": 2.031506299972534,
"eval_runtime": 244.4184,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 8000
},
{
"epoch": 2.410525920642667,
"grad_norm": 1.0595537424087524,
"learning_rate": 1.2958857352021873e-05,
"loss": 2.0257,
"step": 8025
},
{
"epoch": 2.418033709974098,
"grad_norm": 1.1569972038269043,
"learning_rate": 1.2942342923135669e-05,
"loss": 2.0165,
"step": 8050
},
{
"epoch": 2.4255414993055293,
"grad_norm": 0.9342359900474548,
"learning_rate": 1.2925772570320744e-05,
"loss": 2.0085,
"step": 8075
},
{
"epoch": 2.433049288636961,
"grad_norm": 0.9486634731292725,
"learning_rate": 1.2909146463849207e-05,
"loss": 1.9926,
"step": 8100
},
{
"epoch": 2.433049288636961,
"eval_loss": 2.0305228233337402,
"eval_runtime": 244.4927,
"eval_samples_per_second": 22.941,
"eval_steps_per_second": 5.738,
"step": 8100
},
{
"epoch": 2.4405570779683923,
"grad_norm": 1.04513418674469,
"learning_rate": 1.2892464774566082e-05,
"loss": 2.0207,
"step": 8125
},
{
"epoch": 2.4480648672998235,
"grad_norm": 1.0375896692276,
"learning_rate": 1.2875727673887548e-05,
"loss": 2.0299,
"step": 8150
},
{
"epoch": 2.455572656631255,
"grad_norm": 0.8860157132148743,
"learning_rate": 1.2858935333799161e-05,
"loss": 2.0164,
"step": 8175
},
{
"epoch": 2.463080445962686,
"grad_norm": 0.9642972350120544,
"learning_rate": 1.2842087926854117e-05,
"loss": 1.9905,
"step": 8200
},
{
"epoch": 2.463080445962686,
"eval_loss": 2.029367208480835,
"eval_runtime": 244.4104,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 8200
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.9699326753616333,
"learning_rate": 1.282518562617145e-05,
"loss": 2.05,
"step": 8225
},
{
"epoch": 2.478096024625549,
"grad_norm": 1.12892746925354,
"learning_rate": 1.2808228605434282e-05,
"loss": 1.984,
"step": 8250
},
{
"epoch": 2.4856038139569803,
"grad_norm": 0.9147679209709167,
"learning_rate": 1.2791217038888008e-05,
"loss": 2.0349,
"step": 8275
},
{
"epoch": 2.493111603288412,
"grad_norm": 0.9576278328895569,
"learning_rate": 1.2774151101338523e-05,
"loss": 2.0547,
"step": 8300
},
{
"epoch": 2.493111603288412,
"eval_loss": 2.0288000106811523,
"eval_runtime": 244.145,
"eval_samples_per_second": 22.974,
"eval_steps_per_second": 5.747,
"step": 8300
},
{
"epoch": 2.5006193926198432,
"grad_norm": 1.0111256837844849,
"learning_rate": 1.2757030968150426e-05,
"loss": 2.0108,
"step": 8325
},
{
"epoch": 2.5081271819512745,
"grad_norm": 0.8969287276268005,
"learning_rate": 1.2739856815245213e-05,
"loss": 1.9897,
"step": 8350
},
{
"epoch": 2.5156349712827057,
"grad_norm": 1.02077054977417,
"learning_rate": 1.2722628819099472e-05,
"loss": 2.0071,
"step": 8375
},
{
"epoch": 2.523142760614137,
"grad_norm": 0.9784366488456726,
"learning_rate": 1.2705347156743066e-05,
"loss": 2.0018,
"step": 8400
},
{
"epoch": 2.523142760614137,
"eval_loss": 2.027707099914551,
"eval_runtime": 244.2262,
"eval_samples_per_second": 22.966,
"eval_steps_per_second": 5.745,
"step": 8400
},
{
"epoch": 2.5306505499455687,
"grad_norm": 0.9159882664680481,
"learning_rate": 1.2688012005757317e-05,
"loss": 2.0298,
"step": 8425
},
{
"epoch": 2.538158339277,
"grad_norm": 1.080963373184204,
"learning_rate": 1.2670623544273182e-05,
"loss": 2.015,
"step": 8450
},
{
"epoch": 2.5456661286084312,
"grad_norm": 0.9042007923126221,
"learning_rate": 1.2653181950969418e-05,
"loss": 1.9907,
"step": 8475
},
{
"epoch": 2.5531739179398625,
"grad_norm": 0.9830322861671448,
"learning_rate": 1.2635687405070755e-05,
"loss": 2.015,
"step": 8500
},
{
"epoch": 2.5531739179398625,
"eval_loss": 2.0268571376800537,
"eval_runtime": 244.5259,
"eval_samples_per_second": 22.938,
"eval_steps_per_second": 5.738,
"step": 8500
},
{
"epoch": 2.5606817072712937,
"grad_norm": 0.8969373106956482,
"learning_rate": 1.2618842990073232e-05,
"loss": 1.985,
"step": 8525
},
{
"epoch": 2.5681894966027254,
"grad_norm": 1.0655286312103271,
"learning_rate": 1.2601245179065439e-05,
"loss": 2.0409,
"step": 8550
},
{
"epoch": 2.5756972859341567,
"grad_norm": 1.0102958679199219,
"learning_rate": 1.2583594949149863e-05,
"loss": 2.0358,
"step": 8575
},
{
"epoch": 2.583205075265588,
"grad_norm": 0.9221513271331787,
"learning_rate": 1.2565892481695126e-05,
"loss": 2.0241,
"step": 8600
},
{
"epoch": 2.583205075265588,
"eval_loss": 2.025696039199829,
"eval_runtime": 244.8481,
"eval_samples_per_second": 22.908,
"eval_steps_per_second": 5.73,
"step": 8600
},
{
"epoch": 2.5907128645970197,
"grad_norm": 1.0198999643325806,
"learning_rate": 1.2548137958606616e-05,
"loss": 2.0061,
"step": 8625
},
{
"epoch": 2.598220653928451,
"grad_norm": 1.0228906869888306,
"learning_rate": 1.2530331562324637e-05,
"loss": 2.0183,
"step": 8650
},
{
"epoch": 2.605728443259882,
"grad_norm": 0.9328727126121521,
"learning_rate": 1.2512473475822524e-05,
"loss": 2.0111,
"step": 8675
},
{
"epoch": 2.6132362325913134,
"grad_norm": 1.0237301588058472,
"learning_rate": 1.2494563882604764e-05,
"loss": 2.0461,
"step": 8700
},
{
"epoch": 2.6132362325913134,
"eval_loss": 2.025115489959717,
"eval_runtime": 244.776,
"eval_samples_per_second": 22.915,
"eval_steps_per_second": 5.732,
"step": 8700
},
{
"epoch": 2.6207440219227447,
"grad_norm": 1.0419483184814453,
"learning_rate": 1.2476602966705117e-05,
"loss": 2.0226,
"step": 8725
},
{
"epoch": 2.6282518112541764,
"grad_norm": 1.0212359428405762,
"learning_rate": 1.2458590912684718e-05,
"loss": 2.0294,
"step": 8750
},
{
"epoch": 2.6357596005856077,
"grad_norm": 0.9352961778640747,
"learning_rate": 1.2440527905630174e-05,
"loss": 2.0287,
"step": 8775
},
{
"epoch": 2.643267389917039,
"grad_norm": 0.9289619326591492,
"learning_rate": 1.2422414131151686e-05,
"loss": 1.9629,
"step": 8800
},
{
"epoch": 2.643267389917039,
"eval_loss": 2.023833751678467,
"eval_runtime": 244.5795,
"eval_samples_per_second": 22.933,
"eval_steps_per_second": 5.736,
"step": 8800
},
{
"epoch": 2.65077517924847,
"grad_norm": 1.081150770187378,
"learning_rate": 1.2404249775381112e-05,
"loss": 2.0166,
"step": 8825
},
{
"epoch": 2.6582829685799014,
"grad_norm": 0.9818612933158875,
"learning_rate": 1.2386035024970076e-05,
"loss": 2.0314,
"step": 8850
},
{
"epoch": 2.665790757911333,
"grad_norm": 0.9447384476661682,
"learning_rate": 1.2367770067088045e-05,
"loss": 2.0172,
"step": 8875
},
{
"epoch": 2.6732985472427644,
"grad_norm": 0.9655535817146301,
"learning_rate": 1.2349455089420397e-05,
"loss": 2.0163,
"step": 8900
},
{
"epoch": 2.6732985472427644,
"eval_loss": 2.0230913162231445,
"eval_runtime": 244.504,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 8900
},
{
"epoch": 2.6808063365741956,
"grad_norm": 1.010567307472229,
"learning_rate": 1.2331090280166499e-05,
"loss": 2.0132,
"step": 8925
},
{
"epoch": 2.6883141259056273,
"grad_norm": 1.014929175376892,
"learning_rate": 1.2312675828037778e-05,
"loss": 2.0155,
"step": 8950
},
{
"epoch": 2.6958219152370586,
"grad_norm": 0.9091641902923584,
"learning_rate": 1.2294211922255775e-05,
"loss": 2.0069,
"step": 8975
},
{
"epoch": 2.70332970456849,
"grad_norm": 1.0267935991287231,
"learning_rate": 1.2275698752550196e-05,
"loss": 2.0101,
"step": 9000
},
{
"epoch": 2.70332970456849,
"eval_loss": 2.0226101875305176,
"eval_runtime": 244.615,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.736,
"step": 9000
},
{
"epoch": 2.710837493899921,
"grad_norm": 1.147930383682251,
"learning_rate": 1.2257136509156978e-05,
"loss": 1.9859,
"step": 9025
},
{
"epoch": 2.7183452832313524,
"grad_norm": 1.0729800462722778,
"learning_rate": 1.2238525382816322e-05,
"loss": 2.0083,
"step": 9050
},
{
"epoch": 2.725853072562784,
"grad_norm": 1.0532081127166748,
"learning_rate": 1.2219865564770731e-05,
"loss": 2.0317,
"step": 9075
},
{
"epoch": 2.7333608618942153,
"grad_norm": 1.0475471019744873,
"learning_rate": 1.2201157246763056e-05,
"loss": 2.0117,
"step": 9100
},
{
"epoch": 2.7333608618942153,
"eval_loss": 2.0220327377319336,
"eval_runtime": 244.6775,
"eval_samples_per_second": 22.924,
"eval_steps_per_second": 5.734,
"step": 9100
},
{
"epoch": 2.7408686512256466,
"grad_norm": 0.9435563683509827,
"learning_rate": 1.2182400621034513e-05,
"loss": 2.0271,
"step": 9125
},
{
"epoch": 2.748376440557078,
"grad_norm": 0.9693319201469421,
"learning_rate": 1.2163595880322726e-05,
"loss": 2.0162,
"step": 9150
},
{
"epoch": 2.755884229888509,
"grad_norm": 1.0163437128067017,
"learning_rate": 1.2144743217859717e-05,
"loss": 2.0039,
"step": 9175
},
{
"epoch": 2.763392019219941,
"grad_norm": 0.8770220279693604,
"learning_rate": 1.2125842827369955e-05,
"loss": 2.0098,
"step": 9200
},
{
"epoch": 2.763392019219941,
"eval_loss": 2.021249771118164,
"eval_runtime": 244.5171,
"eval_samples_per_second": 22.939,
"eval_steps_per_second": 5.738,
"step": 9200
},
{
"epoch": 2.770899808551372,
"grad_norm": 0.9660369753837585,
"learning_rate": 1.2106894903068337e-05,
"loss": 2.0,
"step": 9225
},
{
"epoch": 2.7784075978828033,
"grad_norm": 1.1277518272399902,
"learning_rate": 1.2087899639658208e-05,
"loss": 2.0048,
"step": 9250
},
{
"epoch": 2.785915387214235,
"grad_norm": 0.9551436305046082,
"learning_rate": 1.2068857232329355e-05,
"loss": 1.9856,
"step": 9275
},
{
"epoch": 2.793423176545666,
"grad_norm": 0.9860432744026184,
"learning_rate": 1.2049767876756002e-05,
"loss": 2.0292,
"step": 9300
},
{
"epoch": 2.793423176545666,
"eval_loss": 2.0205230712890625,
"eval_runtime": 244.4184,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 9300
},
{
"epoch": 2.8009309658770976,
"grad_norm": 1.023398756980896,
"learning_rate": 1.2030631769094799e-05,
"loss": 2.0173,
"step": 9325
},
{
"epoch": 2.808438755208529,
"grad_norm": 0.9791613817214966,
"learning_rate": 1.2011449105982813e-05,
"loss": 2.0237,
"step": 9350
},
{
"epoch": 2.81594654453996,
"grad_norm": 0.9436085224151611,
"learning_rate": 1.1992220084535487e-05,
"loss": 1.99,
"step": 9375
},
{
"epoch": 2.8234543338713918,
"grad_norm": 0.9325253367424011,
"learning_rate": 1.1972944902344646e-05,
"loss": 2.0368,
"step": 9400
},
{
"epoch": 2.8234543338713918,
"eval_loss": 2.019615650177002,
"eval_runtime": 244.3993,
"eval_samples_per_second": 22.95,
"eval_steps_per_second": 5.741,
"step": 9400
},
{
"epoch": 2.830962123202823,
"grad_norm": 0.9791749119758606,
"learning_rate": 1.1953623757476436e-05,
"loss": 2.0055,
"step": 9425
},
{
"epoch": 2.8384699125342543,
"grad_norm": 0.9658190608024597,
"learning_rate": 1.1934256848469312e-05,
"loss": 2.0166,
"step": 9450
},
{
"epoch": 2.8459777018656855,
"grad_norm": 1.026522159576416,
"learning_rate": 1.1914844374331974e-05,
"loss": 1.9916,
"step": 9475
},
{
"epoch": 2.853485491197117,
"grad_norm": 1.1535567045211792,
"learning_rate": 1.1895386534541354e-05,
"loss": 1.9948,
"step": 9500
},
{
"epoch": 2.853485491197117,
"eval_loss": 2.0190258026123047,
"eval_runtime": 244.5245,
"eval_samples_per_second": 22.938,
"eval_steps_per_second": 5.738,
"step": 9500
},
{
"epoch": 2.8609932805285485,
"grad_norm": 0.8700292110443115,
"learning_rate": 1.1875883529040534e-05,
"loss": 1.9998,
"step": 9525
},
{
"epoch": 2.8685010698599798,
"grad_norm": 1.00760018825531,
"learning_rate": 1.1856335558236714e-05,
"loss": 2.0286,
"step": 9550
},
{
"epoch": 2.876008859191411,
"grad_norm": 1.0481544733047485,
"learning_rate": 1.1836742822999139e-05,
"loss": 2.0145,
"step": 9575
},
{
"epoch": 2.8835166485228423,
"grad_norm": 0.9422263503074646,
"learning_rate": 1.1817105524657043e-05,
"loss": 2.0123,
"step": 9600
},
{
"epoch": 2.8835166485228423,
"eval_loss": 2.018214702606201,
"eval_runtime": 244.6614,
"eval_samples_per_second": 22.926,
"eval_steps_per_second": 5.734,
"step": 9600
},
{
"epoch": 2.8910244378542735,
"grad_norm": 1.012352466583252,
"learning_rate": 1.1797423864997577e-05,
"loss": 2.0425,
"step": 9625
},
{
"epoch": 2.8985322271857052,
"grad_norm": 1.0469133853912354,
"learning_rate": 1.1777698046263735e-05,
"loss": 2.0266,
"step": 9650
},
{
"epoch": 2.9060400165171365,
"grad_norm": 1.0227727890014648,
"learning_rate": 1.175792827115228e-05,
"loss": 2.0272,
"step": 9675
},
{
"epoch": 2.9135478058485678,
"grad_norm": 1.1656129360198975,
"learning_rate": 1.1738114742811654e-05,
"loss": 1.9813,
"step": 9700
},
{
"epoch": 2.9135478058485678,
"eval_loss": 2.017220973968506,
"eval_runtime": 244.7357,
"eval_samples_per_second": 22.919,
"eval_steps_per_second": 5.733,
"step": 9700
},
{
"epoch": 2.9210555951799995,
"grad_norm": 0.9345014095306396,
"learning_rate": 1.1718257664839896e-05,
"loss": 1.9932,
"step": 9725
},
{
"epoch": 2.9285633845114307,
"grad_norm": 1.0153813362121582,
"learning_rate": 1.1698357241282546e-05,
"loss": 2.0216,
"step": 9750
},
{
"epoch": 2.936071173842862,
"grad_norm": 1.0141171216964722,
"learning_rate": 1.167841367663056e-05,
"loss": 2.0118,
"step": 9775
},
{
"epoch": 2.9435789631742932,
"grad_norm": 1.0706440210342407,
"learning_rate": 1.1658427175818184e-05,
"loss": 1.9952,
"step": 9800
},
{
"epoch": 2.9435789631742932,
"eval_loss": 2.016911029815674,
"eval_runtime": 244.4656,
"eval_samples_per_second": 22.944,
"eval_steps_per_second": 5.739,
"step": 9800
},
{
"epoch": 2.9510867525057245,
"grad_norm": 0.9770407676696777,
"learning_rate": 1.1638397944220876e-05,
"loss": 2.0154,
"step": 9825
},
{
"epoch": 2.958594541837156,
"grad_norm": 0.9835750460624695,
"learning_rate": 1.1618326187653178e-05,
"loss": 2.0186,
"step": 9850
},
{
"epoch": 2.9661023311685875,
"grad_norm": 1.0434762239456177,
"learning_rate": 1.1598212112366606e-05,
"loss": 1.9859,
"step": 9875
},
{
"epoch": 2.9736101205000187,
"grad_norm": 1.0988759994506836,
"learning_rate": 1.1578055925047533e-05,
"loss": 2.0024,
"step": 9900
},
{
"epoch": 2.9736101205000187,
"eval_loss": 2.0162084102630615,
"eval_runtime": 244.4388,
"eval_samples_per_second": 22.946,
"eval_steps_per_second": 5.74,
"step": 9900
},
{
"epoch": 2.98111790983145,
"grad_norm": 0.9690369367599487,
"learning_rate": 1.1557857832815063e-05,
"loss": 2.0261,
"step": 9925
},
{
"epoch": 2.9886256991628812,
"grad_norm": 0.932151198387146,
"learning_rate": 1.1537618043218898e-05,
"loss": 2.0233,
"step": 9950
},
{
"epoch": 2.996133488494313,
"grad_norm": 1.0118919610977173,
"learning_rate": 1.1517336764237217e-05,
"loss": 1.981,
"step": 9975
},
{
"epoch": 3.0039040504523444,
"grad_norm": 1.0406084060668945,
"learning_rate": 1.1497014204274526e-05,
"loss": 2.0523,
"step": 10000
},
{
"epoch": 3.0039040504523444,
"eval_loss": 2.0155766010284424,
"eval_runtime": 243.5325,
"eval_samples_per_second": 23.032,
"eval_steps_per_second": 5.761,
"step": 10000
},
{
"epoch": 3.0114118397837757,
"grad_norm": 1.0300322771072388,
"learning_rate": 1.1476650572159522e-05,
"loss": 1.9657,
"step": 10025
},
{
"epoch": 3.018919629115207,
"grad_norm": 1.0281704664230347,
"learning_rate": 1.1456246077142954e-05,
"loss": 1.9883,
"step": 10050
},
{
"epoch": 3.026427418446638,
"grad_norm": 1.0092098712921143,
"learning_rate": 1.1435800928895464e-05,
"loss": 2.003,
"step": 10075
},
{
"epoch": 3.03393520777807,
"grad_norm": 1.0722483396530151,
"learning_rate": 1.1415315337505426e-05,
"loss": 1.9913,
"step": 10100
},
{
"epoch": 3.03393520777807,
"eval_loss": 2.0157699584960938,
"eval_runtime": 244.4253,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 10100
},
{
"epoch": 3.041442997109501,
"grad_norm": 0.9789544939994812,
"learning_rate": 1.1394789513476809e-05,
"loss": 1.9866,
"step": 10125
},
{
"epoch": 3.0489507864409324,
"grad_norm": 1.0212770700454712,
"learning_rate": 1.137422366772699e-05,
"loss": 1.976,
"step": 10150
},
{
"epoch": 3.0564585757723637,
"grad_norm": 1.1227072477340698,
"learning_rate": 1.1353618011584607e-05,
"loss": 1.9816,
"step": 10175
},
{
"epoch": 3.0639663651037954,
"grad_norm": 1.0329065322875977,
"learning_rate": 1.1332972756787368e-05,
"loss": 1.9773,
"step": 10200
},
{
"epoch": 3.0639663651037954,
"eval_loss": 2.01505708694458,
"eval_runtime": 244.0878,
"eval_samples_per_second": 22.979,
"eval_steps_per_second": 5.748,
"step": 10200
},
{
"epoch": 3.0714741544352266,
"grad_norm": 1.0419589281082153,
"learning_rate": 1.1312288115479897e-05,
"loss": 1.9966,
"step": 10225
},
{
"epoch": 3.078981943766658,
"grad_norm": 1.0318610668182373,
"learning_rate": 1.1291564300211533e-05,
"loss": 1.9615,
"step": 10250
},
{
"epoch": 3.086489733098089,
"grad_norm": 1.0802398920059204,
"learning_rate": 1.1270801523934156e-05,
"loss": 1.9815,
"step": 10275
},
{
"epoch": 3.0939975224295204,
"grad_norm": 1.0594321489334106,
"learning_rate": 1.125e-05,
"loss": 2.0002,
"step": 10300
},
{
"epoch": 3.0939975224295204,
"eval_loss": 2.0144717693328857,
"eval_runtime": 244.0019,
"eval_samples_per_second": 22.988,
"eval_steps_per_second": 5.75,
"step": 10300
},
{
"epoch": 3.101505311760952,
"grad_norm": 0.8644378781318665,
"learning_rate": 1.122915994215946e-05,
"loss": 1.9563,
"step": 10325
},
{
"epoch": 3.1090131010923834,
"grad_norm": 1.0262008905410767,
"learning_rate": 1.1208281564558895e-05,
"loss": 1.9977,
"step": 10350
},
{
"epoch": 3.1165208904238146,
"grad_norm": 1.1098688840866089,
"learning_rate": 1.1187365081738422e-05,
"loss": 1.9673,
"step": 10375
},
{
"epoch": 3.124028679755246,
"grad_norm": 1.0585020780563354,
"learning_rate": 1.1166410708629716e-05,
"loss": 1.9967,
"step": 10400
},
{
"epoch": 3.124028679755246,
"eval_loss": 2.014115571975708,
"eval_runtime": 244.2712,
"eval_samples_per_second": 22.962,
"eval_steps_per_second": 5.744,
"step": 10400
},
{
"epoch": 3.1315364690866776,
"grad_norm": 0.9442121386528015,
"learning_rate": 1.1145418660553808e-05,
"loss": 2.0003,
"step": 10425
},
{
"epoch": 3.139044258418109,
"grad_norm": 1.0891814231872559,
"learning_rate": 1.1124389153218861e-05,
"loss": 2.0022,
"step": 10450
},
{
"epoch": 3.14655204774954,
"grad_norm": 1.0310977697372437,
"learning_rate": 1.1103322402717958e-05,
"loss": 1.9881,
"step": 10475
},
{
"epoch": 3.1540598370809714,
"grad_norm": 1.2457115650177002,
"learning_rate": 1.1082218625526887e-05,
"loss": 1.9545,
"step": 10500
},
{
"epoch": 3.1540598370809714,
"eval_loss": 2.0137479305267334,
"eval_runtime": 244.4917,
"eval_samples_per_second": 22.941,
"eval_steps_per_second": 5.738,
"step": 10500
},
{
"epoch": 3.161567626412403,
"grad_norm": 1.0390257835388184,
"learning_rate": 1.1061078038501906e-05,
"loss": 1.9965,
"step": 10525
},
{
"epoch": 3.1690754157438343,
"grad_norm": 0.9900075793266296,
"learning_rate": 1.1039900858877521e-05,
"loss": 2.0066,
"step": 10550
},
{
"epoch": 3.1765832050752656,
"grad_norm": 1.074483871459961,
"learning_rate": 1.1018687304264256e-05,
"loss": 1.9794,
"step": 10575
},
{
"epoch": 3.184090994406697,
"grad_norm": 0.9264243245124817,
"learning_rate": 1.099743759264641e-05,
"loss": 1.9793,
"step": 10600
},
{
"epoch": 3.184090994406697,
"eval_loss": 2.013479709625244,
"eval_runtime": 244.7217,
"eval_samples_per_second": 22.92,
"eval_steps_per_second": 5.733,
"step": 10600
},
{
"epoch": 3.191598783738128,
"grad_norm": 1.0158064365386963,
"learning_rate": 1.097615194237982e-05,
"loss": 1.992,
"step": 10625
},
{
"epoch": 3.19910657306956,
"grad_norm": 1.084500789642334,
"learning_rate": 1.0954830572189625e-05,
"loss": 1.981,
"step": 10650
},
{
"epoch": 3.206614362400991,
"grad_norm": 1.1871960163116455,
"learning_rate": 1.0933473701168006e-05,
"loss": 2.0098,
"step": 10675
},
{
"epoch": 3.2141221517324223,
"grad_norm": 1.0174176692962646,
"learning_rate": 1.0912081548771941e-05,
"loss": 1.9898,
"step": 10700
},
{
"epoch": 3.2141221517324223,
"eval_loss": 2.012505054473877,
"eval_runtime": 244.4334,
"eval_samples_per_second": 22.947,
"eval_steps_per_second": 5.74,
"step": 10700
},
{
"epoch": 3.2216299410638536,
"grad_norm": 1.1954680681228638,
"learning_rate": 1.089065433482095e-05,
"loss": 1.9965,
"step": 10725
},
{
"epoch": 3.2291377303952853,
"grad_norm": 1.0380609035491943,
"learning_rate": 1.0869192279494832e-05,
"loss": 2.0142,
"step": 10750
},
{
"epoch": 3.2366455197267165,
"grad_norm": 1.1713154315948486,
"learning_rate": 1.0847695603331412e-05,
"loss": 2.0032,
"step": 10775
},
{
"epoch": 3.244153309058148,
"grad_norm": 0.9350267648696899,
"learning_rate": 1.0826164527224262e-05,
"loss": 1.9926,
"step": 10800
},
{
"epoch": 3.244153309058148,
"eval_loss": 2.0120630264282227,
"eval_runtime": 244.3746,
"eval_samples_per_second": 22.952,
"eval_steps_per_second": 5.741,
"step": 10800
},
{
"epoch": 3.251661098389579,
"grad_norm": 1.1291122436523438,
"learning_rate": 1.0804599272420443e-05,
"loss": 1.9854,
"step": 10825
},
{
"epoch": 3.2591688877210103,
"grad_norm": 0.9929710030555725,
"learning_rate": 1.0783000060518225e-05,
"loss": 1.9712,
"step": 10850
},
{
"epoch": 3.266676677052442,
"grad_norm": 0.9652737379074097,
"learning_rate": 1.076136711346481e-05,
"loss": 1.9767,
"step": 10875
},
{
"epoch": 3.2741844663838733,
"grad_norm": 0.9600501656532288,
"learning_rate": 1.0739700653554052e-05,
"loss": 1.9792,
"step": 10900
},
{
"epoch": 3.2741844663838733,
"eval_loss": 2.0115151405334473,
"eval_runtime": 244.8887,
"eval_samples_per_second": 22.904,
"eval_steps_per_second": 5.729,
"step": 10900
},
{
"epoch": 3.2816922557153045,
"grad_norm": 1.0329478979110718,
"learning_rate": 1.0718000903424174e-05,
"loss": 1.9961,
"step": 10925
},
{
"epoch": 3.289200045046736,
"grad_norm": 1.1442408561706543,
"learning_rate": 1.0696268086055482e-05,
"loss": 1.9898,
"step": 10950
},
{
"epoch": 3.2967078343781675,
"grad_norm": 1.0361113548278809,
"learning_rate": 1.0674502424768066e-05,
"loss": 1.9861,
"step": 10975
},
{
"epoch": 3.3042156237095988,
"grad_norm": 0.997988760471344,
"learning_rate": 1.0652704143219519e-05,
"loss": 1.99,
"step": 11000
},
{
"epoch": 3.3042156237095988,
"eval_loss": 2.0116584300994873,
"eval_runtime": 243.9919,
"eval_samples_per_second": 22.988,
"eval_steps_per_second": 5.75,
"step": 11000
},
{
"epoch": 3.31172341304103,
"grad_norm": 0.9052268266677856,
"learning_rate": 1.0630873465402622e-05,
"loss": 1.9942,
"step": 11025
},
{
"epoch": 3.3192312023724613,
"grad_norm": 0.9491928815841675,
"learning_rate": 1.0609010615643052e-05,
"loss": 2.0145,
"step": 11050
},
{
"epoch": 3.326738991703893,
"grad_norm": 1.0330880880355835,
"learning_rate": 1.058711581859708e-05,
"loss": 1.992,
"step": 11075
},
{
"epoch": 3.3342467810353242,
"grad_norm": 1.0044811964035034,
"learning_rate": 1.0565189299249254e-05,
"loss": 2.0099,
"step": 11100
},
{
"epoch": 3.3342467810353242,
"eval_loss": 2.0105700492858887,
"eval_runtime": 244.4106,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 11100
},
{
"epoch": 3.3417545703667555,
"grad_norm": 1.0180730819702148,
"learning_rate": 1.0543231282910093e-05,
"loss": 1.9847,
"step": 11125
},
{
"epoch": 3.3492623596981868,
"grad_norm": 1.0637898445129395,
"learning_rate": 1.0521241995213771e-05,
"loss": 1.9725,
"step": 11150
},
{
"epoch": 3.356770149029618,
"grad_norm": 1.1966840028762817,
"learning_rate": 1.049922166211579e-05,
"loss": 1.9909,
"step": 11175
},
{
"epoch": 3.3642779383610497,
"grad_norm": 1.0537995100021362,
"learning_rate": 1.0477170509890681e-05,
"loss": 2.0051,
"step": 11200
},
{
"epoch": 3.3642779383610497,
"eval_loss": 2.0095300674438477,
"eval_runtime": 244.5586,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 11200
},
{
"epoch": 3.371785727692481,
"grad_norm": 0.9709149599075317,
"learning_rate": 1.0455088765129643e-05,
"loss": 1.9907,
"step": 11225
},
{
"epoch": 3.3792935170239122,
"grad_norm": 1.1112037897109985,
"learning_rate": 1.043297665473825e-05,
"loss": 1.9855,
"step": 11250
},
{
"epoch": 3.3868013063553435,
"grad_norm": 0.9346416592597961,
"learning_rate": 1.0410834405934099e-05,
"loss": 2.0005,
"step": 11275
},
{
"epoch": 3.394309095686775,
"grad_norm": 1.053544044494629,
"learning_rate": 1.0388662246244482e-05,
"loss": 1.9858,
"step": 11300
},
{
"epoch": 3.394309095686775,
"eval_loss": 2.0087532997131348,
"eval_runtime": 244.6298,
"eval_samples_per_second": 22.929,
"eval_steps_per_second": 5.735,
"step": 11300
},
{
"epoch": 3.4018168850182064,
"grad_norm": 1.0392097234725952,
"learning_rate": 1.0366460403504045e-05,
"loss": 1.9907,
"step": 11325
},
{
"epoch": 3.4093246743496377,
"grad_norm": 0.9744161367416382,
"learning_rate": 1.0344229105852453e-05,
"loss": 1.9888,
"step": 11350
},
{
"epoch": 3.416832463681069,
"grad_norm": 1.0045557022094727,
"learning_rate": 1.0321968581732035e-05,
"loss": 2.0007,
"step": 11375
},
{
"epoch": 3.4243402530125007,
"grad_norm": 1.0795562267303467,
"learning_rate": 1.0299679059885441e-05,
"loss": 1.9836,
"step": 11400
},
{
"epoch": 3.4243402530125007,
"eval_loss": 2.008427381515503,
"eval_runtime": 243.7629,
"eval_samples_per_second": 23.01,
"eval_steps_per_second": 5.756,
"step": 11400
},
{
"epoch": 3.431848042343932,
"grad_norm": 1.0574262142181396,
"learning_rate": 1.0277360769353302e-05,
"loss": 1.9968,
"step": 11425
},
{
"epoch": 3.439355831675363,
"grad_norm": 1.0723813772201538,
"learning_rate": 1.0255013939471862e-05,
"loss": 1.9778,
"step": 11450
},
{
"epoch": 3.4468636210067944,
"grad_norm": 1.0221625566482544,
"learning_rate": 1.0232638799870627e-05,
"loss": 1.9795,
"step": 11475
},
{
"epoch": 3.4543714103382257,
"grad_norm": 1.0293052196502686,
"learning_rate": 1.0210235580470003e-05,
"loss": 2.0101,
"step": 11500
},
{
"epoch": 3.4543714103382257,
"eval_loss": 2.008002996444702,
"eval_runtime": 244.5192,
"eval_samples_per_second": 22.939,
"eval_steps_per_second": 5.738,
"step": 11500
},
{
"epoch": 3.4618791996696574,
"grad_norm": 0.9779027700424194,
"learning_rate": 1.0187804511478948e-05,
"loss": 2.0353,
"step": 11525
},
{
"epoch": 3.4693869890010887,
"grad_norm": 1.3106768131256104,
"learning_rate": 1.0165345823392577e-05,
"loss": 1.9887,
"step": 11550
},
{
"epoch": 3.47689477833252,
"grad_norm": 1.0175050497055054,
"learning_rate": 1.0142859746989822e-05,
"loss": 1.9838,
"step": 11575
},
{
"epoch": 3.484402567663951,
"grad_norm": 1.142027735710144,
"learning_rate": 1.0120346513331048e-05,
"loss": 1.9585,
"step": 11600
},
{
"epoch": 3.484402567663951,
"eval_loss": 2.0071005821228027,
"eval_runtime": 244.0492,
"eval_samples_per_second": 22.983,
"eval_steps_per_second": 5.749,
"step": 11600
},
{
"epoch": 3.491910356995383,
"grad_norm": 1.0209110975265503,
"learning_rate": 1.0097806353755675e-05,
"loss": 1.9731,
"step": 11625
},
{
"epoch": 3.499418146326814,
"grad_norm": 1.046372413635254,
"learning_rate": 1.0075239499879812e-05,
"loss": 1.9688,
"step": 11650
},
{
"epoch": 3.5069259356582454,
"grad_norm": 1.227776050567627,
"learning_rate": 1.0052646183593868e-05,
"loss": 1.9843,
"step": 11675
},
{
"epoch": 3.5144337249896767,
"grad_norm": 1.0463147163391113,
"learning_rate": 1.0030026637060175e-05,
"loss": 2.0024,
"step": 11700
},
{
"epoch": 3.5144337249896767,
"eval_loss": 2.0066797733306885,
"eval_runtime": 243.8922,
"eval_samples_per_second": 22.998,
"eval_steps_per_second": 5.753,
"step": 11700
},
{
"epoch": 3.5219415143211084,
"grad_norm": 1.0555408000946045,
"learning_rate": 1.0007381092710587e-05,
"loss": 1.9974,
"step": 11725
},
{
"epoch": 3.5294493036525396,
"grad_norm": 1.007045865058899,
"learning_rate": 9.984709783244125e-06,
"loss": 2.004,
"step": 11750
},
{
"epoch": 3.536957092983971,
"grad_norm": 1.170345425605774,
"learning_rate": 9.962012941624547e-06,
"loss": 1.9492,
"step": 11775
},
{
"epoch": 3.544464882315402,
"grad_norm": 1.1506013870239258,
"learning_rate": 9.939290801077979e-06,
"loss": 1.9908,
"step": 11800
},
{
"epoch": 3.544464882315402,
"eval_loss": 2.0061874389648438,
"eval_runtime": 244.205,
"eval_samples_per_second": 22.968,
"eval_steps_per_second": 5.745,
"step": 11800
},
{
"epoch": 3.5519726716468334,
"grad_norm": 0.9976746439933777,
"learning_rate": 9.916543595090514e-06,
"loss": 1.995,
"step": 11825
},
{
"epoch": 3.559480460978265,
"grad_norm": 1.0817415714263916,
"learning_rate": 9.893771557405803e-06,
"loss": 1.9989,
"step": 11850
},
{
"epoch": 3.5669882503096964,
"grad_norm": 0.9880387187004089,
"learning_rate": 9.870974922022668e-06,
"loss": 1.9706,
"step": 11875
},
{
"epoch": 3.5744960396411276,
"grad_norm": 1.629197120666504,
"learning_rate": 9.848153923192681e-06,
"loss": 1.9957,
"step": 11900
},
{
"epoch": 3.5744960396411276,
"eval_loss": 2.0057406425476074,
"eval_runtime": 244.5085,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 11900
},
{
"epoch": 3.582003828972559,
"grad_norm": 1.1123307943344116,
"learning_rate": 9.825308795417776e-06,
"loss": 1.9746,
"step": 11925
},
{
"epoch": 3.58951161830399,
"grad_norm": 1.107917308807373,
"learning_rate": 9.802439773447818e-06,
"loss": 1.983,
"step": 11950
},
{
"epoch": 3.597019407635422,
"grad_norm": 1.0012487173080444,
"learning_rate": 9.779547092278212e-06,
"loss": 1.9592,
"step": 11975
},
{
"epoch": 3.604527196966853,
"grad_norm": 0.9805944561958313,
"learning_rate": 9.756630987147473e-06,
"loss": 1.974,
"step": 12000
},
{
"epoch": 3.604527196966853,
"eval_loss": 2.0051681995391846,
"eval_runtime": 244.093,
"eval_samples_per_second": 22.979,
"eval_steps_per_second": 5.748,
"step": 12000
},
{
"epoch": 3.6120349862982843,
"grad_norm": 0.9973050355911255,
"learning_rate": 9.733691693534814e-06,
"loss": 2.018,
"step": 12025
},
{
"epoch": 3.619542775629716,
"grad_norm": 1.0701146125793457,
"learning_rate": 9.710729447157725e-06,
"loss": 1.9395,
"step": 12050
},
{
"epoch": 3.6270505649611473,
"grad_norm": 0.9309558868408203,
"learning_rate": 9.687744483969555e-06,
"loss": 1.9866,
"step": 12075
},
{
"epoch": 3.6345583542925786,
"grad_norm": 1.1145427227020264,
"learning_rate": 9.66473704015708e-06,
"loss": 1.9669,
"step": 12100
},
{
"epoch": 3.6345583542925786,
"eval_loss": 2.004288911819458,
"eval_runtime": 244.31,
"eval_samples_per_second": 22.959,
"eval_steps_per_second": 5.743,
"step": 12100
},
{
"epoch": 3.64206614362401,
"grad_norm": 1.0386533737182617,
"learning_rate": 9.641707352138083e-06,
"loss": 1.9833,
"step": 12125
},
{
"epoch": 3.649573932955441,
"grad_norm": 1.0102437734603882,
"learning_rate": 9.618655656558927e-06,
"loss": 2.0004,
"step": 12150
},
{
"epoch": 3.657081722286873,
"grad_norm": 1.063219666481018,
"learning_rate": 9.595582190292109e-06,
"loss": 1.9995,
"step": 12175
},
{
"epoch": 3.664589511618304,
"grad_norm": 1.0717073678970337,
"learning_rate": 9.57248719043384e-06,
"loss": 1.9995,
"step": 12200
},
{
"epoch": 3.664589511618304,
"eval_loss": 2.0040318965911865,
"eval_runtime": 244.4579,
"eval_samples_per_second": 22.945,
"eval_steps_per_second": 5.739,
"step": 12200
},
{
"epoch": 3.6720973009497353,
"grad_norm": 1.0240517854690552,
"learning_rate": 9.549370894301602e-06,
"loss": 2.0077,
"step": 12225
},
{
"epoch": 3.6796050902811666,
"grad_norm": 1.0465691089630127,
"learning_rate": 9.526233539431713e-06,
"loss": 2.0077,
"step": 12250
},
{
"epoch": 3.687112879612598,
"grad_norm": 1.101195216178894,
"learning_rate": 9.503075363576889e-06,
"loss": 1.99,
"step": 12275
},
{
"epoch": 3.6946206689440295,
"grad_norm": 1.0206913948059082,
"learning_rate": 9.479896604703785e-06,
"loss": 1.9897,
"step": 12300
},
{
"epoch": 3.6946206689440295,
"eval_loss": 2.003530740737915,
"eval_runtime": 244.8327,
"eval_samples_per_second": 22.91,
"eval_steps_per_second": 5.73,
"step": 12300
},
{
"epoch": 3.7021284582754608,
"grad_norm": 0.9398745894432068,
"learning_rate": 9.456697500990571e-06,
"loss": 1.9811,
"step": 12325
},
{
"epoch": 3.709636247606892,
"grad_norm": 1.0570793151855469,
"learning_rate": 9.433478290824472e-06,
"loss": 1.9719,
"step": 12350
},
{
"epoch": 3.7171440369383237,
"grad_norm": 1.0618635416030884,
"learning_rate": 9.410239212799315e-06,
"loss": 1.9744,
"step": 12375
},
{
"epoch": 3.724651826269755,
"grad_norm": 1.0616377592086792,
"learning_rate": 9.387911227877156e-06,
"loss": 1.9889,
"step": 12400
},
{
"epoch": 3.724651826269755,
"eval_loss": 2.003262996673584,
"eval_runtime": 244.6377,
"eval_samples_per_second": 22.928,
"eval_steps_per_second": 5.735,
"step": 12400
},
{
"epoch": 3.7321596156011863,
"grad_norm": 1.0657788515090942,
"learning_rate": 9.364633901740714e-06,
"loss": 1.9712,
"step": 12425
},
{
"epoch": 3.7396674049326175,
"grad_norm": 1.0607733726501465,
"learning_rate": 9.341337415170081e-06,
"loss": 1.9622,
"step": 12450
},
{
"epoch": 3.7471751942640488,
"grad_norm": 1.1743979454040527,
"learning_rate": 9.318022007553162e-06,
"loss": 1.9693,
"step": 12475
},
{
"epoch": 3.7546829835954805,
"grad_norm": 1.0691910982131958,
"learning_rate": 9.294687918472286e-06,
"loss": 1.9865,
"step": 12500
},
{
"epoch": 3.7546829835954805,
"eval_loss": 2.0024280548095703,
"eval_runtime": 244.387,
"eval_samples_per_second": 22.951,
"eval_steps_per_second": 5.741,
"step": 12500
},
{
"epoch": 3.7621907729269117,
"grad_norm": 1.0780701637268066,
"learning_rate": 9.271335387701745e-06,
"loss": 1.9788,
"step": 12525
},
{
"epoch": 3.769698562258343,
"grad_norm": 1.0889036655426025,
"learning_rate": 9.247964655205333e-06,
"loss": 2.0001,
"step": 12550
},
{
"epoch": 3.7772063515897742,
"grad_norm": 1.0859447717666626,
"learning_rate": 9.224575961133889e-06,
"loss": 1.9875,
"step": 12575
},
{
"epoch": 3.7847141409212055,
"grad_norm": 1.1142594814300537,
"learning_rate": 9.201169545822806e-06,
"loss": 1.9703,
"step": 12600
},
{
"epoch": 3.7847141409212055,
"eval_loss": 2.0022220611572266,
"eval_runtime": 244.6481,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 12600
},
{
"epoch": 3.792221930252637,
"grad_norm": 0.9859952926635742,
"learning_rate": 9.177745649789582e-06,
"loss": 1.9795,
"step": 12625
},
{
"epoch": 3.7997297195840685,
"grad_norm": 1.0307040214538574,
"learning_rate": 9.154304513731345e-06,
"loss": 1.9635,
"step": 12650
},
{
"epoch": 3.8072375089154997,
"grad_norm": 1.1140483617782593,
"learning_rate": 9.130846378522373e-06,
"loss": 1.9709,
"step": 12675
},
{
"epoch": 3.8147452982469314,
"grad_norm": 1.2594614028930664,
"learning_rate": 9.107371485211619e-06,
"loss": 1.998,
"step": 12700
},
{
"epoch": 3.8147452982469314,
"eval_loss": 2.0013692378997803,
"eval_runtime": 244.2752,
"eval_samples_per_second": 22.962,
"eval_steps_per_second": 5.744,
"step": 12700
},
{
"epoch": 3.8222530875783627,
"grad_norm": 1.0169751644134521,
"learning_rate": 9.083880075020243e-06,
"loss": 1.9712,
"step": 12725
},
{
"epoch": 3.829760876909794,
"grad_norm": 0.9640651345252991,
"learning_rate": 9.060372389339123e-06,
"loss": 1.9748,
"step": 12750
},
{
"epoch": 3.837268666241225,
"grad_norm": 1.0947884321212769,
"learning_rate": 9.036848669726382e-06,
"loss": 1.9854,
"step": 12775
},
{
"epoch": 3.8447764555726565,
"grad_norm": 1.1233420372009277,
"learning_rate": 9.013309157904907e-06,
"loss": 1.9968,
"step": 12800
},
{
"epoch": 3.8447764555726565,
"eval_loss": 2.001154661178589,
"eval_runtime": 244.9198,
"eval_samples_per_second": 22.901,
"eval_steps_per_second": 5.728,
"step": 12800
},
{
"epoch": 3.852284244904088,
"grad_norm": 0.9935488700866699,
"learning_rate": 8.98975409575985e-06,
"loss": 1.9756,
"step": 12825
},
{
"epoch": 3.8597920342355194,
"grad_norm": 0.9727908372879028,
"learning_rate": 8.966183725336167e-06,
"loss": 1.9942,
"step": 12850
},
{
"epoch": 3.8672998235669507,
"grad_norm": 1.1200799942016602,
"learning_rate": 8.942598288836103e-06,
"loss": 1.9982,
"step": 12875
},
{
"epoch": 3.874807612898382,
"grad_norm": 1.172968864440918,
"learning_rate": 8.91899802861673e-06,
"loss": 1.9842,
"step": 12900
},
{
"epoch": 3.874807612898382,
"eval_loss": 2.000430107116699,
"eval_runtime": 244.7767,
"eval_samples_per_second": 22.915,
"eval_steps_per_second": 5.732,
"step": 12900
},
{
"epoch": 3.882315402229813,
"grad_norm": 1.1125150918960571,
"learning_rate": 8.89538318718744e-06,
"loss": 1.9832,
"step": 12925
},
{
"epoch": 3.889823191561245,
"grad_norm": 1.1382113695144653,
"learning_rate": 8.871754007207454e-06,
"loss": 1.9774,
"step": 12950
},
{
"epoch": 3.897330980892676,
"grad_norm": 1.090171217918396,
"learning_rate": 8.848110731483337e-06,
"loss": 1.9914,
"step": 12975
},
{
"epoch": 3.9048387702241074,
"grad_norm": 0.9999351501464844,
"learning_rate": 8.824453602966493e-06,
"loss": 1.9787,
"step": 13000
},
{
"epoch": 3.9048387702241074,
"eval_loss": 2.0002853870391846,
"eval_runtime": 244.3984,
"eval_samples_per_second": 22.95,
"eval_steps_per_second": 5.741,
"step": 13000
},
{
"epoch": 3.912346559555539,
"grad_norm": 1.0934284925460815,
"learning_rate": 8.800782864750677e-06,
"loss": 1.9817,
"step": 13025
},
{
"epoch": 3.9198543488869704,
"grad_norm": 1.0394964218139648,
"learning_rate": 8.777098760069491e-06,
"loss": 1.968,
"step": 13050
},
{
"epoch": 3.9273621382184016,
"grad_norm": 1.1079460382461548,
"learning_rate": 8.753401532293889e-06,
"loss": 1.9757,
"step": 13075
},
{
"epoch": 3.934869927549833,
"grad_norm": 0.9885277152061462,
"learning_rate": 8.729691424929671e-06,
"loss": 1.9789,
"step": 13100
},
{
"epoch": 3.934869927549833,
"eval_loss": 1.9996843338012695,
"eval_runtime": 245.1096,
"eval_samples_per_second": 22.884,
"eval_steps_per_second": 5.724,
"step": 13100
},
{
"epoch": 3.942377716881264,
"grad_norm": 1.005743145942688,
"learning_rate": 8.705968681614985e-06,
"loss": 1.9701,
"step": 13125
},
{
"epoch": 3.949885506212696,
"grad_norm": 1.0854625701904297,
"learning_rate": 8.682233546117827e-06,
"loss": 2.0009,
"step": 13150
},
{
"epoch": 3.957393295544127,
"grad_norm": 0.9378837943077087,
"learning_rate": 8.658486262333524e-06,
"loss": 1.9618,
"step": 13175
},
{
"epoch": 3.9649010848755584,
"grad_norm": 1.0081528425216675,
"learning_rate": 8.63472707428224e-06,
"loss": 1.9598,
"step": 13200
},
{
"epoch": 3.9649010848755584,
"eval_loss": 1.9990559816360474,
"eval_runtime": 244.3863,
"eval_samples_per_second": 22.951,
"eval_steps_per_second": 5.741,
"step": 13200
},
{
"epoch": 3.9724088742069896,
"grad_norm": 1.0947321653366089,
"learning_rate": 8.61095622610646e-06,
"loss": 1.9754,
"step": 13225
},
{
"epoch": 3.979916663538421,
"grad_norm": 1.01126229763031,
"learning_rate": 8.587173962068493e-06,
"loss": 2.0003,
"step": 13250
},
{
"epoch": 3.9874244528698526,
"grad_norm": 1.0570297241210938,
"learning_rate": 8.563380526547944e-06,
"loss": 1.9662,
"step": 13275
},
{
"epoch": 3.994932242201284,
"grad_norm": 1.103887677192688,
"learning_rate": 8.539576164039218e-06,
"loss": 1.9603,
"step": 13300
},
{
"epoch": 3.994932242201284,
"eval_loss": 1.9989780187606812,
"eval_runtime": 244.1926,
"eval_samples_per_second": 22.97,
"eval_steps_per_second": 5.745,
"step": 13300
},
{
"epoch": 4.002702804159315,
"grad_norm": 0.9994622468948364,
"learning_rate": 8.515761119149003e-06,
"loss": 2.0651,
"step": 13325
},
{
"epoch": 4.010210593490747,
"grad_norm": 1.1002482175827026,
"learning_rate": 8.491935636593756e-06,
"loss": 1.9639,
"step": 13350
},
{
"epoch": 4.017718382822178,
"grad_norm": 1.1589230298995972,
"learning_rate": 8.468099961197186e-06,
"loss": 1.9654,
"step": 13375
},
{
"epoch": 4.02522617215361,
"grad_norm": 1.0557494163513184,
"learning_rate": 8.444254337887742e-06,
"loss": 1.9567,
"step": 13400
},
{
"epoch": 4.02522617215361,
"eval_loss": 1.9992824792861938,
"eval_runtime": 244.4365,
"eval_samples_per_second": 22.947,
"eval_steps_per_second": 5.74,
"step": 13400
},
{
"epoch": 4.03273396148504,
"grad_norm": 1.0956406593322754,
"learning_rate": 8.420399011696096e-06,
"loss": 1.9574,
"step": 13425
},
{
"epoch": 4.040241750816472,
"grad_norm": 1.314028024673462,
"learning_rate": 8.396534227752622e-06,
"loss": 1.9599,
"step": 13450
},
{
"epoch": 4.047749540147904,
"grad_norm": 1.048609972000122,
"learning_rate": 8.372660231284883e-06,
"loss": 1.9483,
"step": 13475
},
{
"epoch": 4.055257329479335,
"grad_norm": 1.119491696357727,
"learning_rate": 8.348777267615099e-06,
"loss": 1.9838,
"step": 13500
},
{
"epoch": 4.055257329479335,
"eval_loss": 1.998762607574463,
"eval_runtime": 244.4149,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 13500
},
{
"epoch": 4.062765118810766,
"grad_norm": 1.0003256797790527,
"learning_rate": 8.324885582157645e-06,
"loss": 1.9629,
"step": 13525
},
{
"epoch": 4.070272908142197,
"grad_norm": 1.059667706489563,
"learning_rate": 8.300985420416509e-06,
"loss": 1.9866,
"step": 13550
},
{
"epoch": 4.077780697473629,
"grad_norm": 1.1236132383346558,
"learning_rate": 8.277077027982787e-06,
"loss": 1.9787,
"step": 13575
},
{
"epoch": 4.0852884868050605,
"grad_norm": 1.0514492988586426,
"learning_rate": 8.253160650532144e-06,
"loss": 1.9829,
"step": 13600
},
{
"epoch": 4.0852884868050605,
"eval_loss": 1.9986952543258667,
"eval_runtime": 245.1032,
"eval_samples_per_second": 22.884,
"eval_steps_per_second": 5.724,
"step": 13600
},
{
"epoch": 4.092796276136491,
"grad_norm": 1.0734481811523438,
"learning_rate": 8.2292365338223e-06,
"loss": 1.9832,
"step": 13625
},
{
"epoch": 4.100304065467923,
"grad_norm": 1.0448415279388428,
"learning_rate": 8.205304923690505e-06,
"loss": 1.9827,
"step": 13650
},
{
"epoch": 4.107811854799355,
"grad_norm": 1.1534922122955322,
"learning_rate": 8.181366066051e-06,
"loss": 1.9398,
"step": 13675
},
{
"epoch": 4.1153196441307855,
"grad_norm": 1.0893254280090332,
"learning_rate": 8.157420206892509e-06,
"loss": 1.9696,
"step": 13700
},
{
"epoch": 4.1153196441307855,
"eval_loss": 1.9981467723846436,
"eval_runtime": 244.0215,
"eval_samples_per_second": 22.986,
"eval_steps_per_second": 5.749,
"step": 13700
},
{
"epoch": 4.122827433462217,
"grad_norm": 1.1225614547729492,
"learning_rate": 8.133467592275697e-06,
"loss": 1.9785,
"step": 13725
},
{
"epoch": 4.130335222793648,
"grad_norm": 1.1276017427444458,
"learning_rate": 8.109508468330643e-06,
"loss": 1.9679,
"step": 13750
},
{
"epoch": 4.13784301212508,
"grad_norm": 1.0437787771224976,
"learning_rate": 8.08554308125432e-06,
"loss": 1.9794,
"step": 13775
},
{
"epoch": 4.1453508014565115,
"grad_norm": 1.1491374969482422,
"learning_rate": 8.061571677308061e-06,
"loss": 1.9575,
"step": 13800
},
{
"epoch": 4.1453508014565115,
"eval_loss": 1.9976245164871216,
"eval_runtime": 244.1266,
"eval_samples_per_second": 22.976,
"eval_steps_per_second": 5.747,
"step": 13800
},
{
"epoch": 4.152858590787942,
"grad_norm": 1.140905499458313,
"learning_rate": 8.037594502815015e-06,
"loss": 1.9591,
"step": 13825
},
{
"epoch": 4.160366380119374,
"grad_norm": 0.9632274508476257,
"learning_rate": 8.013611804157636e-06,
"loss": 1.9593,
"step": 13850
},
{
"epoch": 4.167874169450805,
"grad_norm": 1.1178561449050903,
"learning_rate": 7.989623827775142e-06,
"loss": 1.9729,
"step": 13875
},
{
"epoch": 4.1753819587822365,
"grad_norm": 1.068928837776184,
"learning_rate": 7.965630820160984e-06,
"loss": 1.9359,
"step": 13900
},
{
"epoch": 4.1753819587822365,
"eval_loss": 1.9976770877838135,
"eval_runtime": 244.3884,
"eval_samples_per_second": 22.951,
"eval_steps_per_second": 5.741,
"step": 13900
},
{
"epoch": 4.182889748113668,
"grad_norm": 1.0295666456222534,
"learning_rate": 7.941633027860312e-06,
"loss": 1.9739,
"step": 13925
},
{
"epoch": 4.190397537445099,
"grad_norm": 1.0357112884521484,
"learning_rate": 7.917630697467438e-06,
"loss": 1.9554,
"step": 13950
},
{
"epoch": 4.197905326776531,
"grad_norm": 1.0465984344482422,
"learning_rate": 7.893624075623312e-06,
"loss": 1.9688,
"step": 13975
},
{
"epoch": 4.205413116107962,
"grad_norm": 1.0274240970611572,
"learning_rate": 7.869613409012976e-06,
"loss": 1.9705,
"step": 14000
},
{
"epoch": 4.205413116107962,
"eval_loss": 1.9968942403793335,
"eval_runtime": 244.9157,
"eval_samples_per_second": 22.902,
"eval_steps_per_second": 5.729,
"step": 14000
},
{
"epoch": 4.212920905439393,
"grad_norm": 0.9973297119140625,
"learning_rate": 7.845598944363041e-06,
"loss": 1.9775,
"step": 14025
},
{
"epoch": 4.220428694770825,
"grad_norm": 1.0587254762649536,
"learning_rate": 7.821580928439141e-06,
"loss": 1.9808,
"step": 14050
},
{
"epoch": 4.227936484102256,
"grad_norm": 1.1307932138442993,
"learning_rate": 7.797559608043403e-06,
"loss": 1.9646,
"step": 14075
},
{
"epoch": 4.2354442734336875,
"grad_norm": 1.0376613140106201,
"learning_rate": 7.773535230011909e-06,
"loss": 1.961,
"step": 14100
},
{
"epoch": 4.2354442734336875,
"eval_loss": 1.9972692728042603,
"eval_runtime": 244.4264,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 14100
},
{
"epoch": 4.242952062765119,
"grad_norm": 1.0353500843048096,
"learning_rate": 7.749508041212167e-06,
"loss": 1.9881,
"step": 14125
},
{
"epoch": 4.25045985209655,
"grad_norm": 1.191989541053772,
"learning_rate": 7.725478288540554e-06,
"loss": 1.9307,
"step": 14150
},
{
"epoch": 4.257967641427982,
"grad_norm": 1.1267927885055542,
"learning_rate": 7.701446218919805e-06,
"loss": 1.9837,
"step": 14175
},
{
"epoch": 4.2654754307594125,
"grad_norm": 1.103934407234192,
"learning_rate": 7.677412079296458e-06,
"loss": 1.9557,
"step": 14200
},
{
"epoch": 4.2654754307594125,
"eval_loss": 1.9968904256820679,
"eval_runtime": 244.788,
"eval_samples_per_second": 22.914,
"eval_steps_per_second": 5.731,
"step": 14200
},
{
"epoch": 4.272983220090844,
"grad_norm": 1.1149851083755493,
"learning_rate": 7.653376116638324e-06,
"loss": 1.9573,
"step": 14225
},
{
"epoch": 4.280491009422276,
"grad_norm": 1.2663904428482056,
"learning_rate": 7.629338577931943e-06,
"loss": 1.9652,
"step": 14250
},
{
"epoch": 4.287998798753707,
"grad_norm": 1.1402429342269897,
"learning_rate": 7.605299710180056e-06,
"loss": 1.9834,
"step": 14275
},
{
"epoch": 4.295506588085138,
"grad_norm": 1.1735416650772095,
"learning_rate": 7.581259760399059e-06,
"loss": 1.9743,
"step": 14300
},
{
"epoch": 4.295506588085138,
"eval_loss": 1.9964790344238281,
"eval_runtime": 244.5619,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 14300
},
{
"epoch": 4.30301437741657,
"grad_norm": 1.0733146667480469,
"learning_rate": 7.557218975616456e-06,
"loss": 1.9297,
"step": 14325
},
{
"epoch": 4.310522166748001,
"grad_norm": 1.0636229515075684,
"learning_rate": 7.5331776028683485e-06,
"loss": 2.0013,
"step": 14350
},
{
"epoch": 4.318029956079433,
"grad_norm": 1.0287854671478271,
"learning_rate": 7.509135889196871e-06,
"loss": 1.9394,
"step": 14375
},
{
"epoch": 4.325537745410863,
"grad_norm": 1.2089693546295166,
"learning_rate": 7.485094081647659e-06,
"loss": 1.9651,
"step": 14400
},
{
"epoch": 4.325537745410863,
"eval_loss": 1.9961069822311401,
"eval_runtime": 244.3299,
"eval_samples_per_second": 22.957,
"eval_steps_per_second": 5.742,
"step": 14400
},
{
"epoch": 4.333045534742295,
"grad_norm": 1.0768557786941528,
"learning_rate": 7.461052427267318e-06,
"loss": 1.9671,
"step": 14425
},
{
"epoch": 4.340553324073727,
"grad_norm": 1.1563024520874023,
"learning_rate": 7.437011173100874e-06,
"loss": 1.9492,
"step": 14450
},
{
"epoch": 4.348061113405158,
"grad_norm": 1.1290167570114136,
"learning_rate": 7.412970566189248e-06,
"loss": 1.9858,
"step": 14475
},
{
"epoch": 4.355568902736589,
"grad_norm": 1.0945930480957031,
"learning_rate": 7.388930853566703e-06,
"loss": 1.9662,
"step": 14500
},
{
"epoch": 4.355568902736589,
"eval_loss": 1.9953595399856567,
"eval_runtime": 244.4122,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 14500
},
{
"epoch": 4.36307669206802,
"grad_norm": 1.0695611238479614,
"learning_rate": 7.364892282258315e-06,
"loss": 1.947,
"step": 14525
},
{
"epoch": 4.370584481399452,
"grad_norm": 1.0597783327102661,
"learning_rate": 7.340855099277433e-06,
"loss": 1.9644,
"step": 14550
},
{
"epoch": 4.378092270730884,
"grad_norm": 1.0378893613815308,
"learning_rate": 7.3168195516231395e-06,
"loss": 1.9737,
"step": 14575
},
{
"epoch": 4.385600060062314,
"grad_norm": 1.2585569620132446,
"learning_rate": 7.2937471936532264e-06,
"loss": 1.9779,
"step": 14600
},
{
"epoch": 4.385600060062314,
"eval_loss": 1.9953750371932983,
"eval_runtime": 244.2809,
"eval_samples_per_second": 22.961,
"eval_steps_per_second": 5.743,
"step": 14600
},
{
"epoch": 4.393107849393746,
"grad_norm": 1.064031958580017,
"learning_rate": 7.269715567667308e-06,
"loss": 1.9663,
"step": 14625
},
{
"epoch": 4.400615638725178,
"grad_norm": 1.1410213708877563,
"learning_rate": 7.245686308017058e-06,
"loss": 1.9573,
"step": 14650
},
{
"epoch": 4.408123428056609,
"grad_norm": 1.088382601737976,
"learning_rate": 7.221659661620141e-06,
"loss": 1.9772,
"step": 14675
},
{
"epoch": 4.41563121738804,
"grad_norm": 0.994836151599884,
"learning_rate": 7.197635875367368e-06,
"loss": 1.9703,
"step": 14700
},
{
"epoch": 4.41563121738804,
"eval_loss": 1.9953012466430664,
"eval_runtime": 244.5139,
"eval_samples_per_second": 22.939,
"eval_steps_per_second": 5.738,
"step": 14700
},
{
"epoch": 4.423139006719471,
"grad_norm": 1.0412800312042236,
"learning_rate": 7.173615196120162e-06,
"loss": 1.9413,
"step": 14725
},
{
"epoch": 4.430646796050903,
"grad_norm": 1.159559726715088,
"learning_rate": 7.149597870708011e-06,
"loss": 2.0046,
"step": 14750
},
{
"epoch": 4.4381545853823345,
"grad_norm": 1.045264482498169,
"learning_rate": 7.12558414592596e-06,
"loss": 1.9684,
"step": 14775
},
{
"epoch": 4.445662374713765,
"grad_norm": 1.1119288206100464,
"learning_rate": 7.1015742685320326e-06,
"loss": 1.9649,
"step": 14800
},
{
"epoch": 4.445662374713765,
"eval_loss": 1.9939073324203491,
"eval_runtime": 244.3543,
"eval_samples_per_second": 22.954,
"eval_steps_per_second": 5.742,
"step": 14800
},
{
"epoch": 4.453170164045197,
"grad_norm": 1.2049273252487183,
"learning_rate": 7.077568485244728e-06,
"loss": 1.9586,
"step": 14825
},
{
"epoch": 4.460677953376628,
"grad_norm": 1.0386916399002075,
"learning_rate": 7.053567042740475e-06,
"loss": 1.9811,
"step": 14850
},
{
"epoch": 4.46818574270806,
"grad_norm": 1.0895438194274902,
"learning_rate": 7.029570187651096e-06,
"loss": 1.9829,
"step": 14875
},
{
"epoch": 4.475693532039491,
"grad_norm": 1.1542959213256836,
"learning_rate": 7.005578166561275e-06,
"loss": 1.9678,
"step": 14900
},
{
"epoch": 4.475693532039491,
"eval_loss": 1.9941613674163818,
"eval_runtime": 244.7416,
"eval_samples_per_second": 22.918,
"eval_steps_per_second": 5.733,
"step": 14900
},
{
"epoch": 4.483201321370922,
"grad_norm": 1.0670243501663208,
"learning_rate": 6.9815912260060295e-06,
"loss": 1.9542,
"step": 14925
},
{
"epoch": 4.490709110702354,
"grad_norm": 1.1406601667404175,
"learning_rate": 6.95760961246816e-06,
"loss": 1.9947,
"step": 14950
},
{
"epoch": 4.4982169000337855,
"grad_norm": 1.1366952657699585,
"learning_rate": 6.933633572375736e-06,
"loss": 1.9659,
"step": 14975
},
{
"epoch": 4.505724689365216,
"grad_norm": 1.0811400413513184,
"learning_rate": 6.909663352099552e-06,
"loss": 1.9442,
"step": 15000
},
{
"epoch": 4.505724689365216,
"eval_loss": 1.993889331817627,
"eval_runtime": 244.6905,
"eval_samples_per_second": 22.923,
"eval_steps_per_second": 5.734,
"step": 15000
},
{
"epoch": 4.513232478696648,
"grad_norm": 1.013418436050415,
"learning_rate": 6.885699197950602e-06,
"loss": 1.9702,
"step": 15025
},
{
"epoch": 4.520740268028079,
"grad_norm": 1.097463846206665,
"learning_rate": 6.86174135617754e-06,
"loss": 1.9547,
"step": 15050
},
{
"epoch": 4.5282480573595105,
"grad_norm": 1.1067347526550293,
"learning_rate": 6.83779007296417e-06,
"loss": 1.9772,
"step": 15075
},
{
"epoch": 4.535755846690942,
"grad_norm": 1.0753045082092285,
"learning_rate": 6.813845594426891e-06,
"loss": 1.9522,
"step": 15100
},
{
"epoch": 4.535755846690942,
"eval_loss": 1.9931405782699585,
"eval_runtime": 244.8392,
"eval_samples_per_second": 22.909,
"eval_steps_per_second": 5.73,
"step": 15100
},
{
"epoch": 4.543263636022373,
"grad_norm": 1.0194171667099,
"learning_rate": 6.789908166612178e-06,
"loss": 1.9643,
"step": 15125
},
{
"epoch": 4.550771425353805,
"grad_norm": 1.123500108718872,
"learning_rate": 6.76597803549406e-06,
"loss": 1.954,
"step": 15150
},
{
"epoch": 4.558279214685236,
"grad_norm": 1.1514633893966675,
"learning_rate": 6.742055446971586e-06,
"loss": 1.954,
"step": 15175
},
{
"epoch": 4.565787004016667,
"grad_norm": 1.1776665449142456,
"learning_rate": 6.718140646866296e-06,
"loss": 1.9539,
"step": 15200
},
{
"epoch": 4.565787004016667,
"eval_loss": 1.9931755065917969,
"eval_runtime": 243.9594,
"eval_samples_per_second": 22.992,
"eval_steps_per_second": 5.751,
"step": 15200
},
{
"epoch": 4.573294793348099,
"grad_norm": 1.1815805435180664,
"learning_rate": 6.694233880919708e-06,
"loss": 1.9478,
"step": 15225
},
{
"epoch": 4.58080258267953,
"grad_norm": 1.0977429151535034,
"learning_rate": 6.670335394790772e-06,
"loss": 1.947,
"step": 15250
},
{
"epoch": 4.5883103720109615,
"grad_norm": 1.1538454294204712,
"learning_rate": 6.6464454340533655e-06,
"loss": 1.9462,
"step": 15275
},
{
"epoch": 4.595818161342393,
"grad_norm": 1.1371299028396606,
"learning_rate": 6.622564244193754e-06,
"loss": 1.9586,
"step": 15300
},
{
"epoch": 4.595818161342393,
"eval_loss": 1.9928078651428223,
"eval_runtime": 244.396,
"eval_samples_per_second": 22.95,
"eval_steps_per_second": 5.741,
"step": 15300
},
{
"epoch": 4.603325950673824,
"grad_norm": 1.1348552703857422,
"learning_rate": 6.598692070608083e-06,
"loss": 1.9509,
"step": 15325
},
{
"epoch": 4.610833740005256,
"grad_norm": 0.9622187614440918,
"learning_rate": 6.5748291585998436e-06,
"loss": 1.9359,
"step": 15350
},
{
"epoch": 4.6183415293366865,
"grad_norm": 0.9866182208061218,
"learning_rate": 6.55097575337736e-06,
"loss": 1.9664,
"step": 15375
},
{
"epoch": 4.625849318668118,
"grad_norm": 1.1888655424118042,
"learning_rate": 6.5271321000512715e-06,
"loss": 1.9483,
"step": 15400
},
{
"epoch": 4.625849318668118,
"eval_loss": 1.9925552606582642,
"eval_runtime": 244.5029,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 15400
},
{
"epoch": 4.63335710799955,
"grad_norm": 1.1576796770095825,
"learning_rate": 6.503298443632006e-06,
"loss": 1.9494,
"step": 15425
},
{
"epoch": 4.640864897330981,
"grad_norm": 1.1134017705917358,
"learning_rate": 6.479475029027266e-06,
"loss": 1.9282,
"step": 15450
},
{
"epoch": 4.648372686662412,
"grad_norm": 1.2257720232009888,
"learning_rate": 6.45566210103951e-06,
"loss": 1.9648,
"step": 15475
},
{
"epoch": 4.655880475993843,
"grad_norm": 1.1228723526000977,
"learning_rate": 6.431859904363441e-06,
"loss": 1.9436,
"step": 15500
},
{
"epoch": 4.655880475993843,
"eval_loss": 1.9922431707382202,
"eval_runtime": 244.1596,
"eval_samples_per_second": 22.973,
"eval_steps_per_second": 5.746,
"step": 15500
},
{
"epoch": 4.663388265325275,
"grad_norm": 1.1629343032836914,
"learning_rate": 6.40806868358349e-06,
"loss": 1.9939,
"step": 15525
},
{
"epoch": 4.670896054656707,
"grad_norm": 1.229765772819519,
"learning_rate": 6.38428868317131e-06,
"loss": 1.9469,
"step": 15550
},
{
"epoch": 4.6784038439881375,
"grad_norm": 1.10246741771698,
"learning_rate": 6.360520147483243e-06,
"loss": 1.97,
"step": 15575
},
{
"epoch": 4.685911633319569,
"grad_norm": 1.0727444887161255,
"learning_rate": 6.336763320757837e-06,
"loss": 1.9598,
"step": 15600
},
{
"epoch": 4.685911633319569,
"eval_loss": 1.9915326833724976,
"eval_runtime": 244.8129,
"eval_samples_per_second": 22.911,
"eval_steps_per_second": 5.731,
"step": 15600
},
{
"epoch": 4.693419422651001,
"grad_norm": 1.1504484415054321,
"learning_rate": 6.313018447113308e-06,
"loss": 2.0044,
"step": 15625
},
{
"epoch": 4.700927211982432,
"grad_norm": 1.2329007387161255,
"learning_rate": 6.289285770545056e-06,
"loss": 1.9718,
"step": 15650
},
{
"epoch": 4.708435001313863,
"grad_norm": 1.12412691116333,
"learning_rate": 6.265565534923142e-06,
"loss": 1.9716,
"step": 15675
},
{
"epoch": 4.715942790645294,
"grad_norm": 1.1599684953689575,
"learning_rate": 6.241857983989794e-06,
"loss": 1.9562,
"step": 15700
},
{
"epoch": 4.715942790645294,
"eval_loss": 1.9914188385009766,
"eval_runtime": 244.1223,
"eval_samples_per_second": 22.976,
"eval_steps_per_second": 5.747,
"step": 15700
},
{
"epoch": 4.723450579976726,
"grad_norm": 1.0815508365631104,
"learning_rate": 6.21816336135689e-06,
"loss": 1.9537,
"step": 15725
},
{
"epoch": 4.730958369308157,
"grad_norm": 1.1671316623687744,
"learning_rate": 6.1944819105034615e-06,
"loss": 1.94,
"step": 15750
},
{
"epoch": 4.738466158639588,
"grad_norm": 1.0050816535949707,
"learning_rate": 6.170813874773193e-06,
"loss": 1.9701,
"step": 15775
},
{
"epoch": 4.74597394797102,
"grad_norm": 1.134464979171753,
"learning_rate": 6.1471594973719145e-06,
"loss": 1.9671,
"step": 15800
},
{
"epoch": 4.74597394797102,
"eval_loss": 1.9911348819732666,
"eval_runtime": 244.1165,
"eval_samples_per_second": 22.977,
"eval_steps_per_second": 5.747,
"step": 15800
},
{
"epoch": 4.753481737302451,
"grad_norm": 1.177161455154419,
"learning_rate": 6.123519021365107e-06,
"loss": 1.9476,
"step": 15825
},
{
"epoch": 4.760989526633883,
"grad_norm": 0.998101532459259,
"learning_rate": 6.099892689675414e-06,
"loss": 1.9599,
"step": 15850
},
{
"epoch": 4.768497315965314,
"grad_norm": 1.0538263320922852,
"learning_rate": 6.076280745080128e-06,
"loss": 2.0034,
"step": 15875
},
{
"epoch": 4.776005105296745,
"grad_norm": 1.2193068265914917,
"learning_rate": 6.0526834302087054e-06,
"loss": 1.9526,
"step": 15900
},
{
"epoch": 4.776005105296745,
"eval_loss": 1.9908709526062012,
"eval_runtime": 244.0685,
"eval_samples_per_second": 22.981,
"eval_steps_per_second": 5.748,
"step": 15900
},
{
"epoch": 4.783512894628177,
"grad_norm": 1.0334097146987915,
"learning_rate": 6.0291009875402705e-06,
"loss": 1.9999,
"step": 15925
},
{
"epoch": 4.791020683959609,
"grad_norm": 1.173577904701233,
"learning_rate": 6.005533659401131e-06,
"loss": 1.9886,
"step": 15950
},
{
"epoch": 4.798528473291039,
"grad_norm": 1.270085334777832,
"learning_rate": 5.98198168796227e-06,
"loss": 1.9726,
"step": 15975
},
{
"epoch": 4.806036262622471,
"grad_norm": 1.2580983638763428,
"learning_rate": 5.958445315236885e-06,
"loss": 1.9382,
"step": 16000
},
{
"epoch": 4.806036262622471,
"eval_loss": 1.9913830757141113,
"eval_runtime": 244.081,
"eval_samples_per_second": 22.98,
"eval_steps_per_second": 5.748,
"step": 16000
},
{
"epoch": 4.813544051953902,
"grad_norm": 1.080772042274475,
"learning_rate": 5.934924783077876e-06,
"loss": 1.9402,
"step": 16025
},
{
"epoch": 4.821051841285334,
"grad_norm": 1.1412278413772583,
"learning_rate": 5.911420333175371e-06,
"loss": 1.9609,
"step": 16050
},
{
"epoch": 4.828559630616764,
"grad_norm": 1.0110164880752563,
"learning_rate": 5.887932207054245e-06,
"loss": 1.9922,
"step": 16075
},
{
"epoch": 4.836067419948196,
"grad_norm": 1.1360834836959839,
"learning_rate": 5.864460646071631e-06,
"loss": 2.0002,
"step": 16100
},
{
"epoch": 4.836067419948196,
"eval_loss": 1.9903969764709473,
"eval_runtime": 244.6106,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.736,
"step": 16100
},
{
"epoch": 4.843575209279628,
"grad_norm": 1.163109302520752,
"learning_rate": 5.841005891414443e-06,
"loss": 1.9692,
"step": 16125
},
{
"epoch": 4.851082998611059,
"grad_norm": 1.1313296556472778,
"learning_rate": 5.817568184096897e-06,
"loss": 1.9648,
"step": 16150
},
{
"epoch": 4.85859078794249,
"grad_norm": 1.1544945240020752,
"learning_rate": 5.794147764958046e-06,
"loss": 1.9696,
"step": 16175
},
{
"epoch": 4.866098577273922,
"grad_norm": 1.0399378538131714,
"learning_rate": 5.770744874659283e-06,
"loss": 1.9396,
"step": 16200
},
{
"epoch": 4.866098577273922,
"eval_loss": 1.9903674125671387,
"eval_runtime": 244.9102,
"eval_samples_per_second": 22.902,
"eval_steps_per_second": 5.729,
"step": 16200
},
{
"epoch": 4.873606366605353,
"grad_norm": 1.189995288848877,
"learning_rate": 5.747359753681883e-06,
"loss": 1.9542,
"step": 16225
},
{
"epoch": 4.8811141559367845,
"grad_norm": 1.0425307750701904,
"learning_rate": 5.7239926423245305e-06,
"loss": 1.9764,
"step": 16250
},
{
"epoch": 4.888621945268216,
"grad_norm": 1.1978663206100464,
"learning_rate": 5.700643780700849e-06,
"loss": 1.9624,
"step": 16275
},
{
"epoch": 4.896129734599647,
"grad_norm": 1.045249104499817,
"learning_rate": 5.677313408736924e-06,
"loss": 1.9709,
"step": 16300
},
{
"epoch": 4.896129734599647,
"eval_loss": 1.9895341396331787,
"eval_runtime": 244.445,
"eval_samples_per_second": 22.946,
"eval_steps_per_second": 5.74,
"step": 16300
},
{
"epoch": 4.903637523931079,
"grad_norm": 1.119350790977478,
"learning_rate": 5.654001766168861e-06,
"loss": 1.9712,
"step": 16325
},
{
"epoch": 4.91114531326251,
"grad_norm": 1.090303897857666,
"learning_rate": 5.630709092540301e-06,
"loss": 1.9269,
"step": 16350
},
{
"epoch": 4.918653102593941,
"grad_norm": 1.2654612064361572,
"learning_rate": 5.607435627199961e-06,
"loss": 1.9468,
"step": 16375
},
{
"epoch": 4.926160891925372,
"grad_norm": 1.0900917053222656,
"learning_rate": 5.584181609299187e-06,
"loss": 1.9574,
"step": 16400
},
{
"epoch": 4.926160891925372,
"eval_loss": 1.989732265472412,
"eval_runtime": 244.1216,
"eval_samples_per_second": 22.976,
"eval_steps_per_second": 5.747,
"step": 16400
},
{
"epoch": 4.933668681256804,
"grad_norm": 1.192901372909546,
"learning_rate": 5.560947277789483e-06,
"loss": 1.928,
"step": 16425
},
{
"epoch": 4.9411764705882355,
"grad_norm": 1.0490167140960693,
"learning_rate": 5.537732871420064e-06,
"loss": 1.9452,
"step": 16450
},
{
"epoch": 4.948684259919666,
"grad_norm": 1.0243791341781616,
"learning_rate": 5.514538628735402e-06,
"loss": 1.9646,
"step": 16475
},
{
"epoch": 4.956192049251098,
"grad_norm": 1.091910481452942,
"learning_rate": 5.491364788072769e-06,
"loss": 1.982,
"step": 16500
},
{
"epoch": 4.956192049251098,
"eval_loss": 1.9894477128982544,
"eval_runtime": 244.6192,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.735,
"step": 16500
},
{
"epoch": 4.96369983858253,
"grad_norm": 1.1949421167373657,
"learning_rate": 5.468211587559794e-06,
"loss": 1.9528,
"step": 16525
},
{
"epoch": 4.9712076279139605,
"grad_norm": 1.0508161783218384,
"learning_rate": 5.445079265112013e-06,
"loss": 1.9485,
"step": 16550
},
{
"epoch": 4.978715417245392,
"grad_norm": 1.2194594144821167,
"learning_rate": 5.421968058430424e-06,
"loss": 1.9324,
"step": 16575
},
{
"epoch": 4.986223206576824,
"grad_norm": 1.1732969284057617,
"learning_rate": 5.398878204999047e-06,
"loss": 1.9588,
"step": 16600
},
{
"epoch": 4.986223206576824,
"eval_loss": 1.9885412454605103,
"eval_runtime": 244.0337,
"eval_samples_per_second": 22.985,
"eval_steps_per_second": 5.749,
"step": 16600
},
{
"epoch": 4.993730995908255,
"grad_norm": 1.142801284790039,
"learning_rate": 5.375809942082486e-06,
"loss": 1.969,
"step": 16625
},
{
"epoch": 5.001501557866287,
"grad_norm": 1.1804615259170532,
"learning_rate": 5.35276350672348e-06,
"loss": 2.0292,
"step": 16650
},
{
"epoch": 5.0090093471977175,
"grad_norm": 1.184505581855774,
"learning_rate": 5.329739135740479e-06,
"loss": 1.9356,
"step": 16675
},
{
"epoch": 5.016517136529149,
"grad_norm": 1.23818039894104,
"learning_rate": 5.306737065725203e-06,
"loss": 1.9537,
"step": 16700
},
{
"epoch": 5.016517136529149,
"eval_loss": 1.9894059896469116,
"eval_runtime": 244.4081,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 16700
},
{
"epoch": 5.02402492586058,
"grad_norm": 1.1360877752304077,
"learning_rate": 5.283757533040218e-06,
"loss": 1.9584,
"step": 16725
},
{
"epoch": 5.031532715192012,
"grad_norm": 0.9773384928703308,
"learning_rate": 5.260800773816495e-06,
"loss": 1.9773,
"step": 16750
},
{
"epoch": 5.039040504523443,
"grad_norm": 1.052291750907898,
"learning_rate": 5.237867023951004e-06,
"loss": 1.9516,
"step": 16775
},
{
"epoch": 5.046548293854874,
"grad_norm": 1.086792230606079,
"learning_rate": 5.214956519104266e-06,
"loss": 1.9529,
"step": 16800
},
{
"epoch": 5.046548293854874,
"eval_loss": 1.9890544414520264,
"eval_runtime": 244.3105,
"eval_samples_per_second": 22.958,
"eval_steps_per_second": 5.743,
"step": 16800
},
{
"epoch": 5.054056083186306,
"grad_norm": 1.0600550174713135,
"learning_rate": 5.192069494697948e-06,
"loss": 1.9553,
"step": 16825
},
{
"epoch": 5.061563872517737,
"grad_norm": 1.2051581144332886,
"learning_rate": 5.169206185912439e-06,
"loss": 1.9469,
"step": 16850
},
{
"epoch": 5.0690716618491685,
"grad_norm": 1.0981636047363281,
"learning_rate": 5.146366827684433e-06,
"loss": 1.9817,
"step": 16875
},
{
"epoch": 5.0765794511806,
"grad_norm": 1.2117871046066284,
"learning_rate": 5.123551654704513e-06,
"loss": 1.9476,
"step": 16900
},
{
"epoch": 5.0765794511806,
"eval_loss": 1.9889459609985352,
"eval_runtime": 244.8126,
"eval_samples_per_second": 22.911,
"eval_steps_per_second": 5.731,
"step": 16900
},
{
"epoch": 5.084087240512031,
"grad_norm": 1.1481753587722778,
"learning_rate": 5.101672059749764e-06,
"loss": 1.9257,
"step": 16925
},
{
"epoch": 5.091595029843463,
"grad_norm": 1.0862995386123657,
"learning_rate": 5.0789049696927284e-06,
"loss": 1.9393,
"step": 16950
},
{
"epoch": 5.099102819174894,
"grad_norm": 1.1692668199539185,
"learning_rate": 5.056162758102157e-06,
"loss": 1.9525,
"step": 16975
},
{
"epoch": 5.106610608506325,
"grad_norm": 1.2036652565002441,
"learning_rate": 5.033445658670386e-06,
"loss": 1.9622,
"step": 17000
},
{
"epoch": 5.106610608506325,
"eval_loss": 1.988864541053772,
"eval_runtime": 244.6644,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 17000
},
{
"epoch": 5.114118397837757,
"grad_norm": 1.2949445247650146,
"learning_rate": 5.0107539048317025e-06,
"loss": 1.9454,
"step": 17025
},
{
"epoch": 5.121626187169188,
"grad_norm": 1.3341749906539917,
"learning_rate": 4.98808772975995e-06,
"loss": 1.9501,
"step": 17050
},
{
"epoch": 5.129133976500619,
"grad_norm": 1.147869348526001,
"learning_rate": 4.965447366366137e-06,
"loss": 1.9392,
"step": 17075
},
{
"epoch": 5.136641765832051,
"grad_norm": 1.2364166975021362,
"learning_rate": 4.9428330472960326e-06,
"loss": 1.957,
"step": 17100
},
{
"epoch": 5.136641765832051,
"eval_loss": 1.9885538816452026,
"eval_runtime": 244.7413,
"eval_samples_per_second": 22.918,
"eval_steps_per_second": 5.733,
"step": 17100
},
{
"epoch": 5.144149555163482,
"grad_norm": 1.0443626642227173,
"learning_rate": 4.920245004927787e-06,
"loss": 1.9461,
"step": 17125
},
{
"epoch": 5.151657344494914,
"grad_norm": 1.1504952907562256,
"learning_rate": 4.897683471369532e-06,
"loss": 1.9492,
"step": 17150
},
{
"epoch": 5.1591651338263445,
"grad_norm": 1.1174638271331787,
"learning_rate": 4.875148678457012e-06,
"loss": 1.9496,
"step": 17175
},
{
"epoch": 5.166672923157776,
"grad_norm": 1.2215421199798584,
"learning_rate": 4.852640857751181e-06,
"loss": 1.9272,
"step": 17200
},
{
"epoch": 5.166672923157776,
"eval_loss": 1.9891639947891235,
"eval_runtime": 244.7795,
"eval_samples_per_second": 22.915,
"eval_steps_per_second": 5.732,
"step": 17200
},
{
"epoch": 5.174180712489208,
"grad_norm": 1.179458498954773,
"learning_rate": 4.830160240535846e-06,
"loss": 1.965,
"step": 17225
},
{
"epoch": 5.181688501820639,
"grad_norm": 1.1385215520858765,
"learning_rate": 4.807707057815272e-06,
"loss": 1.9466,
"step": 17250
},
{
"epoch": 5.18919629115207,
"grad_norm": 1.4543343782424927,
"learning_rate": 4.785281540311815e-06,
"loss": 1.9864,
"step": 17275
},
{
"epoch": 5.196704080483501,
"grad_norm": 1.1437246799468994,
"learning_rate": 4.762883918463555e-06,
"loss": 1.9545,
"step": 17300
},
{
"epoch": 5.196704080483501,
"eval_loss": 1.988171935081482,
"eval_runtime": 244.7849,
"eval_samples_per_second": 22.914,
"eval_steps_per_second": 5.732,
"step": 17300
},
{
"epoch": 5.204211869814933,
"grad_norm": 1.154579520225525,
"learning_rate": 4.740514422421921e-06,
"loss": 1.9295,
"step": 17325
},
{
"epoch": 5.211719659146365,
"grad_norm": 1.21454656124115,
"learning_rate": 4.71817328204933e-06,
"loss": 1.9554,
"step": 17350
},
{
"epoch": 5.219227448477795,
"grad_norm": 1.1201882362365723,
"learning_rate": 4.695860726916826e-06,
"loss": 1.9313,
"step": 17375
},
{
"epoch": 5.226735237809227,
"grad_norm": 1.1768020391464233,
"learning_rate": 4.673576986301719e-06,
"loss": 1.9316,
"step": 17400
},
{
"epoch": 5.226735237809227,
"eval_loss": 1.9883191585540771,
"eval_runtime": 245.1496,
"eval_samples_per_second": 22.88,
"eval_steps_per_second": 5.723,
"step": 17400
},
{
"epoch": 5.234243027140659,
"grad_norm": 1.14753258228302,
"learning_rate": 4.651322289185229e-06,
"loss": 1.9224,
"step": 17425
},
{
"epoch": 5.24175081647209,
"grad_norm": 1.2445884943008423,
"learning_rate": 4.629096864250132e-06,
"loss": 1.9336,
"step": 17450
},
{
"epoch": 5.249258605803521,
"grad_norm": 1.0989011526107788,
"learning_rate": 4.606900939878415e-06,
"loss": 1.9434,
"step": 17475
},
{
"epoch": 5.256766395134952,
"grad_norm": 1.2167655229568481,
"learning_rate": 4.584734744148922e-06,
"loss": 1.9219,
"step": 17500
},
{
"epoch": 5.256766395134952,
"eval_loss": 1.9880566596984863,
"eval_runtime": 245.0782,
"eval_samples_per_second": 22.887,
"eval_steps_per_second": 5.725,
"step": 17500
},
{
"epoch": 5.264274184466384,
"grad_norm": 1.070075511932373,
"learning_rate": 4.562598504835015e-06,
"loss": 1.9723,
"step": 17525
},
{
"epoch": 5.2717819737978155,
"grad_norm": 1.1149256229400635,
"learning_rate": 4.540492449402237e-06,
"loss": 1.9661,
"step": 17550
},
{
"epoch": 5.279289763129246,
"grad_norm": 1.1833670139312744,
"learning_rate": 4.5184168050059645e-06,
"loss": 1.9208,
"step": 17575
},
{
"epoch": 5.286797552460678,
"grad_norm": 1.1111880540847778,
"learning_rate": 4.496371798489084e-06,
"loss": 1.9621,
"step": 17600
},
{
"epoch": 5.286797552460678,
"eval_loss": 1.9878884553909302,
"eval_runtime": 245.0298,
"eval_samples_per_second": 22.891,
"eval_steps_per_second": 5.726,
"step": 17600
},
{
"epoch": 5.294305341792109,
"grad_norm": 1.1070398092269897,
"learning_rate": 4.47435765637965e-06,
"loss": 1.9578,
"step": 17625
},
{
"epoch": 5.301813131123541,
"grad_norm": 1.127094030380249,
"learning_rate": 4.452374604888568e-06,
"loss": 1.9291,
"step": 17650
},
{
"epoch": 5.309320920454972,
"grad_norm": 1.1716080904006958,
"learning_rate": 4.430422869907261e-06,
"loss": 1.9694,
"step": 17675
},
{
"epoch": 5.316828709786403,
"grad_norm": 1.0636411905288696,
"learning_rate": 4.408502677005365e-06,
"loss": 1.9692,
"step": 17700
},
{
"epoch": 5.316828709786403,
"eval_loss": 1.9873278141021729,
"eval_runtime": 245.0048,
"eval_samples_per_second": 22.893,
"eval_steps_per_second": 5.726,
"step": 17700
},
{
"epoch": 5.324336499117835,
"grad_norm": 1.0959444046020508,
"learning_rate": 4.386614251428382e-06,
"loss": 1.9467,
"step": 17725
},
{
"epoch": 5.3318442884492665,
"grad_norm": 1.3291634321212769,
"learning_rate": 4.3647578180953905e-06,
"loss": 1.9335,
"step": 17750
},
{
"epoch": 5.339352077780697,
"grad_norm": 1.1393731832504272,
"learning_rate": 4.342933601596728e-06,
"loss": 1.9253,
"step": 17775
},
{
"epoch": 5.346859867112129,
"grad_norm": 1.0294339656829834,
"learning_rate": 4.321141826191677e-06,
"loss": 1.9358,
"step": 17800
},
{
"epoch": 5.346859867112129,
"eval_loss": 1.9870134592056274,
"eval_runtime": 244.8052,
"eval_samples_per_second": 22.912,
"eval_steps_per_second": 5.731,
"step": 17800
},
{
"epoch": 5.35436765644356,
"grad_norm": 1.1546337604522705,
"learning_rate": 4.299382715806166e-06,
"loss": 1.9828,
"step": 17825
},
{
"epoch": 5.3618754457749915,
"grad_norm": 1.1130156517028809,
"learning_rate": 4.27765649403047e-06,
"loss": 1.9328,
"step": 17850
},
{
"epoch": 5.369383235106423,
"grad_norm": 1.1644601821899414,
"learning_rate": 4.2559633841169055e-06,
"loss": 1.9425,
"step": 17875
},
{
"epoch": 5.376891024437854,
"grad_norm": 1.1819103956222534,
"learning_rate": 4.2343036089775444e-06,
"loss": 1.9346,
"step": 17900
},
{
"epoch": 5.376891024437854,
"eval_loss": 1.9867066144943237,
"eval_runtime": 244.6669,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 17900
},
{
"epoch": 5.384398813769286,
"grad_norm": 1.126689076423645,
"learning_rate": 4.212677391181919e-06,
"loss": 1.9554,
"step": 17925
},
{
"epoch": 5.391906603100717,
"grad_norm": 1.2009223699569702,
"learning_rate": 4.191084952954739e-06,
"loss": 1.9597,
"step": 17950
},
{
"epoch": 5.399414392432148,
"grad_norm": 1.1235226392745972,
"learning_rate": 4.169526516173596e-06,
"loss": 1.9362,
"step": 17975
},
{
"epoch": 5.40692218176358,
"grad_norm": 1.2246404886245728,
"learning_rate": 4.148002302366707e-06,
"loss": 1.9621,
"step": 18000
},
{
"epoch": 5.40692218176358,
"eval_loss": 1.9868113994598389,
"eval_runtime": 244.649,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 18000
},
{
"epoch": 5.414429971095011,
"grad_norm": 1.140601634979248,
"learning_rate": 4.126512532710613e-06,
"loss": 1.9313,
"step": 18025
},
{
"epoch": 5.4219377604264425,
"grad_norm": 1.1360208988189697,
"learning_rate": 4.105057428027919e-06,
"loss": 1.9462,
"step": 18050
},
{
"epoch": 5.429445549757874,
"grad_norm": 1.172402024269104,
"learning_rate": 4.0836372087850255e-06,
"loss": 1.9577,
"step": 18075
},
{
"epoch": 5.436953339089305,
"grad_norm": 1.1650437116622925,
"learning_rate": 4.062252095089857e-06,
"loss": 1.9299,
"step": 18100
},
{
"epoch": 5.436953339089305,
"eval_loss": 1.9864240884780884,
"eval_runtime": 244.7091,
"eval_samples_per_second": 22.921,
"eval_steps_per_second": 5.733,
"step": 18100
},
{
"epoch": 5.444461128420737,
"grad_norm": 1.1697797775268555,
"learning_rate": 4.040902306689605e-06,
"loss": 1.9483,
"step": 18125
},
{
"epoch": 5.4519689177521675,
"grad_norm": 1.194689393043518,
"learning_rate": 4.019588062968471e-06,
"loss": 1.9468,
"step": 18150
},
{
"epoch": 5.459476707083599,
"grad_norm": 1.0986504554748535,
"learning_rate": 3.998309582945405e-06,
"loss": 1.9472,
"step": 18175
},
{
"epoch": 5.466984496415031,
"grad_norm": 1.2854065895080566,
"learning_rate": 3.977067085271864e-06,
"loss": 1.9455,
"step": 18200
},
{
"epoch": 5.466984496415031,
"eval_loss": 1.9863779544830322,
"eval_runtime": 244.6161,
"eval_samples_per_second": 22.93,
"eval_steps_per_second": 5.736,
"step": 18200
},
{
"epoch": 5.474492285746462,
"grad_norm": 1.167863368988037,
"learning_rate": 3.95586078822956e-06,
"loss": 1.9287,
"step": 18225
},
{
"epoch": 5.482000075077893,
"grad_norm": 1.190122365951538,
"learning_rate": 3.934690909728214e-06,
"loss": 1.9581,
"step": 18250
},
{
"epoch": 5.489507864409324,
"grad_norm": 1.0951225757598877,
"learning_rate": 3.913557667303326e-06,
"loss": 1.93,
"step": 18275
},
{
"epoch": 5.497015653740756,
"grad_norm": 1.052368402481079,
"learning_rate": 3.8924612781139276e-06,
"loss": 1.9753,
"step": 18300
},
{
"epoch": 5.497015653740756,
"eval_loss": 1.9860328435897827,
"eval_runtime": 245.3363,
"eval_samples_per_second": 22.862,
"eval_steps_per_second": 5.719,
"step": 18300
},
{
"epoch": 5.504523443072188,
"grad_norm": 1.1306352615356445,
"learning_rate": 3.87140195894037e-06,
"loss": 1.9711,
"step": 18325
},
{
"epoch": 5.5120312324036185,
"grad_norm": 1.174249291419983,
"learning_rate": 3.850379926182069e-06,
"loss": 1.9391,
"step": 18350
},
{
"epoch": 5.51953902173505,
"grad_norm": 1.0850168466567993,
"learning_rate": 3.8293953958553055e-06,
"loss": 1.9709,
"step": 18375
},
{
"epoch": 5.527046811066482,
"grad_norm": 1.1175942420959473,
"learning_rate": 3.8084485835909922e-06,
"loss": 1.9369,
"step": 18400
},
{
"epoch": 5.527046811066482,
"eval_loss": 1.9858981370925903,
"eval_runtime": 244.8528,
"eval_samples_per_second": 22.908,
"eval_steps_per_second": 5.73,
"step": 18400
},
{
"epoch": 5.534554600397913,
"grad_norm": 1.1870449781417847,
"learning_rate": 3.7875397046324636e-06,
"loss": 1.9603,
"step": 18425
},
{
"epoch": 5.542062389729344,
"grad_norm": 1.1581183671951294,
"learning_rate": 3.766668973833262e-06,
"loss": 1.9415,
"step": 18450
},
{
"epoch": 5.549570179060775,
"grad_norm": 1.0704885721206665,
"learning_rate": 3.7458366056549304e-06,
"loss": 1.945,
"step": 18475
},
{
"epoch": 5.557077968392207,
"grad_norm": 1.209778904914856,
"learning_rate": 3.7250428141648097e-06,
"loss": 1.9571,
"step": 18500
},
{
"epoch": 5.557077968392207,
"eval_loss": 1.9858996868133545,
"eval_runtime": 244.4136,
"eval_samples_per_second": 22.949,
"eval_steps_per_second": 5.74,
"step": 18500
},
{
"epoch": 5.564585757723639,
"grad_norm": 1.1481252908706665,
"learning_rate": 3.704287813033836e-06,
"loss": 1.9445,
"step": 18525
},
{
"epoch": 5.572093547055069,
"grad_norm": 1.1967343091964722,
"learning_rate": 3.6835718155343483e-06,
"loss": 1.9457,
"step": 18550
},
{
"epoch": 5.579601336386501,
"grad_norm": 1.246741771697998,
"learning_rate": 3.6628950345378965e-06,
"loss": 1.951,
"step": 18575
},
{
"epoch": 5.587109125717932,
"grad_norm": 1.1486597061157227,
"learning_rate": 3.6422576825130477e-06,
"loss": 1.9534,
"step": 18600
},
{
"epoch": 5.587109125717932,
"eval_loss": 1.9850828647613525,
"eval_runtime": 244.5498,
"eval_samples_per_second": 22.936,
"eval_steps_per_second": 5.737,
"step": 18600
},
{
"epoch": 5.594616915049364,
"grad_norm": 1.0895804166793823,
"learning_rate": 3.62165997152322e-06,
"loss": 1.9507,
"step": 18625
},
{
"epoch": 5.602124704380795,
"grad_norm": 1.150476098060608,
"learning_rate": 3.6011021132244807e-06,
"loss": 1.9709,
"step": 18650
},
{
"epoch": 5.609632493712226,
"grad_norm": 1.2993876934051514,
"learning_rate": 3.5805843188633868e-06,
"loss": 1.9095,
"step": 18675
},
{
"epoch": 5.617140283043658,
"grad_norm": 1.1421048641204834,
"learning_rate": 3.56010679927481e-06,
"loss": 1.9381,
"step": 18700
},
{
"epoch": 5.617140283043658,
"eval_loss": 1.9856911897659302,
"eval_runtime": 244.4243,
"eval_samples_per_second": 22.948,
"eval_steps_per_second": 5.74,
"step": 18700
},
{
"epoch": 5.62464807237509,
"grad_norm": 1.2726351022720337,
"learning_rate": 3.539669764879769e-06,
"loss": 1.9533,
"step": 18725
},
{
"epoch": 5.63215586170652,
"grad_norm": 1.3039084672927856,
"learning_rate": 3.519273425683269e-06,
"loss": 1.9381,
"step": 18750
},
{
"epoch": 5.639663651037952,
"grad_norm": 1.2816251516342163,
"learning_rate": 3.4989179912721443e-06,
"loss": 1.9566,
"step": 18775
},
{
"epoch": 5.647171440369383,
"grad_norm": 1.1940944194793701,
"learning_rate": 3.4786036708129018e-06,
"loss": 1.9684,
"step": 18800
},
{
"epoch": 5.647171440369383,
"eval_loss": 1.985024094581604,
"eval_runtime": 245.079,
"eval_samples_per_second": 22.887,
"eval_steps_per_second": 5.725,
"step": 18800
},
{
"epoch": 5.654679229700815,
"grad_norm": 1.1532857418060303,
"learning_rate": 3.4583306730495745e-06,
"loss": 1.9131,
"step": 18825
},
{
"epoch": 5.662187019032246,
"grad_norm": 1.1996498107910156,
"learning_rate": 3.4380992063015747e-06,
"loss": 1.9262,
"step": 18850
},
{
"epoch": 5.669694808363677,
"grad_norm": 1.1328129768371582,
"learning_rate": 3.4179094784615565e-06,
"loss": 1.9509,
"step": 18875
},
{
"epoch": 5.677202597695109,
"grad_norm": 1.124004602432251,
"learning_rate": 3.3977616969932705e-06,
"loss": 1.9334,
"step": 18900
},
{
"epoch": 5.677202597695109,
"eval_loss": 1.9849857091903687,
"eval_runtime": 244.5129,
"eval_samples_per_second": 22.939,
"eval_steps_per_second": 5.738,
"step": 18900
},
{
"epoch": 5.68471038702654,
"grad_norm": 1.187667727470398,
"learning_rate": 3.3776560689294486e-06,
"loss": 1.9702,
"step": 18925
},
{
"epoch": 5.692218176357971,
"grad_norm": 1.1103003025054932,
"learning_rate": 3.3575928008696606e-06,
"loss": 1.9825,
"step": 18950
},
{
"epoch": 5.699725965689403,
"grad_norm": 1.1390193700790405,
"learning_rate": 3.3375720989781967e-06,
"loss": 1.9481,
"step": 18975
},
{
"epoch": 5.707233755020834,
"grad_norm": 1.0689352750778198,
"learning_rate": 3.3175941689819507e-06,
"loss": 1.9633,
"step": 19000
},
{
"epoch": 5.707233755020834,
"eval_loss": 1.9846566915512085,
"eval_runtime": 244.7924,
"eval_samples_per_second": 22.913,
"eval_steps_per_second": 5.731,
"step": 19000
},
{
"epoch": 5.714741544352266,
"grad_norm": 1.3017778396606445,
"learning_rate": 3.297659216168305e-06,
"loss": 1.9521,
"step": 19025
},
{
"epoch": 5.722249333683697,
"grad_norm": 1.0697276592254639,
"learning_rate": 3.277767445383023e-06,
"loss": 1.926,
"step": 19050
},
{
"epoch": 5.729757123015128,
"grad_norm": 1.2455774545669556,
"learning_rate": 3.2579190610281378e-06,
"loss": 1.9708,
"step": 19075
},
{
"epoch": 5.73726491234656,
"grad_norm": 1.2440054416656494,
"learning_rate": 3.238114267059859e-06,
"loss": 1.9728,
"step": 19100
},
{
"epoch": 5.73726491234656,
"eval_loss": 1.9845046997070312,
"eval_runtime": 245.1879,
"eval_samples_per_second": 22.876,
"eval_steps_per_second": 5.722,
"step": 19100
},
{
"epoch": 5.744772701677991,
"grad_norm": 1.1576147079467773,
"learning_rate": 3.218353266986476e-06,
"loss": 1.9956,
"step": 19125
},
{
"epoch": 5.752280491009422,
"grad_norm": 1.4614973068237305,
"learning_rate": 3.198636263866259e-06,
"loss": 1.9471,
"step": 19150
},
{
"epoch": 5.759788280340854,
"grad_norm": 1.2813773155212402,
"learning_rate": 3.1789634603053846e-06,
"loss": 1.9516,
"step": 19175
},
{
"epoch": 5.767296069672285,
"grad_norm": 1.212929368019104,
"learning_rate": 3.1593350584558446e-06,
"loss": 1.9446,
"step": 19200
},
{
"epoch": 5.767296069672285,
"eval_loss": 1.9842097759246826,
"eval_runtime": 244.6725,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 19200
},
{
"epoch": 5.7748038590037165,
"grad_norm": 1.0693471431732178,
"learning_rate": 3.1397512600133694e-06,
"loss": 1.9767,
"step": 19225
},
{
"epoch": 5.782311648335147,
"grad_norm": 1.2919217348098755,
"learning_rate": 3.120212266215365e-06,
"loss": 1.9476,
"step": 19250
},
{
"epoch": 5.789819437666579,
"grad_norm": 1.1402935981750488,
"learning_rate": 3.1007182778388315e-06,
"loss": 1.9495,
"step": 19275
},
{
"epoch": 5.797327226998011,
"grad_norm": 1.2192392349243164,
"learning_rate": 3.0812694951983087e-06,
"loss": 1.9633,
"step": 19300
},
{
"epoch": 5.797327226998011,
"eval_loss": 1.9841300249099731,
"eval_runtime": 245.1028,
"eval_samples_per_second": 22.884,
"eval_steps_per_second": 5.724,
"step": 19300
},
{
"epoch": 5.8048350163294415,
"grad_norm": 1.232828140258789,
"learning_rate": 3.0618661181438147e-06,
"loss": 1.9147,
"step": 19325
},
{
"epoch": 5.812342805660873,
"grad_norm": 1.1075702905654907,
"learning_rate": 3.042508346058794e-06,
"loss": 1.9493,
"step": 19350
},
{
"epoch": 5.819850594992305,
"grad_norm": 1.3566619157791138,
"learning_rate": 3.0231963778580643e-06,
"loss": 1.9314,
"step": 19375
},
{
"epoch": 5.827358384323736,
"grad_norm": 0.9761985540390015,
"learning_rate": 3.0039304119857863e-06,
"loss": 1.9674,
"step": 19400
},
{
"epoch": 5.827358384323736,
"eval_loss": 1.9838725328445435,
"eval_runtime": 244.9514,
"eval_samples_per_second": 22.898,
"eval_steps_per_second": 5.728,
"step": 19400
},
{
"epoch": 5.8348661736551675,
"grad_norm": 1.236413836479187,
"learning_rate": 2.984710646413399e-06,
"loss": 1.9401,
"step": 19425
},
{
"epoch": 5.842373962986598,
"grad_norm": 1.1869447231292725,
"learning_rate": 2.965537278637612e-06,
"loss": 1.9927,
"step": 19450
},
{
"epoch": 5.84988175231803,
"grad_norm": 1.1049928665161133,
"learning_rate": 2.946410505678359e-06,
"loss": 1.9789,
"step": 19475
},
{
"epoch": 5.857389541649462,
"grad_norm": 1.0848510265350342,
"learning_rate": 2.927330524076784e-06,
"loss": 1.9329,
"step": 19500
},
{
"epoch": 5.857389541649462,
"eval_loss": 1.9838305711746216,
"eval_runtime": 244.8394,
"eval_samples_per_second": 22.909,
"eval_steps_per_second": 5.73,
"step": 19500
},
{
"epoch": 5.8648973309808925,
"grad_norm": 1.257003664970398,
"learning_rate": 2.9082975298932073e-06,
"loss": 1.9271,
"step": 19525
},
{
"epoch": 5.872405120312324,
"grad_norm": 1.1217468976974487,
"learning_rate": 2.889311718705135e-06,
"loss": 1.9593,
"step": 19550
},
{
"epoch": 5.879912909643755,
"grad_norm": 1.0415600538253784,
"learning_rate": 2.8703732856052216e-06,
"loss": 1.9436,
"step": 19575
},
{
"epoch": 5.887420698975187,
"grad_norm": 1.1773449182510376,
"learning_rate": 2.8514824251992834e-06,
"loss": 1.9604,
"step": 19600
},
{
"epoch": 5.887420698975187,
"eval_loss": 1.983793020248413,
"eval_runtime": 244.4635,
"eval_samples_per_second": 22.944,
"eval_steps_per_second": 5.739,
"step": 19600
},
{
"epoch": 5.894928488306618,
"grad_norm": 1.1694916486740112,
"learning_rate": 2.832639331604292e-06,
"loss": 1.9281,
"step": 19625
},
{
"epoch": 5.902436277638049,
"grad_norm": 1.1439831256866455,
"learning_rate": 2.813844198446383e-06,
"loss": 1.9469,
"step": 19650
},
{
"epoch": 5.909944066969481,
"grad_norm": 1.2244899272918701,
"learning_rate": 2.7950972188588596e-06,
"loss": 1.9203,
"step": 19675
},
{
"epoch": 5.917451856300913,
"grad_norm": 1.0796282291412354,
"learning_rate": 2.776398585480223e-06,
"loss": 1.9569,
"step": 19700
},
{
"epoch": 5.917451856300913,
"eval_loss": 1.983589768409729,
"eval_runtime": 244.6683,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 19700
},
{
"epoch": 5.9249596456323435,
"grad_norm": 1.1817554235458374,
"learning_rate": 2.757748490452177e-06,
"loss": 1.967,
"step": 19725
},
{
"epoch": 5.932467434963775,
"grad_norm": 1.1933224201202393,
"learning_rate": 2.739147125417653e-06,
"loss": 1.9553,
"step": 19750
},
{
"epoch": 5.939975224295206,
"grad_norm": 1.0195425748825073,
"learning_rate": 2.7205946815188563e-06,
"loss": 1.9477,
"step": 19775
},
{
"epoch": 5.947483013626638,
"grad_norm": 1.1039797067642212,
"learning_rate": 2.7020913493952893e-06,
"loss": 1.9508,
"step": 19800
},
{
"epoch": 5.947483013626638,
"eval_loss": 1.9835751056671143,
"eval_runtime": 244.4808,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 5.739,
"step": 19800
},
{
"epoch": 5.954990802958069,
"grad_norm": 1.1363548040390015,
"learning_rate": 2.6836373191817982e-06,
"loss": 1.9466,
"step": 19825
},
{
"epoch": 5.9624985922895,
"grad_norm": 1.182576298713684,
"learning_rate": 2.6652327805066128e-06,
"loss": 1.9549,
"step": 19850
},
{
"epoch": 5.970006381620932,
"grad_norm": 1.083834171295166,
"learning_rate": 2.6468779224894086e-06,
"loss": 1.9421,
"step": 19875
},
{
"epoch": 5.977514170952363,
"grad_norm": 1.1508004665374756,
"learning_rate": 2.628572933739354e-06,
"loss": 1.9237,
"step": 19900
},
{
"epoch": 5.977514170952363,
"eval_loss": 1.9832342863082886,
"eval_runtime": 276.0383,
"eval_samples_per_second": 20.32,
"eval_steps_per_second": 5.083,
"step": 19900
},
{
"epoch": 5.985021960283794,
"grad_norm": 1.134469985961914,
"learning_rate": 2.6103180023531726e-06,
"loss": 1.9175,
"step": 19925
},
{
"epoch": 5.992529749615226,
"grad_norm": 1.1148380041122437,
"learning_rate": 2.592113315913217e-06,
"loss": 1.96,
"step": 19950
},
{
"epoch": 6.000300311573257,
"grad_norm": 2.1076784133911133,
"learning_rate": 2.5739590614855353e-06,
"loss": 2.0546,
"step": 19975
},
{
"epoch": 6.007808100904689,
"grad_norm": 1.1790810823440552,
"learning_rate": 2.5558554256179507e-06,
"loss": 1.9568,
"step": 20000
},
{
"epoch": 6.007808100904689,
"eval_loss": 1.983675241470337,
"eval_runtime": 277.6925,
"eval_samples_per_second": 20.199,
"eval_steps_per_second": 5.052,
"step": 20000
},
{
"epoch": 6.01531589023612,
"grad_norm": 1.3004977703094482,
"learning_rate": 2.5378025943381482e-06,
"loss": 1.9195,
"step": 20025
},
{
"epoch": 6.022823679567551,
"grad_norm": 1.2546344995498657,
"learning_rate": 2.519800753151757e-06,
"loss": 1.9527,
"step": 20050
},
{
"epoch": 6.030331468898983,
"grad_norm": 1.0866812467575073,
"learning_rate": 2.501850087040448e-06,
"loss": 1.937,
"step": 20075
},
{
"epoch": 6.037839258230414,
"grad_norm": 1.1754050254821777,
"learning_rate": 2.4839507804600274e-06,
"loss": 1.8801,
"step": 20100
},
{
"epoch": 6.037839258230414,
"eval_loss": 1.983474850654602,
"eval_runtime": 244.4023,
"eval_samples_per_second": 22.95,
"eval_steps_per_second": 5.741,
"step": 20100
},
{
"epoch": 6.045347047561846,
"grad_norm": 1.3076649904251099,
"learning_rate": 2.466103017338552e-06,
"loss": 1.9264,
"step": 20125
},
{
"epoch": 6.052854836893276,
"grad_norm": 1.3242402076721191,
"learning_rate": 2.448306981074428e-06,
"loss": 1.9262,
"step": 20150
},
{
"epoch": 6.060362626224708,
"grad_norm": 1.0890467166900635,
"learning_rate": 2.4305628545345394e-06,
"loss": 1.9743,
"step": 20175
},
{
"epoch": 6.06787041555614,
"grad_norm": 1.1457139253616333,
"learning_rate": 2.412870820052353e-06,
"loss": 1.9558,
"step": 20200
},
{
"epoch": 6.06787041555614,
"eval_loss": 1.983147144317627,
"eval_runtime": 244.7147,
"eval_samples_per_second": 22.921,
"eval_steps_per_second": 5.733,
"step": 20200
},
{
"epoch": 6.075378204887571,
"grad_norm": 1.1762498617172241,
"learning_rate": 2.395231059426055e-06,
"loss": 1.9198,
"step": 20225
},
{
"epoch": 6.082885994219002,
"grad_norm": 1.1638132333755493,
"learning_rate": 2.3776437539166825e-06,
"loss": 1.9397,
"step": 20250
},
{
"epoch": 6.090393783550433,
"grad_norm": 1.2441715002059937,
"learning_rate": 2.3601090842462575e-06,
"loss": 1.9676,
"step": 20275
},
{
"epoch": 6.097901572881865,
"grad_norm": 1.1457374095916748,
"learning_rate": 2.342627230595929e-06,
"loss": 1.9574,
"step": 20300
},
{
"epoch": 6.097901572881865,
"eval_loss": 1.9833580255508423,
"eval_runtime": 322.7704,
"eval_samples_per_second": 17.378,
"eval_steps_per_second": 4.347,
"step": 20300
},
{
"epoch": 6.1054093622132966,
"grad_norm": 1.2807676792144775,
"learning_rate": 2.325198372604132e-06,
"loss": 1.91,
"step": 20325
},
{
"epoch": 6.112917151544727,
"grad_norm": 1.1415411233901978,
"learning_rate": 2.3078226893647254e-06,
"loss": 1.9255,
"step": 20350
},
{
"epoch": 6.120424940876159,
"grad_norm": 1.1930123567581177,
"learning_rate": 2.290500359425165e-06,
"loss": 1.898,
"step": 20375
},
{
"epoch": 6.127932730207591,
"grad_norm": 1.1319756507873535,
"learning_rate": 2.2732315607846606e-06,
"loss": 1.9043,
"step": 20400
},
{
"epoch": 6.127932730207591,
"eval_loss": 1.9833526611328125,
"eval_runtime": 244.7015,
"eval_samples_per_second": 22.922,
"eval_steps_per_second": 5.734,
"step": 20400
},
{
"epoch": 6.135440519539022,
"grad_norm": 1.197733759880066,
"learning_rate": 2.25601647089235e-06,
"loss": 1.9325,
"step": 20425
},
{
"epoch": 6.142948308870453,
"grad_norm": 1.1803226470947266,
"learning_rate": 2.238855266645473e-06,
"loss": 1.9357,
"step": 20450
},
{
"epoch": 6.150456098201884,
"grad_norm": 1.2374463081359863,
"learning_rate": 2.2217481243875666e-06,
"loss": 1.9071,
"step": 20475
},
{
"epoch": 6.157963887533316,
"grad_norm": 1.178080439567566,
"learning_rate": 2.2046952199066323e-06,
"loss": 1.936,
"step": 20500
},
{
"epoch": 6.157963887533316,
"eval_loss": 1.9832499027252197,
"eval_runtime": 244.9473,
"eval_samples_per_second": 22.899,
"eval_steps_per_second": 5.728,
"step": 20500
},
{
"epoch": 6.1654716768647475,
"grad_norm": 1.1624387502670288,
"learning_rate": 2.1876967284333436e-06,
"loss": 1.9722,
"step": 20525
},
{
"epoch": 6.172979466196178,
"grad_norm": 1.2391788959503174,
"learning_rate": 2.170752824639242e-06,
"loss": 1.971,
"step": 20550
},
{
"epoch": 6.18048725552761,
"grad_norm": 1.183759331703186,
"learning_rate": 2.153863682634941e-06,
"loss": 1.9717,
"step": 20575
},
{
"epoch": 6.187995044859041,
"grad_norm": 1.164625644683838,
"learning_rate": 2.137029475968338e-06,
"loss": 1.9668,
"step": 20600
},
{
"epoch": 6.187995044859041,
"eval_loss": 1.982852578163147,
"eval_runtime": 244.5369,
"eval_samples_per_second": 22.937,
"eval_steps_per_second": 5.737,
"step": 20600
},
{
"epoch": 6.1955028341904725,
"grad_norm": 1.2210159301757812,
"learning_rate": 2.1209204813122366e-06,
"loss": 1.9451,
"step": 20625
},
{
"epoch": 6.203010623521904,
"grad_norm": 1.2201151847839355,
"learning_rate": 2.104194449172132e-06,
"loss": 1.926,
"step": 20650
},
{
"epoch": 6.210518412853335,
"grad_norm": 1.2492685317993164,
"learning_rate": 2.0875238627562834e-06,
"loss": 1.928,
"step": 20675
},
{
"epoch": 6.218026202184767,
"grad_norm": 0.9965053796768188,
"learning_rate": 2.0709088933667766e-06,
"loss": 1.9374,
"step": 20700
},
{
"epoch": 6.218026202184767,
"eval_loss": 1.9826812744140625,
"eval_runtime": 244.5669,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 20700
},
{
"epoch": 6.2255339915161985,
"grad_norm": 1.0911799669265747,
"learning_rate": 2.0543497117341904e-06,
"loss": 1.9361,
"step": 20725
},
{
"epoch": 6.233041780847629,
"grad_norm": 1.277250051498413,
"learning_rate": 2.0378464880158453e-06,
"loss": 1.9285,
"step": 20750
},
{
"epoch": 6.240549570179061,
"grad_norm": 1.1859968900680542,
"learning_rate": 2.0213993917940577e-06,
"loss": 1.9531,
"step": 20775
},
{
"epoch": 6.248057359510492,
"grad_norm": 1.2009955644607544,
"learning_rate": 2.0050085920743904e-06,
"loss": 1.9415,
"step": 20800
},
{
"epoch": 6.248057359510492,
"eval_loss": 1.9828299283981323,
"eval_runtime": 244.6515,
"eval_samples_per_second": 22.926,
"eval_steps_per_second": 5.735,
"step": 20800
},
{
"epoch": 6.2555651488419235,
"grad_norm": 1.23262357711792,
"learning_rate": 1.9886742572839227e-06,
"loss": 1.9466,
"step": 20825
},
{
"epoch": 6.263072938173355,
"grad_norm": 1.1354538202285767,
"learning_rate": 1.9723965552695134e-06,
"loss": 1.9538,
"step": 20850
},
{
"epoch": 6.270580727504786,
"grad_norm": 1.2842826843261719,
"learning_rate": 1.956175653296082e-06,
"loss": 1.9547,
"step": 20875
},
{
"epoch": 6.278088516836218,
"grad_norm": 1.2268083095550537,
"learning_rate": 1.9400117180448872e-06,
"loss": 1.9535,
"step": 20900
},
{
"epoch": 6.278088516836218,
"eval_loss": 1.9827969074249268,
"eval_runtime": 244.6424,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 20900
},
{
"epoch": 6.2855963061676485,
"grad_norm": 1.1584311723709106,
"learning_rate": 1.923904915611814e-06,
"loss": 1.9903,
"step": 20925
},
{
"epoch": 6.29310409549908,
"grad_norm": 1.1765952110290527,
"learning_rate": 1.9078554115056657e-06,
"loss": 1.9313,
"step": 20950
},
{
"epoch": 6.300611884830512,
"grad_norm": 1.0743718147277832,
"learning_rate": 1.8918633706464663e-06,
"loss": 1.937,
"step": 20975
},
{
"epoch": 6.308119674161943,
"grad_norm": 1.1020997762680054,
"learning_rate": 1.8759289573637645e-06,
"loss": 1.9505,
"step": 21000
},
{
"epoch": 6.308119674161943,
"eval_loss": 1.9827996492385864,
"eval_runtime": 244.7905,
"eval_samples_per_second": 22.913,
"eval_steps_per_second": 5.731,
"step": 21000
},
{
"epoch": 6.3156274634933744,
"grad_norm": 1.2609679698944092,
"learning_rate": 1.8600523353949437e-06,
"loss": 1.9424,
"step": 21025
},
{
"epoch": 6.323135252824806,
"grad_norm": 1.2550972700119019,
"learning_rate": 1.8442336678835417e-06,
"loss": 1.9284,
"step": 21050
},
{
"epoch": 6.330643042156237,
"grad_norm": 1.21172297000885,
"learning_rate": 1.8284731173775695e-06,
"loss": 1.9422,
"step": 21075
},
{
"epoch": 6.338150831487669,
"grad_norm": 1.2744083404541016,
"learning_rate": 1.8127708458278532e-06,
"loss": 1.9512,
"step": 21100
},
{
"epoch": 6.338150831487669,
"eval_loss": 1.9828649759292603,
"eval_runtime": 244.7295,
"eval_samples_per_second": 22.919,
"eval_steps_per_second": 5.733,
"step": 21100
},
{
"epoch": 6.3456586208190995,
"grad_norm": 1.006986141204834,
"learning_rate": 1.7971270145863531e-06,
"loss": 1.9737,
"step": 21125
},
{
"epoch": 6.353166410150531,
"grad_norm": 1.1543078422546387,
"learning_rate": 1.7815417844045175e-06,
"loss": 1.9688,
"step": 21150
},
{
"epoch": 6.360674199481963,
"grad_norm": 1.2171674966812134,
"learning_rate": 1.7660153154316258e-06,
"loss": 1.9549,
"step": 21175
},
{
"epoch": 6.368181988813394,
"grad_norm": 1.1868822574615479,
"learning_rate": 1.7505477672131454e-06,
"loss": 1.9467,
"step": 21200
},
{
"epoch": 6.368181988813394,
"eval_loss": 1.9822328090667725,
"eval_runtime": 244.9166,
"eval_samples_per_second": 22.902,
"eval_steps_per_second": 5.728,
"step": 21200
},
{
"epoch": 6.375689778144825,
"grad_norm": 1.2831307649612427,
"learning_rate": 1.7351392986890915e-06,
"loss": 1.9572,
"step": 21225
},
{
"epoch": 6.383197567476256,
"grad_norm": 1.2353671789169312,
"learning_rate": 1.7197900681923927e-06,
"loss": 1.9286,
"step": 21250
},
{
"epoch": 6.390705356807688,
"grad_norm": 1.2204623222351074,
"learning_rate": 1.7045002334472654e-06,
"loss": 1.959,
"step": 21275
},
{
"epoch": 6.39821314613912,
"grad_norm": 1.3212610483169556,
"learning_rate": 1.689269951567592e-06,
"loss": 1.9591,
"step": 21300
},
{
"epoch": 6.39821314613912,
"eval_loss": 1.9822728633880615,
"eval_runtime": 244.7907,
"eval_samples_per_second": 22.913,
"eval_steps_per_second": 5.731,
"step": 21300
},
{
"epoch": 6.40572093547055,
"grad_norm": 1.1899783611297607,
"learning_rate": 1.674099379055308e-06,
"loss": 1.9496,
"step": 21325
},
{
"epoch": 6.413228724801982,
"grad_norm": 1.1367979049682617,
"learning_rate": 1.6589886717987917e-06,
"loss": 1.9283,
"step": 21350
},
{
"epoch": 6.420736514133413,
"grad_norm": 1.343583106994629,
"learning_rate": 1.6439379850712633e-06,
"loss": 1.9282,
"step": 21375
},
{
"epoch": 6.428244303464845,
"grad_norm": 1.0660362243652344,
"learning_rate": 1.6289474735291935e-06,
"loss": 1.9577,
"step": 21400
},
{
"epoch": 6.428244303464845,
"eval_loss": 1.9821466207504272,
"eval_runtime": 245.3705,
"eval_samples_per_second": 22.859,
"eval_steps_per_second": 5.718,
"step": 21400
},
{
"epoch": 6.435752092796276,
"grad_norm": 1.1740394830703735,
"learning_rate": 1.6140172912107054e-06,
"loss": 1.9397,
"step": 21425
},
{
"epoch": 6.443259882127707,
"grad_norm": 1.2688024044036865,
"learning_rate": 1.5991475915339973e-06,
"loss": 1.9066,
"step": 21450
},
{
"epoch": 6.450767671459139,
"grad_norm": 1.2636834383010864,
"learning_rate": 1.5843385272957686e-06,
"loss": 1.9337,
"step": 21475
},
{
"epoch": 6.458275460790571,
"grad_norm": 1.207352638244629,
"learning_rate": 1.5695902506696439e-06,
"loss": 1.9523,
"step": 21500
},
{
"epoch": 6.458275460790571,
"eval_loss": 1.982376217842102,
"eval_runtime": 245.0395,
"eval_samples_per_second": 22.89,
"eval_steps_per_second": 5.726,
"step": 21500
},
{
"epoch": 6.465783250122001,
"grad_norm": 1.1533434391021729,
"learning_rate": 1.5549029132046123e-06,
"loss": 1.9335,
"step": 21525
},
{
"epoch": 6.473291039453433,
"grad_norm": 1.138137936592102,
"learning_rate": 1.5402766658234704e-06,
"loss": 1.9457,
"step": 21550
},
{
"epoch": 6.480798828784864,
"grad_norm": 1.1742804050445557,
"learning_rate": 1.5257116588212709e-06,
"loss": 1.9303,
"step": 21575
},
{
"epoch": 6.488306618116296,
"grad_norm": 1.1066967248916626,
"learning_rate": 1.511208041863778e-06,
"loss": 1.9251,
"step": 21600
},
{
"epoch": 6.488306618116296,
"eval_loss": 1.9820733070373535,
"eval_runtime": 244.8749,
"eval_samples_per_second": 22.906,
"eval_steps_per_second": 5.729,
"step": 21600
},
{
"epoch": 6.495814407447727,
"grad_norm": 1.1347085237503052,
"learning_rate": 1.4967659639859308e-06,
"loss": 1.9311,
"step": 21625
},
{
"epoch": 6.503322196779158,
"grad_norm": 1.2577033042907715,
"learning_rate": 1.4823855735903083e-06,
"loss": 1.9354,
"step": 21650
},
{
"epoch": 6.51082998611059,
"grad_norm": 1.1945990324020386,
"learning_rate": 1.468067018445608e-06,
"loss": 1.9046,
"step": 21675
},
{
"epoch": 6.518337775442021,
"grad_norm": 1.195004940032959,
"learning_rate": 1.4538104456851294e-06,
"loss": 1.9374,
"step": 21700
},
{
"epoch": 6.518337775442021,
"eval_loss": 1.9817756414413452,
"eval_runtime": 244.4709,
"eval_samples_per_second": 22.943,
"eval_steps_per_second": 5.739,
"step": 21700
},
{
"epoch": 6.525845564773452,
"grad_norm": 1.289342999458313,
"learning_rate": 1.4396160018052555e-06,
"loss": 1.9201,
"step": 21725
},
{
"epoch": 6.533353354104884,
"grad_norm": 1.231141209602356,
"learning_rate": 1.4254838326639514e-06,
"loss": 1.9527,
"step": 21750
},
{
"epoch": 6.540861143436315,
"grad_norm": 1.3896465301513672,
"learning_rate": 1.4114140834792666e-06,
"loss": 1.9347,
"step": 21775
},
{
"epoch": 6.548368932767747,
"grad_norm": 1.2049739360809326,
"learning_rate": 1.3974068988278402e-06,
"loss": 1.969,
"step": 21800
},
{
"epoch": 6.548368932767747,
"eval_loss": 1.9819016456604004,
"eval_runtime": 244.8747,
"eval_samples_per_second": 22.906,
"eval_steps_per_second": 5.729,
"step": 21800
},
{
"epoch": 6.555876722099178,
"grad_norm": 1.2919905185699463,
"learning_rate": 1.3834624226434162e-06,
"loss": 1.9555,
"step": 21825
},
{
"epoch": 6.563384511430609,
"grad_norm": 1.2100296020507812,
"learning_rate": 1.3695807982153666e-06,
"loss": 1.9239,
"step": 21850
},
{
"epoch": 6.570892300762041,
"grad_norm": 1.1903220415115356,
"learning_rate": 1.3557621681872142e-06,
"loss": 1.9201,
"step": 21875
},
{
"epoch": 6.578400090093472,
"grad_norm": 1.1797667741775513,
"learning_rate": 1.3420066745551715e-06,
"loss": 1.9418,
"step": 21900
},
{
"epoch": 6.578400090093472,
"eval_loss": 1.9816147089004517,
"eval_runtime": 244.5673,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 21900
},
{
"epoch": 6.585907879424903,
"grad_norm": 1.155019760131836,
"learning_rate": 1.3283144586666803e-06,
"loss": 1.9466,
"step": 21925
},
{
"epoch": 6.593415668756335,
"grad_norm": 1.2090644836425781,
"learning_rate": 1.314685661218958e-06,
"loss": 1.9444,
"step": 21950
},
{
"epoch": 6.600923458087766,
"grad_norm": 1.1589152812957764,
"learning_rate": 1.3011204222575515e-06,
"loss": 1.9282,
"step": 21975
},
{
"epoch": 6.6084312474191975,
"grad_norm": 1.3078739643096924,
"learning_rate": 1.287618881174899e-06,
"loss": 1.9273,
"step": 22000
},
{
"epoch": 6.6084312474191975,
"eval_loss": 1.9818423986434937,
"eval_runtime": 244.6939,
"eval_samples_per_second": 22.923,
"eval_steps_per_second": 5.734,
"step": 22000
},
{
"epoch": 6.615939036750628,
"grad_norm": 1.070833683013916,
"learning_rate": 1.2741811767089034e-06,
"loss": 1.9397,
"step": 22025
},
{
"epoch": 6.62344682608206,
"grad_norm": 1.1993708610534668,
"learning_rate": 1.2608074469414949e-06,
"loss": 1.959,
"step": 22050
},
{
"epoch": 6.630954615413492,
"grad_norm": 1.1407665014266968,
"learning_rate": 1.2474978292972209e-06,
"loss": 1.9474,
"step": 22075
},
{
"epoch": 6.6384624047449226,
"grad_norm": 1.2709163427352905,
"learning_rate": 1.2342524605418293e-06,
"loss": 1.9464,
"step": 22100
},
{
"epoch": 6.6384624047449226,
"eval_loss": 1.9815821647644043,
"eval_runtime": 244.5724,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 22100
},
{
"epoch": 6.645970194076354,
"grad_norm": 1.1849255561828613,
"learning_rate": 1.221071476780867e-06,
"loss": 1.9201,
"step": 22125
},
{
"epoch": 6.653477983407786,
"grad_norm": 1.2153717279434204,
"learning_rate": 1.207955013458281e-06,
"loss": 1.9624,
"step": 22150
},
{
"epoch": 6.660985772739217,
"grad_norm": 1.1668004989624023,
"learning_rate": 1.1949032053550208e-06,
"loss": 1.9304,
"step": 22175
},
{
"epoch": 6.6684935620706485,
"grad_norm": 1.1738938093185425,
"learning_rate": 1.1819161865876618e-06,
"loss": 1.9117,
"step": 22200
},
{
"epoch": 6.6684935620706485,
"eval_loss": 1.981676697731018,
"eval_runtime": 244.4832,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 5.739,
"step": 22200
},
{
"epoch": 6.676001351402079,
"grad_norm": 1.191311001777649,
"learning_rate": 1.1689940906070203e-06,
"loss": 1.9211,
"step": 22225
},
{
"epoch": 6.683509140733511,
"grad_norm": 1.1772605180740356,
"learning_rate": 1.1561370501967871e-06,
"loss": 1.933,
"step": 22250
},
{
"epoch": 6.691016930064943,
"grad_norm": 1.3537640571594238,
"learning_rate": 1.1433451974721602e-06,
"loss": 1.9239,
"step": 22275
},
{
"epoch": 6.6985247193963735,
"grad_norm": 1.1915578842163086,
"learning_rate": 1.1306186638784846e-06,
"loss": 1.9429,
"step": 22300
},
{
"epoch": 6.6985247193963735,
"eval_loss": 1.981661081314087,
"eval_runtime": 244.4697,
"eval_samples_per_second": 22.944,
"eval_steps_per_second": 5.739,
"step": 22300
},
{
"epoch": 6.706032508727805,
"grad_norm": 1.0470706224441528,
"learning_rate": 1.1179575801899122e-06,
"loss": 1.9428,
"step": 22325
},
{
"epoch": 6.713540298059236,
"grad_norm": 1.2210986614227295,
"learning_rate": 1.1053620765080458e-06,
"loss": 1.9551,
"step": 22350
},
{
"epoch": 6.721048087390668,
"grad_norm": 1.2881091833114624,
"learning_rate": 1.0928322822606064e-06,
"loss": 1.9365,
"step": 22375
},
{
"epoch": 6.728555876722099,
"grad_norm": 1.2427425384521484,
"learning_rate": 1.0803683262001066e-06,
"loss": 1.9491,
"step": 22400
},
{
"epoch": 6.728555876722099,
"eval_loss": 1.9814238548278809,
"eval_runtime": 244.3102,
"eval_samples_per_second": 22.959,
"eval_steps_per_second": 5.743,
"step": 22400
},
{
"epoch": 6.73606366605353,
"grad_norm": 1.2582703828811646,
"learning_rate": 1.067970336402524e-06,
"loss": 1.9398,
"step": 22425
},
{
"epoch": 6.743571455384962,
"grad_norm": 1.2919580936431885,
"learning_rate": 1.055638440265983e-06,
"loss": 1.9626,
"step": 22450
},
{
"epoch": 6.751079244716394,
"grad_norm": 1.2642123699188232,
"learning_rate": 1.0433727645094574e-06,
"loss": 1.9278,
"step": 22475
},
{
"epoch": 6.7585870340478245,
"grad_norm": 1.1762003898620605,
"learning_rate": 1.0311734351714533e-06,
"loss": 1.9289,
"step": 22500
},
{
"epoch": 6.7585870340478245,
"eval_loss": 1.981581687927246,
"eval_runtime": 245.2807,
"eval_samples_per_second": 22.868,
"eval_steps_per_second": 5.72,
"step": 22500
},
{
"epoch": 6.766094823379256,
"grad_norm": 1.1311445236206055,
"learning_rate": 1.0190405776087183e-06,
"loss": 1.9347,
"step": 22525
},
{
"epoch": 6.773602612710687,
"grad_norm": 1.2351762056350708,
"learning_rate": 1.0069743164949595e-06,
"loss": 1.9398,
"step": 22550
},
{
"epoch": 6.781110402042119,
"grad_norm": 1.097221851348877,
"learning_rate": 9.949747758195568e-07,
"loss": 1.9527,
"step": 22575
},
{
"epoch": 6.78861819137355,
"grad_norm": 1.2283989191055298,
"learning_rate": 9.830420788862903e-07,
"loss": 1.9374,
"step": 22600
},
{
"epoch": 6.78861819137355,
"eval_loss": 1.9813563823699951,
"eval_runtime": 244.58,
"eval_samples_per_second": 22.933,
"eval_steps_per_second": 5.736,
"step": 22600
},
{
"epoch": 6.796125980704981,
"grad_norm": 1.28317391872406,
"learning_rate": 9.71176348312076e-07,
"loss": 1.9048,
"step": 22625
},
{
"epoch": 6.803633770036413,
"grad_norm": 1.3111134767532349,
"learning_rate": 9.593777060257004e-07,
"loss": 1.9211,
"step": 22650
},
{
"epoch": 6.811141559367844,
"grad_norm": 1.1793931722640991,
"learning_rate": 9.476462732665697e-07,
"loss": 1.928,
"step": 22675
},
{
"epoch": 6.818649348699275,
"grad_norm": 1.111651062965393,
"learning_rate": 9.359821705834662e-07,
"loss": 1.9336,
"step": 22700
},
{
"epoch": 6.818649348699275,
"eval_loss": 1.9812109470367432,
"eval_runtime": 244.4408,
"eval_samples_per_second": 22.946,
"eval_steps_per_second": 5.74,
"step": 22700
},
{
"epoch": 6.826157138030707,
"grad_norm": 1.0364270210266113,
"learning_rate": 9.243855178333066e-07,
"loss": 1.9512,
"step": 22725
},
{
"epoch": 6.833664927362138,
"grad_norm": 1.0628246068954468,
"learning_rate": 9.128564341799139e-07,
"loss": 1.9368,
"step": 22750
},
{
"epoch": 6.84117271669357,
"grad_norm": 1.250428557395935,
"learning_rate": 9.013950380927874e-07,
"loss": 1.9603,
"step": 22775
},
{
"epoch": 6.848680506025001,
"grad_norm": 1.2401816844940186,
"learning_rate": 8.900014473458943e-07,
"loss": 1.9414,
"step": 22800
},
{
"epoch": 6.848680506025001,
"eval_loss": 1.981279969215393,
"eval_runtime": 244.3713,
"eval_samples_per_second": 22.953,
"eval_steps_per_second": 5.741,
"step": 22800
},
{
"epoch": 6.856188295356432,
"grad_norm": 1.2459220886230469,
"learning_rate": 8.78675779016449e-07,
"loss": 1.9228,
"step": 22825
},
{
"epoch": 6.863696084687864,
"grad_norm": 1.0594732761383057,
"learning_rate": 8.674181494837147e-07,
"loss": 1.9627,
"step": 22850
},
{
"epoch": 6.871203874019295,
"grad_norm": 1.1160441637039185,
"learning_rate": 8.5622867442781e-07,
"loss": 1.9599,
"step": 22875
},
{
"epoch": 6.878711663350726,
"grad_norm": 1.4957025051116943,
"learning_rate": 8.451074688285182e-07,
"loss": 1.9485,
"step": 22900
},
{
"epoch": 6.878711663350726,
"eval_loss": 1.9810361862182617,
"eval_runtime": 244.8768,
"eval_samples_per_second": 22.905,
"eval_steps_per_second": 5.729,
"step": 22900
},
{
"epoch": 6.886219452682158,
"grad_norm": 1.3058786392211914,
"learning_rate": 8.340546469641027e-07,
"loss": 1.9092,
"step": 22925
},
{
"epoch": 6.893727242013589,
"grad_norm": 1.2844972610473633,
"learning_rate": 8.23070322410141e-07,
"loss": 1.9442,
"step": 22950
},
{
"epoch": 6.901235031345021,
"grad_norm": 1.291462779045105,
"learning_rate": 8.121546080383474e-07,
"loss": 1.9241,
"step": 22975
},
{
"epoch": 6.908742820676451,
"grad_norm": 1.2580238580703735,
"learning_rate": 8.013076160154187e-07,
"loss": 1.9412,
"step": 23000
},
{
"epoch": 6.908742820676451,
"eval_loss": 1.9810972213745117,
"eval_runtime": 244.8376,
"eval_samples_per_second": 22.909,
"eval_steps_per_second": 5.73,
"step": 23000
},
{
"epoch": 6.916250610007883,
"grad_norm": 1.2016263008117676,
"learning_rate": 7.905294578018824e-07,
"loss": 1.932,
"step": 23025
},
{
"epoch": 6.923758399339315,
"grad_norm": 1.2784329652786255,
"learning_rate": 7.798202441509484e-07,
"loss": 1.9505,
"step": 23050
},
{
"epoch": 6.931266188670746,
"grad_norm": 1.3308900594711304,
"learning_rate": 7.691800851073724e-07,
"loss": 1.9416,
"step": 23075
},
{
"epoch": 6.938773978002177,
"grad_norm": 1.1549803018569946,
"learning_rate": 7.58609090006328e-07,
"loss": 1.9469,
"step": 23100
},
{
"epoch": 6.938773978002177,
"eval_loss": 1.9810361862182617,
"eval_runtime": 244.9865,
"eval_samples_per_second": 22.895,
"eval_steps_per_second": 5.727,
"step": 23100
},
{
"epoch": 6.946281767333609,
"grad_norm": 1.0882563591003418,
"learning_rate": 7.481073674722763e-07,
"loss": 1.9424,
"step": 23125
},
{
"epoch": 6.95378955666504,
"grad_norm": 1.1521430015563965,
"learning_rate": 7.37675025417856e-07,
"loss": 1.9525,
"step": 23150
},
{
"epoch": 6.9612973459964715,
"grad_norm": 1.1167752742767334,
"learning_rate": 7.273121710427738e-07,
"loss": 1.9644,
"step": 23175
},
{
"epoch": 6.968805135327902,
"grad_norm": 1.0596712827682495,
"learning_rate": 7.170189108326941e-07,
"loss": 1.921,
"step": 23200
},
{
"epoch": 6.968805135327902,
"eval_loss": 1.9809205532073975,
"eval_runtime": 244.6054,
"eval_samples_per_second": 22.931,
"eval_steps_per_second": 5.736,
"step": 23200
},
{
"epoch": 6.976312924659334,
"grad_norm": 1.2589685916900635,
"learning_rate": 7.067953505581593e-07,
"loss": 1.948,
"step": 23225
},
{
"epoch": 6.983820713990766,
"grad_norm": 1.0607045888900757,
"learning_rate": 6.966415952734953e-07,
"loss": 1.9632,
"step": 23250
},
{
"epoch": 6.991328503322197,
"grad_norm": 1.2630726099014282,
"learning_rate": 6.86557749315728e-07,
"loss": 1.9264,
"step": 23275
},
{
"epoch": 6.998836292653628,
"grad_norm": 1.1883573532104492,
"learning_rate": 6.765439163035183e-07,
"loss": 1.9428,
"step": 23300
},
{
"epoch": 6.998836292653628,
"eval_loss": 1.9808813333511353,
"eval_runtime": 244.9512,
"eval_samples_per_second": 22.898,
"eval_steps_per_second": 5.728,
"step": 23300
},
{
"epoch": 7.006606854611659,
"grad_norm": 1.3225353956222534,
"learning_rate": 6.666001991360948e-07,
"loss": 2.0098,
"step": 23325
},
{
"epoch": 7.014114643943091,
"grad_norm": 1.1181532144546509,
"learning_rate": 6.567266999921936e-07,
"loss": 1.9435,
"step": 23350
},
{
"epoch": 7.021622433274523,
"grad_norm": 1.167235016822815,
"learning_rate": 6.469235203290125e-07,
"loss": 1.9534,
"step": 23375
},
{
"epoch": 7.0291302226059535,
"grad_norm": 1.1674944162368774,
"learning_rate": 6.371907608811686e-07,
"loss": 1.9374,
"step": 23400
},
{
"epoch": 7.0291302226059535,
"eval_loss": 1.9810993671417236,
"eval_runtime": 244.971,
"eval_samples_per_second": 22.897,
"eval_steps_per_second": 5.727,
"step": 23400
},
{
"epoch": 7.036638011937385,
"grad_norm": 1.2668019533157349,
"learning_rate": 6.275285216596583e-07,
"loss": 1.9401,
"step": 23425
},
{
"epoch": 7.044145801268816,
"grad_norm": 1.2170027494430542,
"learning_rate": 6.179369019508346e-07,
"loss": 1.9334,
"step": 23450
},
{
"epoch": 7.051653590600248,
"grad_norm": 1.1965893507003784,
"learning_rate": 6.084160003153849e-07,
"loss": 1.9103,
"step": 23475
},
{
"epoch": 7.0591613799316795,
"grad_norm": 1.1913440227508545,
"learning_rate": 5.989659145873175e-07,
"loss": 1.9268,
"step": 23500
},
{
"epoch": 7.0591613799316795,
"eval_loss": 1.9811537265777588,
"eval_runtime": 244.4854,
"eval_samples_per_second": 22.942,
"eval_steps_per_second": 5.739,
"step": 23500
},
{
"epoch": 7.06666916926311,
"grad_norm": 1.3365952968597412,
"learning_rate": 5.895867418729561e-07,
"loss": 1.9736,
"step": 23525
},
{
"epoch": 7.074176958594542,
"grad_norm": 1.1637241840362549,
"learning_rate": 5.802785785499434e-07,
"loss": 1.9338,
"step": 23550
},
{
"epoch": 7.081684747925973,
"grad_norm": 1.279487133026123,
"learning_rate": 5.710415202662539e-07,
"loss": 1.9281,
"step": 23575
},
{
"epoch": 7.0891925372574045,
"grad_norm": 1.1029129028320312,
"learning_rate": 5.618756619392048e-07,
"loss": 1.9513,
"step": 23600
},
{
"epoch": 7.0891925372574045,
"eval_loss": 1.9810665845870972,
"eval_runtime": 244.7676,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 5.732,
"step": 23600
},
{
"epoch": 7.096700326588836,
"grad_norm": 1.3634998798370361,
"learning_rate": 5.527810977544814e-07,
"loss": 1.972,
"step": 23625
},
{
"epoch": 7.104208115920267,
"grad_norm": 1.1559998989105225,
"learning_rate": 5.437579211651739e-07,
"loss": 1.9436,
"step": 23650
},
{
"epoch": 7.111715905251699,
"grad_norm": 1.1781272888183594,
"learning_rate": 5.348062248908126e-07,
"loss": 1.9489,
"step": 23675
},
{
"epoch": 7.11922369458313,
"grad_norm": 1.2078481912612915,
"learning_rate": 5.259261009164179e-07,
"loss": 1.973,
"step": 23700
},
{
"epoch": 7.11922369458313,
"eval_loss": 1.9810374975204468,
"eval_runtime": 244.6697,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 5.734,
"step": 23700
},
{
"epoch": 7.126731483914561,
"grad_norm": 1.2811923027038574,
"learning_rate": 5.171176404915562e-07,
"loss": 1.9334,
"step": 23725
},
{
"epoch": 7.134239273245993,
"grad_norm": 1.2578486204147339,
"learning_rate": 5.08380934129396e-07,
"loss": 1.9083,
"step": 23750
},
{
"epoch": 7.141747062577424,
"grad_norm": 1.2639051675796509,
"learning_rate": 4.99716071605785e-07,
"loss": 1.9363,
"step": 23775
},
{
"epoch": 7.1492548519088555,
"grad_norm": 1.4398607015609741,
"learning_rate": 4.911231419583228e-07,
"loss": 1.9547,
"step": 23800
},
{
"epoch": 7.1492548519088555,
"eval_loss": 1.9809601306915283,
"eval_runtime": 244.5082,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 23800
},
{
"epoch": 7.156762641240287,
"grad_norm": 1.1956835985183716,
"learning_rate": 4.826022334854482e-07,
"loss": 1.9501,
"step": 23825
},
{
"epoch": 7.164270430571718,
"grad_norm": 1.192142367362976,
"learning_rate": 4.741534337455333e-07,
"loss": 1.9336,
"step": 23850
},
{
"epoch": 7.17177821990315,
"grad_norm": 1.204443335533142,
"learning_rate": 4.6577682955597804e-07,
"loss": 1.9482,
"step": 23875
},
{
"epoch": 7.1792860092345805,
"grad_norm": 1.205980896949768,
"learning_rate": 4.5747250699232664e-07,
"loss": 1.9229,
"step": 23900
},
{
"epoch": 7.1792860092345805,
"eval_loss": 1.9809165000915527,
"eval_runtime": 244.9164,
"eval_samples_per_second": 22.902,
"eval_steps_per_second": 5.728,
"step": 23900
},
{
"epoch": 7.186793798566012,
"grad_norm": 1.1392741203308105,
"learning_rate": 4.492405513873732e-07,
"loss": 1.9091,
"step": 23925
},
{
"epoch": 7.194301587897444,
"grad_norm": 1.1868606805801392,
"learning_rate": 4.4108104733029506e-07,
"loss": 1.9538,
"step": 23950
},
{
"epoch": 7.201809377228875,
"grad_norm": 1.2095065116882324,
"learning_rate": 4.32994078665776e-07,
"loss": 1.9259,
"step": 23975
},
{
"epoch": 7.209317166560306,
"grad_norm": 1.2137978076934814,
"learning_rate": 4.2497972849314587e-07,
"loss": 1.9086,
"step": 24000
},
{
"epoch": 7.209317166560306,
"eval_loss": 1.9811201095581055,
"eval_runtime": 244.5079,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 24000
},
{
"epoch": 7.216824955891738,
"grad_norm": 1.3852958679199219,
"learning_rate": 4.170380791655323e-07,
"loss": 1.9304,
"step": 24025
},
{
"epoch": 7.224332745223169,
"grad_norm": 1.1705379486083984,
"learning_rate": 4.0916921228900643e-07,
"loss": 1.9256,
"step": 24050
},
{
"epoch": 7.231840534554601,
"grad_norm": 1.2448861598968506,
"learning_rate": 4.013732087217492e-07,
"loss": 1.9281,
"step": 24075
},
{
"epoch": 7.2393483238860314,
"grad_norm": 1.1513874530792236,
"learning_rate": 3.9365014857322133e-07,
"loss": 1.9121,
"step": 24100
},
{
"epoch": 7.2393483238860314,
"eval_loss": 1.9809224605560303,
"eval_runtime": 244.5719,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 24100
},
{
"epoch": 7.246856113217463,
"grad_norm": 1.2126648426055908,
"learning_rate": 3.8600011120333483e-07,
"loss": 1.9301,
"step": 24125
},
{
"epoch": 7.254363902548895,
"grad_norm": 1.213840126991272,
"learning_rate": 3.7842317522164274e-07,
"loss": 1.9395,
"step": 24150
},
{
"epoch": 7.261871691880326,
"grad_norm": 1.1836591958999634,
"learning_rate": 3.709194184865314e-07,
"loss": 1.9326,
"step": 24175
},
{
"epoch": 7.269379481211757,
"grad_norm": 1.1955537796020508,
"learning_rate": 3.6348891810441457e-07,
"loss": 1.9385,
"step": 24200
},
{
"epoch": 7.269379481211757,
"eval_loss": 1.9811633825302124,
"eval_runtime": 244.8759,
"eval_samples_per_second": 22.905,
"eval_steps_per_second": 5.729,
"step": 24200
},
{
"epoch": 7.276887270543188,
"grad_norm": 1.0566610097885132,
"learning_rate": 3.5613175042894823e-07,
"loss": 1.9263,
"step": 24225
},
{
"epoch": 7.28439505987462,
"grad_norm": 1.196273922920227,
"learning_rate": 3.4884799106024185e-07,
"loss": 1.905,
"step": 24250
},
{
"epoch": 7.291902849206052,
"grad_norm": 1.1962950229644775,
"learning_rate": 3.4163771484408247e-07,
"loss": 1.9178,
"step": 24275
},
{
"epoch": 7.299410638537482,
"grad_norm": 1.1637682914733887,
"learning_rate": 3.3450099587116533e-07,
"loss": 1.9427,
"step": 24300
},
{
"epoch": 7.299410638537482,
"eval_loss": 1.9810516834259033,
"eval_runtime": 244.4733,
"eval_samples_per_second": 22.943,
"eval_steps_per_second": 5.739,
"step": 24300
},
{
"epoch": 7.306918427868914,
"grad_norm": 1.2521005868911743,
"learning_rate": 3.2743790747633285e-07,
"loss": 1.9469,
"step": 24325
},
{
"epoch": 7.314426217200346,
"grad_norm": 1.241564154624939,
"learning_rate": 3.2044852223782337e-07,
"loss": 1.9265,
"step": 24350
},
{
"epoch": 7.321934006531777,
"grad_norm": 1.2285447120666504,
"learning_rate": 3.135329119765204e-07,
"loss": 1.9296,
"step": 24375
},
{
"epoch": 7.329441795863208,
"grad_norm": 1.1928914785385132,
"learning_rate": 3.0669114775521784e-07,
"loss": 1.9409,
"step": 24400
},
{
"epoch": 7.329441795863208,
"eval_loss": 1.9809722900390625,
"eval_runtime": 244.5739,
"eval_samples_per_second": 22.934,
"eval_steps_per_second": 5.737,
"step": 24400
},
{
"epoch": 7.336949585194639,
"grad_norm": 1.1230217218399048,
"learning_rate": 2.9992329987789004e-07,
"loss": 1.9087,
"step": 24425
},
{
"epoch": 7.344457374526071,
"grad_norm": 1.2940632104873657,
"learning_rate": 2.932294378889672e-07,
"loss": 1.9574,
"step": 24450
},
{
"epoch": 7.3519651638575025,
"grad_norm": 1.3414610624313354,
"learning_rate": 2.8660963057262427e-07,
"loss": 1.945,
"step": 24475
},
{
"epoch": 7.359472953188933,
"grad_norm": 1.2023084163665771,
"learning_rate": 2.800639459520693e-07,
"loss": 1.9368,
"step": 24500
},
{
"epoch": 7.359472953188933,
"eval_loss": 1.9809428453445435,
"eval_runtime": 244.6252,
"eval_samples_per_second": 22.929,
"eval_steps_per_second": 5.735,
"step": 24500
},
{
"epoch": 7.366980742520365,
"grad_norm": 1.3090476989746094,
"learning_rate": 2.7359245128884935e-07,
"loss": 1.9401,
"step": 24525
},
{
"epoch": 7.374488531851796,
"grad_norm": 1.1625995635986328,
"learning_rate": 2.6719521308215644e-07,
"loss": 1.9421,
"step": 24550
},
{
"epoch": 7.381996321183228,
"grad_norm": 1.1445448398590088,
"learning_rate": 2.608722970681446e-07,
"loss": 1.9201,
"step": 24575
},
{
"epoch": 7.389504110514659,
"grad_norm": 1.2008908987045288,
"learning_rate": 2.5462376821925453e-07,
"loss": 1.9368,
"step": 24600
},
{
"epoch": 7.389504110514659,
"eval_loss": 1.9809165000915527,
"eval_runtime": 245.2653,
"eval_samples_per_second": 22.869,
"eval_steps_per_second": 5.72,
"step": 24600
},
{
"epoch": 7.39701189984609,
"grad_norm": 1.1442121267318726,
"learning_rate": 2.484496907435452e-07,
"loss": 1.9356,
"step": 24625
},
{
"epoch": 7.404519689177522,
"grad_norm": 1.194258689880371,
"learning_rate": 2.42350128084039e-07,
"loss": 1.957,
"step": 24650
},
{
"epoch": 7.4120274785089535,
"grad_norm": 1.2677561044692993,
"learning_rate": 2.3632514291806185e-07,
"loss": 1.9405,
"step": 24675
},
{
"epoch": 7.419535267840384,
"grad_norm": 1.1491544246673584,
"learning_rate": 2.3037479715660337e-07,
"loss": 1.921,
"step": 24700
},
{
"epoch": 7.419535267840384,
"eval_loss": 1.9808403253555298,
"eval_runtime": 245.1407,
"eval_samples_per_second": 22.881,
"eval_steps_per_second": 5.723,
"step": 24700
},
{
"epoch": 7.427043057171816,
"grad_norm": 1.2121251821517944,
"learning_rate": 2.2449915194368258e-07,
"loss": 1.9255,
"step": 24725
},
{
"epoch": 7.434550846503247,
"grad_norm": 1.2214640378952026,
"learning_rate": 2.1869826765571505e-07,
"loss": 1.9289,
"step": 24750
},
{
"epoch": 7.4420586358346785,
"grad_norm": 1.3879413604736328,
"learning_rate": 2.1297220390089662e-07,
"loss": 1.9396,
"step": 24775
},
{
"epoch": 7.44956642516611,
"grad_norm": 1.1950923204421997,
"learning_rate": 2.0732101951858816e-07,
"loss": 1.926,
"step": 24800
},
{
"epoch": 7.44956642516611,
"eval_loss": 1.9808101654052734,
"eval_runtime": 244.5077,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 24800
},
{
"epoch": 7.457074214497541,
"grad_norm": 1.1813709735870361,
"learning_rate": 2.0174477257871277e-07,
"loss": 1.9482,
"step": 24825
},
{
"epoch": 7.464582003828973,
"grad_norm": 1.2802395820617676,
"learning_rate": 1.9624352038115773e-07,
"loss": 1.9456,
"step": 24850
},
{
"epoch": 7.472089793160404,
"grad_norm": 1.1578987836837769,
"learning_rate": 1.9103292580586406e-07,
"loss": 1.9607,
"step": 24875
},
{
"epoch": 7.479597582491835,
"grad_norm": 1.2001193761825562,
"learning_rate": 1.856788265674736e-07,
"loss": 1.8946,
"step": 24900
},
{
"epoch": 7.479597582491835,
"eval_loss": 1.9809141159057617,
"eval_runtime": 245.3503,
"eval_samples_per_second": 22.861,
"eval_steps_per_second": 5.718,
"step": 24900
},
{
"epoch": 7.487105371823267,
"grad_norm": 1.0772101879119873,
"learning_rate": 1.8039988716037763e-07,
"loss": 1.9356,
"step": 24925
},
{
"epoch": 7.494613161154698,
"grad_norm": 1.2263306379318237,
"learning_rate": 1.7519616182942067e-07,
"loss": 1.9384,
"step": 24950
},
{
"epoch": 7.5021209504861295,
"grad_norm": 1.1009598970413208,
"learning_rate": 1.7006770404656534e-07,
"loss": 1.9524,
"step": 24975
},
{
"epoch": 7.509628739817561,
"grad_norm": 1.192656397819519,
"learning_rate": 1.6501456651034808e-07,
"loss": 1.9367,
"step": 25000
},
{
"epoch": 7.509628739817561,
"eval_loss": 1.9808765649795532,
"eval_runtime": 244.5611,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 25000
},
{
"epoch": 7.517136529148992,
"grad_norm": 1.0909212827682495,
"learning_rate": 1.6003680114533763e-07,
"loss": 1.906,
"step": 25025
},
{
"epoch": 7.524644318480424,
"grad_norm": 1.217670202255249,
"learning_rate": 1.5513445910159823e-07,
"loss": 1.9117,
"step": 25050
},
{
"epoch": 7.5321521078118545,
"grad_norm": 1.1752556562423706,
"learning_rate": 1.503075907541665e-07,
"loss": 1.9262,
"step": 25075
},
{
"epoch": 7.539659897143286,
"grad_norm": 1.1952420473098755,
"learning_rate": 1.455562457025353e-07,
"loss": 1.9484,
"step": 25100
},
{
"epoch": 7.539659897143286,
"eval_loss": 1.9809269905090332,
"eval_runtime": 244.6411,
"eval_samples_per_second": 22.927,
"eval_steps_per_second": 5.735,
"step": 25100
},
{
"epoch": 7.547167686474717,
"grad_norm": 1.1993337869644165,
"learning_rate": 1.4088047277013987e-07,
"loss": 1.9473,
"step": 25125
},
{
"epoch": 7.554675475806149,
"grad_norm": 1.2464344501495361,
"learning_rate": 1.3628032000386008e-07,
"loss": 1.9417,
"step": 25150
},
{
"epoch": 7.56218326513758,
"grad_norm": 1.3131685256958008,
"learning_rate": 1.3175583467352316e-07,
"loss": 1.9431,
"step": 25175
},
{
"epoch": 7.569691054469011,
"grad_norm": 1.255356788635254,
"learning_rate": 1.2730706327142155e-07,
"loss": 1.9323,
"step": 25200
},
{
"epoch": 7.569691054469011,
"eval_loss": 1.9808063507080078,
"eval_runtime": 244.7486,
"eval_samples_per_second": 22.917,
"eval_steps_per_second": 5.732,
"step": 25200
},
{
"epoch": 7.577198843800443,
"grad_norm": 1.2939985990524292,
"learning_rate": 1.2293405151183184e-07,
"loss": 1.9484,
"step": 25225
},
{
"epoch": 7.584706633131875,
"grad_norm": 1.2347224950790405,
"learning_rate": 1.1863684433054994e-07,
"loss": 1.9408,
"step": 25250
},
{
"epoch": 7.5922144224633055,
"grad_norm": 1.142849087715149,
"learning_rate": 1.1441548588442152e-07,
"loss": 1.9449,
"step": 25275
},
{
"epoch": 7.599722211794737,
"grad_norm": 1.2810004949569702,
"learning_rate": 1.1027001955089572e-07,
"loss": 1.9499,
"step": 25300
},
{
"epoch": 7.599722211794737,
"eval_loss": 1.9808244705200195,
"eval_runtime": 244.5894,
"eval_samples_per_second": 22.932,
"eval_steps_per_second": 5.736,
"step": 25300
},
{
"epoch": 7.607230001126169,
"grad_norm": 1.2134476900100708,
"learning_rate": 1.0620048792757464e-07,
"loss": 1.9384,
"step": 25325
},
{
"epoch": 7.6147377904576,
"grad_norm": 1.1853543519973755,
"learning_rate": 1.0220693283177957e-07,
"loss": 1.945,
"step": 25350
},
{
"epoch": 7.622245579789031,
"grad_norm": 1.2892330884933472,
"learning_rate": 9.82893953001171e-08,
"loss": 1.9541,
"step": 25375
},
{
"epoch": 7.629753369120462,
"grad_norm": 1.139492392539978,
"learning_rate": 9.444791558806121e-08,
"loss": 1.9462,
"step": 25400
},
{
"epoch": 7.629753369120462,
"eval_loss": 1.9807994365692139,
"eval_runtime": 244.5984,
"eval_samples_per_second": 22.931,
"eval_steps_per_second": 5.736,
"step": 25400
},
{
"epoch": 7.637261158451894,
"grad_norm": 1.1218518018722534,
"learning_rate": 9.068253316953684e-08,
"loss": 1.926,
"step": 25425
},
{
"epoch": 7.644768947783325,
"grad_norm": 1.1967252492904663,
"learning_rate": 8.699328673651613e-08,
"loss": 1.921,
"step": 25450
},
{
"epoch": 7.652276737114756,
"grad_norm": 1.1184207201004028,
"learning_rate": 8.338021419861868e-08,
"loss": 1.9127,
"step": 25475
},
{
"epoch": 7.659784526446188,
"grad_norm": 1.2853116989135742,
"learning_rate": 7.984335268272441e-08,
"loss": 1.9373,
"step": 25500
},
{
"epoch": 7.659784526446188,
"eval_loss": 1.9808040857315063,
"eval_runtime": 244.5622,
"eval_samples_per_second": 22.935,
"eval_steps_per_second": 5.737,
"step": 25500
},
{
"epoch": 7.667292315777619,
"grad_norm": 1.1911348104476929,
"learning_rate": 7.638273853259131e-08,
"loss": 1.9449,
"step": 25525
},
{
"epoch": 7.674800105109051,
"grad_norm": 1.2387864589691162,
"learning_rate": 7.299840730847995e-08,
"loss": 1.9314,
"step": 25550
},
{
"epoch": 7.682307894440482,
"grad_norm": 1.2212029695510864,
"learning_rate": 6.969039378679292e-08,
"loss": 1.9205,
"step": 25575
},
{
"epoch": 7.689815683771913,
"grad_norm": 1.2776437997817993,
"learning_rate": 6.645873195971098e-08,
"loss": 1.984,
"step": 25600
},
{
"epoch": 7.689815683771913,
"eval_loss": 1.9808552265167236,
"eval_runtime": 244.7299,
"eval_samples_per_second": 22.919,
"eval_steps_per_second": 5.733,
"step": 25600
},
{
"epoch": 7.697323473103345,
"grad_norm": 1.2346168756484985,
"learning_rate": 6.330345503484908e-08,
"loss": 1.9367,
"step": 25625
},
{
"epoch": 7.704831262434777,
"grad_norm": 1.2425425052642822,
"learning_rate": 6.02245954349126e-08,
"loss": 1.9449,
"step": 25650
},
{
"epoch": 7.712339051766207,
"grad_norm": 1.1887537240982056,
"learning_rate": 5.722218479736502e-08,
"loss": 1.9207,
"step": 25675
},
{
"epoch": 7.719846841097639,
"grad_norm": 1.1335868835449219,
"learning_rate": 5.429625397410237e-08,
"loss": 1.9374,
"step": 25700
},
{
"epoch": 7.719846841097639,
"eval_loss": 1.9808275699615479,
"eval_runtime": 244.9051,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 5.729,
"step": 25700
},
{
"epoch": 7.72735463042907,
"grad_norm": 1.2654999494552612,
"learning_rate": 5.144683303113684e-08,
"loss": 1.9645,
"step": 25725
},
{
"epoch": 7.734862419760502,
"grad_norm": 1.3081024885177612,
"learning_rate": 4.8673951248286166e-08,
"loss": 1.9205,
"step": 25750
},
{
"epoch": 7.742370209091932,
"grad_norm": 1.2318024635314941,
"learning_rate": 4.597763711887637e-08,
"loss": 1.9425,
"step": 25775
},
{
"epoch": 7.749877998423364,
"grad_norm": 1.2977827787399292,
"learning_rate": 4.335791834944369e-08,
"loss": 1.9496,
"step": 25800
},
{
"epoch": 7.749877998423364,
"eval_loss": 1.9807723760604858,
"eval_runtime": 244.5126,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 25800
},
{
"epoch": 7.757385787754796,
"grad_norm": 1.0694193840026855,
"learning_rate": 4.081482185945479e-08,
"loss": 1.9416,
"step": 25825
},
{
"epoch": 7.764893577086227,
"grad_norm": 1.2503465414047241,
"learning_rate": 3.8348373781026955e-08,
"loss": 1.9512,
"step": 25850
},
{
"epoch": 7.772401366417658,
"grad_norm": 1.2763334512710571,
"learning_rate": 3.5958599458662537e-08,
"loss": 1.9338,
"step": 25875
},
{
"epoch": 7.77990915574909,
"grad_norm": 1.1357148885726929,
"learning_rate": 3.3645523448984914e-08,
"loss": 1.9579,
"step": 25900
},
{
"epoch": 7.77990915574909,
"eval_loss": 1.9807769060134888,
"eval_runtime": 244.8991,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 5.729,
"step": 25900
},
{
"epoch": 7.787416945080521,
"grad_norm": 1.1911311149597168,
"learning_rate": 3.149715032283562e-08,
"loss": 1.9241,
"step": 25925
},
{
"epoch": 7.7949247344119525,
"grad_norm": 1.2044905424118042,
"learning_rate": 2.933447122186239e-08,
"loss": 1.9325,
"step": 25950
},
{
"epoch": 7.802432523743384,
"grad_norm": 1.355704426765442,
"learning_rate": 2.724855850118585e-08,
"loss": 1.9369,
"step": 25975
},
{
"epoch": 7.809940313074815,
"grad_norm": 1.1674330234527588,
"learning_rate": 2.5239433595037053e-08,
"loss": 1.9114,
"step": 26000
},
{
"epoch": 7.809940313074815,
"eval_loss": 1.9807677268981934,
"eval_runtime": 244.6737,
"eval_samples_per_second": 22.924,
"eval_steps_per_second": 5.734,
"step": 26000
},
{
"epoch": 7.817448102406247,
"grad_norm": 1.242080807685852,
"learning_rate": 2.33071171485974e-08,
"loss": 1.9247,
"step": 26025
},
{
"epoch": 7.824955891737678,
"grad_norm": 1.2494958639144897,
"learning_rate": 2.1451629017787133e-08,
"loss": 1.9284,
"step": 26050
},
{
"epoch": 7.832463681069109,
"grad_norm": 1.2800395488739014,
"learning_rate": 1.9672988269061332e-08,
"loss": 1.9365,
"step": 26075
},
{
"epoch": 7.83997147040054,
"grad_norm": 1.2159162759780884,
"learning_rate": 1.797121317921341e-08,
"loss": 1.9213,
"step": 26100
},
{
"epoch": 7.83997147040054,
"eval_loss": 1.9807840585708618,
"eval_runtime": 245.0653,
"eval_samples_per_second": 22.888,
"eval_steps_per_second": 5.725,
"step": 26100
},
{
"epoch": 7.847479259731972,
"grad_norm": 1.2391120195388794,
"learning_rate": 1.6346321235187756e-08,
"loss": 1.9321,
"step": 26125
},
{
"epoch": 7.8549870490634035,
"grad_norm": 1.1283122301101685,
"learning_rate": 1.4798329133900724e-08,
"loss": 1.9741,
"step": 26150
},
{
"epoch": 7.862494838394834,
"grad_norm": 1.1563724279403687,
"learning_rate": 1.3327252782067423e-08,
"loss": 1.9312,
"step": 26175
},
{
"epoch": 7.870002627726266,
"grad_norm": 1.2058521509170532,
"learning_rate": 1.1933107296039358e-08,
"loss": 1.9255,
"step": 26200
},
{
"epoch": 7.870002627726266,
"eval_loss": 1.980788230895996,
"eval_runtime": 244.7317,
"eval_samples_per_second": 22.919,
"eval_steps_per_second": 5.733,
"step": 26200
},
{
"epoch": 7.877510417057698,
"grad_norm": 1.1759904623031616,
"learning_rate": 1.0615907001648717e-08,
"loss": 1.9553,
"step": 26225
},
{
"epoch": 7.8850182063891285,
"grad_norm": 1.2614035606384277,
"learning_rate": 9.37566543406182e-09,
"loss": 1.9662,
"step": 26250
},
{
"epoch": 7.89252599572056,
"grad_norm": 1.3371256589889526,
"learning_rate": 8.212395337640066e-09,
"loss": 1.9287,
"step": 26275
},
{
"epoch": 7.900033785051991,
"grad_norm": 1.3627214431762695,
"learning_rate": 7.126108665805875e-09,
"loss": 1.9213,
"step": 26300
},
{
"epoch": 7.900033785051991,
"eval_loss": 1.980796217918396,
"eval_runtime": 244.3911,
"eval_samples_per_second": 22.951,
"eval_steps_per_second": 5.741,
"step": 26300
},
{
"epoch": 7.907541574383423,
"grad_norm": 1.236215353012085,
"learning_rate": 6.11681658092611e-09,
"loss": 1.9431,
"step": 26325
},
{
"epoch": 7.9150493637148545,
"grad_norm": 1.1915570497512817,
"learning_rate": 5.184529454191344e-09,
"loss": 1.9467,
"step": 26350
},
{
"epoch": 7.922557153046285,
"grad_norm": 1.1169012784957886,
"learning_rate": 4.329256865511777e-09,
"loss": 1.9403,
"step": 26375
},
{
"epoch": 7.930064942377717,
"grad_norm": 1.4462562799453735,
"learning_rate": 3.5510076034198093e-09,
"loss": 1.9356,
"step": 26400
},
{
"epoch": 7.930064942377717,
"eval_loss": 1.980790138244629,
"eval_runtime": 244.7112,
"eval_samples_per_second": 22.921,
"eval_steps_per_second": 5.733,
"step": 26400
},
{
"epoch": 7.937572731709148,
"grad_norm": 1.2058677673339844,
"learning_rate": 2.8497896649767872e-09,
"loss": 1.9233,
"step": 26425
},
{
"epoch": 7.9450805210405795,
"grad_norm": 1.246536135673523,
"learning_rate": 2.225610255694732e-09,
"loss": 1.9439,
"step": 26450
},
{
"epoch": 7.952588310372011,
"grad_norm": 1.4007676839828491,
"learning_rate": 1.6784757894588998e-09,
"loss": 1.947,
"step": 26475
},
{
"epoch": 7.960096099703442,
"grad_norm": 1.2076547145843506,
"learning_rate": 1.2083918884636668e-09,
"loss": 1.9181,
"step": 26500
},
{
"epoch": 7.960096099703442,
"eval_loss": 1.9807934761047363,
"eval_runtime": 244.5038,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 26500
},
{
"epoch": 7.967603889034874,
"grad_norm": 1.1972334384918213,
"learning_rate": 8.15363383154244e-10,
"loss": 1.928,
"step": 26525
},
{
"epoch": 7.975111678366305,
"grad_norm": 1.246110439300537,
"learning_rate": 4.993943121767153e-10,
"loss": 1.9361,
"step": 26550
},
{
"epoch": 7.982619467697736,
"grad_norm": 1.2921074628829956,
"learning_rate": 2.604879223364054e-10,
"loss": 1.9193,
"step": 26575
},
{
"epoch": 7.990127257029168,
"grad_norm": 1.2516605854034424,
"learning_rate": 9.864666856707061e-11,
"loss": 1.9276,
"step": 26600
},
{
"epoch": 7.990127257029168,
"eval_loss": 1.9807960987091064,
"eval_runtime": 244.5065,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 5.738,
"step": 26600
},
{
"epoch": 7.997635046360599,
"grad_norm": 1.2723952531814575,
"learning_rate": 1.3872213900922859e-11,
"loss": 1.9274,
"step": 26625
}
],
"logging_steps": 25,
"max_steps": 26632,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.055320138484023e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}