flan-t5laa-small / trainer_state.json
hrezaei's picture
End of training
7f9b99c verified
{
"best_global_step": null,
"best_metric": 0.20709213614463806,
"best_model_checkpoint": null,
"epoch": 8.04632568359375,
"eval_steps": 5000,
"global_step": 524288,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"base_loss": 0.5978933601975441,
"epoch": 0.00095367431640625,
"grad_norm": 0.0011252601398155093,
"learning_rate": 4.995241165161133e-05,
"lookahead_loss": 10.307127059936523,
"loss": 0.6104,
"step": 500
},
{
"base_loss": 0.5862715001702309,
"epoch": 0.0019073486328125,
"grad_norm": 0.0011970596387982368,
"learning_rate": 4.990472793579102e-05,
"lookahead_loss": 10.149111717224121,
"loss": 0.6054,
"step": 1000
},
{
"base_loss": 0.6046989848017692,
"epoch": 0.00286102294921875,
"grad_norm": 0.0011270787799730897,
"learning_rate": 4.98570442199707e-05,
"lookahead_loss": 10.002682935714722,
"loss": 0.6216,
"step": 1500
},
{
"base_loss": 0.6158502401709557,
"epoch": 0.003814697265625,
"grad_norm": 0.001161540043540299,
"learning_rate": 4.9809360504150393e-05,
"lookahead_loss": 9.859745756149293,
"loss": 0.6304,
"step": 2000
},
{
"base_loss": 0.6010496825575828,
"epoch": 0.00476837158203125,
"grad_norm": 0.0011645682388916612,
"learning_rate": 4.9761676788330084e-05,
"lookahead_loss": 9.7204390335083,
"loss": 0.6158,
"step": 2500
},
{
"base_loss": 0.5909238924980164,
"epoch": 0.0057220458984375,
"grad_norm": 0.0010902719805017114,
"learning_rate": 4.971399307250977e-05,
"lookahead_loss": 9.615325008392334,
"loss": 0.6064,
"step": 3000
},
{
"base_loss": 0.5803323667645455,
"epoch": 0.00667572021484375,
"grad_norm": 0.0011976484674960375,
"learning_rate": 4.966630935668946e-05,
"lookahead_loss": 9.45008639717102,
"loss": 0.6025,
"step": 3500
},
{
"base_loss": 0.6060425414443016,
"epoch": 0.00762939453125,
"grad_norm": 0.0011318833567202091,
"learning_rate": 4.961862564086914e-05,
"lookahead_loss": 9.358827793121337,
"loss": 0.6227,
"step": 4000
},
{
"base_loss": 0.6035141298770904,
"epoch": 0.00858306884765625,
"grad_norm": 0.0010984783293679357,
"learning_rate": 4.957094192504883e-05,
"lookahead_loss": 9.244875957489013,
"loss": 0.6141,
"step": 4500
},
{
"base_loss": 0.5932371410131454,
"epoch": 0.0095367431640625,
"grad_norm": 0.0012236966285854578,
"learning_rate": 4.952325820922852e-05,
"lookahead_loss": 9.141341512680054,
"loss": 0.6126,
"step": 5000
},
{
"epoch": 0.0095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 8.99573203321463,
"eval_lookahead_perplexity": 8068.573930658563,
"eval_loss": 0.2124410718679428,
"eval_perplexity": 1.236693235395729,
"eval_runtime": 92.5309,
"eval_samples_per_second": 54.036,
"eval_steps_per_second": 1.697,
"step": 5000
},
{
"base_loss": 0.5934136973619462,
"epoch": 0.01049041748046875,
"grad_norm": 0.0011562301078811288,
"learning_rate": 4.9475574493408205e-05,
"lookahead_loss": 9.01434019088745,
"loss": 0.6097,
"step": 5500
},
{
"base_loss": 0.5931573793888092,
"epoch": 0.011444091796875,
"grad_norm": 0.0011571954237297177,
"learning_rate": 4.9427890777587895e-05,
"lookahead_loss": 8.929434476852418,
"loss": 0.6112,
"step": 6000
},
{
"base_loss": 0.6124616218209267,
"epoch": 0.01239776611328125,
"grad_norm": 0.0011109471088275313,
"learning_rate": 4.938020706176758e-05,
"lookahead_loss": 8.831825304031373,
"loss": 0.6266,
"step": 6500
},
{
"base_loss": 0.5971367362737655,
"epoch": 0.0133514404296875,
"grad_norm": 0.0011375549947842956,
"learning_rate": 4.933252334594727e-05,
"lookahead_loss": 8.758240091323852,
"loss": 0.6147,
"step": 7000
},
{
"base_loss": 0.5954981714487075,
"epoch": 0.01430511474609375,
"grad_norm": 0.0011499100364744663,
"learning_rate": 4.928483963012696e-05,
"lookahead_loss": 8.65474760055542,
"loss": 0.6113,
"step": 7500
},
{
"base_loss": 0.5817196745276451,
"epoch": 0.0152587890625,
"grad_norm": 0.0011594763491302729,
"learning_rate": 4.923715591430664e-05,
"lookahead_loss": 8.523625542640685,
"loss": 0.5931,
"step": 8000
},
{
"base_loss": 0.612317619562149,
"epoch": 0.01621246337890625,
"grad_norm": 0.001187981222756207,
"learning_rate": 4.918947219848633e-05,
"lookahead_loss": 8.48364700603485,
"loss": 0.6226,
"step": 8500
},
{
"base_loss": 0.6063876725435257,
"epoch": 0.0171661376953125,
"grad_norm": 0.0011250991374254227,
"learning_rate": 4.9141788482666016e-05,
"lookahead_loss": 8.421111564636231,
"loss": 0.6171,
"step": 9000
},
{
"base_loss": 0.5936728613972664,
"epoch": 0.01811981201171875,
"grad_norm": 0.0010821197647601366,
"learning_rate": 4.9094104766845706e-05,
"lookahead_loss": 8.355785271644592,
"loss": 0.606,
"step": 9500
},
{
"base_loss": 0.5859414834976197,
"epoch": 0.019073486328125,
"grad_norm": 0.0011178837157785892,
"learning_rate": 4.9046421051025396e-05,
"lookahead_loss": 8.281417448997498,
"loss": 0.6026,
"step": 10000
},
{
"epoch": 0.019073486328125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 8.09148007688431,
"eval_lookahead_perplexity": 3266.518692937833,
"eval_loss": 0.21068428456783295,
"eval_perplexity": 1.2345225357127578,
"eval_runtime": 95.6991,
"eval_samples_per_second": 52.247,
"eval_steps_per_second": 1.641,
"step": 10000
},
{
"base_loss": 0.5858585347533226,
"epoch": 0.02002716064453125,
"grad_norm": 0.0011825824622064829,
"learning_rate": 4.899873733520508e-05,
"lookahead_loss": 8.141557210922242,
"loss": 0.6052,
"step": 10500
},
{
"base_loss": 0.6189192984104156,
"epoch": 0.0209808349609375,
"grad_norm": 0.001107234158553183,
"learning_rate": 4.895105361938477e-05,
"lookahead_loss": 8.11882752418518,
"loss": 0.6289,
"step": 11000
},
{
"base_loss": 0.5979826554656029,
"epoch": 0.02193450927734375,
"grad_norm": 0.001138643710874021,
"learning_rate": 4.890336990356445e-05,
"lookahead_loss": 8.047190843582154,
"loss": 0.6135,
"step": 11500
},
{
"base_loss": 0.5944889052510262,
"epoch": 0.02288818359375,
"grad_norm": 0.0011185838375240564,
"learning_rate": 4.8855686187744143e-05,
"lookahead_loss": 7.9819194717407225,
"loss": 0.6082,
"step": 12000
},
{
"base_loss": 0.5957045419812202,
"epoch": 0.02384185791015625,
"grad_norm": 0.0011360319331288338,
"learning_rate": 4.8808002471923834e-05,
"lookahead_loss": 7.931338422775268,
"loss": 0.607,
"step": 12500
},
{
"base_loss": 0.61453830909729,
"epoch": 0.0247955322265625,
"grad_norm": 0.0010940487263724208,
"learning_rate": 4.876031875610352e-05,
"lookahead_loss": 7.886316944122314,
"loss": 0.6274,
"step": 13000
},
{
"base_loss": 0.6059305937290191,
"epoch": 0.02574920654296875,
"grad_norm": 0.0010977721540257335,
"learning_rate": 4.871263504028321e-05,
"lookahead_loss": 7.826973139762878,
"loss": 0.6186,
"step": 13500
},
{
"base_loss": 0.5937826926708222,
"epoch": 0.026702880859375,
"grad_norm": 0.001152095035649836,
"learning_rate": 4.866495132446289e-05,
"lookahead_loss": 7.781199982643128,
"loss": 0.6065,
"step": 14000
},
{
"base_loss": 0.5939369524717331,
"epoch": 0.02765655517578125,
"grad_norm": 0.001108541153371334,
"learning_rate": 4.861726760864258e-05,
"lookahead_loss": 7.837098469734192,
"loss": 0.6077,
"step": 14500
},
{
"base_loss": 0.616533571600914,
"epoch": 0.0286102294921875,
"grad_norm": 0.0011754513252526522,
"learning_rate": 4.856958389282227e-05,
"lookahead_loss": 7.8037817134857175,
"loss": 0.6282,
"step": 15000
},
{
"epoch": 0.0286102294921875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.644905143652481,
"eval_lookahead_perplexity": 2089.9703194804256,
"eval_loss": 0.20981459319591522,
"eval_perplexity": 1.2334493488533298,
"eval_runtime": 112.883,
"eval_samples_per_second": 44.294,
"eval_steps_per_second": 1.391,
"step": 15000
},
{
"base_loss": 0.6012749794125557,
"epoch": 0.02956390380859375,
"grad_norm": 0.0011089193867519498,
"learning_rate": 4.8521900177001955e-05,
"lookahead_loss": 7.749636699676514,
"loss": 0.6145,
"step": 15500
},
{
"base_loss": 0.5948225556612015,
"epoch": 0.030517578125,
"grad_norm": 0.0011371374130249023,
"learning_rate": 4.8474216461181645e-05,
"lookahead_loss": 7.701480973243713,
"loss": 0.61,
"step": 16000
},
{
"base_loss": 0.5902180044651032,
"epoch": 0.03147125244140625,
"grad_norm": 0.0011123986914753914,
"learning_rate": 4.842653274536133e-05,
"lookahead_loss": 7.686638072967529,
"loss": 0.606,
"step": 16500
},
{
"base_loss": 0.6148928787708282,
"epoch": 0.0324249267578125,
"grad_norm": 0.0011524204164743423,
"learning_rate": 4.837884902954102e-05,
"lookahead_loss": 7.60680880355835,
"loss": 0.6307,
"step": 17000
},
{
"base_loss": 0.5949572869539261,
"epoch": 0.03337860107421875,
"grad_norm": 0.0011757535394281149,
"learning_rate": 4.833116531372071e-05,
"lookahead_loss": 7.631597835540772,
"loss": 0.6092,
"step": 17500
},
{
"base_loss": 0.59463729596138,
"epoch": 0.034332275390625,
"grad_norm": 0.001099089509807527,
"learning_rate": 4.828348159790039e-05,
"lookahead_loss": 7.6473195314407345,
"loss": 0.6108,
"step": 18000
},
{
"base_loss": 0.5924516545534134,
"epoch": 0.03528594970703125,
"grad_norm": 0.0011326675303280354,
"learning_rate": 4.823579788208008e-05,
"lookahead_loss": 7.507555624961853,
"loss": 0.6057,
"step": 18500
},
{
"base_loss": 0.6097258816361427,
"epoch": 0.0362396240234375,
"grad_norm": 0.0011656777933239937,
"learning_rate": 4.8188114166259766e-05,
"lookahead_loss": 7.5816989393234255,
"loss": 0.6232,
"step": 19000
},
{
"base_loss": 0.6038113740086556,
"epoch": 0.03719329833984375,
"grad_norm": 0.0011302254861220717,
"learning_rate": 4.8140430450439456e-05,
"lookahead_loss": 7.5106663646698,
"loss": 0.6146,
"step": 19500
},
{
"base_loss": 0.5906151984333992,
"epoch": 0.03814697265625,
"grad_norm": 0.0010981757659465075,
"learning_rate": 4.8092746734619146e-05,
"lookahead_loss": 7.503454574584961,
"loss": 0.6053,
"step": 20000
},
{
"epoch": 0.03814697265625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.413040787267228,
"eval_lookahead_perplexity": 1657.4586710962171,
"eval_loss": 0.2093675285577774,
"eval_perplexity": 1.232898040510938,
"eval_runtime": 116.6122,
"eval_samples_per_second": 42.877,
"eval_steps_per_second": 1.346,
"step": 20000
},
{
"base_loss": 0.5973337704539299,
"epoch": 0.03910064697265625,
"grad_norm": 0.0011260570026934147,
"learning_rate": 4.804506301879883e-05,
"lookahead_loss": 7.45892139339447,
"loss": 0.6126,
"step": 20500
},
{
"base_loss": 0.6136817481517792,
"epoch": 0.0400543212890625,
"grad_norm": 0.0011610394576564431,
"learning_rate": 4.799737930297852e-05,
"lookahead_loss": 7.463105855941772,
"loss": 0.6255,
"step": 21000
},
{
"base_loss": 0.6021154451966285,
"epoch": 0.04100799560546875,
"grad_norm": 0.0011342237703502178,
"learning_rate": 4.79496955871582e-05,
"lookahead_loss": 7.411617256164551,
"loss": 0.6135,
"step": 21500
},
{
"base_loss": 0.5793227363824844,
"epoch": 0.041961669921875,
"grad_norm": 0.001150211552157998,
"learning_rate": 4.7902011871337893e-05,
"lookahead_loss": 7.446121297836304,
"loss": 0.5974,
"step": 22000
},
{
"base_loss": 0.6037838690280914,
"epoch": 0.04291534423828125,
"grad_norm": 0.0010809092782437801,
"learning_rate": 4.7854328155517584e-05,
"lookahead_loss": 7.466872262954712,
"loss": 0.6172,
"step": 22500
},
{
"base_loss": 0.6098511652350426,
"epoch": 0.0438690185546875,
"grad_norm": 0.0011468523880466819,
"learning_rate": 4.780664443969727e-05,
"lookahead_loss": 7.43432590007782,
"loss": 0.6256,
"step": 23000
},
{
"base_loss": 0.5932865824103355,
"epoch": 0.04482269287109375,
"grad_norm": 0.0011411454761400819,
"learning_rate": 4.775896072387696e-05,
"lookahead_loss": 7.379173007011413,
"loss": 0.6064,
"step": 23500
},
{
"base_loss": 0.5919549334645271,
"epoch": 0.0457763671875,
"grad_norm": 0.0010719113051891327,
"learning_rate": 4.771127700805664e-05,
"lookahead_loss": 7.352979991912842,
"loss": 0.6043,
"step": 24000
},
{
"base_loss": 0.6157130757570267,
"epoch": 0.04673004150390625,
"grad_norm": 0.0011310658883303404,
"learning_rate": 4.766359329223633e-05,
"lookahead_loss": 7.356749028205871,
"loss": 0.6284,
"step": 24500
},
{
"base_loss": 0.6059511578679084,
"epoch": 0.0476837158203125,
"grad_norm": 0.0011380530195310712,
"learning_rate": 4.761590957641602e-05,
"lookahead_loss": 7.340578727722168,
"loss": 0.6244,
"step": 25000
},
{
"epoch": 0.0476837158203125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.268213134985,
"eval_lookahead_perplexity": 1433.9858232957502,
"eval_loss": 0.2090858817100525,
"eval_perplexity": 1.232550847559457,
"eval_runtime": 96.5112,
"eval_samples_per_second": 51.807,
"eval_steps_per_second": 1.627,
"step": 25000
},
{
"base_loss": 0.5887659189105033,
"epoch": 0.04863739013671875,
"grad_norm": 0.001135756610892713,
"learning_rate": 4.7568225860595705e-05,
"lookahead_loss": 7.312827653884888,
"loss": 0.6021,
"step": 25500
},
{
"base_loss": 0.5974695605635643,
"epoch": 0.049591064453125,
"grad_norm": 0.0011581169674172997,
"learning_rate": 4.7520542144775395e-05,
"lookahead_loss": 7.2902094869613645,
"loss": 0.6095,
"step": 26000
},
{
"base_loss": 0.6176432440280915,
"epoch": 0.05054473876953125,
"grad_norm": 0.0010899268090724945,
"learning_rate": 4.747285842895508e-05,
"lookahead_loss": 7.362010593414307,
"loss": 0.6288,
"step": 26500
},
{
"base_loss": 0.596397516131401,
"epoch": 0.0514984130859375,
"grad_norm": 0.0011435514315962791,
"learning_rate": 4.742517471313477e-05,
"lookahead_loss": 7.30141323184967,
"loss": 0.6118,
"step": 27000
},
{
"base_loss": 0.5952906757593155,
"epoch": 0.05245208740234375,
"grad_norm": 0.0011486115399748087,
"learning_rate": 4.737749099731446e-05,
"lookahead_loss": 7.251459365844727,
"loss": 0.6066,
"step": 27500
},
{
"base_loss": 0.6116316332221031,
"epoch": 0.05340576171875,
"grad_norm": 0.001126940012909472,
"learning_rate": 4.732980728149414e-05,
"lookahead_loss": 7.226618359565735,
"loss": 0.6223,
"step": 28000
},
{
"base_loss": 0.6090298828482628,
"epoch": 0.05435943603515625,
"grad_norm": 0.0011391318403184414,
"learning_rate": 4.728212356567383e-05,
"lookahead_loss": 7.265408018112183,
"loss": 0.626,
"step": 28500
},
{
"base_loss": 0.5974834812283516,
"epoch": 0.0553131103515625,
"grad_norm": 0.0011334357550367713,
"learning_rate": 4.7234439849853516e-05,
"lookahead_loss": 7.277325885772705,
"loss": 0.6081,
"step": 29000
},
{
"base_loss": 0.5880563573837281,
"epoch": 0.05626678466796875,
"grad_norm": 0.0010896348394453526,
"learning_rate": 4.7186756134033206e-05,
"lookahead_loss": 7.28397903251648,
"loss": 0.6031,
"step": 29500
},
{
"base_loss": 0.6152015085816384,
"epoch": 0.057220458984375,
"grad_norm": 0.0011281173210591078,
"learning_rate": 4.7139072418212896e-05,
"lookahead_loss": 7.264281406402588,
"loss": 0.6301,
"step": 30000
},
{
"epoch": 0.057220458984375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.167584419250488,
"eval_lookahead_perplexity": 1296.7085124687899,
"eval_loss": 0.2088911086320877,
"eval_perplexity": 1.2323108032149765,
"eval_runtime": 93.7201,
"eval_samples_per_second": 53.35,
"eval_steps_per_second": 1.675,
"step": 30000
},
{
"base_loss": 0.6071268789768219,
"epoch": 0.05817413330078125,
"grad_norm": 0.0011435674969106913,
"learning_rate": 4.709138870239258e-05,
"lookahead_loss": 7.241272003173828,
"loss": 0.6175,
"step": 30500
},
{
"base_loss": 0.5850496413707733,
"epoch": 0.0591278076171875,
"grad_norm": 0.001114132348448038,
"learning_rate": 4.704370498657227e-05,
"lookahead_loss": 7.161423627853393,
"loss": 0.602,
"step": 31000
},
{
"base_loss": 0.5950538128614425,
"epoch": 0.06008148193359375,
"grad_norm": 0.0011068363673985004,
"learning_rate": 4.699602127075195e-05,
"lookahead_loss": 7.204737923622131,
"loss": 0.6088,
"step": 31500
},
{
"base_loss": 0.6059240178465843,
"epoch": 0.06103515625,
"grad_norm": 0.0011420327937230468,
"learning_rate": 4.6948337554931643e-05,
"lookahead_loss": 7.163901489257812,
"loss": 0.6186,
"step": 32000
},
{
"base_loss": 0.6009617374539376,
"epoch": 0.06198883056640625,
"grad_norm": 0.0011484776623547077,
"learning_rate": 4.6900653839111334e-05,
"lookahead_loss": 7.169231108665466,
"loss": 0.6148,
"step": 32500
},
{
"base_loss": 0.5932911797165871,
"epoch": 0.0629425048828125,
"grad_norm": 0.001157185179181397,
"learning_rate": 4.685297012329102e-05,
"lookahead_loss": 7.14691205406189,
"loss": 0.6086,
"step": 33000
},
{
"base_loss": 0.6118422101140022,
"epoch": 0.06389617919921875,
"grad_norm": 0.0010650681797415018,
"learning_rate": 4.680528640747071e-05,
"lookahead_loss": 7.168767412185669,
"loss": 0.625,
"step": 33500
},
{
"base_loss": 0.6075323719978333,
"epoch": 0.064849853515625,
"grad_norm": 0.001123252441175282,
"learning_rate": 4.675760269165039e-05,
"lookahead_loss": 7.191308692932129,
"loss": 0.618,
"step": 34000
},
{
"base_loss": 0.6046282976865769,
"epoch": 0.06580352783203125,
"grad_norm": 0.0011436466593295336,
"learning_rate": 4.670991897583008e-05,
"lookahead_loss": 7.114853853225708,
"loss": 0.6155,
"step": 34500
},
{
"base_loss": 0.6035915340185165,
"epoch": 0.0667572021484375,
"grad_norm": 0.0011282000923529267,
"learning_rate": 4.666223526000977e-05,
"lookahead_loss": 7.079581315994263,
"loss": 0.6135,
"step": 35000
},
{
"epoch": 0.0667572021484375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.093881194203044,
"eval_lookahead_perplexity": 1204.573925020854,
"eval_loss": 0.20874807238578796,
"eval_perplexity": 1.232134550708959,
"eval_runtime": 95.3048,
"eval_samples_per_second": 52.463,
"eval_steps_per_second": 1.647,
"step": 35000
},
{
"base_loss": 0.6189325547814369,
"epoch": 0.06771087646484375,
"grad_norm": 0.0011684788623824716,
"learning_rate": 4.6614551544189455e-05,
"lookahead_loss": 7.151174359321594,
"loss": 0.6291,
"step": 35500
},
{
"base_loss": 0.5999714830517768,
"epoch": 0.06866455078125,
"grad_norm": 0.0010892520658671856,
"learning_rate": 4.6566867828369145e-05,
"lookahead_loss": 7.138783624649048,
"loss": 0.6115,
"step": 36000
},
{
"base_loss": 0.5872128927707672,
"epoch": 0.06961822509765625,
"grad_norm": 0.0011145739117637277,
"learning_rate": 4.651918411254883e-05,
"lookahead_loss": 7.1531580286026,
"loss": 0.6008,
"step": 36500
},
{
"base_loss": 0.6123319318294526,
"epoch": 0.0705718994140625,
"grad_norm": 0.001145465881563723,
"learning_rate": 4.647150039672852e-05,
"lookahead_loss": 7.051203452110291,
"loss": 0.6281,
"step": 37000
},
{
"base_loss": 0.5984540300965309,
"epoch": 0.07152557373046875,
"grad_norm": 0.0010991750750690699,
"learning_rate": 4.642381668090821e-05,
"lookahead_loss": 7.079177887916565,
"loss": 0.6127,
"step": 37500
},
{
"base_loss": 0.5946964643001557,
"epoch": 0.072479248046875,
"grad_norm": 0.0011612207163125277,
"learning_rate": 4.637613296508789e-05,
"lookahead_loss": 7.13145064163208,
"loss": 0.6076,
"step": 38000
},
{
"base_loss": 0.5916517315506935,
"epoch": 0.07343292236328125,
"grad_norm": 0.0010734308743849397,
"learning_rate": 4.632844924926758e-05,
"lookahead_loss": 7.0843782205581665,
"loss": 0.6049,
"step": 38500
},
{
"base_loss": 0.6112363495230675,
"epoch": 0.0743865966796875,
"grad_norm": 0.0011188907083123922,
"learning_rate": 4.6280765533447266e-05,
"lookahead_loss": 7.086141440391541,
"loss": 0.6247,
"step": 39000
},
{
"base_loss": 0.5962583271861076,
"epoch": 0.07534027099609375,
"grad_norm": 0.0011356917675584555,
"learning_rate": 4.6233081817626956e-05,
"lookahead_loss": 7.140744082450866,
"loss": 0.6075,
"step": 39500
},
{
"base_loss": 0.5866445366144181,
"epoch": 0.0762939453125,
"grad_norm": 0.0011127168545499444,
"learning_rate": 4.6185398101806646e-05,
"lookahead_loss": 7.094807801246643,
"loss": 0.5998,
"step": 40000
},
{
"epoch": 0.0762939453125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 7.033647046683314,
"eval_lookahead_perplexity": 1134.1594089598198,
"eval_loss": 0.2086315155029297,
"eval_perplexity": 1.2319909453157274,
"eval_runtime": 108.2669,
"eval_samples_per_second": 46.182,
"eval_steps_per_second": 1.45,
"step": 40000
},
{
"base_loss": 0.6231044068932533,
"epoch": 0.07724761962890625,
"grad_norm": 0.0011179151479154825,
"learning_rate": 4.613771438598633e-05,
"lookahead_loss": 7.0746554174423215,
"loss": 0.6366,
"step": 40500
},
{
"base_loss": 0.5951727138757705,
"epoch": 0.0782012939453125,
"grad_norm": 0.0011324447114020586,
"learning_rate": 4.609003067016602e-05,
"lookahead_loss": 7.070094844818115,
"loss": 0.6099,
"step": 41000
},
{
"base_loss": 0.6008335100412369,
"epoch": 0.07915496826171875,
"grad_norm": 0.0010868014069274068,
"learning_rate": 4.60423469543457e-05,
"lookahead_loss": 7.104050822257996,
"loss": 0.6097,
"step": 41500
},
{
"base_loss": 0.6084940298199654,
"epoch": 0.080108642578125,
"grad_norm": 0.0010936354519799352,
"learning_rate": 4.5994663238525393e-05,
"lookahead_loss": 7.052688834190369,
"loss": 0.6203,
"step": 42000
},
{
"base_loss": 0.6062530100941658,
"epoch": 0.08106231689453125,
"grad_norm": 0.0011594812385737896,
"learning_rate": 4.5946979522705084e-05,
"lookahead_loss": 7.072220482826233,
"loss": 0.6196,
"step": 42500
},
{
"base_loss": 0.592376666367054,
"epoch": 0.0820159912109375,
"grad_norm": 0.0010804173070937395,
"learning_rate": 4.589929580688477e-05,
"lookahead_loss": 7.0764106168746945,
"loss": 0.6073,
"step": 43000
},
{
"base_loss": 0.5900094144940377,
"epoch": 0.08296966552734375,
"grad_norm": 0.001111154560931027,
"learning_rate": 4.585161209106446e-05,
"lookahead_loss": 7.086921313285828,
"loss": 0.6041,
"step": 43500
},
{
"base_loss": 0.6183186983466148,
"epoch": 0.08392333984375,
"grad_norm": 0.0010992800816893578,
"learning_rate": 4.580392837524414e-05,
"lookahead_loss": 7.0898256769180295,
"loss": 0.6294,
"step": 44000
},
{
"base_loss": 0.5945970554947853,
"epoch": 0.08487701416015625,
"grad_norm": 0.0011169550707563758,
"learning_rate": 4.575624465942383e-05,
"lookahead_loss": 7.033306765556335,
"loss": 0.6053,
"step": 44500
},
{
"base_loss": 0.5856418209671974,
"epoch": 0.0858306884765625,
"grad_norm": 0.001114897895604372,
"learning_rate": 4.570856094360352e-05,
"lookahead_loss": 7.065869425773621,
"loss": 0.5972,
"step": 45000
},
{
"epoch": 0.0858306884765625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.983915470659542,
"eval_lookahead_perplexity": 1079.1354288264654,
"eval_loss": 0.20853494107723236,
"eval_perplexity": 1.2318719722426898,
"eval_runtime": 99.1115,
"eval_samples_per_second": 50.448,
"eval_steps_per_second": 1.584,
"step": 45000
},
{
"base_loss": 0.591981584072113,
"epoch": 0.08678436279296875,
"grad_norm": 0.0012656449107453227,
"learning_rate": 4.5660877227783205e-05,
"lookahead_loss": 7.022496276855469,
"loss": 0.6054,
"step": 45500
},
{
"base_loss": 0.624447933793068,
"epoch": 0.087738037109375,
"grad_norm": 0.0010355014819651842,
"learning_rate": 4.5613193511962895e-05,
"lookahead_loss": 7.026841708183289,
"loss": 0.6307,
"step": 46000
},
{
"base_loss": 0.5977871975898743,
"epoch": 0.08869171142578125,
"grad_norm": 0.0011489527532830834,
"learning_rate": 4.556550979614258e-05,
"lookahead_loss": 7.015450751304626,
"loss": 0.607,
"step": 46500
},
{
"base_loss": 0.6007027108073234,
"epoch": 0.0896453857421875,
"grad_norm": 0.0011345903621986508,
"learning_rate": 4.551782608032227e-05,
"lookahead_loss": 7.049301884651184,
"loss": 0.6102,
"step": 47000
},
{
"base_loss": 0.5879440263509751,
"epoch": 0.09059906005859375,
"grad_norm": 0.0010685587767511606,
"learning_rate": 4.547014236450196e-05,
"lookahead_loss": 7.006776951789856,
"loss": 0.5999,
"step": 47500
},
{
"base_loss": 0.5816959359049797,
"epoch": 0.091552734375,
"grad_norm": 0.0010566096752882004,
"learning_rate": 4.542245864868164e-05,
"lookahead_loss": 6.994293849945068,
"loss": 0.5928,
"step": 48000
},
{
"base_loss": 0.615579255104065,
"epoch": 0.09250640869140625,
"grad_norm": 0.0011476678773760796,
"learning_rate": 4.537477493286133e-05,
"lookahead_loss": 7.024335027694702,
"loss": 0.6273,
"step": 48500
},
{
"base_loss": 0.5977260445952416,
"epoch": 0.0934600830078125,
"grad_norm": 0.0010787018109112978,
"learning_rate": 4.5327091217041016e-05,
"lookahead_loss": 7.00899642086029,
"loss": 0.6124,
"step": 49000
},
{
"base_loss": 0.5828603687882423,
"epoch": 0.09441375732421875,
"grad_norm": 0.0011202479945495725,
"learning_rate": 4.5279407501220706e-05,
"lookahead_loss": 7.005703037261963,
"loss": 0.5966,
"step": 49500
},
{
"base_loss": 0.5780045939087868,
"epoch": 0.095367431640625,
"grad_norm": 0.001087460434064269,
"learning_rate": 4.523172378540039e-05,
"lookahead_loss": 6.952547832489014,
"loss": 0.5947,
"step": 50000
},
{
"epoch": 0.095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.945180719272017,
"eval_lookahead_perplexity": 1038.1345911819958,
"eval_loss": 0.2084558606147766,
"eval_perplexity": 1.231774559089228,
"eval_runtime": 115.6293,
"eval_samples_per_second": 43.242,
"eval_steps_per_second": 1.358,
"step": 50000
},
{
"base_loss": 0.5960951861143112,
"epoch": 0.09632110595703125,
"grad_norm": 0.0011174663668498397,
"learning_rate": 4.518404006958008e-05,
"lookahead_loss": 6.991843806266784,
"loss": 0.6115,
"step": 50500
},
{
"base_loss": 0.6136610082983971,
"epoch": 0.0972747802734375,
"grad_norm": 0.0011398132191970944,
"learning_rate": 4.513635635375977e-05,
"lookahead_loss": 6.998520258903503,
"loss": 0.626,
"step": 51000
},
{
"base_loss": 0.5944582785964012,
"epoch": 0.09822845458984375,
"grad_norm": 0.0011265052016824484,
"learning_rate": 4.508867263793945e-05,
"lookahead_loss": 6.961953915596008,
"loss": 0.6069,
"step": 51500
},
{
"base_loss": 0.5841029364466667,
"epoch": 0.09918212890625,
"grad_norm": 0.0011030936148017645,
"learning_rate": 4.5040988922119143e-05,
"lookahead_loss": 7.00307084941864,
"loss": 0.5984,
"step": 52000
},
{
"base_loss": 0.5904176152348518,
"epoch": 0.10013580322265625,
"grad_norm": 0.001120659988373518,
"learning_rate": 4.499330520629883e-05,
"lookahead_loss": 6.987798627853394,
"loss": 0.6045,
"step": 52500
},
{
"base_loss": 0.6092280206680298,
"epoch": 0.1010894775390625,
"grad_norm": 0.0011076764203608036,
"learning_rate": 4.494562149047852e-05,
"lookahead_loss": 7.027708046913147,
"loss": 0.6204,
"step": 53000
},
{
"base_loss": 0.6015866943001748,
"epoch": 0.10204315185546875,
"grad_norm": 0.0011410149745643139,
"learning_rate": 4.489793777465821e-05,
"lookahead_loss": 7.007394369125366,
"loss": 0.6094,
"step": 53500
},
{
"base_loss": 0.5893580458164215,
"epoch": 0.102996826171875,
"grad_norm": 0.0011117961257696152,
"learning_rate": 4.485025405883789e-05,
"lookahead_loss": 6.979210342407226,
"loss": 0.6037,
"step": 54000
},
{
"base_loss": 0.5914280138015747,
"epoch": 0.10395050048828125,
"grad_norm": 0.0011249147355556488,
"learning_rate": 4.480257034301758e-05,
"lookahead_loss": 6.984224304199219,
"loss": 0.6027,
"step": 54500
},
{
"base_loss": 0.6135287986993789,
"epoch": 0.1049041748046875,
"grad_norm": 0.0010942122898995876,
"learning_rate": 4.4754886627197264e-05,
"lookahead_loss": 6.940293532371521,
"loss": 0.6257,
"step": 55000
},
{
"epoch": 0.1049041748046875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.90965398660483,
"eval_lookahead_perplexity": 1001.9005113093888,
"eval_loss": 0.20838512480258942,
"eval_perplexity": 1.2316874315969126,
"eval_runtime": 105.743,
"eval_samples_per_second": 47.284,
"eval_steps_per_second": 1.485,
"step": 55000
},
{
"base_loss": 0.6006079289913178,
"epoch": 0.10585784912109375,
"grad_norm": 0.0010583704570308328,
"learning_rate": 4.4707202911376955e-05,
"lookahead_loss": 6.928493264198303,
"loss": 0.6133,
"step": 55500
},
{
"base_loss": 0.5945557134747506,
"epoch": 0.1068115234375,
"grad_norm": 0.0010779986623674631,
"learning_rate": 4.4659519195556645e-05,
"lookahead_loss": 6.998174697875976,
"loss": 0.6067,
"step": 56000
},
{
"base_loss": 0.5838606398105621,
"epoch": 0.10776519775390625,
"grad_norm": 0.001134494668804109,
"learning_rate": 4.461183547973633e-05,
"lookahead_loss": 6.914501080513,
"loss": 0.5962,
"step": 56500
},
{
"base_loss": 0.6029577027559281,
"epoch": 0.1087188720703125,
"grad_norm": 0.0011605332838371396,
"learning_rate": 4.456415176391602e-05,
"lookahead_loss": 6.893777732849121,
"loss": 0.6179,
"step": 57000
},
{
"base_loss": 0.6101588426232338,
"epoch": 0.10967254638671875,
"grad_norm": 0.0011116194073110819,
"learning_rate": 4.45164680480957e-05,
"lookahead_loss": 6.8865425481796265,
"loss": 0.6193,
"step": 57500
},
{
"base_loss": 0.5955164663791657,
"epoch": 0.110626220703125,
"grad_norm": 0.0011056356597691774,
"learning_rate": 4.446878433227539e-05,
"lookahead_loss": 6.867807936668396,
"loss": 0.6074,
"step": 58000
},
{
"base_loss": 0.5822924041152,
"epoch": 0.11157989501953125,
"grad_norm": 0.0010786657221615314,
"learning_rate": 4.442110061645508e-05,
"lookahead_loss": 6.94662325668335,
"loss": 0.5984,
"step": 58500
},
{
"base_loss": 0.6039745928645134,
"epoch": 0.1125335693359375,
"grad_norm": 0.0010965235996991396,
"learning_rate": 4.4373416900634766e-05,
"lookahead_loss": 6.950021827697754,
"loss": 0.6148,
"step": 59000
},
{
"base_loss": 0.6145888038873673,
"epoch": 0.11348724365234375,
"grad_norm": 0.001114803715609014,
"learning_rate": 4.4325733184814456e-05,
"lookahead_loss": 6.944812598228455,
"loss": 0.6277,
"step": 59500
},
{
"base_loss": 0.5980466955900192,
"epoch": 0.11444091796875,
"grad_norm": 0.001079982495866716,
"learning_rate": 4.427804946899414e-05,
"lookahead_loss": 6.852143743515015,
"loss": 0.6075,
"step": 60000
},
{
"epoch": 0.11444091796875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.878637344311602,
"eval_lookahead_perplexity": 971.3019075411648,
"eval_loss": 0.20832312107086182,
"eval_perplexity": 1.2316110647473708,
"eval_runtime": 91.1615,
"eval_samples_per_second": 54.848,
"eval_steps_per_second": 1.722,
"step": 60000
},
{
"base_loss": 0.583279720902443,
"epoch": 0.11539459228515625,
"grad_norm": 0.001118990359827876,
"learning_rate": 4.423036575317383e-05,
"lookahead_loss": 6.890160356521607,
"loss": 0.5958,
"step": 60500
},
{
"base_loss": 0.6041163793802261,
"epoch": 0.1163482666015625,
"grad_norm": 0.0011760067427530885,
"learning_rate": 4.418268203735352e-05,
"lookahead_loss": 6.909441259384155,
"loss": 0.6188,
"step": 61000
},
{
"base_loss": 0.6094158036708832,
"epoch": 0.11730194091796875,
"grad_norm": 0.0011371213477104902,
"learning_rate": 4.41349983215332e-05,
"lookahead_loss": 6.888408424377442,
"loss": 0.6244,
"step": 61500
},
{
"base_loss": 0.6007498300671578,
"epoch": 0.118255615234375,
"grad_norm": 0.001127147930674255,
"learning_rate": 4.4087314605712893e-05,
"lookahead_loss": 6.916694809913635,
"loss": 0.6107,
"step": 62000
},
{
"base_loss": 0.5840388324260711,
"epoch": 0.11920928955078125,
"grad_norm": 0.0011241508182138205,
"learning_rate": 4.403963088989258e-05,
"lookahead_loss": 6.910943949699402,
"loss": 0.5963,
"step": 62500
},
{
"base_loss": 0.5946898341774941,
"epoch": 1.0009536743164062,
"grad_norm": 0.0011088403407484293,
"learning_rate": 4.399194717407227e-05,
"lookahead_loss": 6.968314840316772,
"loss": 0.6022,
"step": 63000
},
{
"base_loss": 0.5878614686727524,
"epoch": 1.0019073486328125,
"grad_norm": 0.0011633536778390408,
"learning_rate": 4.394426345825196e-05,
"lookahead_loss": 6.820799809455871,
"loss": 0.5996,
"step": 63500
},
{
"base_loss": 0.6057377905845642,
"epoch": 1.0028610229492188,
"grad_norm": 0.001116783358156681,
"learning_rate": 4.389657974243164e-05,
"lookahead_loss": 6.83216045665741,
"loss": 0.6149,
"step": 64000
},
{
"base_loss": 0.6151153823137283,
"epoch": 1.003814697265625,
"grad_norm": 0.0011050739558413625,
"learning_rate": 4.384889602661133e-05,
"lookahead_loss": 6.836852411270142,
"loss": 0.6237,
"step": 64500
},
{
"base_loss": 0.5990258244276047,
"epoch": 1.0047683715820312,
"grad_norm": 0.0011015802156180143,
"learning_rate": 4.3801212310791014e-05,
"lookahead_loss": 6.826987672805786,
"loss": 0.6094,
"step": 65000
},
{
"epoch": 1.0047683715820312,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.850588269888783,
"eval_lookahead_perplexity": 944.4363267400045,
"eval_loss": 0.2082662284374237,
"eval_perplexity": 1.2315409971437101,
"eval_runtime": 100.2414,
"eval_samples_per_second": 49.88,
"eval_steps_per_second": 1.566,
"step": 65000
},
{
"base_loss": 0.5899734389781952,
"epoch": 1.0057220458984375,
"grad_norm": 0.0010623879497870803,
"learning_rate": 4.3753528594970705e-05,
"lookahead_loss": 6.945034725189209,
"loss": 0.6,
"step": 65500
},
{
"base_loss": 0.5821795570850372,
"epoch": 1.0066757202148438,
"grad_norm": 0.001107605523429811,
"learning_rate": 4.3705844879150395e-05,
"lookahead_loss": 6.779200302124023,
"loss": 0.5982,
"step": 66000
},
{
"base_loss": 0.6049286904931068,
"epoch": 1.00762939453125,
"grad_norm": 0.0010999958030879498,
"learning_rate": 4.365816116333008e-05,
"lookahead_loss": 6.845924237251282,
"loss": 0.6171,
"step": 66500
},
{
"base_loss": 0.6021122798919678,
"epoch": 1.0085830688476562,
"grad_norm": 0.0010436498560011387,
"learning_rate": 4.361047744750977e-05,
"lookahead_loss": 6.854257493972779,
"loss": 0.6084,
"step": 67000
},
{
"base_loss": 0.5890477049946785,
"epoch": 1.0095367431640625,
"grad_norm": 0.001144828856922686,
"learning_rate": 4.356279373168945e-05,
"lookahead_loss": 6.862697887420654,
"loss": 0.6064,
"step": 67500
},
{
"base_loss": 0.5928993408083916,
"epoch": 1.0104904174804688,
"grad_norm": 0.0011384448735043406,
"learning_rate": 4.351511001586914e-05,
"lookahead_loss": 6.802295356750489,
"loss": 0.6045,
"step": 68000
},
{
"base_loss": 0.5893295911550522,
"epoch": 1.011444091796875,
"grad_norm": 0.0011518702376633883,
"learning_rate": 4.346742630004883e-05,
"lookahead_loss": 6.8328519544601445,
"loss": 0.6061,
"step": 68500
},
{
"base_loss": 0.6088726551532745,
"epoch": 1.0123977661132812,
"grad_norm": 0.0010904420632869005,
"learning_rate": 4.3419742584228516e-05,
"lookahead_loss": 6.814763288497925,
"loss": 0.6216,
"step": 69000
},
{
"base_loss": 0.5975775923132897,
"epoch": 1.0133514404296875,
"grad_norm": 0.0011171442456543446,
"learning_rate": 4.3372058868408206e-05,
"lookahead_loss": 6.899648434638977,
"loss": 0.6108,
"step": 69500
},
{
"base_loss": 0.595933021903038,
"epoch": 1.0143051147460938,
"grad_norm": 0.0010974474716931581,
"learning_rate": 4.332437515258789e-05,
"lookahead_loss": 6.864955189704895,
"loss": 0.6082,
"step": 70000
},
{
"epoch": 1.0143051147460938,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.82464911914862,
"eval_lookahead_perplexity": 920.2534480815124,
"eval_loss": 0.20821362733840942,
"eval_perplexity": 1.2314762184375097,
"eval_runtime": 96.8797,
"eval_samples_per_second": 51.61,
"eval_steps_per_second": 1.621,
"step": 70000
},
{
"base_loss": 0.5827738286256791,
"epoch": 1.0152587890625,
"grad_norm": 0.0011384559329599142,
"learning_rate": 4.327669143676758e-05,
"lookahead_loss": 6.803851629257202,
"loss": 0.5907,
"step": 70500
},
{
"base_loss": 0.6107082022428513,
"epoch": 1.0162124633789062,
"grad_norm": 0.0011207167990505695,
"learning_rate": 4.322900772094727e-05,
"lookahead_loss": 6.853131731986999,
"loss": 0.6182,
"step": 71000
},
{
"base_loss": 0.6065192295908928,
"epoch": 1.0171661376953125,
"grad_norm": 0.0011044219136238098,
"learning_rate": 4.318132400512695e-05,
"lookahead_loss": 6.888585398674011,
"loss": 0.6136,
"step": 71500
},
{
"base_loss": 0.5948996670246124,
"epoch": 1.0181198120117188,
"grad_norm": 0.0010809814557433128,
"learning_rate": 4.3133640289306643e-05,
"lookahead_loss": 6.898890166282654,
"loss": 0.6055,
"step": 72000
},
{
"base_loss": 0.5865646304488182,
"epoch": 1.019073486328125,
"grad_norm": 0.0010813730768859386,
"learning_rate": 4.308595657348633e-05,
"lookahead_loss": 6.91842933177948,
"loss": 0.5989,
"step": 72500
},
{
"base_loss": 0.5887798971533775,
"epoch": 1.0200271606445312,
"grad_norm": 0.0011451997561380267,
"learning_rate": 4.303827285766602e-05,
"lookahead_loss": 6.75454776763916,
"loss": 0.6052,
"step": 73000
},
{
"base_loss": 0.6147837865948677,
"epoch": 1.0209808349609375,
"grad_norm": 0.0010907722171396017,
"learning_rate": 4.299058914184571e-05,
"lookahead_loss": 6.805963615417481,
"loss": 0.6247,
"step": 73500
},
{
"base_loss": 0.5963076213002205,
"epoch": 1.0219345092773438,
"grad_norm": 0.0011470620520412922,
"learning_rate": 4.294290542602539e-05,
"lookahead_loss": 6.7863368434906,
"loss": 0.6097,
"step": 74000
},
{
"base_loss": 0.5970929145216942,
"epoch": 1.02288818359375,
"grad_norm": 0.0011032413458451629,
"learning_rate": 4.289522171020508e-05,
"lookahead_loss": 6.821732865333557,
"loss": 0.6062,
"step": 74500
},
{
"base_loss": 0.5953870372772216,
"epoch": 1.0238418579101562,
"grad_norm": 0.0011107485042884946,
"learning_rate": 4.2847537994384764e-05,
"lookahead_loss": 6.788943789482117,
"loss": 0.6047,
"step": 75000
},
{
"epoch": 1.0238418579101562,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.798898217015373,
"eval_lookahead_perplexity": 896.8586035415678,
"eval_loss": 0.20816253125667572,
"eval_perplexity": 1.2314132964355469,
"eval_runtime": 91.6696,
"eval_samples_per_second": 54.544,
"eval_steps_per_second": 1.713,
"step": 75000
},
{
"base_loss": 0.6108319897651673,
"epoch": 1.0247955322265625,
"grad_norm": 0.0010732373921200633,
"learning_rate": 4.2799854278564455e-05,
"lookahead_loss": 6.784094378471375,
"loss": 0.6248,
"step": 75500
},
{
"base_loss": 0.6046044981479645,
"epoch": 1.0257492065429688,
"grad_norm": 0.001091954531148076,
"learning_rate": 4.2752170562744145e-05,
"lookahead_loss": 6.748379487991333,
"loss": 0.615,
"step": 76000
},
{
"base_loss": 0.5950362936258317,
"epoch": 1.026702880859375,
"grad_norm": 0.0011224595364183187,
"learning_rate": 4.270448684692383e-05,
"lookahead_loss": 6.760618274688721,
"loss": 0.6056,
"step": 76500
},
{
"base_loss": 0.5943114874362946,
"epoch": 1.0276565551757812,
"grad_norm": 0.0011067682644352317,
"learning_rate": 4.265680313110352e-05,
"lookahead_loss": 6.873536200523376,
"loss": 0.6063,
"step": 77000
},
{
"base_loss": 0.6171733926534653,
"epoch": 1.0286102294921875,
"grad_norm": 0.001133575802668929,
"learning_rate": 4.26091194152832e-05,
"lookahead_loss": 6.882242550849915,
"loss": 0.6266,
"step": 77500
},
{
"base_loss": 0.6017211389541626,
"epoch": 1.0295639038085938,
"grad_norm": 0.001115851104259491,
"learning_rate": 4.256143569946289e-05,
"lookahead_loss": 6.8330844841003415,
"loss": 0.6123,
"step": 78000
},
{
"base_loss": 0.5937362365722656,
"epoch": 1.030517578125,
"grad_norm": 0.0011238973820582032,
"learning_rate": 4.251375198364258e-05,
"lookahead_loss": 6.8106466889381405,
"loss": 0.6071,
"step": 78500
},
{
"base_loss": 0.5903627701997757,
"epoch": 1.0314712524414062,
"grad_norm": 0.0010951296426355839,
"learning_rate": 4.2466068267822266e-05,
"lookahead_loss": 6.845707628250122,
"loss": 0.6039,
"step": 79000
},
{
"base_loss": 0.6103688189387322,
"epoch": 1.0324249267578125,
"grad_norm": 0.001133197103627026,
"learning_rate": 4.2418384552001956e-05,
"lookahead_loss": 6.768160277366638,
"loss": 0.6277,
"step": 79500
},
{
"base_loss": 0.5958747680187225,
"epoch": 1.0333786010742188,
"grad_norm": 0.001138185732997954,
"learning_rate": 4.237070083618164e-05,
"lookahead_loss": 6.823498950004578,
"loss": 0.6081,
"step": 80000
},
{
"epoch": 1.0333786010742188,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.777561352275812,
"eval_lookahead_perplexity": 877.9251612762528,
"eval_loss": 0.20812006294727325,
"eval_perplexity": 1.231361001505118,
"eval_runtime": 106.9612,
"eval_samples_per_second": 46.746,
"eval_steps_per_second": 1.468,
"step": 80000
},
{
"base_loss": 0.5922364763021469,
"epoch": 1.034332275390625,
"grad_norm": 0.001084625837393105,
"learning_rate": 4.232301712036133e-05,
"lookahead_loss": 6.893566377639771,
"loss": 0.6076,
"step": 80500
},
{
"base_loss": 0.5894868444204331,
"epoch": 1.0352859497070312,
"grad_norm": 0.0011169870849698782,
"learning_rate": 4.227533340454102e-05,
"lookahead_loss": 6.730395976066589,
"loss": 0.6035,
"step": 81000
},
{
"base_loss": 0.6110191858410835,
"epoch": 1.0362396240234375,
"grad_norm": 0.0011154419044032693,
"learning_rate": 4.22276496887207e-05,
"lookahead_loss": 6.848829930305481,
"loss": 0.6219,
"step": 81500
},
{
"base_loss": 0.6014680997133255,
"epoch": 1.0371932983398438,
"grad_norm": 0.0011206173803657293,
"learning_rate": 4.2179965972900393e-05,
"lookahead_loss": 6.780662053108215,
"loss": 0.611,
"step": 82000
},
{
"base_loss": 0.5883948777914048,
"epoch": 1.03814697265625,
"grad_norm": 0.0010986519046127796,
"learning_rate": 4.213228225708008e-05,
"lookahead_loss": 6.795178040504456,
"loss": 0.6017,
"step": 82500
},
{
"base_loss": 0.6005192502140999,
"epoch": 1.0391006469726562,
"grad_norm": 0.0011019845260307193,
"learning_rate": 4.208459854125977e-05,
"lookahead_loss": 6.752805541038513,
"loss": 0.6141,
"step": 83000
},
{
"base_loss": 0.6166067426204681,
"epoch": 1.0400543212890625,
"grad_norm": 0.0011135574895888567,
"learning_rate": 4.203691482543946e-05,
"lookahead_loss": 6.774759250640869,
"loss": 0.6253,
"step": 83500
},
{
"base_loss": 0.6028286694288254,
"epoch": 1.0410079956054688,
"grad_norm": 0.0011277147568762302,
"learning_rate": 4.198923110961914e-05,
"lookahead_loss": 6.730431209564209,
"loss": 0.6126,
"step": 84000
},
{
"base_loss": 0.5807398597002029,
"epoch": 1.041961669921875,
"grad_norm": 0.0011186593910679221,
"learning_rate": 4.194154739379883e-05,
"lookahead_loss": 6.8060649909973145,
"loss": 0.5979,
"step": 84500
},
{
"base_loss": 0.605569171845913,
"epoch": 1.0429153442382812,
"grad_norm": 0.0010821224423125386,
"learning_rate": 4.1893863677978514e-05,
"lookahead_loss": 6.820508779525757,
"loss": 0.6175,
"step": 85000
},
{
"epoch": 1.0429153442382812,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.75723210538919,
"eval_lookahead_perplexity": 860.2577945124659,
"eval_loss": 0.20807963609695435,
"eval_perplexity": 1.2313112224644334,
"eval_runtime": 93.1866,
"eval_samples_per_second": 53.656,
"eval_steps_per_second": 1.685,
"step": 85000
},
{
"base_loss": 0.6117001739740372,
"epoch": 1.0438690185546875,
"grad_norm": 0.001131902332417667,
"learning_rate": 4.1846179962158205e-05,
"lookahead_loss": 6.828276200294495,
"loss": 0.6253,
"step": 85500
},
{
"base_loss": 0.5927367950081825,
"epoch": 1.0448226928710938,
"grad_norm": 0.0011309866094961762,
"learning_rate": 4.1798496246337895e-05,
"lookahead_loss": 6.777775590896606,
"loss": 0.6043,
"step": 86000
},
{
"base_loss": 0.5924578613042831,
"epoch": 1.0457763671875,
"grad_norm": 0.001088097458705306,
"learning_rate": 4.175081253051758e-05,
"lookahead_loss": 6.762310373783111,
"loss": 0.6038,
"step": 86500
},
{
"base_loss": 0.6208472669720649,
"epoch": 1.0467300415039062,
"grad_norm": 0.001117968698963523,
"learning_rate": 4.170312881469727e-05,
"lookahead_loss": 6.744260499000549,
"loss": 0.629,
"step": 87000
},
{
"base_loss": 0.6022886065840721,
"epoch": 1.0476837158203125,
"grad_norm": 0.001148878363892436,
"learning_rate": 4.165544509887695e-05,
"lookahead_loss": 6.774776460647583,
"loss": 0.6215,
"step": 87500
},
{
"base_loss": 0.588396491408348,
"epoch": 1.0486373901367188,
"grad_norm": 0.0010915439343079925,
"learning_rate": 4.160776138305664e-05,
"lookahead_loss": 6.75317128944397,
"loss": 0.6012,
"step": 88000
},
{
"base_loss": 0.5997280370593071,
"epoch": 1.049591064453125,
"grad_norm": 0.0011360279750078917,
"learning_rate": 4.156007766723633e-05,
"lookahead_loss": 6.72175373840332,
"loss": 0.6095,
"step": 88500
},
{
"base_loss": 0.6170587275028229,
"epoch": 1.0505447387695312,
"grad_norm": 0.0010587567230686545,
"learning_rate": 4.1512393951416016e-05,
"lookahead_loss": 6.816565957069397,
"loss": 0.6288,
"step": 89000
},
{
"base_loss": 0.5966709926724434,
"epoch": 1.0514984130859375,
"grad_norm": 0.0011491916375234723,
"learning_rate": 4.1464710235595706e-05,
"lookahead_loss": 6.767929425239563,
"loss": 0.6097,
"step": 89500
},
{
"base_loss": 0.592747309923172,
"epoch": 1.0524520874023438,
"grad_norm": 0.0011455725179985166,
"learning_rate": 4.141702651977539e-05,
"lookahead_loss": 6.712255210876465,
"loss": 0.606,
"step": 90000
},
{
"epoch": 1.0524520874023438,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.738918094208446,
"eval_lookahead_perplexity": 844.6464134883248,
"eval_loss": 0.2080424576997757,
"eval_perplexity": 1.2312654451377232,
"eval_runtime": 93.0744,
"eval_samples_per_second": 53.72,
"eval_steps_per_second": 1.687,
"step": 90000
},
{
"base_loss": 0.6108891260027886,
"epoch": 1.05340576171875,
"grad_norm": 0.0010910800192505121,
"learning_rate": 4.136934280395508e-05,
"lookahead_loss": 6.70730511379242,
"loss": 0.6205,
"step": 90500
},
{
"base_loss": 0.6110858068466186,
"epoch": 1.0543594360351562,
"grad_norm": 0.001111338846385479,
"learning_rate": 4.132165908813477e-05,
"lookahead_loss": 6.761465684890747,
"loss": 0.6261,
"step": 91000
},
{
"base_loss": 0.5958408567905426,
"epoch": 1.0553131103515625,
"grad_norm": 0.0011252695694565773,
"learning_rate": 4.127397537231445e-05,
"lookahead_loss": 6.779712849617004,
"loss": 0.6066,
"step": 91500
},
{
"base_loss": 0.5900192571878433,
"epoch": 1.0562667846679688,
"grad_norm": 0.0010718937264755368,
"learning_rate": 4.1226291656494143e-05,
"lookahead_loss": 6.779324295997619,
"loss": 0.6041,
"step": 92000
},
{
"base_loss": 0.6141881394386292,
"epoch": 1.057220458984375,
"grad_norm": 0.0011070650070905685,
"learning_rate": 4.117860794067383e-05,
"lookahead_loss": 6.778371848106384,
"loss": 0.6279,
"step": 92500
},
{
"base_loss": 0.6065550698041916,
"epoch": 1.0581741333007812,
"grad_norm": 0.0011338784825056791,
"learning_rate": 4.113092422485352e-05,
"lookahead_loss": 6.767937861442566,
"loss": 0.6173,
"step": 93000
},
{
"base_loss": 0.5854485256075859,
"epoch": 1.0591278076171875,
"grad_norm": 0.0011064207646995783,
"learning_rate": 4.108324050903321e-05,
"lookahead_loss": 6.672726812362671,
"loss": 0.6006,
"step": 93500
},
{
"base_loss": 0.5929028804898262,
"epoch": 1.0600814819335938,
"grad_norm": 0.0011152655351907015,
"learning_rate": 4.103555679321289e-05,
"lookahead_loss": 6.745905955314636,
"loss": 0.6068,
"step": 94000
},
{
"base_loss": 0.6067430639863014,
"epoch": 1.06103515625,
"grad_norm": 0.001127126393839717,
"learning_rate": 4.098787307739258e-05,
"lookahead_loss": 6.6982609925270085,
"loss": 0.617,
"step": 94500
},
{
"base_loss": 0.6029584980010987,
"epoch": 1.0619888305664062,
"grad_norm": 0.0011526525486260653,
"learning_rate": 4.0940189361572264e-05,
"lookahead_loss": 6.715296206474304,
"loss": 0.6149,
"step": 95000
},
{
"epoch": 1.0619888305664062,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.7193962986857745,
"eval_lookahead_perplexity": 828.3173042636047,
"eval_loss": 0.20800456404685974,
"eval_perplexity": 1.2312187888762913,
"eval_runtime": 90.4107,
"eval_samples_per_second": 55.303,
"eval_steps_per_second": 1.737,
"step": 95000
},
{
"base_loss": 0.5950580761432648,
"epoch": 1.0629425048828125,
"grad_norm": 0.0011395127512514591,
"learning_rate": 4.0892505645751955e-05,
"lookahead_loss": 6.710915733337402,
"loss": 0.6065,
"step": 95500
},
{
"base_loss": 0.6125950453877449,
"epoch": 1.0638961791992188,
"grad_norm": 0.0010708384215831757,
"learning_rate": 4.0844821929931645e-05,
"lookahead_loss": 6.735831215858459,
"loss": 0.6241,
"step": 96000
},
{
"base_loss": 0.605140404343605,
"epoch": 1.064849853515625,
"grad_norm": 0.0011066367151215672,
"learning_rate": 4.079713821411133e-05,
"lookahead_loss": 6.77455454158783,
"loss": 0.616,
"step": 96500
},
{
"base_loss": 0.6022319710254669,
"epoch": 1.0658035278320312,
"grad_norm": 0.0011385679244995117,
"learning_rate": 4.074945449829102e-05,
"lookahead_loss": 6.687186507225037,
"loss": 0.6135,
"step": 97000
},
{
"base_loss": 0.6027357627749443,
"epoch": 1.0667572021484375,
"grad_norm": 0.0011094497749581933,
"learning_rate": 4.07017707824707e-05,
"lookahead_loss": 6.668306805610657,
"loss": 0.6111,
"step": 97500
},
{
"base_loss": 0.6161690940260887,
"epoch": 1.0677108764648438,
"grad_norm": 0.0011532640783116221,
"learning_rate": 4.065408706665039e-05,
"lookahead_loss": 6.747381279945373,
"loss": 0.6264,
"step": 98000
},
{
"base_loss": 0.5982645556926728,
"epoch": 1.06866455078125,
"grad_norm": 0.0010863294592127204,
"learning_rate": 4.060640335083008e-05,
"lookahead_loss": 6.722812586784363,
"loss": 0.6099,
"step": 98500
},
{
"base_loss": 0.5865320681333542,
"epoch": 1.0696182250976562,
"grad_norm": 0.0011132799554616213,
"learning_rate": 4.0558719635009766e-05,
"lookahead_loss": 6.748574607849121,
"loss": 0.6003,
"step": 99000
},
{
"base_loss": 0.6179066747426987,
"epoch": 1.0705718994140625,
"grad_norm": 0.0011427431600168347,
"learning_rate": 4.0511035919189456e-05,
"lookahead_loss": 6.636237494945526,
"loss": 0.6297,
"step": 99500
},
{
"base_loss": 0.5993645028471947,
"epoch": 1.0715255737304688,
"grad_norm": 0.0011170258512720466,
"learning_rate": 4.046335220336914e-05,
"lookahead_loss": 6.691766156196595,
"loss": 0.6122,
"step": 100000
},
{
"epoch": 1.0715255737304688,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.701932117961847,
"eval_lookahead_perplexity": 813.9770064196388,
"eval_loss": 0.2079702615737915,
"eval_perplexity": 1.2311765557512993,
"eval_runtime": 93.3483,
"eval_samples_per_second": 53.563,
"eval_steps_per_second": 1.682,
"step": 100000
},
{
"base_loss": 0.5923813906908035,
"epoch": 1.072479248046875,
"grad_norm": 0.0011480902321636677,
"learning_rate": 4.041566848754883e-05,
"lookahead_loss": 6.744957288742065,
"loss": 0.6046,
"step": 100500
},
{
"base_loss": 0.5912299656271934,
"epoch": 1.0734329223632812,
"grad_norm": 0.0010534238535910845,
"learning_rate": 4.036798477172852e-05,
"lookahead_loss": 6.693481325149536,
"loss": 0.6037,
"step": 101000
},
{
"base_loss": 0.6128248473405838,
"epoch": 1.0743865966796875,
"grad_norm": 0.001108926022425294,
"learning_rate": 4.03203010559082e-05,
"lookahead_loss": 6.697817398071289,
"loss": 0.6247,
"step": 101500
},
{
"base_loss": 0.5946393350362777,
"epoch": 1.0753402709960938,
"grad_norm": 0.001114795682951808,
"learning_rate": 4.0272617340087893e-05,
"lookahead_loss": 6.764534550666809,
"loss": 0.6062,
"step": 102000
},
{
"base_loss": 0.5858605382442474,
"epoch": 1.0762939453125,
"grad_norm": 0.001073968131095171,
"learning_rate": 4.022493362426758e-05,
"lookahead_loss": 6.726330715179444,
"loss": 0.5967,
"step": 102500
},
{
"base_loss": 0.6261867806911469,
"epoch": 1.0772476196289062,
"grad_norm": 0.001089173019863665,
"learning_rate": 4.017724990844727e-05,
"lookahead_loss": 6.691923519134521,
"loss": 0.6366,
"step": 103000
},
{
"base_loss": 0.5926554707288743,
"epoch": 1.0782012939453125,
"grad_norm": 0.0011284619104117155,
"learning_rate": 4.012956619262696e-05,
"lookahead_loss": 6.703181129455566,
"loss": 0.6086,
"step": 103500
},
{
"base_loss": 0.5974663733839989,
"epoch": 1.0791549682617188,
"grad_norm": 0.0010861388873308897,
"learning_rate": 4.008188247680664e-05,
"lookahead_loss": 6.74266376209259,
"loss": 0.6072,
"step": 104000
},
{
"base_loss": 0.6090346719622612,
"epoch": 1.080108642578125,
"grad_norm": 0.0010843543568626046,
"learning_rate": 4.003419876098633e-05,
"lookahead_loss": 6.690739940643311,
"loss": 0.6216,
"step": 104500
},
{
"base_loss": 0.6080623995661736,
"epoch": 1.0810623168945312,
"grad_norm": 0.00115968135651201,
"learning_rate": 3.9986515045166014e-05,
"lookahead_loss": 6.721202290534973,
"loss": 0.6199,
"step": 105000
},
{
"epoch": 1.0810623168945312,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.683899407188733,
"eval_lookahead_perplexity": 799.4303465286481,
"eval_loss": 0.20793530344963074,
"eval_perplexity": 1.231133516880683,
"eval_runtime": 92.6887,
"eval_samples_per_second": 53.944,
"eval_steps_per_second": 1.694,
"step": 105000
},
{
"base_loss": 0.5944187866449356,
"epoch": 1.0820159912109375,
"grad_norm": 0.0010683005675673485,
"learning_rate": 3.9938831329345705e-05,
"lookahead_loss": 6.727095356464386,
"loss": 0.6054,
"step": 105500
},
{
"base_loss": 0.5926255500912666,
"epoch": 1.0829696655273438,
"grad_norm": 0.001137380488216877,
"learning_rate": 3.9891147613525395e-05,
"lookahead_loss": 6.750259411811829,
"loss": 0.6044,
"step": 106000
},
{
"base_loss": 0.6189518148899078,
"epoch": 1.08392333984375,
"grad_norm": 0.0010804138146340847,
"learning_rate": 3.984346389770508e-05,
"lookahead_loss": 6.7454378662109375,
"loss": 0.629,
"step": 106500
},
{
"base_loss": 0.5973528184294701,
"epoch": 1.0848770141601562,
"grad_norm": 0.0011135113891214132,
"learning_rate": 3.979578018188477e-05,
"lookahead_loss": 6.6877857160568235,
"loss": 0.6073,
"step": 107000
},
{
"base_loss": 0.5883546487689019,
"epoch": 1.0858306884765625,
"grad_norm": 0.0011390469735488296,
"learning_rate": 3.974809646606445e-05,
"lookahead_loss": 6.7175205068588255,
"loss": 0.5988,
"step": 107500
},
{
"base_loss": 0.5933809608817101,
"epoch": 1.0867843627929688,
"grad_norm": 0.0012121612671762705,
"learning_rate": 3.970041275024414e-05,
"lookahead_loss": 6.67557014465332,
"loss": 0.6058,
"step": 108000
},
{
"base_loss": 0.6234395582079887,
"epoch": 1.087738037109375,
"grad_norm": 0.0010294954990968108,
"learning_rate": 3.965272903442383e-05,
"lookahead_loss": 6.695505553245544,
"loss": 0.6294,
"step": 108500
},
{
"base_loss": 0.5963783563375473,
"epoch": 1.0886917114257812,
"grad_norm": 0.0011363314697518945,
"learning_rate": 3.9605045318603516e-05,
"lookahead_loss": 6.69392915725708,
"loss": 0.6053,
"step": 109000
},
{
"base_loss": 0.5991918464303017,
"epoch": 1.0896453857421875,
"grad_norm": 0.00112410937435925,
"learning_rate": 3.9557361602783206e-05,
"lookahead_loss": 6.717226490974427,
"loss": 0.6083,
"step": 109500
},
{
"base_loss": 0.5880093929767609,
"epoch": 1.0905990600585938,
"grad_norm": 0.0010848381789401174,
"learning_rate": 3.950967788696289e-05,
"lookahead_loss": 6.691950410842895,
"loss": 0.5994,
"step": 110000
},
{
"epoch": 1.0905990600585938,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.667511131055058,
"eval_lookahead_perplexity": 786.4358309477227,
"eval_loss": 0.20790287852287292,
"eval_perplexity": 1.231093598113754,
"eval_runtime": 90.3464,
"eval_samples_per_second": 55.343,
"eval_steps_per_second": 1.738,
"step": 110000
},
{
"base_loss": 0.5782296355366707,
"epoch": 1.091552734375,
"grad_norm": 0.0010654990328475833,
"learning_rate": 3.946199417114258e-05,
"lookahead_loss": 6.665598778724671,
"loss": 0.5917,
"step": 110500
},
{
"base_loss": 0.617558573782444,
"epoch": 1.0925064086914062,
"grad_norm": 0.001140857464633882,
"learning_rate": 3.941431045532227e-05,
"lookahead_loss": 6.707964798927307,
"loss": 0.6283,
"step": 111000
},
{
"base_loss": 0.5982673740983009,
"epoch": 1.0934600830078125,
"grad_norm": 0.001054911408573389,
"learning_rate": 3.936662673950195e-05,
"lookahead_loss": 6.694295763015747,
"loss": 0.6126,
"step": 111500
},
{
"base_loss": 0.5841596345305443,
"epoch": 1.0944137573242188,
"grad_norm": 0.0011086445301771164,
"learning_rate": 3.9318943023681643e-05,
"lookahead_loss": 6.687951999664307,
"loss": 0.597,
"step": 112000
},
{
"base_loss": 0.5796255503892899,
"epoch": 1.095367431640625,
"grad_norm": 0.0010867841774597764,
"learning_rate": 3.927125930786133e-05,
"lookahead_loss": 6.6346531310081485,
"loss": 0.5945,
"step": 112500
},
{
"base_loss": 0.5978616480827331,
"epoch": 1.0963211059570312,
"grad_norm": 0.001112610101699829,
"learning_rate": 3.922357559204102e-05,
"lookahead_loss": 6.685487722396851,
"loss": 0.6119,
"step": 113000
},
{
"base_loss": 0.6154543727636337,
"epoch": 1.0972747802734375,
"grad_norm": 0.0011152740335091949,
"learning_rate": 3.917589187622071e-05,
"lookahead_loss": 6.701974026679992,
"loss": 0.6251,
"step": 113500
},
{
"base_loss": 0.596605758190155,
"epoch": 1.0982284545898438,
"grad_norm": 0.0011151140788570046,
"learning_rate": 3.912820816040039e-05,
"lookahead_loss": 6.645841445922851,
"loss": 0.6064,
"step": 114000
},
{
"base_loss": 0.5833840205669403,
"epoch": 1.09918212890625,
"grad_norm": 0.0011117896065115929,
"learning_rate": 3.908052444458008e-05,
"lookahead_loss": 6.711025802612305,
"loss": 0.5967,
"step": 114500
},
{
"base_loss": 0.586369781255722,
"epoch": 1.1001358032226562,
"grad_norm": 0.0011113210348412395,
"learning_rate": 3.9032840728759764e-05,
"lookahead_loss": 6.68584174823761,
"loss": 0.6021,
"step": 115000
},
{
"epoch": 1.1001358032226562,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.653558318226482,
"eval_lookahead_perplexity": 775.5390362479169,
"eval_loss": 0.20787346363067627,
"eval_perplexity": 1.2310573861608696,
"eval_runtime": 93.7117,
"eval_samples_per_second": 53.355,
"eval_steps_per_second": 1.675,
"step": 115000
},
{
"base_loss": 0.6109204713702202,
"epoch": 1.1010894775390625,
"grad_norm": 0.0011247434886172414,
"learning_rate": 3.8985157012939455e-05,
"lookahead_loss": 6.729825895309448,
"loss": 0.6217,
"step": 115500
},
{
"base_loss": 0.6036947486400605,
"epoch": 1.1020431518554688,
"grad_norm": 0.001134915859438479,
"learning_rate": 3.8937473297119145e-05,
"lookahead_loss": 6.710568108081818,
"loss": 0.609,
"step": 116000
},
{
"base_loss": 0.5892721264362335,
"epoch": 1.102996826171875,
"grad_norm": 0.001085427007637918,
"learning_rate": 3.888978958129883e-05,
"lookahead_loss": 6.689410036087036,
"loss": 0.6033,
"step": 116500
},
{
"base_loss": 0.5901298764944076,
"epoch": 1.1039505004882812,
"grad_norm": 0.0011025239946320653,
"learning_rate": 3.884210586547852e-05,
"lookahead_loss": 6.702957942962646,
"loss": 0.6024,
"step": 117000
},
{
"base_loss": 0.6100293419957161,
"epoch": 1.1049041748046875,
"grad_norm": 0.0010662467684596777,
"learning_rate": 3.87944221496582e-05,
"lookahead_loss": 6.645748790740967,
"loss": 0.6218,
"step": 117500
},
{
"base_loss": 0.602484605550766,
"epoch": 1.1058578491210938,
"grad_norm": 0.001067397533915937,
"learning_rate": 3.874673843383789e-05,
"lookahead_loss": 6.646303733825683,
"loss": 0.6147,
"step": 118000
},
{
"base_loss": 0.5962947644591331,
"epoch": 1.1068115234375,
"grad_norm": 0.0010932920267805457,
"learning_rate": 3.869905471801758e-05,
"lookahead_loss": 6.713128123283386,
"loss": 0.6065,
"step": 118500
},
{
"base_loss": 0.5839503274559975,
"epoch": 1.1077651977539062,
"grad_norm": 0.0011347734834998846,
"learning_rate": 3.8651371002197266e-05,
"lookahead_loss": 6.62405997467041,
"loss": 0.5955,
"step": 119000
},
{
"base_loss": 0.6058702719211578,
"epoch": 1.1087188720703125,
"grad_norm": 0.0011517951497808099,
"learning_rate": 3.8603687286376956e-05,
"lookahead_loss": 6.613666387557983,
"loss": 0.6193,
"step": 119500
},
{
"base_loss": 0.6073802384138107,
"epoch": 1.1096725463867188,
"grad_norm": 0.0010947365080937743,
"learning_rate": 3.855600357055664e-05,
"lookahead_loss": 6.604139113426209,
"loss": 0.6186,
"step": 120000
},
{
"epoch": 1.1096725463867188,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.638637524443313,
"eval_lookahead_perplexity": 764.0532796091819,
"eval_loss": 0.20784424245357513,
"eval_perplexity": 1.2310214137405475,
"eval_runtime": 92.4064,
"eval_samples_per_second": 54.109,
"eval_steps_per_second": 1.699,
"step": 120000
},
{
"base_loss": 0.5944828372001648,
"epoch": 1.110626220703125,
"grad_norm": 0.0010918062180280685,
"learning_rate": 3.850831985473633e-05,
"lookahead_loss": 6.5845939102172855,
"loss": 0.6062,
"step": 120500
},
{
"base_loss": 0.5857415299415588,
"epoch": 1.1115798950195312,
"grad_norm": 0.0010996379423886538,
"learning_rate": 3.846063613891602e-05,
"lookahead_loss": 6.665441793441772,
"loss": 0.598,
"step": 121000
},
{
"base_loss": 0.6081982196569443,
"epoch": 1.1125335693359375,
"grad_norm": 0.001094916253350675,
"learning_rate": 3.84129524230957e-05,
"lookahead_loss": 6.685654604434967,
"loss": 0.6153,
"step": 121500
},
{
"base_loss": 0.6173637208938598,
"epoch": 1.1134872436523438,
"grad_norm": 0.0011174079263582826,
"learning_rate": 3.8365268707275393e-05,
"lookahead_loss": 6.6888926963806155,
"loss": 0.6297,
"step": 122000
},
{
"base_loss": 0.595946085691452,
"epoch": 1.11444091796875,
"grad_norm": 0.001059638219885528,
"learning_rate": 3.831758499145508e-05,
"lookahead_loss": 6.576348360061646,
"loss": 0.6061,
"step": 122500
},
{
"base_loss": 0.5870628617405892,
"epoch": 1.1153945922851562,
"grad_norm": 0.0011298077879473567,
"learning_rate": 3.826990127563477e-05,
"lookahead_loss": 6.627452656745911,
"loss": 0.5984,
"step": 123000
},
{
"base_loss": 0.6015327024459839,
"epoch": 1.1163482666015625,
"grad_norm": 0.001181896193884313,
"learning_rate": 3.822221755981446e-05,
"lookahead_loss": 6.645281805038453,
"loss": 0.6155,
"step": 123500
},
{
"base_loss": 0.6103048238754273,
"epoch": 1.1173019409179688,
"grad_norm": 0.0011344418162479997,
"learning_rate": 3.817453384399414e-05,
"lookahead_loss": 6.633471826553345,
"loss": 0.6218,
"step": 124000
},
{
"base_loss": 0.6046528750061989,
"epoch": 1.118255615234375,
"grad_norm": 0.0011290363036096096,
"learning_rate": 3.812685012817383e-05,
"lookahead_loss": 6.66287439250946,
"loss": 0.6122,
"step": 124500
},
{
"base_loss": 0.5855171493887902,
"epoch": 1.1192092895507812,
"grad_norm": 0.001125651178881526,
"learning_rate": 3.8079166412353514e-05,
"lookahead_loss": 6.652395925998688,
"loss": 0.596,
"step": 125000
},
{
"epoch": 1.1192092895507812,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.624730955678434,
"eval_lookahead_perplexity": 753.5014599919688,
"eval_loss": 0.20781628787517548,
"eval_perplexity": 1.230987001536917,
"eval_runtime": 90.2053,
"eval_samples_per_second": 55.429,
"eval_steps_per_second": 1.74,
"step": 125000
},
{
"base_loss": 0.5973180630803108,
"epoch": 2.0009536743164062,
"grad_norm": 0.0011061604600399733,
"learning_rate": 3.8031482696533205e-05,
"lookahead_loss": 6.729214018821716,
"loss": 0.6027,
"step": 125500
},
{
"base_loss": 0.5872496640682221,
"epoch": 2.0019073486328125,
"grad_norm": 0.0011476138606667519,
"learning_rate": 3.7983798980712895e-05,
"lookahead_loss": 6.56571629524231,
"loss": 0.5971,
"step": 126000
},
{
"base_loss": 0.6028999392390251,
"epoch": 2.0028610229492188,
"grad_norm": 0.0011293648276478052,
"learning_rate": 3.793611526489258e-05,
"lookahead_loss": 6.571031596183777,
"loss": 0.6128,
"step": 126500
},
{
"base_loss": 0.6122851598262787,
"epoch": 2.003814697265625,
"grad_norm": 0.0010998975485563278,
"learning_rate": 3.788843154907227e-05,
"lookahead_loss": 6.580973098754883,
"loss": 0.622,
"step": 127000
},
{
"base_loss": 0.5994558810591698,
"epoch": 2.0047683715820312,
"grad_norm": 0.001085072522982955,
"learning_rate": 3.784074783325195e-05,
"lookahead_loss": 6.575087475776672,
"loss": 0.608,
"step": 127500
},
{
"base_loss": 0.5888918130993843,
"epoch": 2.0057220458984375,
"grad_norm": 0.0010583444964140654,
"learning_rate": 3.779306411743164e-05,
"lookahead_loss": 6.699651515007019,
"loss": 0.6002,
"step": 128000
},
{
"base_loss": 0.5821106826066971,
"epoch": 2.0066757202148438,
"grad_norm": 0.0010807913495227695,
"learning_rate": 3.774538040161133e-05,
"lookahead_loss": 6.536865455627441,
"loss": 0.599,
"step": 128500
},
{
"base_loss": 0.6039754400849342,
"epoch": 2.00762939453125,
"grad_norm": 0.0010995008051395416,
"learning_rate": 3.7697696685791016e-05,
"lookahead_loss": 6.604396286010743,
"loss": 0.6165,
"step": 129000
},
{
"base_loss": 0.6021587365865707,
"epoch": 2.0085830688476562,
"grad_norm": 0.0010323027381673455,
"learning_rate": 3.7650012969970706e-05,
"lookahead_loss": 6.605464251518249,
"loss": 0.6082,
"step": 129500
},
{
"base_loss": 0.5906935078501702,
"epoch": 2.0095367431640625,
"grad_norm": 0.0011525024892762303,
"learning_rate": 3.760232925415039e-05,
"lookahead_loss": 6.627658121109008,
"loss": 0.6063,
"step": 130000
},
{
"epoch": 2.0095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.611681720319266,
"eval_lookahead_perplexity": 743.7327180237362,
"eval_loss": 0.2077895551919937,
"eval_perplexity": 1.2309540943912542,
"eval_runtime": 91.3828,
"eval_samples_per_second": 54.715,
"eval_steps_per_second": 1.718,
"step": 130000
},
{
"base_loss": 0.597510245859623,
"epoch": 2.0104904174804688,
"grad_norm": 0.001104092923924327,
"learning_rate": 3.755464553833008e-05,
"lookahead_loss": 6.562650055885315,
"loss": 0.6071,
"step": 130500
},
{
"base_loss": 0.589246776163578,
"epoch": 2.011444091796875,
"grad_norm": 0.0011504783760756254,
"learning_rate": 3.750696182250977e-05,
"lookahead_loss": 6.588836015701294,
"loss": 0.6033,
"step": 131000
},
{
"base_loss": 0.610647314965725,
"epoch": 2.0123977661132812,
"grad_norm": 0.0011107546743005514,
"learning_rate": 3.745927810668945e-05,
"lookahead_loss": 6.588105909347534,
"loss": 0.6243,
"step": 131500
},
{
"base_loss": 0.5952036259770394,
"epoch": 2.0133514404296875,
"grad_norm": 0.0011159584391862154,
"learning_rate": 3.7411594390869143e-05,
"lookahead_loss": 6.66543052482605,
"loss": 0.6094,
"step": 132000
},
{
"base_loss": 0.5953016864061356,
"epoch": 2.0143051147460938,
"grad_norm": 0.0010857629822567105,
"learning_rate": 3.736391067504883e-05,
"lookahead_loss": 6.636122459411621,
"loss": 0.6068,
"step": 132500
},
{
"base_loss": 0.58618260627985,
"epoch": 2.0152587890625,
"grad_norm": 0.0011120929848402739,
"learning_rate": 3.731622695922852e-05,
"lookahead_loss": 6.585938324451447,
"loss": 0.5924,
"step": 133000
},
{
"base_loss": 0.6084697796702385,
"epoch": 2.0162124633789062,
"grad_norm": 0.0011215230915695429,
"learning_rate": 3.726854324340821e-05,
"lookahead_loss": 6.614871105194092,
"loss": 0.6169,
"step": 133500
},
{
"base_loss": 0.6040195283293724,
"epoch": 2.0171661376953125,
"grad_norm": 0.0011012755567207932,
"learning_rate": 3.722085952758789e-05,
"lookahead_loss": 6.664707413673401,
"loss": 0.612,
"step": 134000
},
{
"base_loss": 0.5937762448191642,
"epoch": 2.0181198120117188,
"grad_norm": 0.0010495241731405258,
"learning_rate": 3.717317581176758e-05,
"lookahead_loss": 6.665002863883972,
"loss": 0.6034,
"step": 134500
},
{
"base_loss": 0.5864896615743637,
"epoch": 2.019073486328125,
"grad_norm": 0.0010809170780703425,
"learning_rate": 3.7125492095947264e-05,
"lookahead_loss": 6.6852047996521,
"loss": 0.5996,
"step": 135000
},
{
"epoch": 2.019073486328125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.59784925783785,
"eval_lookahead_perplexity": 733.5158879689737,
"eval_loss": 0.20776157081127167,
"eval_perplexity": 1.2309196473852166,
"eval_runtime": 92.8062,
"eval_samples_per_second": 53.876,
"eval_steps_per_second": 1.692,
"step": 135000
},
{
"base_loss": 0.5884653336405754,
"epoch": 2.0200271606445312,
"grad_norm": 0.001135366503149271,
"learning_rate": 3.7077808380126955e-05,
"lookahead_loss": 6.525245847702027,
"loss": 0.6027,
"step": 135500
},
{
"base_loss": 0.6167576683163642,
"epoch": 2.0209808349609375,
"grad_norm": 0.0010758079588413239,
"learning_rate": 3.7030124664306645e-05,
"lookahead_loss": 6.587370067596436,
"loss": 0.6253,
"step": 136000
},
{
"base_loss": 0.5946086082458496,
"epoch": 2.0219345092773438,
"grad_norm": 0.0011470479657873511,
"learning_rate": 3.698244094848633e-05,
"lookahead_loss": 6.577798476696015,
"loss": 0.6091,
"step": 136500
},
{
"base_loss": 0.5980083233714104,
"epoch": 2.02288818359375,
"grad_norm": 0.0011121248826384544,
"learning_rate": 3.693475723266602e-05,
"lookahead_loss": 6.599166748046875,
"loss": 0.6066,
"step": 137000
},
{
"base_loss": 0.5981668121218682,
"epoch": 2.0238418579101562,
"grad_norm": 0.001129783340729773,
"learning_rate": 3.68870735168457e-05,
"lookahead_loss": 6.572176639556885,
"loss": 0.6053,
"step": 137500
},
{
"base_loss": 0.6096467951536179,
"epoch": 2.0247955322265625,
"grad_norm": 0.0010902190115302801,
"learning_rate": 3.683938980102539e-05,
"lookahead_loss": 6.575278332710266,
"loss": 0.6221,
"step": 138000
},
{
"base_loss": 0.6067250183224678,
"epoch": 2.0257492065429688,
"grad_norm": 0.0010856341104954481,
"learning_rate": 3.679170608520508e-05,
"lookahead_loss": 6.537411547660827,
"loss": 0.6159,
"step": 138500
},
{
"base_loss": 0.5957215885519982,
"epoch": 2.026702880859375,
"grad_norm": 0.0011199660366401076,
"learning_rate": 3.6744022369384766e-05,
"lookahead_loss": 6.5439116020202635,
"loss": 0.605,
"step": 139000
},
{
"base_loss": 0.5930430209040641,
"epoch": 2.0276565551757812,
"grad_norm": 0.0011078852694481611,
"learning_rate": 3.6696338653564456e-05,
"lookahead_loss": 6.6607541685104374,
"loss": 0.6048,
"step": 139500
},
{
"base_loss": 0.6162356662750245,
"epoch": 2.0286102294921875,
"grad_norm": 0.001123247086070478,
"learning_rate": 3.664865493774414e-05,
"lookahead_loss": 6.67528129196167,
"loss": 0.6263,
"step": 140000
},
{
"epoch": 2.0286102294921875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.585361625439823,
"eval_lookahead_perplexity": 724.412966463547,
"eval_loss": 0.20773683488368988,
"eval_perplexity": 1.230889199822536,
"eval_runtime": 92.7637,
"eval_samples_per_second": 53.9,
"eval_steps_per_second": 1.692,
"step": 140000
},
{
"base_loss": 0.6023641695976257,
"epoch": 2.0295639038085938,
"grad_norm": 0.0010968261631205678,
"learning_rate": 3.660097122192383e-05,
"lookahead_loss": 6.617411545753479,
"loss": 0.6128,
"step": 140500
},
{
"base_loss": 0.5918351314663887,
"epoch": 2.030517578125,
"grad_norm": 0.0011174526298418641,
"learning_rate": 3.655328750610352e-05,
"lookahead_loss": 6.603970588207245,
"loss": 0.6048,
"step": 141000
},
{
"base_loss": 0.5900247128009796,
"epoch": 2.0314712524414062,
"grad_norm": 0.001116051571443677,
"learning_rate": 3.65056037902832e-05,
"lookahead_loss": 6.635660397529602,
"loss": 0.6032,
"step": 141500
},
{
"base_loss": 0.6122562985420227,
"epoch": 2.0324249267578125,
"grad_norm": 0.0011088504688814282,
"learning_rate": 3.6457920074462893e-05,
"lookahead_loss": 6.562885791778564,
"loss": 0.6278,
"step": 142000
},
{
"base_loss": 0.5956820755600929,
"epoch": 2.0333786010742188,
"grad_norm": 0.0011337499599903822,
"learning_rate": 3.641023635864258e-05,
"lookahead_loss": 6.617730149269104,
"loss": 0.6081,
"step": 142500
},
{
"base_loss": 0.5925231646895409,
"epoch": 2.034332275390625,
"grad_norm": 0.0010689555201679468,
"learning_rate": 3.636255264282227e-05,
"lookahead_loss": 6.687095086097718,
"loss": 0.6073,
"step": 143000
},
{
"base_loss": 0.5926423314213752,
"epoch": 2.0352859497070312,
"grad_norm": 0.0011333973379805684,
"learning_rate": 3.631486892700196e-05,
"lookahead_loss": 6.5130055770874025,
"loss": 0.6038,
"step": 143500
},
{
"base_loss": 0.6081820316910743,
"epoch": 2.0362396240234375,
"grad_norm": 0.0011208722135052085,
"learning_rate": 3.626718521118164e-05,
"lookahead_loss": 6.640297612190246,
"loss": 0.62,
"step": 144000
},
{
"base_loss": 0.6008555814623833,
"epoch": 2.0371932983398438,
"grad_norm": 0.0011375031899660826,
"learning_rate": 3.621950149536133e-05,
"lookahead_loss": 6.57792680644989,
"loss": 0.6119,
"step": 144500
},
{
"base_loss": 0.5884444781541824,
"epoch": 2.03814697265625,
"grad_norm": 0.0010999179212376475,
"learning_rate": 3.6171817779541014e-05,
"lookahead_loss": 6.599160625457763,
"loss": 0.6015,
"step": 145000
},
{
"epoch": 2.03814697265625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.573444949933134,
"eval_lookahead_perplexity": 715.8316044138329,
"eval_loss": 0.2077145278453827,
"eval_perplexity": 1.230861742636249,
"eval_runtime": 92.2965,
"eval_samples_per_second": 54.173,
"eval_steps_per_second": 1.701,
"step": 145000
},
{
"base_loss": 0.5984013820290566,
"epoch": 2.0391006469726562,
"grad_norm": 0.0011051023611798882,
"learning_rate": 3.6124134063720705e-05,
"lookahead_loss": 6.544455199241638,
"loss": 0.6123,
"step": 145500
},
{
"base_loss": 0.6142821377515792,
"epoch": 2.0400543212890625,
"grad_norm": 0.0011354960734024644,
"learning_rate": 3.6076450347900395e-05,
"lookahead_loss": 6.579610840797424,
"loss": 0.624,
"step": 146000
},
{
"base_loss": 0.5992258986830712,
"epoch": 2.0410079956054688,
"grad_norm": 0.0011294566793367267,
"learning_rate": 3.602876663208008e-05,
"lookahead_loss": 6.541078899383545,
"loss": 0.6105,
"step": 146500
},
{
"base_loss": 0.578558257818222,
"epoch": 2.041961669921875,
"grad_norm": 0.0011305843945592642,
"learning_rate": 3.598108291625977e-05,
"lookahead_loss": 6.605445454597473,
"loss": 0.5955,
"step": 147000
},
{
"base_loss": 0.6040933942198753,
"epoch": 2.0429153442382812,
"grad_norm": 0.0010733373928815126,
"learning_rate": 3.593339920043945e-05,
"lookahead_loss": 6.627297685623169,
"loss": 0.6147,
"step": 147500
},
{
"base_loss": 0.6104523810148239,
"epoch": 2.0438690185546875,
"grad_norm": 0.001119652995839715,
"learning_rate": 3.588571548461914e-05,
"lookahead_loss": 6.631721858024597,
"loss": 0.6237,
"step": 148000
},
{
"base_loss": 0.5943758766055107,
"epoch": 2.0448226928710938,
"grad_norm": 0.00112288782838732,
"learning_rate": 3.583803176879883e-05,
"lookahead_loss": 6.579286907196045,
"loss": 0.6059,
"step": 148500
},
{
"base_loss": 0.5905072175264359,
"epoch": 2.0457763671875,
"grad_norm": 0.001094466308131814,
"learning_rate": 3.5790348052978516e-05,
"lookahead_loss": 6.574234865665436,
"loss": 0.6021,
"step": 149000
},
{
"base_loss": 0.6159886345267296,
"epoch": 2.0467300415039062,
"grad_norm": 0.0011022677645087242,
"learning_rate": 3.5742664337158206e-05,
"lookahead_loss": 6.549607624053955,
"loss": 0.6273,
"step": 149500
},
{
"base_loss": 0.6079316187500954,
"epoch": 2.0476837158203125,
"grad_norm": 0.0011532205389812589,
"learning_rate": 3.569498062133789e-05,
"lookahead_loss": 6.573279874801636,
"loss": 0.6236,
"step": 150000
},
{
"epoch": 2.0476837158203125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.563056849823973,
"eval_lookahead_perplexity": 708.4339642837788,
"eval_loss": 0.20769280195236206,
"eval_perplexity": 1.2308350013561955,
"eval_runtime": 91.5391,
"eval_samples_per_second": 54.621,
"eval_steps_per_second": 1.715,
"step": 150000
},
{
"base_loss": 0.5885444439649582,
"epoch": 2.0486373901367188,
"grad_norm": 0.0010961332591250539,
"learning_rate": 3.564729690551758e-05,
"lookahead_loss": 6.5612752790451045,
"loss": 0.6013,
"step": 150500
},
{
"base_loss": 0.6004898179769516,
"epoch": 2.049591064453125,
"grad_norm": 0.0011418386129662395,
"learning_rate": 3.559961318969727e-05,
"lookahead_loss": 6.530380144119262,
"loss": 0.6083,
"step": 151000
},
{
"base_loss": 0.6162783756256104,
"epoch": 2.0505447387695312,
"grad_norm": 0.0010346118360757828,
"learning_rate": 3.555192947387695e-05,
"lookahead_loss": 6.624601441383362,
"loss": 0.6277,
"step": 151500
},
{
"base_loss": 0.5984932317137718,
"epoch": 2.0514984130859375,
"grad_norm": 0.001150093856267631,
"learning_rate": 3.5504245758056643e-05,
"lookahead_loss": 6.58684754562378,
"loss": 0.6103,
"step": 152000
},
{
"base_loss": 0.5949168145656586,
"epoch": 2.0524520874023438,
"grad_norm": 0.00113403657451272,
"learning_rate": 3.545656204223633e-05,
"lookahead_loss": 6.523898173332214,
"loss": 0.6055,
"step": 152500
},
{
"base_loss": 0.6100201278328895,
"epoch": 2.05340576171875,
"grad_norm": 0.001108511001802981,
"learning_rate": 3.540887832641602e-05,
"lookahead_loss": 6.522478567123413,
"loss": 0.6204,
"step": 153000
},
{
"base_loss": 0.6109586038589477,
"epoch": 2.0543594360351562,
"grad_norm": 0.0011183718452230096,
"learning_rate": 3.536119461059571e-05,
"lookahead_loss": 6.566904292106629,
"loss": 0.6263,
"step": 153500
},
{
"base_loss": 0.5970847414731979,
"epoch": 2.0553131103515625,
"grad_norm": 0.001120952656492591,
"learning_rate": 3.531351089477539e-05,
"lookahead_loss": 6.588804016113281,
"loss": 0.6066,
"step": 154000
},
{
"base_loss": 0.5898715674877166,
"epoch": 2.0562667846679688,
"grad_norm": 0.0010837721638381481,
"learning_rate": 3.526582717895508e-05,
"lookahead_loss": 6.603852970123291,
"loss": 0.6029,
"step": 154500
},
{
"base_loss": 0.6163467369079589,
"epoch": 2.057220458984375,
"grad_norm": 0.0010966199915856123,
"learning_rate": 3.5218143463134764e-05,
"lookahead_loss": 6.5893798031806945,
"loss": 0.6288,
"step": 155000
},
{
"epoch": 2.057220458984375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.552286794010443,
"eval_lookahead_perplexity": 700.8450309219951,
"eval_loss": 0.20767158269882202,
"eval_perplexity": 1.2308088842333296,
"eval_runtime": 91.2351,
"eval_samples_per_second": 54.803,
"eval_steps_per_second": 1.721,
"step": 155000
},
{
"base_loss": 0.6090138986110687,
"epoch": 2.0581741333007812,
"grad_norm": 0.0011216332204639912,
"learning_rate": 3.5170459747314455e-05,
"lookahead_loss": 6.590124855995178,
"loss": 0.618,
"step": 155500
},
{
"base_loss": 0.583700324177742,
"epoch": 2.0591278076171875,
"grad_norm": 0.0010965469991788268,
"learning_rate": 3.5122776031494145e-05,
"lookahead_loss": 6.496915921211243,
"loss": 0.5999,
"step": 156000
},
{
"base_loss": 0.5938667116761207,
"epoch": 2.0600814819335938,
"grad_norm": 0.0010983194224536419,
"learning_rate": 3.507509231567383e-05,
"lookahead_loss": 6.5626952772140505,
"loss": 0.6085,
"step": 156500
},
{
"base_loss": 0.6066018126606941,
"epoch": 2.06103515625,
"grad_norm": 0.0011154355015605688,
"learning_rate": 3.502740859985352e-05,
"lookahead_loss": 6.521708921432495,
"loss": 0.617,
"step": 157000
},
{
"base_loss": 0.6058481879830361,
"epoch": 2.0619888305664062,
"grad_norm": 0.001157720573246479,
"learning_rate": 3.49797248840332e-05,
"lookahead_loss": 6.530971702575684,
"loss": 0.6158,
"step": 157500
},
{
"base_loss": 0.5946802944540978,
"epoch": 2.0629425048828125,
"grad_norm": 0.0011384790996089578,
"learning_rate": 3.493204116821289e-05,
"lookahead_loss": 6.535911062240601,
"loss": 0.6069,
"step": 158000
},
{
"base_loss": 0.6126915777921677,
"epoch": 2.0638961791992188,
"grad_norm": 0.0010698529658839107,
"learning_rate": 3.488435745239258e-05,
"lookahead_loss": 6.552743993759155,
"loss": 0.6234,
"step": 158500
},
{
"base_loss": 0.6043052950501442,
"epoch": 2.064849853515625,
"grad_norm": 0.0011142947478219867,
"learning_rate": 3.4836673736572266e-05,
"lookahead_loss": 6.595561448097229,
"loss": 0.6137,
"step": 159000
},
{
"base_loss": 0.6039554010033608,
"epoch": 2.0658035278320312,
"grad_norm": 0.0011351387947797775,
"learning_rate": 3.4788990020751956e-05,
"lookahead_loss": 6.505132764816284,
"loss": 0.6139,
"step": 159500
},
{
"base_loss": 0.605331601202488,
"epoch": 2.0667572021484375,
"grad_norm": 0.001111305202357471,
"learning_rate": 3.474130630493164e-05,
"lookahead_loss": 6.489367088317871,
"loss": 0.6123,
"step": 160000
},
{
"epoch": 2.0667572021484375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.541540684410558,
"eval_lookahead_perplexity": 693.3539952402889,
"eval_loss": 0.20765088498592377,
"eval_perplexity": 1.2307834095680457,
"eval_runtime": 93.5455,
"eval_samples_per_second": 53.45,
"eval_steps_per_second": 1.678,
"step": 160000
},
{
"base_loss": 0.6198571705818177,
"epoch": 2.0677108764648438,
"grad_norm": 0.0011394508183002472,
"learning_rate": 3.469362258911133e-05,
"lookahead_loss": 6.569809526443481,
"loss": 0.6283,
"step": 160500
},
{
"base_loss": 0.5996336659789085,
"epoch": 2.06866455078125,
"grad_norm": 0.001077422290109098,
"learning_rate": 3.464593887329102e-05,
"lookahead_loss": 6.538529127120972,
"loss": 0.6077,
"step": 161000
},
{
"base_loss": 0.5853740153312683,
"epoch": 2.0696182250976562,
"grad_norm": 0.0011083297431468964,
"learning_rate": 3.45982551574707e-05,
"lookahead_loss": 6.577185619831085,
"loss": 0.6,
"step": 161500
},
{
"base_loss": 0.6150279142260552,
"epoch": 2.0705718994140625,
"grad_norm": 0.0011394877219572663,
"learning_rate": 3.4550571441650393e-05,
"lookahead_loss": 6.460757801055908,
"loss": 0.6275,
"step": 162000
},
{
"base_loss": 0.5995807001590728,
"epoch": 2.0715255737304688,
"grad_norm": 0.0011034323833882809,
"learning_rate": 3.450288772583008e-05,
"lookahead_loss": 6.503600845336914,
"loss": 0.6119,
"step": 162500
},
{
"base_loss": 0.5951229523420334,
"epoch": 2.072479248046875,
"grad_norm": 0.0011338687036186457,
"learning_rate": 3.445520401000977e-05,
"lookahead_loss": 6.56779256439209,
"loss": 0.6063,
"step": 163000
},
{
"base_loss": 0.5946593886613846,
"epoch": 2.0734329223632812,
"grad_norm": 0.0010535767069086432,
"learning_rate": 3.440752029418946e-05,
"lookahead_loss": 6.522209219932556,
"loss": 0.6063,
"step": 163500
},
{
"base_loss": 0.6102511178851128,
"epoch": 2.0743865966796875,
"grad_norm": 0.0011173501843586564,
"learning_rate": 3.435983657836914e-05,
"lookahead_loss": 6.522058952331543,
"loss": 0.6216,
"step": 164000
},
{
"base_loss": 0.5937603359222412,
"epoch": 2.0753402709960938,
"grad_norm": 0.0011274093994870782,
"learning_rate": 3.431215286254883e-05,
"lookahead_loss": 6.592309030056,
"loss": 0.6057,
"step": 164500
},
{
"base_loss": 0.5869635686874389,
"epoch": 2.0762939453125,
"grad_norm": 0.001087229116819799,
"learning_rate": 3.4264469146728514e-05,
"lookahead_loss": 6.557568222045899,
"loss": 0.5964,
"step": 165000
},
{
"epoch": 2.0762939453125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.530356111617896,
"eval_lookahead_perplexity": 685.6423332225185,
"eval_loss": 0.20762944221496582,
"eval_perplexity": 1.2307570184442458,
"eval_runtime": 90.9123,
"eval_samples_per_second": 54.998,
"eval_steps_per_second": 1.727,
"step": 165000
},
{
"base_loss": 0.6242118158340454,
"epoch": 2.0772476196289062,
"grad_norm": 0.001096642459742725,
"learning_rate": 3.4216785430908205e-05,
"lookahead_loss": 6.5268142805099485,
"loss": 0.6344,
"step": 165500
},
{
"base_loss": 0.5928893273472786,
"epoch": 2.0782012939453125,
"grad_norm": 0.001135199679993093,
"learning_rate": 3.4169101715087895e-05,
"lookahead_loss": 6.549333306312561,
"loss": 0.6079,
"step": 166000
},
{
"base_loss": 0.5996605790853501,
"epoch": 2.0791549682617188,
"grad_norm": 0.0010736893163993955,
"learning_rate": 3.412141799926758e-05,
"lookahead_loss": 6.570660936355591,
"loss": 0.6084,
"step": 166500
},
{
"base_loss": 0.6092261442542076,
"epoch": 2.080108642578125,
"grad_norm": 0.0010752358939498663,
"learning_rate": 3.407373428344727e-05,
"lookahead_loss": 6.512598112106323,
"loss": 0.6199,
"step": 167000
},
{
"base_loss": 0.6069996964335441,
"epoch": 2.0810623168945312,
"grad_norm": 0.001151898643001914,
"learning_rate": 3.402605056762695e-05,
"lookahead_loss": 6.553294209480286,
"loss": 0.6194,
"step": 167500
},
{
"base_loss": 0.5921739342212677,
"epoch": 2.0820159912109375,
"grad_norm": 0.0010723688174039125,
"learning_rate": 3.397836685180664e-05,
"lookahead_loss": 6.563275011539459,
"loss": 0.6057,
"step": 168000
},
{
"base_loss": 0.5929293268918991,
"epoch": 2.0829696655273438,
"grad_norm": 0.0011118296533823013,
"learning_rate": 3.393068313598633e-05,
"lookahead_loss": 6.586033729553223,
"loss": 0.6052,
"step": 168500
},
{
"base_loss": 0.6163858331441879,
"epoch": 2.08392333984375,
"grad_norm": 0.0010826945072039962,
"learning_rate": 3.3882999420166016e-05,
"lookahead_loss": 6.580185745239258,
"loss": 0.6286,
"step": 169000
},
{
"base_loss": 0.5959904823899269,
"epoch": 2.0848770141601562,
"grad_norm": 0.0011054837377741933,
"learning_rate": 3.3835315704345706e-05,
"lookahead_loss": 6.532082775115967,
"loss": 0.6064,
"step": 169500
},
{
"base_loss": 0.5873328469395638,
"epoch": 2.0858306884765625,
"grad_norm": 0.0011319448240101337,
"learning_rate": 3.378763198852539e-05,
"lookahead_loss": 6.541397624015808,
"loss": 0.5974,
"step": 170000
},
{
"epoch": 2.0858306884765625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.519407711089991,
"eval_lookahead_perplexity": 678.1765898884748,
"eval_loss": 0.20760849118232727,
"eval_perplexity": 1.2307312330838982,
"eval_runtime": 92.2316,
"eval_samples_per_second": 54.211,
"eval_steps_per_second": 1.702,
"step": 170000
},
{
"base_loss": 0.5929982444047928,
"epoch": 2.0867843627929688,
"grad_norm": 0.001194111187942326,
"learning_rate": 3.373994827270508e-05,
"lookahead_loss": 6.505416949272155,
"loss": 0.6043,
"step": 170500
},
{
"base_loss": 0.6234143348932266,
"epoch": 2.087738037109375,
"grad_norm": 0.0010146180866286159,
"learning_rate": 3.369226455688477e-05,
"lookahead_loss": 6.542620226383209,
"loss": 0.6284,
"step": 171000
},
{
"base_loss": 0.5942349677085876,
"epoch": 2.0886917114257812,
"grad_norm": 0.0011270438553765416,
"learning_rate": 3.364458084106445e-05,
"lookahead_loss": 6.5318672647476195,
"loss": 0.6037,
"step": 171500
},
{
"base_loss": 0.5997879543304443,
"epoch": 2.0896453857421875,
"grad_norm": 0.0011306487722322345,
"learning_rate": 3.3596897125244143e-05,
"lookahead_loss": 6.558032165527344,
"loss": 0.6084,
"step": 172000
},
{
"base_loss": 0.5867617139816285,
"epoch": 2.0905990600585938,
"grad_norm": 0.0010641829576343298,
"learning_rate": 3.354921340942383e-05,
"lookahead_loss": 6.522708724975586,
"loss": 0.5974,
"step": 172500
},
{
"base_loss": 0.581401211798191,
"epoch": 2.091552734375,
"grad_norm": 0.0010614799102768302,
"learning_rate": 3.350152969360352e-05,
"lookahead_loss": 6.5099988975524905,
"loss": 0.5912,
"step": 173000
},
{
"base_loss": 0.6186662130355834,
"epoch": 2.0925064086914062,
"grad_norm": 0.0011495859362185001,
"learning_rate": 3.345384597778321e-05,
"lookahead_loss": 6.54921883392334,
"loss": 0.6288,
"step": 173500
},
{
"base_loss": 0.600584501862526,
"epoch": 2.0934600830078125,
"grad_norm": 0.0010833586566150188,
"learning_rate": 3.340616226196289e-05,
"lookahead_loss": 6.5347207136154175,
"loss": 0.6132,
"step": 174000
},
{
"base_loss": 0.5855657352209092,
"epoch": 2.0944137573242188,
"grad_norm": 0.0010808638762682676,
"learning_rate": 3.335847854614258e-05,
"lookahead_loss": 6.533743205070496,
"loss": 0.5979,
"step": 174500
},
{
"base_loss": 0.5778659620285034,
"epoch": 2.095367431640625,
"grad_norm": 0.0010777119314298034,
"learning_rate": 3.3310794830322264e-05,
"lookahead_loss": 6.469995648384094,
"loss": 0.5933,
"step": 175000
},
{
"epoch": 2.095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.510934283558172,
"eval_lookahead_perplexity": 672.4543872721904,
"eval_loss": 0.20759032666683197,
"eval_perplexity": 1.2307088776503827,
"eval_runtime": 93.5217,
"eval_samples_per_second": 53.464,
"eval_steps_per_second": 1.679,
"step": 175000
},
{
"base_loss": 0.59952286028862,
"epoch": 2.0963211059570312,
"grad_norm": 0.0011107597965747118,
"learning_rate": 3.3263111114501955e-05,
"lookahead_loss": 6.528745898723602,
"loss": 0.6123,
"step": 175500
},
{
"base_loss": 0.6133572874069214,
"epoch": 2.0972747802734375,
"grad_norm": 0.0011266444344073534,
"learning_rate": 3.3215427398681645e-05,
"lookahead_loss": 6.552535983085632,
"loss": 0.6237,
"step": 176000
},
{
"base_loss": 0.5973192919492721,
"epoch": 2.0982284545898438,
"grad_norm": 0.001105320523492992,
"learning_rate": 3.316774368286133e-05,
"lookahead_loss": 6.48703881072998,
"loss": 0.6069,
"step": 176500
},
{
"base_loss": 0.5845331786870956,
"epoch": 2.09918212890625,
"grad_norm": 0.001108050113543868,
"learning_rate": 3.312005996704102e-05,
"lookahead_loss": 6.557789768218994,
"loss": 0.5956,
"step": 177000
},
{
"base_loss": 0.5865774551033973,
"epoch": 2.1001358032226562,
"grad_norm": 0.001104854280129075,
"learning_rate": 3.30723762512207e-05,
"lookahead_loss": 6.52143590259552,
"loss": 0.6035,
"step": 177500
},
{
"base_loss": 0.6098086424469947,
"epoch": 2.1010894775390625,
"grad_norm": 0.0011074242647737265,
"learning_rate": 3.302469253540039e-05,
"lookahead_loss": 6.581818056106568,
"loss": 0.6195,
"step": 178000
},
{
"base_loss": 0.6012407766580582,
"epoch": 2.1020431518554688,
"grad_norm": 0.0011334229493513703,
"learning_rate": 3.297700881958008e-05,
"lookahead_loss": 6.556834000587464,
"loss": 0.6076,
"step": 178500
},
{
"base_loss": 0.5883222328424453,
"epoch": 2.102996826171875,
"grad_norm": 0.0010879429755732417,
"learning_rate": 3.2929325103759766e-05,
"lookahead_loss": 6.528950669288635,
"loss": 0.6017,
"step": 179000
},
{
"base_loss": 0.5900123327970505,
"epoch": 2.1039505004882812,
"grad_norm": 0.0010885036317631602,
"learning_rate": 3.2881641387939456e-05,
"lookahead_loss": 6.548188806533814,
"loss": 0.6016,
"step": 179500
},
{
"base_loss": 0.6137035485506058,
"epoch": 2.1049041748046875,
"grad_norm": 0.0010836453875526786,
"learning_rate": 3.283395767211914e-05,
"lookahead_loss": 6.490313670158386,
"loss": 0.6239,
"step": 180000
},
{
"epoch": 2.1049041748046875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.501633135274576,
"eval_lookahead_perplexity": 666.2287868008185,
"eval_loss": 0.2075720876455307,
"eval_perplexity": 1.2306864309296515,
"eval_runtime": 91.2824,
"eval_samples_per_second": 54.775,
"eval_steps_per_second": 1.72,
"step": 180000
},
{
"base_loss": 0.6017958376407623,
"epoch": 2.1058578491210938,
"grad_norm": 0.0010430403053760529,
"learning_rate": 3.278627395629883e-05,
"lookahead_loss": 6.493426759719848,
"loss": 0.6127,
"step": 180500
},
{
"base_loss": 0.5970892771482468,
"epoch": 2.1068115234375,
"grad_norm": 0.001087990473024547,
"learning_rate": 3.273859024047852e-05,
"lookahead_loss": 6.560095170974732,
"loss": 0.6064,
"step": 181000
},
{
"base_loss": 0.5833802008032799,
"epoch": 2.1077651977539062,
"grad_norm": 0.0011280244216322899,
"learning_rate": 3.26909065246582e-05,
"lookahead_loss": 6.470833657264709,
"loss": 0.5948,
"step": 181500
},
{
"base_loss": 0.604837516605854,
"epoch": 2.1087188720703125,
"grad_norm": 0.0011518291430547833,
"learning_rate": 3.2643222808837893e-05,
"lookahead_loss": 6.458127084732055,
"loss": 0.6189,
"step": 182000
},
{
"base_loss": 0.6057112255096435,
"epoch": 2.1096725463867188,
"grad_norm": 0.001071825623512268,
"learning_rate": 3.259553909301758e-05,
"lookahead_loss": 6.447037249565125,
"loss": 0.6163,
"step": 182500
},
{
"base_loss": 0.5945826203823089,
"epoch": 2.110626220703125,
"grad_norm": 0.0010940604843199253,
"learning_rate": 3.254785537719727e-05,
"lookahead_loss": 6.43380268573761,
"loss": 0.6068,
"step": 183000
},
{
"base_loss": 0.5839763838648796,
"epoch": 2.1115798950195312,
"grad_norm": 0.0010885735973715782,
"learning_rate": 3.250017166137696e-05,
"lookahead_loss": 6.519184366226196,
"loss": 0.5969,
"step": 183500
},
{
"base_loss": 0.6021909977793694,
"epoch": 2.1125335693359375,
"grad_norm": 0.0010922467336058617,
"learning_rate": 3.245248794555664e-05,
"lookahead_loss": 6.546927687168122,
"loss": 0.612,
"step": 184000
},
{
"base_loss": 0.6174382773041726,
"epoch": 2.1134872436523438,
"grad_norm": 0.0011140938149765134,
"learning_rate": 3.240480422973633e-05,
"lookahead_loss": 6.5441567344665525,
"loss": 0.6269,
"step": 184500
},
{
"base_loss": 0.5974359802007675,
"epoch": 2.11444091796875,
"grad_norm": 0.0010466972598806024,
"learning_rate": 3.2357120513916014e-05,
"lookahead_loss": 6.435290018081665,
"loss": 0.6071,
"step": 185000
},
{
"epoch": 2.11444091796875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.492953685906748,
"eval_lookahead_perplexity": 660.4713097873832,
"eval_loss": 0.20755501091480255,
"eval_perplexity": 1.2306654150083016,
"eval_runtime": 93.1261,
"eval_samples_per_second": 53.691,
"eval_steps_per_second": 1.686,
"step": 185000
},
{
"base_loss": 0.5844211729764939,
"epoch": 2.1153945922851562,
"grad_norm": 0.0011425914708524942,
"learning_rate": 3.2309436798095705e-05,
"lookahead_loss": 6.488190547943115,
"loss": 0.5962,
"step": 185500
},
{
"base_loss": 0.6047764738798141,
"epoch": 2.1163482666015625,
"grad_norm": 0.0011844782857224345,
"learning_rate": 3.2261753082275395e-05,
"lookahead_loss": 6.4878438234329225,
"loss": 0.6158,
"step": 186000
},
{
"base_loss": 0.6098092859387397,
"epoch": 2.1173019409179688,
"grad_norm": 0.0011336279567331076,
"learning_rate": 3.221406936645508e-05,
"lookahead_loss": 6.487655387878418,
"loss": 0.6218,
"step": 186500
},
{
"base_loss": 0.6043260169625282,
"epoch": 2.118255615234375,
"grad_norm": 0.0011267218505963683,
"learning_rate": 3.216638565063477e-05,
"lookahead_loss": 6.516085889816284,
"loss": 0.6128,
"step": 187000
},
{
"base_loss": 0.5858061800599098,
"epoch": 2.1192092895507812,
"grad_norm": 0.001102335867471993,
"learning_rate": 3.211870193481445e-05,
"lookahead_loss": 6.516676008224487,
"loss": 0.5958,
"step": 187500
},
{
"base_loss": 0.5981094686985016,
"epoch": 3.0009536743164062,
"grad_norm": 0.0011026667198166251,
"learning_rate": 3.207101821899414e-05,
"lookahead_loss": 6.580178588867187,
"loss": 0.6029,
"step": 188000
},
{
"base_loss": 0.587501579284668,
"epoch": 3.0019073486328125,
"grad_norm": 0.0011541040148586035,
"learning_rate": 3.202333450317383e-05,
"lookahead_loss": 6.419054620742798,
"loss": 0.5974,
"step": 188500
},
{
"base_loss": 0.6046080349087715,
"epoch": 3.0028610229492188,
"grad_norm": 0.001120623666793108,
"learning_rate": 3.1975650787353516e-05,
"lookahead_loss": 6.434816195487976,
"loss": 0.613,
"step": 189000
},
{
"base_loss": 0.6173882039189339,
"epoch": 3.003814697265625,
"grad_norm": 0.0010917120380327106,
"learning_rate": 3.1927967071533206e-05,
"lookahead_loss": 6.458130680561066,
"loss": 0.6242,
"step": 189500
},
{
"base_loss": 0.6012161781191826,
"epoch": 3.0047683715820312,
"grad_norm": 0.001108814962208271,
"learning_rate": 3.188028335571289e-05,
"lookahead_loss": 6.437314165115357,
"loss": 0.6087,
"step": 190000
},
{
"epoch": 3.0047683715820312,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.484307500120169,
"eval_lookahead_perplexity": 654.7853684107388,
"eval_loss": 0.20753738284111023,
"eval_perplexity": 1.2306437209388883,
"eval_runtime": 91.1617,
"eval_samples_per_second": 54.848,
"eval_steps_per_second": 1.722,
"step": 190000
},
{
"base_loss": 0.5926664335727692,
"epoch": 3.0057220458984375,
"grad_norm": 0.0010479306802153587,
"learning_rate": 3.183259963989258e-05,
"lookahead_loss": 6.560607043266296,
"loss": 0.6018,
"step": 190500
},
{
"base_loss": 0.5822248963713645,
"epoch": 3.0066757202148438,
"grad_norm": 0.0010960784275084734,
"learning_rate": 3.178491592407227e-05,
"lookahead_loss": 6.403440669059753,
"loss": 0.5981,
"step": 191000
},
{
"base_loss": 0.6042868258953095,
"epoch": 3.00762939453125,
"grad_norm": 0.001109161414206028,
"learning_rate": 3.173723220825195e-05,
"lookahead_loss": 6.468971241950989,
"loss": 0.6169,
"step": 191500
},
{
"base_loss": 0.6021608446240425,
"epoch": 3.0085830688476562,
"grad_norm": 0.0010364059126004577,
"learning_rate": 3.1689548492431643e-05,
"lookahead_loss": 6.479389870643616,
"loss": 0.608,
"step": 192000
},
{
"base_loss": 0.5907191566824913,
"epoch": 3.0095367431640625,
"grad_norm": 0.0011385597754269838,
"learning_rate": 3.164186477661133e-05,
"lookahead_loss": 6.483307428359986,
"loss": 0.6059,
"step": 192500
},
{
"base_loss": 0.5937975888252258,
"epoch": 3.0104904174804688,
"grad_norm": 0.0011329938424751163,
"learning_rate": 3.159418106079102e-05,
"lookahead_loss": 6.426459211349488,
"loss": 0.6052,
"step": 193000
},
{
"base_loss": 0.5900298383831978,
"epoch": 3.011444091796875,
"grad_norm": 0.0011370591819286346,
"learning_rate": 3.154649734497071e-05,
"lookahead_loss": 6.45755704498291,
"loss": 0.6034,
"step": 193500
},
{
"base_loss": 0.6114438434243202,
"epoch": 3.0123977661132812,
"grad_norm": 0.0010784439509734511,
"learning_rate": 3.149881362915039e-05,
"lookahead_loss": 6.444551125526428,
"loss": 0.6234,
"step": 194000
},
{
"base_loss": 0.5947433623075485,
"epoch": 3.0133514404296875,
"grad_norm": 0.0011095026275143027,
"learning_rate": 3.145112991333008e-05,
"lookahead_loss": 6.529066830635071,
"loss": 0.6095,
"step": 194500
},
{
"base_loss": 0.5968700026273728,
"epoch": 3.0143051147460938,
"grad_norm": 0.0011093729408457875,
"learning_rate": 3.1403446197509764e-05,
"lookahead_loss": 6.499406108856201,
"loss": 0.6074,
"step": 195000
},
{
"epoch": 3.0143051147460938,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.476042339976984,
"eval_lookahead_perplexity": 649.3957661259718,
"eval_loss": 0.20752041041851044,
"eval_perplexity": 1.2306228341108372,
"eval_runtime": 93.185,
"eval_samples_per_second": 53.657,
"eval_steps_per_second": 1.685,
"step": 195000
},
{
"base_loss": 0.5854599202871322,
"epoch": 3.0152587890625,
"grad_norm": 0.001107083517126739,
"learning_rate": 3.1355762481689455e-05,
"lookahead_loss": 6.448059041976928,
"loss": 0.5918,
"step": 195500
},
{
"base_loss": 0.6102398954629898,
"epoch": 3.0162124633789062,
"grad_norm": 0.0011182829039171338,
"learning_rate": 3.1308078765869145e-05,
"lookahead_loss": 6.49233545923233,
"loss": 0.6183,
"step": 196000
},
{
"base_loss": 0.604021582365036,
"epoch": 3.0171661376953125,
"grad_norm": 0.0011117098620161414,
"learning_rate": 3.126039505004883e-05,
"lookahead_loss": 6.529406607627869,
"loss": 0.6108,
"step": 196500
},
{
"base_loss": 0.5954824941754341,
"epoch": 3.0181198120117188,
"grad_norm": 0.0010434985160827637,
"learning_rate": 3.121271133422852e-05,
"lookahead_loss": 6.530284749984741,
"loss": 0.6029,
"step": 197000
},
{
"base_loss": 0.58826500248909,
"epoch": 3.019073486328125,
"grad_norm": 0.0010525453835725784,
"learning_rate": 3.11650276184082e-05,
"lookahead_loss": 6.558384260654449,
"loss": 0.6004,
"step": 197500
},
{
"base_loss": 0.5917664663791656,
"epoch": 3.0200271606445312,
"grad_norm": 0.0011327258544042706,
"learning_rate": 3.111734390258789e-05,
"lookahead_loss": 6.401400192260742,
"loss": 0.6047,
"step": 198000
},
{
"base_loss": 0.6186916393637657,
"epoch": 3.0209808349609375,
"grad_norm": 0.0010724315652623773,
"learning_rate": 3.106966018676758e-05,
"lookahead_loss": 6.458036962509155,
"loss": 0.6258,
"step": 198500
},
{
"base_loss": 0.5938585975766182,
"epoch": 3.0219345092773438,
"grad_norm": 0.0011486399453133345,
"learning_rate": 3.1021976470947266e-05,
"lookahead_loss": 6.43785121679306,
"loss": 0.6075,
"step": 199000
},
{
"base_loss": 0.595318921983242,
"epoch": 3.02288818359375,
"grad_norm": 0.001104371971450746,
"learning_rate": 3.0974292755126956e-05,
"lookahead_loss": 6.464245168209076,
"loss": 0.6043,
"step": 199500
},
{
"base_loss": 0.5955991841554642,
"epoch": 3.0238418579101562,
"grad_norm": 0.0011078201932832599,
"learning_rate": 3.092660903930664e-05,
"lookahead_loss": 6.4285355896949765,
"loss": 0.6046,
"step": 200000
},
{
"epoch": 3.0238418579101562,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.4665484976844665,
"eval_lookahead_perplexity": 643.2596787484863,
"eval_loss": 0.20750202238559723,
"eval_perplexity": 1.2306002055857075,
"eval_runtime": 95.5351,
"eval_samples_per_second": 52.337,
"eval_steps_per_second": 1.643,
"step": 200000
},
{
"base_loss": 0.6106253827810287,
"epoch": 3.0247955322265625,
"grad_norm": 0.0010915478924289346,
"learning_rate": 3.087892532348633e-05,
"lookahead_loss": 6.434819388389587,
"loss": 0.6219,
"step": 200500
},
{
"base_loss": 0.6082036694884301,
"epoch": 3.0257492065429688,
"grad_norm": 0.0010945522226393223,
"learning_rate": 3.083124160766602e-05,
"lookahead_loss": 6.398662006378173,
"loss": 0.6163,
"step": 201000
},
{
"base_loss": 0.5935834443569183,
"epoch": 3.026702880859375,
"grad_norm": 0.0011064212303608656,
"learning_rate": 3.07835578918457e-05,
"lookahead_loss": 6.422159289360047,
"loss": 0.6039,
"step": 201500
},
{
"base_loss": 0.5928121148943901,
"epoch": 3.0276565551757812,
"grad_norm": 0.0010989385191351175,
"learning_rate": 3.0735874176025393e-05,
"lookahead_loss": 6.5297203512191775,
"loss": 0.6048,
"step": 202000
},
{
"base_loss": 0.6184476745128632,
"epoch": 3.0286102294921875,
"grad_norm": 0.0010950877331197262,
"learning_rate": 3.068819046020508e-05,
"lookahead_loss": 6.549453610897064,
"loss": 0.6282,
"step": 202500
},
{
"base_loss": 0.6005746681690216,
"epoch": 3.0295639038085938,
"grad_norm": 0.0010932876029983163,
"learning_rate": 3.064050674438477e-05,
"lookahead_loss": 6.481932869434357,
"loss": 0.611,
"step": 203000
},
{
"base_loss": 0.5931842148900032,
"epoch": 3.030517578125,
"grad_norm": 0.0011122091673314571,
"learning_rate": 3.059282302856446e-05,
"lookahead_loss": 6.4869947714805605,
"loss": 0.6067,
"step": 203500
},
{
"base_loss": 0.5912481832504273,
"epoch": 3.0314712524414062,
"grad_norm": 0.0010941592045128345,
"learning_rate": 3.054513931274414e-05,
"lookahead_loss": 6.504393637657166,
"loss": 0.6046,
"step": 204000
},
{
"base_loss": 0.6138262154459954,
"epoch": 3.0324249267578125,
"grad_norm": 0.0011110466439276934,
"learning_rate": 3.049745559692383e-05,
"lookahead_loss": 6.4517954845428465,
"loss": 0.628,
"step": 204500
},
{
"base_loss": 0.5952054759263993,
"epoch": 3.0333786010742188,
"grad_norm": 0.0011273113777861,
"learning_rate": 3.0449771881103518e-05,
"lookahead_loss": 6.492183149337769,
"loss": 0.6084,
"step": 205000
},
{
"epoch": 3.0333786010742188,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.459207607915226,
"eval_lookahead_perplexity": 638.5548702221528,
"eval_loss": 0.20748774707317352,
"eval_perplexity": 1.2305826385086922,
"eval_runtime": 91.6822,
"eval_samples_per_second": 54.536,
"eval_steps_per_second": 1.712,
"step": 205000
},
{
"base_loss": 0.5937945809960365,
"epoch": 3.034332275390625,
"grad_norm": 0.0010771463857963681,
"learning_rate": 3.0402088165283205e-05,
"lookahead_loss": 6.558969589233398,
"loss": 0.6073,
"step": 205500
},
{
"base_loss": 0.5899240897297859,
"epoch": 3.0352859497070312,
"grad_norm": 0.0011380125069990754,
"learning_rate": 3.035440444946289e-05,
"lookahead_loss": 6.393504500389099,
"loss": 0.6019,
"step": 206000
},
{
"base_loss": 0.6100166696310043,
"epoch": 3.0362396240234375,
"grad_norm": 0.0011085773585364223,
"learning_rate": 3.0306720733642578e-05,
"lookahead_loss": 6.515884090900421,
"loss": 0.622,
"step": 206500
},
{
"base_loss": 0.600666867017746,
"epoch": 3.0371932983398438,
"grad_norm": 0.0011293049901723862,
"learning_rate": 3.025903701782227e-05,
"lookahead_loss": 6.463486310005188,
"loss": 0.6106,
"step": 207000
},
{
"base_loss": 0.5914758368730545,
"epoch": 3.03814697265625,
"grad_norm": 0.0010940312640741467,
"learning_rate": 3.0211353302001955e-05,
"lookahead_loss": 6.46989311504364,
"loss": 0.6023,
"step": 207500
},
{
"base_loss": 0.600289347231388,
"epoch": 3.0391006469726562,
"grad_norm": 0.0010963748209178448,
"learning_rate": 3.0163669586181642e-05,
"lookahead_loss": 6.431695939540863,
"loss": 0.6123,
"step": 208000
},
{
"base_loss": 0.6154078626036644,
"epoch": 3.0400543212890625,
"grad_norm": 0.001127979252487421,
"learning_rate": 3.011598587036133e-05,
"lookahead_loss": 6.457052739143371,
"loss": 0.6236,
"step": 208500
},
{
"base_loss": 0.6015743594169617,
"epoch": 3.0410079956054688,
"grad_norm": 0.0011190706863999367,
"learning_rate": 3.0068302154541016e-05,
"lookahead_loss": 6.41373338508606,
"loss": 0.6116,
"step": 209000
},
{
"base_loss": 0.5813222458958626,
"epoch": 3.041961669921875,
"grad_norm": 0.0011437357170507312,
"learning_rate": 3.0020618438720706e-05,
"lookahead_loss": 6.483449687004089,
"loss": 0.5969,
"step": 209500
},
{
"base_loss": 0.6031124092936516,
"epoch": 3.0429153442382812,
"grad_norm": 0.0010715459939092398,
"learning_rate": 2.9972934722900393e-05,
"lookahead_loss": 6.503455612182617,
"loss": 0.614,
"step": 210000
},
{
"epoch": 3.0429153442382812,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.451748331514791,
"eval_lookahead_perplexity": 633.8094337129085,
"eval_loss": 0.20747321844100952,
"eval_perplexity": 1.2305647599560656,
"eval_runtime": 94.4941,
"eval_samples_per_second": 52.913,
"eval_steps_per_second": 1.661,
"step": 210000
},
{
"base_loss": 0.6137282114624977,
"epoch": 3.0438690185546875,
"grad_norm": 0.0011294978903606534,
"learning_rate": 2.992525100708008e-05,
"lookahead_loss": 6.515301226615906,
"loss": 0.6256,
"step": 210500
},
{
"base_loss": 0.5906867882609368,
"epoch": 3.0448226928710938,
"grad_norm": 0.0011206147028133273,
"learning_rate": 2.9877567291259766e-05,
"lookahead_loss": 6.449306659698486,
"loss": 0.6033,
"step": 211000
},
{
"base_loss": 0.5901689050197602,
"epoch": 3.0457763671875,
"grad_norm": 0.0010848396923393011,
"learning_rate": 2.9829883575439453e-05,
"lookahead_loss": 6.44843610572815,
"loss": 0.6011,
"step": 211500
},
{
"base_loss": 0.6179713225364685,
"epoch": 3.0467300415039062,
"grad_norm": 0.0011150614591315389,
"learning_rate": 2.9782199859619143e-05,
"lookahead_loss": 6.423956147193909,
"loss": 0.6278,
"step": 212000
},
{
"base_loss": 0.6056273721456528,
"epoch": 3.0476837158203125,
"grad_norm": 0.0011628296924754977,
"learning_rate": 2.973451614379883e-05,
"lookahead_loss": 6.465731230258942,
"loss": 0.6222,
"step": 212500
},
{
"base_loss": 0.5891297512054443,
"epoch": 3.0486373901367188,
"grad_norm": 0.0010860287584364414,
"learning_rate": 2.9686832427978517e-05,
"lookahead_loss": 6.444560400009156,
"loss": 0.6002,
"step": 213000
},
{
"base_loss": 0.5967240616083145,
"epoch": 3.049591064453125,
"grad_norm": 0.0011478269007056952,
"learning_rate": 2.9639148712158204e-05,
"lookahead_loss": 6.408600190162659,
"loss": 0.607,
"step": 213500
},
{
"base_loss": 0.6166991795897484,
"epoch": 3.0505447387695312,
"grad_norm": 0.001043649623170495,
"learning_rate": 2.959146499633789e-05,
"lookahead_loss": 6.510616254806519,
"loss": 0.6283,
"step": 214000
},
{
"base_loss": 0.5971619437336921,
"epoch": 3.0514984130859375,
"grad_norm": 0.0011554835364222527,
"learning_rate": 2.954378128051758e-05,
"lookahead_loss": 6.4740786409378055,
"loss": 0.6093,
"step": 214500
},
{
"base_loss": 0.594504154086113,
"epoch": 3.0524520874023438,
"grad_norm": 0.0011469792807474732,
"learning_rate": 2.9496097564697268e-05,
"lookahead_loss": 6.407137874603271,
"loss": 0.605,
"step": 215000
},
{
"epoch": 3.0524520874023438,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.445492273702408,
"eval_lookahead_perplexity": 629.8566625281327,
"eval_loss": 0.2074602097272873,
"eval_perplexity": 1.2305487519955085,
"eval_runtime": 90.8147,
"eval_samples_per_second": 55.057,
"eval_steps_per_second": 1.729,
"step": 215000
},
{
"base_loss": 0.6092002688646316,
"epoch": 3.05340576171875,
"grad_norm": 0.0011114060180261731,
"learning_rate": 2.9448413848876955e-05,
"lookahead_loss": 6.412362268447876,
"loss": 0.6198,
"step": 215500
},
{
"base_loss": 0.6074704146981239,
"epoch": 3.0543594360351562,
"grad_norm": 0.0011088059982284904,
"learning_rate": 2.940073013305664e-05,
"lookahead_loss": 6.450668965816498,
"loss": 0.6219,
"step": 216000
},
{
"base_loss": 0.5954701615571976,
"epoch": 3.0553131103515625,
"grad_norm": 0.0011205608025193214,
"learning_rate": 2.9353046417236328e-05,
"lookahead_loss": 6.46648416519165,
"loss": 0.6048,
"step": 216500
},
{
"base_loss": 0.5905682035684585,
"epoch": 3.0562667846679688,
"grad_norm": 0.0010696501703932881,
"learning_rate": 2.930536270141602e-05,
"lookahead_loss": 6.470299499511719,
"loss": 0.6042,
"step": 217000
},
{
"base_loss": 0.6139637017846108,
"epoch": 3.057220458984375,
"grad_norm": 0.0011050283210352063,
"learning_rate": 2.9257678985595705e-05,
"lookahead_loss": 6.474473210811615,
"loss": 0.6279,
"step": 217500
},
{
"base_loss": 0.606031008541584,
"epoch": 3.0581741333007812,
"grad_norm": 0.0011311868438497186,
"learning_rate": 2.9209995269775392e-05,
"lookahead_loss": 6.475669991016388,
"loss": 0.6157,
"step": 218000
},
{
"base_loss": 0.5839636498689651,
"epoch": 3.0591278076171875,
"grad_norm": 0.0010969273280352354,
"learning_rate": 2.916231155395508e-05,
"lookahead_loss": 6.375632545471191,
"loss": 0.5996,
"step": 218500
},
{
"base_loss": 0.5957706315517426,
"epoch": 3.0600814819335938,
"grad_norm": 0.0011069747852161527,
"learning_rate": 2.9114627838134766e-05,
"lookahead_loss": 6.45486762714386,
"loss": 0.6077,
"step": 219000
},
{
"base_loss": 0.6068459544181823,
"epoch": 3.06103515625,
"grad_norm": 0.0011218679137527943,
"learning_rate": 2.9066944122314456e-05,
"lookahead_loss": 6.403849498271942,
"loss": 0.6192,
"step": 219500
},
{
"base_loss": 0.6023038199543953,
"epoch": 3.0619888305664062,
"grad_norm": 0.0011592921800911427,
"learning_rate": 2.9019260406494143e-05,
"lookahead_loss": 6.420722769737243,
"loss": 0.6158,
"step": 220000
},
{
"epoch": 3.0619888305664062,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.437712517790139,
"eval_lookahead_perplexity": 624.9755430121139,
"eval_loss": 0.2074456363916397,
"eval_perplexity": 1.2305308189261874,
"eval_runtime": 93.2872,
"eval_samples_per_second": 53.598,
"eval_steps_per_second": 1.683,
"step": 220000
},
{
"base_loss": 0.5944037571549415,
"epoch": 3.0629425048828125,
"grad_norm": 0.0011297245509922504,
"learning_rate": 2.897157669067383e-05,
"lookahead_loss": 6.439490074634552,
"loss": 0.6075,
"step": 220500
},
{
"base_loss": 0.6128740097880363,
"epoch": 3.0638961791992188,
"grad_norm": 0.0010691159404814243,
"learning_rate": 2.8923892974853516e-05,
"lookahead_loss": 6.441310132026673,
"loss": 0.6236,
"step": 221000
},
{
"base_loss": 0.6068454984426498,
"epoch": 3.064849853515625,
"grad_norm": 0.0011188320349901915,
"learning_rate": 2.8876209259033203e-05,
"lookahead_loss": 6.489636187553406,
"loss": 0.6166,
"step": 221500
},
{
"base_loss": 0.6046090689897538,
"epoch": 3.0658035278320312,
"grad_norm": 0.0011356692994013429,
"learning_rate": 2.8828525543212893e-05,
"lookahead_loss": 6.39787455034256,
"loss": 0.6135,
"step": 222000
},
{
"base_loss": 0.6052177213430404,
"epoch": 3.0667572021484375,
"grad_norm": 0.001095956307835877,
"learning_rate": 2.878084182739258e-05,
"lookahead_loss": 6.380840573787689,
"loss": 0.6114,
"step": 222500
},
{
"base_loss": 0.6186637369394302,
"epoch": 3.0677108764648438,
"grad_norm": 0.001142095890827477,
"learning_rate": 2.8733158111572267e-05,
"lookahead_loss": 6.458719520568848,
"loss": 0.6293,
"step": 223000
},
{
"base_loss": 0.5988062580823899,
"epoch": 3.06866455078125,
"grad_norm": 0.0010754456743597984,
"learning_rate": 2.8685474395751954e-05,
"lookahead_loss": 6.417570797920227,
"loss": 0.6071,
"step": 223500
},
{
"base_loss": 0.5868127301335335,
"epoch": 3.0696182250976562,
"grad_norm": 0.0011147081386297941,
"learning_rate": 2.863779067993164e-05,
"lookahead_loss": 6.465461602210999,
"loss": 0.6002,
"step": 224000
},
{
"base_loss": 0.6132235081791878,
"epoch": 3.0705718994140625,
"grad_norm": 0.0011301016202196479,
"learning_rate": 2.859010696411133e-05,
"lookahead_loss": 6.349413995742798,
"loss": 0.628,
"step": 224500
},
{
"base_loss": 0.5981150686740875,
"epoch": 3.0715255737304688,
"grad_norm": 0.001094558509066701,
"learning_rate": 2.8542423248291018e-05,
"lookahead_loss": 6.409218377113342,
"loss": 0.6103,
"step": 225000
},
{
"epoch": 3.0715255737304688,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.430860436381623,
"eval_lookahead_perplexity": 620.7077978795056,
"eval_loss": 0.20743218064308167,
"eval_perplexity": 1.2305142613242928,
"eval_runtime": 91.988,
"eval_samples_per_second": 54.355,
"eval_steps_per_second": 1.707,
"step": 225000
},
{
"base_loss": 0.5917516273856163,
"epoch": 3.072479248046875,
"grad_norm": 0.001141304150223732,
"learning_rate": 2.8494739532470705e-05,
"lookahead_loss": 6.462259509086609,
"loss": 0.6047,
"step": 225500
},
{
"base_loss": 0.5921737739443779,
"epoch": 3.0734329223632812,
"grad_norm": 0.0010474661830812693,
"learning_rate": 2.844705581665039e-05,
"lookahead_loss": 6.417533561706543,
"loss": 0.6032,
"step": 226000
},
{
"base_loss": 0.6123625862002373,
"epoch": 3.0743865966796875,
"grad_norm": 0.0010933991288766265,
"learning_rate": 2.8399372100830078e-05,
"lookahead_loss": 6.422773428916932,
"loss": 0.6223,
"step": 226500
},
{
"base_loss": 0.593733324766159,
"epoch": 3.0753402709960938,
"grad_norm": 0.0010934488382190466,
"learning_rate": 2.835168838500977e-05,
"lookahead_loss": 6.487575828552246,
"loss": 0.6048,
"step": 227000
},
{
"base_loss": 0.5864555166959763,
"epoch": 3.0762939453125,
"grad_norm": 0.0010649971663951874,
"learning_rate": 2.8304004669189455e-05,
"lookahead_loss": 6.451822665691376,
"loss": 0.5975,
"step": 227500
},
{
"base_loss": 0.6271779141426086,
"epoch": 3.0772476196289062,
"grad_norm": 0.0011156428372487426,
"learning_rate": 2.8256320953369142e-05,
"lookahead_loss": 6.417567262649536,
"loss": 0.638,
"step": 228000
},
{
"base_loss": 0.5956557096838951,
"epoch": 3.0782012939453125,
"grad_norm": 0.001118983025662601,
"learning_rate": 2.820863723754883e-05,
"lookahead_loss": 6.439277349472046,
"loss": 0.6088,
"step": 228500
},
{
"base_loss": 0.599572938144207,
"epoch": 3.0791549682617188,
"grad_norm": 0.001086446107365191,
"learning_rate": 2.8160953521728516e-05,
"lookahead_loss": 6.460713619232178,
"loss": 0.6093,
"step": 229000
},
{
"base_loss": 0.6085604978203774,
"epoch": 3.080108642578125,
"grad_norm": 0.0010718839475885034,
"learning_rate": 2.8113269805908206e-05,
"lookahead_loss": 6.416140838623047,
"loss": 0.6184,
"step": 229500
},
{
"base_loss": 0.6050790804624557,
"epoch": 3.0810623168945312,
"grad_norm": 0.0011485074646770954,
"learning_rate": 2.8065586090087893e-05,
"lookahead_loss": 6.4402008790969845,
"loss": 0.6174,
"step": 230000
},
{
"epoch": 3.0810623168945312,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.423224188649235,
"eval_lookahead_perplexity": 615.9859708310377,
"eval_loss": 0.20741787552833557,
"eval_perplexity": 1.2304966588024913,
"eval_runtime": 90.357,
"eval_samples_per_second": 55.336,
"eval_steps_per_second": 1.738,
"step": 230000
},
{
"base_loss": 0.5915502496957779,
"epoch": 3.0820159912109375,
"grad_norm": 0.0010833791457116604,
"learning_rate": 2.801790237426758e-05,
"lookahead_loss": 6.446974056243897,
"loss": 0.6047,
"step": 230500
},
{
"base_loss": 0.5885776147842408,
"epoch": 3.0829696655273438,
"grad_norm": 0.0011150679783895612,
"learning_rate": 2.7970218658447266e-05,
"lookahead_loss": 6.480014348506928,
"loss": 0.6035,
"step": 231000
},
{
"base_loss": 0.6157435640096665,
"epoch": 3.08392333984375,
"grad_norm": 0.0010689526097849011,
"learning_rate": 2.7922534942626953e-05,
"lookahead_loss": 6.488008930206298,
"loss": 0.6266,
"step": 231500
},
{
"base_loss": 0.5945523136258125,
"epoch": 3.0848770141601562,
"grad_norm": 0.001094442093744874,
"learning_rate": 2.7874851226806643e-05,
"lookahead_loss": 6.41443329334259,
"loss": 0.6037,
"step": 232000
},
{
"base_loss": 0.5877983981966972,
"epoch": 3.0858306884765625,
"grad_norm": 0.0011258223094046116,
"learning_rate": 2.782716751098633e-05,
"lookahead_loss": 6.43271403503418,
"loss": 0.5988,
"step": 232500
},
{
"base_loss": 0.5929736877083779,
"epoch": 3.0867843627929688,
"grad_norm": 0.0012092749821022153,
"learning_rate": 2.7779483795166017e-05,
"lookahead_loss": 6.397968234062195,
"loss": 0.6049,
"step": 233000
},
{
"base_loss": 0.6253425707817077,
"epoch": 3.087738037109375,
"grad_norm": 0.001021925127133727,
"learning_rate": 2.7731800079345704e-05,
"lookahead_loss": 6.433693765163421,
"loss": 0.6294,
"step": 233500
},
{
"base_loss": 0.5948817446827889,
"epoch": 3.0886917114257812,
"grad_norm": 0.0011388851562514901,
"learning_rate": 2.768411636352539e-05,
"lookahead_loss": 6.4267685527801515,
"loss": 0.6046,
"step": 234000
},
{
"base_loss": 0.6024612309336662,
"epoch": 3.0896453857421875,
"grad_norm": 0.0011255706194788218,
"learning_rate": 2.763643264770508e-05,
"lookahead_loss": 6.45055183506012,
"loss": 0.61,
"step": 234500
},
{
"base_loss": 0.5884939526319504,
"epoch": 3.0905990600585938,
"grad_norm": 0.0010525273391976953,
"learning_rate": 2.7588748931884768e-05,
"lookahead_loss": 6.4216416721344,
"loss": 0.5991,
"step": 235000
},
{
"epoch": 3.0905990600585938,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.416253169885459,
"eval_lookahead_perplexity": 611.7068533031069,
"eval_loss": 0.20740444958209991,
"eval_perplexity": 1.2304801383314088,
"eval_runtime": 91.5784,
"eval_samples_per_second": 54.598,
"eval_steps_per_second": 1.714,
"step": 235000
},
{
"base_loss": 0.5811684273481369,
"epoch": 3.091552734375,
"grad_norm": 0.0010410203831270337,
"learning_rate": 2.7541065216064455e-05,
"lookahead_loss": 6.4036853876113895,
"loss": 0.5934,
"step": 235500
},
{
"base_loss": 0.6168777622580528,
"epoch": 3.0925064086914062,
"grad_norm": 0.001115046557970345,
"learning_rate": 2.749338150024414e-05,
"lookahead_loss": 6.453854230880737,
"loss": 0.6274,
"step": 236000
},
{
"base_loss": 0.59999961155653,
"epoch": 3.0934600830078125,
"grad_norm": 0.0010715369135141373,
"learning_rate": 2.7445697784423828e-05,
"lookahead_loss": 6.4379607105255126,
"loss": 0.6127,
"step": 236500
},
{
"base_loss": 0.5828448947072029,
"epoch": 3.0944137573242188,
"grad_norm": 0.0011117984540760517,
"learning_rate": 2.739801406860352e-05,
"lookahead_loss": 6.436477872371674,
"loss": 0.5952,
"step": 237000
},
{
"base_loss": 0.5784061435461044,
"epoch": 3.095367431640625,
"grad_norm": 0.001090219127945602,
"learning_rate": 2.7350330352783205e-05,
"lookahead_loss": 6.364374763965607,
"loss": 0.5937,
"step": 237500
},
{
"base_loss": 0.5969022975564003,
"epoch": 3.0963211059570312,
"grad_norm": 0.0011031440226361156,
"learning_rate": 2.7302646636962892e-05,
"lookahead_loss": 6.427553595066071,
"loss": 0.6116,
"step": 238000
},
{
"base_loss": 0.6168330173492431,
"epoch": 3.0972747802734375,
"grad_norm": 0.0011244708439335227,
"learning_rate": 2.725496292114258e-05,
"lookahead_loss": 6.443069730758667,
"loss": 0.6256,
"step": 238500
},
{
"base_loss": 0.5952534638047219,
"epoch": 3.0982284545898438,
"grad_norm": 0.0011027234140783548,
"learning_rate": 2.7207279205322266e-05,
"lookahead_loss": 6.3885435886383055,
"loss": 0.6053,
"step": 239000
},
{
"base_loss": 0.5822330508232116,
"epoch": 3.09918212890625,
"grad_norm": 0.0011086640879511833,
"learning_rate": 2.7159595489501956e-05,
"lookahead_loss": 6.448413589000702,
"loss": 0.5947,
"step": 239500
},
{
"base_loss": 0.5872656118273735,
"epoch": 3.1001358032226562,
"grad_norm": 0.001106367213651538,
"learning_rate": 2.7111911773681643e-05,
"lookahead_loss": 6.428266541957855,
"loss": 0.6033,
"step": 240000
},
{
"epoch": 3.1001358032226562,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.410780125151808,
"eval_lookahead_perplexity": 608.3680992404113,
"eval_loss": 0.20739257335662842,
"eval_perplexity": 1.2304655249586238,
"eval_runtime": 93.0639,
"eval_samples_per_second": 53.727,
"eval_steps_per_second": 1.687,
"step": 240000
},
{
"base_loss": 0.6096063278317452,
"epoch": 3.1010894775390625,
"grad_norm": 0.0010960623621940613,
"learning_rate": 2.706422805786133e-05,
"lookahead_loss": 6.487349942684173,
"loss": 0.6199,
"step": 240500
},
{
"base_loss": 0.6008942748308181,
"epoch": 3.1020431518554688,
"grad_norm": 0.001130930962972343,
"learning_rate": 2.7016544342041016e-05,
"lookahead_loss": 6.453820777893067,
"loss": 0.6088,
"step": 241000
},
{
"base_loss": 0.5871310735344887,
"epoch": 3.102996826171875,
"grad_norm": 0.0010819945018738508,
"learning_rate": 2.6968860626220703e-05,
"lookahead_loss": 6.439614234924316,
"loss": 0.6019,
"step": 241500
},
{
"base_loss": 0.5920609677433968,
"epoch": 3.1039505004882812,
"grad_norm": 0.0011099249823018909,
"learning_rate": 2.6921176910400393e-05,
"lookahead_loss": 6.449179169654847,
"loss": 0.6023,
"step": 242000
},
{
"base_loss": 0.6111314262747765,
"epoch": 3.1049041748046875,
"grad_norm": 0.0010791884269565344,
"learning_rate": 2.687349319458008e-05,
"lookahead_loss": 6.400581046104431,
"loss": 0.6206,
"step": 242500
},
{
"base_loss": 0.6005220351815224,
"epoch": 3.1058578491210938,
"grad_norm": 0.0010320625733584166,
"learning_rate": 2.6825809478759767e-05,
"lookahead_loss": 6.395295563697815,
"loss": 0.612,
"step": 243000
},
{
"base_loss": 0.5973421422243118,
"epoch": 3.1068115234375,
"grad_norm": 0.0010832108091562986,
"learning_rate": 2.6778125762939454e-05,
"lookahead_loss": 6.463225367546081,
"loss": 0.6071,
"step": 243500
},
{
"base_loss": 0.5840073474049569,
"epoch": 3.1077651977539062,
"grad_norm": 0.0011341134086251259,
"learning_rate": 2.673044204711914e-05,
"lookahead_loss": 6.36874011850357,
"loss": 0.5946,
"step": 244000
},
{
"base_loss": 0.6044590792655945,
"epoch": 3.1087188720703125,
"grad_norm": 0.0011408327845856547,
"learning_rate": 2.668275833129883e-05,
"lookahead_loss": 6.373018598556518,
"loss": 0.6181,
"step": 244500
},
{
"base_loss": 0.6069816564917564,
"epoch": 3.1096725463867188,
"grad_norm": 0.00108093093149364,
"learning_rate": 2.6635074615478518e-05,
"lookahead_loss": 6.359239377021789,
"loss": 0.6166,
"step": 245000
},
{
"epoch": 3.1096725463867188,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.4042751575811225,
"eval_lookahead_perplexity": 604.4235280470633,
"eval_loss": 0.20738010108470917,
"eval_perplexity": 1.230450178353713,
"eval_runtime": 89.9445,
"eval_samples_per_second": 55.59,
"eval_steps_per_second": 1.746,
"step": 245000
},
{
"base_loss": 0.5957676445245743,
"epoch": 3.110626220703125,
"grad_norm": 0.0010888735996559262,
"learning_rate": 2.6587390899658205e-05,
"lookahead_loss": 6.333310267925262,
"loss": 0.607,
"step": 245500
},
{
"base_loss": 0.5857621270418167,
"epoch": 3.1115798950195312,
"grad_norm": 0.0010927121620625257,
"learning_rate": 2.653970718383789e-05,
"lookahead_loss": 6.424707530021667,
"loss": 0.5985,
"step": 246000
},
{
"base_loss": 0.603391693353653,
"epoch": 3.1125335693359375,
"grad_norm": 0.001093365834094584,
"learning_rate": 2.6492023468017578e-05,
"lookahead_loss": 6.4441242928504945,
"loss": 0.6132,
"step": 246500
},
{
"base_loss": 0.6127855234742164,
"epoch": 3.1134872436523438,
"grad_norm": 0.0011091139167547226,
"learning_rate": 2.644433975219727e-05,
"lookahead_loss": 6.444578236103058,
"loss": 0.6263,
"step": 247000
},
{
"base_loss": 0.5977847113609314,
"epoch": 3.11444091796875,
"grad_norm": 0.0010543358512222767,
"learning_rate": 2.6396656036376955e-05,
"lookahead_loss": 6.342389549255371,
"loss": 0.6063,
"step": 247500
},
{
"base_loss": 0.585680897474289,
"epoch": 3.1153945922851562,
"grad_norm": 0.001126592163927853,
"learning_rate": 2.6348972320556642e-05,
"lookahead_loss": 6.385599094390869,
"loss": 0.5957,
"step": 248000
},
{
"base_loss": 0.6022236620783806,
"epoch": 3.1163482666015625,
"grad_norm": 0.0011800749925896525,
"learning_rate": 2.630128860473633e-05,
"lookahead_loss": 6.397586730957031,
"loss": 0.6145,
"step": 248500
},
{
"base_loss": 0.6104110144376754,
"epoch": 3.1173019409179688,
"grad_norm": 0.0011340758064761758,
"learning_rate": 2.6253604888916016e-05,
"lookahead_loss": 6.392207997322083,
"loss": 0.6223,
"step": 249000
},
{
"base_loss": 0.603924877524376,
"epoch": 3.118255615234375,
"grad_norm": 0.0011272222036495805,
"learning_rate": 2.6205921173095706e-05,
"lookahead_loss": 6.430540772438049,
"loss": 0.6105,
"step": 249500
},
{
"base_loss": 0.5842441658377647,
"epoch": 3.1192092895507812,
"grad_norm": 0.001132176723331213,
"learning_rate": 2.6158237457275393e-05,
"lookahead_loss": 6.420798850536347,
"loss": 0.5941,
"step": 250000
},
{
"epoch": 3.1192092895507812,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.398296785811647,
"eval_lookahead_perplexity": 600.8208393267304,
"eval_loss": 0.20736823976039886,
"eval_perplexity": 1.230435583671656,
"eval_runtime": 93.3617,
"eval_samples_per_second": 53.555,
"eval_steps_per_second": 1.682,
"step": 250000
},
{
"base_loss": 0.5957836083769799,
"epoch": 4.000953674316406,
"grad_norm": 0.00111961062066257,
"learning_rate": 2.611055374145508e-05,
"lookahead_loss": 6.485253520011902,
"loss": 0.6023,
"step": 250500
},
{
"base_loss": 0.586733922958374,
"epoch": 4.0019073486328125,
"grad_norm": 0.0011267756344750524,
"learning_rate": 2.6062870025634766e-05,
"lookahead_loss": 6.335224370956421,
"loss": 0.5985,
"step": 251000
},
{
"base_loss": 0.6027773340344429,
"epoch": 4.002861022949219,
"grad_norm": 0.0011099674738943577,
"learning_rate": 2.6015186309814453e-05,
"lookahead_loss": 6.3454978213310245,
"loss": 0.6125,
"step": 251500
},
{
"base_loss": 0.6127874755859375,
"epoch": 4.003814697265625,
"grad_norm": 0.001098731765523553,
"learning_rate": 2.5967502593994143e-05,
"lookahead_loss": 6.355840661048889,
"loss": 0.6212,
"step": 252000
},
{
"base_loss": 0.5991843653917313,
"epoch": 4.004768371582031,
"grad_norm": 0.0010964460670948029,
"learning_rate": 2.591981887817383e-05,
"lookahead_loss": 6.346891592025757,
"loss": 0.6078,
"step": 252500
},
{
"base_loss": 0.5906335787773133,
"epoch": 4.0057220458984375,
"grad_norm": 0.0010437110904604197,
"learning_rate": 2.5872135162353517e-05,
"lookahead_loss": 6.4677832736969,
"loss": 0.6006,
"step": 253000
},
{
"base_loss": 0.5800002152323723,
"epoch": 4.006675720214844,
"grad_norm": 0.0010704013984650373,
"learning_rate": 2.5824451446533204e-05,
"lookahead_loss": 6.314324316501618,
"loss": 0.5963,
"step": 253500
},
{
"base_loss": 0.6064526105523109,
"epoch": 4.00762939453125,
"grad_norm": 0.0011120210401713848,
"learning_rate": 2.577676773071289e-05,
"lookahead_loss": 6.374777404785156,
"loss": 0.6184,
"step": 254000
},
{
"base_loss": 0.6036154347658157,
"epoch": 4.008583068847656,
"grad_norm": 0.0010486901737749577,
"learning_rate": 2.572908401489258e-05,
"lookahead_loss": 6.386540162086487,
"loss": 0.6094,
"step": 254500
},
{
"base_loss": 0.5886935539245606,
"epoch": 4.0095367431640625,
"grad_norm": 0.0011316712480038404,
"learning_rate": 2.5681400299072268e-05,
"lookahead_loss": 6.388794689178467,
"loss": 0.6054,
"step": 255000
},
{
"epoch": 4.0095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.392628372667697,
"eval_lookahead_perplexity": 597.4247728311323,
"eval_loss": 0.20735666155815125,
"eval_perplexity": 1.2304213375220883,
"eval_runtime": 90.6862,
"eval_samples_per_second": 55.135,
"eval_steps_per_second": 1.731,
"step": 255000
},
{
"base_loss": 0.5923344283699989,
"epoch": 4.010490417480469,
"grad_norm": 0.0011050583561882377,
"learning_rate": 2.5633716583251955e-05,
"lookahead_loss": 6.32834493637085,
"loss": 0.6037,
"step": 255500
},
{
"base_loss": 0.5904504474997521,
"epoch": 4.011444091796875,
"grad_norm": 0.001151700154878199,
"learning_rate": 2.558603286743164e-05,
"lookahead_loss": 6.364507519721985,
"loss": 0.605,
"step": 256000
},
{
"base_loss": 0.6088732249736786,
"epoch": 4.012397766113281,
"grad_norm": 0.0010878178291022778,
"learning_rate": 2.5538349151611328e-05,
"lookahead_loss": 6.36634335231781,
"loss": 0.6219,
"step": 256500
},
{
"base_loss": 0.59388463139534,
"epoch": 4.0133514404296875,
"grad_norm": 0.0011076608207076788,
"learning_rate": 2.549066543579102e-05,
"lookahead_loss": 6.438094673156738,
"loss": 0.607,
"step": 257000
},
{
"base_loss": 0.5942253875732422,
"epoch": 4.014305114746094,
"grad_norm": 0.0010897432221099734,
"learning_rate": 2.5442981719970705e-05,
"lookahead_loss": 6.4205363702774045,
"loss": 0.6056,
"step": 257500
},
{
"base_loss": 0.5844743223190307,
"epoch": 4.0152587890625,
"grad_norm": 0.001093736500479281,
"learning_rate": 2.5395298004150392e-05,
"lookahead_loss": 6.345088118553162,
"loss": 0.592,
"step": 258000
},
{
"base_loss": 0.6102200556993485,
"epoch": 4.016212463378906,
"grad_norm": 0.001128020347096026,
"learning_rate": 2.534761428833008e-05,
"lookahead_loss": 6.393371778488159,
"loss": 0.6185,
"step": 258500
},
{
"base_loss": 0.6051568930149078,
"epoch": 4.0171661376953125,
"grad_norm": 0.001108690514229238,
"learning_rate": 2.5299930572509766e-05,
"lookahead_loss": 6.449675846099853,
"loss": 0.6106,
"step": 259000
},
{
"base_loss": 0.5939539469480515,
"epoch": 4.018119812011719,
"grad_norm": 0.001039958675391972,
"learning_rate": 2.5252246856689456e-05,
"lookahead_loss": 6.4385554246902466,
"loss": 0.6013,
"step": 259500
},
{
"base_loss": 0.5881457543373108,
"epoch": 4.019073486328125,
"grad_norm": 0.001058859284967184,
"learning_rate": 2.5204563140869143e-05,
"lookahead_loss": 6.462026128768921,
"loss": 0.5991,
"step": 260000
},
{
"epoch": 4.019073486328125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.386161040955077,
"eval_lookahead_perplexity": 593.5734958186138,
"eval_loss": 0.2073436826467514,
"eval_perplexity": 1.230405368096197,
"eval_runtime": 92.3148,
"eval_samples_per_second": 54.162,
"eval_steps_per_second": 1.701,
"step": 260000
},
{
"base_loss": 0.5868560363054276,
"epoch": 4.020027160644531,
"grad_norm": 0.0011423506075516343,
"learning_rate": 2.515687942504883e-05,
"lookahead_loss": 6.313244658946991,
"loss": 0.6021,
"step": 260500
},
{
"base_loss": 0.6168393405079842,
"epoch": 4.0209808349609375,
"grad_norm": 0.001083866460248828,
"learning_rate": 2.5109195709228516e-05,
"lookahead_loss": 6.362197784423828,
"loss": 0.6256,
"step": 261000
},
{
"base_loss": 0.5967223855853081,
"epoch": 4.021934509277344,
"grad_norm": 0.0011323863873258233,
"learning_rate": 2.5061511993408203e-05,
"lookahead_loss": 6.346835678100586,
"loss": 0.6105,
"step": 261500
},
{
"base_loss": 0.5978920136094094,
"epoch": 4.02288818359375,
"grad_norm": 0.00111202837433666,
"learning_rate": 2.5013828277587893e-05,
"lookahead_loss": 6.382146213531494,
"loss": 0.6063,
"step": 262000
},
{
"base_loss": 0.5982745458483696,
"epoch": 4.023841857910156,
"grad_norm": 0.001102231559343636,
"learning_rate": 2.496614456176758e-05,
"lookahead_loss": 6.34771177482605,
"loss": 0.6052,
"step": 262500
},
{
"base_loss": 0.6113470050692559,
"epoch": 4.0247955322265625,
"grad_norm": 0.0010907762916758657,
"learning_rate": 2.4918460845947267e-05,
"lookahead_loss": 6.360005939006806,
"loss": 0.621,
"step": 263000
},
{
"base_loss": 0.6052272637486458,
"epoch": 4.025749206542969,
"grad_norm": 0.001096719759516418,
"learning_rate": 2.4870777130126954e-05,
"lookahead_loss": 6.313052820205688,
"loss": 0.6153,
"step": 263500
},
{
"base_loss": 0.5957095698714256,
"epoch": 4.026702880859375,
"grad_norm": 0.0011290950933471322,
"learning_rate": 2.482309341430664e-05,
"lookahead_loss": 6.339996559143066,
"loss": 0.6055,
"step": 264000
},
{
"base_loss": 0.5928845180273056,
"epoch": 4.027656555175781,
"grad_norm": 0.0011050624307245016,
"learning_rate": 2.477540969848633e-05,
"lookahead_loss": 6.442285621643067,
"loss": 0.6044,
"step": 264500
},
{
"base_loss": 0.6156682388782502,
"epoch": 4.0286102294921875,
"grad_norm": 0.0010948260314762592,
"learning_rate": 2.4727725982666018e-05,
"lookahead_loss": 6.466365109920502,
"loss": 0.6243,
"step": 265000
},
{
"epoch": 4.0286102294921875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.380477937265707,
"eval_lookahead_perplexity": 590.2097234815991,
"eval_loss": 0.2073325216770172,
"eval_perplexity": 1.230391635655757,
"eval_runtime": 91.8069,
"eval_samples_per_second": 54.462,
"eval_steps_per_second": 1.71,
"step": 265000
},
{
"base_loss": 0.6024927053451538,
"epoch": 4.029563903808594,
"grad_norm": 0.0011052724439650774,
"learning_rate": 2.4680042266845705e-05,
"lookahead_loss": 6.400326736450196,
"loss": 0.6118,
"step": 265500
},
{
"base_loss": 0.5958824927210807,
"epoch": 4.030517578125,
"grad_norm": 0.0010953382588922977,
"learning_rate": 2.463235855102539e-05,
"lookahead_loss": 6.396139843940735,
"loss": 0.6077,
"step": 266000
},
{
"base_loss": 0.5906458727121353,
"epoch": 4.031471252441406,
"grad_norm": 0.0011001034872606397,
"learning_rate": 2.4584674835205078e-05,
"lookahead_loss": 6.419850166797638,
"loss": 0.6039,
"step": 266500
},
{
"base_loss": 0.6140482442975044,
"epoch": 4.0324249267578125,
"grad_norm": 0.001121096545830369,
"learning_rate": 2.453699111938477e-05,
"lookahead_loss": 6.364969326019287,
"loss": 0.6282,
"step": 267000
},
{
"base_loss": 0.5971780525445938,
"epoch": 4.033378601074219,
"grad_norm": 0.0011231973767280579,
"learning_rate": 2.4489307403564455e-05,
"lookahead_loss": 6.409055516242981,
"loss": 0.6072,
"step": 267500
},
{
"base_loss": 0.5922137571573257,
"epoch": 4.034332275390625,
"grad_norm": 0.0010805290658026934,
"learning_rate": 2.4441623687744142e-05,
"lookahead_loss": 6.48427177810669,
"loss": 0.6062,
"step": 268000
},
{
"base_loss": 0.591649469256401,
"epoch": 4.035285949707031,
"grad_norm": 0.0011319770710542798,
"learning_rate": 2.439393997192383e-05,
"lookahead_loss": 6.313907026767731,
"loss": 0.6035,
"step": 268500
},
{
"base_loss": 0.609982794225216,
"epoch": 4.0362396240234375,
"grad_norm": 0.0011232325341552496,
"learning_rate": 2.4346256256103516e-05,
"lookahead_loss": 6.435509991645813,
"loss": 0.6217,
"step": 269000
},
{
"base_loss": 0.5994021319746972,
"epoch": 4.037193298339844,
"grad_norm": 0.0011218226281926036,
"learning_rate": 2.4298572540283206e-05,
"lookahead_loss": 6.378507499694824,
"loss": 0.6093,
"step": 269500
},
{
"base_loss": 0.5912262842059135,
"epoch": 4.03814697265625,
"grad_norm": 0.0010908119147643447,
"learning_rate": 2.4250888824462893e-05,
"lookahead_loss": 6.3788512840271,
"loss": 0.6023,
"step": 270000
},
{
"epoch": 4.03814697265625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.375024708696067,
"eval_lookahead_perplexity": 586.9999347655023,
"eval_loss": 0.207322895526886,
"eval_perplexity": 1.2303797917781574,
"eval_runtime": 89.9406,
"eval_samples_per_second": 55.592,
"eval_steps_per_second": 1.746,
"step": 270000
},
{
"base_loss": 0.5976571834087372,
"epoch": 4.039100646972656,
"grad_norm": 0.001099589979276061,
"learning_rate": 2.420320510864258e-05,
"lookahead_loss": 6.3452381052970885,
"loss": 0.6117,
"step": 270500
},
{
"base_loss": 0.6155648761987687,
"epoch": 4.0400543212890625,
"grad_norm": 0.0011181931477040052,
"learning_rate": 2.4155521392822266e-05,
"lookahead_loss": 6.385155973434448,
"loss": 0.6245,
"step": 271000
},
{
"base_loss": 0.603646912753582,
"epoch": 4.041007995605469,
"grad_norm": 0.0011384629178792238,
"learning_rate": 2.4107837677001953e-05,
"lookahead_loss": 6.339157883644104,
"loss": 0.6137,
"step": 271500
},
{
"base_loss": 0.5785835943818093,
"epoch": 4.041961669921875,
"grad_norm": 0.0011360092321410775,
"learning_rate": 2.406015396118164e-05,
"lookahead_loss": 6.399327411651611,
"loss": 0.5954,
"step": 272000
},
{
"base_loss": 0.6030765172839164,
"epoch": 4.042915344238281,
"grad_norm": 0.0010638670064508915,
"learning_rate": 2.401247024536133e-05,
"lookahead_loss": 6.423156121253967,
"loss": 0.6143,
"step": 272500
},
{
"base_loss": 0.6119026395678521,
"epoch": 4.0438690185546875,
"grad_norm": 0.0011214343830943108,
"learning_rate": 2.3964786529541017e-05,
"lookahead_loss": 6.441630228996277,
"loss": 0.6252,
"step": 273000
},
{
"base_loss": 0.5916597181558609,
"epoch": 4.044822692871094,
"grad_norm": 0.0011162528535351157,
"learning_rate": 2.3917102813720704e-05,
"lookahead_loss": 6.372374580383301,
"loss": 0.6053,
"step": 273500
},
{
"base_loss": 0.5914614844322205,
"epoch": 4.0457763671875,
"grad_norm": 0.0010877457680180669,
"learning_rate": 2.386941909790039e-05,
"lookahead_loss": 6.3761955223083495,
"loss": 0.6021,
"step": 274000
},
{
"base_loss": 0.621587080359459,
"epoch": 4.046730041503906,
"grad_norm": 0.001101334230042994,
"learning_rate": 2.3821735382080078e-05,
"lookahead_loss": 6.33512171459198,
"loss": 0.6318,
"step": 274500
},
{
"base_loss": 0.6045556816458703,
"epoch": 4.0476837158203125,
"grad_norm": 0.001156001933850348,
"learning_rate": 2.3774051666259768e-05,
"lookahead_loss": 6.388499364376068,
"loss": 0.6215,
"step": 275000
},
{
"epoch": 4.0476837158203125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.370460871309518,
"eval_lookahead_perplexity": 584.3270664248921,
"eval_loss": 0.20731313526630402,
"eval_perplexity": 1.2303677830093793,
"eval_runtime": 91.8401,
"eval_samples_per_second": 54.442,
"eval_steps_per_second": 1.709,
"step": 275000
},
{
"base_loss": 0.5886928399801254,
"epoch": 4.048637390136719,
"grad_norm": 0.0010876537999138236,
"learning_rate": 2.3726367950439455e-05,
"lookahead_loss": 6.358909488677979,
"loss": 0.6008,
"step": 275500
},
{
"base_loss": 0.5989522265791893,
"epoch": 4.049591064453125,
"grad_norm": 0.0011483179405331612,
"learning_rate": 2.367868423461914e-05,
"lookahead_loss": 6.335620626926422,
"loss": 0.6074,
"step": 276000
},
{
"base_loss": 0.615303504705429,
"epoch": 4.050544738769531,
"grad_norm": 0.0010323745664209127,
"learning_rate": 2.3631000518798828e-05,
"lookahead_loss": 6.436678630828857,
"loss": 0.6272,
"step": 276500
},
{
"base_loss": 0.5973832362890243,
"epoch": 4.0514984130859375,
"grad_norm": 0.0011526040034368634,
"learning_rate": 2.3583316802978515e-05,
"lookahead_loss": 6.392355844974518,
"loss": 0.6105,
"step": 277000
},
{
"base_loss": 0.5953535653948784,
"epoch": 4.052452087402344,
"grad_norm": 0.0011593152303248644,
"learning_rate": 2.3535633087158205e-05,
"lookahead_loss": 6.3196696195602415,
"loss": 0.6054,
"step": 277500
},
{
"base_loss": 0.609023510336876,
"epoch": 4.05340576171875,
"grad_norm": 0.0010948091512545943,
"learning_rate": 2.3487949371337892e-05,
"lookahead_loss": 6.335317673683167,
"loss": 0.6192,
"step": 278000
},
{
"base_loss": 0.6116033662557602,
"epoch": 4.054359436035156,
"grad_norm": 0.0011012164177373052,
"learning_rate": 2.344026565551758e-05,
"lookahead_loss": 6.380707992076874,
"loss": 0.6258,
"step": 278500
},
{
"base_loss": 0.5943940283060074,
"epoch": 4.0553131103515625,
"grad_norm": 0.0010989225702360272,
"learning_rate": 2.3392581939697266e-05,
"lookahead_loss": 6.384932087421417,
"loss": 0.6041,
"step": 279000
},
{
"base_loss": 0.5885190908908844,
"epoch": 4.056266784667969,
"grad_norm": 0.001057869172655046,
"learning_rate": 2.3344898223876953e-05,
"lookahead_loss": 6.398144548416138,
"loss": 0.6033,
"step": 279500
},
{
"base_loss": 0.6137237245440483,
"epoch": 4.057220458984375,
"grad_norm": 0.001099518733099103,
"learning_rate": 2.3297214508056643e-05,
"lookahead_loss": 6.392653202056885,
"loss": 0.6266,
"step": 280000
},
{
"epoch": 4.057220458984375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.365785460121716,
"eval_lookahead_perplexity": 581.6014737195064,
"eval_loss": 0.20730404555797577,
"eval_perplexity": 1.2303565993759233,
"eval_runtime": 91.3171,
"eval_samples_per_second": 54.754,
"eval_steps_per_second": 1.719,
"step": 280000
},
{
"base_loss": 0.6062676934599877,
"epoch": 4.058174133300781,
"grad_norm": 0.0011220412561669946,
"learning_rate": 2.324953079223633e-05,
"lookahead_loss": 6.392493741989136,
"loss": 0.6151,
"step": 280500
},
{
"base_loss": 0.5848039170503616,
"epoch": 4.0591278076171875,
"grad_norm": 0.0011025768471881747,
"learning_rate": 2.3201847076416016e-05,
"lookahead_loss": 6.284692679405213,
"loss": 0.6003,
"step": 281000
},
{
"base_loss": 0.5932843062877655,
"epoch": 4.060081481933594,
"grad_norm": 0.0011020904639735818,
"learning_rate": 2.3154163360595703e-05,
"lookahead_loss": 6.377818461418152,
"loss": 0.6053,
"step": 281500
},
{
"base_loss": 0.6079036598205566,
"epoch": 4.06103515625,
"grad_norm": 0.0011223220499232411,
"learning_rate": 2.310647964477539e-05,
"lookahead_loss": 6.317885259151459,
"loss": 0.6188,
"step": 282000
},
{
"base_loss": 0.6019631532430649,
"epoch": 4.061988830566406,
"grad_norm": 0.0011525802547112107,
"learning_rate": 2.305879592895508e-05,
"lookahead_loss": 6.3444995069503785,
"loss": 0.6138,
"step": 282500
},
{
"base_loss": 0.5940396988987923,
"epoch": 4.0629425048828125,
"grad_norm": 0.001133267069235444,
"learning_rate": 2.3011112213134767e-05,
"lookahead_loss": 6.347807200431824,
"loss": 0.6066,
"step": 283000
},
{
"base_loss": 0.612035707950592,
"epoch": 4.063896179199219,
"grad_norm": 0.0010704045416787267,
"learning_rate": 2.2963428497314454e-05,
"lookahead_loss": 6.361331562995911,
"loss": 0.6231,
"step": 283500
},
{
"base_loss": 0.6061015563607216,
"epoch": 4.064849853515625,
"grad_norm": 0.0011012317845597863,
"learning_rate": 2.291574478149414e-05,
"lookahead_loss": 6.41039630651474,
"loss": 0.6162,
"step": 284000
},
{
"base_loss": 0.6026831232309341,
"epoch": 4.065803527832031,
"grad_norm": 0.0011320897610858083,
"learning_rate": 2.2868061065673828e-05,
"lookahead_loss": 6.329383935928345,
"loss": 0.6138,
"step": 284500
},
{
"base_loss": 0.6037889505624772,
"epoch": 4.0667572021484375,
"grad_norm": 0.0011039386736229062,
"learning_rate": 2.2820377349853518e-05,
"lookahead_loss": 6.318099370002747,
"loss": 0.6115,
"step": 285000
},
{
"epoch": 4.0667572021484375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.360919896025246,
"eval_lookahead_perplexity": 578.7785276521714,
"eval_loss": 0.2072950154542923,
"eval_perplexity": 1.2303454891784265,
"eval_runtime": 89.1026,
"eval_samples_per_second": 56.115,
"eval_steps_per_second": 1.762,
"step": 285000
},
{
"base_loss": 0.6186125885248184,
"epoch": 4.067710876464844,
"grad_norm": 0.0011432436294853687,
"learning_rate": 2.2772693634033205e-05,
"lookahead_loss": 6.3883231830596925,
"loss": 0.6282,
"step": 285500
},
{
"base_loss": 0.5985263943076133,
"epoch": 4.06866455078125,
"grad_norm": 0.001086168922483921,
"learning_rate": 2.272500991821289e-05,
"lookahead_loss": 6.344208096981048,
"loss": 0.609,
"step": 286000
},
{
"base_loss": 0.5856241980195045,
"epoch": 4.069618225097656,
"grad_norm": 0.001086947857402265,
"learning_rate": 2.2677326202392578e-05,
"lookahead_loss": 6.384473464012146,
"loss": 0.5994,
"step": 286500
},
{
"base_loss": 0.615737675666809,
"epoch": 4.0705718994140625,
"grad_norm": 0.0011107822647318244,
"learning_rate": 2.2629642486572265e-05,
"lookahead_loss": 6.289638694763184,
"loss": 0.628,
"step": 287000
},
{
"base_loss": 0.5986395262479782,
"epoch": 4.071525573730469,
"grad_norm": 0.0010868743993341923,
"learning_rate": 2.2581958770751955e-05,
"lookahead_loss": 6.324786671638488,
"loss": 0.6105,
"step": 287500
},
{
"base_loss": 0.5934888973236084,
"epoch": 4.072479248046875,
"grad_norm": 0.0011527807218953967,
"learning_rate": 2.2534275054931642e-05,
"lookahead_loss": 6.377863648414611,
"loss": 0.6042,
"step": 288000
},
{
"base_loss": 0.5941547375321389,
"epoch": 4.073432922363281,
"grad_norm": 0.0010451872367411852,
"learning_rate": 2.248659133911133e-05,
"lookahead_loss": 6.336130671501159,
"loss": 0.6062,
"step": 288500
},
{
"base_loss": 0.6138048614859581,
"epoch": 4.0743865966796875,
"grad_norm": 0.00108580372761935,
"learning_rate": 2.2438907623291016e-05,
"lookahead_loss": 6.343031913757324,
"loss": 0.6243,
"step": 289000
},
{
"base_loss": 0.5931758234500885,
"epoch": 4.075340270996094,
"grad_norm": 0.001116293016821146,
"learning_rate": 2.2391223907470703e-05,
"lookahead_loss": 6.409209857940674,
"loss": 0.6072,
"step": 289500
},
{
"base_loss": 0.587039683163166,
"epoch": 4.0762939453125,
"grad_norm": 0.0010742460144683719,
"learning_rate": 2.2343540191650393e-05,
"lookahead_loss": 6.3832610340118405,
"loss": 0.5971,
"step": 290000
},
{
"epoch": 4.0762939453125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.3556131493931,
"eval_lookahead_perplexity": 575.7152319059157,
"eval_loss": 0.20728513598442078,
"eval_perplexity": 1.2303333340772777,
"eval_runtime": 92.8001,
"eval_samples_per_second": 53.879,
"eval_steps_per_second": 1.692,
"step": 290000
},
{
"base_loss": 0.6259823521375656,
"epoch": 4.077247619628906,
"grad_norm": 0.001103928778320551,
"learning_rate": 2.229585647583008e-05,
"lookahead_loss": 6.341942510604858,
"loss": 0.6351,
"step": 290500
},
{
"base_loss": 0.5940629866719246,
"epoch": 4.0782012939453125,
"grad_norm": 0.001126352814026177,
"learning_rate": 2.2248172760009766e-05,
"lookahead_loss": 6.3599296531677245,
"loss": 0.6092,
"step": 291000
},
{
"base_loss": 0.5972123501300812,
"epoch": 4.079154968261719,
"grad_norm": 0.001072005252353847,
"learning_rate": 2.2200489044189453e-05,
"lookahead_loss": 6.396831311225891,
"loss": 0.6064,
"step": 291500
},
{
"base_loss": 0.6067690544128418,
"epoch": 4.080108642578125,
"grad_norm": 0.0010752794332802296,
"learning_rate": 2.215280532836914e-05,
"lookahead_loss": 6.333174806594848,
"loss": 0.6189,
"step": 292000
},
{
"base_loss": 0.6066553748250008,
"epoch": 4.081062316894531,
"grad_norm": 0.0011793439043685794,
"learning_rate": 2.210512161254883e-05,
"lookahead_loss": 6.3659382867813115,
"loss": 0.618,
"step": 292500
},
{
"base_loss": 0.5932499123811722,
"epoch": 4.0820159912109375,
"grad_norm": 0.001059235306456685,
"learning_rate": 2.2057437896728517e-05,
"lookahead_loss": 6.378923125267029,
"loss": 0.6045,
"step": 293000
},
{
"base_loss": 0.5890687267780303,
"epoch": 4.082969665527344,
"grad_norm": 0.0011305406223982573,
"learning_rate": 2.2009754180908204e-05,
"lookahead_loss": 6.4099526739120485,
"loss": 0.6029,
"step": 293500
},
{
"base_loss": 0.6172499601840973,
"epoch": 4.08392333984375,
"grad_norm": 0.0010661403648555279,
"learning_rate": 2.196207046508789e-05,
"lookahead_loss": 6.412414843082428,
"loss": 0.6288,
"step": 294000
},
{
"base_loss": 0.5928957391381263,
"epoch": 4.084877014160156,
"grad_norm": 0.0011031859321519732,
"learning_rate": 2.1914386749267578e-05,
"lookahead_loss": 6.342724691390991,
"loss": 0.6048,
"step": 294500
},
{
"base_loss": 0.5880893425941467,
"epoch": 4.0858306884765625,
"grad_norm": 0.0011241311440244317,
"learning_rate": 2.1866703033447268e-05,
"lookahead_loss": 6.35858272600174,
"loss": 0.5976,
"step": 295000
},
{
"epoch": 4.0858306884765625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.350282313343816,
"eval_lookahead_perplexity": 572.6543541608802,
"eval_loss": 0.20727534592151642,
"eval_perplexity": 1.2303212890955044,
"eval_runtime": 88.9078,
"eval_samples_per_second": 56.238,
"eval_steps_per_second": 1.766,
"step": 295000
},
{
"base_loss": 0.5925973926782608,
"epoch": 4.086784362792969,
"grad_norm": 0.0011744624935090542,
"learning_rate": 2.1819019317626955e-05,
"lookahead_loss": 6.3310195207595825,
"loss": 0.6042,
"step": 295500
},
{
"base_loss": 0.6219116472601891,
"epoch": 4.087738037109375,
"grad_norm": 0.0010088298004120588,
"learning_rate": 2.177133560180664e-05,
"lookahead_loss": 6.351883282661438,
"loss": 0.6272,
"step": 296000
},
{
"base_loss": 0.5964856284856797,
"epoch": 4.088691711425781,
"grad_norm": 0.0011463849805295467,
"learning_rate": 2.1723651885986328e-05,
"lookahead_loss": 6.367171252727508,
"loss": 0.605,
"step": 296500
},
{
"base_loss": 0.599702876329422,
"epoch": 4.0896453857421875,
"grad_norm": 0.001124189468100667,
"learning_rate": 2.1675968170166015e-05,
"lookahead_loss": 6.378195665359497,
"loss": 0.6085,
"step": 297000
},
{
"base_loss": 0.5873163719773292,
"epoch": 4.090599060058594,
"grad_norm": 0.0010804467601701617,
"learning_rate": 2.1628284454345705e-05,
"lookahead_loss": 6.3505833082199095,
"loss": 0.5978,
"step": 297500
},
{
"base_loss": 0.5817358963489533,
"epoch": 4.091552734375,
"grad_norm": 0.0010557883651927114,
"learning_rate": 2.1580600738525392e-05,
"lookahead_loss": 6.3347275447845455,
"loss": 0.5921,
"step": 298000
},
{
"base_loss": 0.6144214420318603,
"epoch": 4.092506408691406,
"grad_norm": 0.0011353583540767431,
"learning_rate": 2.153291702270508e-05,
"lookahead_loss": 6.380691371917725,
"loss": 0.626,
"step": 298500
},
{
"base_loss": 0.5997614379525185,
"epoch": 4.0934600830078125,
"grad_norm": 0.001061895745806396,
"learning_rate": 2.1485233306884766e-05,
"lookahead_loss": 6.359518153190613,
"loss": 0.6125,
"step": 299000
},
{
"base_loss": 0.5837140116095543,
"epoch": 4.094413757324219,
"grad_norm": 0.0010897148167714477,
"learning_rate": 2.1437549591064453e-05,
"lookahead_loss": 6.365825653076172,
"loss": 0.5966,
"step": 299500
},
{
"base_loss": 0.5794652185440063,
"epoch": 4.095367431640625,
"grad_norm": 0.0010967223206534982,
"learning_rate": 2.1389865875244143e-05,
"lookahead_loss": 6.290959995746612,
"loss": 0.594,
"step": 300000
},
{
"epoch": 4.095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.346539011397682,
"eval_lookahead_perplexity": 570.5147431060077,
"eval_loss": 0.20726701617240906,
"eval_perplexity": 1.2303110408705273,
"eval_runtime": 92.9044,
"eval_samples_per_second": 53.819,
"eval_steps_per_second": 1.69,
"step": 300000
},
{
"base_loss": 0.5975050512552261,
"epoch": 4.096321105957031,
"grad_norm": 0.0011109969345852733,
"learning_rate": 2.134218215942383e-05,
"lookahead_loss": 6.353515555381775,
"loss": 0.6116,
"step": 300500
},
{
"base_loss": 0.610376767218113,
"epoch": 4.0972747802734375,
"grad_norm": 0.0011318782344460487,
"learning_rate": 2.1294498443603516e-05,
"lookahead_loss": 6.376906949996949,
"loss": 0.6227,
"step": 301000
},
{
"base_loss": 0.5961799000501633,
"epoch": 4.098228454589844,
"grad_norm": 0.0010932480217888951,
"learning_rate": 2.1246814727783203e-05,
"lookahead_loss": 6.313533523082733,
"loss": 0.6066,
"step": 301500
},
{
"base_loss": 0.5839848874211311,
"epoch": 4.09918212890625,
"grad_norm": 0.0011126038152724504,
"learning_rate": 2.119913101196289e-05,
"lookahead_loss": 6.389643818378448,
"loss": 0.5955,
"step": 302000
},
{
"base_loss": 0.5879470457434655,
"epoch": 4.100135803222656,
"grad_norm": 0.0011223671026527882,
"learning_rate": 2.115144729614258e-05,
"lookahead_loss": 6.359994841575623,
"loss": 0.6028,
"step": 302500
},
{
"base_loss": 0.6098613065481185,
"epoch": 4.1010894775390625,
"grad_norm": 0.0010963305830955505,
"learning_rate": 2.1103763580322267e-05,
"lookahead_loss": 6.410213003635406,
"loss": 0.6181,
"step": 303000
},
{
"base_loss": 0.6013668415546417,
"epoch": 4.102043151855469,
"grad_norm": 0.0011391708394512534,
"learning_rate": 2.1056079864501954e-05,
"lookahead_loss": 6.382720433235169,
"loss": 0.6076,
"step": 303500
},
{
"base_loss": 0.5899085831642151,
"epoch": 4.102996826171875,
"grad_norm": 0.0010788282379508018,
"learning_rate": 2.100839614868164e-05,
"lookahead_loss": 6.362212418556213,
"loss": 0.6021,
"step": 304000
},
{
"base_loss": 0.5915732194781304,
"epoch": 4.103950500488281,
"grad_norm": 0.0011102594435214996,
"learning_rate": 2.0960712432861328e-05,
"lookahead_loss": 6.379088652610779,
"loss": 0.6022,
"step": 304500
},
{
"base_loss": 0.6102671644687653,
"epoch": 4.1049041748046875,
"grad_norm": 0.0010900754714384675,
"learning_rate": 2.0913028717041018e-05,
"lookahead_loss": 6.325960657596588,
"loss": 0.6216,
"step": 305000
},
{
"epoch": 4.1049041748046875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.342223271775169,
"eval_lookahead_perplexity": 568.0578554810108,
"eval_loss": 0.20725864171981812,
"eval_perplexity": 1.2303007377321853,
"eval_runtime": 91.3026,
"eval_samples_per_second": 54.763,
"eval_steps_per_second": 1.72,
"step": 305000
},
{
"base_loss": 0.6025243458151818,
"epoch": 4.105857849121094,
"grad_norm": 0.001037590904161334,
"learning_rate": 2.0865345001220705e-05,
"lookahead_loss": 6.338426442146301,
"loss": 0.6134,
"step": 305500
},
{
"base_loss": 0.5969890383481979,
"epoch": 4.1068115234375,
"grad_norm": 0.0010905301896855235,
"learning_rate": 2.081766128540039e-05,
"lookahead_loss": 6.4113108348846435,
"loss": 0.6073,
"step": 306000
},
{
"base_loss": 0.5838305065631867,
"epoch": 4.107765197753906,
"grad_norm": 0.0011273369891569018,
"learning_rate": 2.0769977569580078e-05,
"lookahead_loss": 6.2966307439804075,
"loss": 0.5932,
"step": 306500
},
{
"base_loss": 0.6047073290944099,
"epoch": 4.1087188720703125,
"grad_norm": 0.0011418815702199936,
"learning_rate": 2.0722293853759765e-05,
"lookahead_loss": 6.3043163280487065,
"loss": 0.617,
"step": 307000
},
{
"base_loss": 0.6081058176159859,
"epoch": 4.109672546386719,
"grad_norm": 0.0010792854009196162,
"learning_rate": 2.0674610137939455e-05,
"lookahead_loss": 6.284334290027618,
"loss": 0.6181,
"step": 307500
},
{
"base_loss": 0.5982678539156914,
"epoch": 4.110626220703125,
"grad_norm": 0.00108222512062639,
"learning_rate": 2.0626926422119142e-05,
"lookahead_loss": 6.264766163825989,
"loss": 0.6081,
"step": 308000
},
{
"base_loss": 0.5849089304804802,
"epoch": 4.111579895019531,
"grad_norm": 0.001074333442375064,
"learning_rate": 2.057924270629883e-05,
"lookahead_loss": 6.356988987922668,
"loss": 0.5967,
"step": 308500
},
{
"base_loss": 0.60502344673872,
"epoch": 4.1125335693359375,
"grad_norm": 0.0010747781489044428,
"learning_rate": 2.0531558990478516e-05,
"lookahead_loss": 6.378328808307648,
"loss": 0.6128,
"step": 309000
},
{
"base_loss": 0.6168978958129883,
"epoch": 4.113487243652344,
"grad_norm": 0.001114184153266251,
"learning_rate": 2.0483875274658203e-05,
"lookahead_loss": 6.3930044984817505,
"loss": 0.6281,
"step": 309500
},
{
"base_loss": 0.5967838813066483,
"epoch": 4.11444091796875,
"grad_norm": 0.001040627365000546,
"learning_rate": 2.0436191558837893e-05,
"lookahead_loss": 6.270927430152893,
"loss": 0.6069,
"step": 310000
},
{
"epoch": 4.11444091796875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.338280543732567,
"eval_lookahead_perplexity": 565.8225673067175,
"eval_loss": 0.2072509527206421,
"eval_perplexity": 1.2302912779871946,
"eval_runtime": 90.8255,
"eval_samples_per_second": 55.051,
"eval_steps_per_second": 1.729,
"step": 310000
},
{
"base_loss": 0.5838415359258652,
"epoch": 4.115394592285156,
"grad_norm": 0.0011250053066760302,
"learning_rate": 2.038850784301758e-05,
"lookahead_loss": 6.321314858436584,
"loss": 0.5963,
"step": 310500
},
{
"base_loss": 0.6032737566232681,
"epoch": 4.1163482666015625,
"grad_norm": 0.0011572489747777581,
"learning_rate": 2.0340824127197266e-05,
"lookahead_loss": 6.336915095329284,
"loss": 0.6151,
"step": 311000
},
{
"base_loss": 0.6101697637438774,
"epoch": 4.117301940917969,
"grad_norm": 0.0011303132632747293,
"learning_rate": 2.0293140411376953e-05,
"lookahead_loss": 6.329978828430176,
"loss": 0.6221,
"step": 311500
},
{
"base_loss": 0.5998096758127213,
"epoch": 4.118255615234375,
"grad_norm": 0.0011345641687512398,
"learning_rate": 2.024545669555664e-05,
"lookahead_loss": 6.358539440155029,
"loss": 0.6102,
"step": 312000
},
{
"base_loss": 0.5844805639982223,
"epoch": 4.119209289550781,
"grad_norm": 0.001088910736143589,
"learning_rate": 2.019777297973633e-05,
"lookahead_loss": 6.361526685237885,
"loss": 0.5943,
"step": 312500
},
{
"base_loss": 0.5970172066688537,
"epoch": 5.000953674316406,
"grad_norm": 0.0011056496296077967,
"learning_rate": 2.0150089263916017e-05,
"lookahead_loss": 6.426861086845398,
"loss": 0.602,
"step": 313000
},
{
"base_loss": 0.5878908542394639,
"epoch": 5.0019073486328125,
"grad_norm": 0.0011464518029242754,
"learning_rate": 2.0102405548095704e-05,
"lookahead_loss": 6.26604482460022,
"loss": 0.5986,
"step": 313500
},
{
"base_loss": 0.6054873216748238,
"epoch": 5.002861022949219,
"grad_norm": 0.0011213821126148105,
"learning_rate": 2.005472183227539e-05,
"lookahead_loss": 6.284090840816498,
"loss": 0.6149,
"step": 314000
},
{
"base_loss": 0.6155969781279564,
"epoch": 5.003814697265625,
"grad_norm": 0.001117490348406136,
"learning_rate": 2.0007038116455078e-05,
"lookahead_loss": 6.302778920173645,
"loss": 0.6229,
"step": 314500
},
{
"base_loss": 0.6011522135734558,
"epoch": 5.004768371582031,
"grad_norm": 0.0011072148336097598,
"learning_rate": 1.9959354400634768e-05,
"lookahead_loss": 6.271711923599243,
"loss": 0.6083,
"step": 315000
},
{
"epoch": 5.004768371582031,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.3342473286028484,
"eval_lookahead_perplexity": 563.5450790548485,
"eval_loss": 0.20724281668663025,
"eval_perplexity": 1.2302812683362319,
"eval_runtime": 91.4874,
"eval_samples_per_second": 54.652,
"eval_steps_per_second": 1.716,
"step": 315000
},
{
"base_loss": 0.5900973889827729,
"epoch": 5.0057220458984375,
"grad_norm": 0.0010573536856099963,
"learning_rate": 1.9911670684814455e-05,
"lookahead_loss": 6.4039622111320496,
"loss": 0.5992,
"step": 315500
},
{
"base_loss": 0.5815674068927765,
"epoch": 5.006675720214844,
"grad_norm": 0.0011015620548278093,
"learning_rate": 1.986398696899414e-05,
"lookahead_loss": 6.252389284610748,
"loss": 0.5969,
"step": 316000
},
{
"base_loss": 0.6040794110894203,
"epoch": 5.00762939453125,
"grad_norm": 0.0011115833185613155,
"learning_rate": 1.9816303253173828e-05,
"lookahead_loss": 6.311683702468872,
"loss": 0.6157,
"step": 316500
},
{
"base_loss": 0.6009674743413925,
"epoch": 5.008583068847656,
"grad_norm": 0.0010299838613718748,
"learning_rate": 1.9768619537353515e-05,
"lookahead_loss": 6.318372031211853,
"loss": 0.6064,
"step": 317000
},
{
"base_loss": 0.5907457684278488,
"epoch": 5.0095367431640625,
"grad_norm": 0.0011498293606564403,
"learning_rate": 1.9720935821533205e-05,
"lookahead_loss": 6.334591036319733,
"loss": 0.6071,
"step": 317500
},
{
"base_loss": 0.5929673384428025,
"epoch": 5.010490417480469,
"grad_norm": 0.0011196956038475037,
"learning_rate": 1.9673252105712892e-05,
"lookahead_loss": 6.270245428085327,
"loss": 0.6046,
"step": 318000
},
{
"base_loss": 0.5895965065360069,
"epoch": 5.011444091796875,
"grad_norm": 0.001152197364717722,
"learning_rate": 1.962556838989258e-05,
"lookahead_loss": 6.298059554100036,
"loss": 0.6054,
"step": 318500
},
{
"base_loss": 0.6096455634236336,
"epoch": 5.012397766113281,
"grad_norm": 0.0010954708559438586,
"learning_rate": 1.9577884674072266e-05,
"lookahead_loss": 6.312456937789917,
"loss": 0.6216,
"step": 319000
},
{
"base_loss": 0.5980722559094429,
"epoch": 5.0133514404296875,
"grad_norm": 0.0011161410948261619,
"learning_rate": 1.9530200958251953e-05,
"lookahead_loss": 6.37580472278595,
"loss": 0.6097,
"step": 319500
},
{
"base_loss": 0.5950591211915016,
"epoch": 5.014305114746094,
"grad_norm": 0.0011130195343866944,
"learning_rate": 1.9482517242431643e-05,
"lookahead_loss": 6.353333437919617,
"loss": 0.6077,
"step": 320000
},
{
"epoch": 5.014305114746094,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.33025686657086,
"eval_lookahead_perplexity": 561.3007547372324,
"eval_loss": 0.20723474025726318,
"eval_perplexity": 1.2302713320965912,
"eval_runtime": 92.6732,
"eval_samples_per_second": 53.953,
"eval_steps_per_second": 1.694,
"step": 320000
},
{
"base_loss": 0.5842609284520149,
"epoch": 5.0152587890625,
"grad_norm": 0.0011031727772206068,
"learning_rate": 1.943483352661133e-05,
"lookahead_loss": 6.298804305553436,
"loss": 0.5893,
"step": 320500
},
{
"base_loss": 0.6108002681136131,
"epoch": 5.016212463378906,
"grad_norm": 0.0011222651228308678,
"learning_rate": 1.9387149810791016e-05,
"lookahead_loss": 6.338112614154816,
"loss": 0.6178,
"step": 321000
},
{
"base_loss": 0.6050938671827316,
"epoch": 5.0171661376953125,
"grad_norm": 0.0011011798633262515,
"learning_rate": 1.9339466094970703e-05,
"lookahead_loss": 6.3767892370223995,
"loss": 0.6101,
"step": 321500
},
{
"base_loss": 0.59214752471447,
"epoch": 5.018119812011719,
"grad_norm": 0.0010596738429740071,
"learning_rate": 1.929178237915039e-05,
"lookahead_loss": 6.373836220741272,
"loss": 0.6019,
"step": 322000
},
{
"base_loss": 0.5841910520792007,
"epoch": 5.019073486328125,
"grad_norm": 0.0010723528685048223,
"learning_rate": 1.924409866333008e-05,
"lookahead_loss": 6.397507895469666,
"loss": 0.5966,
"step": 322500
},
{
"base_loss": 0.5900825787782669,
"epoch": 5.020027160644531,
"grad_norm": 0.0011345201637595892,
"learning_rate": 1.9196414947509767e-05,
"lookahead_loss": 6.252168926239014,
"loss": 0.6035,
"step": 323000
},
{
"base_loss": 0.6143665207028389,
"epoch": 5.0209808349609375,
"grad_norm": 0.001077457214705646,
"learning_rate": 1.9148731231689454e-05,
"lookahead_loss": 6.304201687812805,
"loss": 0.6244,
"step": 323500
},
{
"base_loss": 0.5979527611136436,
"epoch": 5.021934509277344,
"grad_norm": 0.0011455104686319828,
"learning_rate": 1.910104751586914e-05,
"lookahead_loss": 6.293455883979798,
"loss": 0.609,
"step": 324000
},
{
"base_loss": 0.5961930236816406,
"epoch": 5.02288818359375,
"grad_norm": 0.001122178859077394,
"learning_rate": 1.9053363800048828e-05,
"lookahead_loss": 6.3198678956031795,
"loss": 0.6044,
"step": 324500
},
{
"base_loss": 0.5978932146430016,
"epoch": 5.023841857910156,
"grad_norm": 0.0011242764303460717,
"learning_rate": 1.9005680084228518e-05,
"lookahead_loss": 6.288296319961548,
"loss": 0.6048,
"step": 325000
},
{
"epoch": 5.023841857910156,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.3256560042262455,
"eval_lookahead_perplexity": 558.7242189190645,
"eval_loss": 0.20722579956054688,
"eval_perplexity": 1.2302603326629036,
"eval_runtime": 89.5384,
"eval_samples_per_second": 55.842,
"eval_steps_per_second": 1.753,
"step": 325000
},
{
"base_loss": 0.6122328286170959,
"epoch": 5.0247955322265625,
"grad_norm": 0.0010839574970304966,
"learning_rate": 1.8957996368408205e-05,
"lookahead_loss": 6.295799336433411,
"loss": 0.6222,
"step": 325500
},
{
"base_loss": 0.6058134199380875,
"epoch": 5.025749206542969,
"grad_norm": 0.0010937975021079183,
"learning_rate": 1.891031265258789e-05,
"lookahead_loss": 6.256947278022766,
"loss": 0.6154,
"step": 326000
},
{
"base_loss": 0.5969057772159576,
"epoch": 5.026702880859375,
"grad_norm": 0.0011171259684488177,
"learning_rate": 1.8862628936767578e-05,
"lookahead_loss": 6.2739819231033325,
"loss": 0.6044,
"step": 326500
},
{
"base_loss": 0.5948947068452836,
"epoch": 5.027656555175781,
"grad_norm": 0.0010930420830845833,
"learning_rate": 1.8814945220947265e-05,
"lookahead_loss": 6.3927841548919675,
"loss": 0.6051,
"step": 327000
},
{
"base_loss": 0.6182447550296784,
"epoch": 5.0286102294921875,
"grad_norm": 0.001109077362343669,
"learning_rate": 1.8767261505126955e-05,
"lookahead_loss": 6.409919787406921,
"loss": 0.6284,
"step": 327500
},
{
"base_loss": 0.5978566119670868,
"epoch": 5.029563903808594,
"grad_norm": 0.0010880293557420373,
"learning_rate": 1.8719577789306642e-05,
"lookahead_loss": 6.346259602069855,
"loss": 0.6094,
"step": 328000
},
{
"base_loss": 0.5951992619633675,
"epoch": 5.030517578125,
"grad_norm": 0.0010963050881400704,
"learning_rate": 1.867189407348633e-05,
"lookahead_loss": 6.3438457818031315,
"loss": 0.6075,
"step": 328500
},
{
"base_loss": 0.5904952138662338,
"epoch": 5.031471252441406,
"grad_norm": 0.0010976595804095268,
"learning_rate": 1.8624210357666016e-05,
"lookahead_loss": 6.359939827442169,
"loss": 0.6039,
"step": 329000
},
{
"base_loss": 0.6122912662625313,
"epoch": 5.0324249267578125,
"grad_norm": 0.0011258937884122133,
"learning_rate": 1.8576526641845703e-05,
"lookahead_loss": 6.307325168609619,
"loss": 0.6268,
"step": 329500
},
{
"base_loss": 0.5966172614097596,
"epoch": 5.033378601074219,
"grad_norm": 0.0011488485615700483,
"learning_rate": 1.8528842926025393e-05,
"lookahead_loss": 6.344880403518677,
"loss": 0.6075,
"step": 330000
},
{
"epoch": 5.033378601074219,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.322288970216014,
"eval_lookahead_perplexity": 556.8461390253646,
"eval_loss": 0.20721931755542755,
"eval_perplexity": 1.2302523581349747,
"eval_runtime": 91.3128,
"eval_samples_per_second": 54.757,
"eval_steps_per_second": 1.719,
"step": 330000
},
{
"base_loss": 0.5913965013027191,
"epoch": 5.034332275390625,
"grad_norm": 0.0010760932927951217,
"learning_rate": 1.848115921020508e-05,
"lookahead_loss": 6.418337035179138,
"loss": 0.6054,
"step": 330500
},
{
"base_loss": 0.5880823624134064,
"epoch": 5.035285949707031,
"grad_norm": 0.0011206314666196704,
"learning_rate": 1.8433475494384766e-05,
"lookahead_loss": 6.257230742454529,
"loss": 0.6021,
"step": 331000
},
{
"base_loss": 0.6115914248228073,
"epoch": 5.0362396240234375,
"grad_norm": 0.0011233886471018195,
"learning_rate": 1.8385791778564453e-05,
"lookahead_loss": 6.371477304935455,
"loss": 0.6215,
"step": 331500
},
{
"base_loss": 0.6013447298407555,
"epoch": 5.037193298339844,
"grad_norm": 0.001142382388934493,
"learning_rate": 1.833810806274414e-05,
"lookahead_loss": 6.323221492767334,
"loss": 0.6115,
"step": 332000
},
{
"base_loss": 0.5888082583546639,
"epoch": 5.03814697265625,
"grad_norm": 0.0010905246017500758,
"learning_rate": 1.829042434692383e-05,
"lookahead_loss": 6.323706493854523,
"loss": 0.6011,
"step": 332500
},
{
"base_loss": 0.5994124051332473,
"epoch": 5.039100646972656,
"grad_norm": 0.0011082935379818082,
"learning_rate": 1.8242740631103517e-05,
"lookahead_loss": 6.297938917160034,
"loss": 0.6109,
"step": 333000
},
{
"base_loss": 0.6140200459361076,
"epoch": 5.0400543212890625,
"grad_norm": 0.0011177349369972944,
"learning_rate": 1.8195056915283204e-05,
"lookahead_loss": 6.32886435174942,
"loss": 0.6233,
"step": 333500
},
{
"base_loss": 0.6008699499964714,
"epoch": 5.041007995605469,
"grad_norm": 0.0011120132403448224,
"learning_rate": 1.814737319946289e-05,
"lookahead_loss": 6.274889605998993,
"loss": 0.6121,
"step": 334000
},
{
"base_loss": 0.576877062678337,
"epoch": 5.041961669921875,
"grad_norm": 0.001127906609326601,
"learning_rate": 1.8099689483642578e-05,
"lookahead_loss": 6.331388182640076,
"loss": 0.5947,
"step": 334500
},
{
"base_loss": 0.6040148069858551,
"epoch": 5.042915344238281,
"grad_norm": 0.0010664901928976178,
"learning_rate": 1.8052005767822268e-05,
"lookahead_loss": 6.35973362827301,
"loss": 0.6143,
"step": 335000
},
{
"epoch": 5.042915344238281,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.3188232392929615,
"eval_lookahead_perplexity": 554.9196005023798,
"eval_loss": 0.20721259713172913,
"eval_perplexity": 1.2302440903456535,
"eval_runtime": 91.4885,
"eval_samples_per_second": 54.652,
"eval_steps_per_second": 1.716,
"step": 335000
},
{
"base_loss": 0.6138452437520027,
"epoch": 5.0438690185546875,
"grad_norm": 0.0011214982951059937,
"learning_rate": 1.8004322052001955e-05,
"lookahead_loss": 6.382272615909576,
"loss": 0.6257,
"step": 335500
},
{
"base_loss": 0.5906766360402107,
"epoch": 5.044822692871094,
"grad_norm": 0.00112924596760422,
"learning_rate": 1.795663833618164e-05,
"lookahead_loss": 6.318922083854675,
"loss": 0.6028,
"step": 336000
},
{
"base_loss": 0.5900976023674012,
"epoch": 5.0457763671875,
"grad_norm": 0.0010961294174194336,
"learning_rate": 1.7908954620361328e-05,
"lookahead_loss": 6.324168976783753,
"loss": 0.6009,
"step": 336500
},
{
"base_loss": 0.6200715956091881,
"epoch": 5.046730041503906,
"grad_norm": 0.0010944355744868517,
"learning_rate": 1.7861270904541015e-05,
"lookahead_loss": 6.278039996147156,
"loss": 0.6312,
"step": 337000
},
{
"base_loss": 0.6041342921853066,
"epoch": 5.0476837158203125,
"grad_norm": 0.0011535694357007742,
"learning_rate": 1.7813587188720705e-05,
"lookahead_loss": 6.33254317522049,
"loss": 0.6221,
"step": 337500
},
{
"base_loss": 0.5885805570483208,
"epoch": 5.048637390136719,
"grad_norm": 0.0010927009861916304,
"learning_rate": 1.7765903472900392e-05,
"lookahead_loss": 6.311563837051391,
"loss": 0.6008,
"step": 338000
},
{
"base_loss": 0.5990964626669883,
"epoch": 5.049591064453125,
"grad_norm": 0.001139697851613164,
"learning_rate": 1.771821975708008e-05,
"lookahead_loss": 6.277810357570648,
"loss": 0.6072,
"step": 338500
},
{
"base_loss": 0.6159806163311005,
"epoch": 5.050544738769531,
"grad_norm": 0.0010485260281711817,
"learning_rate": 1.7670536041259766e-05,
"lookahead_loss": 6.383065185070038,
"loss": 0.6273,
"step": 339000
},
{
"base_loss": 0.6002469740509987,
"epoch": 5.0514984130859375,
"grad_norm": 0.0011513761710375547,
"learning_rate": 1.7622852325439453e-05,
"lookahead_loss": 6.340414535522461,
"loss": 0.6113,
"step": 339500
},
{
"base_loss": 0.5939902824163437,
"epoch": 5.052452087402344,
"grad_norm": 0.0011352116707712412,
"learning_rate": 1.7575168609619143e-05,
"lookahead_loss": 6.261112779617309,
"loss": 0.6055,
"step": 340000
},
{
"epoch": 5.052452087402344,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.316136239054866,
"eval_lookahead_perplexity": 553.4305328621764,
"eval_loss": 0.20720693469047546,
"eval_perplexity": 1.230237124180487,
"eval_runtime": 90.8802,
"eval_samples_per_second": 55.017,
"eval_steps_per_second": 1.728,
"step": 340000
},
{
"base_loss": 0.6115701840519905,
"epoch": 5.05340576171875,
"grad_norm": 0.0011188328498974442,
"learning_rate": 1.752748489379883e-05,
"lookahead_loss": 6.279942854881287,
"loss": 0.6192,
"step": 340500
},
{
"base_loss": 0.6067719753980637,
"epoch": 5.054359436035156,
"grad_norm": 0.0010936488397419453,
"learning_rate": 1.7479801177978516e-05,
"lookahead_loss": 6.320557628631592,
"loss": 0.6227,
"step": 341000
},
{
"base_loss": 0.594871651172638,
"epoch": 5.0553131103515625,
"grad_norm": 0.0011088603641837835,
"learning_rate": 1.7432117462158203e-05,
"lookahead_loss": 6.32833381986618,
"loss": 0.6045,
"step": 341500
},
{
"base_loss": 0.5912774945497513,
"epoch": 5.056266784667969,
"grad_norm": 0.0010413776617497206,
"learning_rate": 1.738443374633789e-05,
"lookahead_loss": 6.348275181770325,
"loss": 0.6037,
"step": 342000
},
{
"base_loss": 0.6153999509811401,
"epoch": 5.057220458984375,
"grad_norm": 0.0010915326420217752,
"learning_rate": 1.733675003051758e-05,
"lookahead_loss": 6.3401120119094845,
"loss": 0.6279,
"step": 342500
},
{
"base_loss": 0.6071836153268814,
"epoch": 5.058174133300781,
"grad_norm": 0.001087325974367559,
"learning_rate": 1.7289066314697267e-05,
"lookahead_loss": 6.34826420211792,
"loss": 0.6148,
"step": 343000
},
{
"base_loss": 0.5827829704880715,
"epoch": 5.0591278076171875,
"grad_norm": 0.001092458376660943,
"learning_rate": 1.7241382598876954e-05,
"lookahead_loss": 6.233663388252259,
"loss": 0.5998,
"step": 343500
},
{
"base_loss": 0.5933170615434646,
"epoch": 5.060081481933594,
"grad_norm": 0.0011025206185877323,
"learning_rate": 1.719369888305664e-05,
"lookahead_loss": 6.321156049728393,
"loss": 0.6051,
"step": 344000
},
{
"base_loss": 0.6054021108746529,
"epoch": 5.06103515625,
"grad_norm": 0.0011285829823464155,
"learning_rate": 1.7146015167236328e-05,
"lookahead_loss": 6.259889480590821,
"loss": 0.6174,
"step": 344500
},
{
"base_loss": 0.6035048805475235,
"epoch": 5.061988830566406,
"grad_norm": 0.0011320598423480988,
"learning_rate": 1.7098331451416018e-05,
"lookahead_loss": 6.293309184074402,
"loss": 0.6154,
"step": 345000
},
{
"epoch": 5.061988830566406,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.312484579726149,
"eval_lookahead_perplexity": 551.4132784985202,
"eval_loss": 0.20720021426677704,
"eval_perplexity": 1.2302288564935442,
"eval_runtime": 92.0757,
"eval_samples_per_second": 54.303,
"eval_steps_per_second": 1.705,
"step": 345000
},
{
"base_loss": 0.594410364151001,
"epoch": 5.0629425048828125,
"grad_norm": 0.0011401425581425428,
"learning_rate": 1.7050647735595705e-05,
"lookahead_loss": 6.305532325267792,
"loss": 0.6062,
"step": 345500
},
{
"base_loss": 0.6103508388400077,
"epoch": 5.063896179199219,
"grad_norm": 0.0010705353925004601,
"learning_rate": 1.700296401977539e-05,
"lookahead_loss": 6.314088912010193,
"loss": 0.6222,
"step": 346000
},
{
"base_loss": 0.606827544927597,
"epoch": 5.064849853515625,
"grad_norm": 0.0010943631641566753,
"learning_rate": 1.6955280303955078e-05,
"lookahead_loss": 6.355545690059662,
"loss": 0.6154,
"step": 346500
},
{
"base_loss": 0.6025100236535073,
"epoch": 5.065803527832031,
"grad_norm": 0.0011316005839034915,
"learning_rate": 1.6907596588134765e-05,
"lookahead_loss": 6.270945949554443,
"loss": 0.6118,
"step": 347000
},
{
"base_loss": 0.6017893969416618,
"epoch": 5.0667572021484375,
"grad_norm": 0.0011118698166683316,
"learning_rate": 1.6859912872314455e-05,
"lookahead_loss": 6.260629506111145,
"loss": 0.61,
"step": 347500
},
{
"base_loss": 0.6166568307876586,
"epoch": 5.067710876464844,
"grad_norm": 0.0011616123374551535,
"learning_rate": 1.6812229156494142e-05,
"lookahead_loss": 6.3263319902420045,
"loss": 0.6271,
"step": 348000
},
{
"base_loss": 0.5990218588709831,
"epoch": 5.06866455078125,
"grad_norm": 0.0010879425099119544,
"learning_rate": 1.676454544067383e-05,
"lookahead_loss": 6.2897031512260435,
"loss": 0.6077,
"step": 348500
},
{
"base_loss": 0.5863162516951561,
"epoch": 5.069618225097656,
"grad_norm": 0.0011173348175361753,
"learning_rate": 1.6716861724853516e-05,
"lookahead_loss": 6.335295492649078,
"loss": 0.6,
"step": 349000
},
{
"base_loss": 0.6140699430108071,
"epoch": 5.0705718994140625,
"grad_norm": 0.0011209336807951331,
"learning_rate": 1.6669178009033203e-05,
"lookahead_loss": 6.232721103191376,
"loss": 0.6271,
"step": 349500
},
{
"base_loss": 0.6004664080142975,
"epoch": 5.071525573730469,
"grad_norm": 0.0011067437008023262,
"learning_rate": 1.6621494293212893e-05,
"lookahead_loss": 6.271895239830017,
"loss": 0.6104,
"step": 350000
},
{
"epoch": 5.071525573730469,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.309436458368271,
"eval_lookahead_perplexity": 549.7350629090662,
"eval_loss": 0.20719435811042786,
"eval_perplexity": 1.2302216521021103,
"eval_runtime": 90.8003,
"eval_samples_per_second": 55.066,
"eval_steps_per_second": 1.729,
"step": 350000
},
{
"base_loss": 0.5908818225860596,
"epoch": 5.072479248046875,
"grad_norm": 0.0011330280685797334,
"learning_rate": 1.657381057739258e-05,
"lookahead_loss": 6.328438294410706,
"loss": 0.6029,
"step": 350500
},
{
"base_loss": 0.5954542902112007,
"epoch": 5.073432922363281,
"grad_norm": 0.0010515834437683225,
"learning_rate": 1.6526126861572266e-05,
"lookahead_loss": 6.284423396110535,
"loss": 0.6062,
"step": 351000
},
{
"base_loss": 0.6109695326685906,
"epoch": 5.0743865966796875,
"grad_norm": 0.0011074411449953914,
"learning_rate": 1.6478443145751953e-05,
"lookahead_loss": 6.297104268074036,
"loss": 0.6225,
"step": 351500
},
{
"base_loss": 0.5943767395019531,
"epoch": 5.075340270996094,
"grad_norm": 0.0011059824610128999,
"learning_rate": 1.643075942993164e-05,
"lookahead_loss": 6.358781456947327,
"loss": 0.6067,
"step": 352000
},
{
"base_loss": 0.5842889928817749,
"epoch": 5.0762939453125,
"grad_norm": 0.0010608715238049626,
"learning_rate": 1.638307571411133e-05,
"lookahead_loss": 6.328452944278717,
"loss": 0.5965,
"step": 352500
},
{
"base_loss": 0.6252594041824341,
"epoch": 5.077247619628906,
"grad_norm": 0.0010822336189448833,
"learning_rate": 1.6335391998291017e-05,
"lookahead_loss": 6.299285487651825,
"loss": 0.6364,
"step": 353000
},
{
"base_loss": 0.5977307210564613,
"epoch": 5.0782012939453125,
"grad_norm": 0.0011410461738705635,
"learning_rate": 1.6287708282470704e-05,
"lookahead_loss": 6.318628603935242,
"loss": 0.6125,
"step": 353500
},
{
"base_loss": 0.5993528968691826,
"epoch": 5.079154968261719,
"grad_norm": 0.0010777156567201018,
"learning_rate": 1.624002456665039e-05,
"lookahead_loss": 6.336204090118408,
"loss": 0.6074,
"step": 354000
},
{
"base_loss": 0.610366469681263,
"epoch": 5.080108642578125,
"grad_norm": 0.0010876890737563372,
"learning_rate": 1.6192340850830078e-05,
"lookahead_loss": 6.294617042541504,
"loss": 0.6205,
"step": 354500
},
{
"base_loss": 0.6119058151245117,
"epoch": 5.081062316894531,
"grad_norm": 0.0011599212884902954,
"learning_rate": 1.6144657135009768e-05,
"lookahead_loss": 6.320651604652404,
"loss": 0.6214,
"step": 355000
},
{
"epoch": 5.081062316894531,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.305697089947832,
"eval_lookahead_perplexity": 547.6832396276725,
"eval_loss": 0.20718762278556824,
"eval_perplexity": 1.2302133661875383,
"eval_runtime": 92.0429,
"eval_samples_per_second": 54.323,
"eval_steps_per_second": 1.706,
"step": 355000
},
{
"base_loss": 0.5920207235217094,
"epoch": 5.0820159912109375,
"grad_norm": 0.0010734680108726025,
"learning_rate": 1.6096973419189455e-05,
"lookahead_loss": 6.332855414390564,
"loss": 0.6048,
"step": 355500
},
{
"base_loss": 0.5911735916733741,
"epoch": 5.082969665527344,
"grad_norm": 0.0011140975402668118,
"learning_rate": 1.604928970336914e-05,
"lookahead_loss": 6.363209072589874,
"loss": 0.6043,
"step": 356000
},
{
"base_loss": 0.615800891816616,
"epoch": 5.08392333984375,
"grad_norm": 0.0010806769132614136,
"learning_rate": 1.6001605987548828e-05,
"lookahead_loss": 6.358828454494477,
"loss": 0.6262,
"step": 356500
},
{
"base_loss": 0.594821957230568,
"epoch": 5.084877014160156,
"grad_norm": 0.001102195936255157,
"learning_rate": 1.5953922271728515e-05,
"lookahead_loss": 6.303458571910858,
"loss": 0.6046,
"step": 357000
},
{
"base_loss": 0.5869647584557534,
"epoch": 5.0858306884765625,
"grad_norm": 0.0011355791939422488,
"learning_rate": 1.5906238555908205e-05,
"lookahead_loss": 6.312077233791351,
"loss": 0.5982,
"step": 357500
},
{
"base_loss": 0.5933388795256614,
"epoch": 5.086784362792969,
"grad_norm": 0.0011941350530833006,
"learning_rate": 1.5858554840087892e-05,
"lookahead_loss": 6.28103159236908,
"loss": 0.6038,
"step": 358000
},
{
"base_loss": 0.6257667993307113,
"epoch": 5.087738037109375,
"grad_norm": 0.0010375409619882703,
"learning_rate": 1.581087112426758e-05,
"lookahead_loss": 6.311352049827575,
"loss": 0.6301,
"step": 358500
},
{
"base_loss": 0.5958490890860557,
"epoch": 5.088691711425781,
"grad_norm": 0.0011225225171074271,
"learning_rate": 1.5763187408447266e-05,
"lookahead_loss": 6.311702779769897,
"loss": 0.6044,
"step": 359000
},
{
"base_loss": 0.5977150818109512,
"epoch": 5.0896453857421875,
"grad_norm": 0.0011090969201177359,
"learning_rate": 1.5715503692626953e-05,
"lookahead_loss": 6.333823210716248,
"loss": 0.6073,
"step": 359500
},
{
"base_loss": 0.5886843089461327,
"epoch": 5.090599060058594,
"grad_norm": 0.0010649607283994555,
"learning_rate": 1.5667819976806643e-05,
"lookahead_loss": 6.307887722015381,
"loss": 0.6011,
"step": 360000
},
{
"epoch": 5.090599060058594,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.30245965738266,
"eval_lookahead_perplexity": 545.9130191032252,
"eval_loss": 0.2071814388036728,
"eval_perplexity": 1.2302057585938768,
"eval_runtime": 91.4549,
"eval_samples_per_second": 54.672,
"eval_steps_per_second": 1.717,
"step": 360000
},
{
"base_loss": 0.5824657415151596,
"epoch": 5.091552734375,
"grad_norm": 0.0010494901798665524,
"learning_rate": 1.562013626098633e-05,
"lookahead_loss": 6.287062782287598,
"loss": 0.5936,
"step": 360500
},
{
"base_loss": 0.6174823232293128,
"epoch": 5.092506408691406,
"grad_norm": 0.0011219014413654804,
"learning_rate": 1.5572452545166016e-05,
"lookahead_loss": 6.3347588586807255,
"loss": 0.6246,
"step": 361000
},
{
"base_loss": 0.5997858379483223,
"epoch": 5.0934600830078125,
"grad_norm": 0.0010824851924553514,
"learning_rate": 1.5524768829345703e-05,
"lookahead_loss": 6.319412599563599,
"loss": 0.6122,
"step": 361500
},
{
"base_loss": 0.5813568589091301,
"epoch": 5.094413757324219,
"grad_norm": 0.001104526687413454,
"learning_rate": 1.547708511352539e-05,
"lookahead_loss": 6.322312892436981,
"loss": 0.5951,
"step": 362000
},
{
"base_loss": 0.5759395672678947,
"epoch": 5.095367431640625,
"grad_norm": 0.001081343274563551,
"learning_rate": 1.542940139770508e-05,
"lookahead_loss": 6.258695254802704,
"loss": 0.5915,
"step": 362500
},
{
"base_loss": 0.5969497102499008,
"epoch": 5.096321105957031,
"grad_norm": 0.0011133088264614344,
"learning_rate": 1.5381717681884767e-05,
"lookahead_loss": 6.303308537006378,
"loss": 0.61,
"step": 363000
},
{
"base_loss": 0.6147504753470421,
"epoch": 5.0972747802734375,
"grad_norm": 0.001123305642977357,
"learning_rate": 1.5334033966064454e-05,
"lookahead_loss": 6.34167172908783,
"loss": 0.6241,
"step": 363500
},
{
"base_loss": 0.5965510091781616,
"epoch": 5.098228454589844,
"grad_norm": 0.0011167360935360193,
"learning_rate": 1.528635025024414e-05,
"lookahead_loss": 6.270585143089295,
"loss": 0.6065,
"step": 364000
},
{
"base_loss": 0.5827678750753402,
"epoch": 5.09918212890625,
"grad_norm": 0.001104156021028757,
"learning_rate": 1.523866653442383e-05,
"lookahead_loss": 6.335385172843933,
"loss": 0.595,
"step": 364500
},
{
"base_loss": 0.5870707350373268,
"epoch": 5.100135803222656,
"grad_norm": 0.0010957367485389113,
"learning_rate": 1.5190982818603516e-05,
"lookahead_loss": 6.311425636291504,
"loss": 0.6025,
"step": 365000
},
{
"epoch": 5.100135803222656,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.3001686483145525,
"eval_lookahead_perplexity": 544.663759005586,
"eval_loss": 0.20717626810073853,
"eval_perplexity": 1.2301993975817966,
"eval_runtime": 90.4079,
"eval_samples_per_second": 55.305,
"eval_steps_per_second": 1.737,
"step": 365000
},
{
"base_loss": 0.6111934832334518,
"epoch": 5.1010894775390625,
"grad_norm": 0.0010889478726312518,
"learning_rate": 1.5143299102783205e-05,
"lookahead_loss": 6.365397205829621,
"loss": 0.6214,
"step": 365500
},
{
"base_loss": 0.6032542023062706,
"epoch": 5.102043151855469,
"grad_norm": 0.001146532827988267,
"learning_rate": 1.5095615386962891e-05,
"lookahead_loss": 6.334895478248596,
"loss": 0.6076,
"step": 366000
},
{
"base_loss": 0.5893560016155243,
"epoch": 5.102996826171875,
"grad_norm": 0.0010869104880839586,
"learning_rate": 1.5047931671142578e-05,
"lookahead_loss": 6.309252546310425,
"loss": 0.6029,
"step": 366500
},
{
"base_loss": 0.5904992881417275,
"epoch": 5.103950500488281,
"grad_norm": 0.001103839953429997,
"learning_rate": 1.5000247955322267e-05,
"lookahead_loss": 6.336761679649353,
"loss": 0.6019,
"step": 367000
},
{
"base_loss": 0.6113997294902801,
"epoch": 5.1049041748046875,
"grad_norm": 0.0010869849938899279,
"learning_rate": 1.4952564239501954e-05,
"lookahead_loss": 6.279005553245544,
"loss": 0.6242,
"step": 367500
},
{
"base_loss": 0.6017834762334824,
"epoch": 5.105857849121094,
"grad_norm": 0.001036735251545906,
"learning_rate": 1.4904880523681642e-05,
"lookahead_loss": 6.2851017117500305,
"loss": 0.6133,
"step": 368000
},
{
"base_loss": 0.5975290570855141,
"epoch": 5.1068115234375,
"grad_norm": 0.0010710596106946468,
"learning_rate": 1.4857196807861329e-05,
"lookahead_loss": 6.347316368103027,
"loss": 0.6067,
"step": 368500
},
{
"base_loss": 0.5801187572479248,
"epoch": 5.107765197753906,
"grad_norm": 0.001139249769039452,
"learning_rate": 1.4809513092041016e-05,
"lookahead_loss": 6.257316456317902,
"loss": 0.5924,
"step": 369000
},
{
"base_loss": 0.603221082687378,
"epoch": 5.1087188720703125,
"grad_norm": 0.0011551164789125323,
"learning_rate": 1.4761829376220704e-05,
"lookahead_loss": 6.2666192288398745,
"loss": 0.6172,
"step": 369500
},
{
"base_loss": 0.606291466653347,
"epoch": 5.109672546386719,
"grad_norm": 0.001088725752197206,
"learning_rate": 1.4714145660400391e-05,
"lookahead_loss": 6.244538120269775,
"loss": 0.616,
"step": 370000
},
{
"epoch": 5.109672546386719,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.297252341200369,
"eval_lookahead_perplexity": 543.0776661014929,
"eval_loss": 0.20717072486877441,
"eval_perplexity": 1.2301925783200742,
"eval_runtime": 92.6089,
"eval_samples_per_second": 53.99,
"eval_steps_per_second": 1.695,
"step": 370000
},
{
"base_loss": 0.5948919664621353,
"epoch": 5.110626220703125,
"grad_norm": 0.0010841034818440676,
"learning_rate": 1.466646194458008e-05,
"lookahead_loss": 6.219464591503144,
"loss": 0.606,
"step": 370500
},
{
"base_loss": 0.5846513741016388,
"epoch": 5.111579895019531,
"grad_norm": 0.0010996430646628141,
"learning_rate": 1.4618778228759766e-05,
"lookahead_loss": 6.3166262807846065,
"loss": 0.597,
"step": 371000
},
{
"base_loss": 0.6055964279770851,
"epoch": 5.1125335693359375,
"grad_norm": 0.0010927943512797356,
"learning_rate": 1.4571094512939453e-05,
"lookahead_loss": 6.335015828132629,
"loss": 0.6131,
"step": 371500
},
{
"base_loss": 0.6173858237266541,
"epoch": 5.113487243652344,
"grad_norm": 0.0011214661644771695,
"learning_rate": 1.4523410797119142e-05,
"lookahead_loss": 6.343456150531769,
"loss": 0.627,
"step": 372000
},
{
"base_loss": 0.5966166469454766,
"epoch": 5.11444091796875,
"grad_norm": 0.0010272579966112971,
"learning_rate": 1.4475727081298829e-05,
"lookahead_loss": 6.2267381420135495,
"loss": 0.606,
"step": 372500
},
{
"base_loss": 0.5850279053449631,
"epoch": 5.115394592285156,
"grad_norm": 0.001128312898799777,
"learning_rate": 1.4428043365478517e-05,
"lookahead_loss": 6.2816207437515255,
"loss": 0.5965,
"step": 373000
},
{
"base_loss": 0.6050868096351624,
"epoch": 5.1163482666015625,
"grad_norm": 0.0011569701600819826,
"learning_rate": 1.4380359649658204e-05,
"lookahead_loss": 6.286871615409851,
"loss": 0.6157,
"step": 373500
},
{
"base_loss": 0.6095665054321289,
"epoch": 5.117301940917969,
"grad_norm": 0.0011440969537943602,
"learning_rate": 1.433267593383789e-05,
"lookahead_loss": 6.284195227622986,
"loss": 0.6217,
"step": 374000
},
{
"base_loss": 0.6041606207489967,
"epoch": 5.118255615234375,
"grad_norm": 0.0011145136086270213,
"learning_rate": 1.428499221801758e-05,
"lookahead_loss": 6.316567860603333,
"loss": 0.6116,
"step": 374500
},
{
"base_loss": 0.5852125850319863,
"epoch": 5.119209289550781,
"grad_norm": 0.001106358366087079,
"learning_rate": 1.4237308502197266e-05,
"lookahead_loss": 6.311560387611389,
"loss": 0.5951,
"step": 375000
},
{
"epoch": 5.119209289550781,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.294692088239871,
"eval_lookahead_perplexity": 541.689028289642,
"eval_loss": 0.20716556906700134,
"eval_perplexity": 1.2301862357073483,
"eval_runtime": 90.9096,
"eval_samples_per_second": 55.0,
"eval_steps_per_second": 1.727,
"step": 375000
},
{
"base_loss": 0.5954181181788445,
"epoch": 6.000953674316406,
"grad_norm": 0.0011187719646841288,
"learning_rate": 1.4189624786376955e-05,
"lookahead_loss": 6.385282723426819,
"loss": 0.6012,
"step": 375500
},
{
"base_loss": 0.5865643661618233,
"epoch": 6.0019073486328125,
"grad_norm": 0.0011442394461482763,
"learning_rate": 1.4141941070556641e-05,
"lookahead_loss": 6.220819156169892,
"loss": 0.5983,
"step": 376000
},
{
"base_loss": 0.6043179650306701,
"epoch": 6.002861022949219,
"grad_norm": 0.0011097525712102652,
"learning_rate": 1.4094257354736328e-05,
"lookahead_loss": 6.230756608486176,
"loss": 0.613,
"step": 376500
},
{
"base_loss": 0.6149949839115143,
"epoch": 6.003814697265625,
"grad_norm": 0.0010844056960195303,
"learning_rate": 1.4046573638916017e-05,
"lookahead_loss": 6.254981481552124,
"loss": 0.6229,
"step": 377000
},
{
"base_loss": 0.603040216743946,
"epoch": 6.004768371582031,
"grad_norm": 0.001105528324842453,
"learning_rate": 1.3998889923095704e-05,
"lookahead_loss": 6.231416835784912,
"loss": 0.6098,
"step": 377500
},
{
"base_loss": 0.590958813726902,
"epoch": 6.0057220458984375,
"grad_norm": 0.0010404631029814482,
"learning_rate": 1.3951206207275392e-05,
"lookahead_loss": 6.355237211227417,
"loss": 0.5985,
"step": 378000
},
{
"base_loss": 0.5807944664359093,
"epoch": 6.006675720214844,
"grad_norm": 0.0010901595233008265,
"learning_rate": 1.3903522491455079e-05,
"lookahead_loss": 6.2100413432121275,
"loss": 0.5965,
"step": 378500
},
{
"base_loss": 0.6047119013071061,
"epoch": 6.00762939453125,
"grad_norm": 0.0011048950254917145,
"learning_rate": 1.3855838775634766e-05,
"lookahead_loss": 6.267502905368805,
"loss": 0.6165,
"step": 379000
},
{
"base_loss": 0.6036030429005623,
"epoch": 6.008583068847656,
"grad_norm": 0.0010420246981084347,
"learning_rate": 1.3808155059814454e-05,
"lookahead_loss": 6.271774408817291,
"loss": 0.6077,
"step": 379500
},
{
"base_loss": 0.5903495861887932,
"epoch": 6.0095367431640625,
"grad_norm": 0.001150952186435461,
"learning_rate": 1.3760471343994141e-05,
"lookahead_loss": 6.283748052120209,
"loss": 0.6068,
"step": 380000
},
{
"epoch": 6.0095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.292171517880961,
"eval_lookahead_perplexity": 540.3253822860049,
"eval_loss": 0.20716047286987305,
"eval_perplexity": 1.2301799664517612,
"eval_runtime": 91.422,
"eval_samples_per_second": 54.691,
"eval_steps_per_second": 1.717,
"step": 380000
},
{
"base_loss": 0.592279341518879,
"epoch": 6.010490417480469,
"grad_norm": 0.0010976491030305624,
"learning_rate": 1.371278762817383e-05,
"lookahead_loss": 6.236666479110718,
"loss": 0.6037,
"step": 380500
},
{
"base_loss": 0.5882293724417686,
"epoch": 6.011444091796875,
"grad_norm": 0.0011407588608562946,
"learning_rate": 1.3665103912353516e-05,
"lookahead_loss": 6.257264825820923,
"loss": 0.6041,
"step": 381000
},
{
"base_loss": 0.6102479678988457,
"epoch": 6.012397766113281,
"grad_norm": 0.0010918615153059363,
"learning_rate": 1.3617420196533203e-05,
"lookahead_loss": 6.255623918533325,
"loss": 0.6214,
"step": 381500
},
{
"base_loss": 0.5958946568965912,
"epoch": 6.0133514404296875,
"grad_norm": 0.001107473624870181,
"learning_rate": 1.3569736480712892e-05,
"lookahead_loss": 6.330819105148316,
"loss": 0.6088,
"step": 382000
},
{
"base_loss": 0.5949127861857414,
"epoch": 6.014305114746094,
"grad_norm": 0.0011172577505931258,
"learning_rate": 1.3522052764892579e-05,
"lookahead_loss": 6.318566505432129,
"loss": 0.607,
"step": 382500
},
{
"base_loss": 0.5792632920742035,
"epoch": 6.0152587890625,
"grad_norm": 0.0011169774224981666,
"learning_rate": 1.3474369049072265e-05,
"lookahead_loss": 6.247372644424439,
"loss": 0.5868,
"step": 383000
},
{
"base_loss": 0.6084413604736328,
"epoch": 6.016212463378906,
"grad_norm": 0.0011083075078204274,
"learning_rate": 1.3426685333251954e-05,
"lookahead_loss": 6.301636509895324,
"loss": 0.6167,
"step": 383500
},
{
"base_loss": 0.6049468902349472,
"epoch": 6.0171661376953125,
"grad_norm": 0.001102580688893795,
"learning_rate": 1.337900161743164e-05,
"lookahead_loss": 6.3453411102294925,
"loss": 0.6096,
"step": 384000
},
{
"base_loss": 0.5921321566104889,
"epoch": 6.018119812011719,
"grad_norm": 0.0010313682723790407,
"learning_rate": 1.333131790161133e-05,
"lookahead_loss": 6.338734080314636,
"loss": 0.6005,
"step": 384500
},
{
"base_loss": 0.5876540603637695,
"epoch": 6.019073486328125,
"grad_norm": 0.001066872850060463,
"learning_rate": 1.3283634185791016e-05,
"lookahead_loss": 6.364590894699097,
"loss": 0.5974,
"step": 385000
},
{
"epoch": 6.019073486328125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.2893890938438926,
"eval_lookahead_perplexity": 538.8240575840532,
"eval_loss": 0.20715488493442535,
"eval_perplexity": 1.2301730923047258,
"eval_runtime": 92.0267,
"eval_samples_per_second": 54.332,
"eval_steps_per_second": 1.706,
"step": 385000
},
{
"base_loss": 0.5890076372027397,
"epoch": 6.020027160644531,
"grad_norm": 0.0011358478805050254,
"learning_rate": 1.3235950469970703e-05,
"lookahead_loss": 6.209294054985047,
"loss": 0.603,
"step": 385500
},
{
"base_loss": 0.6188538994193077,
"epoch": 6.0209808349609375,
"grad_norm": 0.0010753298411145806,
"learning_rate": 1.3188266754150391e-05,
"lookahead_loss": 6.266224889755249,
"loss": 0.6263,
"step": 386000
},
{
"base_loss": 0.596300050675869,
"epoch": 6.021934509277344,
"grad_norm": 0.0011278757592663169,
"learning_rate": 1.3140583038330078e-05,
"lookahead_loss": 6.252052826881409,
"loss": 0.6084,
"step": 386500
},
{
"base_loss": 0.5962796038985252,
"epoch": 6.02288818359375,
"grad_norm": 0.0011039053788408637,
"learning_rate": 1.3092899322509767e-05,
"lookahead_loss": 6.271143918991089,
"loss": 0.6041,
"step": 387000
},
{
"base_loss": 0.5983630774617195,
"epoch": 6.023841857910156,
"grad_norm": 0.0011156456312164664,
"learning_rate": 1.3045215606689454e-05,
"lookahead_loss": 6.247088316917419,
"loss": 0.6052,
"step": 387500
},
{
"base_loss": 0.6114985771775245,
"epoch": 6.0247955322265625,
"grad_norm": 0.0010853740386664867,
"learning_rate": 1.299753189086914e-05,
"lookahead_loss": 6.26153412771225,
"loss": 0.6224,
"step": 388000
},
{
"base_loss": 0.6061187900304794,
"epoch": 6.025749206542969,
"grad_norm": 0.001081890077330172,
"learning_rate": 1.2949848175048829e-05,
"lookahead_loss": 6.220615880966187,
"loss": 0.6158,
"step": 388500
},
{
"base_loss": 0.5953421378135682,
"epoch": 6.026702880859375,
"grad_norm": 0.0011105469893664122,
"learning_rate": 1.2902164459228516e-05,
"lookahead_loss": 6.231943651199341,
"loss": 0.605,
"step": 389000
},
{
"base_loss": 0.5930399923324585,
"epoch": 6.027656555175781,
"grad_norm": 0.0010852537816390395,
"learning_rate": 1.2854480743408204e-05,
"lookahead_loss": 6.355642718315124,
"loss": 0.6037,
"step": 389500
},
{
"base_loss": 0.6187563810944557,
"epoch": 6.0286102294921875,
"grad_norm": 0.0010983969550579786,
"learning_rate": 1.2806797027587891e-05,
"lookahead_loss": 6.366688157558441,
"loss": 0.6267,
"step": 390000
},
{
"epoch": 6.0286102294921875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.286968604444315,
"eval_lookahead_perplexity": 537.5214168145098,
"eval_loss": 0.2071501761674881,
"eval_perplexity": 1.2301672997199797,
"eval_runtime": 91.251,
"eval_samples_per_second": 54.794,
"eval_steps_per_second": 1.721,
"step": 390000
},
{
"base_loss": 0.599323958337307,
"epoch": 6.029563903808594,
"grad_norm": 0.001101642264984548,
"learning_rate": 1.2759113311767578e-05,
"lookahead_loss": 6.299301620960236,
"loss": 0.6094,
"step": 390500
},
{
"base_loss": 0.593437955737114,
"epoch": 6.030517578125,
"grad_norm": 0.0010927050607278943,
"learning_rate": 1.2711429595947266e-05,
"lookahead_loss": 6.304755561351776,
"loss": 0.6065,
"step": 391000
},
{
"base_loss": 0.5899879291653634,
"epoch": 6.031471252441406,
"grad_norm": 0.0011103905271738768,
"learning_rate": 1.2663745880126953e-05,
"lookahead_loss": 6.316039553165436,
"loss": 0.6029,
"step": 391500
},
{
"base_loss": 0.609737318277359,
"epoch": 6.0324249267578125,
"grad_norm": 0.0011069606989622116,
"learning_rate": 1.2616062164306642e-05,
"lookahead_loss": 6.261803119182587,
"loss": 0.626,
"step": 392000
},
{
"base_loss": 0.5963039031624794,
"epoch": 6.033378601074219,
"grad_norm": 0.0011312129208818078,
"learning_rate": 1.2568378448486329e-05,
"lookahead_loss": 6.314544658184052,
"loss": 0.6073,
"step": 392500
},
{
"base_loss": 0.5939543527364731,
"epoch": 6.034332275390625,
"grad_norm": 0.001062846858985722,
"learning_rate": 1.2520694732666015e-05,
"lookahead_loss": 6.379460736274719,
"loss": 0.607,
"step": 393000
},
{
"base_loss": 0.5901711618304253,
"epoch": 6.035285949707031,
"grad_norm": 0.0011185839539393783,
"learning_rate": 1.2473011016845704e-05,
"lookahead_loss": 6.221417744636535,
"loss": 0.6018,
"step": 393500
},
{
"base_loss": 0.6095206972360611,
"epoch": 6.0362396240234375,
"grad_norm": 0.0011291452683508396,
"learning_rate": 1.242532730102539e-05,
"lookahead_loss": 6.328900773048401,
"loss": 0.6202,
"step": 394000
},
{
"base_loss": 0.6031560533642769,
"epoch": 6.037193298339844,
"grad_norm": 0.0011474916245788336,
"learning_rate": 1.237764358520508e-05,
"lookahead_loss": 6.284544788837433,
"loss": 0.6122,
"step": 394500
},
{
"base_loss": 0.589473883986473,
"epoch": 6.03814697265625,
"grad_norm": 0.0010779218282550573,
"learning_rate": 1.2329959869384766e-05,
"lookahead_loss": 6.292745516777039,
"loss": 0.5994,
"step": 395000
},
{
"epoch": 6.03814697265625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.28468038479741,
"eval_lookahead_perplexity": 536.2928558925989,
"eval_loss": 0.20714624226093292,
"eval_perplexity": 1.2301624603662942,
"eval_runtime": 92.8052,
"eval_samples_per_second": 53.876,
"eval_steps_per_second": 1.692,
"step": 395000
},
{
"base_loss": 0.6025843568444252,
"epoch": 6.039100646972656,
"grad_norm": 0.001086984178982675,
"learning_rate": 1.2282276153564453e-05,
"lookahead_loss": 6.262492390632629,
"loss": 0.6127,
"step": 395500
},
{
"base_loss": 0.6174288346767426,
"epoch": 6.0400543212890625,
"grad_norm": 0.0010939656058326364,
"learning_rate": 1.2234592437744141e-05,
"lookahead_loss": 6.2870379695892336,
"loss": 0.6249,
"step": 396000
},
{
"base_loss": 0.6014795810580253,
"epoch": 6.041007995605469,
"grad_norm": 0.0011165516916662455,
"learning_rate": 1.2186908721923828e-05,
"lookahead_loss": 6.232427957057953,
"loss": 0.6128,
"step": 396500
},
{
"base_loss": 0.5805132260918617,
"epoch": 6.041961669921875,
"grad_norm": 0.001125194481573999,
"learning_rate": 1.2139225006103517e-05,
"lookahead_loss": 6.307816028594971,
"loss": 0.5953,
"step": 397000
},
{
"base_loss": 0.6034971331357956,
"epoch": 6.042915344238281,
"grad_norm": 0.0010759946890175343,
"learning_rate": 1.2091541290283204e-05,
"lookahead_loss": 6.327818301200867,
"loss": 0.6143,
"step": 397500
},
{
"base_loss": 0.6110702828168869,
"epoch": 6.0438690185546875,
"grad_norm": 0.0011335865128785372,
"learning_rate": 1.204385757446289e-05,
"lookahead_loss": 6.344313054084778,
"loss": 0.6233,
"step": 398000
},
{
"base_loss": 0.5920091760754586,
"epoch": 6.044822692871094,
"grad_norm": 0.0011095399968326092,
"learning_rate": 1.1996173858642579e-05,
"lookahead_loss": 6.279779013633728,
"loss": 0.6051,
"step": 398500
},
{
"base_loss": 0.5922821204066276,
"epoch": 6.0457763671875,
"grad_norm": 0.001092213555239141,
"learning_rate": 1.1948490142822266e-05,
"lookahead_loss": 6.276170256614685,
"loss": 0.6023,
"step": 399000
},
{
"base_loss": 0.6183968783020973,
"epoch": 6.046730041503906,
"grad_norm": 0.0011093751527369022,
"learning_rate": 1.1900806427001954e-05,
"lookahead_loss": 6.241053509235382,
"loss": 0.6271,
"step": 399500
},
{
"base_loss": 0.6054804750084877,
"epoch": 6.0476837158203125,
"grad_norm": 0.001156223937869072,
"learning_rate": 1.1853122711181641e-05,
"lookahead_loss": 6.292171361923217,
"loss": 0.6225,
"step": 400000
},
{
"epoch": 6.0476837158203125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.282726495791548,
"eval_lookahead_perplexity": 535.2460222090004,
"eval_loss": 0.20714208483695984,
"eval_perplexity": 1.2301573460700217,
"eval_runtime": 91.3116,
"eval_samples_per_second": 54.758,
"eval_steps_per_second": 1.719,
"step": 400000
},
{
"base_loss": 0.5904572269320488,
"epoch": 6.048637390136719,
"grad_norm": 0.0010891122510656714,
"learning_rate": 1.1805438995361328e-05,
"lookahead_loss": 6.270732181072235,
"loss": 0.6001,
"step": 400500
},
{
"base_loss": 0.5977823759317398,
"epoch": 6.049591064453125,
"grad_norm": 0.0011509527685120702,
"learning_rate": 1.1757755279541016e-05,
"lookahead_loss": 6.241637663841248,
"loss": 0.6059,
"step": 401000
},
{
"base_loss": 0.6174590476155281,
"epoch": 6.050544738769531,
"grad_norm": 0.0010234012734144926,
"learning_rate": 1.1710071563720703e-05,
"lookahead_loss": 6.341926455974579,
"loss": 0.6282,
"step": 401500
},
{
"base_loss": 0.5955156463384629,
"epoch": 6.0514984130859375,
"grad_norm": 0.001154066063463688,
"learning_rate": 1.1662387847900392e-05,
"lookahead_loss": 6.2955641860961915,
"loss": 0.608,
"step": 402000
},
{
"base_loss": 0.5936241209506988,
"epoch": 6.052452087402344,
"grad_norm": 0.0011350855929777026,
"learning_rate": 1.1614704132080079e-05,
"lookahead_loss": 6.227645565032959,
"loss": 0.6042,
"step": 402500
},
{
"base_loss": 0.6117795875668526,
"epoch": 6.05340576171875,
"grad_norm": 0.001109893317334354,
"learning_rate": 1.1567020416259765e-05,
"lookahead_loss": 6.238243167877197,
"loss": 0.6205,
"step": 403000
},
{
"base_loss": 0.6093643299937248,
"epoch": 6.054359436035156,
"grad_norm": 0.0011104453587904572,
"learning_rate": 1.1519336700439454e-05,
"lookahead_loss": 6.285483991622924,
"loss": 0.6236,
"step": 403500
},
{
"base_loss": 0.5967902150750161,
"epoch": 6.0553131103515625,
"grad_norm": 0.0011117176618427038,
"learning_rate": 1.147165298461914e-05,
"lookahead_loss": 6.290799595832825,
"loss": 0.6048,
"step": 404000
},
{
"base_loss": 0.5867149458527565,
"epoch": 6.056266784667969,
"grad_norm": 0.001066114753484726,
"learning_rate": 1.142396926879883e-05,
"lookahead_loss": 6.314110722541809,
"loss": 0.6003,
"step": 404500
},
{
"base_loss": 0.6157781246900559,
"epoch": 6.057220458984375,
"grad_norm": 0.001078008092008531,
"learning_rate": 1.1376285552978516e-05,
"lookahead_loss": 6.294362932682037,
"loss": 0.629,
"step": 405000
},
{
"epoch": 6.057220458984375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.280795160573892,
"eval_lookahead_perplexity": 534.2132803226017,
"eval_loss": 0.20713835954666138,
"eval_perplexity": 1.2301527633853309,
"eval_runtime": 91.5706,
"eval_samples_per_second": 54.603,
"eval_steps_per_second": 1.715,
"step": 405000
},
{
"base_loss": 0.6060819316506386,
"epoch": 6.058174133300781,
"grad_norm": 0.0011201421730220318,
"learning_rate": 1.1328601837158203e-05,
"lookahead_loss": 6.31790843296051,
"loss": 0.6154,
"step": 405500
},
{
"base_loss": 0.5837663099765777,
"epoch": 6.0591278076171875,
"grad_norm": 0.0011035632342100143,
"learning_rate": 1.1280918121337891e-05,
"lookahead_loss": 6.2060595207214355,
"loss": 0.599,
"step": 406000
},
{
"base_loss": 0.5909119842648506,
"epoch": 6.060081481933594,
"grad_norm": 0.0010986344423145056,
"learning_rate": 1.1233234405517578e-05,
"lookahead_loss": 6.279904296875,
"loss": 0.6045,
"step": 406500
},
{
"base_loss": 0.609372730076313,
"epoch": 6.06103515625,
"grad_norm": 0.001120261033065617,
"learning_rate": 1.1185550689697267e-05,
"lookahead_loss": 6.231126732826233,
"loss": 0.6181,
"step": 407000
},
{
"base_loss": 0.6013825216889381,
"epoch": 6.061988830566406,
"grad_norm": 0.0011455032508820295,
"learning_rate": 1.1137866973876954e-05,
"lookahead_loss": 6.261857038497925,
"loss": 0.6139,
"step": 407500
},
{
"base_loss": 0.59294895529747,
"epoch": 6.0629425048828125,
"grad_norm": 0.0011340271448716521,
"learning_rate": 1.109018325805664e-05,
"lookahead_loss": 6.269025160789489,
"loss": 0.6062,
"step": 408000
},
{
"base_loss": 0.6099907007813453,
"epoch": 6.063896179199219,
"grad_norm": 0.0010581036331132054,
"learning_rate": 1.1042499542236329e-05,
"lookahead_loss": 6.264707807064056,
"loss": 0.6212,
"step": 408500
},
{
"base_loss": 0.6085181525945663,
"epoch": 6.064849853515625,
"grad_norm": 0.0011085295118391514,
"learning_rate": 1.0994815826416016e-05,
"lookahead_loss": 6.328743109703064,
"loss": 0.6151,
"step": 409000
},
{
"base_loss": 0.6055323982238769,
"epoch": 6.065803527832031,
"grad_norm": 0.0011180423898622394,
"learning_rate": 1.0947132110595704e-05,
"lookahead_loss": 6.239625741958618,
"loss": 0.6142,
"step": 409500
},
{
"base_loss": 0.6037949919104576,
"epoch": 6.0667572021484375,
"grad_norm": 0.0011036383220925927,
"learning_rate": 1.0899448394775391e-05,
"lookahead_loss": 6.2200113606452945,
"loss": 0.6107,
"step": 410000
},
{
"epoch": 6.0667572021484375,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.27883860554558,
"eval_lookahead_perplexity": 533.1690844892074,
"eval_loss": 0.20713473856449127,
"eval_perplexity": 1.2301483090321728,
"eval_runtime": 92.0938,
"eval_samples_per_second": 54.292,
"eval_steps_per_second": 1.705,
"step": 410000
},
{
"base_loss": 0.6215471062660217,
"epoch": 6.067710876464844,
"grad_norm": 0.0011463616974651814,
"learning_rate": 1.0851764678955078e-05,
"lookahead_loss": 6.295929139137268,
"loss": 0.6294,
"step": 410500
},
{
"base_loss": 0.6006580455303192,
"epoch": 6.06866455078125,
"grad_norm": 0.0010866274824365973,
"learning_rate": 1.0804080963134766e-05,
"lookahead_loss": 6.255359673500061,
"loss": 0.6088,
"step": 411000
},
{
"base_loss": 0.5863450763821602,
"epoch": 6.069618225097656,
"grad_norm": 0.0010793661931529641,
"learning_rate": 1.0756397247314453e-05,
"lookahead_loss": 6.290106748104096,
"loss": 0.5996,
"step": 411500
},
{
"base_loss": 0.6130303152799607,
"epoch": 6.0705718994140625,
"grad_norm": 0.0011292450362816453,
"learning_rate": 1.0708713531494142e-05,
"lookahead_loss": 6.193022495269775,
"loss": 0.6263,
"step": 412000
},
{
"base_loss": 0.5976909038424492,
"epoch": 6.071525573730469,
"grad_norm": 0.0011028555454686284,
"learning_rate": 1.0661029815673829e-05,
"lookahead_loss": 6.238008366107941,
"loss": 0.6101,
"step": 412500
},
{
"base_loss": 0.5941678040623665,
"epoch": 6.072479248046875,
"grad_norm": 0.0011350243585184216,
"learning_rate": 1.0613346099853515e-05,
"lookahead_loss": 6.284453297615051,
"loss": 0.6041,
"step": 413000
},
{
"base_loss": 0.59308890157938,
"epoch": 6.073432922363281,
"grad_norm": 0.0010425560176372528,
"learning_rate": 1.0565662384033204e-05,
"lookahead_loss": 6.2601313934326175,
"loss": 0.6043,
"step": 413500
},
{
"base_loss": 0.6148577529788017,
"epoch": 6.0743865966796875,
"grad_norm": 0.0010980789083987474,
"learning_rate": 1.051797866821289e-05,
"lookahead_loss": 6.254723328113556,
"loss": 0.6223,
"step": 414000
},
{
"base_loss": 0.5937941102385521,
"epoch": 6.075340270996094,
"grad_norm": 0.0011371398577466607,
"learning_rate": 1.047029495239258e-05,
"lookahead_loss": 6.319699608802796,
"loss": 0.6057,
"step": 414500
},
{
"base_loss": 0.5855166874527932,
"epoch": 6.0762939453125,
"grad_norm": 0.0010461227502673864,
"learning_rate": 1.0422611236572266e-05,
"lookahead_loss": 6.3037777528762815,
"loss": 0.5961,
"step": 415000
},
{
"epoch": 6.0762939453125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.2767525427638535,
"eval_lookahead_perplexity": 532.0580195841749,
"eval_loss": 0.20713090896606445,
"eval_perplexity": 1.2301435980671642,
"eval_runtime": 90.955,
"eval_samples_per_second": 54.972,
"eval_steps_per_second": 1.726,
"step": 415000
},
{
"base_loss": 0.6238721495270729,
"epoch": 6.077247619628906,
"grad_norm": 0.0010918197222054005,
"learning_rate": 1.0374927520751953e-05,
"lookahead_loss": 6.2675533857345584,
"loss": 0.6338,
"step": 415500
},
{
"base_loss": 0.5962715279459954,
"epoch": 6.0782012939453125,
"grad_norm": 0.0011410253355279565,
"learning_rate": 1.0327243804931641e-05,
"lookahead_loss": 6.27958661365509,
"loss": 0.6104,
"step": 416000
},
{
"base_loss": 0.5984803900718689,
"epoch": 6.079154968261719,
"grad_norm": 0.0010592457838356495,
"learning_rate": 1.0279560089111328e-05,
"lookahead_loss": 6.307873971939087,
"loss": 0.6081,
"step": 416500
},
{
"base_loss": 0.6096297157406807,
"epoch": 6.080108642578125,
"grad_norm": 0.0010972806485369802,
"learning_rate": 1.0231876373291017e-05,
"lookahead_loss": 6.251428637504578,
"loss": 0.62,
"step": 417000
},
{
"base_loss": 0.6096931555867195,
"epoch": 6.081062316894531,
"grad_norm": 0.0011736972955986857,
"learning_rate": 1.0184192657470704e-05,
"lookahead_loss": 6.28662513923645,
"loss": 0.6192,
"step": 417500
},
{
"base_loss": 0.5950792465209961,
"epoch": 6.0820159912109375,
"grad_norm": 0.0010587719734758139,
"learning_rate": 1.013650894165039e-05,
"lookahead_loss": 6.306061523914337,
"loss": 0.6073,
"step": 418000
},
{
"base_loss": 0.5926352781057358,
"epoch": 6.082969665527344,
"grad_norm": 0.0011153679806739092,
"learning_rate": 1.0088825225830079e-05,
"lookahead_loss": 6.331948488235474,
"loss": 0.6044,
"step": 418500
},
{
"base_loss": 0.618686983525753,
"epoch": 6.08392333984375,
"grad_norm": 0.0010920428903773427,
"learning_rate": 1.0041141510009766e-05,
"lookahead_loss": 6.321983070850372,
"loss": 0.6292,
"step": 419000
},
{
"base_loss": 0.5942374100089073,
"epoch": 6.084877014160156,
"grad_norm": 0.001100387773476541,
"learning_rate": 9.993457794189454e-06,
"lookahead_loss": 6.271707439422608,
"loss": 0.6034,
"step": 419500
},
{
"base_loss": 0.5854629936814308,
"epoch": 6.0858306884765625,
"grad_norm": 0.0011402657255530357,
"learning_rate": 9.945774078369141e-06,
"lookahead_loss": 6.2728576302528385,
"loss": 0.5962,
"step": 420000
},
{
"epoch": 6.0858306884765625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.27458217349677,
"eval_lookahead_perplexity": 530.9045094344544,
"eval_loss": 0.20712696015834808,
"eval_perplexity": 1.2301387404762227,
"eval_runtime": 90.6741,
"eval_samples_per_second": 55.143,
"eval_steps_per_second": 1.731,
"step": 420000
},
{
"base_loss": 0.5934821357131004,
"epoch": 6.086784362792969,
"grad_norm": 0.0011842896929010749,
"learning_rate": 9.898090362548828e-06,
"lookahead_loss": 6.252230400085449,
"loss": 0.6039,
"step": 420500
},
{
"base_loss": 0.6230015980005265,
"epoch": 6.087738037109375,
"grad_norm": 0.001042664865963161,
"learning_rate": 9.850406646728516e-06,
"lookahead_loss": 6.282988451957703,
"loss": 0.6287,
"step": 421000
},
{
"base_loss": 0.5975295419692993,
"epoch": 6.088691711425781,
"grad_norm": 0.0011371064465492964,
"learning_rate": 9.802722930908203e-06,
"lookahead_loss": 6.272774960517883,
"loss": 0.6057,
"step": 421500
},
{
"base_loss": 0.5983778918385506,
"epoch": 6.0896453857421875,
"grad_norm": 0.0011219978332519531,
"learning_rate": 9.755039215087892e-06,
"lookahead_loss": 6.301621433734894,
"loss": 0.6089,
"step": 422000
},
{
"base_loss": 0.5868593170046806,
"epoch": 6.090599060058594,
"grad_norm": 0.0010826161596924067,
"learning_rate": 9.707355499267579e-06,
"lookahead_loss": 6.280946511268616,
"loss": 0.5982,
"step": 422500
},
{
"base_loss": 0.5818374307155609,
"epoch": 6.091552734375,
"grad_norm": 0.0010603091213852167,
"learning_rate": 9.659671783447265e-06,
"lookahead_loss": 6.2535722813606265,
"loss": 0.5917,
"step": 423000
},
{
"base_loss": 0.616866985142231,
"epoch": 6.092506408691406,
"grad_norm": 0.0011452294420450926,
"learning_rate": 9.611988067626954e-06,
"lookahead_loss": 6.298460816383362,
"loss": 0.6261,
"step": 423500
},
{
"base_loss": 0.5975575439333916,
"epoch": 6.0934600830078125,
"grad_norm": 0.0010709463385865092,
"learning_rate": 9.56430435180664e-06,
"lookahead_loss": 6.2952479648590085,
"loss": 0.6119,
"step": 424000
},
{
"base_loss": 0.584998004078865,
"epoch": 6.094413757324219,
"grad_norm": 0.001079953508451581,
"learning_rate": 9.51662063598633e-06,
"lookahead_loss": 6.284721467971802,
"loss": 0.5966,
"step": 424500
},
{
"base_loss": 0.5778142619729042,
"epoch": 6.095367431640625,
"grad_norm": 0.001095326617360115,
"learning_rate": 9.468936920166016e-06,
"lookahead_loss": 6.222258620738983,
"loss": 0.5928,
"step": 425000
},
{
"epoch": 6.095367431640625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.273199597867533,
"eval_lookahead_perplexity": 530.171000980557,
"eval_loss": 0.20712387561798096,
"eval_perplexity": 1.2301349460694726,
"eval_runtime": 91.6966,
"eval_samples_per_second": 54.528,
"eval_steps_per_second": 1.712,
"step": 425000
},
{
"base_loss": 0.5984759790301323,
"epoch": 6.096321105957031,
"grad_norm": 0.001100808964110911,
"learning_rate": 9.421253204345703e-06,
"lookahead_loss": 6.273516724586487,
"loss": 0.6115,
"step": 425500
},
{
"base_loss": 0.6139922738075256,
"epoch": 6.0972747802734375,
"grad_norm": 0.0011272229021415114,
"learning_rate": 9.373569488525391e-06,
"lookahead_loss": 6.297957478523254,
"loss": 0.6236,
"step": 426000
},
{
"base_loss": 0.5972231289744377,
"epoch": 6.098228454589844,
"grad_norm": 0.0010969050927087665,
"learning_rate": 9.325885772705078e-06,
"lookahead_loss": 6.234860178947449,
"loss": 0.6072,
"step": 426500
},
{
"base_loss": 0.5825409645438194,
"epoch": 6.09918212890625,
"grad_norm": 0.0011044559068977833,
"learning_rate": 9.278202056884767e-06,
"lookahead_loss": 6.312386203289032,
"loss": 0.5948,
"step": 427000
},
{
"base_loss": 0.5877818803787231,
"epoch": 6.100135803222656,
"grad_norm": 0.0011069976026192307,
"learning_rate": 9.230518341064454e-06,
"lookahead_loss": 6.295519777297974,
"loss": 0.6034,
"step": 427500
},
{
"base_loss": 0.610653886437416,
"epoch": 6.1010894775390625,
"grad_norm": 0.0010821627220138907,
"learning_rate": 9.18283462524414e-06,
"lookahead_loss": 6.3413438749313356,
"loss": 0.62,
"step": 428000
},
{
"base_loss": 0.6026809126138687,
"epoch": 6.102043151855469,
"grad_norm": 0.0011416695779189467,
"learning_rate": 9.135150909423829e-06,
"lookahead_loss": 6.302958374023437,
"loss": 0.6081,
"step": 428500
},
{
"base_loss": 0.5897697188258171,
"epoch": 6.102996826171875,
"grad_norm": 0.0010855476139113307,
"learning_rate": 9.087467193603516e-06,
"lookahead_loss": 6.285157390594483,
"loss": 0.6021,
"step": 429000
},
{
"base_loss": 0.590453925728798,
"epoch": 6.103950500488281,
"grad_norm": 0.0011181783629581332,
"learning_rate": 9.039783477783204e-06,
"lookahead_loss": 6.307080610275269,
"loss": 0.6019,
"step": 429500
},
{
"base_loss": 0.6123360496759415,
"epoch": 6.1049041748046875,
"grad_norm": 0.0010880377376452088,
"learning_rate": 8.992099761962891e-06,
"lookahead_loss": 6.254195990562439,
"loss": 0.6228,
"step": 430000
},
{
"epoch": 6.1049041748046875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.271564553720882,
"eval_lookahead_perplexity": 529.3048562740563,
"eval_loss": 0.20712073147296906,
"eval_perplexity": 1.2301310783528983,
"eval_runtime": 92.2103,
"eval_samples_per_second": 54.224,
"eval_steps_per_second": 1.703,
"step": 430000
},
{
"base_loss": 0.6012233046293258,
"epoch": 6.105857849121094,
"grad_norm": 0.0010412432020530105,
"learning_rate": 8.944416046142578e-06,
"lookahead_loss": 6.260348888397217,
"loss": 0.6131,
"step": 430500
},
{
"base_loss": 0.5989438276290894,
"epoch": 6.1068115234375,
"grad_norm": 0.0010780078591778874,
"learning_rate": 8.896732330322266e-06,
"lookahead_loss": 6.3281553306579585,
"loss": 0.6071,
"step": 431000
},
{
"base_loss": 0.5841665432453156,
"epoch": 6.107765197753906,
"grad_norm": 0.0011316650779917836,
"learning_rate": 8.849048614501953e-06,
"lookahead_loss": 6.234313168525696,
"loss": 0.5947,
"step": 431500
},
{
"base_loss": 0.6041393259763718,
"epoch": 6.1087188720703125,
"grad_norm": 0.0011483209673315287,
"learning_rate": 8.801364898681642e-06,
"lookahead_loss": 6.231174618244171,
"loss": 0.6173,
"step": 432000
},
{
"base_loss": 0.6072890778183937,
"epoch": 6.109672546386719,
"grad_norm": 0.0010701629798859358,
"learning_rate": 8.753681182861329e-06,
"lookahead_loss": 6.205759740829468,
"loss": 0.617,
"step": 432500
},
{
"base_loss": 0.5975223676562309,
"epoch": 6.110626220703125,
"grad_norm": 0.0010894860606640577,
"learning_rate": 8.705997467041015e-06,
"lookahead_loss": 6.183219263553619,
"loss": 0.6065,
"step": 433000
},
{
"base_loss": 0.5839032330513001,
"epoch": 6.111579895019531,
"grad_norm": 0.0010885415831580758,
"learning_rate": 8.658313751220704e-06,
"lookahead_loss": 6.2854044160842895,
"loss": 0.5963,
"step": 433500
},
{
"base_loss": 0.6033086371421814,
"epoch": 6.1125335693359375,
"grad_norm": 0.0010762620950117707,
"learning_rate": 8.61063003540039e-06,
"lookahead_loss": 6.313331780433654,
"loss": 0.613,
"step": 434000
},
{
"base_loss": 0.6134573189020157,
"epoch": 6.113487243652344,
"grad_norm": 0.0010942388325929642,
"learning_rate": 8.56294631958008e-06,
"lookahead_loss": 6.318107944011688,
"loss": 0.6263,
"step": 434500
},
{
"base_loss": 0.5973056275844574,
"epoch": 6.11444091796875,
"grad_norm": 0.001045436249114573,
"learning_rate": 8.515262603759766e-06,
"lookahead_loss": 6.20644309425354,
"loss": 0.6064,
"step": 435000
},
{
"epoch": 6.11444091796875,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.270106329323766,
"eval_lookahead_perplexity": 528.5335735074714,
"eval_loss": 0.20711791515350342,
"eval_perplexity": 1.2301276139156756,
"eval_runtime": 91.8053,
"eval_samples_per_second": 54.463,
"eval_steps_per_second": 1.71,
"step": 435000
},
{
"base_loss": 0.5865064730644226,
"epoch": 6.115394592285156,
"grad_norm": 0.0011204156326130033,
"learning_rate": 8.467578887939453e-06,
"lookahead_loss": 6.250656041145325,
"loss": 0.5967,
"step": 435500
},
{
"base_loss": 0.6034195944666862,
"epoch": 6.1163482666015625,
"grad_norm": 0.0011835613986477256,
"learning_rate": 8.419895172119141e-06,
"lookahead_loss": 6.255055441856384,
"loss": 0.6142,
"step": 436000
},
{
"base_loss": 0.6118170965909958,
"epoch": 6.117301940917969,
"grad_norm": 0.0011420606169849634,
"learning_rate": 8.372211456298828e-06,
"lookahead_loss": 6.259607755661011,
"loss": 0.6237,
"step": 436500
},
{
"base_loss": 0.6024285949468613,
"epoch": 6.118255615234375,
"grad_norm": 0.0011443018447607756,
"learning_rate": 8.324527740478517e-06,
"lookahead_loss": 6.2809450225830075,
"loss": 0.6109,
"step": 437000
},
{
"base_loss": 0.5845644298195839,
"epoch": 6.119209289550781,
"grad_norm": 0.0011274107964709401,
"learning_rate": 8.276844024658204e-06,
"lookahead_loss": 6.284584188461304,
"loss": 0.5948,
"step": 437500
},
{
"base_loss": 0.5961613509654998,
"epoch": 7.000953674316406,
"grad_norm": 0.0011183428578078747,
"learning_rate": 8.22916030883789e-06,
"lookahead_loss": 6.349602223396301,
"loss": 0.6029,
"step": 438000
},
{
"base_loss": 0.5877805910706521,
"epoch": 7.0019073486328125,
"grad_norm": 0.001147622475400567,
"learning_rate": 8.181476593017579e-06,
"lookahead_loss": 6.187368534088135,
"loss": 0.5973,
"step": 438500
},
{
"base_loss": 0.6050056391954421,
"epoch": 7.002861022949219,
"grad_norm": 0.0011125532910227776,
"learning_rate": 8.133792877197266e-06,
"lookahead_loss": 6.200378650665283,
"loss": 0.6131,
"step": 439000
},
{
"base_loss": 0.6121045120954514,
"epoch": 7.003814697265625,
"grad_norm": 0.0010899071348831058,
"learning_rate": 8.086109161376954e-06,
"lookahead_loss": 6.22591156578064,
"loss": 0.6207,
"step": 439500
},
{
"base_loss": 0.6019601293206215,
"epoch": 7.004768371582031,
"grad_norm": 0.0011169801000505686,
"learning_rate": 8.038425445556641e-06,
"lookahead_loss": 6.211684448242187,
"loss": 0.608,
"step": 440000
},
{
"epoch": 7.004768371582031,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.268705815933764,
"eval_lookahead_perplexity": 527.7938732616643,
"eval_loss": 0.207115039229393,
"eval_perplexity": 1.2301240761670988,
"eval_runtime": 91.6482,
"eval_samples_per_second": 54.556,
"eval_steps_per_second": 1.713,
"step": 440000
},
{
"base_loss": 0.5894829227328301,
"epoch": 7.0057220458984375,
"grad_norm": 0.0010402505286037922,
"learning_rate": 7.990741729736328e-06,
"lookahead_loss": 6.329894530773163,
"loss": 0.5999,
"step": 440500
},
{
"base_loss": 0.5802138668894767,
"epoch": 7.006675720214844,
"grad_norm": 0.0010724926833063364,
"learning_rate": 7.943058013916016e-06,
"lookahead_loss": 6.185600045204162,
"loss": 0.5955,
"step": 441000
},
{
"base_loss": 0.6045641638636589,
"epoch": 7.00762939453125,
"grad_norm": 0.001113320467993617,
"learning_rate": 7.895374298095703e-06,
"lookahead_loss": 6.25577535200119,
"loss": 0.6166,
"step": 441500
},
{
"base_loss": 0.6033532832860946,
"epoch": 7.008583068847656,
"grad_norm": 0.001021145610138774,
"learning_rate": 7.847690582275392e-06,
"lookahead_loss": 6.258115340709686,
"loss": 0.6071,
"step": 442000
},
{
"base_loss": 0.5892392939925194,
"epoch": 7.0095367431640625,
"grad_norm": 0.0011431181337684393,
"learning_rate": 7.800006866455079e-06,
"lookahead_loss": 6.264180626392364,
"loss": 0.6052,
"step": 442500
},
{
"base_loss": 0.5968242118954659,
"epoch": 7.010490417480469,
"grad_norm": 0.0011058381060138345,
"learning_rate": 7.752323150634765e-06,
"lookahead_loss": 6.2021826705932614,
"loss": 0.6043,
"step": 443000
},
{
"base_loss": 0.590798145532608,
"epoch": 7.011444091796875,
"grad_norm": 0.001150521100498736,
"learning_rate": 7.704639434814454e-06,
"lookahead_loss": 6.237220158100128,
"loss": 0.6048,
"step": 443500
},
{
"base_loss": 0.6112803395986557,
"epoch": 7.012397766113281,
"grad_norm": 0.0011072250781580806,
"learning_rate": 7.65695571899414e-06,
"lookahead_loss": 6.236739232063293,
"loss": 0.6218,
"step": 444000
},
{
"base_loss": 0.5957660912275314,
"epoch": 7.0133514404296875,
"grad_norm": 0.0011012445902451873,
"learning_rate": 7.6092720031738284e-06,
"lookahead_loss": 6.307263396263123,
"loss": 0.6076,
"step": 444500
},
{
"base_loss": 0.5957861280441284,
"epoch": 7.014305114746094,
"grad_norm": 0.0011027586879208684,
"learning_rate": 7.561588287353516e-06,
"lookahead_loss": 6.285342982292176,
"loss": 0.6074,
"step": 445000
},
{
"epoch": 7.014305114746094,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.267345444462932,
"eval_lookahead_perplexity": 527.0763656830716,
"eval_loss": 0.20711229741573334,
"eval_perplexity": 1.2301207034007275,
"eval_runtime": 90.8882,
"eval_samples_per_second": 55.013,
"eval_steps_per_second": 1.727,
"step": 445000
},
{
"base_loss": 0.5817868651151658,
"epoch": 7.0152587890625,
"grad_norm": 0.0011148322373628616,
"learning_rate": 7.513904571533204e-06,
"lookahead_loss": 6.224323974609375,
"loss": 0.5886,
"step": 445500
},
{
"base_loss": 0.6079041356444359,
"epoch": 7.016212463378906,
"grad_norm": 0.001130820601247251,
"learning_rate": 7.466220855712891e-06,
"lookahead_loss": 6.284985821247101,
"loss": 0.6164,
"step": 446000
},
{
"base_loss": 0.6032249782681465,
"epoch": 7.0171661376953125,
"grad_norm": 0.0010859910398721695,
"learning_rate": 7.418537139892578e-06,
"lookahead_loss": 6.324811381340027,
"loss": 0.6089,
"step": 446500
},
{
"base_loss": 0.5946653738021851,
"epoch": 7.018119812011719,
"grad_norm": 0.0010381847387179732,
"learning_rate": 7.370853424072266e-06,
"lookahead_loss": 6.318586661338806,
"loss": 0.6029,
"step": 447000
},
{
"base_loss": 0.5883135892748833,
"epoch": 7.019073486328125,
"grad_norm": 0.0010651465272530913,
"learning_rate": 7.323169708251954e-06,
"lookahead_loss": 6.3373530521392825,
"loss": 0.5982,
"step": 447500
},
{
"base_loss": 0.5866989207267761,
"epoch": 7.020027160644531,
"grad_norm": 0.0011416026391088963,
"learning_rate": 7.275485992431641e-06,
"lookahead_loss": 6.17793447971344,
"loss": 0.6012,
"step": 448000
},
{
"base_loss": 0.6181823741197586,
"epoch": 7.0209808349609375,
"grad_norm": 0.0010932744480669498,
"learning_rate": 7.227802276611328e-06,
"lookahead_loss": 6.233171957969666,
"loss": 0.6252,
"step": 448500
},
{
"base_loss": 0.5960042692422867,
"epoch": 7.021934509277344,
"grad_norm": 0.0011309271212667227,
"learning_rate": 7.180118560791016e-06,
"lookahead_loss": 6.224097855091095,
"loss": 0.6085,
"step": 449000
},
{
"base_loss": 0.5992451857328415,
"epoch": 7.02288818359375,
"grad_norm": 0.0011079860851168633,
"learning_rate": 7.1324348449707034e-06,
"lookahead_loss": 6.251955393314361,
"loss": 0.6062,
"step": 449500
},
{
"base_loss": 0.5992803901433945,
"epoch": 7.023841857910156,
"grad_norm": 0.001132696750573814,
"learning_rate": 7.084751129150391e-06,
"lookahead_loss": 6.2213596034049985,
"loss": 0.6062,
"step": 450000
},
{
"epoch": 7.023841857910156,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.265722547476284,
"eval_lookahead_perplexity": 526.221668767587,
"eval_loss": 0.2071092277765274,
"eval_perplexity": 1.2301169273797838,
"eval_runtime": 91.2802,
"eval_samples_per_second": 54.776,
"eval_steps_per_second": 1.72,
"step": 450000
},
{
"base_loss": 0.6113395862579346,
"epoch": 7.0247955322265625,
"grad_norm": 0.0010815602727234364,
"learning_rate": 7.037067413330079e-06,
"lookahead_loss": 6.2310498752594,
"loss": 0.6221,
"step": 450500
},
{
"base_loss": 0.6042459404468536,
"epoch": 7.025749206542969,
"grad_norm": 0.0010827596997842193,
"learning_rate": 6.989383697509766e-06,
"lookahead_loss": 6.193919623374939,
"loss": 0.6151,
"step": 451000
},
{
"base_loss": 0.5966729502677918,
"epoch": 7.026702880859375,
"grad_norm": 0.0011122706346213818,
"learning_rate": 6.941699981689453e-06,
"lookahead_loss": 6.21730579662323,
"loss": 0.6058,
"step": 451500
},
{
"base_loss": 0.5946527794599533,
"epoch": 7.027656555175781,
"grad_norm": 0.0010856961598619819,
"learning_rate": 6.894016265869141e-06,
"lookahead_loss": 6.3204999446868895,
"loss": 0.6041,
"step": 452000
},
{
"base_loss": 0.6178021001815795,
"epoch": 7.0286102294921875,
"grad_norm": 0.0010914442827925086,
"learning_rate": 6.846332550048829e-06,
"lookahead_loss": 6.350885845184326,
"loss": 0.6271,
"step": 452500
},
{
"base_loss": 0.5997619133591652,
"epoch": 7.029563903808594,
"grad_norm": 0.0010976734338328242,
"learning_rate": 6.798648834228516e-06,
"lookahead_loss": 6.279042994499206,
"loss": 0.6093,
"step": 453000
},
{
"base_loss": 0.5922289202213288,
"epoch": 7.030517578125,
"grad_norm": 0.0010889185359701514,
"learning_rate": 6.750965118408203e-06,
"lookahead_loss": 6.284307628631592,
"loss": 0.6053,
"step": 453500
},
{
"base_loss": 0.5905209797620773,
"epoch": 7.031471252441406,
"grad_norm": 0.0011072177439928055,
"learning_rate": 6.703281402587891e-06,
"lookahead_loss": 6.292623271942139,
"loss": 0.6026,
"step": 454000
},
{
"base_loss": 0.6133147512674332,
"epoch": 7.0324249267578125,
"grad_norm": 0.0011112524662166834,
"learning_rate": 6.6555976867675784e-06,
"lookahead_loss": 6.244432949066162,
"loss": 0.6276,
"step": 454500
},
{
"base_loss": 0.5965310020446777,
"epoch": 7.033378601074219,
"grad_norm": 0.001126542454585433,
"learning_rate": 6.607913970947266e-06,
"lookahead_loss": 6.288736204147339,
"loss": 0.6076,
"step": 455000
},
{
"epoch": 7.033378601074219,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.264621932666523,
"eval_lookahead_perplexity": 525.6428200089147,
"eval_loss": 0.20710715651512146,
"eval_perplexity": 1.2301143794887062,
"eval_runtime": 90.1897,
"eval_samples_per_second": 55.439,
"eval_steps_per_second": 1.741,
"step": 455000
},
{
"base_loss": 0.5923399785757065,
"epoch": 7.034332275390625,
"grad_norm": 0.0010525787947699428,
"learning_rate": 6.560230255126954e-06,
"lookahead_loss": 6.364464108467102,
"loss": 0.6063,
"step": 455500
},
{
"base_loss": 0.5895907972455025,
"epoch": 7.035285949707031,
"grad_norm": 0.0011163650779053569,
"learning_rate": 6.512546539306641e-06,
"lookahead_loss": 6.196651001453399,
"loss": 0.6011,
"step": 456000
},
{
"base_loss": 0.6101583961248398,
"epoch": 7.0362396240234375,
"grad_norm": 0.0011191520607098937,
"learning_rate": 6.464862823486328e-06,
"lookahead_loss": 6.324227838516236,
"loss": 0.6218,
"step": 456500
},
{
"base_loss": 0.5974237969517708,
"epoch": 7.037193298339844,
"grad_norm": 0.0011339603224769235,
"learning_rate": 6.417179107666016e-06,
"lookahead_loss": 6.257159686088562,
"loss": 0.6098,
"step": 457000
},
{
"base_loss": 0.5909910210371018,
"epoch": 7.03814697265625,
"grad_norm": 0.001096972613595426,
"learning_rate": 6.369495391845704e-06,
"lookahead_loss": 6.265636465072632,
"loss": 0.6028,
"step": 457500
},
{
"base_loss": 0.5998276071548462,
"epoch": 7.039100646972656,
"grad_norm": 0.0010910106357187033,
"learning_rate": 6.321811676025391e-06,
"lookahead_loss": 6.244960191726684,
"loss": 0.6124,
"step": 458000
},
{
"base_loss": 0.6157890763282776,
"epoch": 7.0400543212890625,
"grad_norm": 0.001121240551583469,
"learning_rate": 6.274127960205078e-06,
"lookahead_loss": 6.279780546188355,
"loss": 0.6236,
"step": 458500
},
{
"base_loss": 0.600258769273758,
"epoch": 7.041007995605469,
"grad_norm": 0.0011285766959190369,
"learning_rate": 6.226444244384766e-06,
"lookahead_loss": 6.209328644752502,
"loss": 0.6114,
"step": 459000
},
{
"base_loss": 0.5789770235419274,
"epoch": 7.041961669921875,
"grad_norm": 0.0011222581379115582,
"learning_rate": 6.1787605285644534e-06,
"lookahead_loss": 6.2839519019126895,
"loss": 0.5952,
"step": 459500
},
{
"base_loss": 0.603790655374527,
"epoch": 7.042915344238281,
"grad_norm": 0.0010850606486201286,
"learning_rate": 6.131076812744141e-06,
"lookahead_loss": 6.308252753257752,
"loss": 0.6147,
"step": 460000
},
{
"epoch": 7.042915344238281,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.263513925357367,
"eval_lookahead_perplexity": 525.060726463843,
"eval_loss": 0.2071051001548767,
"eval_perplexity": 1.2301118499330004,
"eval_runtime": 92.9681,
"eval_samples_per_second": 53.782,
"eval_steps_per_second": 1.689,
"step": 460000
},
{
"base_loss": 0.6122875562906265,
"epoch": 7.0438690185546875,
"grad_norm": 0.001144933863542974,
"learning_rate": 6.083393096923829e-06,
"lookahead_loss": 6.326288844108581,
"loss": 0.6259,
"step": 460500
},
{
"base_loss": 0.5919451169967651,
"epoch": 7.044822692871094,
"grad_norm": 0.0011071843327954412,
"learning_rate": 6.035709381103516e-06,
"lookahead_loss": 6.259445383071899,
"loss": 0.6046,
"step": 461000
},
{
"base_loss": 0.5933005015850067,
"epoch": 7.0457763671875,
"grad_norm": 0.0010790773667395115,
"learning_rate": 5.988025665283203e-06,
"lookahead_loss": 6.257792898178101,
"loss": 0.6032,
"step": 461500
},
{
"base_loss": 0.619339332818985,
"epoch": 7.046730041503906,
"grad_norm": 0.0011052032932639122,
"learning_rate": 5.940341949462891e-06,
"lookahead_loss": 6.215020411014557,
"loss": 0.6282,
"step": 462000
},
{
"base_loss": 0.6016078860163688,
"epoch": 7.0476837158203125,
"grad_norm": 0.0011391551233828068,
"learning_rate": 5.892658233642579e-06,
"lookahead_loss": 6.270429663658142,
"loss": 0.6197,
"step": 462500
},
{
"base_loss": 0.5871764430999756,
"epoch": 7.048637390136719,
"grad_norm": 0.0010885728988796473,
"learning_rate": 5.844974517822266e-06,
"lookahead_loss": 6.2495416173934935,
"loss": 0.6004,
"step": 463000
},
{
"base_loss": 0.6003029895424843,
"epoch": 7.049591064453125,
"grad_norm": 0.0011504755821079016,
"learning_rate": 5.797290802001953e-06,
"lookahead_loss": 6.2231017370224,
"loss": 0.6079,
"step": 463500
},
{
"base_loss": 0.616134802877903,
"epoch": 7.050544738769531,
"grad_norm": 0.001045083161443472,
"learning_rate": 5.749607086181641e-06,
"lookahead_loss": 6.316908567428589,
"loss": 0.6265,
"step": 464000
},
{
"base_loss": 0.5965465674996376,
"epoch": 7.0514984130859375,
"grad_norm": 0.0011614857940003276,
"learning_rate": 5.7019233703613284e-06,
"lookahead_loss": 6.286425273895263,
"loss": 0.6091,
"step": 464500
},
{
"base_loss": 0.5948073084950447,
"epoch": 7.052452087402344,
"grad_norm": 0.001139484578743577,
"learning_rate": 5.654239654541016e-06,
"lookahead_loss": 6.1960596828460694,
"loss": 0.6053,
"step": 465000
},
{
"epoch": 7.052452087402344,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.262733459472656,
"eval_lookahead_perplexity": 524.6510943521738,
"eval_loss": 0.2071034163236618,
"eval_perplexity": 1.2301097786340136,
"eval_runtime": 93.0888,
"eval_samples_per_second": 53.712,
"eval_steps_per_second": 1.687,
"step": 465000
},
{
"base_loss": 0.6123905621767044,
"epoch": 7.05340576171875,
"grad_norm": 0.00109586538746953,
"learning_rate": 5.606555938720704e-06,
"lookahead_loss": 6.227264610290527,
"loss": 0.6202,
"step": 465500
},
{
"base_loss": 0.6105590149760246,
"epoch": 7.054359436035156,
"grad_norm": 0.0011044503189623356,
"learning_rate": 5.558872222900391e-06,
"lookahead_loss": 6.263851017951965,
"loss": 0.6254,
"step": 466000
},
{
"base_loss": 0.5943532618284225,
"epoch": 7.0553131103515625,
"grad_norm": 0.00111213861964643,
"learning_rate": 5.511188507080078e-06,
"lookahead_loss": 6.27136710357666,
"loss": 0.6029,
"step": 466500
},
{
"base_loss": 0.5899229286909103,
"epoch": 7.056266784667969,
"grad_norm": 0.0010533079039305449,
"learning_rate": 5.463504791259766e-06,
"lookahead_loss": 6.291095352172851,
"loss": 0.6048,
"step": 467000
},
{
"base_loss": 0.6136555113196372,
"epoch": 7.057220458984375,
"grad_norm": 0.001086205942556262,
"learning_rate": 5.415821075439454e-06,
"lookahead_loss": 6.285642471790314,
"loss": 0.6274,
"step": 467500
},
{
"base_loss": 0.60705413210392,
"epoch": 7.058174133300781,
"grad_norm": 0.0011131491046398878,
"learning_rate": 5.368137359619141e-06,
"lookahead_loss": 6.296099196434021,
"loss": 0.6149,
"step": 468000
},
{
"base_loss": 0.5850182236433029,
"epoch": 7.0591278076171875,
"grad_norm": 0.0010873244609683752,
"learning_rate": 5.320453643798828e-06,
"lookahead_loss": 6.182619555473328,
"loss": 0.6004,
"step": 468500
},
{
"base_loss": 0.5940621579289437,
"epoch": 7.060081481933594,
"grad_norm": 0.0011030533351004124,
"learning_rate": 5.272769927978516e-06,
"lookahead_loss": 6.263250873088837,
"loss": 0.6064,
"step": 469000
},
{
"base_loss": 0.6068593204021454,
"epoch": 7.06103515625,
"grad_norm": 0.0011013118783012033,
"learning_rate": 5.2250862121582034e-06,
"lookahead_loss": 6.212888800621033,
"loss": 0.6168,
"step": 469500
},
{
"base_loss": 0.6027862961888313,
"epoch": 7.061988830566406,
"grad_norm": 0.001138397492468357,
"learning_rate": 5.177402496337891e-06,
"lookahead_loss": 6.228745173454285,
"loss": 0.6144,
"step": 470000
},
{
"epoch": 7.061988830566406,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.261695091716779,
"eval_lookahead_perplexity": 524.1065963162379,
"eval_loss": 0.207101508975029,
"eval_perplexity": 1.2301074323880465,
"eval_runtime": 90.65,
"eval_samples_per_second": 55.157,
"eval_steps_per_second": 1.732,
"step": 470000
},
{
"base_loss": 0.5955105296373367,
"epoch": 7.0629425048828125,
"grad_norm": 0.001156990067102015,
"learning_rate": 5.129718780517579e-06,
"lookahead_loss": 6.243498850822449,
"loss": 0.6064,
"step": 470500
},
{
"base_loss": 0.6115499113202095,
"epoch": 7.063896179199219,
"grad_norm": 0.001073277904652059,
"learning_rate": 5.082035064697266e-06,
"lookahead_loss": 6.25689557170868,
"loss": 0.6233,
"step": 471000
},
{
"base_loss": 0.6069118053913116,
"epoch": 7.064849853515625,
"grad_norm": 0.0010952987940981984,
"learning_rate": 5.034351348876953e-06,
"lookahead_loss": 6.301583794593811,
"loss": 0.6156,
"step": 471500
},
{
"base_loss": 0.6021770805120468,
"epoch": 7.065803527832031,
"grad_norm": 0.0011316589079797268,
"learning_rate": 4.986667633056641e-06,
"lookahead_loss": 6.225786661148072,
"loss": 0.6123,
"step": 472000
},
{
"base_loss": 0.6015241233706474,
"epoch": 7.0667572021484375,
"grad_norm": 0.0011141011491417885,
"learning_rate": 4.938983917236329e-06,
"lookahead_loss": 6.202967374324799,
"loss": 0.6095,
"step": 472500
},
{
"base_loss": 0.6184404605031013,
"epoch": 7.067710876464844,
"grad_norm": 0.001143975299783051,
"learning_rate": 4.891300201416016e-06,
"lookahead_loss": 6.271997055530548,
"loss": 0.6279,
"step": 473000
},
{
"base_loss": 0.5974515009522438,
"epoch": 7.06866455078125,
"grad_norm": 0.0010884921066462994,
"learning_rate": 4.843616485595703e-06,
"lookahead_loss": 6.238460997581482,
"loss": 0.6056,
"step": 473500
},
{
"base_loss": 0.5842619987726212,
"epoch": 7.069618225097656,
"grad_norm": 0.0011051521869376302,
"learning_rate": 4.795932769775391e-06,
"lookahead_loss": 6.2725230369567875,
"loss": 0.5982,
"step": 474000
},
{
"base_loss": 0.614363546192646,
"epoch": 7.0705718994140625,
"grad_norm": 0.0011240324238315225,
"learning_rate": 4.7482490539550784e-06,
"lookahead_loss": 6.1859285850524905,
"loss": 0.6267,
"step": 474500
},
{
"base_loss": 0.6006701437830925,
"epoch": 7.071525573730469,
"grad_norm": 0.0011095181107521057,
"learning_rate": 4.700565338134766e-06,
"lookahead_loss": 6.229223669528961,
"loss": 0.6112,
"step": 475000
},
{
"epoch": 7.071525573730469,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.260892772065184,
"eval_lookahead_perplexity": 523.6862639374949,
"eval_loss": 0.20709997415542603,
"eval_perplexity": 1.2301055443964946,
"eval_runtime": 91.3111,
"eval_samples_per_second": 54.758,
"eval_steps_per_second": 1.719,
"step": 475000
},
{
"base_loss": 0.5963374812602997,
"epoch": 7.072479248046875,
"grad_norm": 0.0011368774576112628,
"learning_rate": 4.652881622314453e-06,
"lookahead_loss": 6.267845227241516,
"loss": 0.6058,
"step": 475500
},
{
"base_loss": 0.5906391541361808,
"epoch": 7.073432922363281,
"grad_norm": 0.0010553834727033973,
"learning_rate": 4.605197906494141e-06,
"lookahead_loss": 6.2372974953651426,
"loss": 0.6034,
"step": 476000
},
{
"base_loss": 0.6126260715126991,
"epoch": 7.0743865966796875,
"grad_norm": 0.0010900960769504309,
"learning_rate": 4.557514190673828e-06,
"lookahead_loss": 6.238807513236999,
"loss": 0.6218,
"step": 476500
},
{
"base_loss": 0.5941922485232353,
"epoch": 7.075340270996094,
"grad_norm": 0.0010984891559928656,
"learning_rate": 4.509830474853516e-06,
"lookahead_loss": 6.299616351604461,
"loss": 0.6054,
"step": 477000
},
{
"base_loss": 0.5850859879851341,
"epoch": 7.0762939453125,
"grad_norm": 0.0010677935788407922,
"learning_rate": 4.462146759033204e-06,
"lookahead_loss": 6.270443281173706,
"loss": 0.5961,
"step": 477500
},
{
"base_loss": 0.6249460031986237,
"epoch": 7.077247619628906,
"grad_norm": 0.0010833673877641559,
"learning_rate": 4.4144630432128904e-06,
"lookahead_loss": 6.2501684432029725,
"loss": 0.6356,
"step": 478000
},
{
"base_loss": 0.5957035277485847,
"epoch": 7.0782012939453125,
"grad_norm": 0.001110053970478475,
"learning_rate": 4.366779327392578e-06,
"lookahead_loss": 6.2742936916351315,
"loss": 0.6089,
"step": 478500
},
{
"base_loss": 0.5999418792724609,
"epoch": 7.079154968261719,
"grad_norm": 0.0010629543103277683,
"learning_rate": 4.319095611572266e-06,
"lookahead_loss": 6.2945415420532225,
"loss": 0.6071,
"step": 479000
},
{
"base_loss": 0.6110728977918625,
"epoch": 7.080108642578125,
"grad_norm": 0.0010882082860916853,
"learning_rate": 4.2714118957519534e-06,
"lookahead_loss": 6.2418430824279785,
"loss": 0.6197,
"step": 479500
},
{
"base_loss": 0.6104652171134949,
"epoch": 7.081062316894531,
"grad_norm": 0.0011661059688776731,
"learning_rate": 4.223728179931641e-06,
"lookahead_loss": 6.270271697044373,
"loss": 0.6218,
"step": 480000
},
{
"epoch": 7.081062316894531,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.260015819781124,
"eval_lookahead_perplexity": 523.2272173825853,
"eval_loss": 0.20709839463233948,
"eval_perplexity": 1.2301036014179227,
"eval_runtime": 92.6569,
"eval_samples_per_second": 53.963,
"eval_steps_per_second": 1.694,
"step": 480000
},
{
"base_loss": 0.5925396049022674,
"epoch": 7.0820159912109375,
"grad_norm": 0.0010843832278624177,
"learning_rate": 4.176044464111328e-06,
"lookahead_loss": 6.290804083824158,
"loss": 0.6057,
"step": 480500
},
{
"base_loss": 0.5912622154951096,
"epoch": 7.082969665527344,
"grad_norm": 0.0011063800193369389,
"learning_rate": 4.128360748291016e-06,
"lookahead_loss": 6.319176843643189,
"loss": 0.6034,
"step": 481000
},
{
"base_loss": 0.6196577532887458,
"epoch": 7.08392333984375,
"grad_norm": 0.0010821149917319417,
"learning_rate": 4.080677032470703e-06,
"lookahead_loss": 6.315744980335236,
"loss": 0.6285,
"step": 481500
},
{
"base_loss": 0.5937823454141616,
"epoch": 7.084877014160156,
"grad_norm": 0.0010981445666402578,
"learning_rate": 4.032993316650391e-06,
"lookahead_loss": 6.247111065387726,
"loss": 0.603,
"step": 482000
},
{
"base_loss": 0.5897138588428498,
"epoch": 7.0858306884765625,
"grad_norm": 0.0011262124171480536,
"learning_rate": 3.985309600830079e-06,
"lookahead_loss": 6.25641952419281,
"loss": 0.5988,
"step": 482500
},
{
"base_loss": 0.5952436604499817,
"epoch": 7.086784362792969,
"grad_norm": 0.0011732213897630572,
"learning_rate": 3.9376258850097654e-06,
"lookahead_loss": 6.227437392234802,
"loss": 0.606,
"step": 483000
},
{
"base_loss": 0.6236630493402481,
"epoch": 7.087738037109375,
"grad_norm": 0.0010312370723113418,
"learning_rate": 3.889942169189453e-06,
"lookahead_loss": 6.267721421718598,
"loss": 0.6287,
"step": 483500
},
{
"base_loss": 0.5960297654867173,
"epoch": 7.088691711425781,
"grad_norm": 0.0011183172464370728,
"learning_rate": 3.842258453369141e-06,
"lookahead_loss": 6.262453424453735,
"loss": 0.6056,
"step": 484000
},
{
"base_loss": 0.5999740233421326,
"epoch": 7.0896453857421875,
"grad_norm": 0.0011287264060229063,
"learning_rate": 3.7945747375488284e-06,
"lookahead_loss": 6.287140043258667,
"loss": 0.6091,
"step": 484500
},
{
"base_loss": 0.5876833364963532,
"epoch": 7.090599060058594,
"grad_norm": 0.0010590523015707731,
"learning_rate": 3.7468910217285157e-06,
"lookahead_loss": 6.26017622089386,
"loss": 0.5981,
"step": 485000
},
{
"epoch": 7.090599060058594,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.259280581062975,
"eval_lookahead_perplexity": 522.842661861354,
"eval_loss": 0.20709699392318726,
"eval_perplexity": 1.2301018784017568,
"eval_runtime": 89.5464,
"eval_samples_per_second": 55.837,
"eval_steps_per_second": 1.753,
"step": 485000
},
{
"base_loss": 0.5815649164319039,
"epoch": 7.091552734375,
"grad_norm": 0.001043815049342811,
"learning_rate": 3.6992073059082034e-06,
"lookahead_loss": 6.240018193244934,
"loss": 0.5916,
"step": 485500
},
{
"base_loss": 0.616571400463581,
"epoch": 7.092506408691406,
"grad_norm": 0.0011240957537665963,
"learning_rate": 3.6515235900878906e-06,
"lookahead_loss": 6.282034734249115,
"loss": 0.6274,
"step": 486000
},
{
"base_loss": 0.5999781568050384,
"epoch": 7.0934600830078125,
"grad_norm": 0.0010617803782224655,
"learning_rate": 3.6038398742675783e-06,
"lookahead_loss": 6.2716433649063115,
"loss": 0.6123,
"step": 486500
},
{
"base_loss": 0.5853218165636063,
"epoch": 7.094413757324219,
"grad_norm": 0.0010862386552616954,
"learning_rate": 3.556156158447266e-06,
"lookahead_loss": 6.264246835708618,
"loss": 0.5973,
"step": 487000
},
{
"base_loss": 0.5790126396417618,
"epoch": 7.095367431640625,
"grad_norm": 0.0010896100429818034,
"learning_rate": 3.508472442626953e-06,
"lookahead_loss": 6.2038795657157895,
"loss": 0.5941,
"step": 487500
},
{
"base_loss": 0.5979302336573601,
"epoch": 7.096321105957031,
"grad_norm": 0.001098281005397439,
"learning_rate": 3.460788726806641e-06,
"lookahead_loss": 6.263394642829895,
"loss": 0.6109,
"step": 488000
},
{
"base_loss": 0.6135610321164131,
"epoch": 7.0972747802734375,
"grad_norm": 0.0011270649265497923,
"learning_rate": 3.413105010986328e-06,
"lookahead_loss": 6.286933131217957,
"loss": 0.6234,
"step": 488500
},
{
"base_loss": 0.5961914101839065,
"epoch": 7.098228454589844,
"grad_norm": 0.0011138232657685876,
"learning_rate": 3.3654212951660158e-06,
"lookahead_loss": 6.229583405971527,
"loss": 0.6052,
"step": 489000
},
{
"base_loss": 0.5830495541095734,
"epoch": 7.09918212890625,
"grad_norm": 0.0010958199854940176,
"learning_rate": 3.3177375793457034e-06,
"lookahead_loss": 6.296017809391022,
"loss": 0.5957,
"step": 489500
},
{
"base_loss": 0.587930432677269,
"epoch": 7.100135803222656,
"grad_norm": 0.0011185267940163612,
"learning_rate": 3.2700538635253907e-06,
"lookahead_loss": 6.264325401306152,
"loss": 0.6033,
"step": 490000
},
{
"epoch": 7.100135803222656,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.25881780069857,
"eval_lookahead_perplexity": 522.6007565226091,
"eval_loss": 0.2070958912372589,
"eval_perplexity": 1.230100521986473,
"eval_runtime": 91.4681,
"eval_samples_per_second": 54.664,
"eval_steps_per_second": 1.716,
"step": 490000
},
{
"base_loss": 0.6116759033799172,
"epoch": 7.1010894775390625,
"grad_norm": 0.0010914826998487115,
"learning_rate": 3.2223701477050784e-06,
"lookahead_loss": 6.335911834716797,
"loss": 0.6208,
"step": 490500
},
{
"base_loss": 0.5991794173121452,
"epoch": 7.102043151855469,
"grad_norm": 0.0011365750106051564,
"learning_rate": 3.1746864318847656e-06,
"lookahead_loss": 6.299183880805969,
"loss": 0.6062,
"step": 491000
},
{
"base_loss": 0.5893770458698273,
"epoch": 7.102996826171875,
"grad_norm": 0.0010888108517974615,
"learning_rate": 3.1270027160644533e-06,
"lookahead_loss": 6.271809469223022,
"loss": 0.6033,
"step": 491500
},
{
"base_loss": 0.5916792218089104,
"epoch": 7.103950500488281,
"grad_norm": 0.0011168160708621144,
"learning_rate": 3.079319000244141e-06,
"lookahead_loss": 6.289948089599609,
"loss": 0.6027,
"step": 492000
},
{
"base_loss": 0.6132831824421883,
"epoch": 7.1049041748046875,
"grad_norm": 0.0010816961294040084,
"learning_rate": 3.031635284423828e-06,
"lookahead_loss": 6.232145028591156,
"loss": 0.6235,
"step": 492500
},
{
"base_loss": 0.6014310421943665,
"epoch": 7.105857849121094,
"grad_norm": 0.001060318318195641,
"learning_rate": 2.983951568603516e-06,
"lookahead_loss": 6.2375462627410885,
"loss": 0.6124,
"step": 493000
},
{
"base_loss": 0.5968408140540123,
"epoch": 7.1068115234375,
"grad_norm": 0.0010752023663371801,
"learning_rate": 2.936267852783203e-06,
"lookahead_loss": 6.3170910439491275,
"loss": 0.6051,
"step": 493500
},
{
"base_loss": 0.5824449016451836,
"epoch": 7.107765197753906,
"grad_norm": 0.0011570702772587538,
"learning_rate": 2.8885841369628908e-06,
"lookahead_loss": 6.208592594146729,
"loss": 0.5949,
"step": 494000
},
{
"base_loss": 0.6062813322544098,
"epoch": 7.1087188720703125,
"grad_norm": 0.0011445485288277268,
"learning_rate": 2.8409004211425784e-06,
"lookahead_loss": 6.210321761131286,
"loss": 0.619,
"step": 494500
},
{
"base_loss": 0.6082264738082885,
"epoch": 7.109672546386719,
"grad_norm": 0.0010743378661572933,
"learning_rate": 2.7932167053222657e-06,
"lookahead_loss": 6.190786142349243,
"loss": 0.6172,
"step": 495000
},
{
"epoch": 7.109672546386719,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.258252050929938,
"eval_lookahead_perplexity": 522.3051788848935,
"eval_loss": 0.20709487795829773,
"eval_perplexity": 1.2300992755521254,
"eval_runtime": 90.5758,
"eval_samples_per_second": 55.202,
"eval_steps_per_second": 1.733,
"step": 495000
},
{
"base_loss": 0.5961957424879074,
"epoch": 7.110626220703125,
"grad_norm": 0.0011050713947042823,
"learning_rate": 2.7455329895019534e-06,
"lookahead_loss": 6.164184574127197,
"loss": 0.6072,
"step": 495500
},
{
"base_loss": 0.5848405210375786,
"epoch": 7.111579895019531,
"grad_norm": 0.0010936327744275331,
"learning_rate": 2.6978492736816406e-06,
"lookahead_loss": 6.275397553443908,
"loss": 0.5972,
"step": 496000
},
{
"base_loss": 0.6031162394881249,
"epoch": 7.1125335693359375,
"grad_norm": 0.001085134455934167,
"learning_rate": 2.6501655578613283e-06,
"lookahead_loss": 6.301357226371765,
"loss": 0.6124,
"step": 496500
},
{
"base_loss": 0.6153186203241349,
"epoch": 7.113487243652344,
"grad_norm": 0.0011182770831510425,
"learning_rate": 2.602481842041016e-06,
"lookahead_loss": 6.307302838802338,
"loss": 0.6267,
"step": 497000
},
{
"base_loss": 0.5994082721471786,
"epoch": 7.11444091796875,
"grad_norm": 0.0010612837504595518,
"learning_rate": 2.554798126220703e-06,
"lookahead_loss": 6.18713902759552,
"loss": 0.6084,
"step": 497500
},
{
"base_loss": 0.5841842757463456,
"epoch": 7.115394592285156,
"grad_norm": 0.0011268676025792956,
"learning_rate": 2.507114410400391e-06,
"lookahead_loss": 6.234672443389893,
"loss": 0.5956,
"step": 498000
},
{
"base_loss": 0.6037292023897171,
"epoch": 7.1163482666015625,
"grad_norm": 0.001171495416201651,
"learning_rate": 2.459430694580078e-06,
"lookahead_loss": 6.24148467540741,
"loss": 0.6165,
"step": 498500
},
{
"base_loss": 0.607548170864582,
"epoch": 7.117301940917969,
"grad_norm": 0.0011324421502649784,
"learning_rate": 2.4117469787597658e-06,
"lookahead_loss": 6.238188538551331,
"loss": 0.6209,
"step": 499000
},
{
"base_loss": 0.6047358834147454,
"epoch": 7.118255615234375,
"grad_norm": 0.0011255667777732015,
"learning_rate": 2.3640632629394534e-06,
"lookahead_loss": 6.277647980690002,
"loss": 0.6121,
"step": 499500
},
{
"base_loss": 0.5826379895210266,
"epoch": 7.119209289550781,
"grad_norm": 0.0011003295658156276,
"learning_rate": 2.3163795471191407e-06,
"lookahead_loss": 6.27165866279602,
"loss": 0.5926,
"step": 500000
},
{
"epoch": 7.119209289550781,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.257852252679892,
"eval_lookahead_perplexity": 522.0964039250978,
"eval_loss": 0.20709407329559326,
"eval_perplexity": 1.2300982857375138,
"eval_runtime": 92.2348,
"eval_samples_per_second": 54.209,
"eval_steps_per_second": 1.702,
"step": 500000
},
{
"base_loss": 0.5965691907405853,
"epoch": 8.000953674316406,
"grad_norm": 0.0011052964255213737,
"learning_rate": 2.2686958312988284e-06,
"lookahead_loss": 6.344482093811036,
"loss": 0.6026,
"step": 500500
},
{
"base_loss": 0.5847149593234062,
"epoch": 8.001907348632812,
"grad_norm": 0.0011188529897481203,
"learning_rate": 2.2210121154785156e-06,
"lookahead_loss": 6.181741203308105,
"loss": 0.5971,
"step": 501000
},
{
"base_loss": 0.6025487731099128,
"epoch": 8.002861022949219,
"grad_norm": 0.0011255404679104686,
"learning_rate": 2.1733283996582033e-06,
"lookahead_loss": 6.201256879806518,
"loss": 0.6123,
"step": 501500
},
{
"base_loss": 0.6159883877038955,
"epoch": 8.003814697265625,
"grad_norm": 0.0010819864692166448,
"learning_rate": 2.125644683837891e-06,
"lookahead_loss": 6.209218832969666,
"loss": 0.6228,
"step": 502000
},
{
"base_loss": 0.5985354263782501,
"epoch": 8.004768371582031,
"grad_norm": 0.0011081405682489276,
"learning_rate": 2.077960968017578e-06,
"lookahead_loss": 6.19532088470459,
"loss": 0.6057,
"step": 502500
},
{
"base_loss": 0.59137887185812,
"epoch": 8.005722045898438,
"grad_norm": 0.0010434648720547557,
"learning_rate": 2.030277252197266e-06,
"lookahead_loss": 6.318549188613892,
"loss": 0.6001,
"step": 503000
},
{
"base_loss": 0.5807731298208236,
"epoch": 8.006675720214844,
"grad_norm": 0.0010843180352821946,
"learning_rate": 1.982593536376953e-06,
"lookahead_loss": 6.1740428781509396,
"loss": 0.5959,
"step": 503500
},
{
"base_loss": 0.6059517723321914,
"epoch": 8.00762939453125,
"grad_norm": 0.0011015934869647026,
"learning_rate": 1.9349098205566408e-06,
"lookahead_loss": 6.244119252204895,
"loss": 0.6161,
"step": 504000
},
{
"base_loss": 0.6048649581670761,
"epoch": 8.008583068847656,
"grad_norm": 0.0010392587864771485,
"learning_rate": 1.8872261047363282e-06,
"lookahead_loss": 6.230837811946869,
"loss": 0.6093,
"step": 504500
},
{
"base_loss": 0.593385848402977,
"epoch": 8.009536743164062,
"grad_norm": 0.001120842294767499,
"learning_rate": 1.8395423889160157e-06,
"lookahead_loss": 6.241628100395203,
"loss": 0.608,
"step": 505000
},
{
"epoch": 8.009536743164062,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.257503026590561,
"eval_lookahead_perplexity": 521.9141060731368,
"eval_loss": 0.20709335803985596,
"eval_perplexity": 1.2300974059029721,
"eval_runtime": 91.6562,
"eval_samples_per_second": 54.552,
"eval_steps_per_second": 1.713,
"step": 505000
},
{
"base_loss": 0.5941373434662819,
"epoch": 8.010490417480469,
"grad_norm": 0.0010929540731012821,
"learning_rate": 1.7918586730957031e-06,
"lookahead_loss": 6.197162329673767,
"loss": 0.6035,
"step": 505500
},
{
"base_loss": 0.5908739617466927,
"epoch": 8.011444091796875,
"grad_norm": 0.0011226508067920804,
"learning_rate": 1.7441749572753908e-06,
"lookahead_loss": 6.22726021194458,
"loss": 0.6052,
"step": 506000
},
{
"base_loss": 0.6081678086519241,
"epoch": 8.012397766113281,
"grad_norm": 0.0010954260360449553,
"learning_rate": 1.6964912414550783e-06,
"lookahead_loss": 6.215122665405273,
"loss": 0.6213,
"step": 506500
},
{
"base_loss": 0.5956925541162491,
"epoch": 8.013351440429688,
"grad_norm": 0.0011058412492275238,
"learning_rate": 1.6488075256347657e-06,
"lookahead_loss": 6.291760811805725,
"loss": 0.6083,
"step": 507000
},
{
"base_loss": 0.5932368034124375,
"epoch": 8.014305114746094,
"grad_norm": 0.0011063116835430264,
"learning_rate": 1.6011238098144532e-06,
"lookahead_loss": 6.2699484491348265,
"loss": 0.606,
"step": 507500
},
{
"base_loss": 0.5816438822746277,
"epoch": 8.0152587890625,
"grad_norm": 0.0011064352001994848,
"learning_rate": 1.5534400939941406e-06,
"lookahead_loss": 6.210830857753754,
"loss": 0.5884,
"step": 508000
},
{
"base_loss": 0.6093874707818031,
"epoch": 8.016212463378906,
"grad_norm": 0.0011251309188082814,
"learning_rate": 1.505756378173828e-06,
"lookahead_loss": 6.271678217887878,
"loss": 0.6167,
"step": 508500
},
{
"base_loss": 0.6042429065108299,
"epoch": 8.017166137695312,
"grad_norm": 0.001074893050827086,
"learning_rate": 1.4580726623535158e-06,
"lookahead_loss": 6.3098267641067505,
"loss": 0.6111,
"step": 509000
},
{
"base_loss": 0.5948937609791756,
"epoch": 8.018119812011719,
"grad_norm": 0.0010518047492951155,
"learning_rate": 1.4103889465332032e-06,
"lookahead_loss": 6.3096597938537595,
"loss": 0.6029,
"step": 509500
},
{
"base_loss": 0.5853534046411514,
"epoch": 8.019073486328125,
"grad_norm": 0.0010772800305858254,
"learning_rate": 1.3627052307128907e-06,
"lookahead_loss": 6.328934656620025,
"loss": 0.5975,
"step": 510000
},
{
"epoch": 8.019073486328125,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.257185920739707,
"eval_lookahead_perplexity": 521.7486302945136,
"eval_loss": 0.20709271728992462,
"eval_perplexity": 1.2300966177183963,
"eval_runtime": 89.5208,
"eval_samples_per_second": 55.853,
"eval_steps_per_second": 1.754,
"step": 510000
},
{
"base_loss": 0.5883547278642655,
"epoch": 8.020027160644531,
"grad_norm": 0.0011325281811878085,
"learning_rate": 1.3150215148925781e-06,
"lookahead_loss": 6.171751028060913,
"loss": 0.6033,
"step": 510500
},
{
"base_loss": 0.6178413493037224,
"epoch": 8.020980834960938,
"grad_norm": 0.001067935605533421,
"learning_rate": 1.2673377990722656e-06,
"lookahead_loss": 6.234110792160034,
"loss": 0.627,
"step": 511000
},
{
"base_loss": 0.599259612083435,
"epoch": 8.021934509277344,
"grad_norm": 0.0011157679837197065,
"learning_rate": 1.2196540832519533e-06,
"lookahead_loss": 6.210755274772644,
"loss": 0.611,
"step": 511500
},
{
"base_loss": 0.5985046907067298,
"epoch": 8.02288818359375,
"grad_norm": 0.0010958234779536724,
"learning_rate": 1.1719703674316407e-06,
"lookahead_loss": 6.240888547897339,
"loss": 0.6048,
"step": 512000
},
{
"base_loss": 0.5953476763367653,
"epoch": 8.023841857910156,
"grad_norm": 0.001115177758038044,
"learning_rate": 1.1242866516113282e-06,
"lookahead_loss": 6.217287230491638,
"loss": 0.6031,
"step": 512500
},
{
"base_loss": 0.612789287507534,
"epoch": 8.024795532226562,
"grad_norm": 0.0010878838365897536,
"learning_rate": 1.0766029357910156e-06,
"lookahead_loss": 6.225279389858246,
"loss": 0.6225,
"step": 513000
},
{
"base_loss": 0.6079529778957367,
"epoch": 8.025749206542969,
"grad_norm": 0.001078968751244247,
"learning_rate": 1.028919219970703e-06,
"lookahead_loss": 6.175548429965973,
"loss": 0.6162,
"step": 513500
},
{
"base_loss": 0.5974433195590972,
"epoch": 8.026702880859375,
"grad_norm": 0.0011195708066225052,
"learning_rate": 9.812355041503908e-07,
"lookahead_loss": 6.199098266601562,
"loss": 0.6053,
"step": 514000
},
{
"base_loss": 0.5935445895195007,
"epoch": 8.027656555175781,
"grad_norm": 0.0010943651432171464,
"learning_rate": 9.335517883300781e-07,
"lookahead_loss": 6.309714347839355,
"loss": 0.6063,
"step": 514500
},
{
"base_loss": 0.6162749938368798,
"epoch": 8.028610229492188,
"grad_norm": 0.001078986912034452,
"learning_rate": 8.858680725097657e-07,
"lookahead_loss": 6.3432404346466065,
"loss": 0.6248,
"step": 515000
},
{
"epoch": 8.028610229492188,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.2569849018852555,
"eval_lookahead_perplexity": 521.6437595233942,
"eval_loss": 0.20709232985973358,
"eval_perplexity": 1.230096141141921,
"eval_runtime": 91.7882,
"eval_samples_per_second": 54.473,
"eval_steps_per_second": 1.71,
"step": 515000
},
{
"base_loss": 0.6038788543343544,
"epoch": 8.029563903808594,
"grad_norm": 0.001102432026527822,
"learning_rate": 8.381843566894531e-07,
"lookahead_loss": 6.264419286251068,
"loss": 0.6123,
"step": 515500
},
{
"base_loss": 0.5950830878019333,
"epoch": 8.030517578125,
"grad_norm": 0.0011076893424615264,
"learning_rate": 7.905006408691407e-07,
"lookahead_loss": 6.274682451725006,
"loss": 0.6066,
"step": 516000
},
{
"base_loss": 0.5894574123620987,
"epoch": 8.031471252441406,
"grad_norm": 0.0011027451837435365,
"learning_rate": 7.428169250488282e-07,
"lookahead_loss": 6.2933365597724915,
"loss": 0.6024,
"step": 516500
},
{
"base_loss": 0.6123639032840729,
"epoch": 8.032424926757812,
"grad_norm": 0.001126546529121697,
"learning_rate": 6.951332092285156e-07,
"lookahead_loss": 6.2396877632141114,
"loss": 0.6264,
"step": 517000
},
{
"base_loss": 0.5966498643159867,
"epoch": 8.033378601074219,
"grad_norm": 0.0011282344348728657,
"learning_rate": 6.474494934082032e-07,
"lookahead_loss": 6.2721795229911805,
"loss": 0.6068,
"step": 517500
},
{
"base_loss": 0.5952624140977859,
"epoch": 8.034332275390625,
"grad_norm": 0.0010613331105560064,
"learning_rate": 5.997657775878906e-07,
"lookahead_loss": 6.34448275566101,
"loss": 0.6088,
"step": 518000
},
{
"base_loss": 0.5912229750752449,
"epoch": 8.035285949707031,
"grad_norm": 0.0011148882331326604,
"learning_rate": 5.520820617675782e-07,
"lookahead_loss": 6.200133923053741,
"loss": 0.6025,
"step": 518500
},
{
"base_loss": 0.6111020909547806,
"epoch": 8.036239624023438,
"grad_norm": 0.0011170883662998676,
"learning_rate": 5.043983459472657e-07,
"lookahead_loss": 6.301377963066101,
"loss": 0.6212,
"step": 519000
},
{
"base_loss": 0.6032237566113472,
"epoch": 8.037193298339844,
"grad_norm": 0.0011249141534790397,
"learning_rate": 4.5671463012695317e-07,
"lookahead_loss": 6.248824975967407,
"loss": 0.6122,
"step": 519500
},
{
"base_loss": 0.5889247298240662,
"epoch": 8.03814697265625,
"grad_norm": 0.0010875992011278868,
"learning_rate": 4.0903091430664063e-07,
"lookahead_loss": 6.256915027618408,
"loss": 0.6013,
"step": 520000
},
{
"epoch": 8.03814697265625,
"eval_accuracy": 0.0032320939334637964,
"eval_base_loss": 0.19372630566834642,
"eval_base_perplexity": 1.2137640371544767,
"eval_lookahead_loss": 6.256861397252677,
"eval_lookahead_perplexity": 521.5793380807926,
"eval_loss": 0.20709213614463806,
"eval_perplexity": 1.2300959028537526,
"eval_runtime": 91.5112,
"eval_samples_per_second": 54.638,
"eval_steps_per_second": 1.716,
"step": 520000
},
{
"base_loss": 0.5958697483539581,
"epoch": 8.039100646972656,
"grad_norm": 0.0011051874607801437,
"learning_rate": 3.6134719848632814e-07,
"lookahead_loss": 6.230529176712036,
"loss": 0.6107,
"step": 520500
},
{
"base_loss": 0.6128707799315453,
"epoch": 8.040054321289062,
"grad_norm": 0.001110102515667677,
"learning_rate": 3.1366348266601565e-07,
"lookahead_loss": 6.245839037895203,
"loss": 0.6219,
"step": 521000
},
{
"base_loss": 0.6014232878088951,
"epoch": 8.041007995605469,
"grad_norm": 0.0011205815244466066,
"learning_rate": 2.6597976684570316e-07,
"lookahead_loss": 6.203103644371033,
"loss": 0.6107,
"step": 521500
},
{
"base_loss": 0.579496483206749,
"epoch": 8.041961669921875,
"grad_norm": 0.0011324465740472078,
"learning_rate": 2.1829605102539064e-07,
"lookahead_loss": 6.285241819381714,
"loss": 0.5951,
"step": 522000
},
{
"base_loss": 0.6032205757498741,
"epoch": 8.042915344238281,
"grad_norm": 0.0010729862842708826,
"learning_rate": 1.7061233520507813e-07,
"lookahead_loss": 6.298031174659729,
"loss": 0.6152,
"step": 522500
},
{
"base_loss": 0.6130943556427956,
"epoch": 8.043869018554688,
"grad_norm": 0.0011428052093833685,
"learning_rate": 1.2292861938476564e-07,
"lookahead_loss": 6.320867140769958,
"loss": 0.6249,
"step": 523000
},
{
"base_loss": 0.592569636464119,
"epoch": 8.044822692871094,
"grad_norm": 0.0011110154446214437,
"learning_rate": 7.524490356445312e-08,
"lookahead_loss": 6.253005561828613,
"loss": 0.6046,
"step": 523500
},
{
"base_loss": 0.5920621357560157,
"epoch": 8.0457763671875,
"grad_norm": 0.001082174712792039,
"learning_rate": 2.7561187744140627e-08,
"lookahead_loss": 6.253642764091492,
"loss": 0.603,
"step": 524000
},
{
"epoch": 8.04632568359375,
"step": 524288,
"total_flos": 3.966527920280699e+18,
"train_loss": 0.6106607067631558,
"train_runtime": 117545.2213,
"train_samples_per_second": 142.73,
"train_steps_per_second": 4.46
}
],
"logging_steps": 500,
"max_steps": 524288,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.966527920280699e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}