flan-t5la-large / trainer_state.json
hrezaei's picture
End of training
c09fbce verified
{
"best_global_step": null,
"best_metric": 1.6212482452392578,
"best_model_checkpoint": null,
"epoch": 1.0749359130859375,
"eval_steps": 5000,
"global_step": 524288,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"base_loss": 0.30045702931284907,
"epoch": 0.00095367431640625,
"grad_norm": 0.2715776264667511,
"learning_rate": 4.995241165161133e-05,
"lookahead_loss": 11.636415706634521,
"loss": 5.9684,
"step": 500
},
{
"base_loss": 0.30115938156843186,
"epoch": 0.0019073486328125,
"grad_norm": 0.23746199905872345,
"learning_rate": 4.990472793579102e-05,
"lookahead_loss": 10.842903160095215,
"loss": 5.572,
"step": 1000
},
{
"base_loss": 0.30052139541506767,
"epoch": 0.00286102294921875,
"grad_norm": 0.2835578918457031,
"learning_rate": 4.98570442199707e-05,
"lookahead_loss": 10.13417279434204,
"loss": 5.2173,
"step": 1500
},
{
"base_loss": 0.30269933369755747,
"epoch": 0.003814697265625,
"grad_norm": 0.24823836982250214,
"learning_rate": 4.9809360504150393e-05,
"lookahead_loss": 9.486023275375366,
"loss": 4.8944,
"step": 2000
},
{
"base_loss": 0.3041781492829323,
"epoch": 0.00476837158203125,
"grad_norm": 0.20649601519107819,
"learning_rate": 4.9761676788330084e-05,
"lookahead_loss": 8.79746763420105,
"loss": 4.5508,
"step": 2500
},
{
"base_loss": 0.31303099401295187,
"epoch": 0.0057220458984375,
"grad_norm": 0.1906881481409073,
"learning_rate": 4.971399307250977e-05,
"lookahead_loss": 8.270847107887269,
"loss": 4.2919,
"step": 3000
},
{
"base_loss": 0.33166604954004286,
"epoch": 0.00667572021484375,
"grad_norm": 0.1698817014694214,
"learning_rate": 4.966630935668946e-05,
"lookahead_loss": 7.874463705062866,
"loss": 4.1031,
"step": 3500
},
{
"base_loss": 0.31333299943804743,
"epoch": 0.00762939453125,
"grad_norm": 0.174786776304245,
"learning_rate": 4.961862564086914e-05,
"lookahead_loss": 7.546367056846619,
"loss": 3.9299,
"step": 4000
},
{
"base_loss": 0.31245614659786225,
"epoch": 0.00858306884765625,
"grad_norm": 0.14247459173202515,
"learning_rate": 4.957094192504883e-05,
"lookahead_loss": 7.2693215188980105,
"loss": 3.7909,
"step": 4500
},
{
"base_loss": 0.292913170427084,
"epoch": 0.0095367431640625,
"grad_norm": 0.1335776448249817,
"learning_rate": 4.952325820922852e-05,
"lookahead_loss": 7.0413890533447265,
"loss": 3.6672,
"step": 5000
},
{
"epoch": 0.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 7.667783941650391,
"eval_lookahead_perplexity": 2138.337510077118,
"eval_loss": 3.8992421627044678,
"eval_perplexity": 49.36502426986664,
"eval_runtime": 486.7779,
"eval_samples_per_second": 20.543,
"eval_steps_per_second": 1.284,
"step": 5000
},
{
"base_loss": 0.30041549998521805,
"epoch": 0.01049041748046875,
"grad_norm": 0.14120376110076904,
"learning_rate": 4.9475574493408205e-05,
"lookahead_loss": 6.863395688056946,
"loss": 3.5819,
"step": 5500
},
{
"base_loss": 0.29533531844615935,
"epoch": 0.011444091796875,
"grad_norm": 0.14177614450454712,
"learning_rate": 4.9427890777587895e-05,
"lookahead_loss": 6.756114417076111,
"loss": 3.5257,
"step": 6000
},
{
"base_loss": 0.29999259182810784,
"epoch": 0.01239776611328125,
"grad_norm": 0.15401475131511688,
"learning_rate": 4.938020706176758e-05,
"lookahead_loss": 6.607378579139709,
"loss": 3.4537,
"step": 6500
},
{
"base_loss": 0.29886582669615747,
"epoch": 0.0133514404296875,
"grad_norm": 0.17274411022663116,
"learning_rate": 4.933252334594727e-05,
"lookahead_loss": 6.502672654151916,
"loss": 3.4008,
"step": 7000
},
{
"base_loss": 0.3034250964820385,
"epoch": 0.01430511474609375,
"grad_norm": 0.16889062523841858,
"learning_rate": 4.928483963012696e-05,
"lookahead_loss": 6.298652252197265,
"loss": 3.301,
"step": 7500
},
{
"base_loss": 0.3204497399777174,
"epoch": 0.0152587890625,
"grad_norm": 0.1580396145582199,
"learning_rate": 4.923715591430664e-05,
"lookahead_loss": 6.183374300003051,
"loss": 3.2519,
"step": 8000
},
{
"base_loss": 0.31960979211330415,
"epoch": 0.01621246337890625,
"grad_norm": 0.19017435610294342,
"learning_rate": 4.918947219848633e-05,
"lookahead_loss": 6.093465467453003,
"loss": 3.2065,
"step": 8500
},
{
"base_loss": 0.3001296965777874,
"epoch": 0.0171661376953125,
"grad_norm": 0.1364353597164154,
"learning_rate": 4.9141788482666016e-05,
"lookahead_loss": 6.014976463317871,
"loss": 3.1576,
"step": 9000
},
{
"base_loss": 0.32021681547164915,
"epoch": 0.01811981201171875,
"grad_norm": 0.24822595715522766,
"learning_rate": 4.9094104766845706e-05,
"lookahead_loss": 5.915456521987915,
"loss": 3.1178,
"step": 9500
},
{
"base_loss": 0.29040670284628867,
"epoch": 0.019073486328125,
"grad_norm": 0.19323401153087616,
"learning_rate": 4.9046421051025396e-05,
"lookahead_loss": 5.8278450555801395,
"loss": 3.0591,
"step": 10000
},
{
"epoch": 0.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 6.077848429870605,
"eval_lookahead_perplexity": 436.08990659185014,
"eval_loss": 3.1042745113372803,
"eval_perplexity": 22.293039759463593,
"eval_runtime": 483.2744,
"eval_samples_per_second": 20.692,
"eval_steps_per_second": 1.293,
"step": 10000
},
{
"base_loss": 0.3004113952517509,
"epoch": 0.02002716064453125,
"grad_norm": 0.16262881457805634,
"learning_rate": 4.899873733520508e-05,
"lookahead_loss": 5.826037519454956,
"loss": 3.0632,
"step": 10500
},
{
"base_loss": 0.29663331305980684,
"epoch": 0.0209808349609375,
"grad_norm": 0.15377278625965118,
"learning_rate": 4.895105361938477e-05,
"lookahead_loss": 5.7655581150054935,
"loss": 3.0311,
"step": 11000
},
{
"base_loss": 0.30206509011983873,
"epoch": 0.02193450927734375,
"grad_norm": 0.6924819350242615,
"learning_rate": 4.890336990356445e-05,
"lookahead_loss": 5.726267605781556,
"loss": 3.0142,
"step": 11500
},
{
"base_loss": 0.30140692415833475,
"epoch": 0.02288818359375,
"grad_norm": 0.17533016204833984,
"learning_rate": 4.8855686187744143e-05,
"lookahead_loss": 5.6108699903488155,
"loss": 2.9561,
"step": 12000
},
{
"base_loss": 0.31570343241095544,
"epoch": 0.02384185791015625,
"grad_norm": 0.14791598916053772,
"learning_rate": 4.8808002471923834e-05,
"lookahead_loss": 5.538150405883789,
"loss": 2.9269,
"step": 12500
},
{
"base_loss": 0.33028968888521193,
"epoch": 0.0247955322265625,
"grad_norm": 0.12097828835248947,
"learning_rate": 4.876031875610352e-05,
"lookahead_loss": 5.539091997146606,
"loss": 2.9347,
"step": 13000
},
{
"base_loss": 0.3067899467945099,
"epoch": 0.02574920654296875,
"grad_norm": 0.1595277637243271,
"learning_rate": 4.871263504028321e-05,
"lookahead_loss": 5.434582984924316,
"loss": 2.8707,
"step": 13500
},
{
"base_loss": 0.3104507875740528,
"epoch": 0.026702880859375,
"grad_norm": 0.16813045740127563,
"learning_rate": 4.866495132446289e-05,
"lookahead_loss": 5.4183082113265995,
"loss": 2.8644,
"step": 14000
},
{
"base_loss": 0.295670255869627,
"epoch": 0.02765655517578125,
"grad_norm": 0.23366433382034302,
"learning_rate": 4.861726760864258e-05,
"lookahead_loss": 5.329585377693176,
"loss": 2.8126,
"step": 14500
},
{
"base_loss": 0.3073807775378227,
"epoch": 0.0286102294921875,
"grad_norm": 0.15339840948581696,
"learning_rate": 4.856958389282227e-05,
"lookahead_loss": 5.379592286109924,
"loss": 2.8435,
"step": 15000
},
{
"epoch": 0.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 5.3830560668945315,
"eval_lookahead_perplexity": 217.68652449406255,
"eval_loss": 2.75687837600708,
"eval_perplexity": 15.750598680461218,
"eval_runtime": 488.4895,
"eval_samples_per_second": 20.471,
"eval_steps_per_second": 1.279,
"step": 15000
},
{
"base_loss": 0.296496417850256,
"epoch": 0.02956390380859375,
"grad_norm": 0.16044539213180542,
"learning_rate": 4.8521900177001955e-05,
"lookahead_loss": 5.344062633514405,
"loss": 2.8203,
"step": 15500
},
{
"base_loss": 0.29590288090705874,
"epoch": 0.030517578125,
"grad_norm": 0.16459447145462036,
"learning_rate": 4.8474216461181645e-05,
"lookahead_loss": 5.331672690868378,
"loss": 2.8138,
"step": 16000
},
{
"base_loss": 0.3003334278166294,
"epoch": 0.03147125244140625,
"grad_norm": 0.14433036744594574,
"learning_rate": 4.842653274536133e-05,
"lookahead_loss": 5.245349229812622,
"loss": 2.7728,
"step": 16500
},
{
"base_loss": 0.3256162821352482,
"epoch": 0.0324249267578125,
"grad_norm": 0.17356151342391968,
"learning_rate": 4.837884902954102e-05,
"lookahead_loss": 5.219405631065369,
"loss": 2.7725,
"step": 17000
},
{
"base_loss": 0.3199668276309967,
"epoch": 0.03337860107421875,
"grad_norm": 0.15259094536304474,
"learning_rate": 4.833116531372071e-05,
"lookahead_loss": 5.178223248481751,
"loss": 2.7491,
"step": 17500
},
{
"base_loss": 0.29680381083488466,
"epoch": 0.034332275390625,
"grad_norm": 0.20254507660865784,
"learning_rate": 4.828348159790039e-05,
"lookahead_loss": 5.133180852890015,
"loss": 2.715,
"step": 18000
},
{
"base_loss": 0.30402446049451826,
"epoch": 0.03528594970703125,
"grad_norm": 0.14859794080257416,
"learning_rate": 4.823579788208008e-05,
"lookahead_loss": 5.102789646148682,
"loss": 2.7034,
"step": 18500
},
{
"base_loss": 0.2954226844608784,
"epoch": 0.0362396240234375,
"grad_norm": 0.1865054816007614,
"learning_rate": 4.8188114166259766e-05,
"lookahead_loss": 5.056313884735108,
"loss": 2.6759,
"step": 19000
},
{
"base_loss": 0.30284518826007845,
"epoch": 0.03719329833984375,
"grad_norm": 0.1533517986536026,
"learning_rate": 4.8140430450439456e-05,
"lookahead_loss": 5.113425381660462,
"loss": 2.7081,
"step": 19500
},
{
"base_loss": 0.293648807734251,
"epoch": 0.03814697265625,
"grad_norm": 0.12334468960762024,
"learning_rate": 4.8092746734619146e-05,
"lookahead_loss": 5.080516023635864,
"loss": 2.6871,
"step": 20000
},
{
"epoch": 0.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.964036189270019,
"eval_lookahead_perplexity": 143.1704944953996,
"eval_loss": 2.547368288040161,
"eval_perplexity": 12.773443485997294,
"eval_runtime": 505.4082,
"eval_samples_per_second": 19.786,
"eval_steps_per_second": 1.237,
"step": 20000
},
{
"base_loss": 0.30936932846903803,
"epoch": 0.03910064697265625,
"grad_norm": 0.14457817375659943,
"learning_rate": 4.804506301879883e-05,
"lookahead_loss": 5.047291966438293,
"loss": 2.6783,
"step": 20500
},
{
"base_loss": 0.29837565070390704,
"epoch": 0.0400543212890625,
"grad_norm": 0.21649453043937683,
"learning_rate": 4.799737930297852e-05,
"lookahead_loss": 4.948660229682923,
"loss": 2.6235,
"step": 21000
},
{
"base_loss": 0.327464056879282,
"epoch": 0.04100799560546875,
"grad_norm": 0.1511124223470688,
"learning_rate": 4.79496955871582e-05,
"lookahead_loss": 5.014459494590759,
"loss": 2.671,
"step": 21500
},
{
"base_loss": 0.3259224636852741,
"epoch": 0.041961669921875,
"grad_norm": 0.18786948919296265,
"learning_rate": 4.7902011871337893e-05,
"lookahead_loss": 4.958053824424744,
"loss": 2.642,
"step": 22000
},
{
"base_loss": 0.30848885998129844,
"epoch": 0.04291534423828125,
"grad_norm": 0.18608908355236053,
"learning_rate": 4.7854328155517584e-05,
"lookahead_loss": 4.910871742725372,
"loss": 2.6097,
"step": 22500
},
{
"base_loss": 0.2953577929735184,
"epoch": 0.0438690185546875,
"grad_norm": 0.13473840057849884,
"learning_rate": 4.780664443969727e-05,
"lookahead_loss": 4.861847942352295,
"loss": 2.5786,
"step": 23000
},
{
"base_loss": 0.3016613866984844,
"epoch": 0.04482269287109375,
"grad_norm": 0.12197423726320267,
"learning_rate": 4.775896072387696e-05,
"lookahead_loss": 4.872081851005555,
"loss": 2.5869,
"step": 23500
},
{
"base_loss": 0.2971103771924973,
"epoch": 0.0457763671875,
"grad_norm": 0.16922320425510406,
"learning_rate": 4.771127700805664e-05,
"lookahead_loss": 4.889053022384643,
"loss": 2.5931,
"step": 24000
},
{
"base_loss": 0.300103415876627,
"epoch": 0.04673004150390625,
"grad_norm": 0.16374553740024567,
"learning_rate": 4.766359329223633e-05,
"lookahead_loss": 4.88103800201416,
"loss": 2.5906,
"step": 24500
},
{
"base_loss": 0.3028304523229599,
"epoch": 0.0476837158203125,
"grad_norm": 0.1691102385520935,
"learning_rate": 4.761590957641602e-05,
"lookahead_loss": 4.8170537824630735,
"loss": 2.5599,
"step": 25000
},
{
"epoch": 0.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.681251558685303,
"eval_lookahead_perplexity": 107.90503758244142,
"eval_loss": 2.4059760570526123,
"eval_perplexity": 11.08924874105288,
"eval_runtime": 483.5965,
"eval_samples_per_second": 20.678,
"eval_steps_per_second": 1.292,
"step": 25000
},
{
"base_loss": 0.3226933609247208,
"epoch": 0.04863739013671875,
"grad_norm": 0.3987274169921875,
"learning_rate": 4.7568225860595705e-05,
"lookahead_loss": 4.831066390037536,
"loss": 2.5769,
"step": 25500
},
{
"base_loss": 0.3246669633388519,
"epoch": 0.049591064453125,
"grad_norm": 0.1506359577178955,
"learning_rate": 4.7520542144775395e-05,
"lookahead_loss": 4.826577740669251,
"loss": 2.5756,
"step": 26000
},
{
"base_loss": 0.31835618990659714,
"epoch": 0.05054473876953125,
"grad_norm": 0.2562532126903534,
"learning_rate": 4.747285842895508e-05,
"lookahead_loss": 4.776721940994262,
"loss": 2.5475,
"step": 26500
},
{
"base_loss": 0.3007115146815777,
"epoch": 0.0514984130859375,
"grad_norm": 0.18583890795707703,
"learning_rate": 4.742517471313477e-05,
"lookahead_loss": 4.746668409347534,
"loss": 2.5237,
"step": 27000
},
{
"base_loss": 0.30024259850382806,
"epoch": 0.05245208740234375,
"grad_norm": 0.1737774759531021,
"learning_rate": 4.737749099731446e-05,
"lookahead_loss": 4.724320489406586,
"loss": 2.5123,
"step": 27500
},
{
"base_loss": 0.30464168420433996,
"epoch": 0.05340576171875,
"grad_norm": 0.18554258346557617,
"learning_rate": 4.732980728149414e-05,
"lookahead_loss": 4.769838083267212,
"loss": 2.5372,
"step": 28000
},
{
"base_loss": 0.2989484859406948,
"epoch": 0.05435943603515625,
"grad_norm": 0.24365681409835815,
"learning_rate": 4.728212356567383e-05,
"lookahead_loss": 4.73219411945343,
"loss": 2.5156,
"step": 28500
},
{
"base_loss": 0.315606110394001,
"epoch": 0.0553131103515625,
"grad_norm": 0.16112400591373444,
"learning_rate": 4.7234439849853516e-05,
"lookahead_loss": 4.720495784759522,
"loss": 2.5181,
"step": 29000
},
{
"base_loss": 0.323923219949007,
"epoch": 0.05626678466796875,
"grad_norm": 0.14975038170814514,
"learning_rate": 4.7186756134033206e-05,
"lookahead_loss": 4.705821178436279,
"loss": 2.5149,
"step": 29500
},
{
"base_loss": 0.3346382395327091,
"epoch": 0.057220458984375,
"grad_norm": 0.12785978615283966,
"learning_rate": 4.7139072418212896e-05,
"lookahead_loss": 4.724789978027344,
"loss": 2.5297,
"step": 30000
},
{
"epoch": 0.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.474773878097534,
"eval_lookahead_perplexity": 87.77475037203264,
"eval_loss": 2.30273699760437,
"eval_perplexity": 10.00151916148414,
"eval_runtime": 489.7644,
"eval_samples_per_second": 20.418,
"eval_steps_per_second": 1.276,
"step": 30000
},
{
"base_loss": 0.30464139559865,
"epoch": 0.05817413330078125,
"grad_norm": 0.16110007464885712,
"learning_rate": 4.709138870239258e-05,
"lookahead_loss": 4.650503679275513,
"loss": 2.4776,
"step": 30500
},
{
"base_loss": 0.29905567806959155,
"epoch": 0.0591278076171875,
"grad_norm": 0.15517863631248474,
"learning_rate": 4.704370498657227e-05,
"lookahead_loss": 4.638624626159668,
"loss": 2.4688,
"step": 31000
},
{
"base_loss": 0.2991310947537422,
"epoch": 0.06008148193359375,
"grad_norm": 0.23121874034404755,
"learning_rate": 4.699602127075195e-05,
"lookahead_loss": 4.616633594036102,
"loss": 2.4579,
"step": 31500
},
{
"base_loss": 0.3083145318031311,
"epoch": 0.06103515625,
"grad_norm": 0.2543278932571411,
"learning_rate": 4.6948337554931643e-05,
"lookahead_loss": 4.696196820259094,
"loss": 2.5023,
"step": 32000
},
{
"base_loss": 0.30308397909998896,
"epoch": 0.06198883056640625,
"grad_norm": 0.13134761154651642,
"learning_rate": 4.6900653839111334e-05,
"lookahead_loss": 4.6643982214927675,
"loss": 2.4837,
"step": 32500
},
{
"base_loss": 0.3018778342306614,
"epoch": 0.0629425048828125,
"grad_norm": 0.1542576104402542,
"learning_rate": 4.685297012329102e-05,
"lookahead_loss": 4.626801939964294,
"loss": 2.4643,
"step": 33000
},
{
"base_loss": 0.3218779897689819,
"epoch": 0.06389617919921875,
"grad_norm": 0.13860082626342773,
"learning_rate": 4.680528640747071e-05,
"lookahead_loss": 4.630942379951477,
"loss": 2.4764,
"step": 33500
},
{
"base_loss": 0.32255707490444185,
"epoch": 0.064849853515625,
"grad_norm": 0.21500709652900696,
"learning_rate": 4.675760269165039e-05,
"lookahead_loss": 4.614894771099091,
"loss": 2.4687,
"step": 34000
},
{
"base_loss": 0.3057953714132309,
"epoch": 0.06580352783203125,
"grad_norm": 0.15437884628772736,
"learning_rate": 4.670991897583008e-05,
"lookahead_loss": 4.562792086601258,
"loss": 2.4343,
"step": 34500
},
{
"base_loss": 0.2983690336048603,
"epoch": 0.0667572021484375,
"grad_norm": 0.17949432134628296,
"learning_rate": 4.666223526000977e-05,
"lookahead_loss": 4.535246185302734,
"loss": 2.4168,
"step": 35000
},
{
"epoch": 0.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.3161673141479495,
"eval_lookahead_perplexity": 74.90100543624399,
"eval_loss": 2.2234339714050293,
"eval_perplexity": 9.23900292614348,
"eval_runtime": 498.5159,
"eval_samples_per_second": 20.06,
"eval_steps_per_second": 1.254,
"step": 35000
},
{
"base_loss": 0.3004172631800175,
"epoch": 0.06771087646484375,
"grad_norm": 0.14633019268512726,
"learning_rate": 4.6614551544189455e-05,
"lookahead_loss": 4.555401055812836,
"loss": 2.4279,
"step": 35500
},
{
"base_loss": 0.3054021218121052,
"epoch": 0.06866455078125,
"grad_norm": 0.1640414297580719,
"learning_rate": 4.6566867828369145e-05,
"lookahead_loss": 4.591926307678222,
"loss": 2.4487,
"step": 36000
},
{
"base_loss": 0.30075133538246157,
"epoch": 0.06961822509765625,
"grad_norm": 0.14715056121349335,
"learning_rate": 4.651918411254883e-05,
"lookahead_loss": 4.545757569789886,
"loss": 2.4233,
"step": 36500
},
{
"base_loss": 0.3224307889938354,
"epoch": 0.0705718994140625,
"grad_norm": 0.1614302396774292,
"learning_rate": 4.647150039672852e-05,
"lookahead_loss": 4.540217909812927,
"loss": 2.4313,
"step": 37000
},
{
"base_loss": 0.3294345450103283,
"epoch": 0.07152557373046875,
"grad_norm": 0.15803970396518707,
"learning_rate": 4.642381668090821e-05,
"lookahead_loss": 4.554252586364746,
"loss": 2.4418,
"step": 37500
},
{
"base_loss": 0.3225139188170433,
"epoch": 0.072479248046875,
"grad_norm": 0.1647147685289383,
"learning_rate": 4.637613296508789e-05,
"lookahead_loss": 4.5207296891212465,
"loss": 2.4216,
"step": 38000
},
{
"base_loss": 0.312881602704525,
"epoch": 0.07343292236328125,
"grad_norm": 0.1871267408132553,
"learning_rate": 4.632844924926758e-05,
"lookahead_loss": 4.507406691551209,
"loss": 2.4101,
"step": 38500
},
{
"base_loss": 0.3021739726960659,
"epoch": 0.0743865966796875,
"grad_norm": 0.1738116592168808,
"learning_rate": 4.6280765533447266e-05,
"lookahead_loss": 4.469052557945251,
"loss": 2.3856,
"step": 39000
},
{
"base_loss": 0.30172099885344505,
"epoch": 0.07534027099609375,
"grad_norm": 0.16887560486793518,
"learning_rate": 4.6233081817626956e-05,
"lookahead_loss": 4.467407505512238,
"loss": 2.3846,
"step": 39500
},
{
"base_loss": 0.30044360157847405,
"epoch": 0.0762939453125,
"grad_norm": 0.26040539145469666,
"learning_rate": 4.6185398101806646e-05,
"lookahead_loss": 4.4797073764801025,
"loss": 2.3901,
"step": 40000
},
{
"epoch": 0.0762939453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.1904504711151125,
"eval_lookahead_perplexity": 66.05253902049455,
"eval_loss": 2.1605753898620605,
"eval_perplexity": 8.676128378831928,
"eval_runtime": 487.0689,
"eval_samples_per_second": 20.531,
"eval_steps_per_second": 1.283,
"step": 40000
},
{
"base_loss": 0.3015633761882782,
"epoch": 0.07724761962890625,
"grad_norm": 0.21266193687915802,
"learning_rate": 4.613771438598633e-05,
"lookahead_loss": 4.494357872486114,
"loss": 2.398,
"step": 40500
},
{
"base_loss": 0.31515628564357756,
"epoch": 0.0782012939453125,
"grad_norm": 0.16350935399532318,
"learning_rate": 4.609003067016602e-05,
"lookahead_loss": 4.463918738365173,
"loss": 2.3895,
"step": 41000
},
{
"base_loss": 0.3230645119249821,
"epoch": 0.07915496826171875,
"grad_norm": 0.14223527908325195,
"learning_rate": 4.60423469543457e-05,
"lookahead_loss": 4.479482789039611,
"loss": 2.4013,
"step": 41500
},
{
"base_loss": 0.31478354924917223,
"epoch": 0.080108642578125,
"grad_norm": 0.21286998689174652,
"learning_rate": 4.5994663238525393e-05,
"lookahead_loss": 4.4453483581542965,
"loss": 2.3801,
"step": 42000
},
{
"base_loss": 0.3150367656648159,
"epoch": 0.08106231689453125,
"grad_norm": 0.17431187629699707,
"learning_rate": 4.5946979522705084e-05,
"lookahead_loss": 4.43265785074234,
"loss": 2.3738,
"step": 42500
},
{
"base_loss": 0.2954510691165924,
"epoch": 0.0820159912109375,
"grad_norm": 0.1371452659368515,
"learning_rate": 4.589929580688477e-05,
"lookahead_loss": 4.397426519393921,
"loss": 2.3464,
"step": 43000
},
{
"base_loss": 0.29434001427888873,
"epoch": 0.08296966552734375,
"grad_norm": 0.13817064464092255,
"learning_rate": 4.585161209106446e-05,
"lookahead_loss": 4.442314762115479,
"loss": 2.3683,
"step": 43500
},
{
"base_loss": 0.2983709729015827,
"epoch": 0.08392333984375,
"grad_norm": 0.17395979166030884,
"learning_rate": 4.580392837524414e-05,
"lookahead_loss": 4.447078158378601,
"loss": 2.3727,
"step": 44000
},
{
"base_loss": 0.29992626640200615,
"epoch": 0.08487701416015625,
"grad_norm": 0.17240917682647705,
"learning_rate": 4.575624465942383e-05,
"lookahead_loss": 4.4108530750274655,
"loss": 2.3554,
"step": 44500
},
{
"base_loss": 0.32140542250871657,
"epoch": 0.0858306884765625,
"grad_norm": 0.2007725089788437,
"learning_rate": 4.570856094360352e-05,
"lookahead_loss": 4.3953737797737125,
"loss": 2.3584,
"step": 45000
},
{
"epoch": 0.0858306884765625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.088313439559936,
"eval_lookahead_perplexity": 59.639221683003754,
"eval_loss": 2.109506607055664,
"eval_perplexity": 8.244172664370447,
"eval_runtime": 486.2971,
"eval_samples_per_second": 20.564,
"eval_steps_per_second": 1.285,
"step": 45000
},
{
"base_loss": 0.32445472630858424,
"epoch": 0.08678436279296875,
"grad_norm": 0.22776088118553162,
"learning_rate": 4.5660877227783205e-05,
"lookahead_loss": 4.4375409073829655,
"loss": 2.381,
"step": 45500
},
{
"base_loss": 0.33727448362112045,
"epoch": 0.087738037109375,
"grad_norm": 0.20219068229198456,
"learning_rate": 4.5613193511962895e-05,
"lookahead_loss": 4.402894349098205,
"loss": 2.3701,
"step": 46000
},
{
"base_loss": 0.2958546592593193,
"epoch": 0.08869171142578125,
"grad_norm": 0.13857534527778625,
"learning_rate": 4.556550979614258e-05,
"lookahead_loss": 4.363266070842743,
"loss": 2.3296,
"step": 46500
},
{
"base_loss": 0.2990704481303692,
"epoch": 0.0896453857421875,
"grad_norm": 0.17887870967388153,
"learning_rate": 4.551782608032227e-05,
"lookahead_loss": 4.336292638778686,
"loss": 2.3177,
"step": 47000
},
{
"base_loss": 0.3050165086686611,
"epoch": 0.09059906005859375,
"grad_norm": 0.14284111559391022,
"learning_rate": 4.547014236450196e-05,
"lookahead_loss": 4.395085669517517,
"loss": 2.3501,
"step": 47500
},
{
"base_loss": 0.307517321318388,
"epoch": 0.091552734375,
"grad_norm": 0.14320409297943115,
"learning_rate": 4.542245864868164e-05,
"lookahead_loss": 4.417602932453155,
"loss": 2.3626,
"step": 48000
},
{
"base_loss": 0.3067179475426674,
"epoch": 0.09250640869140625,
"grad_norm": 0.14618393778800964,
"learning_rate": 4.537477493286133e-05,
"lookahead_loss": 4.324424335956573,
"loss": 2.3156,
"step": 48500
},
{
"base_loss": 0.34884196099638937,
"epoch": 0.0934600830078125,
"grad_norm": 0.14412052929401398,
"learning_rate": 4.5327091217041016e-05,
"lookahead_loss": 4.406433558940887,
"loss": 2.3776,
"step": 49000
},
{
"base_loss": 0.3168534035682678,
"epoch": 0.09441375732421875,
"grad_norm": 0.15117081999778748,
"learning_rate": 4.5279407501220706e-05,
"lookahead_loss": 4.37341592168808,
"loss": 2.3451,
"step": 49500
},
{
"base_loss": 0.3365150539577007,
"epoch": 0.095367431640625,
"grad_norm": 0.29998552799224854,
"learning_rate": 4.523172378540039e-05,
"lookahead_loss": 4.387516963481903,
"loss": 2.362,
"step": 50000
},
{
"epoch": 0.095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 4.00546947631836,
"eval_lookahead_perplexity": 54.897591469211974,
"eval_loss": 2.068084955215454,
"eval_perplexity": 7.909661249131722,
"eval_runtime": 486.3798,
"eval_samples_per_second": 20.56,
"eval_steps_per_second": 1.285,
"step": 50000
},
{
"base_loss": 0.29778670057654383,
"epoch": 0.09632110595703125,
"grad_norm": 0.15356530249118805,
"learning_rate": 4.518404006958008e-05,
"lookahead_loss": 4.299853836059571,
"loss": 2.2988,
"step": 50500
},
{
"base_loss": 0.2963479610979557,
"epoch": 0.0972747802734375,
"grad_norm": 0.17061887681484222,
"learning_rate": 4.513635635375977e-05,
"lookahead_loss": 4.333408058643341,
"loss": 2.3149,
"step": 51000
},
{
"base_loss": 0.3014124562442303,
"epoch": 0.09822845458984375,
"grad_norm": 0.19273534417152405,
"learning_rate": 4.508867263793945e-05,
"lookahead_loss": 4.332042829036713,
"loss": 2.3167,
"step": 51500
},
{
"base_loss": 0.3079428587257862,
"epoch": 0.09918212890625,
"grad_norm": 0.17310389876365662,
"learning_rate": 4.5040988922119143e-05,
"lookahead_loss": 4.3280877280235295,
"loss": 2.318,
"step": 52000
},
{
"base_loss": 0.3165371402204037,
"epoch": 0.10013580322265625,
"grad_norm": 0.2102889120578766,
"learning_rate": 4.499330520629883e-05,
"lookahead_loss": 4.3219221534729,
"loss": 2.3192,
"step": 52500
},
{
"base_loss": 0.3282755868136883,
"epoch": 0.1010894775390625,
"grad_norm": 0.12816853821277618,
"learning_rate": 4.494562149047852e-05,
"lookahead_loss": 4.345856199264526,
"loss": 2.3371,
"step": 53000
},
{
"base_loss": 0.3201599704921246,
"epoch": 0.10204315185546875,
"grad_norm": 0.18837909400463104,
"learning_rate": 4.489793777465821e-05,
"lookahead_loss": 4.310313300609589,
"loss": 2.3152,
"step": 53500
},
{
"base_loss": 0.29424001121521,
"epoch": 0.102996826171875,
"grad_norm": 0.20326119661331177,
"learning_rate": 4.485025405883789e-05,
"lookahead_loss": 4.260730008602143,
"loss": 2.2775,
"step": 54000
},
{
"base_loss": 0.3039598934650421,
"epoch": 0.10395050048828125,
"grad_norm": 0.19273315370082855,
"learning_rate": 4.480257034301758e-05,
"lookahead_loss": 4.278165885925293,
"loss": 2.2911,
"step": 54500
},
{
"base_loss": 0.3027501743733883,
"epoch": 0.1049041748046875,
"grad_norm": 0.31297221779823303,
"learning_rate": 4.4754886627197264e-05,
"lookahead_loss": 4.322804376602173,
"loss": 2.3128,
"step": 55000
},
{
"epoch": 0.1049041748046875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.9343780502319334,
"eval_lookahead_perplexity": 51.130339562617046,
"eval_loss": 2.0325393676757812,
"eval_perplexity": 7.633445893645447,
"eval_runtime": 484.3719,
"eval_samples_per_second": 20.645,
"eval_steps_per_second": 1.29,
"step": 55000
},
{
"base_loss": 0.30902848035097125,
"epoch": 0.10585784912109375,
"grad_norm": 0.14435261487960815,
"learning_rate": 4.4707202911376955e-05,
"lookahead_loss": 4.311323729991913,
"loss": 2.3102,
"step": 55500
},
{
"base_loss": 0.32790091571211816,
"epoch": 0.1068115234375,
"grad_norm": 0.14303159713745117,
"learning_rate": 4.4659519195556645e-05,
"lookahead_loss": 4.295704743385315,
"loss": 2.3118,
"step": 56000
},
{
"base_loss": 0.34225816893577576,
"epoch": 0.10776519775390625,
"grad_norm": 0.16590921580791473,
"learning_rate": 4.461183547973633e-05,
"lookahead_loss": 4.334332738399506,
"loss": 2.3383,
"step": 56500
},
{
"base_loss": 0.378170046120882,
"epoch": 0.1087188720703125,
"grad_norm": 0.13906623423099518,
"learning_rate": 4.456415176391602e-05,
"lookahead_loss": 4.338398130893707,
"loss": 2.3583,
"step": 57000
},
{
"base_loss": 0.29169481843709943,
"epoch": 0.10967254638671875,
"grad_norm": 0.13996054232120514,
"learning_rate": 4.45164680480957e-05,
"lookahead_loss": 4.234890432357788,
"loss": 2.2633,
"step": 57500
},
{
"base_loss": 0.29581671801209447,
"epoch": 0.110626220703125,
"grad_norm": 0.20492452383041382,
"learning_rate": 4.446878433227539e-05,
"lookahead_loss": 4.2333492503166195,
"loss": 2.2646,
"step": 58000
},
{
"base_loss": 0.30925117334723473,
"epoch": 0.11157989501953125,
"grad_norm": 0.15514181554317474,
"learning_rate": 4.442110061645508e-05,
"lookahead_loss": 4.287873956203461,
"loss": 2.2986,
"step": 58500
},
{
"base_loss": 0.3024054784178734,
"epoch": 0.1125335693359375,
"grad_norm": 0.13332504034042358,
"learning_rate": 4.4373416900634766e-05,
"lookahead_loss": 4.307773890495301,
"loss": 2.3051,
"step": 59000
},
{
"base_loss": 0.3064781714081764,
"epoch": 0.11348724365234375,
"grad_norm": 0.15052156150341034,
"learning_rate": 4.4325733184814456e-05,
"lookahead_loss": 4.252724995613098,
"loss": 2.2796,
"step": 59500
},
{
"base_loss": 0.33560348653793337,
"epoch": 0.11444091796875,
"grad_norm": 0.1650613248348236,
"learning_rate": 4.427804946899414e-05,
"lookahead_loss": 4.293933411121368,
"loss": 2.3148,
"step": 60000
},
{
"epoch": 0.11444091796875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.873882612991333,
"eval_lookahead_perplexity": 48.12888963812929,
"eval_loss": 2.002291679382324,
"eval_perplexity": 7.406008864179334,
"eval_runtime": 485.5371,
"eval_samples_per_second": 20.596,
"eval_steps_per_second": 1.287,
"step": 60000
},
{
"base_loss": 0.32748540678620336,
"epoch": 0.11539459228515625,
"grad_norm": 0.17119185626506805,
"learning_rate": 4.423036575317383e-05,
"lookahead_loss": 4.253311916828156,
"loss": 2.2904,
"step": 60500
},
{
"base_loss": 0.31137031635642054,
"epoch": 0.1163482666015625,
"grad_norm": 0.13608764111995697,
"learning_rate": 4.418268203735352e-05,
"lookahead_loss": 4.232373682975769,
"loss": 2.2719,
"step": 61000
},
{
"base_loss": 0.29493060091137885,
"epoch": 0.11730194091796875,
"grad_norm": 0.18083657324314117,
"learning_rate": 4.41349983215332e-05,
"lookahead_loss": 4.186727853775024,
"loss": 2.2408,
"step": 61500
},
{
"base_loss": 0.29388627085089686,
"epoch": 0.118255615234375,
"grad_norm": 0.1371856927871704,
"learning_rate": 4.4087314605712893e-05,
"lookahead_loss": 4.225665160179139,
"loss": 2.2598,
"step": 62000
},
{
"base_loss": 0.3018366146683693,
"epoch": 0.11920928955078125,
"grad_norm": 0.20487329363822937,
"learning_rate": 4.403963088989258e-05,
"lookahead_loss": 4.264792753696442,
"loss": 2.2833,
"step": 62500
},
{
"base_loss": 0.3014587008357048,
"epoch": 0.1201629638671875,
"grad_norm": 0.150614932179451,
"learning_rate": 4.399194717407227e-05,
"lookahead_loss": 4.226875496387482,
"loss": 2.2642,
"step": 63000
},
{
"base_loss": 0.3339847734570503,
"epoch": 0.12111663818359375,
"grad_norm": 0.14908407628536224,
"learning_rate": 4.394426345825196e-05,
"lookahead_loss": 4.266850714683533,
"loss": 2.3004,
"step": 63500
},
{
"base_loss": 0.3062296485900879,
"epoch": 0.1220703125,
"grad_norm": 0.14350071549415588,
"learning_rate": 4.389657974243164e-05,
"lookahead_loss": 4.188070932388306,
"loss": 2.2471,
"step": 64000
},
{
"base_loss": 0.30985459744930266,
"epoch": 0.12302398681640625,
"grad_norm": 0.1504562944173813,
"learning_rate": 4.384889602661133e-05,
"lookahead_loss": 4.190308025836945,
"loss": 2.2501,
"step": 64500
},
{
"base_loss": 0.2970647314786911,
"epoch": 0.1239776611328125,
"grad_norm": 0.1937413513660431,
"learning_rate": 4.3801212310791014e-05,
"lookahead_loss": 4.1838707237243655,
"loss": 2.2405,
"step": 65000
},
{
"epoch": 0.1239776611328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.8223541236877443,
"eval_lookahead_perplexity": 45.71169273408033,
"eval_loss": 1.9765273332595825,
"eval_perplexity": 7.2176349735977094,
"eval_runtime": 484.8269,
"eval_samples_per_second": 20.626,
"eval_steps_per_second": 1.289,
"step": 65000
},
{
"base_loss": 0.3036232475936413,
"epoch": 0.12493133544921875,
"grad_norm": 0.1427551954984665,
"learning_rate": 4.3753528594970705e-05,
"lookahead_loss": 4.240097631454468,
"loss": 2.2719,
"step": 65500
},
{
"base_loss": 0.3062092212736607,
"epoch": 0.125885009765625,
"grad_norm": 0.1701672226190567,
"learning_rate": 4.3705844879150395e-05,
"lookahead_loss": 4.233000110626221,
"loss": 2.2696,
"step": 66000
},
{
"base_loss": 0.3083333975672722,
"epoch": 0.12683868408203125,
"grad_norm": 0.1478368043899536,
"learning_rate": 4.365816116333008e-05,
"lookahead_loss": 4.2112864146232605,
"loss": 2.2598,
"step": 66500
},
{
"base_loss": 0.32698157826066016,
"epoch": 0.1277923583984375,
"grad_norm": 0.1420971006155014,
"learning_rate": 4.361047744750977e-05,
"lookahead_loss": 4.215134252548218,
"loss": 2.2711,
"step": 67000
},
{
"base_loss": 0.3078202583193779,
"epoch": 0.12874603271484375,
"grad_norm": 0.23012402653694153,
"learning_rate": 4.356279373168945e-05,
"lookahead_loss": 4.165988350391388,
"loss": 2.2369,
"step": 67500
},
{
"base_loss": 0.3053109573423862,
"epoch": 0.12969970703125,
"grad_norm": 0.18060369789600372,
"learning_rate": 4.351511001586914e-05,
"lookahead_loss": 4.1732166509628295,
"loss": 2.2393,
"step": 68000
},
{
"base_loss": 0.3097400109171867,
"epoch": 0.13065338134765625,
"grad_norm": 0.15356962382793427,
"learning_rate": 4.346742630004883e-05,
"lookahead_loss": 4.150314831733704,
"loss": 2.23,
"step": 68500
},
{
"base_loss": 0.3077418188452721,
"epoch": 0.1316070556640625,
"grad_norm": 0.16534677147865295,
"learning_rate": 4.3419742584228516e-05,
"lookahead_loss": 4.2109439077377315,
"loss": 2.2593,
"step": 69000
},
{
"base_loss": 0.3032139558494091,
"epoch": 0.13256072998046875,
"grad_norm": 0.1445273905992508,
"learning_rate": 4.3372058868408206e-05,
"lookahead_loss": 4.206652547359466,
"loss": 2.2549,
"step": 69500
},
{
"base_loss": 0.3113168263733387,
"epoch": 0.133514404296875,
"grad_norm": 0.14419260621070862,
"learning_rate": 4.332437515258789e-05,
"lookahead_loss": 4.168332862854004,
"loss": 2.2398,
"step": 70000
},
{
"epoch": 0.133514404296875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.775430511856079,
"eval_lookahead_perplexity": 43.616281600719844,
"eval_loss": 1.9530656337738037,
"eval_perplexity": 7.050268024589573,
"eval_runtime": 481.81,
"eval_samples_per_second": 20.755,
"eval_steps_per_second": 1.297,
"step": 70000
},
{
"base_loss": 0.33833707132935525,
"epoch": 0.13446807861328125,
"grad_norm": 0.19232244789600372,
"learning_rate": 4.327669143676758e-05,
"lookahead_loss": 4.234199364185333,
"loss": 2.2863,
"step": 70500
},
{
"base_loss": 0.31973540037870407,
"epoch": 0.1354217529296875,
"grad_norm": 0.2945224344730377,
"learning_rate": 4.322900772094727e-05,
"lookahead_loss": 4.172246160030365,
"loss": 2.246,
"step": 71000
},
{
"base_loss": 0.3021405778825283,
"epoch": 0.13637542724609375,
"grad_norm": 0.21431593596935272,
"learning_rate": 4.318132400512695e-05,
"lookahead_loss": 4.1314473094940185,
"loss": 2.2168,
"step": 71500
},
{
"base_loss": 0.2982295399904251,
"epoch": 0.1373291015625,
"grad_norm": 0.17282553017139435,
"learning_rate": 4.3133640289306643e-05,
"lookahead_loss": 4.108474971294403,
"loss": 2.2034,
"step": 72000
},
{
"base_loss": 0.3037794386148453,
"epoch": 0.13828277587890625,
"grad_norm": 0.18910439312458038,
"learning_rate": 4.308595657348633e-05,
"lookahead_loss": 4.186692704200745,
"loss": 2.2452,
"step": 72500
},
{
"base_loss": 0.30183823220431805,
"epoch": 0.1392364501953125,
"grad_norm": 0.1664671003818512,
"learning_rate": 4.303827285766602e-05,
"lookahead_loss": 4.19723641204834,
"loss": 2.2495,
"step": 73000
},
{
"base_loss": 0.30403331050276755,
"epoch": 0.14019012451171875,
"grad_norm": 0.1586393415927887,
"learning_rate": 4.299058914184571e-05,
"lookahead_loss": 4.1403014822006226,
"loss": 2.2222,
"step": 73500
},
{
"base_loss": 0.3824539307653904,
"epoch": 0.141143798828125,
"grad_norm": 0.17738763988018036,
"learning_rate": 4.294290542602539e-05,
"lookahead_loss": 4.264650414466858,
"loss": 2.3236,
"step": 74000
},
{
"base_loss": 0.3024712265729904,
"epoch": 0.14209747314453125,
"grad_norm": 0.15253642201423645,
"learning_rate": 4.289522171020508e-05,
"lookahead_loss": 4.125599290370941,
"loss": 2.214,
"step": 74500
},
{
"base_loss": 0.3230967881381512,
"epoch": 0.1430511474609375,
"grad_norm": 0.19057177007198334,
"learning_rate": 4.2847537994384764e-05,
"lookahead_loss": 4.12642933511734,
"loss": 2.2248,
"step": 75000
},
{
"epoch": 0.1430511474609375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.7355900829315187,
"eval_lookahead_perplexity": 41.912750266117726,
"eval_loss": 1.93314528465271,
"eval_perplexity": 6.911213826492632,
"eval_runtime": 481.4425,
"eval_samples_per_second": 20.771,
"eval_steps_per_second": 1.298,
"step": 75000
},
{
"base_loss": 0.30004050022363665,
"epoch": 0.14400482177734375,
"grad_norm": 0.17912127077579498,
"learning_rate": 4.2799854278564455e-05,
"lookahead_loss": 4.098903262138367,
"loss": 2.1995,
"step": 75500
},
{
"base_loss": 0.31687936970591546,
"epoch": 0.14495849609375,
"grad_norm": 0.15588033199310303,
"learning_rate": 4.2752170562744145e-05,
"lookahead_loss": 4.163789962291718,
"loss": 2.2403,
"step": 76000
},
{
"base_loss": 0.3090082891881466,
"epoch": 0.14591217041015625,
"grad_norm": 0.15440773963928223,
"learning_rate": 4.270448684692383e-05,
"lookahead_loss": 4.168988561153411,
"loss": 2.239,
"step": 76500
},
{
"base_loss": 0.30597079479694367,
"epoch": 0.1468658447265625,
"grad_norm": 0.13688406348228455,
"learning_rate": 4.265680313110352e-05,
"lookahead_loss": 4.118725947856903,
"loss": 2.2123,
"step": 77000
},
{
"base_loss": 0.34478209909796714,
"epoch": 0.14781951904296875,
"grad_norm": 0.14708669483661652,
"learning_rate": 4.26091194152832e-05,
"lookahead_loss": 4.186521942615509,
"loss": 2.2657,
"step": 77500
},
{
"base_loss": 0.3085326923131943,
"epoch": 0.148773193359375,
"grad_norm": 0.1381761133670807,
"learning_rate": 4.256143569946289e-05,
"lookahead_loss": 4.102399334430695,
"loss": 2.2055,
"step": 78000
},
{
"base_loss": 0.31045393279194833,
"epoch": 0.14972686767578125,
"grad_norm": 0.18813666701316833,
"learning_rate": 4.251375198364258e-05,
"lookahead_loss": 4.1153163766860965,
"loss": 2.2129,
"step": 78500
},
{
"base_loss": 0.2935263271927834,
"epoch": 0.1506805419921875,
"grad_norm": 0.21148359775543213,
"learning_rate": 4.2466068267822266e-05,
"lookahead_loss": 4.083489650249481,
"loss": 2.1885,
"step": 79000
},
{
"base_loss": 0.3048303987979889,
"epoch": 0.15163421630859375,
"grad_norm": 0.16598311066627502,
"learning_rate": 4.2418384552001956e-05,
"lookahead_loss": 4.14640766620636,
"loss": 2.2256,
"step": 79500
},
{
"base_loss": 0.3079901858270168,
"epoch": 0.152587890625,
"grad_norm": 0.2019839733839035,
"learning_rate": 4.237070083618164e-05,
"lookahead_loss": 4.159954071044922,
"loss": 2.234,
"step": 80000
},
{
"epoch": 0.152587890625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.698641411590576,
"eval_lookahead_perplexity": 40.392390432330735,
"eval_loss": 1.9146709442138672,
"eval_perplexity": 6.784705882914828,
"eval_runtime": 485.524,
"eval_samples_per_second": 20.596,
"eval_steps_per_second": 1.287,
"step": 80000
},
{
"base_loss": 0.32198003405332565,
"epoch": 0.15354156494140625,
"grad_norm": 0.14169633388519287,
"learning_rate": 4.232301712036133e-05,
"lookahead_loss": 4.123985225200653,
"loss": 2.223,
"step": 80500
},
{
"base_loss": 0.3435799330174923,
"epoch": 0.1544952392578125,
"grad_norm": 0.2051265984773636,
"learning_rate": 4.227533340454102e-05,
"lookahead_loss": 4.174958533763886,
"loss": 2.2593,
"step": 81000
},
{
"base_loss": 0.3113198747932911,
"epoch": 0.15544891357421875,
"grad_norm": 0.22451823949813843,
"learning_rate": 4.22276496887207e-05,
"lookahead_loss": 4.103942976951599,
"loss": 2.2076,
"step": 81500
},
{
"base_loss": 0.2957187399119139,
"epoch": 0.156402587890625,
"grad_norm": 0.19755777716636658,
"learning_rate": 4.2179965972900393e-05,
"lookahead_loss": 4.071250169277191,
"loss": 2.1835,
"step": 82000
},
{
"base_loss": 0.29787460842728614,
"epoch": 0.15735626220703125,
"grad_norm": 0.14888489246368408,
"learning_rate": 4.213228225708008e-05,
"lookahead_loss": 4.07864601278305,
"loss": 2.1883,
"step": 82500
},
{
"base_loss": 0.3036723498404026,
"epoch": 0.1583099365234375,
"grad_norm": 0.14837269484996796,
"learning_rate": 4.208459854125977e-05,
"lookahead_loss": 4.1651582074165345,
"loss": 2.2344,
"step": 83000
},
{
"base_loss": 0.3110756404399872,
"epoch": 0.15926361083984375,
"grad_norm": 0.1456403285264969,
"learning_rate": 4.203691482543946e-05,
"lookahead_loss": 4.12848590517044,
"loss": 2.2198,
"step": 83500
},
{
"base_loss": 0.3243219917714596,
"epoch": 0.16021728515625,
"grad_norm": 0.1554984450340271,
"learning_rate": 4.198923110961914e-05,
"lookahead_loss": 4.138046524524689,
"loss": 2.2312,
"step": 84000
},
{
"base_loss": 0.3183397548496723,
"epoch": 0.16117095947265625,
"grad_norm": 0.15073776245117188,
"learning_rate": 4.194154739379883e-05,
"lookahead_loss": 4.095347220897675,
"loss": 2.2068,
"step": 84500
},
{
"base_loss": 0.330243824750185,
"epoch": 0.1621246337890625,
"grad_norm": 0.20515553653240204,
"learning_rate": 4.1893863677978514e-05,
"lookahead_loss": 4.1033834571838375,
"loss": 2.2168,
"step": 85000
},
{
"epoch": 0.1621246337890625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.6668980613708495,
"eval_lookahead_perplexity": 39.130337503517424,
"eval_loss": 1.898799180984497,
"eval_perplexity": 6.677870711585694,
"eval_runtime": 491.2668,
"eval_samples_per_second": 20.356,
"eval_steps_per_second": 1.272,
"step": 85000
},
{
"base_loss": 0.28920619750022886,
"epoch": 0.16307830810546875,
"grad_norm": 0.13687625527381897,
"learning_rate": 4.1846179962158205e-05,
"lookahead_loss": 4.051657072067261,
"loss": 2.1704,
"step": 85500
},
{
"base_loss": 0.3127485100328922,
"epoch": 0.164031982421875,
"grad_norm": 0.1961473971605301,
"learning_rate": 4.1798496246337895e-05,
"lookahead_loss": 4.111975946426392,
"loss": 2.2124,
"step": 86000
},
{
"base_loss": 0.3023957554399967,
"epoch": 0.16498565673828125,
"grad_norm": 0.21571995317935944,
"learning_rate": 4.175081253051758e-05,
"lookahead_loss": 4.115322601795197,
"loss": 2.2089,
"step": 86500
},
{
"base_loss": 0.3064508207142353,
"epoch": 0.1659393310546875,
"grad_norm": 0.145101398229599,
"learning_rate": 4.170312881469727e-05,
"lookahead_loss": 4.093011445045471,
"loss": 2.1997,
"step": 87000
},
{
"base_loss": 0.33141738665103915,
"epoch": 0.16689300537109375,
"grad_norm": 0.13913673162460327,
"learning_rate": 4.165544509887695e-05,
"lookahead_loss": 4.137466729640961,
"loss": 2.2344,
"step": 87500
},
{
"base_loss": 0.3255680377185345,
"epoch": 0.1678466796875,
"grad_norm": 0.1342954784631729,
"learning_rate": 4.160776138305664e-05,
"lookahead_loss": 4.086215874195099,
"loss": 2.2059,
"step": 88000
},
{
"base_loss": 0.3133760218322277,
"epoch": 0.16880035400390625,
"grad_norm": 0.15926498174667358,
"learning_rate": 4.156007766723633e-05,
"lookahead_loss": 4.077684763908386,
"loss": 2.1955,
"step": 88500
},
{
"base_loss": 0.29871078038215637,
"epoch": 0.1697540283203125,
"grad_norm": 0.25558069348335266,
"learning_rate": 4.1512393951416016e-05,
"lookahead_loss": 4.051617414474487,
"loss": 2.1752,
"step": 89000
},
{
"base_loss": 0.29546582013368605,
"epoch": 0.17070770263671875,
"grad_norm": 0.1507255733013153,
"learning_rate": 4.1464710235595706e-05,
"lookahead_loss": 4.0750104126930236,
"loss": 2.1852,
"step": 89500
},
{
"base_loss": 0.2991917096078396,
"epoch": 0.171661376953125,
"grad_norm": 0.13587379455566406,
"learning_rate": 4.141702651977539e-05,
"lookahead_loss": 4.071310368061066,
"loss": 2.1853,
"step": 90000
},
{
"epoch": 0.171661376953125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.63601548538208,
"eval_lookahead_perplexity": 37.94036122372511,
"eval_loss": 1.8833580017089844,
"eval_perplexity": 6.575548533346909,
"eval_runtime": 499.4539,
"eval_samples_per_second": 20.022,
"eval_steps_per_second": 1.251,
"step": 90000
},
{
"base_loss": 0.297812943726778,
"epoch": 0.17261505126953125,
"grad_norm": 0.16026511788368225,
"learning_rate": 4.136934280395508e-05,
"lookahead_loss": 4.0754151496887205,
"loss": 2.1866,
"step": 90500
},
{
"base_loss": 0.30764649564027785,
"epoch": 0.1735687255859375,
"grad_norm": 0.48659810423851013,
"learning_rate": 4.132165908813477e-05,
"lookahead_loss": 4.057919836521148,
"loss": 2.1828,
"step": 91000
},
{
"base_loss": 0.3142555268108845,
"epoch": 0.17452239990234375,
"grad_norm": 0.18999813497066498,
"learning_rate": 4.127397537231445e-05,
"lookahead_loss": 4.059666626930237,
"loss": 2.187,
"step": 91500
},
{
"base_loss": 0.3496557460427284,
"epoch": 0.17547607421875,
"grad_norm": 0.17107349634170532,
"learning_rate": 4.1226291656494143e-05,
"lookahead_loss": 4.136337936401367,
"loss": 2.243,
"step": 92000
},
{
"base_loss": 0.3016935878098011,
"epoch": 0.17642974853515625,
"grad_norm": 0.23676873743534088,
"learning_rate": 4.117860794067383e-05,
"lookahead_loss": 4.028763621330262,
"loss": 2.1652,
"step": 92500
},
{
"base_loss": 0.300347177952528,
"epoch": 0.1773834228515625,
"grad_norm": 0.1585322767496109,
"learning_rate": 4.113092422485352e-05,
"lookahead_loss": 4.0394937310218815,
"loss": 2.1699,
"step": 93000
},
{
"base_loss": 0.3073237894177437,
"epoch": 0.17833709716796875,
"grad_norm": 0.23585672676563263,
"learning_rate": 4.108324050903321e-05,
"lookahead_loss": 4.032268433570862,
"loss": 2.1698,
"step": 93500
},
{
"base_loss": 0.3024279504716396,
"epoch": 0.179290771484375,
"grad_norm": 0.17836162447929382,
"learning_rate": 4.103555679321289e-05,
"lookahead_loss": 4.024306795597076,
"loss": 2.1634,
"step": 94000
},
{
"base_loss": 0.3055584655106068,
"epoch": 0.18024444580078125,
"grad_norm": 0.14800947904586792,
"learning_rate": 4.098787307739258e-05,
"lookahead_loss": 4.064500618457794,
"loss": 2.185,
"step": 94500
},
{
"base_loss": 0.29581878417730334,
"epoch": 0.1811981201171875,
"grad_norm": 0.16330750286579132,
"learning_rate": 4.0940189361572264e-05,
"lookahead_loss": 4.053201458930969,
"loss": 2.1745,
"step": 95000
},
{
"epoch": 0.1811981201171875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.608971565246582,
"eval_lookahead_perplexity": 36.9280551838023,
"eval_loss": 1.8698359727859497,
"eval_perplexity": 6.487232229383202,
"eval_runtime": 547.0525,
"eval_samples_per_second": 18.28,
"eval_steps_per_second": 1.142,
"step": 95000
},
{
"base_loss": 0.2980945122539997,
"epoch": 0.18215179443359375,
"grad_norm": 0.14234571158885956,
"learning_rate": 4.0892505645751955e-05,
"lookahead_loss": 4.067723794937134,
"loss": 2.1829,
"step": 95500
},
{
"base_loss": 0.2986116451621056,
"epoch": 0.18310546875,
"grad_norm": 0.15030185878276825,
"learning_rate": 4.0844821929931645e-05,
"lookahead_loss": 4.031040393829346,
"loss": 2.1648,
"step": 96000
},
{
"base_loss": 0.31211488363146783,
"epoch": 0.18405914306640625,
"grad_norm": 0.2024800330400467,
"learning_rate": 4.079713821411133e-05,
"lookahead_loss": 4.039662356376648,
"loss": 2.1759,
"step": 96500
},
{
"base_loss": 0.3342977456152439,
"epoch": 0.1850128173828125,
"grad_norm": 0.18318872153759003,
"learning_rate": 4.074945449829102e-05,
"lookahead_loss": 4.070985550403595,
"loss": 2.2026,
"step": 97000
},
{
"base_loss": 0.31514875215291976,
"epoch": 0.18596649169921875,
"grad_norm": 0.14978346228599548,
"learning_rate": 4.07017707824707e-05,
"lookahead_loss": 4.028551621437073,
"loss": 2.1718,
"step": 97500
},
{
"base_loss": 0.3053509466052055,
"epoch": 0.186920166015625,
"grad_norm": 0.2080519199371338,
"learning_rate": 4.065408706665039e-05,
"lookahead_loss": 4.030301760673523,
"loss": 2.1678,
"step": 98000
},
{
"base_loss": 0.29078236150741577,
"epoch": 0.18787384033203125,
"grad_norm": 0.16793227195739746,
"learning_rate": 4.060640335083008e-05,
"lookahead_loss": 3.9797529973983763,
"loss": 2.1353,
"step": 98500
},
{
"base_loss": 0.29214190459251405,
"epoch": 0.1888275146484375,
"grad_norm": 0.19143177568912506,
"learning_rate": 4.0558719635009766e-05,
"lookahead_loss": 4.013256893157959,
"loss": 2.1527,
"step": 99000
},
{
"base_loss": 0.2928229000866413,
"epoch": 0.18978118896484375,
"grad_norm": 0.2626541554927826,
"learning_rate": 4.0511035919189456e-05,
"lookahead_loss": 4.013317709445953,
"loss": 2.1531,
"step": 99500
},
{
"base_loss": 0.29795382434129714,
"epoch": 0.19073486328125,
"grad_norm": 0.1662345677614212,
"learning_rate": 4.046335220336914e-05,
"lookahead_loss": 4.026696702957153,
"loss": 2.1623,
"step": 100000
},
{
"epoch": 0.19073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.5836494529724123,
"eval_lookahead_perplexity": 36.00469882921293,
"eval_loss": 1.8571749925613403,
"eval_perplexity": 6.405615275997352,
"eval_runtime": 518.6821,
"eval_samples_per_second": 19.28,
"eval_steps_per_second": 1.205,
"step": 100000
},
{
"base_loss": 0.29810027703642844,
"epoch": 0.19168853759765625,
"grad_norm": 0.1798204481601715,
"learning_rate": 4.041566848754883e-05,
"lookahead_loss": 4.021183431148529,
"loss": 2.1596,
"step": 100500
},
{
"base_loss": 0.30463395109772684,
"epoch": 0.1926422119140625,
"grad_norm": 0.15981730818748474,
"learning_rate": 4.036798477172852e-05,
"lookahead_loss": 4.006066414356232,
"loss": 2.1553,
"step": 101000
},
{
"base_loss": 0.33558367761969565,
"epoch": 0.19359588623046875,
"grad_norm": 0.21296393871307373,
"learning_rate": 4.03203010559082e-05,
"lookahead_loss": 4.060854420661927,
"loss": 2.1982,
"step": 101500
},
{
"base_loss": 0.3199073303639889,
"epoch": 0.194549560546875,
"grad_norm": 0.1356714516878128,
"learning_rate": 4.0272617340087893e-05,
"lookahead_loss": 4.027761749267578,
"loss": 2.1738,
"step": 102000
},
{
"base_loss": 0.30011415255069734,
"epoch": 0.19550323486328125,
"grad_norm": 0.15449336171150208,
"learning_rate": 4.022493362426758e-05,
"lookahead_loss": 3.9922111649513243,
"loss": 2.1462,
"step": 102500
},
{
"base_loss": 0.2923199172616005,
"epoch": 0.1964569091796875,
"grad_norm": 0.18760551512241364,
"learning_rate": 4.017724990844727e-05,
"lookahead_loss": 3.9661373071670534,
"loss": 2.1292,
"step": 103000
},
{
"base_loss": 0.29327082937955856,
"epoch": 0.19741058349609375,
"grad_norm": 0.2747306823730469,
"learning_rate": 4.012956619262696e-05,
"lookahead_loss": 4.003834127426147,
"loss": 2.1486,
"step": 103500
},
{
"base_loss": 0.2999972744882107,
"epoch": 0.1983642578125,
"grad_norm": 0.14901545643806458,
"learning_rate": 4.008188247680664e-05,
"lookahead_loss": 4.033395376682281,
"loss": 2.1667,
"step": 104000
},
{
"base_loss": 0.2973039819002152,
"epoch": 0.19931793212890625,
"grad_norm": 0.23263269662857056,
"learning_rate": 4.003419876098633e-05,
"lookahead_loss": 4.03858584690094,
"loss": 2.1679,
"step": 104500
},
{
"base_loss": 0.3063103293478489,
"epoch": 0.2002716064453125,
"grad_norm": 0.17267969250679016,
"learning_rate": 3.9986515045166014e-05,
"lookahead_loss": 4.011103184700012,
"loss": 2.1587,
"step": 105000
},
{
"epoch": 0.2002716064453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.5608823120117186,
"eval_lookahead_perplexity": 35.19423574717032,
"eval_loss": 1.8457914590835571,
"eval_perplexity": 6.333110204970354,
"eval_runtime": 490.4884,
"eval_samples_per_second": 20.388,
"eval_steps_per_second": 1.274,
"step": 105000
},
{
"base_loss": 0.3124685201644897,
"epoch": 0.20122528076171875,
"grad_norm": 0.15897978842258453,
"learning_rate": 3.9938831329345705e-05,
"lookahead_loss": 4.016992502689361,
"loss": 2.1647,
"step": 105500
},
{
"base_loss": 0.3321477827131748,
"epoch": 0.202178955078125,
"grad_norm": 0.14868295192718506,
"learning_rate": 3.9891147613525395e-05,
"lookahead_loss": 4.048239949703216,
"loss": 2.1902,
"step": 106000
},
{
"base_loss": 0.2998480386734009,
"epoch": 0.20313262939453125,
"grad_norm": 0.1452517807483673,
"learning_rate": 3.984346389770508e-05,
"lookahead_loss": 3.990150417327881,
"loss": 2.145,
"step": 106500
},
{
"base_loss": 0.30396230933070184,
"epoch": 0.2040863037109375,
"grad_norm": 0.15313097834587097,
"learning_rate": 3.979578018188477e-05,
"lookahead_loss": 3.9821047258377074,
"loss": 2.143,
"step": 107000
},
{
"base_loss": 0.30401156124472617,
"epoch": 0.20503997802734375,
"grad_norm": 0.19967354834079742,
"learning_rate": 3.974809646606445e-05,
"lookahead_loss": 3.970831639289856,
"loss": 2.1374,
"step": 107500
},
{
"base_loss": 0.29991251334547997,
"epoch": 0.20599365234375,
"grad_norm": 0.15913745760917664,
"learning_rate": 3.970041275024414e-05,
"lookahead_loss": 4.019209212779999,
"loss": 2.1596,
"step": 108000
},
{
"base_loss": 0.30137626150250435,
"epoch": 0.20694732666015625,
"grad_norm": 0.34907975792884827,
"learning_rate": 3.965272903442383e-05,
"lookahead_loss": 4.026716927051544,
"loss": 2.164,
"step": 108500
},
{
"base_loss": 0.2976263118684292,
"epoch": 0.2079010009765625,
"grad_norm": 0.1488516479730606,
"learning_rate": 3.9605045318603516e-05,
"lookahead_loss": 4.010719275474548,
"loss": 2.1542,
"step": 109000
},
{
"base_loss": 0.3038526868522167,
"epoch": 0.20885467529296875,
"grad_norm": 0.18493635952472687,
"learning_rate": 3.9557361602783206e-05,
"lookahead_loss": 3.985397423744202,
"loss": 2.1446,
"step": 109500
},
{
"base_loss": 0.3238096301853657,
"epoch": 0.209808349609375,
"grad_norm": 0.1789693385362625,
"learning_rate": 3.950967788696289e-05,
"lookahead_loss": 4.032813804626465,
"loss": 2.1783,
"step": 110000
},
{
"epoch": 0.209808349609375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.5394810447692873,
"eval_lookahead_perplexity": 34.44903704327508,
"eval_loss": 1.8350908756256104,
"eval_perplexity": 6.265703519291424,
"eval_runtime": 500.8146,
"eval_samples_per_second": 19.967,
"eval_steps_per_second": 1.248,
"step": 110000
},
{
"base_loss": 0.31262540474534034,
"epoch": 0.21076202392578125,
"grad_norm": 0.12846623361110687,
"learning_rate": 3.946199417114258e-05,
"lookahead_loss": 3.9918526377677916,
"loss": 2.1522,
"step": 110500
},
{
"base_loss": 0.30065977996587756,
"epoch": 0.2117156982421875,
"grad_norm": 0.18878379464149475,
"learning_rate": 3.941431045532227e-05,
"lookahead_loss": 3.9505230865478516,
"loss": 2.1256,
"step": 111000
},
{
"base_loss": 0.29354970484972,
"epoch": 0.21266937255859375,
"grad_norm": 0.15901413559913635,
"learning_rate": 3.936662673950195e-05,
"lookahead_loss": 3.9420759234428404,
"loss": 2.1178,
"step": 111500
},
{
"base_loss": 0.29862283357977865,
"epoch": 0.213623046875,
"grad_norm": 0.15839649736881256,
"learning_rate": 3.9318943023681643e-05,
"lookahead_loss": 3.988606767177582,
"loss": 2.1436,
"step": 112000
},
{
"base_loss": 0.29591704466938973,
"epoch": 0.21457672119140625,
"grad_norm": 0.13983677327632904,
"learning_rate": 3.927125930786133e-05,
"lookahead_loss": 3.9879449706077574,
"loss": 2.1419,
"step": 112500
},
{
"base_loss": 0.2976507830321789,
"epoch": 0.2155303955078125,
"grad_norm": 0.15823175013065338,
"learning_rate": 3.922357559204102e-05,
"lookahead_loss": 3.9824121689796446,
"loss": 2.14,
"step": 113000
},
{
"base_loss": 0.3091459658145905,
"epoch": 0.21648406982421875,
"grad_norm": 0.12450090050697327,
"learning_rate": 3.917589187622071e-05,
"lookahead_loss": 3.9995364723205564,
"loss": 2.1543,
"step": 113500
},
{
"base_loss": 0.33537453308701515,
"epoch": 0.217437744140625,
"grad_norm": 0.18551339209079742,
"learning_rate": 3.912820816040039e-05,
"lookahead_loss": 4.021560028076172,
"loss": 2.1785,
"step": 114000
},
{
"base_loss": 0.32016737046837807,
"epoch": 0.21839141845703125,
"grad_norm": 0.17938633263111115,
"learning_rate": 3.908052444458008e-05,
"lookahead_loss": 3.9909664607048034,
"loss": 2.1556,
"step": 114500
},
{
"base_loss": 0.30930135017633437,
"epoch": 0.2193450927734375,
"grad_norm": 0.15878070890903473,
"learning_rate": 3.9032840728759764e-05,
"lookahead_loss": 3.963454393863678,
"loss": 2.1364,
"step": 115000
},
{
"epoch": 0.2193450927734375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.520967939376831,
"eval_lookahead_perplexity": 33.81714557401866,
"eval_loss": 1.8258343935012817,
"eval_perplexity": 6.207972750791024,
"eval_runtime": 509.2129,
"eval_samples_per_second": 19.638,
"eval_steps_per_second": 1.227,
"step": 115000
},
{
"base_loss": 0.3047959138453007,
"epoch": 0.22029876708984375,
"grad_norm": 0.20373134315013885,
"learning_rate": 3.8985157012939455e-05,
"lookahead_loss": 3.9657868213653567,
"loss": 2.1353,
"step": 115500
},
{
"base_loss": 0.29702088606357574,
"epoch": 0.22125244140625,
"grad_norm": 0.15585345029830933,
"learning_rate": 3.8937473297119145e-05,
"lookahead_loss": 3.9414098200798033,
"loss": 2.1192,
"step": 116000
},
{
"base_loss": 0.2947344943881035,
"epoch": 0.22220611572265625,
"grad_norm": 0.1686229705810547,
"learning_rate": 3.888978958129883e-05,
"lookahead_loss": 3.985533133983612,
"loss": 2.1401,
"step": 116500
},
{
"base_loss": 0.29663796299695966,
"epoch": 0.2231597900390625,
"grad_norm": 0.5541319251060486,
"learning_rate": 3.884210586547852e-05,
"lookahead_loss": 3.9794252281188967,
"loss": 2.138,
"step": 117000
},
{
"base_loss": 0.3019692142158747,
"epoch": 0.22411346435546875,
"grad_norm": 0.1443110853433609,
"learning_rate": 3.87944221496582e-05,
"lookahead_loss": 3.977426125049591,
"loss": 2.1397,
"step": 117500
},
{
"base_loss": 0.3167287348806858,
"epoch": 0.225067138671875,
"grad_norm": 0.1557740718126297,
"learning_rate": 3.874673843383789e-05,
"lookahead_loss": 3.9934216737747192,
"loss": 2.1551,
"step": 118000
},
{
"base_loss": 0.3616081200838089,
"epoch": 0.22602081298828125,
"grad_norm": 0.1348077803850174,
"learning_rate": 3.869905471801758e-05,
"lookahead_loss": 4.030314054965973,
"loss": 2.196,
"step": 118500
},
{
"base_loss": 0.32006074047088623,
"epoch": 0.2269744873046875,
"grad_norm": 0.16196569800376892,
"learning_rate": 3.8651371002197266e-05,
"lookahead_loss": 3.9777443284988405,
"loss": 2.1489,
"step": 119000
},
{
"base_loss": 0.30098330533504486,
"epoch": 0.22792816162109375,
"grad_norm": 0.17852458357810974,
"learning_rate": 3.8603687286376956e-05,
"lookahead_loss": 3.941022171020508,
"loss": 2.121,
"step": 119500
},
{
"base_loss": 0.29374924197793006,
"epoch": 0.2288818359375,
"grad_norm": 0.15789750218391418,
"learning_rate": 3.855600357055664e-05,
"lookahead_loss": 3.9294084067344666,
"loss": 2.1116,
"step": 120000
},
{
"epoch": 0.2288818359375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.503778175354004,
"eval_lookahead_perplexity": 33.24080459612871,
"eval_loss": 1.8172394037246704,
"eval_perplexity": 6.154843936341777,
"eval_runtime": 486.0945,
"eval_samples_per_second": 20.572,
"eval_steps_per_second": 1.286,
"step": 120000
},
{
"base_loss": 0.30340095722675325,
"epoch": 0.22983551025390625,
"grad_norm": 0.1268227994441986,
"learning_rate": 3.850831985473633e-05,
"lookahead_loss": 3.997514929294586,
"loss": 2.1505,
"step": 120500
},
{
"base_loss": 0.2969953828752041,
"epoch": 0.2307891845703125,
"grad_norm": 0.38102588057518005,
"learning_rate": 3.846063613891602e-05,
"lookahead_loss": 3.981098453044891,
"loss": 2.139,
"step": 121000
},
{
"base_loss": 0.3105453898310661,
"epoch": 0.23174285888671875,
"grad_norm": 0.2956802248954773,
"learning_rate": 3.84129524230957e-05,
"lookahead_loss": 3.9859898743629456,
"loss": 2.1483,
"step": 121500
},
{
"base_loss": 0.312623037725687,
"epoch": 0.232696533203125,
"grad_norm": 0.20321176946163177,
"learning_rate": 3.8365268707275393e-05,
"lookahead_loss": 3.9476396222114563,
"loss": 2.1301,
"step": 122000
},
{
"base_loss": 0.3441284774243832,
"epoch": 0.23365020751953125,
"grad_norm": 0.1659804880619049,
"learning_rate": 3.831758499145508e-05,
"lookahead_loss": 4.019089301586151,
"loss": 2.1816,
"step": 122500
},
{
"base_loss": 0.3158799746334553,
"epoch": 0.2346038818359375,
"grad_norm": 0.1387973129749298,
"learning_rate": 3.826990127563477e-05,
"lookahead_loss": 3.9791625356674194,
"loss": 2.1475,
"step": 123000
},
{
"base_loss": 0.3052534331381321,
"epoch": 0.23555755615234375,
"grad_norm": 0.18591973185539246,
"learning_rate": 3.822221755981446e-05,
"lookahead_loss": 3.9717899789810183,
"loss": 2.1385,
"step": 123500
},
{
"base_loss": 0.30015348917245865,
"epoch": 0.23651123046875,
"grad_norm": 0.14250557124614716,
"learning_rate": 3.817453384399414e-05,
"lookahead_loss": 3.9358688526153562,
"loss": 2.118,
"step": 124000
},
{
"base_loss": 0.30166161328554153,
"epoch": 0.23746490478515625,
"grad_norm": 0.15538230538368225,
"learning_rate": 3.812685012817383e-05,
"lookahead_loss": 3.9643000559806825,
"loss": 2.133,
"step": 124500
},
{
"base_loss": 0.2973581215441227,
"epoch": 0.2384185791015625,
"grad_norm": 0.1487993448972702,
"learning_rate": 3.8079166412353514e-05,
"lookahead_loss": 3.9540540752410887,
"loss": 2.1257,
"step": 125000
},
{
"epoch": 0.2384185791015625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.4853565464019773,
"eval_lookahead_perplexity": 32.63406059268419,
"eval_loss": 1.8080284595489502,
"eval_perplexity": 6.098412305711686,
"eval_runtime": 483.8181,
"eval_samples_per_second": 20.669,
"eval_steps_per_second": 1.292,
"step": 125000
},
{
"base_loss": 0.3022316889762878,
"epoch": 1.0009536743164062,
"grad_norm": 0.160108283162117,
"learning_rate": 3.8031482696533205e-05,
"lookahead_loss": 3.9718240842819212,
"loss": 2.137,
"step": 125500
},
{
"base_loss": 0.2998015423119068,
"epoch": 1.0019073486328125,
"grad_norm": 0.1442178189754486,
"learning_rate": 3.7983798980712895e-05,
"lookahead_loss": 3.9606360173225403,
"loss": 2.1302,
"step": 126000
},
{
"base_loss": 0.299979487746954,
"epoch": 1.0028610229492188,
"grad_norm": 0.21493926644325256,
"learning_rate": 3.793611526489258e-05,
"lookahead_loss": 3.956595335960388,
"loss": 2.1283,
"step": 126500
},
{
"base_loss": 0.30338721710443495,
"epoch": 1.003814697265625,
"grad_norm": 0.24776244163513184,
"learning_rate": 3.788843154907227e-05,
"lookahead_loss": 3.9652762699127195,
"loss": 2.1343,
"step": 127000
},
{
"base_loss": 0.30518372932076454,
"epoch": 1.0047683715820312,
"grad_norm": 0.28755414485931396,
"learning_rate": 3.784074783325195e-05,
"lookahead_loss": 3.941191442966461,
"loss": 2.1232,
"step": 127500
},
{
"base_loss": 0.3134980680346489,
"epoch": 1.0057220458984375,
"grad_norm": 0.16672217845916748,
"learning_rate": 3.779306411743164e-05,
"lookahead_loss": 3.9538578872680663,
"loss": 2.1337,
"step": 128000
},
{
"base_loss": 0.3313628733754158,
"epoch": 1.0066757202148438,
"grad_norm": 0.18939979374408722,
"learning_rate": 3.774538040161133e-05,
"lookahead_loss": 3.9658765501976014,
"loss": 2.1486,
"step": 128500
},
{
"base_loss": 0.31349258169531824,
"epoch": 1.00762939453125,
"grad_norm": 0.1514354944229126,
"learning_rate": 3.7697696685791016e-05,
"lookahead_loss": 3.946809937477112,
"loss": 2.1302,
"step": 129000
},
{
"base_loss": 0.31480157864093783,
"epoch": 1.0085830688476562,
"grad_norm": 0.14762338995933533,
"learning_rate": 3.7650012969970706e-05,
"lookahead_loss": 3.94188436794281,
"loss": 2.1283,
"step": 129500
},
{
"base_loss": 0.29147451075911524,
"epoch": 1.0095367431640625,
"grad_norm": 0.13176566362380981,
"learning_rate": 3.760232925415039e-05,
"lookahead_loss": 3.9021912565231323,
"loss": 2.0968,
"step": 130000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.4698605045318605,
"eval_lookahead_perplexity": 32.13225982748101,
"eval_loss": 1.8002804517745972,
"eval_perplexity": 6.05134433671433,
"eval_runtime": 488.4824,
"eval_samples_per_second": 20.472,
"eval_steps_per_second": 1.279,
"step": 130000
},
{
"base_loss": 0.30006001514196395,
"epoch": 1.0104904174804688,
"grad_norm": 0.14655235409736633,
"learning_rate": 3.755464553833008e-05,
"lookahead_loss": 3.9301828422546388,
"loss": 2.1151,
"step": 130500
},
{
"base_loss": 0.2960101006031036,
"epoch": 1.011444091796875,
"grad_norm": 0.1633438616991043,
"learning_rate": 3.750696182250977e-05,
"lookahead_loss": 3.952839255809784,
"loss": 2.1244,
"step": 131000
},
{
"base_loss": 0.29892793264985085,
"epoch": 1.0123977661132812,
"grad_norm": 0.18194669485092163,
"learning_rate": 3.745927810668945e-05,
"lookahead_loss": 3.958092218399048,
"loss": 2.1285,
"step": 131500
},
{
"base_loss": 0.29978542965650556,
"epoch": 1.0133514404296875,
"grad_norm": 0.17729413509368896,
"learning_rate": 3.7411594390869143e-05,
"lookahead_loss": 3.952873631000519,
"loss": 2.1263,
"step": 132000
},
{
"base_loss": 0.3069946175217628,
"epoch": 1.0143051147460938,
"grad_norm": 0.17295287549495697,
"learning_rate": 3.736391067504883e-05,
"lookahead_loss": 3.938451060295105,
"loss": 2.1227,
"step": 132500
},
{
"base_loss": 0.3185223871767521,
"epoch": 1.0152587890625,
"grad_norm": 0.17466206848621368,
"learning_rate": 3.731622695922852e-05,
"lookahead_loss": 3.9561206374168396,
"loss": 2.1373,
"step": 133000
},
{
"base_loss": 0.321269671857357,
"epoch": 1.0162124633789062,
"grad_norm": 0.17413805425167084,
"learning_rate": 3.726854324340821e-05,
"lookahead_loss": 3.9549911060333254,
"loss": 2.1381,
"step": 133500
},
{
"base_loss": 0.29983696776628493,
"epoch": 1.0171661376953125,
"grad_norm": 0.1588663011789322,
"learning_rate": 3.722085952758789e-05,
"lookahead_loss": 3.919067234992981,
"loss": 2.1095,
"step": 134000
},
{
"base_loss": 0.31978207612037657,
"epoch": 1.0181198120117188,
"grad_norm": 0.28145942091941833,
"learning_rate": 3.717317581176758e-05,
"lookahead_loss": 3.9233868684768676,
"loss": 2.1216,
"step": 134500
},
{
"base_loss": 0.29129536652565,
"epoch": 1.019073486328125,
"grad_norm": 0.1842055320739746,
"learning_rate": 3.7125492095947264e-05,
"lookahead_loss": 3.8867431244850157,
"loss": 2.089,
"step": 135000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.455427379989624,
"eval_lookahead_perplexity": 31.671821693026402,
"eval_loss": 1.7930638790130615,
"eval_perplexity": 6.007831565384858,
"eval_runtime": 491.2385,
"eval_samples_per_second": 20.357,
"eval_steps_per_second": 1.272,
"step": 135000
},
{
"base_loss": 0.30145278573036194,
"epoch": 1.0200271606445312,
"grad_norm": 0.1709691286087036,
"learning_rate": 3.7077808380126955e-05,
"lookahead_loss": 3.9409879336357116,
"loss": 2.1212,
"step": 135500
},
{
"base_loss": 0.2936854332089424,
"epoch": 1.0209808349609375,
"grad_norm": 0.1593596488237381,
"learning_rate": 3.7030124664306645e-05,
"lookahead_loss": 3.926469912528992,
"loss": 2.1101,
"step": 136000
},
{
"base_loss": 0.3014064610004425,
"epoch": 1.0219345092773438,
"grad_norm": 0.6354989409446716,
"learning_rate": 3.698244094848633e-05,
"lookahead_loss": 3.9481157250404357,
"loss": 2.1248,
"step": 136500
},
{
"base_loss": 0.30362616485357286,
"epoch": 1.02288818359375,
"grad_norm": 0.16273941099643707,
"learning_rate": 3.693475723266602e-05,
"lookahead_loss": 3.9114366030693053,
"loss": 2.1075,
"step": 137000
},
{
"base_loss": 0.3144021729230881,
"epoch": 1.0238418579101562,
"grad_norm": 0.149958074092865,
"learning_rate": 3.68870735168457e-05,
"lookahead_loss": 3.929403066635132,
"loss": 2.1219,
"step": 137500
},
{
"base_loss": 0.330579254090786,
"epoch": 1.0247955322265625,
"grad_norm": 0.12735570967197418,
"learning_rate": 3.683938980102539e-05,
"lookahead_loss": 3.9694002509117126,
"loss": 2.15,
"step": 138000
},
{
"base_loss": 0.30834819096326826,
"epoch": 1.0257492065429688,
"grad_norm": 0.15742145478725433,
"learning_rate": 3.679170608520508e-05,
"lookahead_loss": 3.903434811115265,
"loss": 2.1059,
"step": 138500
},
{
"base_loss": 0.30765692061185834,
"epoch": 1.026702880859375,
"grad_norm": 0.1584819257259369,
"learning_rate": 3.6744022369384766e-05,
"lookahead_loss": 3.905990194797516,
"loss": 2.1068,
"step": 139000
},
{
"base_loss": 0.29641868540644645,
"epoch": 1.0276565551757812,
"grad_norm": 0.2251375913619995,
"learning_rate": 3.6696338653564456e-05,
"lookahead_loss": 3.871658296585083,
"loss": 2.084,
"step": 139500
},
{
"base_loss": 0.30686542350053786,
"epoch": 1.0286102294921875,
"grad_norm": 0.15121251344680786,
"learning_rate": 3.664865493774414e-05,
"lookahead_loss": 3.9316897687911987,
"loss": 2.1193,
"step": 140000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.4411208850860597,
"eval_lookahead_perplexity": 31.221934763309736,
"eval_loss": 1.785910725593567,
"eval_perplexity": 5.965009961818914,
"eval_runtime": 495.6044,
"eval_samples_per_second": 20.177,
"eval_steps_per_second": 1.261,
"step": 140000
},
{
"base_loss": 0.29295339247584345,
"epoch": 1.0295639038085938,
"grad_norm": 0.17040514945983887,
"learning_rate": 3.660097122192383e-05,
"lookahead_loss": 3.922786801815033,
"loss": 2.1079,
"step": 140500
},
{
"base_loss": 0.29418606969714167,
"epoch": 1.030517578125,
"grad_norm": 0.16101863980293274,
"learning_rate": 3.655328750610352e-05,
"lookahead_loss": 3.9305174765586854,
"loss": 2.1124,
"step": 141000
},
{
"base_loss": 0.30073931351304056,
"epoch": 1.0314712524414062,
"grad_norm": 0.15563958883285522,
"learning_rate": 3.65056037902832e-05,
"lookahead_loss": 3.9169350867271424,
"loss": 2.1088,
"step": 141500
},
{
"base_loss": 0.32550489193201065,
"epoch": 1.0324249267578125,
"grad_norm": 0.18198108673095703,
"learning_rate": 3.6457920074462893e-05,
"lookahead_loss": 3.949872624874115,
"loss": 2.1377,
"step": 142000
},
{
"base_loss": 0.3217253153324127,
"epoch": 1.0333786010742188,
"grad_norm": 0.14466656744480133,
"learning_rate": 3.641023635864258e-05,
"lookahead_loss": 3.943008924484253,
"loss": 2.1324,
"step": 142500
},
{
"base_loss": 0.29591422697901726,
"epoch": 1.034332275390625,
"grad_norm": 0.22209474444389343,
"learning_rate": 3.636255264282227e-05,
"lookahead_loss": 3.89274987077713,
"loss": 2.0943,
"step": 143000
},
{
"base_loss": 0.3019189378321171,
"epoch": 1.0352859497070312,
"grad_norm": 0.15056173503398895,
"learning_rate": 3.631486892700196e-05,
"lookahead_loss": 3.8982916412353514,
"loss": 2.1001,
"step": 143500
},
{
"base_loss": 0.29187374815344813,
"epoch": 1.0362396240234375,
"grad_norm": 0.1609506458044052,
"learning_rate": 3.626718521118164e-05,
"lookahead_loss": 3.876904351711273,
"loss": 2.0844,
"step": 144000
},
{
"base_loss": 0.30099624979496004,
"epoch": 1.0371932983398438,
"grad_norm": 0.15686625242233276,
"learning_rate": 3.621950149536133e-05,
"lookahead_loss": 3.9365439949035643,
"loss": 2.1188,
"step": 144500
},
{
"base_loss": 0.2944833936691284,
"epoch": 1.03814697265625,
"grad_norm": 0.12234937399625778,
"learning_rate": 3.6171817779541014e-05,
"lookahead_loss": 3.925371481895447,
"loss": 2.1099,
"step": 145000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.427335873413086,
"eval_lookahead_perplexity": 30.794492942144828,
"eval_loss": 1.7790181636810303,
"eval_perplexity": 5.924037127456275,
"eval_runtime": 486.0413,
"eval_samples_per_second": 20.574,
"eval_steps_per_second": 1.286,
"step": 145000
},
{
"base_loss": 0.3096110028922558,
"epoch": 1.0391006469726562,
"grad_norm": 0.14352020621299744,
"learning_rate": 3.6124134063720705e-05,
"lookahead_loss": 3.9274420647621153,
"loss": 2.1185,
"step": 145500
},
{
"base_loss": 0.29815685757994653,
"epoch": 1.0400543212890625,
"grad_norm": 0.19689391553401947,
"learning_rate": 3.6076450347900395e-05,
"lookahead_loss": 3.8799168334007264,
"loss": 2.089,
"step": 146000
},
{
"base_loss": 0.33026822620630264,
"epoch": 1.0410079956054688,
"grad_norm": 0.15674136579036713,
"learning_rate": 3.602876663208008e-05,
"lookahead_loss": 3.9665490646362302,
"loss": 2.1484,
"step": 146500
},
{
"base_loss": 0.32754166290163994,
"epoch": 1.041961669921875,
"grad_norm": 0.2157350480556488,
"learning_rate": 3.598108291625977e-05,
"lookahead_loss": 3.9263464074134826,
"loss": 2.1269,
"step": 147000
},
{
"base_loss": 0.308051556378603,
"epoch": 1.0429153442382812,
"grad_norm": 0.17233142256736755,
"learning_rate": 3.593339920043945e-05,
"lookahead_loss": 3.892621549129486,
"loss": 2.1003,
"step": 147500
},
{
"base_loss": 0.2963902007639408,
"epoch": 1.0438690185546875,
"grad_norm": 0.13420014083385468,
"learning_rate": 3.588571548461914e-05,
"lookahead_loss": 3.8680308771133425,
"loss": 2.0822,
"step": 148000
},
{
"base_loss": 0.303954668790102,
"epoch": 1.0448226928710938,
"grad_norm": 0.13218580186367035,
"learning_rate": 3.583803176879883e-05,
"lookahead_loss": 3.886783453464508,
"loss": 2.0954,
"step": 148500
},
{
"base_loss": 0.29838690185546873,
"epoch": 1.0457763671875,
"grad_norm": 0.16112856566905975,
"learning_rate": 3.5790348052978516e-05,
"lookahead_loss": 3.9105755825042725,
"loss": 2.1045,
"step": 149000
},
{
"base_loss": 0.2983519469201565,
"epoch": 1.0467300415039062,
"grad_norm": 0.1744556576013565,
"learning_rate": 3.5742664337158206e-05,
"lookahead_loss": 3.9085661787986754,
"loss": 2.1035,
"step": 149500
},
{
"base_loss": 0.30389092776179316,
"epoch": 1.0476837158203125,
"grad_norm": 0.15804961323738098,
"learning_rate": 3.569498062133789e-05,
"lookahead_loss": 3.8888008046150206,
"loss": 2.0963,
"step": 150000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.4153005847930906,
"eval_lookahead_perplexity": 30.426093674257224,
"eval_loss": 1.7730005979537964,
"eval_perplexity": 5.88849588779296,
"eval_runtime": 485.4933,
"eval_samples_per_second": 20.598,
"eval_steps_per_second": 1.287,
"step": 150000
},
{
"base_loss": 0.32404559567570684,
"epoch": 1.0486373901367188,
"grad_norm": 0.3761279881000519,
"learning_rate": 3.564729690551758e-05,
"lookahead_loss": 3.9307862186431883,
"loss": 2.1274,
"step": 150500
},
{
"base_loss": 0.32713927403092385,
"epoch": 1.049591064453125,
"grad_norm": 0.16427947580814362,
"learning_rate": 3.559961318969727e-05,
"lookahead_loss": 3.9374624252319337,
"loss": 2.1323,
"step": 151000
},
{
"base_loss": 0.3163092802464962,
"epoch": 1.0505447387695312,
"grad_norm": 0.23663388192653656,
"learning_rate": 3.555192947387695e-05,
"lookahead_loss": 3.8996983790397644,
"loss": 2.108,
"step": 151500
},
{
"base_loss": 0.30190867054462434,
"epoch": 1.0514984130859375,
"grad_norm": 0.19574706256389618,
"learning_rate": 3.5504245758056643e-05,
"lookahead_loss": 3.879706892490387,
"loss": 2.0908,
"step": 152000
},
{
"base_loss": 0.2993427519798279,
"epoch": 1.0524520874023438,
"grad_norm": 0.18463152647018433,
"learning_rate": 3.545656204223633e-05,
"lookahead_loss": 3.8694649033546447,
"loss": 2.0844,
"step": 152500
},
{
"base_loss": 0.3071295386552811,
"epoch": 1.05340576171875,
"grad_norm": 0.17730920016765594,
"learning_rate": 3.540887832641602e-05,
"lookahead_loss": 3.9193247327804563,
"loss": 2.1132,
"step": 153000
},
{
"base_loss": 0.2997964630126953,
"epoch": 1.0543594360351562,
"grad_norm": 0.262504905462265,
"learning_rate": 3.536119461059571e-05,
"lookahead_loss": 3.8937467522621154,
"loss": 2.0968,
"step": 153500
},
{
"base_loss": 0.31396923500299456,
"epoch": 1.0553131103515625,
"grad_norm": 0.1696975976228714,
"learning_rate": 3.531351089477539e-05,
"lookahead_loss": 3.9104519414901735,
"loss": 2.1122,
"step": 154000
},
{
"base_loss": 0.32232173484563825,
"epoch": 1.0562667846679688,
"grad_norm": 0.14248883724212646,
"learning_rate": 3.526582717895508e-05,
"lookahead_loss": 3.9199597582817076,
"loss": 2.1211,
"step": 154500
},
{
"base_loss": 0.33595413306355476,
"epoch": 1.057220458984375,
"grad_norm": 0.13409440219402313,
"learning_rate": 3.5218143463134764e-05,
"lookahead_loss": 3.951625358104706,
"loss": 2.1438,
"step": 155000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.4037932720184325,
"eval_lookahead_perplexity": 30.077977877910172,
"eval_loss": 1.7672468423843384,
"eval_perplexity": 5.854712206507675,
"eval_runtime": 489.1245,
"eval_samples_per_second": 20.445,
"eval_steps_per_second": 1.278,
"step": 155000
},
{
"base_loss": 0.30358468553423884,
"epoch": 1.0581741333007812,
"grad_norm": 0.14868056774139404,
"learning_rate": 3.5170459747314455e-05,
"lookahead_loss": 3.875640995025635,
"loss": 2.0896,
"step": 155500
},
{
"base_loss": 0.29896630710363387,
"epoch": 1.0591278076171875,
"grad_norm": 0.15365566313266754,
"learning_rate": 3.5122776031494145e-05,
"lookahead_loss": 3.8703284606933592,
"loss": 2.0846,
"step": 156000
},
{
"base_loss": 0.3018466059863567,
"epoch": 1.0600814819335938,
"grad_norm": 0.23302938044071198,
"learning_rate": 3.507509231567383e-05,
"lookahead_loss": 3.8674917163848876,
"loss": 2.0847,
"step": 156500
},
{
"base_loss": 0.30544874557852747,
"epoch": 1.06103515625,
"grad_norm": 0.2524946928024292,
"learning_rate": 3.502740859985352e-05,
"lookahead_loss": 3.937124222278595,
"loss": 2.1213,
"step": 157000
},
{
"base_loss": 0.30323341020941735,
"epoch": 1.0619888305664062,
"grad_norm": 0.1309209018945694,
"learning_rate": 3.49797248840332e-05,
"lookahead_loss": 3.9118480677604675,
"loss": 2.1075,
"step": 157500
},
{
"base_loss": 0.30131002590060235,
"epoch": 1.0629425048828125,
"grad_norm": 0.1601804494857788,
"learning_rate": 3.493204116821289e-05,
"lookahead_loss": 3.897795463562012,
"loss": 2.0996,
"step": 158000
},
{
"base_loss": 0.31901666805148127,
"epoch": 1.0638961791992188,
"grad_norm": 0.14104294776916504,
"learning_rate": 3.488435745239258e-05,
"lookahead_loss": 3.9272995166778566,
"loss": 2.1232,
"step": 158500
},
{
"base_loss": 0.32057506546378134,
"epoch": 1.064849853515625,
"grad_norm": 0.19912464916706085,
"learning_rate": 3.4836673736572266e-05,
"lookahead_loss": 3.919455467700958,
"loss": 2.12,
"step": 159000
},
{
"base_loss": 0.3094568813741207,
"epoch": 1.0658035278320312,
"grad_norm": 0.13988551497459412,
"learning_rate": 3.4788990020751956e-05,
"lookahead_loss": 3.8748404712677003,
"loss": 2.0921,
"step": 159500
},
{
"base_loss": 0.296497131973505,
"epoch": 1.0667572021484375,
"grad_norm": 0.17116788029670715,
"learning_rate": 3.474130630493164e-05,
"lookahead_loss": 3.8537814893722535,
"loss": 2.0751,
"step": 160000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.393118642807007,
"eval_lookahead_perplexity": 29.758614193648153,
"eval_loss": 1.7619096040725708,
"eval_perplexity": 5.823547453052975,
"eval_runtime": 489.6754,
"eval_samples_per_second": 20.422,
"eval_steps_per_second": 1.276,
"step": 160000
},
{
"base_loss": 0.2961631261408329,
"epoch": 1.0677108764648438,
"grad_norm": 0.14856025576591492,
"learning_rate": 3.469362258911133e-05,
"lookahead_loss": 3.8731155366897583,
"loss": 2.0846,
"step": 160500
},
{
"base_loss": 0.30877826517820356,
"epoch": 1.06866455078125,
"grad_norm": 0.15069471299648285,
"learning_rate": 3.464593887329102e-05,
"lookahead_loss": 3.9271927394866943,
"loss": 2.118,
"step": 161000
},
{
"base_loss": 0.3005406486093998,
"epoch": 1.0696182250976562,
"grad_norm": 0.15068742632865906,
"learning_rate": 3.45982551574707e-05,
"lookahead_loss": 3.88241503572464,
"loss": 2.0915,
"step": 161500
},
{
"base_loss": 0.32187015274167063,
"epoch": 1.0705718994140625,
"grad_norm": 0.15749786794185638,
"learning_rate": 3.4550571441650393e-05,
"lookahead_loss": 3.896246009349823,
"loss": 2.1091,
"step": 162000
},
{
"base_loss": 0.33053677862882613,
"epoch": 1.0715255737304688,
"grad_norm": 0.15866954624652863,
"learning_rate": 3.450288772583008e-05,
"lookahead_loss": 3.9282021570205687,
"loss": 2.1294,
"step": 162500
},
{
"base_loss": 0.32069788879156114,
"epoch": 1.072479248046875,
"grad_norm": 0.1594492495059967,
"learning_rate": 3.445520401000977e-05,
"lookahead_loss": 3.8940247020721435,
"loss": 2.1074,
"step": 163000
},
{
"base_loss": 0.31262470212578775,
"epoch": 1.0734329223632812,
"grad_norm": 0.17305102944374084,
"learning_rate": 3.440752029418946e-05,
"lookahead_loss": 3.8849755458831785,
"loss": 2.0988,
"step": 163500
},
{
"base_loss": 0.2981148832142353,
"epoch": 1.0743865966796875,
"grad_norm": 0.16819824278354645,
"learning_rate": 3.435983657836914e-05,
"lookahead_loss": 3.8529111161231993,
"loss": 2.0755,
"step": 164000
},
{
"base_loss": 0.30023786443471906,
"epoch": 1.0753402709960938,
"grad_norm": 0.17850428819656372,
"learning_rate": 3.431215286254883e-05,
"lookahead_loss": 3.861279788017273,
"loss": 2.0808,
"step": 164500
},
{
"base_loss": 0.3020662295222282,
"epoch": 1.0762939453125,
"grad_norm": 0.2424204796552658,
"learning_rate": 3.4264469146728514e-05,
"lookahead_loss": 3.8807444310188295,
"loss": 2.0914,
"step": 165000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.3821912258148195,
"eval_lookahead_perplexity": 29.435199668131006,
"eval_loss": 1.7564458847045898,
"eval_perplexity": 5.79181598888891,
"eval_runtime": 489.9893,
"eval_samples_per_second": 20.409,
"eval_steps_per_second": 1.276,
"step": 165000
},
{
"base_loss": 0.3027431888282299,
"epoch": 1.0772476196289062,
"grad_norm": 0.20542722940444946,
"learning_rate": 3.4216785430908205e-05,
"lookahead_loss": 3.8899433569908144,
"loss": 2.0963,
"step": 165500
},
{
"base_loss": 0.3171701425611973,
"epoch": 1.0782012939453125,
"grad_norm": 0.16201598942279816,
"learning_rate": 3.4169101715087895e-05,
"lookahead_loss": 3.8952909932136537,
"loss": 2.1062,
"step": 166000
},
{
"base_loss": 0.3263061309456825,
"epoch": 1.0791549682617188,
"grad_norm": 0.14267025887966156,
"learning_rate": 3.412141799926758e-05,
"lookahead_loss": 3.915688913345337,
"loss": 2.121,
"step": 166500
},
{
"base_loss": 0.3166075404882431,
"epoch": 1.080108642578125,
"grad_norm": 0.20565369725227356,
"learning_rate": 3.407373428344727e-05,
"lookahead_loss": 3.884951609611511,
"loss": 2.1008,
"step": 167000
},
{
"base_loss": 0.3153759800195694,
"epoch": 1.0810623168945312,
"grad_norm": 0.17952857911586761,
"learning_rate": 3.402605056762695e-05,
"lookahead_loss": 3.874412197113037,
"loss": 2.0949,
"step": 167500
},
{
"base_loss": 0.29424766221642495,
"epoch": 1.0820159912109375,
"grad_norm": 0.13297192752361298,
"learning_rate": 3.397836685180664e-05,
"lookahead_loss": 3.8378429608345033,
"loss": 2.066,
"step": 168000
},
{
"base_loss": 0.2951381744146347,
"epoch": 1.0829696655273438,
"grad_norm": 0.14463625848293304,
"learning_rate": 3.393068313598633e-05,
"lookahead_loss": 3.878976944446564,
"loss": 2.0871,
"step": 168500
},
{
"base_loss": 0.29598655554652215,
"epoch": 1.08392333984375,
"grad_norm": 0.1720210164785385,
"learning_rate": 3.3882999420166016e-05,
"lookahead_loss": 3.883972409248352,
"loss": 2.09,
"step": 169000
},
{
"base_loss": 0.3003324483036995,
"epoch": 1.0848770141601562,
"grad_norm": 0.16413024067878723,
"learning_rate": 3.3835315704345706e-05,
"lookahead_loss": 3.866434679508209,
"loss": 2.0834,
"step": 169500
},
{
"base_loss": 0.324860055655241,
"epoch": 1.0858306884765625,
"grad_norm": 0.21429774165153503,
"learning_rate": 3.378763198852539e-05,
"lookahead_loss": 3.8790123524665834,
"loss": 2.1019,
"step": 170000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.3718783203125,
"eval_lookahead_perplexity": 29.13319717374854,
"eval_loss": 1.7512894868850708,
"eval_perplexity": 5.762027947051168,
"eval_runtime": 481.9044,
"eval_samples_per_second": 20.751,
"eval_steps_per_second": 1.297,
"step": 170000
},
{
"base_loss": 0.32848941776156426,
"epoch": 1.0867843627929688,
"grad_norm": 0.2181519716978073,
"learning_rate": 3.373994827270508e-05,
"lookahead_loss": 3.916994530200958,
"loss": 2.1227,
"step": 170500
},
{
"base_loss": 0.3349419977068901,
"epoch": 1.087738037109375,
"grad_norm": 0.21681463718414307,
"learning_rate": 3.369226455688477e-05,
"lookahead_loss": 3.889006247520447,
"loss": 2.112,
"step": 171000
},
{
"base_loss": 0.29626981797814367,
"epoch": 1.0886917114257812,
"grad_norm": 0.13979732990264893,
"learning_rate": 3.364458084106445e-05,
"lookahead_loss": 3.845555930137634,
"loss": 2.0709,
"step": 171500
},
{
"base_loss": 0.29588885527849196,
"epoch": 1.0896453857421875,
"grad_norm": 0.17025181651115417,
"learning_rate": 3.3596897125244143e-05,
"lookahead_loss": 3.825295521736145,
"loss": 2.0606,
"step": 172000
},
{
"base_loss": 0.30355440092086794,
"epoch": 1.0905990600585938,
"grad_norm": 0.13651303946971893,
"learning_rate": 3.354921340942383e-05,
"lookahead_loss": 3.8832921752929686,
"loss": 2.0934,
"step": 172500
},
{
"base_loss": 0.3085533272922039,
"epoch": 1.091552734375,
"grad_norm": 0.1449888050556183,
"learning_rate": 3.350152969360352e-05,
"lookahead_loss": 3.900917944908142,
"loss": 2.1047,
"step": 173000
},
{
"base_loss": 0.30814607721567155,
"epoch": 1.0925064086914062,
"grad_norm": 0.14506219327449799,
"learning_rate": 3.345384597778321e-05,
"lookahead_loss": 3.8408993144035337,
"loss": 2.0745,
"step": 173500
},
{
"base_loss": 0.3475791245102882,
"epoch": 1.0934600830078125,
"grad_norm": 0.14961472153663635,
"learning_rate": 3.340616226196289e-05,
"lookahead_loss": 3.9275504984855654,
"loss": 2.1376,
"step": 174000
},
{
"base_loss": 0.3188588669300079,
"epoch": 1.0944137573242188,
"grad_norm": 0.14812441170215607,
"learning_rate": 3.335847854614258e-05,
"lookahead_loss": 3.894184876918793,
"loss": 2.1065,
"step": 174500
},
{
"base_loss": 0.33637256652116776,
"epoch": 1.095367431640625,
"grad_norm": 0.31748202443122864,
"learning_rate": 3.3310794830322264e-05,
"lookahead_loss": 3.91321995306015,
"loss": 2.1248,
"step": 175000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.362889016342163,
"eval_lookahead_perplexity": 28.872483581226124,
"eval_loss": 1.7467947006225586,
"eval_perplexity": 5.736186981305101,
"eval_runtime": 492.3817,
"eval_samples_per_second": 20.309,
"eval_steps_per_second": 1.269,
"step": 175000
},
{
"base_loss": 0.29970453238487244,
"epoch": 1.0963211059570312,
"grad_norm": 0.14985507726669312,
"learning_rate": 3.3263111114501955e-05,
"lookahead_loss": 3.8269423189163208,
"loss": 2.0633,
"step": 175500
},
{
"base_loss": 0.29638376808166506,
"epoch": 1.0972747802734375,
"grad_norm": 0.15558375418186188,
"learning_rate": 3.3215427398681645e-05,
"lookahead_loss": 3.8555296115875244,
"loss": 2.076,
"step": 176000
},
{
"base_loss": 0.3054467994570732,
"epoch": 1.0982284545898438,
"grad_norm": 0.1894250363111496,
"learning_rate": 3.316774368286133e-05,
"lookahead_loss": 3.8709189410209657,
"loss": 2.0882,
"step": 176500
},
{
"base_loss": 0.3071099489927292,
"epoch": 1.09918212890625,
"grad_norm": 0.16460929811000824,
"learning_rate": 3.312005996704102e-05,
"lookahead_loss": 3.862484317779541,
"loss": 2.0848,
"step": 177000
},
{
"base_loss": 0.31426447916030886,
"epoch": 1.1001358032226562,
"grad_norm": 0.21863171458244324,
"learning_rate": 3.30723762512207e-05,
"lookahead_loss": 3.8745448336601256,
"loss": 2.0944,
"step": 177500
},
{
"base_loss": 0.32723835909366605,
"epoch": 1.1010894775390625,
"grad_norm": 0.1326635330915451,
"learning_rate": 3.302469253540039e-05,
"lookahead_loss": 3.9010628695487974,
"loss": 2.1142,
"step": 178000
},
{
"base_loss": 0.31847833314538004,
"epoch": 1.1020431518554688,
"grad_norm": 0.18970361351966858,
"learning_rate": 3.297700881958008e-05,
"lookahead_loss": 3.864170029640198,
"loss": 2.0913,
"step": 178500
},
{
"base_loss": 0.29289821565151214,
"epoch": 1.102996826171875,
"grad_norm": 0.20295552909374237,
"learning_rate": 3.2929325103759766e-05,
"lookahead_loss": 3.814200548171997,
"loss": 2.0535,
"step": 179000
},
{
"base_loss": 0.30456195056438445,
"epoch": 1.1039505004882812,
"grad_norm": 0.20044149458408356,
"learning_rate": 3.2881641387939456e-05,
"lookahead_loss": 3.842505895137787,
"loss": 2.0735,
"step": 179500
},
{
"base_loss": 0.3080640316605568,
"epoch": 1.1049041748046875,
"grad_norm": 0.2961556911468506,
"learning_rate": 3.283395767211914e-05,
"lookahead_loss": 3.883462808609009,
"loss": 2.0958,
"step": 180000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.353512685394287,
"eval_lookahead_perplexity": 28.603030833040698,
"eval_loss": 1.7421066761016846,
"eval_perplexity": 5.709358531431193,
"eval_runtime": 509.3192,
"eval_samples_per_second": 19.634,
"eval_steps_per_second": 1.227,
"step": 180000
},
{
"base_loss": 0.31145399552583697,
"epoch": 1.1058578491210938,
"grad_norm": 0.14174073934555054,
"learning_rate": 3.278627395629883e-05,
"lookahead_loss": 3.878586507797241,
"loss": 2.095,
"step": 180500
},
{
"base_loss": 0.3290363503992558,
"epoch": 1.1068115234375,
"grad_norm": 0.14076529443264008,
"learning_rate": 3.273859024047852e-05,
"lookahead_loss": 3.87795290517807,
"loss": 2.1035,
"step": 181000
},
{
"base_loss": 0.34425261700153353,
"epoch": 1.1077651977539062,
"grad_norm": 0.16690996289253235,
"learning_rate": 3.26909065246582e-05,
"lookahead_loss": 3.9159616875648497,
"loss": 2.1301,
"step": 181500
},
{
"base_loss": 0.37842708241939543,
"epoch": 1.1087188720703125,
"grad_norm": 0.13356585800647736,
"learning_rate": 3.2643222808837893e-05,
"lookahead_loss": 3.9294180998802184,
"loss": 2.1539,
"step": 182000
},
{
"base_loss": 0.29286388018727305,
"epoch": 1.1096725463867188,
"grad_norm": 0.1421193927526474,
"learning_rate": 3.259553909301758e-05,
"lookahead_loss": 3.819779777050018,
"loss": 2.0563,
"step": 182500
},
{
"base_loss": 0.2963064706027508,
"epoch": 1.110626220703125,
"grad_norm": 0.19730889797210693,
"learning_rate": 3.254785537719727e-05,
"lookahead_loss": 3.8228342752456665,
"loss": 2.0596,
"step": 183000
},
{
"base_loss": 0.310688713490963,
"epoch": 1.1115798950195312,
"grad_norm": 0.15284715592861176,
"learning_rate": 3.250017166137696e-05,
"lookahead_loss": 3.8795133504867554,
"loss": 2.0951,
"step": 183500
},
{
"base_loss": 0.304560353577137,
"epoch": 1.1125335693359375,
"grad_norm": 0.1316368132829666,
"learning_rate": 3.245248794555664e-05,
"lookahead_loss": 3.8906916728019714,
"loss": 2.0976,
"step": 184000
},
{
"base_loss": 0.3029217945933342,
"epoch": 1.1134872436523438,
"grad_norm": 0.14683161675930023,
"learning_rate": 3.240480422973633e-05,
"lookahead_loss": 3.8557743334770205,
"loss": 2.0793,
"step": 184500
},
{
"base_loss": 0.33166853222250936,
"epoch": 1.11444091796875,
"grad_norm": 0.17274287343025208,
"learning_rate": 3.2357120513916014e-05,
"lookahead_loss": 3.903276960849762,
"loss": 2.1175,
"step": 185000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.3450291194915773,
"eval_lookahead_perplexity": 28.361401524222025,
"eval_loss": 1.7378648519515991,
"eval_perplexity": 5.685191728431611,
"eval_runtime": 481.6228,
"eval_samples_per_second": 20.763,
"eval_steps_per_second": 1.298,
"step": 185000
},
{
"base_loss": 0.3226691042780876,
"epoch": 1.1153945922851562,
"grad_norm": 0.16963951289653778,
"learning_rate": 3.2309436798095705e-05,
"lookahead_loss": 3.864322167873383,
"loss": 2.0935,
"step": 185500
},
{
"base_loss": 0.31310846722126007,
"epoch": 1.1163482666015625,
"grad_norm": 0.12234390527009964,
"learning_rate": 3.2261753082275395e-05,
"lookahead_loss": 3.8433740344047544,
"loss": 2.0782,
"step": 186000
},
{
"base_loss": 0.29350434136390685,
"epoch": 1.1173019409179688,
"grad_norm": 0.16640856862068176,
"learning_rate": 3.221406936645508e-05,
"lookahead_loss": 3.7999322633743287,
"loss": 2.0467,
"step": 186500
},
{
"base_loss": 0.2932390958070755,
"epoch": 1.118255615234375,
"grad_norm": 0.13954715430736542,
"learning_rate": 3.216638565063477e-05,
"lookahead_loss": 3.8404451036453247,
"loss": 2.0668,
"step": 187000
},
{
"base_loss": 0.30293849104642867,
"epoch": 1.1192092895507812,
"grad_norm": 0.2102259248495102,
"learning_rate": 3.211870193481445e-05,
"lookahead_loss": 3.876214940547943,
"loss": 2.0896,
"step": 187500
},
{
"base_loss": 0.3002570872604847,
"epoch": 1.1201629638671875,
"grad_norm": 0.1469457745552063,
"learning_rate": 3.207101821899414e-05,
"lookahead_loss": 3.8522539978027344,
"loss": 2.0763,
"step": 188000
},
{
"base_loss": 0.33394081115722657,
"epoch": 1.1211166381835938,
"grad_norm": 0.1487540900707245,
"learning_rate": 3.202333450317383e-05,
"lookahead_loss": 3.899791095733643,
"loss": 2.1169,
"step": 188500
},
{
"base_loss": 0.305567186832428,
"epoch": 1.1220703125,
"grad_norm": 0.13522186875343323,
"learning_rate": 3.1975650787353516e-05,
"lookahead_loss": 3.8232165246009826,
"loss": 2.0644,
"step": 189000
},
{
"base_loss": 0.3088062160909176,
"epoch": 1.1230239868164062,
"grad_norm": 0.14893706142902374,
"learning_rate": 3.1927967071533206e-05,
"lookahead_loss": 3.825707914352417,
"loss": 2.0673,
"step": 189500
},
{
"base_loss": 0.2980837540626526,
"epoch": 1.1239776611328125,
"grad_norm": 0.1944948434829712,
"learning_rate": 3.188028335571289e-05,
"lookahead_loss": 3.8185536642074585,
"loss": 2.0583,
"step": 190000
},
{
"epoch": 1.1239776611328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.337442953109741,
"eval_lookahead_perplexity": 28.147061251859267,
"eval_loss": 1.7340717315673828,
"eval_perplexity": 5.663667958729687,
"eval_runtime": 490.8569,
"eval_samples_per_second": 20.373,
"eval_steps_per_second": 1.273,
"step": 190000
},
{
"base_loss": 0.3022361363172531,
"epoch": 1.1249313354492188,
"grad_norm": 0.13972483575344086,
"learning_rate": 3.183259963989258e-05,
"lookahead_loss": 3.871019568443298,
"loss": 2.0866,
"step": 190500
},
{
"base_loss": 0.3076981520354748,
"epoch": 1.125885009765625,
"grad_norm": 0.1670301854610443,
"learning_rate": 3.178491592407227e-05,
"lookahead_loss": 3.870435709476471,
"loss": 2.0891,
"step": 191000
},
{
"base_loss": 0.3076484650671482,
"epoch": 1.1268386840820312,
"grad_norm": 0.15010391175746918,
"learning_rate": 3.173723220825195e-05,
"lookahead_loss": 3.858703122615814,
"loss": 2.0832,
"step": 191500
},
{
"base_loss": 0.32482430759072306,
"epoch": 1.1277923583984375,
"grad_norm": 0.1445780098438263,
"learning_rate": 3.1689548492431643e-05,
"lookahead_loss": 3.868783023357391,
"loss": 2.0968,
"step": 192000
},
{
"base_loss": 0.3072025769650936,
"epoch": 1.1287460327148438,
"grad_norm": 0.21696054935455322,
"learning_rate": 3.164186477661133e-05,
"lookahead_loss": 3.820800142288208,
"loss": 2.064,
"step": 192500
},
{
"base_loss": 0.3038946217596531,
"epoch": 1.12969970703125,
"grad_norm": 0.18033991754055023,
"learning_rate": 3.159418106079102e-05,
"lookahead_loss": 3.8287801537513735,
"loss": 2.0663,
"step": 193000
},
{
"base_loss": 0.309920187741518,
"epoch": 1.1306533813476562,
"grad_norm": 0.14635299146175385,
"learning_rate": 3.154649734497071e-05,
"lookahead_loss": 3.809571493625641,
"loss": 2.0597,
"step": 193500
},
{
"base_loss": 0.3105273153483868,
"epoch": 1.1316070556640625,
"grad_norm": 0.16371551156044006,
"learning_rate": 3.149881362915039e-05,
"lookahead_loss": 3.8675531783103945,
"loss": 2.089,
"step": 194000
},
{
"base_loss": 0.3032006587386131,
"epoch": 1.1325607299804688,
"grad_norm": 0.1437891125679016,
"learning_rate": 3.145112991333008e-05,
"lookahead_loss": 3.859119012832642,
"loss": 2.0812,
"step": 194500
},
{
"base_loss": 0.3097851026952267,
"epoch": 1.133514404296875,
"grad_norm": 0.1416500210762024,
"learning_rate": 3.1403446197509764e-05,
"lookahead_loss": 3.838276375770569,
"loss": 2.074,
"step": 195000
},
{
"epoch": 1.133514404296875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.1307004702925682,
"eval_base_perplexity": 1.1396263782287075,
"eval_lookahead_loss": 3.3290446380615233,
"eval_lookahead_perplexity": 27.91166322078159,
"eval_loss": 1.729872703552246,
"eval_perplexity": 5.639935918922495,
"eval_runtime": 483.0611,
"eval_samples_per_second": 20.701,
"eval_steps_per_second": 1.294,
"step": 195000
},
{
"base_loss": 0.3019270299375057,
"epoch": 1.0009536743164062,
"grad_norm": 0.11448461562395096,
"learning_rate": 3.1355762481689455e-05,
"lookahead_loss": 3.852882830142975,
"loss": 2.073,
"step": 195500
},
{
"base_loss": 0.3022316916286945,
"epoch": 1.0019073486328125,
"grad_norm": 0.17280858755111694,
"learning_rate": 3.1308078765869145e-05,
"lookahead_loss": 3.83609538269043,
"loss": 2.0718,
"step": 196000
},
{
"base_loss": 0.3106894801259041,
"epoch": 1.0028610229492188,
"grad_norm": 0.1027756780385971,
"learning_rate": 3.126039505004883e-05,
"lookahead_loss": 3.846992799282074,
"loss": 2.0705,
"step": 196500
},
{
"base_loss": 0.3196644955575466,
"epoch": 1.003814697265625,
"grad_norm": 0.10951551049947739,
"learning_rate": 3.121271133422852e-05,
"lookahead_loss": 3.8443777050971986,
"loss": 2.0839,
"step": 197000
},
{
"base_loss": 0.30172179120779036,
"epoch": 1.0047683715820312,
"grad_norm": 0.0957549512386322,
"learning_rate": 3.11650276184082e-05,
"lookahead_loss": 3.806076729774475,
"loss": 2.0577,
"step": 197500
},
{
"base_loss": 0.2981105833351612,
"epoch": 1.0057220458984375,
"grad_norm": 0.1120913177728653,
"learning_rate": 3.111734390258789e-05,
"lookahead_loss": 3.831531247615814,
"loss": 2.064,
"step": 198000
},
{
"base_loss": 0.29635272261500356,
"epoch": 1.0066757202148438,
"grad_norm": 0.11260683089494705,
"learning_rate": 3.106966018676758e-05,
"lookahead_loss": 3.8422910799980166,
"loss": 2.0727,
"step": 198500
},
{
"base_loss": 0.313492115303874,
"epoch": 1.00762939453125,
"grad_norm": 0.12370772659778595,
"learning_rate": 3.1021976470947266e-05,
"lookahead_loss": 3.8489174375534057,
"loss": 2.0755,
"step": 199000
},
{
"base_loss": 0.31625834056735036,
"epoch": 1.0085830688476562,
"grad_norm": 0.12102854251861572,
"learning_rate": 3.0974292755126956e-05,
"lookahead_loss": 3.839761669635773,
"loss": 2.0706,
"step": 199500
},
{
"base_loss": 0.3014394761025906,
"epoch": 1.0095367431640625,
"grad_norm": 0.11906581372022629,
"learning_rate": 3.092660903930664e-05,
"lookahead_loss": 3.7962422103881837,
"loss": 2.0537,
"step": 200000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.3221299815863468,
"eval_lookahead_perplexity": 27.71932938548797,
"eval_loss": 1.724821925163269,
"eval_perplexity": 5.611521669910201,
"eval_runtime": 491.8555,
"eval_samples_per_second": 10.166,
"eval_steps_per_second": 0.319,
"step": 200000
},
{
"base_loss": 0.29815256175398824,
"epoch": 1.0104904174804688,
"grad_norm": 0.1050342544913292,
"learning_rate": 3.087892532348633e-05,
"lookahead_loss": 3.8349307403564454,
"loss": 2.0643,
"step": 200500
},
{
"base_loss": 0.300417543143034,
"epoch": 1.011444091796875,
"grad_norm": 0.10707239806652069,
"learning_rate": 3.083124160766602e-05,
"lookahead_loss": 3.8183065605163575,
"loss": 2.0646,
"step": 201000
},
{
"base_loss": 0.3254467163980007,
"epoch": 1.0123977661132812,
"grad_norm": 0.09747885912656784,
"learning_rate": 3.07835578918457e-05,
"lookahead_loss": 3.8502072682380675,
"loss": 2.0876,
"step": 201500
},
{
"base_loss": 0.3060005504488945,
"epoch": 1.0133514404296875,
"grad_norm": 0.12023238092660904,
"learning_rate": 3.0735874176025393e-05,
"lookahead_loss": 3.8015957136154173,
"loss": 2.0575,
"step": 202000
},
{
"base_loss": 0.29856650426983833,
"epoch": 1.0143051147460938,
"grad_norm": 0.12048441171646118,
"learning_rate": 3.068819046020508e-05,
"lookahead_loss": 3.8015453901290894,
"loss": 2.0505,
"step": 202500
},
{
"base_loss": 0.2938228516280651,
"epoch": 1.0152587890625,
"grad_norm": 0.11696764826774597,
"learning_rate": 3.064050674438477e-05,
"lookahead_loss": 3.8269337430000303,
"loss": 2.0616,
"step": 203000
},
{
"base_loss": 0.31177652820944785,
"epoch": 1.0162124633789062,
"grad_norm": 0.11982905864715576,
"learning_rate": 3.059282302856446e-05,
"lookahead_loss": 3.8382421617507934,
"loss": 2.0762,
"step": 203500
},
{
"base_loss": 0.3144752032160759,
"epoch": 1.0171661376953125,
"grad_norm": 0.13648830354213715,
"learning_rate": 3.054513931274414e-05,
"lookahead_loss": 3.83622642326355,
"loss": 2.0702,
"step": 204000
},
{
"base_loss": 0.3018057085573673,
"epoch": 1.0181198120117188,
"grad_norm": 0.13782760500907898,
"learning_rate": 3.049745559692383e-05,
"lookahead_loss": 3.8047565789222717,
"loss": 2.0474,
"step": 204500
},
{
"base_loss": 0.29815602460503576,
"epoch": 1.019073486328125,
"grad_norm": 0.1043187752366066,
"learning_rate": 3.0449771881103518e-05,
"lookahead_loss": 3.838974130153656,
"loss": 2.0681,
"step": 205000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.311551880912659,
"eval_lookahead_perplexity": 27.42765691874828,
"eval_loss": 1.7195392847061157,
"eval_perplexity": 5.581956179194468,
"eval_runtime": 511.3254,
"eval_samples_per_second": 9.779,
"eval_steps_per_second": 0.307,
"step": 205000
},
{
"base_loss": 0.30170127359032634,
"epoch": 1.0200271606445312,
"grad_norm": 0.13211366534233093,
"learning_rate": 3.0402088165283205e-05,
"lookahead_loss": 3.8131870975494384,
"loss": 2.0591,
"step": 205500
},
{
"base_loss": 0.32987111946940423,
"epoch": 1.0209808349609375,
"grad_norm": 0.15636946260929108,
"learning_rate": 3.035440444946289e-05,
"lookahead_loss": 3.8654905581474304,
"loss": 2.0925,
"step": 206000
},
{
"base_loss": 0.3035789504647255,
"epoch": 1.0219345092773438,
"grad_norm": 0.10754602402448654,
"learning_rate": 3.0306720733642578e-05,
"lookahead_loss": 3.7953832607269287,
"loss": 2.0467,
"step": 206500
},
{
"base_loss": 0.2993673265874386,
"epoch": 1.02288818359375,
"grad_norm": 0.1499767005443573,
"learning_rate": 3.025903701782227e-05,
"lookahead_loss": 3.802317718505859,
"loss": 2.0559,
"step": 207000
},
{
"base_loss": 0.30179986253380775,
"epoch": 1.0238418579101562,
"grad_norm": 0.10085665434598923,
"learning_rate": 3.0211353302001955e-05,
"lookahead_loss": 3.8097751059532166,
"loss": 2.0572,
"step": 207500
},
{
"base_loss": 0.3247649165391922,
"epoch": 1.0247955322265625,
"grad_norm": 0.10755620151758194,
"learning_rate": 3.0163669586181642e-05,
"lookahead_loss": 3.8524597969055177,
"loss": 2.0871,
"step": 208000
},
{
"base_loss": 0.30702361911535264,
"epoch": 1.0257492065429688,
"grad_norm": 0.12487711757421494,
"learning_rate": 3.011598587036133e-05,
"lookahead_loss": 3.7942523493766784,
"loss": 2.0572,
"step": 208500
},
{
"base_loss": 0.30614723709225655,
"epoch": 1.026702880859375,
"grad_norm": 0.12863144278526306,
"learning_rate": 3.0068302154541016e-05,
"lookahead_loss": 3.821696516036987,
"loss": 2.0563,
"step": 209000
},
{
"base_loss": 0.30975785833597186,
"epoch": 1.0276565551757812,
"grad_norm": 0.10876427590847015,
"learning_rate": 3.0020618438720706e-05,
"lookahead_loss": 3.827836051464081,
"loss": 2.0651,
"step": 209500
},
{
"base_loss": 0.33282243901491165,
"epoch": 1.0286102294921875,
"grad_norm": 0.1228252649307251,
"learning_rate": 2.9972934722900393e-05,
"lookahead_loss": 3.8672441935539243,
"loss": 2.0942,
"step": 210000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.302361313908245,
"eval_lookahead_perplexity": 27.176736018932548,
"eval_loss": 1.7149664163589478,
"eval_perplexity": 5.556488902099112,
"eval_runtime": 493.9456,
"eval_samples_per_second": 10.123,
"eval_steps_per_second": 0.318,
"step": 210000
},
{
"base_loss": 0.303214881747961,
"epoch": 1.0295639038085938,
"grad_norm": 0.11192873120307922,
"learning_rate": 2.992525100708008e-05,
"lookahead_loss": 3.7915321893692018,
"loss": 2.0473,
"step": 210500
},
{
"base_loss": 0.3033870562314987,
"epoch": 1.030517578125,
"grad_norm": 0.14577656984329224,
"learning_rate": 2.9877567291259766e-05,
"lookahead_loss": 3.8221058592796324,
"loss": 2.0646,
"step": 211000
},
{
"base_loss": 0.30246006432175637,
"epoch": 1.0314712524414062,
"grad_norm": 0.12820690870285034,
"learning_rate": 2.9829883575439453e-05,
"lookahead_loss": 3.830271818637848,
"loss": 2.0663,
"step": 211500
},
{
"base_loss": 0.31892248579859733,
"epoch": 1.0324249267578125,
"grad_norm": 0.13428226113319397,
"learning_rate": 2.9782199859619143e-05,
"lookahead_loss": 3.847850811481476,
"loss": 2.0872,
"step": 212000
},
{
"base_loss": 0.3053825112581253,
"epoch": 1.0333786010742188,
"grad_norm": 0.16832296550273895,
"learning_rate": 2.973451614379883e-05,
"lookahead_loss": 3.7897876076698305,
"loss": 2.0465,
"step": 212500
},
{
"base_loss": 0.3021465467214584,
"epoch": 1.034332275390625,
"grad_norm": 0.12703529000282288,
"learning_rate": 2.9686832427978517e-05,
"lookahead_loss": 3.8201312785148622,
"loss": 2.0634,
"step": 213000
},
{
"base_loss": 0.30765057054162026,
"epoch": 1.0352859497070312,
"grad_norm": 0.12233400344848633,
"learning_rate": 2.9639148712158204e-05,
"lookahead_loss": 3.8115249166488647,
"loss": 2.0635,
"step": 213500
},
{
"base_loss": 0.3246081721484661,
"epoch": 1.0362396240234375,
"grad_norm": 0.11228856444358826,
"learning_rate": 2.959146499633789e-05,
"lookahead_loss": 3.8441762342453,
"loss": 2.0814,
"step": 214000
},
{
"base_loss": 0.3049994637668133,
"epoch": 1.0371932983398438,
"grad_norm": 0.1296556442975998,
"learning_rate": 2.954378128051758e-05,
"lookahead_loss": 3.7977512683868406,
"loss": 2.0533,
"step": 214500
},
{
"base_loss": 0.30023205706477163,
"epoch": 1.03814697265625,
"grad_norm": 0.13668769598007202,
"learning_rate": 2.9496097564697268e-05,
"lookahead_loss": 3.7937655339241028,
"loss": 2.051,
"step": 215000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.293529276649792,
"eval_lookahead_perplexity": 26.93776691924059,
"eval_loss": 1.7105357646942139,
"eval_perplexity": 5.531924493648194,
"eval_runtime": 525.3589,
"eval_samples_per_second": 9.517,
"eval_steps_per_second": 0.299,
"step": 215000
},
{
"base_loss": 0.3105517603158951,
"epoch": 1.0391006469726562,
"grad_norm": 0.12106386572122574,
"learning_rate": 2.9448413848876955e-05,
"lookahead_loss": 3.8179289078712464,
"loss": 2.0674,
"step": 215500
},
{
"base_loss": 0.3201953979730606,
"epoch": 1.0400543212890625,
"grad_norm": 0.16168224811553955,
"learning_rate": 2.940073013305664e-05,
"lookahead_loss": 3.835119602203369,
"loss": 2.0769,
"step": 216000
},
{
"base_loss": 0.31106969705224036,
"epoch": 1.0410079956054688,
"grad_norm": 0.10200289636850357,
"learning_rate": 2.9353046417236328e-05,
"lookahead_loss": 3.7980401215553283,
"loss": 2.0487,
"step": 216500
},
{
"base_loss": 0.29374569734930994,
"epoch": 1.041961669921875,
"grad_norm": 0.13975954055786133,
"learning_rate": 2.930536270141602e-05,
"lookahead_loss": 3.8102884998321533,
"loss": 2.0536,
"step": 217000
},
{
"base_loss": 0.30810881498456,
"epoch": 1.0429153442382812,
"grad_norm": 0.2878585755825043,
"learning_rate": 2.9257678985595705e-05,
"lookahead_loss": 3.80524068403244,
"loss": 2.0591,
"step": 217500
},
{
"base_loss": 0.32673985859751703,
"epoch": 1.0438690185546875,
"grad_norm": 0.1362403929233551,
"learning_rate": 2.9209995269775392e-05,
"lookahead_loss": 3.832825825691223,
"loss": 2.0847,
"step": 218000
},
{
"base_loss": 0.2944556847214699,
"epoch": 1.0448226928710938,
"grad_norm": 0.18242229521274567,
"learning_rate": 2.916231155395508e-05,
"lookahead_loss": 3.7715793895721434,
"loss": 2.0345,
"step": 218500
},
{
"base_loss": 0.3020890684425831,
"epoch": 1.0457763671875,
"grad_norm": 0.09675723314285278,
"learning_rate": 2.9114627838134766e-05,
"lookahead_loss": 3.8328170766830443,
"loss": 2.0663,
"step": 219000
},
{
"base_loss": 0.32630685463547704,
"epoch": 1.0467300415039062,
"grad_norm": 0.10813385993242264,
"learning_rate": 2.9066944122314456e-05,
"lookahead_loss": 3.826720841407776,
"loss": 2.0744,
"step": 219500
},
{
"base_loss": 0.3254209460914135,
"epoch": 1.0476837158203125,
"grad_norm": 0.18731825053691864,
"learning_rate": 2.9019260406494143e-05,
"lookahead_loss": 3.837134199142456,
"loss": 2.0853,
"step": 220000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2851505911769197,
"eval_lookahead_perplexity": 26.71300675513349,
"eval_loss": 1.7063519954681396,
"eval_perplexity": 5.508828545937477,
"eval_runtime": 501.181,
"eval_samples_per_second": 9.976,
"eval_steps_per_second": 0.313,
"step": 220000
},
{
"base_loss": 0.29404987835884094,
"epoch": 1.0486373901367188,
"grad_norm": 0.11766321212053299,
"learning_rate": 2.897157669067383e-05,
"lookahead_loss": 3.7706230659484863,
"loss": 2.0379,
"step": 220500
},
{
"base_loss": 0.30766302010416985,
"epoch": 1.049591064453125,
"grad_norm": 0.10604788362979889,
"learning_rate": 2.8923892974853516e-05,
"lookahead_loss": 3.80825559425354,
"loss": 2.0572,
"step": 221000
},
{
"base_loss": 0.3222936154305935,
"epoch": 1.0505447387695312,
"grad_norm": 0.10031577199697495,
"learning_rate": 2.8876209259033203e-05,
"lookahead_loss": 3.831725327968597,
"loss": 2.0767,
"step": 221500
},
{
"base_loss": 0.3045852819383144,
"epoch": 1.0514984130859375,
"grad_norm": 0.1485632359981537,
"learning_rate": 2.8828525543212893e-05,
"lookahead_loss": 3.774880935192108,
"loss": 2.0437,
"step": 222000
},
{
"base_loss": 0.30816273841261865,
"epoch": 1.0524520874023438,
"grad_norm": 0.18679194152355194,
"learning_rate": 2.878084182739258e-05,
"lookahead_loss": 3.8043067450523376,
"loss": 2.0553,
"step": 222500
},
{
"base_loss": 0.32319829949736595,
"epoch": 1.05340576171875,
"grad_norm": 0.11528552323579788,
"learning_rate": 2.8733158111572267e-05,
"lookahead_loss": 3.8278827791213987,
"loss": 2.0701,
"step": 223000
},
{
"base_loss": 0.3588077034056187,
"epoch": 1.0543594360351562,
"grad_norm": 0.10808339715003967,
"learning_rate": 2.8685474395751954e-05,
"lookahead_loss": 3.8612174353599547,
"loss": 2.1125,
"step": 223500
},
{
"base_loss": 0.29601221990585325,
"epoch": 1.0553131103515625,
"grad_norm": 0.13398034870624542,
"learning_rate": 2.863779067993164e-05,
"lookahead_loss": 3.758062246322632,
"loss": 2.0311,
"step": 224000
},
{
"base_loss": 0.3043306003510952,
"epoch": 1.0562667846679688,
"grad_norm": 0.10396202653646469,
"learning_rate": 2.859010696411133e-05,
"lookahead_loss": 3.826102759838104,
"loss": 2.0674,
"step": 224500
},
{
"base_loss": 0.31856663155555726,
"epoch": 1.057220458984375,
"grad_norm": 0.12544843554496765,
"learning_rate": 2.8542423248291018e-05,
"lookahead_loss": 3.824730550289154,
"loss": 2.0742,
"step": 225000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2769267330535303,
"eval_lookahead_perplexity": 26.494223631272767,
"eval_loss": 1.7022335529327393,
"eval_perplexity": 5.486187407250397,
"eval_runtime": 485.513,
"eval_samples_per_second": 10.298,
"eval_steps_per_second": 0.323,
"step": 225000
},
{
"base_loss": 0.32003091636300085,
"epoch": 1.0581741333007812,
"grad_norm": 0.10620646923780441,
"learning_rate": 2.8494739532470705e-05,
"lookahead_loss": 3.801748302936554,
"loss": 2.0581,
"step": 225500
},
{
"base_loss": 0.29179560819268224,
"epoch": 1.0591278076171875,
"grad_norm": 0.09537842869758606,
"learning_rate": 2.844705581665039e-05,
"lookahead_loss": 3.76057571554184,
"loss": 2.0317,
"step": 226000
},
{
"base_loss": 0.30011604171991346,
"epoch": 1.0600814819335938,
"grad_norm": 0.09911732375621796,
"learning_rate": 2.8399372100830078e-05,
"lookahead_loss": 3.809082925796509,
"loss": 2.0564,
"step": 226500
},
{
"base_loss": 0.3230703995227814,
"epoch": 1.06103515625,
"grad_norm": 0.09991439431905746,
"learning_rate": 2.835168838500977e-05,
"lookahead_loss": 3.8153861479759215,
"loss": 2.0645,
"step": 227000
},
{
"base_loss": 0.30637279444932936,
"epoch": 1.0619888305664062,
"grad_norm": 0.13179980218410492,
"learning_rate": 2.8304004669189455e-05,
"lookahead_loss": 3.786368088722229,
"loss": 2.0365,
"step": 227500
},
{
"base_loss": 0.3066646957695484,
"epoch": 1.0629425048828125,
"grad_norm": 0.1195763424038887,
"learning_rate": 2.8256320953369142e-05,
"lookahead_loss": 3.8267566895484926,
"loss": 2.0608,
"step": 228000
},
{
"base_loss": 0.31583699241280555,
"epoch": 1.0638961791992188,
"grad_norm": 0.1081557646393776,
"learning_rate": 2.820863723754883e-05,
"lookahead_loss": 3.819593190193176,
"loss": 2.0659,
"step": 228500
},
{
"base_loss": 0.30502623090147973,
"epoch": 1.064849853515625,
"grad_norm": 0.12640877068042755,
"learning_rate": 2.8160953521728516e-05,
"lookahead_loss": 3.7740664672851563,
"loss": 2.0424,
"step": 229000
},
{
"base_loss": 0.30984748020768166,
"epoch": 1.0658035278320312,
"grad_norm": 0.13270865380764008,
"learning_rate": 2.8113269805908206e-05,
"lookahead_loss": 3.7949006910324097,
"loss": 2.0514,
"step": 229500
},
{
"base_loss": 0.3079377235472202,
"epoch": 1.0667572021484375,
"grad_norm": 0.10393428802490234,
"learning_rate": 2.8065586090087893e-05,
"lookahead_loss": 3.801414387702942,
"loss": 2.0527,
"step": 230000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.269460886050337,
"eval_lookahead_perplexity": 26.297158356119088,
"eval_loss": 1.6984984874725342,
"eval_perplexity": 5.4657343586728935,
"eval_runtime": 501.4886,
"eval_samples_per_second": 9.97,
"eval_steps_per_second": 0.313,
"step": 230000
},
{
"base_loss": 0.33075104707479475,
"epoch": 1.0677108764648438,
"grad_norm": 0.15981747210025787,
"learning_rate": 2.801790237426758e-05,
"lookahead_loss": 3.83703492307663,
"loss": 2.081,
"step": 230500
},
{
"base_loss": 0.3024148660302162,
"epoch": 1.06866455078125,
"grad_norm": 0.13739366829395294,
"learning_rate": 2.7970218658447266e-05,
"lookahead_loss": 3.754325032234192,
"loss": 2.0253,
"step": 231000
},
{
"base_loss": 0.3048691195845604,
"epoch": 1.0696182250976562,
"grad_norm": 0.12613807618618011,
"learning_rate": 2.7922534942626953e-05,
"lookahead_loss": 3.824920612812042,
"loss": 2.0616,
"step": 231500
},
{
"base_loss": 0.3433072043955326,
"epoch": 1.0705718994140625,
"grad_norm": 0.10592526942491531,
"learning_rate": 2.7874851226806643e-05,
"lookahead_loss": 3.845939799785614,
"loss": 2.0977,
"step": 232000
},
{
"base_loss": 0.3126540828049183,
"epoch": 1.0715255737304688,
"grad_norm": 0.14438621699810028,
"learning_rate": 2.782716751098633e-05,
"lookahead_loss": 3.7780807838439943,
"loss": 2.045,
"step": 232500
},
{
"base_loss": 0.30936010053753854,
"epoch": 1.072479248046875,
"grad_norm": 0.12224919348955154,
"learning_rate": 2.7779483795166017e-05,
"lookahead_loss": 3.790074597835541,
"loss": 2.0452,
"step": 233000
},
{
"base_loss": 0.3016264271736145,
"epoch": 1.0734329223632812,
"grad_norm": 0.09938216209411621,
"learning_rate": 2.7731800079345704e-05,
"lookahead_loss": 3.7917777862548827,
"loss": 2.0494,
"step": 233500
},
{
"base_loss": 0.3292314064204693,
"epoch": 1.0743865966796875,
"grad_norm": 0.10616600513458252,
"learning_rate": 2.768411636352539e-05,
"lookahead_loss": 3.8125745573043823,
"loss": 2.0687,
"step": 234000
},
{
"base_loss": 0.304562608808279,
"epoch": 1.0753402709960938,
"grad_norm": 0.12816548347473145,
"learning_rate": 2.763643264770508e-05,
"lookahead_loss": 3.7680485906600953,
"loss": 2.0349,
"step": 234500
},
{
"base_loss": 0.3058357034623623,
"epoch": 1.0762939453125,
"grad_norm": 0.16024808585643768,
"learning_rate": 2.7588748931884768e-05,
"lookahead_loss": 3.8191744446754456,
"loss": 2.0615,
"step": 235000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.262039216562582,
"eval_lookahead_perplexity": 26.102711989194106,
"eval_loss": 1.6948232650756836,
"eval_perplexity": 5.445683437708976,
"eval_runtime": 506.7872,
"eval_samples_per_second": 9.866,
"eval_steps_per_second": 0.31,
"step": 235000
},
{
"base_loss": 0.32823599469661713,
"epoch": 1.0772476196289062,
"grad_norm": 0.1520950198173523,
"learning_rate": 2.7541065216064455e-05,
"lookahead_loss": 3.822832639217377,
"loss": 2.0821,
"step": 235500
},
{
"base_loss": 0.3016549552977085,
"epoch": 1.0782012939453125,
"grad_norm": 0.12211694568395615,
"learning_rate": 2.749338150024414e-05,
"lookahead_loss": 3.756803053855896,
"loss": 2.0364,
"step": 236000
},
{
"base_loss": 0.29828149917721747,
"epoch": 1.0791549682617188,
"grad_norm": 0.12640158832073212,
"learning_rate": 2.7445697784423828e-05,
"lookahead_loss": 3.789377547264099,
"loss": 2.0469,
"step": 236500
},
{
"base_loss": 0.31368752831220625,
"epoch": 1.080108642578125,
"grad_norm": 0.11661666631698608,
"learning_rate": 2.739801406860352e-05,
"lookahead_loss": 3.815073594093323,
"loss": 2.0704,
"step": 237000
},
{
"base_loss": 0.3168468432724476,
"epoch": 1.0810623168945312,
"grad_norm": 0.15805600583553314,
"learning_rate": 2.7350330352783205e-05,
"lookahead_loss": 3.788653570652008,
"loss": 2.0595,
"step": 237500
},
{
"base_loss": 0.29921497783064843,
"epoch": 1.0820159912109375,
"grad_norm": 0.13112910091876984,
"learning_rate": 2.7302646636962892e-05,
"lookahead_loss": 3.765235338687897,
"loss": 2.0388,
"step": 238000
},
{
"base_loss": 0.3023185026049614,
"epoch": 1.0829696655273438,
"grad_norm": 0.09834863990545273,
"learning_rate": 2.725496292114258e-05,
"lookahead_loss": 3.7986365513801577,
"loss": 2.0509,
"step": 238500
},
{
"base_loss": 0.33467673206329346,
"epoch": 1.08392333984375,
"grad_norm": 0.10940661281347275,
"learning_rate": 2.7207279205322266e-05,
"lookahead_loss": 3.8246895036697386,
"loss": 2.0764,
"step": 239000
},
{
"base_loss": 0.30601534658670426,
"epoch": 1.0848770141601562,
"grad_norm": 0.1469735950231552,
"learning_rate": 2.7159595489501956e-05,
"lookahead_loss": 3.773017762184143,
"loss": 2.0409,
"step": 239500
},
{
"base_loss": 0.2985472394824028,
"epoch": 1.0858306884765625,
"grad_norm": 0.10987813770771027,
"learning_rate": 2.7111911773681643e-05,
"lookahead_loss": 3.779964041233063,
"loss": 2.038,
"step": 240000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2550360676579584,
"eval_lookahead_perplexity": 25.920549410649766,
"eval_loss": 1.6912927627563477,
"eval_perplexity": 5.426491338512043,
"eval_runtime": 484.3801,
"eval_samples_per_second": 10.322,
"eval_steps_per_second": 0.324,
"step": 240000
},
{
"base_loss": 0.3033053425848484,
"epoch": 1.0867843627929688,
"grad_norm": 0.4711226522922516,
"learning_rate": 2.706422805786133e-05,
"lookahead_loss": 3.782335328578949,
"loss": 2.0383,
"step": 240500
},
{
"base_loss": 0.33623868149518965,
"epoch": 1.087738037109375,
"grad_norm": 0.10267792642116547,
"learning_rate": 2.7016544342041016e-05,
"lookahead_loss": 3.8300727925300597,
"loss": 2.0763,
"step": 241000
},
{
"base_loss": 0.3016283850669861,
"epoch": 1.0886917114257812,
"grad_norm": 0.12222345918416977,
"learning_rate": 2.6968860626220703e-05,
"lookahead_loss": 3.7537764272689818,
"loss": 2.0287,
"step": 241500
},
{
"base_loss": 0.3083756065964699,
"epoch": 1.0896453857421875,
"grad_norm": 0.11414934694766998,
"learning_rate": 2.6921176910400393e-05,
"lookahead_loss": 3.752362766265869,
"loss": 2.0298,
"step": 242000
},
{
"base_loss": 0.2994114246070385,
"epoch": 1.0905990600585938,
"grad_norm": 0.10676714032888412,
"learning_rate": 2.687349319458008e-05,
"lookahead_loss": 3.7789285941123962,
"loss": 2.0418,
"step": 242500
},
{
"base_loss": 0.29944976773858073,
"epoch": 1.091552734375,
"grad_norm": 0.10604005306959152,
"learning_rate": 2.6825809478759767e-05,
"lookahead_loss": 3.779764890193939,
"loss": 2.0375,
"step": 243000
},
{
"base_loss": 0.32598793333768844,
"epoch": 1.0925064086914062,
"grad_norm": 0.16571524739265442,
"learning_rate": 2.6778125762939454e-05,
"lookahead_loss": 3.800921561717987,
"loss": 2.0587,
"step": 243500
},
{
"base_loss": 0.30856517258286476,
"epoch": 1.0934600830078125,
"grad_norm": 0.12707704305648804,
"learning_rate": 2.673044204711914e-05,
"lookahead_loss": 3.7637797536849975,
"loss": 2.0395,
"step": 244000
},
{
"base_loss": 0.2889265112578869,
"epoch": 1.0944137573242188,
"grad_norm": 0.16245336830615997,
"learning_rate": 2.668275833129883e-05,
"lookahead_loss": 3.7417610387802123,
"loss": 2.0145,
"step": 244500
},
{
"base_loss": 0.2941120155751705,
"epoch": 1.095367431640625,
"grad_norm": 0.11395586282014847,
"learning_rate": 2.6635074615478518e-05,
"lookahead_loss": 3.7530963735580443,
"loss": 2.0266,
"step": 245000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2479548934168707,
"eval_lookahead_perplexity": 25.737649820044062,
"eval_loss": 1.687726616859436,
"eval_perplexity": 5.4071741431311375,
"eval_runtime": 512.0102,
"eval_samples_per_second": 9.765,
"eval_steps_per_second": 0.307,
"step": 245000
},
{
"base_loss": 0.3011808663010597,
"epoch": 1.0963211059570312,
"grad_norm": 0.2450723797082901,
"learning_rate": 2.6587390899658205e-05,
"lookahead_loss": 3.7636666469573976,
"loss": 2.0324,
"step": 245500
},
{
"base_loss": 0.33242677092552186,
"epoch": 1.0972747802734375,
"grad_norm": 0.11119936406612396,
"learning_rate": 2.653970718383789e-05,
"lookahead_loss": 3.8032662024497985,
"loss": 2.0627,
"step": 246000
},
{
"base_loss": 0.29240253108739855,
"epoch": 1.0982284545898438,
"grad_norm": 0.12222541123628616,
"learning_rate": 2.6492023468017578e-05,
"lookahead_loss": 3.7323589258193968,
"loss": 2.0156,
"step": 246500
},
{
"base_loss": 0.29588570061326025,
"epoch": 1.09918212890625,
"grad_norm": 0.09957096725702286,
"learning_rate": 2.644433975219727e-05,
"lookahead_loss": 3.7686120963096617,
"loss": 2.035,
"step": 247000
},
{
"base_loss": 0.2992869386672974,
"epoch": 1.1001358032226562,
"grad_norm": 0.1540381759405136,
"learning_rate": 2.6396656036376955e-05,
"lookahead_loss": 3.775865716457367,
"loss": 2.0415,
"step": 247500
},
{
"base_loss": 0.3209733834564686,
"epoch": 1.1010894775390625,
"grad_norm": 0.09748150408267975,
"learning_rate": 2.6348972320556642e-05,
"lookahead_loss": 3.805611068725586,
"loss": 2.059,
"step": 248000
},
{
"base_loss": 0.301429179161787,
"epoch": 1.1020431518554688,
"grad_norm": 0.14414113759994507,
"learning_rate": 2.630128860473633e-05,
"lookahead_loss": 3.752038255691528,
"loss": 2.0265,
"step": 248500
},
{
"base_loss": 0.2956730664372444,
"epoch": 1.102996826171875,
"grad_norm": 0.117804616689682,
"learning_rate": 2.6253604888916016e-05,
"lookahead_loss": 3.7491831588745117,
"loss": 2.031,
"step": 249000
},
{
"base_loss": 0.29523133793473244,
"epoch": 1.1039505004882812,
"grad_norm": 0.09736798703670502,
"learning_rate": 2.6205921173095706e-05,
"lookahead_loss": 3.778213514328003,
"loss": 2.0405,
"step": 249500
},
{
"base_loss": 0.31446042719483375,
"epoch": 1.1049041748046875,
"grad_norm": 0.11215147376060486,
"learning_rate": 2.6158237457275393e-05,
"lookahead_loss": 3.776704475879669,
"loss": 2.0502,
"step": 250000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.241219103145904,
"eval_lookahead_perplexity": 25.5648689698412,
"eval_loss": 1.6843818426132202,
"eval_perplexity": 5.389118579038349,
"eval_runtime": 490.435,
"eval_samples_per_second": 10.195,
"eval_steps_per_second": 0.32,
"step": 250000
},
{
"base_loss": 0.30885917544364927,
"epoch": 1.1058578491210938,
"grad_norm": 0.14926961064338684,
"learning_rate": 2.611055374145508e-05,
"lookahead_loss": 3.7549472694396973,
"loss": 2.0303,
"step": 250500
},
{
"base_loss": 0.29513884752988817,
"epoch": 1.1068115234375,
"grad_norm": 0.11320330202579498,
"learning_rate": 2.6062870025634766e-05,
"lookahead_loss": 3.7395705704689024,
"loss": 2.02,
"step": 251000
},
{
"base_loss": 0.2974797194004059,
"epoch": 1.1077651977539062,
"grad_norm": 0.1485828310251236,
"learning_rate": 2.6015186309814453e-05,
"lookahead_loss": 3.767058692932129,
"loss": 2.0293,
"step": 251500
},
{
"base_loss": 0.3221335953772068,
"epoch": 1.1087188720703125,
"grad_norm": 0.12389354407787323,
"learning_rate": 2.5967502593994143e-05,
"lookahead_loss": 3.7966585497856142,
"loss": 2.0583,
"step": 252000
},
{
"base_loss": 0.31462463283538816,
"epoch": 1.1096725463867188,
"grad_norm": 0.12376336008310318,
"learning_rate": 2.591981887817383e-05,
"lookahead_loss": 3.75765408372879,
"loss": 2.0376,
"step": 252500
},
{
"base_loss": 0.3006181915104389,
"epoch": 1.110626220703125,
"grad_norm": 0.10401830077171326,
"learning_rate": 2.5872135162353517e-05,
"lookahead_loss": 3.7382473673820495,
"loss": 2.0207,
"step": 253000
},
{
"base_loss": 0.2966276684105396,
"epoch": 1.1115798950195312,
"grad_norm": 0.2943388521671295,
"learning_rate": 2.5824451446533204e-05,
"lookahead_loss": 3.77330969953537,
"loss": 2.0325,
"step": 253500
},
{
"base_loss": 0.30631836572289467,
"epoch": 1.1125335693359375,
"grad_norm": 0.14963500201702118,
"learning_rate": 2.577676773071289e-05,
"lookahead_loss": 3.777679774284363,
"loss": 2.0433,
"step": 254000
},
{
"base_loss": 0.3482712984383106,
"epoch": 1.1134872436523438,
"grad_norm": 0.11030125617980957,
"learning_rate": 2.572908401489258e-05,
"lookahead_loss": 3.8148307304382323,
"loss": 2.0723,
"step": 254500
},
{
"base_loss": 0.29542491587996483,
"epoch": 1.11444091796875,
"grad_norm": 0.11233729124069214,
"learning_rate": 2.5681400299072268e-05,
"lookahead_loss": 3.7286681451797485,
"loss": 2.014,
"step": 255000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2360856357854777,
"eval_lookahead_perplexity": 25.433968822689298,
"eval_loss": 1.6818461418151855,
"eval_perplexity": 5.375470697541516,
"eval_runtime": 525.262,
"eval_samples_per_second": 9.519,
"eval_steps_per_second": 0.299,
"step": 255000
},
{
"base_loss": 0.2994180084168911,
"epoch": 1.1153945922851562,
"grad_norm": 0.3878551721572876,
"learning_rate": 2.5633716583251955e-05,
"lookahead_loss": 3.7766121606826784,
"loss": 2.0408,
"step": 255500
},
{
"base_loss": 0.3126500599086285,
"epoch": 1.1163482666015625,
"grad_norm": 0.1614830195903778,
"learning_rate": 2.558603286743164e-05,
"lookahead_loss": 3.7671957154273987,
"loss": 2.0408,
"step": 256000
},
{
"base_loss": 0.3251005619764328,
"epoch": 1.1173019409179688,
"grad_norm": 0.10149198770523071,
"learning_rate": 2.5538349151611328e-05,
"lookahead_loss": 3.8031828441619875,
"loss": 2.0681,
"step": 256500
},
{
"base_loss": 0.3045504302084446,
"epoch": 1.118255615234375,
"grad_norm": 0.10480870306491852,
"learning_rate": 2.549066543579102e-05,
"lookahead_loss": 3.7594980635643007,
"loss": 2.0284,
"step": 257000
},
{
"base_loss": 0.3033926927447319,
"epoch": 1.1192092895507812,
"grad_norm": 0.1331453174352646,
"learning_rate": 2.5442981719970705e-05,
"lookahead_loss": 3.7688667068481445,
"loss": 2.0333,
"step": 257500
},
{
"base_loss": 0.3065674279928207,
"epoch": 2.0009536743164062,
"grad_norm": 0.11469951272010803,
"learning_rate": 2.5395298004150392e-05,
"lookahead_loss": 3.7806892952919005,
"loss": 2.0367,
"step": 258000
},
{
"base_loss": 0.30196980077028274,
"epoch": 2.0019073486328125,
"grad_norm": 0.17589828372001648,
"learning_rate": 2.534761428833008e-05,
"lookahead_loss": 3.761930028438568,
"loss": 2.0342,
"step": 258500
},
{
"base_loss": 0.3121912784278393,
"epoch": 2.0028610229492188,
"grad_norm": 0.10175404697656631,
"learning_rate": 2.5299930572509766e-05,
"lookahead_loss": 3.7761128277778626,
"loss": 2.0353,
"step": 259000
},
{
"base_loss": 0.320074492007494,
"epoch": 2.003814697265625,
"grad_norm": 0.11250407993793488,
"learning_rate": 2.5252246856689456e-05,
"lookahead_loss": 3.772739490509033,
"loss": 2.0486,
"step": 259500
},
{
"base_loss": 0.3025432696044445,
"epoch": 2.0047683715820312,
"grad_norm": 0.10035926848649979,
"learning_rate": 2.5204563140869143e-05,
"lookahead_loss": 3.7355602521896363,
"loss": 2.0217,
"step": 260000
},
{
"epoch": 2.0047683715820312,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.229613203591051,
"eval_lookahead_perplexity": 25.269880781246492,
"eval_loss": 1.678591012954712,
"eval_perplexity": 5.358001295737592,
"eval_runtime": 520.5248,
"eval_samples_per_second": 9.606,
"eval_steps_per_second": 0.302,
"step": 260000
},
{
"base_loss": 0.29722241711616515,
"epoch": 2.0057220458984375,
"grad_norm": 0.11531388014554977,
"learning_rate": 2.515687942504883e-05,
"lookahead_loss": 3.757506604194641,
"loss": 2.0282,
"step": 260500
},
{
"base_loss": 0.29807624077796935,
"epoch": 2.0066757202148438,
"grad_norm": 0.103513203561306,
"learning_rate": 2.5109195709228516e-05,
"lookahead_loss": 3.769570269584656,
"loss": 2.0366,
"step": 261000
},
{
"base_loss": 0.3125916388332844,
"epoch": 2.00762939453125,
"grad_norm": 0.12189048528671265,
"learning_rate": 2.5061511993408203e-05,
"lookahead_loss": 3.777708933353424,
"loss": 2.0416,
"step": 261500
},
{
"base_loss": 0.31632441571354863,
"epoch": 2.0085830688476562,
"grad_norm": 0.11706750094890594,
"learning_rate": 2.5013828277587893e-05,
"lookahead_loss": 3.768247010707855,
"loss": 2.0345,
"step": 262000
},
{
"base_loss": 0.2996086142659187,
"epoch": 2.0095367431640625,
"grad_norm": 0.12055575102567673,
"learning_rate": 2.496614456176758e-05,
"lookahead_loss": 3.724477370262146,
"loss": 2.0186,
"step": 262500
},
{
"base_loss": 0.2994557471871376,
"epoch": 2.0104904174804688,
"grad_norm": 0.09918702393770218,
"learning_rate": 2.4918460845947267e-05,
"lookahead_loss": 3.7654948663711547,
"loss": 2.0302,
"step": 263000
},
{
"base_loss": 0.30171854814887045,
"epoch": 2.011444091796875,
"grad_norm": 0.10484851896762848,
"learning_rate": 2.4870777130126954e-05,
"lookahead_loss": 3.748970988750458,
"loss": 2.0291,
"step": 263500
},
{
"base_loss": 0.3268180110156536,
"epoch": 2.0123977661132812,
"grad_norm": 0.09725604206323624,
"learning_rate": 2.482309341430664e-05,
"lookahead_loss": 3.7823777060508728,
"loss": 2.0534,
"step": 264000
},
{
"base_loss": 0.30524489533901217,
"epoch": 2.0133514404296875,
"grad_norm": 0.11222957819700241,
"learning_rate": 2.477540969848633e-05,
"lookahead_loss": 3.732293013095856,
"loss": 2.0222,
"step": 264500
},
{
"base_loss": 0.29953240939974785,
"epoch": 2.0143051147460938,
"grad_norm": 0.12111784517765045,
"learning_rate": 2.4727725982666018e-05,
"lookahead_loss": 3.73520312833786,
"loss": 2.0174,
"step": 265000
},
{
"epoch": 2.0143051147460938,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2239884416135354,
"eval_lookahead_perplexity": 25.128142711795284,
"eval_loss": 1.6757808923721313,
"eval_perplexity": 5.342965801684929,
"eval_runtime": 518.5198,
"eval_samples_per_second": 9.643,
"eval_steps_per_second": 0.303,
"step": 265000
},
{
"base_loss": 0.2956546121239662,
"epoch": 2.0152587890625,
"grad_norm": 0.11768271774053574,
"learning_rate": 2.4680042266845705e-05,
"lookahead_loss": 3.7577808418273926,
"loss": 2.0272,
"step": 265500
},
{
"base_loss": 0.31376709473133085,
"epoch": 2.0162124633789062,
"grad_norm": 0.1221555769443512,
"learning_rate": 2.463235855102539e-05,
"lookahead_loss": 3.771606719017029,
"loss": 2.0436,
"step": 266000
},
{
"base_loss": 0.31018616977334024,
"epoch": 2.0171661376953125,
"grad_norm": 0.13717730343341827,
"learning_rate": 2.4584674835205078e-05,
"lookahead_loss": 3.764633895397186,
"loss": 2.0331,
"step": 266500
},
{
"base_loss": 0.30160776057839395,
"epoch": 2.0181198120117188,
"grad_norm": 0.13953110575675964,
"learning_rate": 2.453699111938477e-05,
"lookahead_loss": 3.7366544070243837,
"loss": 2.0145,
"step": 267000
},
{
"base_loss": 0.3013426844775677,
"epoch": 2.019073486328125,
"grad_norm": 0.10401706397533417,
"learning_rate": 2.4489307403564455e-05,
"lookahead_loss": 3.773498547077179,
"loss": 2.036,
"step": 267500
},
{
"base_loss": 0.30037295311689377,
"epoch": 2.0200271606445312,
"grad_norm": 0.1209816038608551,
"learning_rate": 2.4441623687744142e-05,
"lookahead_loss": 3.7449615778923033,
"loss": 2.0239,
"step": 268000
},
{
"base_loss": 0.33094308829307556,
"epoch": 2.0209808349609375,
"grad_norm": 0.14925232529640198,
"learning_rate": 2.439393997192383e-05,
"lookahead_loss": 3.802969255924225,
"loss": 2.0617,
"step": 268500
},
{
"base_loss": 0.3023415932953358,
"epoch": 2.0219345092773438,
"grad_norm": 0.1070331484079361,
"learning_rate": 2.4346256256103516e-05,
"lookahead_loss": 3.728753103733063,
"loss": 2.0133,
"step": 269000
},
{
"base_loss": 0.30017873507738113,
"epoch": 2.02288818359375,
"grad_norm": 0.15573182702064514,
"learning_rate": 2.4298572540283206e-05,
"lookahead_loss": 3.736812686443329,
"loss": 2.0228,
"step": 269500
},
{
"base_loss": 0.30104174053668975,
"epoch": 2.0238418579101562,
"grad_norm": 0.12105967849493027,
"learning_rate": 2.4250888824462893e-05,
"lookahead_loss": 3.7435108699798585,
"loss": 2.0246,
"step": 270000
},
{
"epoch": 2.0238418579101562,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2185772630734184,
"eval_lookahead_perplexity": 24.992537069258876,
"eval_loss": 1.6730648279190063,
"eval_perplexity": 5.328473651912189,
"eval_runtime": 520.3975,
"eval_samples_per_second": 9.608,
"eval_steps_per_second": 0.302,
"step": 270000
},
{
"base_loss": 0.32663769656419755,
"epoch": 2.0247955322265625,
"grad_norm": 0.10453636199235916,
"learning_rate": 2.420320510864258e-05,
"lookahead_loss": 3.78777219581604,
"loss": 2.0562,
"step": 270500
},
{
"base_loss": 0.3077376018166542,
"epoch": 2.0257492065429688,
"grad_norm": 0.12286706268787384,
"learning_rate": 2.4155521392822266e-05,
"lookahead_loss": 3.732728928089142,
"loss": 2.0261,
"step": 271000
},
{
"base_loss": 0.304711830675602,
"epoch": 2.026702880859375,
"grad_norm": 0.12905757129192352,
"learning_rate": 2.4107837677001953e-05,
"lookahead_loss": 3.755615324497223,
"loss": 2.0239,
"step": 271500
},
{
"base_loss": 0.30911297634243967,
"epoch": 2.0276565551757812,
"grad_norm": 0.1109541729092598,
"learning_rate": 2.406015396118164e-05,
"lookahead_loss": 3.763146149635315,
"loss": 2.0324,
"step": 272000
},
{
"base_loss": 0.3343582956790924,
"epoch": 2.0286102294921875,
"grad_norm": 0.13435722887516022,
"learning_rate": 2.401247024536133e-05,
"lookahead_loss": 3.80504735994339,
"loss": 2.0631,
"step": 272500
},
{
"base_loss": 0.30309502825140955,
"epoch": 2.0295639038085938,
"grad_norm": 0.10892323404550552,
"learning_rate": 2.3964786529541017e-05,
"lookahead_loss": 3.728642023563385,
"loss": 2.0167,
"step": 273000
},
{
"base_loss": 0.30229189068078993,
"epoch": 2.030517578125,
"grad_norm": 0.15014854073524475,
"learning_rate": 2.3917102813720704e-05,
"lookahead_loss": 3.759530487060547,
"loss": 2.0321,
"step": 273500
},
{
"base_loss": 0.3015878119468689,
"epoch": 2.0314712524414062,
"grad_norm": 0.12702982127666473,
"learning_rate": 2.386941909790039e-05,
"lookahead_loss": 3.7643159022331236,
"loss": 2.0323,
"step": 274000
},
{
"base_loss": 0.31617325788736345,
"epoch": 2.0324249267578125,
"grad_norm": 0.1333305388689041,
"learning_rate": 2.3821735382080078e-05,
"lookahead_loss": 3.7832584075927733,
"loss": 2.0536,
"step": 274500
},
{
"base_loss": 0.3030018242299557,
"epoch": 2.0333786010742188,
"grad_norm": 0.16368670761585236,
"learning_rate": 2.3774051666259768e-05,
"lookahead_loss": 3.7274523305892946,
"loss": 2.0161,
"step": 275000
},
{
"epoch": 2.0333786010742188,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.213857348353718,
"eval_lookahead_perplexity": 24.874852375000955,
"eval_loss": 1.6707115173339844,
"eval_perplexity": 5.315948841629713,
"eval_runtime": 567.9157,
"eval_samples_per_second": 8.804,
"eval_steps_per_second": 0.276,
"step": 275000
},
{
"base_loss": 0.3039812208414078,
"epoch": 2.034332275390625,
"grad_norm": 0.13603663444519043,
"learning_rate": 2.3726367950439455e-05,
"lookahead_loss": 3.7608649916648864,
"loss": 2.0336,
"step": 275500
},
{
"base_loss": 0.30951715883612635,
"epoch": 2.0352859497070312,
"grad_norm": 0.13090349733829498,
"learning_rate": 2.367868423461914e-05,
"lookahead_loss": 3.751265535354614,
"loss": 2.0318,
"step": 276000
},
{
"base_loss": 0.3263999198377132,
"epoch": 2.0362396240234375,
"grad_norm": 0.10014423727989197,
"learning_rate": 2.3631000518798828e-05,
"lookahead_loss": 3.783777579784393,
"loss": 2.0521,
"step": 276500
},
{
"base_loss": 0.3047963642179966,
"epoch": 2.0371932983398438,
"grad_norm": 0.12176624685525894,
"learning_rate": 2.3583316802978515e-05,
"lookahead_loss": 3.737890814781189,
"loss": 2.0226,
"step": 277000
},
{
"base_loss": 0.3006651694476604,
"epoch": 2.03814697265625,
"grad_norm": 0.13232555985450745,
"learning_rate": 2.3535633087158205e-05,
"lookahead_loss": 3.7341004371643067,
"loss": 2.0212,
"step": 277500
},
{
"base_loss": 0.30875834566354754,
"epoch": 2.0391006469726562,
"grad_norm": 0.12054823338985443,
"learning_rate": 2.3487949371337892e-05,
"lookahead_loss": 3.754430528640747,
"loss": 2.0355,
"step": 278000
},
{
"base_loss": 0.32003281235694886,
"epoch": 2.0400543212890625,
"grad_norm": 0.13997812569141388,
"learning_rate": 2.344026565551758e-05,
"lookahead_loss": 3.7748019156455994,
"loss": 2.0462,
"step": 278500
},
{
"base_loss": 0.3103601124882698,
"epoch": 2.0410079956054688,
"grad_norm": 0.1007557213306427,
"learning_rate": 2.3392581939697266e-05,
"lookahead_loss": 3.7388244090080263,
"loss": 2.0189,
"step": 279000
},
{
"base_loss": 0.2956464845538139,
"epoch": 2.041961669921875,
"grad_norm": 0.13650935888290405,
"learning_rate": 2.3344898223876953e-05,
"lookahead_loss": 3.751103096961975,
"loss": 2.0244,
"step": 279500
},
{
"base_loss": 0.30817501452565194,
"epoch": 2.0429153442382812,
"grad_norm": 0.28911060094833374,
"learning_rate": 2.3297214508056643e-05,
"lookahead_loss": 3.7451971626281737,
"loss": 2.0305,
"step": 280000
},
{
"epoch": 2.0429153442382812,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.208491450300613,
"eval_lookahead_perplexity": 24.74173392249817,
"eval_loss": 1.668028473854065,
"eval_perplexity": 5.3017050366693725,
"eval_runtime": 503.074,
"eval_samples_per_second": 9.939,
"eval_steps_per_second": 0.312,
"step": 280000
},
{
"base_loss": 0.326471285879612,
"epoch": 2.0438690185546875,
"grad_norm": 0.12640319764614105,
"learning_rate": 2.324953079223633e-05,
"lookahead_loss": 3.774631119728088,
"loss": 2.0557,
"step": 280500
},
{
"base_loss": 0.2915420908033848,
"epoch": 2.0448226928710938,
"grad_norm": 0.1708357036113739,
"learning_rate": 2.3201847076416016e-05,
"lookahead_loss": 3.7091518301963804,
"loss": 2.0031,
"step": 281000
},
{
"base_loss": 0.3044695939719677,
"epoch": 2.0457763671875,
"grad_norm": 0.10427648574113846,
"learning_rate": 2.3154163360595703e-05,
"lookahead_loss": 3.771734592437744,
"loss": 2.0368,
"step": 281500
},
{
"base_loss": 0.33020448702573774,
"epoch": 2.0467300415039062,
"grad_norm": 0.10459216684103012,
"learning_rate": 2.310647964477539e-05,
"lookahead_loss": 3.7720982518196107,
"loss": 2.0471,
"step": 282000
},
{
"base_loss": 0.3262847933769226,
"epoch": 2.0476837158203125,
"grad_norm": 0.19303348660469055,
"learning_rate": 2.305879592895508e-05,
"lookahead_loss": 3.781227571964264,
"loss": 2.0568,
"step": 282500
},
{
"base_loss": 0.29507314643263816,
"epoch": 2.0486373901367188,
"grad_norm": 0.10965248942375183,
"learning_rate": 2.3011112213134767e-05,
"lookahead_loss": 3.712780210018158,
"loss": 2.0076,
"step": 283000
},
{
"base_loss": 0.3051584759950638,
"epoch": 2.049591064453125,
"grad_norm": 0.10179181396961212,
"learning_rate": 2.2963428497314454e-05,
"lookahead_loss": 3.746872082710266,
"loss": 2.025,
"step": 283500
},
{
"base_loss": 0.31943696123361587,
"epoch": 2.0505447387695312,
"grad_norm": 0.1000838577747345,
"learning_rate": 2.291574478149414e-05,
"lookahead_loss": 3.772828236103058,
"loss": 2.0468,
"step": 284000
},
{
"base_loss": 0.30451616686582567,
"epoch": 2.0514984130859375,
"grad_norm": 0.14745627343654633,
"learning_rate": 2.2868061065673828e-05,
"lookahead_loss": 3.71900003194809,
"loss": 2.0156,
"step": 284500
},
{
"base_loss": 0.3078126339912415,
"epoch": 2.0524520874023438,
"grad_norm": 0.18430376052856445,
"learning_rate": 2.2820377349853518e-05,
"lookahead_loss": 3.748859833717346,
"loss": 2.0266,
"step": 285000
},
{
"epoch": 2.0524520874023438,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.2039166052882284,
"eval_lookahead_perplexity": 24.628802842522166,
"eval_loss": 1.6657613515853882,
"eval_perplexity": 5.289699037794899,
"eval_runtime": 518.0968,
"eval_samples_per_second": 9.651,
"eval_steps_per_second": 0.303,
"step": 285000
},
{
"base_loss": 0.32404885134100914,
"epoch": 2.05340576171875,
"grad_norm": 0.11742905527353287,
"learning_rate": 2.2772693634033205e-05,
"lookahead_loss": 3.7723129653930663,
"loss": 2.0421,
"step": 285500
},
{
"base_loss": 0.3565616801381111,
"epoch": 2.0543594360351562,
"grad_norm": 0.1019706130027771,
"learning_rate": 2.272500991821289e-05,
"lookahead_loss": 3.8037442412376405,
"loss": 2.0832,
"step": 286000
},
{
"base_loss": 0.29305290046334265,
"epoch": 2.0553131103515625,
"grad_norm": 0.13263586163520813,
"learning_rate": 2.2677326202392578e-05,
"lookahead_loss": 3.6994579930305482,
"loss": 2.0018,
"step": 286500
},
{
"base_loss": 0.3075584282577038,
"epoch": 2.0562667846679688,
"grad_norm": 0.10312948375940323,
"learning_rate": 2.2629642486572265e-05,
"lookahead_loss": 3.770684916973114,
"loss": 2.0401,
"step": 287000
},
{
"base_loss": 0.3192341819703579,
"epoch": 2.057220458984375,
"grad_norm": 0.12222248315811157,
"learning_rate": 2.2581958770751955e-05,
"lookahead_loss": 3.7722288217544557,
"loss": 2.0463,
"step": 287500
},
{
"base_loss": 0.32363886943459513,
"epoch": 2.0581741333007812,
"grad_norm": 0.10712938755750656,
"learning_rate": 2.2534275054931642e-05,
"lookahead_loss": 3.747902335643768,
"loss": 2.0318,
"step": 288000
},
{
"base_loss": 0.2921108921468258,
"epoch": 2.0591278076171875,
"grad_norm": 0.0956321582198143,
"learning_rate": 2.248659133911133e-05,
"lookahead_loss": 3.7060699305534364,
"loss": 2.0051,
"step": 288500
},
{
"base_loss": 0.30325917214155196,
"epoch": 2.0600814819335938,
"grad_norm": 0.09532496333122253,
"learning_rate": 2.2438907623291016e-05,
"lookahead_loss": 3.756898371219635,
"loss": 2.0312,
"step": 289000
},
{
"base_loss": 0.3216978460550308,
"epoch": 2.06103515625,
"grad_norm": 0.09469062834978104,
"learning_rate": 2.2391223907470703e-05,
"lookahead_loss": 3.760906336784363,
"loss": 2.0383,
"step": 289500
},
{
"base_loss": 0.3074358084797859,
"epoch": 2.0619888305664062,
"grad_norm": 0.12240971624851227,
"learning_rate": 2.2343540191650393e-05,
"lookahead_loss": 3.732211685180664,
"loss": 2.0106,
"step": 290000
},
{
"epoch": 2.0619888305664062,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111277975594,
"eval_base_perplexity": 1.1401970664566357,
"eval_lookahead_loss": 3.19971718346349,
"eval_lookahead_perplexity": 24.52559297291427,
"eval_loss": 1.6636607646942139,
"eval_perplexity": 5.278599227477334,
"eval_runtime": 496.2507,
"eval_samples_per_second": 10.076,
"eval_steps_per_second": 0.316,
"step": 290000
},
{
"base_loss": 0.30197526678442954,
"epoch": 1.0009536743164062,
"grad_norm": 0.11595375835895538,
"learning_rate": 2.229585647583008e-05,
"lookahead_loss": 3.7548108019828796,
"loss": 2.0242,
"step": 290500
},
{
"base_loss": 0.303896483540535,
"epoch": 1.0019073486328125,
"grad_norm": 0.1678297072649002,
"learning_rate": 2.2248172760009766e-05,
"lookahead_loss": 3.738233793735504,
"loss": 2.0227,
"step": 291000
},
{
"base_loss": 0.3094813532233238,
"epoch": 1.0028610229492188,
"grad_norm": 0.10476084798574448,
"learning_rate": 2.2200489044189453e-05,
"lookahead_loss": 3.749328505039215,
"loss": 2.0219,
"step": 291500
},
{
"base_loss": 0.3199170651733875,
"epoch": 1.003814697265625,
"grad_norm": 0.11634726822376251,
"learning_rate": 2.215280532836914e-05,
"lookahead_loss": 3.7493094959259032,
"loss": 2.0361,
"step": 292000
},
{
"base_loss": 0.30184895062446593,
"epoch": 1.0047683715820312,
"grad_norm": 0.09533069282770157,
"learning_rate": 2.210512161254883e-05,
"lookahead_loss": 3.7101937403678895,
"loss": 2.0101,
"step": 292500
},
{
"base_loss": 0.2977984355092049,
"epoch": 1.0057220458984375,
"grad_norm": 0.11591842770576477,
"learning_rate": 2.2057437896728517e-05,
"lookahead_loss": 3.733954050540924,
"loss": 2.0147,
"step": 293000
},
{
"base_loss": 0.2989386010617018,
"epoch": 1.0066757202148438,
"grad_norm": 0.1040799543261528,
"learning_rate": 2.2009754180908204e-05,
"lookahead_loss": 3.745620455265045,
"loss": 2.0247,
"step": 293500
},
{
"base_loss": 0.3137947543263435,
"epoch": 1.00762939453125,
"grad_norm": 0.12795260548591614,
"learning_rate": 2.196207046508789e-05,
"lookahead_loss": 3.755497174739838,
"loss": 2.0303,
"step": 294000
},
{
"base_loss": 0.31258952274918556,
"epoch": 1.0085830688476562,
"grad_norm": 0.12033551186323166,
"learning_rate": 2.1914386749267578e-05,
"lookahead_loss": 3.7433795986175538,
"loss": 2.0219,
"step": 294500
},
{
"base_loss": 0.3022870315015316,
"epoch": 1.0095367431640625,
"grad_norm": 0.12253336608409882,
"learning_rate": 2.1866703033447268e-05,
"lookahead_loss": 3.7020899171829225,
"loss": 2.006,
"step": 295000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.195268607368104,
"eval_lookahead_perplexity": 24.416731325480427,
"eval_loss": 1.6614117622375488,
"eval_perplexity": 5.266740984454094,
"eval_runtime": 262.7535,
"eval_samples_per_second": 19.029,
"eval_steps_per_second": 0.598,
"step": 295000
},
{
"base_loss": 0.29763062146306035,
"epoch": 1.0104904174804688,
"grad_norm": 0.10351324081420898,
"learning_rate": 2.1819019317626955e-05,
"lookahead_loss": 3.7392460746765135,
"loss": 2.0173,
"step": 295500
},
{
"base_loss": 0.3011737278997898,
"epoch": 1.011444091796875,
"grad_norm": 0.10836062580347061,
"learning_rate": 2.177133560180664e-05,
"lookahead_loss": 3.725526228427887,
"loss": 2.018,
"step": 296000
},
{
"base_loss": 0.32275081843137743,
"epoch": 1.0123977661132812,
"grad_norm": 0.09027555584907532,
"learning_rate": 2.1723651885986328e-05,
"lookahead_loss": 3.756115399837494,
"loss": 2.0399,
"step": 296500
},
{
"base_loss": 0.30656733042001727,
"epoch": 1.0133514404296875,
"grad_norm": 0.11751335114240646,
"learning_rate": 2.1675968170166015e-05,
"lookahead_loss": 3.708749872684479,
"loss": 2.0119,
"step": 297000
},
{
"base_loss": 0.29944423550367355,
"epoch": 1.0143051147460938,
"grad_norm": 0.11894430220127106,
"learning_rate": 2.1628284454345705e-05,
"lookahead_loss": 3.710795109272003,
"loss": 2.0053,
"step": 297500
},
{
"base_loss": 0.29441548812389373,
"epoch": 1.0152587890625,
"grad_norm": 0.12353851646184921,
"learning_rate": 2.1580600738525392e-05,
"lookahead_loss": 3.733433099746704,
"loss": 2.0149,
"step": 298000
},
{
"base_loss": 0.31012057706713675,
"epoch": 1.0162124633789062,
"grad_norm": 0.11940598487854004,
"learning_rate": 2.153291702270508e-05,
"lookahead_loss": 3.7454442892074584,
"loss": 2.0313,
"step": 298500
},
{
"base_loss": 0.3121089872717857,
"epoch": 1.0171661376953125,
"grad_norm": 0.1333819031715393,
"learning_rate": 2.1485233306884766e-05,
"lookahead_loss": 3.7422564029693604,
"loss": 2.0218,
"step": 299000
},
{
"base_loss": 0.30401164934039115,
"epoch": 1.0181198120117188,
"grad_norm": 0.13284090161323547,
"learning_rate": 2.1437549591064453e-05,
"lookahead_loss": 3.715003073692322,
"loss": 2.0031,
"step": 299500
},
{
"base_loss": 0.29751659095287325,
"epoch": 1.019073486328125,
"grad_norm": 0.10176288336515427,
"learning_rate": 2.1389865875244143e-05,
"lookahead_loss": 3.743918424129486,
"loss": 2.0219,
"step": 300000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1906816144339953,
"eval_lookahead_perplexity": 24.304988429281245,
"eval_loss": 1.659116268157959,
"eval_perplexity": 5.254665077095094,
"eval_runtime": 276.7109,
"eval_samples_per_second": 18.069,
"eval_steps_per_second": 0.567,
"step": 300000
},
{
"base_loss": 0.30135778871178626,
"epoch": 1.0200271606445312,
"grad_norm": 0.12351817637681961,
"learning_rate": 2.134218215942383e-05,
"lookahead_loss": 3.722639967441559,
"loss": 2.0144,
"step": 300500
},
{
"base_loss": 0.33046225929260253,
"epoch": 1.0209808349609375,
"grad_norm": 0.14412406086921692,
"learning_rate": 2.1294498443603516e-05,
"lookahead_loss": 3.7789668612480165,
"loss": 2.0485,
"step": 301000
},
{
"base_loss": 0.30414657789468763,
"epoch": 1.0219345092773438,
"grad_norm": 0.10363082587718964,
"learning_rate": 2.1246814727783203e-05,
"lookahead_loss": 3.7076259078979494,
"loss": 2.0031,
"step": 301500
},
{
"base_loss": 0.30107878148555756,
"epoch": 1.02288818359375,
"grad_norm": 0.1513744294643402,
"learning_rate": 2.119913101196289e-05,
"lookahead_loss": 3.7153729853630066,
"loss": 2.0111,
"step": 302000
},
{
"base_loss": 0.3019954281449318,
"epoch": 1.0238418579101562,
"grad_norm": 0.10837174206972122,
"learning_rate": 2.115144729614258e-05,
"lookahead_loss": 3.721953295707703,
"loss": 2.014,
"step": 302500
},
{
"base_loss": 0.3265440165698528,
"epoch": 1.0247955322265625,
"grad_norm": 0.11248663067817688,
"learning_rate": 2.1103763580322267e-05,
"lookahead_loss": 3.766532400608063,
"loss": 2.0446,
"step": 303000
},
{
"base_loss": 0.3089427370727062,
"epoch": 1.0257492065429688,
"grad_norm": 0.12824861705303192,
"learning_rate": 2.1056079864501954e-05,
"lookahead_loss": 3.712160005092621,
"loss": 2.0148,
"step": 303500
},
{
"base_loss": 0.306296229749918,
"epoch": 1.026702880859375,
"grad_norm": 0.13572391867637634,
"learning_rate": 2.100839614868164e-05,
"lookahead_loss": 3.7354738450050355,
"loss": 2.0129,
"step": 304000
},
{
"base_loss": 0.30920383241772653,
"epoch": 1.0276565551757812,
"grad_norm": 0.11274685710668564,
"learning_rate": 2.0960712432861328e-05,
"lookahead_loss": 3.7408050112724305,
"loss": 2.0211,
"step": 304500
},
{
"base_loss": 0.33220478031039236,
"epoch": 1.0286102294921875,
"grad_norm": 0.12302955985069275,
"learning_rate": 2.0913028717041018e-05,
"lookahead_loss": 3.7831042833328246,
"loss": 2.0522,
"step": 305000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.186847817021818,
"eval_lookahead_perplexity": 24.211986416887246,
"eval_loss": 1.657226800918579,
"eval_perplexity": 5.24474593347748,
"eval_runtime": 260.7532,
"eval_samples_per_second": 19.175,
"eval_steps_per_second": 0.602,
"step": 305000
},
{
"base_loss": 0.30326411041617396,
"epoch": 1.0295639038085938,
"grad_norm": 0.11241597682237625,
"learning_rate": 2.0865345001220705e-05,
"lookahead_loss": 3.7075537395477296,
"loss": 2.0049,
"step": 305500
},
{
"base_loss": 0.3031138954460621,
"epoch": 1.030517578125,
"grad_norm": 0.14674971997737885,
"learning_rate": 2.081766128540039e-05,
"lookahead_loss": 3.736467706680298,
"loss": 2.0225,
"step": 306000
},
{
"base_loss": 0.30234866255521775,
"epoch": 1.0314712524414062,
"grad_norm": 0.12188810110092163,
"learning_rate": 2.0769977569580078e-05,
"lookahead_loss": 3.742179157733917,
"loss": 2.0214,
"step": 306500
},
{
"base_loss": 0.3155796425938606,
"epoch": 1.0324249267578125,
"grad_norm": 0.13792704045772552,
"learning_rate": 2.0722293853759765e-05,
"lookahead_loss": 3.759408875465393,
"loss": 2.0432,
"step": 307000
},
{
"base_loss": 0.3022744803726673,
"epoch": 1.0333786010742188,
"grad_norm": 0.15973329544067383,
"learning_rate": 2.0674610137939455e-05,
"lookahead_loss": 3.7038854308128357,
"loss": 2.0042,
"step": 307500
},
{
"base_loss": 0.30410280799865724,
"epoch": 1.034332275390625,
"grad_norm": 0.1354731023311615,
"learning_rate": 2.0626926422119142e-05,
"lookahead_loss": 3.73946187210083,
"loss": 2.0229,
"step": 308000
},
{
"base_loss": 0.3077150760293007,
"epoch": 1.0352859497070312,
"grad_norm": 0.12362109869718552,
"learning_rate": 2.057924270629883e-05,
"lookahead_loss": 3.7301273741722105,
"loss": 2.0218,
"step": 308500
},
{
"base_loss": 0.3269314341843128,
"epoch": 1.0362396240234375,
"grad_norm": 0.10148072987794876,
"learning_rate": 2.0531558990478516e-05,
"lookahead_loss": 3.7653005418777465,
"loss": 2.0419,
"step": 309000
},
{
"base_loss": 0.30525318866968154,
"epoch": 1.0371932983398438,
"grad_norm": 0.10899261385202408,
"learning_rate": 2.0483875274658203e-05,
"lookahead_loss": 3.7169791021347045,
"loss": 2.0118,
"step": 309500
},
{
"base_loss": 0.3003401378691196,
"epoch": 1.03814697265625,
"grad_norm": 0.13924409449100494,
"learning_rate": 2.0436191558837893e-05,
"lookahead_loss": 3.7134488053321837,
"loss": 2.0096,
"step": 310000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1832272579875616,
"eval_lookahead_perplexity": 24.124483990164148,
"eval_loss": 1.6554052829742432,
"eval_perplexity": 5.235201230209359,
"eval_runtime": 270.7386,
"eval_samples_per_second": 18.468,
"eval_steps_per_second": 0.58,
"step": 310000
},
{
"base_loss": 0.3116057882905006,
"epoch": 1.0391006469726562,
"grad_norm": 0.12299172580242157,
"learning_rate": 2.038850784301758e-05,
"lookahead_loss": 3.7361023359298704,
"loss": 2.0261,
"step": 310500
},
{
"base_loss": 0.31716569018363955,
"epoch": 1.0400543212890625,
"grad_norm": 0.15561062097549438,
"learning_rate": 2.0340824127197266e-05,
"lookahead_loss": 3.752716485500336,
"loss": 2.0352,
"step": 311000
},
{
"base_loss": 0.31002197673916815,
"epoch": 1.0410079956054688,
"grad_norm": 0.1071806252002716,
"learning_rate": 2.0293140411376953e-05,
"lookahead_loss": 3.717531894683838,
"loss": 2.0087,
"step": 311500
},
{
"base_loss": 0.29521755149960516,
"epoch": 1.041961669921875,
"grad_norm": 0.1366235613822937,
"learning_rate": 2.024545669555664e-05,
"lookahead_loss": 3.729984639644623,
"loss": 2.014,
"step": 312000
},
{
"base_loss": 0.30736519694328307,
"epoch": 1.0429153442382812,
"grad_norm": 0.2894051969051361,
"learning_rate": 2.019777297973633e-05,
"lookahead_loss": 3.724388958930969,
"loss": 2.0196,
"step": 312500
},
{
"base_loss": 0.3271687869429588,
"epoch": 1.0438690185546875,
"grad_norm": 0.13099634647369385,
"learning_rate": 2.0150089263916017e-05,
"lookahead_loss": 3.7537019534111025,
"loss": 2.0458,
"step": 313000
},
{
"base_loss": 0.2943850245475769,
"epoch": 1.0448226928710938,
"grad_norm": 0.17744140326976776,
"learning_rate": 2.0102405548095704e-05,
"lookahead_loss": 3.6922522506713866,
"loss": 1.9939,
"step": 313500
},
{
"base_loss": 0.30418619123101237,
"epoch": 1.0457763671875,
"grad_norm": 0.10149285197257996,
"learning_rate": 2.005472183227539e-05,
"lookahead_loss": 3.7536733145713805,
"loss": 2.0266,
"step": 314000
},
{
"base_loss": 0.32734892451763153,
"epoch": 1.0467300415039062,
"grad_norm": 0.10566597431898117,
"learning_rate": 2.0007038116455078e-05,
"lookahead_loss": 3.7515933270454407,
"loss": 2.037,
"step": 314500
},
{
"base_loss": 0.32642096510529517,
"epoch": 1.0476837158203125,
"grad_norm": 0.18560998141765594,
"learning_rate": 1.9959354400634768e-05,
"lookahead_loss": 3.7600359020233154,
"loss": 2.0466,
"step": 315000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1796208700052087,
"eval_lookahead_perplexity": 24.037638434531967,
"eval_loss": 1.6536099910736084,
"eval_perplexity": 5.225810947513935,
"eval_runtime": 273.2772,
"eval_samples_per_second": 18.296,
"eval_steps_per_second": 0.575,
"step": 315000
},
{
"base_loss": 0.29649946457147597,
"epoch": 1.0486373901367188,
"grad_norm": 0.10538148880004883,
"learning_rate": 1.9911670684814455e-05,
"lookahead_loss": 3.694501425266266,
"loss": 1.9992,
"step": 315500
},
{
"base_loss": 0.3057677939236164,
"epoch": 1.049591064453125,
"grad_norm": 0.10804512351751328,
"learning_rate": 1.986398696899414e-05,
"lookahead_loss": 3.7278350348472595,
"loss": 2.0164,
"step": 316000
},
{
"base_loss": 0.3218669015169144,
"epoch": 1.0505447387695312,
"grad_norm": 0.0984341949224472,
"learning_rate": 1.9816303253173828e-05,
"lookahead_loss": 3.7570176639556885,
"loss": 2.0388,
"step": 316500
},
{
"base_loss": 0.308034790366888,
"epoch": 1.0514984130859375,
"grad_norm": 0.14431750774383545,
"learning_rate": 1.9768619537353515e-05,
"lookahead_loss": 3.701095435142517,
"loss": 2.0069,
"step": 317000
},
{
"base_loss": 0.30695659655332563,
"epoch": 1.0524520874023438,
"grad_norm": 0.18022307753562927,
"learning_rate": 1.9720935821533205e-05,
"lookahead_loss": 3.726999051570892,
"loss": 2.0167,
"step": 317500
},
{
"base_loss": 0.3215196977555752,
"epoch": 1.05340576171875,
"grad_norm": 0.12188173830509186,
"learning_rate": 1.9673252105712892e-05,
"lookahead_loss": 3.7509958362579345,
"loss": 2.0307,
"step": 318000
},
{
"base_loss": 0.35528673872351646,
"epoch": 1.0543594360351562,
"grad_norm": 0.10480683296918869,
"learning_rate": 1.962556838989258e-05,
"lookahead_loss": 3.7855782594680787,
"loss": 2.0744,
"step": 318500
},
{
"base_loss": 0.2939756731390953,
"epoch": 1.0553131103515625,
"grad_norm": 0.13534528017044067,
"learning_rate": 1.9577884674072266e-05,
"lookahead_loss": 3.680368016242981,
"loss": 1.9924,
"step": 319000
},
{
"base_loss": 0.30533788445591925,
"epoch": 1.0562667846679688,
"grad_norm": 0.10960806906223297,
"learning_rate": 1.9530200958251953e-05,
"lookahead_loss": 3.7495523767471313,
"loss": 2.0292,
"step": 319500
},
{
"base_loss": 0.3139654756486416,
"epoch": 1.057220458984375,
"grad_norm": 0.11507224291563034,
"learning_rate": 1.9482517242431643e-05,
"lookahead_loss": 3.747137411594391,
"loss": 2.0341,
"step": 320000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1759323974767812,
"eval_lookahead_perplexity": 23.94913957865158,
"eval_loss": 1.6517695188522339,
"eval_perplexity": 5.216201832997494,
"eval_runtime": 259.5136,
"eval_samples_per_second": 19.267,
"eval_steps_per_second": 0.605,
"step": 320000
},
{
"base_loss": 0.32300865882635116,
"epoch": 1.0581741333007812,
"grad_norm": 0.10415331274271011,
"learning_rate": 1.943483352661133e-05,
"lookahead_loss": 3.728764622211456,
"loss": 2.0226,
"step": 320500
},
{
"base_loss": 0.28907309558987615,
"epoch": 1.0591278076171875,
"grad_norm": 0.0982637032866478,
"learning_rate": 1.9387149810791016e-05,
"lookahead_loss": 3.6841273069381715,
"loss": 1.9931,
"step": 321000
},
{
"base_loss": 0.30057010012865065,
"epoch": 1.0600814819335938,
"grad_norm": 0.09693081676959991,
"learning_rate": 1.9339466094970703e-05,
"lookahead_loss": 3.735932330608368,
"loss": 2.0199,
"step": 321500
},
{
"base_loss": 0.32112616834044455,
"epoch": 1.06103515625,
"grad_norm": 0.09655743092298508,
"learning_rate": 1.929178237915039e-05,
"lookahead_loss": 3.740573130607605,
"loss": 2.0288,
"step": 322000
},
{
"base_loss": 0.3060283879637718,
"epoch": 1.0619888305664062,
"grad_norm": 0.1286919116973877,
"learning_rate": 1.924409866333008e-05,
"lookahead_loss": 3.712129928588867,
"loss": 1.9997,
"step": 322500
},
{
"base_loss": 0.31152518782019617,
"epoch": 1.0629425048828125,
"grad_norm": 0.11635252833366394,
"learning_rate": 1.9196414947509767e-05,
"lookahead_loss": 3.7569445767402647,
"loss": 2.0279,
"step": 323000
},
{
"base_loss": 0.3149063532948494,
"epoch": 1.0638961791992188,
"grad_norm": 0.10551901161670685,
"learning_rate": 1.9148731231689454e-05,
"lookahead_loss": 3.7492296714782714,
"loss": 2.03,
"step": 323500
},
{
"base_loss": 0.30411062452197074,
"epoch": 1.064849853515625,
"grad_norm": 0.12699651718139648,
"learning_rate": 1.910104751586914e-05,
"lookahead_loss": 3.7036221413612367,
"loss": 2.0074,
"step": 324000
},
{
"base_loss": 0.30933507332205773,
"epoch": 1.0658035278320312,
"grad_norm": 0.1357489973306656,
"learning_rate": 1.9053363800048828e-05,
"lookahead_loss": 3.7238325271606447,
"loss": 2.0153,
"step": 324500
},
{
"base_loss": 0.30638799047470094,
"epoch": 1.0667572021484375,
"grad_norm": 0.104975126683712,
"learning_rate": 1.9005680084228518e-05,
"lookahead_loss": 3.7292118496894835,
"loss": 2.017,
"step": 325000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1727946909090963,
"eval_lookahead_perplexity": 23.87411197496282,
"eval_loss": 1.6501920223236084,
"eval_perplexity": 5.207979779548762,
"eval_runtime": 269.7309,
"eval_samples_per_second": 18.537,
"eval_steps_per_second": 0.582,
"step": 325000
},
{
"base_loss": 0.32938760298490527,
"epoch": 1.0677108764648438,
"grad_norm": 0.15414147078990936,
"learning_rate": 1.8957996368408205e-05,
"lookahead_loss": 3.7661532316207884,
"loss": 2.0456,
"step": 325500
},
{
"base_loss": 0.29950347980856895,
"epoch": 1.06866455078125,
"grad_norm": 0.13640785217285156,
"learning_rate": 1.891031265258789e-05,
"lookahead_loss": 3.6806137619018555,
"loss": 1.9879,
"step": 326000
},
{
"base_loss": 0.30374919882416723,
"epoch": 1.0696182250976562,
"grad_norm": 0.1291172206401825,
"learning_rate": 1.8862628936767578e-05,
"lookahead_loss": 3.753649913311005,
"loss": 2.0277,
"step": 326500
},
{
"base_loss": 0.34455711591243743,
"epoch": 1.0705718994140625,
"grad_norm": 0.11020272970199585,
"learning_rate": 1.8814945220947265e-05,
"lookahead_loss": 3.7804429450035095,
"loss": 2.066,
"step": 327000
},
{
"base_loss": 0.31508783569931986,
"epoch": 1.0715255737304688,
"grad_norm": 0.14544732868671417,
"learning_rate": 1.8767261505126955e-05,
"lookahead_loss": 3.710645009994507,
"loss": 2.0101,
"step": 327500
},
{
"base_loss": 0.3064769520163536,
"epoch": 1.072479248046875,
"grad_norm": 0.12241372466087341,
"learning_rate": 1.8719577789306642e-05,
"lookahead_loss": 3.7174899125099183,
"loss": 2.0103,
"step": 328000
},
{
"base_loss": 0.3032188524603844,
"epoch": 1.0734329223632812,
"grad_norm": 0.09545071423053741,
"learning_rate": 1.867189407348633e-05,
"lookahead_loss": 3.723200294494629,
"loss": 2.017,
"step": 328500
},
{
"base_loss": 0.3287756524384022,
"epoch": 1.0743865966796875,
"grad_norm": 0.111233189702034,
"learning_rate": 1.8624210357666016e-05,
"lookahead_loss": 3.746195571899414,
"loss": 2.0363,
"step": 329000
},
{
"base_loss": 0.30459495696425437,
"epoch": 1.0753402709960938,
"grad_norm": 0.12619462609291077,
"learning_rate": 1.8576526641845703e-05,
"lookahead_loss": 3.699687201976776,
"loss": 1.9999,
"step": 329500
},
{
"base_loss": 0.3038223915994167,
"epoch": 1.0762939453125,
"grad_norm": 0.15558308362960815,
"learning_rate": 1.8528842926025393e-05,
"lookahead_loss": 3.7490292925834656,
"loss": 2.0269,
"step": 330000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1694573937132717,
"eval_lookahead_perplexity": 23.79456976983049,
"eval_loss": 1.6485565900802612,
"eval_perplexity": 5.1994694424312895,
"eval_runtime": 269.5983,
"eval_samples_per_second": 18.546,
"eval_steps_per_second": 0.582,
"step": 330000
},
{
"base_loss": 0.33005355006456377,
"epoch": 1.0772476196289062,
"grad_norm": 0.14846207201480865,
"learning_rate": 1.848115921020508e-05,
"lookahead_loss": 3.757576567649841,
"loss": 2.0498,
"step": 330500
},
{
"base_loss": 0.3011254695951939,
"epoch": 1.0782012939453125,
"grad_norm": 0.12544012069702148,
"learning_rate": 1.8433475494384766e-05,
"lookahead_loss": 3.6908311409950256,
"loss": 2.0027,
"step": 331000
},
{
"base_loss": 0.30074691036343576,
"epoch": 1.0791549682617188,
"grad_norm": 0.12219205498695374,
"learning_rate": 1.8385791778564453e-05,
"lookahead_loss": 3.723267038345337,
"loss": 2.0147,
"step": 331500
},
{
"base_loss": 0.31164744511246684,
"epoch": 1.080108642578125,
"grad_norm": 0.11832752823829651,
"learning_rate": 1.833810806274414e-05,
"lookahead_loss": 3.748631308555603,
"loss": 2.0351,
"step": 332000
},
{
"base_loss": 0.3218230297267437,
"epoch": 1.0810623168945312,
"grad_norm": 0.15903718769550323,
"learning_rate": 1.829042434692383e-05,
"lookahead_loss": 3.7279847102165222,
"loss": 2.0282,
"step": 332500
},
{
"base_loss": 0.30160315957665446,
"epoch": 1.0820159912109375,
"grad_norm": 0.13598766922950745,
"learning_rate": 1.8242740631103517e-05,
"lookahead_loss": 3.701110266685486,
"loss": 2.0043,
"step": 333000
},
{
"base_loss": 0.3046890263557434,
"epoch": 1.0829696655273438,
"grad_norm": 0.10043879598379135,
"learning_rate": 1.8195056915283204e-05,
"lookahead_loss": 3.7326065835952758,
"loss": 2.0182,
"step": 333500
},
{
"base_loss": 0.3337951873242855,
"epoch": 1.08392333984375,
"grad_norm": 0.11642859876155853,
"learning_rate": 1.814737319946289e-05,
"lookahead_loss": 3.7617497820854187,
"loss": 2.0431,
"step": 334000
},
{
"base_loss": 0.30679659196734427,
"epoch": 1.0848770141601562,
"grad_norm": 0.14971092343330383,
"learning_rate": 1.8099689483642578e-05,
"lookahead_loss": 3.7082361845970153,
"loss": 2.0092,
"step": 334500
},
{
"base_loss": 0.29602449855208396,
"epoch": 1.0858306884765625,
"grad_norm": 0.10793782025575638,
"learning_rate": 1.8052005767822268e-05,
"lookahead_loss": 3.712603385448456,
"loss": 2.003,
"step": 335000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.16634484906547,
"eval_lookahead_perplexity": 23.72062324966851,
"eval_loss": 1.646978497505188,
"eval_perplexity": 5.191270669222933,
"eval_runtime": 258.8839,
"eval_samples_per_second": 19.314,
"eval_steps_per_second": 0.606,
"step": 335000
},
{
"base_loss": 0.30177975767850873,
"epoch": 1.0867843627929688,
"grad_norm": 0.4744811952114105,
"learning_rate": 1.8004322052001955e-05,
"lookahead_loss": 3.7170921821594236,
"loss": 2.0061,
"step": 335500
},
{
"base_loss": 0.3368028699159622,
"epoch": 1.087738037109375,
"grad_norm": 0.10788233578205109,
"learning_rate": 1.795663833618164e-05,
"lookahead_loss": 3.7672131505012514,
"loss": 2.0461,
"step": 336000
},
{
"base_loss": 0.30262881484627724,
"epoch": 1.0886917114257812,
"grad_norm": 0.12896187603473663,
"learning_rate": 1.7908954620361328e-05,
"lookahead_loss": 3.6916392154693605,
"loss": 1.9969,
"step": 336500
},
{
"base_loss": 0.3077995398044586,
"epoch": 1.0896453857421875,
"grad_norm": 0.10718485713005066,
"learning_rate": 1.7861270904541015e-05,
"lookahead_loss": 3.688416923999786,
"loss": 1.9978,
"step": 337000
},
{
"base_loss": 0.2990732188224792,
"epoch": 1.0905990600585938,
"grad_norm": 0.10033638030290604,
"learning_rate": 1.7813587188720705e-05,
"lookahead_loss": 3.7155744066238405,
"loss": 2.0097,
"step": 337500
},
{
"base_loss": 0.2992991936802864,
"epoch": 1.091552734375,
"grad_norm": 0.10070586949586868,
"learning_rate": 1.7765903472900392e-05,
"lookahead_loss": 3.716888844013214,
"loss": 2.0061,
"step": 338000
},
{
"base_loss": 0.32812165850400926,
"epoch": 1.0925064086914062,
"grad_norm": 0.1552288979291916,
"learning_rate": 1.771821975708008e-05,
"lookahead_loss": 3.7439418969154357,
"loss": 2.0298,
"step": 338500
},
{
"base_loss": 0.30864692279696465,
"epoch": 1.0934600830078125,
"grad_norm": 0.1215750053524971,
"learning_rate": 1.7670536041259766e-05,
"lookahead_loss": 3.700509956359863,
"loss": 2.008,
"step": 339000
},
{
"base_loss": 0.2887690741121769,
"epoch": 1.0944137573242188,
"grad_norm": 0.16854335367679596,
"learning_rate": 1.7622852325439453e-05,
"lookahead_loss": 3.6794085698127748,
"loss": 1.9825,
"step": 339500
},
{
"base_loss": 0.2968901333212852,
"epoch": 1.095367431640625,
"grad_norm": 0.10205203294754028,
"learning_rate": 1.7575168609619143e-05,
"lookahead_loss": 3.6928190813064576,
"loss": 1.9965,
"step": 340000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1630953965476527,
"eval_lookahead_perplexity": 23.64366930752887,
"eval_loss": 1.6453267335891724,
"eval_perplexity": 5.1827029934899995,
"eval_runtime": 270.7892,
"eval_samples_per_second": 18.465,
"eval_steps_per_second": 0.58,
"step": 340000
},
{
"base_loss": 0.2998797046840191,
"epoch": 1.0963211059570312,
"grad_norm": 0.2383318990468979,
"learning_rate": 1.752748489379883e-05,
"lookahead_loss": 3.701192867279053,
"loss": 2.0021,
"step": 340500
},
{
"base_loss": 0.3316424978226423,
"epoch": 1.0972747802734375,
"grad_norm": 0.10822620987892151,
"learning_rate": 1.7479801177978516e-05,
"lookahead_loss": 3.744793387889862,
"loss": 2.033,
"step": 341000
},
{
"base_loss": 0.2931605673134327,
"epoch": 1.0982284545898438,
"grad_norm": 0.1306353211402893,
"learning_rate": 1.7432117462158203e-05,
"lookahead_loss": 3.670068524837494,
"loss": 1.9853,
"step": 341500
},
{
"base_loss": 0.2951606792807579,
"epoch": 1.09918212890625,
"grad_norm": 0.09544999897480011,
"learning_rate": 1.738443374633789e-05,
"lookahead_loss": 3.7054534935951233,
"loss": 2.0026,
"step": 342000
},
{
"base_loss": 0.3005833325088024,
"epoch": 1.1001358032226562,
"grad_norm": 0.1566513329744339,
"learning_rate": 1.733675003051758e-05,
"lookahead_loss": 3.712607653141022,
"loss": 2.0106,
"step": 342500
},
{
"base_loss": 0.320584302932024,
"epoch": 1.1010894775390625,
"grad_norm": 0.10093113034963608,
"learning_rate": 1.7289066314697267e-05,
"lookahead_loss": 3.7455484852790835,
"loss": 2.0294,
"step": 343000
},
{
"base_loss": 0.3009133404493332,
"epoch": 1.1020431518554688,
"grad_norm": 0.14453580975532532,
"learning_rate": 1.7241382598876954e-05,
"lookahead_loss": 3.6938126912117006,
"loss": 1.9963,
"step": 343500
},
{
"base_loss": 0.2956097418367863,
"epoch": 1.102996826171875,
"grad_norm": 0.11491036415100098,
"learning_rate": 1.719369888305664e-05,
"lookahead_loss": 3.6890286202430724,
"loss": 1.9998,
"step": 344000
},
{
"base_loss": 0.297827641248703,
"epoch": 1.1039505004882812,
"grad_norm": 0.096994549036026,
"learning_rate": 1.7146015167236328e-05,
"lookahead_loss": 3.7211020727157593,
"loss": 2.012,
"step": 344500
},
{
"base_loss": 0.31351589208841324,
"epoch": 1.1049041748046875,
"grad_norm": 0.11693672835826874,
"learning_rate": 1.7098331451416018e-05,
"lookahead_loss": 3.7185617332458496,
"loss": 2.0197,
"step": 345000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1599354583996173,
"eval_lookahead_perplexity": 23.569074694176884,
"eval_loss": 1.6437718868255615,
"eval_perplexity": 5.174650945986001,
"eval_runtime": 258.3426,
"eval_samples_per_second": 19.354,
"eval_steps_per_second": 0.608,
"step": 345000
},
{
"base_loss": 0.30757557436823846,
"epoch": 1.1058578491210938,
"grad_norm": 0.15182138979434967,
"learning_rate": 1.7050647735595705e-05,
"lookahead_loss": 3.69487420463562,
"loss": 1.9979,
"step": 345500
},
{
"base_loss": 0.2956553302705288,
"epoch": 1.1068115234375,
"grad_norm": 0.11485382914543152,
"learning_rate": 1.700296401977539e-05,
"lookahead_loss": 3.681716691493988,
"loss": 1.99,
"step": 346000
},
{
"base_loss": 0.29734967839717863,
"epoch": 1.1077651977539062,
"grad_norm": 0.15719352662563324,
"learning_rate": 1.6955280303955078e-05,
"lookahead_loss": 3.707757304191589,
"loss": 2.0001,
"step": 346500
},
{
"base_loss": 0.32284524619579313,
"epoch": 1.1087188720703125,
"grad_norm": 0.12326517701148987,
"learning_rate": 1.6907596588134765e-05,
"lookahead_loss": 3.7393799867630007,
"loss": 2.0291,
"step": 347000
},
{
"base_loss": 0.3172047883272171,
"epoch": 1.1096725463867188,
"grad_norm": 0.11677565425634384,
"learning_rate": 1.6859912872314455e-05,
"lookahead_loss": 3.702916480064392,
"loss": 2.0123,
"step": 347500
},
{
"base_loss": 0.29914562621712687,
"epoch": 1.110626220703125,
"grad_norm": 0.10666316747665405,
"learning_rate": 1.6812229156494142e-05,
"lookahead_loss": 3.679292615890503,
"loss": 1.9916,
"step": 348000
},
{
"base_loss": 0.2961756982207298,
"epoch": 1.1115798950195312,
"grad_norm": 0.2921413481235504,
"learning_rate": 1.676454544067383e-05,
"lookahead_loss": 3.7147518510818482,
"loss": 2.0041,
"step": 348500
},
{
"base_loss": 0.3077882871925831,
"epoch": 1.1125335693359375,
"grad_norm": 0.14457540214061737,
"learning_rate": 1.6716861724853516e-05,
"lookahead_loss": 3.7223079199790954,
"loss": 2.0166,
"step": 349000
},
{
"base_loss": 0.34582618343830107,
"epoch": 1.1134872436523438,
"grad_norm": 0.11859697103500366,
"learning_rate": 1.6669178009033203e-05,
"lookahead_loss": 3.7550181512832643,
"loss": 2.0428,
"step": 349500
},
{
"base_loss": 0.2972444402873516,
"epoch": 1.11444091796875,
"grad_norm": 0.1123783141374588,
"learning_rate": 1.6621494293212893e-05,
"lookahead_loss": 3.675015371799469,
"loss": 1.9874,
"step": 350000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.15782101618977,
"eval_lookahead_perplexity": 23.519291897767186,
"eval_loss": 1.6427289247512817,
"eval_perplexity": 5.1692567947382555,
"eval_runtime": 269.8055,
"eval_samples_per_second": 18.532,
"eval_steps_per_second": 0.582,
"step": 350000
},
{
"base_loss": 0.29772910493612287,
"epoch": 1.1153945922851562,
"grad_norm": 0.3871125280857086,
"learning_rate": 1.657381057739258e-05,
"lookahead_loss": 3.7167241282463075,
"loss": 2.0107,
"step": 350500
},
{
"base_loss": 0.31204655358195305,
"epoch": 1.1163482666015625,
"grad_norm": 0.166167750954628,
"learning_rate": 1.6526126861572266e-05,
"lookahead_loss": 3.7098571991920473,
"loss": 2.0117,
"step": 351000
},
{
"base_loss": 0.3257904815077782,
"epoch": 1.1173019409179688,
"grad_norm": 0.09978712350130081,
"learning_rate": 1.6478443145751953e-05,
"lookahead_loss": 3.7479370784759523,
"loss": 2.0397,
"step": 351500
},
{
"base_loss": 0.30611594703793527,
"epoch": 1.118255615234375,
"grad_norm": 0.11194101721048355,
"learning_rate": 1.643075942993164e-05,
"lookahead_loss": 3.7034087748527527,
"loss": 2.0015,
"step": 352000
},
{
"base_loss": 0.29987680965662,
"epoch": 1.1192092895507812,
"grad_norm": 0.13494464755058289,
"learning_rate": 1.638307571411133e-05,
"lookahead_loss": 3.7097480974197388,
"loss": 2.0036,
"step": 352500
},
{
"base_loss": 0.3036254093050957,
"epoch": 2.0009536743164062,
"grad_norm": 0.11621030420064926,
"learning_rate": 1.6335391998291017e-05,
"lookahead_loss": 3.721737900733948,
"loss": 2.0079,
"step": 353000
},
{
"base_loss": 0.3016641443669796,
"epoch": 2.0019073486328125,
"grad_norm": 0.17441171407699585,
"learning_rate": 1.6287708282470704e-05,
"lookahead_loss": 3.704204406738281,
"loss": 2.0054,
"step": 353500
},
{
"base_loss": 0.31078750917315484,
"epoch": 2.0028610229492188,
"grad_norm": 0.10131888836622238,
"learning_rate": 1.624002456665039e-05,
"lookahead_loss": 3.7189337663650512,
"loss": 2.0074,
"step": 354000
},
{
"base_loss": 0.3186642001867294,
"epoch": 2.003814697265625,
"grad_norm": 0.11608010530471802,
"learning_rate": 1.6192340850830078e-05,
"lookahead_loss": 3.717122416496277,
"loss": 2.021,
"step": 354500
},
{
"base_loss": 0.29983767235279085,
"epoch": 2.0047683715820312,
"grad_norm": 0.10390781611204147,
"learning_rate": 1.6144657135009768e-05,
"lookahead_loss": 3.6769887518882753,
"loss": 1.9942,
"step": 355000
},
{
"epoch": 2.0047683715820312,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1547467457219813,
"eval_lookahead_perplexity": 23.447098261503868,
"eval_loss": 1.6411741971969604,
"eval_perplexity": 5.1612262530339486,
"eval_runtime": 273.268,
"eval_samples_per_second": 18.297,
"eval_steps_per_second": 0.575,
"step": 355000
},
{
"base_loss": 0.29767213702201845,
"epoch": 2.0057220458984375,
"grad_norm": 0.1111893281340599,
"learning_rate": 1.6096973419189455e-05,
"lookahead_loss": 3.703052345275879,
"loss": 2.0003,
"step": 355500
},
{
"base_loss": 0.2978040435314179,
"epoch": 2.0066757202148438,
"grad_norm": 0.1064620316028595,
"learning_rate": 1.604928970336914e-05,
"lookahead_loss": 3.7156985325813294,
"loss": 2.0087,
"step": 356000
},
{
"base_loss": 0.3102393752634525,
"epoch": 2.00762939453125,
"grad_norm": 0.11691620200872421,
"learning_rate": 1.6001605987548828e-05,
"lookahead_loss": 3.7233915762901306,
"loss": 2.015,
"step": 356500
},
{
"base_loss": 0.3170112347304821,
"epoch": 2.0085830688476562,
"grad_norm": 0.11584022641181946,
"learning_rate": 1.5953922271728515e-05,
"lookahead_loss": 3.716030221939087,
"loss": 2.0091,
"step": 357000
},
{
"base_loss": 0.3013848161697388,
"epoch": 2.0095367431640625,
"grad_norm": 0.1189672127366066,
"learning_rate": 1.5906238555908205e-05,
"lookahead_loss": 3.673254894256592,
"loss": 1.9913,
"step": 357500
},
{
"base_loss": 0.3008648828268051,
"epoch": 2.0104904174804688,
"grad_norm": 0.10629413276910782,
"learning_rate": 1.5858554840087892e-05,
"lookahead_loss": 3.7115158891677855,
"loss": 2.003,
"step": 358000
},
{
"base_loss": 0.3007318134009838,
"epoch": 2.011444091796875,
"grad_norm": 0.11223334819078445,
"learning_rate": 1.581087112426758e-05,
"lookahead_loss": 3.6943654375076296,
"loss": 2.0027,
"step": 358500
},
{
"base_loss": 0.3253892393708229,
"epoch": 2.0123977661132812,
"grad_norm": 0.09513326734304428,
"learning_rate": 1.5763187408447266e-05,
"lookahead_loss": 3.7291233649253845,
"loss": 2.0275,
"step": 359000
},
{
"base_loss": 0.3064581930339336,
"epoch": 2.0133514404296875,
"grad_norm": 0.12533146142959595,
"learning_rate": 1.5715503692626953e-05,
"lookahead_loss": 3.68066414642334,
"loss": 1.998,
"step": 359500
},
{
"base_loss": 0.30020537215471266,
"epoch": 2.0143051147460938,
"grad_norm": 0.1161990836262703,
"learning_rate": 1.5667819976806643e-05,
"lookahead_loss": 3.682435676574707,
"loss": 1.9911,
"step": 360000
},
{
"epoch": 2.0143051147460938,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.152189729312738,
"eval_lookahead_perplexity": 23.387220233675123,
"eval_loss": 1.6399027109146118,
"eval_perplexity": 5.154667994904291,
"eval_runtime": 259.7548,
"eval_samples_per_second": 19.249,
"eval_steps_per_second": 0.604,
"step": 360000
},
{
"base_loss": 0.2963980228602886,
"epoch": 2.0152587890625,
"grad_norm": 0.12211249768733978,
"learning_rate": 1.562013626098633e-05,
"lookahead_loss": 3.7067537345886232,
"loss": 2.0011,
"step": 360500
},
{
"base_loss": 0.3123014765381813,
"epoch": 2.0162124633789062,
"grad_norm": 0.12035073339939117,
"learning_rate": 1.5572452545166016e-05,
"lookahead_loss": 3.7187122321128845,
"loss": 2.0164,
"step": 361000
},
{
"base_loss": 0.311051389247179,
"epoch": 2.0171661376953125,
"grad_norm": 0.12999185919761658,
"learning_rate": 1.5524768829345703e-05,
"lookahead_loss": 3.7131579689979555,
"loss": 2.0084,
"step": 361500
},
{
"base_loss": 0.30117547073960305,
"epoch": 2.0181198120117188,
"grad_norm": 0.12995636463165283,
"learning_rate": 1.547708511352539e-05,
"lookahead_loss": 3.684767934322357,
"loss": 1.9879,
"step": 362000
},
{
"base_loss": 0.2991112365424633,
"epoch": 2.019073486328125,
"grad_norm": 0.10578301548957825,
"learning_rate": 1.542940139770508e-05,
"lookahead_loss": 3.7182036762237547,
"loss": 2.0095,
"step": 362500
},
{
"base_loss": 0.2995945112109184,
"epoch": 2.0200271606445312,
"grad_norm": 0.12268956005573273,
"learning_rate": 1.5381717681884767e-05,
"lookahead_loss": 3.693427396774292,
"loss": 1.9995,
"step": 363000
},
{
"base_loss": 0.32935504597425463,
"epoch": 2.0209808349609375,
"grad_norm": 0.15605683624744415,
"learning_rate": 1.5334033966064454e-05,
"lookahead_loss": 3.7494262118339536,
"loss": 2.0335,
"step": 363500
},
{
"base_loss": 0.3044169374704361,
"epoch": 2.0219345092773438,
"grad_norm": 0.10574875771999359,
"learning_rate": 1.528635025024414e-05,
"lookahead_loss": 3.6809173183441164,
"loss": 1.9887,
"step": 364000
},
{
"base_loss": 0.2996359769701958,
"epoch": 2.02288818359375,
"grad_norm": 0.15029314160346985,
"learning_rate": 1.523866653442383e-05,
"lookahead_loss": 3.686254850387573,
"loss": 1.9976,
"step": 364500
},
{
"base_loss": 0.30142849957942963,
"epoch": 2.0238418579101562,
"grad_norm": 0.10843583941459656,
"learning_rate": 1.5190982818603516e-05,
"lookahead_loss": 3.6926897540092467,
"loss": 1.9987,
"step": 365000
},
{
"epoch": 2.0238418579101562,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1496820206078477,
"eval_lookahead_perplexity": 23.328645372952177,
"eval_loss": 1.6386431455612183,
"eval_perplexity": 5.148179440926399,
"eval_runtime": 271.1091,
"eval_samples_per_second": 18.443,
"eval_steps_per_second": 0.579,
"step": 365000
},
{
"base_loss": 0.3248125557899475,
"epoch": 2.0247955322265625,
"grad_norm": 0.09933142364025116,
"learning_rate": 1.5143299102783205e-05,
"lookahead_loss": 3.7378739171028137,
"loss": 2.0301,
"step": 365500
},
{
"base_loss": 0.31028636208176613,
"epoch": 2.0257492065429688,
"grad_norm": 0.12542510032653809,
"learning_rate": 1.5095615386962891e-05,
"lookahead_loss": 3.6860274262428283,
"loss": 2.0024,
"step": 366000
},
{
"base_loss": 0.3049684434235096,
"epoch": 2.026702880859375,
"grad_norm": 0.1288381665945053,
"learning_rate": 1.5047931671142578e-05,
"lookahead_loss": 3.7073655371665954,
"loss": 1.9991,
"step": 366500
},
{
"base_loss": 0.30682381707429884,
"epoch": 2.0276565551757812,
"grad_norm": 0.10210111737251282,
"learning_rate": 1.5000247955322267e-05,
"lookahead_loss": 3.7111876306533813,
"loss": 2.0056,
"step": 367000
},
{
"base_loss": 0.33313145861029625,
"epoch": 2.0286102294921875,
"grad_norm": 0.11859393119812012,
"learning_rate": 1.4952564239501954e-05,
"lookahead_loss": 3.7556820430755615,
"loss": 2.0382,
"step": 367500
},
{
"base_loss": 0.3025540582239628,
"epoch": 2.0295639038085938,
"grad_norm": 0.11141599714756012,
"learning_rate": 1.4904880523681642e-05,
"lookahead_loss": 3.679368188858032,
"loss": 1.9926,
"step": 368000
},
{
"base_loss": 0.3034520089030266,
"epoch": 2.030517578125,
"grad_norm": 0.15309970080852509,
"learning_rate": 1.4857196807861329e-05,
"lookahead_loss": 3.7101809630393983,
"loss": 2.0078,
"step": 368500
},
{
"base_loss": 0.30229984161257745,
"epoch": 2.0314712524414062,
"grad_norm": 0.12841260433197021,
"learning_rate": 1.4809513092041016e-05,
"lookahead_loss": 3.716352026462555,
"loss": 2.0084,
"step": 369000
},
{
"base_loss": 0.3168328501284122,
"epoch": 2.0324249267578125,
"grad_norm": 0.14004720747470856,
"learning_rate": 1.4761829376220704e-05,
"lookahead_loss": 3.734860891342163,
"loss": 2.0319,
"step": 369500
},
{
"base_loss": 0.30422543051838874,
"epoch": 2.0333786010742188,
"grad_norm": 0.1576036512851715,
"learning_rate": 1.4714145660400391e-05,
"lookahead_loss": 3.6790615549087526,
"loss": 1.9909,
"step": 370000
},
{
"epoch": 2.0333786010742188,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1475962008150242,
"eval_lookahead_perplexity": 23.280036734746588,
"eval_loss": 1.637601375579834,
"eval_perplexity": 5.142819014776387,
"eval_runtime": 269.2705,
"eval_samples_per_second": 18.569,
"eval_steps_per_second": 0.583,
"step": 370000
},
{
"base_loss": 0.3037178117632866,
"epoch": 2.034332275390625,
"grad_norm": 0.132655069231987,
"learning_rate": 1.466646194458008e-05,
"lookahead_loss": 3.709216817855835,
"loss": 2.0074,
"step": 370500
},
{
"base_loss": 0.3084926683306694,
"epoch": 2.0352859497070312,
"grad_norm": 0.1192920133471489,
"learning_rate": 1.4618778228759766e-05,
"lookahead_loss": 3.703082191944122,
"loss": 2.0072,
"step": 371000
},
{
"base_loss": 0.32473134699463846,
"epoch": 2.0362396240234375,
"grad_norm": 0.1038188636302948,
"learning_rate": 1.4571094512939453e-05,
"lookahead_loss": 3.736210472106934,
"loss": 2.0279,
"step": 371500
},
{
"base_loss": 0.3050074822306633,
"epoch": 2.0371932983398438,
"grad_norm": 0.11769280582666397,
"learning_rate": 1.4523410797119142e-05,
"lookahead_loss": 3.6904703121185305,
"loss": 1.9993,
"step": 372000
},
{
"base_loss": 0.3013649364411831,
"epoch": 2.03814697265625,
"grad_norm": 0.13569827377796173,
"learning_rate": 1.4475727081298829e-05,
"lookahead_loss": 3.687872163295746,
"loss": 1.9978,
"step": 372500
},
{
"base_loss": 0.3096700141429901,
"epoch": 2.0391006469726562,
"grad_norm": 0.11427426338195801,
"learning_rate": 1.4428043365478517e-05,
"lookahead_loss": 3.708254415988922,
"loss": 2.0132,
"step": 373000
},
{
"base_loss": 0.31990464240312577,
"epoch": 2.0400543212890625,
"grad_norm": 0.15470543503761292,
"learning_rate": 1.4380359649658204e-05,
"lookahead_loss": 3.7311536393165587,
"loss": 2.0233,
"step": 373500
},
{
"base_loss": 0.3096840573251247,
"epoch": 2.0410079956054688,
"grad_norm": 0.10737185180187225,
"learning_rate": 1.433267593383789e-05,
"lookahead_loss": 3.690148108959198,
"loss": 1.9951,
"step": 374000
},
{
"base_loss": 0.29508850196003916,
"epoch": 2.041961669921875,
"grad_norm": 0.14022259414196014,
"learning_rate": 1.428499221801758e-05,
"lookahead_loss": 3.702949764251709,
"loss": 2.0013,
"step": 374500
},
{
"base_loss": 0.3078301128745079,
"epoch": 2.0429153442382812,
"grad_norm": 0.28813207149505615,
"learning_rate": 1.4237308502197266e-05,
"lookahead_loss": 3.698307290554047,
"loss": 2.0061,
"step": 375000
},
{
"epoch": 2.0429153442382812,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.145147456909521,
"eval_lookahead_perplexity": 23.223099627321105,
"eval_loss": 1.636375069618225,
"eval_perplexity": 5.136516210532161,
"eval_runtime": 261.4483,
"eval_samples_per_second": 19.124,
"eval_steps_per_second": 0.601,
"step": 375000
},
{
"base_loss": 0.3264890166521072,
"epoch": 2.0438690185546875,
"grad_norm": 0.13248620927333832,
"learning_rate": 1.4189624786376955e-05,
"lookahead_loss": 3.7284442324638367,
"loss": 2.0322,
"step": 375500
},
{
"base_loss": 0.29649411234259604,
"epoch": 2.0448226928710938,
"grad_norm": 0.1766345202922821,
"learning_rate": 1.4141941070556641e-05,
"lookahead_loss": 3.6673648881912233,
"loss": 1.9829,
"step": 376000
},
{
"base_loss": 0.30447618263959886,
"epoch": 2.0457763671875,
"grad_norm": 0.09906455129384995,
"learning_rate": 1.4094257354736328e-05,
"lookahead_loss": 3.7270817494392396,
"loss": 2.0136,
"step": 376500
},
{
"base_loss": 0.3287204530388117,
"epoch": 2.0467300415039062,
"grad_norm": 0.10636741667985916,
"learning_rate": 1.4046573638916017e-05,
"lookahead_loss": 3.7259369196891785,
"loss": 2.025,
"step": 377000
},
{
"base_loss": 0.32552153533697126,
"epoch": 2.0476837158203125,
"grad_norm": 0.19604343175888062,
"learning_rate": 1.3998889923095704e-05,
"lookahead_loss": 3.7353513431549072,
"loss": 2.034,
"step": 377500
},
{
"base_loss": 0.2965818170309067,
"epoch": 2.0486373901367188,
"grad_norm": 0.11304906010627747,
"learning_rate": 1.3951206207275392e-05,
"lookahead_loss": 3.6689750633239746,
"loss": 1.9855,
"step": 378000
},
{
"base_loss": 0.30438165706396103,
"epoch": 2.049591064453125,
"grad_norm": 0.10858786851167679,
"learning_rate": 1.3903522491455079e-05,
"lookahead_loss": 3.7019722032547,
"loss": 2.0043,
"step": 378500
},
{
"base_loss": 0.3185528250038624,
"epoch": 2.0505447387695312,
"grad_norm": 0.09491516649723053,
"learning_rate": 1.3855838775634766e-05,
"lookahead_loss": 3.7292102155685423,
"loss": 2.0251,
"step": 379000
},
{
"base_loss": 0.3064282323718071,
"epoch": 2.0514984130859375,
"grad_norm": 0.14370054006576538,
"learning_rate": 1.3808155059814454e-05,
"lookahead_loss": 3.675420670509338,
"loss": 1.9933,
"step": 379500
},
{
"base_loss": 0.30485667461156846,
"epoch": 2.0524520874023438,
"grad_norm": 0.1851443499326706,
"learning_rate": 1.3760471343994141e-05,
"lookahead_loss": 3.701467691421509,
"loss": 2.0043,
"step": 380000
},
{
"epoch": 2.0524520874023438,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1431306185432897,
"eval_lookahead_perplexity": 23.1763095888427,
"eval_loss": 1.6353859901428223,
"eval_perplexity": 5.131438299416049,
"eval_runtime": 274.3785,
"eval_samples_per_second": 18.223,
"eval_steps_per_second": 0.572,
"step": 380000
},
{
"base_loss": 0.3230416283607483,
"epoch": 2.05340576171875,
"grad_norm": 0.1157195121049881,
"learning_rate": 1.371278762817383e-05,
"lookahead_loss": 3.7277134714126587,
"loss": 2.0199,
"step": 380500
},
{
"base_loss": 0.3555378588140011,
"epoch": 2.0543594360351562,
"grad_norm": 0.10356119275093079,
"learning_rate": 1.3665103912353516e-05,
"lookahead_loss": 3.7598505668640136,
"loss": 2.0624,
"step": 381000
},
{
"base_loss": 0.29287488567829134,
"epoch": 2.0553131103515625,
"grad_norm": 0.13426493108272552,
"learning_rate": 1.3617420196533203e-05,
"lookahead_loss": 3.6564659061431883,
"loss": 1.9806,
"step": 381500
},
{
"base_loss": 0.3070660081803799,
"epoch": 2.0562667846679688,
"grad_norm": 0.10810237377882004,
"learning_rate": 1.3569736480712892e-05,
"lookahead_loss": 3.7246059970855714,
"loss": 2.0172,
"step": 382000
},
{
"base_loss": 0.31805591636896136,
"epoch": 2.057220458984375,
"grad_norm": 0.1190585345029831,
"learning_rate": 1.3522052764892579e-05,
"lookahead_loss": 3.7293725261688233,
"loss": 2.0252,
"step": 382500
},
{
"base_loss": 0.32177385982871054,
"epoch": 2.0581741333007812,
"grad_norm": 0.10589335113763809,
"learning_rate": 1.3474369049072265e-05,
"lookahead_loss": 3.7027944622039795,
"loss": 2.0089,
"step": 383000
},
{
"base_loss": 0.2912293503880501,
"epoch": 2.0591278076171875,
"grad_norm": 0.08998730033636093,
"learning_rate": 1.3426685333251954e-05,
"lookahead_loss": 3.6608450388908387,
"loss": 1.9813,
"step": 383500
},
{
"base_loss": 0.3013720656633377,
"epoch": 2.0600814819335938,
"grad_norm": 0.09680195152759552,
"learning_rate": 1.337900161743164e-05,
"lookahead_loss": 3.7127736706733705,
"loss": 2.008,
"step": 384000
},
{
"base_loss": 0.320624990940094,
"epoch": 2.06103515625,
"grad_norm": 0.09335515648126602,
"learning_rate": 1.333131790161133e-05,
"lookahead_loss": 3.7168069033622744,
"loss": 2.0159,
"step": 384500
},
{
"base_loss": 0.30687601006031034,
"epoch": 2.0619888305664062,
"grad_norm": 0.1287391632795334,
"learning_rate": 1.3283634185791016e-05,
"lookahead_loss": 3.6904591851234434,
"loss": 1.9892,
"step": 385000
},
{
"epoch": 2.0619888305664062,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1412412084329624,
"eval_lookahead_perplexity": 23.13256137735663,
"eval_loss": 1.6344444751739502,
"eval_perplexity": 5.126609247114544,
"eval_runtime": 262.0521,
"eval_samples_per_second": 19.08,
"eval_steps_per_second": 0.599,
"step": 385000
},
{
"base_loss": 0.3092333972156048,
"epoch": 2.0629425048828125,
"grad_norm": 0.11992325633764267,
"learning_rate": 1.3235950469970703e-05,
"lookahead_loss": 3.731930965423584,
"loss": 2.0138,
"step": 385500
},
{
"base_loss": 0.3187819467484951,
"epoch": 2.0638961791992188,
"grad_norm": 0.10708379745483398,
"learning_rate": 1.3188266754150391e-05,
"lookahead_loss": 3.728700873374939,
"loss": 2.0192,
"step": 386000
},
{
"base_loss": 0.30155983543396,
"epoch": 2.064849853515625,
"grad_norm": 0.1287672370672226,
"learning_rate": 1.3140583038330078e-05,
"lookahead_loss": 3.677223875999451,
"loss": 1.9938,
"step": 386500
},
{
"base_loss": 0.30980769458413127,
"epoch": 2.0658035278320312,
"grad_norm": 0.14807488024234772,
"learning_rate": 1.3092899322509767e-05,
"lookahead_loss": 3.6996435074806215,
"loss": 2.003,
"step": 387000
},
{
"base_loss": 0.309319562882185,
"epoch": 2.0667572021484375,
"grad_norm": 0.10004570335149765,
"learning_rate": 1.3045215606689454e-05,
"lookahead_loss": 3.7083262605667113,
"loss": 2.007,
"step": 387500
},
{
"base_loss": 0.33088878998160365,
"epoch": 2.0677108764648438,
"grad_norm": 0.1570482850074768,
"learning_rate": 1.299753189086914e-05,
"lookahead_loss": 3.744370493888855,
"loss": 2.0344,
"step": 388000
},
{
"base_loss": 0.30170208609104154,
"epoch": 2.06866455078125,
"grad_norm": 0.13802482187747955,
"learning_rate": 1.2949848175048829e-05,
"lookahead_loss": 3.660481810569763,
"loss": 1.9787,
"step": 388500
},
{
"base_loss": 0.3034212864339352,
"epoch": 2.0696182250976562,
"grad_norm": 0.13130271434783936,
"learning_rate": 1.2902164459228516e-05,
"lookahead_loss": 3.7303440852165224,
"loss": 2.015,
"step": 389000
},
{
"base_loss": 0.3412878410220146,
"epoch": 2.0705718994140625,
"grad_norm": 0.10371687263250351,
"learning_rate": 1.2854480743408204e-05,
"lookahead_loss": 3.7537382555007937,
"loss": 2.0526,
"step": 389500
},
{
"base_loss": 0.31413120782375337,
"epoch": 2.0715255737304688,
"grad_norm": 0.14051884412765503,
"learning_rate": 1.2806797027587891e-05,
"lookahead_loss": 3.687371994972229,
"loss": 1.9978,
"step": 390000
},
{
"epoch": 2.0715255737304688,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1392902360556607,
"eval_lookahead_perplexity": 23.087474385132566,
"eval_loss": 1.6334805488586426,
"eval_perplexity": 5.121669954492467,
"eval_runtime": 296.484,
"eval_samples_per_second": 16.864,
"eval_steps_per_second": 0.53,
"step": 390000
},
{
"base_loss": 0.30943771398067477,
"epoch": 2.072479248046875,
"grad_norm": 0.1194412112236023,
"learning_rate": 1.2759113311767578e-05,
"lookahead_loss": 3.6957876262664793,
"loss": 1.9982,
"step": 390500
},
{
"base_loss": 0.3021232470273972,
"epoch": 2.0734329223632812,
"grad_norm": 0.09396813064813614,
"learning_rate": 1.2711429595947266e-05,
"lookahead_loss": 3.70153946685791,
"loss": 2.0048,
"step": 391000
},
{
"base_loss": 0.3273992139399052,
"epoch": 2.0743865966796875,
"grad_norm": 0.104710154235363,
"learning_rate": 1.2663745880126953e-05,
"lookahead_loss": 3.7224628949165344,
"loss": 2.0227,
"step": 391500
},
{
"base_loss": 0.3019404113292694,
"epoch": 2.0753402709960938,
"grad_norm": 0.1233215481042862,
"learning_rate": 1.2616062164306642e-05,
"lookahead_loss": 3.6753981237411497,
"loss": 1.9883,
"step": 392000
},
{
"base_loss": 0.30488721799850466,
"epoch": 2.0762939453125,
"grad_norm": 0.1560201346874237,
"learning_rate": 1.2568378448486329e-05,
"lookahead_loss": 3.72650838804245,
"loss": 2.0156,
"step": 392500
},
{
"base_loss": 0.32930157482624056,
"epoch": 2.0772476196289062,
"grad_norm": 0.14494048058986664,
"learning_rate": 1.2520694732666015e-05,
"lookahead_loss": 3.735769229888916,
"loss": 2.0379,
"step": 393000
},
{
"base_loss": 0.3017339085638523,
"epoch": 2.0782012939453125,
"grad_norm": 0.11965566128492355,
"learning_rate": 1.2473011016845704e-05,
"lookahead_loss": 3.667924000263214,
"loss": 1.991,
"step": 393500
},
{
"base_loss": 0.299552004635334,
"epoch": 2.0791549682617188,
"grad_norm": 0.1270737498998642,
"learning_rate": 1.242532730102539e-05,
"lookahead_loss": 3.699381884098053,
"loss": 2.0014,
"step": 394000
},
{
"base_loss": 0.31343785190582274,
"epoch": 2.080108642578125,
"grad_norm": 0.1207314133644104,
"learning_rate": 1.237764358520508e-05,
"lookahead_loss": 3.725409944534302,
"loss": 2.0234,
"step": 394500
},
{
"base_loss": 0.3193435942828655,
"epoch": 2.0810623168945312,
"grad_norm": 0.1668887585401535,
"learning_rate": 1.2329959869384766e-05,
"lookahead_loss": 3.705724129199982,
"loss": 2.018,
"step": 395000
},
{
"epoch": 2.0810623168945312,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1376450054180887,
"eval_lookahead_perplexity": 23.049521394202596,
"eval_loss": 1.6326426267623901,
"eval_perplexity": 5.117380191562451,
"eval_runtime": 275.2681,
"eval_samples_per_second": 18.164,
"eval_steps_per_second": 0.57,
"step": 395000
},
{
"base_loss": 0.29771795102953913,
"epoch": 2.0820159912109375,
"grad_norm": 0.13261182606220245,
"learning_rate": 1.2282276153564453e-05,
"lookahead_loss": 3.674468102455139,
"loss": 1.9931,
"step": 395500
},
{
"base_loss": 0.30634405037760737,
"epoch": 2.0829696655273438,
"grad_norm": 0.09685485810041428,
"learning_rate": 1.2234592437744141e-05,
"lookahead_loss": 3.7131715035438537,
"loss": 2.0079,
"step": 396000
},
{
"base_loss": 0.3325966064631939,
"epoch": 2.08392333984375,
"grad_norm": 0.11063214391469955,
"learning_rate": 1.2186908721923828e-05,
"lookahead_loss": 3.7380114979743957,
"loss": 2.032,
"step": 396500
},
{
"base_loss": 0.3046146906912327,
"epoch": 2.0848770141601562,
"grad_norm": 0.14153137803077698,
"learning_rate": 1.2139225006103517e-05,
"lookahead_loss": 3.6873513193130494,
"loss": 1.9979,
"step": 397000
},
{
"base_loss": 0.2984280304312706,
"epoch": 2.0858306884765625,
"grad_norm": 0.1063678041100502,
"learning_rate": 1.2091541290283204e-05,
"lookahead_loss": 3.6935010514259337,
"loss": 1.9924,
"step": 397500
},
{
"base_loss": 0.3047517819106579,
"epoch": 2.0867843627929688,
"grad_norm": 0.4544115960597992,
"learning_rate": 1.204385757446289e-05,
"lookahead_loss": 3.697446392059326,
"loss": 1.9966,
"step": 398000
},
{
"base_loss": 0.33619433450698855,
"epoch": 2.087738037109375,
"grad_norm": 0.10550739616155624,
"learning_rate": 1.1996173858642579e-05,
"lookahead_loss": 3.7438949031829836,
"loss": 2.0327,
"step": 398500
},
{
"base_loss": 0.30201966351270676,
"epoch": 2.0886917114257812,
"grad_norm": 0.12323067337274551,
"learning_rate": 1.1948490142822266e-05,
"lookahead_loss": 3.6697822360992434,
"loss": 1.9862,
"step": 399000
},
{
"base_loss": 0.308837145447731,
"epoch": 2.0896453857421875,
"grad_norm": 0.11336886882781982,
"learning_rate": 1.1900806427001954e-05,
"lookahead_loss": 3.667280921936035,
"loss": 1.9871,
"step": 399500
},
{
"base_loss": 0.2996679684817791,
"epoch": 2.0905990600585938,
"grad_norm": 0.10357397049665451,
"learning_rate": 1.1853122711181641e-05,
"lookahead_loss": 3.6947066493034364,
"loss": 1.9992,
"step": 400000
},
{
"epoch": 2.0905990600585938,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.135611410719899,
"eval_lookahead_perplexity": 23.00269563814152,
"eval_loss": 1.6316173076629639,
"eval_perplexity": 5.112135932891737,
"eval_runtime": 263.2362,
"eval_samples_per_second": 18.994,
"eval_steps_per_second": 0.596,
"step": 400000
},
{
"base_loss": 0.3001748549044132,
"epoch": 2.091552734375,
"grad_norm": 0.10686542093753815,
"learning_rate": 1.1805438995361328e-05,
"lookahead_loss": 3.696131613254547,
"loss": 1.9946,
"step": 400500
},
{
"base_loss": 0.32454153364896776,
"epoch": 2.0925064086914062,
"grad_norm": 0.16283544898033142,
"learning_rate": 1.1757755279541016e-05,
"lookahead_loss": 3.719967551231384,
"loss": 2.0177,
"step": 401000
},
{
"base_loss": 0.3073597291409969,
"epoch": 2.0934600830078125,
"grad_norm": 0.13384558260440826,
"learning_rate": 1.1710071563720703e-05,
"lookahead_loss": 3.679731466293335,
"loss": 1.9974,
"step": 401500
},
{
"base_loss": 0.2906791627705097,
"epoch": 2.0944137573242188,
"grad_norm": 0.16118714213371277,
"learning_rate": 1.1662387847900392e-05,
"lookahead_loss": 3.6603454394340513,
"loss": 1.9723,
"step": 402000
},
{
"base_loss": 0.2977730810046196,
"epoch": 2.095367431640625,
"grad_norm": 0.1117246001958847,
"learning_rate": 1.1614704132080079e-05,
"lookahead_loss": 3.671906901359558,
"loss": 1.9866,
"step": 402500
},
{
"base_loss": 0.3014255873262882,
"epoch": 2.0963211059570312,
"grad_norm": 0.22929483652114868,
"learning_rate": 1.1567020416259765e-05,
"lookahead_loss": 3.6827497115135195,
"loss": 1.9928,
"step": 403000
},
{
"base_loss": 0.3342979139983654,
"epoch": 2.0972747802734375,
"grad_norm": 0.10726416856050491,
"learning_rate": 1.1519336700439454e-05,
"lookahead_loss": 3.723692095756531,
"loss": 2.0222,
"step": 403500
},
{
"base_loss": 0.29461920487880705,
"epoch": 2.0982284545898438,
"grad_norm": 0.11667662858963013,
"learning_rate": 1.147165298461914e-05,
"lookahead_loss": 3.6522948231697083,
"loss": 1.9764,
"step": 404000
},
{
"base_loss": 0.29449185797572136,
"epoch": 2.09918212890625,
"grad_norm": 0.09684642404317856,
"learning_rate": 1.142396926879883e-05,
"lookahead_loss": 3.6839715528488157,
"loss": 1.9921,
"step": 404500
},
{
"base_loss": 0.3004076217412949,
"epoch": 2.1001358032226562,
"grad_norm": 0.1516093909740448,
"learning_rate": 1.1376285552978516e-05,
"lookahead_loss": 3.6948669986724854,
"loss": 2.0012,
"step": 405000
},
{
"epoch": 2.1001358032226562,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.133693779619357,
"eval_lookahead_perplexity": 22.95862722057799,
"eval_loss": 1.6306617259979248,
"eval_perplexity": 5.107253202820293,
"eval_runtime": 275.9898,
"eval_samples_per_second": 18.117,
"eval_steps_per_second": 0.569,
"step": 405000
},
{
"base_loss": 0.3221150857806206,
"epoch": 2.1010894775390625,
"grad_norm": 0.09898355603218079,
"learning_rate": 1.1328601837158203e-05,
"lookahead_loss": 3.7262251405715943,
"loss": 2.0191,
"step": 405500
},
{
"base_loss": 0.3021469285786152,
"epoch": 2.1020431518554688,
"grad_norm": 0.14063310623168945,
"learning_rate": 1.1280918121337891e-05,
"lookahead_loss": 3.6706616439819335,
"loss": 1.9854,
"step": 406000
},
{
"base_loss": 0.29727980035543444,
"epoch": 2.102996826171875,
"grad_norm": 0.12236055731773376,
"learning_rate": 1.1233234405517578e-05,
"lookahead_loss": 3.6699202404022215,
"loss": 1.9916,
"step": 406500
},
{
"base_loss": 0.2964036027789116,
"epoch": 2.1039505004882812,
"grad_norm": 0.10163906216621399,
"learning_rate": 1.1185550689697267e-05,
"lookahead_loss": 3.6987908611297606,
"loss": 2.0006,
"step": 407000
},
{
"base_loss": 0.3138076714575291,
"epoch": 2.1049041748046875,
"grad_norm": 0.10581351071596146,
"learning_rate": 1.1137866973876954e-05,
"lookahead_loss": 3.700042960166931,
"loss": 2.0104,
"step": 407500
},
{
"base_loss": 0.31011119556427,
"epoch": 2.1058578491210938,
"grad_norm": 0.14030158519744873,
"learning_rate": 1.109018325805664e-05,
"lookahead_loss": 3.677447699546814,
"loss": 1.9897,
"step": 408000
},
{
"base_loss": 0.2961321137845516,
"epoch": 2.1068115234375,
"grad_norm": 0.10945964604616165,
"learning_rate": 1.1042499542236329e-05,
"lookahead_loss": 3.6609533157348633,
"loss": 1.9796,
"step": 408500
},
{
"base_loss": 0.2940867764055729,
"epoch": 2.1077651977539062,
"grad_norm": 0.14712247252464294,
"learning_rate": 1.0994815826416016e-05,
"lookahead_loss": 3.684739191532135,
"loss": 1.9891,
"step": 409000
},
{
"base_loss": 0.32057751885056496,
"epoch": 2.1087188720703125,
"grad_norm": 0.12016324698925018,
"learning_rate": 1.0947132110595704e-05,
"lookahead_loss": 3.7167062678337097,
"loss": 2.0188,
"step": 409500
},
{
"base_loss": 0.3168235483467579,
"epoch": 2.1096725463867188,
"grad_norm": 0.11053480952978134,
"learning_rate": 1.0899448394775391e-05,
"lookahead_loss": 3.685606789112091,
"loss": 2.0024,
"step": 410000
},
{
"epoch": 2.1096725463867188,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1321658196921547,
"eval_lookahead_perplexity": 22.923574144868628,
"eval_loss": 1.6299090385437012,
"eval_perplexity": 5.103410483773616,
"eval_runtime": 277.1361,
"eval_samples_per_second": 18.042,
"eval_steps_per_second": 0.567,
"step": 410000
},
{
"base_loss": 0.30013936150074005,
"epoch": 2.110626220703125,
"grad_norm": 0.10985679924488068,
"learning_rate": 1.0851764678955078e-05,
"lookahead_loss": 3.661238037109375,
"loss": 1.9809,
"step": 410500
},
{
"base_loss": 0.29884218820929526,
"epoch": 2.1115798950195312,
"grad_norm": 0.2866235077381134,
"learning_rate": 1.0804080963134766e-05,
"lookahead_loss": 3.697595094203949,
"loss": 1.9953,
"step": 411000
},
{
"base_loss": 0.30719696512818334,
"epoch": 2.1125335693359375,
"grad_norm": 0.12627729773521423,
"learning_rate": 1.0756397247314453e-05,
"lookahead_loss": 3.7029927005767824,
"loss": 2.0056,
"step": 411500
},
{
"base_loss": 0.3448580102622509,
"epoch": 2.1134872436523438,
"grad_norm": 0.11490114778280258,
"learning_rate": 1.0708713531494142e-05,
"lookahead_loss": 3.7366786670684813,
"loss": 2.0333,
"step": 412000
},
{
"base_loss": 0.29588871854543686,
"epoch": 2.11444091796875,
"grad_norm": 0.11319919675588608,
"learning_rate": 1.0661029815673829e-05,
"lookahead_loss": 3.652657012939453,
"loss": 1.9766,
"step": 412500
},
{
"base_loss": 0.3001244888305664,
"epoch": 2.1153945922851562,
"grad_norm": 0.383401095867157,
"learning_rate": 1.0613346099853515e-05,
"lookahead_loss": 3.699717406749725,
"loss": 2.0031,
"step": 413000
},
{
"base_loss": 0.3110040880739689,
"epoch": 2.1163482666015625,
"grad_norm": 0.16206490993499756,
"learning_rate": 1.0565662384033204e-05,
"lookahead_loss": 3.6906900959014894,
"loss": 2.0034,
"step": 413500
},
{
"base_loss": 0.32364195665717127,
"epoch": 2.1173019409179688,
"grad_norm": 0.10381924360990524,
"learning_rate": 1.051797866821289e-05,
"lookahead_loss": 3.726861256599426,
"loss": 2.0293,
"step": 414000
},
{
"base_loss": 0.30665904381871223,
"epoch": 2.118255615234375,
"grad_norm": 0.10144428163766861,
"learning_rate": 1.047029495239258e-05,
"lookahead_loss": 3.686223955631256,
"loss": 1.9927,
"step": 414500
},
{
"base_loss": 0.30039039224386216,
"epoch": 2.1192092895507812,
"grad_norm": 0.1295783519744873,
"learning_rate": 1.0422611236572266e-05,
"lookahead_loss": 3.6910682001113893,
"loss": 1.9947,
"step": 415000
},
{
"epoch": 2.1192092895507812,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.130569809161055,
"eval_lookahead_perplexity": 22.88701705962949,
"eval_loss": 1.629108190536499,
"eval_perplexity": 5.099325063776334,
"eval_runtime": 260.4647,
"eval_samples_per_second": 19.196,
"eval_steps_per_second": 0.603,
"step": 415000
},
{
"base_loss": 0.30430060923099517,
"epoch": 3.0009536743164062,
"grad_norm": 0.11666166037321091,
"learning_rate": 1.0374927520751953e-05,
"lookahead_loss": 3.7038714718818664,
"loss": 1.9999,
"step": 415500
},
{
"base_loss": 0.3014007512629032,
"epoch": 3.0019073486328125,
"grad_norm": 0.17477287352085114,
"learning_rate": 1.0327243804931641e-05,
"lookahead_loss": 3.6869843678474425,
"loss": 1.9959,
"step": 416000
},
{
"base_loss": 0.30987949097156525,
"epoch": 3.0028610229492188,
"grad_norm": 0.1018444299697876,
"learning_rate": 1.0279560089111328e-05,
"lookahead_loss": 3.7006162996292113,
"loss": 1.9985,
"step": 416500
},
{
"base_loss": 0.31901577454805374,
"epoch": 3.003814697265625,
"grad_norm": 0.11105263233184814,
"learning_rate": 1.0231876373291017e-05,
"lookahead_loss": 3.7003021211624145,
"loss": 2.0117,
"step": 417000
},
{
"base_loss": 0.30133016020059583,
"epoch": 3.0047683715820312,
"grad_norm": 0.09324323385953903,
"learning_rate": 1.0184192657470704e-05,
"lookahead_loss": 3.6617124271392822,
"loss": 1.9858,
"step": 417500
},
{
"base_loss": 0.29890421107411386,
"epoch": 3.0057220458984375,
"grad_norm": 0.10885365307331085,
"learning_rate": 1.013650894165039e-05,
"lookahead_loss": 3.6865826263427732,
"loss": 1.9927,
"step": 418000
},
{
"base_loss": 0.2965673512518406,
"epoch": 3.0066757202148438,
"grad_norm": 0.10601469874382019,
"learning_rate": 1.0088825225830079e-05,
"lookahead_loss": 3.6957881078720094,
"loss": 2.0004,
"step": 418500
},
{
"base_loss": 0.309929815903306,
"epoch": 3.00762939453125,
"grad_norm": 0.11855798214673996,
"learning_rate": 1.0041141510009766e-05,
"lookahead_loss": 3.705542845726013,
"loss": 2.0052,
"step": 419000
},
{
"base_loss": 0.3150366614460945,
"epoch": 3.0085830688476562,
"grad_norm": 0.11749642342329025,
"learning_rate": 9.993457794189454e-06,
"lookahead_loss": 3.6966953639984133,
"loss": 1.9993,
"step": 419500
},
{
"base_loss": 0.2985053049325943,
"epoch": 3.0095367431640625,
"grad_norm": 0.11532973498106003,
"learning_rate": 9.945774078369141e-06,
"lookahead_loss": 3.6538707246780397,
"loss": 1.9834,
"step": 420000
},
{
"epoch": 3.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.129125172337785,
"eval_lookahead_perplexity": 22.85397750283659,
"eval_loss": 1.6283732652664185,
"eval_perplexity": 5.095578817700681,
"eval_runtime": 275.3938,
"eval_samples_per_second": 18.156,
"eval_steps_per_second": 0.57,
"step": 420000
},
{
"base_loss": 0.299905475795269,
"epoch": 3.0104904174804688,
"grad_norm": 0.10150377452373505,
"learning_rate": 9.898090362548828e-06,
"lookahead_loss": 3.694345196247101,
"loss": 1.9941,
"step": 420500
},
{
"base_loss": 0.3031460309624672,
"epoch": 3.011444091796875,
"grad_norm": 0.1087944358587265,
"learning_rate": 9.850406646728516e-06,
"lookahead_loss": 3.6788615469932555,
"loss": 1.9953,
"step": 421000
},
{
"base_loss": 0.32608956068754197,
"epoch": 3.0123977661132812,
"grad_norm": 0.09304623305797577,
"learning_rate": 9.802722930908203e-06,
"lookahead_loss": 3.7108820514678955,
"loss": 2.0181,
"step": 421500
},
{
"base_loss": 0.3085035228431225,
"epoch": 3.0133514404296875,
"grad_norm": 0.11764844506978989,
"learning_rate": 9.755039215087892e-06,
"lookahead_loss": 3.6660648827552795,
"loss": 1.99,
"step": 422000
},
{
"base_loss": 0.2973440226018429,
"epoch": 3.0143051147460938,
"grad_norm": 0.11751745641231537,
"learning_rate": 9.707355499267579e-06,
"lookahead_loss": 3.6628987169265748,
"loss": 1.981,
"step": 422500
},
{
"base_loss": 0.2965872138440609,
"epoch": 3.0152587890625,
"grad_norm": 0.11600304394960403,
"learning_rate": 9.659671783447265e-06,
"lookahead_loss": 3.6893741731643677,
"loss": 1.9923,
"step": 423000
},
{
"base_loss": 0.31338943153619764,
"epoch": 3.0162124633789062,
"grad_norm": 0.1297282874584198,
"learning_rate": 9.611988067626954e-06,
"lookahead_loss": 3.703455045223236,
"loss": 2.0093,
"step": 423500
},
{
"base_loss": 0.3127202790379524,
"epoch": 3.0171661376953125,
"grad_norm": 0.1299683153629303,
"learning_rate": 9.56430435180664e-06,
"lookahead_loss": 3.697984820842743,
"loss": 2.0007,
"step": 424000
},
{
"base_loss": 0.30206070256233214,
"epoch": 3.0181198120117188,
"grad_norm": 0.1379324048757553,
"learning_rate": 9.51662063598633e-06,
"lookahead_loss": 3.667839276313782,
"loss": 1.9805,
"step": 424500
},
{
"base_loss": 0.2996631888449192,
"epoch": 3.019073486328125,
"grad_norm": 0.10621386021375656,
"learning_rate": 9.468936920166016e-06,
"lookahead_loss": 3.700213254451752,
"loss": 2.0011,
"step": 425000
},
{
"epoch": 3.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1276195155926785,
"eval_lookahead_perplexity": 22.819593149469497,
"eval_loss": 1.6276158094406128,
"eval_perplexity": 5.0917206032373405,
"eval_runtime": 264.8437,
"eval_samples_per_second": 18.879,
"eval_steps_per_second": 0.593,
"step": 425000
},
{
"base_loss": 0.3018102090358734,
"epoch": 3.0200271606445312,
"grad_norm": 0.121736079454422,
"learning_rate": 9.421253204345703e-06,
"lookahead_loss": 3.6784856877326964,
"loss": 1.9918,
"step": 425500
},
{
"base_loss": 0.32929644933342933,
"epoch": 3.0209808349609375,
"grad_norm": 0.14993664622306824,
"learning_rate": 9.373569488525391e-06,
"lookahead_loss": 3.7336047492027284,
"loss": 2.0255,
"step": 426000
},
{
"base_loss": 0.3050749698281288,
"epoch": 3.0219345092773438,
"grad_norm": 0.10891906917095184,
"learning_rate": 9.325885772705078e-06,
"lookahead_loss": 3.6635623302459717,
"loss": 1.98,
"step": 426500
},
{
"base_loss": 0.29926853865385056,
"epoch": 3.02288818359375,
"grad_norm": 0.1561029702425003,
"learning_rate": 9.278202056884767e-06,
"lookahead_loss": 3.6687038259506224,
"loss": 1.9894,
"step": 427000
},
{
"base_loss": 0.30300188970565795,
"epoch": 3.0238418579101562,
"grad_norm": 0.11594616621732712,
"learning_rate": 9.230518341064454e-06,
"lookahead_loss": 3.677744508266449,
"loss": 1.9921,
"step": 427500
},
{
"base_loss": 0.3265539970099926,
"epoch": 3.0247955322265625,
"grad_norm": 0.10191618651151657,
"learning_rate": 9.18283462524414e-06,
"lookahead_loss": 3.7241694231033327,
"loss": 2.0228,
"step": 428000
},
{
"base_loss": 0.3085910253226757,
"epoch": 3.0257492065429688,
"grad_norm": 0.12174220383167267,
"learning_rate": 9.135150909423829e-06,
"lookahead_loss": 3.667691098690033,
"loss": 1.9951,
"step": 428500
},
{
"base_loss": 0.3028672685772181,
"epoch": 3.026702880859375,
"grad_norm": 0.12968415021896362,
"learning_rate": 9.087467193603516e-06,
"lookahead_loss": 3.688646504402161,
"loss": 1.9897,
"step": 429000
},
{
"base_loss": 0.3085927827656269,
"epoch": 3.0276565551757812,
"grad_norm": 0.11275982856750488,
"learning_rate": 9.039783477783204e-06,
"lookahead_loss": 3.697152105331421,
"loss": 1.9988,
"step": 429500
},
{
"base_loss": 0.3330037909448147,
"epoch": 3.0286102294921875,
"grad_norm": 0.13144977390766144,
"learning_rate": 8.992099761962891e-06,
"lookahead_loss": 3.739767518520355,
"loss": 2.0306,
"step": 430000
},
{
"epoch": 3.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.126402835876416,
"eval_lookahead_perplexity": 22.79184589653397,
"eval_loss": 1.6270209550857544,
"eval_perplexity": 5.088692671741188,
"eval_runtime": 277.6859,
"eval_samples_per_second": 18.006,
"eval_steps_per_second": 0.565,
"step": 430000
},
{
"base_loss": 0.3037443687617779,
"epoch": 3.0295639038085938,
"grad_norm": 0.11082852631807327,
"learning_rate": 8.944416046142578e-06,
"lookahead_loss": 3.663953601360321,
"loss": 1.9849,
"step": 430500
},
{
"base_loss": 0.3000984548330307,
"epoch": 3.030517578125,
"grad_norm": 0.15227945148944855,
"learning_rate": 8.896732330322266e-06,
"lookahead_loss": 3.69063617515564,
"loss": 1.9985,
"step": 431000
},
{
"base_loss": 0.30023023423552514,
"epoch": 3.0314712524414062,
"grad_norm": 0.11759933084249496,
"learning_rate": 8.849048614501953e-06,
"lookahead_loss": 3.698117901802063,
"loss": 1.9996,
"step": 431500
},
{
"base_loss": 0.3138752512037754,
"epoch": 3.0324249267578125,
"grad_norm": 0.13768881559371948,
"learning_rate": 8.801364898681642e-06,
"lookahead_loss": 3.718590494632721,
"loss": 2.0225,
"step": 432000
},
{
"base_loss": 0.3038401392996311,
"epoch": 3.0333786010742188,
"grad_norm": 0.16651467978954315,
"learning_rate": 8.753681182861329e-06,
"lookahead_loss": 3.6639317264556883,
"loss": 1.9836,
"step": 432500
},
{
"base_loss": 0.302562724173069,
"epoch": 3.034332275390625,
"grad_norm": 0.1357959806919098,
"learning_rate": 8.705997467041015e-06,
"lookahead_loss": 3.694962691307068,
"loss": 2.0013,
"step": 433000
},
{
"base_loss": 0.30816466361284256,
"epoch": 3.0352859497070312,
"grad_norm": 0.11627933382987976,
"learning_rate": 8.658313751220704e-06,
"lookahead_loss": 3.68630348110199,
"loss": 2.0,
"step": 433500
},
{
"base_loss": 0.32414969062805177,
"epoch": 3.0362396240234375,
"grad_norm": 0.10137467831373215,
"learning_rate": 8.61063003540039e-06,
"lookahead_loss": 3.7200622777938843,
"loss": 2.0191,
"step": 434000
},
{
"base_loss": 0.3057764404714107,
"epoch": 3.0371932983398438,
"grad_norm": 0.11666380614042282,
"learning_rate": 8.56294631958008e-06,
"lookahead_loss": 3.675651388168335,
"loss": 1.992,
"step": 434500
},
{
"base_loss": 0.2984813822805881,
"epoch": 3.03814697265625,
"grad_norm": 0.13813932240009308,
"learning_rate": 8.515262603759766e-06,
"lookahead_loss": 3.667681806087494,
"loss": 1.9876,
"step": 435000
},
{
"epoch": 3.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.125287823783704,
"eval_lookahead_perplexity": 22.76644687548336,
"eval_loss": 1.6264574527740479,
"eval_perplexity": 5.08582598942401,
"eval_runtime": 276.6213,
"eval_samples_per_second": 18.075,
"eval_steps_per_second": 0.568,
"step": 435000
},
{
"base_loss": 0.308715908408165,
"epoch": 3.0391006469726562,
"grad_norm": 0.12543562054634094,
"learning_rate": 8.467578887939453e-06,
"lookahead_loss": 3.693666660785675,
"loss": 2.006,
"step": 435500
},
{
"base_loss": 0.3176246392726898,
"epoch": 3.0400543212890625,
"grad_norm": 0.15535596013069153,
"learning_rate": 8.419895172119141e-06,
"lookahead_loss": 3.715300820350647,
"loss": 2.0146,
"step": 436000
},
{
"base_loss": 0.30790746420621873,
"epoch": 3.0410079956054688,
"grad_norm": 0.11093998700380325,
"learning_rate": 8.372211456298828e-06,
"lookahead_loss": 3.674683918952942,
"loss": 1.987,
"step": 436500
},
{
"base_loss": 0.29577805346250535,
"epoch": 3.041961669921875,
"grad_norm": 0.14224250614643097,
"learning_rate": 8.324527740478517e-06,
"lookahead_loss": 3.6888953552246093,
"loss": 1.9934,
"step": 437000
},
{
"base_loss": 0.30930229860544206,
"epoch": 3.0429153442382812,
"grad_norm": 0.29056692123413086,
"learning_rate": 8.276844024658204e-06,
"lookahead_loss": 3.685419809818268,
"loss": 2.0001,
"step": 437500
},
{
"base_loss": 0.3300181960165501,
"epoch": 3.0438690185546875,
"grad_norm": 0.13487912714481354,
"learning_rate": 8.22916030883789e-06,
"lookahead_loss": 3.7153477659225462,
"loss": 2.0264,
"step": 438000
},
{
"base_loss": 0.2941302236020565,
"epoch": 3.0448226928710938,
"grad_norm": 0.18283872306346893,
"learning_rate": 8.181476593017579e-06,
"lookahead_loss": 3.651518307685852,
"loss": 1.9746,
"step": 438500
},
{
"base_loss": 0.3060891110301018,
"epoch": 3.0457763671875,
"grad_norm": 0.10058881342411041,
"learning_rate": 8.133792877197266e-06,
"lookahead_loss": 3.7120121273994444,
"loss": 2.0059,
"step": 439000
},
{
"base_loss": 0.3289530730843544,
"epoch": 3.0467300415039062,
"grad_norm": 0.11518778651952744,
"learning_rate": 8.086109161376954e-06,
"lookahead_loss": 3.713506714820862,
"loss": 2.019,
"step": 439500
},
{
"base_loss": 0.32627532437443735,
"epoch": 3.0476837158203125,
"grad_norm": 0.18812230229377747,
"learning_rate": 8.038425445556641e-06,
"lookahead_loss": 3.7203544387817384,
"loss": 2.0281,
"step": 440000
},
{
"epoch": 3.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1241645150291273,
"eval_lookahead_perplexity": 22.740887484628896,
"eval_loss": 1.625902533531189,
"eval_perplexity": 5.0830045496246665,
"eval_runtime": 268.2319,
"eval_samples_per_second": 18.641,
"eval_steps_per_second": 0.585,
"step": 440000
},
{
"base_loss": 0.29505294340848925,
"epoch": 3.0486373901367188,
"grad_norm": 0.10775741934776306,
"learning_rate": 7.990741729736328e-06,
"lookahead_loss": 3.655467821598053,
"loss": 1.9785,
"step": 440500
},
{
"base_loss": 0.3058525973558426,
"epoch": 3.049591064453125,
"grad_norm": 0.100101538002491,
"learning_rate": 7.943058013916016e-06,
"lookahead_loss": 3.6875068039894106,
"loss": 1.9976,
"step": 441000
},
{
"base_loss": 0.3209580435454845,
"epoch": 3.0505447387695312,
"grad_norm": 0.0992191731929779,
"learning_rate": 7.895374298095703e-06,
"lookahead_loss": 3.716779721736908,
"loss": 2.0189,
"step": 441500
},
{
"base_loss": 0.30481894659996034,
"epoch": 3.0514984130859375,
"grad_norm": 0.14181774854660034,
"learning_rate": 7.847690582275392e-06,
"lookahead_loss": 3.6599699621200563,
"loss": 1.9858,
"step": 442000
},
{
"base_loss": 0.30778305551409724,
"epoch": 3.0524520874023438,
"grad_norm": 0.1846870332956314,
"learning_rate": 7.800006866455079e-06,
"lookahead_loss": 3.68911500453949,
"loss": 1.9979,
"step": 442500
},
{
"base_loss": 0.3220736192762852,
"epoch": 3.05340576171875,
"grad_norm": 0.11497963219881058,
"learning_rate": 7.752323150634765e-06,
"lookahead_loss": 3.7116645102500914,
"loss": 2.0122,
"step": 443000
},
{
"base_loss": 0.3551108354330063,
"epoch": 3.0543594360351562,
"grad_norm": 0.10149979591369629,
"learning_rate": 7.704639434814454e-06,
"lookahead_loss": 3.746900239944458,
"loss": 2.0558,
"step": 443500
},
{
"base_loss": 0.2914521896839142,
"epoch": 3.0553131103515625,
"grad_norm": 0.13717815279960632,
"learning_rate": 7.65695571899414e-06,
"lookahead_loss": 3.639768280506134,
"loss": 1.9715,
"step": 444000
},
{
"base_loss": 0.3070342823863029,
"epoch": 3.0562667846679688,
"grad_norm": 0.10753431171178818,
"learning_rate": 7.6092720031738284e-06,
"lookahead_loss": 3.7122844524383547,
"loss": 2.0111,
"step": 444500
},
{
"base_loss": 0.31701629158854483,
"epoch": 3.057220458984375,
"grad_norm": 0.12341847270727158,
"learning_rate": 7.561588287353516e-06,
"lookahead_loss": 3.713340766429901,
"loss": 2.0165,
"step": 445000
},
{
"epoch": 3.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1230207685464486,
"eval_lookahead_perplexity": 22.71489254320056,
"eval_loss": 1.6253362894058228,
"eval_perplexity": 5.080127142893444,
"eval_runtime": 274.3775,
"eval_samples_per_second": 18.223,
"eval_steps_per_second": 0.572,
"step": 445000
},
{
"base_loss": 0.3187244310975075,
"epoch": 3.0581741333007812,
"grad_norm": 0.10979755967855453,
"learning_rate": 7.513904571533204e-06,
"lookahead_loss": 3.6891490058898926,
"loss": 2.0024,
"step": 445500
},
{
"base_loss": 0.2901874388754368,
"epoch": 3.0591278076171875,
"grad_norm": 0.09405338019132614,
"learning_rate": 7.466220855712891e-06,
"lookahead_loss": 3.646950806617737,
"loss": 1.9735,
"step": 446000
},
{
"base_loss": 0.30347892227768897,
"epoch": 3.0600814819335938,
"grad_norm": 0.09631290286779404,
"learning_rate": 7.418537139892578e-06,
"lookahead_loss": 3.7013241720199583,
"loss": 2.0025,
"step": 446500
},
{
"base_loss": 0.32231138944625853,
"epoch": 3.06103515625,
"grad_norm": 0.09189051389694214,
"learning_rate": 7.370853424072266e-06,
"lookahead_loss": 3.7054029207229613,
"loss": 2.0104,
"step": 447000
},
{
"base_loss": 0.3056237104833126,
"epoch": 3.0619888305664062,
"grad_norm": 0.1302718222141266,
"learning_rate": 7.323169708251954e-06,
"lookahead_loss": 3.675487512588501,
"loss": 1.9827,
"step": 447500
},
{
"base_loss": 0.3069985309243202,
"epoch": 3.0629425048828125,
"grad_norm": 0.11389657109975815,
"learning_rate": 7.275485992431641e-06,
"lookahead_loss": 3.7175057182312012,
"loss": 2.007,
"step": 448000
},
{
"base_loss": 0.31453078415989877,
"epoch": 3.0638961791992188,
"grad_norm": 0.1031729131937027,
"learning_rate": 7.227802276611328e-06,
"lookahead_loss": 3.7119864888191225,
"loss": 2.0122,
"step": 448500
},
{
"base_loss": 0.3025907655358315,
"epoch": 3.064849853515625,
"grad_norm": 0.1239413321018219,
"learning_rate": 7.180118560791016e-06,
"lookahead_loss": 3.665122102737427,
"loss": 1.9872,
"step": 449000
},
{
"base_loss": 0.3104621644318104,
"epoch": 3.0658035278320312,
"grad_norm": 0.13091668486595154,
"learning_rate": 7.1324348449707034e-06,
"lookahead_loss": 3.685753818035126,
"loss": 1.9961,
"step": 449500
},
{
"base_loss": 0.3051215724647045,
"epoch": 3.0667572021484375,
"grad_norm": 0.1033690795302391,
"learning_rate": 7.084751129150391e-06,
"lookahead_loss": 3.6920931324958803,
"loss": 1.9991,
"step": 450000
},
{
"epoch": 3.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1221639020755267,
"eval_lookahead_perplexity": 22.695437249874335,
"eval_loss": 1.624902367591858,
"eval_perplexity": 5.077923243103105,
"eval_runtime": 274.3111,
"eval_samples_per_second": 18.227,
"eval_steps_per_second": 0.572,
"step": 450000
},
{
"base_loss": 0.3333324840962887,
"epoch": 3.0677108764648438,
"grad_norm": 0.16154102981090546,
"learning_rate": 7.037067413330079e-06,
"lookahead_loss": 3.734944314479828,
"loss": 2.0298,
"step": 450500
},
{
"base_loss": 0.3019326252639294,
"epoch": 3.06866455078125,
"grad_norm": 0.13059785962104797,
"learning_rate": 6.989383697509766e-06,
"lookahead_loss": 3.6473833932876585,
"loss": 1.9719,
"step": 451000
},
{
"base_loss": 0.29965000972151756,
"epoch": 3.0696182250976562,
"grad_norm": 0.1245245486497879,
"learning_rate": 6.941699981689453e-06,
"lookahead_loss": 3.7130830340385437,
"loss": 2.0066,
"step": 451500
},
{
"base_loss": 0.34273114350438116,
"epoch": 3.0705718994140625,
"grad_norm": 0.1069691926240921,
"learning_rate": 6.894016265869141e-06,
"lookahead_loss": 3.7431788458824156,
"loss": 2.0475,
"step": 452000
},
{
"base_loss": 0.3139573369324207,
"epoch": 3.0715255737304688,
"grad_norm": 0.14187411963939667,
"learning_rate": 6.846332550048829e-06,
"lookahead_loss": 3.6735409541130064,
"loss": 1.9913,
"step": 452500
},
{
"base_loss": 0.306064426034689,
"epoch": 3.072479248046875,
"grad_norm": 0.12259159982204437,
"learning_rate": 6.798648834228516e-06,
"lookahead_loss": 3.681407932758331,
"loss": 1.9905,
"step": 453000
},
{
"base_loss": 0.30416936001181605,
"epoch": 3.0734329223632812,
"grad_norm": 0.09869462251663208,
"learning_rate": 6.750965118408203e-06,
"lookahead_loss": 3.6899491953849792,
"loss": 1.9993,
"step": 453500
},
{
"base_loss": 0.32993814861774445,
"epoch": 3.0743865966796875,
"grad_norm": 0.11042125523090363,
"learning_rate": 6.703281402587891e-06,
"lookahead_loss": 3.7116984300613405,
"loss": 2.0173,
"step": 454000
},
{
"base_loss": 0.3029070964753628,
"epoch": 3.0753402709960938,
"grad_norm": 0.12257838994264603,
"learning_rate": 6.6555976867675784e-06,
"lookahead_loss": 3.6630940194129944,
"loss": 1.9823,
"step": 454500
},
{
"base_loss": 0.3045917192697525,
"epoch": 3.0762939453125,
"grad_norm": 0.1443857103586197,
"learning_rate": 6.607913970947266e-06,
"lookahead_loss": 3.714505085945129,
"loss": 2.01,
"step": 455000
},
{
"epoch": 3.0762939453125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.121192988496238,
"eval_lookahead_perplexity": 22.673412635389344,
"eval_loss": 1.6244314908981323,
"eval_perplexity": 5.0755327302579625,
"eval_runtime": 260.1225,
"eval_samples_per_second": 19.222,
"eval_steps_per_second": 0.604,
"step": 455000
},
{
"base_loss": 0.3286729282438755,
"epoch": 3.0772476196289062,
"grad_norm": 0.1463230848312378,
"learning_rate": 6.560230255126954e-06,
"lookahead_loss": 3.7237854881286623,
"loss": 2.0319,
"step": 455500
},
{
"base_loss": 0.30402689361572266,
"epoch": 3.0782012939453125,
"grad_norm": 0.11617710441350937,
"learning_rate": 6.512546539306641e-06,
"lookahead_loss": 3.656920817375183,
"loss": 1.985,
"step": 456000
},
{
"base_loss": 0.2991543865799904,
"epoch": 3.0791549682617188,
"grad_norm": 0.11465635150671005,
"learning_rate": 6.464862823486328e-06,
"lookahead_loss": 3.686256776332855,
"loss": 1.9948,
"step": 456500
},
{
"base_loss": 0.3117095545232296,
"epoch": 3.080108642578125,
"grad_norm": 0.12070228904485703,
"learning_rate": 6.417179107666016e-06,
"lookahead_loss": 3.7134584021568298,
"loss": 2.0171,
"step": 457000
},
{
"base_loss": 0.32279202672839163,
"epoch": 3.0810623168945312,
"grad_norm": 0.16106776893138885,
"learning_rate": 6.369495391845704e-06,
"lookahead_loss": 3.6951747126579284,
"loss": 2.0141,
"step": 457500
},
{
"base_loss": 0.2984404028356075,
"epoch": 3.0820159912109375,
"grad_norm": 0.12972743809223175,
"learning_rate": 6.321811676025391e-06,
"lookahead_loss": 3.663430832386017,
"loss": 1.9873,
"step": 458000
},
{
"base_loss": 0.30507346931099893,
"epoch": 3.0829696655273438,
"grad_norm": 0.09754678606987,
"learning_rate": 6.274127960205078e-06,
"lookahead_loss": 3.6983825812339783,
"loss": 2.0024,
"step": 458500
},
{
"base_loss": 0.3304409826993942,
"epoch": 3.08392333984375,
"grad_norm": 0.10381273180246353,
"learning_rate": 6.226444244384766e-06,
"lookahead_loss": 3.7253785543441773,
"loss": 2.0254,
"step": 459000
},
{
"base_loss": 0.3062775060236454,
"epoch": 3.0848770141601562,
"grad_norm": 0.14119604229927063,
"learning_rate": 6.1787605285644534e-06,
"lookahead_loss": 3.675768536090851,
"loss": 1.9917,
"step": 459500
},
{
"base_loss": 0.2975120039880276,
"epoch": 3.0858306884765625,
"grad_norm": 0.09802526980638504,
"learning_rate": 6.131076812744141e-06,
"lookahead_loss": 3.6808967127799987,
"loss": 1.9873,
"step": 460000
},
{
"epoch": 3.0858306884765625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1203504332338277,
"eval_lookahead_perplexity": 22.654317077917476,
"eval_loss": 1.624000906944275,
"eval_perplexity": 5.073347757747845,
"eval_runtime": 271.0959,
"eval_samples_per_second": 18.444,
"eval_steps_per_second": 0.579,
"step": 460000
},
{
"base_loss": 0.3025768627524376,
"epoch": 3.0867843627929688,
"grad_norm": 0.4642274081707001,
"learning_rate": 6.083393096923829e-06,
"lookahead_loss": 3.6851150598526,
"loss": 1.9892,
"step": 460500
},
{
"base_loss": 0.3363469123840332,
"epoch": 3.087738037109375,
"grad_norm": 0.11057446897029877,
"learning_rate": 6.035709381103516e-06,
"lookahead_loss": 3.736498592376709,
"loss": 2.029,
"step": 461000
},
{
"base_loss": 0.3032494475245476,
"epoch": 3.0886917114257812,
"grad_norm": 0.12081364542245865,
"learning_rate": 5.988025665283203e-06,
"lookahead_loss": 3.658247405529022,
"loss": 1.9801,
"step": 461500
},
{
"base_loss": 0.3093935915529728,
"epoch": 3.0896453857421875,
"grad_norm": 0.11053162068128586,
"learning_rate": 5.940341949462891e-06,
"lookahead_loss": 3.657750358581543,
"loss": 1.9829,
"step": 462000
},
{
"base_loss": 0.29919813787937166,
"epoch": 3.0905990600585938,
"grad_norm": 0.1109168529510498,
"learning_rate": 5.892658233642579e-06,
"lookahead_loss": 3.680602566242218,
"loss": 1.9922,
"step": 462500
},
{
"base_loss": 0.2975557982325554,
"epoch": 3.091552734375,
"grad_norm": 0.10788305848836899,
"learning_rate": 5.844974517822266e-06,
"lookahead_loss": 3.683750663757324,
"loss": 1.9885,
"step": 463000
},
{
"base_loss": 0.3243062843978405,
"epoch": 3.0925064086914062,
"grad_norm": 0.15955285727977753,
"learning_rate": 5.797290802001953e-06,
"lookahead_loss": 3.709198553085327,
"loss": 2.0119,
"step": 463500
},
{
"base_loss": 0.308385471701622,
"epoch": 3.0934600830078125,
"grad_norm": 0.1460312008857727,
"learning_rate": 5.749607086181641e-06,
"lookahead_loss": 3.66904278755188,
"loss": 1.9921,
"step": 464000
},
{
"base_loss": 0.29028289583325384,
"epoch": 3.0944137573242188,
"grad_norm": 0.16574956476688385,
"learning_rate": 5.7019233703613284e-06,
"lookahead_loss": 3.648255637168884,
"loss": 1.9668,
"step": 464500
},
{
"base_loss": 0.29396757900714876,
"epoch": 3.095367431640625,
"grad_norm": 0.10332682728767395,
"learning_rate": 5.654239654541016e-06,
"lookahead_loss": 3.657000524997711,
"loss": 1.9795,
"step": 465000
},
{
"epoch": 3.095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1194351359297294,
"eval_lookahead_perplexity": 22.63359112921932,
"eval_loss": 1.6235333681106567,
"eval_perplexity": 5.070976325066281,
"eval_runtime": 259.9944,
"eval_samples_per_second": 19.231,
"eval_steps_per_second": 0.604,
"step": 465000
},
{
"base_loss": 0.3007206397950649,
"epoch": 3.0963211059570312,
"grad_norm": 0.22960178554058075,
"learning_rate": 5.606555938720704e-06,
"lookahead_loss": 3.670574773788452,
"loss": 1.9856,
"step": 465500
},
{
"base_loss": 0.33368014737963675,
"epoch": 3.0972747802734375,
"grad_norm": 0.10277973115444183,
"learning_rate": 5.558872222900391e-06,
"lookahead_loss": 3.712209891319275,
"loss": 2.0179,
"step": 466000
},
{
"base_loss": 0.2953821074962616,
"epoch": 3.0982284545898438,
"grad_norm": 0.12367592006921768,
"learning_rate": 5.511188507080078e-06,
"lookahead_loss": 3.64039670085907,
"loss": 1.9701,
"step": 466500
},
{
"base_loss": 0.29632621896266936,
"epoch": 3.09918212890625,
"grad_norm": 0.10013365000486374,
"learning_rate": 5.463504791259766e-06,
"lookahead_loss": 3.676180508136749,
"loss": 1.9884,
"step": 467000
},
{
"base_loss": 0.3015917186141014,
"epoch": 3.1001358032226562,
"grad_norm": 0.1582956314086914,
"learning_rate": 5.415821075439454e-06,
"lookahead_loss": 3.6843478126525877,
"loss": 1.9969,
"step": 467500
},
{
"base_loss": 0.3215226559937,
"epoch": 3.1010894775390625,
"grad_norm": 0.10262365639209747,
"learning_rate": 5.368137359619141e-06,
"lookahead_loss": 3.7145949635505677,
"loss": 2.0143,
"step": 468000
},
{
"base_loss": 0.3008797511458397,
"epoch": 3.1020431518554688,
"grad_norm": 0.1437883973121643,
"learning_rate": 5.320453643798828e-06,
"lookahead_loss": 3.6612454042434694,
"loss": 1.9818,
"step": 468500
},
{
"base_loss": 0.29713244566321373,
"epoch": 3.102996826171875,
"grad_norm": 0.11875070631504059,
"learning_rate": 5.272769927978516e-06,
"lookahead_loss": 3.658306348800659,
"loss": 1.9854,
"step": 469000
},
{
"base_loss": 0.29707186728715895,
"epoch": 3.1039505004882812,
"grad_norm": 0.10225356370210648,
"learning_rate": 5.2250862121582034e-06,
"lookahead_loss": 3.6872463150024415,
"loss": 1.9941,
"step": 469500
},
{
"base_loss": 0.31371429899334907,
"epoch": 3.1049041748046875,
"grad_norm": 0.11156973242759705,
"learning_rate": 5.177402496337891e-06,
"lookahead_loss": 3.6897249317169187,
"loss": 2.005,
"step": 470000
},
{
"epoch": 3.1049041748046875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1185580457742224,
"eval_lookahead_perplexity": 22.613748132576696,
"eval_loss": 1.6231098175048828,
"eval_perplexity": 5.068828964761916,
"eval_runtime": 273.7848,
"eval_samples_per_second": 18.263,
"eval_steps_per_second": 0.573,
"step": 470000
},
{
"base_loss": 0.30964688101410864,
"epoch": 3.1058578491210938,
"grad_norm": 0.14706304669380188,
"learning_rate": 5.129718780517579e-06,
"lookahead_loss": 3.6665027503967287,
"loss": 1.9853,
"step": 470500
},
{
"base_loss": 0.2950109542310238,
"epoch": 3.1068115234375,
"grad_norm": 0.10974892228841782,
"learning_rate": 5.082035064697266e-06,
"lookahead_loss": 3.649388165473938,
"loss": 1.9749,
"step": 471000
},
{
"base_loss": 0.29701292705535887,
"epoch": 3.1077651977539062,
"grad_norm": 0.14115644991397858,
"learning_rate": 5.034351348876953e-06,
"lookahead_loss": 3.6766736326217653,
"loss": 1.9849,
"step": 471500
},
{
"base_loss": 0.3228313593864441,
"epoch": 3.1087188720703125,
"grad_norm": 0.12014192342758179,
"learning_rate": 4.986667633056641e-06,
"lookahead_loss": 3.7084100289344786,
"loss": 2.0137,
"step": 472000
},
{
"base_loss": 0.31703535151481627,
"epoch": 3.1096725463867188,
"grad_norm": 0.11289548873901367,
"learning_rate": 4.938983917236329e-06,
"lookahead_loss": 3.674653216838837,
"loss": 1.9978,
"step": 472500
},
{
"base_loss": 0.2994038117825985,
"epoch": 3.110626220703125,
"grad_norm": 0.10673966258764267,
"learning_rate": 4.891300201416016e-06,
"lookahead_loss": 3.6496841259002686,
"loss": 1.9763,
"step": 473000
},
{
"base_loss": 0.2989161580502987,
"epoch": 3.1115798950195312,
"grad_norm": 0.28921955823898315,
"learning_rate": 4.843616485595703e-06,
"lookahead_loss": 3.6857114777565,
"loss": 1.9901,
"step": 473500
},
{
"base_loss": 0.30562417407333853,
"epoch": 3.1125335693359375,
"grad_norm": 0.1318550556898117,
"learning_rate": 4.795932769775391e-06,
"lookahead_loss": 3.693369821548462,
"loss": 2.0018,
"step": 474000
},
{
"base_loss": 0.3457943990826607,
"epoch": 3.1134872436523438,
"grad_norm": 0.11760886013507843,
"learning_rate": 4.7482490539550784e-06,
"lookahead_loss": 3.7278725819587706,
"loss": 2.0282,
"step": 474500
},
{
"base_loss": 0.2958811685740948,
"epoch": 3.11444091796875,
"grad_norm": 0.11645074933767319,
"learning_rate": 4.700565338134766e-06,
"lookahead_loss": 3.6444467964172365,
"loss": 1.9711,
"step": 475000
},
{
"epoch": 3.11444091796875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.11814274498449,
"eval_lookahead_perplexity": 22.604358574998717,
"eval_loss": 1.6229058504104614,
"eval_perplexity": 5.0677951958768555,
"eval_runtime": 273.0672,
"eval_samples_per_second": 18.311,
"eval_steps_per_second": 0.575,
"step": 475000
},
{
"base_loss": 0.2985372878909111,
"epoch": 3.1153945922851562,
"grad_norm": 0.3836623430252075,
"learning_rate": 4.652881622314453e-06,
"lookahead_loss": 3.688412217617035,
"loss": 1.9976,
"step": 475500
},
{
"base_loss": 0.31154814088344573,
"epoch": 3.1163482666015625,
"grad_norm": 0.161661297082901,
"learning_rate": 4.605197906494141e-06,
"lookahead_loss": 3.6807560696601866,
"loss": 1.9983,
"step": 476000
},
{
"base_loss": 0.3238145258128643,
"epoch": 3.1173019409179688,
"grad_norm": 0.10365596413612366,
"learning_rate": 4.557514190673828e-06,
"lookahead_loss": 3.7186170196533204,
"loss": 2.0244,
"step": 476500
},
{
"base_loss": 0.30326704213023187,
"epoch": 3.118255615234375,
"grad_norm": 0.10426798462867737,
"learning_rate": 4.509830474853516e-06,
"lookahead_loss": 3.6723845586776735,
"loss": 1.9861,
"step": 477000
},
{
"base_loss": 0.30094251811504363,
"epoch": 3.1192092895507812,
"grad_norm": 0.1405145525932312,
"learning_rate": 4.462146759033204e-06,
"lookahead_loss": 3.6822138290405273,
"loss": 1.9895,
"step": 477500
},
{
"base_loss": 0.3028905067443848,
"epoch": 4.000953674316406,
"grad_norm": 0.11994955688714981,
"learning_rate": 4.4144630432128904e-06,
"lookahead_loss": 3.694705171585083,
"loss": 1.9938,
"step": 478000
},
{
"base_loss": 0.30170239555835726,
"epoch": 4.0019073486328125,
"grad_norm": 0.17390646040439606,
"learning_rate": 4.366779327392578e-06,
"lookahead_loss": 3.677241044521332,
"loss": 1.9921,
"step": 478500
},
{
"base_loss": 0.3116890364587307,
"epoch": 4.002861022949219,
"grad_norm": 0.10386445373296738,
"learning_rate": 4.319095611572266e-06,
"lookahead_loss": 3.693949136734009,
"loss": 1.994,
"step": 479000
},
{
"base_loss": 0.32097554665803907,
"epoch": 4.003814697265625,
"grad_norm": 0.11139928549528122,
"learning_rate": 4.2714118957519534e-06,
"lookahead_loss": 3.6914780583381654,
"loss": 2.0077,
"step": 479500
},
{
"base_loss": 0.3001700294613838,
"epoch": 4.004768371582031,
"grad_norm": 0.09557037055492401,
"learning_rate": 4.223728179931641e-06,
"lookahead_loss": 3.650711070537567,
"loss": 1.9799,
"step": 480000
},
{
"epoch": 4.004768371582031,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.117379737738222,
"eval_lookahead_perplexity": 22.587117863838966,
"eval_loss": 1.622514009475708,
"eval_perplexity": 5.065809815272263,
"eval_runtime": 262.804,
"eval_samples_per_second": 19.026,
"eval_steps_per_second": 0.597,
"step": 480000
},
{
"base_loss": 0.29891238936781883,
"epoch": 4.0057220458984375,
"grad_norm": 0.12393570691347122,
"learning_rate": 4.176044464111328e-06,
"lookahead_loss": 3.6757360472679137,
"loss": 1.9858,
"step": 480500
},
{
"base_loss": 0.297944345831871,
"epoch": 4.006675720214844,
"grad_norm": 0.10606178641319275,
"learning_rate": 4.128360748291016e-06,
"lookahead_loss": 3.688351684093475,
"loss": 1.9954,
"step": 481000
},
{
"base_loss": 0.3113737390637398,
"epoch": 4.00762939453125,
"grad_norm": 0.12270639836788177,
"learning_rate": 4.080677032470703e-06,
"lookahead_loss": 3.696225019454956,
"loss": 2.0012,
"step": 481500
},
{
"base_loss": 0.3164871991574764,
"epoch": 4.008583068847656,
"grad_norm": 0.11441905051469803,
"learning_rate": 4.032993316650391e-06,
"lookahead_loss": 3.6902763628959656,
"loss": 1.9955,
"step": 482000
},
{
"base_loss": 0.30087858831882475,
"epoch": 4.0095367431640625,
"grad_norm": 0.113109290599823,
"learning_rate": 3.985309600830079e-06,
"lookahead_loss": 3.645549575805664,
"loss": 1.9779,
"step": 482500
},
{
"base_loss": 0.29908930853009225,
"epoch": 4.010490417480469,
"grad_norm": 0.10333634167909622,
"learning_rate": 3.9376258850097654e-06,
"lookahead_loss": 3.685707633972168,
"loss": 1.991,
"step": 483000
},
{
"base_loss": 0.30296807369589807,
"epoch": 4.011444091796875,
"grad_norm": 0.09995999187231064,
"learning_rate": 3.889942169189453e-06,
"lookahead_loss": 3.6709425745010376,
"loss": 1.9912,
"step": 483500
},
{
"base_loss": 0.3258657184243202,
"epoch": 4.012397766113281,
"grad_norm": 0.0958399698138237,
"learning_rate": 3.842258453369141e-06,
"lookahead_loss": 3.7037739310264586,
"loss": 2.0136,
"step": 484000
},
{
"base_loss": 0.30683475187420844,
"epoch": 4.0133514404296875,
"grad_norm": 0.12681221961975098,
"learning_rate": 3.7945747375488284e-06,
"lookahead_loss": 3.656396279811859,
"loss": 1.9858,
"step": 484500
},
{
"base_loss": 0.2975224345624447,
"epoch": 4.014305114746094,
"grad_norm": 0.12238068878650665,
"learning_rate": 3.7468910217285157e-06,
"lookahead_loss": 3.6545630931854247,
"loss": 1.9783,
"step": 485000
},
{
"epoch": 4.014305114746094,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.11680754533591,
"eval_lookahead_perplexity": 22.574197383460017,
"eval_loss": 1.622229814529419,
"eval_perplexity": 5.064370342279065,
"eval_runtime": 274.0996,
"eval_samples_per_second": 18.242,
"eval_steps_per_second": 0.573,
"step": 485000
},
{
"base_loss": 0.30197526678442954,
"epoch": 1.0009536743164062,
"grad_norm": 0.11313185840845108,
"learning_rate": 3.6992073059082034e-06,
"lookahead_loss": 3.693221253395081,
"loss": 1.9935,
"step": 485500
},
{
"base_loss": 0.303896483540535,
"epoch": 1.0019073486328125,
"grad_norm": 0.167490616440773,
"learning_rate": 3.6515235900878906e-06,
"lookahead_loss": 3.677141076564789,
"loss": 1.9921,
"step": 486000
},
{
"base_loss": 0.3094813532233238,
"epoch": 1.0028610229492188,
"grad_norm": 0.10422538220882416,
"learning_rate": 3.6038398742675783e-06,
"lookahead_loss": 3.6902101335525512,
"loss": 1.9925,
"step": 486500
},
{
"base_loss": 0.3199170651733875,
"epoch": 1.003814697265625,
"grad_norm": 0.11597315967082977,
"learning_rate": 3.556156158447266e-06,
"lookahead_loss": 3.691033944129944,
"loss": 2.007,
"step": 487000
},
{
"base_loss": 0.30184895062446593,
"epoch": 1.0047683715820312,
"grad_norm": 0.09484552592039108,
"learning_rate": 3.508472442626953e-06,
"lookahead_loss": 3.651390314102173,
"loss": 1.9807,
"step": 487500
},
{
"base_loss": 0.2977984355092049,
"epoch": 1.0057220458984375,
"grad_norm": 0.11500236392021179,
"learning_rate": 3.460788726806641e-06,
"lookahead_loss": 3.674714815616608,
"loss": 1.9851,
"step": 488000
},
{
"base_loss": 0.2989386010617018,
"epoch": 1.0066757202148438,
"grad_norm": 0.10374879837036133,
"learning_rate": 3.413105010986328e-06,
"lookahead_loss": 3.6858183379173277,
"loss": 1.9948,
"step": 488500
},
{
"base_loss": 0.3137947543263435,
"epoch": 1.00762939453125,
"grad_norm": 0.12499138712882996,
"learning_rate": 3.3654212951660158e-06,
"lookahead_loss": 3.698023428440094,
"loss": 2.0016,
"step": 489000
},
{
"base_loss": 0.31258952274918556,
"epoch": 1.0085830688476562,
"grad_norm": 0.11980710178613663,
"learning_rate": 3.3177375793457034e-06,
"lookahead_loss": 3.6856638407707214,
"loss": 1.993,
"step": 489500
},
{
"base_loss": 0.3022870315015316,
"epoch": 1.0095367431640625,
"grad_norm": 0.11947501450777054,
"learning_rate": 3.2700538635253907e-06,
"lookahead_loss": 3.644882921695709,
"loss": 1.9775,
"step": 490000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1163531157155386,
"eval_lookahead_perplexity": 22.563941330016597,
"eval_loss": 1.6219943761825562,
"eval_perplexity": 5.063178135648863,
"eval_runtime": 270.228,
"eval_samples_per_second": 18.503,
"eval_steps_per_second": 0.581,
"step": 490000
},
{
"base_loss": 0.29763062146306035,
"epoch": 1.0104904174804688,
"grad_norm": 0.1063813641667366,
"learning_rate": 3.2223701477050784e-06,
"lookahead_loss": 3.681282151699066,
"loss": 1.9884,
"step": 490500
},
{
"base_loss": 0.3011737278997898,
"epoch": 1.011444091796875,
"grad_norm": 0.10796815901994705,
"learning_rate": 3.1746864318847656e-06,
"lookahead_loss": 3.6680510277748106,
"loss": 1.9893,
"step": 491000
},
{
"base_loss": 0.32275081843137743,
"epoch": 1.0123977661132812,
"grad_norm": 0.08862055093050003,
"learning_rate": 3.1270027160644533e-06,
"lookahead_loss": 3.700777189731598,
"loss": 2.0121,
"step": 491500
},
{
"base_loss": 0.30656733042001727,
"epoch": 1.0133514404296875,
"grad_norm": 0.11730780452489853,
"learning_rate": 3.079319000244141e-06,
"lookahead_loss": 3.6528478350639344,
"loss": 1.984,
"step": 492000
},
{
"base_loss": 0.29944423550367355,
"epoch": 1.0143051147460938,
"grad_norm": 0.11960422992706299,
"learning_rate": 3.031635284423828e-06,
"lookahead_loss": 3.654515419960022,
"loss": 1.9772,
"step": 492500
},
{
"base_loss": 0.29441548812389373,
"epoch": 1.0152587890625,
"grad_norm": 0.12259198725223541,
"learning_rate": 2.983951568603516e-06,
"lookahead_loss": 3.6766817421913145,
"loss": 1.9865,
"step": 493000
},
{
"base_loss": 0.31012057706713675,
"epoch": 1.0162124633789062,
"grad_norm": 0.12037604302167892,
"learning_rate": 2.936267852783203e-06,
"lookahead_loss": 3.6906212878227236,
"loss": 2.0039,
"step": 493500
},
{
"base_loss": 0.3121089872717857,
"epoch": 1.0171661376953125,
"grad_norm": 0.13175299763679504,
"learning_rate": 2.8885841369628908e-06,
"lookahead_loss": 3.687456472873688,
"loss": 1.9945,
"step": 494000
},
{
"base_loss": 0.30401164934039115,
"epoch": 1.0181198120117188,
"grad_norm": 0.13393071293830872,
"learning_rate": 2.8409004211425784e-06,
"lookahead_loss": 3.6606993680000306,
"loss": 1.9758,
"step": 494500
},
{
"base_loss": 0.29751659095287325,
"epoch": 1.019073486328125,
"grad_norm": 0.1026306077837944,
"learning_rate": 2.7932167053222657e-06,
"lookahead_loss": 3.6879693541526795,
"loss": 1.994,
"step": 495000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1158480590905624,
"eval_lookahead_perplexity": 22.552548139307703,
"eval_loss": 1.6217412948608398,
"eval_perplexity": 5.061896901969204,
"eval_runtime": 275.022,
"eval_samples_per_second": 18.18,
"eval_steps_per_second": 0.571,
"step": 495000
},
{
"base_loss": 0.30135778871178626,
"epoch": 1.0200271606445312,
"grad_norm": 0.12621034681797028,
"learning_rate": 2.7455329895019534e-06,
"lookahead_loss": 3.668645941734314,
"loss": 1.9875,
"step": 495500
},
{
"base_loss": 0.33046225929260253,
"epoch": 1.0209808349609375,
"grad_norm": 0.14475296437740326,
"learning_rate": 2.6978492736816406e-06,
"lookahead_loss": 3.725844889640808,
"loss": 2.0219,
"step": 496000
},
{
"base_loss": 0.30414657789468763,
"epoch": 1.0219345092773438,
"grad_norm": 0.10405510663986206,
"learning_rate": 2.6501655578613283e-06,
"lookahead_loss": 3.65455238866806,
"loss": 1.9765,
"step": 496500
},
{
"base_loss": 0.30107878148555756,
"epoch": 1.02288818359375,
"grad_norm": 0.15244410932064056,
"learning_rate": 2.602481842041016e-06,
"lookahead_loss": 3.661600576877594,
"loss": 1.9842,
"step": 497000
},
{
"base_loss": 0.3019954281449318,
"epoch": 1.0238418579101562,
"grad_norm": 0.10435742884874344,
"learning_rate": 2.554798126220703e-06,
"lookahead_loss": 3.6684547848701476,
"loss": 1.9873,
"step": 497500
},
{
"base_loss": 0.3265440165698528,
"epoch": 1.0247955322265625,
"grad_norm": 0.11240936815738678,
"learning_rate": 2.507114410400391e-06,
"lookahead_loss": 3.7140092349052427,
"loss": 2.0185,
"step": 498000
},
{
"base_loss": 0.3089427370727062,
"epoch": 1.0257492065429688,
"grad_norm": 0.12863673269748688,
"learning_rate": 2.459430694580078e-06,
"lookahead_loss": 3.660076278209686,
"loss": 1.9887,
"step": 498500
},
{
"base_loss": 0.306296229749918,
"epoch": 1.026702880859375,
"grad_norm": 0.13621826469898224,
"learning_rate": 2.4117469787597658e-06,
"lookahead_loss": 3.6829268450737,
"loss": 1.9866,
"step": 499000
},
{
"base_loss": 0.30920383241772653,
"epoch": 1.0276565551757812,
"grad_norm": 0.1093730702996254,
"learning_rate": 2.3640632629394534e-06,
"lookahead_loss": 3.6884715824127197,
"loss": 1.995,
"step": 499500
},
{
"base_loss": 0.33220478031039236,
"epoch": 1.0286102294921875,
"grad_norm": 0.12156689912080765,
"learning_rate": 2.3163795471191407e-06,
"lookahead_loss": 3.7322032294273377,
"loss": 2.0268,
"step": 500000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1155673802470245,
"eval_lookahead_perplexity": 22.54621900444581,
"eval_loss": 1.621601939201355,
"eval_perplexity": 5.061191547136921,
"eval_runtime": 263.6593,
"eval_samples_per_second": 18.964,
"eval_steps_per_second": 0.595,
"step": 500000
},
{
"base_loss": 0.30326411041617396,
"epoch": 1.0295639038085938,
"grad_norm": 0.11229516565799713,
"learning_rate": 2.2686958312988284e-06,
"lookahead_loss": 3.656010775089264,
"loss": 1.9792,
"step": 500500
},
{
"base_loss": 0.3031138954460621,
"epoch": 1.030517578125,
"grad_norm": 0.14705514907836914,
"learning_rate": 2.2210121154785156e-06,
"lookahead_loss": 3.6846701459884645,
"loss": 1.9967,
"step": 501000
},
{
"base_loss": 0.30234866255521775,
"epoch": 1.0314712524414062,
"grad_norm": 0.12098229676485062,
"learning_rate": 2.1733283996582033e-06,
"lookahead_loss": 3.690236645698547,
"loss": 1.9955,
"step": 501500
},
{
"base_loss": 0.3155796425938606,
"epoch": 1.0324249267578125,
"grad_norm": 0.13642369210720062,
"learning_rate": 2.125644683837891e-06,
"lookahead_loss": 3.709054000377655,
"loss": 2.0181,
"step": 502000
},
{
"base_loss": 0.3022744803726673,
"epoch": 1.0333786010742188,
"grad_norm": 0.15626195073127747,
"learning_rate": 2.077960968017578e-06,
"lookahead_loss": 3.653508902549744,
"loss": 1.9791,
"step": 502500
},
{
"base_loss": 0.30410280799865724,
"epoch": 1.034332275390625,
"grad_norm": 0.12734845280647278,
"learning_rate": 2.030277252197266e-06,
"lookahead_loss": 3.688762324333191,
"loss": 1.9975,
"step": 503000
},
{
"base_loss": 0.3077150760293007,
"epoch": 1.0352859497070312,
"grad_norm": 0.12563078105449677,
"learning_rate": 1.982593536376953e-06,
"lookahead_loss": 3.680134729385376,
"loss": 1.9968,
"step": 503500
},
{
"base_loss": 0.3269314341843128,
"epoch": 1.0362396240234375,
"grad_norm": 0.10034479200839996,
"learning_rate": 1.9349098205566408e-06,
"lookahead_loss": 3.7161739377975462,
"loss": 2.0173,
"step": 504000
},
{
"base_loss": 0.30525318866968154,
"epoch": 1.0371932983398438,
"grad_norm": 0.10954893380403519,
"learning_rate": 1.8872261047363282e-06,
"lookahead_loss": 3.667464601516724,
"loss": 1.9871,
"step": 504500
},
{
"base_loss": 0.3003401378691196,
"epoch": 1.03814697265625,
"grad_norm": 0.14078158140182495,
"learning_rate": 1.8395423889160157e-06,
"lookahead_loss": 3.664040919303894,
"loss": 1.9849,
"step": 505000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1153540405602502,
"eval_lookahead_perplexity": 22.54140951419127,
"eval_loss": 1.6214919090270996,
"eval_perplexity": 5.0606346939849365,
"eval_runtime": 273.2183,
"eval_samples_per_second": 18.3,
"eval_steps_per_second": 0.575,
"step": 505000
},
{
"base_loss": 0.3116057882905006,
"epoch": 1.0391006469726562,
"grad_norm": 0.12243175506591797,
"learning_rate": 1.7918586730957031e-06,
"lookahead_loss": 3.6875705614089966,
"loss": 2.0017,
"step": 505500
},
{
"base_loss": 0.31716569018363955,
"epoch": 1.0400543212890625,
"grad_norm": 0.1560487598180771,
"learning_rate": 1.7441749572753908e-06,
"lookahead_loss": 3.7047328453063963,
"loss": 2.0112,
"step": 506000
},
{
"base_loss": 0.31002197673916815,
"epoch": 1.0410079956054688,
"grad_norm": 0.11027877777814865,
"learning_rate": 1.6964912414550783e-06,
"lookahead_loss": 3.6693738231658934,
"loss": 1.9847,
"step": 506500
},
{
"base_loss": 0.29521755149960516,
"epoch": 1.041961669921875,
"grad_norm": 0.1359536051750183,
"learning_rate": 1.6488075256347657e-06,
"lookahead_loss": 3.6803672742843627,
"loss": 1.9893,
"step": 507000
},
{
"base_loss": 0.30736519694328307,
"epoch": 1.0429153442382812,
"grad_norm": 0.290884405374527,
"learning_rate": 1.6011238098144532e-06,
"lookahead_loss": 3.676666582584381,
"loss": 1.9959,
"step": 507500
},
{
"base_loss": 0.3271687869429588,
"epoch": 1.0438690185546875,
"grad_norm": 0.13232041895389557,
"learning_rate": 1.5534400939941406e-06,
"lookahead_loss": 3.7064253492355346,
"loss": 2.0222,
"step": 508000
},
{
"base_loss": 0.2943850245475769,
"epoch": 1.0448226928710938,
"grad_norm": 0.17723123729228973,
"learning_rate": 1.505756378173828e-06,
"lookahead_loss": 3.644582795619965,
"loss": 1.9702,
"step": 508500
},
{
"base_loss": 0.30418619123101237,
"epoch": 1.0457763671875,
"grad_norm": 0.10071691125631332,
"learning_rate": 1.4580726623535158e-06,
"lookahead_loss": 3.705183662891388,
"loss": 2.0024,
"step": 509000
},
{
"base_loss": 0.32734892451763153,
"epoch": 1.0467300415039062,
"grad_norm": 0.10738521814346313,
"learning_rate": 1.4103889465332032e-06,
"lookahead_loss": 3.7058504252433777,
"loss": 2.0141,
"step": 509500
},
{
"base_loss": 0.32642096510529517,
"epoch": 1.0476837158203125,
"grad_norm": 0.18155638873577118,
"learning_rate": 1.3627052307128907e-06,
"lookahead_loss": 3.713751731872559,
"loss": 2.0234,
"step": 510000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.115112134823784,
"eval_lookahead_perplexity": 22.535957277412017,
"eval_loss": 1.6213737726211548,
"eval_perplexity": 5.06003688410264,
"eval_runtime": 270.6265,
"eval_samples_per_second": 18.476,
"eval_steps_per_second": 0.58,
"step": 510000
},
{
"base_loss": 0.29649946457147597,
"epoch": 1.0486373901367188,
"grad_norm": 0.10251113772392273,
"learning_rate": 1.3150215148925781e-06,
"lookahead_loss": 3.6475554642677306,
"loss": 1.9757,
"step": 510500
},
{
"base_loss": 0.3057677939236164,
"epoch": 1.049591064453125,
"grad_norm": 0.10838726907968521,
"learning_rate": 1.2673377990722656e-06,
"lookahead_loss": 3.681268889427185,
"loss": 1.993,
"step": 511000
},
{
"base_loss": 0.3218669015169144,
"epoch": 1.0505447387695312,
"grad_norm": 0.09824109077453613,
"learning_rate": 1.2196540832519533e-06,
"lookahead_loss": 3.7119429187774657,
"loss": 2.0162,
"step": 511500
},
{
"base_loss": 0.308034790366888,
"epoch": 1.0514984130859375,
"grad_norm": 0.14391624927520752,
"learning_rate": 1.1719703674316407e-06,
"lookahead_loss": 3.6557554478645327,
"loss": 1.9841,
"step": 512000
},
{
"base_loss": 0.30695659655332563,
"epoch": 1.0524520874023438,
"grad_norm": 0.18154321610927582,
"learning_rate": 1.1242866516113282e-06,
"lookahead_loss": 3.6813112597465514,
"loss": 1.9938,
"step": 512500
},
{
"base_loss": 0.3215196977555752,
"epoch": 1.05340576171875,
"grad_norm": 0.12306945025920868,
"learning_rate": 1.0766029357910156e-06,
"lookahead_loss": 3.7058457975387573,
"loss": 2.0082,
"step": 513000
},
{
"base_loss": 0.35528673872351646,
"epoch": 1.0543594360351562,
"grad_norm": 0.10241065919399261,
"learning_rate": 1.028919219970703e-06,
"lookahead_loss": 3.7413923802375795,
"loss": 2.0523,
"step": 513500
},
{
"base_loss": 0.2939756731390953,
"epoch": 1.0553131103515625,
"grad_norm": 0.13681042194366455,
"learning_rate": 9.812355041503908e-07,
"lookahead_loss": 3.63578445148468,
"loss": 1.97,
"step": 514000
},
{
"base_loss": 0.30533788445591925,
"epoch": 1.0562667846679688,
"grad_norm": 0.1053692176938057,
"learning_rate": 9.335517883300781e-07,
"lookahead_loss": 3.703752159118652,
"loss": 2.0064,
"step": 514500
},
{
"base_loss": 0.3139654756486416,
"epoch": 1.057220458984375,
"grad_norm": 0.11466662585735321,
"learning_rate": 8.858680725097657e-07,
"lookahead_loss": 3.703360953807831,
"loss": 2.0123,
"step": 515000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1149530372680565,
"eval_lookahead_perplexity": 22.532372146893515,
"eval_loss": 1.6212981939315796,
"eval_perplexity": 5.059654467597189,
"eval_runtime": 262.3851,
"eval_samples_per_second": 19.056,
"eval_steps_per_second": 0.598,
"step": 515000
},
{
"base_loss": 0.32300865882635116,
"epoch": 1.0581741333007812,
"grad_norm": 0.10684940963983536,
"learning_rate": 8.381843566894531e-07,
"lookahead_loss": 3.684891324043274,
"loss": 2.0007,
"step": 515500
},
{
"base_loss": 0.28907309558987615,
"epoch": 1.0591278076171875,
"grad_norm": 0.09958865493535995,
"learning_rate": 7.905006408691407e-07,
"lookahead_loss": 3.6399435262680053,
"loss": 1.9711,
"step": 516000
},
{
"base_loss": 0.30057010012865065,
"epoch": 1.0600814819335938,
"grad_norm": 0.097833551466465,
"learning_rate": 7.428169250488282e-07,
"lookahead_loss": 3.6915335173606874,
"loss": 1.9978,
"step": 516500
},
{
"base_loss": 0.32112616834044455,
"epoch": 1.06103515625,
"grad_norm": 0.09406998753547668,
"learning_rate": 6.951332092285156e-07,
"lookahead_loss": 3.697935612201691,
"loss": 2.0075,
"step": 517000
},
{
"base_loss": 0.3060283879637718,
"epoch": 1.0619888305664062,
"grad_norm": 0.13066363334655762,
"learning_rate": 6.474494934082032e-07,
"lookahead_loss": 3.6692494864463807,
"loss": 1.9784,
"step": 517500
},
{
"base_loss": 0.31152518782019617,
"epoch": 1.0629425048828125,
"grad_norm": 0.1175675168633461,
"learning_rate": 5.997657775878906e-07,
"lookahead_loss": 3.712639572620392,
"loss": 2.0057,
"step": 518000
},
{
"base_loss": 0.3149063532948494,
"epoch": 1.0638961791992188,
"grad_norm": 0.1051829382777214,
"learning_rate": 5.520820617675782e-07,
"lookahead_loss": 3.7067093458175657,
"loss": 2.0088,
"step": 518500
},
{
"base_loss": 0.30411062452197074,
"epoch": 1.064849853515625,
"grad_norm": 0.12518513202667236,
"learning_rate": 5.043983459472657e-07,
"lookahead_loss": 3.6613830890655517,
"loss": 1.9863,
"step": 519000
},
{
"base_loss": 0.30933507332205773,
"epoch": 1.0658035278320312,
"grad_norm": 0.13780134916305542,
"learning_rate": 4.5671463012695317e-07,
"lookahead_loss": 3.6811904344558717,
"loss": 1.994,
"step": 519500
},
{
"base_loss": 0.30638799047470094,
"epoch": 1.0667572021484375,
"grad_norm": 0.10666853189468384,
"learning_rate": 4.0903091430664063e-07,
"lookahead_loss": 3.6868834929466248,
"loss": 1.9957,
"step": 520000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.00254853228962818,
"eval_base_loss": 0.13120111280355973,
"eval_base_perplexity": 1.1401970664837768,
"eval_lookahead_loss": 3.1148510923781716,
"eval_lookahead_perplexity": 22.53007520377895,
"eval_loss": 1.6212482452392578,
"eval_perplexity": 5.059401750784422,
"eval_runtime": 271.4662,
"eval_samples_per_second": 18.418,
"eval_steps_per_second": 0.578,
"step": 520000
},
{
"base_loss": 0.32938760298490527,
"epoch": 1.0677108764648438,
"grad_norm": 0.1531253606081009,
"learning_rate": 3.6134719848632814e-07,
"lookahead_loss": 3.7249799466133116,
"loss": 2.0249,
"step": 520500
},
{
"base_loss": 0.29950347980856895,
"epoch": 1.06866455078125,
"grad_norm": 0.1346207857131958,
"learning_rate": 3.1366348266601565e-07,
"lookahead_loss": 3.639112250804901,
"loss": 1.9671,
"step": 521000
},
{
"base_loss": 0.30374919882416723,
"epoch": 1.0696182250976562,
"grad_norm": 0.13088198006153107,
"learning_rate": 2.6597976684570316e-07,
"lookahead_loss": 3.7110060076713562,
"loss": 2.0063,
"step": 521500
},
{
"base_loss": 0.34455711591243743,
"epoch": 1.0705718994140625,
"grad_norm": 0.10850070416927338,
"learning_rate": 2.1829605102539064e-07,
"lookahead_loss": 3.7400474700927733,
"loss": 2.0458,
"step": 522000
},
{
"base_loss": 0.31508783569931986,
"epoch": 1.0715255737304688,
"grad_norm": 0.14581139385700226,
"learning_rate": 1.7061233520507813e-07,
"lookahead_loss": 3.6696447038650515,
"loss": 1.9897,
"step": 522500
},
{
"base_loss": 0.3064769520163536,
"epoch": 1.072479248046875,
"grad_norm": 0.12252921611070633,
"learning_rate": 1.2292861938476564e-07,
"lookahead_loss": 3.6762770075798032,
"loss": 1.9898,
"step": 523000
},
{
"base_loss": 0.3032188524603844,
"epoch": 1.0734329223632812,
"grad_norm": 0.09782757610082626,
"learning_rate": 7.524490356445312e-08,
"lookahead_loss": 3.6822463884353636,
"loss": 1.9965,
"step": 523500
},
{
"base_loss": 0.3287756524384022,
"epoch": 1.0743865966796875,
"grad_norm": 0.11079169809818268,
"learning_rate": 2.7561187744140627e-08,
"lookahead_loss": 3.706464078426361,
"loss": 2.0165,
"step": 524000
},
{
"epoch": 1.0749359130859375,
"step": 524288,
"total_flos": 3.285699411601208e+19,
"train_loss": 0.14963702380191535,
"train_runtime": 35362.9264,
"train_samples_per_second": 474.43,
"train_steps_per_second": 14.826
}
],
"logging_steps": 500,
"max_steps": 524288,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.285699411601208e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}