flan-t5laa2-large / trainer_state.json
hrezaei's picture
End of training
ab25e73 verified
{
"best_global_step": null,
"best_metric": 0.1416620910167694,
"best_model_checkpoint": null,
"epoch": 2.1083145141601562,
"eval_steps": 5000,
"global_step": 524288,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"base_loss": 0.3044133240580559,
"epoch": 0.00095367431640625,
"grad_norm": 0.0010083107044920325,
"learning_rate": 4.995241165161133e-05,
"lookahead_loss": 10.315906455993652,
"loss": 0.3208,
"step": 500
},
{
"base_loss": 0.30059696701169014,
"epoch": 0.0019073486328125,
"grad_norm": 0.0010226964950561523,
"learning_rate": 4.990472793579102e-05,
"lookahead_loss": 10.178516641616822,
"loss": 0.3205,
"step": 1000
},
{
"base_loss": 0.31169990518689156,
"epoch": 0.00286102294921875,
"grad_norm": 0.001013169065117836,
"learning_rate": 4.98570442199707e-05,
"lookahead_loss": 10.051177593231202,
"loss": 0.3281,
"step": 1500
},
{
"base_loss": 0.3227726019620895,
"epoch": 0.003814697265625,
"grad_norm": 0.0010217369999736547,
"learning_rate": 4.9809360504150393e-05,
"lookahead_loss": 9.926475008010865,
"loss": 0.3417,
"step": 2000
},
{
"base_loss": 0.3022470915019512,
"epoch": 0.00476837158203125,
"grad_norm": 0.0010057457257062197,
"learning_rate": 4.9761676788330084e-05,
"lookahead_loss": 9.79694257736206,
"loss": 0.3232,
"step": 2500
},
{
"base_loss": 0.30552061820030213,
"epoch": 0.0057220458984375,
"grad_norm": 0.0008910459582693875,
"learning_rate": 4.971399307250977e-05,
"lookahead_loss": 9.700162549972534,
"loss": 0.3197,
"step": 3000
},
{
"base_loss": 0.2953472335338593,
"epoch": 0.00667572021484375,
"grad_norm": 0.001025513163767755,
"learning_rate": 4.966630935668946e-05,
"lookahead_loss": 9.54606210899353,
"loss": 0.3201,
"step": 3500
},
{
"base_loss": 0.312746944963932,
"epoch": 0.00762939453125,
"grad_norm": 0.0010036778403446078,
"learning_rate": 4.961862564086914e-05,
"lookahead_loss": 9.464179010391236,
"loss": 0.3296,
"step": 4000
},
{
"base_loss": 0.3169711889922619,
"epoch": 0.00858306884765625,
"grad_norm": 0.0009707122226245701,
"learning_rate": 4.957094192504883e-05,
"lookahead_loss": 9.352066455841065,
"loss": 0.3273,
"step": 4500
},
{
"base_loss": 0.306710629016161,
"epoch": 0.0095367431640625,
"grad_norm": 0.001031655934639275,
"learning_rate": 4.952325820922852e-05,
"lookahead_loss": 9.252665700912475,
"loss": 0.3249,
"step": 5000
},
{
"epoch": 0.0095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 9.162839600072502,
"eval_lookahead_perplexity": 9536.09737037476,
"eval_loss": 0.147327721118927,
"eval_perplexity": 1.158733642297792,
"eval_runtime": 480.4293,
"eval_samples_per_second": 10.407,
"eval_steps_per_second": 0.327,
"step": 5000
},
{
"base_loss": 0.30083237382769584,
"epoch": 0.01049041748046875,
"grad_norm": 0.0010012760758399963,
"learning_rate": 4.9475574493408205e-05,
"lookahead_loss": 9.12458013534546,
"loss": 0.316,
"step": 5500
},
{
"base_loss": 0.2993237827420235,
"epoch": 0.011444091796875,
"grad_norm": 0.001035403460264206,
"learning_rate": 4.9427890777587895e-05,
"lookahead_loss": 9.04572417831421,
"loss": 0.3185,
"step": 6000
},
{
"base_loss": 0.3238567093908787,
"epoch": 0.01239776611328125,
"grad_norm": 0.0008969915215857327,
"learning_rate": 4.938020706176758e-05,
"lookahead_loss": 8.952016605377198,
"loss": 0.3386,
"step": 6500
},
{
"base_loss": 0.3051931007504463,
"epoch": 0.0133514404296875,
"grad_norm": 0.000971041910815984,
"learning_rate": 4.933252334594727e-05,
"lookahead_loss": 8.886044243812561,
"loss": 0.3244,
"step": 7000
},
{
"base_loss": 0.29808008483052256,
"epoch": 0.01430511474609375,
"grad_norm": 0.0009857703698799014,
"learning_rate": 4.928483963012696e-05,
"lookahead_loss": 8.771625858306885,
"loss": 0.3177,
"step": 7500
},
{
"base_loss": 0.29345863962173463,
"epoch": 0.0152587890625,
"grad_norm": 0.0009497535647824407,
"learning_rate": 4.923715591430664e-05,
"lookahead_loss": 8.633492926597595,
"loss": 0.3098,
"step": 8000
},
{
"base_loss": 0.3092884007692337,
"epoch": 0.01621246337890625,
"grad_norm": 0.0010520165087655187,
"learning_rate": 4.918947219848633e-05,
"lookahead_loss": 8.621233073234558,
"loss": 0.3252,
"step": 8500
},
{
"base_loss": 0.31143338218331335,
"epoch": 0.0171661376953125,
"grad_norm": 0.0009231261792592704,
"learning_rate": 4.9141788482666016e-05,
"lookahead_loss": 8.558024926185608,
"loss": 0.3269,
"step": 9000
},
{
"base_loss": 0.3001442384421825,
"epoch": 0.01811981201171875,
"grad_norm": 0.0009771535405889153,
"learning_rate": 4.9094104766845706e-05,
"lookahead_loss": 8.481963250160216,
"loss": 0.3153,
"step": 9500
},
{
"base_loss": 0.2986592257618904,
"epoch": 0.019073486328125,
"grad_norm": 0.000987049425020814,
"learning_rate": 4.9046421051025396e-05,
"lookahead_loss": 8.409450398445129,
"loss": 0.3149,
"step": 10000
},
{
"epoch": 0.019073486328125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 8.298678596179705,
"eval_lookahead_perplexity": 4018.5587449798586,
"eval_loss": 0.14565235376358032,
"eval_perplexity": 1.1567939630712722,
"eval_runtime": 491.9,
"eval_samples_per_second": 10.165,
"eval_steps_per_second": 0.319,
"step": 10000
},
{
"base_loss": 0.30347599306702616,
"epoch": 0.02002716064453125,
"grad_norm": 0.0010771030792966485,
"learning_rate": 4.899873733520508e-05,
"lookahead_loss": 8.271695964813233,
"loss": 0.3189,
"step": 10500
},
{
"base_loss": 0.3299741225540638,
"epoch": 0.0209808349609375,
"grad_norm": 0.0009491143864579499,
"learning_rate": 4.895105361938477e-05,
"lookahead_loss": 8.27685186958313,
"loss": 0.344,
"step": 11000
},
{
"base_loss": 0.3070560489296913,
"epoch": 0.02193450927734375,
"grad_norm": 0.0009909559739753604,
"learning_rate": 4.890336990356445e-05,
"lookahead_loss": 8.1793439950943,
"loss": 0.3199,
"step": 11500
},
{
"base_loss": 0.301061170309782,
"epoch": 0.02288818359375,
"grad_norm": 0.001020422438159585,
"learning_rate": 4.8855686187744143e-05,
"lookahead_loss": 8.126035836219788,
"loss": 0.3167,
"step": 12000
},
{
"base_loss": 0.30337609922885894,
"epoch": 0.02384185791015625,
"grad_norm": 0.0009905572514981031,
"learning_rate": 4.8808002471923834e-05,
"lookahead_loss": 8.095245086669921,
"loss": 0.3178,
"step": 12500
},
{
"base_loss": 0.3241444931924343,
"epoch": 0.0247955322265625,
"grad_norm": 0.0009353117784485221,
"learning_rate": 4.876031875610352e-05,
"lookahead_loss": 8.033969619750977,
"loss": 0.3394,
"step": 13000
},
{
"base_loss": 0.3070600248277187,
"epoch": 0.02574920654296875,
"grad_norm": 0.000984247657470405,
"learning_rate": 4.871263504028321e-05,
"lookahead_loss": 7.95574036693573,
"loss": 0.3244,
"step": 13500
},
{
"base_loss": 0.3022406686246395,
"epoch": 0.026702880859375,
"grad_norm": 0.001025758683681488,
"learning_rate": 4.866495132446289e-05,
"lookahead_loss": 7.927939188957215,
"loss": 0.3141,
"step": 14000
},
{
"base_loss": 0.30680677881836893,
"epoch": 0.02765655517578125,
"grad_norm": 0.0009658489725552499,
"learning_rate": 4.861726760864258e-05,
"lookahead_loss": 7.988002327919006,
"loss": 0.3219,
"step": 14500
},
{
"base_loss": 0.33426042160391806,
"epoch": 0.0286102294921875,
"grad_norm": 0.0010099124629050493,
"learning_rate": 4.856958389282227e-05,
"lookahead_loss": 7.973947680473327,
"loss": 0.3455,
"step": 15000
},
{
"epoch": 0.0286102294921875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.854276204642396,
"eval_lookahead_perplexity": 2576.729413719151,
"eval_loss": 0.1447858065366745,
"eval_perplexity": 1.1557919806657084,
"eval_runtime": 483.93,
"eval_samples_per_second": 10.332,
"eval_steps_per_second": 0.324,
"step": 15000
},
{
"base_loss": 0.3049518305659294,
"epoch": 0.02956390380859375,
"grad_norm": 0.0009620142518542707,
"learning_rate": 4.8521900177001955e-05,
"lookahead_loss": 7.91404645729065,
"loss": 0.3161,
"step": 15500
},
{
"base_loss": 0.3062360401749611,
"epoch": 0.030517578125,
"grad_norm": 0.0009641471551731229,
"learning_rate": 4.8474216461181645e-05,
"lookahead_loss": 7.844175812721253,
"loss": 0.3196,
"step": 16000
},
{
"base_loss": 0.30225355681777,
"epoch": 0.03147125244140625,
"grad_norm": 0.0009732363396324217,
"learning_rate": 4.842653274536133e-05,
"lookahead_loss": 7.831875602722168,
"loss": 0.3166,
"step": 16500
},
{
"base_loss": 0.3184074863195419,
"epoch": 0.0324249267578125,
"grad_norm": 0.0010106490226462483,
"learning_rate": 4.837884902954102e-05,
"lookahead_loss": 7.771908633232116,
"loss": 0.3381,
"step": 17000
},
{
"base_loss": 0.30629492220282556,
"epoch": 0.03337860107421875,
"grad_norm": 0.0010188270825892687,
"learning_rate": 4.833116531372071e-05,
"lookahead_loss": 7.789399157524109,
"loss": 0.3185,
"step": 17500
},
{
"base_loss": 0.3031555346250534,
"epoch": 0.034332275390625,
"grad_norm": 0.0009390591876581311,
"learning_rate": 4.828348159790039e-05,
"lookahead_loss": 7.772115784645081,
"loss": 0.3169,
"step": 18000
},
{
"base_loss": 0.31164542263746264,
"epoch": 0.03528594970703125,
"grad_norm": 0.0010221318807452917,
"learning_rate": 4.823579788208008e-05,
"lookahead_loss": 7.639335807800293,
"loss": 0.3253,
"step": 18500
},
{
"base_loss": 0.324304408878088,
"epoch": 0.0362396240234375,
"grad_norm": 0.00101387407630682,
"learning_rate": 4.8188114166259766e-05,
"lookahead_loss": 7.712016674995422,
"loss": 0.3383,
"step": 19000
},
{
"base_loss": 0.30813179594278334,
"epoch": 0.03719329833984375,
"grad_norm": 0.0009941563475877047,
"learning_rate": 4.8140430450439456e-05,
"lookahead_loss": 7.633579847335816,
"loss": 0.3224,
"step": 19500
},
{
"base_loss": 0.30138176554441454,
"epoch": 0.03814697265625,
"grad_norm": 0.0009536141296848655,
"learning_rate": 4.8092746734619146e-05,
"lookahead_loss": 7.657862429618835,
"loss": 0.3164,
"step": 20000
},
{
"epoch": 0.03814697265625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.604301906622256,
"eval_lookahead_perplexity": 2006.810463506877,
"eval_loss": 0.14430111646652222,
"eval_perplexity": 1.1552319155094923,
"eval_runtime": 498.3437,
"eval_samples_per_second": 10.033,
"eval_steps_per_second": 0.315,
"step": 20000
},
{
"base_loss": 0.30871694785356524,
"epoch": 0.03910064697265625,
"grad_norm": 0.0009617543546482921,
"learning_rate": 4.804506301879883e-05,
"lookahead_loss": 7.594292169570923,
"loss": 0.3229,
"step": 20500
},
{
"base_loss": 0.32506244936585427,
"epoch": 0.0400543212890625,
"grad_norm": 0.0009832490468397737,
"learning_rate": 4.799737930297852e-05,
"lookahead_loss": 7.60844051361084,
"loss": 0.3364,
"step": 21000
},
{
"base_loss": 0.30769926142692566,
"epoch": 0.04100799560546875,
"grad_norm": 0.0009847276378422976,
"learning_rate": 4.79496955871582e-05,
"lookahead_loss": 7.544025864601135,
"loss": 0.3191,
"step": 21500
},
{
"base_loss": 0.29858891409635546,
"epoch": 0.041961669921875,
"grad_norm": 0.0010060840286314487,
"learning_rate": 4.7902011871337893e-05,
"lookahead_loss": 7.582731894493103,
"loss": 0.3122,
"step": 22000
},
{
"base_loss": 0.3094627737402916,
"epoch": 0.04291534423828125,
"grad_norm": 0.0009809609036892653,
"learning_rate": 4.7854328155517584e-05,
"lookahead_loss": 7.609361615180969,
"loss": 0.3268,
"step": 22500
},
{
"base_loss": 0.32764697542786597,
"epoch": 0.0438690185546875,
"grad_norm": 0.0009822545107454062,
"learning_rate": 4.780664443969727e-05,
"lookahead_loss": 7.582602263450623,
"loss": 0.343,
"step": 23000
},
{
"base_loss": 0.29553532418608663,
"epoch": 0.04482269287109375,
"grad_norm": 0.0010076743783429265,
"learning_rate": 4.775896072387696e-05,
"lookahead_loss": 7.513092971801758,
"loss": 0.3112,
"step": 23500
},
{
"base_loss": 0.3041268612146378,
"epoch": 0.0457763671875,
"grad_norm": 0.0009422925650142133,
"learning_rate": 4.771127700805664e-05,
"lookahead_loss": 7.501417625427246,
"loss": 0.3176,
"step": 24000
},
{
"base_loss": 0.3303914776444435,
"epoch": 0.04673004150390625,
"grad_norm": 0.000979002215899527,
"learning_rate": 4.766359329223633e-05,
"lookahead_loss": 7.502476096153259,
"loss": 0.3419,
"step": 24500
},
{
"base_loss": 0.3248122656941414,
"epoch": 0.0476837158203125,
"grad_norm": 0.0010156352072954178,
"learning_rate": 4.761590957641602e-05,
"lookahead_loss": 7.487288349151611,
"loss": 0.3412,
"step": 25000
},
{
"epoch": 0.0476837158203125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.440544437676573,
"eval_lookahead_perplexity": 1703.677514928871,
"eval_loss": 0.14398084580898285,
"eval_perplexity": 1.1548619878659485,
"eval_runtime": 492.0521,
"eval_samples_per_second": 10.162,
"eval_steps_per_second": 0.319,
"step": 25000
},
{
"base_loss": 0.2969100174307823,
"epoch": 0.04863739013671875,
"grad_norm": 0.0009701626840978861,
"learning_rate": 4.7568225860595705e-05,
"lookahead_loss": 7.416944421768188,
"loss": 0.3108,
"step": 25500
},
{
"base_loss": 0.302005185931921,
"epoch": 0.049591064453125,
"grad_norm": 0.0009941664757207036,
"learning_rate": 4.7520542144775395e-05,
"lookahead_loss": 7.415986575126648,
"loss": 0.3185,
"step": 26000
},
{
"base_loss": 0.31874441370368006,
"epoch": 0.05054473876953125,
"grad_norm": 0.0009409674676135182,
"learning_rate": 4.747285842895508e-05,
"lookahead_loss": 7.50030288696289,
"loss": 0.3333,
"step": 26500
},
{
"base_loss": 0.30408672893047334,
"epoch": 0.0514984130859375,
"grad_norm": 0.0009882714366540313,
"learning_rate": 4.742517471313477e-05,
"lookahead_loss": 7.433645064353943,
"loss": 0.3203,
"step": 27000
},
{
"base_loss": 0.3058005510568619,
"epoch": 0.05245208740234375,
"grad_norm": 0.0010352524695917964,
"learning_rate": 4.737749099731446e-05,
"lookahead_loss": 7.385928537368774,
"loss": 0.3201,
"step": 27500
},
{
"base_loss": 0.32026463899016383,
"epoch": 0.05340576171875,
"grad_norm": 0.0009495351114310324,
"learning_rate": 4.732980728149414e-05,
"lookahead_loss": 7.358126895904541,
"loss": 0.3313,
"step": 28000
},
{
"base_loss": 0.35889338579773905,
"epoch": 0.05435943603515625,
"grad_norm": 0.0009934919653460383,
"learning_rate": 4.728212356567383e-05,
"lookahead_loss": 7.398619123458863,
"loss": 0.3729,
"step": 28500
},
{
"base_loss": 0.29546374672651293,
"epoch": 0.0553131103515625,
"grad_norm": 0.0009958905866369605,
"learning_rate": 4.7234439849853516e-05,
"lookahead_loss": 7.388701396942139,
"loss": 0.3082,
"step": 29000
},
{
"base_loss": 0.3063408683240414,
"epoch": 0.05626678466796875,
"grad_norm": 0.0009431101498194039,
"learning_rate": 4.7186756134033206e-05,
"lookahead_loss": 7.369903712272644,
"loss": 0.3195,
"step": 29500
},
{
"base_loss": 0.3186078954935074,
"epoch": 0.057220458984375,
"grad_norm": 0.0009628318366594613,
"learning_rate": 4.7139072418212896e-05,
"lookahead_loss": 7.403464751243591,
"loss": 0.3334,
"step": 30000
},
{
"epoch": 0.057220458984375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.322395800020748,
"eval_lookahead_perplexity": 1513.8264541066753,
"eval_loss": 0.1437508761882782,
"eval_perplexity": 1.154596435228323,
"eval_runtime": 481.4129,
"eval_samples_per_second": 10.386,
"eval_steps_per_second": 0.326,
"step": 30000
},
{
"base_loss": 0.31768571627140046,
"epoch": 0.05817413330078125,
"grad_norm": 0.0009793334174901247,
"learning_rate": 4.709138870239258e-05,
"lookahead_loss": 7.361583526611328,
"loss": 0.3287,
"step": 30500
},
{
"base_loss": 0.2922684009075165,
"epoch": 0.0591278076171875,
"grad_norm": 0.0009462712332606316,
"learning_rate": 4.704370498657227e-05,
"lookahead_loss": 7.2946535530090335,
"loss": 0.3098,
"step": 31000
},
{
"base_loss": 0.30112267237901685,
"epoch": 0.06008148193359375,
"grad_norm": 0.0009671795414760709,
"learning_rate": 4.699602127075195e-05,
"lookahead_loss": 7.319288095474243,
"loss": 0.3168,
"step": 31500
},
{
"base_loss": 0.32029621145129206,
"epoch": 0.06103515625,
"grad_norm": 0.0009950937237590551,
"learning_rate": 4.6948337554931643e-05,
"lookahead_loss": 7.297159067153931,
"loss": 0.333,
"step": 32000
},
{
"base_loss": 0.30533574494719506,
"epoch": 0.06198883056640625,
"grad_norm": 0.0010346778435632586,
"learning_rate": 4.6900653839111334e-05,
"lookahead_loss": 7.286148567199707,
"loss": 0.3169,
"step": 32500
},
{
"base_loss": 0.30571810373663905,
"epoch": 0.0629425048828125,
"grad_norm": 0.0010247246827930212,
"learning_rate": 4.685297012329102e-05,
"lookahead_loss": 7.249496428489685,
"loss": 0.3185,
"step": 33000
},
{
"base_loss": 0.31451627737283705,
"epoch": 0.06389617919921875,
"grad_norm": 0.0009608972468413413,
"learning_rate": 4.680528640747071e-05,
"lookahead_loss": 7.3038947277069095,
"loss": 0.3298,
"step": 33500
},
{
"base_loss": 0.30425655883550645,
"epoch": 0.064849853515625,
"grad_norm": 0.0009828249458223581,
"learning_rate": 4.675760269165039e-05,
"lookahead_loss": 7.304937886238098,
"loss": 0.3192,
"step": 34000
},
{
"base_loss": 0.31105126801133154,
"epoch": 0.06580352783203125,
"grad_norm": 0.0009732933831401169,
"learning_rate": 4.670991897583008e-05,
"lookahead_loss": 7.2146655473709105,
"loss": 0.3228,
"step": 34500
},
{
"base_loss": 0.3071163959801197,
"epoch": 0.0667572021484375,
"grad_norm": 0.0009960451861843467,
"learning_rate": 4.666223526000977e-05,
"lookahead_loss": 7.182215183258057,
"loss": 0.3182,
"step": 35000
},
{
"epoch": 0.0667572021484375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.2324602169731556,
"eval_lookahead_perplexity": 1383.6223313598848,
"eval_loss": 0.14357592165470123,
"eval_perplexity": 1.1543944510170698,
"eval_runtime": 497.8756,
"eval_samples_per_second": 10.043,
"eval_steps_per_second": 0.315,
"step": 35000
},
{
"base_loss": 0.3321034919023514,
"epoch": 0.06771087646484375,
"grad_norm": 0.0010179802775382996,
"learning_rate": 4.6614551544189455e-05,
"lookahead_loss": 7.266714824676514,
"loss": 0.3445,
"step": 35500
},
{
"base_loss": 0.3017843673825264,
"epoch": 0.06866455078125,
"grad_norm": 0.0009934077970683575,
"learning_rate": 4.6566867828369145e-05,
"lookahead_loss": 7.266165484428406,
"loss": 0.312,
"step": 36000
},
{
"base_loss": 0.302195555627346,
"epoch": 0.06961822509765625,
"grad_norm": 0.0009844391606748104,
"learning_rate": 4.651918411254883e-05,
"lookahead_loss": 7.245750316619873,
"loss": 0.318,
"step": 36500
},
{
"base_loss": 0.3459869565963745,
"epoch": 0.0705718994140625,
"grad_norm": 0.0009586875676177442,
"learning_rate": 4.647150039672852e-05,
"lookahead_loss": 7.152301582336426,
"loss": 0.3617,
"step": 37000
},
{
"base_loss": 0.3151495299339294,
"epoch": 0.07152557373046875,
"grad_norm": 0.0009651901782490313,
"learning_rate": 4.642381668090821e-05,
"lookahead_loss": 7.200114166259765,
"loss": 0.3277,
"step": 37500
},
{
"base_loss": 0.30790447345376015,
"epoch": 0.072479248046875,
"grad_norm": 0.001032789470627904,
"learning_rate": 4.637613296508789e-05,
"lookahead_loss": 7.234860193252564,
"loss": 0.3219,
"step": 38000
},
{
"base_loss": 0.30545566940307617,
"epoch": 0.07343292236328125,
"grad_norm": 0.0009383106953464448,
"learning_rate": 4.632844924926758e-05,
"lookahead_loss": 7.182545600891113,
"loss": 0.32,
"step": 38500
},
{
"base_loss": 0.32841417971253395,
"epoch": 0.0743865966796875,
"grad_norm": 0.000990406610071659,
"learning_rate": 4.6280765533447266e-05,
"lookahead_loss": 7.212004456520081,
"loss": 0.3421,
"step": 39000
},
{
"base_loss": 0.30363579127192497,
"epoch": 0.07534027099609375,
"grad_norm": 0.0010100390063598752,
"learning_rate": 4.6233081817626956e-05,
"lookahead_loss": 7.244741122245789,
"loss": 0.3174,
"step": 39500
},
{
"base_loss": 0.30504586565494535,
"epoch": 0.0762939453125,
"grad_norm": 0.0009652067092247307,
"learning_rate": 4.6185398101806646e-05,
"lookahead_loss": 7.1760479412078855,
"loss": 0.3193,
"step": 40000
},
{
"epoch": 0.0762939453125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.159913369261038,
"eval_lookahead_perplexity": 1286.7994516897043,
"eval_loss": 0.14343445003032684,
"eval_perplexity": 1.1542311485105234,
"eval_runtime": 492.5085,
"eval_samples_per_second": 10.152,
"eval_steps_per_second": 0.319,
"step": 40000
},
{
"base_loss": 0.33029799509048463,
"epoch": 0.07724761962890625,
"grad_norm": 0.0010073404991999269,
"learning_rate": 4.613771438598633e-05,
"lookahead_loss": 7.19347186088562,
"loss": 0.3467,
"step": 40500
},
{
"base_loss": 0.3037954642176628,
"epoch": 0.0782012939453125,
"grad_norm": 0.0009825917659327388,
"learning_rate": 4.609003067016602e-05,
"lookahead_loss": 7.184024220466614,
"loss": 0.3181,
"step": 41000
},
{
"base_loss": 0.29821320512890814,
"epoch": 0.07915496826171875,
"grad_norm": 0.0009405228192918003,
"learning_rate": 4.60423469543457e-05,
"lookahead_loss": 7.188343933105469,
"loss": 0.3143,
"step": 41500
},
{
"base_loss": 0.3142137563228607,
"epoch": 0.080108642578125,
"grad_norm": 0.0009637173498049378,
"learning_rate": 4.5994663238525393e-05,
"lookahead_loss": 7.147234386444092,
"loss": 0.3317,
"step": 42000
},
{
"base_loss": 0.3222310249209404,
"epoch": 0.08106231689453125,
"grad_norm": 0.0009848393965512514,
"learning_rate": 4.5946979522705084e-05,
"lookahead_loss": 7.199878736495972,
"loss": 0.3406,
"step": 42500
},
{
"base_loss": 0.3002626436650753,
"epoch": 0.0820159912109375,
"grad_norm": 0.000929056026507169,
"learning_rate": 4.589929580688477e-05,
"lookahead_loss": 7.160097922325134,
"loss": 0.3137,
"step": 43000
},
{
"base_loss": 0.3045452245473862,
"epoch": 0.08296966552734375,
"grad_norm": 0.0009913038229569793,
"learning_rate": 4.585161209106446e-05,
"lookahead_loss": 7.1922206773757935,
"loss": 0.3198,
"step": 43500
},
{
"base_loss": 0.33469617655873296,
"epoch": 0.08392333984375,
"grad_norm": 0.0009477322455495596,
"learning_rate": 4.580392837524414e-05,
"lookahead_loss": 7.188063373565674,
"loss": 0.347,
"step": 44000
},
{
"base_loss": 0.30740025800466536,
"epoch": 0.08487701416015625,
"grad_norm": 0.0009691762970760465,
"learning_rate": 4.575624465942383e-05,
"lookahead_loss": 7.11799400806427,
"loss": 0.3204,
"step": 44500
},
{
"base_loss": 0.300477741509676,
"epoch": 0.0858306884765625,
"grad_norm": 0.000998710049316287,
"learning_rate": 4.570856094360352e-05,
"lookahead_loss": 7.151413684844971,
"loss": 0.3112,
"step": 45000
},
{
"epoch": 0.0858306884765625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.100333274743808,
"eval_lookahead_perplexity": 1212.3710598241469,
"eval_loss": 0.14331814646720886,
"eval_perplexity": 1.154096915121352,
"eval_runtime": 480.4538,
"eval_samples_per_second": 10.407,
"eval_steps_per_second": 0.327,
"step": 45000
},
{
"base_loss": 0.301901563256979,
"epoch": 0.08678436279296875,
"grad_norm": 0.0011987128527835011,
"learning_rate": 4.5660877227783205e-05,
"lookahead_loss": 7.107098340034485,
"loss": 0.3142,
"step": 45500
},
{
"base_loss": 0.338142231285572,
"epoch": 0.087738037109375,
"grad_norm": 0.0009075519046746194,
"learning_rate": 4.5613193511962895e-05,
"lookahead_loss": 7.136958950996399,
"loss": 0.3464,
"step": 46000
},
{
"base_loss": 0.3009798979461193,
"epoch": 0.08869171142578125,
"grad_norm": 0.0009948944207280874,
"learning_rate": 4.556550979614258e-05,
"lookahead_loss": 7.107201243400573,
"loss": 0.3132,
"step": 46500
},
{
"base_loss": 0.3090392453968525,
"epoch": 0.0896453857421875,
"grad_norm": 0.0010095473844558,
"learning_rate": 4.551782608032227e-05,
"lookahead_loss": 7.135781683921814,
"loss": 0.3192,
"step": 47000
},
{
"base_loss": 0.30036539113521576,
"epoch": 0.09059906005859375,
"grad_norm": 0.0009663203964009881,
"learning_rate": 4.547014236450196e-05,
"lookahead_loss": 7.099122268676758,
"loss": 0.3132,
"step": 47500
},
{
"base_loss": 0.3006012495756149,
"epoch": 0.091552734375,
"grad_norm": 0.0009152375860139728,
"learning_rate": 4.542245864868164e-05,
"lookahead_loss": 7.080106061935425,
"loss": 0.3121,
"step": 48000
},
{
"base_loss": 0.31875682109594344,
"epoch": 0.09250640869140625,
"grad_norm": 0.0009438424604013562,
"learning_rate": 4.537477493286133e-05,
"lookahead_loss": 7.104524848937988,
"loss": 0.3365,
"step": 48500
},
{
"base_loss": 0.3104289738535881,
"epoch": 0.0934600830078125,
"grad_norm": 0.000929334491956979,
"learning_rate": 4.5327091217041016e-05,
"lookahead_loss": 7.102272230148316,
"loss": 0.3228,
"step": 49000
},
{
"base_loss": 0.2877590928971767,
"epoch": 0.09441375732421875,
"grad_norm": 0.000983032863587141,
"learning_rate": 4.5279407501220706e-05,
"lookahead_loss": 7.092049780845642,
"loss": 0.3028,
"step": 49500
},
{
"base_loss": 0.2935507807135582,
"epoch": 0.095367431640625,
"grad_norm": 0.0009688133141025901,
"learning_rate": 4.523172378540039e-05,
"lookahead_loss": 7.024641987800599,
"loss": 0.3084,
"step": 50000
},
{
"epoch": 0.095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.052693091261501,
"eval_lookahead_perplexity": 1155.9676810375186,
"eval_loss": 0.1432213932275772,
"eval_perplexity": 1.1539852579076508,
"eval_runtime": 492.5723,
"eval_samples_per_second": 10.151,
"eval_steps_per_second": 0.319,
"step": 50000
},
{
"base_loss": 0.2986202912926674,
"epoch": 0.09632110595703125,
"grad_norm": 0.0009543218766339123,
"learning_rate": 4.518404006958008e-05,
"lookahead_loss": 7.0854365358352664,
"loss": 0.3132,
"step": 50500
},
{
"base_loss": 0.3307524161040783,
"epoch": 0.0972747802734375,
"grad_norm": 0.0010054496815428138,
"learning_rate": 4.513635635375977e-05,
"lookahead_loss": 7.08996038722992,
"loss": 0.3427,
"step": 51000
},
{
"base_loss": 0.29244673988223074,
"epoch": 0.09822845458984375,
"grad_norm": 0.0010033181170001626,
"learning_rate": 4.508867263793945e-05,
"lookahead_loss": 7.052651536941529,
"loss": 0.3083,
"step": 51500
},
{
"base_loss": 0.295786843508482,
"epoch": 0.09918212890625,
"grad_norm": 0.0009846463799476624,
"learning_rate": 4.5040988922119143e-05,
"lookahead_loss": 7.07691504573822,
"loss": 0.3121,
"step": 52000
},
{
"base_loss": 0.30293611577153207,
"epoch": 0.10013580322265625,
"grad_norm": 0.0009364968864247203,
"learning_rate": 4.499330520629883e-05,
"lookahead_loss": 7.069658821105957,
"loss": 0.3161,
"step": 52500
},
{
"base_loss": 0.3240869597494602,
"epoch": 0.1010894775390625,
"grad_norm": 0.0009558585588820279,
"learning_rate": 4.494562149047852e-05,
"lookahead_loss": 7.12118856048584,
"loss": 0.335,
"step": 53000
},
{
"base_loss": 0.30599541807174685,
"epoch": 0.10204315185546875,
"grad_norm": 0.000964898441452533,
"learning_rate": 4.489793777465821e-05,
"lookahead_loss": 7.0878299045562745,
"loss": 0.3163,
"step": 53500
},
{
"base_loss": 0.2991089904308319,
"epoch": 0.102996826171875,
"grad_norm": 0.0009853472001850605,
"learning_rate": 4.485025405883789e-05,
"lookahead_loss": 7.047714894294739,
"loss": 0.3149,
"step": 54000
},
{
"base_loss": 0.30219315418601034,
"epoch": 0.10395050048828125,
"grad_norm": 0.0010090046562254429,
"learning_rate": 4.480257034301758e-05,
"lookahead_loss": 7.079160309791565,
"loss": 0.3135,
"step": 54500
},
{
"base_loss": 0.3133500624895096,
"epoch": 0.1049041748046875,
"grad_norm": 0.0009890320943668485,
"learning_rate": 4.4754886627197264e-05,
"lookahead_loss": 7.02062137889862,
"loss": 0.3281,
"step": 55000
},
{
"epoch": 0.1049041748046875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 7.00973541820392,
"eval_lookahead_perplexity": 1107.3614784488032,
"eval_loss": 0.14313539862632751,
"eval_perplexity": 1.1538860256723285,
"eval_runtime": 480.347,
"eval_samples_per_second": 10.409,
"eval_steps_per_second": 0.327,
"step": 55000
},
{
"base_loss": 0.3070584389269352,
"epoch": 0.10585784912109375,
"grad_norm": 0.0009455215185880661,
"learning_rate": 4.4707202911376955e-05,
"lookahead_loss": 6.998170353889465,
"loss": 0.3221,
"step": 55500
},
{
"base_loss": 0.29948241996765135,
"epoch": 0.1068115234375,
"grad_norm": 0.0009629906271584332,
"learning_rate": 4.4659519195556645e-05,
"lookahead_loss": 7.0833413105010985,
"loss": 0.3107,
"step": 56000
},
{
"base_loss": 0.29492466670274736,
"epoch": 0.10776519775390625,
"grad_norm": 0.0009873651433736086,
"learning_rate": 4.461183547973633e-05,
"lookahead_loss": 6.994237482070923,
"loss": 0.3092,
"step": 56500
},
{
"base_loss": 0.3188383647501469,
"epoch": 0.1087188720703125,
"grad_norm": 0.0010177677031606436,
"learning_rate": 4.456415176391602e-05,
"lookahead_loss": 6.982017017364502,
"loss": 0.3324,
"step": 57000
},
{
"base_loss": 0.31659464621543887,
"epoch": 0.10967254638671875,
"grad_norm": 0.0009399647242389619,
"learning_rate": 4.45164680480957e-05,
"lookahead_loss": 6.973290238380432,
"loss": 0.3271,
"step": 57500
},
{
"base_loss": 0.3013280538916588,
"epoch": 0.110626220703125,
"grad_norm": 0.0008893092744983733,
"learning_rate": 4.446878433227539e-05,
"lookahead_loss": 6.9135963726043705,
"loss": 0.3151,
"step": 58000
},
{
"base_loss": 0.29822684854269027,
"epoch": 0.11157989501953125,
"grad_norm": 0.0010066829854622483,
"learning_rate": 4.442110061645508e-05,
"lookahead_loss": 7.023115937232971,
"loss": 0.3097,
"step": 58500
},
{
"base_loss": 0.3082665235698223,
"epoch": 0.1125335693359375,
"grad_norm": 0.0009333739290013909,
"learning_rate": 4.4373416900634766e-05,
"lookahead_loss": 7.03084280014038,
"loss": 0.322,
"step": 59000
},
{
"base_loss": 0.34342152199149134,
"epoch": 0.11348724365234375,
"grad_norm": 0.001008225604891777,
"learning_rate": 4.4325733184814456e-05,
"lookahead_loss": 7.03166408252716,
"loss": 0.3512,
"step": 59500
},
{
"base_loss": 0.29527223294973376,
"epoch": 0.11444091796875,
"grad_norm": 0.0009543896303512156,
"learning_rate": 4.427804946899414e-05,
"lookahead_loss": 6.924260063171387,
"loss": 0.3096,
"step": 60000
},
{
"epoch": 0.11444091796875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.971603098007056,
"eval_lookahead_perplexity": 1065.930172124952,
"eval_loss": 0.14305971562862396,
"eval_perplexity": 1.153798699423495,
"eval_runtime": 498.2888,
"eval_samples_per_second": 10.034,
"eval_steps_per_second": 0.315,
"step": 60000
},
{
"base_loss": 0.2983711498081684,
"epoch": 0.11539459228515625,
"grad_norm": 0.0009778671665117145,
"learning_rate": 4.423036575317383e-05,
"lookahead_loss": 6.959155892372132,
"loss": 0.3119,
"step": 60500
},
{
"base_loss": 0.3164993856549263,
"epoch": 0.1163482666015625,
"grad_norm": 0.0010577912908047438,
"learning_rate": 4.418268203735352e-05,
"lookahead_loss": 6.982672909736634,
"loss": 0.326,
"step": 61000
},
{
"base_loss": 0.3281388694047928,
"epoch": 0.11730194091796875,
"grad_norm": 0.0010003127390518785,
"learning_rate": 4.41349983215332e-05,
"lookahead_loss": 6.978295309066772,
"loss": 0.3447,
"step": 61500
},
{
"base_loss": 0.3066762860417366,
"epoch": 0.118255615234375,
"grad_norm": 0.0010272158542647958,
"learning_rate": 4.4087314605712893e-05,
"lookahead_loss": 6.9806537971496585,
"loss": 0.317,
"step": 62000
},
{
"base_loss": 0.3002779276072979,
"epoch": 0.11920928955078125,
"grad_norm": 0.0009698990033939481,
"learning_rate": 4.403963088989258e-05,
"lookahead_loss": 7.002115357398987,
"loss": 0.3116,
"step": 62500
},
{
"base_loss": 0.3048044160306454,
"epoch": 1.0009536743164062,
"grad_norm": 0.0009617453324608505,
"learning_rate": 4.399194717407227e-05,
"lookahead_loss": 7.047370400428772,
"loss": 0.3145,
"step": 63000
},
{
"base_loss": 0.2995053820014,
"epoch": 1.0019073486328125,
"grad_norm": 0.0010174677008762956,
"learning_rate": 4.394426345825196e-05,
"lookahead_loss": 6.895120985031128,
"loss": 0.3142,
"step": 63500
},
{
"base_loss": 0.31198617857694627,
"epoch": 1.0028610229492188,
"grad_norm": 0.0010111057199537754,
"learning_rate": 4.389657974243164e-05,
"lookahead_loss": 6.888555366516114,
"loss": 0.3226,
"step": 64000
},
{
"base_loss": 0.32396442687511445,
"epoch": 1.003814697265625,
"grad_norm": 0.0009548735106363893,
"learning_rate": 4.384889602661133e-05,
"lookahead_loss": 6.908667636871338,
"loss": 0.336,
"step": 64500
},
{
"base_loss": 0.3013957371413708,
"epoch": 1.0047683715820312,
"grad_norm": 0.000966136809438467,
"learning_rate": 4.3801212310791014e-05,
"lookahead_loss": 6.905056614875793,
"loss": 0.3168,
"step": 65000
},
{
"epoch": 1.0047683715820312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.937254576637341,
"eval_lookahead_perplexity": 1029.9387120995427,
"eval_loss": 0.14299124479293823,
"eval_perplexity": 1.1537197005669222,
"eval_runtime": 494.5805,
"eval_samples_per_second": 10.11,
"eval_steps_per_second": 0.317,
"step": 65000
},
{
"base_loss": 0.3039788320362568,
"epoch": 1.0057220458984375,
"grad_norm": 0.0008604921749792993,
"learning_rate": 4.3753528594970705e-05,
"lookahead_loss": 7.014158073425293,
"loss": 0.3139,
"step": 65500
},
{
"base_loss": 0.29717833909392355,
"epoch": 1.0066757202148438,
"grad_norm": 0.0009630241547711194,
"learning_rate": 4.3705844879150395e-05,
"lookahead_loss": 6.847448231697083,
"loss": 0.3148,
"step": 66000
},
{
"base_loss": 0.31199148765206336,
"epoch": 1.00762939453125,
"grad_norm": 0.0010012584971264005,
"learning_rate": 4.365816116333008e-05,
"lookahead_loss": 6.918432865142822,
"loss": 0.3246,
"step": 66500
},
{
"base_loss": 0.3148621036410332,
"epoch": 1.0085830688476562,
"grad_norm": 0.0009159519104287028,
"learning_rate": 4.361047744750977e-05,
"lookahead_loss": 6.913657369136811,
"loss": 0.3229,
"step": 67000
},
{
"base_loss": 0.30580521461367605,
"epoch": 1.0095367431640625,
"grad_norm": 0.0009974334388971329,
"learning_rate": 4.356279373168945e-05,
"lookahead_loss": 6.923233027458191,
"loss": 0.3191,
"step": 67500
},
{
"base_loss": 0.3015244754254818,
"epoch": 1.0104904174804688,
"grad_norm": 0.0009639709023758769,
"learning_rate": 4.351511001586914e-05,
"lookahead_loss": 6.88383205986023,
"loss": 0.3128,
"step": 68000
},
{
"base_loss": 0.30137019059062004,
"epoch": 1.011444091796875,
"grad_norm": 0.0010148598812520504,
"learning_rate": 4.346742630004883e-05,
"lookahead_loss": 6.898301356315613,
"loss": 0.3139,
"step": 68500
},
{
"base_loss": 0.3252628707587719,
"epoch": 1.0123977661132812,
"grad_norm": 0.000888565497007221,
"learning_rate": 4.3419742584228516e-05,
"lookahead_loss": 6.891662693023681,
"loss": 0.3359,
"step": 69000
},
{
"base_loss": 0.30557073107361793,
"epoch": 1.0133514404296875,
"grad_norm": 0.0009476915001869202,
"learning_rate": 4.3372058868408206e-05,
"lookahead_loss": 6.974801006317139,
"loss": 0.3211,
"step": 69500
},
{
"base_loss": 0.30054079556465146,
"epoch": 1.0143051147460938,
"grad_norm": 0.0009728021686896682,
"learning_rate": 4.332437515258789e-05,
"lookahead_loss": 6.902195900917053,
"loss": 0.3158,
"step": 70000
},
{
"epoch": 1.0143051147460938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.905760923513589,
"eval_lookahead_perplexity": 998.0076319369023,
"eval_loss": 0.14292870461940765,
"eval_perplexity": 1.1536475489928526,
"eval_runtime": 489.0741,
"eval_samples_per_second": 10.223,
"eval_steps_per_second": 0.321,
"step": 70000
},
{
"base_loss": 0.29648803743720054,
"epoch": 1.0152587890625,
"grad_norm": 0.0009132726700045168,
"learning_rate": 4.327669143676758e-05,
"lookahead_loss": 6.884706204414368,
"loss": 0.3079,
"step": 70500
},
{
"base_loss": 0.31412097451090815,
"epoch": 1.0162124633789062,
"grad_norm": 0.0009932307293638587,
"learning_rate": 4.322900772094727e-05,
"lookahead_loss": 6.908680680274963,
"loss": 0.326,
"step": 71000
},
{
"base_loss": 0.3125672063827515,
"epoch": 1.0171661376953125,
"grad_norm": 0.0009134129504673183,
"learning_rate": 4.318132400512695e-05,
"lookahead_loss": 6.958608627319336,
"loss": 0.3241,
"step": 71500
},
{
"base_loss": 0.3002317441105843,
"epoch": 1.0181198120117188,
"grad_norm": 0.0009274011244997382,
"learning_rate": 4.3133640289306643e-05,
"lookahead_loss": 6.960107209205628,
"loss": 0.3111,
"step": 72000
},
{
"base_loss": 0.29831535935401915,
"epoch": 1.019073486328125,
"grad_norm": 0.0009689630824141204,
"learning_rate": 4.308595657348633e-05,
"lookahead_loss": 6.973208980560303,
"loss": 0.3109,
"step": 72500
},
{
"base_loss": 0.3020369653701782,
"epoch": 1.0200271606445312,
"grad_norm": 0.001046851510182023,
"learning_rate": 4.303827285766602e-05,
"lookahead_loss": 6.801126411437989,
"loss": 0.3157,
"step": 73000
},
{
"base_loss": 0.32652922403812407,
"epoch": 1.0209808349609375,
"grad_norm": 0.0009485671180300415,
"learning_rate": 4.299058914184571e-05,
"lookahead_loss": 6.887979488372803,
"loss": 0.3396,
"step": 73500
},
{
"base_loss": 0.30453234216570857,
"epoch": 1.0219345092773438,
"grad_norm": 0.0009610042907297611,
"learning_rate": 4.294290542602539e-05,
"lookahead_loss": 6.849929617881775,
"loss": 0.3146,
"step": 74000
},
{
"base_loss": 0.2977458454966545,
"epoch": 1.02288818359375,
"grad_norm": 0.0010150724556297064,
"learning_rate": 4.289522171020508e-05,
"lookahead_loss": 6.878925356388092,
"loss": 0.3125,
"step": 74500
},
{
"base_loss": 0.30405546057224275,
"epoch": 1.0238418579101562,
"grad_norm": 0.0009638189221732318,
"learning_rate": 4.2847537994384764e-05,
"lookahead_loss": 6.84999968624115,
"loss": 0.3149,
"step": 75000
},
{
"epoch": 1.0238418579101562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.875502051636815,
"eval_lookahead_perplexity": 968.2613607781705,
"eval_loss": 0.142868772149086,
"eval_perplexity": 1.1535784101172133,
"eval_runtime": 499.6825,
"eval_samples_per_second": 10.006,
"eval_steps_per_second": 0.314,
"step": 75000
},
{
"base_loss": 0.32463854083418847,
"epoch": 1.0247955322265625,
"grad_norm": 0.0009209099807776511,
"learning_rate": 4.2799854278564455e-05,
"lookahead_loss": 6.842094340324402,
"loss": 0.3362,
"step": 75500
},
{
"base_loss": 0.3075324648320675,
"epoch": 1.0257492065429688,
"grad_norm": 0.0009934107074514031,
"learning_rate": 4.2752170562744145e-05,
"lookahead_loss": 6.811694778442383,
"loss": 0.3238,
"step": 76000
},
{
"base_loss": 0.30398501074314116,
"epoch": 1.026702880859375,
"grad_norm": 0.0009826256427913904,
"learning_rate": 4.270448684692383e-05,
"lookahead_loss": 6.813813063621521,
"loss": 0.314,
"step": 76500
},
{
"base_loss": 0.3081837382018566,
"epoch": 1.0276565551757812,
"grad_norm": 0.0009539109887555242,
"learning_rate": 4.265680313110352e-05,
"lookahead_loss": 6.9388167886734005,
"loss": 0.319,
"step": 77000
},
{
"base_loss": 0.32895678067207335,
"epoch": 1.0286102294921875,
"grad_norm": 0.0009696350898593664,
"learning_rate": 4.26091194152832e-05,
"lookahead_loss": 6.9466577863693235,
"loss": 0.341,
"step": 77500
},
{
"base_loss": 0.30588172587752344,
"epoch": 1.0295639038085938,
"grad_norm": 0.0009499301086179912,
"learning_rate": 4.256143569946289e-05,
"lookahead_loss": 6.890705446243286,
"loss": 0.3154,
"step": 78000
},
{
"base_loss": 0.3051903445720673,
"epoch": 1.030517578125,
"grad_norm": 0.0009480732842348516,
"learning_rate": 4.251375198364258e-05,
"lookahead_loss": 6.864274346351624,
"loss": 0.3174,
"step": 78500
},
{
"base_loss": 0.30346439191699026,
"epoch": 1.0314712524414062,
"grad_norm": 0.0009676506742835045,
"learning_rate": 4.2466068267822266e-05,
"lookahead_loss": 6.894395670890808,
"loss": 0.3162,
"step": 79000
},
{
"base_loss": 0.31795056411623956,
"epoch": 1.0324249267578125,
"grad_norm": 0.0009829605696722865,
"learning_rate": 4.2418384552001956e-05,
"lookahead_loss": 6.821612464904785,
"loss": 0.3355,
"step": 79500
},
{
"base_loss": 0.30795893451571466,
"epoch": 1.0333786010742188,
"grad_norm": 0.0009851903887465596,
"learning_rate": 4.237070083618164e-05,
"lookahead_loss": 6.886163942337036,
"loss": 0.3185,
"step": 80000
},
{
"epoch": 1.0333786010742188,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.84953888384298,
"eval_lookahead_perplexity": 943.4457682675941,
"eval_loss": 0.14281712472438812,
"eval_perplexity": 1.1535188323016774,
"eval_runtime": 496.4245,
"eval_samples_per_second": 10.072,
"eval_steps_per_second": 0.316,
"step": 80000
},
{
"base_loss": 0.3031257001161575,
"epoch": 1.034332275390625,
"grad_norm": 0.0009114276035688818,
"learning_rate": 4.232301712036133e-05,
"lookahead_loss": 6.9387643995285035,
"loss": 0.3159,
"step": 80500
},
{
"base_loss": 0.31068781118094924,
"epoch": 1.0352859497070312,
"grad_norm": 0.0010243533179163933,
"learning_rate": 4.227533340454102e-05,
"lookahead_loss": 6.779029149055481,
"loss": 0.3235,
"step": 81000
},
{
"base_loss": 0.32500979214906695,
"epoch": 1.0362396240234375,
"grad_norm": 0.0009815491503104568,
"learning_rate": 4.22276496887207e-05,
"lookahead_loss": 6.89651736831665,
"loss": 0.3377,
"step": 81500
},
{
"base_loss": 0.3069631262719631,
"epoch": 1.0371932983398438,
"grad_norm": 0.0010114209726452827,
"learning_rate": 4.2179965972900393e-05,
"lookahead_loss": 6.8101696758270265,
"loss": 0.3181,
"step": 82000
},
{
"base_loss": 0.3025422422587872,
"epoch": 1.03814697265625,
"grad_norm": 0.0009560140897519886,
"learning_rate": 4.213228225708008e-05,
"lookahead_loss": 6.864963472366333,
"loss": 0.314,
"step": 82500
},
{
"base_loss": 0.3076345331072807,
"epoch": 1.0391006469726562,
"grad_norm": 0.000972763926256448,
"learning_rate": 4.208459854125977e-05,
"lookahead_loss": 6.800820850372315,
"loss": 0.3196,
"step": 83000
},
{
"base_loss": 0.3235399980545044,
"epoch": 1.0400543212890625,
"grad_norm": 0.0009581708000041544,
"learning_rate": 4.203691482543946e-05,
"lookahead_loss": 6.839151841163635,
"loss": 0.3342,
"step": 83500
},
{
"base_loss": 0.30506757298111914,
"epoch": 1.0410079956054688,
"grad_norm": 0.0009835829259827733,
"learning_rate": 4.198923110961914e-05,
"lookahead_loss": 6.788027523994446,
"loss": 0.3148,
"step": 84000
},
{
"base_loss": 0.29668092691898346,
"epoch": 1.041961669921875,
"grad_norm": 0.0009812023490667343,
"learning_rate": 4.194154739379883e-05,
"lookahead_loss": 6.869966278076172,
"loss": 0.3089,
"step": 84500
},
{
"base_loss": 0.30789948108792303,
"epoch": 1.0429153442382812,
"grad_norm": 0.0009604549850337207,
"learning_rate": 4.1893863677978514e-05,
"lookahead_loss": 6.8703847560882565,
"loss": 0.3247,
"step": 85000
},
{
"epoch": 1.0429153442382812,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.825141870175687,
"eval_lookahead_perplexity": 920.7070156519827,
"eval_loss": 0.14276856184005737,
"eval_perplexity": 1.1534628154602318,
"eval_runtime": 493.0912,
"eval_samples_per_second": 10.14,
"eval_steps_per_second": 0.318,
"step": 85000
},
{
"base_loss": 0.3281280441880226,
"epoch": 1.0438690185546875,
"grad_norm": 0.0009952600812539458,
"learning_rate": 4.1846179962158205e-05,
"lookahead_loss": 6.8946290845870974,
"loss": 0.3444,
"step": 85500
},
{
"base_loss": 0.2978555924296379,
"epoch": 1.0448226928710938,
"grad_norm": 0.0010316900443285704,
"learning_rate": 4.1798496246337895e-05,
"lookahead_loss": 6.817944658279419,
"loss": 0.3111,
"step": 86000
},
{
"base_loss": 0.3044668311774731,
"epoch": 1.0457763671875,
"grad_norm": 0.0009631580905988812,
"learning_rate": 4.175081253051758e-05,
"lookahead_loss": 6.833521637916565,
"loss": 0.318,
"step": 86500
},
{
"base_loss": 0.3298782432973385,
"epoch": 1.0467300415039062,
"grad_norm": 0.0009412643266841769,
"learning_rate": 4.170312881469727e-05,
"lookahead_loss": 6.806390251159668,
"loss": 0.3407,
"step": 87000
},
{
"base_loss": 0.32442897310853,
"epoch": 1.0476837158203125,
"grad_norm": 0.0009984897915273905,
"learning_rate": 4.165544509887695e-05,
"lookahead_loss": 6.830627080917359,
"loss": 0.3392,
"step": 87500
},
{
"base_loss": 0.2941350122392178,
"epoch": 1.0486373901367188,
"grad_norm": 0.0009231239673681557,
"learning_rate": 4.160776138305664e-05,
"lookahead_loss": 6.789681484222412,
"loss": 0.3084,
"step": 88000
},
{
"base_loss": 0.301623804807663,
"epoch": 1.049591064453125,
"grad_norm": 0.000988572952337563,
"learning_rate": 4.156007766723633e-05,
"lookahead_loss": 6.77013840007782,
"loss": 0.3152,
"step": 88500
},
{
"base_loss": 0.31965578559041025,
"epoch": 1.0505447387695312,
"grad_norm": 0.0009195742895826697,
"learning_rate": 4.1512393951416016e-05,
"lookahead_loss": 6.868552158355713,
"loss": 0.3327,
"step": 89000
},
{
"base_loss": 0.30511142282187936,
"epoch": 1.0514984130859375,
"grad_norm": 0.000990525702945888,
"learning_rate": 4.1464710235595706e-05,
"lookahead_loss": 6.824063732147216,
"loss": 0.3195,
"step": 89500
},
{
"base_loss": 0.3033564644157887,
"epoch": 1.0524520874023438,
"grad_norm": 0.0010395031422376633,
"learning_rate": 4.141702651977539e-05,
"lookahead_loss": 6.759691466331482,
"loss": 0.3166,
"step": 90000
},
{
"epoch": 1.0524520874023438,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.803260954043355,
"eval_lookahead_perplexity": 900.7799093735076,
"eval_loss": 0.14272409677505493,
"eval_perplexity": 1.1534115278014274,
"eval_runtime": 492.5913,
"eval_samples_per_second": 10.15,
"eval_steps_per_second": 0.319,
"step": 90000
},
{
"base_loss": 0.32089028322696683,
"epoch": 1.05340576171875,
"grad_norm": 0.0009381326381117105,
"learning_rate": 4.136934280395508e-05,
"lookahead_loss": 6.765647802352905,
"loss": 0.3307,
"step": 90500
},
{
"base_loss": 0.35406574749946595,
"epoch": 1.0543594360351562,
"grad_norm": 0.0009708745637908578,
"learning_rate": 4.132165908813477e-05,
"lookahead_loss": 6.8132513179779055,
"loss": 0.3699,
"step": 91000
},
{
"base_loss": 0.2938829956352711,
"epoch": 1.0553131103515625,
"grad_norm": 0.0009931994136422873,
"learning_rate": 4.127397537231445e-05,
"lookahead_loss": 6.817016356945038,
"loss": 0.3071,
"step": 91500
},
{
"base_loss": 0.30498689064383505,
"epoch": 1.0562667846679688,
"grad_norm": 0.0009295056224800646,
"learning_rate": 4.1226291656494143e-05,
"lookahead_loss": 6.81505980014801,
"loss": 0.3181,
"step": 92000
},
{
"base_loss": 0.317481600522995,
"epoch": 1.057220458984375,
"grad_norm": 0.0009703385876491666,
"learning_rate": 4.117860794067383e-05,
"lookahead_loss": 6.835084310531617,
"loss": 0.3317,
"step": 92500
},
{
"base_loss": 0.3179551683664322,
"epoch": 1.0581741333007812,
"grad_norm": 0.0009712363826110959,
"learning_rate": 4.113092422485352e-05,
"lookahead_loss": 6.822910179138184,
"loss": 0.3299,
"step": 93000
},
{
"base_loss": 0.29271650505065916,
"epoch": 1.0591278076171875,
"grad_norm": 0.000948713393881917,
"learning_rate": 4.108324050903321e-05,
"lookahead_loss": 6.735391735076904,
"loss": 0.3073,
"step": 93500
},
{
"base_loss": 0.3039356949329376,
"epoch": 1.0600814819335938,
"grad_norm": 0.0009828072506934404,
"learning_rate": 4.103555679321289e-05,
"lookahead_loss": 6.790757938861847,
"loss": 0.3189,
"step": 94000
},
{
"base_loss": 0.32165152502059935,
"epoch": 1.06103515625,
"grad_norm": 0.000970664550550282,
"learning_rate": 4.098787307739258e-05,
"lookahead_loss": 6.76230890083313,
"loss": 0.3332,
"step": 94500
},
{
"base_loss": 0.3061283130943775,
"epoch": 1.0619888305664062,
"grad_norm": 0.0010395641438663006,
"learning_rate": 4.0940189361572264e-05,
"lookahead_loss": 6.777449913024903,
"loss": 0.3171,
"step": 95000
},
{
"epoch": 1.0619888305664062,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.780221919282176,
"eval_lookahead_perplexity": 880.2640499998114,
"eval_loss": 0.14268015325069427,
"eval_perplexity": 1.1533608439474792,
"eval_runtime": 484.5014,
"eval_samples_per_second": 10.32,
"eval_steps_per_second": 0.324,
"step": 95000
},
{
"base_loss": 0.30640321379899976,
"epoch": 1.0629425048828125,
"grad_norm": 0.0009915264090523124,
"learning_rate": 4.0892505645751955e-05,
"lookahead_loss": 6.743420834541321,
"loss": 0.3177,
"step": 95500
},
{
"base_loss": 0.31782649287581444,
"epoch": 1.0638961791992188,
"grad_norm": 0.0009494387777522206,
"learning_rate": 4.0844821929931645e-05,
"lookahead_loss": 6.79113444519043,
"loss": 0.3309,
"step": 96000
},
{
"base_loss": 0.30349985790252687,
"epoch": 1.064849853515625,
"grad_norm": 0.0009761872352100909,
"learning_rate": 4.079713821411133e-05,
"lookahead_loss": 6.813262487411499,
"loss": 0.3191,
"step": 96500
},
{
"base_loss": 0.3075440634191036,
"epoch": 1.0658035278320312,
"grad_norm": 0.0009754234342835844,
"learning_rate": 4.074945449829102e-05,
"lookahead_loss": 6.742620223045349,
"loss": 0.321,
"step": 97000
},
{
"base_loss": 0.3064390652179718,
"epoch": 1.0667572021484375,
"grad_norm": 0.0009838842088356614,
"learning_rate": 4.07017707824707e-05,
"lookahead_loss": 6.700049202919006,
"loss": 0.317,
"step": 97500
},
{
"base_loss": 0.3303199237883091,
"epoch": 1.0677108764648438,
"grad_norm": 0.0010200405959039927,
"learning_rate": 4.065408706665039e-05,
"lookahead_loss": 6.784039269447327,
"loss": 0.3412,
"step": 98000
},
{
"base_loss": 0.2994670196175575,
"epoch": 1.06866455078125,
"grad_norm": 0.0010121484519913793,
"learning_rate": 4.060640335083008e-05,
"lookahead_loss": 6.775819786071778,
"loss": 0.3108,
"step": 98500
},
{
"base_loss": 0.3000359579175711,
"epoch": 1.0696182250976562,
"grad_norm": 0.0009712814935483038,
"learning_rate": 4.0558719635009766e-05,
"lookahead_loss": 6.7902104940414425,
"loss": 0.3161,
"step": 99000
},
{
"base_loss": 0.34639680609107015,
"epoch": 1.0705718994140625,
"grad_norm": 0.0009614603477530181,
"learning_rate": 4.0511035919189456e-05,
"lookahead_loss": 6.673412177562714,
"loss": 0.3593,
"step": 99500
},
{
"base_loss": 0.3132462115287781,
"epoch": 1.0715255737304688,
"grad_norm": 0.0009738055523484945,
"learning_rate": 4.046335220336914e-05,
"lookahead_loss": 6.736346269607544,
"loss": 0.3247,
"step": 100000
},
{
"epoch": 1.0715255737304688,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.758892397530163,
"eval_lookahead_perplexity": 861.6872601047816,
"eval_loss": 0.14263826608657837,
"eval_perplexity": 1.1533125339443155,
"eval_runtime": 492.9295,
"eval_samples_per_second": 10.143,
"eval_steps_per_second": 0.319,
"step": 100000
},
{
"base_loss": 0.3044133240580559,
"epoch": 1.0009536743164062,
"grad_norm": 0.000981901423074305,
"learning_rate": 4.041566848754883e-05,
"lookahead_loss": 6.821595732688904,
"loss": 0.3138,
"step": 100500
},
{
"base_loss": 0.30059696701169014,
"epoch": 1.0019073486328125,
"grad_norm": 0.000999079318717122,
"learning_rate": 4.036798477172852e-05,
"lookahead_loss": 6.66421698474884,
"loss": 0.3137,
"step": 101000
},
{
"base_loss": 0.31169990518689156,
"epoch": 1.0028610229492188,
"grad_norm": 0.0009957862785086036,
"learning_rate": 4.03203010559082e-05,
"lookahead_loss": 6.655234758377075,
"loss": 0.3215,
"step": 101500
},
{
"base_loss": 0.3227726019620895,
"epoch": 1.003814697265625,
"grad_norm": 0.0009710168233141303,
"learning_rate": 4.0272617340087893e-05,
"lookahead_loss": 6.6804737997055055,
"loss": 0.3354,
"step": 102000
},
{
"base_loss": 0.3022470915019512,
"epoch": 1.0047683715820312,
"grad_norm": 0.000950310961343348,
"learning_rate": 4.022493362426758e-05,
"lookahead_loss": 6.665619974136352,
"loss": 0.3172,
"step": 102500
},
{
"base_loss": 0.30552061820030213,
"epoch": 1.0057220458984375,
"grad_norm": 0.0008522234857082367,
"learning_rate": 4.017724990844727e-05,
"lookahead_loss": 6.790848443984985,
"loss": 0.314,
"step": 103000
},
{
"base_loss": 0.2953472335338593,
"epoch": 1.0066757202148438,
"grad_norm": 0.0009317957446910441,
"learning_rate": 4.012956619262696e-05,
"lookahead_loss": 6.637859883308411,
"loss": 0.3144,
"step": 103500
},
{
"base_loss": 0.312746944963932,
"epoch": 1.00762939453125,
"grad_norm": 0.0009721561800688505,
"learning_rate": 4.008188247680664e-05,
"lookahead_loss": 6.700247512817382,
"loss": 0.3242,
"step": 104000
},
{
"base_loss": 0.3169711889922619,
"epoch": 1.0085830688476562,
"grad_norm": 0.0009455936960875988,
"learning_rate": 4.003419876098633e-05,
"lookahead_loss": 6.695308849334717,
"loss": 0.3221,
"step": 104500
},
{
"base_loss": 0.306710629016161,
"epoch": 1.0095367431640625,
"grad_norm": 0.0009776534279808402,
"learning_rate": 3.9986515045166014e-05,
"lookahead_loss": 6.697743677139282,
"loss": 0.3199,
"step": 105000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.739253997802734,
"eval_lookahead_perplexity": 844.9301809110984,
"eval_loss": 0.14259953796863556,
"eval_perplexity": 1.1532678691853726,
"eval_runtime": 478.9091,
"eval_samples_per_second": 10.44,
"eval_steps_per_second": 0.328,
"step": 105000
},
{
"base_loss": 0.30083237382769584,
"epoch": 1.0104904174804688,
"grad_norm": 0.0009771056938916445,
"learning_rate": 3.9938831329345705e-05,
"lookahead_loss": 6.666155261039734,
"loss": 0.3113,
"step": 105500
},
{
"base_loss": 0.2993237827420235,
"epoch": 1.011444091796875,
"grad_norm": 0.0010173760820180178,
"learning_rate": 3.9891147613525395e-05,
"lookahead_loss": 6.6820623445510865,
"loss": 0.3139,
"step": 106000
},
{
"base_loss": 0.3238567093908787,
"epoch": 1.0123977661132812,
"grad_norm": 0.000880017876625061,
"learning_rate": 3.984346389770508e-05,
"lookahead_loss": 6.659082005500793,
"loss": 0.3342,
"step": 106500
},
{
"base_loss": 0.3051931007504463,
"epoch": 1.0133514404296875,
"grad_norm": 0.0009482282912358642,
"learning_rate": 3.979578018188477e-05,
"lookahead_loss": 6.76999457359314,
"loss": 0.3202,
"step": 107000
},
{
"base_loss": 0.29808008483052256,
"epoch": 1.0143051147460938,
"grad_norm": 0.0009459082502871752,
"learning_rate": 3.974809646606445e-05,
"lookahead_loss": 6.709939098358154,
"loss": 0.3136,
"step": 107500
},
{
"base_loss": 0.29345863962173463,
"epoch": 1.0152587890625,
"grad_norm": 0.0009026661282405257,
"learning_rate": 3.970041275024414e-05,
"lookahead_loss": 6.641336709499359,
"loss": 0.3059,
"step": 108000
},
{
"base_loss": 0.3092884007692337,
"epoch": 1.0162124633789062,
"grad_norm": 0.0009934415575116873,
"learning_rate": 3.965272903442383e-05,
"lookahead_loss": 6.710262378692627,
"loss": 0.3215,
"step": 108500
},
{
"base_loss": 0.31143338218331335,
"epoch": 1.0171661376953125,
"grad_norm": 0.0009016263647936285,
"learning_rate": 3.9605045318603516e-05,
"lookahead_loss": 6.74980753993988,
"loss": 0.3234,
"step": 109000
},
{
"base_loss": 0.3001442384421825,
"epoch": 1.0181198120117188,
"grad_norm": 0.0009415132808499038,
"learning_rate": 3.9557361602783206e-05,
"lookahead_loss": 6.753072287559509,
"loss": 0.3119,
"step": 109500
},
{
"base_loss": 0.2986592257618904,
"epoch": 1.019073486328125,
"grad_norm": 0.0009482503519393504,
"learning_rate": 3.950967788696289e-05,
"lookahead_loss": 6.771802840709686,
"loss": 0.3116,
"step": 110000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.720215821799378,
"eval_lookahead_perplexity": 828.9964076723086,
"eval_loss": 0.1425611525774002,
"eval_perplexity": 1.1532236013966384,
"eval_runtime": 486.3363,
"eval_samples_per_second": 10.281,
"eval_steps_per_second": 0.323,
"step": 110000
},
{
"base_loss": 0.30347599306702616,
"epoch": 1.0200271606445312,
"grad_norm": 0.0010480897035449743,
"learning_rate": 3.946199417114258e-05,
"lookahead_loss": 6.608225521087647,
"loss": 0.3157,
"step": 110500
},
{
"base_loss": 0.3299741225540638,
"epoch": 1.0209808349609375,
"grad_norm": 0.0009382431744597852,
"learning_rate": 3.941431045532227e-05,
"lookahead_loss": 6.704700765609741,
"loss": 0.341,
"step": 111000
},
{
"base_loss": 0.3070560489296913,
"epoch": 1.0219345092773438,
"grad_norm": 0.0009868369670584798,
"learning_rate": 3.936662673950195e-05,
"lookahead_loss": 6.654251655578613,
"loss": 0.3169,
"step": 111500
},
{
"base_loss": 0.301061170309782,
"epoch": 1.02288818359375,
"grad_norm": 0.0010104605462402105,
"learning_rate": 3.9318943023681643e-05,
"lookahead_loss": 6.692898473739624,
"loss": 0.3139,
"step": 112000
},
{
"base_loss": 0.30337609922885894,
"epoch": 1.0238418579101562,
"grad_norm": 0.0009765701834112406,
"learning_rate": 3.927125930786133e-05,
"lookahead_loss": 6.65063930606842,
"loss": 0.315,
"step": 112500
},
{
"base_loss": 0.3241444931924343,
"epoch": 1.0247955322265625,
"grad_norm": 0.0009068374638445675,
"learning_rate": 3.922357559204102e-05,
"lookahead_loss": 6.651956144332885,
"loss": 0.3367,
"step": 113000
},
{
"base_loss": 0.3070600248277187,
"epoch": 1.0257492065429688,
"grad_norm": 0.0009709529695101082,
"learning_rate": 3.917589187622071e-05,
"lookahead_loss": 6.608614470481872,
"loss": 0.3218,
"step": 113500
},
{
"base_loss": 0.3022406686246395,
"epoch": 1.026702880859375,
"grad_norm": 0.0010030195116996765,
"learning_rate": 3.912820816040039e-05,
"lookahead_loss": 6.62807030582428,
"loss": 0.3116,
"step": 114000
},
{
"base_loss": 0.30680677881836893,
"epoch": 1.0276565551757812,
"grad_norm": 0.0009639645577408373,
"learning_rate": 3.908052444458008e-05,
"lookahead_loss": 6.749866914749146,
"loss": 0.3195,
"step": 114500
},
{
"base_loss": 0.33426042160391806,
"epoch": 1.0286102294921875,
"grad_norm": 0.00096013059373945,
"learning_rate": 3.9032840728759764e-05,
"lookahead_loss": 6.75391247177124,
"loss": 0.3431,
"step": 115000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.702820940901296,
"eval_lookahead_perplexity": 814.7008094738575,
"eval_loss": 0.14252659678459167,
"eval_perplexity": 1.1531837515293326,
"eval_runtime": 472.8492,
"eval_samples_per_second": 10.574,
"eval_steps_per_second": 0.332,
"step": 115000
},
{
"base_loss": 0.3049518305659294,
"epoch": 1.0295639038085938,
"grad_norm": 0.0009578875033184886,
"learning_rate": 3.8985157012939455e-05,
"lookahead_loss": 6.715790456771851,
"loss": 0.3138,
"step": 115500
},
{
"base_loss": 0.3062360401749611,
"epoch": 1.030517578125,
"grad_norm": 0.000939805235248059,
"learning_rate": 3.8937473297119145e-05,
"lookahead_loss": 6.674023857593537,
"loss": 0.3173,
"step": 116000
},
{
"base_loss": 0.30225355681777,
"epoch": 1.0314712524414062,
"grad_norm": 0.0009627947001717985,
"learning_rate": 3.888978958129883e-05,
"lookahead_loss": 6.712348463058472,
"loss": 0.3144,
"step": 116500
},
{
"base_loss": 0.3184074863195419,
"epoch": 1.0324249267578125,
"grad_norm": 0.0009747587610036135,
"learning_rate": 3.884210586547852e-05,
"lookahead_loss": 6.652136072158814,
"loss": 0.3359,
"step": 117000
},
{
"base_loss": 0.30629492220282556,
"epoch": 1.0333786010742188,
"grad_norm": 0.0010025979718193412,
"learning_rate": 3.87944221496582e-05,
"lookahead_loss": 6.700876714706421,
"loss": 0.3163,
"step": 117500
},
{
"base_loss": 0.3031555346250534,
"epoch": 1.034332275390625,
"grad_norm": 0.0009006695472635329,
"learning_rate": 3.874673843383789e-05,
"lookahead_loss": 6.757840476036072,
"loss": 0.3149,
"step": 118000
},
{
"base_loss": 0.31164542263746264,
"epoch": 1.0352859497070312,
"grad_norm": 0.0010140526574105024,
"learning_rate": 3.869905471801758e-05,
"lookahead_loss": 6.589431819915771,
"loss": 0.3232,
"step": 118500
},
{
"base_loss": 0.324304408878088,
"epoch": 1.0362396240234375,
"grad_norm": 0.0009850772330537438,
"learning_rate": 3.8651371002197266e-05,
"lookahead_loss": 6.707897541999817,
"loss": 0.3363,
"step": 119000
},
{
"base_loss": 0.30813179594278334,
"epoch": 1.0371932983398438,
"grad_norm": 0.0009913926478475332,
"learning_rate": 3.8603687286376956e-05,
"lookahead_loss": 6.640646786689758,
"loss": 0.3205,
"step": 119500
},
{
"base_loss": 0.30138176554441454,
"epoch": 1.03814697265625,
"grad_norm": 0.0009480128646828234,
"learning_rate": 3.855600357055664e-05,
"lookahead_loss": 6.679037447929383,
"loss": 0.3145,
"step": 120000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.686355599960961,
"eval_lookahead_perplexity": 801.3963149780681,
"eval_loss": 0.1424950510263443,
"eval_perplexity": 1.1531473740472726,
"eval_runtime": 483.3795,
"eval_samples_per_second": 10.344,
"eval_steps_per_second": 0.325,
"step": 120000
},
{
"base_loss": 0.30871694785356524,
"epoch": 1.0391006469726562,
"grad_norm": 0.0009410271886736155,
"learning_rate": 3.850831985473633e-05,
"lookahead_loss": 6.625640468597412,
"loss": 0.3211,
"step": 120500
},
{
"base_loss": 0.32506244936585427,
"epoch": 1.0400543212890625,
"grad_norm": 0.0009350285981781781,
"learning_rate": 3.846063613891602e-05,
"lookahead_loss": 6.6512613153457645,
"loss": 0.3345,
"step": 121000
},
{
"base_loss": 0.30769926142692566,
"epoch": 1.0410079956054688,
"grad_norm": 0.000978046446107328,
"learning_rate": 3.84129524230957e-05,
"lookahead_loss": 6.6048227882385255,
"loss": 0.3173,
"step": 121500
},
{
"base_loss": 0.29858891409635546,
"epoch": 1.041961669921875,
"grad_norm": 0.000979132833890617,
"learning_rate": 3.8365268707275393e-05,
"lookahead_loss": 6.689519411087036,
"loss": 0.3105,
"step": 122000
},
{
"base_loss": 0.3094627737402916,
"epoch": 1.0429153442382812,
"grad_norm": 0.0009663606178946793,
"learning_rate": 3.831758499145508e-05,
"lookahead_loss": 6.7044447908401485,
"loss": 0.3251,
"step": 122500
},
{
"base_loss": 0.32764697542786597,
"epoch": 1.0438690185546875,
"grad_norm": 0.0009794370271265507,
"learning_rate": 3.826990127563477e-05,
"lookahead_loss": 6.723638868331909,
"loss": 0.3413,
"step": 123000
},
{
"base_loss": 0.29553532418608663,
"epoch": 1.0448226928710938,
"grad_norm": 0.0009821865241974592,
"learning_rate": 3.822221755981446e-05,
"lookahead_loss": 6.640208226203918,
"loss": 0.3095,
"step": 123500
},
{
"base_loss": 0.3041268612146378,
"epoch": 1.0457763671875,
"grad_norm": 0.0009438347187824547,
"learning_rate": 3.817453384399414e-05,
"lookahead_loss": 6.654849781990051,
"loss": 0.3159,
"step": 124000
},
{
"base_loss": 0.3303914776444435,
"epoch": 1.0467300415039062,
"grad_norm": 0.0009457177948206663,
"learning_rate": 3.812685012817383e-05,
"lookahead_loss": 6.630368858814239,
"loss": 0.3403,
"step": 124500
},
{
"base_loss": 0.3248122656941414,
"epoch": 1.0476837158203125,
"grad_norm": 0.0010212536435574293,
"learning_rate": 3.8079166412353514e-05,
"lookahead_loss": 6.664453419685364,
"loss": 0.3396,
"step": 125000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.671399636009631,
"eval_lookahead_perplexity": 789.4998439268663,
"eval_loss": 0.14246411621570587,
"eval_perplexity": 1.153111702203372,
"eval_runtime": 483.8837,
"eval_samples_per_second": 10.333,
"eval_steps_per_second": 0.324,
"step": 125000
},
{
"base_loss": 0.2969100174307823,
"epoch": 1.0486373901367188,
"grad_norm": 0.0009448639466427267,
"learning_rate": 3.8031482696533205e-05,
"lookahead_loss": 6.613508511543274,
"loss": 0.3092,
"step": 125500
},
{
"base_loss": 0.302005185931921,
"epoch": 1.049591064453125,
"grad_norm": 0.0009936641436070204,
"learning_rate": 3.7983798980712895e-05,
"lookahead_loss": 6.604424912452698,
"loss": 0.3169,
"step": 126000
},
{
"base_loss": 0.31874441370368006,
"epoch": 1.0505447387695312,
"grad_norm": 0.0009042201563715935,
"learning_rate": 3.793611526489258e-05,
"lookahead_loss": 6.707013080596924,
"loss": 0.3317,
"step": 126500
},
{
"base_loss": 0.30408672893047334,
"epoch": 1.0514984130859375,
"grad_norm": 0.0009868575725704432,
"learning_rate": 3.788843154907227e-05,
"lookahead_loss": 6.650782800674438,
"loss": 0.3188,
"step": 127000
},
{
"base_loss": 0.3058005510568619,
"epoch": 1.0524520874023438,
"grad_norm": 0.00102641258854419,
"learning_rate": 3.784074783325195e-05,
"lookahead_loss": 6.603402623176574,
"loss": 0.3186,
"step": 127500
},
{
"base_loss": 0.32026463899016383,
"epoch": 1.05340576171875,
"grad_norm": 0.0009292624308727682,
"learning_rate": 3.779306411743164e-05,
"lookahead_loss": 6.604880172729493,
"loss": 0.3298,
"step": 128000
},
{
"base_loss": 0.35889338579773905,
"epoch": 1.0543594360351562,
"grad_norm": 0.0009620094788260758,
"learning_rate": 3.774538040161133e-05,
"lookahead_loss": 6.646626858711243,
"loss": 0.3714,
"step": 128500
},
{
"base_loss": 0.29546374672651293,
"epoch": 1.0553131103515625,
"grad_norm": 0.0009715965134091675,
"learning_rate": 3.7697696685791016e-05,
"lookahead_loss": 6.637260946273804,
"loss": 0.3067,
"step": 129000
},
{
"base_loss": 0.3063408683240414,
"epoch": 1.0562667846679688,
"grad_norm": 0.0009361687116324902,
"learning_rate": 3.7650012969970706e-05,
"lookahead_loss": 6.649524963378906,
"loss": 0.3181,
"step": 129500
},
{
"base_loss": 0.3186078954935074,
"epoch": 1.057220458984375,
"grad_norm": 0.0009357648668810725,
"learning_rate": 3.760232925415039e-05,
"lookahead_loss": 6.6754186210632325,
"loss": 0.332,
"step": 130000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.6561929562602185,
"eval_lookahead_perplexity": 777.5849948862144,
"eval_loss": 0.14243432879447937,
"eval_perplexity": 1.1530773544909447,
"eval_runtime": 471.0856,
"eval_samples_per_second": 10.614,
"eval_steps_per_second": 0.333,
"step": 130000
},
{
"base_loss": 0.31768571627140046,
"epoch": 1.0581741333007812,
"grad_norm": 0.0009750055032782257,
"learning_rate": 3.755464553833008e-05,
"lookahead_loss": 6.655493871688843,
"loss": 0.3273,
"step": 130500
},
{
"base_loss": 0.2922684009075165,
"epoch": 1.0591278076171875,
"grad_norm": 0.0009410986676812172,
"learning_rate": 3.750696182250977e-05,
"lookahead_loss": 6.57894612789154,
"loss": 0.3084,
"step": 131000
},
{
"base_loss": 0.30112267237901685,
"epoch": 1.0600814819335938,
"grad_norm": 0.0009611063869670033,
"learning_rate": 3.745927810668945e-05,
"lookahead_loss": 6.6305308623313906,
"loss": 0.3154,
"step": 131500
},
{
"base_loss": 0.32029621145129206,
"epoch": 1.06103515625,
"grad_norm": 0.0009844622109085321,
"learning_rate": 3.7411594390869143e-05,
"lookahead_loss": 6.5994256973266605,
"loss": 0.3317,
"step": 132000
},
{
"base_loss": 0.30533574494719506,
"epoch": 1.0619888305664062,
"grad_norm": 0.0010215704096481204,
"learning_rate": 3.736391067504883e-05,
"lookahead_loss": 6.603619037628174,
"loss": 0.3156,
"step": 132500
},
{
"base_loss": 0.30571810373663905,
"epoch": 1.0629425048828125,
"grad_norm": 0.0010208436287939548,
"learning_rate": 3.731622695922852e-05,
"lookahead_loss": 6.59216494178772,
"loss": 0.3172,
"step": 133000
},
{
"base_loss": 0.31451627737283705,
"epoch": 1.0638961791992188,
"grad_norm": 0.0009539647144265473,
"learning_rate": 3.726854324340821e-05,
"lookahead_loss": 6.634691061019898,
"loss": 0.3285,
"step": 133500
},
{
"base_loss": 0.30425655883550645,
"epoch": 1.064849853515625,
"grad_norm": 0.0009696083143353462,
"learning_rate": 3.722085952758789e-05,
"lookahead_loss": 6.656025648117065,
"loss": 0.3179,
"step": 134000
},
{
"base_loss": 0.31105126801133154,
"epoch": 1.0658035278320312,
"grad_norm": 0.0009692656458355486,
"learning_rate": 3.717317581176758e-05,
"lookahead_loss": 6.560485363006592,
"loss": 0.3215,
"step": 134500
},
{
"base_loss": 0.3071163959801197,
"epoch": 1.0667572021484375,
"grad_norm": 0.0009831758216023445,
"learning_rate": 3.7125492095947264e-05,
"lookahead_loss": 6.5449725456237795,
"loss": 0.3169,
"step": 135000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.641093528308807,
"eval_lookahead_perplexity": 765.9321036725115,
"eval_loss": 0.14240561425685883,
"eval_perplexity": 1.1530442448832356,
"eval_runtime": 485.8723,
"eval_samples_per_second": 10.291,
"eval_steps_per_second": 0.323,
"step": 135000
},
{
"base_loss": 0.3321034919023514,
"epoch": 1.0677108764648438,
"grad_norm": 0.0010069276904687285,
"learning_rate": 3.7077808380126955e-05,
"lookahead_loss": 6.633243779182434,
"loss": 0.3433,
"step": 135500
},
{
"base_loss": 0.3017843673825264,
"epoch": 1.06866455078125,
"grad_norm": 0.0009863151935860515,
"learning_rate": 3.7030124664306645e-05,
"lookahead_loss": 6.624729963302612,
"loss": 0.3108,
"step": 136000
},
{
"base_loss": 0.302195555627346,
"epoch": 1.0696182250976562,
"grad_norm": 0.0009720239322632551,
"learning_rate": 3.698244094848633e-05,
"lookahead_loss": 6.641478686332703,
"loss": 0.3168,
"step": 136500
},
{
"base_loss": 0.3459869565963745,
"epoch": 1.0705718994140625,
"grad_norm": 0.0009440227877348661,
"learning_rate": 3.693475723266602e-05,
"lookahead_loss": 6.523862397193908,
"loss": 0.3605,
"step": 137000
},
{
"base_loss": 0.3151495299339294,
"epoch": 1.0715255737304688,
"grad_norm": 0.0009616228053346276,
"learning_rate": 3.68870735168457e-05,
"lookahead_loss": 6.586525348186493,
"loss": 0.3265,
"step": 137500
},
{
"base_loss": 0.30790447345376015,
"epoch": 1.072479248046875,
"grad_norm": 0.001022504991851747,
"learning_rate": 3.683938980102539e-05,
"lookahead_loss": 6.648889023780823,
"loss": 0.3207,
"step": 138000
},
{
"base_loss": 0.30545566940307617,
"epoch": 1.0734329223632812,
"grad_norm": 0.0009266745182685554,
"learning_rate": 3.679170608520508e-05,
"lookahead_loss": 6.604302444458008,
"loss": 0.3189,
"step": 138500
},
{
"base_loss": 0.32841417971253395,
"epoch": 1.0743865966796875,
"grad_norm": 0.0009727867436595261,
"learning_rate": 3.6744022369384766e-05,
"lookahead_loss": 6.628902969360351,
"loss": 0.3409,
"step": 139000
},
{
"base_loss": 0.30363579127192497,
"epoch": 1.0753402709960938,
"grad_norm": 0.0009738055523484945,
"learning_rate": 3.6696338653564456e-05,
"lookahead_loss": 6.6886735420227055,
"loss": 0.3163,
"step": 139500
},
{
"base_loss": 0.30504586565494535,
"epoch": 1.0762939453125,
"grad_norm": 0.0009336514631286263,
"learning_rate": 3.664865493774414e-05,
"lookahead_loss": 6.637202547073365,
"loss": 0.3183,
"step": 140000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.625944952614391,
"eval_lookahead_perplexity": 754.4167639313139,
"eval_loss": 0.14237651228904724,
"eval_perplexity": 1.1530106895150016,
"eval_runtime": 484.1692,
"eval_samples_per_second": 10.327,
"eval_steps_per_second": 0.324,
"step": 140000
},
{
"base_loss": 0.33029799509048463,
"epoch": 1.0772476196289062,
"grad_norm": 0.0009831954957917333,
"learning_rate": 3.660097122192383e-05,
"lookahead_loss": 6.633178756713868,
"loss": 0.3456,
"step": 140500
},
{
"base_loss": 0.3037954642176628,
"epoch": 1.0782012939453125,
"grad_norm": 0.00096993736224249,
"learning_rate": 3.655328750610352e-05,
"lookahead_loss": 6.641198945999146,
"loss": 0.317,
"step": 141000
},
{
"base_loss": 0.29821320512890814,
"epoch": 1.0791549682617188,
"grad_norm": 0.0009353780187666416,
"learning_rate": 3.65056037902832e-05,
"lookahead_loss": 6.639979603767395,
"loss": 0.3132,
"step": 141500
},
{
"base_loss": 0.3142137563228607,
"epoch": 1.080108642578125,
"grad_norm": 0.0009523274493403733,
"learning_rate": 3.6457920074462893e-05,
"lookahead_loss": 6.602743772506714,
"loss": 0.3306,
"step": 142000
},
{
"base_loss": 0.3222310249209404,
"epoch": 1.0810623168945312,
"grad_norm": 0.0009943461045622826,
"learning_rate": 3.641023635864258e-05,
"lookahead_loss": 6.649980679988861,
"loss": 0.3395,
"step": 142500
},
{
"base_loss": 0.3002626436650753,
"epoch": 1.0820159912109375,
"grad_norm": 0.0009161168127320707,
"learning_rate": 3.636255264282227e-05,
"lookahead_loss": 6.636230380058288,
"loss": 0.3127,
"step": 143000
},
{
"base_loss": 0.3045452245473862,
"epoch": 1.0829696655273438,
"grad_norm": 0.0009895727271214128,
"learning_rate": 3.631486892700196e-05,
"lookahead_loss": 6.670627347946167,
"loss": 0.3188,
"step": 143500
},
{
"base_loss": 0.33469617655873296,
"epoch": 1.08392333984375,
"grad_norm": 0.0009402433061040938,
"learning_rate": 3.626718521118164e-05,
"lookahead_loss": 6.6782106046676635,
"loss": 0.346,
"step": 144000
},
{
"base_loss": 0.30740025800466536,
"epoch": 1.0848770141601562,
"grad_norm": 0.0009543623309582472,
"learning_rate": 3.621950149536133e-05,
"lookahead_loss": 6.598346343994141,
"loss": 0.3193,
"step": 144500
},
{
"base_loss": 0.300477741509676,
"epoch": 1.0858306884765625,
"grad_norm": 0.001002481789328158,
"learning_rate": 3.6171817779541014e-05,
"lookahead_loss": 6.61094771194458,
"loss": 0.3102,
"step": 145000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.611703176467945,
"eval_lookahead_perplexity": 743.7486758347069,
"eval_loss": 0.14234933257102966,
"eval_perplexity": 1.1529793514354714,
"eval_runtime": 475.9695,
"eval_samples_per_second": 10.505,
"eval_steps_per_second": 0.33,
"step": 145000
},
{
"base_loss": 0.301901563256979,
"epoch": 1.0867843627929688,
"grad_norm": 0.0011312811402603984,
"learning_rate": 3.6124134063720705e-05,
"lookahead_loss": 6.581235520362854,
"loss": 0.3132,
"step": 145500
},
{
"base_loss": 0.338142231285572,
"epoch": 1.087738037109375,
"grad_norm": 0.0008956918027251959,
"learning_rate": 3.6076450347900395e-05,
"lookahead_loss": 6.629713255405426,
"loss": 0.3454,
"step": 146000
},
{
"base_loss": 0.3009798979461193,
"epoch": 1.0886917114257812,
"grad_norm": 0.0009857059922069311,
"learning_rate": 3.602876663208008e-05,
"lookahead_loss": 6.607778671741485,
"loss": 0.3122,
"step": 146500
},
{
"base_loss": 0.3090392453968525,
"epoch": 1.0896453857421875,
"grad_norm": 0.0010041121859103441,
"learning_rate": 3.598108291625977e-05,
"lookahead_loss": 6.641937935829163,
"loss": 0.3183,
"step": 147000
},
{
"base_loss": 0.30036539113521576,
"epoch": 1.0905990600585938,
"grad_norm": 0.0009667676058597863,
"learning_rate": 3.593339920043945e-05,
"lookahead_loss": 6.610139918327332,
"loss": 0.3123,
"step": 147500
},
{
"base_loss": 0.3006012495756149,
"epoch": 1.091552734375,
"grad_norm": 0.0008970113703981042,
"learning_rate": 3.588571548461914e-05,
"lookahead_loss": 6.584732450485229,
"loss": 0.3112,
"step": 148000
},
{
"base_loss": 0.31875682109594344,
"epoch": 1.0925064086914062,
"grad_norm": 0.0009264895925298333,
"learning_rate": 3.583803176879883e-05,
"lookahead_loss": 6.613501731872558,
"loss": 0.3355,
"step": 148500
},
{
"base_loss": 0.3104289738535881,
"epoch": 1.0934600830078125,
"grad_norm": 0.0009199742926284671,
"learning_rate": 3.5790348052978516e-05,
"lookahead_loss": 6.620612399101257,
"loss": 0.3219,
"step": 149000
},
{
"base_loss": 0.2877590928971767,
"epoch": 1.0944137573242188,
"grad_norm": 0.0009749355376698077,
"learning_rate": 3.5742664337158206e-05,
"lookahead_loss": 6.6178521070480345,
"loss": 0.3019,
"step": 149500
},
{
"base_loss": 0.2935507807135582,
"epoch": 1.095367431640625,
"grad_norm": 0.0009596819872967899,
"learning_rate": 3.569498062133789e-05,
"lookahead_loss": 6.54475514793396,
"loss": 0.3075,
"step": 150000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.600208011298134,
"eval_lookahead_perplexity": 735.2481132509488,
"eval_loss": 0.14232492446899414,
"eval_perplexity": 1.1529512097412606,
"eval_runtime": 491.9361,
"eval_samples_per_second": 10.164,
"eval_steps_per_second": 0.319,
"step": 150000
},
{
"base_loss": 0.2986202912926674,
"epoch": 1.0963211059570312,
"grad_norm": 0.0009500982123427093,
"learning_rate": 3.564729690551758e-05,
"lookahead_loss": 6.612848388671875,
"loss": 0.3123,
"step": 150500
},
{
"base_loss": 0.3307524161040783,
"epoch": 1.0972747802734375,
"grad_norm": 0.0009901663288474083,
"learning_rate": 3.559961318969727e-05,
"lookahead_loss": 6.628784805297852,
"loss": 0.3418,
"step": 151000
},
{
"base_loss": 0.29244673988223074,
"epoch": 1.0982284545898438,
"grad_norm": 0.0009901755256578326,
"learning_rate": 3.555192947387695e-05,
"lookahead_loss": 6.573961565494537,
"loss": 0.3074,
"step": 151500
},
{
"base_loss": 0.295786843508482,
"epoch": 1.09918212890625,
"grad_norm": 0.0009776337537914515,
"learning_rate": 3.5504245758056643e-05,
"lookahead_loss": 6.619675145149231,
"loss": 0.3112,
"step": 152000
},
{
"base_loss": 0.30293611577153207,
"epoch": 1.1001358032226562,
"grad_norm": 0.0009244863176718354,
"learning_rate": 3.545656204223633e-05,
"lookahead_loss": 6.614836613655091,
"loss": 0.3152,
"step": 152500
},
{
"base_loss": 0.3240869597494602,
"epoch": 1.1010894775390625,
"grad_norm": 0.0009519928717054427,
"learning_rate": 3.540887832641602e-05,
"lookahead_loss": 6.675669587135315,
"loss": 0.3341,
"step": 153000
},
{
"base_loss": 0.30599541807174685,
"epoch": 1.1020431518554688,
"grad_norm": 0.0009593720897100866,
"learning_rate": 3.536119461059571e-05,
"lookahead_loss": 6.632304777145386,
"loss": 0.3154,
"step": 153500
},
{
"base_loss": 0.2991089904308319,
"epoch": 1.102996826171875,
"grad_norm": 0.0009689299622550607,
"learning_rate": 3.531351089477539e-05,
"lookahead_loss": 6.595957444190979,
"loss": 0.314,
"step": 154000
},
{
"base_loss": 0.30219315418601034,
"epoch": 1.1039505004882812,
"grad_norm": 0.0010047038085758686,
"learning_rate": 3.526582717895508e-05,
"lookahead_loss": 6.639606526374817,
"loss": 0.3127,
"step": 154500
},
{
"base_loss": 0.3133500624895096,
"epoch": 1.1049041748046875,
"grad_norm": 0.0009745966526679695,
"learning_rate": 3.5218143463134764e-05,
"lookahead_loss": 6.568594479560852,
"loss": 0.3272,
"step": 155000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.58809915061195,
"eval_lookahead_perplexity": 726.3987820642042,
"eval_loss": 0.142300546169281,
"eval_perplexity": 1.1529231030937126,
"eval_runtime": 475.0132,
"eval_samples_per_second": 10.526,
"eval_steps_per_second": 0.331,
"step": 155000
},
{
"base_loss": 0.3070584389269352,
"epoch": 1.1058578491210938,
"grad_norm": 0.0009309753077104688,
"learning_rate": 3.5170459747314455e-05,
"lookahead_loss": 6.5637693691253665,
"loss": 0.3213,
"step": 155500
},
{
"base_loss": 0.29948241996765135,
"epoch": 1.1068115234375,
"grad_norm": 0.0009559483733028173,
"learning_rate": 3.5122776031494145e-05,
"lookahead_loss": 6.645429815769195,
"loss": 0.3098,
"step": 156000
},
{
"base_loss": 0.29492466670274736,
"epoch": 1.1077651977539062,
"grad_norm": 0.0009711109451018274,
"learning_rate": 3.507509231567383e-05,
"lookahead_loss": 6.56166731595993,
"loss": 0.3083,
"step": 156500
},
{
"base_loss": 0.3188383647501469,
"epoch": 1.1087188720703125,
"grad_norm": 0.0010144627885892987,
"learning_rate": 3.502740859985352e-05,
"lookahead_loss": 6.550499409675598,
"loss": 0.3316,
"step": 157000
},
{
"base_loss": 0.31659464621543887,
"epoch": 1.1096725463867188,
"grad_norm": 0.0009141165646724403,
"learning_rate": 3.49797248840332e-05,
"lookahead_loss": 6.527835812091827,
"loss": 0.3262,
"step": 157500
},
{
"base_loss": 0.3013280538916588,
"epoch": 1.110626220703125,
"grad_norm": 0.0008812876185402274,
"learning_rate": 3.493204116821289e-05,
"lookahead_loss": 6.478862133026123,
"loss": 0.3142,
"step": 158000
},
{
"base_loss": 0.29822684854269027,
"epoch": 1.1115798950195312,
"grad_norm": 0.0010021587368100882,
"learning_rate": 3.488435745239258e-05,
"lookahead_loss": 6.607266705513,
"loss": 0.3089,
"step": 158500
},
{
"base_loss": 0.3082665235698223,
"epoch": 1.1125335693359375,
"grad_norm": 0.0009319260716438293,
"learning_rate": 3.4836673736572266e-05,
"lookahead_loss": 6.618119819164276,
"loss": 0.3212,
"step": 159000
},
{
"base_loss": 0.34342152199149134,
"epoch": 1.1134872436523438,
"grad_norm": 0.0010022877249866724,
"learning_rate": 3.4788990020751956e-05,
"lookahead_loss": 6.628146926879883,
"loss": 0.3504,
"step": 159500
},
{
"base_loss": 0.29527223294973376,
"epoch": 1.11444091796875,
"grad_norm": 0.0009499759180471301,
"learning_rate": 3.474130630493164e-05,
"lookahead_loss": 6.5076857767105105,
"loss": 0.3088,
"step": 160000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.576469425957043,
"eval_lookahead_perplexity": 717.9998972605395,
"eval_loss": 0.14227746427059174,
"eval_perplexity": 1.1528964917465718,
"eval_runtime": 486.8871,
"eval_samples_per_second": 10.269,
"eval_steps_per_second": 0.322,
"step": 160000
},
{
"base_loss": 0.2983711498081684,
"epoch": 1.1153945922851562,
"grad_norm": 0.0009712922619655728,
"learning_rate": 3.469362258911133e-05,
"lookahead_loss": 6.557691449165344,
"loss": 0.3111,
"step": 160500
},
{
"base_loss": 0.3164993856549263,
"epoch": 1.1163482666015625,
"grad_norm": 0.0010468108812347054,
"learning_rate": 3.464593887329102e-05,
"lookahead_loss": 6.562255940437317,
"loss": 0.3252,
"step": 161000
},
{
"base_loss": 0.3281388694047928,
"epoch": 1.1173019409179688,
"grad_norm": 0.0009933033725246787,
"learning_rate": 3.45982551574707e-05,
"lookahead_loss": 6.5705895509719845,
"loss": 0.3439,
"step": 161500
},
{
"base_loss": 0.3066762860417366,
"epoch": 1.118255615234375,
"grad_norm": 0.0010285564931109548,
"learning_rate": 3.4550571441650393e-05,
"lookahead_loss": 6.584883491516114,
"loss": 0.3162,
"step": 162000
},
{
"base_loss": 0.3002779276072979,
"epoch": 1.1192092895507812,
"grad_norm": 0.0009605666273273528,
"learning_rate": 3.450288772583008e-05,
"lookahead_loss": 6.601356457710266,
"loss": 0.3108,
"step": 162500
},
{
"base_loss": 0.3048044160306454,
"epoch": 2.0009536743164062,
"grad_norm": 0.000960490433499217,
"learning_rate": 3.445520401000977e-05,
"lookahead_loss": 6.638746548652649,
"loss": 0.3137,
"step": 163000
},
{
"base_loss": 0.2995053820014,
"epoch": 2.0019073486328125,
"grad_norm": 0.001005924423225224,
"learning_rate": 3.440752029418946e-05,
"lookahead_loss": 6.48337349319458,
"loss": 0.3134,
"step": 163500
},
{
"base_loss": 0.31198617857694627,
"epoch": 2.0028610229492188,
"grad_norm": 0.0010051662102341652,
"learning_rate": 3.435983657836914e-05,
"lookahead_loss": 6.466943081855774,
"loss": 0.3218,
"step": 164000
},
{
"base_loss": 0.32396442687511445,
"epoch": 2.003814697265625,
"grad_norm": 0.0009522914770059288,
"learning_rate": 3.431215286254883e-05,
"lookahead_loss": 6.503096837997436,
"loss": 0.3352,
"step": 164500
},
{
"base_loss": 0.3013957371413708,
"epoch": 2.0047683715820312,
"grad_norm": 0.0009518108563497663,
"learning_rate": 3.4264469146728514e-05,
"lookahead_loss": 6.490046030044556,
"loss": 0.316,
"step": 165000
},
{
"epoch": 2.0047683715820312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.564821576158079,
"eval_lookahead_perplexity": 709.6852601291519,
"eval_loss": 0.14225433766841888,
"eval_perplexity": 1.1528698294763657,
"eval_runtime": 483.7285,
"eval_samples_per_second": 10.336,
"eval_steps_per_second": 0.325,
"step": 165000
},
{
"base_loss": 0.3039788320362568,
"epoch": 2.0057220458984375,
"grad_norm": 0.0008521187701262534,
"learning_rate": 3.4216785430908205e-05,
"lookahead_loss": 6.6104288854599,
"loss": 0.3131,
"step": 165500
},
{
"base_loss": 0.29717833909392355,
"epoch": 2.0066757202148438,
"grad_norm": 0.0009511762764304876,
"learning_rate": 3.4169101715087895e-05,
"lookahead_loss": 6.45364487361908,
"loss": 0.314,
"step": 166000
},
{
"base_loss": 0.31199148765206336,
"epoch": 2.00762939453125,
"grad_norm": 0.0009973255218937993,
"learning_rate": 3.412141799926758e-05,
"lookahead_loss": 6.517452167510986,
"loss": 0.3238,
"step": 166500
},
{
"base_loss": 0.3148621036410332,
"epoch": 2.0085830688476562,
"grad_norm": 0.0009120389586314559,
"learning_rate": 3.407373428344727e-05,
"lookahead_loss": 6.512342976093292,
"loss": 0.3221,
"step": 167000
},
{
"base_loss": 0.30580521461367605,
"epoch": 2.0095367431640625,
"grad_norm": 0.0009860595455393195,
"learning_rate": 3.402605056762695e-05,
"lookahead_loss": 6.524440247535706,
"loss": 0.3184,
"step": 167500
},
{
"base_loss": 0.3015244754254818,
"epoch": 2.0104904174804688,
"grad_norm": 0.000945191946811974,
"learning_rate": 3.397836685180664e-05,
"lookahead_loss": 6.494577717781067,
"loss": 0.312,
"step": 168000
},
{
"base_loss": 0.30137019059062004,
"epoch": 2.011444091796875,
"grad_norm": 0.0010133878095075488,
"learning_rate": 3.393068313598633e-05,
"lookahead_loss": 6.505839110851288,
"loss": 0.3131,
"step": 168500
},
{
"base_loss": 0.3252628707587719,
"epoch": 2.0123977661132812,
"grad_norm": 0.000883644272107631,
"learning_rate": 3.3882999420166016e-05,
"lookahead_loss": 6.501287104606629,
"loss": 0.3352,
"step": 169000
},
{
"base_loss": 0.30557073107361793,
"epoch": 2.0133514404296875,
"grad_norm": 0.0009423987939953804,
"learning_rate": 3.3835315704345706e-05,
"lookahead_loss": 6.586906661987305,
"loss": 0.3203,
"step": 169500
},
{
"base_loss": 0.30054079556465146,
"epoch": 2.0143051147460938,
"grad_norm": 0.0009685211116448045,
"learning_rate": 3.378763198852539e-05,
"lookahead_loss": 6.517388526916504,
"loss": 0.315,
"step": 170000
},
{
"epoch": 2.0143051147460938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.553637696531253,
"eval_lookahead_perplexity": 701.792444028261,
"eval_loss": 0.1422322541475296,
"eval_perplexity": 1.1528443703325186,
"eval_runtime": 473.8031,
"eval_samples_per_second": 10.553,
"eval_steps_per_second": 0.331,
"step": 170000
},
{
"base_loss": 0.29648803743720054,
"epoch": 2.0152587890625,
"grad_norm": 0.0009045371552929282,
"learning_rate": 3.373994827270508e-05,
"lookahead_loss": 6.50550382566452,
"loss": 0.3072,
"step": 170500
},
{
"base_loss": 0.31412097451090815,
"epoch": 2.0162124633789062,
"grad_norm": 0.0009835059754550457,
"learning_rate": 3.369226455688477e-05,
"lookahead_loss": 6.529936217784882,
"loss": 0.3253,
"step": 171000
},
{
"base_loss": 0.3125672063827515,
"epoch": 2.0171661376953125,
"grad_norm": 0.00090819998877123,
"learning_rate": 3.364458084106445e-05,
"lookahead_loss": 6.578486999034881,
"loss": 0.3233,
"step": 171500
},
{
"base_loss": 0.3002317441105843,
"epoch": 2.0181198120117188,
"grad_norm": 0.0009094449342228472,
"learning_rate": 3.3596897125244143e-05,
"lookahead_loss": 6.584220232963562,
"loss": 0.3104,
"step": 172000
},
{
"base_loss": 0.29831535935401915,
"epoch": 2.019073486328125,
"grad_norm": 0.000955162278842181,
"learning_rate": 3.354921340942383e-05,
"lookahead_loss": 6.600418879508972,
"loss": 0.3102,
"step": 172500
},
{
"base_loss": 0.3020369653701782,
"epoch": 2.0200271606445312,
"grad_norm": 0.0010421768529340625,
"learning_rate": 3.350152969360352e-05,
"lookahead_loss": 6.4263093366622925,
"loss": 0.3149,
"step": 173000
},
{
"base_loss": 0.32652922403812407,
"epoch": 2.0209808349609375,
"grad_norm": 0.0009437088738195598,
"learning_rate": 3.345384597778321e-05,
"lookahead_loss": 6.5167139654159545,
"loss": 0.3389,
"step": 173500
},
{
"base_loss": 0.30453234216570857,
"epoch": 2.0219345092773438,
"grad_norm": 0.0009512313990853727,
"learning_rate": 3.340616226196289e-05,
"lookahead_loss": 6.482069372653961,
"loss": 0.3138,
"step": 174000
},
{
"base_loss": 0.2977458454966545,
"epoch": 2.02288818359375,
"grad_norm": 0.0010046373354271054,
"learning_rate": 3.335847854614258e-05,
"lookahead_loss": 6.520741944313049,
"loss": 0.3118,
"step": 174500
},
{
"base_loss": 0.30405546057224275,
"epoch": 2.0238418579101562,
"grad_norm": 0.0009604114457033575,
"learning_rate": 3.3310794830322264e-05,
"lookahead_loss": 6.482968965530396,
"loss": 0.3142,
"step": 175000
},
{
"epoch": 2.0238418579101562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.541717086737148,
"eval_lookahead_perplexity": 693.4763152866533,
"eval_loss": 0.1422090083360672,
"eval_perplexity": 1.152817571841118,
"eval_runtime": 483.1484,
"eval_samples_per_second": 10.349,
"eval_steps_per_second": 0.325,
"step": 175000
},
{
"base_loss": 0.32463854083418847,
"epoch": 2.0247955322265625,
"grad_norm": 0.0009208981646224856,
"learning_rate": 3.3263111114501955e-05,
"lookahead_loss": 6.484801607131958,
"loss": 0.3355,
"step": 175500
},
{
"base_loss": 0.3075324648320675,
"epoch": 2.0257492065429688,
"grad_norm": 0.0009852510411292315,
"learning_rate": 3.3215427398681645e-05,
"lookahead_loss": 6.444905442237854,
"loss": 0.323,
"step": 176000
},
{
"base_loss": 0.30398501074314116,
"epoch": 2.026702880859375,
"grad_norm": 0.000977648189291358,
"learning_rate": 3.316774368286133e-05,
"lookahead_loss": 6.460987593650818,
"loss": 0.3133,
"step": 176500
},
{
"base_loss": 0.3081837382018566,
"epoch": 2.0276565551757812,
"grad_norm": 0.000948708038777113,
"learning_rate": 3.312005996704102e-05,
"lookahead_loss": 6.576977911949157,
"loss": 0.3183,
"step": 177000
},
{
"base_loss": 0.32895678067207335,
"epoch": 2.0286102294921875,
"grad_norm": 0.0009632044821046293,
"learning_rate": 3.30723762512207e-05,
"lookahead_loss": 6.5979501276016235,
"loss": 0.3403,
"step": 177500
},
{
"base_loss": 0.30588172587752344,
"epoch": 2.0295639038085938,
"grad_norm": 0.0009475542465224862,
"learning_rate": 3.302469253540039e-05,
"lookahead_loss": 6.530917593955993,
"loss": 0.3147,
"step": 178000
},
{
"base_loss": 0.3051903445720673,
"epoch": 2.030517578125,
"grad_norm": 0.0009406576864421368,
"learning_rate": 3.297700881958008e-05,
"lookahead_loss": 6.51764566040039,
"loss": 0.3167,
"step": 178500
},
{
"base_loss": 0.30346439191699026,
"epoch": 2.0314712524414062,
"grad_norm": 0.0009703211835585535,
"learning_rate": 3.2929325103759766e-05,
"lookahead_loss": 6.543263789176941,
"loss": 0.3155,
"step": 179000
},
{
"base_loss": 0.31795056411623956,
"epoch": 2.0324249267578125,
"grad_norm": 0.0009707180433906615,
"learning_rate": 3.2881641387939456e-05,
"lookahead_loss": 6.475300903320313,
"loss": 0.3348,
"step": 179500
},
{
"base_loss": 0.30795893451571466,
"epoch": 2.0333786010742188,
"grad_norm": 0.00098248606082052,
"learning_rate": 3.283395767211914e-05,
"lookahead_loss": 6.536730496883393,
"loss": 0.3178,
"step": 180000
},
{
"epoch": 2.0333786010742188,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.531855725632689,
"eval_lookahead_perplexity": 686.6713034107797,
"eval_loss": 0.1421896070241928,
"eval_perplexity": 1.1527952058848376,
"eval_runtime": 483.5703,
"eval_samples_per_second": 10.34,
"eval_steps_per_second": 0.325,
"step": 180000
},
{
"base_loss": 0.3031257001161575,
"epoch": 2.034332275390625,
"grad_norm": 0.0009016969706863165,
"learning_rate": 3.278627395629883e-05,
"lookahead_loss": 6.603059131622315,
"loss": 0.3153,
"step": 180500
},
{
"base_loss": 0.31068781118094924,
"epoch": 2.0352859497070312,
"grad_norm": 0.0010222060373052955,
"learning_rate": 3.273859024047852e-05,
"lookahead_loss": 6.439461089611053,
"loss": 0.3228,
"step": 181000
},
{
"base_loss": 0.32500979214906695,
"epoch": 2.0362396240234375,
"grad_norm": 0.0009705196134746075,
"learning_rate": 3.26909065246582e-05,
"lookahead_loss": 6.5541965799331665,
"loss": 0.337,
"step": 181500
},
{
"base_loss": 0.3069631262719631,
"epoch": 2.0371932983398438,
"grad_norm": 0.0010073435259982944,
"learning_rate": 3.2643222808837893e-05,
"lookahead_loss": 6.471248873710632,
"loss": 0.3175,
"step": 182000
},
{
"base_loss": 0.3025422422587872,
"epoch": 2.03814697265625,
"grad_norm": 0.0009568389505147934,
"learning_rate": 3.259553909301758e-05,
"lookahead_loss": 6.524754017829895,
"loss": 0.3133,
"step": 182500
},
{
"base_loss": 0.3076345331072807,
"epoch": 2.0391006469726562,
"grad_norm": 0.0009688584832474589,
"learning_rate": 3.254785537719727e-05,
"lookahead_loss": 6.467104331970215,
"loss": 0.3189,
"step": 183000
},
{
"base_loss": 0.3235399980545044,
"epoch": 2.0400543212890625,
"grad_norm": 0.0009437952539883554,
"learning_rate": 3.250017166137696e-05,
"lookahead_loss": 6.506529898166656,
"loss": 0.3336,
"step": 183500
},
{
"base_loss": 0.30506757298111914,
"epoch": 2.0410079956054688,
"grad_norm": 0.0009767162846401334,
"learning_rate": 3.245248794555664e-05,
"lookahead_loss": 6.448292857646942,
"loss": 0.3142,
"step": 184000
},
{
"base_loss": 0.29668092691898346,
"epoch": 2.041961669921875,
"grad_norm": 0.0009727113647386432,
"learning_rate": 3.240480422973633e-05,
"lookahead_loss": 6.536737553119659,
"loss": 0.3083,
"step": 184500
},
{
"base_loss": 0.30789948108792303,
"epoch": 2.0429153442382812,
"grad_norm": 0.000962116289883852,
"learning_rate": 3.2357120513916014e-05,
"lookahead_loss": 6.536309094429016,
"loss": 0.324,
"step": 185000
},
{
"epoch": 2.0429153442382812,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.522111146213909,
"eval_lookahead_perplexity": 680.0124767842166,
"eval_loss": 0.14217031002044678,
"eval_perplexity": 1.1527729606060655,
"eval_runtime": 471.9889,
"eval_samples_per_second": 10.593,
"eval_steps_per_second": 0.333,
"step": 185000
},
{
"base_loss": 0.3281280441880226,
"epoch": 2.0438690185546875,
"grad_norm": 0.0009924211772158742,
"learning_rate": 3.2309436798095705e-05,
"lookahead_loss": 6.569694968223572,
"loss": 0.3437,
"step": 185500
},
{
"base_loss": 0.2978555924296379,
"epoch": 2.0448226928710938,
"grad_norm": 0.0010296371765434742,
"learning_rate": 3.2261753082275395e-05,
"lookahead_loss": 6.4846109199523925,
"loss": 0.3105,
"step": 186000
},
{
"base_loss": 0.3044668311774731,
"epoch": 2.0457763671875,
"grad_norm": 0.0009649458806961775,
"learning_rate": 3.221406936645508e-05,
"lookahead_loss": 6.508920118331909,
"loss": 0.3174,
"step": 186500
},
{
"base_loss": 0.3298782432973385,
"epoch": 2.0467300415039062,
"grad_norm": 0.0009266676497645676,
"learning_rate": 3.216638565063477e-05,
"lookahead_loss": 6.470085307598114,
"loss": 0.3401,
"step": 187000
},
{
"base_loss": 0.32442897310853,
"epoch": 2.0476837158203125,
"grad_norm": 0.0009991949191316962,
"learning_rate": 3.211870193481445e-05,
"lookahead_loss": 6.5127511582374575,
"loss": 0.3385,
"step": 187500
},
{
"base_loss": 0.2941350122392178,
"epoch": 2.0486373901367188,
"grad_norm": 0.0009228453855030239,
"learning_rate": 3.207101821899414e-05,
"lookahead_loss": 6.472175216674804,
"loss": 0.3078,
"step": 188000
},
{
"base_loss": 0.301623804807663,
"epoch": 2.049591064453125,
"grad_norm": 0.0009880108991637826,
"learning_rate": 3.202333450317383e-05,
"lookahead_loss": 6.448336009979248,
"loss": 0.3146,
"step": 188500
},
{
"base_loss": 0.31965578559041025,
"epoch": 2.0505447387695312,
"grad_norm": 0.0009035322000272572,
"learning_rate": 3.1975650787353516e-05,
"lookahead_loss": 6.551394259452819,
"loss": 0.3321,
"step": 189000
},
{
"base_loss": 0.30511142282187936,
"epoch": 2.0514984130859375,
"grad_norm": 0.0009879703866317868,
"learning_rate": 3.1927967071533206e-05,
"lookahead_loss": 6.507521020889282,
"loss": 0.3188,
"step": 189500
},
{
"base_loss": 0.3033564644157887,
"epoch": 2.0524520874023438,
"grad_norm": 0.0010369070805609226,
"learning_rate": 3.188028335571289e-05,
"lookahead_loss": 6.442676889419555,
"loss": 0.316,
"step": 190000
},
{
"epoch": 2.0524520874023438,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.5135073006724395,
"eval_lookahead_perplexity": 674.1868517938349,
"eval_loss": 0.14215251803398132,
"eval_perplexity": 1.1527524506676095,
"eval_runtime": 483.1418,
"eval_samples_per_second": 10.349,
"eval_steps_per_second": 0.325,
"step": 190000
},
{
"base_loss": 0.32089028322696683,
"epoch": 2.05340576171875,
"grad_norm": 0.0009397159446962178,
"learning_rate": 3.183259963989258e-05,
"lookahead_loss": 6.455264377593994,
"loss": 0.3301,
"step": 190500
},
{
"base_loss": 0.35406574749946595,
"epoch": 2.0543594360351562,
"grad_norm": 0.0009735460043884814,
"learning_rate": 3.178491592407227e-05,
"lookahead_loss": 6.496513916969299,
"loss": 0.3693,
"step": 191000
},
{
"base_loss": 0.2938829956352711,
"epoch": 2.0553131103515625,
"grad_norm": 0.0009796855738386512,
"learning_rate": 3.173723220825195e-05,
"lookahead_loss": 6.495951771259308,
"loss": 0.3064,
"step": 191500
},
{
"base_loss": 0.30498689064383505,
"epoch": 2.0562667846679688,
"grad_norm": 0.000926612876355648,
"learning_rate": 3.1689548492431643e-05,
"lookahead_loss": 6.501594274520874,
"loss": 0.3175,
"step": 192000
},
{
"base_loss": 0.317481600522995,
"epoch": 2.057220458984375,
"grad_norm": 0.0009604953811503947,
"learning_rate": 3.164186477661133e-05,
"lookahead_loss": 6.519902579307556,
"loss": 0.3311,
"step": 192500
},
{
"base_loss": 0.3179551683664322,
"epoch": 2.0581741333007812,
"grad_norm": 0.0009685103432275355,
"learning_rate": 3.159418106079102e-05,
"lookahead_loss": 6.514713489532471,
"loss": 0.3293,
"step": 193000
},
{
"base_loss": 0.29271650505065916,
"epoch": 2.0591278076171875,
"grad_norm": 0.0009422221919521689,
"learning_rate": 3.154649734497071e-05,
"lookahead_loss": 6.422006649971008,
"loss": 0.3067,
"step": 193500
},
{
"base_loss": 0.3039356949329376,
"epoch": 2.0600814819335938,
"grad_norm": 0.0009756973595358431,
"learning_rate": 3.149881362915039e-05,
"lookahead_loss": 6.483645843505859,
"loss": 0.3183,
"step": 194000
},
{
"base_loss": 0.32165152502059935,
"epoch": 2.06103515625,
"grad_norm": 0.0009674925822764635,
"learning_rate": 3.145112991333008e-05,
"lookahead_loss": 6.4522641057968135,
"loss": 0.3326,
"step": 194500
},
{
"base_loss": 0.3061283130943775,
"epoch": 2.0619888305664062,
"grad_norm": 0.0010340444277971983,
"learning_rate": 3.1403446197509764e-05,
"lookahead_loss": 6.467900278091431,
"loss": 0.3165,
"step": 195000
},
{
"epoch": 2.0619888305664062,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.5032055431280655,
"eval_lookahead_perplexity": 667.2771942234244,
"eval_loss": 0.1421336829662323,
"eval_perplexity": 1.152730738701577,
"eval_runtime": 473.7725,
"eval_samples_per_second": 10.554,
"eval_steps_per_second": 0.331,
"step": 195000
},
{
"base_loss": 0.30640321379899976,
"epoch": 2.0629425048828125,
"grad_norm": 0.0009878401178866625,
"learning_rate": 3.1355762481689455e-05,
"lookahead_loss": 6.44850652551651,
"loss": 0.3171,
"step": 195500
},
{
"base_loss": 0.31782649287581444,
"epoch": 2.0638961791992188,
"grad_norm": 0.0009471693192608654,
"learning_rate": 3.1308078765869145e-05,
"lookahead_loss": 6.486370619773865,
"loss": 0.3303,
"step": 196000
},
{
"base_loss": 0.30349985790252687,
"epoch": 2.064849853515625,
"grad_norm": 0.0009686322882771492,
"learning_rate": 3.126039505004883e-05,
"lookahead_loss": 6.50835819530487,
"loss": 0.3185,
"step": 196500
},
{
"base_loss": 0.3075440634191036,
"epoch": 2.0658035278320312,
"grad_norm": 0.0009692518506199121,
"learning_rate": 3.121271133422852e-05,
"lookahead_loss": 6.4442818622589115,
"loss": 0.3204,
"step": 197000
},
{
"base_loss": 0.3064390652179718,
"epoch": 2.0667572021484375,
"grad_norm": 0.0009859678102657199,
"learning_rate": 3.11650276184082e-05,
"lookahead_loss": 6.408861030578613,
"loss": 0.3165,
"step": 197500
},
{
"base_loss": 0.3303199237883091,
"epoch": 2.0677108764648438,
"grad_norm": 0.0010187255684286356,
"learning_rate": 3.111734390258789e-05,
"lookahead_loss": 6.487036975860596,
"loss": 0.3406,
"step": 198000
},
{
"base_loss": 0.2994670196175575,
"epoch": 2.06866455078125,
"grad_norm": 0.0010079372441396117,
"learning_rate": 3.106966018676758e-05,
"lookahead_loss": 6.467383036613464,
"loss": 0.3102,
"step": 198500
},
{
"base_loss": 0.3000359579175711,
"epoch": 2.0696182250976562,
"grad_norm": 0.0009658647468313575,
"learning_rate": 3.1021976470947266e-05,
"lookahead_loss": 6.49391339302063,
"loss": 0.3156,
"step": 199000
},
{
"base_loss": 0.34639680609107015,
"epoch": 2.0705718994140625,
"grad_norm": 0.0009569233516231179,
"learning_rate": 3.0974292755126956e-05,
"lookahead_loss": 6.382178443908692,
"loss": 0.3587,
"step": 199500
},
{
"base_loss": 0.3132462115287781,
"epoch": 2.0715255737304688,
"grad_norm": 0.0009783732239156961,
"learning_rate": 3.092660903930664e-05,
"lookahead_loss": 6.4411235184669495,
"loss": 0.3241,
"step": 200000
},
{
"epoch": 2.0715255737304688,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.12980225879829912,
"eval_base_perplexity": 1.1386032122951009,
"eval_lookahead_loss": 6.493639263482139,
"eval_lookahead_perplexity": 660.9242693582831,
"eval_loss": 0.1421152651309967,
"eval_perplexity": 1.1527095080922722,
"eval_runtime": 497.9192,
"eval_samples_per_second": 10.042,
"eval_steps_per_second": 0.315,
"step": 200000
},
{
"base_loss": 0.30655504322052,
"epoch": 1.0009536743164062,
"grad_norm": 0.0009628058760426939,
"learning_rate": 3.087892532348633e-05,
"lookahead_loss": 6.568298415660858,
"loss": 0.3144,
"step": 200500
},
{
"base_loss": 0.3002312153875828,
"epoch": 1.0019073486328125,
"grad_norm": 0.0010066054528579116,
"learning_rate": 3.083124160766602e-05,
"lookahead_loss": 6.406751696586609,
"loss": 0.3132,
"step": 201000
},
{
"base_loss": 0.312505132496357,
"epoch": 1.0028610229492188,
"grad_norm": 0.0009853820083662868,
"learning_rate": 3.07835578918457e-05,
"lookahead_loss": 6.39734969997406,
"loss": 0.3223,
"step": 201500
},
{
"base_loss": 0.3240452491641045,
"epoch": 1.003814697265625,
"grad_norm": 0.000949465436860919,
"learning_rate": 3.0735874176025393e-05,
"lookahead_loss": 6.42671659564972,
"loss": 0.3363,
"step": 202000
},
{
"base_loss": 0.29858038023114203,
"epoch": 1.0047683715820312,
"grad_norm": 0.0009409029153175652,
"learning_rate": 3.068819046020508e-05,
"lookahead_loss": 6.4190877294540405,
"loss": 0.3137,
"step": 202500
},
{
"base_loss": 0.3042404046058655,
"epoch": 1.0057220458984375,
"grad_norm": 0.0008439480443485081,
"learning_rate": 3.064050674438477e-05,
"lookahead_loss": 6.542135063171386,
"loss": 0.3132,
"step": 203000
},
{
"base_loss": 0.29714440524578095,
"epoch": 1.0066757202148438,
"grad_norm": 0.0009408199694007635,
"learning_rate": 3.059282302856446e-05,
"lookahead_loss": 6.393728638648986,
"loss": 0.3127,
"step": 203500
},
{
"base_loss": 0.31379624953866003,
"epoch": 1.00762939453125,
"grad_norm": 0.0009831907227635384,
"learning_rate": 3.054513931274414e-05,
"lookahead_loss": 6.455501090049744,
"loss": 0.3247,
"step": 204000
},
{
"base_loss": 0.31622857597470283,
"epoch": 1.0085830688476562,
"grad_norm": 0.0009250900475308299,
"learning_rate": 3.049745559692383e-05,
"lookahead_loss": 6.439110320091247,
"loss": 0.3233,
"step": 204500
},
{
"base_loss": 0.3033116071224213,
"epoch": 1.0095367431640625,
"grad_norm": 0.0009871090296655893,
"learning_rate": 3.0449771881103518e-05,
"lookahead_loss": 6.460744188308716,
"loss": 0.3166,
"step": 205000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.484790183484745,
"eval_lookahead_perplexity": 655.1014987047179,
"eval_loss": 0.1420980542898178,
"eval_perplexity": 1.152689669162726,
"eval_runtime": 259.918,
"eval_samples_per_second": 19.237,
"eval_steps_per_second": 0.604,
"step": 205000
},
{
"base_loss": 0.302242584168911,
"epoch": 1.0104904174804688,
"grad_norm": 0.0009698246722109616,
"learning_rate": 3.0402088165283205e-05,
"lookahead_loss": 6.423384314537048,
"loss": 0.3121,
"step": 205500
},
{
"base_loss": 0.3031807193160057,
"epoch": 1.011444091796875,
"grad_norm": 0.0009783682180568576,
"learning_rate": 3.035440444946289e-05,
"lookahead_loss": 6.434307872772217,
"loss": 0.3166,
"step": 206000
},
{
"base_loss": 0.324542246311903,
"epoch": 1.0123977661132812,
"grad_norm": 0.0008743834332562983,
"learning_rate": 3.0306720733642578e-05,
"lookahead_loss": 6.431088864326477,
"loss": 0.3347,
"step": 206500
},
{
"base_loss": 0.3043093577325344,
"epoch": 1.0133514404296875,
"grad_norm": 0.0009393716463819146,
"learning_rate": 3.025903701782227e-05,
"lookahead_loss": 6.5347442026138305,
"loss": 0.3199,
"step": 207000
},
{
"base_loss": 0.29890961676836014,
"epoch": 1.0143051147460938,
"grad_norm": 0.000908377580344677,
"learning_rate": 3.0211353302001955e-05,
"lookahead_loss": 6.474946077346802,
"loss": 0.3143,
"step": 207500
},
{
"base_loss": 0.2968312213420868,
"epoch": 1.0152587890625,
"grad_norm": 0.0009118029265664518,
"learning_rate": 3.0163669586181642e-05,
"lookahead_loss": 6.427557950973511,
"loss": 0.3081,
"step": 208000
},
{
"base_loss": 0.309987826526165,
"epoch": 1.0162124633789062,
"grad_norm": 0.0009858175180852413,
"learning_rate": 3.011598587036133e-05,
"lookahead_loss": 6.471011517524719,
"loss": 0.3222,
"step": 208500
},
{
"base_loss": 0.3124798896312714,
"epoch": 1.0171661376953125,
"grad_norm": 0.0009066257625818253,
"learning_rate": 3.0068302154541016e-05,
"lookahead_loss": 6.498817705154419,
"loss": 0.3235,
"step": 209000
},
{
"base_loss": 0.30385399025678633,
"epoch": 1.0181198120117188,
"grad_norm": 0.0009525167988613248,
"learning_rate": 3.0020618438720706e-05,
"lookahead_loss": 6.509060912132263,
"loss": 0.313,
"step": 209500
},
{
"base_loss": 0.2997076933085918,
"epoch": 1.019073486328125,
"grad_norm": 0.0009451503865420818,
"learning_rate": 2.9972934722900393e-05,
"lookahead_loss": 6.532156089782715,
"loss": 0.3104,
"step": 210000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.475708332305518,
"eval_lookahead_perplexity": 649.1788991778967,
"eval_loss": 0.14207972586154938,
"eval_perplexity": 1.15266854236642,
"eval_runtime": 295.0132,
"eval_samples_per_second": 16.948,
"eval_steps_per_second": 0.532,
"step": 210000
},
{
"base_loss": 0.30210260692238805,
"epoch": 1.0200271606445312,
"grad_norm": 0.0010357605060562491,
"learning_rate": 2.992525100708008e-05,
"lookahead_loss": 6.357669311523438,
"loss": 0.315,
"step": 210500
},
{
"base_loss": 0.3285051781535149,
"epoch": 1.0209808349609375,
"grad_norm": 0.0009752861224114895,
"learning_rate": 2.9877567291259766e-05,
"lookahead_loss": 6.451837629318237,
"loss": 0.3386,
"step": 211000
},
{
"base_loss": 0.30326452678442,
"epoch": 1.0219345092773438,
"grad_norm": 0.0009549338137730956,
"learning_rate": 2.9829883575439453e-05,
"lookahead_loss": 6.413721528530121,
"loss": 0.3141,
"step": 211500
},
{
"base_loss": 0.29889601907134056,
"epoch": 1.02288818359375,
"grad_norm": 0.0009833979420363903,
"learning_rate": 2.9782199859619143e-05,
"lookahead_loss": 6.465893486022949,
"loss": 0.3118,
"step": 212000
},
{
"base_loss": 0.3006108500063419,
"epoch": 1.0238418579101562,
"grad_norm": 0.0009762793779373169,
"learning_rate": 2.973451614379883e-05,
"lookahead_loss": 6.41252710723877,
"loss": 0.3126,
"step": 212500
},
{
"base_loss": 0.3237688979506493,
"epoch": 1.0247955322265625,
"grad_norm": 0.000890803465154022,
"learning_rate": 2.9686832427978517e-05,
"lookahead_loss": 6.42967294883728,
"loss": 0.3356,
"step": 213000
},
{
"base_loss": 0.3078545735180378,
"epoch": 1.0257492065429688,
"grad_norm": 0.0009638071060180664,
"learning_rate": 2.9639148712158204e-05,
"lookahead_loss": 6.387160004615784,
"loss": 0.3221,
"step": 213500
},
{
"base_loss": 0.3022345977425575,
"epoch": 1.026702880859375,
"grad_norm": 0.0010054127778857946,
"learning_rate": 2.959146499633789e-05,
"lookahead_loss": 6.398777591705322,
"loss": 0.3111,
"step": 214000
},
{
"base_loss": 0.3071480156183243,
"epoch": 1.0276565551757812,
"grad_norm": 0.000952261732891202,
"learning_rate": 2.954378128051758e-05,
"lookahead_loss": 6.514560857772827,
"loss": 0.3187,
"step": 214500
},
{
"base_loss": 0.3302598208785057,
"epoch": 1.0286102294921875,
"grad_norm": 0.0009398755501024425,
"learning_rate": 2.9496097564697268e-05,
"lookahead_loss": 6.534868264198304,
"loss": 0.3416,
"step": 215000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.467427203449578,
"eval_lookahead_perplexity": 643.8251631474817,
"eval_loss": 0.14206360280513763,
"eval_perplexity": 1.1526499579763063,
"eval_runtime": 274.565,
"eval_samples_per_second": 18.211,
"eval_steps_per_second": 0.572,
"step": 215000
},
{
"base_loss": 0.3027013133764267,
"epoch": 1.0295639038085938,
"grad_norm": 0.0009348155581392348,
"learning_rate": 2.9448413848876955e-05,
"lookahead_loss": 6.474138929367065,
"loss": 0.312,
"step": 215500
},
{
"base_loss": 0.3046494301855564,
"epoch": 1.030517578125,
"grad_norm": 0.0009527892689220607,
"learning_rate": 2.940073013305664e-05,
"lookahead_loss": 6.449120025157929,
"loss": 0.316,
"step": 216000
},
{
"base_loss": 0.3023626366853714,
"epoch": 1.0314712524414062,
"grad_norm": 0.0009392331703566015,
"learning_rate": 2.9353046417236328e-05,
"lookahead_loss": 6.4767416534423825,
"loss": 0.3147,
"step": 216500
},
{
"base_loss": 0.3171934984624386,
"epoch": 1.0324249267578125,
"grad_norm": 0.000982875470072031,
"learning_rate": 2.930536270141602e-05,
"lookahead_loss": 6.436639490127564,
"loss": 0.3346,
"step": 217000
},
{
"base_loss": 0.305971223294735,
"epoch": 1.0333786010742188,
"grad_norm": 0.0009957151487469673,
"learning_rate": 2.9257678985595705e-05,
"lookahead_loss": 6.465258483886719,
"loss": 0.3153,
"step": 217500
},
{
"base_loss": 0.3008191674053669,
"epoch": 1.034332275390625,
"grad_norm": 0.000894519907888025,
"learning_rate": 2.9209995269775392e-05,
"lookahead_loss": 6.542671841621399,
"loss": 0.3137,
"step": 218000
},
{
"base_loss": 0.3125488177835941,
"epoch": 1.0352859497070312,
"grad_norm": 0.001007439219392836,
"learning_rate": 2.916231155395508e-05,
"lookahead_loss": 6.36938267993927,
"loss": 0.3225,
"step": 218500
},
{
"base_loss": 0.32382212686538697,
"epoch": 1.0362396240234375,
"grad_norm": 0.000986156752333045,
"learning_rate": 2.9114627838134766e-05,
"lookahead_loss": 6.487519608497619,
"loss": 0.3375,
"step": 219000
},
{
"base_loss": 0.30577521124482154,
"epoch": 1.0371932983398438,
"grad_norm": 0.001012337044812739,
"learning_rate": 2.9066944122314456e-05,
"lookahead_loss": 6.423724625587464,
"loss": 0.3175,
"step": 219500
},
{
"base_loss": 0.3027714610397816,
"epoch": 1.03814697265625,
"grad_norm": 0.0009804905857890844,
"learning_rate": 2.9019260406494143e-05,
"lookahead_loss": 6.465024924278259,
"loss": 0.3152,
"step": 220000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.459714018117887,
"eval_lookahead_perplexity": 638.8783228163264,
"eval_loss": 0.14204907417297363,
"eval_perplexity": 1.1526332116707039,
"eval_runtime": 342.2757,
"eval_samples_per_second": 14.608,
"eval_steps_per_second": 0.459,
"step": 220000
},
{
"base_loss": 0.3064252578020096,
"epoch": 1.0391006469726562,
"grad_norm": 0.000922691891901195,
"learning_rate": 2.897157669067383e-05,
"lookahead_loss": 6.412614955902099,
"loss": 0.3197,
"step": 220500
},
{
"base_loss": 0.3251348150372505,
"epoch": 1.0400543212890625,
"grad_norm": 0.0009598666802048683,
"learning_rate": 2.8923892974853516e-05,
"lookahead_loss": 6.448003714561462,
"loss": 0.3346,
"step": 221000
},
{
"base_loss": 0.3045478595495224,
"epoch": 1.0410079956054688,
"grad_norm": 0.0009646528051234782,
"learning_rate": 2.8876209259033203e-05,
"lookahead_loss": 6.387988406181336,
"loss": 0.3154,
"step": 221500
},
{
"base_loss": 0.2982518375813961,
"epoch": 1.041961669921875,
"grad_norm": 0.0009869185741990805,
"learning_rate": 2.8828525543212893e-05,
"lookahead_loss": 6.465492009162903,
"loss": 0.31,
"step": 222000
},
{
"base_loss": 0.3089935587644577,
"epoch": 1.0429153442382812,
"grad_norm": 0.0009668731945566833,
"learning_rate": 2.878084182739258e-05,
"lookahead_loss": 6.474124125003815,
"loss": 0.3243,
"step": 222500
},
{
"base_loss": 0.3268603746891022,
"epoch": 1.0438690185546875,
"grad_norm": 0.0009782308479771018,
"learning_rate": 2.8733158111572267e-05,
"lookahead_loss": 6.512031971931457,
"loss": 0.341,
"step": 223000
},
{
"base_loss": 0.29676153120398524,
"epoch": 1.0448226928710938,
"grad_norm": 0.0009767008014023304,
"learning_rate": 2.8685474395751954e-05,
"lookahead_loss": 6.4184129590988155,
"loss": 0.3094,
"step": 223500
},
{
"base_loss": 0.3044439141750336,
"epoch": 1.0457763671875,
"grad_norm": 0.0009669915889389813,
"learning_rate": 2.863779067993164e-05,
"lookahead_loss": 6.440961833953858,
"loss": 0.3164,
"step": 224000
},
{
"base_loss": 0.3313070158064365,
"epoch": 1.0467300415039062,
"grad_norm": 0.0009441322763450444,
"learning_rate": 2.859010696411133e-05,
"lookahead_loss": 6.40649371099472,
"loss": 0.3396,
"step": 224500
},
{
"base_loss": 0.32587327966094015,
"epoch": 1.0476837158203125,
"grad_norm": 0.0010059355990961194,
"learning_rate": 2.8542423248291018e-05,
"lookahead_loss": 6.4436911821365355,
"loss": 0.3391,
"step": 225000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.452761023189313,
"eval_lookahead_perplexity": 634.4516123592657,
"eval_loss": 0.14203442633152008,
"eval_perplexity": 1.1526163282058186,
"eval_runtime": 279.8888,
"eval_samples_per_second": 17.864,
"eval_steps_per_second": 0.561,
"step": 225000
},
{
"base_loss": 0.29468956208229063,
"epoch": 1.0486373901367188,
"grad_norm": 0.0009452200611121953,
"learning_rate": 2.8494739532470705e-05,
"lookahead_loss": 6.411119204521179,
"loss": 0.308,
"step": 225500
},
{
"base_loss": 0.3027429393827915,
"epoch": 1.049591064453125,
"grad_norm": 0.000997032504528761,
"learning_rate": 2.844705581665039e-05,
"lookahead_loss": 6.39424205160141,
"loss": 0.3165,
"step": 226000
},
{
"base_loss": 0.3190444597601891,
"epoch": 1.0505447387695312,
"grad_norm": 0.000901962979696691,
"learning_rate": 2.8399372100830078e-05,
"lookahead_loss": 6.48411437702179,
"loss": 0.3334,
"step": 226500
},
{
"base_loss": 0.3043918348252773,
"epoch": 1.0514984130859375,
"grad_norm": 0.0009930033702403307,
"learning_rate": 2.835168838500977e-05,
"lookahead_loss": 6.4403407697677615,
"loss": 0.3181,
"step": 227000
},
{
"base_loss": 0.3046840020418167,
"epoch": 1.0524520874023438,
"grad_norm": 0.0010451226262375712,
"learning_rate": 2.8304004669189455e-05,
"lookahead_loss": 6.379611058235168,
"loss": 0.3174,
"step": 227500
},
{
"base_loss": 0.3202188531160355,
"epoch": 1.05340576171875,
"grad_norm": 0.0009132448467426002,
"learning_rate": 2.8256320953369142e-05,
"lookahead_loss": 6.3884695830345155,
"loss": 0.3302,
"step": 228000
},
{
"base_loss": 0.3542410895228386,
"epoch": 1.0543594360351562,
"grad_norm": 0.0009372152271680534,
"learning_rate": 2.820863723754883e-05,
"lookahead_loss": 6.432561398506165,
"loss": 0.369,
"step": 228500
},
{
"base_loss": 0.2943912135362625,
"epoch": 1.0553131103515625,
"grad_norm": 0.0009705543052405119,
"learning_rate": 2.8160953521728516e-05,
"lookahead_loss": 6.44163135099411,
"loss": 0.3085,
"step": 229000
},
{
"base_loss": 0.30392896428704264,
"epoch": 1.0562667846679688,
"grad_norm": 0.0009304340346716344,
"learning_rate": 2.8113269805908206e-05,
"lookahead_loss": 6.4533995332717895,
"loss": 0.317,
"step": 229500
},
{
"base_loss": 0.3181495431959629,
"epoch": 1.057220458984375,
"grad_norm": 0.0009568824316374958,
"learning_rate": 2.8065586090087893e-05,
"lookahead_loss": 6.471045615673066,
"loss": 0.3319,
"step": 230000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.445454614231,
"eval_lookahead_perplexity": 629.832942905796,
"eval_loss": 0.14202013611793518,
"eval_perplexity": 1.1525998571899945,
"eval_runtime": 305.3757,
"eval_samples_per_second": 16.373,
"eval_steps_per_second": 0.514,
"step": 230000
},
{
"base_loss": 0.3180287193655968,
"epoch": 1.0581741333007812,
"grad_norm": 0.000988287152722478,
"learning_rate": 2.801790237426758e-05,
"lookahead_loss": 6.466564978599548,
"loss": 0.3287,
"step": 230500
},
{
"base_loss": 0.292520221978426,
"epoch": 1.0591278076171875,
"grad_norm": 0.000923556333873421,
"learning_rate": 2.7970218658447266e-05,
"lookahead_loss": 6.3680587558746335,
"loss": 0.3084,
"step": 231000
},
{
"base_loss": 0.3019208701252937,
"epoch": 1.0600814819335938,
"grad_norm": 0.0009795061778277159,
"learning_rate": 2.7922534942626953e-05,
"lookahead_loss": 6.414703974723816,
"loss": 0.315,
"step": 231500
},
{
"base_loss": 0.32141088619828223,
"epoch": 1.06103515625,
"grad_norm": 0.0009851646609604359,
"learning_rate": 2.7874851226806643e-05,
"lookahead_loss": 6.393617419719696,
"loss": 0.3324,
"step": 232000
},
{
"base_loss": 0.30723505771160126,
"epoch": 1.0619888305664062,
"grad_norm": 0.0010298019042238593,
"learning_rate": 2.782716751098633e-05,
"lookahead_loss": 6.404361547470093,
"loss": 0.3166,
"step": 232500
},
{
"base_loss": 0.308370777964592,
"epoch": 1.0629425048828125,
"grad_norm": 0.0009864643216133118,
"learning_rate": 2.7779483795166017e-05,
"lookahead_loss": 6.393691156387329,
"loss": 0.3175,
"step": 233000
},
{
"base_loss": 0.3175841515958309,
"epoch": 1.0638961791992188,
"grad_norm": 0.0009484239271841943,
"learning_rate": 2.7731800079345704e-05,
"lookahead_loss": 6.43188937664032,
"loss": 0.33,
"step": 233500
},
{
"base_loss": 0.3023634272813797,
"epoch": 1.064849853515625,
"grad_norm": 0.0009487507632002234,
"learning_rate": 2.768411636352539e-05,
"lookahead_loss": 6.451686841011047,
"loss": 0.3175,
"step": 234000
},
{
"base_loss": 0.31000158992409704,
"epoch": 1.0658035278320312,
"grad_norm": 0.0009820089908316731,
"learning_rate": 2.763643264770508e-05,
"lookahead_loss": 6.376307249069214,
"loss": 0.3214,
"step": 234500
},
{
"base_loss": 0.3074465197324753,
"epoch": 1.0667572021484375,
"grad_norm": 0.0009712969767861068,
"learning_rate": 2.7588748931884768e-05,
"lookahead_loss": 6.353045886516571,
"loss": 0.3171,
"step": 235000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.437607750725061,
"eval_lookahead_perplexity": 624.9100695885159,
"eval_loss": 0.1420057862997055,
"eval_perplexity": 1.1525833177102218,
"eval_runtime": 291.1708,
"eval_samples_per_second": 17.172,
"eval_steps_per_second": 0.539,
"step": 235000
},
{
"base_loss": 0.3308669750988483,
"epoch": 1.0677108764648438,
"grad_norm": 0.0010304059833288193,
"learning_rate": 2.7541065216064455e-05,
"lookahead_loss": 6.44032656955719,
"loss": 0.3439,
"step": 235500
},
{
"base_loss": 0.300963022172451,
"epoch": 1.06866455078125,
"grad_norm": 0.0009902446763589978,
"learning_rate": 2.749338150024414e-05,
"lookahead_loss": 6.4132391576766965,
"loss": 0.3104,
"step": 236000
},
{
"base_loss": 0.3016065271794796,
"epoch": 1.0696182250976562,
"grad_norm": 0.0009580631158314645,
"learning_rate": 2.7445697784423828e-05,
"lookahead_loss": 6.435465224266053,
"loss": 0.3147,
"step": 236500
},
{
"base_loss": 0.3469915909469128,
"epoch": 1.0705718994140625,
"grad_norm": 0.0009712293976917863,
"learning_rate": 2.739801406860352e-05,
"lookahead_loss": 6.335322134017944,
"loss": 0.3591,
"step": 237000
},
{
"base_loss": 0.31762470316886904,
"epoch": 1.0715255737304688,
"grad_norm": 0.00096644286531955,
"learning_rate": 2.7350330352783205e-05,
"lookahead_loss": 6.388244523525238,
"loss": 0.3257,
"step": 237500
},
{
"base_loss": 0.3090612238943577,
"epoch": 1.072479248046875,
"grad_norm": 0.0009957854636013508,
"learning_rate": 2.7302646636962892e-05,
"lookahead_loss": 6.472851838111877,
"loss": 0.319,
"step": 238000
},
{
"base_loss": 0.3051177371442318,
"epoch": 1.0734329223632812,
"grad_norm": 0.0009499763837084174,
"learning_rate": 2.725496292114258e-05,
"lookahead_loss": 6.422829883098602,
"loss": 0.3177,
"step": 238500
},
{
"base_loss": 0.32735036182403565,
"epoch": 1.0743865966796875,
"grad_norm": 0.0009518949664197862,
"learning_rate": 2.7207279205322266e-05,
"lookahead_loss": 6.442498418331146,
"loss": 0.34,
"step": 239000
},
{
"base_loss": 0.3037717220187187,
"epoch": 1.0753402709960938,
"grad_norm": 0.0009656847105361521,
"learning_rate": 2.7159595489501956e-05,
"lookahead_loss": 6.504564664363861,
"loss": 0.316,
"step": 239500
},
{
"base_loss": 0.3043428426384926,
"epoch": 1.0762939453125,
"grad_norm": 0.0009005047613754869,
"learning_rate": 2.7111911773681643e-05,
"lookahead_loss": 6.462020317077637,
"loss": 0.3176,
"step": 240000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.43002515707534,
"eval_lookahead_perplexity": 620.189549971696,
"eval_loss": 0.14199133217334747,
"eval_perplexity": 1.1525666582457088,
"eval_runtime": 297.2728,
"eval_samples_per_second": 16.82,
"eval_steps_per_second": 0.528,
"step": 240000
},
{
"base_loss": 0.3304513318836689,
"epoch": 1.0772476196289062,
"grad_norm": 0.0009675352484919131,
"learning_rate": 2.706422805786133e-05,
"lookahead_loss": 6.441016254425048,
"loss": 0.344,
"step": 240500
},
{
"base_loss": 0.3036956556737423,
"epoch": 1.0782012939453125,
"grad_norm": 0.000994171597994864,
"learning_rate": 2.7016544342041016e-05,
"lookahead_loss": 6.463219980716706,
"loss": 0.318,
"step": 241000
},
{
"base_loss": 0.29657268461585046,
"epoch": 1.0791549682617188,
"grad_norm": 0.000914372387342155,
"learning_rate": 2.6968860626220703e-05,
"lookahead_loss": 6.459996415138245,
"loss": 0.3112,
"step": 241500
},
{
"base_loss": 0.3123248810470104,
"epoch": 1.080108642578125,
"grad_norm": 0.0009580728365108371,
"learning_rate": 2.6921176910400393e-05,
"lookahead_loss": 6.445644073486328,
"loss": 0.3292,
"step": 242000
},
{
"base_loss": 0.32140666726231576,
"epoch": 1.0810623168945312,
"grad_norm": 0.000988593208603561,
"learning_rate": 2.687349319458008e-05,
"lookahead_loss": 6.463636265277863,
"loss": 0.3373,
"step": 242500
},
{
"base_loss": 0.29961148300766943,
"epoch": 1.0820159912109375,
"grad_norm": 0.0009065298363566399,
"learning_rate": 2.6825809478759767e-05,
"lookahead_loss": 6.459467195987702,
"loss": 0.3123,
"step": 243000
},
{
"base_loss": 0.3036400380730629,
"epoch": 1.0829696655273438,
"grad_norm": 0.0010064428206533194,
"learning_rate": 2.6778125762939454e-05,
"lookahead_loss": 6.485273163795471,
"loss": 0.3175,
"step": 243500
},
{
"base_loss": 0.3315876969695091,
"epoch": 1.08392333984375,
"grad_norm": 0.0009258039062842727,
"learning_rate": 2.673044204711914e-05,
"lookahead_loss": 6.489104858398438,
"loss": 0.345,
"step": 244000
},
{
"base_loss": 0.30729381024837493,
"epoch": 1.0848770141601562,
"grad_norm": 0.000969972345046699,
"learning_rate": 2.668275833129883e-05,
"lookahead_loss": 6.436074539661408,
"loss": 0.3198,
"step": 244500
},
{
"base_loss": 0.3001744159460068,
"epoch": 1.0858306884765625,
"grad_norm": 0.0010077956831082702,
"learning_rate": 2.6635074615478518e-05,
"lookahead_loss": 6.4327317771911625,
"loss": 0.3101,
"step": 245000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.422323587222602,
"eval_lookahead_perplexity": 615.4314627197983,
"eval_loss": 0.14197710156440735,
"eval_perplexity": 1.1525502566370207,
"eval_runtime": 260.851,
"eval_samples_per_second": 19.168,
"eval_steps_per_second": 0.602,
"step": 245000
},
{
"base_loss": 0.3043146550655365,
"epoch": 1.0867843627929688,
"grad_norm": 0.001128224772401154,
"learning_rate": 2.6587390899658205e-05,
"lookahead_loss": 6.390339149475098,
"loss": 0.3138,
"step": 245500
},
{
"base_loss": 0.33685871040821075,
"epoch": 1.087738037109375,
"grad_norm": 0.0008826267439872026,
"learning_rate": 2.653970718383789e-05,
"lookahead_loss": 6.4577340927124025,
"loss": 0.3444,
"step": 246000
},
{
"base_loss": 0.30271838963031766,
"epoch": 1.0886917114257812,
"grad_norm": 0.0009827081812545657,
"learning_rate": 2.6492023468017578e-05,
"lookahead_loss": 6.433503454208374,
"loss": 0.3134,
"step": 246500
},
{
"base_loss": 0.31106107553839685,
"epoch": 1.0896453857421875,
"grad_norm": 0.000977760530076921,
"learning_rate": 2.644433975219727e-05,
"lookahead_loss": 6.467117793083191,
"loss": 0.3206,
"step": 247000
},
{
"base_loss": 0.29801268032193184,
"epoch": 1.0905990600585938,
"grad_norm": 0.0009753919439390302,
"learning_rate": 2.6396656036376955e-05,
"lookahead_loss": 6.457008891105652,
"loss": 0.3107,
"step": 247500
},
{
"base_loss": 0.29706856977939605,
"epoch": 1.091552734375,
"grad_norm": 0.0008775305468589067,
"learning_rate": 2.6348972320556642e-05,
"lookahead_loss": 6.417390830039978,
"loss": 0.3085,
"step": 248000
},
{
"base_loss": 0.3189851225912571,
"epoch": 1.0925064086914062,
"grad_norm": 0.0009562866762280464,
"learning_rate": 2.630128860473633e-05,
"lookahead_loss": 6.447861388206482,
"loss": 0.3347,
"step": 248500
},
{
"base_loss": 0.307105902582407,
"epoch": 1.0934600830078125,
"grad_norm": 0.0009072708780877292,
"learning_rate": 2.6253604888916016e-05,
"lookahead_loss": 6.451038600921631,
"loss": 0.3198,
"step": 249000
},
{
"base_loss": 0.2863923677802086,
"epoch": 1.0944137573242188,
"grad_norm": 0.0009528218070045114,
"learning_rate": 2.6205921173095706e-05,
"lookahead_loss": 6.449621738433838,
"loss": 0.3025,
"step": 249500
},
{
"base_loss": 0.2923275768607855,
"epoch": 1.095367431640625,
"grad_norm": 0.0009796309750527143,
"learning_rate": 2.6158237457275393e-05,
"lookahead_loss": 6.37427237701416,
"loss": 0.3081,
"step": 250000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.416619156877073,
"eval_lookahead_perplexity": 611.9307710270847,
"eval_loss": 0.1419648975133896,
"eval_perplexity": 1.1525361909407175,
"eval_runtime": 271.5852,
"eval_samples_per_second": 18.41,
"eval_steps_per_second": 0.578,
"step": 250000
},
{
"base_loss": 0.2988976333141327,
"epoch": 1.0963211059570312,
"grad_norm": 0.0009741184767335653,
"learning_rate": 2.611055374145508e-05,
"lookahead_loss": 6.453314149856568,
"loss": 0.312,
"step": 250500
},
{
"base_loss": 0.3292928241491318,
"epoch": 1.0972747802734375,
"grad_norm": 0.0009737982181832194,
"learning_rate": 2.6062870025634766e-05,
"lookahead_loss": 6.470526203155518,
"loss": 0.3397,
"step": 251000
},
{
"base_loss": 0.2914348037838936,
"epoch": 1.0982284545898438,
"grad_norm": 0.0009696637280285358,
"learning_rate": 2.6015186309814453e-05,
"lookahead_loss": 6.402810046672821,
"loss": 0.3081,
"step": 251500
},
{
"base_loss": 0.2972012578845024,
"epoch": 1.09918212890625,
"grad_norm": 0.0009921115124598145,
"learning_rate": 2.5967502593994143e-05,
"lookahead_loss": 6.451505978584289,
"loss": 0.3099,
"step": 252000
},
{
"base_loss": 0.3006402098238468,
"epoch": 1.1001358032226562,
"grad_norm": 0.0009354639914818108,
"learning_rate": 2.591981887817383e-05,
"lookahead_loss": 6.447086319446564,
"loss": 0.3148,
"step": 252500
},
{
"base_loss": 0.3227167456150055,
"epoch": 1.1010894775390625,
"grad_norm": 0.0009446279727853835,
"learning_rate": 2.5872135162353517e-05,
"lookahead_loss": 6.516663771629333,
"loss": 0.3322,
"step": 253000
},
{
"base_loss": 0.30574207335710524,
"epoch": 1.1020431518554688,
"grad_norm": 0.0009245507535524666,
"learning_rate": 2.5824451446533204e-05,
"lookahead_loss": 6.473391896247864,
"loss": 0.315,
"step": 253500
},
{
"base_loss": 0.29960223579406736,
"epoch": 1.102996826171875,
"grad_norm": 0.0009597218013368547,
"learning_rate": 2.577676773071289e-05,
"lookahead_loss": 6.437963982582092,
"loss": 0.3144,
"step": 254000
},
{
"base_loss": 0.2996614835858345,
"epoch": 1.1039505004882812,
"grad_norm": 0.0009940110612660646,
"learning_rate": 2.572908401489258e-05,
"lookahead_loss": 6.468970078468323,
"loss": 0.3118,
"step": 254500
},
{
"base_loss": 0.3155037875175476,
"epoch": 1.1049041748046875,
"grad_norm": 0.0009892784291878343,
"learning_rate": 2.5681400299072268e-05,
"lookahead_loss": 6.404183995246887,
"loss": 0.3277,
"step": 255000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.4104706182266575,
"eval_lookahead_perplexity": 608.1798342368199,
"eval_loss": 0.1419525295495987,
"eval_perplexity": 1.1525219365029897,
"eval_runtime": 267.8175,
"eval_samples_per_second": 18.669,
"eval_steps_per_second": 0.586,
"step": 255000
},
{
"base_loss": 0.30881242457032204,
"epoch": 1.1058578491210938,
"grad_norm": 0.0009296280913986266,
"learning_rate": 2.5633716583251955e-05,
"lookahead_loss": 6.387983600616455,
"loss": 0.3208,
"step": 255500
},
{
"base_loss": 0.29839677426218986,
"epoch": 1.1068115234375,
"grad_norm": 0.0009309362503699958,
"learning_rate": 2.558603286743164e-05,
"lookahead_loss": 6.475096418380737,
"loss": 0.3094,
"step": 256000
},
{
"base_loss": 0.294783333927393,
"epoch": 1.1077651977539062,
"grad_norm": 0.0009678134229034185,
"learning_rate": 2.5538349151611328e-05,
"lookahead_loss": 6.3918491244316105,
"loss": 0.3082,
"step": 256500
},
{
"base_loss": 0.32150769320130346,
"epoch": 1.1087188720703125,
"grad_norm": 0.0010087802074849606,
"learning_rate": 2.549066543579102e-05,
"lookahead_loss": 6.3981378741264345,
"loss": 0.3336,
"step": 257000
},
{
"base_loss": 0.3191940434873104,
"epoch": 1.1096725463867188,
"grad_norm": 0.0009189763222821057,
"learning_rate": 2.5442981719970705e-05,
"lookahead_loss": 6.356856064796448,
"loss": 0.3283,
"step": 257500
},
{
"base_loss": 0.30270202097296717,
"epoch": 1.110626220703125,
"grad_norm": 0.0008876527426764369,
"learning_rate": 2.5395298004150392e-05,
"lookahead_loss": 6.327256792068481,
"loss": 0.3141,
"step": 258000
},
{
"base_loss": 0.2974509707689285,
"epoch": 1.1115798950195312,
"grad_norm": 0.0009743132395669818,
"learning_rate": 2.534761428833008e-05,
"lookahead_loss": 6.453075678348541,
"loss": 0.3085,
"step": 258500
},
{
"base_loss": 0.3114223616421223,
"epoch": 1.1125335693359375,
"grad_norm": 0.0009253611788153648,
"learning_rate": 2.5299930572509766e-05,
"lookahead_loss": 6.464822199821472,
"loss": 0.3229,
"step": 259000
},
{
"base_loss": 0.3443338246643543,
"epoch": 1.1134872436523438,
"grad_norm": 0.0009910385124385357,
"learning_rate": 2.5252246856689456e-05,
"lookahead_loss": 6.472144736289978,
"loss": 0.3517,
"step": 259500
},
{
"base_loss": 0.2939972540736198,
"epoch": 1.11444091796875,
"grad_norm": 0.000937771808821708,
"learning_rate": 2.5204563140869143e-05,
"lookahead_loss": 6.35105470085144,
"loss": 0.3083,
"step": 260000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.404723172751479,
"eval_lookahead_perplexity": 604.6943796252261,
"eval_loss": 0.14194095134735107,
"eval_perplexity": 1.1525085924481642,
"eval_runtime": 271.0139,
"eval_samples_per_second": 18.449,
"eval_steps_per_second": 0.579,
"step": 260000
},
{
"base_loss": 0.29841734063625336,
"epoch": 1.1153945922851562,
"grad_norm": 0.0009852543007582426,
"learning_rate": 2.515687942504883e-05,
"lookahead_loss": 6.423218637466431,
"loss": 0.3114,
"step": 260500
},
{
"base_loss": 0.3147252712547779,
"epoch": 1.1163482666015625,
"grad_norm": 0.0010154576739296317,
"learning_rate": 2.5109195709228516e-05,
"lookahead_loss": 6.40000822353363,
"loss": 0.325,
"step": 261000
},
{
"base_loss": 0.32950386153161526,
"epoch": 1.1173019409179688,
"grad_norm": 0.000979804084636271,
"learning_rate": 2.5061511993408203e-05,
"lookahead_loss": 6.405930280685425,
"loss": 0.3432,
"step": 261500
},
{
"base_loss": 0.30734304267168044,
"epoch": 1.118255615234375,
"grad_norm": 0.0010216154623776674,
"learning_rate": 2.5013828277587893e-05,
"lookahead_loss": 6.439217642784119,
"loss": 0.3167,
"step": 262000
},
{
"base_loss": 0.3014386140704155,
"epoch": 1.1192092895507812,
"grad_norm": 0.0009699428919702768,
"learning_rate": 2.496614456176758e-05,
"lookahead_loss": 6.446448052883148,
"loss": 0.3129,
"step": 262500
},
{
"base_loss": 0.30611268219351767,
"epoch": 2.0009536743164062,
"grad_norm": 0.0009632021537981927,
"learning_rate": 2.4918460845947267e-05,
"lookahead_loss": 6.482765606403351,
"loss": 0.3149,
"step": 263000
},
{
"base_loss": 0.301539769411087,
"epoch": 2.0019073486328125,
"grad_norm": 0.0010064532980322838,
"learning_rate": 2.4870777130126954e-05,
"lookahead_loss": 6.310386034011841,
"loss": 0.314,
"step": 263500
},
{
"base_loss": 0.31222748425602914,
"epoch": 2.0028610229492188,
"grad_norm": 0.0009766683215275407,
"learning_rate": 2.482309341430664e-05,
"lookahead_loss": 6.307436645507813,
"loss": 0.3223,
"step": 264000
},
{
"base_loss": 0.32267384630441664,
"epoch": 2.003814697265625,
"grad_norm": 0.0009503905894234776,
"learning_rate": 2.477540969848633e-05,
"lookahead_loss": 6.3480785894393925,
"loss": 0.3351,
"step": 264500
},
{
"base_loss": 0.30016050645709036,
"epoch": 2.0047683715820312,
"grad_norm": 0.000949801120441407,
"learning_rate": 2.4727725982666018e-05,
"lookahead_loss": 6.324021258354187,
"loss": 0.3162,
"step": 265000
},
{
"epoch": 2.0047683715820312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.398448333191795,
"eval_lookahead_perplexity": 600.9118990506257,
"eval_loss": 0.14192864298820496,
"eval_perplexity": 1.152494407045789,
"eval_runtime": 267.8201,
"eval_samples_per_second": 18.669,
"eval_steps_per_second": 0.586,
"step": 265000
},
{
"base_loss": 0.3024714471399784,
"epoch": 2.0057220458984375,
"grad_norm": 0.000842128531076014,
"learning_rate": 2.4680042266845705e-05,
"lookahead_loss": 6.430944422245026,
"loss": 0.3115,
"step": 265500
},
{
"base_loss": 0.2964489733278751,
"epoch": 2.0066757202148438,
"grad_norm": 0.0009076377027668059,
"learning_rate": 2.463235855102539e-05,
"lookahead_loss": 6.303486613273621,
"loss": 0.3129,
"step": 266000
},
{
"base_loss": 0.31337857532501223,
"epoch": 2.00762939453125,
"grad_norm": 0.0009625607635825872,
"learning_rate": 2.4584674835205078e-05,
"lookahead_loss": 6.363961039543152,
"loss": 0.3236,
"step": 266500
},
{
"base_loss": 0.3180972839295864,
"epoch": 2.0085830688476562,
"grad_norm": 0.0009394744993187487,
"learning_rate": 2.453699111938477e-05,
"lookahead_loss": 6.347307140827179,
"loss": 0.3229,
"step": 267000
},
{
"base_loss": 0.30493127757310867,
"epoch": 2.0095367431640625,
"grad_norm": 0.0009756337967701256,
"learning_rate": 2.4489307403564455e-05,
"lookahead_loss": 6.364429833412171,
"loss": 0.3183,
"step": 267500
},
{
"base_loss": 0.30099570405483245,
"epoch": 2.0104904174804688,
"grad_norm": 0.0009337849332951009,
"learning_rate": 2.4441623687744142e-05,
"lookahead_loss": 6.338661858081817,
"loss": 0.3112,
"step": 268000
},
{
"base_loss": 0.30160990768671037,
"epoch": 2.011444091796875,
"grad_norm": 0.0010120351798832417,
"learning_rate": 2.439393997192383e-05,
"lookahead_loss": 6.341832407951355,
"loss": 0.3145,
"step": 268500
},
{
"base_loss": 0.32538792353868484,
"epoch": 2.0123977661132812,
"grad_norm": 0.0008798608323559165,
"learning_rate": 2.4346256256103516e-05,
"lookahead_loss": 6.331272545814514,
"loss": 0.3356,
"step": 269000
},
{
"base_loss": 0.3040602553486824,
"epoch": 2.0133514404296875,
"grad_norm": 0.0009250569855794311,
"learning_rate": 2.4298572540283206e-05,
"lookahead_loss": 6.42705198764801,
"loss": 0.3192,
"step": 269500
},
{
"base_loss": 0.29813345649838446,
"epoch": 2.0143051147460938,
"grad_norm": 0.000940575497224927,
"learning_rate": 2.4250888824462893e-05,
"lookahead_loss": 6.370742217063904,
"loss": 0.3118,
"step": 270000
},
{
"epoch": 2.0143051147460938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.3924329357025345,
"eval_lookahead_perplexity": 597.308025355375,
"eval_loss": 0.14191682636737823,
"eval_perplexity": 1.1524807885368387,
"eval_runtime": 257.7499,
"eval_samples_per_second": 19.399,
"eval_steps_per_second": 0.609,
"step": 270000
},
{
"base_loss": 0.2960759684741497,
"epoch": 2.0152587890625,
"grad_norm": 0.0009009315399453044,
"learning_rate": 2.420320510864258e-05,
"lookahead_loss": 6.327431865215302,
"loss": 0.3069,
"step": 270500
},
{
"base_loss": 0.31211792075634004,
"epoch": 2.0162124633789062,
"grad_norm": 0.0009672954329289496,
"learning_rate": 2.4155521392822266e-05,
"lookahead_loss": 6.376048874855042,
"loss": 0.3227,
"step": 271000
},
{
"base_loss": 0.31110167542099953,
"epoch": 2.0171661376953125,
"grad_norm": 0.0009206890244968235,
"learning_rate": 2.4107837677001953e-05,
"lookahead_loss": 6.41415591430664,
"loss": 0.3223,
"step": 271500
},
{
"base_loss": 0.2990322083234787,
"epoch": 2.0181198120117188,
"grad_norm": 0.0009310735040344298,
"learning_rate": 2.406015396118164e-05,
"lookahead_loss": 6.427069549560547,
"loss": 0.3117,
"step": 272000
},
{
"base_loss": 0.29806812533736227,
"epoch": 2.019073486328125,
"grad_norm": 0.0009696860215626657,
"learning_rate": 2.401247024536133e-05,
"lookahead_loss": 6.449417492866516,
"loss": 0.3097,
"step": 272500
},
{
"base_loss": 0.30187543269991873,
"epoch": 2.0200271606445312,
"grad_norm": 0.001025256235152483,
"learning_rate": 2.3964786529541017e-05,
"lookahead_loss": 6.279822265148163,
"loss": 0.3149,
"step": 273000
},
{
"base_loss": 0.32729279178380966,
"epoch": 2.0209808349609375,
"grad_norm": 0.0009685347322374582,
"learning_rate": 2.3917102813720704e-05,
"lookahead_loss": 6.365867915153504,
"loss": 0.3375,
"step": 273500
},
{
"base_loss": 0.3057846530973911,
"epoch": 2.0219345092773438,
"grad_norm": 0.000963672180660069,
"learning_rate": 2.386941909790039e-05,
"lookahead_loss": 6.3289254207611085,
"loss": 0.3143,
"step": 274000
},
{
"base_loss": 0.2997340569794178,
"epoch": 2.02288818359375,
"grad_norm": 0.0009888113709166646,
"learning_rate": 2.3821735382080078e-05,
"lookahead_loss": 6.368264214992523,
"loss": 0.3116,
"step": 274500
},
{
"base_loss": 0.30260268279910085,
"epoch": 2.0238418579101562,
"grad_norm": 0.0009482959285378456,
"learning_rate": 2.3774051666259768e-05,
"lookahead_loss": 6.32055482673645,
"loss": 0.314,
"step": 275000
},
{
"epoch": 2.0238418579101562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.385799606006366,
"eval_lookahead_perplexity": 593.3589963787288,
"eval_loss": 0.1419040411710739,
"eval_perplexity": 1.1524660539379128,
"eval_runtime": 265.9647,
"eval_samples_per_second": 18.799,
"eval_steps_per_second": 0.59,
"step": 275000
},
{
"base_loss": 0.3236656226217747,
"epoch": 2.0247955322265625,
"grad_norm": 0.0009006787440739572,
"learning_rate": 2.3726367950439455e-05,
"lookahead_loss": 6.34121999502182,
"loss": 0.3346,
"step": 275500
},
{
"base_loss": 0.30869458481669426,
"epoch": 2.0257492065429688,
"grad_norm": 0.0009739008382894099,
"learning_rate": 2.367868423461914e-05,
"lookahead_loss": 6.298044787406921,
"loss": 0.3227,
"step": 276000
},
{
"base_loss": 0.3019005296528339,
"epoch": 2.026702880859375,
"grad_norm": 0.0010005339281633496,
"learning_rate": 2.3631000518798828e-05,
"lookahead_loss": 6.320043759346008,
"loss": 0.3112,
"step": 276500
},
{
"base_loss": 0.3077106066644192,
"epoch": 2.0276565551757812,
"grad_norm": 0.0009583939099684358,
"learning_rate": 2.3583316802978515e-05,
"lookahead_loss": 6.426606664657593,
"loss": 0.3183,
"step": 277000
},
{
"base_loss": 0.3280421564877033,
"epoch": 2.0286102294921875,
"grad_norm": 0.0009699960355646908,
"learning_rate": 2.3535633087158205e-05,
"lookahead_loss": 6.442135247707367,
"loss": 0.3392,
"step": 277500
},
{
"base_loss": 0.30581475085020066,
"epoch": 2.0295639038085938,
"grad_norm": 0.0009488245123066008,
"learning_rate": 2.3487949371337892e-05,
"lookahead_loss": 6.394023329734802,
"loss": 0.3138,
"step": 278000
},
{
"base_loss": 0.3068877322375774,
"epoch": 2.030517578125,
"grad_norm": 0.0009734364575706422,
"learning_rate": 2.344026565551758e-05,
"lookahead_loss": 6.383229479789734,
"loss": 0.3165,
"step": 278500
},
{
"base_loss": 0.3014947620034218,
"epoch": 2.0314712524414062,
"grad_norm": 0.0009622674551792443,
"learning_rate": 2.3392581939697266e-05,
"lookahead_loss": 6.387813640594483,
"loss": 0.3141,
"step": 279000
},
{
"base_loss": 0.3173881909847259,
"epoch": 2.0324249267578125,
"grad_norm": 0.0009690853185020387,
"learning_rate": 2.3344898223876953e-05,
"lookahead_loss": 6.346703090667725,
"loss": 0.3335,
"step": 279500
},
{
"base_loss": 0.3059709269702435,
"epoch": 2.0333786010742188,
"grad_norm": 0.0009841558057814837,
"learning_rate": 2.3297214508056643e-05,
"lookahead_loss": 6.396211630821228,
"loss": 0.3149,
"step": 280000
},
{
"epoch": 2.0333786010742188,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.38051580849547,
"eval_lookahead_perplexity": 590.2320758728989,
"eval_loss": 0.14189383387565613,
"eval_perplexity": 1.152454290436478,
"eval_runtime": 270.4203,
"eval_samples_per_second": 18.49,
"eval_steps_per_second": 0.581,
"step": 280000
},
{
"base_loss": 0.3017133396565914,
"epoch": 2.034332275390625,
"grad_norm": 0.0009047857020050287,
"learning_rate": 2.324953079223633e-05,
"lookahead_loss": 6.462435024261475,
"loss": 0.3144,
"step": 280500
},
{
"base_loss": 0.31134800574183463,
"epoch": 2.0352859497070312,
"grad_norm": 0.0010061671491712332,
"learning_rate": 2.3201847076416016e-05,
"lookahead_loss": 6.291885371685028,
"loss": 0.3227,
"step": 281000
},
{
"base_loss": 0.32387468561530114,
"epoch": 2.0362396240234375,
"grad_norm": 0.0009494886617176235,
"learning_rate": 2.3154163360595703e-05,
"lookahead_loss": 6.4035931491851805,
"loss": 0.3364,
"step": 281500
},
{
"base_loss": 0.3080780008882284,
"epoch": 2.0371932983398438,
"grad_norm": 0.0009919317672029138,
"learning_rate": 2.310647964477539e-05,
"lookahead_loss": 6.333096095561981,
"loss": 0.3194,
"step": 282000
},
{
"base_loss": 0.30180328992009164,
"epoch": 2.03814697265625,
"grad_norm": 0.0009797826642170548,
"learning_rate": 2.305879592895508e-05,
"lookahead_loss": 6.382054131031037,
"loss": 0.3139,
"step": 282500
},
{
"base_loss": 0.30689890575408935,
"epoch": 2.0391006469726562,
"grad_norm": 0.0009295629570260644,
"learning_rate": 2.3011112213134767e-05,
"lookahead_loss": 6.332678085803986,
"loss": 0.319,
"step": 283000
},
{
"base_loss": 0.32427770999073985,
"epoch": 2.0400543212890625,
"grad_norm": 0.0009341423865407705,
"learning_rate": 2.2963428497314454e-05,
"lookahead_loss": 6.368278294086457,
"loss": 0.3333,
"step": 283500
},
{
"base_loss": 0.30682690465450285,
"epoch": 2.0410079956054688,
"grad_norm": 0.000977862160652876,
"learning_rate": 2.291574478149414e-05,
"lookahead_loss": 6.298216439723968,
"loss": 0.3158,
"step": 284000
},
{
"base_loss": 0.29654143354296686,
"epoch": 2.041961669921875,
"grad_norm": 0.0009500051965005696,
"learning_rate": 2.2868061065673828e-05,
"lookahead_loss": 6.386434417724609,
"loss": 0.3081,
"step": 284500
},
{
"base_loss": 0.30721216344833374,
"epoch": 2.0429153442382812,
"grad_norm": 0.0009771424811333418,
"learning_rate": 2.2820377349853518e-05,
"lookahead_loss": 6.396834279060363,
"loss": 0.3232,
"step": 285000
},
{
"epoch": 2.0429153442382812,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.375226985151395,
"eval_lookahead_perplexity": 587.1186830411696,
"eval_loss": 0.14188361167907715,
"eval_perplexity": 1.1524425098823847,
"eval_runtime": 255.0149,
"eval_samples_per_second": 19.607,
"eval_steps_per_second": 0.616,
"step": 285000
},
{
"base_loss": 0.32630143281817436,
"epoch": 2.0438690185546875,
"grad_norm": 0.000979576027020812,
"learning_rate": 2.2772693634033205e-05,
"lookahead_loss": 6.421607649326324,
"loss": 0.3418,
"step": 285500
},
{
"base_loss": 0.296696748316288,
"epoch": 2.0448226928710938,
"grad_norm": 0.0009980611503124237,
"learning_rate": 2.272500991821289e-05,
"lookahead_loss": 6.342905656814575,
"loss": 0.3099,
"step": 286000
},
{
"base_loss": 0.30323311913013457,
"epoch": 2.0457763671875,
"grad_norm": 0.0009551486582495272,
"learning_rate": 2.2677326202392578e-05,
"lookahead_loss": 6.355796694755554,
"loss": 0.3161,
"step": 286500
},
{
"base_loss": 0.32944888742268086,
"epoch": 2.0467300415039062,
"grad_norm": 0.0009698076173663139,
"learning_rate": 2.2629642486572265e-05,
"lookahead_loss": 6.332300736427307,
"loss": 0.3395,
"step": 287000
},
{
"base_loss": 0.32393511798977853,
"epoch": 2.0476837158203125,
"grad_norm": 0.0009880122961476445,
"learning_rate": 2.2581958770751955e-05,
"lookahead_loss": 6.367317282676697,
"loss": 0.3395,
"step": 287500
},
{
"base_loss": 0.293301939278841,
"epoch": 2.0486373901367188,
"grad_norm": 0.0009537344449199736,
"learning_rate": 2.2534275054931642e-05,
"lookahead_loss": 6.327198909759521,
"loss": 0.3059,
"step": 288000
},
{
"base_loss": 0.3036652799248695,
"epoch": 2.049591064453125,
"grad_norm": 0.001002758159302175,
"learning_rate": 2.248659133911133e-05,
"lookahead_loss": 6.3047745990753175,
"loss": 0.3174,
"step": 288500
},
{
"base_loss": 0.3174412237107754,
"epoch": 2.0505447387695312,
"grad_norm": 0.0009080055169761181,
"learning_rate": 2.2438907623291016e-05,
"lookahead_loss": 6.417123206138611,
"loss": 0.3322,
"step": 289000
},
{
"base_loss": 0.30474287942051886,
"epoch": 2.0514984130859375,
"grad_norm": 0.0009786185109987855,
"learning_rate": 2.2391223907470703e-05,
"lookahead_loss": 6.35580781173706,
"loss": 0.318,
"step": 289500
},
{
"base_loss": 0.30692395463585853,
"epoch": 2.0524520874023438,
"grad_norm": 0.001052466919645667,
"learning_rate": 2.2343540191650393e-05,
"lookahead_loss": 6.302160401821136,
"loss": 0.3179,
"step": 290000
},
{
"epoch": 2.0524520874023438,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.370734815780347,
"eval_lookahead_perplexity": 584.4871615214768,
"eval_loss": 0.14187423884868622,
"eval_perplexity": 1.1524317082848252,
"eval_runtime": 269.1546,
"eval_samples_per_second": 18.577,
"eval_steps_per_second": 0.583,
"step": 290000
},
{
"base_loss": 0.32256214889883994,
"epoch": 2.05340576171875,
"grad_norm": 0.0009556798031553626,
"learning_rate": 2.229585647583008e-05,
"lookahead_loss": 6.321493628501892,
"loss": 0.3314,
"step": 290500
},
{
"base_loss": 0.3550116382241249,
"epoch": 2.0543594360351562,
"grad_norm": 0.000961259298492223,
"learning_rate": 2.2248172760009766e-05,
"lookahead_loss": 6.349826979160309,
"loss": 0.3695,
"step": 291000
},
{
"base_loss": 0.2970747436285019,
"epoch": 2.0553131103515625,
"grad_norm": 0.0009557474404573441,
"learning_rate": 2.2200489044189453e-05,
"lookahead_loss": 6.357933250904083,
"loss": 0.3083,
"step": 291500
},
{
"base_loss": 0.30645539990067483,
"epoch": 2.0562667846679688,
"grad_norm": 0.0009564717183820903,
"learning_rate": 2.215280532836914e-05,
"lookahead_loss": 6.364681176662445,
"loss": 0.3169,
"step": 292000
},
{
"base_loss": 0.31723022189736366,
"epoch": 2.057220458984375,
"grad_norm": 0.0009774576174095273,
"learning_rate": 2.210512161254883e-05,
"lookahead_loss": 6.384846560955047,
"loss": 0.331,
"step": 292500
},
{
"base_loss": 0.3193240025639534,
"epoch": 2.0581741333007812,
"grad_norm": 0.0009728356963023543,
"learning_rate": 2.2057437896728517e-05,
"lookahead_loss": 6.384281438827514,
"loss": 0.3274,
"step": 293000
},
{
"base_loss": 0.2937832759618759,
"epoch": 2.0591278076171875,
"grad_norm": 0.0009236375335603952,
"learning_rate": 2.2009754180908204e-05,
"lookahead_loss": 6.286119668006897,
"loss": 0.3073,
"step": 293500
},
{
"base_loss": 0.30271227744221685,
"epoch": 2.0600814819335938,
"grad_norm": 0.000955918338149786,
"learning_rate": 2.196207046508789e-05,
"lookahead_loss": 6.3528727483749385,
"loss": 0.3166,
"step": 294000
},
{
"base_loss": 0.3198817696869373,
"epoch": 2.06103515625,
"grad_norm": 0.0009816517122089863,
"learning_rate": 2.1914386749267578e-05,
"lookahead_loss": 6.311840002059936,
"loss": 0.3308,
"step": 294500
},
{
"base_loss": 0.3065698970258236,
"epoch": 2.0619888305664062,
"grad_norm": 0.0010125135304406285,
"learning_rate": 2.1866703033447268e-05,
"lookahead_loss": 6.322453142166138,
"loss": 0.3149,
"step": 295000
},
{
"epoch": 2.0619888305664062,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.365234135057979,
"eval_lookahead_perplexity": 581.2809106252928,
"eval_loss": 0.14186429977416992,
"eval_perplexity": 1.152420254237123,
"eval_runtime": 252.3744,
"eval_samples_per_second": 19.812,
"eval_steps_per_second": 0.622,
"step": 295000
},
{
"base_loss": 0.30793745544552803,
"epoch": 2.0629425048828125,
"grad_norm": 0.0010041906498372555,
"learning_rate": 2.1819019317626955e-05,
"lookahead_loss": 6.316522599697113,
"loss": 0.3179,
"step": 295500
},
{
"base_loss": 0.3166033121049404,
"epoch": 2.0638961791992188,
"grad_norm": 0.0009562079794704914,
"learning_rate": 2.177133560180664e-05,
"lookahead_loss": 6.345142545700074,
"loss": 0.3285,
"step": 296000
},
{
"base_loss": 0.30278992640972135,
"epoch": 2.064849853515625,
"grad_norm": 0.0009520898456685245,
"learning_rate": 2.1723651885986328e-05,
"lookahead_loss": 6.375065122127533,
"loss": 0.3182,
"step": 296500
},
{
"base_loss": 0.30789859166741373,
"epoch": 2.0658035278320312,
"grad_norm": 0.0009764463757164776,
"learning_rate": 2.1675968170166015e-05,
"lookahead_loss": 6.292466301918029,
"loss": 0.3196,
"step": 297000
},
{
"base_loss": 0.307515013217926,
"epoch": 2.0667572021484375,
"grad_norm": 0.0009969203965738416,
"learning_rate": 2.1628284454345705e-05,
"lookahead_loss": 6.283729823112488,
"loss": 0.3164,
"step": 297500
},
{
"base_loss": 0.3304452752768993,
"epoch": 2.0677108764648438,
"grad_norm": 0.0010050119599327445,
"learning_rate": 2.1580600738525392e-05,
"lookahead_loss": 6.349568615913391,
"loss": 0.3424,
"step": 298000
},
{
"base_loss": 0.3000219973921776,
"epoch": 2.06866455078125,
"grad_norm": 0.0010230648331344128,
"learning_rate": 2.153291702270508e-05,
"lookahead_loss": 6.3259260330200195,
"loss": 0.3094,
"step": 298500
},
{
"base_loss": 0.3050138043165207,
"epoch": 2.0696182250976562,
"grad_norm": 0.000958802062086761,
"learning_rate": 2.1485233306884766e-05,
"lookahead_loss": 6.357368535041809,
"loss": 0.3167,
"step": 299000
},
{
"base_loss": 0.34709723374247553,
"epoch": 2.0705718994140625,
"grad_norm": 0.0009933494729921222,
"learning_rate": 2.1437549591064453e-05,
"lookahead_loss": 6.26349934053421,
"loss": 0.3598,
"step": 299500
},
{
"base_loss": 0.31454886627197265,
"epoch": 2.0715255737304688,
"grad_norm": 0.0009534511482343078,
"learning_rate": 2.1389865875244143e-05,
"lookahead_loss": 6.306615966796875,
"loss": 0.3259,
"step": 300000
},
{
"epoch": 2.0715255737304688,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.36017004521891,
"eval_lookahead_perplexity": 578.3446927825626,
"eval_loss": 0.14185450971126556,
"eval_perplexity": 1.1524089720255686,
"eval_runtime": 269.0073,
"eval_samples_per_second": 18.587,
"eval_steps_per_second": 0.584,
"step": 300000
},
{
"base_loss": 0.30621468406915664,
"epoch": 2.072479248046875,
"grad_norm": 0.0009834032971411943,
"learning_rate": 2.134218215942383e-05,
"lookahead_loss": 6.379785425662995,
"loss": 0.3181,
"step": 300500
},
{
"base_loss": 0.3062588813006878,
"epoch": 2.0734329223632812,
"grad_norm": 0.0009368330356664956,
"learning_rate": 2.1294498443603516e-05,
"lookahead_loss": 6.34466918849945,
"loss": 0.3187,
"step": 301000
},
{
"base_loss": 0.3277868445813656,
"epoch": 2.0743865966796875,
"grad_norm": 0.0009595782612450421,
"learning_rate": 2.1246814727783203e-05,
"lookahead_loss": 6.362534090518952,
"loss": 0.3403,
"step": 301500
},
{
"base_loss": 0.30303199696540833,
"epoch": 2.0753402709960938,
"grad_norm": 0.0009713785257190466,
"learning_rate": 2.119913101196289e-05,
"lookahead_loss": 6.440483233451843,
"loss": 0.3151,
"step": 302000
},
{
"base_loss": 0.30761926966905595,
"epoch": 2.0762939453125,
"grad_norm": 0.0009204771486110985,
"learning_rate": 2.115144729614258e-05,
"lookahead_loss": 6.384215325355529,
"loss": 0.3185,
"step": 302500
},
{
"base_loss": 0.33150802648067473,
"epoch": 2.0772476196289062,
"grad_norm": 0.0009674776811152697,
"learning_rate": 2.1103763580322267e-05,
"lookahead_loss": 6.376626619338989,
"loss": 0.3442,
"step": 303000
},
{
"base_loss": 0.30574921500682833,
"epoch": 2.0782012939453125,
"grad_norm": 0.0009765610448084772,
"learning_rate": 2.1056079864501954e-05,
"lookahead_loss": 6.3769332141876225,
"loss": 0.3173,
"step": 303500
},
{
"base_loss": 0.2994054418802261,
"epoch": 2.0791549682617188,
"grad_norm": 0.0009325052960775793,
"learning_rate": 2.100839614868164e-05,
"lookahead_loss": 6.375643718719482,
"loss": 0.3132,
"step": 304000
},
{
"base_loss": 0.31194803246855735,
"epoch": 2.080108642578125,
"grad_norm": 0.0009499716688878834,
"learning_rate": 2.0960712432861328e-05,
"lookahead_loss": 6.357075540542603,
"loss": 0.3293,
"step": 304500
},
{
"base_loss": 0.3244352611005306,
"epoch": 2.0810623168945312,
"grad_norm": 0.0010088100098073483,
"learning_rate": 2.0913028717041018e-05,
"lookahead_loss": 6.381894103050232,
"loss": 0.3389,
"step": 305000
},
{
"epoch": 2.0810623168945312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.354905619788855,
"eval_lookahead_perplexity": 575.3080404027362,
"eval_loss": 0.14184485375881195,
"eval_perplexity": 1.1523978444730512,
"eval_runtime": 264.9253,
"eval_samples_per_second": 18.873,
"eval_steps_per_second": 0.593,
"step": 305000
},
{
"base_loss": 0.30112930870056154,
"epoch": 2.0820159912109375,
"grad_norm": 0.0009222808876074851,
"learning_rate": 2.0865345001220705e-05,
"lookahead_loss": 6.385272013187408,
"loss": 0.313,
"step": 305500
},
{
"base_loss": 0.30448419651389125,
"epoch": 2.0829696655273438,
"grad_norm": 0.0009914437541738153,
"learning_rate": 2.081766128540039e-05,
"lookahead_loss": 6.414517087459564,
"loss": 0.3179,
"step": 306000
},
{
"base_loss": 0.33415990057587625,
"epoch": 2.08392333984375,
"grad_norm": 0.0009415415697731078,
"learning_rate": 2.0769977569580078e-05,
"lookahead_loss": 6.42780497264862,
"loss": 0.3454,
"step": 306500
},
{
"base_loss": 0.31056174263358116,
"epoch": 2.0848770141601562,
"grad_norm": 0.000990306492894888,
"learning_rate": 2.0722293853759765e-05,
"lookahead_loss": 6.353310499668122,
"loss": 0.3208,
"step": 307000
},
{
"base_loss": 0.2973758824914694,
"epoch": 2.0858306884765625,
"grad_norm": 0.0010177840013056993,
"learning_rate": 2.0674610137939455e-05,
"lookahead_loss": 6.358139885902405,
"loss": 0.3087,
"step": 307500
},
{
"base_loss": 0.3042152850329876,
"epoch": 2.0867843627929688,
"grad_norm": 0.0011030308669432998,
"learning_rate": 2.0626926422119142e-05,
"lookahead_loss": 6.3280936369895935,
"loss": 0.3138,
"step": 308000
},
{
"base_loss": 0.337257578343153,
"epoch": 2.087738037109375,
"grad_norm": 0.000879972823895514,
"learning_rate": 2.057924270629883e-05,
"lookahead_loss": 6.373201508045197,
"loss": 0.344,
"step": 308500
},
{
"base_loss": 0.3000968562066555,
"epoch": 2.0886917114257812,
"grad_norm": 0.0010026989039033651,
"learning_rate": 2.0531558990478516e-05,
"lookahead_loss": 6.366145831108093,
"loss": 0.3114,
"step": 309000
},
{
"base_loss": 0.3103375973403454,
"epoch": 2.0896453857421875,
"grad_norm": 0.0009907458443194628,
"learning_rate": 2.0483875274658203e-05,
"lookahead_loss": 6.402014326095581,
"loss": 0.3184,
"step": 309500
},
{
"base_loss": 0.29991284269094465,
"epoch": 2.0905990600585938,
"grad_norm": 0.0009730191086418927,
"learning_rate": 2.0436191558837893e-05,
"lookahead_loss": 6.375398355484009,
"loss": 0.3119,
"step": 310000
},
{
"epoch": 2.0905990600585938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.350209241477065,
"eval_lookahead_perplexity": 572.6125107670246,
"eval_loss": 0.14183588325977325,
"eval_perplexity": 1.1523875069356617,
"eval_runtime": 261.1778,
"eval_samples_per_second": 19.144,
"eval_steps_per_second": 0.601,
"step": 310000
},
{
"base_loss": 0.3000768061578274,
"epoch": 2.091552734375,
"grad_norm": 0.0008830556180328131,
"learning_rate": 2.038850784301758e-05,
"lookahead_loss": 6.33889298915863,
"loss": 0.3108,
"step": 310500
},
{
"base_loss": 0.31945454320311545,
"epoch": 2.0925064086914062,
"grad_norm": 0.0009592261631041765,
"learning_rate": 2.0340824127197266e-05,
"lookahead_loss": 6.384022113323212,
"loss": 0.3332,
"step": 311000
},
{
"base_loss": 0.309114942163229,
"epoch": 2.0934600830078125,
"grad_norm": 0.0008802940137684345,
"learning_rate": 2.0293140411376953e-05,
"lookahead_loss": 6.382145908355713,
"loss": 0.3196,
"step": 311500
},
{
"base_loss": 0.28753899577260017,
"epoch": 2.0944137573242188,
"grad_norm": 0.0009626175160519779,
"learning_rate": 2.024545669555664e-05,
"lookahead_loss": 6.367324014663696,
"loss": 0.3034,
"step": 312000
},
{
"base_loss": 0.29245217123627665,
"epoch": 2.095367431640625,
"grad_norm": 0.0009789286414161325,
"learning_rate": 2.019777297973633e-05,
"lookahead_loss": 6.304876070022583,
"loss": 0.3079,
"step": 312500
},
{
"base_loss": 0.30112256136536597,
"epoch": 2.0963211059570312,
"grad_norm": 0.0009424127638339996,
"learning_rate": 2.0150089263916017e-05,
"lookahead_loss": 6.374583042144775,
"loss": 0.3135,
"step": 313000
},
{
"base_loss": 0.3297825155258179,
"epoch": 2.0972747802734375,
"grad_norm": 0.0009768138406798244,
"learning_rate": 2.0102405548095704e-05,
"lookahead_loss": 6.39582384967804,
"loss": 0.341,
"step": 313500
},
{
"base_loss": 0.2911633634865284,
"epoch": 2.0982284545898438,
"grad_norm": 0.0009722594986669719,
"learning_rate": 2.005472183227539e-05,
"lookahead_loss": 6.326456391811371,
"loss": 0.3062,
"step": 314000
},
{
"base_loss": 0.2934698580801487,
"epoch": 2.09918212890625,
"grad_norm": 0.0009798408718779683,
"learning_rate": 2.0007038116455078e-05,
"lookahead_loss": 6.375765996932984,
"loss": 0.3097,
"step": 314500
},
{
"base_loss": 0.3032768616080284,
"epoch": 2.1001358032226562,
"grad_norm": 0.0009298865916207433,
"learning_rate": 1.9959354400634768e-05,
"lookahead_loss": 6.382933850288391,
"loss": 0.3149,
"step": 315000
},
{
"epoch": 2.1001358032226562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.346465186189158,
"eval_lookahead_perplexity": 570.4726262907923,
"eval_loss": 0.14182765781879425,
"eval_perplexity": 1.1523780280792224,
"eval_runtime": 278.3923,
"eval_samples_per_second": 17.96,
"eval_steps_per_second": 0.564,
"step": 315000
},
{
"base_loss": 0.3233104472160339,
"epoch": 2.1010894775390625,
"grad_norm": 0.0009602688369341195,
"learning_rate": 1.9911670684814455e-05,
"lookahead_loss": 6.44511248588562,
"loss": 0.3332,
"step": 315500
},
{
"base_loss": 0.30158160945773127,
"epoch": 2.1020431518554688,
"grad_norm": 0.0009496202110312879,
"learning_rate": 1.986398696899414e-05,
"lookahead_loss": 6.3878337059021,
"loss": 0.3128,
"step": 316000
},
{
"base_loss": 0.29719595339894295,
"epoch": 2.102996826171875,
"grad_norm": 0.0009702611714601517,
"learning_rate": 1.9816303253173828e-05,
"lookahead_loss": 6.36890030002594,
"loss": 0.3127,
"step": 316500
},
{
"base_loss": 0.3008777514696121,
"epoch": 2.1039505004882812,
"grad_norm": 0.0009990332182496786,
"learning_rate": 1.9768619537353515e-05,
"lookahead_loss": 6.40038135433197,
"loss": 0.3116,
"step": 317000
},
{
"base_loss": 0.31517351168394087,
"epoch": 2.1049041748046875,
"grad_norm": 0.0009908730862662196,
"learning_rate": 1.9720935821533205e-05,
"lookahead_loss": 6.332096821784973,
"loss": 0.3277,
"step": 317500
},
{
"base_loss": 0.3079349631667137,
"epoch": 2.1058578491210938,
"grad_norm": 0.0009350811596959829,
"learning_rate": 1.9673252105712892e-05,
"lookahead_loss": 6.3365942516326905,
"loss": 0.3206,
"step": 318000
},
{
"base_loss": 0.29828172570466993,
"epoch": 2.1068115234375,
"grad_norm": 0.0009335639770142734,
"learning_rate": 1.962556838989258e-05,
"lookahead_loss": 6.408078939437866,
"loss": 0.3084,
"step": 318500
},
{
"base_loss": 0.29354105108976364,
"epoch": 2.1077651977539062,
"grad_norm": 0.0009805822046473622,
"learning_rate": 1.9577884674072266e-05,
"lookahead_loss": 6.328373380184174,
"loss": 0.306,
"step": 319000
},
{
"base_loss": 0.3199558552503586,
"epoch": 2.1087188720703125,
"grad_norm": 0.0010078581981360912,
"learning_rate": 1.9530200958251953e-05,
"lookahead_loss": 6.335598618030548,
"loss": 0.333,
"step": 319500
},
{
"base_loss": 0.31955078572034834,
"epoch": 2.1096725463867188,
"grad_norm": 0.0009100844035856426,
"learning_rate": 1.9482517242431643e-05,
"lookahead_loss": 6.297343758106232,
"loss": 0.3289,
"step": 320000
},
{
"epoch": 2.1096725463867188,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.342238569411988,
"eval_lookahead_perplexity": 568.066545490244,
"eval_loss": 0.14181943237781525,
"eval_perplexity": 1.1523685493007505,
"eval_runtime": 268.0733,
"eval_samples_per_second": 18.652,
"eval_steps_per_second": 0.586,
"step": 320000
},
{
"base_loss": 0.30168747982382776,
"epoch": 2.110626220703125,
"grad_norm": 0.0009170817211270332,
"learning_rate": 1.943483352661133e-05,
"lookahead_loss": 6.255009590625763,
"loss": 0.315,
"step": 320500
},
{
"base_loss": 0.29647291442751883,
"epoch": 2.1115798950195312,
"grad_norm": 0.0010039744665846229,
"learning_rate": 1.9387149810791016e-05,
"lookahead_loss": 6.370616749763489,
"loss": 0.3074,
"step": 321000
},
{
"base_loss": 0.3107553372234106,
"epoch": 2.1125335693359375,
"grad_norm": 0.0009533903794363141,
"learning_rate": 1.9339466094970703e-05,
"lookahead_loss": 6.388658571243286,
"loss": 0.3221,
"step": 321500
},
{
"base_loss": 0.34275346267223356,
"epoch": 2.1134872436523438,
"grad_norm": 0.0010111962910741568,
"learning_rate": 1.929178237915039e-05,
"lookahead_loss": 6.398144736289978,
"loss": 0.3506,
"step": 322000
},
{
"base_loss": 0.29299941608309743,
"epoch": 2.11444091796875,
"grad_norm": 0.0009360475232824683,
"learning_rate": 1.924409866333008e-05,
"lookahead_loss": 6.286139685630799,
"loss": 0.3075,
"step": 322500
},
{
"base_loss": 0.29959108304977417,
"epoch": 2.1153945922851562,
"grad_norm": 0.0009626513347029686,
"learning_rate": 1.9196414947509767e-05,
"lookahead_loss": 6.344089280605316,
"loss": 0.3104,
"step": 323000
},
{
"base_loss": 0.3146350940167904,
"epoch": 2.1163482666015625,
"grad_norm": 0.0010095579782500863,
"learning_rate": 1.9148731231689454e-05,
"lookahead_loss": 6.34140051651001,
"loss": 0.3239,
"step": 323500
},
{
"base_loss": 0.32594672916829587,
"epoch": 2.1173019409179688,
"grad_norm": 0.0009781294502317905,
"learning_rate": 1.910104751586914e-05,
"lookahead_loss": 6.33776809501648,
"loss": 0.3422,
"step": 324000
},
{
"base_loss": 0.3083632712960243,
"epoch": 2.118255615234375,
"grad_norm": 0.0010100657818838954,
"learning_rate": 1.9053363800048828e-05,
"lookahead_loss": 6.3736692142486575,
"loss": 0.3159,
"step": 324500
},
{
"base_loss": 0.30149356806278227,
"epoch": 2.1192092895507812,
"grad_norm": 0.000946766056586057,
"learning_rate": 1.9005680084228518e-05,
"lookahead_loss": 6.3695377283096315,
"loss": 0.3111,
"step": 325000
},
{
"epoch": 2.1192092895507812,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.338089135508187,
"eval_lookahead_perplexity": 565.7142745781854,
"eval_loss": 0.14181137084960938,
"eval_perplexity": 1.1523592594866319,
"eval_runtime": 257.0144,
"eval_samples_per_second": 19.454,
"eval_steps_per_second": 0.611,
"step": 325000
},
{
"base_loss": 0.308347962975502,
"epoch": 3.0009536743164062,
"grad_norm": 0.0009298936929553747,
"learning_rate": 1.8957996368408205e-05,
"lookahead_loss": 6.413288908958435,
"loss": 0.3164,
"step": 325500
},
{
"base_loss": 0.3005163077712059,
"epoch": 3.0019073486328125,
"grad_norm": 0.0010061148786917329,
"learning_rate": 1.891031265258789e-05,
"lookahead_loss": 6.252055767536163,
"loss": 0.3131,
"step": 326000
},
{
"base_loss": 0.3118715011179447,
"epoch": 3.0028610229492188,
"grad_norm": 0.001017007976770401,
"learning_rate": 1.8862628936767578e-05,
"lookahead_loss": 6.243941001415252,
"loss": 0.3221,
"step": 326500
},
{
"base_loss": 0.3241955025494099,
"epoch": 3.003814697265625,
"grad_norm": 0.0009476043051108718,
"learning_rate": 1.8814945220947265e-05,
"lookahead_loss": 6.282772970199585,
"loss": 0.3347,
"step": 327000
},
{
"base_loss": 0.3020790828168392,
"epoch": 3.0047683715820312,
"grad_norm": 0.0009582182974554598,
"learning_rate": 1.8767261505126955e-05,
"lookahead_loss": 6.2533429822921756,
"loss": 0.3166,
"step": 327500
},
{
"base_loss": 0.3016543593108654,
"epoch": 3.0057220458984375,
"grad_norm": 0.0008436063071712852,
"learning_rate": 1.8719577789306642e-05,
"lookahead_loss": 6.375606914520263,
"loss": 0.3119,
"step": 328000
},
{
"base_loss": 0.29849619832634927,
"epoch": 3.0066757202148438,
"grad_norm": 0.0009330453467555344,
"learning_rate": 1.867189407348633e-05,
"lookahead_loss": 6.2344167790412905,
"loss": 0.3128,
"step": 328500
},
{
"base_loss": 0.31336222241818906,
"epoch": 3.00762939453125,
"grad_norm": 0.0009829691844061017,
"learning_rate": 1.8624210357666016e-05,
"lookahead_loss": 6.291501372814179,
"loss": 0.3234,
"step": 329000
},
{
"base_loss": 0.316298152923584,
"epoch": 3.0085830688476562,
"grad_norm": 0.0009287581779062748,
"learning_rate": 1.8576526641845703e-05,
"lookahead_loss": 6.2824432291984555,
"loss": 0.3224,
"step": 329500
},
{
"base_loss": 0.3036236027777195,
"epoch": 3.0095367431640625,
"grad_norm": 0.000979596166871488,
"learning_rate": 1.8528842926025393e-05,
"lookahead_loss": 6.3004652094841,
"loss": 0.3179,
"step": 330000
},
{
"epoch": 3.0095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.334081890484015,
"eval_lookahead_perplexity": 563.4518549287164,
"eval_loss": 0.1418035328388214,
"eval_perplexity": 1.1523502273177215,
"eval_runtime": 268.9824,
"eval_samples_per_second": 18.589,
"eval_steps_per_second": 0.584,
"step": 330000
},
{
"base_loss": 0.30253981775045397,
"epoch": 3.0104904174804688,
"grad_norm": 0.0009402994182892144,
"learning_rate": 1.848115921020508e-05,
"lookahead_loss": 6.262249879837036,
"loss": 0.3119,
"step": 330500
},
{
"base_loss": 0.30147307565808296,
"epoch": 3.011444091796875,
"grad_norm": 0.0010017170570790768,
"learning_rate": 1.8433475494384766e-05,
"lookahead_loss": 6.274536405563355,
"loss": 0.3139,
"step": 331000
},
{
"base_loss": 0.32504893574118615,
"epoch": 3.0123977661132812,
"grad_norm": 0.0008727677050046623,
"learning_rate": 1.8385791778564453e-05,
"lookahead_loss": 6.272889457702637,
"loss": 0.335,
"step": 331500
},
{
"base_loss": 0.3044933348596096,
"epoch": 3.0133514404296875,
"grad_norm": 0.0009206495014950633,
"learning_rate": 1.833810806274414e-05,
"lookahead_loss": 6.381618105888367,
"loss": 0.3184,
"step": 332000
},
{
"base_loss": 0.29874410527944567,
"epoch": 3.0143051147460938,
"grad_norm": 0.0009790639160200953,
"learning_rate": 1.829042434692383e-05,
"lookahead_loss": 6.30400173664093,
"loss": 0.3137,
"step": 332500
},
{
"base_loss": 0.2969954553842545,
"epoch": 3.0152587890625,
"grad_norm": 0.0009097548900172114,
"learning_rate": 1.8242740631103517e-05,
"lookahead_loss": 6.282666071414948,
"loss": 0.3061,
"step": 333000
},
{
"base_loss": 0.30927150130271913,
"epoch": 3.0162124633789062,
"grad_norm": 0.0009808189934119582,
"learning_rate": 1.8195056915283204e-05,
"lookahead_loss": 6.3133458938598634,
"loss": 0.322,
"step": 333500
},
{
"base_loss": 0.3112277380824089,
"epoch": 3.0171661376953125,
"grad_norm": 0.0009308747248724103,
"learning_rate": 1.814737319946289e-05,
"lookahead_loss": 6.369205039024353,
"loss": 0.3221,
"step": 334000
},
{
"base_loss": 0.300263544857502,
"epoch": 3.0181198120117188,
"grad_norm": 0.0009227790287695825,
"learning_rate": 1.8099689483642578e-05,
"lookahead_loss": 6.362029777526855,
"loss": 0.312,
"step": 334500
},
{
"base_loss": 0.29675460466742515,
"epoch": 3.019073486328125,
"grad_norm": 0.0009541187318973243,
"learning_rate": 1.8052005767822268e-05,
"lookahead_loss": 6.378857530593872,
"loss": 0.3094,
"step": 335000
},
{
"epoch": 3.019073486328125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.329803198671189,
"eval_lookahead_perplexity": 561.046168355948,
"eval_loss": 0.14179490506649017,
"eval_perplexity": 1.1523402851452038,
"eval_runtime": 295.7066,
"eval_samples_per_second": 16.909,
"eval_steps_per_second": 0.531,
"step": 335000
},
{
"base_loss": 0.30056491792201995,
"epoch": 3.0200271606445312,
"grad_norm": 0.0010459490586072206,
"learning_rate": 1.8004322052001955e-05,
"lookahead_loss": 6.219315876483917,
"loss": 0.3149,
"step": 335500
},
{
"base_loss": 0.3270763371884823,
"epoch": 3.0209808349609375,
"grad_norm": 0.0009397394605912268,
"learning_rate": 1.795663833618164e-05,
"lookahead_loss": 6.310785099029541,
"loss": 0.3388,
"step": 336000
},
{
"base_loss": 0.3038404151797295,
"epoch": 3.0219345092773438,
"grad_norm": 0.0009903458412736654,
"learning_rate": 1.7908954620361328e-05,
"lookahead_loss": 6.267493858814239,
"loss": 0.3142,
"step": 336500
},
{
"base_loss": 0.29968900653719904,
"epoch": 3.02288818359375,
"grad_norm": 0.0009887177729979157,
"learning_rate": 1.7861270904541015e-05,
"lookahead_loss": 6.31684240436554,
"loss": 0.3122,
"step": 337000
},
{
"base_loss": 0.3010028195679188,
"epoch": 3.0238418579101562,
"grad_norm": 0.0009739255765452981,
"learning_rate": 1.7813587188720705e-05,
"lookahead_loss": 6.265827451705933,
"loss": 0.3125,
"step": 337500
},
{
"base_loss": 0.32444008192420004,
"epoch": 3.0247955322265625,
"grad_norm": 0.0009061110904440284,
"learning_rate": 1.7765903472900392e-05,
"lookahead_loss": 6.279670271396637,
"loss": 0.3341,
"step": 338000
},
{
"base_loss": 0.30922784996032715,
"epoch": 3.0257492065429688,
"grad_norm": 0.0009870253270491958,
"learning_rate": 1.771821975708008e-05,
"lookahead_loss": 6.245859955787659,
"loss": 0.3225,
"step": 338500
},
{
"base_loss": 0.30224511262774467,
"epoch": 3.026702880859375,
"grad_norm": 0.0009940272429957986,
"learning_rate": 1.7670536041259766e-05,
"lookahead_loss": 6.2585461874008175,
"loss": 0.3115,
"step": 339000
},
{
"base_loss": 0.3063249331712723,
"epoch": 3.0276565551757812,
"grad_norm": 0.0009431723156012595,
"learning_rate": 1.7622852325439453e-05,
"lookahead_loss": 6.363879596710205,
"loss": 0.3174,
"step": 339500
},
{
"base_loss": 0.32986500787734985,
"epoch": 3.0286102294921875,
"grad_norm": 0.0009309691959060729,
"learning_rate": 1.7575168609619143e-05,
"lookahead_loss": 6.401720482826233,
"loss": 0.3396,
"step": 340000
},
{
"epoch": 3.0286102294921875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.325850799060858,
"eval_lookahead_perplexity": 558.8330661119743,
"eval_loss": 0.1417873352766037,
"eval_perplexity": 1.152331562204383,
"eval_runtime": 270.5286,
"eval_samples_per_second": 18.482,
"eval_steps_per_second": 0.58,
"step": 340000
},
{
"base_loss": 0.3042540407180786,
"epoch": 3.0295639038085938,
"grad_norm": 0.0009302702965214849,
"learning_rate": 1.752748489379883e-05,
"lookahead_loss": 6.327819796562195,
"loss": 0.3128,
"step": 340500
},
{
"base_loss": 0.3056684481501579,
"epoch": 3.030517578125,
"grad_norm": 0.0009667924023233354,
"learning_rate": 1.7479801177978516e-05,
"lookahead_loss": 6.319185664653778,
"loss": 0.3153,
"step": 341000
},
{
"base_loss": 0.30326661148667333,
"epoch": 3.0314712524414062,
"grad_norm": 0.000947739346884191,
"learning_rate": 1.7432117462158203e-05,
"lookahead_loss": 6.31995133113861,
"loss": 0.3148,
"step": 341500
},
{
"base_loss": 0.31433943542838094,
"epoch": 3.0324249267578125,
"grad_norm": 0.0009427520562894642,
"learning_rate": 1.738443374633789e-05,
"lookahead_loss": 6.287163452148437,
"loss": 0.3335,
"step": 342000
},
{
"base_loss": 0.3052255228161812,
"epoch": 3.0333786010742188,
"grad_norm": 0.0010005695512518287,
"learning_rate": 1.733675003051758e-05,
"lookahead_loss": 6.3287235207557675,
"loss": 0.3154,
"step": 342500
},
{
"base_loss": 0.304299351811409,
"epoch": 3.034332275390625,
"grad_norm": 0.0008999764686450362,
"learning_rate": 1.7289066314697267e-05,
"lookahead_loss": 6.394580018997193,
"loss": 0.316,
"step": 343000
},
{
"base_loss": 0.3093027866780758,
"epoch": 3.0352859497070312,
"grad_norm": 0.001007239567115903,
"learning_rate": 1.7241382598876954e-05,
"lookahead_loss": 6.223367876529694,
"loss": 0.3212,
"step": 343500
},
{
"base_loss": 0.3244023490846157,
"epoch": 3.0362396240234375,
"grad_norm": 0.0009588600951246917,
"learning_rate": 1.719369888305664e-05,
"lookahead_loss": 6.34389029598236,
"loss": 0.3373,
"step": 344000
},
{
"base_loss": 0.30684786412119863,
"epoch": 3.0371932983398438,
"grad_norm": 0.0009947418002411723,
"learning_rate": 1.7146015167236328e-05,
"lookahead_loss": 6.284828133106232,
"loss": 0.3188,
"step": 344500
},
{
"base_loss": 0.30203218227624895,
"epoch": 3.03814697265625,
"grad_norm": 0.0009736179490573704,
"learning_rate": 1.7098331451416018e-05,
"lookahead_loss": 6.32041801738739,
"loss": 0.314,
"step": 345000
},
{
"epoch": 3.03814697265625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.322265743828429,
"eval_lookahead_perplexity": 556.8332056513128,
"eval_loss": 0.1417807936668396,
"eval_perplexity": 1.1523240241256398,
"eval_runtime": 265.1579,
"eval_samples_per_second": 18.857,
"eval_steps_per_second": 0.592,
"step": 345000
},
{
"base_loss": 0.30594835013151167,
"epoch": 3.0391006469726562,
"grad_norm": 0.0009133536368608475,
"learning_rate": 1.7050647735595705e-05,
"lookahead_loss": 6.269966838359832,
"loss": 0.319,
"step": 345500
},
{
"base_loss": 0.3232807823717594,
"epoch": 3.0400543212890625,
"grad_norm": 0.0009371156920678914,
"learning_rate": 1.700296401977539e-05,
"lookahead_loss": 6.320635152816773,
"loss": 0.3325,
"step": 346000
},
{
"base_loss": 0.30651690036058427,
"epoch": 3.0410079956054688,
"grad_norm": 0.000969213608186692,
"learning_rate": 1.6955280303955078e-05,
"lookahead_loss": 6.247298479557037,
"loss": 0.3148,
"step": 346500
},
{
"base_loss": 0.2957771936655045,
"epoch": 3.041961669921875,
"grad_norm": 0.0009626311366446316,
"learning_rate": 1.6907596588134765e-05,
"lookahead_loss": 6.325569787979126,
"loss": 0.3075,
"step": 347000
},
{
"base_loss": 0.310648419380188,
"epoch": 3.0429153442382812,
"grad_norm": 0.00098798715043813,
"learning_rate": 1.6859912872314455e-05,
"lookahead_loss": 6.343762699604034,
"loss": 0.3249,
"step": 347500
},
{
"base_loss": 0.3259735953062773,
"epoch": 3.0438690185546875,
"grad_norm": 0.0009963024640455842,
"learning_rate": 1.6812229156494142e-05,
"lookahead_loss": 6.38560670042038,
"loss": 0.3417,
"step": 348000
},
{
"base_loss": 0.295075288772583,
"epoch": 3.0448226928710938,
"grad_norm": 0.0009655926842242479,
"learning_rate": 1.676454544067383e-05,
"lookahead_loss": 6.27928000164032,
"loss": 0.31,
"step": 348500
},
{
"base_loss": 0.3027168534696102,
"epoch": 3.0457763671875,
"grad_norm": 0.0009574743453413248,
"learning_rate": 1.6716861724853516e-05,
"lookahead_loss": 6.29035155916214,
"loss": 0.3156,
"step": 349000
},
{
"base_loss": 0.33181600126624106,
"epoch": 3.0467300415039062,
"grad_norm": 0.0009480383596383035,
"learning_rate": 1.6669178009033203e-05,
"lookahead_loss": 6.259274421691894,
"loss": 0.3397,
"step": 349500
},
{
"base_loss": 0.3247646952867508,
"epoch": 3.0476837158203125,
"grad_norm": 0.001015504589304328,
"learning_rate": 1.6621494293212893e-05,
"lookahead_loss": 6.312367619514466,
"loss": 0.3392,
"step": 350000
},
{
"epoch": 3.0476837158203125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.319038366738219,
"eval_lookahead_perplexity": 555.0389917800691,
"eval_loss": 0.14177383482456207,
"eval_perplexity": 1.1523160053124042,
"eval_runtime": 258.8888,
"eval_samples_per_second": 19.313,
"eval_steps_per_second": 0.606,
"step": 350000
},
{
"base_loss": 0.29319588682055475,
"epoch": 3.0486373901367188,
"grad_norm": 0.0009484434267506003,
"learning_rate": 1.657381057739258e-05,
"lookahead_loss": 6.280442509651184,
"loss": 0.3075,
"step": 350500
},
{
"base_loss": 0.3029366814792156,
"epoch": 3.049591064453125,
"grad_norm": 0.0009550508693791926,
"learning_rate": 1.6526126861572266e-05,
"lookahead_loss": 6.254170268058777,
"loss": 0.316,
"step": 351000
},
{
"base_loss": 0.32044321012496946,
"epoch": 3.0505447387695312,
"grad_norm": 0.0009002664592117071,
"learning_rate": 1.6478443145751953e-05,
"lookahead_loss": 6.356799177169799,
"loss": 0.3322,
"step": 351500
},
{
"base_loss": 0.3040877487659454,
"epoch": 3.0514984130859375,
"grad_norm": 0.0009919478325173259,
"learning_rate": 1.643075942993164e-05,
"lookahead_loss": 6.317004768371582,
"loss": 0.3172,
"step": 352000
},
{
"base_loss": 0.30488765078783037,
"epoch": 3.0524520874023438,
"grad_norm": 0.0010407265508547425,
"learning_rate": 1.638307571411133e-05,
"lookahead_loss": 6.25145507478714,
"loss": 0.3177,
"step": 352500
},
{
"base_loss": 0.32170946165919306,
"epoch": 3.05340576171875,
"grad_norm": 0.0009659235365688801,
"learning_rate": 1.6335391998291017e-05,
"lookahead_loss": 6.258784490585327,
"loss": 0.3297,
"step": 353000
},
{
"base_loss": 0.3580245627462864,
"epoch": 3.0543594360351562,
"grad_norm": 0.0009487209608778358,
"learning_rate": 1.6287708282470704e-05,
"lookahead_loss": 6.3121585865020755,
"loss": 0.3715,
"step": 353500
},
{
"base_loss": 0.29618824023008344,
"epoch": 3.0553131103515625,
"grad_norm": 0.0009817008394747972,
"learning_rate": 1.624002456665039e-05,
"lookahead_loss": 6.299326305389404,
"loss": 0.3075,
"step": 354000
},
{
"base_loss": 0.30493036335706714,
"epoch": 3.0562667846679688,
"grad_norm": 0.000914248637855053,
"learning_rate": 1.6192340850830078e-05,
"lookahead_loss": 6.316375274181366,
"loss": 0.3167,
"step": 354500
},
{
"base_loss": 0.318002614736557,
"epoch": 3.057220458984375,
"grad_norm": 0.000944992178119719,
"learning_rate": 1.6144657135009768e-05,
"lookahead_loss": 6.3369423885345455,
"loss": 0.3319,
"step": 355000
},
{
"epoch": 3.057220458984375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.315732623822392,
"eval_lookahead_perplexity": 553.207204941304,
"eval_loss": 0.14176736772060394,
"eval_perplexity": 1.1523085531891022,
"eval_runtime": 271.1413,
"eval_samples_per_second": 18.441,
"eval_steps_per_second": 0.579,
"step": 355000
},
{
"base_loss": 0.3197241204380989,
"epoch": 3.0581741333007812,
"grad_norm": 0.0009679922368377447,
"learning_rate": 1.6096973419189455e-05,
"lookahead_loss": 6.325226506233215,
"loss": 0.3298,
"step": 355500
},
{
"base_loss": 0.2939224536716938,
"epoch": 3.0591278076171875,
"grad_norm": 0.0009332736954092979,
"learning_rate": 1.604928970336914e-05,
"lookahead_loss": 6.236564951896668,
"loss": 0.3079,
"step": 356000
},
{
"base_loss": 0.30190801098942754,
"epoch": 3.0600814819335938,
"grad_norm": 0.0009871459333226085,
"learning_rate": 1.6001605987548828e-05,
"lookahead_loss": 6.2921178107261655,
"loss": 0.3148,
"step": 356500
},
{
"base_loss": 0.3212427071630955,
"epoch": 3.06103515625,
"grad_norm": 0.0009790301555767655,
"learning_rate": 1.5953922271728515e-05,
"lookahead_loss": 6.265547143936157,
"loss": 0.3313,
"step": 357000
},
{
"base_loss": 0.3071871542930603,
"epoch": 3.0619888305664062,
"grad_norm": 0.001000607735477388,
"learning_rate": 1.5906238555908205e-05,
"lookahead_loss": 6.265981761932373,
"loss": 0.3169,
"step": 357500
},
{
"base_loss": 0.3058412022292614,
"epoch": 3.0629425048828125,
"grad_norm": 0.0009778927778825164,
"learning_rate": 1.5858554840087892e-05,
"lookahead_loss": 6.28272382068634,
"loss": 0.3154,
"step": 358000
},
{
"base_loss": 0.31580124926567077,
"epoch": 3.0638961791992188,
"grad_norm": 0.0009540447499603033,
"learning_rate": 1.581087112426758e-05,
"lookahead_loss": 6.302886658191681,
"loss": 0.329,
"step": 358500
},
{
"base_loss": 0.30067266592383385,
"epoch": 3.064849853515625,
"grad_norm": 0.000962612044531852,
"learning_rate": 1.5763187408447266e-05,
"lookahead_loss": 6.3211470890045165,
"loss": 0.3147,
"step": 359000
},
{
"base_loss": 0.3075073702633381,
"epoch": 3.0658035278320312,
"grad_norm": 0.0009687106939963996,
"learning_rate": 1.5715503692626953e-05,
"lookahead_loss": 6.239695079803467,
"loss": 0.3197,
"step": 359500
},
{
"base_loss": 0.30778184497356414,
"epoch": 3.0667572021484375,
"grad_norm": 0.0009693547617644072,
"learning_rate": 1.5667819976806643e-05,
"lookahead_loss": 6.236202743053436,
"loss": 0.3169,
"step": 360000
},
{
"epoch": 3.0667572021484375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.312184997259999,
"eval_lookahead_perplexity": 551.2481094907844,
"eval_loss": 0.14176103472709656,
"eval_perplexity": 1.152301255649624,
"eval_runtime": 269.9401,
"eval_samples_per_second": 18.523,
"eval_steps_per_second": 0.582,
"step": 360000
},
{
"base_loss": 0.32953174033761023,
"epoch": 3.0677108764648438,
"grad_norm": 0.0010309175122529268,
"learning_rate": 1.562013626098633e-05,
"lookahead_loss": 6.302441148757935,
"loss": 0.3414,
"step": 360500
},
{
"base_loss": 0.3003321154117584,
"epoch": 3.06866455078125,
"grad_norm": 0.0010074395686388016,
"learning_rate": 1.5572452545166016e-05,
"lookahead_loss": 6.279360107421875,
"loss": 0.3096,
"step": 361000
},
{
"base_loss": 0.3017352370470762,
"epoch": 3.0696182250976562,
"grad_norm": 0.0009648427367210388,
"learning_rate": 1.5524768829345703e-05,
"lookahead_loss": 6.320928843975067,
"loss": 0.3148,
"step": 361500
},
{
"base_loss": 0.3461217338144779,
"epoch": 3.0705718994140625,
"grad_norm": 0.0009705660049803555,
"learning_rate": 1.547708511352539e-05,
"lookahead_loss": 6.2096345415115355,
"loss": 0.3589,
"step": 362000
},
{
"base_loss": 0.3151253694295883,
"epoch": 3.0715255737304688,
"grad_norm": 0.0009764889837242663,
"learning_rate": 1.542940139770508e-05,
"lookahead_loss": 6.254361331939697,
"loss": 0.3252,
"step": 362500
},
{
"base_loss": 0.3069308316111565,
"epoch": 3.072479248046875,
"grad_norm": 0.0009667645208537579,
"learning_rate": 1.5381717681884767e-05,
"lookahead_loss": 6.327083124160767,
"loss": 0.318,
"step": 363000
},
{
"base_loss": 0.30406438249349593,
"epoch": 3.0734329223632812,
"grad_norm": 0.0009305006824433804,
"learning_rate": 1.5334033966064454e-05,
"lookahead_loss": 6.285885016918183,
"loss": 0.317,
"step": 363500
},
{
"base_loss": 0.32663529852032663,
"epoch": 3.0743865966796875,
"grad_norm": 0.0009700483060441911,
"learning_rate": 1.528635025024414e-05,
"lookahead_loss": 6.314631988525391,
"loss": 0.3395,
"step": 364000
},
{
"base_loss": 0.30363860473036763,
"epoch": 3.0753402709960938,
"grad_norm": 0.0009474587277509272,
"learning_rate": 1.523866653442383e-05,
"lookahead_loss": 6.379505019664764,
"loss": 0.3149,
"step": 364500
},
{
"base_loss": 0.30343541222810744,
"epoch": 3.0762939453125,
"grad_norm": 0.0009159389301203191,
"learning_rate": 1.5190982818603516e-05,
"lookahead_loss": 6.336886539459228,
"loss": 0.3166,
"step": 365000
},
{
"epoch": 3.0762939453125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.308753397518073,
"eval_lookahead_perplexity": 549.3596886259452,
"eval_loss": 0.14175455272197723,
"eval_perplexity": 1.1522937864511935,
"eval_runtime": 258.1196,
"eval_samples_per_second": 19.371,
"eval_steps_per_second": 0.608,
"step": 365000
},
{
"base_loss": 0.32582479974627493,
"epoch": 3.0772476196289062,
"grad_norm": 0.0009780022082850337,
"learning_rate": 1.5143299102783205e-05,
"lookahead_loss": 6.3267164087295535,
"loss": 0.3404,
"step": 365500
},
{
"base_loss": 0.30336130890250207,
"epoch": 3.0782012939453125,
"grad_norm": 0.000988679938018322,
"learning_rate": 1.5095615386962891e-05,
"lookahead_loss": 6.332203316688537,
"loss": 0.3175,
"step": 366000
},
{
"base_loss": 0.29930157062411306,
"epoch": 3.0791549682617188,
"grad_norm": 0.0009294356568716466,
"learning_rate": 1.5047931671142578e-05,
"lookahead_loss": 6.334248623847961,
"loss": 0.3128,
"step": 366500
},
{
"base_loss": 0.30971532610058783,
"epoch": 3.080108642578125,
"grad_norm": 0.0009678815258666873,
"learning_rate": 1.5000247955322267e-05,
"lookahead_loss": 6.314932657241822,
"loss": 0.3278,
"step": 367000
},
{
"base_loss": 0.3241646741628647,
"epoch": 3.0810623168945312,
"grad_norm": 0.0009766423609107733,
"learning_rate": 1.4952564239501954e-05,
"lookahead_loss": 6.3299585676193235,
"loss": 0.3401,
"step": 367500
},
{
"base_loss": 0.3009178417623043,
"epoch": 3.0820159912109375,
"grad_norm": 0.0008978794794529676,
"learning_rate": 1.4904880523681642e-05,
"lookahead_loss": 6.32042356300354,
"loss": 0.313,
"step": 368000
},
{
"base_loss": 0.3062888396978378,
"epoch": 3.0829696655273438,
"grad_norm": 0.0009827233152464032,
"learning_rate": 1.4857196807861329e-05,
"lookahead_loss": 6.378310326099395,
"loss": 0.3195,
"step": 368500
},
{
"base_loss": 0.33434281674027444,
"epoch": 3.08392333984375,
"grad_norm": 0.000925905245821923,
"learning_rate": 1.4809513092041016e-05,
"lookahead_loss": 6.374326509475708,
"loss": 0.345,
"step": 369000
},
{
"base_loss": 0.3093685868382454,
"epoch": 3.0848770141601562,
"grad_norm": 0.0009787451708689332,
"learning_rate": 1.4761829376220704e-05,
"lookahead_loss": 6.290310836315155,
"loss": 0.3189,
"step": 369500
},
{
"base_loss": 0.2982211470901966,
"epoch": 3.0858306884765625,
"grad_norm": 0.000987353385426104,
"learning_rate": 1.4714145660400391e-05,
"lookahead_loss": 6.305355979442597,
"loss": 0.3087,
"step": 370000
},
{
"epoch": 3.0858306884765625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.305268980062808,
"eval_lookahead_perplexity": 547.448821200929,
"eval_loss": 0.14174824953079224,
"eval_perplexity": 1.1522865233460466,
"eval_runtime": 275.1775,
"eval_samples_per_second": 18.17,
"eval_steps_per_second": 0.571,
"step": 370000
},
{
"base_loss": 0.30163490331172943,
"epoch": 3.0867843627929688,
"grad_norm": 0.001101371948607266,
"learning_rate": 1.466646194458008e-05,
"lookahead_loss": 6.271534814834594,
"loss": 0.313,
"step": 370500
},
{
"base_loss": 0.33820600137114526,
"epoch": 3.087738037109375,
"grad_norm": 0.0009093422559089959,
"learning_rate": 1.4618778228759766e-05,
"lookahead_loss": 6.3157040328979495,
"loss": 0.344,
"step": 371000
},
{
"base_loss": 0.30079344487190246,
"epoch": 3.0886917114257812,
"grad_norm": 0.0010120077058672905,
"learning_rate": 1.4571094512939453e-05,
"lookahead_loss": 6.31459610080719,
"loss": 0.3135,
"step": 371500
},
{
"base_loss": 0.3107509427666664,
"epoch": 3.0896453857421875,
"grad_norm": 0.0009789596078917384,
"learning_rate": 1.4523410797119142e-05,
"lookahead_loss": 6.341436826229096,
"loss": 0.3187,
"step": 372000
},
{
"base_loss": 0.30222347214818,
"epoch": 3.0905990600585938,
"grad_norm": 0.0009555344004184008,
"learning_rate": 1.4475727081298829e-05,
"lookahead_loss": 6.318659209251404,
"loss": 0.3128,
"step": 372500
},
{
"base_loss": 0.3001328880786896,
"epoch": 3.091552734375,
"grad_norm": 0.0009132505510933697,
"learning_rate": 1.4428043365478517e-05,
"lookahead_loss": 6.288368509769439,
"loss": 0.3099,
"step": 373000
},
{
"base_loss": 0.31811392498016355,
"epoch": 3.0925064086914062,
"grad_norm": 0.0009298596996814013,
"learning_rate": 1.4380359649658204e-05,
"lookahead_loss": 6.329743075847626,
"loss": 0.3326,
"step": 373500
},
{
"base_loss": 0.31058550345897673,
"epoch": 3.0934600830078125,
"grad_norm": 0.0008941558189690113,
"learning_rate": 1.433267593383789e-05,
"lookahead_loss": 6.32411856842041,
"loss": 0.3212,
"step": 374000
},
{
"base_loss": 0.28807635736465453,
"epoch": 3.0944137573242188,
"grad_norm": 0.0009412313811480999,
"learning_rate": 1.428499221801758e-05,
"lookahead_loss": 6.325871033668518,
"loss": 0.3033,
"step": 374500
},
{
"base_loss": 0.2929033098220825,
"epoch": 3.095367431640625,
"grad_norm": 0.0009629988926462829,
"learning_rate": 1.4237308502197266e-05,
"lookahead_loss": 6.251724208831787,
"loss": 0.3074,
"step": 375000
},
{
"epoch": 3.095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.302847357222828,
"eval_lookahead_perplexity": 546.1247105271912,
"eval_loss": 0.14174294471740723,
"eval_perplexity": 1.1522804106972875,
"eval_runtime": 253.3905,
"eval_samples_per_second": 19.732,
"eval_steps_per_second": 0.62,
"step": 375000
},
{
"base_loss": 0.3016510356664658,
"epoch": 3.0963211059570312,
"grad_norm": 0.0009708671714179218,
"learning_rate": 1.4189624786376955e-05,
"lookahead_loss": 6.314412651538849,
"loss": 0.3143,
"step": 375500
},
{
"base_loss": 0.32936670687794684,
"epoch": 3.0972747802734375,
"grad_norm": 0.0009516220889054239,
"learning_rate": 1.4141941070556641e-05,
"lookahead_loss": 6.348289193153382,
"loss": 0.3409,
"step": 376000
},
{
"base_loss": 0.29046559768915176,
"epoch": 3.0982284545898438,
"grad_norm": 0.0009895728435367346,
"learning_rate": 1.4094257354736328e-05,
"lookahead_loss": 6.274867289066314,
"loss": 0.3055,
"step": 376500
},
{
"base_loss": 0.29554841595888137,
"epoch": 3.09918212890625,
"grad_norm": 0.0009511645184829831,
"learning_rate": 1.4046573638916017e-05,
"lookahead_loss": 6.323284810066223,
"loss": 0.31,
"step": 377000
},
{
"base_loss": 0.3033588379621506,
"epoch": 3.1001358032226562,
"grad_norm": 0.0009181915083900094,
"learning_rate": 1.3998889923095704e-05,
"lookahead_loss": 6.335620421409607,
"loss": 0.3154,
"step": 377500
},
{
"base_loss": 0.32281789609789846,
"epoch": 3.1010894775390625,
"grad_norm": 0.0009614496375434101,
"learning_rate": 1.3951206207275392e-05,
"lookahead_loss": 6.400040787696838,
"loss": 0.3334,
"step": 378000
},
{
"base_loss": 0.30253357443213463,
"epoch": 3.1020431518554688,
"grad_norm": 0.0009595219744369388,
"learning_rate": 1.3903522491455079e-05,
"lookahead_loss": 6.356647694587708,
"loss": 0.3143,
"step": 378500
},
{
"base_loss": 0.2986242602169514,
"epoch": 3.102996826171875,
"grad_norm": 0.0009835211094468832,
"learning_rate": 1.3855838775634766e-05,
"lookahead_loss": 6.328301607131958,
"loss": 0.3132,
"step": 379000
},
{
"base_loss": 0.2968667206466198,
"epoch": 3.1039505004882812,
"grad_norm": 0.0009939175797626376,
"learning_rate": 1.3808155059814454e-05,
"lookahead_loss": 6.348633293628692,
"loss": 0.3109,
"step": 379500
},
{
"base_loss": 0.3167901526391506,
"epoch": 3.1049041748046875,
"grad_norm": 0.0009683977696113288,
"learning_rate": 1.3760471343994141e-05,
"lookahead_loss": 6.29404914855957,
"loss": 0.3288,
"step": 380000
},
{
"epoch": 3.1049041748046875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.3002430775675915,
"eval_lookahead_perplexity": 544.7042994310043,
"eval_loss": 0.14173762500286102,
"eval_perplexity": 1.1522742809107298,
"eval_runtime": 271.874,
"eval_samples_per_second": 18.391,
"eval_steps_per_second": 0.577,
"step": 380000
},
{
"base_loss": 0.3047486243546009,
"epoch": 3.1058578491210938,
"grad_norm": 0.0009209378040395677,
"learning_rate": 1.371278762817383e-05,
"lookahead_loss": 6.285359220504761,
"loss": 0.3195,
"step": 380500
},
{
"base_loss": 0.2987241801023483,
"epoch": 3.1068115234375,
"grad_norm": 0.0009467555209994316,
"learning_rate": 1.3665103912353516e-05,
"lookahead_loss": 6.353755208015442,
"loss": 0.3091,
"step": 381000
},
{
"base_loss": 0.29589339858293534,
"epoch": 3.1077651977539062,
"grad_norm": 0.000994194415397942,
"learning_rate": 1.3617420196533203e-05,
"lookahead_loss": 6.2666127576828,
"loss": 0.3074,
"step": 381500
},
{
"base_loss": 0.32084586623311045,
"epoch": 3.1087188720703125,
"grad_norm": 0.0009913091780617833,
"learning_rate": 1.3569736480712892e-05,
"lookahead_loss": 6.280218794345855,
"loss": 0.333,
"step": 382000
},
{
"base_loss": 0.31951685512065886,
"epoch": 3.1096725463867188,
"grad_norm": 0.0009130350081250072,
"learning_rate": 1.3522052764892579e-05,
"lookahead_loss": 6.237833600521087,
"loss": 0.3277,
"step": 382500
},
{
"base_loss": 0.30553389444947243,
"epoch": 3.110626220703125,
"grad_norm": 0.0009011180372908711,
"learning_rate": 1.3474369049072265e-05,
"lookahead_loss": 6.207389236927033,
"loss": 0.3161,
"step": 383000
},
{
"base_loss": 0.29624236226081846,
"epoch": 3.1115798950195312,
"grad_norm": 0.0009934761328622699,
"learning_rate": 1.3426685333251954e-05,
"lookahead_loss": 6.32315664100647,
"loss": 0.3073,
"step": 383500
},
{
"base_loss": 0.3096672693490982,
"epoch": 3.1125335693359375,
"grad_norm": 0.0009398029651492834,
"learning_rate": 1.337900161743164e-05,
"lookahead_loss": 6.333669634819031,
"loss": 0.3207,
"step": 384000
},
{
"base_loss": 0.34586220744252205,
"epoch": 3.1134872436523438,
"grad_norm": 0.000988540006801486,
"learning_rate": 1.333131790161133e-05,
"lookahead_loss": 6.357729611396789,
"loss": 0.3524,
"step": 384500
},
{
"base_loss": 0.2944044529795647,
"epoch": 3.11444091796875,
"grad_norm": 0.0009240133222192526,
"learning_rate": 1.3283634185791016e-05,
"lookahead_loss": 6.241785104751587,
"loss": 0.3082,
"step": 385000
},
{
"epoch": 3.11444091796875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.29768767372107,
"eval_lookahead_perplexity": 543.3141369388018,
"eval_loss": 0.14173269271850586,
"eval_perplexity": 1.152268597580337,
"eval_runtime": 266.7684,
"eval_samples_per_second": 18.743,
"eval_steps_per_second": 0.589,
"step": 385000
},
{
"base_loss": 0.29711622232198714,
"epoch": 3.1153945922851562,
"grad_norm": 0.0009787186281755567,
"learning_rate": 1.3235950469970703e-05,
"lookahead_loss": 6.293258435726166,
"loss": 0.3096,
"step": 385500
},
{
"base_loss": 0.3151656058430672,
"epoch": 3.1163482666015625,
"grad_norm": 0.001029650797136128,
"learning_rate": 1.3188266754150391e-05,
"lookahead_loss": 6.278011352539062,
"loss": 0.3247,
"step": 386000
},
{
"base_loss": 0.32477895976603033,
"epoch": 3.1173019409179688,
"grad_norm": 0.0009709696751087904,
"learning_rate": 1.3140583038330078e-05,
"lookahead_loss": 6.306861896514892,
"loss": 0.3409,
"step": 386500
},
{
"base_loss": 0.3063432638645172,
"epoch": 3.118255615234375,
"grad_norm": 0.0010217542294412851,
"learning_rate": 1.3092899322509767e-05,
"lookahead_loss": 6.320661119937896,
"loss": 0.3155,
"step": 387000
},
{
"base_loss": 0.30022109842300415,
"epoch": 3.1192092895507812,
"grad_norm": 0.0009462428861297667,
"learning_rate": 1.3045215606689454e-05,
"lookahead_loss": 6.326066428661346,
"loss": 0.3108,
"step": 387500
},
{
"base_loss": 0.30673753410577775,
"epoch": 4.000953674316406,
"grad_norm": 0.0009463900933042169,
"learning_rate": 1.299753189086914e-05,
"lookahead_loss": 6.373164211273194,
"loss": 0.3143,
"step": 388000
},
{
"base_loss": 0.29716887477040294,
"epoch": 4.0019073486328125,
"grad_norm": 0.0010187909938395023,
"learning_rate": 1.2949848175048829e-05,
"lookahead_loss": 6.219348033428192,
"loss": 0.3109,
"step": 388500
},
{
"base_loss": 0.3124042835831642,
"epoch": 4.002861022949219,
"grad_norm": 0.0009658048511482775,
"learning_rate": 1.2902164459228516e-05,
"lookahead_loss": 6.20941606426239,
"loss": 0.3217,
"step": 389000
},
{
"base_loss": 0.32272179606556894,
"epoch": 4.003814697265625,
"grad_norm": 0.0009669990977272391,
"learning_rate": 1.2854480743408204e-05,
"lookahead_loss": 6.2338160667419436,
"loss": 0.3353,
"step": 389500
},
{
"base_loss": 0.3010484471619129,
"epoch": 4.004768371582031,
"grad_norm": 0.0009306691936217248,
"learning_rate": 1.2806797027587891e-05,
"lookahead_loss": 6.211200427055359,
"loss": 0.3171,
"step": 390000
},
{
"epoch": 4.004768371582031,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.2949571426684106,
"eval_lookahead_perplexity": 541.8326243950327,
"eval_loss": 0.14172740280628204,
"eval_perplexity": 1.1522625021967197,
"eval_runtime": 259.6181,
"eval_samples_per_second": 19.259,
"eval_steps_per_second": 0.605,
"step": 390000
},
{
"base_loss": 0.304677970200777,
"epoch": 4.0057220458984375,
"grad_norm": 0.0008401920204050839,
"learning_rate": 1.2759113311767578e-05,
"lookahead_loss": 6.327916157245636,
"loss": 0.3134,
"step": 390500
},
{
"base_loss": 0.2963202897310257,
"epoch": 4.006675720214844,
"grad_norm": 0.0009407810866832733,
"learning_rate": 1.2711429595947266e-05,
"lookahead_loss": 6.200181405544281,
"loss": 0.3115,
"step": 391000
},
{
"base_loss": 0.3113421536386013,
"epoch": 4.00762939453125,
"grad_norm": 0.0009980009635910392,
"learning_rate": 1.2663745880126953e-05,
"lookahead_loss": 6.251565413475037,
"loss": 0.3226,
"step": 391500
},
{
"base_loss": 0.31501844617724417,
"epoch": 4.008583068847656,
"grad_norm": 0.000927921908441931,
"learning_rate": 1.2616062164306642e-05,
"lookahead_loss": 6.240460067272187,
"loss": 0.321,
"step": 392000
},
{
"base_loss": 0.30433804252743724,
"epoch": 4.0095367431640625,
"grad_norm": 0.0009920436423271894,
"learning_rate": 1.2568378448486329e-05,
"lookahead_loss": 6.250067127704621,
"loss": 0.3173,
"step": 392500
},
{
"base_loss": 0.300560106664896,
"epoch": 4.010490417480469,
"grad_norm": 0.0009590853005647659,
"learning_rate": 1.2520694732666015e-05,
"lookahead_loss": 6.2296731300354,
"loss": 0.3111,
"step": 393000
},
{
"base_loss": 0.3013677542209625,
"epoch": 4.011444091796875,
"grad_norm": 0.0010011434787884355,
"learning_rate": 1.2473011016845704e-05,
"lookahead_loss": 6.2316191611289975,
"loss": 0.3144,
"step": 393500
},
{
"base_loss": 0.3262205650210381,
"epoch": 4.012397766113281,
"grad_norm": 0.0008804492536000907,
"learning_rate": 1.242532730102539e-05,
"lookahead_loss": 6.226336854934693,
"loss": 0.3349,
"step": 394000
},
{
"base_loss": 0.30570278534293177,
"epoch": 4.0133514404296875,
"grad_norm": 0.0009209011332131922,
"learning_rate": 1.237764358520508e-05,
"lookahead_loss": 6.313926457881927,
"loss": 0.319,
"step": 394500
},
{
"base_loss": 0.2983907374441624,
"epoch": 4.014305114746094,
"grad_norm": 0.0009333580383099616,
"learning_rate": 1.2329959869384766e-05,
"lookahead_loss": 6.263853558540344,
"loss": 0.3132,
"step": 395000
},
{
"epoch": 4.014305114746094,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.2924738585377655,
"eval_lookahead_perplexity": 540.4887693149997,
"eval_loss": 0.14172253012657166,
"eval_perplexity": 1.1522568876042834,
"eval_runtime": 266.6176,
"eval_samples_per_second": 18.753,
"eval_steps_per_second": 0.589,
"step": 395000
},
{
"base_loss": 0.2952827827334404,
"epoch": 4.0152587890625,
"grad_norm": 0.0009197366307489574,
"learning_rate": 1.2282276153564453e-05,
"lookahead_loss": 6.225392914295196,
"loss": 0.3071,
"step": 395500
},
{
"base_loss": 0.3103551665246487,
"epoch": 4.016212463378906,
"grad_norm": 0.0009927282808348536,
"learning_rate": 1.2234592437744141e-05,
"lookahead_loss": 6.285288751602173,
"loss": 0.3233,
"step": 396000
},
{
"base_loss": 0.31515152567625043,
"epoch": 4.0171661376953125,
"grad_norm": 0.0008934010402299464,
"learning_rate": 1.2186908721923828e-05,
"lookahead_loss": 6.318837342262268,
"loss": 0.3244,
"step": 396500
},
{
"base_loss": 0.2997098692059517,
"epoch": 4.018119812011719,
"grad_norm": 0.0009356258087791502,
"learning_rate": 1.2139225006103517e-05,
"lookahead_loss": 6.3236184453964235,
"loss": 0.3105,
"step": 397000
},
{
"base_loss": 0.2971528458297253,
"epoch": 4.019073486328125,
"grad_norm": 0.0009439104469493032,
"learning_rate": 1.2091541290283204e-05,
"lookahead_loss": 6.3374961051940915,
"loss": 0.3097,
"step": 397500
},
{
"base_loss": 0.30046086144447326,
"epoch": 4.020027160644531,
"grad_norm": 0.001036732573993504,
"learning_rate": 1.204385757446289e-05,
"lookahead_loss": 6.184789316654205,
"loss": 0.3137,
"step": 398000
},
{
"base_loss": 0.32556178280711173,
"epoch": 4.0209808349609375,
"grad_norm": 0.0009528475347906351,
"learning_rate": 1.1996173858642579e-05,
"lookahead_loss": 6.28066121339798,
"loss": 0.3384,
"step": 398500
},
{
"base_loss": 0.30630752837657926,
"epoch": 4.021934509277344,
"grad_norm": 0.0009664383833296597,
"learning_rate": 1.1948490142822266e-05,
"lookahead_loss": 6.229596662521362,
"loss": 0.3151,
"step": 399000
},
{
"base_loss": 0.2993029763698578,
"epoch": 4.02288818359375,
"grad_norm": 0.0009780285181477666,
"learning_rate": 1.1900806427001954e-05,
"lookahead_loss": 6.271652706146241,
"loss": 0.3126,
"step": 399500
},
{
"base_loss": 0.3033270851969719,
"epoch": 4.023841857910156,
"grad_norm": 0.0009725343552418053,
"learning_rate": 1.1853122711181641e-05,
"lookahead_loss": 6.217432358264923,
"loss": 0.3153,
"step": 400000
},
{
"epoch": 4.023841857910156,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.289627787404167,
"eval_lookahead_perplexity": 538.9526867675877,
"eval_loss": 0.14171701669692993,
"eval_perplexity": 1.1522505347345173,
"eval_runtime": 269.7244,
"eval_samples_per_second": 18.537,
"eval_steps_per_second": 0.582,
"step": 400000
},
{
"base_loss": 0.32289262464642526,
"epoch": 4.0247955322265625,
"grad_norm": 0.0009039760916493833,
"learning_rate": 1.1805438995361328e-05,
"lookahead_loss": 6.247900634765625,
"loss": 0.333,
"step": 400500
},
{
"base_loss": 0.3097568289935589,
"epoch": 4.025749206542969,
"grad_norm": 0.0009861037833616138,
"learning_rate": 1.1757755279541016e-05,
"lookahead_loss": 6.196441818714142,
"loss": 0.323,
"step": 401000
},
{
"base_loss": 0.30243406727910044,
"epoch": 4.026702880859375,
"grad_norm": 0.0010050033451989293,
"learning_rate": 1.1710071563720703e-05,
"lookahead_loss": 6.22297222328186,
"loss": 0.3127,
"step": 401500
},
{
"base_loss": 0.3076402995288372,
"epoch": 4.027656555175781,
"grad_norm": 0.0009313607588410378,
"learning_rate": 1.1662387847900392e-05,
"lookahead_loss": 6.321592648506164,
"loss": 0.3182,
"step": 402000
},
{
"base_loss": 0.3292808674275875,
"epoch": 4.0286102294921875,
"grad_norm": 0.0009357398957945406,
"learning_rate": 1.1614704132080079e-05,
"lookahead_loss": 6.344618997097015,
"loss": 0.3395,
"step": 402500
},
{
"base_loss": 0.30565082639455793,
"epoch": 4.029563903808594,
"grad_norm": 0.0009264600230380893,
"learning_rate": 1.1567020416259765e-05,
"lookahead_loss": 6.279807175636291,
"loss": 0.3144,
"step": 403000
},
{
"base_loss": 0.3062080657184124,
"epoch": 4.030517578125,
"grad_norm": 0.0009344466379843652,
"learning_rate": 1.1519336700439454e-05,
"lookahead_loss": 6.271709934711456,
"loss": 0.3164,
"step": 403500
},
{
"base_loss": 0.3015955919623375,
"epoch": 4.031471252441406,
"grad_norm": 0.000959421566221863,
"learning_rate": 1.147165298461914e-05,
"lookahead_loss": 6.298808106422424,
"loss": 0.314,
"step": 404000
},
{
"base_loss": 0.3150785211026669,
"epoch": 4.0324249267578125,
"grad_norm": 0.0009721990791149437,
"learning_rate": 1.142396926879883e-05,
"lookahead_loss": 6.2469435338974,
"loss": 0.3334,
"step": 404500
},
{
"base_loss": 0.30686600294709204,
"epoch": 4.033378601074219,
"grad_norm": 0.0010035648010671139,
"learning_rate": 1.1376285552978516e-05,
"lookahead_loss": 6.3015628657341,
"loss": 0.317,
"step": 405000
},
{
"epoch": 4.033378601074219,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.287427164114321,
"eval_lookahead_perplexity": 537.7679589808528,
"eval_loss": 0.1417127251625061,
"eval_perplexity": 1.1522455898222932,
"eval_runtime": 259.3547,
"eval_samples_per_second": 19.279,
"eval_steps_per_second": 0.605,
"step": 405000
},
{
"base_loss": 0.30655504322052,
"epoch": 1.0009536743164062,
"grad_norm": 0.0009588917600922287,
"learning_rate": 1.1328601837158203e-05,
"lookahead_loss": 6.336568244457245,
"loss": 0.3139,
"step": 405500
},
{
"base_loss": 0.3002312153875828,
"epoch": 1.0019073486328125,
"grad_norm": 0.0009991881670430303,
"learning_rate": 1.1280918121337891e-05,
"lookahead_loss": 6.180880591869355,
"loss": 0.3128,
"step": 406000
},
{
"base_loss": 0.312505132496357,
"epoch": 1.0028610229492188,
"grad_norm": 0.0009822511347010732,
"learning_rate": 1.1233234405517578e-05,
"lookahead_loss": 6.171671084403991,
"loss": 0.3219,
"step": 406500
},
{
"base_loss": 0.3240452491641045,
"epoch": 1.003814697265625,
"grad_norm": 0.0009494374971836805,
"learning_rate": 1.1185550689697267e-05,
"lookahead_loss": 6.2043786425590515,
"loss": 0.3358,
"step": 407000
},
{
"base_loss": 0.29858038023114203,
"epoch": 1.0047683715820312,
"grad_norm": 0.0009324284037575126,
"learning_rate": 1.1137866973876954e-05,
"lookahead_loss": 6.192115921020508,
"loss": 0.3133,
"step": 407500
},
{
"base_loss": 0.3042404046058655,
"epoch": 1.0057220458984375,
"grad_norm": 0.0008426170097663999,
"learning_rate": 1.109018325805664e-05,
"lookahead_loss": 6.314742174148559,
"loss": 0.3128,
"step": 408000
},
{
"base_loss": 0.29714440524578095,
"epoch": 1.0066757202148438,
"grad_norm": 0.0009390473132953048,
"learning_rate": 1.1042499542236329e-05,
"lookahead_loss": 6.173552748680115,
"loss": 0.3123,
"step": 408500
},
{
"base_loss": 0.31379624953866003,
"epoch": 1.00762939453125,
"grad_norm": 0.0009832140058279037,
"learning_rate": 1.0994815826416016e-05,
"lookahead_loss": 6.232072317123413,
"loss": 0.3242,
"step": 409000
},
{
"base_loss": 0.31622857597470283,
"epoch": 1.0085830688476562,
"grad_norm": 0.0009228273993358016,
"learning_rate": 1.0947132110595704e-05,
"lookahead_loss": 6.214964485168457,
"loss": 0.3228,
"step": 409500
},
{
"base_loss": 0.3033116071224213,
"epoch": 1.0095367431640625,
"grad_norm": 0.000982031342573464,
"learning_rate": 1.0899448394775391e-05,
"lookahead_loss": 6.235745645046234,
"loss": 0.3161,
"step": 410000
},
{
"epoch": 1.0095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.285334576433078,
"eval_lookahead_perplexity": 536.6438089759619,
"eval_loss": 0.14170871675014496,
"eval_perplexity": 1.1522409711560848,
"eval_runtime": 271.4698,
"eval_samples_per_second": 18.418,
"eval_steps_per_second": 0.578,
"step": 410000
},
{
"base_loss": 0.302242584168911,
"epoch": 1.0104904174804688,
"grad_norm": 0.0009649608400650322,
"learning_rate": 1.0851764678955078e-05,
"lookahead_loss": 6.201606332778931,
"loss": 0.3116,
"step": 410500
},
{
"base_loss": 0.3031807193160057,
"epoch": 1.011444091796875,
"grad_norm": 0.000979897566139698,
"learning_rate": 1.0804080963134766e-05,
"lookahead_loss": 6.214184216022492,
"loss": 0.3162,
"step": 411000
},
{
"base_loss": 0.324542246311903,
"epoch": 1.0123977661132812,
"grad_norm": 0.0008748429245315492,
"learning_rate": 1.0756397247314453e-05,
"lookahead_loss": 6.212394516944885,
"loss": 0.3343,
"step": 411500
},
{
"base_loss": 0.3043093577325344,
"epoch": 1.0133514404296875,
"grad_norm": 0.0009360151016153395,
"learning_rate": 1.0708713531494142e-05,
"lookahead_loss": 6.3113635568618776,
"loss": 0.3195,
"step": 412000
},
{
"base_loss": 0.29890961676836014,
"epoch": 1.0143051147460938,
"grad_norm": 0.0009034210816025734,
"learning_rate": 1.0661029815673829e-05,
"lookahead_loss": 6.2563081007003785,
"loss": 0.3139,
"step": 412500
},
{
"base_loss": 0.2968312213420868,
"epoch": 1.0152587890625,
"grad_norm": 0.0009093395201489329,
"learning_rate": 1.0613346099853515e-05,
"lookahead_loss": 6.21161489534378,
"loss": 0.3077,
"step": 413000
},
{
"base_loss": 0.309987826526165,
"epoch": 1.0162124633789062,
"grad_norm": 0.0009834787342697382,
"learning_rate": 1.0565662384033204e-05,
"lookahead_loss": 6.257289779186249,
"loss": 0.3218,
"step": 413500
},
{
"base_loss": 0.3124798896312714,
"epoch": 1.0171661376953125,
"grad_norm": 0.0009035744587890804,
"learning_rate": 1.051797866821289e-05,
"lookahead_loss": 6.281959458351135,
"loss": 0.323,
"step": 414000
},
{
"base_loss": 0.30385399025678633,
"epoch": 1.0181198120117188,
"grad_norm": 0.0009486905764788389,
"learning_rate": 1.047029495239258e-05,
"lookahead_loss": 6.2897271089553834,
"loss": 0.3125,
"step": 414500
},
{
"base_loss": 0.2997076933085918,
"epoch": 1.019073486328125,
"grad_norm": 0.0009449619683437049,
"learning_rate": 1.0422611236572266e-05,
"lookahead_loss": 6.319130481719971,
"loss": 0.31,
"step": 415000
},
{
"epoch": 1.019073486328125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.283114870516256,
"eval_lookahead_perplexity": 535.4539386076507,
"eval_loss": 0.14170418679714203,
"eval_perplexity": 1.1522357515704595,
"eval_runtime": 302.1089,
"eval_samples_per_second": 16.55,
"eval_steps_per_second": 0.52,
"step": 415000
},
{
"base_loss": 0.30210260692238805,
"epoch": 1.0200271606445312,
"grad_norm": 0.0010286852484568954,
"learning_rate": 1.0374927520751953e-05,
"lookahead_loss": 6.147320285797119,
"loss": 0.3146,
"step": 415500
},
{
"base_loss": 0.3285051781535149,
"epoch": 1.0209808349609375,
"grad_norm": 0.0009719706140458584,
"learning_rate": 1.0327243804931641e-05,
"lookahead_loss": 6.239666066169739,
"loss": 0.3382,
"step": 416000
},
{
"base_loss": 0.30326452678442,
"epoch": 1.0219345092773438,
"grad_norm": 0.0009495667181909084,
"learning_rate": 1.0279560089111328e-05,
"lookahead_loss": 6.200324444770813,
"loss": 0.3137,
"step": 416500
},
{
"base_loss": 0.29889601907134056,
"epoch": 1.02288818359375,
"grad_norm": 0.0009736404754221439,
"learning_rate": 1.0231876373291017e-05,
"lookahead_loss": 6.251074130535126,
"loss": 0.3114,
"step": 417000
},
{
"base_loss": 0.3006108500063419,
"epoch": 1.0238418579101562,
"grad_norm": 0.0009736517095007002,
"learning_rate": 1.0184192657470704e-05,
"lookahead_loss": 6.2008349332809445,
"loss": 0.3122,
"step": 417500
},
{
"base_loss": 0.3237688979506493,
"epoch": 1.0247955322265625,
"grad_norm": 0.0008906475268304348,
"learning_rate": 1.013650894165039e-05,
"lookahead_loss": 6.22595210647583,
"loss": 0.3352,
"step": 418000
},
{
"base_loss": 0.3078545735180378,
"epoch": 1.0257492065429688,
"grad_norm": 0.0009600927005521953,
"learning_rate": 1.0088825225830079e-05,
"lookahead_loss": 6.177468316078186,
"loss": 0.3217,
"step": 418500
},
{
"base_loss": 0.3022345977425575,
"epoch": 1.026702880859375,
"grad_norm": 0.001004669931717217,
"learning_rate": 1.0041141510009766e-05,
"lookahead_loss": 6.194265043735504,
"loss": 0.3106,
"step": 419000
},
{
"base_loss": 0.3071480156183243,
"epoch": 1.0276565551757812,
"grad_norm": 0.0009501728927716613,
"learning_rate": 9.993457794189454e-06,
"lookahead_loss": 6.303304790496826,
"loss": 0.3183,
"step": 419500
},
{
"base_loss": 0.3302598208785057,
"epoch": 1.0286102294921875,
"grad_norm": 0.0009357588132843375,
"learning_rate": 9.945774078369141e-06,
"lookahead_loss": 6.331861734390259,
"loss": 0.3412,
"step": 420000
},
{
"epoch": 1.0286102294921875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.281080325190633,
"eval_lookahead_perplexity": 534.3656407708145,
"eval_loss": 0.14170029759407043,
"eval_perplexity": 1.1522312703003497,
"eval_runtime": 283.7378,
"eval_samples_per_second": 17.622,
"eval_steps_per_second": 0.553,
"step": 420000
},
{
"base_loss": 0.3027013133764267,
"epoch": 1.0295639038085938,
"grad_norm": 0.0009330170578323305,
"learning_rate": 9.898090362548828e-06,
"lookahead_loss": 6.26250266456604,
"loss": 0.3116,
"step": 420500
},
{
"base_loss": 0.3046494301855564,
"epoch": 1.030517578125,
"grad_norm": 0.0009495181730017066,
"learning_rate": 9.850406646728516e-06,
"lookahead_loss": 6.2420420794487,
"loss": 0.3156,
"step": 421000
},
{
"base_loss": 0.3023626366853714,
"epoch": 1.0314712524414062,
"grad_norm": 0.0009388374746777117,
"learning_rate": 9.802722930908203e-06,
"lookahead_loss": 6.270756626129151,
"loss": 0.3143,
"step": 421500
},
{
"base_loss": 0.3171934984624386,
"epoch": 1.0324249267578125,
"grad_norm": 0.0009781798580661416,
"learning_rate": 9.755039215087892e-06,
"lookahead_loss": 6.233568662643433,
"loss": 0.3343,
"step": 422000
},
{
"base_loss": 0.305971223294735,
"epoch": 1.0333786010742188,
"grad_norm": 0.0009960117749869823,
"learning_rate": 9.707355499267579e-06,
"lookahead_loss": 6.261573143005371,
"loss": 0.3149,
"step": 422500
},
{
"base_loss": 0.3008191674053669,
"epoch": 1.034332275390625,
"grad_norm": 0.000891255447641015,
"learning_rate": 9.659671783447265e-06,
"lookahead_loss": 6.343621186256408,
"loss": 0.3133,
"step": 423000
},
{
"base_loss": 0.3125488177835941,
"epoch": 1.0352859497070312,
"grad_norm": 0.0010064059169963002,
"learning_rate": 9.611988067626954e-06,
"lookahead_loss": 6.176242787361145,
"loss": 0.3221,
"step": 423500
},
{
"base_loss": 0.32382212686538697,
"epoch": 1.0362396240234375,
"grad_norm": 0.0009808322647586465,
"learning_rate": 9.56430435180664e-06,
"lookahead_loss": 6.291307106018066,
"loss": 0.3371,
"step": 424000
},
{
"base_loss": 0.30577521124482154,
"epoch": 1.0371932983398438,
"grad_norm": 0.0010097597260028124,
"learning_rate": 9.51662063598633e-06,
"lookahead_loss": 6.22961159658432,
"loss": 0.3171,
"step": 424500
},
{
"base_loss": 0.3027714610397816,
"epoch": 1.03814697265625,
"grad_norm": 0.0009810663759708405,
"learning_rate": 9.468936920166016e-06,
"lookahead_loss": 6.266712354660034,
"loss": 0.3148,
"step": 425000
},
{
"epoch": 1.03814697265625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.279258819433828,
"eval_lookahead_perplexity": 533.3931766233101,
"eval_loss": 0.14169692993164062,
"eval_perplexity": 1.152227389980924,
"eval_runtime": 396.6631,
"eval_samples_per_second": 12.605,
"eval_steps_per_second": 0.396,
"step": 425000
},
{
"base_loss": 0.3064252578020096,
"epoch": 1.0391006469726562,
"grad_norm": 0.0009220910142175853,
"learning_rate": 9.421253204345703e-06,
"lookahead_loss": 6.2247487473487855,
"loss": 0.3193,
"step": 425500
},
{
"base_loss": 0.3251348150372505,
"epoch": 1.0400543212890625,
"grad_norm": 0.0009618565090931952,
"learning_rate": 9.373569488525391e-06,
"lookahead_loss": 6.2589435048103335,
"loss": 0.3343,
"step": 426000
},
{
"base_loss": 0.3045478595495224,
"epoch": 1.0410079956054688,
"grad_norm": 0.0009619101765565574,
"learning_rate": 9.325885772705078e-06,
"lookahead_loss": 6.189810398101806,
"loss": 0.315,
"step": 426500
},
{
"base_loss": 0.2982518375813961,
"epoch": 1.041961669921875,
"grad_norm": 0.0009836278622969985,
"learning_rate": 9.278202056884767e-06,
"lookahead_loss": 6.272165112972259,
"loss": 0.3097,
"step": 427000
},
{
"base_loss": 0.3089935587644577,
"epoch": 1.0429153442382812,
"grad_norm": 0.0009688155842013657,
"learning_rate": 9.230518341064454e-06,
"lookahead_loss": 6.282459297657013,
"loss": 0.324,
"step": 427500
},
{
"base_loss": 0.3268603746891022,
"epoch": 1.0438690185546875,
"grad_norm": 0.0009795301593840122,
"learning_rate": 9.18283462524414e-06,
"lookahead_loss": 6.323117096424102,
"loss": 0.3406,
"step": 428000
},
{
"base_loss": 0.29676153120398524,
"epoch": 1.0448226928710938,
"grad_norm": 0.0009742308175191283,
"learning_rate": 9.135150909423829e-06,
"lookahead_loss": 6.223826610565186,
"loss": 0.3091,
"step": 428500
},
{
"base_loss": 0.3044439141750336,
"epoch": 1.0457763671875,
"grad_norm": 0.000964790116995573,
"learning_rate": 9.087467193603516e-06,
"lookahead_loss": 6.254047955989837,
"loss": 0.3161,
"step": 429000
},
{
"base_loss": 0.3313070158064365,
"epoch": 1.0467300415039062,
"grad_norm": 0.000941166770644486,
"learning_rate": 9.039783477783204e-06,
"lookahead_loss": 6.2130351490974425,
"loss": 0.3393,
"step": 429500
},
{
"base_loss": 0.32587327966094015,
"epoch": 1.0476837158203125,
"grad_norm": 0.0010063599329441786,
"learning_rate": 8.992099761962891e-06,
"lookahead_loss": 6.257496692657471,
"loss": 0.3387,
"step": 430000
},
{
"epoch": 1.0476837158203125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.2776359635800985,
"eval_lookahead_perplexity": 532.5282583929193,
"eval_loss": 0.14169345796108246,
"eval_perplexity": 1.1522233894882945,
"eval_runtime": 288.3523,
"eval_samples_per_second": 17.34,
"eval_steps_per_second": 0.544,
"step": 430000
},
{
"base_loss": 0.29468956208229063,
"epoch": 1.0486373901367188,
"grad_norm": 0.0009450262296013534,
"learning_rate": 8.944416046142578e-06,
"lookahead_loss": 6.2217209124565125,
"loss": 0.3076,
"step": 430500
},
{
"base_loss": 0.3027429393827915,
"epoch": 1.049591064453125,
"grad_norm": 0.0009969213278964162,
"learning_rate": 8.896732330322266e-06,
"lookahead_loss": 6.207447394371033,
"loss": 0.3161,
"step": 431000
},
{
"base_loss": 0.3190444597601891,
"epoch": 1.0505447387695312,
"grad_norm": 0.0008964469889178872,
"learning_rate": 8.849048614501953e-06,
"lookahead_loss": 6.302306387901306,
"loss": 0.3331,
"step": 431500
},
{
"base_loss": 0.3043918348252773,
"epoch": 1.0514984130859375,
"grad_norm": 0.0009915096452459693,
"learning_rate": 8.801364898681642e-06,
"lookahead_loss": 6.257247267246246,
"loss": 0.3177,
"step": 432000
},
{
"base_loss": 0.3046840020418167,
"epoch": 1.0524520874023438,
"grad_norm": 0.0010428299428895116,
"learning_rate": 8.753681182861329e-06,
"lookahead_loss": 6.1936208391189576,
"loss": 0.317,
"step": 432500
},
{
"base_loss": 0.3202188531160355,
"epoch": 1.05340576171875,
"grad_norm": 0.0009129694662988186,
"learning_rate": 8.705997467041015e-06,
"lookahead_loss": 6.2067328634262084,
"loss": 0.3299,
"step": 433000
},
{
"base_loss": 0.3542410895228386,
"epoch": 1.0543594360351562,
"grad_norm": 0.0009339757962152362,
"learning_rate": 8.658313751220704e-06,
"lookahead_loss": 6.249197509765625,
"loss": 0.3686,
"step": 433500
},
{
"base_loss": 0.2943912135362625,
"epoch": 1.0553131103515625,
"grad_norm": 0.0009666451369412243,
"learning_rate": 8.61063003540039e-06,
"lookahead_loss": 6.255749958992005,
"loss": 0.3081,
"step": 434000
},
{
"base_loss": 0.30392896428704264,
"epoch": 1.0562667846679688,
"grad_norm": 0.0009277292410843074,
"learning_rate": 8.56294631958008e-06,
"lookahead_loss": 6.270913313865662,
"loss": 0.3167,
"step": 434500
},
{
"base_loss": 0.3181495431959629,
"epoch": 1.057220458984375,
"grad_norm": 0.0009528077207505703,
"learning_rate": 8.515262603759766e-06,
"lookahead_loss": 6.285120022773743,
"loss": 0.3316,
"step": 435000
},
{
"epoch": 1.057220458984375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.276001627071977,
"eval_lookahead_perplexity": 531.658638837682,
"eval_loss": 0.1416902393102646,
"eval_perplexity": 1.152219680889508,
"eval_runtime": 272.5458,
"eval_samples_per_second": 18.346,
"eval_steps_per_second": 0.576,
"step": 435000
},
{
"base_loss": 0.3180287193655968,
"epoch": 1.0581741333007812,
"grad_norm": 0.000986663973890245,
"learning_rate": 8.467578887939453e-06,
"lookahead_loss": 6.288068524360657,
"loss": 0.3283,
"step": 435500
},
{
"base_loss": 0.292520221978426,
"epoch": 1.0591278076171875,
"grad_norm": 0.0009171205456368625,
"learning_rate": 8.419895172119141e-06,
"lookahead_loss": 6.184898139953614,
"loss": 0.308,
"step": 436000
},
{
"base_loss": 0.3019208701252937,
"epoch": 1.0600814819335938,
"grad_norm": 0.0009770637843757868,
"learning_rate": 8.372211456298828e-06,
"lookahead_loss": 6.231124799251557,
"loss": 0.3146,
"step": 436500
},
{
"base_loss": 0.32141088619828223,
"epoch": 1.06103515625,
"grad_norm": 0.0009801468113437295,
"learning_rate": 8.324527740478517e-06,
"lookahead_loss": 6.210278995513916,
"loss": 0.332,
"step": 437000
},
{
"base_loss": 0.30723505771160126,
"epoch": 1.0619888305664062,
"grad_norm": 0.0010288211051374674,
"learning_rate": 8.276844024658204e-06,
"lookahead_loss": 6.222631175994873,
"loss": 0.3162,
"step": 437500
},
{
"base_loss": 0.308370777964592,
"epoch": 1.0629425048828125,
"grad_norm": 0.000984584796242416,
"learning_rate": 8.22916030883789e-06,
"lookahead_loss": 6.219526912689209,
"loss": 0.3171,
"step": 438000
},
{
"base_loss": 0.3175841515958309,
"epoch": 1.0638961791992188,
"grad_norm": 0.0009486065828241408,
"learning_rate": 8.181476593017579e-06,
"lookahead_loss": 6.251926373958588,
"loss": 0.3296,
"step": 438500
},
{
"base_loss": 0.3023634272813797,
"epoch": 1.064849853515625,
"grad_norm": 0.0009462623856961727,
"learning_rate": 8.133792877197266e-06,
"lookahead_loss": 6.273111929893494,
"loss": 0.3172,
"step": 439000
},
{
"base_loss": 0.31000158992409704,
"epoch": 1.0658035278320312,
"grad_norm": 0.000977607793174684,
"learning_rate": 8.086109161376954e-06,
"lookahead_loss": 6.2029001622200015,
"loss": 0.321,
"step": 439500
},
{
"base_loss": 0.3074465197324753,
"epoch": 1.0667572021484375,
"grad_norm": 0.0009711860329844058,
"learning_rate": 8.038425445556641e-06,
"lookahead_loss": 6.179575590610504,
"loss": 0.3168,
"step": 440000
},
{
"epoch": 1.0667572021484375,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.274223205761407,
"eval_lookahead_perplexity": 530.7139660465189,
"eval_loss": 0.1416870653629303,
"eval_perplexity": 1.1522160238107269,
"eval_runtime": 271.6398,
"eval_samples_per_second": 18.407,
"eval_steps_per_second": 0.578,
"step": 440000
},
{
"base_loss": 0.3308669750988483,
"epoch": 1.0677108764648438,
"grad_norm": 0.001032104599289596,
"learning_rate": 7.990741729736328e-06,
"lookahead_loss": 6.263833108901977,
"loss": 0.3436,
"step": 440500
},
{
"base_loss": 0.300963022172451,
"epoch": 1.06866455078125,
"grad_norm": 0.0009854926029220223,
"learning_rate": 7.943058013916016e-06,
"lookahead_loss": 6.227484060764313,
"loss": 0.31,
"step": 441000
},
{
"base_loss": 0.3016065271794796,
"epoch": 1.0696182250976562,
"grad_norm": 0.0009571650298312306,
"learning_rate": 7.895374298095703e-06,
"lookahead_loss": 6.25590787267685,
"loss": 0.3144,
"step": 441500
},
{
"base_loss": 0.3469915909469128,
"epoch": 1.0705718994140625,
"grad_norm": 0.0009693103493191302,
"learning_rate": 7.847690582275392e-06,
"lookahead_loss": 6.164663645267487,
"loss": 0.3587,
"step": 442000
},
{
"base_loss": 0.31762470316886904,
"epoch": 1.0715255737304688,
"grad_norm": 0.0009685850236564875,
"learning_rate": 7.800006866455079e-06,
"lookahead_loss": 6.211194790840149,
"loss": 0.3254,
"step": 442500
},
{
"base_loss": 0.3090612238943577,
"epoch": 1.072479248046875,
"grad_norm": 0.0009897744748741388,
"learning_rate": 7.752323150634765e-06,
"lookahead_loss": 6.288339027404785,
"loss": 0.3187,
"step": 443000
},
{
"base_loss": 0.3051177371442318,
"epoch": 1.0734329223632812,
"grad_norm": 0.0009485665941610932,
"learning_rate": 7.704639434814454e-06,
"lookahead_loss": 6.247628257751465,
"loss": 0.3173,
"step": 443500
},
{
"base_loss": 0.32735036182403565,
"epoch": 1.0743865966796875,
"grad_norm": 0.000949955778196454,
"learning_rate": 7.65695571899414e-06,
"lookahead_loss": 6.261586086750031,
"loss": 0.3397,
"step": 444000
},
{
"base_loss": 0.3037717220187187,
"epoch": 1.0753402709960938,
"grad_norm": 0.0009570113033987582,
"learning_rate": 7.6092720031738284e-06,
"lookahead_loss": 6.32311720943451,
"loss": 0.3156,
"step": 444500
},
{
"base_loss": 0.3043428426384926,
"epoch": 1.0762939453125,
"grad_norm": 0.0008967678295448422,
"learning_rate": 7.561588287353516e-06,
"lookahead_loss": 6.288136465072632,
"loss": 0.3173,
"step": 445000
},
{
"epoch": 1.0762939453125,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.272627116773076,
"eval_lookahead_perplexity": 529.8675749667811,
"eval_loss": 0.14168404042720795,
"eval_perplexity": 1.1522125384365882,
"eval_runtime": 265.0143,
"eval_samples_per_second": 18.867,
"eval_steps_per_second": 0.592,
"step": 445000
},
{
"base_loss": 0.3304513318836689,
"epoch": 1.0772476196289062,
"grad_norm": 0.0009672945598140359,
"learning_rate": 7.513904571533204e-06,
"lookahead_loss": 6.262294914245605,
"loss": 0.3437,
"step": 445500
},
{
"base_loss": 0.3036956556737423,
"epoch": 1.0782012939453125,
"grad_norm": 0.0009918182622641325,
"learning_rate": 7.466220855712891e-06,
"lookahead_loss": 6.286622961521148,
"loss": 0.3177,
"step": 446000
},
{
"base_loss": 0.29657268461585046,
"epoch": 1.0791549682617188,
"grad_norm": 0.0009162202477455139,
"learning_rate": 7.418537139892578e-06,
"lookahead_loss": 6.281464424133301,
"loss": 0.3109,
"step": 446500
},
{
"base_loss": 0.3123248810470104,
"epoch": 1.080108642578125,
"grad_norm": 0.0009557082084938884,
"learning_rate": 7.370853424072266e-06,
"lookahead_loss": 6.271296732425689,
"loss": 0.3289,
"step": 447000
},
{
"base_loss": 0.32140666726231576,
"epoch": 1.0810623168945312,
"grad_norm": 0.0009874977404251695,
"learning_rate": 7.323169708251954e-06,
"lookahead_loss": 6.28459517621994,
"loss": 0.3369,
"step": 447500
},
{
"base_loss": 0.29961148300766943,
"epoch": 1.0820159912109375,
"grad_norm": 0.0008999704150483012,
"learning_rate": 7.275485992431641e-06,
"lookahead_loss": 6.285267066001892,
"loss": 0.312,
"step": 448000
},
{
"base_loss": 0.3036400380730629,
"epoch": 1.0829696655273438,
"grad_norm": 0.0010060252388939261,
"learning_rate": 7.227802276611328e-06,
"lookahead_loss": 6.313744902610779,
"loss": 0.3171,
"step": 448500
},
{
"base_loss": 0.3315876969695091,
"epoch": 1.08392333984375,
"grad_norm": 0.0009252046584151685,
"learning_rate": 7.180118560791016e-06,
"lookahead_loss": 6.31829022693634,
"loss": 0.3446,
"step": 449000
},
{
"base_loss": 0.30729381024837493,
"epoch": 1.0848770141601562,
"grad_norm": 0.0009679401991888881,
"learning_rate": 7.1324348449707034e-06,
"lookahead_loss": 6.265555395126343,
"loss": 0.3195,
"step": 449500
},
{
"base_loss": 0.3001744159460068,
"epoch": 1.0858306884765625,
"grad_norm": 0.0010059743653982878,
"learning_rate": 7.084751129150391e-06,
"lookahead_loss": 6.251848965168,
"loss": 0.3098,
"step": 450000
},
{
"epoch": 1.0858306884765625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.271020517562525,
"eval_lookahead_perplexity": 529.0169736098392,
"eval_loss": 0.14168114960193634,
"eval_perplexity": 1.1522092075962782,
"eval_runtime": 256.5611,
"eval_samples_per_second": 19.489,
"eval_steps_per_second": 0.612,
"step": 450000
},
{
"base_loss": 0.3043146550655365,
"epoch": 1.0867843627929688,
"grad_norm": 0.0011264593340456486,
"learning_rate": 7.037067413330079e-06,
"lookahead_loss": 6.214835606098175,
"loss": 0.3135,
"step": 450500
},
{
"base_loss": 0.33685871040821075,
"epoch": 1.087738037109375,
"grad_norm": 0.000883612665347755,
"learning_rate": 6.989383697509766e-06,
"lookahead_loss": 6.2883378591537475,
"loss": 0.344,
"step": 451000
},
{
"base_loss": 0.30271838963031766,
"epoch": 1.0886917114257812,
"grad_norm": 0.0009825986344367266,
"learning_rate": 6.941699981689453e-06,
"lookahead_loss": 6.2627932052612305,
"loss": 0.313,
"step": 451500
},
{
"base_loss": 0.31106107553839685,
"epoch": 1.0896453857421875,
"grad_norm": 0.0009782552951946855,
"learning_rate": 6.894016265869141e-06,
"lookahead_loss": 6.294947155952453,
"loss": 0.3202,
"step": 452000
},
{
"base_loss": 0.29801268032193184,
"epoch": 1.0905990600585938,
"grad_norm": 0.0009775352664291859,
"learning_rate": 6.846332550048829e-06,
"lookahead_loss": 6.287975093841553,
"loss": 0.3103,
"step": 452500
},
{
"base_loss": 0.29706856977939605,
"epoch": 1.091552734375,
"grad_norm": 0.0008749934495426714,
"learning_rate": 6.798648834228516e-06,
"lookahead_loss": 6.24645920419693,
"loss": 0.3081,
"step": 453000
},
{
"base_loss": 0.3189851225912571,
"epoch": 1.0925064086914062,
"grad_norm": 0.0009568808600306511,
"learning_rate": 6.750965118408203e-06,
"lookahead_loss": 6.2773444094657895,
"loss": 0.3344,
"step": 453500
},
{
"base_loss": 0.307105902582407,
"epoch": 1.0934600830078125,
"grad_norm": 0.0009044524631462991,
"learning_rate": 6.703281402587891e-06,
"lookahead_loss": 6.281952003479004,
"loss": 0.3195,
"step": 454000
},
{
"base_loss": 0.2863923677802086,
"epoch": 1.0944137573242188,
"grad_norm": 0.0009490604279562831,
"learning_rate": 6.6555976867675784e-06,
"lookahead_loss": 6.28246708202362,
"loss": 0.3022,
"step": 454500
},
{
"base_loss": 0.2923275768607855,
"epoch": 1.095367431640625,
"grad_norm": 0.0009766619186848402,
"learning_rate": 6.607913970947266e-06,
"lookahead_loss": 6.205554433822632,
"loss": 0.3078,
"step": 455000
},
{
"epoch": 1.095367431640625,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.269953422272168,
"eval_lookahead_perplexity": 528.4527631754679,
"eval_loss": 0.1416788250207901,
"eval_perplexity": 1.1522065291955907,
"eval_runtime": 265.9126,
"eval_samples_per_second": 18.803,
"eval_steps_per_second": 0.59,
"step": 455000
},
{
"base_loss": 0.2988976333141327,
"epoch": 1.0963211059570312,
"grad_norm": 0.0009743266855366528,
"learning_rate": 6.560230255126954e-06,
"lookahead_loss": 6.28825941324234,
"loss": 0.3117,
"step": 455500
},
{
"base_loss": 0.3292928241491318,
"epoch": 1.0972747802734375,
"grad_norm": 0.0009721256792545319,
"learning_rate": 6.512546539306641e-06,
"lookahead_loss": 6.306233211517334,
"loss": 0.3394,
"step": 456000
},
{
"base_loss": 0.2914348037838936,
"epoch": 1.0982284545898438,
"grad_norm": 0.0009659443167038262,
"learning_rate": 6.464862823486328e-06,
"lookahead_loss": 6.237008224010467,
"loss": 0.3078,
"step": 456500
},
{
"base_loss": 0.2972012578845024,
"epoch": 1.09918212890625,
"grad_norm": 0.0009896591072902083,
"learning_rate": 6.417179107666016e-06,
"lookahead_loss": 6.287574047088623,
"loss": 0.3096,
"step": 457000
},
{
"base_loss": 0.3006402098238468,
"epoch": 1.1001358032226562,
"grad_norm": 0.0009321753168478608,
"learning_rate": 6.369495391845704e-06,
"lookahead_loss": 6.283053328514099,
"loss": 0.3145,
"step": 457500
},
{
"base_loss": 0.3227167456150055,
"epoch": 1.1010894775390625,
"grad_norm": 0.0009433454251848161,
"learning_rate": 6.321811676025391e-06,
"lookahead_loss": 6.353063113212586,
"loss": 0.3319,
"step": 458000
},
{
"base_loss": 0.30574207335710524,
"epoch": 1.1020431518554688,
"grad_norm": 0.0009229978313669562,
"learning_rate": 6.274127960205078e-06,
"lookahead_loss": 6.309114946365357,
"loss": 0.3146,
"step": 458500
},
{
"base_loss": 0.29960223579406736,
"epoch": 1.102996826171875,
"grad_norm": 0.0009563881903886795,
"learning_rate": 6.226444244384766e-06,
"lookahead_loss": 6.274078320503235,
"loss": 0.314,
"step": 459000
},
{
"base_loss": 0.2996614835858345,
"epoch": 1.1039505004882812,
"grad_norm": 0.0009942464530467987,
"learning_rate": 6.1787605285644534e-06,
"lookahead_loss": 6.303729599952698,
"loss": 0.3115,
"step": 459500
},
{
"base_loss": 0.3155037875175476,
"epoch": 1.1049041748046875,
"grad_norm": 0.000987946754321456,
"learning_rate": 6.131076812744141e-06,
"lookahead_loss": 6.240426075935364,
"loss": 0.3274,
"step": 460000
},
{
"epoch": 1.1049041748046875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.268837010898529,
"eval_lookahead_perplexity": 527.8631217026941,
"eval_loss": 0.14167657494544983,
"eval_perplexity": 1.1522039366470092,
"eval_runtime": 255.3675,
"eval_samples_per_second": 19.58,
"eval_steps_per_second": 0.615,
"step": 460000
},
{
"base_loss": 0.30881242457032204,
"epoch": 1.1058578491210938,
"grad_norm": 0.0009278358775191009,
"learning_rate": 6.083393096923829e-06,
"lookahead_loss": 6.227671199798584,
"loss": 0.3205,
"step": 460500
},
{
"base_loss": 0.29839677426218986,
"epoch": 1.1068115234375,
"grad_norm": 0.0009282033424824476,
"learning_rate": 6.035709381103516e-06,
"lookahead_loss": 6.310325454711914,
"loss": 0.3091,
"step": 461000
},
{
"base_loss": 0.294783333927393,
"epoch": 1.1077651977539062,
"grad_norm": 0.0009650102001614869,
"learning_rate": 5.988025665283203e-06,
"lookahead_loss": 6.230243679523468,
"loss": 0.3079,
"step": 461500
},
{
"base_loss": 0.32150769320130346,
"epoch": 1.1087188720703125,
"grad_norm": 0.0010077395709231496,
"learning_rate": 5.940341949462891e-06,
"lookahead_loss": 6.241594205856323,
"loss": 0.3333,
"step": 462000
},
{
"base_loss": 0.3191940434873104,
"epoch": 1.1096725463867188,
"grad_norm": 0.0009163669892586768,
"learning_rate": 5.892658233642579e-06,
"lookahead_loss": 6.1973172135353085,
"loss": 0.328,
"step": 462500
},
{
"base_loss": 0.30270202097296717,
"epoch": 1.110626220703125,
"grad_norm": 0.0008867474389262497,
"learning_rate": 5.844974517822266e-06,
"lookahead_loss": 6.168700245380402,
"loss": 0.3138,
"step": 463000
},
{
"base_loss": 0.2974509707689285,
"epoch": 1.1115798950195312,
"grad_norm": 0.0009725645068101585,
"learning_rate": 5.797290802001953e-06,
"lookahead_loss": 6.2942254137992855,
"loss": 0.3082,
"step": 463500
},
{
"base_loss": 0.3114223616421223,
"epoch": 1.1125335693359375,
"grad_norm": 0.0009279170189984143,
"learning_rate": 5.749607086181641e-06,
"lookahead_loss": 6.310624211788178,
"loss": 0.3226,
"step": 464000
},
{
"base_loss": 0.3443338246643543,
"epoch": 1.1134872436523438,
"grad_norm": 0.0009897082345560193,
"learning_rate": 5.7019233703613284e-06,
"lookahead_loss": 6.318526268482208,
"loss": 0.3514,
"step": 464500
},
{
"base_loss": 0.2939972540736198,
"epoch": 1.11444091796875,
"grad_norm": 0.000933043600525707,
"learning_rate": 5.654239654541016e-06,
"lookahead_loss": 6.195446736812592,
"loss": 0.308,
"step": 465000
},
{
"epoch": 1.11444091796875,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.2678496045426435,
"eval_lookahead_perplexity": 527.3421635423134,
"eval_loss": 0.14167460799217224,
"eval_perplexity": 1.1522016703179285,
"eval_runtime": 265.7854,
"eval_samples_per_second": 18.812,
"eval_steps_per_second": 0.591,
"step": 465000
},
{
"base_loss": 0.29841734063625336,
"epoch": 1.1153945922851562,
"grad_norm": 0.0009832673240453005,
"learning_rate": 5.606555938720704e-06,
"lookahead_loss": 6.269070412158966,
"loss": 0.311,
"step": 465500
},
{
"base_loss": 0.3147252712547779,
"epoch": 1.1163482666015625,
"grad_norm": 0.0010125736007466912,
"learning_rate": 5.558872222900391e-06,
"lookahead_loss": 6.2435841588973995,
"loss": 0.3247,
"step": 466000
},
{
"base_loss": 0.32950386153161526,
"epoch": 1.1173019409179688,
"grad_norm": 0.0009781451663002372,
"learning_rate": 5.511188507080078e-06,
"lookahead_loss": 6.253851639270782,
"loss": 0.3429,
"step": 466500
},
{
"base_loss": 0.30734304267168044,
"epoch": 1.118255615234375,
"grad_norm": 0.0010217922972515225,
"learning_rate": 5.463504791259766e-06,
"lookahead_loss": 6.285841710567475,
"loss": 0.3164,
"step": 467000
},
{
"base_loss": 0.3014386140704155,
"epoch": 1.1192092895507812,
"grad_norm": 0.0009706662967801094,
"learning_rate": 5.415821075439454e-06,
"lookahead_loss": 6.292059094905853,
"loss": 0.3126,
"step": 467500
},
{
"base_loss": 0.30611268219351767,
"epoch": 2.0009536743164062,
"grad_norm": 0.0009617306059226394,
"learning_rate": 5.368137359619141e-06,
"lookahead_loss": 6.326679523468018,
"loss": 0.3146,
"step": 468000
},
{
"base_loss": 0.301539769411087,
"epoch": 2.0019073486328125,
"grad_norm": 0.001001556869596243,
"learning_rate": 5.320453643798828e-06,
"lookahead_loss": 6.157724303245544,
"loss": 0.3137,
"step": 468500
},
{
"base_loss": 0.31222748425602914,
"epoch": 2.0028610229492188,
"grad_norm": 0.0009728239965625107,
"learning_rate": 5.272769927978516e-06,
"lookahead_loss": 6.156917175769806,
"loss": 0.3221,
"step": 469000
},
{
"base_loss": 0.32267384630441664,
"epoch": 2.003814697265625,
"grad_norm": 0.0009497611317783594,
"learning_rate": 5.2250862121582034e-06,
"lookahead_loss": 6.197290048122406,
"loss": 0.3348,
"step": 469500
},
{
"base_loss": 0.30016050645709036,
"epoch": 2.0047683715820312,
"grad_norm": 0.0009481213637627661,
"learning_rate": 5.177402496337891e-06,
"lookahead_loss": 6.171369277000427,
"loss": 0.3159,
"step": 470000
},
{
"epoch": 2.0047683715820312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.266808138106959,
"eval_lookahead_perplexity": 526.7932402710601,
"eval_loss": 0.14167256653308868,
"eval_perplexity": 1.1521993181477634,
"eval_runtime": 264.661,
"eval_samples_per_second": 18.892,
"eval_steps_per_second": 0.593,
"step": 470000
},
{
"base_loss": 0.3024714471399784,
"epoch": 2.0057220458984375,
"grad_norm": 0.0008405263070017099,
"learning_rate": 5.129718780517579e-06,
"lookahead_loss": 6.278398807525635,
"loss": 0.3112,
"step": 470500
},
{
"base_loss": 0.2964489733278751,
"epoch": 2.0066757202148438,
"grad_norm": 0.0009033022215589881,
"learning_rate": 5.082035064697266e-06,
"lookahead_loss": 6.156778864383697,
"loss": 0.3126,
"step": 471000
},
{
"base_loss": 0.31337857532501223,
"epoch": 2.00762939453125,
"grad_norm": 0.0009623009245842695,
"learning_rate": 5.034351348876953e-06,
"lookahead_loss": 6.213428562164307,
"loss": 0.3233,
"step": 471500
},
{
"base_loss": 0.3180972839295864,
"epoch": 2.0085830688476562,
"grad_norm": 0.0009386781021021307,
"learning_rate": 4.986667633056641e-06,
"lookahead_loss": 6.1949804525375365,
"loss": 0.3226,
"step": 472000
},
{
"base_loss": 0.30493127757310867,
"epoch": 2.0095367431640625,
"grad_norm": 0.0009717259090393782,
"learning_rate": 4.938983917236329e-06,
"lookahead_loss": 6.213238111972808,
"loss": 0.318,
"step": 472500
},
{
"base_loss": 0.30099570405483245,
"epoch": 2.0104904174804688,
"grad_norm": 0.0009312546462751925,
"learning_rate": 4.891300201416016e-06,
"lookahead_loss": 6.1899485034942625,
"loss": 0.3109,
"step": 473000
},
{
"base_loss": 0.30160990768671037,
"epoch": 2.011444091796875,
"grad_norm": 0.0010111306328326464,
"learning_rate": 4.843616485595703e-06,
"lookahead_loss": 6.192778927326202,
"loss": 0.3142,
"step": 473500
},
{
"base_loss": 0.32538792353868484,
"epoch": 2.0123977661132812,
"grad_norm": 0.000879990984685719,
"learning_rate": 4.795932769775391e-06,
"lookahead_loss": 6.183738801956177,
"loss": 0.3353,
"step": 474000
},
{
"base_loss": 0.3040602553486824,
"epoch": 2.0133514404296875,
"grad_norm": 0.0009244357934221625,
"learning_rate": 4.7482490539550784e-06,
"lookahead_loss": 6.276916295051575,
"loss": 0.3189,
"step": 474500
},
{
"base_loss": 0.29813345649838446,
"epoch": 2.0143051147460938,
"grad_norm": 0.0009381592972204089,
"learning_rate": 4.700565338134766e-06,
"lookahead_loss": 6.224869798183441,
"loss": 0.3115,
"step": 475000
},
{
"epoch": 2.0143051147460938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.26585840036313,
"eval_lookahead_perplexity": 526.2931623566051,
"eval_loss": 0.14167073369026184,
"eval_perplexity": 1.1521972063494434,
"eval_runtime": 254.0716,
"eval_samples_per_second": 19.679,
"eval_steps_per_second": 0.618,
"step": 475000
},
{
"base_loss": 0.2960759684741497,
"epoch": 2.0152587890625,
"grad_norm": 0.0008980022976174951,
"learning_rate": 4.652881622314453e-06,
"lookahead_loss": 6.182446990966797,
"loss": 0.3066,
"step": 475500
},
{
"base_loss": 0.31211792075634004,
"epoch": 2.0162124633789062,
"grad_norm": 0.0009660014766268432,
"learning_rate": 4.605197906494141e-06,
"lookahead_loss": 6.232044881820679,
"loss": 0.3224,
"step": 476000
},
{
"base_loss": 0.31110167542099953,
"epoch": 2.0171661376953125,
"grad_norm": 0.0009189226548187435,
"learning_rate": 4.557514190673828e-06,
"lookahead_loss": 6.267304803848266,
"loss": 0.3221,
"step": 476500
},
{
"base_loss": 0.2990322083234787,
"epoch": 2.0181198120117188,
"grad_norm": 0.0009295094641856849,
"learning_rate": 4.509830474853516e-06,
"lookahead_loss": 6.280393055915832,
"loss": 0.3114,
"step": 477000
},
{
"base_loss": 0.29806812533736227,
"epoch": 2.019073486328125,
"grad_norm": 0.00097031140467152,
"learning_rate": 4.462146759033204e-06,
"lookahead_loss": 6.305790534496308,
"loss": 0.3094,
"step": 477500
},
{
"base_loss": 0.30187543269991873,
"epoch": 2.0200271606445312,
"grad_norm": 0.001022745855152607,
"learning_rate": 4.4144630432128904e-06,
"lookahead_loss": 6.14053142118454,
"loss": 0.3146,
"step": 478000
},
{
"base_loss": 0.32729279178380966,
"epoch": 2.0209808349609375,
"grad_norm": 0.0009685283876024187,
"learning_rate": 4.366779327392578e-06,
"lookahead_loss": 6.224147350311279,
"loss": 0.3372,
"step": 478500
},
{
"base_loss": 0.3057846530973911,
"epoch": 2.0219345092773438,
"grad_norm": 0.0009614306618459523,
"learning_rate": 4.319095611572266e-06,
"lookahead_loss": 6.187606465339661,
"loss": 0.314,
"step": 479000
},
{
"base_loss": 0.2997340569794178,
"epoch": 2.02288818359375,
"grad_norm": 0.000983657082542777,
"learning_rate": 4.2714118957519534e-06,
"lookahead_loss": 6.223736906528473,
"loss": 0.3113,
"step": 479500
},
{
"base_loss": 0.30260268279910085,
"epoch": 2.0238418579101562,
"grad_norm": 0.0009483291069045663,
"learning_rate": 4.223728179931641e-06,
"lookahead_loss": 6.179685834407806,
"loss": 0.3137,
"step": 480000
},
{
"epoch": 2.0238418579101562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.264854770879776,
"eval_lookahead_perplexity": 525.7652239935444,
"eval_loss": 0.14166878163814545,
"eval_perplexity": 1.1521949572026435,
"eval_runtime": 266.1285,
"eval_samples_per_second": 18.788,
"eval_steps_per_second": 0.59,
"step": 480000
},
{
"base_loss": 0.3236656226217747,
"epoch": 2.0247955322265625,
"grad_norm": 0.0009004331659525633,
"learning_rate": 4.176044464111328e-06,
"lookahead_loss": 6.2046719169616695,
"loss": 0.3344,
"step": 480500
},
{
"base_loss": 0.30869458481669426,
"epoch": 2.0257492065429688,
"grad_norm": 0.0009720096713863313,
"learning_rate": 4.128360748291016e-06,
"lookahead_loss": 6.157517350673675,
"loss": 0.3224,
"step": 481000
},
{
"base_loss": 0.3019005296528339,
"epoch": 2.026702880859375,
"grad_norm": 0.0010002320632338524,
"learning_rate": 4.080677032470703e-06,
"lookahead_loss": 6.184016052246093,
"loss": 0.311,
"step": 481500
},
{
"base_loss": 0.3077106066644192,
"epoch": 2.0276565551757812,
"grad_norm": 0.0009557516314089298,
"learning_rate": 4.032993316650391e-06,
"lookahead_loss": 6.2857253398895265,
"loss": 0.318,
"step": 482000
},
{
"base_loss": 0.3280421564877033,
"epoch": 2.0286102294921875,
"grad_norm": 0.0009686889825388789,
"learning_rate": 3.985309600830079e-06,
"lookahead_loss": 6.307454082489014,
"loss": 0.3389,
"step": 482500
},
{
"base_loss": 0.30581475085020066,
"epoch": 2.0295639038085938,
"grad_norm": 0.0009459998109377921,
"learning_rate": 3.9376258850097654e-06,
"lookahead_loss": 6.254247172832489,
"loss": 0.3136,
"step": 483000
},
{
"base_loss": 0.3068877322375774,
"epoch": 2.030517578125,
"grad_norm": 0.0009705954580567777,
"learning_rate": 3.889942169189453e-06,
"lookahead_loss": 6.246685606956482,
"loss": 0.3162,
"step": 483500
},
{
"base_loss": 0.3014947620034218,
"epoch": 2.0314712524414062,
"grad_norm": 0.000960386183578521,
"learning_rate": 3.842258453369141e-06,
"lookahead_loss": 6.249851522445678,
"loss": 0.3138,
"step": 484000
},
{
"base_loss": 0.3173881909847259,
"epoch": 2.0324249267578125,
"grad_norm": 0.0009665894904173911,
"learning_rate": 3.7945747375488284e-06,
"lookahead_loss": 6.212476090431213,
"loss": 0.3333,
"step": 484500
},
{
"base_loss": 0.3059709269702435,
"epoch": 2.0333786010742188,
"grad_norm": 0.0009842559229582548,
"learning_rate": 3.7468910217285157e-06,
"lookahead_loss": 6.260224392414093,
"loss": 0.3146,
"step": 485000
},
{
"epoch": 2.0333786010742188,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.2641355762847315,
"eval_lookahead_perplexity": 525.3872324272244,
"eval_loss": 0.1416674256324768,
"eval_perplexity": 1.1521933948208094,
"eval_runtime": 264.2356,
"eval_samples_per_second": 18.923,
"eval_steps_per_second": 0.594,
"step": 485000
},
{
"base_loss": 0.3017133396565914,
"epoch": 2.034332275390625,
"grad_norm": 0.0009045092738233507,
"learning_rate": 3.6992073059082034e-06,
"lookahead_loss": 6.33234530544281,
"loss": 0.3141,
"step": 485500
},
{
"base_loss": 0.31134800574183463,
"epoch": 2.0352859497070312,
"grad_norm": 0.0010052913567051291,
"learning_rate": 3.6515235900878906e-06,
"lookahead_loss": 6.164672554492951,
"loss": 0.3224,
"step": 486000
},
{
"base_loss": 0.32387468561530114,
"epoch": 2.0362396240234375,
"grad_norm": 0.0009474267717450857,
"learning_rate": 3.6038398742675783e-06,
"lookahead_loss": 6.275379824161529,
"loss": 0.3362,
"step": 486500
},
{
"base_loss": 0.3080780008882284,
"epoch": 2.0371932983398438,
"grad_norm": 0.0009899679571390152,
"learning_rate": 3.556156158447266e-06,
"lookahead_loss": 6.207229743003845,
"loss": 0.3192,
"step": 487000
},
{
"base_loss": 0.30180328992009164,
"epoch": 2.03814697265625,
"grad_norm": 0.0009810588089749217,
"learning_rate": 3.508472442626953e-06,
"lookahead_loss": 6.2532482995986935,
"loss": 0.3137,
"step": 487500
},
{
"base_loss": 0.30689890575408935,
"epoch": 2.0391006469726562,
"grad_norm": 0.0009289110312238336,
"learning_rate": 3.460788726806641e-06,
"lookahead_loss": 6.209832691192627,
"loss": 0.3188,
"step": 488000
},
{
"base_loss": 0.32427770999073985,
"epoch": 2.0400543212890625,
"grad_norm": 0.0009330078610219061,
"learning_rate": 3.413105010986328e-06,
"lookahead_loss": 6.245529312610627,
"loss": 0.3331,
"step": 488500
},
{
"base_loss": 0.30682690465450285,
"epoch": 2.0410079956054688,
"grad_norm": 0.0009762721601873636,
"learning_rate": 3.3654212951660158e-06,
"lookahead_loss": 6.171029562950134,
"loss": 0.3156,
"step": 489000
},
{
"base_loss": 0.29654143354296686,
"epoch": 2.041961669921875,
"grad_norm": 0.0009487427887506783,
"learning_rate": 3.3177375793457034e-06,
"lookahead_loss": 6.259974523544312,
"loss": 0.3079,
"step": 489500
},
{
"base_loss": 0.30721216344833374,
"epoch": 2.0429153442382812,
"grad_norm": 0.0009788471506908536,
"learning_rate": 3.2700538635253907e-06,
"lookahead_loss": 6.27340632724762,
"loss": 0.323,
"step": 490000
},
{
"epoch": 2.0429153442382812,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.263471987300788,
"eval_lookahead_perplexity": 525.038706899121,
"eval_loss": 0.14166615903377533,
"eval_perplexity": 1.1521919354550758,
"eval_runtime": 253.9561,
"eval_samples_per_second": 19.688,
"eval_steps_per_second": 0.618,
"step": 490000
},
{
"base_loss": 0.32630143281817436,
"epoch": 2.0438690185546875,
"grad_norm": 0.0009800581028684974,
"learning_rate": 3.2223701477050784e-06,
"lookahead_loss": 6.298602212905884,
"loss": 0.3415,
"step": 490500
},
{
"base_loss": 0.296696748316288,
"epoch": 2.0448226928710938,
"grad_norm": 0.0009970470564439893,
"learning_rate": 3.1746864318847656e-06,
"lookahead_loss": 6.2160419683456425,
"loss": 0.3097,
"step": 491000
},
{
"base_loss": 0.30323311913013457,
"epoch": 2.0457763671875,
"grad_norm": 0.0009552966221235693,
"learning_rate": 3.1270027160644533e-06,
"lookahead_loss": 6.2349854435920715,
"loss": 0.3159,
"step": 491500
},
{
"base_loss": 0.32944888742268086,
"epoch": 2.0467300415039062,
"grad_norm": 0.0009680980583652854,
"learning_rate": 3.079319000244141e-06,
"lookahead_loss": 6.207026290416717,
"loss": 0.3393,
"step": 492000
},
{
"base_loss": 0.32393511798977853,
"epoch": 2.0476837158203125,
"grad_norm": 0.0009857366094365716,
"learning_rate": 3.031635284423828e-06,
"lookahead_loss": 6.246865995407105,
"loss": 0.3393,
"step": 492500
},
{
"base_loss": 0.293301939278841,
"epoch": 2.0486373901367188,
"grad_norm": 0.0009528042282909155,
"learning_rate": 2.983951568603516e-06,
"lookahead_loss": 6.2056601891517635,
"loss": 0.3056,
"step": 493000
},
{
"base_loss": 0.3036652799248695,
"epoch": 2.049591064453125,
"grad_norm": 0.001003090525045991,
"learning_rate": 2.936267852783203e-06,
"lookahead_loss": 6.183267019271851,
"loss": 0.3171,
"step": 493500
},
{
"base_loss": 0.3174412237107754,
"epoch": 2.0505447387695312,
"grad_norm": 0.0009027134510688484,
"learning_rate": 2.8885841369628908e-06,
"lookahead_loss": 6.298980396270752,
"loss": 0.332,
"step": 494000
},
{
"base_loss": 0.30474287942051886,
"epoch": 2.0514984130859375,
"grad_norm": 0.0009782238630577922,
"learning_rate": 2.8409004211425784e-06,
"lookahead_loss": 6.23814437866211,
"loss": 0.3178,
"step": 494500
},
{
"base_loss": 0.30692395463585853,
"epoch": 2.0524520874023438,
"grad_norm": 0.0010520197683945298,
"learning_rate": 2.7932167053222657e-06,
"lookahead_loss": 6.181493873119354,
"loss": 0.3177,
"step": 495000
},
{
"epoch": 2.0524520874023438,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.262974844192164,
"eval_lookahead_perplexity": 524.7777523954666,
"eval_loss": 0.14166510105133057,
"eval_perplexity": 1.15219071645688,
"eval_runtime": 270.937,
"eval_samples_per_second": 18.454,
"eval_steps_per_second": 0.579,
"step": 495000
},
{
"base_loss": 0.32256214889883994,
"epoch": 2.05340576171875,
"grad_norm": 0.0009561408660374582,
"learning_rate": 2.7455329895019534e-06,
"lookahead_loss": 6.20479358291626,
"loss": 0.3312,
"step": 495500
},
{
"base_loss": 0.3550116382241249,
"epoch": 2.0543594360351562,
"grad_norm": 0.0009577757446095347,
"learning_rate": 2.6978492736816406e-06,
"lookahead_loss": 6.231877175807953,
"loss": 0.3693,
"step": 496000
},
{
"base_loss": 0.2970747436285019,
"epoch": 2.0553131103515625,
"grad_norm": 0.0009534953278489411,
"learning_rate": 2.6501655578613283e-06,
"lookahead_loss": 6.23866082906723,
"loss": 0.308,
"step": 496500
},
{
"base_loss": 0.30645539990067483,
"epoch": 2.0562667846679688,
"grad_norm": 0.0009555872529745102,
"learning_rate": 2.602481842041016e-06,
"lookahead_loss": 6.246189908981323,
"loss": 0.3167,
"step": 497000
},
{
"base_loss": 0.31723022189736366,
"epoch": 2.057220458984375,
"grad_norm": 0.0009773544734343886,
"learning_rate": 2.554798126220703e-06,
"lookahead_loss": 6.266136578559875,
"loss": 0.3307,
"step": 497500
},
{
"base_loss": 0.3193240025639534,
"epoch": 2.0581741333007812,
"grad_norm": 0.0009717575740069151,
"learning_rate": 2.507114410400391e-06,
"lookahead_loss": 6.268920562744141,
"loss": 0.3271,
"step": 498000
},
{
"base_loss": 0.2937832759618759,
"epoch": 2.0591278076171875,
"grad_norm": 0.0009175707236863673,
"learning_rate": 2.459430694580078e-06,
"lookahead_loss": 6.168703609466553,
"loss": 0.3071,
"step": 498500
},
{
"base_loss": 0.30271227744221685,
"epoch": 2.0600814819335938,
"grad_norm": 0.0009538477752357721,
"learning_rate": 2.4117469787597658e-06,
"lookahead_loss": 6.238021997451782,
"loss": 0.3164,
"step": 499000
},
{
"base_loss": 0.3198817696869373,
"epoch": 2.06103515625,
"grad_norm": 0.0009791208431124687,
"learning_rate": 2.3640632629394534e-06,
"lookahead_loss": 6.194750837326049,
"loss": 0.3306,
"step": 499500
},
{
"base_loss": 0.3065698970258236,
"epoch": 2.0619888305664062,
"grad_norm": 0.0010103002423420548,
"learning_rate": 2.3163795471191407e-06,
"lookahead_loss": 6.2074076795578,
"loss": 0.3147,
"step": 500000
},
{
"epoch": 2.0619888305664062,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.26243155147321,
"eval_lookahead_perplexity": 524.4927218980288,
"eval_loss": 0.14166411757469177,
"eval_perplexity": 1.1521895833047842,
"eval_runtime": 292.5742,
"eval_samples_per_second": 17.09,
"eval_steps_per_second": 0.537,
"step": 500000
},
{
"base_loss": 0.30793745544552803,
"epoch": 2.0629425048828125,
"grad_norm": 0.0010036919265985489,
"learning_rate": 2.2686958312988284e-06,
"lookahead_loss": 6.206808748722076,
"loss": 0.3177,
"step": 500500
},
{
"base_loss": 0.3166033121049404,
"epoch": 2.0638961791992188,
"grad_norm": 0.0009552605915814638,
"learning_rate": 2.2210121154785156e-06,
"lookahead_loss": 6.2300395545959475,
"loss": 0.3283,
"step": 501000
},
{
"base_loss": 0.30278992640972135,
"epoch": 2.064849853515625,
"grad_norm": 0.0009494012338109314,
"learning_rate": 2.1733283996582033e-06,
"lookahead_loss": 6.2603726406097415,
"loss": 0.318,
"step": 501500
},
{
"base_loss": 0.30789859166741373,
"epoch": 2.0658035278320312,
"grad_norm": 0.000976825482212007,
"learning_rate": 2.125644683837891e-06,
"lookahead_loss": 6.1817445759773255,
"loss": 0.3194,
"step": 502000
},
{
"base_loss": 0.307515013217926,
"epoch": 2.0667572021484375,
"grad_norm": 0.000997724011540413,
"learning_rate": 2.077960968017578e-06,
"lookahead_loss": 6.173175088882446,
"loss": 0.3162,
"step": 502500
},
{
"base_loss": 0.3304452752768993,
"epoch": 2.0677108764648438,
"grad_norm": 0.0010043421061709523,
"learning_rate": 2.030277252197266e-06,
"lookahead_loss": 6.237773827552795,
"loss": 0.3422,
"step": 503000
},
{
"base_loss": 0.3000219973921776,
"epoch": 2.06866455078125,
"grad_norm": 0.0010214378125965595,
"learning_rate": 1.982593536376953e-06,
"lookahead_loss": 6.208347861766815,
"loss": 0.3092,
"step": 503500
},
{
"base_loss": 0.3050138043165207,
"epoch": 2.0696182250976562,
"grad_norm": 0.0009577918681316078,
"learning_rate": 1.9349098205566408e-06,
"lookahead_loss": 6.242258395195007,
"loss": 0.3164,
"step": 504000
},
{
"base_loss": 0.34709723374247553,
"epoch": 2.0705718994140625,
"grad_norm": 0.0009933137334883213,
"learning_rate": 1.8872261047363282e-06,
"lookahead_loss": 6.156369118213654,
"loss": 0.3596,
"step": 504500
},
{
"base_loss": 0.31454886627197265,
"epoch": 2.0715255737304688,
"grad_norm": 0.0009540682658553123,
"learning_rate": 1.8395423889160157e-06,
"lookahead_loss": 6.19591244506836,
"loss": 0.3257,
"step": 505000
},
{
"epoch": 2.0715255737304688,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.262022242378503,
"eval_lookahead_perplexity": 524.2780861860196,
"eval_loss": 0.1416633278131485,
"eval_perplexity": 1.15218867335012,
"eval_runtime": 273.5457,
"eval_samples_per_second": 18.278,
"eval_steps_per_second": 0.574,
"step": 505000
},
{
"base_loss": 0.30621468406915664,
"epoch": 2.072479248046875,
"grad_norm": 0.0009789029136300087,
"learning_rate": 1.7918586730957031e-06,
"lookahead_loss": 6.262781209945679,
"loss": 0.3179,
"step": 505500
},
{
"base_loss": 0.3062588813006878,
"epoch": 2.0734329223632812,
"grad_norm": 0.0009360404801554978,
"learning_rate": 1.7441749572753908e-06,
"lookahead_loss": 6.235173214435577,
"loss": 0.3185,
"step": 506000
},
{
"base_loss": 0.3277868445813656,
"epoch": 2.0743865966796875,
"grad_norm": 0.0009577349992468953,
"learning_rate": 1.6964912414550783e-06,
"lookahead_loss": 6.249401865959167,
"loss": 0.3401,
"step": 506500
},
{
"base_loss": 0.30303199696540833,
"epoch": 2.0753402709960938,
"grad_norm": 0.0009694884065538645,
"learning_rate": 1.6488075256347657e-06,
"lookahead_loss": 6.3263852491378785,
"loss": 0.3149,
"step": 507000
},
{
"base_loss": 0.30761926966905595,
"epoch": 2.0762939453125,
"grad_norm": 0.0009187961695715785,
"learning_rate": 1.6011238098144532e-06,
"lookahead_loss": 6.275495934486389,
"loss": 0.3183,
"step": 507500
},
{
"base_loss": 0.33150802648067473,
"epoch": 2.0772476196289062,
"grad_norm": 0.0009673857130110264,
"learning_rate": 1.5534400939941406e-06,
"lookahead_loss": 6.266810004234314,
"loss": 0.344,
"step": 508000
},
{
"base_loss": 0.30574921500682833,
"epoch": 2.0782012939453125,
"grad_norm": 0.0009754388011060655,
"learning_rate": 1.505756378173828e-06,
"lookahead_loss": 6.266697756290435,
"loss": 0.3171,
"step": 508500
},
{
"base_loss": 0.2994054418802261,
"epoch": 2.0791549682617188,
"grad_norm": 0.0009331432520411909,
"learning_rate": 1.4580726623535158e-06,
"lookahead_loss": 6.26380909538269,
"loss": 0.313,
"step": 509000
},
{
"base_loss": 0.31194803246855735,
"epoch": 2.080108642578125,
"grad_norm": 0.0009484157781116664,
"learning_rate": 1.4103889465332032e-06,
"lookahead_loss": 6.247877294540405,
"loss": 0.3291,
"step": 509500
},
{
"base_loss": 0.3244352611005306,
"epoch": 2.0810623168945312,
"grad_norm": 0.001009124331176281,
"learning_rate": 1.3627052307128907e-06,
"lookahead_loss": 6.271973398685455,
"loss": 0.3387,
"step": 510000
},
{
"epoch": 2.0810623168945312,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.261683738269745,
"eval_lookahead_perplexity": 524.1006459335351,
"eval_loss": 0.14166270196437836,
"eval_perplexity": 1.1521879522544816,
"eval_runtime": 285.7481,
"eval_samples_per_second": 17.498,
"eval_steps_per_second": 0.549,
"step": 510000
},
{
"base_loss": 0.30112930870056154,
"epoch": 2.0820159912109375,
"grad_norm": 0.0009189407574012876,
"learning_rate": 1.3150215148925781e-06,
"lookahead_loss": 6.277680154800415,
"loss": 0.3128,
"step": 510500
},
{
"base_loss": 0.30448419651389125,
"epoch": 2.0829696655273438,
"grad_norm": 0.000990754459053278,
"learning_rate": 1.2673377990722656e-06,
"lookahead_loss": 6.309187083244324,
"loss": 0.3177,
"step": 511000
},
{
"base_loss": 0.33415990057587625,
"epoch": 2.08392333984375,
"grad_norm": 0.0009419569978490472,
"learning_rate": 1.2196540832519533e-06,
"lookahead_loss": 6.3219421377182,
"loss": 0.3452,
"step": 511500
},
{
"base_loss": 0.31056174263358116,
"epoch": 2.0848770141601562,
"grad_norm": 0.0009886783082038164,
"learning_rate": 1.1719703674316407e-06,
"lookahead_loss": 6.2479847407341005,
"loss": 0.3206,
"step": 512000
},
{
"base_loss": 0.2973758824914694,
"epoch": 2.0858306884765625,
"grad_norm": 0.0010188610758632421,
"learning_rate": 1.1242866516113282e-06,
"lookahead_loss": 6.2457886896133425,
"loss": 0.3085,
"step": 512500
},
{
"base_loss": 0.3042152850329876,
"epoch": 2.0867843627929688,
"grad_norm": 0.0010978945065289736,
"learning_rate": 1.0766029357910156e-06,
"lookahead_loss": 6.219885497093201,
"loss": 0.3136,
"step": 513000
},
{
"base_loss": 0.337257578343153,
"epoch": 2.087738037109375,
"grad_norm": 0.0008794477325864136,
"learning_rate": 1.028919219970703e-06,
"lookahead_loss": 6.267465775966644,
"loss": 0.3438,
"step": 513500
},
{
"base_loss": 0.3000968562066555,
"epoch": 2.0886917114257812,
"grad_norm": 0.001002022996544838,
"learning_rate": 9.812355041503908e-07,
"lookahead_loss": 6.260487885951996,
"loss": 0.3112,
"step": 514000
},
{
"base_loss": 0.3103375973403454,
"epoch": 2.0896453857421875,
"grad_norm": 0.0009891856461763382,
"learning_rate": 9.335517883300781e-07,
"lookahead_loss": 6.296863189697266,
"loss": 0.3182,
"step": 514500
},
{
"base_loss": 0.29991284269094465,
"epoch": 2.0905990600585938,
"grad_norm": 0.000973585934843868,
"learning_rate": 8.858680725097657e-07,
"lookahead_loss": 6.271089879989624,
"loss": 0.3117,
"step": 515000
},
{
"epoch": 2.0905990600585938,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.261462169714248,
"eval_lookahead_perplexity": 523.9845345742689,
"eval_loss": 0.14166229963302612,
"eval_perplexity": 1.152187488693238,
"eval_runtime": 284.7414,
"eval_samples_per_second": 17.56,
"eval_steps_per_second": 0.551,
"step": 515000
},
{
"base_loss": 0.3000768061578274,
"epoch": 2.091552734375,
"grad_norm": 0.0008802941883914173,
"learning_rate": 8.381843566894531e-07,
"lookahead_loss": 6.234204215049743,
"loss": 0.3106,
"step": 515500
},
{
"base_loss": 0.31945454320311545,
"epoch": 2.0925064086914062,
"grad_norm": 0.000959145778324455,
"learning_rate": 7.905006408691407e-07,
"lookahead_loss": 6.28009091091156,
"loss": 0.333,
"step": 516000
},
{
"base_loss": 0.309114942163229,
"epoch": 2.0934600830078125,
"grad_norm": 0.0008794525056146085,
"learning_rate": 7.428169250488282e-07,
"lookahead_loss": 6.279258125305176,
"loss": 0.3194,
"step": 516500
},
{
"base_loss": 0.28753899577260017,
"epoch": 2.0944137573242188,
"grad_norm": 0.0009603716316632926,
"learning_rate": 6.951332092285156e-07,
"lookahead_loss": 6.265149411201477,
"loss": 0.3032,
"step": 517000
},
{
"base_loss": 0.29245217123627665,
"epoch": 2.095367431640625,
"grad_norm": 0.0009777392260730267,
"learning_rate": 6.474494934082032e-07,
"lookahead_loss": 6.201946850299835,
"loss": 0.3077,
"step": 517500
},
{
"base_loss": 0.30112256136536597,
"epoch": 2.0963211059570312,
"grad_norm": 0.0009416543180122972,
"learning_rate": 5.997657775878906e-07,
"lookahead_loss": 6.2732035398483275,
"loss": 0.3133,
"step": 518000
},
{
"base_loss": 0.3297825155258179,
"epoch": 2.0972747802734375,
"grad_norm": 0.0009741596295498312,
"learning_rate": 5.520820617675782e-07,
"lookahead_loss": 6.294767707824707,
"loss": 0.3408,
"step": 518500
},
{
"base_loss": 0.2911633634865284,
"epoch": 2.0982284545898438,
"grad_norm": 0.0009707529679872096,
"learning_rate": 5.043983459472657e-07,
"lookahead_loss": 6.226530519485474,
"loss": 0.306,
"step": 519000
},
{
"base_loss": 0.2934698580801487,
"epoch": 2.09918212890625,
"grad_norm": 0.0009795463411137462,
"learning_rate": 4.5671463012695317e-07,
"lookahead_loss": 6.274832231521606,
"loss": 0.3095,
"step": 519500
},
{
"base_loss": 0.3032768616080284,
"epoch": 2.1001358032226562,
"grad_norm": 0.0009279102087020874,
"learning_rate": 4.0903091430664063e-07,
"lookahead_loss": 6.28268619632721,
"loss": 0.3147,
"step": 520000
},
{
"epoch": 2.1001358032226562,
"eval_accuracy": 0.002520743639921722,
"eval_base_loss": 0.1298022617856725,
"eval_base_perplexity": 1.1386032156965338,
"eval_lookahead_loss": 6.261365200383976,
"eval_lookahead_perplexity": 523.9337266083253,
"eval_loss": 0.1416620910167694,
"eval_perplexity": 1.152187248328222,
"eval_runtime": 316.6586,
"eval_samples_per_second": 15.79,
"eval_steps_per_second": 0.496,
"step": 520000
},
{
"base_loss": 0.3233104472160339,
"epoch": 2.1010894775390625,
"grad_norm": 0.000958006305154413,
"learning_rate": 3.6134719848632814e-07,
"lookahead_loss": 6.3454251170158384,
"loss": 0.333,
"step": 520500
},
{
"base_loss": 0.30158160945773127,
"epoch": 2.1020431518554688,
"grad_norm": 0.0009491976234130561,
"learning_rate": 3.1366348266601565e-07,
"lookahead_loss": 6.288103145599365,
"loss": 0.3126,
"step": 521000
},
{
"base_loss": 0.29719595339894295,
"epoch": 2.102996826171875,
"grad_norm": 0.0009692716994322836,
"learning_rate": 2.6597976684570316e-07,
"lookahead_loss": 6.270690215110779,
"loss": 0.3125,
"step": 521500
},
{
"base_loss": 0.3008777514696121,
"epoch": 2.1039505004882812,
"grad_norm": 0.0009985940996557474,
"learning_rate": 2.1829605102539064e-07,
"lookahead_loss": 6.300080471038818,
"loss": 0.3114,
"step": 522000
},
{
"base_loss": 0.31517351168394087,
"epoch": 2.1049041748046875,
"grad_norm": 0.0009895325638353825,
"learning_rate": 1.7061233520507813e-07,
"lookahead_loss": 6.232696861267089,
"loss": 0.3275,
"step": 522500
},
{
"base_loss": 0.3079349631667137,
"epoch": 2.1058578491210938,
"grad_norm": 0.0009340654360130429,
"learning_rate": 1.2292861938476564e-07,
"lookahead_loss": 6.239760284423828,
"loss": 0.3204,
"step": 523000
},
{
"base_loss": 0.29828172570466993,
"epoch": 2.1068115234375,
"grad_norm": 0.0009318156517110765,
"learning_rate": 7.524490356445312e-08,
"lookahead_loss": 6.308520089626312,
"loss": 0.3082,
"step": 523500
},
{
"base_loss": 0.29354105108976364,
"epoch": 2.1077651977539062,
"grad_norm": 0.0009783627465367317,
"learning_rate": 2.7561187744140627e-08,
"lookahead_loss": 6.2301955571174625,
"loss": 0.3059,
"step": 524000
},
{
"epoch": 2.1083145141601562,
"step": 524288,
"total_flos": 4.036319640756106e+19,
"train_loss": 0.07301426184130833,
"train_runtime": 100630.1012,
"train_samples_per_second": 166.722,
"train_steps_per_second": 5.21
}
],
"logging_steps": 500,
"max_steps": 524288,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.036319640756106e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}