{ "best_global_step": null, "best_metric": 4.036635398864746, "best_model_checkpoint": null, "epoch": 1.04632568359375, "eval_steps": 5000, "global_step": 524288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00095367431640625, "grad_norm": 1.3474202156066895, "learning_rate": 4.995241165161133e-05, "lookahead_loss": 8.677686486244202, "loss": 8.0948, "step": 500 }, { "epoch": 0.0019073486328125, "grad_norm": 1.3769419193267822, "learning_rate": 4.990472793579102e-05, "lookahead_loss": 8.285415952682495, "loss": 7.4958, "step": 1000 }, { "epoch": 0.00286102294921875, "grad_norm": 1.0920134782791138, "learning_rate": 4.98570442199707e-05, "lookahead_loss": 8.032022035598755, "loss": 7.1909, "step": 1500 }, { "epoch": 0.003814697265625, "grad_norm": 1.1071102619171143, "learning_rate": 4.9809360504150393e-05, "lookahead_loss": 7.860327247619629, "loss": 7.0234, "step": 2000 }, { "epoch": 0.00476837158203125, "grad_norm": 0.9932270646095276, "learning_rate": 4.9761676788330084e-05, "lookahead_loss": 7.688726548194885, "loss": 6.8573, "step": 2500 }, { "epoch": 0.0057220458984375, "grad_norm": 0.8994491696357727, "learning_rate": 4.971399307250977e-05, "lookahead_loss": 7.590894039154053, "loss": 6.7701, "step": 3000 }, { "epoch": 0.00667572021484375, "grad_norm": 1.1505061388015747, "learning_rate": 4.966630935668946e-05, "lookahead_loss": 7.4404816112518315, "loss": 6.6554, "step": 3500 }, { "epoch": 0.00762939453125, "grad_norm": 0.9925533533096313, "learning_rate": 4.961862564086914e-05, "lookahead_loss": 7.556472114562988, "loss": 6.5799, "step": 4000 }, { "epoch": 0.00858306884765625, "grad_norm": 1.0438563823699951, "learning_rate": 4.957094192504883e-05, "lookahead_loss": 7.340022535324096, "loss": 6.4449, "step": 4500 }, { "epoch": 0.0095367431640625, "grad_norm": 0.9325507879257202, "learning_rate": 4.952325820922852e-05, "lookahead_loss": 7.342128164291382, "loss": 6.4709, "step": 5000 }, { "epoch": 0.0095367431640625, "eval_accuracy": 0.029982778864970645, "eval_lookahead_loss": 7.365461821937561, "eval_lookahead_perplexity": 1580.4451422425795, "eval_loss": 6.411215782165527, "eval_perplexity": 608.6331968113838, "eval_runtime": 513.5386, "eval_samples_per_second": 19.473, "eval_steps_per_second": 4.868, "step": 5000 }, { "epoch": 0.01049041748046875, "grad_norm": 1.1312073469161987, "learning_rate": 4.9475574493408205e-05, "lookahead_loss": 7.404867521286011, "loss": 6.4155, "step": 5500 }, { "epoch": 0.011444091796875, "grad_norm": 1.0023726224899292, "learning_rate": 4.9427890777587895e-05, "lookahead_loss": 7.3527754259109495, "loss": 6.4398, "step": 6000 }, { "epoch": 0.01239776611328125, "grad_norm": 1.0266180038452148, "learning_rate": 4.938020706176758e-05, "lookahead_loss": 7.35479855632782, "loss": 6.379, "step": 6500 }, { "epoch": 0.0133514404296875, "grad_norm": 1.016222357749939, "learning_rate": 4.933252334594727e-05, "lookahead_loss": 7.346290413856506, "loss": 6.3383, "step": 7000 }, { "epoch": 0.01430511474609375, "grad_norm": 0.9968783855438232, "learning_rate": 4.928483963012696e-05, "lookahead_loss": 7.309118425369262, "loss": 6.3058, "step": 7500 }, { "epoch": 0.0152587890625, "grad_norm": 0.9278364181518555, "learning_rate": 4.923715591430664e-05, "lookahead_loss": 7.305126391410828, "loss": 6.3001, "step": 8000 }, { "epoch": 0.01621246337890625, "grad_norm": 0.9722256660461426, "learning_rate": 4.918947219848633e-05, "lookahead_loss": 7.258668989181518, "loss": 6.2214, "step": 8500 }, { "epoch": 0.0171661376953125, "grad_norm": 1.1593012809753418, "learning_rate": 4.9141788482666016e-05, "lookahead_loss": 7.345213917732239, "loss": 6.1842, "step": 9000 }, { "epoch": 0.01811981201171875, "grad_norm": 1.016472578048706, "learning_rate": 4.9094104766845706e-05, "lookahead_loss": 7.310981082439422, "loss": 6.1755, "step": 9500 }, { "epoch": 0.019073486328125, "grad_norm": 1.072575330734253, "learning_rate": 4.9046421051025396e-05, "lookahead_loss": 7.207895273208618, "loss": 6.1634, "step": 10000 }, { "epoch": 0.019073486328125, "eval_accuracy": 0.028848923679060664, "eval_lookahead_loss": 7.294508046817779, "eval_lookahead_perplexity": 1472.1924729450327, "eval_loss": 6.138713359832764, "eval_perplexity": 463.45688483767555, "eval_runtime": 513.9526, "eval_samples_per_second": 19.457, "eval_steps_per_second": 4.864, "step": 10000 }, { "epoch": 0.02002716064453125, "grad_norm": 1.065335750579834, "learning_rate": 4.899873733520508e-05, "lookahead_loss": 7.29176104259491, "loss": 6.1072, "step": 10500 }, { "epoch": 0.0209808349609375, "grad_norm": 1.2301826477050781, "learning_rate": 4.895105361938477e-05, "lookahead_loss": 7.339878624916077, "loss": 6.1121, "step": 11000 }, { "epoch": 0.02193450927734375, "grad_norm": 1.306551456451416, "learning_rate": 4.890336990356445e-05, "lookahead_loss": 7.223239213943481, "loss": 6.1278, "step": 11500 }, { "epoch": 0.02288818359375, "grad_norm": 1.064714789390564, "learning_rate": 4.8855686187744143e-05, "lookahead_loss": 7.357453184127808, "loss": 6.1691, "step": 12000 }, { "epoch": 0.02384185791015625, "grad_norm": 1.4712648391723633, "learning_rate": 4.8808002471923834e-05, "lookahead_loss": 7.199277226448059, "loss": 6.0988, "step": 12500 }, { "epoch": 0.0247955322265625, "grad_norm": 1.37797212600708, "learning_rate": 4.876031875610352e-05, "lookahead_loss": 7.359523962020874, "loss": 6.1183, "step": 13000 }, { "epoch": 0.02574920654296875, "grad_norm": 1.4248344898223877, "learning_rate": 4.871263504028321e-05, "lookahead_loss": 7.248492773532868, "loss": 6.0322, "step": 13500 }, { "epoch": 0.026702880859375, "grad_norm": 1.6531237363815308, "learning_rate": 4.866495132446289e-05, "lookahead_loss": 7.317364872932434, "loss": 6.0274, "step": 14000 }, { "epoch": 0.02765655517578125, "grad_norm": 1.8214269876480103, "learning_rate": 4.861726760864258e-05, "lookahead_loss": 7.257651334762573, "loss": 6.0376, "step": 14500 }, { "epoch": 0.0286102294921875, "grad_norm": 1.9904179573059082, "learning_rate": 4.856958389282227e-05, "lookahead_loss": 7.31811006641388, "loss": 6.0273, "step": 15000 }, { "epoch": 0.0286102294921875, "eval_accuracy": 0.031200391389432484, "eval_lookahead_loss": 7.244623989105224, "eval_lookahead_perplexity": 1400.5551729510337, "eval_loss": 5.947416305541992, "eval_perplexity": 382.76311745163, "eval_runtime": 507.5748, "eval_samples_per_second": 19.702, "eval_steps_per_second": 4.925, "step": 15000 }, { "epoch": 0.02956390380859375, "grad_norm": 1.88814377784729, "learning_rate": 4.8521900177001955e-05, "lookahead_loss": 7.309946389198303, "loss": 6.038, "step": 15500 }, { "epoch": 0.030517578125, "grad_norm": 2.1016085147857666, "learning_rate": 4.8474216461181645e-05, "lookahead_loss": 7.251779208183288, "loss": 5.9717, "step": 16000 }, { "epoch": 0.03147125244140625, "grad_norm": 1.8789067268371582, "learning_rate": 4.842653274536133e-05, "lookahead_loss": 7.239742446899414, "loss": 5.9589, "step": 16500 }, { "epoch": 0.0324249267578125, "grad_norm": 1.9547315835952759, "learning_rate": 4.837884902954102e-05, "lookahead_loss": 7.262725432395935, "loss": 5.9105, "step": 17000 }, { "epoch": 0.03337860107421875, "grad_norm": 2.905271291732788, "learning_rate": 4.833116531372071e-05, "lookahead_loss": 7.222391340255737, "loss": 5.9442, "step": 17500 }, { "epoch": 0.034332275390625, "grad_norm": 2.4046437740325928, "learning_rate": 4.828348159790039e-05, "lookahead_loss": 7.311060576438904, "loss": 5.9834, "step": 18000 }, { "epoch": 0.03528594970703125, "grad_norm": 2.8731801509857178, "learning_rate": 4.823579788208008e-05, "lookahead_loss": 7.260329246520996, "loss": 5.9227, "step": 18500 }, { "epoch": 0.0362396240234375, "grad_norm": 3.013892412185669, "learning_rate": 4.8188114166259766e-05, "lookahead_loss": 7.312660473823548, "loss": 5.9048, "step": 19000 }, { "epoch": 0.03719329833984375, "grad_norm": 3.6560518741607666, "learning_rate": 4.8140430450439456e-05, "lookahead_loss": 7.268177618980408, "loss": 5.8917, "step": 19500 }, { "epoch": 0.03814697265625, "grad_norm": 3.1264445781707764, "learning_rate": 4.8092746734619146e-05, "lookahead_loss": 7.278836151123047, "loss": 5.89, "step": 20000 }, { "epoch": 0.03814697265625, "eval_accuracy": 0.03142152641878669, "eval_lookahead_loss": 7.210129296016693, "eval_lookahead_perplexity": 1353.0672022643917, "eval_loss": 5.8083038330078125, "eval_perplexity": 333.0537315626037, "eval_runtime": 503.1825, "eval_samples_per_second": 19.874, "eval_steps_per_second": 4.968, "step": 20000 }, { "epoch": 0.03910064697265625, "grad_norm": 3.2956721782684326, "learning_rate": 4.804506301879883e-05, "lookahead_loss": 7.161835201263428, "loss": 5.8973, "step": 20500 }, { "epoch": 0.0400543212890625, "grad_norm": 3.2284531593322754, "learning_rate": 4.799737930297852e-05, "lookahead_loss": 7.283536592483521, "loss": 5.8916, "step": 21000 }, { "epoch": 0.04100799560546875, "grad_norm": 4.024008750915527, "learning_rate": 4.79496955871582e-05, "lookahead_loss": 7.232271059989929, "loss": 5.8636, "step": 21500 }, { "epoch": 0.041961669921875, "grad_norm": 4.139217853546143, "learning_rate": 4.7902011871337893e-05, "lookahead_loss": 7.191415932178497, "loss": 5.8788, "step": 22000 }, { "epoch": 0.04291534423828125, "grad_norm": 5.551108360290527, "learning_rate": 4.7854328155517584e-05, "lookahead_loss": 7.315459197998047, "loss": 5.843, "step": 22500 }, { "epoch": 0.0438690185546875, "grad_norm": 4.512049198150635, "learning_rate": 4.780664443969727e-05, "lookahead_loss": 7.284794059753418, "loss": 5.8276, "step": 23000 }, { "epoch": 0.04482269287109375, "grad_norm": 4.516061782836914, "learning_rate": 4.775896072387696e-05, "lookahead_loss": 7.23037956237793, "loss": 5.8067, "step": 23500 }, { "epoch": 0.0457763671875, "grad_norm": 4.564533233642578, "learning_rate": 4.771127700805664e-05, "lookahead_loss": 7.195322487831116, "loss": 5.8543, "step": 24000 }, { "epoch": 0.04673004150390625, "grad_norm": 5.672128677368164, "learning_rate": 4.766359329223633e-05, "lookahead_loss": 7.32241688156128, "loss": 5.8528, "step": 24500 }, { "epoch": 0.0476837158203125, "grad_norm": 7.20906400680542, "learning_rate": 4.761590957641602e-05, "lookahead_loss": 7.2459747619628905, "loss": 5.7665, "step": 25000 }, { "epoch": 0.0476837158203125, "eval_accuracy": 0.03299902152641879, "eval_lookahead_loss": 7.188009054756164, "eval_lookahead_perplexity": 1323.4656332395036, "eval_loss": 5.719667911529541, "eval_perplexity": 304.8036843584245, "eval_runtime": 509.7231, "eval_samples_per_second": 19.618, "eval_steps_per_second": 4.905, "step": 25000 }, { "epoch": 0.04863739013671875, "grad_norm": 6.934261798858643, "learning_rate": 4.7568225860595705e-05, "lookahead_loss": 7.234013737201691, "loss": 5.8174, "step": 25500 }, { "epoch": 0.049591064453125, "grad_norm": 5.836729526519775, "learning_rate": 4.7520542144775395e-05, "lookahead_loss": 7.156494901657105, "loss": 5.7817, "step": 26000 }, { "epoch": 0.05054473876953125, "grad_norm": 4.637109279632568, "learning_rate": 4.747285842895508e-05, "lookahead_loss": 7.121358276367188, "loss": 5.7763, "step": 26500 }, { "epoch": 0.0514984130859375, "grad_norm": 6.888278484344482, "learning_rate": 4.742517471313477e-05, "lookahead_loss": 7.189897603034973, "loss": 5.788, "step": 27000 }, { "epoch": 0.05245208740234375, "grad_norm": 5.705153465270996, "learning_rate": 4.737749099731446e-05, "lookahead_loss": 7.287825613975525, "loss": 5.768, "step": 27500 }, { "epoch": 0.05340576171875, "grad_norm": 6.031825065612793, "learning_rate": 4.732980728149414e-05, "lookahead_loss": 7.160476461410522, "loss": 5.7623, "step": 28000 }, { "epoch": 0.05435943603515625, "grad_norm": 6.522262096405029, "learning_rate": 4.728212356567383e-05, "lookahead_loss": 7.253725218772888, "loss": 5.7446, "step": 28500 }, { "epoch": 0.0553131103515625, "grad_norm": 10.468609809875488, "learning_rate": 4.7234439849853516e-05, "lookahead_loss": 7.088342420578003, "loss": 5.6968, "step": 29000 }, { "epoch": 0.05626678466796875, "grad_norm": 9.363543510437012, "learning_rate": 4.7186756134033206e-05, "lookahead_loss": 7.1719634160995485, "loss": 5.7355, "step": 29500 }, { "epoch": 0.057220458984375, "grad_norm": 7.383028030395508, "learning_rate": 4.7139072418212896e-05, "lookahead_loss": 7.133569308280945, "loss": 5.6891, "step": 30000 }, { "epoch": 0.057220458984375, "eval_accuracy": 0.03113013698630137, "eval_lookahead_loss": 7.202668938064575, "eval_lookahead_perplexity": 1343.0103969302609, "eval_loss": 5.690493583679199, "eval_perplexity": 296.0397049518614, "eval_runtime": 504.2306, "eval_samples_per_second": 19.832, "eval_steps_per_second": 4.958, "step": 30000 }, { "epoch": 0.05817413330078125, "grad_norm": 6.521446704864502, "learning_rate": 4.709138870239258e-05, "lookahead_loss": 7.164209529876709, "loss": 5.6804, "step": 30500 }, { "epoch": 0.0591278076171875, "grad_norm": 5.763637065887451, "learning_rate": 4.704370498657227e-05, "lookahead_loss": 7.144323136329651, "loss": 5.6598, "step": 31000 }, { "epoch": 0.06008148193359375, "grad_norm": 6.887425899505615, "learning_rate": 4.699602127075195e-05, "lookahead_loss": 7.292964460372925, "loss": 5.7812, "step": 31500 }, { "epoch": 0.06103515625, "grad_norm": 9.434282302856445, "learning_rate": 4.6948337554931643e-05, "lookahead_loss": 7.219927433013916, "loss": 5.7674, "step": 32000 }, { "epoch": 0.06198883056640625, "grad_norm": 10.704363822937012, "learning_rate": 4.6900653839111334e-05, "lookahead_loss": 7.155441507339478, "loss": 5.7215, "step": 32500 }, { "epoch": 0.0629425048828125, "grad_norm": 8.569531440734863, "learning_rate": 4.685297012329102e-05, "lookahead_loss": 7.112296872615814, "loss": 5.648, "step": 33000 }, { "epoch": 0.06389617919921875, "grad_norm": 10.21805477142334, "learning_rate": 4.680528640747071e-05, "lookahead_loss": 7.270873064994812, "loss": 5.763, "step": 33500 }, { "epoch": 0.064849853515625, "grad_norm": 8.476314544677734, "learning_rate": 4.675760269165039e-05, "lookahead_loss": 7.2414698429107665, "loss": 5.6687, "step": 34000 }, { "epoch": 0.06580352783203125, "grad_norm": 11.159482955932617, "learning_rate": 4.670991897583008e-05, "lookahead_loss": 7.25000864315033, "loss": 5.7207, "step": 34500 }, { "epoch": 0.0667572021484375, "grad_norm": 16.375673294067383, "learning_rate": 4.666223526000977e-05, "lookahead_loss": 7.232049855232239, "loss": 5.6926, "step": 35000 }, { "epoch": 0.0667572021484375, "eval_accuracy": 0.03229138943248532, "eval_lookahead_loss": 7.135774825954437, "eval_lookahead_perplexity": 1256.1098783373734, "eval_loss": 5.58113956451416, "eval_perplexity": 265.37384415924726, "eval_runtime": 509.9319, "eval_samples_per_second": 19.61, "eval_steps_per_second": 4.903, "step": 35000 }, { "epoch": 0.06771087646484375, "grad_norm": 11.827472686767578, "learning_rate": 4.6614551544189455e-05, "lookahead_loss": 7.138879651069641, "loss": 5.6873, "step": 35500 }, { "epoch": 0.06866455078125, "grad_norm": 26.275428771972656, "learning_rate": 4.6566867828369145e-05, "lookahead_loss": 7.2058644151687625, "loss": 5.6855, "step": 36000 }, { "epoch": 0.06961822509765625, "grad_norm": 10.221647262573242, "learning_rate": 4.651918411254883e-05, "lookahead_loss": 7.209253810882569, "loss": 5.7246, "step": 36500 }, { "epoch": 0.0705718994140625, "grad_norm": 8.884531021118164, "learning_rate": 4.647150039672852e-05, "lookahead_loss": 7.1476358222961425, "loss": 5.638, "step": 37000 }, { "epoch": 0.07152557373046875, "grad_norm": 10.094484329223633, "learning_rate": 4.642381668090821e-05, "lookahead_loss": 7.243965757846833, "loss": 5.6566, "step": 37500 }, { "epoch": 0.072479248046875, "grad_norm": 11.710677146911621, "learning_rate": 4.637613296508789e-05, "lookahead_loss": 7.229899419784546, "loss": 5.6524, "step": 38000 }, { "epoch": 0.07343292236328125, "grad_norm": 11.815423011779785, "learning_rate": 4.632844924926758e-05, "lookahead_loss": 7.103484579086304, "loss": 5.6128, "step": 38500 }, { "epoch": 0.0743865966796875, "grad_norm": 8.337890625, "learning_rate": 4.6280765533447266e-05, "lookahead_loss": 7.231636068344116, "loss": 5.6565, "step": 39000 }, { "epoch": 0.07534027099609375, "grad_norm": 6.702563285827637, "learning_rate": 4.6233081817626956e-05, "lookahead_loss": 7.159618255615235, "loss": 5.6163, "step": 39500 }, { "epoch": 0.0762939453125, "grad_norm": 7.328082084655762, "learning_rate": 4.6185398101806646e-05, "lookahead_loss": 7.151073882102966, "loss": 5.6076, "step": 40000 }, { "epoch": 0.0762939453125, "eval_accuracy": 0.03238884540117417, "eval_lookahead_loss": 7.116009107303619, "eval_lookahead_perplexity": 1231.5257259872412, "eval_loss": 5.517475605010986, "eval_perplexity": 249.0056544922258, "eval_runtime": 507.0936, "eval_samples_per_second": 19.72, "eval_steps_per_second": 4.93, "step": 40000 }, { "epoch": 0.07724761962890625, "grad_norm": 11.30929946899414, "learning_rate": 4.613771438598633e-05, "lookahead_loss": 7.0856515436172485, "loss": 5.6168, "step": 40500 }, { "epoch": 0.0782012939453125, "grad_norm": 12.018611907958984, "learning_rate": 4.609003067016602e-05, "lookahead_loss": 7.199606026649475, "loss": 5.6234, "step": 41000 }, { "epoch": 0.07915496826171875, "grad_norm": 21.610713958740234, "learning_rate": 4.60423469543457e-05, "lookahead_loss": 7.089690165519714, "loss": 5.6095, "step": 41500 }, { "epoch": 0.080108642578125, "grad_norm": 9.74264907836914, "learning_rate": 4.5994663238525393e-05, "lookahead_loss": 7.10763978767395, "loss": 5.6204, "step": 42000 }, { "epoch": 0.08106231689453125, "grad_norm": 11.829455375671387, "learning_rate": 4.5946979522705084e-05, "lookahead_loss": 7.117177209854126, "loss": 5.5599, "step": 42500 }, { "epoch": 0.0820159912109375, "grad_norm": 8.984912872314453, "learning_rate": 4.589929580688477e-05, "lookahead_loss": 7.175943549156189, "loss": 5.5917, "step": 43000 }, { "epoch": 0.08296966552734375, "grad_norm": 11.498998641967773, "learning_rate": 4.585161209106446e-05, "lookahead_loss": 7.0405034160614015, "loss": 5.562, "step": 43500 }, { "epoch": 0.08392333984375, "grad_norm": 10.168523788452148, "learning_rate": 4.580392837524414e-05, "lookahead_loss": 7.139830492973328, "loss": 5.5785, "step": 44000 }, { "epoch": 0.08487701416015625, "grad_norm": 12.835909843444824, "learning_rate": 4.575624465942383e-05, "lookahead_loss": 7.207131398677826, "loss": 5.5692, "step": 44500 }, { "epoch": 0.0858306884765625, "grad_norm": 14.897501945495605, "learning_rate": 4.570856094360352e-05, "lookahead_loss": 7.1923885850906375, "loss": 5.5375, "step": 45000 }, { "epoch": 0.0858306884765625, "eval_accuracy": 0.03146966731898239, "eval_lookahead_loss": 7.118142500019074, "eval_lookahead_perplexity": 1234.1558585552302, "eval_loss": 5.470874786376953, "eval_perplexity": 237.66801058771193, "eval_runtime": 505.5165, "eval_samples_per_second": 19.782, "eval_steps_per_second": 4.945, "step": 45000 }, { "epoch": 0.08678436279296875, "grad_norm": 10.744502067565918, "learning_rate": 4.5660877227783205e-05, "lookahead_loss": 7.1796361961364745, "loss": 5.5512, "step": 45500 }, { "epoch": 0.087738037109375, "grad_norm": 11.42258358001709, "learning_rate": 4.5613193511962895e-05, "lookahead_loss": 7.051038390159607, "loss": 5.5718, "step": 46000 }, { "epoch": 0.08869171142578125, "grad_norm": 12.666010856628418, "learning_rate": 4.556550979614258e-05, "lookahead_loss": 7.118438813209534, "loss": 5.5118, "step": 46500 }, { "epoch": 0.0896453857421875, "grad_norm": 10.014766693115234, "learning_rate": 4.551782608032227e-05, "lookahead_loss": 7.125885098457337, "loss": 5.5524, "step": 47000 }, { "epoch": 0.09059906005859375, "grad_norm": 9.59919548034668, "learning_rate": 4.547014236450196e-05, "lookahead_loss": 7.1186350011825565, "loss": 5.4701, "step": 47500 }, { "epoch": 0.091552734375, "grad_norm": 16.168424606323242, "learning_rate": 4.542245864868164e-05, "lookahead_loss": 7.167111631393433, "loss": 5.5113, "step": 48000 }, { "epoch": 0.09250640869140625, "grad_norm": 12.184856414794922, "learning_rate": 4.537477493286133e-05, "lookahead_loss": 7.134662317276001, "loss": 5.4575, "step": 48500 }, { "epoch": 0.0934600830078125, "grad_norm": 44.705780029296875, "learning_rate": 4.5327091217041016e-05, "lookahead_loss": 7.177341228961945, "loss": 5.5078, "step": 49000 }, { "epoch": 0.09441375732421875, "grad_norm": 8.28176498413086, "learning_rate": 4.5279407501220706e-05, "lookahead_loss": 7.187461320877075, "loss": 5.512, "step": 49500 }, { "epoch": 0.095367431640625, "grad_norm": 12.484027862548828, "learning_rate": 4.523172378540039e-05, "lookahead_loss": 7.266454363822937, "loss": 5.5712, "step": 50000 }, { "epoch": 0.095367431640625, "eval_accuracy": 0.03226536203522505, "eval_lookahead_loss": 7.08289865732193, "eval_lookahead_perplexity": 1191.417027762198, "eval_loss": 5.416250228881836, "eval_perplexity": 225.03371352794844, "eval_runtime": 504.4571, "eval_samples_per_second": 19.823, "eval_steps_per_second": 4.956, "step": 50000 }, { "epoch": 0.09632110595703125, "grad_norm": 24.848735809326172, "learning_rate": 4.518404006958008e-05, "lookahead_loss": 7.178735790252685, "loss": 5.5959, "step": 50500 }, { "epoch": 0.0972747802734375, "grad_norm": 17.089916229248047, "learning_rate": 4.513635635375977e-05, "lookahead_loss": 7.208560256958008, "loss": 5.5382, "step": 51000 }, { "epoch": 0.09822845458984375, "grad_norm": 13.217761039733887, "learning_rate": 4.508867263793945e-05, "lookahead_loss": 7.181252511024475, "loss": 5.5787, "step": 51500 }, { "epoch": 0.09918212890625, "grad_norm": 15.83373737335205, "learning_rate": 4.5040988922119143e-05, "lookahead_loss": 7.183449371337891, "loss": 5.5108, "step": 52000 }, { "epoch": 0.10013580322265625, "grad_norm": 14.428374290466309, "learning_rate": 4.499330520629883e-05, "lookahead_loss": 7.160496368408203, "loss": 5.5412, "step": 52500 }, { "epoch": 0.1010894775390625, "grad_norm": 12.822569847106934, "learning_rate": 4.494562149047852e-05, "lookahead_loss": 7.1611150960922245, "loss": 5.533, "step": 53000 }, { "epoch": 0.10204315185546875, "grad_norm": 20.030109405517578, "learning_rate": 4.489793777465821e-05, "lookahead_loss": 7.083055003166199, "loss": 5.4679, "step": 53500 }, { "epoch": 0.102996826171875, "grad_norm": 13.136330604553223, "learning_rate": 4.485025405883789e-05, "lookahead_loss": 7.208622979164123, "loss": 5.5208, "step": 54000 }, { "epoch": 0.10395050048828125, "grad_norm": 12.386436462402344, "learning_rate": 4.480257034301758e-05, "lookahead_loss": 7.102167963981628, "loss": 5.426, "step": 54500 }, { "epoch": 0.1049041748046875, "grad_norm": 23.099088668823242, "learning_rate": 4.4754886627197264e-05, "lookahead_loss": 7.126259490966797, "loss": 5.5664, "step": 55000 }, { "epoch": 0.1049041748046875, "eval_accuracy": 0.03050880626223092, "eval_lookahead_loss": 7.05611174621582, "eval_lookahead_perplexity": 1159.9262983953224, "eval_loss": 5.387856960296631, "eval_perplexity": 218.7341269950547, "eval_runtime": 500.6811, "eval_samples_per_second": 19.973, "eval_steps_per_second": 4.993, "step": 55000 }, { "epoch": 0.10585784912109375, "grad_norm": 27.903282165527344, "learning_rate": 4.4707202911376955e-05, "lookahead_loss": 7.120815940856934, "loss": 5.5236, "step": 55500 }, { "epoch": 0.1068115234375, "grad_norm": 16.345800399780273, "learning_rate": 4.4659519195556645e-05, "lookahead_loss": 7.168801429748535, "loss": 5.4683, "step": 56000 }, { "epoch": 0.10776519775390625, "grad_norm": 17.027198791503906, "learning_rate": 4.461183547973633e-05, "lookahead_loss": 7.223023843765259, "loss": 5.5291, "step": 56500 }, { "epoch": 0.1087188720703125, "grad_norm": 7.567070007324219, "learning_rate": 4.456415176391602e-05, "lookahead_loss": 7.094460361480713, "loss": 5.4936, "step": 57000 }, { "epoch": 0.10967254638671875, "grad_norm": 15.553812980651855, "learning_rate": 4.45164680480957e-05, "lookahead_loss": 7.117517959594727, "loss": 5.4386, "step": 57500 }, { "epoch": 0.110626220703125, "grad_norm": 53.542049407958984, "learning_rate": 4.446878433227539e-05, "lookahead_loss": 6.969122802734375, "loss": 5.2634, "step": 58000 }, { "epoch": 0.11157989501953125, "grad_norm": 16.33045196533203, "learning_rate": 4.442110061645508e-05, "lookahead_loss": 7.076863143920899, "loss": 5.4985, "step": 58500 }, { "epoch": 0.1125335693359375, "grad_norm": 20.980016708374023, "learning_rate": 4.4373416900634766e-05, "lookahead_loss": 7.099330670356751, "loss": 5.5065, "step": 59000 }, { "epoch": 0.11348724365234375, "grad_norm": 17.214855194091797, "learning_rate": 4.4325733184814456e-05, "lookahead_loss": 7.040463229179382, "loss": 5.4538, "step": 59500 }, { "epoch": 0.11444091796875, "grad_norm": 13.367892265319824, "learning_rate": 4.427804946899414e-05, "lookahead_loss": 7.0823855390548704, "loss": 5.4285, "step": 60000 }, { "epoch": 0.11444091796875, "eval_accuracy": 0.031926614481409, "eval_lookahead_loss": 7.042757276535034, "eval_lookahead_perplexity": 1144.5390706263672, "eval_loss": 5.354024410247803, "eval_perplexity": 211.4575798421855, "eval_runtime": 501.5271, "eval_samples_per_second": 19.939, "eval_steps_per_second": 4.985, "step": 60000 }, { "epoch": 0.11539459228515625, "grad_norm": 10.866477966308594, "learning_rate": 4.423036575317383e-05, "lookahead_loss": 7.069445035934448, "loss": 5.4456, "step": 60500 }, { "epoch": 0.1163482666015625, "grad_norm": 18.253215789794922, "learning_rate": 4.418268203735352e-05, "lookahead_loss": 7.1344916515350345, "loss": 5.4407, "step": 61000 }, { "epoch": 0.11730194091796875, "grad_norm": 10.5404052734375, "learning_rate": 4.41349983215332e-05, "lookahead_loss": 7.04019455909729, "loss": 5.4007, "step": 61500 }, { "epoch": 0.118255615234375, "grad_norm": 11.506131172180176, "learning_rate": 4.4087314605712893e-05, "lookahead_loss": 7.095839465141297, "loss": 5.4524, "step": 62000 }, { "epoch": 0.11920928955078125, "grad_norm": 14.382362365722656, "learning_rate": 4.403963088989258e-05, "lookahead_loss": 7.0705813159942625, "loss": 5.438, "step": 62500 }, { "epoch": 0.1201629638671875, "grad_norm": 22.689937591552734, "learning_rate": 4.399194717407227e-05, "lookahead_loss": 7.129315278053284, "loss": 5.47, "step": 63000 }, { "epoch": 0.12111663818359375, "grad_norm": 13.494345664978027, "learning_rate": 4.394426345825196e-05, "lookahead_loss": 7.077808388710022, "loss": 5.4315, "step": 63500 }, { "epoch": 0.1220703125, "grad_norm": 10.685275077819824, "learning_rate": 4.389657974243164e-05, "lookahead_loss": 7.085301048755646, "loss": 5.3617, "step": 64000 }, { "epoch": 0.12302398681640625, "grad_norm": 11.87861442565918, "learning_rate": 4.384889602661133e-05, "lookahead_loss": 7.16679208278656, "loss": 5.4257, "step": 64500 }, { "epoch": 0.1239776611328125, "grad_norm": 16.03499984741211, "learning_rate": 4.3801212310791014e-05, "lookahead_loss": 7.031316479682922, "loss": 5.3761, "step": 65000 }, { "epoch": 0.1239776611328125, "eval_accuracy": 0.03238180039138943, "eval_lookahead_loss": 7.022714523029327, "eval_lookahead_perplexity": 1121.8277154827879, "eval_loss": 5.285512924194336, "eval_perplexity": 197.4554371492178, "eval_runtime": 503.607, "eval_samples_per_second": 19.857, "eval_steps_per_second": 4.964, "step": 65000 }, { "epoch": 0.12493133544921875, "grad_norm": 8.821488380432129, "learning_rate": 4.3753528594970705e-05, "lookahead_loss": 7.021174932479858, "loss": 5.3571, "step": 65500 }, { "epoch": 0.125885009765625, "grad_norm": 21.386552810668945, "learning_rate": 4.3705844879150395e-05, "lookahead_loss": 7.015764747619629, "loss": 5.371, "step": 66000 }, { "epoch": 0.12683868408203125, "grad_norm": 25.545211791992188, "learning_rate": 4.365816116333008e-05, "lookahead_loss": 7.094599558830261, "loss": 5.3509, "step": 66500 }, { "epoch": 0.1277923583984375, "grad_norm": 13.755099296569824, "learning_rate": 4.361047744750977e-05, "lookahead_loss": 7.000915126800537, "loss": 5.3614, "step": 67000 }, { "epoch": 0.12874603271484375, "grad_norm": 34.34740447998047, "learning_rate": 4.356279373168945e-05, "lookahead_loss": 7.141147650718689, "loss": 5.4116, "step": 67500 }, { "epoch": 0.12969970703125, "grad_norm": 16.219282150268555, "learning_rate": 4.351511001586914e-05, "lookahead_loss": 7.131195318698883, "loss": 5.5064, "step": 68000 }, { "epoch": 0.13065338134765625, "grad_norm": 21.588878631591797, "learning_rate": 4.346742630004883e-05, "lookahead_loss": 7.021893961906433, "loss": 5.4284, "step": 68500 }, { "epoch": 0.1316070556640625, "grad_norm": 69.18465423583984, "learning_rate": 4.3419742584228516e-05, "lookahead_loss": 7.060449453353882, "loss": 5.4347, "step": 69000 }, { "epoch": 0.13256072998046875, "grad_norm": 17.7547550201416, "learning_rate": 4.3372058868408206e-05, "lookahead_loss": 7.124897184371949, "loss": 5.4015, "step": 69500 }, { "epoch": 0.133514404296875, "grad_norm": 17.275911331176758, "learning_rate": 4.332437515258789e-05, "lookahead_loss": 7.0240634059906, "loss": 5.3878, "step": 70000 }, { "epoch": 0.133514404296875, "eval_accuracy": 0.03167221135029354, "eval_lookahead_loss": 7.012924317264557, "eval_lookahead_perplexity": 1110.898378839129, "eval_loss": 5.255021572113037, "eval_perplexity": 191.52561742208215, "eval_runtime": 507.5694, "eval_samples_per_second": 19.702, "eval_steps_per_second": 4.925, "step": 70000 }, { "epoch": 0.13446807861328125, "grad_norm": 16.54254913330078, "learning_rate": 4.327669143676758e-05, "lookahead_loss": 7.107575142383576, "loss": 5.4072, "step": 70500 }, { "epoch": 0.1354217529296875, "grad_norm": 17.765472412109375, "learning_rate": 4.322900772094727e-05, "lookahead_loss": 7.040564844131469, "loss": 5.4069, "step": 71000 }, { "epoch": 0.13637542724609375, "grad_norm": 11.293356895446777, "learning_rate": 4.318132400512695e-05, "lookahead_loss": 7.088335141658783, "loss": 5.368, "step": 71500 }, { "epoch": 0.1373291015625, "grad_norm": 9.786500930786133, "learning_rate": 4.3133640289306643e-05, "lookahead_loss": 7.091951345443726, "loss": 5.3819, "step": 72000 }, { "epoch": 0.13828277587890625, "grad_norm": 10.401778221130371, "learning_rate": 4.308595657348633e-05, "lookahead_loss": 7.143118433952331, "loss": 5.3827, "step": 72500 }, { "epoch": 0.1392364501953125, "grad_norm": 28.415803909301758, "learning_rate": 4.303827285766602e-05, "lookahead_loss": 7.089268786430359, "loss": 5.3647, "step": 73000 }, { "epoch": 0.14019012451171875, "grad_norm": 13.403910636901855, "learning_rate": 4.299058914184571e-05, "lookahead_loss": 7.096157086372376, "loss": 5.3713, "step": 73500 }, { "epoch": 0.141143798828125, "grad_norm": 28.016172409057617, "learning_rate": 4.294290542602539e-05, "lookahead_loss": 7.157042339324951, "loss": 5.3408, "step": 74000 }, { "epoch": 0.14209747314453125, "grad_norm": 16.44430160522461, "learning_rate": 4.289522171020508e-05, "lookahead_loss": 7.151892686843872, "loss": 5.3204, "step": 74500 }, { "epoch": 0.1430511474609375, "grad_norm": 11.597758293151855, "learning_rate": 4.2847537994384764e-05, "lookahead_loss": 7.094533257484436, "loss": 5.3311, "step": 75000 }, { "epoch": 0.1430511474609375, "eval_accuracy": 0.031123091976516633, "eval_lookahead_loss": 6.991511741924286, "eval_lookahead_perplexity": 1087.3640481191728, "eval_loss": 5.2316412925720215, "eval_perplexity": 187.09963689388135, "eval_runtime": 497.9004, "eval_samples_per_second": 20.084, "eval_steps_per_second": 5.021, "step": 75000 }, { "epoch": 0.14400482177734375, "grad_norm": 14.603766441345215, "learning_rate": 4.2799854278564455e-05, "lookahead_loss": 7.027197382926941, "loss": 5.3552, "step": 75500 }, { "epoch": 0.14495849609375, "grad_norm": 16.020191192626953, "learning_rate": 4.2752170562744145e-05, "lookahead_loss": 7.048828477859497, "loss": 5.3477, "step": 76000 }, { "epoch": 0.14591217041015625, "grad_norm": 18.4349308013916, "learning_rate": 4.270448684692383e-05, "lookahead_loss": 7.1001180319786075, "loss": 5.3829, "step": 76500 }, { "epoch": 0.1468658447265625, "grad_norm": 13.536137580871582, "learning_rate": 4.265680313110352e-05, "lookahead_loss": 6.992280686378479, "loss": 5.3369, "step": 77000 }, { "epoch": 0.14781951904296875, "grad_norm": 25.51408576965332, "learning_rate": 4.26091194152832e-05, "lookahead_loss": 7.062615676403046, "loss": 5.3185, "step": 77500 }, { "epoch": 0.148773193359375, "grad_norm": 19.138198852539062, "learning_rate": 4.256143569946289e-05, "lookahead_loss": 7.062578455924988, "loss": 5.3111, "step": 78000 }, { "epoch": 0.14972686767578125, "grad_norm": 10.740262985229492, "learning_rate": 4.251375198364258e-05, "lookahead_loss": 7.058840200901032, "loss": 5.3182, "step": 78500 }, { "epoch": 0.1506805419921875, "grad_norm": 11.959470748901367, "learning_rate": 4.2466068267822266e-05, "lookahead_loss": 7.089667859077454, "loss": 5.3119, "step": 79000 }, { "epoch": 0.15163421630859375, "grad_norm": 20.028032302856445, "learning_rate": 4.2418384552001956e-05, "lookahead_loss": 7.040053847312927, "loss": 5.2756, "step": 79500 }, { "epoch": 0.152587890625, "grad_norm": 14.42139720916748, "learning_rate": 4.237070083618164e-05, "lookahead_loss": 6.960290806770325, "loss": 5.192, "step": 80000 }, { "epoch": 0.152587890625, "eval_accuracy": 0.030379843444227006, "eval_lookahead_loss": 6.978407783412933, "eval_lookahead_perplexity": 1073.208225960849, "eval_loss": 5.176183700561523, "eval_perplexity": 177.0060124359365, "eval_runtime": 510.9461, "eval_samples_per_second": 19.572, "eval_steps_per_second": 4.893, "step": 80000 }, { "epoch": 0.15354156494140625, "grad_norm": 26.987417221069336, "learning_rate": 4.232301712036133e-05, "lookahead_loss": 7.073492568016052, "loss": 5.3042, "step": 80500 }, { "epoch": 0.1544952392578125, "grad_norm": 15.75604248046875, "learning_rate": 4.227533340454102e-05, "lookahead_loss": 7.047906914710999, "loss": 5.2509, "step": 81000 }, { "epoch": 0.15544891357421875, "grad_norm": 26.492666244506836, "learning_rate": 4.22276496887207e-05, "lookahead_loss": 7.101487815856934, "loss": 5.2952, "step": 81500 }, { "epoch": 0.156402587890625, "grad_norm": 11.874725341796875, "learning_rate": 4.2179965972900393e-05, "lookahead_loss": 7.032579688072205, "loss": 5.2934, "step": 82000 }, { "epoch": 0.15735626220703125, "grad_norm": 13.683402061462402, "learning_rate": 4.213228225708008e-05, "lookahead_loss": 6.8860186071395875, "loss": 5.2394, "step": 82500 }, { "epoch": 0.1583099365234375, "grad_norm": 19.220481872558594, "learning_rate": 4.208459854125977e-05, "lookahead_loss": 7.025292691230774, "loss": 5.2301, "step": 83000 }, { "epoch": 0.15926361083984375, "grad_norm": 11.837571144104004, "learning_rate": 4.203691482543946e-05, "lookahead_loss": 6.9911140441894535, "loss": 5.2179, "step": 83500 }, { "epoch": 0.16021728515625, "grad_norm": 14.403276443481445, "learning_rate": 4.198923110961914e-05, "lookahead_loss": 6.9934060640335085, "loss": 5.2191, "step": 84000 }, { "epoch": 0.16117095947265625, "grad_norm": 14.482048034667969, "learning_rate": 4.194154739379883e-05, "lookahead_loss": 7.053557103157043, "loss": 5.2539, "step": 84500 }, { "epoch": 0.1621246337890625, "grad_norm": 15.262174606323242, "learning_rate": 4.1893863677978514e-05, "lookahead_loss": 7.025487617492676, "loss": 5.3575, "step": 85000 }, { "epoch": 0.1621246337890625, "eval_accuracy": 0.03066849315068493, "eval_lookahead_loss": 6.960791391468048, "eval_lookahead_perplexity": 1054.4677238826143, "eval_loss": 5.133950710296631, "eval_perplexity": 169.686176471233, "eval_runtime": 515.4392, "eval_samples_per_second": 19.401, "eval_steps_per_second": 4.85, "step": 85000 }, { "epoch": 0.16307830810546875, "grad_norm": 13.788129806518555, "learning_rate": 4.1846179962158205e-05, "lookahead_loss": 7.180290727615357, "loss": 5.3688, "step": 85500 }, { "epoch": 0.164031982421875, "grad_norm": 29.578916549682617, "learning_rate": 4.1798496246337895e-05, "lookahead_loss": 7.06610741853714, "loss": 5.3275, "step": 86000 }, { "epoch": 0.16498565673828125, "grad_norm": 32.00334930419922, "learning_rate": 4.175081253051758e-05, "lookahead_loss": 7.127366899490356, "loss": 5.3083, "step": 86500 }, { "epoch": 0.1659393310546875, "grad_norm": 18.280853271484375, "learning_rate": 4.170312881469727e-05, "lookahead_loss": 6.94872332572937, "loss": 5.2966, "step": 87000 }, { "epoch": 0.16689300537109375, "grad_norm": 30.88824462890625, "learning_rate": 4.165544509887695e-05, "lookahead_loss": 7.004853561401367, "loss": 5.2633, "step": 87500 }, { "epoch": 0.1678466796875, "grad_norm": 14.328486442565918, "learning_rate": 4.160776138305664e-05, "lookahead_loss": 7.062825730323792, "loss": 5.2541, "step": 88000 }, { "epoch": 0.16880035400390625, "grad_norm": 17.42473602294922, "learning_rate": 4.156007766723633e-05, "lookahead_loss": 7.042470587730408, "loss": 5.2599, "step": 88500 }, { "epoch": 0.1697540283203125, "grad_norm": 17.593931198120117, "learning_rate": 4.1512393951416016e-05, "lookahead_loss": 7.022479557037354, "loss": 5.2539, "step": 89000 }, { "epoch": 0.17070770263671875, "grad_norm": 12.986210823059082, "learning_rate": 4.1464710235595706e-05, "lookahead_loss": 7.018770641326904, "loss": 5.2892, "step": 89500 }, { "epoch": 0.171661376953125, "grad_norm": 14.76010799407959, "learning_rate": 4.141702651977539e-05, "lookahead_loss": 7.068565278053284, "loss": 5.2409, "step": 90000 }, { "epoch": 0.171661376953125, "eval_accuracy": 0.029615068493150685, "eval_lookahead_loss": 6.95743630695343, "eval_lookahead_perplexity": 1050.9358237758254, "eval_loss": 5.104590892791748, "eval_perplexity": 164.7766454271377, "eval_runtime": 523.2798, "eval_samples_per_second": 19.11, "eval_steps_per_second": 4.778, "step": 90000 }, { "epoch": 0.17261505126953125, "grad_norm": 36.2658805847168, "learning_rate": 4.136934280395508e-05, "lookahead_loss": 7.053651679992676, "loss": 5.2371, "step": 90500 }, { "epoch": 0.1735687255859375, "grad_norm": 12.038991928100586, "learning_rate": 4.132165908813477e-05, "lookahead_loss": 7.076903000831604, "loss": 5.211, "step": 91000 }, { "epoch": 0.17452239990234375, "grad_norm": 14.45565128326416, "learning_rate": 4.127397537231445e-05, "lookahead_loss": 7.085323967933655, "loss": 5.243, "step": 91500 }, { "epoch": 0.17547607421875, "grad_norm": 15.149271965026855, "learning_rate": 4.1226291656494143e-05, "lookahead_loss": 7.07952986240387, "loss": 5.2173, "step": 92000 }, { "epoch": 0.17642974853515625, "grad_norm": 45.502471923828125, "learning_rate": 4.117860794067383e-05, "lookahead_loss": 7.074552497386932, "loss": 5.1976, "step": 92500 }, { "epoch": 0.1773834228515625, "grad_norm": 25.228681564331055, "learning_rate": 4.113092422485352e-05, "lookahead_loss": 6.950310461044311, "loss": 5.2045, "step": 93000 }, { "epoch": 0.17833709716796875, "grad_norm": 59.99542236328125, "learning_rate": 4.108324050903321e-05, "lookahead_loss": 7.038198033332825, "loss": 5.2208, "step": 93500 }, { "epoch": 0.179290771484375, "grad_norm": 14.549333572387695, "learning_rate": 4.103555679321289e-05, "lookahead_loss": 7.078160994529724, "loss": 5.2405, "step": 94000 }, { "epoch": 0.18024444580078125, "grad_norm": 15.092066764831543, "learning_rate": 4.098787307739258e-05, "lookahead_loss": 7.125468634605408, "loss": 5.196, "step": 94500 }, { "epoch": 0.1811981201171875, "grad_norm": 20.77546501159668, "learning_rate": 4.0940189361572264e-05, "lookahead_loss": 6.931323900222778, "loss": 5.2178, "step": 95000 }, { "epoch": 0.1811981201171875, "eval_accuracy": 0.02985675146771037, "eval_lookahead_loss": 6.941384142398834, "eval_lookahead_perplexity": 1034.2007057762896, "eval_loss": 5.075103282928467, "eval_perplexity": 159.9887152291175, "eval_runtime": 508.4394, "eval_samples_per_second": 19.668, "eval_steps_per_second": 4.917, "step": 95000 }, { "epoch": 0.18215179443359375, "grad_norm": 21.640905380249023, "learning_rate": 4.0892505645751955e-05, "lookahead_loss": 7.017150844097137, "loss": 5.1729, "step": 95500 }, { "epoch": 0.18310546875, "grad_norm": 18.768932342529297, "learning_rate": 4.0844821929931645e-05, "lookahead_loss": 7.0667437629699705, "loss": 5.1855, "step": 96000 }, { "epoch": 0.18405914306640625, "grad_norm": 15.86687183380127, "learning_rate": 4.079713821411133e-05, "lookahead_loss": 7.019403304576874, "loss": 5.2012, "step": 96500 }, { "epoch": 0.1850128173828125, "grad_norm": 48.56646728515625, "learning_rate": 4.074945449829102e-05, "lookahead_loss": 7.042156358718872, "loss": 5.195, "step": 97000 }, { "epoch": 0.18596649169921875, "grad_norm": 21.649391174316406, "learning_rate": 4.07017707824707e-05, "lookahead_loss": 7.129418392181396, "loss": 5.1632, "step": 97500 }, { "epoch": 0.186920166015625, "grad_norm": 13.83983039855957, "learning_rate": 4.065408706665039e-05, "lookahead_loss": 7.031696859359741, "loss": 5.1688, "step": 98000 }, { "epoch": 0.18787384033203125, "grad_norm": 13.785502433776855, "learning_rate": 4.060640335083008e-05, "lookahead_loss": 7.016117392539978, "loss": 5.1533, "step": 98500 }, { "epoch": 0.1888275146484375, "grad_norm": 12.750699043273926, "learning_rate": 4.0558719635009766e-05, "lookahead_loss": 7.125010883331298, "loss": 5.1617, "step": 99000 }, { "epoch": 0.18978118896484375, "grad_norm": 24.571895599365234, "learning_rate": 4.0511035919189456e-05, "lookahead_loss": 7.061727128982544, "loss": 5.1206, "step": 99500 }, { "epoch": 0.19073486328125, "grad_norm": 31.058263778686523, "learning_rate": 4.046335220336914e-05, "lookahead_loss": 7.114327770233154, "loss": 5.1463, "step": 100000 }, { "epoch": 0.19073486328125, "eval_accuracy": 0.029686301369863013, "eval_lookahead_loss": 6.933541359710693, "eval_lookahead_perplexity": 1026.1214178472446, "eval_loss": 5.043911457061768, "eval_perplexity": 155.0754010558847, "eval_runtime": 517.801, "eval_samples_per_second": 19.312, "eval_steps_per_second": 4.828, "step": 100000 }, { "epoch": 0.19168853759765625, "grad_norm": 28.94295883178711, "learning_rate": 4.041566848754883e-05, "lookahead_loss": 6.927390743255615, "loss": 5.1315, "step": 100500 }, { "epoch": 0.1926422119140625, "grad_norm": 19.058692932128906, "learning_rate": 4.036798477172852e-05, "lookahead_loss": 7.01663103055954, "loss": 5.1958, "step": 101000 }, { "epoch": 0.19359588623046875, "grad_norm": 27.587623596191406, "learning_rate": 4.03203010559082e-05, "lookahead_loss": 7.0556095666885374, "loss": 5.244, "step": 101500 }, { "epoch": 0.194549560546875, "grad_norm": 36.67210006713867, "learning_rate": 4.0272617340087893e-05, "lookahead_loss": 7.060176265716553, "loss": 5.2029, "step": 102000 }, { "epoch": 0.19550323486328125, "grad_norm": 32.97154235839844, "learning_rate": 4.022493362426758e-05, "lookahead_loss": 7.059000421524048, "loss": 5.2418, "step": 102500 }, { "epoch": 0.1964569091796875, "grad_norm": 31.018024444580078, "learning_rate": 4.017724990844727e-05, "lookahead_loss": 7.016113545894623, "loss": 5.2204, "step": 103000 }, { "epoch": 0.19741058349609375, "grad_norm": 19.926023483276367, "learning_rate": 4.012956619262696e-05, "lookahead_loss": 7.0085662698745725, "loss": 5.1895, "step": 103500 }, { "epoch": 0.1983642578125, "grad_norm": 16.08983039855957, "learning_rate": 4.008188247680664e-05, "lookahead_loss": 7.061134398460388, "loss": 5.1842, "step": 104000 }, { "epoch": 0.19931793212890625, "grad_norm": 29.40497589111328, "learning_rate": 4.003419876098633e-05, "lookahead_loss": 7.114178021430969, "loss": 5.1852, "step": 104500 }, { "epoch": 0.2002716064453125, "grad_norm": 51.267250061035156, "learning_rate": 3.9986515045166014e-05, "lookahead_loss": 7.031693346977234, "loss": 5.1763, "step": 105000 }, { "epoch": 0.2002716064453125, "eval_accuracy": 0.029473385518590998, "eval_lookahead_loss": 6.923986781597137, "eval_lookahead_perplexity": 1016.363949084971, "eval_loss": 5.019506454467773, "eval_perplexity": 151.33659385943037, "eval_runtime": 515.4264, "eval_samples_per_second": 19.401, "eval_steps_per_second": 4.85, "step": 105000 }, { "epoch": 0.20122528076171875, "grad_norm": 19.078763961791992, "learning_rate": 3.9938831329345705e-05, "lookahead_loss": 7.085572448730469, "loss": 5.1722, "step": 105500 }, { "epoch": 0.202178955078125, "grad_norm": 22.66206169128418, "learning_rate": 3.9891147613525395e-05, "lookahead_loss": 7.0965241508483885, "loss": 5.1532, "step": 106000 }, { "epoch": 0.20313262939453125, "grad_norm": 21.129127502441406, "learning_rate": 3.984346389770508e-05, "lookahead_loss": 6.970416614055633, "loss": 5.1443, "step": 106500 }, { "epoch": 0.2040863037109375, "grad_norm": 20.843502044677734, "learning_rate": 3.979578018188477e-05, "lookahead_loss": 6.973331341266632, "loss": 5.1777, "step": 107000 }, { "epoch": 0.20503997802734375, "grad_norm": 13.69922161102295, "learning_rate": 3.974809646606445e-05, "lookahead_loss": 6.962644424438476, "loss": 5.1476, "step": 107500 }, { "epoch": 0.20599365234375, "grad_norm": 16.637916564941406, "learning_rate": 3.970041275024414e-05, "lookahead_loss": 7.012517877578736, "loss": 5.1246, "step": 108000 }, { "epoch": 0.20694732666015625, "grad_norm": 31.868671417236328, "learning_rate": 3.965272903442383e-05, "lookahead_loss": 7.033748887062073, "loss": 5.123, "step": 108500 }, { "epoch": 0.2079010009765625, "grad_norm": 23.37651824951172, "learning_rate": 3.9605045318603516e-05, "lookahead_loss": 7.113513669967651, "loss": 5.1741, "step": 109000 }, { "epoch": 0.20885467529296875, "grad_norm": 29.04323387145996, "learning_rate": 3.9557361602783206e-05, "lookahead_loss": 7.051657503128052, "loss": 5.1511, "step": 109500 }, { "epoch": 0.209808349609375, "grad_norm": 25.81477928161621, "learning_rate": 3.950967788696289e-05, "lookahead_loss": 7.03148574256897, "loss": 5.1507, "step": 110000 }, { "epoch": 0.209808349609375, "eval_accuracy": 0.028944618395303327, "eval_lookahead_loss": 6.90715215549469, "eval_lookahead_perplexity": 999.3970583549639, "eval_loss": 4.986264228820801, "eval_perplexity": 146.38852668538476, "eval_runtime": 509.551, "eval_samples_per_second": 19.625, "eval_steps_per_second": 4.906, "step": 110000 }, { "epoch": 0.21076202392578125, "grad_norm": 22.366060256958008, "learning_rate": 3.946199417114258e-05, "lookahead_loss": 7.0615292873382565, "loss": 5.1226, "step": 110500 }, { "epoch": 0.2117156982421875, "grad_norm": 20.431612014770508, "learning_rate": 3.941431045532227e-05, "lookahead_loss": 7.10613902759552, "loss": 5.1687, "step": 111000 }, { "epoch": 0.21266937255859375, "grad_norm": 31.437076568603516, "learning_rate": 3.936662673950195e-05, "lookahead_loss": 7.048504544258118, "loss": 5.1298, "step": 111500 }, { "epoch": 0.213623046875, "grad_norm": 18.371660232543945, "learning_rate": 3.9318943023681643e-05, "lookahead_loss": 7.021038059234619, "loss": 5.0787, "step": 112000 }, { "epoch": 0.21457672119140625, "grad_norm": 28.444997787475586, "learning_rate": 3.927125930786133e-05, "lookahead_loss": 7.00390574836731, "loss": 5.0984, "step": 112500 }, { "epoch": 0.2155303955078125, "grad_norm": 18.85318374633789, "learning_rate": 3.922357559204102e-05, "lookahead_loss": 6.935596935749054, "loss": 5.1306, "step": 113000 }, { "epoch": 0.21648406982421875, "grad_norm": 27.469560623168945, "learning_rate": 3.917589187622071e-05, "lookahead_loss": 7.058727834701538, "loss": 5.0832, "step": 113500 }, { "epoch": 0.217437744140625, "grad_norm": 33.21272659301758, "learning_rate": 3.912820816040039e-05, "lookahead_loss": 6.966387234687805, "loss": 5.0936, "step": 114000 }, { "epoch": 0.21839141845703125, "grad_norm": 26.946094512939453, "learning_rate": 3.908052444458008e-05, "lookahead_loss": 7.011251744270325, "loss": 5.1222, "step": 114500 }, { "epoch": 0.2193450927734375, "grad_norm": 42.00823211669922, "learning_rate": 3.9032840728759764e-05, "lookahead_loss": 6.907915513992309, "loss": 5.0905, "step": 115000 }, { "epoch": 0.2193450927734375, "eval_accuracy": 0.029135029354207437, "eval_lookahead_loss": 6.90610673532486, "eval_lookahead_perplexity": 998.3528144444188, "eval_loss": 4.965140342712402, "eval_perplexity": 143.32866399537386, "eval_runtime": 512.1332, "eval_samples_per_second": 19.526, "eval_steps_per_second": 4.882, "step": 115000 }, { "epoch": 0.22029876708984375, "grad_norm": 20.44475555419922, "learning_rate": 3.8985157012939455e-05, "lookahead_loss": 6.975942084312439, "loss": 5.065, "step": 115500 }, { "epoch": 0.22125244140625, "grad_norm": 20.73438835144043, "learning_rate": 3.8937473297119145e-05, "lookahead_loss": 7.020067551612854, "loss": 5.0824, "step": 116000 }, { "epoch": 0.22220611572265625, "grad_norm": 16.505712509155273, "learning_rate": 3.888978958129883e-05, "lookahead_loss": 6.953783864021301, "loss": 5.0785, "step": 116500 }, { "epoch": 0.2231597900390625, "grad_norm": 25.2286319732666, "learning_rate": 3.884210586547852e-05, "lookahead_loss": 6.972704875946045, "loss": 5.0707, "step": 117000 }, { "epoch": 0.22411346435546875, "grad_norm": 40.70512390136719, "learning_rate": 3.87944221496582e-05, "lookahead_loss": 6.99136109828949, "loss": 5.1466, "step": 117500 }, { "epoch": 0.225067138671875, "grad_norm": 16.29987907409668, "learning_rate": 3.874673843383789e-05, "lookahead_loss": 7.047970659255982, "loss": 5.1634, "step": 118000 }, { "epoch": 0.22602081298828125, "grad_norm": 73.62030792236328, "learning_rate": 3.869905471801758e-05, "lookahead_loss": 7.07725322341919, "loss": 5.1877, "step": 118500 }, { "epoch": 0.2269744873046875, "grad_norm": 31.63263702392578, "learning_rate": 3.8651371002197266e-05, "lookahead_loss": 7.019968224525452, "loss": 5.1196, "step": 119000 }, { "epoch": 0.22792816162109375, "grad_norm": 39.314659118652344, "learning_rate": 3.8603687286376956e-05, "lookahead_loss": 7.01877578830719, "loss": 5.1149, "step": 119500 }, { "epoch": 0.2288818359375, "grad_norm": 28.35359764099121, "learning_rate": 3.855600357055664e-05, "lookahead_loss": 7.0780117893218994, "loss": 5.133, "step": 120000 }, { "epoch": 0.2288818359375, "eval_accuracy": 0.028930528375733854, "eval_lookahead_loss": 6.905415377998352, "eval_lookahead_perplexity": 997.6628344505577, "eval_loss": 4.934220314025879, "eval_perplexity": 138.96475141447732, "eval_runtime": 559.2842, "eval_samples_per_second": 17.88, "eval_steps_per_second": 4.47, "step": 120000 }, { "epoch": 0.22983551025390625, "grad_norm": 63.927207946777344, "learning_rate": 3.850831985473633e-05, "lookahead_loss": 6.970953395843506, "loss": 5.107, "step": 120500 }, { "epoch": 0.2307891845703125, "grad_norm": 22.31732940673828, "learning_rate": 3.846063613891602e-05, "lookahead_loss": 7.011101596832275, "loss": 5.1139, "step": 121000 }, { "epoch": 0.23174285888671875, "grad_norm": 18.158554077148438, "learning_rate": 3.84129524230957e-05, "lookahead_loss": 7.119197889328003, "loss": 5.0866, "step": 121500 }, { "epoch": 0.232696533203125, "grad_norm": 32.63042068481445, "learning_rate": 3.8365268707275393e-05, "lookahead_loss": 7.009960453987121, "loss": 5.0891, "step": 122000 }, { "epoch": 0.23365020751953125, "grad_norm": 25.261150360107422, "learning_rate": 3.831758499145508e-05, "lookahead_loss": 7.033861748695373, "loss": 5.1206, "step": 122500 }, { "epoch": 0.2346038818359375, "grad_norm": 24.71761703491211, "learning_rate": 3.826990127563477e-05, "lookahead_loss": 6.969883305549621, "loss": 5.0712, "step": 123000 }, { "epoch": 0.23555755615234375, "grad_norm": 17.960708618164062, "learning_rate": 3.822221755981446e-05, "lookahead_loss": 7.124355870723725, "loss": 5.0969, "step": 123500 }, { "epoch": 0.23651123046875, "grad_norm": 17.788679122924805, "learning_rate": 3.817453384399414e-05, "lookahead_loss": 7.063056550979614, "loss": 5.0818, "step": 124000 }, { "epoch": 0.23746490478515625, "grad_norm": 19.183542251586914, "learning_rate": 3.812685012817383e-05, "lookahead_loss": 7.007234457492828, "loss": 5.0485, "step": 124500 }, { "epoch": 0.2384185791015625, "grad_norm": 17.047277450561523, "learning_rate": 3.8079166412353514e-05, "lookahead_loss": 7.000014476776123, "loss": 5.0768, "step": 125000 }, { "epoch": 0.2384185791015625, "eval_accuracy": 0.029198238747553815, "eval_lookahead_loss": 6.894271610927582, "eval_lookahead_perplexity": 986.6068293950667, "eval_loss": 4.9210615158081055, "eval_perplexity": 137.148120841793, "eval_runtime": 563.1045, "eval_samples_per_second": 17.759, "eval_steps_per_second": 4.44, "step": 125000 }, { "epoch": 0.23937225341796875, "grad_norm": 45.86539077758789, "learning_rate": 3.8031482696533205e-05, "lookahead_loss": 6.966305281639099, "loss": 5.0899, "step": 125500 }, { "epoch": 0.240325927734375, "grad_norm": 15.76852035522461, "learning_rate": 3.7983798980712895e-05, "lookahead_loss": 6.952229702472687, "loss": 5.0593, "step": 126000 }, { "epoch": 0.24127960205078125, "grad_norm": 45.952022552490234, "learning_rate": 3.793611526489258e-05, "lookahead_loss": 6.912394628524781, "loss": 5.0772, "step": 126500 }, { "epoch": 0.2422332763671875, "grad_norm": 16.728940963745117, "learning_rate": 3.788843154907227e-05, "lookahead_loss": 6.971283762931824, "loss": 5.0821, "step": 127000 }, { "epoch": 0.24318695068359375, "grad_norm": 32.42121887207031, "learning_rate": 3.784074783325195e-05, "lookahead_loss": 6.921290741443634, "loss": 5.0717, "step": 127500 }, { "epoch": 0.244140625, "grad_norm": 28.443878173828125, "learning_rate": 3.779306411743164e-05, "lookahead_loss": 6.876033420085907, "loss": 5.0541, "step": 128000 }, { "epoch": 0.24509429931640625, "grad_norm": 14.713363647460938, "learning_rate": 3.774538040161133e-05, "lookahead_loss": 6.946796874523163, "loss": 5.0573, "step": 128500 }, { "epoch": 0.2460479736328125, "grad_norm": 27.072772979736328, "learning_rate": 3.7697696685791016e-05, "lookahead_loss": 6.997370836734771, "loss": 5.0516, "step": 129000 }, { "epoch": 0.24700164794921875, "grad_norm": 18.42095947265625, "learning_rate": 3.7650012969970706e-05, "lookahead_loss": 6.996538161277771, "loss": 5.0421, "step": 129500 }, { "epoch": 0.247955322265625, "grad_norm": 17.15911293029785, "learning_rate": 3.760232925415039e-05, "lookahead_loss": 7.0170216455459595, "loss": 5.0645, "step": 130000 }, { "epoch": 0.247955322265625, "eval_accuracy": 0.028754207436399216, "eval_lookahead_loss": 6.8848255153656, "eval_lookahead_perplexity": 977.3311255663789, "eval_loss": 4.891185760498047, "eval_perplexity": 133.1113186526427, "eval_runtime": 515.5123, "eval_samples_per_second": 19.398, "eval_steps_per_second": 4.85, "step": 130000 }, { "epoch": 0.24890899658203125, "grad_norm": 31.166728973388672, "learning_rate": 3.755464553833008e-05, "lookahead_loss": 7.000779727935791, "loss": 5.0087, "step": 130500 }, { "epoch": 0.2498626708984375, "grad_norm": 37.516387939453125, "learning_rate": 3.750696182250977e-05, "lookahead_loss": 6.925521522521973, "loss": 5.0449, "step": 131000 }, { "epoch": 0.25081634521484375, "grad_norm": 28.797426223754883, "learning_rate": 3.745927810668945e-05, "lookahead_loss": 7.010660227775574, "loss": 5.0492, "step": 131500 }, { "epoch": 0.25177001953125, "grad_norm": 25.733989715576172, "learning_rate": 3.7411594390869143e-05, "lookahead_loss": 6.976770524978638, "loss": 5.0289, "step": 132000 }, { "epoch": 0.25272369384765625, "grad_norm": 67.13007354736328, "learning_rate": 3.736391067504883e-05, "lookahead_loss": 6.950276339530945, "loss": 5.036, "step": 132500 }, { "epoch": 0.2536773681640625, "grad_norm": 47.058589935302734, "learning_rate": 3.731622695922852e-05, "lookahead_loss": 7.000660407066345, "loss": 5.0666, "step": 133000 }, { "epoch": 0.25463104248046875, "grad_norm": 50.83979415893555, "learning_rate": 3.726854324340821e-05, "lookahead_loss": 7.068637451171875, "loss": 5.0929, "step": 133500 }, { "epoch": 0.255584716796875, "grad_norm": 19.997501373291016, "learning_rate": 3.722085952758789e-05, "lookahead_loss": 7.018365789413452, "loss": 5.1061, "step": 134000 }, { "epoch": 0.25653839111328125, "grad_norm": 30.75432777404785, "learning_rate": 3.717317581176758e-05, "lookahead_loss": 6.9933175230026245, "loss": 5.0919, "step": 134500 }, { "epoch": 0.2574920654296875, "grad_norm": 21.000967025756836, "learning_rate": 3.7125492095947264e-05, "lookahead_loss": 7.019055374145508, "loss": 5.0714, "step": 135000 }, { "epoch": 0.2574920654296875, "eval_accuracy": 0.029134637964774952, "eval_lookahead_loss": 6.877201622962952, "eval_lookahead_perplexity": 969.9083892478764, "eval_loss": 4.87701416015625, "eval_perplexity": 131.2382219808382, "eval_runtime": 540.0497, "eval_samples_per_second": 18.517, "eval_steps_per_second": 4.629, "step": 135000 }, { "epoch": 0.25844573974609375, "grad_norm": 16.54420280456543, "learning_rate": 3.7077808380126955e-05, "lookahead_loss": 7.0055183448791505, "loss": 5.0604, "step": 135500 }, { "epoch": 0.2593994140625, "grad_norm": 15.701042175292969, "learning_rate": 3.7030124664306645e-05, "lookahead_loss": 6.950681262969971, "loss": 5.0336, "step": 136000 }, { "epoch": 0.26035308837890625, "grad_norm": 19.6634464263916, "learning_rate": 3.698244094848633e-05, "lookahead_loss": 6.960614207744598, "loss": 5.0033, "step": 136500 }, { "epoch": 0.2613067626953125, "grad_norm": 28.120410919189453, "learning_rate": 3.693475723266602e-05, "lookahead_loss": 7.026803030967712, "loss": 5.0473, "step": 137000 }, { "epoch": 0.26226043701171875, "grad_norm": 29.416000366210938, "learning_rate": 3.68870735168457e-05, "lookahead_loss": 7.047402129173279, "loss": 5.0532, "step": 137500 }, { "epoch": 0.263214111328125, "grad_norm": 22.85749053955078, "learning_rate": 3.683938980102539e-05, "lookahead_loss": 6.984240023612976, "loss": 5.0378, "step": 138000 }, { "epoch": 0.26416778564453125, "grad_norm": 21.862411499023438, "learning_rate": 3.679170608520508e-05, "lookahead_loss": 6.969232307434082, "loss": 5.0351, "step": 138500 }, { "epoch": 0.2651214599609375, "grad_norm": 29.69403076171875, "learning_rate": 3.6744022369384766e-05, "lookahead_loss": 6.9022474222183225, "loss": 4.9496, "step": 139000 }, { "epoch": 0.26607513427734375, "grad_norm": 31.769420623779297, "learning_rate": 3.6696338653564456e-05, "lookahead_loss": 7.064392502784729, "loss": 5.0412, "step": 139500 }, { "epoch": 0.26702880859375, "grad_norm": 20.508264541625977, "learning_rate": 3.664865493774414e-05, "lookahead_loss": 6.978036500453949, "loss": 5.0038, "step": 140000 }, { "epoch": 0.26702880859375, "eval_accuracy": 0.02828375733855186, "eval_lookahead_loss": 6.876235432910919, "eval_lookahead_perplexity": 968.9717259810335, "eval_loss": 4.85726261138916, "eval_perplexity": 128.67149569351136, "eval_runtime": 636.4763, "eval_samples_per_second": 15.712, "eval_steps_per_second": 3.928, "step": 140000 }, { "epoch": 0.26798248291015625, "grad_norm": 22.219287872314453, "learning_rate": 3.660097122192383e-05, "lookahead_loss": 7.067828114509583, "loss": 5.018, "step": 140500 }, { "epoch": 0.2689361572265625, "grad_norm": 18.10211944580078, "learning_rate": 3.655328750610352e-05, "lookahead_loss": 7.056972642898559, "loss": 4.9996, "step": 141000 }, { "epoch": 0.26988983154296875, "grad_norm": 22.048385620117188, "learning_rate": 3.65056037902832e-05, "lookahead_loss": 7.070527895927429, "loss": 5.0441, "step": 141500 }, { "epoch": 0.270843505859375, "grad_norm": 36.969696044921875, "learning_rate": 3.6457920074462893e-05, "lookahead_loss": 6.96172491979599, "loss": 5.0055, "step": 142000 }, { "epoch": 0.27179718017578125, "grad_norm": 16.63195037841797, "learning_rate": 3.641023635864258e-05, "lookahead_loss": 6.93306167793274, "loss": 5.0021, "step": 142500 }, { "epoch": 0.2727508544921875, "grad_norm": 28.463359832763672, "learning_rate": 3.636255264282227e-05, "lookahead_loss": 6.8807322473526, "loss": 4.9616, "step": 143000 }, { "epoch": 0.27370452880859375, "grad_norm": 16.56485366821289, "learning_rate": 3.631486892700196e-05, "lookahead_loss": 7.014453740596771, "loss": 5.0339, "step": 143500 }, { "epoch": 0.274658203125, "grad_norm": 17.137786865234375, "learning_rate": 3.626718521118164e-05, "lookahead_loss": 7.067553649902344, "loss": 4.9761, "step": 144000 }, { "epoch": 0.27561187744140625, "grad_norm": 75.30558013916016, "learning_rate": 3.621950149536133e-05, "lookahead_loss": 6.907124230384826, "loss": 4.9351, "step": 144500 }, { "epoch": 0.2765655517578125, "grad_norm": 28.882795333862305, "learning_rate": 3.6171817779541014e-05, "lookahead_loss": 7.042889600753784, "loss": 4.9626, "step": 145000 }, { "epoch": 0.2765655517578125, "eval_accuracy": 0.02830645792563601, "eval_lookahead_loss": 6.865430118370056, "eval_lookahead_perplexity": 958.5580445726084, "eval_loss": 4.8280487060546875, "eval_perplexity": 124.96687548375812, "eval_runtime": 729.4838, "eval_samples_per_second": 13.708, "eval_steps_per_second": 3.427, "step": 145000 }, { "epoch": 0.27751922607421875, "grad_norm": 24.788028717041016, "learning_rate": 3.6124134063720705e-05, "lookahead_loss": 7.012193975448608, "loss": 4.9478, "step": 145500 }, { "epoch": 0.278472900390625, "grad_norm": 23.758310317993164, "learning_rate": 3.6076450347900395e-05, "lookahead_loss": 6.9251371879577635, "loss": 4.946, "step": 146000 }, { "epoch": 0.27942657470703125, "grad_norm": 10.454421997070312, "learning_rate": 3.602876663208008e-05, "lookahead_loss": 6.90738028717041, "loss": 4.9523, "step": 146500 }, { "epoch": 0.2803802490234375, "grad_norm": 22.896276473999023, "learning_rate": 3.598108291625977e-05, "lookahead_loss": 6.941471890449524, "loss": 4.9968, "step": 147000 }, { "epoch": 0.28133392333984375, "grad_norm": 22.6302490234375, "learning_rate": 3.593339920043945e-05, "lookahead_loss": 6.877468722343445, "loss": 4.9595, "step": 147500 }, { "epoch": 0.28228759765625, "grad_norm": 16.85541534423828, "learning_rate": 3.588571548461914e-05, "lookahead_loss": 6.963884599685669, "loss": 4.9213, "step": 148000 }, { "epoch": 0.28324127197265625, "grad_norm": 17.745227813720703, "learning_rate": 3.583803176879883e-05, "lookahead_loss": 6.958312917709351, "loss": 4.9456, "step": 148500 }, { "epoch": 0.2841949462890625, "grad_norm": 10.442530632019043, "learning_rate": 3.5790348052978516e-05, "lookahead_loss": 7.003836160182953, "loss": 5.0476, "step": 149000 }, { "epoch": 0.28514862060546875, "grad_norm": 15.9586820602417, "learning_rate": 3.5742664337158206e-05, "lookahead_loss": 6.973358942985534, "loss": 5.0162, "step": 149500 }, { "epoch": 0.286102294921875, "grad_norm": 13.914422035217285, "learning_rate": 3.569498062133789e-05, "lookahead_loss": 6.895875663280487, "loss": 5.0443, "step": 150000 }, { "epoch": 0.286102294921875, "eval_accuracy": 0.02852152641878669, "eval_lookahead_loss": 6.856860765457153, "eval_lookahead_perplexity": 950.3789173669138, "eval_loss": 4.799951553344727, "eval_perplexity": 121.50453088802016, "eval_runtime": 591.4297, "eval_samples_per_second": 16.908, "eval_steps_per_second": 4.227, "step": 150000 }, { "epoch": 0.28705596923828125, "grad_norm": 37.02862548828125, "learning_rate": 3.564729690551758e-05, "lookahead_loss": 6.895696063995361, "loss": 4.9991, "step": 150500 }, { "epoch": 0.2880096435546875, "grad_norm": 25.055213928222656, "learning_rate": 3.559961318969727e-05, "lookahead_loss": 7.023568695068359, "loss": 4.9761, "step": 151000 }, { "epoch": 0.28896331787109375, "grad_norm": 14.961044311523438, "learning_rate": 3.555192947387695e-05, "lookahead_loss": 6.959557954788208, "loss": 4.9503, "step": 151500 }, { "epoch": 0.2899169921875, "grad_norm": 24.926082611083984, "learning_rate": 3.5504245758056643e-05, "lookahead_loss": 7.003702548980713, "loss": 5.0071, "step": 152000 }, { "epoch": 0.29087066650390625, "grad_norm": 15.324337005615234, "learning_rate": 3.545656204223633e-05, "lookahead_loss": 6.888869009971619, "loss": 4.9375, "step": 152500 }, { "epoch": 0.2918243408203125, "grad_norm": 27.02728271484375, "learning_rate": 3.540887832641602e-05, "lookahead_loss": 7.006887480735779, "loss": 5.0264, "step": 153000 }, { "epoch": 0.29277801513671875, "grad_norm": 35.46083450317383, "learning_rate": 3.536119461059571e-05, "lookahead_loss": 6.947562145233154, "loss": 4.9618, "step": 153500 }, { "epoch": 0.293731689453125, "grad_norm": 23.45201301574707, "learning_rate": 3.531351089477539e-05, "lookahead_loss": 6.992656003475189, "loss": 4.9953, "step": 154000 }, { "epoch": 0.29468536376953125, "grad_norm": 14.677413940429688, "learning_rate": 3.526582717895508e-05, "lookahead_loss": 7.05440563583374, "loss": 4.9749, "step": 154500 }, { "epoch": 0.2956390380859375, "grad_norm": 12.241249084472656, "learning_rate": 3.5218143463134764e-05, "lookahead_loss": 6.948158584594727, "loss": 4.9348, "step": 155000 }, { "epoch": 0.2956390380859375, "eval_accuracy": 0.02838688845401174, "eval_lookahead_loss": 6.848816467475891, "eval_lookahead_perplexity": 942.7644537291543, "eval_loss": 4.782739639282227, "eval_perplexity": 119.43110038554309, "eval_runtime": 827.1636, "eval_samples_per_second": 12.09, "eval_steps_per_second": 3.022, "step": 155000 }, { "epoch": 0.29659271240234375, "grad_norm": 14.035004615783691, "learning_rate": 3.5170459747314455e-05, "lookahead_loss": 6.96835812664032, "loss": 4.9154, "step": 155500 }, { "epoch": 0.29754638671875, "grad_norm": 24.650785446166992, "learning_rate": 3.5122776031494145e-05, "lookahead_loss": 6.985821210861206, "loss": 4.9596, "step": 156000 }, { "epoch": 0.29850006103515625, "grad_norm": 11.745288848876953, "learning_rate": 3.507509231567383e-05, "lookahead_loss": 7.052103019714355, "loss": 4.9454, "step": 156500 }, { "epoch": 0.2994537353515625, "grad_norm": 14.651015281677246, "learning_rate": 3.502740859985352e-05, "lookahead_loss": 6.927494093894959, "loss": 4.9102, "step": 157000 }, { "epoch": 0.30040740966796875, "grad_norm": 14.775087356567383, "learning_rate": 3.49797248840332e-05, "lookahead_loss": 6.915873066902161, "loss": 4.9721, "step": 157500 }, { "epoch": 0.301361083984375, "grad_norm": 81.306884765625, "learning_rate": 3.493204116821289e-05, "lookahead_loss": 6.911425126075745, "loss": 4.8887, "step": 158000 }, { "epoch": 0.30231475830078125, "grad_norm": 11.422953605651855, "learning_rate": 3.488435745239258e-05, "lookahead_loss": 7.04006861114502, "loss": 4.9212, "step": 158500 }, { "epoch": 0.3032684326171875, "grad_norm": 31.249181747436523, "learning_rate": 3.4836673736572266e-05, "lookahead_loss": 6.927506227493286, "loss": 4.905, "step": 159000 }, { "epoch": 0.30422210693359375, "grad_norm": 17.233642578125, "learning_rate": 3.4788990020751956e-05, "lookahead_loss": 6.887065649032593, "loss": 4.8913, "step": 159500 }, { "epoch": 0.30517578125, "grad_norm": 16.048831939697266, "learning_rate": 3.474130630493164e-05, "lookahead_loss": 6.945653329849243, "loss": 4.8871, "step": 160000 }, { "epoch": 0.30517578125, "eval_accuracy": 0.02788512720156556, "eval_lookahead_loss": 6.837414638900757, "eval_lookahead_perplexity": 932.076263288584, "eval_loss": 4.758224964141846, "eval_perplexity": 116.5388815042004, "eval_runtime": 800.0646, "eval_samples_per_second": 12.499, "eval_steps_per_second": 3.125, "step": 160000 }, { "epoch": 0.30612945556640625, "grad_norm": 25.438861846923828, "learning_rate": 3.469362258911133e-05, "lookahead_loss": 6.864038036823272, "loss": 4.8676, "step": 160500 }, { "epoch": 0.3070831298828125, "grad_norm": 38.583343505859375, "learning_rate": 3.464593887329102e-05, "lookahead_loss": 6.884070171356202, "loss": 4.9214, "step": 161000 }, { "epoch": 0.30803680419921875, "grad_norm": 16.328372955322266, "learning_rate": 3.45982551574707e-05, "lookahead_loss": 6.990485664844513, "loss": 4.8918, "step": 161500 }, { "epoch": 0.308990478515625, "grad_norm": 10.450627326965332, "learning_rate": 3.4550571441650393e-05, "lookahead_loss": 6.781832706928253, "loss": 4.9248, "step": 162000 }, { "epoch": 0.30994415283203125, "grad_norm": 19.444429397583008, "learning_rate": 3.450288772583008e-05, "lookahead_loss": 6.964550238609314, "loss": 4.8952, "step": 162500 }, { "epoch": 0.3108978271484375, "grad_norm": 25.03470802307129, "learning_rate": 3.445520401000977e-05, "lookahead_loss": 6.9275254011154175, "loss": 4.9165, "step": 163000 }, { "epoch": 0.31185150146484375, "grad_norm": 104.5985107421875, "learning_rate": 3.440752029418946e-05, "lookahead_loss": 6.923734365463257, "loss": 4.8837, "step": 163500 }, { "epoch": 0.31280517578125, "grad_norm": 13.91694450378418, "learning_rate": 3.435983657836914e-05, "lookahead_loss": 6.8402302165031434, "loss": 4.95, "step": 164000 }, { "epoch": 0.31375885009765625, "grad_norm": 17.412071228027344, "learning_rate": 3.431215286254883e-05, "lookahead_loss": 6.899979671478271, "loss": 4.9692, "step": 164500 }, { "epoch": 0.3147125244140625, "grad_norm": 14.99606704711914, "learning_rate": 3.4264469146728514e-05, "lookahead_loss": 6.966157273292541, "loss": 4.9312, "step": 165000 }, { "epoch": 0.3147125244140625, "eval_accuracy": 0.029249706457925635, "eval_lookahead_loss": 6.821188983345031, "eval_lookahead_perplexity": 917.0747487145987, "eval_loss": 4.73521089553833, "eval_perplexity": 113.88747454595905, "eval_runtime": 674.5318, "eval_samples_per_second": 14.825, "eval_steps_per_second": 3.706, "step": 165000 }, { "epoch": 0.31566619873046875, "grad_norm": 15.974380493164062, "learning_rate": 3.4216785430908205e-05, "lookahead_loss": 6.864722494125366, "loss": 4.9622, "step": 165500 }, { "epoch": 0.316619873046875, "grad_norm": 13.009764671325684, "learning_rate": 3.4169101715087895e-05, "lookahead_loss": 6.925512114048004, "loss": 4.9181, "step": 166000 }, { "epoch": 0.31757354736328125, "grad_norm": 13.776761054992676, "learning_rate": 3.412141799926758e-05, "lookahead_loss": 7.006348931312561, "loss": 4.9291, "step": 166500 }, { "epoch": 0.3185272216796875, "grad_norm": 12.035865783691406, "learning_rate": 3.407373428344727e-05, "lookahead_loss": 6.9127522535324095, "loss": 4.9283, "step": 167000 }, { "epoch": 0.31948089599609375, "grad_norm": 12.134992599487305, "learning_rate": 3.402605056762695e-05, "lookahead_loss": 6.957017666816712, "loss": 4.9417, "step": 167500 }, { "epoch": 0.3204345703125, "grad_norm": 20.790393829345703, "learning_rate": 3.397836685180664e-05, "lookahead_loss": 6.93363643360138, "loss": 4.9347, "step": 168000 }, { "epoch": 0.32138824462890625, "grad_norm": 17.53589630126953, "learning_rate": 3.393068313598633e-05, "lookahead_loss": 6.921780423164368, "loss": 4.9108, "step": 168500 }, { "epoch": 0.3223419189453125, "grad_norm": 14.406173706054688, "learning_rate": 3.3882999420166016e-05, "lookahead_loss": 6.90714052772522, "loss": 4.9228, "step": 169000 }, { "epoch": 0.32329559326171875, "grad_norm": 35.16606521606445, "learning_rate": 3.3835315704345706e-05, "lookahead_loss": 6.890581090450286, "loss": 4.9153, "step": 169500 }, { "epoch": 0.324249267578125, "grad_norm": 28.15321159362793, "learning_rate": 3.378763198852539e-05, "lookahead_loss": 7.026367143630981, "loss": 4.9013, "step": 170000 }, { "epoch": 0.324249267578125, "eval_accuracy": 0.02853522504892368, "eval_lookahead_loss": 6.818971997547149, "eval_lookahead_perplexity": 915.0438590796638, "eval_loss": 4.7225213050842285, "eval_perplexity": 112.45141987900689, "eval_runtime": 721.3921, "eval_samples_per_second": 13.862, "eval_steps_per_second": 3.466, "step": 170000 }, { "epoch": 0.32520294189453125, "grad_norm": 19.237560272216797, "learning_rate": 3.373994827270508e-05, "lookahead_loss": 6.9315129642486575, "loss": 4.9018, "step": 170500 }, { "epoch": 0.3261566162109375, "grad_norm": 16.063291549682617, "learning_rate": 3.369226455688477e-05, "lookahead_loss": 6.959984789848328, "loss": 4.9086, "step": 171000 }, { "epoch": 0.32711029052734375, "grad_norm": 15.176820755004883, "learning_rate": 3.364458084106445e-05, "lookahead_loss": 6.935260605812073, "loss": 4.8272, "step": 171500 }, { "epoch": 0.32806396484375, "grad_norm": 13.98118782043457, "learning_rate": 3.3596897125244143e-05, "lookahead_loss": 6.964900031089782, "loss": 4.8834, "step": 172000 }, { "epoch": 0.32901763916015625, "grad_norm": 21.28561782836914, "learning_rate": 3.354921340942383e-05, "lookahead_loss": 6.877140071868896, "loss": 4.8746, "step": 172500 }, { "epoch": 0.3299713134765625, "grad_norm": 16.979141235351562, "learning_rate": 3.350152969360352e-05, "lookahead_loss": 6.919057213783264, "loss": 4.8568, "step": 173000 }, { "epoch": 0.33092498779296875, "grad_norm": 22.39498519897461, "learning_rate": 3.345384597778321e-05, "lookahead_loss": 6.946840383529663, "loss": 4.8852, "step": 173500 }, { "epoch": 0.331878662109375, "grad_norm": 17.254030227661133, "learning_rate": 3.340616226196289e-05, "lookahead_loss": 6.901164591789246, "loss": 4.8818, "step": 174000 }, { "epoch": 0.33283233642578125, "grad_norm": 87.46729278564453, "learning_rate": 3.335847854614258e-05, "lookahead_loss": 6.92927253818512, "loss": 4.8521, "step": 174500 }, { "epoch": 0.3337860107421875, "grad_norm": 16.898258209228516, "learning_rate": 3.3310794830322264e-05, "lookahead_loss": 6.841258005142212, "loss": 4.8217, "step": 175000 }, { "epoch": 0.3337860107421875, "eval_accuracy": 0.027938551859099804, "eval_lookahead_loss": 6.812032573032379, "eval_lookahead_perplexity": 908.7159626652528, "eval_loss": 4.70542049407959, "eval_perplexity": 110.54475859235868, "eval_runtime": 596.3122, "eval_samples_per_second": 16.77, "eval_steps_per_second": 4.192, "step": 175000 }, { "epoch": 0.33473968505859375, "grad_norm": 12.669487953186035, "learning_rate": 3.3263111114501955e-05, "lookahead_loss": 6.962246194839477, "loss": 4.862, "step": 175500 }, { "epoch": 0.335693359375, "grad_norm": 15.61766529083252, "learning_rate": 3.3215427398681645e-05, "lookahead_loss": 7.043109121322632, "loss": 4.8569, "step": 176000 }, { "epoch": 0.33664703369140625, "grad_norm": 15.210989952087402, "learning_rate": 3.316774368286133e-05, "lookahead_loss": 6.905501788139343, "loss": 4.803, "step": 176500 }, { "epoch": 0.3376007080078125, "grad_norm": 35.87611389160156, "learning_rate": 3.312005996704102e-05, "lookahead_loss": 6.895154564857483, "loss": 4.8518, "step": 177000 }, { "epoch": 0.33855438232421875, "grad_norm": 23.919984817504883, "learning_rate": 3.30723762512207e-05, "lookahead_loss": 6.867300504684448, "loss": 4.855, "step": 177500 }, { "epoch": 0.339508056640625, "grad_norm": 15.072147369384766, "learning_rate": 3.302469253540039e-05, "lookahead_loss": 6.801936350345612, "loss": 4.8197, "step": 178000 }, { "epoch": 0.34046173095703125, "grad_norm": 17.006420135498047, "learning_rate": 3.297700881958008e-05, "lookahead_loss": 6.821063242912293, "loss": 4.8042, "step": 178500 }, { "epoch": 0.3414154052734375, "grad_norm": 13.767796516418457, "learning_rate": 3.2929325103759766e-05, "lookahead_loss": 6.799063691139221, "loss": 4.8376, "step": 179000 }, { "epoch": 0.34236907958984375, "grad_norm": 18.627914428710938, "learning_rate": 3.2881641387939456e-05, "lookahead_loss": 6.899736148357391, "loss": 4.9154, "step": 179500 }, { "epoch": 0.34332275390625, "grad_norm": 20.227718353271484, "learning_rate": 3.283395767211914e-05, "lookahead_loss": 7.000618993282318, "loss": 4.9256, "step": 180000 }, { "epoch": 0.34332275390625, "eval_accuracy": 0.028952054794520548, "eval_lookahead_loss": 6.817646003627777, "eval_lookahead_perplexity": 913.8313205735715, "eval_loss": 4.684326648712158, "eval_perplexity": 108.23736599537305, "eval_runtime": 569.0632, "eval_samples_per_second": 17.573, "eval_steps_per_second": 4.393, "step": 180000 }, { "epoch": 0.34427642822265625, "grad_norm": 24.821849822998047, "learning_rate": 3.278627395629883e-05, "lookahead_loss": 6.85215523481369, "loss": 4.9209, "step": 180500 }, { "epoch": 0.3452301025390625, "grad_norm": 10.48924446105957, "learning_rate": 3.273859024047852e-05, "lookahead_loss": 6.982842287063598, "loss": 4.9222, "step": 181000 }, { "epoch": 0.34618377685546875, "grad_norm": 9.095426559448242, "learning_rate": 3.26909065246582e-05, "lookahead_loss": 6.894338526725769, "loss": 4.8524, "step": 181500 }, { "epoch": 0.347137451171875, "grad_norm": 14.645662307739258, "learning_rate": 3.2643222808837893e-05, "lookahead_loss": 6.844138590335846, "loss": 4.8523, "step": 182000 }, { "epoch": 0.34809112548828125, "grad_norm": 15.900192260742188, "learning_rate": 3.259553909301758e-05, "lookahead_loss": 6.958358055114746, "loss": 4.8739, "step": 182500 }, { "epoch": 0.3490447998046875, "grad_norm": 14.77332878112793, "learning_rate": 3.254785537719727e-05, "lookahead_loss": 6.9371869502067565, "loss": 4.8757, "step": 183000 }, { "epoch": 0.34999847412109375, "grad_norm": 18.724903106689453, "learning_rate": 3.250017166137696e-05, "lookahead_loss": 6.797015940189362, "loss": 4.8463, "step": 183500 }, { "epoch": 0.3509521484375, "grad_norm": 15.143988609313965, "learning_rate": 3.245248794555664e-05, "lookahead_loss": 6.950707427978515, "loss": 4.86, "step": 184000 }, { "epoch": 0.35190582275390625, "grad_norm": 29.541372299194336, "learning_rate": 3.240480422973633e-05, "lookahead_loss": 6.839635824203492, "loss": 4.8265, "step": 184500 }, { "epoch": 0.3528594970703125, "grad_norm": 14.572477340698242, "learning_rate": 3.2357120513916014e-05, "lookahead_loss": 6.948925925731659, "loss": 4.8326, "step": 185000 }, { "epoch": 0.3528594970703125, "eval_accuracy": 0.028384148727984344, "eval_lookahead_loss": 6.771338399124145, "eval_lookahead_perplexity": 872.4788378106439, "eval_loss": 4.652266502380371, "eval_perplexity": 104.82229652720292, "eval_runtime": 556.6257, "eval_samples_per_second": 17.965, "eval_steps_per_second": 4.491, "step": 185000 }, { "epoch": 0.35381317138671875, "grad_norm": 15.093419075012207, "learning_rate": 3.2309436798095705e-05, "lookahead_loss": 6.882199538230896, "loss": 4.8529, "step": 185500 }, { "epoch": 0.354766845703125, "grad_norm": 11.8356294631958, "learning_rate": 3.2261753082275395e-05, "lookahead_loss": 6.904272257328033, "loss": 4.7858, "step": 186000 }, { "epoch": 0.35572052001953125, "grad_norm": 18.80912971496582, "learning_rate": 3.221406936645508e-05, "lookahead_loss": 6.835084801673889, "loss": 4.7972, "step": 186500 }, { "epoch": 0.3566741943359375, "grad_norm": 10.571149826049805, "learning_rate": 3.216638565063477e-05, "lookahead_loss": 6.960137063026428, "loss": 4.7953, "step": 187000 }, { "epoch": 0.35762786865234375, "grad_norm": 10.40713882446289, "learning_rate": 3.211870193481445e-05, "lookahead_loss": 6.864735876560212, "loss": 4.8127, "step": 187500 }, { "epoch": 0.35858154296875, "grad_norm": 9.305915832519531, "learning_rate": 3.207101821899414e-05, "lookahead_loss": 6.925107287883758, "loss": 4.8344, "step": 188000 }, { "epoch": 0.35953521728515625, "grad_norm": 7.813634872436523, "learning_rate": 3.202333450317383e-05, "lookahead_loss": 6.864461313247681, "loss": 4.8005, "step": 188500 }, { "epoch": 0.3604888916015625, "grad_norm": 9.31898021697998, "learning_rate": 3.1975650787353516e-05, "lookahead_loss": 6.879134814739228, "loss": 4.8336, "step": 189000 }, { "epoch": 0.36144256591796875, "grad_norm": 28.4632511138916, "learning_rate": 3.1927967071533206e-05, "lookahead_loss": 6.907152587413788, "loss": 4.7526, "step": 189500 }, { "epoch": 0.362396240234375, "grad_norm": 12.17216682434082, "learning_rate": 3.188028335571289e-05, "lookahead_loss": 6.942371348381043, "loss": 4.7726, "step": 190000 }, { "epoch": 0.362396240234375, "eval_accuracy": 0.028716046966731898, "eval_lookahead_loss": 6.7743327404022216, "eval_lookahead_perplexity": 875.0952524732456, "eval_loss": 4.636414051055908, "eval_perplexity": 103.17370778426833, "eval_runtime": 518.4006, "eval_samples_per_second": 19.29, "eval_steps_per_second": 4.823, "step": 190000 }, { "epoch": 0.36334991455078125, "grad_norm": 13.718379020690918, "learning_rate": 3.183259963989258e-05, "lookahead_loss": 6.930777439117431, "loss": 4.7883, "step": 190500 }, { "epoch": 0.3643035888671875, "grad_norm": 9.086777687072754, "learning_rate": 3.178491592407227e-05, "lookahead_loss": 6.904231739521027, "loss": 4.7973, "step": 191000 }, { "epoch": 0.36525726318359375, "grad_norm": 11.362726211547852, "learning_rate": 3.173723220825195e-05, "lookahead_loss": 6.915815782546997, "loss": 4.7731, "step": 191500 }, { "epoch": 0.3662109375, "grad_norm": 14.917767524719238, "learning_rate": 3.1689548492431643e-05, "lookahead_loss": 6.881822595596313, "loss": 4.7518, "step": 192000 }, { "epoch": 0.36716461181640625, "grad_norm": 8.246079444885254, "learning_rate": 3.164186477661133e-05, "lookahead_loss": 6.855777984619141, "loss": 4.7333, "step": 192500 }, { "epoch": 0.3681182861328125, "grad_norm": 32.00188064575195, "learning_rate": 3.159418106079102e-05, "lookahead_loss": 6.805473225593567, "loss": 4.7385, "step": 193000 }, { "epoch": 0.36907196044921875, "grad_norm": 13.972314834594727, "learning_rate": 3.154649734497071e-05, "lookahead_loss": 6.770559470176697, "loss": 4.7426, "step": 193500 }, { "epoch": 0.370025634765625, "grad_norm": 11.774514198303223, "learning_rate": 3.149881362915039e-05, "lookahead_loss": 6.8027354102134705, "loss": 4.7689, "step": 194000 }, { "epoch": 0.37097930908203125, "grad_norm": 9.327366828918457, "learning_rate": 3.145112991333008e-05, "lookahead_loss": 6.881221285820008, "loss": 4.8635, "step": 194500 }, { "epoch": 0.3719329833984375, "grad_norm": 25.055700302124023, "learning_rate": 3.1403446197509764e-05, "lookahead_loss": 6.857808141708374, "loss": 4.8655, "step": 195000 }, { "epoch": 0.3719329833984375, "eval_accuracy": 0.02812876712328767, "eval_lookahead_loss": 6.772071803951263, "eval_lookahead_perplexity": 873.1189527048368, "eval_loss": 4.617378234863281, "eval_perplexity": 101.2282871272344, "eval_runtime": 570.1467, "eval_samples_per_second": 17.539, "eval_steps_per_second": 4.385, "step": 195000 }, { "epoch": 0.37288665771484375, "grad_norm": 35.89665603637695, "learning_rate": 3.1355762481689455e-05, "lookahead_loss": 6.834752644538879, "loss": 4.8032, "step": 195500 }, { "epoch": 0.37384033203125, "grad_norm": 14.396994590759277, "learning_rate": 3.1308078765869145e-05, "lookahead_loss": 6.955719030380249, "loss": 4.847, "step": 196000 }, { "epoch": 0.37479400634765625, "grad_norm": 17.672876358032227, "learning_rate": 3.126039505004883e-05, "lookahead_loss": 6.86171492767334, "loss": 4.8153, "step": 196500 }, { "epoch": 0.3757476806640625, "grad_norm": 13.194722175598145, "learning_rate": 3.121271133422852e-05, "lookahead_loss": 6.831992061138153, "loss": 4.7965, "step": 197000 }, { "epoch": 0.37670135498046875, "grad_norm": 11.360389709472656, "learning_rate": 3.11650276184082e-05, "lookahead_loss": 6.886775142669678, "loss": 4.8341, "step": 197500 }, { "epoch": 0.377655029296875, "grad_norm": 11.576452255249023, "learning_rate": 3.111734390258789e-05, "lookahead_loss": 6.8808323831558225, "loss": 4.8111, "step": 198000 }, { "epoch": 0.37860870361328125, "grad_norm": 41.32535171508789, "learning_rate": 3.106966018676758e-05, "lookahead_loss": 6.86693590259552, "loss": 4.8501, "step": 198500 }, { "epoch": 0.3795623779296875, "grad_norm": 8.69752311706543, "learning_rate": 3.1021976470947266e-05, "lookahead_loss": 6.857236812114715, "loss": 4.8269, "step": 199000 }, { "epoch": 0.38051605224609375, "grad_norm": 9.38365650177002, "learning_rate": 3.0974292755126956e-05, "lookahead_loss": 6.845773404598236, "loss": 4.8239, "step": 199500 }, { "epoch": 0.3814697265625, "grad_norm": 7.945014953613281, "learning_rate": 3.092660903930664e-05, "lookahead_loss": 6.877838421344757, "loss": 4.7666, "step": 200000 }, { "epoch": 0.3814697265625, "eval_accuracy": 0.027060861056751467, "eval_lookahead_loss": 6.762299109077453, "eval_lookahead_perplexity": 864.627785961729, "eval_loss": 4.598567962646484, "eval_perplexity": 99.34195234495421, "eval_runtime": 533.6711, "eval_samples_per_second": 18.738, "eval_steps_per_second": 4.685, "step": 200000 }, { "epoch": 0.38242340087890625, "grad_norm": 7.462662696838379, "learning_rate": 3.087892532348633e-05, "lookahead_loss": 6.881955535888672, "loss": 4.724, "step": 200500 }, { "epoch": 0.3833770751953125, "grad_norm": 15.07455825805664, "learning_rate": 3.083124160766602e-05, "lookahead_loss": 6.830864996433258, "loss": 4.7542, "step": 201000 }, { "epoch": 0.38433074951171875, "grad_norm": 13.703947067260742, "learning_rate": 3.07835578918457e-05, "lookahead_loss": 6.87936337184906, "loss": 4.8067, "step": 201500 }, { "epoch": 0.385284423828125, "grad_norm": 6.508729934692383, "learning_rate": 3.0735874176025393e-05, "lookahead_loss": 6.8694367532730105, "loss": 4.7618, "step": 202000 }, { "epoch": 0.38623809814453125, "grad_norm": 53.115013122558594, "learning_rate": 3.068819046020508e-05, "lookahead_loss": 6.879793552398682, "loss": 4.7583, "step": 202500 }, { "epoch": 0.3871917724609375, "grad_norm": 13.237932205200195, "learning_rate": 3.064050674438477e-05, "lookahead_loss": 6.821065585136414, "loss": 4.7736, "step": 203000 }, { "epoch": 0.38814544677734375, "grad_norm": 21.41532325744629, "learning_rate": 3.059282302856446e-05, "lookahead_loss": 6.857066061019897, "loss": 4.7332, "step": 203500 }, { "epoch": 0.38909912109375, "grad_norm": 14.233695030212402, "learning_rate": 3.054513931274414e-05, "lookahead_loss": 6.779116709709167, "loss": 4.753, "step": 204000 }, { "epoch": 0.39005279541015625, "grad_norm": 18.57429313659668, "learning_rate": 3.049745559692383e-05, "lookahead_loss": 6.92658317899704, "loss": 4.7224, "step": 204500 }, { "epoch": 0.3910064697265625, "grad_norm": 9.019280433654785, "learning_rate": 3.0449771881103518e-05, "lookahead_loss": 6.800223940849304, "loss": 4.7235, "step": 205000 }, { "epoch": 0.3910064697265625, "eval_accuracy": 0.028047553816046965, "eval_lookahead_loss": 6.753289661502838, "eval_lookahead_perplexity": 856.8729530876706, "eval_loss": 4.580152988433838, "eval_perplexity": 97.52931392274326, "eval_runtime": 508.0658, "eval_samples_per_second": 19.682, "eval_steps_per_second": 4.921, "step": 205000 }, { "epoch": 0.39196014404296875, "grad_norm": 10.35768985748291, "learning_rate": 3.0402088165283205e-05, "lookahead_loss": 6.860224767684937, "loss": 4.7285, "step": 205500 }, { "epoch": 0.392913818359375, "grad_norm": 12.51717472076416, "learning_rate": 3.035440444946289e-05, "lookahead_loss": 6.851239624023438, "loss": 4.717, "step": 206000 }, { "epoch": 0.39386749267578125, "grad_norm": 14.596587181091309, "learning_rate": 3.0306720733642578e-05, "lookahead_loss": 6.924259947776794, "loss": 4.7235, "step": 206500 }, { "epoch": 0.3948211669921875, "grad_norm": 10.076025009155273, "learning_rate": 3.025903701782227e-05, "lookahead_loss": 6.871153787612915, "loss": 4.7163, "step": 207000 }, { "epoch": 0.39577484130859375, "grad_norm": 6.595065116882324, "learning_rate": 3.0211353302001955e-05, "lookahead_loss": 6.7870999517440795, "loss": 4.7171, "step": 207500 }, { "epoch": 0.396728515625, "grad_norm": 8.028550148010254, "learning_rate": 3.0163669586181642e-05, "lookahead_loss": 6.8409305191040035, "loss": 4.7119, "step": 208000 }, { "epoch": 0.39768218994140625, "grad_norm": 9.688517570495605, "learning_rate": 3.011598587036133e-05, "lookahead_loss": 6.771541133880615, "loss": 4.7234, "step": 208500 }, { "epoch": 0.3986358642578125, "grad_norm": 25.116613388061523, "learning_rate": 3.0068302154541016e-05, "lookahead_loss": 6.841614699363708, "loss": 4.7592, "step": 209000 }, { "epoch": 0.39958953857421875, "grad_norm": 7.961500644683838, "learning_rate": 3.0020618438720706e-05, "lookahead_loss": 6.946041712760925, "loss": 4.7865, "step": 209500 }, { "epoch": 0.400543212890625, "grad_norm": 8.984140396118164, "learning_rate": 2.9972934722900393e-05, "lookahead_loss": 6.846708699703217, "loss": 4.8219, "step": 210000 }, { "epoch": 0.400543212890625, "eval_accuracy": 0.027758317025440314, "eval_lookahead_loss": 6.747739342975616, "eval_lookahead_perplexity": 852.1302093054367, "eval_loss": 4.562594413757324, "eval_perplexity": 95.83178488886253, "eval_runtime": 515.9616, "eval_samples_per_second": 19.381, "eval_steps_per_second": 4.845, "step": 210000 }, { "epoch": 0.40149688720703125, "grad_norm": 10.774547576904297, "learning_rate": 2.992525100708008e-05, "lookahead_loss": 6.8916302680969235, "loss": 4.7886, "step": 210500 }, { "epoch": 0.4024505615234375, "grad_norm": 12.251407623291016, "learning_rate": 2.9877567291259766e-05, "lookahead_loss": 6.847809802055359, "loss": 4.749, "step": 211000 }, { "epoch": 0.40340423583984375, "grad_norm": 9.838257789611816, "learning_rate": 2.9829883575439453e-05, "lookahead_loss": 6.877293023109436, "loss": 4.7845, "step": 211500 }, { "epoch": 0.40435791015625, "grad_norm": 10.019533157348633, "learning_rate": 2.9782199859619143e-05, "lookahead_loss": 6.922103682518006, "loss": 4.7371, "step": 212000 }, { "epoch": 0.40531158447265625, "grad_norm": 9.81891918182373, "learning_rate": 2.973451614379883e-05, "lookahead_loss": 6.806386597633362, "loss": 4.7041, "step": 212500 }, { "epoch": 0.4062652587890625, "grad_norm": 16.70152473449707, "learning_rate": 2.9686832427978517e-05, "lookahead_loss": 6.842723188400268, "loss": 4.7369, "step": 213000 }, { "epoch": 0.40721893310546875, "grad_norm": 8.388794898986816, "learning_rate": 2.9639148712158204e-05, "lookahead_loss": 6.855916251182556, "loss": 4.785, "step": 213500 }, { "epoch": 0.408172607421875, "grad_norm": 13.115460395812988, "learning_rate": 2.959146499633789e-05, "lookahead_loss": 6.865746474742889, "loss": 4.7637, "step": 214000 }, { "epoch": 0.40912628173828125, "grad_norm": 8.38688850402832, "learning_rate": 2.954378128051758e-05, "lookahead_loss": 6.826547119617462, "loss": 4.7006, "step": 214500 }, { "epoch": 0.4100799560546875, "grad_norm": 7.511728286743164, "learning_rate": 2.9496097564697268e-05, "lookahead_loss": 6.845867900848389, "loss": 4.7113, "step": 215000 }, { "epoch": 0.4100799560546875, "eval_accuracy": 0.027773385518590998, "eval_lookahead_loss": 6.733608465385437, "eval_lookahead_perplexity": 840.1735397041244, "eval_loss": 4.545471668243408, "eval_perplexity": 94.20485016833317, "eval_runtime": 508.2601, "eval_samples_per_second": 19.675, "eval_steps_per_second": 4.919, "step": 215000 }, { "epoch": 0.41103363037109375, "grad_norm": 13.766517639160156, "learning_rate": 2.9448413848876955e-05, "lookahead_loss": 6.744051348686218, "loss": 4.6922, "step": 215500 }, { "epoch": 0.4119873046875, "grad_norm": 8.105950355529785, "learning_rate": 2.940073013305664e-05, "lookahead_loss": 6.893662909507752, "loss": 4.7057, "step": 216000 }, { "epoch": 0.41294097900390625, "grad_norm": 10.36955451965332, "learning_rate": 2.9353046417236328e-05, "lookahead_loss": 6.8541857380867, "loss": 4.7249, "step": 216500 }, { "epoch": 0.4138946533203125, "grad_norm": 6.440521240234375, "learning_rate": 2.930536270141602e-05, "lookahead_loss": 6.765749216556549, "loss": 4.7234, "step": 217000 }, { "epoch": 0.41484832763671875, "grad_norm": 7.272236347198486, "learning_rate": 2.9257678985595705e-05, "lookahead_loss": 6.853433388710022, "loss": 4.665, "step": 217500 }, { "epoch": 0.415802001953125, "grad_norm": 6.276742935180664, "learning_rate": 2.9209995269775392e-05, "lookahead_loss": 6.764645181655884, "loss": 4.7042, "step": 218000 }, { "epoch": 0.41675567626953125, "grad_norm": 11.672040939331055, "learning_rate": 2.916231155395508e-05, "lookahead_loss": 6.796969823360443, "loss": 4.7007, "step": 218500 }, { "epoch": 0.4177093505859375, "grad_norm": 8.154991149902344, "learning_rate": 2.9114627838134766e-05, "lookahead_loss": 6.782781042098999, "loss": 4.6779, "step": 219000 }, { "epoch": 0.41866302490234375, "grad_norm": 17.222139358520508, "learning_rate": 2.9066944122314456e-05, "lookahead_loss": 6.827827154636383, "loss": 4.6954, "step": 219500 }, { "epoch": 0.41961669921875, "grad_norm": 7.38473653793335, "learning_rate": 2.9019260406494143e-05, "lookahead_loss": 6.778075915336609, "loss": 4.7011, "step": 220000 }, { "epoch": 0.41961669921875, "eval_accuracy": 0.027556947162426616, "eval_lookahead_loss": 6.736468231105804, "eval_lookahead_perplexity": 842.5796780482721, "eval_loss": 4.5303053855896, "eval_perplexity": 92.78689253576687, "eval_runtime": 532.0361, "eval_samples_per_second": 18.796, "eval_steps_per_second": 4.699, "step": 220000 }, { "epoch": 0.42057037353515625, "grad_norm": 6.253684043884277, "learning_rate": 2.897157669067383e-05, "lookahead_loss": 6.806883024215698, "loss": 4.6706, "step": 220500 }, { "epoch": 0.4215240478515625, "grad_norm": 14.447110176086426, "learning_rate": 2.8923892974853516e-05, "lookahead_loss": 6.82536906003952, "loss": 4.6856, "step": 221000 }, { "epoch": 0.42247772216796875, "grad_norm": 8.760591506958008, "learning_rate": 2.8876209259033203e-05, "lookahead_loss": 6.84308206653595, "loss": 4.7051, "step": 221500 }, { "epoch": 0.423431396484375, "grad_norm": 6.616645812988281, "learning_rate": 2.8828525543212893e-05, "lookahead_loss": 6.770108239173889, "loss": 4.6677, "step": 222000 }, { "epoch": 0.42438507080078125, "grad_norm": 8.348169326782227, "learning_rate": 2.878084182739258e-05, "lookahead_loss": 6.71211741733551, "loss": 4.6662, "step": 222500 }, { "epoch": 0.4253387451171875, "grad_norm": 15.148275375366211, "learning_rate": 2.8733158111572267e-05, "lookahead_loss": 6.849678965568542, "loss": 4.6864, "step": 223000 }, { "epoch": 0.42629241943359375, "grad_norm": 15.359856605529785, "learning_rate": 2.8685474395751954e-05, "lookahead_loss": 6.844837466239929, "loss": 4.7504, "step": 223500 }, { "epoch": 0.42724609375, "grad_norm": 7.492391586303711, "learning_rate": 2.863779067993164e-05, "lookahead_loss": 6.823279851913452, "loss": 4.7607, "step": 224000 }, { "epoch": 0.42819976806640625, "grad_norm": 11.877659797668457, "learning_rate": 2.859010696411133e-05, "lookahead_loss": 6.86239346408844, "loss": 4.764, "step": 224500 }, { "epoch": 0.4291534423828125, "grad_norm": 7.433264255523682, "learning_rate": 2.8542423248291018e-05, "lookahead_loss": 6.921005269050598, "loss": 4.7134, "step": 225000 }, { "epoch": 0.4291534423828125, "eval_accuracy": 0.02751976516634051, "eval_lookahead_loss": 6.735574113845825, "eval_lookahead_perplexity": 841.8266497132043, "eval_loss": 4.522216320037842, "eval_perplexity": 92.03936077251684, "eval_runtime": 511.6323, "eval_samples_per_second": 19.545, "eval_steps_per_second": 4.886, "step": 225000 }, { "epoch": 0.43010711669921875, "grad_norm": 14.627372741699219, "learning_rate": 2.8494739532470705e-05, "lookahead_loss": 6.786255206108093, "loss": 4.7154, "step": 225500 }, { "epoch": 0.431060791015625, "grad_norm": 12.684699058532715, "learning_rate": 2.844705581665039e-05, "lookahead_loss": 6.8134490013122555, "loss": 4.7332, "step": 226000 }, { "epoch": 0.43201446533203125, "grad_norm": 10.127054214477539, "learning_rate": 2.8399372100830078e-05, "lookahead_loss": 6.928757458209992, "loss": 4.7959, "step": 226500 }, { "epoch": 0.4329681396484375, "grad_norm": 13.97661018371582, "learning_rate": 2.835168838500977e-05, "lookahead_loss": 6.831813469409942, "loss": 4.6792, "step": 227000 }, { "epoch": 0.43392181396484375, "grad_norm": 7.198910713195801, "learning_rate": 2.8304004669189455e-05, "lookahead_loss": 6.798321119308472, "loss": 4.7346, "step": 227500 }, { "epoch": 0.43487548828125, "grad_norm": 9.290630340576172, "learning_rate": 2.8256320953369142e-05, "lookahead_loss": 6.801683102607727, "loss": 4.6582, "step": 228000 }, { "epoch": 0.43582916259765625, "grad_norm": 5.7725749015808105, "learning_rate": 2.820863723754883e-05, "lookahead_loss": 6.765568204402924, "loss": 4.7241, "step": 228500 }, { "epoch": 0.4367828369140625, "grad_norm": 6.510128021240234, "learning_rate": 2.8160953521728516e-05, "lookahead_loss": 6.887916783809662, "loss": 4.6899, "step": 229000 }, { "epoch": 0.43773651123046875, "grad_norm": 17.19536590576172, "learning_rate": 2.8113269805908206e-05, "lookahead_loss": 6.800729690551758, "loss": 4.658, "step": 229500 }, { "epoch": 0.438690185546875, "grad_norm": 8.316597938537598, "learning_rate": 2.8065586090087893e-05, "lookahead_loss": 6.918262977123261, "loss": 4.6916, "step": 230000 }, { "epoch": 0.438690185546875, "eval_accuracy": 0.0276720156555773, "eval_lookahead_loss": 6.719068545341492, "eval_lookahead_perplexity": 828.0458649820013, "eval_loss": 4.497356414794922, "eval_perplexity": 89.77947761130496, "eval_runtime": 506.6438, "eval_samples_per_second": 19.738, "eval_steps_per_second": 4.934, "step": 230000 }, { "epoch": 0.43964385986328125, "grad_norm": 9.408232688903809, "learning_rate": 2.801790237426758e-05, "lookahead_loss": 6.879965525150299, "loss": 4.6795, "step": 230500 }, { "epoch": 0.4405975341796875, "grad_norm": 7.654597759246826, "learning_rate": 2.7970218658447266e-05, "lookahead_loss": 6.800792472839356, "loss": 4.6324, "step": 231000 }, { "epoch": 0.44155120849609375, "grad_norm": 5.555298328399658, "learning_rate": 2.7922534942626953e-05, "lookahead_loss": 6.890419705867767, "loss": 4.6934, "step": 231500 }, { "epoch": 0.4425048828125, "grad_norm": 9.836344718933105, "learning_rate": 2.7874851226806643e-05, "lookahead_loss": 6.8155165619850155, "loss": 4.6854, "step": 232000 }, { "epoch": 0.44345855712890625, "grad_norm": 9.457110404968262, "learning_rate": 2.782716751098633e-05, "lookahead_loss": 6.830329776763916, "loss": 4.6537, "step": 232500 }, { "epoch": 0.4444122314453125, "grad_norm": 4.722607612609863, "learning_rate": 2.7779483795166017e-05, "lookahead_loss": 6.735857604980469, "loss": 4.6566, "step": 233000 }, { "epoch": 0.44536590576171875, "grad_norm": 6.821660041809082, "learning_rate": 2.7731800079345704e-05, "lookahead_loss": 6.793193730354309, "loss": 4.6176, "step": 233500 }, { "epoch": 0.446319580078125, "grad_norm": 8.533875465393066, "learning_rate": 2.768411636352539e-05, "lookahead_loss": 6.74989365196228, "loss": 4.6359, "step": 234000 }, { "epoch": 0.44727325439453125, "grad_norm": 10.35650634765625, "learning_rate": 2.763643264770508e-05, "lookahead_loss": 6.877397830963135, "loss": 4.6427, "step": 234500 }, { "epoch": 0.4482269287109375, "grad_norm": 17.310665130615234, "learning_rate": 2.7588748931884768e-05, "lookahead_loss": 6.8418282942771915, "loss": 4.6694, "step": 235000 }, { "epoch": 0.4482269287109375, "eval_accuracy": 0.026724657534246576, "eval_lookahead_loss": 6.7319054651260375, "eval_lookahead_perplexity": 838.7439415965529, "eval_loss": 4.509628772735596, "eval_perplexity": 90.8880721167475, "eval_runtime": 502.5732, "eval_samples_per_second": 19.898, "eval_steps_per_second": 4.974, "step": 235000 }, { "epoch": 0.44918060302734375, "grad_norm": 7.831390857696533, "learning_rate": 2.7541065216064455e-05, "lookahead_loss": 6.828319634437561, "loss": 4.6669, "step": 235500 }, { "epoch": 0.45013427734375, "grad_norm": 5.560863971710205, "learning_rate": 2.749338150024414e-05, "lookahead_loss": 6.776805781364441, "loss": 4.6587, "step": 236000 }, { "epoch": 0.45108795166015625, "grad_norm": 6.234622955322266, "learning_rate": 2.7445697784423828e-05, "lookahead_loss": 6.797786170959473, "loss": 4.6617, "step": 236500 }, { "epoch": 0.4520416259765625, "grad_norm": 6.328197479248047, "learning_rate": 2.739801406860352e-05, "lookahead_loss": 6.730026741504669, "loss": 4.6415, "step": 237000 }, { "epoch": 0.45299530029296875, "grad_norm": 5.458266735076904, "learning_rate": 2.7350330352783205e-05, "lookahead_loss": 6.792340087890625, "loss": 4.6356, "step": 237500 }, { "epoch": 0.453948974609375, "grad_norm": 7.036812782287598, "learning_rate": 2.7302646636962892e-05, "lookahead_loss": 6.8629042949676515, "loss": 4.7164, "step": 238000 }, { "epoch": 0.45490264892578125, "grad_norm": 7.387545108795166, "learning_rate": 2.725496292114258e-05, "lookahead_loss": 6.868465321540833, "loss": 4.7001, "step": 238500 }, { "epoch": 0.4558563232421875, "grad_norm": 10.85196590423584, "learning_rate": 2.7207279205322266e-05, "lookahead_loss": 6.839413352489472, "loss": 4.7126, "step": 239000 }, { "epoch": 0.45680999755859375, "grad_norm": 10.480144500732422, "learning_rate": 2.7159595489501956e-05, "lookahead_loss": 6.776459849834442, "loss": 4.6775, "step": 239500 }, { "epoch": 0.457763671875, "grad_norm": 8.411324501037598, "learning_rate": 2.7111911773681643e-05, "lookahead_loss": 6.812754894256591, "loss": 4.6188, "step": 240000 }, { "epoch": 0.457763671875, "eval_accuracy": 0.027436399217221134, "eval_lookahead_loss": 6.703162211608887, "eval_lookahead_perplexity": 814.9788904433267, "eval_loss": 4.4625091552734375, "eval_perplexity": 86.70479217035178, "eval_runtime": 508.787, "eval_samples_per_second": 19.655, "eval_steps_per_second": 4.914, "step": 240000 }, { "epoch": 0.45871734619140625, "grad_norm": 4.500423431396484, "learning_rate": 2.706422805786133e-05, "lookahead_loss": 6.886532336235047, "loss": 4.6683, "step": 240500 }, { "epoch": 0.4596710205078125, "grad_norm": 5.8886003494262695, "learning_rate": 2.7016544342041016e-05, "lookahead_loss": 6.797688834190368, "loss": 4.6639, "step": 241000 }, { "epoch": 0.46062469482421875, "grad_norm": 7.274393558502197, "learning_rate": 2.6968860626220703e-05, "lookahead_loss": 6.8456897892951964, "loss": 4.7251, "step": 241500 }, { "epoch": 0.461578369140625, "grad_norm": 6.767756462097168, "learning_rate": 2.6921176910400393e-05, "lookahead_loss": 6.805787023067475, "loss": 4.6677, "step": 242000 }, { "epoch": 0.46253204345703125, "grad_norm": 10.675180435180664, "learning_rate": 2.687349319458008e-05, "lookahead_loss": 6.761420122623443, "loss": 4.6245, "step": 242500 }, { "epoch": 0.4634857177734375, "grad_norm": 6.167633056640625, "learning_rate": 2.6825809478759767e-05, "lookahead_loss": 6.803386947154999, "loss": 4.6402, "step": 243000 }, { "epoch": 0.46443939208984375, "grad_norm": 9.084933280944824, "learning_rate": 2.6778125762939454e-05, "lookahead_loss": 6.734957904338836, "loss": 4.6469, "step": 243500 }, { "epoch": 0.46539306640625, "grad_norm": 7.723199367523193, "learning_rate": 2.673044204711914e-05, "lookahead_loss": 6.825903774261475, "loss": 4.6121, "step": 244000 }, { "epoch": 0.46634674072265625, "grad_norm": 7.428271293640137, "learning_rate": 2.668275833129883e-05, "lookahead_loss": 6.8290307970047, "loss": 4.5797, "step": 244500 }, { "epoch": 0.4673004150390625, "grad_norm": 7.509993553161621, "learning_rate": 2.6635074615478518e-05, "lookahead_loss": 6.791725723743439, "loss": 4.6234, "step": 245000 }, { "epoch": 0.4673004150390625, "eval_accuracy": 0.027202544031311154, "eval_lookahead_loss": 6.704393914222718, "eval_lookahead_perplexity": 815.9833205254972, "eval_loss": 4.45693826675415, "eval_perplexity": 86.22311237703373, "eval_runtime": 506.3726, "eval_samples_per_second": 19.748, "eval_steps_per_second": 4.937, "step": 245000 }, { "epoch": 0.46825408935546875, "grad_norm": 6.642219543457031, "learning_rate": 2.6587390899658205e-05, "lookahead_loss": 6.893568713188172, "loss": 4.6311, "step": 245500 }, { "epoch": 0.469207763671875, "grad_norm": 7.002037048339844, "learning_rate": 2.653970718383789e-05, "lookahead_loss": 6.845495506286621, "loss": 4.575, "step": 246000 }, { "epoch": 0.47016143798828125, "grad_norm": 6.98114538192749, "learning_rate": 2.6492023468017578e-05, "lookahead_loss": 6.740485010147094, "loss": 4.6177, "step": 246500 }, { "epoch": 0.4711151123046875, "grad_norm": 14.055593490600586, "learning_rate": 2.644433975219727e-05, "lookahead_loss": 6.796768043518067, "loss": 4.6415, "step": 247000 }, { "epoch": 0.47206878662109375, "grad_norm": 7.6654157638549805, "learning_rate": 2.6396656036376955e-05, "lookahead_loss": 6.87089737033844, "loss": 4.5815, "step": 247500 }, { "epoch": 0.4730224609375, "grad_norm": 5.913922309875488, "learning_rate": 2.6348972320556642e-05, "lookahead_loss": 6.75460085439682, "loss": 4.6264, "step": 248000 }, { "epoch": 0.47397613525390625, "grad_norm": 8.11966609954834, "learning_rate": 2.630128860473633e-05, "lookahead_loss": 6.835740717887878, "loss": 4.5871, "step": 248500 }, { "epoch": 0.4749298095703125, "grad_norm": 17.4832820892334, "learning_rate": 2.6253604888916016e-05, "lookahead_loss": 6.71933558177948, "loss": 4.6032, "step": 249000 }, { "epoch": 0.47588348388671875, "grad_norm": 11.189537048339844, "learning_rate": 2.6205921173095706e-05, "lookahead_loss": 6.790458413600922, "loss": 4.6322, "step": 249500 }, { "epoch": 0.476837158203125, "grad_norm": 8.002442359924316, "learning_rate": 2.6158237457275393e-05, "lookahead_loss": 6.804021176338196, "loss": 4.5967, "step": 250000 }, { "epoch": 0.476837158203125, "eval_accuracy": 0.02773620352250489, "eval_lookahead_loss": 6.698928699874878, "eval_lookahead_perplexity": 811.5359607315246, "eval_loss": 4.437278747558594, "eval_perplexity": 84.54456126062698, "eval_runtime": 509.8518, "eval_samples_per_second": 19.614, "eval_steps_per_second": 4.903, "step": 250000 }, { "epoch": 0.47779083251953125, "grad_norm": 5.189752578735352, "learning_rate": 2.611055374145508e-05, "lookahead_loss": 6.621621014595032, "loss": 4.5711, "step": 250500 }, { "epoch": 0.4787445068359375, "grad_norm": 4.661776065826416, "learning_rate": 2.6062870025634766e-05, "lookahead_loss": 6.768929479598999, "loss": 4.6091, "step": 251000 }, { "epoch": 0.47969818115234375, "grad_norm": 7.735442161560059, "learning_rate": 2.6015186309814453e-05, "lookahead_loss": 6.784061515331269, "loss": 4.6394, "step": 251500 }, { "epoch": 0.48065185546875, "grad_norm": 4.69485330581665, "learning_rate": 2.5967502593994143e-05, "lookahead_loss": 6.812984355926513, "loss": 4.6192, "step": 252000 }, { "epoch": 0.48160552978515625, "grad_norm": 5.021263122558594, "learning_rate": 2.591981887817383e-05, "lookahead_loss": 6.7972890887260435, "loss": 4.6851, "step": 252500 }, { "epoch": 0.4825592041015625, "grad_norm": 6.682577610015869, "learning_rate": 2.5872135162353517e-05, "lookahead_loss": 6.807465260505676, "loss": 4.6461, "step": 253000 }, { "epoch": 0.48351287841796875, "grad_norm": 5.426817417144775, "learning_rate": 2.5824451446533204e-05, "lookahead_loss": 6.765803963184356, "loss": 4.6627, "step": 253500 }, { "epoch": 0.484466552734375, "grad_norm": 5.963171005249023, "learning_rate": 2.577676773071289e-05, "lookahead_loss": 6.865730625152588, "loss": 4.609, "step": 254000 }, { "epoch": 0.48542022705078125, "grad_norm": 23.531435012817383, "learning_rate": 2.572908401489258e-05, "lookahead_loss": 6.708600515842438, "loss": 4.5349, "step": 254500 }, { "epoch": 0.4863739013671875, "grad_norm": 11.49215316772461, "learning_rate": 2.5681400299072268e-05, "lookahead_loss": 6.789046202659607, "loss": 4.5814, "step": 255000 }, { "epoch": 0.4863739013671875, "eval_accuracy": 0.027103326810176125, "eval_lookahead_loss": 6.685731836509705, "eval_lookahead_perplexity": 800.8965891183798, "eval_loss": 4.4201154708862305, "eval_perplexity": 83.1058811140611, "eval_runtime": 508.5142, "eval_samples_per_second": 19.665, "eval_steps_per_second": 4.916, "step": 255000 }, { "epoch": 0.48732757568359375, "grad_norm": 16.101449966430664, "learning_rate": 2.5633716583251955e-05, "lookahead_loss": 6.790324227333069, "loss": 4.6232, "step": 255500 }, { "epoch": 0.48828125, "grad_norm": 23.115615844726562, "learning_rate": 2.558603286743164e-05, "lookahead_loss": 6.785535855293274, "loss": 4.5966, "step": 256000 }, { "epoch": 0.48923492431640625, "grad_norm": 11.256710052490234, "learning_rate": 2.5538349151611328e-05, "lookahead_loss": 6.759244240760803, "loss": 4.6368, "step": 256500 }, { "epoch": 0.4901885986328125, "grad_norm": 8.227165222167969, "learning_rate": 2.549066543579102e-05, "lookahead_loss": 6.828501435279846, "loss": 4.6004, "step": 257000 }, { "epoch": 0.49114227294921875, "grad_norm": 6.936631679534912, "learning_rate": 2.5442981719970705e-05, "lookahead_loss": 6.745646829605103, "loss": 4.5773, "step": 257500 }, { "epoch": 0.492095947265625, "grad_norm": 4.894238471984863, "learning_rate": 2.5395298004150392e-05, "lookahead_loss": 6.856553471565246, "loss": 4.5901, "step": 258000 }, { "epoch": 0.49304962158203125, "grad_norm": 5.337064266204834, "learning_rate": 2.534761428833008e-05, "lookahead_loss": 6.7623658618927, "loss": 4.5887, "step": 258500 }, { "epoch": 0.4940032958984375, "grad_norm": 6.3007707595825195, "learning_rate": 2.5299930572509766e-05, "lookahead_loss": 6.740935876846313, "loss": 4.5898, "step": 259000 }, { "epoch": 0.49495697021484375, "grad_norm": 7.725447654724121, "learning_rate": 2.5252246856689456e-05, "lookahead_loss": 6.743220863342285, "loss": 4.6145, "step": 259500 }, { "epoch": 0.49591064453125, "grad_norm": 4.923539638519287, "learning_rate": 2.5204563140869143e-05, "lookahead_loss": 6.868897308349609, "loss": 4.5838, "step": 260000 }, { "epoch": 0.49591064453125, "eval_accuracy": 0.027579060665362035, "eval_lookahead_loss": 6.681761246967316, "eval_lookahead_perplexity": 797.7228624496879, "eval_loss": 4.413488864898682, "eval_perplexity": 82.55699182974226, "eval_runtime": 509.737, "eval_samples_per_second": 19.618, "eval_steps_per_second": 4.904, "step": 260000 }, { "epoch": 0.49686431884765625, "grad_norm": 11.082085609436035, "learning_rate": 2.515687942504883e-05, "lookahead_loss": 6.820860422611236, "loss": 4.5974, "step": 260500 }, { "epoch": 0.4978179931640625, "grad_norm": 6.4547810554504395, "learning_rate": 2.5109195709228516e-05, "lookahead_loss": 6.837406048774719, "loss": 4.543, "step": 261000 }, { "epoch": 0.49877166748046875, "grad_norm": 4.4494781494140625, "learning_rate": 2.5061511993408203e-05, "lookahead_loss": 6.75308655166626, "loss": 4.5597, "step": 261500 }, { "epoch": 0.499725341796875, "grad_norm": 9.954333305358887, "learning_rate": 2.5013828277587893e-05, "lookahead_loss": 6.834174869537353, "loss": 4.5766, "step": 262000 }, { "epoch": 0.5006790161132812, "grad_norm": 3.8551645278930664, "learning_rate": 2.496614456176758e-05, "lookahead_loss": 6.795374658584595, "loss": 4.5775, "step": 262500 }, { "epoch": 0.5016326904296875, "grad_norm": 4.753420829772949, "learning_rate": 2.4918460845947267e-05, "lookahead_loss": 6.749995267868042, "loss": 4.5786, "step": 263000 }, { "epoch": 0.5025863647460938, "grad_norm": 5.3529229164123535, "learning_rate": 2.4870777130126954e-05, "lookahead_loss": 6.815603377342224, "loss": 4.5607, "step": 263500 }, { "epoch": 0.5035400390625, "grad_norm": 8.77649974822998, "learning_rate": 2.482309341430664e-05, "lookahead_loss": 6.859789810180664, "loss": 4.5829, "step": 264000 }, { "epoch": 0.5044937133789062, "grad_norm": 6.863483905792236, "learning_rate": 2.477540969848633e-05, "lookahead_loss": 6.88712083530426, "loss": 4.5733, "step": 264500 }, { "epoch": 0.5054473876953125, "grad_norm": 7.630585193634033, "learning_rate": 2.4727725982666018e-05, "lookahead_loss": 6.735982489585877, "loss": 4.575, "step": 265000 }, { "epoch": 0.5054473876953125, "eval_accuracy": 0.026781409001956948, "eval_lookahead_loss": 6.677350661945343, "eval_lookahead_perplexity": 794.2121857011977, "eval_loss": 4.393498420715332, "eval_perplexity": 80.92302714562551, "eval_runtime": 503.9923, "eval_samples_per_second": 19.842, "eval_steps_per_second": 4.96, "step": 265000 }, { "epoch": 0.5064010620117188, "grad_norm": 4.656352519989014, "learning_rate": 2.4680042266845705e-05, "lookahead_loss": 6.784516426086426, "loss": 4.5723, "step": 265500 }, { "epoch": 0.507354736328125, "grad_norm": 5.293808460235596, "learning_rate": 2.463235855102539e-05, "lookahead_loss": 6.683867498397827, "loss": 4.511, "step": 266000 }, { "epoch": 0.5083084106445312, "grad_norm": 6.616509914398193, "learning_rate": 2.4584674835205078e-05, "lookahead_loss": 6.839096932888031, "loss": 4.6155, "step": 266500 }, { "epoch": 0.5092620849609375, "grad_norm": 5.832101345062256, "learning_rate": 2.453699111938477e-05, "lookahead_loss": 6.819410691261291, "loss": 4.6213, "step": 267000 }, { "epoch": 0.5102157592773438, "grad_norm": 9.951017379760742, "learning_rate": 2.4489307403564455e-05, "lookahead_loss": 6.782696785449982, "loss": 4.6269, "step": 267500 }, { "epoch": 0.51116943359375, "grad_norm": 4.919025421142578, "learning_rate": 2.4441623687744142e-05, "lookahead_loss": 6.733903615951538, "loss": 4.5626, "step": 268000 }, { "epoch": 0.5121231079101562, "grad_norm": 5.371066570281982, "learning_rate": 2.439393997192383e-05, "lookahead_loss": 6.750059889793396, "loss": 4.5591, "step": 268500 }, { "epoch": 0.5130767822265625, "grad_norm": 8.982616424560547, "learning_rate": 2.4346256256103516e-05, "lookahead_loss": 6.739513368606567, "loss": 4.5586, "step": 269000 }, { "epoch": 0.5140304565429688, "grad_norm": 5.379887104034424, "learning_rate": 2.4298572540283206e-05, "lookahead_loss": 6.765355040550232, "loss": 4.5543, "step": 269500 }, { "epoch": 0.514984130859375, "grad_norm": 6.494828701019287, "learning_rate": 2.4250888824462893e-05, "lookahead_loss": 6.7474156808853145, "loss": 4.5786, "step": 270000 }, { "epoch": 0.514984130859375, "eval_accuracy": 0.027225636007827788, "eval_lookahead_loss": 6.671760565280914, "eval_lookahead_perplexity": 789.7848489604489, "eval_loss": 4.382969856262207, "eval_perplexity": 80.07549332493596, "eval_runtime": 499.533, "eval_samples_per_second": 20.019, "eval_steps_per_second": 5.005, "step": 270000 }, { "epoch": 0.5159378051757812, "grad_norm": 6.766885757446289, "learning_rate": 2.420320510864258e-05, "lookahead_loss": 6.767586376190185, "loss": 4.5664, "step": 270500 }, { "epoch": 0.5168914794921875, "grad_norm": 5.5666890144348145, "learning_rate": 2.4155521392822266e-05, "lookahead_loss": 6.79396518945694, "loss": 4.5821, "step": 271000 }, { "epoch": 0.5178451538085938, "grad_norm": 7.23394775390625, "learning_rate": 2.4107837677001953e-05, "lookahead_loss": 6.813719945907593, "loss": 4.5696, "step": 271500 }, { "epoch": 0.518798828125, "grad_norm": 7.673539638519287, "learning_rate": 2.406015396118164e-05, "lookahead_loss": 6.821444494247436, "loss": 4.5708, "step": 272000 }, { "epoch": 0.5197525024414062, "grad_norm": 5.60297155380249, "learning_rate": 2.401247024536133e-05, "lookahead_loss": 6.737942953109741, "loss": 4.526, "step": 272500 }, { "epoch": 0.5207061767578125, "grad_norm": 7.381375789642334, "learning_rate": 2.3964786529541017e-05, "lookahead_loss": 6.7791331381797795, "loss": 4.5355, "step": 273000 }, { "epoch": 0.5216598510742188, "grad_norm": 5.634623050689697, "learning_rate": 2.3917102813720704e-05, "lookahead_loss": 6.818725703239441, "loss": 4.5341, "step": 273500 }, { "epoch": 0.522613525390625, "grad_norm": 5.267809867858887, "learning_rate": 2.386941909790039e-05, "lookahead_loss": 6.692176138877868, "loss": 4.5455, "step": 274000 }, { "epoch": 0.5235671997070312, "grad_norm": 4.404324054718018, "learning_rate": 2.3821735382080078e-05, "lookahead_loss": 6.767769090175628, "loss": 4.5295, "step": 274500 }, { "epoch": 0.5245208740234375, "grad_norm": 4.600510120391846, "learning_rate": 2.3774051666259768e-05, "lookahead_loss": 6.826436989307403, "loss": 4.5582, "step": 275000 }, { "epoch": 0.5245208740234375, "eval_accuracy": 0.02703894324853229, "eval_lookahead_loss": 6.666026669979096, "eval_lookahead_perplexity": 785.2692636440395, "eval_loss": 4.372351169586182, "eval_perplexity": 79.22969533025932, "eval_runtime": 503.6141, "eval_samples_per_second": 19.856, "eval_steps_per_second": 4.964, "step": 275000 }, { "epoch": 0.5254745483398438, "grad_norm": 4.266603946685791, "learning_rate": 2.3726367950439455e-05, "lookahead_loss": 6.839318128585815, "loss": 4.5546, "step": 275500 }, { "epoch": 0.52642822265625, "grad_norm": 10.231660842895508, "learning_rate": 2.367868423461914e-05, "lookahead_loss": 6.711370833396912, "loss": 4.5411, "step": 276000 }, { "epoch": 0.5273818969726562, "grad_norm": 8.98074722290039, "learning_rate": 2.3631000518798828e-05, "lookahead_loss": 6.714813334941864, "loss": 4.5203, "step": 276500 }, { "epoch": 0.5283355712890625, "grad_norm": 4.800537586212158, "learning_rate": 2.3583316802978515e-05, "lookahead_loss": 6.810443245410919, "loss": 4.4806, "step": 277000 }, { "epoch": 0.5292892456054688, "grad_norm": 4.586134433746338, "learning_rate": 2.3535633087158205e-05, "lookahead_loss": 6.8357763314247135, "loss": 4.5464, "step": 277500 }, { "epoch": 0.530242919921875, "grad_norm": 4.799231052398682, "learning_rate": 2.3487949371337892e-05, "lookahead_loss": 6.840376096725464, "loss": 4.5893, "step": 278000 }, { "epoch": 0.5311965942382812, "grad_norm": 6.292491436004639, "learning_rate": 2.344026565551758e-05, "lookahead_loss": 6.664181458473205, "loss": 4.5283, "step": 278500 }, { "epoch": 0.5321502685546875, "grad_norm": 4.13116455078125, "learning_rate": 2.3392581939697266e-05, "lookahead_loss": 6.718390095233917, "loss": 4.4968, "step": 279000 }, { "epoch": 0.5331039428710938, "grad_norm": 6.146130561828613, "learning_rate": 2.3344898223876953e-05, "lookahead_loss": 6.691791282653808, "loss": 4.5054, "step": 279500 }, { "epoch": 0.5340576171875, "grad_norm": 4.449344635009766, "learning_rate": 2.3297214508056643e-05, "lookahead_loss": 6.688320465087891, "loss": 4.5571, "step": 280000 }, { "epoch": 0.5340576171875, "eval_accuracy": 0.027528180039138945, "eval_lookahead_loss": 6.655807732009888, "eval_lookahead_perplexity": 777.2855079780043, "eval_loss": 4.354944705963135, "eval_perplexity": 77.86251988475132, "eval_runtime": 497.6209, "eval_samples_per_second": 20.096, "eval_steps_per_second": 5.024, "step": 280000 }, { "epoch": 0.5350112915039062, "grad_norm": 3.424004554748535, "learning_rate": 2.324953079223633e-05, "lookahead_loss": 6.779023581981659, "loss": 4.599, "step": 280500 }, { "epoch": 0.5359649658203125, "grad_norm": 7.118185043334961, "learning_rate": 2.3201847076416016e-05, "lookahead_loss": 6.750466910362244, "loss": 4.5971, "step": 281000 }, { "epoch": 0.5369186401367188, "grad_norm": 5.894007682800293, "learning_rate": 2.3154163360595703e-05, "lookahead_loss": 6.778980483531952, "loss": 4.5992, "step": 281500 }, { "epoch": 0.537872314453125, "grad_norm": 3.7739498615264893, "learning_rate": 2.310647964477539e-05, "lookahead_loss": 6.780877924919128, "loss": 4.545, "step": 282000 }, { "epoch": 0.5388259887695312, "grad_norm": 5.057179927825928, "learning_rate": 2.305879592895508e-05, "lookahead_loss": 6.754450489997864, "loss": 4.55, "step": 282500 }, { "epoch": 0.5397796630859375, "grad_norm": 3.9184961318969727, "learning_rate": 2.3011112213134767e-05, "lookahead_loss": 6.785111947059631, "loss": 4.5517, "step": 283000 }, { "epoch": 0.5407333374023438, "grad_norm": 8.290541648864746, "learning_rate": 2.2963428497314454e-05, "lookahead_loss": 6.741956337928772, "loss": 4.5272, "step": 283500 }, { "epoch": 0.54168701171875, "grad_norm": 5.1730475425720215, "learning_rate": 2.291574478149414e-05, "lookahead_loss": 6.689818567276001, "loss": 4.4988, "step": 284000 }, { "epoch": 0.5426406860351562, "grad_norm": 4.866126537322998, "learning_rate": 2.2868061065673828e-05, "lookahead_loss": 6.760932396888733, "loss": 4.5109, "step": 284500 }, { "epoch": 0.5435943603515625, "grad_norm": 6.932046413421631, "learning_rate": 2.2820377349853518e-05, "lookahead_loss": 6.702657186508179, "loss": 4.5238, "step": 285000 }, { "epoch": 0.5435943603515625, "eval_accuracy": 0.027582974559686887, "eval_lookahead_loss": 6.649052880573272, "eval_lookahead_perplexity": 772.0527529857387, "eval_loss": 4.342409610748291, "eval_perplexity": 76.89259751911013, "eval_runtime": 508.7374, "eval_samples_per_second": 19.657, "eval_steps_per_second": 4.914, "step": 285000 }, { "epoch": 0.5445480346679688, "grad_norm": 5.049684047698975, "learning_rate": 2.2772693634033205e-05, "lookahead_loss": 6.813228886127472, "loss": 4.5147, "step": 285500 }, { "epoch": 0.545501708984375, "grad_norm": 6.871647834777832, "learning_rate": 2.272500991821289e-05, "lookahead_loss": 6.791557188510895, "loss": 4.5271, "step": 286000 }, { "epoch": 0.5464553833007812, "grad_norm": 6.409543991088867, "learning_rate": 2.2677326202392578e-05, "lookahead_loss": 6.775540713310241, "loss": 4.5189, "step": 286500 }, { "epoch": 0.5474090576171875, "grad_norm": 4.853549957275391, "learning_rate": 2.2629642486572265e-05, "lookahead_loss": 6.7449184398651125, "loss": 4.4838, "step": 287000 }, { "epoch": 0.5483627319335938, "grad_norm": 7.923564434051514, "learning_rate": 2.2581958770751955e-05, "lookahead_loss": 6.875919403076172, "loss": 4.5313, "step": 287500 }, { "epoch": 0.54931640625, "grad_norm": 3.2618143558502197, "learning_rate": 2.2534275054931642e-05, "lookahead_loss": 6.760967472076416, "loss": 4.5094, "step": 288000 }, { "epoch": 0.5502700805664062, "grad_norm": 6.317617416381836, "learning_rate": 2.248659133911133e-05, "lookahead_loss": 6.687269252777099, "loss": 4.579, "step": 288500 }, { "epoch": 0.5512237548828125, "grad_norm": 6.988278388977051, "learning_rate": 2.2438907623291016e-05, "lookahead_loss": 6.822534495353699, "loss": 4.5136, "step": 289000 }, { "epoch": 0.5521774291992188, "grad_norm": 7.275880813598633, "learning_rate": 2.2391223907470703e-05, "lookahead_loss": 6.691516678810119, "loss": 4.4996, "step": 289500 }, { "epoch": 0.553131103515625, "grad_norm": 4.041243553161621, "learning_rate": 2.2343540191650393e-05, "lookahead_loss": 6.731153163433075, "loss": 4.4993, "step": 290000 }, { "epoch": 0.553131103515625, "eval_accuracy": 0.027246966731898237, "eval_lookahead_loss": 6.650654366016388, "eval_lookahead_perplexity": 773.2901748230189, "eval_loss": 4.335085868835449, "eval_perplexity": 76.33151310650074, "eval_runtime": 523.0715, "eval_samples_per_second": 19.118, "eval_steps_per_second": 4.779, "step": 290000 }, { "epoch": 0.5540847778320312, "grad_norm": 7.781128883361816, "learning_rate": 2.229585647583008e-05, "lookahead_loss": 6.744361032009125, "loss": 4.5428, "step": 290500 }, { "epoch": 0.5550384521484375, "grad_norm": 4.385952472686768, "learning_rate": 2.2248172760009766e-05, "lookahead_loss": 6.695714050292969, "loss": 4.5155, "step": 291000 }, { "epoch": 0.5559921264648438, "grad_norm": 3.813856363296509, "learning_rate": 2.2200489044189453e-05, "lookahead_loss": 6.758798654556275, "loss": 4.5014, "step": 291500 }, { "epoch": 0.55694580078125, "grad_norm": 6.079179286956787, "learning_rate": 2.215280532836914e-05, "lookahead_loss": 6.610006287097931, "loss": 4.4967, "step": 292000 }, { "epoch": 0.5578994750976562, "grad_norm": 9.989141464233398, "learning_rate": 2.210512161254883e-05, "lookahead_loss": 6.688715100288391, "loss": 4.5114, "step": 292500 }, { "epoch": 0.5588531494140625, "grad_norm": 7.684407711029053, "learning_rate": 2.2057437896728517e-05, "lookahead_loss": 6.740292546749115, "loss": 4.4896, "step": 293000 }, { "epoch": 0.5598068237304688, "grad_norm": 5.720460891723633, "learning_rate": 2.2009754180908204e-05, "lookahead_loss": 6.612023066043854, "loss": 4.4804, "step": 293500 }, { "epoch": 0.560760498046875, "grad_norm": 6.498396396636963, "learning_rate": 2.196207046508789e-05, "lookahead_loss": 6.8224652404785155, "loss": 4.5905, "step": 294000 }, { "epoch": 0.5617141723632812, "grad_norm": 4.9039692878723145, "learning_rate": 2.1914386749267578e-05, "lookahead_loss": 6.65625856590271, "loss": 4.6225, "step": 294500 }, { "epoch": 0.5626678466796875, "grad_norm": 6.133942127227783, "learning_rate": 2.1866703033447268e-05, "lookahead_loss": 6.756600568771362, "loss": 4.6375, "step": 295000 }, { "epoch": 0.5626678466796875, "eval_accuracy": 0.02726692759295499, "eval_lookahead_loss": 6.644798863124848, "eval_lookahead_perplexity": 768.7754029978735, "eval_loss": 4.318997859954834, "eval_perplexity": 75.11331649945502, "eval_runtime": 510.7955, "eval_samples_per_second": 19.577, "eval_steps_per_second": 4.894, "step": 295000 }, { "epoch": 0.5636215209960938, "grad_norm": 7.958284378051758, "learning_rate": 2.1819019317626955e-05, "lookahead_loss": 6.818441821575165, "loss": 4.5695, "step": 295500 }, { "epoch": 0.5645751953125, "grad_norm": 4.738055229187012, "learning_rate": 2.177133560180664e-05, "lookahead_loss": 6.709409970760346, "loss": 4.5164, "step": 296000 }, { "epoch": 0.5655288696289062, "grad_norm": 5.528202533721924, "learning_rate": 2.1723651885986328e-05, "lookahead_loss": 6.733810359954834, "loss": 4.5022, "step": 296500 }, { "epoch": 0.5664825439453125, "grad_norm": 5.297482013702393, "learning_rate": 2.1675968170166015e-05, "lookahead_loss": 6.78105641412735, "loss": 4.496, "step": 297000 }, { "epoch": 0.5674362182617188, "grad_norm": 7.160508632659912, "learning_rate": 2.1628284454345705e-05, "lookahead_loss": 6.750348650932312, "loss": 4.5192, "step": 297500 }, { "epoch": 0.568389892578125, "grad_norm": 4.161978244781494, "learning_rate": 2.1580600738525392e-05, "lookahead_loss": 6.805831137657165, "loss": 4.5478, "step": 298000 }, { "epoch": 0.5693435668945312, "grad_norm": 3.987971305847168, "learning_rate": 2.153291702270508e-05, "lookahead_loss": 6.6814481291770935, "loss": 4.4832, "step": 298500 }, { "epoch": 0.5702972412109375, "grad_norm": 4.921310901641846, "learning_rate": 2.1485233306884766e-05, "lookahead_loss": 6.79853134727478, "loss": 4.5105, "step": 299000 }, { "epoch": 0.5712509155273438, "grad_norm": 4.662833213806152, "learning_rate": 2.1437549591064453e-05, "lookahead_loss": 6.78131288433075, "loss": 4.4904, "step": 299500 }, { "epoch": 0.57220458984375, "grad_norm": 3.9053597450256348, "learning_rate": 2.1389865875244143e-05, "lookahead_loss": 6.785785898208618, "loss": 4.4756, "step": 300000 }, { "epoch": 0.57220458984375, "eval_accuracy": 0.02725577299412916, "eval_lookahead_loss": 6.644458235645295, "eval_lookahead_perplexity": 768.5135815643268, "eval_loss": 4.3120293617248535, "eval_perplexity": 74.59170900759104, "eval_runtime": 508.4906, "eval_samples_per_second": 19.666, "eval_steps_per_second": 4.917, "step": 300000 }, { "epoch": 0.5731582641601562, "grad_norm": 3.6465182304382324, "learning_rate": 2.134218215942383e-05, "lookahead_loss": 6.74456575012207, "loss": 4.5127, "step": 300500 }, { "epoch": 0.5741119384765625, "grad_norm": 18.76051139831543, "learning_rate": 2.1294498443603516e-05, "lookahead_loss": 6.745822964668274, "loss": 4.4651, "step": 301000 }, { "epoch": 0.5750656127929688, "grad_norm": 4.096388339996338, "learning_rate": 2.1246814727783203e-05, "lookahead_loss": 6.698595564842224, "loss": 4.4756, "step": 301500 }, { "epoch": 0.576019287109375, "grad_norm": 5.200983047485352, "learning_rate": 2.119913101196289e-05, "lookahead_loss": 6.747269897937775, "loss": 4.4526, "step": 302000 }, { "epoch": 0.5769729614257812, "grad_norm": 5.346353530883789, "learning_rate": 2.115144729614258e-05, "lookahead_loss": 6.7391540222167965, "loss": 4.4766, "step": 302500 }, { "epoch": 0.5779266357421875, "grad_norm": 5.053552627563477, "learning_rate": 2.1103763580322267e-05, "lookahead_loss": 6.758529759407043, "loss": 4.5088, "step": 303000 }, { "epoch": 0.5788803100585938, "grad_norm": 3.00019907951355, "learning_rate": 2.1056079864501954e-05, "lookahead_loss": 6.678250250816345, "loss": 4.4521, "step": 303500 }, { "epoch": 0.579833984375, "grad_norm": 4.9447526931762695, "learning_rate": 2.100839614868164e-05, "lookahead_loss": 6.75969517660141, "loss": 4.4477, "step": 304000 }, { "epoch": 0.5807876586914062, "grad_norm": 4.026423454284668, "learning_rate": 2.0960712432861328e-05, "lookahead_loss": 6.67162743139267, "loss": 4.4908, "step": 304500 }, { "epoch": 0.5817413330078125, "grad_norm": 4.985555171966553, "learning_rate": 2.0913028717041018e-05, "lookahead_loss": 6.748422413349152, "loss": 4.4725, "step": 305000 }, { "epoch": 0.5817413330078125, "eval_accuracy": 0.026545401174168298, "eval_lookahead_loss": 6.633568870925903, "eval_lookahead_perplexity": 760.1903564458153, "eval_loss": 4.293924331665039, "eval_perplexity": 73.25337571377813, "eval_runtime": 527.3315, "eval_samples_per_second": 18.963, "eval_steps_per_second": 4.741, "step": 305000 }, { "epoch": 0.5826950073242188, "grad_norm": 4.148164749145508, "learning_rate": 2.0865345001220705e-05, "lookahead_loss": 6.765863850593567, "loss": 4.4768, "step": 305500 }, { "epoch": 0.583648681640625, "grad_norm": 7.339029312133789, "learning_rate": 2.081766128540039e-05, "lookahead_loss": 6.6869233646392825, "loss": 4.4503, "step": 306000 }, { "epoch": 0.5846023559570312, "grad_norm": 4.4355292320251465, "learning_rate": 2.0769977569580078e-05, "lookahead_loss": 6.704280662536621, "loss": 4.4565, "step": 306500 }, { "epoch": 0.5855560302734375, "grad_norm": 4.743040084838867, "learning_rate": 2.0722293853759765e-05, "lookahead_loss": 6.724354888439178, "loss": 4.4496, "step": 307000 }, { "epoch": 0.5865097045898438, "grad_norm": 6.024969577789307, "learning_rate": 2.0674610137939455e-05, "lookahead_loss": 6.630684651851654, "loss": 4.4293, "step": 307500 }, { "epoch": 0.58746337890625, "grad_norm": 4.130810260772705, "learning_rate": 2.0626926422119142e-05, "lookahead_loss": 6.716033184051514, "loss": 4.556, "step": 308000 }, { "epoch": 0.5884170532226562, "grad_norm": 5.667199611663818, "learning_rate": 2.057924270629883e-05, "lookahead_loss": 6.743627453804016, "loss": 4.5538, "step": 308500 }, { "epoch": 0.5893707275390625, "grad_norm": 3.6583170890808105, "learning_rate": 2.0531558990478516e-05, "lookahead_loss": 6.721438986778259, "loss": 4.5067, "step": 309000 }, { "epoch": 0.5903244018554688, "grad_norm": 8.450644493103027, "learning_rate": 2.0483875274658203e-05, "lookahead_loss": 6.735768364906311, "loss": 4.4822, "step": 309500 }, { "epoch": 0.591278076171875, "grad_norm": 4.496861457824707, "learning_rate": 2.0436191558837893e-05, "lookahead_loss": 6.77764000415802, "loss": 4.4974, "step": 310000 }, { "epoch": 0.591278076171875, "eval_accuracy": 0.027552641878669275, "eval_lookahead_loss": 6.635923807048798, "eval_lookahead_perplexity": 761.9826657351255, "eval_loss": 4.301084995269775, "eval_perplexity": 73.77980102307822, "eval_runtime": 498.8741, "eval_samples_per_second": 20.045, "eval_steps_per_second": 5.011, "step": 310000 }, { "epoch": 0.5922317504882812, "grad_norm": 8.68830680847168, "learning_rate": 2.038850784301758e-05, "lookahead_loss": 6.667295397281647, "loss": 4.4677, "step": 310500 }, { "epoch": 0.5931854248046875, "grad_norm": 3.770911693572998, "learning_rate": 2.0340824127197266e-05, "lookahead_loss": 6.712395018577576, "loss": 4.4485, "step": 311000 }, { "epoch": 0.5941390991210938, "grad_norm": 7.058656215667725, "learning_rate": 2.0293140411376953e-05, "lookahead_loss": 6.659264601707458, "loss": 4.4485, "step": 311500 }, { "epoch": 0.5950927734375, "grad_norm": 3.824577808380127, "learning_rate": 2.024545669555664e-05, "lookahead_loss": 6.869963115692139, "loss": 4.5116, "step": 312000 }, { "epoch": 0.5960464477539062, "grad_norm": 4.830245018005371, "learning_rate": 2.019777297973633e-05, "lookahead_loss": 6.731892123699188, "loss": 4.491, "step": 312500 }, { "epoch": 0.5970001220703125, "grad_norm": 4.910013675689697, "learning_rate": 2.0150089263916017e-05, "lookahead_loss": 6.696903713226319, "loss": 4.5012, "step": 313000 }, { "epoch": 0.5979537963867188, "grad_norm": 3.4201393127441406, "learning_rate": 2.0102405548095704e-05, "lookahead_loss": 6.730336936473846, "loss": 4.4162, "step": 313500 }, { "epoch": 0.598907470703125, "grad_norm": 4.29105806350708, "learning_rate": 2.005472183227539e-05, "lookahead_loss": 6.690161351203918, "loss": 4.4286, "step": 314000 }, { "epoch": 0.5998611450195312, "grad_norm": 59.915977478027344, "learning_rate": 2.0007038116455078e-05, "lookahead_loss": 6.769863495826721, "loss": 4.4732, "step": 314500 }, { "epoch": 0.6008148193359375, "grad_norm": 38.13645935058594, "learning_rate": 1.9959354400634768e-05, "lookahead_loss": 6.742902619361877, "loss": 4.4651, "step": 315000 }, { "epoch": 0.6008148193359375, "eval_accuracy": 0.026280821917808218, "eval_lookahead_loss": 6.624830114364624, "eval_lookahead_perplexity": 753.5761799112871, "eval_loss": 4.276357173919678, "eval_perplexity": 71.9777593888016, "eval_runtime": 499.9278, "eval_samples_per_second": 20.003, "eval_steps_per_second": 5.001, "step": 315000 }, { "epoch": 0.6017684936523438, "grad_norm": 3.725734233856201, "learning_rate": 1.9911670684814455e-05, "lookahead_loss": 6.710557502746582, "loss": 4.4646, "step": 315500 }, { "epoch": 0.60272216796875, "grad_norm": 71.05250549316406, "learning_rate": 1.986398696899414e-05, "lookahead_loss": 6.677198050498962, "loss": 4.4593, "step": 316000 }, { "epoch": 0.6036758422851562, "grad_norm": 6.435787200927734, "learning_rate": 1.9816303253173828e-05, "lookahead_loss": 6.728137380599976, "loss": 4.4457, "step": 316500 }, { "epoch": 0.6046295166015625, "grad_norm": 4.1589274406433105, "learning_rate": 1.9768619537353515e-05, "lookahead_loss": 6.731306467056275, "loss": 4.4696, "step": 317000 }, { "epoch": 0.6055831909179688, "grad_norm": 36.343536376953125, "learning_rate": 1.9720935821533205e-05, "lookahead_loss": 6.664629363059998, "loss": 4.4777, "step": 317500 }, { "epoch": 0.606536865234375, "grad_norm": 8.774985313415527, "learning_rate": 1.9673252105712892e-05, "lookahead_loss": 6.733167035102844, "loss": 4.4446, "step": 318000 }, { "epoch": 0.6074905395507812, "grad_norm": 5.3302130699157715, "learning_rate": 1.962556838989258e-05, "lookahead_loss": 6.742209082126617, "loss": 4.4548, "step": 318500 }, { "epoch": 0.6084442138671875, "grad_norm": 3.7066664695739746, "learning_rate": 1.9577884674072266e-05, "lookahead_loss": 6.672843801021576, "loss": 4.4631, "step": 319000 }, { "epoch": 0.6093978881835938, "grad_norm": 6.667570114135742, "learning_rate": 1.9530200958251953e-05, "lookahead_loss": 6.763614084243774, "loss": 4.4477, "step": 319500 }, { "epoch": 0.6103515625, "grad_norm": 7.620219707489014, "learning_rate": 1.9482517242431643e-05, "lookahead_loss": 6.654103709220887, "loss": 4.4339, "step": 320000 }, { "epoch": 0.6103515625, "eval_accuracy": 0.027004109589041096, "eval_lookahead_loss": 6.618405271911621, "eval_lookahead_perplexity": 748.7500917083748, "eval_loss": 4.26544189453125, "eval_perplexity": 71.19637431158624, "eval_runtime": 500.2419, "eval_samples_per_second": 19.99, "eval_steps_per_second": 4.998, "step": 320000 }, { "epoch": 0.6113052368164062, "grad_norm": 4.199902057647705, "learning_rate": 1.943483352661133e-05, "lookahead_loss": 6.673474744319916, "loss": 4.4231, "step": 320500 }, { "epoch": 0.6122589111328125, "grad_norm": 4.8833417892456055, "learning_rate": 1.9387149810791016e-05, "lookahead_loss": 6.691699499130249, "loss": 4.4641, "step": 321000 }, { "epoch": 0.6132125854492188, "grad_norm": 4.0071635246276855, "learning_rate": 1.9339466094970703e-05, "lookahead_loss": 6.780665341377258, "loss": 4.5396, "step": 321500 }, { "epoch": 0.614166259765625, "grad_norm": 5.448105335235596, "learning_rate": 1.929178237915039e-05, "lookahead_loss": 6.719988704681397, "loss": 4.5687, "step": 322000 }, { "epoch": 0.6151199340820312, "grad_norm": 4.13397741317749, "learning_rate": 1.924409866333008e-05, "lookahead_loss": 6.709903753757477, "loss": 4.5204, "step": 322500 }, { "epoch": 0.6160736083984375, "grad_norm": 5.216847896575928, "learning_rate": 1.9196414947509767e-05, "lookahead_loss": 6.752863204956054, "loss": 4.5073, "step": 323000 }, { "epoch": 0.6170272827148438, "grad_norm": 5.6081624031066895, "learning_rate": 1.9148731231689454e-05, "lookahead_loss": 6.669276219367981, "loss": 4.4398, "step": 323500 }, { "epoch": 0.61798095703125, "grad_norm": 9.173544883728027, "learning_rate": 1.910104751586914e-05, "lookahead_loss": 6.7369874835014345, "loss": 4.4545, "step": 324000 }, { "epoch": 0.6189346313476562, "grad_norm": 5.9940056800842285, "learning_rate": 1.9053363800048828e-05, "lookahead_loss": 6.666944143772125, "loss": 4.4685, "step": 324500 }, { "epoch": 0.6198883056640625, "grad_norm": 3.899653673171997, "learning_rate": 1.9005680084228518e-05, "lookahead_loss": 6.705458169460297, "loss": 4.456, "step": 325000 }, { "epoch": 0.6198883056640625, "eval_accuracy": 0.02666555772994129, "eval_lookahead_loss": 6.618089934062958, "eval_lookahead_perplexity": 748.5140196884466, "eval_loss": 4.255804061889648, "eval_perplexity": 70.51349161181827, "eval_runtime": 504.4921, "eval_samples_per_second": 19.822, "eval_steps_per_second": 4.955, "step": 325000 }, { "epoch": 0.6208419799804688, "grad_norm": 4.745683670043945, "learning_rate": 1.8957996368408205e-05, "lookahead_loss": 6.768761556625366, "loss": 4.4799, "step": 325500 }, { "epoch": 0.621795654296875, "grad_norm": 3.7533304691314697, "learning_rate": 1.891031265258789e-05, "lookahead_loss": 6.672192378044128, "loss": 4.4239, "step": 326000 }, { "epoch": 0.6227493286132812, "grad_norm": 3.6618618965148926, "learning_rate": 1.8862628936767578e-05, "lookahead_loss": 6.761971126079559, "loss": 4.4462, "step": 326500 }, { "epoch": 0.6237030029296875, "grad_norm": 3.799405097961426, "learning_rate": 1.8814945220947265e-05, "lookahead_loss": 6.684108627796173, "loss": 4.385, "step": 327000 }, { "epoch": 0.6246566772460938, "grad_norm": 4.166128158569336, "learning_rate": 1.8767261505126955e-05, "lookahead_loss": 6.800023325920105, "loss": 4.4262, "step": 327500 }, { "epoch": 0.6256103515625, "grad_norm": 4.966941833496094, "learning_rate": 1.8719577789306642e-05, "lookahead_loss": 6.7040563592910765, "loss": 4.3992, "step": 328000 }, { "epoch": 0.6265640258789062, "grad_norm": 3.750234365463257, "learning_rate": 1.867189407348633e-05, "lookahead_loss": 6.732434203147888, "loss": 4.4586, "step": 328500 }, { "epoch": 0.6275177001953125, "grad_norm": 6.0385026931762695, "learning_rate": 1.8624210357666016e-05, "lookahead_loss": 6.59243482542038, "loss": 4.4125, "step": 329000 }, { "epoch": 0.6284713745117188, "grad_norm": 3.62520170211792, "learning_rate": 1.8576526641845703e-05, "lookahead_loss": 6.668375226974487, "loss": 4.4401, "step": 329500 }, { "epoch": 0.629425048828125, "grad_norm": 6.538697719573975, "learning_rate": 1.8528842926025393e-05, "lookahead_loss": 6.7852325806617735, "loss": 4.4337, "step": 330000 }, { "epoch": 0.629425048828125, "eval_accuracy": 0.026335420743639922, "eval_lookahead_loss": 6.609780311775207, "eval_lookahead_perplexity": 742.319921856188, "eval_loss": 4.247890949249268, "eval_perplexity": 69.9577122823621, "eval_runtime": 502.9697, "eval_samples_per_second": 19.882, "eval_steps_per_second": 4.97, "step": 330000 }, { "epoch": 0.6303787231445312, "grad_norm": 3.790666103363037, "learning_rate": 1.848115921020508e-05, "lookahead_loss": 6.740117348670959, "loss": 4.3919, "step": 330500 }, { "epoch": 0.6313323974609375, "grad_norm": 8.604785919189453, "learning_rate": 1.8433475494384766e-05, "lookahead_loss": 6.699921989440918, "loss": 4.4456, "step": 331000 }, { "epoch": 0.6322860717773438, "grad_norm": 3.542102098464966, "learning_rate": 1.8385791778564453e-05, "lookahead_loss": 6.72690689277649, "loss": 4.3837, "step": 331500 }, { "epoch": 0.63323974609375, "grad_norm": 3.785600423812866, "learning_rate": 1.833810806274414e-05, "lookahead_loss": 6.717962538719177, "loss": 4.4464, "step": 332000 }, { "epoch": 0.6341934204101562, "grad_norm": 5.2448906898498535, "learning_rate": 1.829042434692383e-05, "lookahead_loss": 6.6456244473457335, "loss": 4.4229, "step": 332500 }, { "epoch": 0.6351470947265625, "grad_norm": 4.809235572814941, "learning_rate": 1.8242740631103517e-05, "lookahead_loss": 6.646697951316834, "loss": 4.3894, "step": 333000 }, { "epoch": 0.6361007690429688, "grad_norm": 3.483379364013672, "learning_rate": 1.8195056915283204e-05, "lookahead_loss": 6.7344948945045475, "loss": 4.4183, "step": 333500 }, { "epoch": 0.637054443359375, "grad_norm": 3.302672863006592, "learning_rate": 1.814737319946289e-05, "lookahead_loss": 6.688945171356202, "loss": 4.4072, "step": 334000 }, { "epoch": 0.6380081176757812, "grad_norm": 3.551886558532715, "learning_rate": 1.8099689483642578e-05, "lookahead_loss": 6.687325328826904, "loss": 4.4265, "step": 334500 }, { "epoch": 0.6389617919921875, "grad_norm": 4.9312825202941895, "learning_rate": 1.8052005767822268e-05, "lookahead_loss": 6.708302593231201, "loss": 4.4737, "step": 335000 }, { "epoch": 0.6389617919921875, "eval_accuracy": 0.026374755381604695, "eval_lookahead_loss": 6.607738901901245, "eval_lookahead_perplexity": 740.8060883411376, "eval_loss": 4.234662055969238, "eval_perplexity": 69.03834369462643, "eval_runtime": 500.6484, "eval_samples_per_second": 19.974, "eval_steps_per_second": 4.994, "step": 335000 }, { "epoch": 0.6399154663085938, "grad_norm": 4.597929954528809, "learning_rate": 1.8004322052001955e-05, "lookahead_loss": 6.702645635604858, "loss": 4.5206, "step": 335500 }, { "epoch": 0.640869140625, "grad_norm": 4.154162406921387, "learning_rate": 1.795663833618164e-05, "lookahead_loss": 6.818061703681946, "loss": 4.4647, "step": 336000 }, { "epoch": 0.6418228149414062, "grad_norm": 4.32538366317749, "learning_rate": 1.7908954620361328e-05, "lookahead_loss": 6.757953865528107, "loss": 4.443, "step": 336500 }, { "epoch": 0.6427764892578125, "grad_norm": 3.7801594734191895, "learning_rate": 1.7861270904541015e-05, "lookahead_loss": 6.648034989833832, "loss": 4.3799, "step": 337000 }, { "epoch": 0.6437301635742188, "grad_norm": 3.996729850769043, "learning_rate": 1.7813587188720705e-05, "lookahead_loss": 6.666618365287781, "loss": 4.4278, "step": 337500 }, { "epoch": 0.644683837890625, "grad_norm": 5.2831220626831055, "learning_rate": 1.7765903472900392e-05, "lookahead_loss": 6.689608473777771, "loss": 4.3983, "step": 338000 }, { "epoch": 0.6456375122070312, "grad_norm": 5.825692176818848, "learning_rate": 1.771821975708008e-05, "lookahead_loss": 6.73386011838913, "loss": 4.4124, "step": 338500 }, { "epoch": 0.6465911865234375, "grad_norm": 3.993104934692383, "learning_rate": 1.7670536041259766e-05, "lookahead_loss": 6.678568332195282, "loss": 4.5016, "step": 339000 }, { "epoch": 0.6475448608398438, "grad_norm": 8.556177139282227, "learning_rate": 1.7622852325439453e-05, "lookahead_loss": 6.670063781738281, "loss": 4.4177, "step": 339500 }, { "epoch": 0.64849853515625, "grad_norm": 4.3883185386657715, "learning_rate": 1.7575168609619143e-05, "lookahead_loss": 6.704745220184326, "loss": 4.4369, "step": 340000 }, { "epoch": 0.64849853515625, "eval_accuracy": 0.027093346379647748, "eval_lookahead_loss": 6.611294908046722, "eval_lookahead_perplexity": 743.4450887137718, "eval_loss": 4.230683326721191, "eval_perplexity": 68.76420454170646, "eval_runtime": 499.9516, "eval_samples_per_second": 20.002, "eval_steps_per_second": 5.0, "step": 340000 }, { "epoch": 0.6494522094726562, "grad_norm": 3.3336105346679688, "learning_rate": 1.752748489379883e-05, "lookahead_loss": 6.710097378253937, "loss": 4.3829, "step": 340500 }, { "epoch": 0.6504058837890625, "grad_norm": 5.135087490081787, "learning_rate": 1.7479801177978516e-05, "lookahead_loss": 6.581288105964661, "loss": 4.372, "step": 341000 }, { "epoch": 0.6513595581054688, "grad_norm": 4.113499641418457, "learning_rate": 1.7432117462158203e-05, "lookahead_loss": 6.627819978237152, "loss": 4.435, "step": 341500 }, { "epoch": 0.652313232421875, "grad_norm": 5.181084156036377, "learning_rate": 1.738443374633789e-05, "lookahead_loss": 6.731591155529022, "loss": 4.4081, "step": 342000 }, { "epoch": 0.6532669067382812, "grad_norm": 3.794375419616699, "learning_rate": 1.733675003051758e-05, "lookahead_loss": 6.627827451705933, "loss": 4.4187, "step": 342500 }, { "epoch": 0.6542205810546875, "grad_norm": 2.9668197631835938, "learning_rate": 1.7289066314697267e-05, "lookahead_loss": 6.621901811122894, "loss": 4.4203, "step": 343000 }, { "epoch": 0.6551742553710938, "grad_norm": 7.81248140335083, "learning_rate": 1.7241382598876954e-05, "lookahead_loss": 6.673139771461487, "loss": 4.4047, "step": 343500 }, { "epoch": 0.6561279296875, "grad_norm": 2.9868528842926025, "learning_rate": 1.719369888305664e-05, "lookahead_loss": 6.685246260643005, "loss": 4.4204, "step": 344000 }, { "epoch": 0.6570816040039062, "grad_norm": 2.974440336227417, "learning_rate": 1.7146015167236328e-05, "lookahead_loss": 6.627199524402618, "loss": 4.3511, "step": 344500 }, { "epoch": 0.6580352783203125, "grad_norm": 4.069870471954346, "learning_rate": 1.7098331451416018e-05, "lookahead_loss": 6.710576326847076, "loss": 4.4086, "step": 345000 }, { "epoch": 0.6580352783203125, "eval_accuracy": 0.026534246575342466, "eval_lookahead_loss": 6.599379666423798, "eval_lookahead_perplexity": 734.6393264225064, "eval_loss": 4.221333026885986, "eval_perplexity": 68.12423522640586, "eval_runtime": 508.9992, "eval_samples_per_second": 19.646, "eval_steps_per_second": 4.912, "step": 345000 }, { "epoch": 0.6589889526367188, "grad_norm": 3.6605544090270996, "learning_rate": 1.7050647735595705e-05, "lookahead_loss": 6.7103914537429805, "loss": 4.3938, "step": 345500 }, { "epoch": 0.659942626953125, "grad_norm": 4.5186052322387695, "learning_rate": 1.700296401977539e-05, "lookahead_loss": 6.731236486911774, "loss": 4.4062, "step": 346000 }, { "epoch": 0.6608963012695312, "grad_norm": 4.3831257820129395, "learning_rate": 1.6955280303955078e-05, "lookahead_loss": 6.716773512363434, "loss": 4.4084, "step": 346500 }, { "epoch": 0.6618499755859375, "grad_norm": 2.7324509620666504, "learning_rate": 1.6907596588134765e-05, "lookahead_loss": 6.699392409324646, "loss": 4.3772, "step": 347000 }, { "epoch": 0.6628036499023438, "grad_norm": 5.073256015777588, "learning_rate": 1.6859912872314455e-05, "lookahead_loss": 6.681104771137237, "loss": 4.3916, "step": 347500 }, { "epoch": 0.66375732421875, "grad_norm": 2.4443671703338623, "learning_rate": 1.6812229156494142e-05, "lookahead_loss": 6.606231934070587, "loss": 4.3799, "step": 348000 }, { "epoch": 0.6647109985351562, "grad_norm": 2.773428201675415, "learning_rate": 1.676454544067383e-05, "lookahead_loss": 6.838286927223206, "loss": 4.5062, "step": 348500 }, { "epoch": 0.6656646728515625, "grad_norm": 3.659235954284668, "learning_rate": 1.6716861724853516e-05, "lookahead_loss": 6.803696868419647, "loss": 4.4511, "step": 349000 }, { "epoch": 0.6666183471679688, "grad_norm": 5.034940242767334, "learning_rate": 1.6669178009033203e-05, "lookahead_loss": 6.782551177501679, "loss": 4.4366, "step": 349500 }, { "epoch": 0.667572021484375, "grad_norm": 5.6991658210754395, "learning_rate": 1.6621494293212893e-05, "lookahead_loss": 6.605106722831726, "loss": 4.3934, "step": 350000 }, { "epoch": 0.667572021484375, "eval_accuracy": 0.026918786692759294, "eval_lookahead_loss": 6.59083257226944, "eval_lookahead_perplexity": 728.3870523796809, "eval_loss": 4.2088847160339355, "eval_perplexity": 67.28146001668406, "eval_runtime": 498.4366, "eval_samples_per_second": 20.063, "eval_steps_per_second": 5.016, "step": 350000 }, { "epoch": 0.6685256958007812, "grad_norm": 3.159066915512085, "learning_rate": 1.657381057739258e-05, "lookahead_loss": 6.713028662681579, "loss": 4.4444, "step": 350500 }, { "epoch": 0.6694793701171875, "grad_norm": 3.6661810874938965, "learning_rate": 1.6526126861572266e-05, "lookahead_loss": 6.6944408664703365, "loss": 4.3875, "step": 351000 }, { "epoch": 0.6704330444335938, "grad_norm": 3.913452386856079, "learning_rate": 1.6478443145751953e-05, "lookahead_loss": 6.670931756019592, "loss": 4.402, "step": 351500 }, { "epoch": 0.67138671875, "grad_norm": 3.7112531661987305, "learning_rate": 1.643075942993164e-05, "lookahead_loss": 6.672056921958923, "loss": 4.409, "step": 352000 }, { "epoch": 0.6723403930664062, "grad_norm": 3.797732353210449, "learning_rate": 1.638307571411133e-05, "lookahead_loss": 6.729757042884827, "loss": 4.411, "step": 352500 }, { "epoch": 0.6732940673828125, "grad_norm": 2.853748321533203, "learning_rate": 1.6335391998291017e-05, "lookahead_loss": 6.699663223743439, "loss": 4.3897, "step": 353000 }, { "epoch": 0.6742477416992188, "grad_norm": 4.977001667022705, "learning_rate": 1.6287708282470704e-05, "lookahead_loss": 6.738108113765716, "loss": 4.3838, "step": 353500 }, { "epoch": 0.675201416015625, "grad_norm": 5.227272987365723, "learning_rate": 1.624002456665039e-05, "lookahead_loss": 6.616458326339722, "loss": 4.3441, "step": 354000 }, { "epoch": 0.6761550903320312, "grad_norm": 3.8353447914123535, "learning_rate": 1.6192340850830078e-05, "lookahead_loss": 6.697678503513337, "loss": 4.4052, "step": 354500 }, { "epoch": 0.6771087646484375, "grad_norm": 2.472179651260376, "learning_rate": 1.6144657135009768e-05, "lookahead_loss": 6.711700092792511, "loss": 4.361, "step": 355000 }, { "epoch": 0.6771087646484375, "eval_accuracy": 0.026397260273972604, "eval_lookahead_loss": 6.585781492137909, "eval_lookahead_perplexity": 724.7171872054365, "eval_loss": 4.200438976287842, "eval_perplexity": 66.7156111851496, "eval_runtime": 497.3644, "eval_samples_per_second": 20.106, "eval_steps_per_second": 5.026, "step": 355000 }, { "epoch": 0.6780624389648438, "grad_norm": 3.0541954040527344, "learning_rate": 1.6096973419189455e-05, "lookahead_loss": 6.69687911272049, "loss": 4.3483, "step": 355500 }, { "epoch": 0.67901611328125, "grad_norm": 6.426888465881348, "learning_rate": 1.604928970336914e-05, "lookahead_loss": 6.61621085691452, "loss": 4.3385, "step": 356000 }, { "epoch": 0.6799697875976562, "grad_norm": 3.1123814582824707, "learning_rate": 1.6001605987548828e-05, "lookahead_loss": 6.712069251537323, "loss": 4.3595, "step": 356500 }, { "epoch": 0.6809234619140625, "grad_norm": 3.876549243927002, "learning_rate": 1.5953922271728515e-05, "lookahead_loss": 6.656116401672363, "loss": 4.3449, "step": 357000 }, { "epoch": 0.6818771362304688, "grad_norm": 3.040940046310425, "learning_rate": 1.5906238555908205e-05, "lookahead_loss": 6.674755868911743, "loss": 4.3633, "step": 357500 }, { "epoch": 0.682830810546875, "grad_norm": 5.0258097648620605, "learning_rate": 1.5858554840087892e-05, "lookahead_loss": 6.547597886562348, "loss": 4.3308, "step": 358000 }, { "epoch": 0.6837844848632812, "grad_norm": 3.3490772247314453, "learning_rate": 1.581087112426758e-05, "lookahead_loss": 6.680437192440033, "loss": 4.3356, "step": 358500 }, { "epoch": 0.6847381591796875, "grad_norm": 7.182896137237549, "learning_rate": 1.5763187408447266e-05, "lookahead_loss": 6.620883371353149, "loss": 4.3528, "step": 359000 }, { "epoch": 0.6856918334960938, "grad_norm": 3.2694413661956787, "learning_rate": 1.5715503692626953e-05, "lookahead_loss": 6.693105098724366, "loss": 4.3197, "step": 359500 }, { "epoch": 0.6866455078125, "grad_norm": 3.234042167663574, "learning_rate": 1.5667819976806643e-05, "lookahead_loss": 6.702560704231262, "loss": 4.3219, "step": 360000 }, { "epoch": 0.6866455078125, "eval_accuracy": 0.02609784735812133, "eval_lookahead_loss": 6.58364914188385, "eval_lookahead_perplexity": 723.1734827713739, "eval_loss": 4.189453125, "eval_perplexity": 65.9866946175968, "eval_runtime": 498.3851, "eval_samples_per_second": 20.065, "eval_steps_per_second": 5.016, "step": 360000 }, { "epoch": 0.6875991821289062, "grad_norm": 3.4763636589050293, "learning_rate": 1.562013626098633e-05, "lookahead_loss": 6.752572128295898, "loss": 4.314, "step": 360500 }, { "epoch": 0.6885528564453125, "grad_norm": 4.534358978271484, "learning_rate": 1.5572452545166016e-05, "lookahead_loss": 6.6798251738548275, "loss": 4.332, "step": 361000 }, { "epoch": 0.6895065307617188, "grad_norm": 6.692281723022461, "learning_rate": 1.5524768829345703e-05, "lookahead_loss": 6.717973682880402, "loss": 4.3424, "step": 361500 }, { "epoch": 0.690460205078125, "grad_norm": 3.632629632949829, "learning_rate": 1.547708511352539e-05, "lookahead_loss": 6.65506831073761, "loss": 4.3551, "step": 362000 }, { "epoch": 0.6914138793945312, "grad_norm": 2.6637303829193115, "learning_rate": 1.542940139770508e-05, "lookahead_loss": 6.667487190723419, "loss": 4.3589, "step": 362500 }, { "epoch": 0.6923675537109375, "grad_norm": 2.8579957485198975, "learning_rate": 1.5381717681884767e-05, "lookahead_loss": 6.638519877433777, "loss": 4.3446, "step": 363000 }, { "epoch": 0.6933212280273438, "grad_norm": 5.04373025894165, "learning_rate": 1.5334033966064454e-05, "lookahead_loss": 6.574143090724945, "loss": 4.3272, "step": 363500 }, { "epoch": 0.69427490234375, "grad_norm": 4.437886714935303, "learning_rate": 1.528635025024414e-05, "lookahead_loss": 6.660460298538208, "loss": 4.348, "step": 364000 }, { "epoch": 0.6952285766601562, "grad_norm": 3.6791868209838867, "learning_rate": 1.523866653442383e-05, "lookahead_loss": 6.550147813796997, "loss": 4.3254, "step": 364500 }, { "epoch": 0.6961822509765625, "grad_norm": 3.4471399784088135, "learning_rate": 1.5190982818603516e-05, "lookahead_loss": 6.709778325080872, "loss": 4.3989, "step": 365000 }, { "epoch": 0.6961822509765625, "eval_accuracy": 0.026415655577299413, "eval_lookahead_loss": 6.590795400524139, "eval_lookahead_perplexity": 728.3599774649036, "eval_loss": 4.1812310218811035, "eval_perplexity": 65.44636955811609, "eval_runtime": 497.4508, "eval_samples_per_second": 20.102, "eval_steps_per_second": 5.026, "step": 365000 }, { "epoch": 0.6971359252929688, "grad_norm": 3.0071182250976562, "learning_rate": 1.5143299102783205e-05, "lookahead_loss": 6.705849833965302, "loss": 4.4612, "step": 365500 }, { "epoch": 0.698089599609375, "grad_norm": 3.617403507232666, "learning_rate": 1.5095615386962891e-05, "lookahead_loss": 6.610382905006409, "loss": 4.424, "step": 366000 }, { "epoch": 0.6990432739257812, "grad_norm": 4.012004375457764, "learning_rate": 1.5047931671142578e-05, "lookahead_loss": 6.839118438243866, "loss": 4.4722, "step": 366500 }, { "epoch": 0.6999969482421875, "grad_norm": 3.511273145675659, "learning_rate": 1.5000247955322267e-05, "lookahead_loss": 6.863223690986633, "loss": 4.441, "step": 367000 }, { "epoch": 0.7009506225585938, "grad_norm": 3.0213024616241455, "learning_rate": 1.4952564239501954e-05, "lookahead_loss": 6.642057280540467, "loss": 4.3795, "step": 367500 }, { "epoch": 0.701904296875, "grad_norm": 33.06582260131836, "learning_rate": 1.4904880523681642e-05, "lookahead_loss": 6.737332377910614, "loss": 4.3787, "step": 368000 }, { "epoch": 0.7028579711914062, "grad_norm": 9.458013534545898, "learning_rate": 1.4857196807861329e-05, "lookahead_loss": 6.628967182159424, "loss": 4.3616, "step": 368500 }, { "epoch": 0.7038116455078125, "grad_norm": 2.2891244888305664, "learning_rate": 1.4809513092041016e-05, "lookahead_loss": 6.6429635591506955, "loss": 4.3509, "step": 369000 }, { "epoch": 0.7047653198242188, "grad_norm": 4.122226238250732, "learning_rate": 1.4761829376220704e-05, "lookahead_loss": 6.660888340473175, "loss": 4.3298, "step": 369500 }, { "epoch": 0.705718994140625, "grad_norm": 3.343393564224243, "learning_rate": 1.4714145660400391e-05, "lookahead_loss": 6.710143873214721, "loss": 4.3539, "step": 370000 }, { "epoch": 0.705718994140625, "eval_accuracy": 0.02616379647749511, "eval_lookahead_loss": 6.57881973619461, "eval_lookahead_perplexity": 719.689404424905, "eval_loss": 4.175107002258301, "eval_perplexity": 65.04679944308978, "eval_runtime": 497.1409, "eval_samples_per_second": 20.115, "eval_steps_per_second": 5.029, "step": 370000 }, { "epoch": 0.7066726684570312, "grad_norm": 3.0434231758117676, "learning_rate": 1.466646194458008e-05, "lookahead_loss": 6.706907410621643, "loss": 4.3215, "step": 370500 }, { "epoch": 0.7076263427734375, "grad_norm": 7.531590461730957, "learning_rate": 1.4618778228759766e-05, "lookahead_loss": 6.748720278263092, "loss": 4.3742, "step": 371000 }, { "epoch": 0.7085800170898438, "grad_norm": 9.674352645874023, "learning_rate": 1.4571094512939453e-05, "lookahead_loss": 6.629317391872406, "loss": 4.3348, "step": 371500 }, { "epoch": 0.70953369140625, "grad_norm": 3.699843168258667, "learning_rate": 1.4523410797119142e-05, "lookahead_loss": 6.70166833114624, "loss": 4.3935, "step": 372000 }, { "epoch": 0.7104873657226562, "grad_norm": 3.5233702659606934, "learning_rate": 1.4475727081298829e-05, "lookahead_loss": 6.746693240642547, "loss": 4.3458, "step": 372500 }, { "epoch": 0.7114410400390625, "grad_norm": 3.045717716217041, "learning_rate": 1.4428043365478517e-05, "lookahead_loss": 6.684703667640686, "loss": 4.3366, "step": 373000 }, { "epoch": 0.7123947143554688, "grad_norm": 3.8223719596862793, "learning_rate": 1.4380359649658204e-05, "lookahead_loss": 6.722576892375946, "loss": 4.3654, "step": 373500 }, { "epoch": 0.713348388671875, "grad_norm": 4.2622199058532715, "learning_rate": 1.433267593383789e-05, "lookahead_loss": 6.675731827259064, "loss": 4.333, "step": 374000 }, { "epoch": 0.7143020629882812, "grad_norm": 4.83391809463501, "learning_rate": 1.428499221801758e-05, "lookahead_loss": 6.670724513053894, "loss": 4.3717, "step": 374500 }, { "epoch": 0.7152557373046875, "grad_norm": 5.849557399749756, "learning_rate": 1.4237308502197266e-05, "lookahead_loss": 6.620873598575592, "loss": 4.3217, "step": 375000 }, { "epoch": 0.7152557373046875, "eval_accuracy": 0.026307436399217222, "eval_lookahead_loss": 6.577335855007171, "eval_lookahead_perplexity": 718.6222628083761, "eval_loss": 4.167860984802246, "eval_perplexity": 64.57717271539977, "eval_runtime": 501.4187, "eval_samples_per_second": 19.943, "eval_steps_per_second": 4.986, "step": 375000 }, { "epoch": 0.7162094116210938, "grad_norm": 3.256359577178955, "learning_rate": 1.4189624786376955e-05, "lookahead_loss": 6.6249938111305235, "loss": 4.3196, "step": 375500 }, { "epoch": 0.7171630859375, "grad_norm": 5.568325996398926, "learning_rate": 1.4141941070556641e-05, "lookahead_loss": 6.663873047351837, "loss": 4.3293, "step": 376000 }, { "epoch": 0.7181167602539062, "grad_norm": 2.890977144241333, "learning_rate": 1.4094257354736328e-05, "lookahead_loss": 6.691572694301605, "loss": 4.3338, "step": 376500 }, { "epoch": 0.7190704345703125, "grad_norm": 2.485827922821045, "learning_rate": 1.4046573638916017e-05, "lookahead_loss": 6.601945824623108, "loss": 4.3552, "step": 377000 }, { "epoch": 0.7200241088867188, "grad_norm": 5.2378315925598145, "learning_rate": 1.3998889923095704e-05, "lookahead_loss": 6.699498305797577, "loss": 4.3479, "step": 377500 }, { "epoch": 0.720977783203125, "grad_norm": 2.7909178733825684, "learning_rate": 1.3951206207275392e-05, "lookahead_loss": 6.626140068054199, "loss": 4.3397, "step": 378000 }, { "epoch": 0.7219314575195312, "grad_norm": 4.9583258628845215, "learning_rate": 1.3903522491455079e-05, "lookahead_loss": 6.629690541267395, "loss": 4.3077, "step": 378500 }, { "epoch": 0.7228851318359375, "grad_norm": 2.4516940116882324, "learning_rate": 1.3855838775634766e-05, "lookahead_loss": 6.721600716590881, "loss": 4.2871, "step": 379000 }, { "epoch": 0.7238388061523438, "grad_norm": 3.321565628051758, "learning_rate": 1.3808155059814454e-05, "lookahead_loss": 6.69266849899292, "loss": 4.3207, "step": 379500 }, { "epoch": 0.72479248046875, "grad_norm": 3.0966012477874756, "learning_rate": 1.3760471343994141e-05, "lookahead_loss": 6.69942758512497, "loss": 4.3246, "step": 380000 }, { "epoch": 0.72479248046875, "eval_accuracy": 0.026269471624266145, "eval_lookahead_loss": 6.566206645679474, "eval_lookahead_perplexity": 710.6689046034771, "eval_loss": 4.157970905303955, "eval_perplexity": 63.941647222541796, "eval_runtime": 498.2129, "eval_samples_per_second": 20.072, "eval_steps_per_second": 5.018, "step": 380000 }, { "epoch": 0.7257461547851562, "grad_norm": 2.734866142272949, "learning_rate": 1.371278762817383e-05, "lookahead_loss": 6.639896987915039, "loss": 4.3616, "step": 380500 }, { "epoch": 0.7266998291015625, "grad_norm": 3.517700672149658, "learning_rate": 1.3665103912353516e-05, "lookahead_loss": 6.724002861976624, "loss": 4.314, "step": 381000 }, { "epoch": 0.7276535034179688, "grad_norm": 3.2689876556396484, "learning_rate": 1.3617420196533203e-05, "lookahead_loss": 6.687919172286987, "loss": 4.3509, "step": 381500 }, { "epoch": 0.728607177734375, "grad_norm": 3.0330846309661865, "learning_rate": 1.3569736480712892e-05, "lookahead_loss": 6.607539977073669, "loss": 4.3121, "step": 382000 }, { "epoch": 0.7295608520507812, "grad_norm": 2.222898006439209, "learning_rate": 1.3522052764892579e-05, "lookahead_loss": 6.655561319351197, "loss": 4.3004, "step": 382500 }, { "epoch": 0.7305145263671875, "grad_norm": 15.09243392944336, "learning_rate": 1.3474369049072265e-05, "lookahead_loss": 6.649073299884797, "loss": 4.3366, "step": 383000 }, { "epoch": 0.7314682006835938, "grad_norm": 3.0642571449279785, "learning_rate": 1.3426685333251954e-05, "lookahead_loss": 6.632102132320404, "loss": 4.3221, "step": 383500 }, { "epoch": 0.732421875, "grad_norm": 2.829737901687622, "learning_rate": 1.337900161743164e-05, "lookahead_loss": 6.681195634841919, "loss": 4.3214, "step": 384000 }, { "epoch": 0.7333755493164062, "grad_norm": 2.6743147373199463, "learning_rate": 1.333131790161133e-05, "lookahead_loss": 6.624256817817688, "loss": 4.311, "step": 384500 }, { "epoch": 0.7343292236328125, "grad_norm": 2.8244335651397705, "learning_rate": 1.3283634185791016e-05, "lookahead_loss": 6.636578221797943, "loss": 4.3571, "step": 385000 }, { "epoch": 0.7343292236328125, "eval_accuracy": 0.026513111545988257, "eval_lookahead_loss": 6.571847222995758, "eval_lookahead_perplexity": 714.6888141533301, "eval_loss": 4.150728702545166, "eval_perplexity": 63.480241661644065, "eval_runtime": 499.9997, "eval_samples_per_second": 20.0, "eval_steps_per_second": 5.0, "step": 385000 }, { "epoch": 0.7352828979492188, "grad_norm": 3.2999961376190186, "learning_rate": 1.3235950469970703e-05, "lookahead_loss": 6.722831572055816, "loss": 4.4139, "step": 385500 }, { "epoch": 0.736236572265625, "grad_norm": 2.806603193283081, "learning_rate": 1.3188266754150391e-05, "lookahead_loss": 6.745130841255188, "loss": 4.3955, "step": 386000 }, { "epoch": 0.7371902465820312, "grad_norm": 2.9243004322052, "learning_rate": 1.3140583038330078e-05, "lookahead_loss": 6.68635154914856, "loss": 4.3833, "step": 386500 }, { "epoch": 0.7381439208984375, "grad_norm": 2.4423747062683105, "learning_rate": 1.3092899322509767e-05, "lookahead_loss": 6.737236121654511, "loss": 4.3337, "step": 387000 }, { "epoch": 0.7390975952148438, "grad_norm": 5.724939823150635, "learning_rate": 1.3045215606689454e-05, "lookahead_loss": 6.667905026435852, "loss": 4.2755, "step": 387500 }, { "epoch": 0.74005126953125, "grad_norm": 3.302123785018921, "learning_rate": 1.299753189086914e-05, "lookahead_loss": 6.705378426551819, "loss": 4.3849, "step": 388000 }, { "epoch": 0.7410049438476562, "grad_norm": 2.5157999992370605, "learning_rate": 1.2949848175048829e-05, "lookahead_loss": 6.650665447235108, "loss": 4.3602, "step": 388500 }, { "epoch": 0.7419586181640625, "grad_norm": 2.457387685775757, "learning_rate": 1.2902164459228516e-05, "lookahead_loss": 6.557595596909523, "loss": 4.2947, "step": 389000 }, { "epoch": 0.7429122924804688, "grad_norm": 4.504166603088379, "learning_rate": 1.2854480743408204e-05, "lookahead_loss": 6.700963752269745, "loss": 4.3505, "step": 389500 }, { "epoch": 0.743865966796875, "grad_norm": 3.3341872692108154, "learning_rate": 1.2806797027587891e-05, "lookahead_loss": 6.665305290222168, "loss": 4.3611, "step": 390000 }, { "epoch": 0.743865966796875, "eval_accuracy": 0.02615146771037182, "eval_lookahead_loss": 6.564685486412048, "eval_lookahead_perplexity": 709.5886858138754, "eval_loss": 4.142886161804199, "eval_perplexity": 62.9843423770439, "eval_runtime": 497.3312, "eval_samples_per_second": 20.107, "eval_steps_per_second": 5.027, "step": 390000 }, { "epoch": 0.7448196411132812, "grad_norm": 2.7418582439422607, "learning_rate": 1.2759113311767578e-05, "lookahead_loss": 6.693970648288727, "loss": 4.3005, "step": 390500 }, { "epoch": 0.7457733154296875, "grad_norm": 3.876659631729126, "learning_rate": 1.2711429595947266e-05, "lookahead_loss": 6.685900143623352, "loss": 4.3388, "step": 391000 }, { "epoch": 0.7467269897460938, "grad_norm": 3.8911428451538086, "learning_rate": 1.2663745880126953e-05, "lookahead_loss": 6.513127979278565, "loss": 4.2964, "step": 391500 }, { "epoch": 0.7476806640625, "grad_norm": 2.520048141479492, "learning_rate": 1.2616062164306642e-05, "lookahead_loss": 6.618854129314423, "loss": 4.2959, "step": 392000 }, { "epoch": 0.7486343383789062, "grad_norm": 6.426451206207275, "learning_rate": 1.2568378448486329e-05, "lookahead_loss": 6.695264280796051, "loss": 4.3111, "step": 392500 }, { "epoch": 0.7495880126953125, "grad_norm": 3.5387940406799316, "learning_rate": 1.2520694732666015e-05, "lookahead_loss": 6.714041899204254, "loss": 4.2673, "step": 393000 }, { "epoch": 0.7505416870117188, "grad_norm": 2.792405843734741, "learning_rate": 1.2473011016845704e-05, "lookahead_loss": 6.693569235324859, "loss": 4.3108, "step": 393500 }, { "epoch": 0.751495361328125, "grad_norm": 3.3299753665924072, "learning_rate": 1.242532730102539e-05, "lookahead_loss": 6.6249069724082945, "loss": 4.3064, "step": 394000 }, { "epoch": 0.7524490356445312, "grad_norm": 2.1633620262145996, "learning_rate": 1.237764358520508e-05, "lookahead_loss": 6.661659617900848, "loss": 4.2984, "step": 394500 }, { "epoch": 0.7534027099609375, "grad_norm": 2.7956740856170654, "learning_rate": 1.2329959869384766e-05, "lookahead_loss": 6.622993128299713, "loss": 4.297, "step": 395000 }, { "epoch": 0.7534027099609375, "eval_accuracy": 0.02587866927592955, "eval_lookahead_loss": 6.560332302093506, "eval_lookahead_perplexity": 706.5064291572969, "eval_loss": 4.134155750274658, "eval_perplexity": 62.43685651406719, "eval_runtime": 498.2139, "eval_samples_per_second": 20.072, "eval_steps_per_second": 5.018, "step": 395000 }, { "epoch": 0.7543563842773438, "grad_norm": 2.308744192123413, "learning_rate": 1.2282276153564453e-05, "lookahead_loss": 6.707507292747498, "loss": 4.2888, "step": 395500 }, { "epoch": 0.75531005859375, "grad_norm": 2.2134950160980225, "learning_rate": 1.2234592437744141e-05, "lookahead_loss": 6.605789232730865, "loss": 4.2301, "step": 396000 }, { "epoch": 0.7562637329101562, "grad_norm": 3.837510347366333, "learning_rate": 1.2186908721923828e-05, "lookahead_loss": 6.692598711967468, "loss": 4.2857, "step": 396500 }, { "epoch": 0.7572174072265625, "grad_norm": 3.351163864135742, "learning_rate": 1.2139225006103517e-05, "lookahead_loss": 6.6370115914344785, "loss": 4.2847, "step": 397000 }, { "epoch": 0.7581710815429688, "grad_norm": 3.335264205932617, "learning_rate": 1.2091541290283204e-05, "lookahead_loss": 6.69923542881012, "loss": 4.2798, "step": 397500 }, { "epoch": 0.759124755859375, "grad_norm": 3.785309314727783, "learning_rate": 1.204385757446289e-05, "lookahead_loss": 6.617724842071533, "loss": 4.233, "step": 398000 }, { "epoch": 0.7600784301757812, "grad_norm": 6.288375377655029, "learning_rate": 1.1996173858642579e-05, "lookahead_loss": 6.654966024875641, "loss": 4.2747, "step": 398500 }, { "epoch": 0.7610321044921875, "grad_norm": 2.4916412830352783, "learning_rate": 1.1948490142822266e-05, "lookahead_loss": 6.6277365736961364, "loss": 4.2402, "step": 399000 }, { "epoch": 0.7619857788085938, "grad_norm": 5.595306396484375, "learning_rate": 1.1900806427001954e-05, "lookahead_loss": 6.634374638080597, "loss": 4.2618, "step": 399500 }, { "epoch": 0.762939453125, "grad_norm": 2.6695070266723633, "learning_rate": 1.1853122711181641e-05, "lookahead_loss": 6.644364473342896, "loss": 4.2687, "step": 400000 }, { "epoch": 0.762939453125, "eval_accuracy": 0.025910567514677105, "eval_lookahead_loss": 6.561528756618499, "eval_lookahead_perplexity": 707.3522378563239, "eval_loss": 4.129403114318848, "eval_perplexity": 62.14082089681638, "eval_runtime": 497.1165, "eval_samples_per_second": 20.116, "eval_steps_per_second": 5.029, "step": 400000 }, { "epoch": 0.7638931274414062, "grad_norm": 4.8858137130737305, "learning_rate": 1.1805438995361328e-05, "lookahead_loss": 6.589639246463776, "loss": 4.2742, "step": 400500 }, { "epoch": 0.7648468017578125, "grad_norm": 3.5187971591949463, "learning_rate": 1.1757755279541016e-05, "lookahead_loss": 6.58173553943634, "loss": 4.2891, "step": 401000 }, { "epoch": 0.7658004760742188, "grad_norm": 3.030245780944824, "learning_rate": 1.1710071563720703e-05, "lookahead_loss": 6.587851373195648, "loss": 4.2726, "step": 401500 }, { "epoch": 0.766754150390625, "grad_norm": 5.722937107086182, "learning_rate": 1.1662387847900392e-05, "lookahead_loss": 6.604130040645599, "loss": 4.2981, "step": 402000 }, { "epoch": 0.7677078247070312, "grad_norm": 2.6497461795806885, "learning_rate": 1.1614704132080079e-05, "lookahead_loss": 6.530738988876343, "loss": 4.2548, "step": 402500 }, { "epoch": 0.7686614990234375, "grad_norm": 2.821290969848633, "learning_rate": 1.1567020416259765e-05, "lookahead_loss": 6.71344186925888, "loss": 4.3425, "step": 403000 }, { "epoch": 0.7696151733398438, "grad_norm": 2.568646192550659, "learning_rate": 1.1519336700439454e-05, "lookahead_loss": 6.699754923343659, "loss": 4.3171, "step": 403500 }, { "epoch": 0.77056884765625, "grad_norm": 2.4559967517852783, "learning_rate": 1.147165298461914e-05, "lookahead_loss": 6.661921547412872, "loss": 4.3559, "step": 404000 }, { "epoch": 0.7715225219726562, "grad_norm": 3.402430534362793, "learning_rate": 1.142396926879883e-05, "lookahead_loss": 6.647783831119537, "loss": 4.396, "step": 404500 }, { "epoch": 0.7724761962890625, "grad_norm": 3.8895821571350098, "learning_rate": 1.1376285552978516e-05, "lookahead_loss": 6.7456273860931395, "loss": 4.3698, "step": 405000 }, { "epoch": 0.7724761962890625, "eval_accuracy": 0.026411154598825832, "eval_lookahead_loss": 6.555500870037079, "eval_lookahead_perplexity": 703.1012239800189, "eval_loss": 4.123408794403076, "eval_perplexity": 61.769443127136924, "eval_runtime": 498.5639, "eval_samples_per_second": 20.058, "eval_steps_per_second": 5.014, "step": 405000 }, { "epoch": 0.7734298706054688, "grad_norm": 2.766287088394165, "learning_rate": 1.1328601837158203e-05, "lookahead_loss": 6.580435157775879, "loss": 4.344, "step": 405500 }, { "epoch": 0.774383544921875, "grad_norm": 2.441617012023926, "learning_rate": 1.1280918121337891e-05, "lookahead_loss": 6.661370910644531, "loss": 4.3517, "step": 406000 }, { "epoch": 0.7753372192382812, "grad_norm": 3.585153818130493, "learning_rate": 1.1233234405517578e-05, "lookahead_loss": 6.606682596683502, "loss": 4.2932, "step": 406500 }, { "epoch": 0.7762908935546875, "grad_norm": 1.7890125513076782, "learning_rate": 1.1185550689697267e-05, "lookahead_loss": 6.6908057458400725, "loss": 4.3179, "step": 407000 }, { "epoch": 0.7772445678710938, "grad_norm": 2.8319530487060547, "learning_rate": 1.1137866973876954e-05, "lookahead_loss": 6.644765535831452, "loss": 4.288, "step": 407500 }, { "epoch": 0.7781982421875, "grad_norm": 2.4868204593658447, "learning_rate": 1.109018325805664e-05, "lookahead_loss": 6.6125330600738526, "loss": 4.2813, "step": 408000 }, { "epoch": 0.7791519165039062, "grad_norm": 2.699803113937378, "learning_rate": 1.1042499542236329e-05, "lookahead_loss": 6.636028523921967, "loss": 4.3081, "step": 408500 }, { "epoch": 0.7801055908203125, "grad_norm": 2.597249984741211, "learning_rate": 1.0994815826416016e-05, "lookahead_loss": 6.6129836401939395, "loss": 4.2781, "step": 409000 }, { "epoch": 0.7810592651367188, "grad_norm": 2.8653981685638428, "learning_rate": 1.0947132110595704e-05, "lookahead_loss": 6.6597934679985045, "loss": 4.3139, "step": 409500 }, { "epoch": 0.782012939453125, "grad_norm": 3.093632221221924, "learning_rate": 1.0899448394775391e-05, "lookahead_loss": 6.615334903717041, "loss": 4.2747, "step": 410000 }, { "epoch": 0.782012939453125, "eval_accuracy": 0.02629041095890411, "eval_lookahead_loss": 6.554099105453491, "eval_lookahead_perplexity": 702.1163320401578, "eval_loss": 4.116486072540283, "eval_perplexity": 61.343307164965466, "eval_runtime": 500.2535, "eval_samples_per_second": 19.99, "eval_steps_per_second": 4.997, "step": 410000 }, { "epoch": 0.7829666137695312, "grad_norm": 4.335233688354492, "learning_rate": 1.0851764678955078e-05, "lookahead_loss": 6.563867825508118, "loss": 4.2375, "step": 410500 }, { "epoch": 0.7839202880859375, "grad_norm": 3.7158541679382324, "learning_rate": 1.0804080963134766e-05, "lookahead_loss": 6.702996132850647, "loss": 4.3025, "step": 411000 }, { "epoch": 0.7848739624023438, "grad_norm": 3.1089818477630615, "learning_rate": 1.0756397247314453e-05, "lookahead_loss": 6.676822054862976, "loss": 4.3123, "step": 411500 }, { "epoch": 0.78582763671875, "grad_norm": 5.723045825958252, "learning_rate": 1.0708713531494142e-05, "lookahead_loss": 6.659311408519745, "loss": 4.2933, "step": 412000 }, { "epoch": 0.7867813110351562, "grad_norm": 2.5103535652160645, "learning_rate": 1.0661029815673829e-05, "lookahead_loss": 6.621230489253998, "loss": 4.2926, "step": 412500 }, { "epoch": 0.7877349853515625, "grad_norm": 3.147996425628662, "learning_rate": 1.0613346099853515e-05, "lookahead_loss": 6.78729913520813, "loss": 4.2789, "step": 413000 }, { "epoch": 0.7886886596679688, "grad_norm": 3.002119541168213, "learning_rate": 1.0565662384033204e-05, "lookahead_loss": 6.612146024703979, "loss": 4.2987, "step": 413500 }, { "epoch": 0.789642333984375, "grad_norm": 2.4499685764312744, "learning_rate": 1.051797866821289e-05, "lookahead_loss": 6.680230628013611, "loss": 4.2722, "step": 414000 }, { "epoch": 0.7905960083007812, "grad_norm": 5.132491588592529, "learning_rate": 1.047029495239258e-05, "lookahead_loss": 6.743901293754577, "loss": 4.2675, "step": 414500 }, { "epoch": 0.7915496826171875, "grad_norm": 2.842432737350464, "learning_rate": 1.0422611236572266e-05, "lookahead_loss": 6.654487781047821, "loss": 4.2801, "step": 415000 }, { "epoch": 0.7915496826171875, "eval_accuracy": 0.026136399217221135, "eval_lookahead_loss": 6.54736415977478, "eval_lookahead_perplexity": 697.4035048156809, "eval_loss": 4.110983371734619, "eval_perplexity": 61.006680327006265, "eval_runtime": 517.1643, "eval_samples_per_second": 19.336, "eval_steps_per_second": 4.834, "step": 415000 }, { "epoch": 0.7925033569335938, "grad_norm": 3.0221896171569824, "learning_rate": 1.0374927520751953e-05, "lookahead_loss": 6.6506709976196285, "loss": 4.2623, "step": 415500 }, { "epoch": 0.79345703125, "grad_norm": 2.9380404949188232, "learning_rate": 1.0327243804931641e-05, "lookahead_loss": 6.701758219718933, "loss": 4.2743, "step": 416000 }, { "epoch": 0.7944107055664062, "grad_norm": 3.1665589809417725, "learning_rate": 1.0279560089111328e-05, "lookahead_loss": 6.594400270462036, "loss": 4.2377, "step": 416500 }, { "epoch": 0.7953643798828125, "grad_norm": 10.091591835021973, "learning_rate": 1.0231876373291017e-05, "lookahead_loss": 6.658792613983154, "loss": 4.3318, "step": 417000 }, { "epoch": 0.7963180541992188, "grad_norm": 2.720912456512451, "learning_rate": 1.0184192657470704e-05, "lookahead_loss": 6.620512091636658, "loss": 4.2475, "step": 417500 }, { "epoch": 0.797271728515625, "grad_norm": 2.677490711212158, "learning_rate": 1.013650894165039e-05, "lookahead_loss": 6.652620564937592, "loss": 4.2669, "step": 418000 }, { "epoch": 0.7982254028320312, "grad_norm": 3.1806468963623047, "learning_rate": 1.0088825225830079e-05, "lookahead_loss": 6.6070528383255, "loss": 4.2828, "step": 418500 }, { "epoch": 0.7991790771484375, "grad_norm": 4.604299068450928, "learning_rate": 1.0041141510009766e-05, "lookahead_loss": 6.5740251870155335, "loss": 4.2718, "step": 419000 }, { "epoch": 0.8001327514648438, "grad_norm": 3.0425753593444824, "learning_rate": 9.993457794189454e-06, "lookahead_loss": 6.609935761451721, "loss": 4.2624, "step": 419500 }, { "epoch": 0.80108642578125, "grad_norm": 6.122983455657959, "learning_rate": 9.945774078369141e-06, "lookahead_loss": 6.606327618598938, "loss": 4.2964, "step": 420000 }, { "epoch": 0.80108642578125, "eval_accuracy": 0.02623894324853229, "eval_lookahead_loss": 6.54739080991745, "eval_lookahead_perplexity": 697.4220909662431, "eval_loss": 4.104123592376709, "eval_perplexity": 60.58962006673757, "eval_runtime": 506.8903, "eval_samples_per_second": 19.728, "eval_steps_per_second": 4.932, "step": 420000 }, { "epoch": 0.8020401000976562, "grad_norm": 4.119128704071045, "learning_rate": 9.898090362548828e-06, "lookahead_loss": 6.594478064537048, "loss": 4.3137, "step": 420500 }, { "epoch": 0.8029937744140625, "grad_norm": 2.293022394180298, "learning_rate": 9.850406646728516e-06, "lookahead_loss": 6.668021618366241, "loss": 4.3428, "step": 421000 }, { "epoch": 0.8039474487304688, "grad_norm": 3.1304075717926025, "learning_rate": 9.802722930908203e-06, "lookahead_loss": 6.671570154666901, "loss": 4.3319, "step": 421500 }, { "epoch": 0.804901123046875, "grad_norm": 2.5874991416931152, "learning_rate": 9.755039215087892e-06, "lookahead_loss": 6.676826702594757, "loss": 4.3197, "step": 422000 }, { "epoch": 0.8058547973632812, "grad_norm": 3.4005072116851807, "learning_rate": 9.707355499267579e-06, "lookahead_loss": 6.630913782596588, "loss": 4.3603, "step": 422500 }, { "epoch": 0.8068084716796875, "grad_norm": 4.515960693359375, "learning_rate": 9.659671783447265e-06, "lookahead_loss": 6.675499610900879, "loss": 4.2991, "step": 423000 }, { "epoch": 0.8077621459960938, "grad_norm": 2.485811471939087, "learning_rate": 9.611988067626954e-06, "lookahead_loss": 6.69144215965271, "loss": 4.3012, "step": 423500 }, { "epoch": 0.8087158203125, "grad_norm": 50.45549011230469, "learning_rate": 9.56430435180664e-06, "lookahead_loss": 6.633441325187683, "loss": 4.2539, "step": 424000 }, { "epoch": 0.8096694946289062, "grad_norm": 7.830841064453125, "learning_rate": 9.51662063598633e-06, "lookahead_loss": 6.674381168365478, "loss": 4.3194, "step": 424500 }, { "epoch": 0.8106231689453125, "grad_norm": 3.276059865951538, "learning_rate": 9.468936920166016e-06, "lookahead_loss": 6.652407437324524, "loss": 4.2828, "step": 425000 }, { "epoch": 0.8106231689453125, "eval_accuracy": 0.026173972602739726, "eval_lookahead_loss": 6.5472285559654235, "eval_lookahead_perplexity": 697.3089406555443, "eval_loss": 4.098784446716309, "eval_perplexity": 60.26698532306999, "eval_runtime": 517.6691, "eval_samples_per_second": 19.317, "eval_steps_per_second": 4.829, "step": 425000 }, { "epoch": 0.8115768432617188, "grad_norm": 2.8764538764953613, "learning_rate": 9.421253204345703e-06, "lookahead_loss": 6.653769429683686, "loss": 4.285, "step": 425500 }, { "epoch": 0.812530517578125, "grad_norm": 2.9866414070129395, "learning_rate": 9.373569488525391e-06, "lookahead_loss": 6.658207138061523, "loss": 4.2781, "step": 426000 }, { "epoch": 0.8134841918945312, "grad_norm": 12.330890655517578, "learning_rate": 9.325885772705078e-06, "lookahead_loss": 6.664547824382782, "loss": 4.2856, "step": 426500 }, { "epoch": 0.8144378662109375, "grad_norm": 2.519469738006592, "learning_rate": 9.278202056884767e-06, "lookahead_loss": 6.707953702926636, "loss": 4.2857, "step": 427000 }, { "epoch": 0.8153915405273438, "grad_norm": 2.4571871757507324, "learning_rate": 9.230518341064454e-06, "lookahead_loss": 6.6104178705215455, "loss": 4.2645, "step": 427500 }, { "epoch": 0.81634521484375, "grad_norm": 2.3348891735076904, "learning_rate": 9.18283462524414e-06, "lookahead_loss": 6.678764070510864, "loss": 4.2738, "step": 428000 }, { "epoch": 0.8172988891601562, "grad_norm": 14.179648399353027, "learning_rate": 9.135150909423829e-06, "lookahead_loss": 6.676016499042511, "loss": 4.3071, "step": 428500 }, { "epoch": 0.8182525634765625, "grad_norm": 2.5469255447387695, "learning_rate": 9.087467193603516e-06, "lookahead_loss": 6.6185896263122554, "loss": 4.2438, "step": 429000 }, { "epoch": 0.8192062377929688, "grad_norm": 2.6413934230804443, "learning_rate": 9.039783477783204e-06, "lookahead_loss": 6.717736600875854, "loss": 4.2664, "step": 429500 }, { "epoch": 0.820159912109375, "grad_norm": 3.067246198654175, "learning_rate": 8.992099761962891e-06, "lookahead_loss": 6.628569370746613, "loss": 4.2847, "step": 430000 }, { "epoch": 0.820159912109375, "eval_accuracy": 0.025995694716242662, "eval_lookahead_loss": 6.539514183139801, "eval_lookahead_perplexity": 691.950335227618, "eval_loss": 4.092729091644287, "eval_perplexity": 59.90315001554418, "eval_runtime": 510.1128, "eval_samples_per_second": 19.604, "eval_steps_per_second": 4.901, "step": 430000 }, { "epoch": 0.8211135864257812, "grad_norm": 5.744166374206543, "learning_rate": 8.944416046142578e-06, "lookahead_loss": 6.658939532279968, "loss": 4.2638, "step": 430500 }, { "epoch": 0.8220672607421875, "grad_norm": 2.5084736347198486, "learning_rate": 8.896732330322266e-06, "lookahead_loss": 6.609486294269562, "loss": 4.291, "step": 431000 }, { "epoch": 0.8230209350585938, "grad_norm": 3.661972761154175, "learning_rate": 8.849048614501953e-06, "lookahead_loss": 6.5864327449798585, "loss": 4.2804, "step": 431500 }, { "epoch": 0.823974609375, "grad_norm": 2.7139265537261963, "learning_rate": 8.801364898681642e-06, "lookahead_loss": 6.586454041004181, "loss": 4.2313, "step": 432000 }, { "epoch": 0.8249282836914062, "grad_norm": 7.337777137756348, "learning_rate": 8.753681182861329e-06, "lookahead_loss": 6.613495799064636, "loss": 4.2614, "step": 432500 }, { "epoch": 0.8258819580078125, "grad_norm": 5.210099697113037, "learning_rate": 8.705997467041015e-06, "lookahead_loss": 6.696278253555298, "loss": 4.2438, "step": 433000 }, { "epoch": 0.8268356323242188, "grad_norm": 14.80707836151123, "learning_rate": 8.658313751220704e-06, "lookahead_loss": 6.633884486198426, "loss": 4.2889, "step": 433500 }, { "epoch": 0.827789306640625, "grad_norm": 3.6266610622406006, "learning_rate": 8.61063003540039e-06, "lookahead_loss": 6.626838976860046, "loss": 4.2391, "step": 434000 }, { "epoch": 0.8287429809570312, "grad_norm": 3.8215153217315674, "learning_rate": 8.56294631958008e-06, "lookahead_loss": 6.685880282402039, "loss": 4.2508, "step": 434500 }, { "epoch": 0.8296966552734375, "grad_norm": 7.333991050720215, "learning_rate": 8.515262603759766e-06, "lookahead_loss": 6.599733174800873, "loss": 4.2484, "step": 435000 }, { "epoch": 0.8296966552734375, "eval_accuracy": 0.026116438356164385, "eval_lookahead_loss": 6.538583665084839, "eval_lookahead_perplexity": 691.3067624220465, "eval_loss": 4.087435722351074, "eval_perplexity": 59.58689827774516, "eval_runtime": 522.8808, "eval_samples_per_second": 19.125, "eval_steps_per_second": 4.781, "step": 435000 }, { "epoch": 0.8306503295898438, "grad_norm": 4.17072868347168, "learning_rate": 8.467578887939453e-06, "lookahead_loss": 6.4633186254501345, "loss": 4.254, "step": 435500 }, { "epoch": 0.83160400390625, "grad_norm": 4.413368225097656, "learning_rate": 8.419895172119141e-06, "lookahead_loss": 6.60126486825943, "loss": 4.2859, "step": 436000 }, { "epoch": 0.8325576782226562, "grad_norm": 2.8531882762908936, "learning_rate": 8.372211456298828e-06, "lookahead_loss": 6.679268563270569, "loss": 4.2519, "step": 436500 }, { "epoch": 0.8335113525390625, "grad_norm": 3.2767746448516846, "learning_rate": 8.324527740478517e-06, "lookahead_loss": 6.669997920036316, "loss": 4.2468, "step": 437000 }, { "epoch": 0.8344650268554688, "grad_norm": 9.254007339477539, "learning_rate": 8.276844024658204e-06, "lookahead_loss": 6.655861560821533, "loss": 4.2617, "step": 437500 }, { "epoch": 0.835418701171875, "grad_norm": 4.701876640319824, "learning_rate": 8.22916030883789e-06, "lookahead_loss": 6.7079678115844725, "loss": 4.3019, "step": 438000 }, { "epoch": 0.8363723754882812, "grad_norm": 3.497493028640747, "learning_rate": 8.181476593017579e-06, "lookahead_loss": 6.65270103263855, "loss": 4.3247, "step": 438500 }, { "epoch": 0.8373260498046875, "grad_norm": 2.954586982727051, "learning_rate": 8.133792877197266e-06, "lookahead_loss": 6.69448539018631, "loss": 4.3275, "step": 439000 }, { "epoch": 0.8382797241210938, "grad_norm": 2.6196553707122803, "learning_rate": 8.086109161376954e-06, "lookahead_loss": 6.6499974822998045, "loss": 4.3345, "step": 439500 }, { "epoch": 0.8392333984375, "grad_norm": 2.873448371887207, "learning_rate": 8.038425445556641e-06, "lookahead_loss": 6.652175800800324, "loss": 4.2779, "step": 440000 }, { "epoch": 0.8392333984375, "eval_accuracy": 0.026026614481409003, "eval_lookahead_loss": 6.535137751293182, "eval_lookahead_perplexity": 688.9286786042409, "eval_loss": 4.083395004272461, "eval_perplexity": 59.346610215723835, "eval_runtime": 526.1937, "eval_samples_per_second": 19.004, "eval_steps_per_second": 4.751, "step": 440000 }, { "epoch": 0.8401870727539062, "grad_norm": 4.472801685333252, "learning_rate": 7.990741729736328e-06, "lookahead_loss": 6.594625084400177, "loss": 4.279, "step": 440500 }, { "epoch": 0.8411407470703125, "grad_norm": 2.95894193649292, "learning_rate": 7.943058013916016e-06, "lookahead_loss": 6.615108338356018, "loss": 4.2672, "step": 441000 }, { "epoch": 0.8420944213867188, "grad_norm": 5.113205432891846, "learning_rate": 7.895374298095703e-06, "lookahead_loss": 6.679742815494538, "loss": 4.256, "step": 441500 }, { "epoch": 0.843048095703125, "grad_norm": 2.5649607181549072, "learning_rate": 7.847690582275392e-06, "lookahead_loss": 6.653530582427979, "loss": 4.226, "step": 442000 }, { "epoch": 0.8440017700195312, "grad_norm": 3.272820472717285, "learning_rate": 7.800006866455079e-06, "lookahead_loss": 6.607401587486267, "loss": 4.2678, "step": 442500 }, { "epoch": 0.8449554443359375, "grad_norm": 3.735708713531494, "learning_rate": 7.752323150634765e-06, "lookahead_loss": 6.670604539394379, "loss": 4.2601, "step": 443000 }, { "epoch": 0.8459091186523438, "grad_norm": 3.8364078998565674, "learning_rate": 7.704639434814454e-06, "lookahead_loss": 6.611488430976868, "loss": 4.256, "step": 443500 }, { "epoch": 0.84686279296875, "grad_norm": 3.199310779571533, "learning_rate": 7.65695571899414e-06, "lookahead_loss": 6.594077569007873, "loss": 4.2686, "step": 444000 }, { "epoch": 0.8478164672851562, "grad_norm": 5.144148826599121, "learning_rate": 7.6092720031738284e-06, "lookahead_loss": 6.68904780960083, "loss": 4.2425, "step": 444500 }, { "epoch": 0.8487701416015625, "grad_norm": 2.212801694869995, "learning_rate": 7.561588287353516e-06, "lookahead_loss": 6.583337436676025, "loss": 4.2502, "step": 445000 }, { "epoch": 0.8487701416015625, "eval_accuracy": 0.02626673189823875, "eval_lookahead_loss": 6.532193170928955, "eval_lookahead_perplexity": 686.9030565119941, "eval_loss": 4.076977729797363, "eval_perplexity": 58.96698610763953, "eval_runtime": 521.7152, "eval_samples_per_second": 19.168, "eval_steps_per_second": 4.792, "step": 445000 }, { "epoch": 0.8497238159179688, "grad_norm": 2.848716974258423, "learning_rate": 7.513904571533204e-06, "lookahead_loss": 6.6040154967308045, "loss": 4.2633, "step": 445500 }, { "epoch": 0.850677490234375, "grad_norm": 8.098231315612793, "learning_rate": 7.466220855712891e-06, "lookahead_loss": 6.605663443565368, "loss": 4.2507, "step": 446000 }, { "epoch": 0.8516311645507812, "grad_norm": 3.05631947517395, "learning_rate": 7.418537139892578e-06, "lookahead_loss": 6.61659708738327, "loss": 4.2852, "step": 446500 }, { "epoch": 0.8525848388671875, "grad_norm": 2.881080389022827, "learning_rate": 7.370853424072266e-06, "lookahead_loss": 6.720996547698975, "loss": 4.2493, "step": 447000 }, { "epoch": 0.8535385131835938, "grad_norm": 2.530477523803711, "learning_rate": 7.323169708251954e-06, "lookahead_loss": 6.60098335981369, "loss": 4.2731, "step": 447500 }, { "epoch": 0.8544921875, "grad_norm": 2.4100759029388428, "learning_rate": 7.275485992431641e-06, "lookahead_loss": 6.625934128284454, "loss": 4.2704, "step": 448000 }, { "epoch": 0.8554458618164062, "grad_norm": 2.6307332515716553, "learning_rate": 7.227802276611328e-06, "lookahead_loss": 6.629112528800964, "loss": 4.2404, "step": 448500 }, { "epoch": 0.8563995361328125, "grad_norm": 3.420232057571411, "learning_rate": 7.180118560791016e-06, "lookahead_loss": 6.582442262172699, "loss": 4.1817, "step": 449000 }, { "epoch": 0.8573532104492188, "grad_norm": 2.4548017978668213, "learning_rate": 7.1324348449707034e-06, "lookahead_loss": 6.648948418140411, "loss": 4.2439, "step": 449500 }, { "epoch": 0.858306884765625, "grad_norm": 2.444723129272461, "learning_rate": 7.084751129150391e-06, "lookahead_loss": 6.569156549930573, "loss": 4.2147, "step": 450000 }, { "epoch": 0.858306884765625, "eval_accuracy": 0.02606086105675147, "eval_lookahead_loss": 6.531340920543671, "eval_lookahead_perplexity": 686.3178925059723, "eval_loss": 4.07657527923584, "eval_perplexity": 58.94325958566545, "eval_runtime": 555.0671, "eval_samples_per_second": 18.016, "eval_steps_per_second": 4.504, "step": 450000 }, { "epoch": 0.8592605590820312, "grad_norm": 3.7206337451934814, "learning_rate": 7.037067413330079e-06, "lookahead_loss": 6.673008211612702, "loss": 4.2324, "step": 450500 }, { "epoch": 0.8602142333984375, "grad_norm": 3.1438088417053223, "learning_rate": 6.989383697509766e-06, "lookahead_loss": 6.551126944065094, "loss": 4.1859, "step": 451000 }, { "epoch": 0.8611679077148438, "grad_norm": 5.4472222328186035, "learning_rate": 6.941699981689453e-06, "lookahead_loss": 6.690309646606445, "loss": 4.2427, "step": 451500 }, { "epoch": 0.86212158203125, "grad_norm": 2.896361827850342, "learning_rate": 6.894016265869141e-06, "lookahead_loss": 6.622882782459259, "loss": 4.2305, "step": 452000 }, { "epoch": 0.8630752563476562, "grad_norm": 4.0095672607421875, "learning_rate": 6.846332550048829e-06, "lookahead_loss": 6.543432626724243, "loss": 4.2504, "step": 452500 }, { "epoch": 0.8640289306640625, "grad_norm": 2.3271827697753906, "learning_rate": 6.798648834228516e-06, "lookahead_loss": 6.637137311458588, "loss": 4.2367, "step": 453000 }, { "epoch": 0.8649826049804688, "grad_norm": 3.728686809539795, "learning_rate": 6.750965118408203e-06, "lookahead_loss": 6.569926063537598, "loss": 4.2498, "step": 453500 }, { "epoch": 0.865936279296875, "grad_norm": 3.34240460395813, "learning_rate": 6.703281402587891e-06, "lookahead_loss": 6.6253420701026915, "loss": 4.2448, "step": 454000 }, { "epoch": 0.8668899536132812, "grad_norm": 2.570436716079712, "learning_rate": 6.6555976867675784e-06, "lookahead_loss": 6.60813649559021, "loss": 4.3222, "step": 454500 }, { "epoch": 0.8678436279296875, "grad_norm": 2.9127368927001953, "learning_rate": 6.607913970947266e-06, "lookahead_loss": 6.625271406650543, "loss": 4.3302, "step": 455000 }, { "epoch": 0.8678436279296875, "eval_accuracy": 0.02626320939334638, "eval_lookahead_loss": 6.529072472381592, "eval_lookahead_perplexity": 684.7627804565157, "eval_loss": 4.069154262542725, "eval_perplexity": 58.50745971141493, "eval_runtime": 531.486, "eval_samples_per_second": 18.815, "eval_steps_per_second": 4.704, "step": 455000 }, { "epoch": 0.8687973022460938, "grad_norm": 2.9328160285949707, "learning_rate": 6.560230255126954e-06, "lookahead_loss": 6.617561619281769, "loss": 4.2758, "step": 455500 }, { "epoch": 0.8697509765625, "grad_norm": 3.0083749294281006, "learning_rate": 6.512546539306641e-06, "lookahead_loss": 6.725751633644104, "loss": 4.32, "step": 456000 }, { "epoch": 0.8707046508789062, "grad_norm": 2.4908618927001953, "learning_rate": 6.464862823486328e-06, "lookahead_loss": 6.599325193405152, "loss": 4.257, "step": 456500 }, { "epoch": 0.8716583251953125, "grad_norm": 2.5758073329925537, "learning_rate": 6.417179107666016e-06, "lookahead_loss": 6.65661363363266, "loss": 4.2526, "step": 457000 }, { "epoch": 0.8726119995117188, "grad_norm": 4.87410306930542, "learning_rate": 6.369495391845704e-06, "lookahead_loss": 6.610780673980713, "loss": 4.247, "step": 457500 }, { "epoch": 0.873565673828125, "grad_norm": 4.387118816375732, "learning_rate": 6.321811676025391e-06, "lookahead_loss": 6.641941963672638, "loss": 4.2742, "step": 458000 }, { "epoch": 0.8745193481445312, "grad_norm": 6.7322821617126465, "learning_rate": 6.274127960205078e-06, "lookahead_loss": 6.681360451221466, "loss": 4.2435, "step": 458500 }, { "epoch": 0.8754730224609375, "grad_norm": 2.5205633640289307, "learning_rate": 6.226444244384766e-06, "lookahead_loss": 6.725174157619477, "loss": 4.3076, "step": 459000 }, { "epoch": 0.8764266967773438, "grad_norm": 2.705512285232544, "learning_rate": 6.1787605285644534e-06, "lookahead_loss": 6.653602551937103, "loss": 4.2507, "step": 459500 }, { "epoch": 0.87738037109375, "grad_norm": 2.6948580741882324, "learning_rate": 6.131076812744141e-06, "lookahead_loss": 6.559046368122101, "loss": 4.2672, "step": 460000 }, { "epoch": 0.87738037109375, "eval_accuracy": 0.026212720156555774, "eval_lookahead_loss": 6.527213355064392, "eval_lookahead_perplexity": 683.4909087588768, "eval_loss": 4.06601619720459, "eval_perplexity": 58.324147253737024, "eval_runtime": 541.6339, "eval_samples_per_second": 18.463, "eval_steps_per_second": 4.616, "step": 460000 }, { "epoch": 0.8783340454101562, "grad_norm": 2.859353542327881, "learning_rate": 6.083393096923829e-06, "lookahead_loss": 6.5805863113403325, "loss": 4.2418, "step": 460500 }, { "epoch": 0.8792877197265625, "grad_norm": 6.312974452972412, "learning_rate": 6.035709381103516e-06, "lookahead_loss": 6.628599304676056, "loss": 4.2687, "step": 461000 }, { "epoch": 0.8802413940429688, "grad_norm": 3.572995662689209, "learning_rate": 5.988025665283203e-06, "lookahead_loss": 6.570033200263977, "loss": 4.2236, "step": 461500 }, { "epoch": 0.881195068359375, "grad_norm": 4.1703104972839355, "learning_rate": 5.940341949462891e-06, "lookahead_loss": 6.681577812194824, "loss": 4.2637, "step": 462000 }, { "epoch": 0.8821487426757812, "grad_norm": 4.4267354011535645, "learning_rate": 5.892658233642579e-06, "lookahead_loss": 6.601446920871735, "loss": 4.2247, "step": 462500 }, { "epoch": 0.8831024169921875, "grad_norm": 20.743913650512695, "learning_rate": 5.844974517822266e-06, "lookahead_loss": 6.63533249092102, "loss": 4.2644, "step": 463000 }, { "epoch": 0.8840560913085938, "grad_norm": 3.5155651569366455, "learning_rate": 5.797290802001953e-06, "lookahead_loss": 6.560450331687927, "loss": 4.2503, "step": 463500 }, { "epoch": 0.885009765625, "grad_norm": 3.853111743927002, "learning_rate": 5.749607086181641e-06, "lookahead_loss": 6.673612300395965, "loss": 4.252, "step": 464000 }, { "epoch": 0.8859634399414062, "grad_norm": 2.14844012260437, "learning_rate": 5.7019233703613284e-06, "lookahead_loss": 6.64728904914856, "loss": 4.2353, "step": 464500 }, { "epoch": 0.8869171142578125, "grad_norm": 3.5505640506744385, "learning_rate": 5.654239654541016e-06, "lookahead_loss": 6.582558938503265, "loss": 4.2339, "step": 465000 }, { "epoch": 0.8869171142578125, "eval_accuracy": 0.026174559686888452, "eval_lookahead_loss": 6.526713365459442, "eval_lookahead_perplexity": 683.1492558279942, "eval_loss": 4.063615798950195, "eval_perplexity": 58.18431396741055, "eval_runtime": 550.3305, "eval_samples_per_second": 18.171, "eval_steps_per_second": 4.543, "step": 465000 }, { "epoch": 0.8878707885742188, "grad_norm": 5.016045093536377, "learning_rate": 5.606555938720704e-06, "lookahead_loss": 6.554610238075257, "loss": 4.2233, "step": 465500 }, { "epoch": 0.888824462890625, "grad_norm": 3.4702141284942627, "learning_rate": 5.558872222900391e-06, "lookahead_loss": 6.65569415473938, "loss": 4.1922, "step": 466000 }, { "epoch": 0.8897781372070312, "grad_norm": 3.0336902141571045, "learning_rate": 5.511188507080078e-06, "lookahead_loss": 6.671941471099854, "loss": 4.1937, "step": 466500 }, { "epoch": 0.8907318115234375, "grad_norm": 2.479767084121704, "learning_rate": 5.463504791259766e-06, "lookahead_loss": 6.6847689499855045, "loss": 4.2253, "step": 467000 }, { "epoch": 0.8916854858398438, "grad_norm": 2.5143187046051025, "learning_rate": 5.415821075439454e-06, "lookahead_loss": 6.607502378463745, "loss": 4.216, "step": 467500 }, { "epoch": 0.89263916015625, "grad_norm": 3.500342845916748, "learning_rate": 5.368137359619141e-06, "lookahead_loss": 6.631697944641113, "loss": 4.2291, "step": 468000 }, { "epoch": 0.8935928344726562, "grad_norm": 5.390781402587891, "learning_rate": 5.320453643798828e-06, "lookahead_loss": 6.668407920360565, "loss": 4.2324, "step": 468500 }, { "epoch": 0.8945465087890625, "grad_norm": 14.906558990478516, "learning_rate": 5.272769927978516e-06, "lookahead_loss": 6.6372659602165225, "loss": 4.266, "step": 469000 }, { "epoch": 0.8955001831054688, "grad_norm": 76.5853271484375, "learning_rate": 5.2250862121582034e-06, "lookahead_loss": 6.577320454597473, "loss": 4.2453, "step": 469500 }, { "epoch": 0.896453857421875, "grad_norm": 2.6320252418518066, "learning_rate": 5.177402496337891e-06, "lookahead_loss": 6.691053541660309, "loss": 4.2653, "step": 470000 }, { "epoch": 0.896453857421875, "eval_accuracy": 0.026293542074363994, "eval_lookahead_loss": 6.522984474754334, "eval_lookahead_perplexity": 680.6066104872345, "eval_loss": 4.057998180389404, "eval_perplexity": 57.85837304835865, "eval_runtime": 514.4224, "eval_samples_per_second": 19.439, "eval_steps_per_second": 4.86, "step": 470000 }, { "epoch": 0.8974075317382812, "grad_norm": 4.103435039520264, "learning_rate": 5.129718780517579e-06, "lookahead_loss": 6.602483690261841, "loss": 4.2418, "step": 470500 }, { "epoch": 0.8983612060546875, "grad_norm": 2.9843530654907227, "learning_rate": 5.082035064697266e-06, "lookahead_loss": 6.683919417858124, "loss": 4.3126, "step": 471000 }, { "epoch": 0.8993148803710938, "grad_norm": 2.5269765853881836, "learning_rate": 5.034351348876953e-06, "lookahead_loss": 6.5691752800941465, "loss": 4.3809, "step": 471500 }, { "epoch": 0.9002685546875, "grad_norm": 3.603677272796631, "learning_rate": 4.986667633056641e-06, "lookahead_loss": 6.618152508258819, "loss": 4.3154, "step": 472000 }, { "epoch": 0.9012222290039062, "grad_norm": 3.0613391399383545, "learning_rate": 4.938983917236329e-06, "lookahead_loss": 6.602779621601105, "loss": 4.2913, "step": 472500 }, { "epoch": 0.9021759033203125, "grad_norm": 2.7142789363861084, "learning_rate": 4.891300201416016e-06, "lookahead_loss": 6.614610056400299, "loss": 4.2457, "step": 473000 }, { "epoch": 0.9031295776367188, "grad_norm": 2.192488193511963, "learning_rate": 4.843616485595703e-06, "lookahead_loss": 6.607316645145416, "loss": 4.2673, "step": 473500 }, { "epoch": 0.904083251953125, "grad_norm": 2.7316648960113525, "learning_rate": 4.795932769775391e-06, "lookahead_loss": 6.647573236942291, "loss": 4.2697, "step": 474000 }, { "epoch": 0.9050369262695312, "grad_norm": 3.821866512298584, "learning_rate": 4.7482490539550784e-06, "lookahead_loss": 6.716279386520386, "loss": 4.2278, "step": 474500 }, { "epoch": 0.9059906005859375, "grad_norm": 4.446009159088135, "learning_rate": 4.700565338134766e-06, "lookahead_loss": 6.697609417915344, "loss": 4.2606, "step": 475000 }, { "epoch": 0.9059906005859375, "eval_accuracy": 0.026237769080234834, "eval_lookahead_loss": 6.519714889526367, "eval_lookahead_perplexity": 678.3849431121141, "eval_loss": 4.055566787719727, "eval_perplexity": 57.71786750554135, "eval_runtime": 536.6797, "eval_samples_per_second": 18.633, "eval_steps_per_second": 4.658, "step": 475000 }, { "epoch": 0.9069442749023438, "grad_norm": 3.599128484725952, "learning_rate": 4.652881622314453e-06, "lookahead_loss": 6.564849036216736, "loss": 4.2507, "step": 475500 }, { "epoch": 0.90789794921875, "grad_norm": 3.8391809463500977, "learning_rate": 4.605197906494141e-06, "lookahead_loss": 6.689366463661194, "loss": 4.2329, "step": 476000 }, { "epoch": 0.9088516235351562, "grad_norm": 2.8635313510894775, "learning_rate": 4.557514190673828e-06, "lookahead_loss": 6.6049631452560424, "loss": 4.2442, "step": 476500 }, { "epoch": 0.9098052978515625, "grad_norm": 2.454864978790283, "learning_rate": 4.509830474853516e-06, "lookahead_loss": 6.508340880393982, "loss": 4.2214, "step": 477000 }, { "epoch": 0.9107589721679688, "grad_norm": 2.5846598148345947, "learning_rate": 4.462146759033204e-06, "lookahead_loss": 6.5625168390274045, "loss": 4.2097, "step": 477500 }, { "epoch": 0.911712646484375, "grad_norm": 2.7328832149505615, "learning_rate": 4.4144630432128904e-06, "lookahead_loss": 6.667540644645691, "loss": 4.2724, "step": 478000 }, { "epoch": 0.9126663208007812, "grad_norm": 2.67948579788208, "learning_rate": 4.366779327392578e-06, "lookahead_loss": 6.675197106361389, "loss": 4.229, "step": 478500 }, { "epoch": 0.9136199951171875, "grad_norm": 3.4494199752807617, "learning_rate": 4.319095611572266e-06, "lookahead_loss": 6.652788396835327, "loss": 4.2316, "step": 479000 }, { "epoch": 0.9145736694335938, "grad_norm": 3.322443962097168, "learning_rate": 4.2714118957519534e-06, "lookahead_loss": 6.696266431808471, "loss": 4.2284, "step": 479500 }, { "epoch": 0.91552734375, "grad_norm": 2.631840229034424, "learning_rate": 4.223728179931641e-06, "lookahead_loss": 6.5073800740242005, "loss": 4.1935, "step": 480000 }, { "epoch": 0.91552734375, "eval_accuracy": 0.026120156555772996, "eval_lookahead_loss": 6.520347544574737, "eval_lookahead_perplexity": 678.8142625623517, "eval_loss": 4.053394317626953, "eval_perplexity": 57.5926132693707, "eval_runtime": 546.9441, "eval_samples_per_second": 18.283, "eval_steps_per_second": 4.571, "step": 480000 }, { "epoch": 0.9164810180664062, "grad_norm": 3.543037176132202, "learning_rate": 4.176044464111328e-06, "lookahead_loss": 6.579752499580383, "loss": 4.2244, "step": 480500 }, { "epoch": 0.9174346923828125, "grad_norm": 2.5560812950134277, "learning_rate": 4.128360748291016e-06, "lookahead_loss": 6.580712119102478, "loss": 4.2226, "step": 481000 }, { "epoch": 0.9183883666992188, "grad_norm": 3.2254488468170166, "learning_rate": 4.080677032470703e-06, "lookahead_loss": 6.5551387901306155, "loss": 4.202, "step": 481500 }, { "epoch": 0.919342041015625, "grad_norm": 3.4346179962158203, "learning_rate": 4.032993316650391e-06, "lookahead_loss": 6.567693585395813, "loss": 4.2292, "step": 482000 }, { "epoch": 0.9202957153320312, "grad_norm": 2.466153383255005, "learning_rate": 3.985309600830079e-06, "lookahead_loss": 6.658562662124634, "loss": 4.2124, "step": 482500 }, { "epoch": 0.9212493896484375, "grad_norm": 6.278874397277832, "learning_rate": 3.9376258850097654e-06, "lookahead_loss": 6.650185940742492, "loss": 4.2197, "step": 483000 }, { "epoch": 0.9222030639648438, "grad_norm": 3.4738569259643555, "learning_rate": 3.889942169189453e-06, "lookahead_loss": 6.570983042240143, "loss": 4.1923, "step": 483500 }, { "epoch": 0.92315673828125, "grad_norm": 3.485471248626709, "learning_rate": 3.842258453369141e-06, "lookahead_loss": 6.593604946136475, "loss": 4.2085, "step": 484000 }, { "epoch": 0.9241104125976562, "grad_norm": 1.9643586874008179, "learning_rate": 3.7945747375488284e-06, "lookahead_loss": 6.5568357219696045, "loss": 4.2082, "step": 484500 }, { "epoch": 0.9250640869140625, "grad_norm": 2.870086669921875, "learning_rate": 3.7468910217285157e-06, "lookahead_loss": 6.577616129398346, "loss": 4.2571, "step": 485000 }, { "epoch": 0.9250640869140625, "eval_accuracy": 0.026050097847358122, "eval_lookahead_loss": 6.518237040424347, "eval_lookahead_perplexity": 677.3831329775484, "eval_loss": 4.048369884490967, "eval_perplexity": 57.30396877957541, "eval_runtime": 548.6592, "eval_samples_per_second": 18.226, "eval_steps_per_second": 4.557, "step": 485000 }, { "epoch": 0.9260177612304688, "grad_norm": 3.7176942825317383, "learning_rate": 3.6992073059082034e-06, "lookahead_loss": 6.552393431663513, "loss": 4.1942, "step": 485500 }, { "epoch": 0.926971435546875, "grad_norm": 3.3978350162506104, "learning_rate": 3.6515235900878906e-06, "lookahead_loss": 6.5988184909820555, "loss": 4.2097, "step": 486000 }, { "epoch": 0.9279251098632812, "grad_norm": 5.2661542892456055, "learning_rate": 3.6038398742675783e-06, "lookahead_loss": 6.531791122436523, "loss": 4.2173, "step": 486500 }, { "epoch": 0.9288787841796875, "grad_norm": 2.9231197834014893, "learning_rate": 3.556156158447266e-06, "lookahead_loss": 6.569038414001465, "loss": 4.2464, "step": 487000 }, { "epoch": 0.9298324584960938, "grad_norm": 2.9972376823425293, "learning_rate": 3.508472442626953e-06, "lookahead_loss": 6.612027626991272, "loss": 4.2706, "step": 487500 }, { "epoch": 0.9307861328125, "grad_norm": 4.5635833740234375, "learning_rate": 3.460788726806641e-06, "lookahead_loss": 6.609994557380676, "loss": 4.3291, "step": 488000 }, { "epoch": 0.9317398071289062, "grad_norm": 4.255969047546387, "learning_rate": 3.413105010986328e-06, "lookahead_loss": 6.574904029846191, "loss": 4.3095, "step": 488500 }, { "epoch": 0.9326934814453125, "grad_norm": 3.864790916442871, "learning_rate": 3.3654212951660158e-06, "lookahead_loss": 6.567343755722046, "loss": 4.2391, "step": 489000 }, { "epoch": 0.9336471557617188, "grad_norm": 2.6948344707489014, "learning_rate": 3.3177375793457034e-06, "lookahead_loss": 6.545618652820587, "loss": 4.2719, "step": 489500 }, { "epoch": 0.934600830078125, "grad_norm": 2.924546003341675, "learning_rate": 3.2700538635253907e-06, "lookahead_loss": 6.608199277877808, "loss": 4.2549, "step": 490000 }, { "epoch": 0.934600830078125, "eval_accuracy": 0.025788258317025442, "eval_lookahead_loss": 6.518117751216889, "eval_lookahead_perplexity": 677.3023332998308, "eval_loss": 4.047163486480713, "eval_perplexity": 57.23487906888425, "eval_runtime": 528.8104, "eval_samples_per_second": 18.91, "eval_steps_per_second": 4.728, "step": 490000 }, { "epoch": 0.9355545043945312, "grad_norm": 3.337984800338745, "learning_rate": 3.2223701477050784e-06, "lookahead_loss": 6.663796932697296, "loss": 4.2254, "step": 490500 }, { "epoch": 0.9365081787109375, "grad_norm": 2.9918324947357178, "learning_rate": 3.1746864318847656e-06, "lookahead_loss": 6.630043797492981, "loss": 4.2846, "step": 491000 }, { "epoch": 0.9374618530273438, "grad_norm": 2.610915422439575, "learning_rate": 3.1270027160644533e-06, "lookahead_loss": 6.747565757751465, "loss": 4.2305, "step": 491500 }, { "epoch": 0.93841552734375, "grad_norm": 2.86892032623291, "learning_rate": 3.079319000244141e-06, "lookahead_loss": 6.622491578102112, "loss": 4.2428, "step": 492000 }, { "epoch": 0.9393692016601562, "grad_norm": 3.4845874309539795, "learning_rate": 3.031635284423828e-06, "lookahead_loss": 6.789701117515564, "loss": 4.3313, "step": 492500 }, { "epoch": 0.9403228759765625, "grad_norm": 6.423483848571777, "learning_rate": 2.983951568603516e-06, "lookahead_loss": 6.612638689517975, "loss": 4.218, "step": 493000 }, { "epoch": 0.9412765502929688, "grad_norm": 3.1299057006835938, "learning_rate": 2.936267852783203e-06, "lookahead_loss": 6.633204594612121, "loss": 4.2363, "step": 493500 }, { "epoch": 0.942230224609375, "grad_norm": 2.7796661853790283, "learning_rate": 2.8885841369628908e-06, "lookahead_loss": 6.674515191078186, "loss": 4.2526, "step": 494000 }, { "epoch": 0.9431838989257812, "grad_norm": 3.028944253921509, "learning_rate": 2.8409004211425784e-06, "lookahead_loss": 6.6662033867836, "loss": 4.2195, "step": 494500 }, { "epoch": 0.9441375732421875, "grad_norm": 2.6399247646331787, "learning_rate": 2.7932167053222657e-06, "lookahead_loss": 6.593350584030151, "loss": 4.2247, "step": 495000 }, { "epoch": 0.9441375732421875, "eval_accuracy": 0.025922309197651663, "eval_lookahead_loss": 6.517293460464478, "eval_lookahead_perplexity": 676.7442692850069, "eval_loss": 4.044368267059326, "eval_perplexity": 57.07511841039871, "eval_runtime": 509.7374, "eval_samples_per_second": 19.618, "eval_steps_per_second": 4.904, "step": 495000 }, { "epoch": 0.9450912475585938, "grad_norm": 4.027184963226318, "learning_rate": 2.7455329895019534e-06, "lookahead_loss": 6.531048232078552, "loss": 4.2328, "step": 495500 }, { "epoch": 0.946044921875, "grad_norm": 2.9770889282226562, "learning_rate": 2.6978492736816406e-06, "lookahead_loss": 6.547837210655213, "loss": 4.2307, "step": 496000 }, { "epoch": 0.9469985961914062, "grad_norm": 4.159328937530518, "learning_rate": 2.6501655578613283e-06, "lookahead_loss": 6.530126694202423, "loss": 4.1808, "step": 496500 }, { "epoch": 0.9479522705078125, "grad_norm": 3.2738466262817383, "learning_rate": 2.602481842041016e-06, "lookahead_loss": 6.598403035640716, "loss": 4.1957, "step": 497000 }, { "epoch": 0.9489059448242188, "grad_norm": 2.519172430038452, "learning_rate": 2.554798126220703e-06, "lookahead_loss": 6.614674340724945, "loss": 4.156, "step": 497500 }, { "epoch": 0.949859619140625, "grad_norm": 3.1957650184631348, "learning_rate": 2.507114410400391e-06, "lookahead_loss": 6.680439057826995, "loss": 4.186, "step": 498000 }, { "epoch": 0.9508132934570312, "grad_norm": 2.729926347732544, "learning_rate": 2.459430694580078e-06, "lookahead_loss": 6.658263957500457, "loss": 4.2441, "step": 498500 }, { "epoch": 0.9517669677734375, "grad_norm": 3.5596132278442383, "learning_rate": 2.4117469787597658e-06, "lookahead_loss": 6.634872227191925, "loss": 4.2225, "step": 499000 }, { "epoch": 0.9527206420898438, "grad_norm": 20.391313552856445, "learning_rate": 2.3640632629394534e-06, "lookahead_loss": 6.558115773677826, "loss": 4.227, "step": 499500 }, { "epoch": 0.95367431640625, "grad_norm": 2.6300275325775146, "learning_rate": 2.3163795471191407e-06, "lookahead_loss": 6.572777153491974, "loss": 4.206, "step": 500000 }, { "epoch": 0.95367431640625, "eval_accuracy": 0.025671037181996086, "eval_lookahead_loss": 6.515631353664398, "eval_lookahead_perplexity": 675.6203823019493, "eval_loss": 4.0427937507629395, "eval_perplexity": 56.985323416734666, "eval_runtime": 501.0944, "eval_samples_per_second": 19.956, "eval_steps_per_second": 4.989, "step": 500000 }, { "epoch": 1.0009536743164062, "grad_norm": 2.707035541534424, "learning_rate": 2.2686958312988284e-06, "lookahead_loss": 6.620791524887085, "loss": 4.1758, "step": 500500 }, { "epoch": 1.0019073486328125, "grad_norm": 3.0893070697784424, "learning_rate": 2.2210121154785156e-06, "lookahead_loss": 6.671734881401062, "loss": 4.1899, "step": 501000 }, { "epoch": 1.0028610229492188, "grad_norm": 64.37688446044922, "learning_rate": 2.1733283996582033e-06, "lookahead_loss": 6.637036371707916, "loss": 4.2051, "step": 501500 }, { "epoch": 1.003814697265625, "grad_norm": 2.6765806674957275, "learning_rate": 2.125644683837891e-06, "lookahead_loss": 6.684634018421173, "loss": 4.2469, "step": 502000 }, { "epoch": 1.0047683715820312, "grad_norm": 7.367605209350586, "learning_rate": 2.077960968017578e-06, "lookahead_loss": 6.6256292009353634, "loss": 4.1924, "step": 502500 }, { "epoch": 1.0057220458984375, "grad_norm": 5.829056262969971, "learning_rate": 2.030277252197266e-06, "lookahead_loss": 6.6596670169830325, "loss": 4.2121, "step": 503000 }, { "epoch": 1.0066757202148438, "grad_norm": 4.116297245025635, "learning_rate": 1.982593536376953e-06, "lookahead_loss": 6.522253867149353, "loss": 4.1805, "step": 503500 }, { "epoch": 1.00762939453125, "grad_norm": 5.136316299438477, "learning_rate": 1.9349098205566408e-06, "lookahead_loss": 6.688910636901856, "loss": 4.1885, "step": 504000 }, { "epoch": 1.0085830688476562, "grad_norm": 2.7861788272857666, "learning_rate": 1.8872261047363282e-06, "lookahead_loss": 6.537882747650147, "loss": 4.1539, "step": 504500 }, { "epoch": 1.0095367431640625, "grad_norm": 3.776442050933838, "learning_rate": 1.8395423889160157e-06, "lookahead_loss": 6.611950333595276, "loss": 4.1828, "step": 505000 }, { "epoch": 1.0095367431640625, "eval_accuracy": 0.02588825831702544, "eval_lookahead_loss": 6.512920007514953, "eval_lookahead_perplexity": 673.7910227141415, "eval_loss": 4.0404534339904785, "eval_perplexity": 56.852115643550434, "eval_runtime": 575.5183, "eval_samples_per_second": 17.376, "eval_steps_per_second": 4.344, "step": 505000 }, { "epoch": 1.0104904174804688, "grad_norm": 4.793929576873779, "learning_rate": 1.7918586730957031e-06, "lookahead_loss": 6.6328760848045345, "loss": 4.1859, "step": 505500 }, { "epoch": 1.011444091796875, "grad_norm": 3.718944787979126, "learning_rate": 1.7441749572753908e-06, "lookahead_loss": 6.564227255821228, "loss": 4.1922, "step": 506000 }, { "epoch": 1.0123977661132812, "grad_norm": 3.5973196029663086, "learning_rate": 1.6964912414550783e-06, "lookahead_loss": 6.628193510055542, "loss": 4.1763, "step": 506500 }, { "epoch": 1.0133514404296875, "grad_norm": 2.9633047580718994, "learning_rate": 1.6488075256347657e-06, "lookahead_loss": 6.652925055980682, "loss": 4.1712, "step": 507000 }, { "epoch": 1.0143051147460938, "grad_norm": 2.9668164253234863, "learning_rate": 1.6011238098144532e-06, "lookahead_loss": 6.612866282939911, "loss": 4.1882, "step": 507500 }, { "epoch": 1.0152587890625, "grad_norm": 2.6643197536468506, "learning_rate": 1.5534400939941406e-06, "lookahead_loss": 6.612572358608245, "loss": 4.1924, "step": 508000 }, { "epoch": 1.0162124633789062, "grad_norm": 2.606398820877075, "learning_rate": 1.505756378173828e-06, "lookahead_loss": 6.552173396587372, "loss": 4.1802, "step": 508500 }, { "epoch": 1.0171661376953125, "grad_norm": 3.484496593475342, "learning_rate": 1.4580726623535158e-06, "lookahead_loss": 6.68125634765625, "loss": 4.2084, "step": 509000 }, { "epoch": 1.0181198120117188, "grad_norm": 3.119004011154175, "learning_rate": 1.4103889465332032e-06, "lookahead_loss": 6.627913850784302, "loss": 4.1967, "step": 509500 }, { "epoch": 1.019073486328125, "grad_norm": 3.5603859424591064, "learning_rate": 1.3627052307128907e-06, "lookahead_loss": 6.569750081062317, "loss": 4.2161, "step": 510000 }, { "epoch": 1.019073486328125, "eval_accuracy": 0.026020547945205478, "eval_lookahead_loss": 6.51159861907959, "eval_lookahead_perplexity": 672.9012710321304, "eval_loss": 4.038088321685791, "eval_perplexity": 56.717812888448414, "eval_runtime": 502.3215, "eval_samples_per_second": 19.908, "eval_steps_per_second": 4.977, "step": 510000 }, { "epoch": 1.0200271606445312, "grad_norm": 2.334524393081665, "learning_rate": 1.3150215148925781e-06, "lookahead_loss": 6.610077663421631, "loss": 4.2081, "step": 510500 }, { "epoch": 1.0209808349609375, "grad_norm": 2.619579315185547, "learning_rate": 1.2673377990722656e-06, "lookahead_loss": 6.646046039581299, "loss": 4.216, "step": 511000 }, { "epoch": 1.0219345092773438, "grad_norm": 3.795804023742676, "learning_rate": 1.2196540832519533e-06, "lookahead_loss": 6.561614444732666, "loss": 4.2501, "step": 511500 }, { "epoch": 1.02288818359375, "grad_norm": 3.044727087020874, "learning_rate": 1.1719703674316407e-06, "lookahead_loss": 6.7033868436813355, "loss": 4.2763, "step": 512000 }, { "epoch": 1.0238418579101562, "grad_norm": 1.9431852102279663, "learning_rate": 1.1242866516113282e-06, "lookahead_loss": 6.537177822113037, "loss": 4.2366, "step": 512500 }, { "epoch": 1.0247955322265625, "grad_norm": 6.121826171875, "learning_rate": 1.0766029357910156e-06, "lookahead_loss": 6.662478150844574, "loss": 4.2634, "step": 513000 }, { "epoch": 1.0257492065429688, "grad_norm": 1.8391380310058594, "learning_rate": 1.028919219970703e-06, "lookahead_loss": 6.587497451305389, "loss": 4.2565, "step": 513500 }, { "epoch": 1.026702880859375, "grad_norm": 3.1350104808807373, "learning_rate": 9.812355041503908e-07, "lookahead_loss": 6.624988639831543, "loss": 4.2088, "step": 514000 }, { "epoch": 1.0276565551757812, "grad_norm": 2.6663527488708496, "learning_rate": 9.335517883300781e-07, "lookahead_loss": 6.619484189987182, "loss": 4.2247, "step": 514500 }, { "epoch": 1.0286102294921875, "grad_norm": 5.63853645324707, "learning_rate": 8.858680725097657e-07, "lookahead_loss": 6.629350611209869, "loss": 4.244, "step": 515000 }, { "epoch": 1.0286102294921875, "eval_accuracy": 0.026124461839530332, "eval_lookahead_loss": 6.512042020320893, "eval_lookahead_perplexity": 673.1997024485213, "eval_loss": 4.036985874176025, "eval_perplexity": 56.65531893135343, "eval_runtime": 500.4634, "eval_samples_per_second": 19.981, "eval_steps_per_second": 4.995, "step": 515000 }, { "epoch": 1.0295639038085938, "grad_norm": 2.5392985343933105, "learning_rate": 8.381843566894531e-07, "lookahead_loss": 6.665633599281311, "loss": 4.2586, "step": 515500 }, { "epoch": 1.030517578125, "grad_norm": 7.482141494750977, "learning_rate": 7.905006408691407e-07, "lookahead_loss": 6.599335281848908, "loss": 4.2079, "step": 516000 }, { "epoch": 1.0314712524414062, "grad_norm": 2.6055543422698975, "learning_rate": 7.428169250488282e-07, "lookahead_loss": 6.594608627319336, "loss": 4.2429, "step": 516500 }, { "epoch": 1.0324249267578125, "grad_norm": 2.821611166000366, "learning_rate": 6.951332092285156e-07, "lookahead_loss": 6.643828285217285, "loss": 4.2272, "step": 517000 }, { "epoch": 1.0333786010742188, "grad_norm": 3.752774953842163, "learning_rate": 6.474494934082032e-07, "lookahead_loss": 6.598296153068542, "loss": 4.2151, "step": 517500 }, { "epoch": 1.034332275390625, "grad_norm": 2.4729936122894287, "learning_rate": 5.997657775878906e-07, "lookahead_loss": 6.661199746131897, "loss": 4.251, "step": 518000 }, { "epoch": 1.0352859497070312, "grad_norm": 2.958141803741455, "learning_rate": 5.520820617675782e-07, "lookahead_loss": 6.653834413051605, "loss": 4.2081, "step": 518500 }, { "epoch": 1.0362396240234375, "grad_norm": 8.099872589111328, "learning_rate": 5.043983459472657e-07, "lookahead_loss": 6.6293657789230345, "loss": 4.2136, "step": 519000 }, { "epoch": 1.0371932983398438, "grad_norm": 2.9280049800872803, "learning_rate": 4.5671463012695317e-07, "lookahead_loss": 6.565846514701843, "loss": 4.1941, "step": 519500 }, { "epoch": 1.03814697265625, "grad_norm": 4.969724178314209, "learning_rate": 4.0903091430664063e-07, "lookahead_loss": 6.625099413871765, "loss": 4.2112, "step": 520000 }, { "epoch": 1.03814697265625, "eval_accuracy": 0.02610743639921722, "eval_lookahead_loss": 6.511468852710724, "eval_lookahead_perplexity": 672.8139567429353, "eval_loss": 4.036635398864746, "eval_perplexity": 56.635466119978666, "eval_runtime": 508.6173, "eval_samples_per_second": 19.661, "eval_steps_per_second": 4.915, "step": 520000 }, { "epoch": 1.0391006469726562, "grad_norm": 4.252646446228027, "learning_rate": 3.6134719848632814e-07, "lookahead_loss": 6.544254674911499, "loss": 4.2331, "step": 520500 }, { "epoch": 1.0400543212890625, "grad_norm": 2.6441867351531982, "learning_rate": 3.1366348266601565e-07, "lookahead_loss": 6.643053800582885, "loss": 4.2228, "step": 521000 }, { "epoch": 1.0410079956054688, "grad_norm": 3.4916834831237793, "learning_rate": 2.6597976684570316e-07, "lookahead_loss": 6.593183302879334, "loss": 4.2361, "step": 521500 }, { "epoch": 1.041961669921875, "grad_norm": 2.5787572860717773, "learning_rate": 2.1829605102539064e-07, "lookahead_loss": 6.539567858695984, "loss": 4.2099, "step": 522000 }, { "epoch": 1.0429153442382812, "grad_norm": 3.3570804595947266, "learning_rate": 1.7061233520507813e-07, "lookahead_loss": 6.699640050411224, "loss": 4.2079, "step": 522500 }, { "epoch": 1.0438690185546875, "grad_norm": 3.1439714431762695, "learning_rate": 1.2292861938476564e-07, "lookahead_loss": 6.658572237491608, "loss": 4.1607, "step": 523000 }, { "epoch": 1.0448226928710938, "grad_norm": 2.995192050933838, "learning_rate": 7.524490356445312e-08, "lookahead_loss": 6.641252551555634, "loss": 4.209, "step": 523500 }, { "epoch": 1.0457763671875, "grad_norm": 3.3803346157073975, "learning_rate": 2.7561187744140627e-08, "lookahead_loss": 6.607998614788055, "loss": 4.245, "step": 524000 }, { "epoch": 1.04632568359375, "step": 524288, "total_flos": 4.621498060193661e+18, "train_loss": 0.2757485848851502, "train_runtime": 24028.7714, "train_samples_per_second": 87.277, "train_steps_per_second": 21.819 } ], "logging_steps": 500, "max_steps": 524288, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.621498060193661e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }