diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9675 @@ +{ + "best_global_step": null, + "best_metric": 5.227207660675049, + "best_model_checkpoint": null, + "epoch": 1.04632568359375, + "eval_steps": 5000, + "global_step": 524288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00095367431640625, + "grad_norm": 1.4116827249526978, + "learning_rate": 4.995241165161133e-05, + "lookahead_loss": 82.81035764312745, + "loss": 8.3013, + "step": 500 + }, + { + "epoch": 0.0019073486328125, + "grad_norm": 1.2876181602478027, + "learning_rate": 4.990472793579102e-05, + "lookahead_loss": 68.31142778015136, + "loss": 7.6757, + "step": 1000 + }, + { + "epoch": 0.00286102294921875, + "grad_norm": 1.0290312767028809, + "learning_rate": 4.98570442199707e-05, + "lookahead_loss": 64.17177369689941, + "loss": 7.3512, + "step": 1500 + }, + { + "epoch": 0.003814697265625, + "grad_norm": 1.0730235576629639, + "learning_rate": 4.9809360504150393e-05, + "lookahead_loss": 61.10301218414307, + "loss": 7.1983, + "step": 2000 + }, + { + "epoch": 0.00476837158203125, + "grad_norm": 0.9289631247520447, + "learning_rate": 4.9761676788330084e-05, + "lookahead_loss": 60.02642610168457, + "loss": 7.0463, + "step": 2500 + }, + { + "epoch": 0.0057220458984375, + "grad_norm": 0.9375585317611694, + "learning_rate": 4.971399307250977e-05, + "lookahead_loss": 58.240289306640626, + "loss": 6.9615, + "step": 3000 + }, + { + "epoch": 0.00667572021484375, + "grad_norm": 0.9769118428230286, + "learning_rate": 4.966630935668946e-05, + "lookahead_loss": 55.355101417541505, + "loss": 6.8558, + "step": 3500 + }, + { + "epoch": 0.00762939453125, + "grad_norm": 0.9649014472961426, + "learning_rate": 4.961862564086914e-05, + "lookahead_loss": 55.96681627655029, + "loss": 6.7903, + "step": 4000 + }, + { + "epoch": 0.00858306884765625, + "grad_norm": 1.0386439561843872, + "learning_rate": 4.957094192504883e-05, + "lookahead_loss": 54.144768939971925, + "loss": 6.6642, + "step": 4500 + }, + { + "epoch": 0.0095367431640625, + "grad_norm": 0.9463391900062561, + "learning_rate": 4.952325820922852e-05, + "lookahead_loss": 53.60892930603028, + "loss": 6.6913, + "step": 5000 + }, + { + "epoch": 0.0095367431640625, + "eval_accuracy": 0.028216438356164383, + "eval_lookahead_loss": 39.30436083450317, + "eval_lookahead_perplexity": 1.1739971028046022e+17, + "eval_loss": 6.57836389541626, + "eval_perplexity": 719.3614154076851, + "eval_runtime": 487.6738, + "eval_samples_per_second": 20.506, + "eval_steps_per_second": 5.126, + "step": 5000 + }, + { + "epoch": 0.01049041748046875, + "grad_norm": 1.0929338932037354, + "learning_rate": 4.9475574493408205e-05, + "lookahead_loss": 53.01767923355103, + "loss": 6.64, + "step": 5500 + }, + { + "epoch": 0.011444091796875, + "grad_norm": 1.0882351398468018, + "learning_rate": 4.9427890777587895e-05, + "lookahead_loss": 51.597215480804444, + "loss": 6.664, + "step": 6000 + }, + { + "epoch": 0.01239776611328125, + "grad_norm": 1.0210477113723755, + "learning_rate": 4.938020706176758e-05, + "lookahead_loss": 51.8494676399231, + "loss": 6.6109, + "step": 6500 + }, + { + "epoch": 0.0133514404296875, + "grad_norm": 1.0666073560714722, + "learning_rate": 4.933252334594727e-05, + "lookahead_loss": 50.77286548614502, + "loss": 6.563, + "step": 7000 + }, + { + "epoch": 0.01430511474609375, + "grad_norm": 1.1284505128860474, + "learning_rate": 4.928483963012696e-05, + "lookahead_loss": 49.70681496810913, + "loss": 6.5286, + "step": 7500 + }, + { + "epoch": 0.0152587890625, + "grad_norm": 1.0259513854980469, + "learning_rate": 4.923715591430664e-05, + "lookahead_loss": 48.283052101135254, + "loss": 6.5192, + "step": 8000 + }, + { + "epoch": 0.01621246337890625, + "grad_norm": 1.13589346408844, + "learning_rate": 4.918947219848633e-05, + "lookahead_loss": 48.01224088668823, + "loss": 6.4496, + "step": 8500 + }, + { + "epoch": 0.0171661376953125, + "grad_norm": 1.4678149223327637, + "learning_rate": 4.9141788482666016e-05, + "lookahead_loss": 47.74653625869751, + "loss": 6.4044, + "step": 9000 + }, + { + "epoch": 0.01811981201171875, + "grad_norm": 1.2191615104675293, + "learning_rate": 4.9094104766845706e-05, + "lookahead_loss": 46.88958682250976, + "loss": 6.3996, + "step": 9500 + }, + { + "epoch": 0.019073486328125, + "grad_norm": 1.2123606204986572, + "learning_rate": 4.9046421051025396e-05, + "lookahead_loss": 45.81951342010498, + "loss": 6.3853, + "step": 10000 + }, + { + "epoch": 0.019073486328125, + "eval_accuracy": 0.02893287671232877, + "eval_lookahead_loss": 30.3654480884552, + "eval_lookahead_perplexity": 15400915759075.514, + "eval_loss": 6.288400650024414, + "eval_perplexity": 538.2917234091022, + "eval_runtime": 482.4341, + "eval_samples_per_second": 20.728, + "eval_steps_per_second": 5.182, + "step": 10000 + }, + { + "epoch": 0.02002716064453125, + "grad_norm": 1.4043011665344238, + "learning_rate": 4.899873733520508e-05, + "lookahead_loss": 46.1119024848938, + "loss": 6.328, + "step": 10500 + }, + { + "epoch": 0.0209808349609375, + "grad_norm": 1.4673211574554443, + "learning_rate": 4.895105361938477e-05, + "lookahead_loss": 45.123251277923586, + "loss": 6.3419, + "step": 11000 + }, + { + "epoch": 0.02193450927734375, + "grad_norm": 1.6025383472442627, + "learning_rate": 4.890336990356445e-05, + "lookahead_loss": 43.82665083312988, + "loss": 6.3494, + "step": 11500 + }, + { + "epoch": 0.02288818359375, + "grad_norm": 1.453022837638855, + "learning_rate": 4.8855686187744143e-05, + "lookahead_loss": 44.45840075302124, + "loss": 6.3942, + "step": 12000 + }, + { + "epoch": 0.02384185791015625, + "grad_norm": 2.0204365253448486, + "learning_rate": 4.8808002471923834e-05, + "lookahead_loss": 42.38063139724731, + "loss": 6.3276, + "step": 12500 + }, + { + "epoch": 0.0247955322265625, + "grad_norm": 1.7605488300323486, + "learning_rate": 4.876031875610352e-05, + "lookahead_loss": 43.250685325622555, + "loss": 6.3492, + "step": 13000 + }, + { + "epoch": 0.02574920654296875, + "grad_norm": 1.6546428203582764, + "learning_rate": 4.871263504028321e-05, + "lookahead_loss": 42.73258158874512, + "loss": 6.274, + "step": 13500 + }, + { + "epoch": 0.026702880859375, + "grad_norm": 2.7032573223114014, + "learning_rate": 4.866495132446289e-05, + "lookahead_loss": 41.79551118850708, + "loss": 6.2586, + "step": 14000 + }, + { + "epoch": 0.02765655517578125, + "grad_norm": 2.255749464035034, + "learning_rate": 4.861726760864258e-05, + "lookahead_loss": 40.578983207702635, + "loss": 6.2715, + "step": 14500 + }, + { + "epoch": 0.0286102294921875, + "grad_norm": 2.4507906436920166, + "learning_rate": 4.856958389282227e-05, + "lookahead_loss": 39.66463244628906, + "loss": 6.2629, + "step": 15000 + }, + { + "epoch": 0.0286102294921875, + "eval_accuracy": 0.030517808219178082, + "eval_lookahead_loss": 26.564950465011595, + "eval_lookahead_perplexity": 344358969082.55023, + "eval_loss": 6.131886959075928, + "eval_perplexity": 460.3039163653932, + "eval_runtime": 484.8825, + "eval_samples_per_second": 20.624, + "eval_steps_per_second": 5.156, + "step": 15000 + }, + { + "epoch": 0.02956390380859375, + "grad_norm": 2.1231741905212402, + "learning_rate": 4.8521900177001955e-05, + "lookahead_loss": 40.00202400588989, + "loss": 6.2789, + "step": 15500 + }, + { + "epoch": 0.030517578125, + "grad_norm": 2.702622652053833, + "learning_rate": 4.8474216461181645e-05, + "lookahead_loss": 38.220937404632565, + "loss": 6.2119, + "step": 16000 + }, + { + "epoch": 0.03147125244140625, + "grad_norm": 2.533889055252075, + "learning_rate": 4.842653274536133e-05, + "lookahead_loss": 37.03616960906982, + "loss": 6.1943, + "step": 16500 + }, + { + "epoch": 0.0324249267578125, + "grad_norm": 2.6587507724761963, + "learning_rate": 4.837884902954102e-05, + "lookahead_loss": 36.206924369812015, + "loss": 6.1443, + "step": 17000 + }, + { + "epoch": 0.03337860107421875, + "grad_norm": 3.694762706756592, + "learning_rate": 4.833116531372071e-05, + "lookahead_loss": 34.99742555618286, + "loss": 6.1853, + "step": 17500 + }, + { + "epoch": 0.034332275390625, + "grad_norm": 3.002371072769165, + "learning_rate": 4.828348159790039e-05, + "lookahead_loss": 34.22952080154419, + "loss": 6.2214, + "step": 18000 + }, + { + "epoch": 0.03528594970703125, + "grad_norm": 3.8106696605682373, + "learning_rate": 4.823579788208008e-05, + "lookahead_loss": 32.58342349052429, + "loss": 6.1587, + "step": 18500 + }, + { + "epoch": 0.0362396240234375, + "grad_norm": 4.531000137329102, + "learning_rate": 4.8188114166259766e-05, + "lookahead_loss": 31.428825771331788, + "loss": 6.1431, + "step": 19000 + }, + { + "epoch": 0.03719329833984375, + "grad_norm": 4.729257106781006, + "learning_rate": 4.8140430450439456e-05, + "lookahead_loss": 29.815019004821778, + "loss": 6.1204, + "step": 19500 + }, + { + "epoch": 0.03814697265625, + "grad_norm": 4.6506500244140625, + "learning_rate": 4.8092746734619146e-05, + "lookahead_loss": 28.273008794784545, + "loss": 6.1226, + "step": 20000 + }, + { + "epoch": 0.03814697265625, + "eval_accuracy": 0.030854011741682976, + "eval_lookahead_loss": 20.776716146469116, + "eval_lookahead_perplexity": 1054904571.7535434, + "eval_loss": 6.015466213226318, + "eval_perplexity": 409.7168097923695, + "eval_runtime": 482.4324, + "eval_samples_per_second": 20.728, + "eval_steps_per_second": 5.182, + "step": 20000 + }, + { + "epoch": 0.03910064697265625, + "grad_norm": 4.7437052726745605, + "learning_rate": 4.804506301879883e-05, + "lookahead_loss": 26.21044944190979, + "loss": 6.1201, + "step": 20500 + }, + { + "epoch": 0.0400543212890625, + "grad_norm": 5.321864128112793, + "learning_rate": 4.799737930297852e-05, + "lookahead_loss": 25.452267398834227, + "loss": 6.1125, + "step": 21000 + }, + { + "epoch": 0.04100799560546875, + "grad_norm": 5.394619941711426, + "learning_rate": 4.79496955871582e-05, + "lookahead_loss": 23.53945638847351, + "loss": 6.076, + "step": 21500 + }, + { + "epoch": 0.041961669921875, + "grad_norm": 6.435363292694092, + "learning_rate": 4.7902011871337893e-05, + "lookahead_loss": 22.008990169525145, + "loss": 6.0921, + "step": 22000 + }, + { + "epoch": 0.04291534423828125, + "grad_norm": 7.579057216644287, + "learning_rate": 4.7854328155517584e-05, + "lookahead_loss": 21.273363492965697, + "loss": 6.0541, + "step": 22500 + }, + { + "epoch": 0.0438690185546875, + "grad_norm": 10.112333297729492, + "learning_rate": 4.780664443969727e-05, + "lookahead_loss": 19.90719951057434, + "loss": 6.0458, + "step": 23000 + }, + { + "epoch": 0.04482269287109375, + "grad_norm": 4.892643451690674, + "learning_rate": 4.775896072387696e-05, + "lookahead_loss": 18.471449033737183, + "loss": 6.014, + "step": 23500 + }, + { + "epoch": 0.0457763671875, + "grad_norm": 9.228553771972656, + "learning_rate": 4.771127700805664e-05, + "lookahead_loss": 17.00941694831848, + "loss": 6.0572, + "step": 24000 + }, + { + "epoch": 0.04673004150390625, + "grad_norm": 6.995558738708496, + "learning_rate": 4.766359329223633e-05, + "lookahead_loss": 16.560600999832154, + "loss": 6.0575, + "step": 24500 + }, + { + "epoch": 0.0476837158203125, + "grad_norm": 13.87187385559082, + "learning_rate": 4.761590957641602e-05, + "lookahead_loss": 15.213032215118409, + "loss": 5.9646, + "step": 25000 + }, + { + "epoch": 0.0476837158203125, + "eval_accuracy": 0.03215538160469667, + "eval_lookahead_loss": 12.763101608467101, + "eval_lookahead_perplexity": 349095.7812825452, + "eval_loss": 5.912298202514648, + "eval_perplexity": 369.55449127330263, + "eval_runtime": 493.6869, + "eval_samples_per_second": 20.256, + "eval_steps_per_second": 5.064, + "step": 25000 + }, + { + "epoch": 0.04863739013671875, + "grad_norm": 7.327078819274902, + "learning_rate": 4.7568225860595705e-05, + "lookahead_loss": 14.443667469978333, + "loss": 6.0208, + "step": 25500 + }, + { + "epoch": 0.049591064453125, + "grad_norm": 8.537580490112305, + "learning_rate": 4.7520542144775395e-05, + "lookahead_loss": 13.591373314857483, + "loss": 5.9831, + "step": 26000 + }, + { + "epoch": 0.05054473876953125, + "grad_norm": 9.2998685836792, + "learning_rate": 4.747285842895508e-05, + "lookahead_loss": 12.583511290550232, + "loss": 5.9738, + "step": 26500 + }, + { + "epoch": 0.0514984130859375, + "grad_norm": 11.028800964355469, + "learning_rate": 4.742517471313477e-05, + "lookahead_loss": 12.073891931533813, + "loss": 5.9818, + "step": 27000 + }, + { + "epoch": 0.05245208740234375, + "grad_norm": 13.256938934326172, + "learning_rate": 4.737749099731446e-05, + "lookahead_loss": 11.761963409423828, + "loss": 5.9722, + "step": 27500 + }, + { + "epoch": 0.05340576171875, + "grad_norm": 11.946344375610352, + "learning_rate": 4.732980728149414e-05, + "lookahead_loss": 11.185643421173095, + "loss": 5.9695, + "step": 28000 + }, + { + "epoch": 0.05435943603515625, + "grad_norm": 11.591967582702637, + "learning_rate": 4.728212356567383e-05, + "lookahead_loss": 10.651180962562561, + "loss": 5.9535, + "step": 28500 + }, + { + "epoch": 0.0553131103515625, + "grad_norm": 9.303091049194336, + "learning_rate": 4.7234439849853516e-05, + "lookahead_loss": 10.13999552822113, + "loss": 5.902, + "step": 29000 + }, + { + "epoch": 0.05626678466796875, + "grad_norm": 12.996500968933105, + "learning_rate": 4.7186756134033206e-05, + "lookahead_loss": 9.940039359092712, + "loss": 5.9458, + "step": 29500 + }, + { + "epoch": 0.057220458984375, + "grad_norm": 9.543191909790039, + "learning_rate": 4.7139072418212896e-05, + "lookahead_loss": 9.64600991344452, + "loss": 5.9014, + "step": 30000 + }, + { + "epoch": 0.057220458984375, + "eval_accuracy": 0.030647553816046967, + "eval_lookahead_loss": 9.189016084861755, + "eval_lookahead_perplexity": 9789.01467959859, + "eval_loss": 5.922324180603027, + "eval_perplexity": 373.2782725873048, + "eval_runtime": 492.5463, + "eval_samples_per_second": 20.303, + "eval_steps_per_second": 5.076, + "step": 30000 + }, + { + "epoch": 0.05817413330078125, + "grad_norm": 11.830968856811523, + "learning_rate": 4.709138870239258e-05, + "lookahead_loss": 9.448507949829102, + "loss": 5.8861, + "step": 30500 + }, + { + "epoch": 0.0591278076171875, + "grad_norm": 13.017300605773926, + "learning_rate": 4.704370498657227e-05, + "lookahead_loss": 9.281733512878418, + "loss": 5.867, + "step": 31000 + }, + { + "epoch": 0.06008148193359375, + "grad_norm": 15.906122207641602, + "learning_rate": 4.699602127075195e-05, + "lookahead_loss": 9.350438363075256, + "loss": 5.9964, + "step": 31500 + }, + { + "epoch": 0.06103515625, + "grad_norm": 13.458169937133789, + "learning_rate": 4.6948337554931643e-05, + "lookahead_loss": 9.306318830490111, + "loss": 5.9802, + "step": 32000 + }, + { + "epoch": 0.06198883056640625, + "grad_norm": 12.834677696228027, + "learning_rate": 4.6900653839111334e-05, + "lookahead_loss": 9.121820962905884, + "loss": 5.9456, + "step": 32500 + }, + { + "epoch": 0.0629425048828125, + "grad_norm": 22.344593048095703, + "learning_rate": 4.685297012329102e-05, + "lookahead_loss": 9.114763659477234, + "loss": 5.877, + "step": 33000 + }, + { + "epoch": 0.06389617919921875, + "grad_norm": 24.45722198486328, + "learning_rate": 4.680528640747071e-05, + "lookahead_loss": 9.237862285614014, + "loss": 5.9893, + "step": 33500 + }, + { + "epoch": 0.064849853515625, + "grad_norm": 34.96940994262695, + "learning_rate": 4.675760269165039e-05, + "lookahead_loss": 9.279925588607789, + "loss": 5.9082, + "step": 34000 + }, + { + "epoch": 0.06580352783203125, + "grad_norm": 18.211742401123047, + "learning_rate": 4.670991897583008e-05, + "lookahead_loss": 9.224804194450378, + "loss": 5.9579, + "step": 34500 + }, + { + "epoch": 0.0667572021484375, + "grad_norm": 23.321969985961914, + "learning_rate": 4.666223526000977e-05, + "lookahead_loss": 9.255750324249268, + "loss": 5.9364, + "step": 35000 + }, + { + "epoch": 0.0667572021484375, + "eval_accuracy": 0.03256634050880626, + "eval_lookahead_loss": 8.846821884918214, + "eval_lookahead_perplexity": 6952.258744163133, + "eval_loss": 5.836491584777832, + "eval_perplexity": 342.5753333069899, + "eval_runtime": 505.5101, + "eval_samples_per_second": 19.782, + "eval_steps_per_second": 4.945, + "step": 35000 + }, + { + "epoch": 0.06771087646484375, + "grad_norm": 19.004629135131836, + "learning_rate": 4.6614551544189455e-05, + "lookahead_loss": 9.160115962028504, + "loss": 5.9392, + "step": 35500 + }, + { + "epoch": 0.06866455078125, + "grad_norm": 30.239704132080078, + "learning_rate": 4.6566867828369145e-05, + "lookahead_loss": 9.138649526596069, + "loss": 5.9315, + "step": 36000 + }, + { + "epoch": 0.06961822509765625, + "grad_norm": 30.81173324584961, + "learning_rate": 4.651918411254883e-05, + "lookahead_loss": 9.10984047794342, + "loss": 5.9633, + "step": 36500 + }, + { + "epoch": 0.0705718994140625, + "grad_norm": 43.37763977050781, + "learning_rate": 4.647150039672852e-05, + "lookahead_loss": 9.122330055236816, + "loss": 5.8919, + "step": 37000 + }, + { + "epoch": 0.07152557373046875, + "grad_norm": 41.92087936401367, + "learning_rate": 4.642381668090821e-05, + "lookahead_loss": 9.035297988891601, + "loss": 5.9283, + "step": 37500 + }, + { + "epoch": 0.072479248046875, + "grad_norm": 36.329612731933594, + "learning_rate": 4.637613296508789e-05, + "lookahead_loss": 9.111817368507385, + "loss": 5.9298, + "step": 38000 + }, + { + "epoch": 0.07343292236328125, + "grad_norm": 28.321809768676758, + "learning_rate": 4.632844924926758e-05, + "lookahead_loss": 8.965152152061462, + "loss": 5.8935, + "step": 38500 + }, + { + "epoch": 0.0743865966796875, + "grad_norm": 32.70054244995117, + "learning_rate": 4.6280765533447266e-05, + "lookahead_loss": 9.196262484550475, + "loss": 5.9474, + "step": 39000 + }, + { + "epoch": 0.07534027099609375, + "grad_norm": 33.936851501464844, + "learning_rate": 4.6233081817626956e-05, + "lookahead_loss": 8.78305698108673, + "loss": 5.9013, + "step": 39500 + }, + { + "epoch": 0.0762939453125, + "grad_norm": 25.9757080078125, + "learning_rate": 4.6185398101806646e-05, + "lookahead_loss": 8.965730987548827, + "loss": 5.9101, + "step": 40000 + }, + { + "epoch": 0.0762939453125, + "eval_accuracy": 0.03441643835616438, + "eval_lookahead_loss": 8.572307312774658, + "eval_lookahead_perplexity": 5283.305977029776, + "eval_loss": 5.800355434417725, + "eval_perplexity": 330.4169806078424, + "eval_runtime": 504.4248, + "eval_samples_per_second": 19.825, + "eval_steps_per_second": 4.956, + "step": 40000 + }, + { + "epoch": 0.07724761962890625, + "grad_norm": 50.818946838378906, + "learning_rate": 4.613771438598633e-05, + "lookahead_loss": 8.835840244293212, + "loss": 5.9191, + "step": 40500 + }, + { + "epoch": 0.0782012939453125, + "grad_norm": 18.649166107177734, + "learning_rate": 4.609003067016602e-05, + "lookahead_loss": 8.888431717872619, + "loss": 5.9168, + "step": 41000 + }, + { + "epoch": 0.07915496826171875, + "grad_norm": 38.28921127319336, + "learning_rate": 4.60423469543457e-05, + "lookahead_loss": 8.905545147895813, + "loss": 5.9119, + "step": 41500 + }, + { + "epoch": 0.080108642578125, + "grad_norm": 23.086090087890625, + "learning_rate": 4.5994663238525393e-05, + "lookahead_loss": 8.75807586479187, + "loss": 5.9238, + "step": 42000 + }, + { + "epoch": 0.08106231689453125, + "grad_norm": 31.405933380126953, + "learning_rate": 4.5946979522705084e-05, + "lookahead_loss": 8.705488247871399, + "loss": 5.8646, + "step": 42500 + }, + { + "epoch": 0.0820159912109375, + "grad_norm": 34.23134994506836, + "learning_rate": 4.589929580688477e-05, + "lookahead_loss": 9.027087542533874, + "loss": 5.8998, + "step": 43000 + }, + { + "epoch": 0.08296966552734375, + "grad_norm": 26.862382888793945, + "learning_rate": 4.585161209106446e-05, + "lookahead_loss": 8.785642810821534, + "loss": 5.8714, + "step": 43500 + }, + { + "epoch": 0.08392333984375, + "grad_norm": 33.30260467529297, + "learning_rate": 4.580392837524414e-05, + "lookahead_loss": 9.144563898086547, + "loss": 5.8988, + "step": 44000 + }, + { + "epoch": 0.08487701416015625, + "grad_norm": 31.97353172302246, + "learning_rate": 4.575624465942383e-05, + "lookahead_loss": 9.052102692604064, + "loss": 5.8992, + "step": 44500 + }, + { + "epoch": 0.0858306884765625, + "grad_norm": 31.309261322021484, + "learning_rate": 4.570856094360352e-05, + "lookahead_loss": 8.816824792861938, + "loss": 5.8641, + "step": 45000 + }, + { + "epoch": 0.0858306884765625, + "eval_accuracy": 0.034044618395303324, + "eval_lookahead_loss": 8.859428939819336, + "eval_lookahead_perplexity": 7040.4610693951345, + "eval_loss": 5.783834934234619, + "eval_perplexity": 325.0031693845403, + "eval_runtime": 492.3333, + "eval_samples_per_second": 20.311, + "eval_steps_per_second": 5.078, + "step": 45000 + }, + { + "epoch": 0.08678436279296875, + "grad_norm": 29.05169677734375, + "learning_rate": 4.5660877227783205e-05, + "lookahead_loss": 8.970363302230835, + "loss": 5.8839, + "step": 45500 + }, + { + "epoch": 0.087738037109375, + "grad_norm": 24.335325241088867, + "learning_rate": 4.5613193511962895e-05, + "lookahead_loss": 8.678368333816529, + "loss": 5.8838, + "step": 46000 + }, + { + "epoch": 0.08869171142578125, + "grad_norm": 31.98122215270996, + "learning_rate": 4.556550979614258e-05, + "lookahead_loss": 8.860163199424743, + "loss": 5.8418, + "step": 46500 + }, + { + "epoch": 0.0896453857421875, + "grad_norm": 42.879005432128906, + "learning_rate": 4.551782608032227e-05, + "lookahead_loss": 8.727448391914368, + "loss": 5.8758, + "step": 47000 + }, + { + "epoch": 0.09059906005859375, + "grad_norm": 39.7878532409668, + "learning_rate": 4.547014236450196e-05, + "lookahead_loss": 8.852768278121948, + "loss": 5.7925, + "step": 47500 + }, + { + "epoch": 0.091552734375, + "grad_norm": 76.18293762207031, + "learning_rate": 4.542245864868164e-05, + "lookahead_loss": 8.892733434677124, + "loss": 5.8307, + "step": 48000 + }, + { + "epoch": 0.09250640869140625, + "grad_norm": 35.2605094909668, + "learning_rate": 4.537477493286133e-05, + "lookahead_loss": 8.871909951210021, + "loss": 5.7826, + "step": 48500 + }, + { + "epoch": 0.0934600830078125, + "grad_norm": 34.7687873840332, + "learning_rate": 4.5327091217041016e-05, + "lookahead_loss": 8.86015606212616, + "loss": 5.8403, + "step": 49000 + }, + { + "epoch": 0.09441375732421875, + "grad_norm": 21.99600601196289, + "learning_rate": 4.5279407501220706e-05, + "lookahead_loss": 8.911193285942078, + "loss": 5.8598, + "step": 49500 + }, + { + "epoch": 0.095367431640625, + "grad_norm": 117.01227569580078, + "learning_rate": 4.523172378540039e-05, + "lookahead_loss": 9.122151438713074, + "loss": 5.9081, + "step": 50000 + }, + { + "epoch": 0.095367431640625, + "eval_accuracy": 0.03578101761252446, + "eval_lookahead_loss": 8.898765789604187, + "eval_lookahead_perplexity": 7322.92992320255, + "eval_loss": 5.762501239776611, + "eval_perplexity": 318.1430867067312, + "eval_runtime": 487.3894, + "eval_samples_per_second": 20.517, + "eval_steps_per_second": 5.129, + "step": 50000 + }, + { + "epoch": 0.09632110595703125, + "grad_norm": 43.02692413330078, + "learning_rate": 4.518404006958008e-05, + "lookahead_loss": 8.807118844985961, + "loss": 5.944, + "step": 50500 + }, + { + "epoch": 0.0972747802734375, + "grad_norm": 64.49649047851562, + "learning_rate": 4.513635635375977e-05, + "lookahead_loss": 9.025630756378174, + "loss": 5.8835, + "step": 51000 + }, + { + "epoch": 0.09822845458984375, + "grad_norm": 46.78428649902344, + "learning_rate": 4.508867263793945e-05, + "lookahead_loss": 9.233048422813415, + "loss": 5.9366, + "step": 51500 + }, + { + "epoch": 0.09918212890625, + "grad_norm": 45.5450439453125, + "learning_rate": 4.5040988922119143e-05, + "lookahead_loss": 9.022085003852844, + "loss": 5.8719, + "step": 52000 + }, + { + "epoch": 0.10013580322265625, + "grad_norm": 104.69728088378906, + "learning_rate": 4.499330520629883e-05, + "lookahead_loss": 8.788162070274353, + "loss": 5.9298, + "step": 52500 + }, + { + "epoch": 0.1010894775390625, + "grad_norm": 64.52140808105469, + "learning_rate": 4.494562149047852e-05, + "lookahead_loss": 9.41310000896454, + "loss": 5.9314, + "step": 53000 + }, + { + "epoch": 0.10204315185546875, + "grad_norm": 91.63325500488281, + "learning_rate": 4.489793777465821e-05, + "lookahead_loss": 8.9484829454422, + "loss": 5.846, + "step": 53500 + }, + { + "epoch": 0.102996826171875, + "grad_norm": 100.22386932373047, + "learning_rate": 4.485025405883789e-05, + "lookahead_loss": 9.143427966117859, + "loss": 5.9138, + "step": 54000 + }, + { + "epoch": 0.10395050048828125, + "grad_norm": 154.89105224609375, + "learning_rate": 4.480257034301758e-05, + "lookahead_loss": 8.977231894493103, + "loss": 5.835, + "step": 54500 + }, + { + "epoch": 0.1049041748046875, + "grad_norm": 66.59415435791016, + "learning_rate": 4.4754886627197264e-05, + "lookahead_loss": 8.884087010383606, + "loss": 5.9661, + "step": 55000 + }, + { + "epoch": 0.1049041748046875, + "eval_accuracy": 0.035612720156555776, + "eval_lookahead_loss": 8.63708973712921, + "eval_lookahead_perplexity": 5636.901066149716, + "eval_loss": 5.780404567718506, + "eval_perplexity": 323.8901994334777, + "eval_runtime": 501.3627, + "eval_samples_per_second": 19.946, + "eval_steps_per_second": 4.986, + "step": 55000 + }, + { + "epoch": 0.10585784912109375, + "grad_norm": 47.204345703125, + "learning_rate": 4.4707202911376955e-05, + "lookahead_loss": 8.955890634536743, + "loss": 5.9263, + "step": 55500 + }, + { + "epoch": 0.1068115234375, + "grad_norm": 125.44088745117188, + "learning_rate": 4.4659519195556645e-05, + "lookahead_loss": 9.201303124427795, + "loss": 5.8797, + "step": 56000 + }, + { + "epoch": 0.10776519775390625, + "grad_norm": 93.1299819946289, + "learning_rate": 4.461183547973633e-05, + "lookahead_loss": 9.191447956085206, + "loss": 5.9494, + "step": 56500 + }, + { + "epoch": 0.1087188720703125, + "grad_norm": 136.57327270507812, + "learning_rate": 4.456415176391602e-05, + "lookahead_loss": 9.322702138900757, + "loss": 5.9525, + "step": 57000 + }, + { + "epoch": 0.10967254638671875, + "grad_norm": 84.23783111572266, + "learning_rate": 4.45164680480957e-05, + "lookahead_loss": 9.275324690818787, + "loss": 5.9063, + "step": 57500 + }, + { + "epoch": 0.110626220703125, + "grad_norm": 125.72661590576172, + "learning_rate": 4.446878433227539e-05, + "lookahead_loss": 9.426334665298462, + "loss": 5.7226, + "step": 58000 + }, + { + "epoch": 0.11157989501953125, + "grad_norm": 69.1849136352539, + "learning_rate": 4.442110061645508e-05, + "lookahead_loss": 9.873576466560364, + "loss": 6.0143, + "step": 58500 + }, + { + "epoch": 0.1125335693359375, + "grad_norm": 111.868408203125, + "learning_rate": 4.4373416900634766e-05, + "lookahead_loss": 8.999039393424988, + "loss": 6.0086, + "step": 59000 + }, + { + "epoch": 0.11348724365234375, + "grad_norm": 202.96800231933594, + "learning_rate": 4.4325733184814456e-05, + "lookahead_loss": 8.854657793045044, + "loss": 5.9327, + "step": 59500 + }, + { + "epoch": 0.11444091796875, + "grad_norm": 179.3572540283203, + "learning_rate": 4.427804946899414e-05, + "lookahead_loss": 9.268710127830506, + "loss": 5.9456, + "step": 60000 + }, + { + "epoch": 0.11444091796875, + "eval_accuracy": 0.03692035225048924, + "eval_lookahead_loss": 8.799783656311035, + "eval_lookahead_perplexity": 6632.808884701421, + "eval_loss": 5.810815334320068, + "eval_perplexity": 333.8912477177071, + "eval_runtime": 485.4069, + "eval_samples_per_second": 20.601, + "eval_steps_per_second": 5.15, + "step": 60000 + }, + { + "epoch": 0.11539459228515625, + "grad_norm": 52.02401351928711, + "learning_rate": 4.423036575317383e-05, + "lookahead_loss": 9.230049908638, + "loss": 5.9305, + "step": 60500 + }, + { + "epoch": 0.1163482666015625, + "grad_norm": 91.58668518066406, + "learning_rate": 4.418268203735352e-05, + "lookahead_loss": 9.1432270154953, + "loss": 5.9319, + "step": 61000 + }, + { + "epoch": 0.11730194091796875, + "grad_norm": 71.80653381347656, + "learning_rate": 4.41349983215332e-05, + "lookahead_loss": 8.751217803001405, + "loss": 5.8942, + "step": 61500 + }, + { + "epoch": 0.118255615234375, + "grad_norm": 172.49195861816406, + "learning_rate": 4.4087314605712893e-05, + "lookahead_loss": 9.048080438613892, + "loss": 5.9637, + "step": 62000 + }, + { + "epoch": 0.11920928955078125, + "grad_norm": 168.48675537109375, + "learning_rate": 4.403963088989258e-05, + "lookahead_loss": 8.998857446670533, + "loss": 5.9397, + "step": 62500 + }, + { + "epoch": 0.1201629638671875, + "grad_norm": 138.5640106201172, + "learning_rate": 4.399194717407227e-05, + "lookahead_loss": 8.808494299888611, + "loss": 5.9974, + "step": 63000 + }, + { + "epoch": 0.12111663818359375, + "grad_norm": 141.08485412597656, + "learning_rate": 4.394426345825196e-05, + "lookahead_loss": 9.133205205917358, + "loss": 5.9483, + "step": 63500 + }, + { + "epoch": 0.1220703125, + "grad_norm": 134.65687561035156, + "learning_rate": 4.389657974243164e-05, + "lookahead_loss": 9.107861867904663, + "loss": 5.8929, + "step": 64000 + }, + { + "epoch": 0.12302398681640625, + "grad_norm": 64.50765991210938, + "learning_rate": 4.384889602661133e-05, + "lookahead_loss": 8.778854193687438, + "loss": 5.9598, + "step": 64500 + }, + { + "epoch": 0.1239776611328125, + "grad_norm": 55.1839485168457, + "learning_rate": 4.3801212310791014e-05, + "lookahead_loss": 9.052451465606689, + "loss": 5.9131, + "step": 65000 + }, + { + "epoch": 0.1239776611328125, + "eval_accuracy": 0.038159491193737766, + "eval_lookahead_loss": 8.971788196372986, + "eval_lookahead_perplexity": 7877.675849359319, + "eval_loss": 5.82460355758667, + "eval_perplexity": 338.5269000506194, + "eval_runtime": 488.0048, + "eval_samples_per_second": 20.492, + "eval_steps_per_second": 5.123, + "step": 65000 + }, + { + "epoch": 0.12493133544921875, + "grad_norm": 100.2865982055664, + "learning_rate": 4.3753528594970705e-05, + "lookahead_loss": 8.985530485153198, + "loss": 5.9007, + "step": 65500 + }, + { + "epoch": 0.125885009765625, + "grad_norm": 124.38475799560547, + "learning_rate": 4.3705844879150395e-05, + "lookahead_loss": 8.922111594200134, + "loss": 5.8973, + "step": 66000 + }, + { + "epoch": 0.12683868408203125, + "grad_norm": 126.8738784790039, + "learning_rate": 4.365816116333008e-05, + "lookahead_loss": 9.228840293884277, + "loss": 5.8598, + "step": 66500 + }, + { + "epoch": 0.1277923583984375, + "grad_norm": 75.08271789550781, + "learning_rate": 4.361047744750977e-05, + "lookahead_loss": 8.87219711780548, + "loss": 5.8894, + "step": 67000 + }, + { + "epoch": 0.12874603271484375, + "grad_norm": 90.62391662597656, + "learning_rate": 4.356279373168945e-05, + "lookahead_loss": 9.068375196456909, + "loss": 5.938, + "step": 67500 + }, + { + "epoch": 0.12969970703125, + "grad_norm": 370.620849609375, + "learning_rate": 4.351511001586914e-05, + "lookahead_loss": 8.772779585838318, + "loss": 6.0352, + "step": 68000 + }, + { + "epoch": 0.13065338134765625, + "grad_norm": 79.76567840576172, + "learning_rate": 4.346742630004883e-05, + "lookahead_loss": 9.151159704208373, + "loss": 6.0171, + "step": 68500 + }, + { + "epoch": 0.1316070556640625, + "grad_norm": 31.362693786621094, + "learning_rate": 4.3419742584228516e-05, + "lookahead_loss": 8.853398272514344, + "loss": 5.9833, + "step": 69000 + }, + { + "epoch": 0.13256072998046875, + "grad_norm": 101.24190521240234, + "learning_rate": 4.3372058868408206e-05, + "lookahead_loss": 8.865188754081727, + "loss": 5.9787, + "step": 69500 + }, + { + "epoch": 0.133514404296875, + "grad_norm": 51.592620849609375, + "learning_rate": 4.332437515258789e-05, + "lookahead_loss": 8.797926445007324, + "loss": 5.9408, + "step": 70000 + }, + { + "epoch": 0.133514404296875, + "eval_accuracy": 0.03817651663405088, + "eval_lookahead_loss": 8.970869445228576, + "eval_lookahead_perplexity": 7870.441549426981, + "eval_loss": 5.799240589141846, + "eval_perplexity": 330.04882205598415, + "eval_runtime": 486.343, + "eval_samples_per_second": 20.562, + "eval_steps_per_second": 5.14, + "step": 70000 + }, + { + "epoch": 0.13446807861328125, + "grad_norm": 202.7108154296875, + "learning_rate": 4.327669143676758e-05, + "lookahead_loss": 9.086999006271363, + "loss": 5.9703, + "step": 70500 + }, + { + "epoch": 0.1354217529296875, + "grad_norm": 226.24972534179688, + "learning_rate": 4.322900772094727e-05, + "lookahead_loss": 9.262575436592103, + "loss": 5.9864, + "step": 71000 + }, + { + "epoch": 0.13637542724609375, + "grad_norm": 191.69708251953125, + "learning_rate": 4.318132400512695e-05, + "lookahead_loss": 9.651975973129272, + "loss": 5.9795, + "step": 71500 + }, + { + "epoch": 0.1373291015625, + "grad_norm": 121.97740936279297, + "learning_rate": 4.3133640289306643e-05, + "lookahead_loss": 9.289710388183593, + "loss": 5.9802, + "step": 72000 + }, + { + "epoch": 0.13828277587890625, + "grad_norm": 73.50595092773438, + "learning_rate": 4.308595657348633e-05, + "lookahead_loss": 9.07081819820404, + "loss": 5.9802, + "step": 72500 + }, + { + "epoch": 0.1392364501953125, + "grad_norm": 116.1031723022461, + "learning_rate": 4.303827285766602e-05, + "lookahead_loss": 8.929019109725953, + "loss": 5.9811, + "step": 73000 + }, + { + "epoch": 0.14019012451171875, + "grad_norm": 73.75403594970703, + "learning_rate": 4.299058914184571e-05, + "lookahead_loss": 8.633091394424438, + "loss": 5.9914, + "step": 73500 + }, + { + "epoch": 0.141143798828125, + "grad_norm": 2218.982177734375, + "learning_rate": 4.294290542602539e-05, + "lookahead_loss": 8.708157258033753, + "loss": 5.9672, + "step": 74000 + }, + { + "epoch": 0.14209747314453125, + "grad_norm": 232.07986450195312, + "learning_rate": 4.289522171020508e-05, + "lookahead_loss": 9.003678139686585, + "loss": 5.9636, + "step": 74500 + }, + { + "epoch": 0.1430511474609375, + "grad_norm": 69.3753662109375, + "learning_rate": 4.2847537994384764e-05, + "lookahead_loss": 8.961207758903504, + "loss": 6.0065, + "step": 75000 + }, + { + "epoch": 0.1430511474609375, + "eval_accuracy": 0.039377690802348334, + "eval_lookahead_loss": 8.827856708145141, + "eval_lookahead_perplexity": 6821.650348486948, + "eval_loss": 5.847181797027588, + "eval_perplexity": 346.25718113535765, + "eval_runtime": 485.8898, + "eval_samples_per_second": 20.581, + "eval_steps_per_second": 5.145, + "step": 75000 + }, + { + "epoch": 0.14400482177734375, + "grad_norm": 257.98968505859375, + "learning_rate": 4.2799854278564455e-05, + "lookahead_loss": 8.789914158821105, + "loss": 5.9937, + "step": 75500 + }, + { + "epoch": 0.14495849609375, + "grad_norm": 133.71444702148438, + "learning_rate": 4.2752170562744145e-05, + "lookahead_loss": 8.673704077720641, + "loss": 5.9949, + "step": 76000 + }, + { + "epoch": 0.14591217041015625, + "grad_norm": 147.74267578125, + "learning_rate": 4.270448684692383e-05, + "lookahead_loss": 8.891170823097228, + "loss": 6.0562, + "step": 76500 + }, + { + "epoch": 0.1468658447265625, + "grad_norm": 119.70494842529297, + "learning_rate": 4.265680313110352e-05, + "lookahead_loss": 8.86989661026001, + "loss": 6.016, + "step": 77000 + }, + { + "epoch": 0.14781951904296875, + "grad_norm": 228.5037841796875, + "learning_rate": 4.26091194152832e-05, + "lookahead_loss": 8.860928062438965, + "loss": 6.048, + "step": 77500 + }, + { + "epoch": 0.148773193359375, + "grad_norm": 824.8228149414062, + "learning_rate": 4.256143569946289e-05, + "lookahead_loss": 8.985385056495666, + "loss": 5.9966, + "step": 78000 + }, + { + "epoch": 0.14972686767578125, + "grad_norm": 325.0461730957031, + "learning_rate": 4.251375198364258e-05, + "lookahead_loss": 9.052589923858642, + "loss": 6.0044, + "step": 78500 + }, + { + "epoch": 0.1506805419921875, + "grad_norm": 190.2242431640625, + "learning_rate": 4.2466068267822266e-05, + "lookahead_loss": 9.491239009857178, + "loss": 6.0059, + "step": 79000 + }, + { + "epoch": 0.15163421630859375, + "grad_norm": 196.2430877685547, + "learning_rate": 4.2418384552001956e-05, + "lookahead_loss": 8.930598598480225, + "loss": 6.0001, + "step": 79500 + }, + { + "epoch": 0.152587890625, + "grad_norm": 128.90660095214844, + "learning_rate": 4.237070083618164e-05, + "lookahead_loss": 8.819123309135437, + "loss": 5.8967, + "step": 80000 + }, + { + "epoch": 0.152587890625, + "eval_accuracy": 0.039476125244618396, + "eval_lookahead_loss": 9.530179664611817, + "eval_lookahead_perplexity": 13769.064675450436, + "eval_loss": 5.8740715980529785, + "eval_perplexity": 355.6942800693519, + "eval_runtime": 483.2747, + "eval_samples_per_second": 20.692, + "eval_steps_per_second": 5.173, + "step": 80000 + }, + { + "epoch": 0.15354156494140625, + "grad_norm": 188.47970581054688, + "learning_rate": 4.232301712036133e-05, + "lookahead_loss": 9.142940559387206, + "loss": 6.0571, + "step": 80500 + }, + { + "epoch": 0.1544952392578125, + "grad_norm": 121.5519790649414, + "learning_rate": 4.227533340454102e-05, + "lookahead_loss": 8.96001132583618, + "loss": 5.9923, + "step": 81000 + }, + { + "epoch": 0.15544891357421875, + "grad_norm": 293.0586242675781, + "learning_rate": 4.22276496887207e-05, + "lookahead_loss": 8.956564957618713, + "loss": 6.0911, + "step": 81500 + }, + { + "epoch": 0.156402587890625, + "grad_norm": 74.56285095214844, + "learning_rate": 4.2179965972900393e-05, + "lookahead_loss": 8.582456354141236, + "loss": 6.0862, + "step": 82000 + }, + { + "epoch": 0.15735626220703125, + "grad_norm": 511.8816833496094, + "learning_rate": 4.213228225708008e-05, + "lookahead_loss": 9.042255405426026, + "loss": 5.9629, + "step": 82500 + }, + { + "epoch": 0.1583099365234375, + "grad_norm": 183.74826049804688, + "learning_rate": 4.208459854125977e-05, + "lookahead_loss": 9.405522727012634, + "loss": 5.9366, + "step": 83000 + }, + { + "epoch": 0.15926361083984375, + "grad_norm": 279.3388671875, + "learning_rate": 4.203691482543946e-05, + "lookahead_loss": 8.901457698822021, + "loss": 5.9244, + "step": 83500 + }, + { + "epoch": 0.16021728515625, + "grad_norm": 357.5735168457031, + "learning_rate": 4.198923110961914e-05, + "lookahead_loss": 9.167743388175964, + "loss": 5.9162, + "step": 84000 + }, + { + "epoch": 0.16117095947265625, + "grad_norm": 159.27464294433594, + "learning_rate": 4.194154739379883e-05, + "lookahead_loss": 9.373907676696778, + "loss": 6.014, + "step": 84500 + }, + { + "epoch": 0.1621246337890625, + "grad_norm": 93.59793090820312, + "learning_rate": 4.1893863677978514e-05, + "lookahead_loss": 9.038835690498352, + "loss": 6.1743, + "step": 85000 + }, + { + "epoch": 0.1621246337890625, + "eval_accuracy": 0.04017279843444227, + "eval_lookahead_loss": 8.818123157119752, + "eval_lookahead_perplexity": 6755.573569280033, + "eval_loss": 5.957902431488037, + "eval_perplexity": 386.79793755266024, + "eval_runtime": 489.9065, + "eval_samples_per_second": 20.412, + "eval_steps_per_second": 5.103, + "step": 85000 + }, + { + "epoch": 0.16307830810546875, + "grad_norm": 168.8986053466797, + "learning_rate": 4.1846179962158205e-05, + "lookahead_loss": 8.783296154022217, + "loss": 6.1266, + "step": 85500 + }, + { + "epoch": 0.164031982421875, + "grad_norm": 239.19187927246094, + "learning_rate": 4.1798496246337895e-05, + "lookahead_loss": 8.701505654335023, + "loss": 6.0763, + "step": 86000 + }, + { + "epoch": 0.16498565673828125, + "grad_norm": 125.89714813232422, + "learning_rate": 4.175081253051758e-05, + "lookahead_loss": 9.14300745677948, + "loss": 6.0504, + "step": 86500 + }, + { + "epoch": 0.1659393310546875, + "grad_norm": 248.45547485351562, + "learning_rate": 4.170312881469727e-05, + "lookahead_loss": 8.9906970911026, + "loss": 6.0455, + "step": 87000 + }, + { + "epoch": 0.16689300537109375, + "grad_norm": 256.23480224609375, + "learning_rate": 4.165544509887695e-05, + "lookahead_loss": 8.891051920890808, + "loss": 6.0046, + "step": 87500 + }, + { + "epoch": 0.1678466796875, + "grad_norm": 284.01104736328125, + "learning_rate": 4.160776138305664e-05, + "lookahead_loss": 9.329686467170715, + "loss": 6.0906, + "step": 88000 + }, + { + "epoch": 0.16880035400390625, + "grad_norm": 132.79754638671875, + "learning_rate": 4.156007766723633e-05, + "lookahead_loss": 8.698058214187622, + "loss": 6.0459, + "step": 88500 + }, + { + "epoch": 0.1697540283203125, + "grad_norm": 133.9832000732422, + "learning_rate": 4.1512393951416016e-05, + "lookahead_loss": 8.509726748466491, + "loss": 6.0236, + "step": 89000 + }, + { + "epoch": 0.17070770263671875, + "grad_norm": 277.36358642578125, + "learning_rate": 4.1464710235595706e-05, + "lookahead_loss": 8.506252351760864, + "loss": 6.0646, + "step": 89500 + }, + { + "epoch": 0.171661376953125, + "grad_norm": 342.52911376953125, + "learning_rate": 4.141702651977539e-05, + "lookahead_loss": 9.103038928985596, + "loss": 6.0193, + "step": 90000 + }, + { + "epoch": 0.171661376953125, + "eval_accuracy": 0.03989745596868884, + "eval_lookahead_loss": 8.554608922958375, + "eval_lookahead_perplexity": 5190.622561220272, + "eval_loss": 5.8674235343933105, + "eval_perplexity": 353.33744472812856, + "eval_runtime": 492.4204, + "eval_samples_per_second": 20.308, + "eval_steps_per_second": 5.077, + "step": 90000 + }, + { + "epoch": 0.17261505126953125, + "grad_norm": 165.12269592285156, + "learning_rate": 4.136934280395508e-05, + "lookahead_loss": 8.572826384544372, + "loss": 6.0221, + "step": 90500 + }, + { + "epoch": 0.1735687255859375, + "grad_norm": 107.81327819824219, + "learning_rate": 4.132165908813477e-05, + "lookahead_loss": 8.481350008964538, + "loss": 6.0222, + "step": 91000 + }, + { + "epoch": 0.17452239990234375, + "grad_norm": 556.1472778320312, + "learning_rate": 4.127397537231445e-05, + "lookahead_loss": 8.599616055488587, + "loss": 6.0664, + "step": 91500 + }, + { + "epoch": 0.17547607421875, + "grad_norm": 1288.407470703125, + "learning_rate": 4.1226291656494143e-05, + "lookahead_loss": 8.63302899837494, + "loss": 6.0456, + "step": 92000 + }, + { + "epoch": 0.17642974853515625, + "grad_norm": 143.24591064453125, + "learning_rate": 4.117860794067383e-05, + "lookahead_loss": 8.452617700576782, + "loss": 6.0033, + "step": 92500 + }, + { + "epoch": 0.1773834228515625, + "grad_norm": 119.54076385498047, + "learning_rate": 4.113092422485352e-05, + "lookahead_loss": 8.49664342021942, + "loss": 6.0329, + "step": 93000 + }, + { + "epoch": 0.17833709716796875, + "grad_norm": 390.5928649902344, + "learning_rate": 4.108324050903321e-05, + "lookahead_loss": 8.825624912261963, + "loss": 6.0757, + "step": 93500 + }, + { + "epoch": 0.179290771484375, + "grad_norm": 313.72137451171875, + "learning_rate": 4.103555679321289e-05, + "lookahead_loss": 8.703830801963806, + "loss": 6.0974, + "step": 94000 + }, + { + "epoch": 0.18024444580078125, + "grad_norm": 250.77395629882812, + "learning_rate": 4.098787307739258e-05, + "lookahead_loss": 8.433175395965575, + "loss": 5.9947, + "step": 94500 + }, + { + "epoch": 0.1811981201171875, + "grad_norm": 82.42889404296875, + "learning_rate": 4.0940189361572264e-05, + "lookahead_loss": 8.40940456390381, + "loss": 6.0285, + "step": 95000 + }, + { + "epoch": 0.1811981201171875, + "eval_accuracy": 0.040384735812133074, + "eval_lookahead_loss": 8.451369491767883, + "eval_lookahead_perplexity": 4681.479595206787, + "eval_loss": 5.867776393890381, + "eval_perplexity": 353.4621452007506, + "eval_runtime": 486.3864, + "eval_samples_per_second": 20.56, + "eval_steps_per_second": 5.14, + "step": 95000 + }, + { + "epoch": 0.18215179443359375, + "grad_norm": 206.6583709716797, + "learning_rate": 4.0892505645751955e-05, + "lookahead_loss": 8.643177847862244, + "loss": 5.9837, + "step": 95500 + }, + { + "epoch": 0.18310546875, + "grad_norm": 276.0804443359375, + "learning_rate": 4.0844821929931645e-05, + "lookahead_loss": 8.666072872161866, + "loss": 6.0129, + "step": 96000 + }, + { + "epoch": 0.18405914306640625, + "grad_norm": 57.96406936645508, + "learning_rate": 4.079713821411133e-05, + "lookahead_loss": 8.65278869152069, + "loss": 6.0148, + "step": 96500 + }, + { + "epoch": 0.1850128173828125, + "grad_norm": 197.7506866455078, + "learning_rate": 4.074945449829102e-05, + "lookahead_loss": 8.779692636489868, + "loss": 6.0355, + "step": 97000 + }, + { + "epoch": 0.18596649169921875, + "grad_norm": 134.04188537597656, + "learning_rate": 4.07017707824707e-05, + "lookahead_loss": 8.712127835273742, + "loss": 5.9997, + "step": 97500 + }, + { + "epoch": 0.186920166015625, + "grad_norm": 132.23191833496094, + "learning_rate": 4.065408706665039e-05, + "lookahead_loss": 8.590782883644104, + "loss": 5.9988, + "step": 98000 + }, + { + "epoch": 0.18787384033203125, + "grad_norm": 124.85617065429688, + "learning_rate": 4.060640335083008e-05, + "lookahead_loss": 8.741560804367065, + "loss": 5.993, + "step": 98500 + }, + { + "epoch": 0.1888275146484375, + "grad_norm": 127.54620361328125, + "learning_rate": 4.0558719635009766e-05, + "lookahead_loss": 8.607123659133912, + "loss": 5.9677, + "step": 99000 + }, + { + "epoch": 0.18978118896484375, + "grad_norm": 382.4927062988281, + "learning_rate": 4.0511035919189456e-05, + "lookahead_loss": 8.836537936210632, + "loss": 5.9167, + "step": 99500 + }, + { + "epoch": 0.19073486328125, + "grad_norm": 207.11195373535156, + "learning_rate": 4.046335220336914e-05, + "lookahead_loss": 9.184633234024048, + "loss": 5.9659, + "step": 100000 + }, + { + "epoch": 0.19073486328125, + "eval_accuracy": 0.04148003913894325, + "eval_lookahead_loss": 8.726900485610962, + "eval_lookahead_perplexity": 6166.585040351611, + "eval_loss": 5.867016315460205, + "eval_perplexity": 353.19358832337394, + "eval_runtime": 481.8479, + "eval_samples_per_second": 20.753, + "eval_steps_per_second": 5.188, + "step": 100000 + }, + { + "epoch": 0.19168853759765625, + "grad_norm": 315.92205810546875, + "learning_rate": 4.041566848754883e-05, + "lookahead_loss": 8.797781614303588, + "loss": 5.9489, + "step": 100500 + }, + { + "epoch": 0.1926422119140625, + "grad_norm": 135.5908660888672, + "learning_rate": 4.036798477172852e-05, + "lookahead_loss": 8.524951152801513, + "loss": 6.0196, + "step": 101000 + }, + { + "epoch": 0.19359588623046875, + "grad_norm": 162.22238159179688, + "learning_rate": 4.03203010559082e-05, + "lookahead_loss": 8.704341668128967, + "loss": 6.1409, + "step": 101500 + }, + { + "epoch": 0.194549560546875, + "grad_norm": 568.7293701171875, + "learning_rate": 4.0272617340087893e-05, + "lookahead_loss": 8.494575491905213, + "loss": 6.0518, + "step": 102000 + }, + { + "epoch": 0.19550323486328125, + "grad_norm": 185.62965393066406, + "learning_rate": 4.022493362426758e-05, + "lookahead_loss": 8.428331973075867, + "loss": 6.1032, + "step": 102500 + }, + { + "epoch": 0.1964569091796875, + "grad_norm": 255.8610076904297, + "learning_rate": 4.017724990844727e-05, + "lookahead_loss": 8.571762264251708, + "loss": 6.0853, + "step": 103000 + }, + { + "epoch": 0.19741058349609375, + "grad_norm": 200.58724975585938, + "learning_rate": 4.012956619262696e-05, + "lookahead_loss": 8.9400325050354, + "loss": 6.0429, + "step": 103500 + }, + { + "epoch": 0.1983642578125, + "grad_norm": 111.87779998779297, + "learning_rate": 4.008188247680664e-05, + "lookahead_loss": 8.725747778892517, + "loss": 6.0599, + "step": 104000 + }, + { + "epoch": 0.19931793212890625, + "grad_norm": 206.3902130126953, + "learning_rate": 4.003419876098633e-05, + "lookahead_loss": 8.484298157691956, + "loss": 6.0462, + "step": 104500 + }, + { + "epoch": 0.2002716064453125, + "grad_norm": 548.9729614257812, + "learning_rate": 3.9986515045166014e-05, + "lookahead_loss": 8.522790581703186, + "loss": 6.0304, + "step": 105000 + }, + { + "epoch": 0.2002716064453125, + "eval_accuracy": 0.04124970645792564, + "eval_lookahead_loss": 8.426673457717895, + "eval_lookahead_perplexity": 4567.281539259324, + "eval_loss": 5.910349369049072, + "eval_perplexity": 368.8349924327831, + "eval_runtime": 481.8181, + "eval_samples_per_second": 20.755, + "eval_steps_per_second": 5.189, + "step": 105000 + }, + { + "epoch": 0.20122528076171875, + "grad_norm": 218.3773956298828, + "learning_rate": 3.9938831329345705e-05, + "lookahead_loss": 8.927167050361634, + "loss": 6.0284, + "step": 105500 + }, + { + "epoch": 0.202178955078125, + "grad_norm": 165.89390563964844, + "learning_rate": 3.9891147613525395e-05, + "lookahead_loss": 8.973238533973694, + "loss": 6.0122, + "step": 106000 + }, + { + "epoch": 0.20313262939453125, + "grad_norm": 328.2847595214844, + "learning_rate": 3.984346389770508e-05, + "lookahead_loss": 8.65789045715332, + "loss": 5.993, + "step": 106500 + }, + { + "epoch": 0.2040863037109375, + "grad_norm": 307.3328857421875, + "learning_rate": 3.979578018188477e-05, + "lookahead_loss": 8.545142482757567, + "loss": 6.045, + "step": 107000 + }, + { + "epoch": 0.20503997802734375, + "grad_norm": 464.73260498046875, + "learning_rate": 3.974809646606445e-05, + "lookahead_loss": 8.489681368827819, + "loss": 6.0011, + "step": 107500 + }, + { + "epoch": 0.20599365234375, + "grad_norm": 148.97218322753906, + "learning_rate": 3.970041275024414e-05, + "lookahead_loss": 8.710652714729308, + "loss": 5.9797, + "step": 108000 + }, + { + "epoch": 0.20694732666015625, + "grad_norm": 492.6673278808594, + "learning_rate": 3.965272903442383e-05, + "lookahead_loss": 8.614580635070801, + "loss": 5.9838, + "step": 108500 + }, + { + "epoch": 0.2079010009765625, + "grad_norm": 200.87330627441406, + "learning_rate": 3.9605045318603516e-05, + "lookahead_loss": 8.646617564201355, + "loss": 6.0425, + "step": 109000 + }, + { + "epoch": 0.20885467529296875, + "grad_norm": 203.0087432861328, + "learning_rate": 3.9557361602783206e-05, + "lookahead_loss": 8.7244285364151, + "loss": 6.0639, + "step": 109500 + }, + { + "epoch": 0.209808349609375, + "grad_norm": 3111.361083984375, + "learning_rate": 3.950967788696289e-05, + "lookahead_loss": 8.815971665382385, + "loss": 6.0583, + "step": 110000 + }, + { + "epoch": 0.209808349609375, + "eval_accuracy": 0.04112211350293542, + "eval_lookahead_loss": 8.694844723510743, + "eval_lookahead_perplexity": 5972.045177465004, + "eval_loss": 5.8797383308410645, + "eval_perplexity": 357.7156263141802, + "eval_runtime": 487.0279, + "eval_samples_per_second": 20.533, + "eval_steps_per_second": 5.133, + "step": 110000 + }, + { + "epoch": 0.21076202392578125, + "grad_norm": 348.0232238769531, + "learning_rate": 3.946199417114258e-05, + "lookahead_loss": 9.101626420974732, + "loss": 6.0401, + "step": 110500 + }, + { + "epoch": 0.2117156982421875, + "grad_norm": 285.0534362792969, + "learning_rate": 3.941431045532227e-05, + "lookahead_loss": 8.784951137542725, + "loss": 6.0651, + "step": 111000 + }, + { + "epoch": 0.21266937255859375, + "grad_norm": 207.54171752929688, + "learning_rate": 3.936662673950195e-05, + "lookahead_loss": 8.772650203704835, + "loss": 6.0417, + "step": 111500 + }, + { + "epoch": 0.213623046875, + "grad_norm": 524.7745971679688, + "learning_rate": 3.9318943023681643e-05, + "lookahead_loss": 8.633025979995727, + "loss": 6.0194, + "step": 112000 + }, + { + "epoch": 0.21457672119140625, + "grad_norm": 354.3783264160156, + "learning_rate": 3.927125930786133e-05, + "lookahead_loss": 8.687143105506896, + "loss": 6.024, + "step": 112500 + }, + { + "epoch": 0.2155303955078125, + "grad_norm": 174.89064025878906, + "learning_rate": 3.922357559204102e-05, + "lookahead_loss": 8.538061501502991, + "loss": 6.0763, + "step": 113000 + }, + { + "epoch": 0.21648406982421875, + "grad_norm": 859.1728515625, + "learning_rate": 3.917589187622071e-05, + "lookahead_loss": 8.956743593215942, + "loss": 6.03, + "step": 113500 + }, + { + "epoch": 0.217437744140625, + "grad_norm": 660.7149047851562, + "learning_rate": 3.912820816040039e-05, + "lookahead_loss": 8.850226809501647, + "loss": 6.0412, + "step": 114000 + }, + { + "epoch": 0.21839141845703125, + "grad_norm": 388.4820861816406, + "learning_rate": 3.908052444458008e-05, + "lookahead_loss": 9.070973587036132, + "loss": 6.1043, + "step": 114500 + }, + { + "epoch": 0.2193450927734375, + "grad_norm": 675.2697143554688, + "learning_rate": 3.9032840728759764e-05, + "lookahead_loss": 8.866824633598327, + "loss": 6.053, + "step": 115000 + }, + { + "epoch": 0.2193450927734375, + "eval_accuracy": 0.0417399217221135, + "eval_lookahead_loss": 8.55319642009735, + "eval_lookahead_perplexity": 5183.295967637397, + "eval_loss": 5.915393352508545, + "eval_perplexity": 370.7000898398448, + "eval_runtime": 482.5393, + "eval_samples_per_second": 20.724, + "eval_steps_per_second": 5.181, + "step": 115000 + }, + { + "epoch": 0.22029876708984375, + "grad_norm": 317.06585693359375, + "learning_rate": 3.8985157012939455e-05, + "lookahead_loss": 8.611581056594849, + "loss": 5.9742, + "step": 115500 + }, + { + "epoch": 0.22125244140625, + "grad_norm": 504.3750915527344, + "learning_rate": 3.8937473297119145e-05, + "lookahead_loss": 8.556282791137695, + "loss": 6.001, + "step": 116000 + }, + { + "epoch": 0.22220611572265625, + "grad_norm": 1370.23046875, + "learning_rate": 3.888978958129883e-05, + "lookahead_loss": 8.621963116645812, + "loss": 6.0212, + "step": 116500 + }, + { + "epoch": 0.2231597900390625, + "grad_norm": 618.9483642578125, + "learning_rate": 3.884210586547852e-05, + "lookahead_loss": 8.624157251358032, + "loss": 6.0102, + "step": 117000 + }, + { + "epoch": 0.22411346435546875, + "grad_norm": 264.4212646484375, + "learning_rate": 3.87944221496582e-05, + "lookahead_loss": 8.587020833015442, + "loss": 6.0883, + "step": 117500 + }, + { + "epoch": 0.225067138671875, + "grad_norm": 249.97801208496094, + "learning_rate": 3.874673843383789e-05, + "lookahead_loss": 8.859909684181213, + "loss": 6.0885, + "step": 118000 + }, + { + "epoch": 0.22602081298828125, + "grad_norm": 225.84487915039062, + "learning_rate": 3.869905471801758e-05, + "lookahead_loss": 8.527730641365052, + "loss": 6.118, + "step": 118500 + }, + { + "epoch": 0.2269744873046875, + "grad_norm": 290.6041259765625, + "learning_rate": 3.8651371002197266e-05, + "lookahead_loss": 8.562794244766236, + "loss": 6.0514, + "step": 119000 + }, + { + "epoch": 0.22792816162109375, + "grad_norm": 353.9345397949219, + "learning_rate": 3.8603687286376956e-05, + "lookahead_loss": 8.618139860153198, + "loss": 6.0647, + "step": 119500 + }, + { + "epoch": 0.2288818359375, + "grad_norm": 538.1376953125, + "learning_rate": 3.855600357055664e-05, + "lookahead_loss": 8.646830515861511, + "loss": 6.1051, + "step": 120000 + }, + { + "epoch": 0.2288818359375, + "eval_accuracy": 0.042556360078277886, + "eval_lookahead_loss": 8.577955355072021, + "eval_lookahead_perplexity": 5313.2307412741375, + "eval_loss": 5.927862644195557, + "eval_perplexity": 375.35139637086365, + "eval_runtime": 485.122, + "eval_samples_per_second": 20.613, + "eval_steps_per_second": 5.153, + "step": 120000 + }, + { + "epoch": 0.22983551025390625, + "grad_norm": 176.3824005126953, + "learning_rate": 3.850831985473633e-05, + "lookahead_loss": 8.50881103515625, + "loss": 6.1076, + "step": 120500 + }, + { + "epoch": 0.2307891845703125, + "grad_norm": 547.4995727539062, + "learning_rate": 3.846063613891602e-05, + "lookahead_loss": 8.608163069725036, + "loss": 6.0977, + "step": 121000 + }, + { + "epoch": 0.23174285888671875, + "grad_norm": 175.63818359375, + "learning_rate": 3.84129524230957e-05, + "lookahead_loss": 8.461777080535889, + "loss": 6.0685, + "step": 121500 + }, + { + "epoch": 0.232696533203125, + "grad_norm": 258.8597717285156, + "learning_rate": 3.8365268707275393e-05, + "lookahead_loss": 8.328949046134948, + "loss": 6.0712, + "step": 122000 + }, + { + "epoch": 0.23365020751953125, + "grad_norm": 326.1037292480469, + "learning_rate": 3.831758499145508e-05, + "lookahead_loss": 8.370767808914184, + "loss": 6.1333, + "step": 122500 + }, + { + "epoch": 0.2346038818359375, + "grad_norm": 251.73008728027344, + "learning_rate": 3.826990127563477e-05, + "lookahead_loss": 8.214890414237976, + "loss": 6.0586, + "step": 123000 + }, + { + "epoch": 0.23555755615234375, + "grad_norm": 203.9977569580078, + "learning_rate": 3.822221755981446e-05, + "lookahead_loss": 8.31967354297638, + "loss": 6.0823, + "step": 123500 + }, + { + "epoch": 0.23651123046875, + "grad_norm": 211.93418884277344, + "learning_rate": 3.817453384399414e-05, + "lookahead_loss": 8.320230326652528, + "loss": 6.0815, + "step": 124000 + }, + { + "epoch": 0.23746490478515625, + "grad_norm": 221.87466430664062, + "learning_rate": 3.812685012817383e-05, + "lookahead_loss": 8.655318147659301, + "loss": 6.0497, + "step": 124500 + }, + { + "epoch": 0.2384185791015625, + "grad_norm": 758.3634033203125, + "learning_rate": 3.8079166412353514e-05, + "lookahead_loss": 8.6256755361557, + "loss": 6.0722, + "step": 125000 + }, + { + "epoch": 0.2384185791015625, + "eval_accuracy": 0.04157318982387476, + "eval_lookahead_loss": 8.524691902542115, + "eval_lookahead_perplexity": 5037.634484341758, + "eval_loss": 5.916193962097168, + "eval_perplexity": 370.996994722863, + "eval_runtime": 482.1675, + "eval_samples_per_second": 20.74, + "eval_steps_per_second": 5.185, + "step": 125000 + }, + { + "epoch": 0.23937225341796875, + "grad_norm": 3325.891845703125, + "learning_rate": 3.8031482696533205e-05, + "lookahead_loss": 8.594130160331726, + "loss": 6.1017, + "step": 125500 + }, + { + "epoch": 0.240325927734375, + "grad_norm": 348.7906188964844, + "learning_rate": 3.7983798980712895e-05, + "lookahead_loss": 8.558137132644653, + "loss": 6.0877, + "step": 126000 + }, + { + "epoch": 0.24127960205078125, + "grad_norm": 140.01161193847656, + "learning_rate": 3.793611526489258e-05, + "lookahead_loss": 8.316791947364807, + "loss": 6.1159, + "step": 126500 + }, + { + "epoch": 0.2422332763671875, + "grad_norm": 239.45155334472656, + "learning_rate": 3.788843154907227e-05, + "lookahead_loss": 8.520363026618957, + "loss": 6.1319, + "step": 127000 + }, + { + "epoch": 0.24318695068359375, + "grad_norm": 138.10704040527344, + "learning_rate": 3.784074783325195e-05, + "lookahead_loss": 8.526314659118652, + "loss": 6.103, + "step": 127500 + }, + { + "epoch": 0.244140625, + "grad_norm": 634.4774169921875, + "learning_rate": 3.779306411743164e-05, + "lookahead_loss": 8.20733311843872, + "loss": 6.074, + "step": 128000 + }, + { + "epoch": 0.24509429931640625, + "grad_norm": 101.46903228759766, + "learning_rate": 3.774538040161133e-05, + "lookahead_loss": 8.063571801185608, + "loss": 6.035, + "step": 128500 + }, + { + "epoch": 0.2460479736328125, + "grad_norm": 343.1429748535156, + "learning_rate": 3.7697696685791016e-05, + "lookahead_loss": 8.163540954589843, + "loss": 6.0444, + "step": 129000 + }, + { + "epoch": 0.24700164794921875, + "grad_norm": 855.9058837890625, + "learning_rate": 3.7650012969970706e-05, + "lookahead_loss": 8.606195244789124, + "loss": 6.0571, + "step": 129500 + }, + { + "epoch": 0.247955322265625, + "grad_norm": 170.9237518310547, + "learning_rate": 3.760232925415039e-05, + "lookahead_loss": 8.55739320755005, + "loss": 6.0857, + "step": 130000 + }, + { + "epoch": 0.247955322265625, + "eval_accuracy": 0.042729158512720154, + "eval_lookahead_loss": 8.339963854598999, + "eval_lookahead_perplexity": 4187.938364018258, + "eval_loss": 5.888275623321533, + "eval_perplexity": 360.78262253450475, + "eval_runtime": 483.4097, + "eval_samples_per_second": 20.686, + "eval_steps_per_second": 5.172, + "step": 130000 + }, + { + "epoch": 0.24890899658203125, + "grad_norm": 194.24160766601562, + "learning_rate": 3.755464553833008e-05, + "lookahead_loss": 8.30693965435028, + "loss": 6.0272, + "step": 130500 + }, + { + "epoch": 0.2498626708984375, + "grad_norm": 194.9763641357422, + "learning_rate": 3.750696182250977e-05, + "lookahead_loss": 8.209974727630616, + "loss": 6.0171, + "step": 131000 + }, + { + "epoch": 0.25081634521484375, + "grad_norm": 236.00146484375, + "learning_rate": 3.745927810668945e-05, + "lookahead_loss": 8.265376645088196, + "loss": 6.0051, + "step": 131500 + }, + { + "epoch": 0.25177001953125, + "grad_norm": 90.95122528076172, + "learning_rate": 3.7411594390869143e-05, + "lookahead_loss": 8.200556273460387, + "loss": 6.0179, + "step": 132000 + }, + { + "epoch": 0.25272369384765625, + "grad_norm": 697.4423828125, + "learning_rate": 3.736391067504883e-05, + "lookahead_loss": 8.357412700653077, + "loss": 6.0185, + "step": 132500 + }, + { + "epoch": 0.2536773681640625, + "grad_norm": 136.90536499023438, + "learning_rate": 3.731622695922852e-05, + "lookahead_loss": 8.263107558250427, + "loss": 6.0519, + "step": 133000 + }, + { + "epoch": 0.25463104248046875, + "grad_norm": 485.62091064453125, + "learning_rate": 3.726854324340821e-05, + "lookahead_loss": 8.184577953338623, + "loss": 6.057, + "step": 133500 + }, + { + "epoch": 0.255584716796875, + "grad_norm": 417.3059387207031, + "learning_rate": 3.722085952758789e-05, + "lookahead_loss": 8.309420327186585, + "loss": 6.0665, + "step": 134000 + }, + { + "epoch": 0.25653839111328125, + "grad_norm": 286.899658203125, + "learning_rate": 3.717317581176758e-05, + "lookahead_loss": 8.06452819442749, + "loss": 6.0802, + "step": 134500 + }, + { + "epoch": 0.2574920654296875, + "grad_norm": 197.1073455810547, + "learning_rate": 3.7125492095947264e-05, + "lookahead_loss": 8.495831546783448, + "loss": 6.0764, + "step": 135000 + }, + { + "epoch": 0.2574920654296875, + "eval_accuracy": 0.042579647749510764, + "eval_lookahead_loss": 8.606221893501282, + "eval_lookahead_perplexity": 5465.560152352688, + "eval_loss": 5.932655334472656, + "eval_perplexity": 377.15465714220664, + "eval_runtime": 484.4209, + "eval_samples_per_second": 20.643, + "eval_steps_per_second": 5.161, + "step": 135000 + }, + { + "epoch": 0.25844573974609375, + "grad_norm": 409.67510986328125, + "learning_rate": 3.7077808380126955e-05, + "lookahead_loss": 8.839627535820007, + "loss": 6.1139, + "step": 135500 + }, + { + "epoch": 0.2593994140625, + "grad_norm": 265.7265319824219, + "learning_rate": 3.7030124664306645e-05, + "lookahead_loss": 8.331418164730072, + "loss": 6.0667, + "step": 136000 + }, + { + "epoch": 0.26035308837890625, + "grad_norm": 346.7359924316406, + "learning_rate": 3.698244094848633e-05, + "lookahead_loss": 8.268414088249207, + "loss": 6.0353, + "step": 136500 + }, + { + "epoch": 0.2613067626953125, + "grad_norm": 172.1764678955078, + "learning_rate": 3.693475723266602e-05, + "lookahead_loss": 8.29066693019867, + "loss": 6.0852, + "step": 137000 + }, + { + "epoch": 0.26226043701171875, + "grad_norm": 689.0460815429688, + "learning_rate": 3.68870735168457e-05, + "lookahead_loss": 8.343387287139892, + "loss": 6.1056, + "step": 137500 + }, + { + "epoch": 0.263214111328125, + "grad_norm": 271.1156921386719, + "learning_rate": 3.683938980102539e-05, + "lookahead_loss": 8.267973919868469, + "loss": 6.0524, + "step": 138000 + }, + { + "epoch": 0.26416778564453125, + "grad_norm": 154.06008911132812, + "learning_rate": 3.679170608520508e-05, + "lookahead_loss": 8.29391701221466, + "loss": 6.0598, + "step": 138500 + }, + { + "epoch": 0.2651214599609375, + "grad_norm": 424.4548645019531, + "learning_rate": 3.6744022369384766e-05, + "lookahead_loss": 8.114374744415283, + "loss": 5.9613, + "step": 139000 + }, + { + "epoch": 0.26607513427734375, + "grad_norm": 102.16728210449219, + "learning_rate": 3.6696338653564456e-05, + "lookahead_loss": 8.602961087226868, + "loss": 6.0919, + "step": 139500 + }, + { + "epoch": 0.26702880859375, + "grad_norm": 365.97674560546875, + "learning_rate": 3.664865493774414e-05, + "lookahead_loss": 8.558653169631958, + "loss": 6.0819, + "step": 140000 + }, + { + "epoch": 0.26702880859375, + "eval_accuracy": 0.04174148727984344, + "eval_lookahead_loss": 8.704425875282288, + "eval_lookahead_perplexity": 6029.539238537812, + "eval_loss": 5.948251724243164, + "eval_perplexity": 383.08301852513205, + "eval_runtime": 482.2045, + "eval_samples_per_second": 20.738, + "eval_steps_per_second": 5.185, + "step": 140000 + }, + { + "epoch": 0.26798248291015625, + "grad_norm": 92.32951354980469, + "learning_rate": 3.660097122192383e-05, + "lookahead_loss": 8.542299242019654, + "loss": 6.095, + "step": 140500 + }, + { + "epoch": 0.2689361572265625, + "grad_norm": 234.51275634765625, + "learning_rate": 3.655328750610352e-05, + "lookahead_loss": 8.21882178401947, + "loss": 6.0662, + "step": 141000 + }, + { + "epoch": 0.26988983154296875, + "grad_norm": 438.8663635253906, + "learning_rate": 3.65056037902832e-05, + "lookahead_loss": 8.30684892463684, + "loss": 6.0685, + "step": 141500 + }, + { + "epoch": 0.270843505859375, + "grad_norm": 430.9894104003906, + "learning_rate": 3.6457920074462893e-05, + "lookahead_loss": 8.326179340362549, + "loss": 6.0402, + "step": 142000 + }, + { + "epoch": 0.27179718017578125, + "grad_norm": 378.9204406738281, + "learning_rate": 3.641023635864258e-05, + "lookahead_loss": 8.494240595817566, + "loss": 6.0504, + "step": 142500 + }, + { + "epoch": 0.2727508544921875, + "grad_norm": 184.7960968017578, + "learning_rate": 3.636255264282227e-05, + "lookahead_loss": 8.822472394943237, + "loss": 6.016, + "step": 143000 + }, + { + "epoch": 0.27370452880859375, + "grad_norm": 153.46844482421875, + "learning_rate": 3.631486892700196e-05, + "lookahead_loss": 9.66422557258606, + "loss": 6.1146, + "step": 143500 + }, + { + "epoch": 0.274658203125, + "grad_norm": 204.32020568847656, + "learning_rate": 3.626718521118164e-05, + "lookahead_loss": 9.674363851547241, + "loss": 6.13, + "step": 144000 + }, + { + "epoch": 0.27561187744140625, + "grad_norm": 141.95143127441406, + "learning_rate": 3.621950149536133e-05, + "lookahead_loss": 8.643821138381957, + "loss": 6.009, + "step": 144500 + }, + { + "epoch": 0.2765655517578125, + "grad_norm": 1009.5608520507812, + "learning_rate": 3.6171817779541014e-05, + "lookahead_loss": 8.646462057113647, + "loss": 6.031, + "step": 145000 + }, + { + "epoch": 0.2765655517578125, + "eval_accuracy": 0.04204618395303327, + "eval_lookahead_loss": 8.506373066520691, + "eval_lookahead_perplexity": 4946.191010489164, + "eval_loss": 5.879608631134033, + "eval_perplexity": 357.66923371086534, + "eval_runtime": 487.2542, + "eval_samples_per_second": 20.523, + "eval_steps_per_second": 5.131, + "step": 145000 + }, + { + "epoch": 0.27751922607421875, + "grad_norm": 448.50677490234375, + "learning_rate": 3.6124134063720705e-05, + "lookahead_loss": 8.727618228912354, + "loss": 5.9974, + "step": 145500 + }, + { + "epoch": 0.278472900390625, + "grad_norm": 278.30078125, + "learning_rate": 3.6076450347900395e-05, + "lookahead_loss": 8.77801029777527, + "loss": 6.0099, + "step": 146000 + }, + { + "epoch": 0.27942657470703125, + "grad_norm": 298.3955383300781, + "learning_rate": 3.602876663208008e-05, + "lookahead_loss": 8.4466147108078, + "loss": 5.9951, + "step": 146500 + }, + { + "epoch": 0.2803802490234375, + "grad_norm": 79.32508850097656, + "learning_rate": 3.598108291625977e-05, + "lookahead_loss": 8.457780947685242, + "loss": 6.0683, + "step": 147000 + }, + { + "epoch": 0.28133392333984375, + "grad_norm": 332.9968566894531, + "learning_rate": 3.593339920043945e-05, + "lookahead_loss": 8.183274803161622, + "loss": 6.003, + "step": 147500 + }, + { + "epoch": 0.28228759765625, + "grad_norm": 469.8811340332031, + "learning_rate": 3.588571548461914e-05, + "lookahead_loss": 8.731921584129333, + "loss": 5.9496, + "step": 148000 + }, + { + "epoch": 0.28324127197265625, + "grad_norm": 208.2938995361328, + "learning_rate": 3.583803176879883e-05, + "lookahead_loss": 9.07046215248108, + "loss": 5.9886, + "step": 148500 + }, + { + "epoch": 0.2841949462890625, + "grad_norm": 551.42529296875, + "learning_rate": 3.5790348052978516e-05, + "lookahead_loss": 9.085750420570374, + "loss": 6.1014, + "step": 149000 + }, + { + "epoch": 0.28514862060546875, + "grad_norm": 590.7175903320312, + "learning_rate": 3.5742664337158206e-05, + "lookahead_loss": 8.801296343803406, + "loss": 6.0781, + "step": 149500 + }, + { + "epoch": 0.286102294921875, + "grad_norm": 218.66790771484375, + "learning_rate": 3.569498062133789e-05, + "lookahead_loss": 8.484878833770752, + "loss": 6.0952, + "step": 150000 + }, + { + "epoch": 0.286102294921875, + "eval_accuracy": 0.0419880626223092, + "eval_lookahead_loss": 8.423770334815979, + "eval_lookahead_perplexity": 4554.041387815776, + "eval_loss": 5.884705543518066, + "eval_perplexity": 359.49689621872744, + "eval_runtime": 487.333, + "eval_samples_per_second": 20.52, + "eval_steps_per_second": 5.13, + "step": 150000 + }, + { + "epoch": 0.28705596923828125, + "grad_norm": 255.74014282226562, + "learning_rate": 3.564729690551758e-05, + "lookahead_loss": 8.367752463340759, + "loss": 6.0526, + "step": 150500 + }, + { + "epoch": 0.2880096435546875, + "grad_norm": 615.177001953125, + "learning_rate": 3.559961318969727e-05, + "lookahead_loss": 8.368024963378906, + "loss": 6.0529, + "step": 151000 + }, + { + "epoch": 0.28896331787109375, + "grad_norm": 178.51412963867188, + "learning_rate": 3.555192947387695e-05, + "lookahead_loss": 8.28337444972992, + "loss": 6.0304, + "step": 151500 + }, + { + "epoch": 0.2899169921875, + "grad_norm": 694.1113891601562, + "learning_rate": 3.5504245758056643e-05, + "lookahead_loss": 8.402232641220094, + "loss": 6.0759, + "step": 152000 + }, + { + "epoch": 0.29087066650390625, + "grad_norm": 295.4070129394531, + "learning_rate": 3.545656204223633e-05, + "lookahead_loss": 8.434672183036804, + "loss": 6.0091, + "step": 152500 + }, + { + "epoch": 0.2918243408203125, + "grad_norm": 226.90664672851562, + "learning_rate": 3.540887832641602e-05, + "lookahead_loss": 8.42317797613144, + "loss": 6.0848, + "step": 153000 + }, + { + "epoch": 0.29277801513671875, + "grad_norm": 296.4658203125, + "learning_rate": 3.536119461059571e-05, + "lookahead_loss": 8.319315007209777, + "loss": 6.0539, + "step": 153500 + }, + { + "epoch": 0.293731689453125, + "grad_norm": 318.38720703125, + "learning_rate": 3.531351089477539e-05, + "lookahead_loss": 8.457439171791076, + "loss": 6.0757, + "step": 154000 + }, + { + "epoch": 0.29468536376953125, + "grad_norm": 203.85891723632812, + "learning_rate": 3.526582717895508e-05, + "lookahead_loss": 8.515343271255492, + "loss": 6.0841, + "step": 154500 + }, + { + "epoch": 0.2956390380859375, + "grad_norm": 1237.6728515625, + "learning_rate": 3.5218143463134764e-05, + "lookahead_loss": 8.46052855682373, + "loss": 6.0402, + "step": 155000 + }, + { + "epoch": 0.2956390380859375, + "eval_accuracy": 0.04173307240704501, + "eval_lookahead_loss": 8.27357072544098, + "eval_lookahead_perplexity": 3918.9174167289752, + "eval_loss": 5.8894853591918945, + "eval_perplexity": 361.21933831650136, + "eval_runtime": 482.2599, + "eval_samples_per_second": 20.736, + "eval_steps_per_second": 5.184, + "step": 155000 + }, + { + "epoch": 0.29659271240234375, + "grad_norm": 272.65460205078125, + "learning_rate": 3.5170459747314455e-05, + "lookahead_loss": 8.357754618644714, + "loss": 6.0337, + "step": 155500 + }, + { + "epoch": 0.29754638671875, + "grad_norm": 275.7253723144531, + "learning_rate": 3.5122776031494145e-05, + "lookahead_loss": 8.365152515411378, + "loss": 6.0782, + "step": 156000 + }, + { + "epoch": 0.29850006103515625, + "grad_norm": 206.92604064941406, + "learning_rate": 3.507509231567383e-05, + "lookahead_loss": 8.19471140575409, + "loss": 6.0354, + "step": 156500 + }, + { + "epoch": 0.2994537353515625, + "grad_norm": 205.04920959472656, + "learning_rate": 3.502740859985352e-05, + "lookahead_loss": 8.187815518379212, + "loss": 6.015, + "step": 157000 + }, + { + "epoch": 0.30040740966796875, + "grad_norm": 223.8436279296875, + "learning_rate": 3.49797248840332e-05, + "lookahead_loss": 8.507842065811158, + "loss": 6.0638, + "step": 157500 + }, + { + "epoch": 0.301361083984375, + "grad_norm": 133.26565551757812, + "learning_rate": 3.493204116821289e-05, + "lookahead_loss": 8.44181195640564, + "loss": 5.9914, + "step": 158000 + }, + { + "epoch": 0.30231475830078125, + "grad_norm": 229.11009216308594, + "learning_rate": 3.488435745239258e-05, + "lookahead_loss": 8.546023080825806, + "loss": 6.0154, + "step": 158500 + }, + { + "epoch": 0.3032684326171875, + "grad_norm": 197.01304626464844, + "learning_rate": 3.4836673736572266e-05, + "lookahead_loss": 8.925120499610902, + "loss": 6.0115, + "step": 159000 + }, + { + "epoch": 0.30422210693359375, + "grad_norm": 224.20155334472656, + "learning_rate": 3.4788990020751956e-05, + "lookahead_loss": 8.621713082313537, + "loss": 5.9737, + "step": 159500 + }, + { + "epoch": 0.30517578125, + "grad_norm": 149.9480438232422, + "learning_rate": 3.474130630493164e-05, + "lookahead_loss": 8.449884405136109, + "loss": 6.0109, + "step": 160000 + }, + { + "epoch": 0.30517578125, + "eval_accuracy": 0.0411692759295499, + "eval_lookahead_loss": 8.293304634666443, + "eval_lookahead_perplexity": 3997.0210880482423, + "eval_loss": 5.859129905700684, + "eval_perplexity": 350.4191136634002, + "eval_runtime": 490.2813, + "eval_samples_per_second": 20.396, + "eval_steps_per_second": 5.099, + "step": 160000 + }, + { + "epoch": 0.30612945556640625, + "grad_norm": 310.0232238769531, + "learning_rate": 3.469362258911133e-05, + "lookahead_loss": 8.309981032371521, + "loss": 5.9942, + "step": 160500 + }, + { + "epoch": 0.3070831298828125, + "grad_norm": 212.95753479003906, + "learning_rate": 3.464593887329102e-05, + "lookahead_loss": 8.202514970779418, + "loss": 6.0143, + "step": 161000 + }, + { + "epoch": 0.30803680419921875, + "grad_norm": 126.3419418334961, + "learning_rate": 3.45982551574707e-05, + "lookahead_loss": 8.178078886985778, + "loss": 6.0052, + "step": 161500 + }, + { + "epoch": 0.308990478515625, + "grad_norm": 111.19036102294922, + "learning_rate": 3.4550571441650393e-05, + "lookahead_loss": 7.996254001617432, + "loss": 5.9953, + "step": 162000 + }, + { + "epoch": 0.30994415283203125, + "grad_norm": 251.0377655029297, + "learning_rate": 3.450288772583008e-05, + "lookahead_loss": 8.294375630378724, + "loss": 5.9681, + "step": 162500 + }, + { + "epoch": 0.3108978271484375, + "grad_norm": 207.9862823486328, + "learning_rate": 3.445520401000977e-05, + "lookahead_loss": 8.374870569229126, + "loss": 5.9896, + "step": 163000 + }, + { + "epoch": 0.31185150146484375, + "grad_norm": 164.48751831054688, + "learning_rate": 3.440752029418946e-05, + "lookahead_loss": 8.114132668495179, + "loss": 5.9544, + "step": 163500 + }, + { + "epoch": 0.31280517578125, + "grad_norm": 134.6900177001953, + "learning_rate": 3.435983657836914e-05, + "lookahead_loss": 8.208403561592101, + "loss": 5.9887, + "step": 164000 + }, + { + "epoch": 0.31375885009765625, + "grad_norm": 167.5714569091797, + "learning_rate": 3.431215286254883e-05, + "lookahead_loss": 8.213879681587219, + "loss": 6.0222, + "step": 164500 + }, + { + "epoch": 0.3147125244140625, + "grad_norm": 691.8146362304688, + "learning_rate": 3.4264469146728514e-05, + "lookahead_loss": 8.230835564613342, + "loss": 6.0041, + "step": 165000 + }, + { + "epoch": 0.3147125244140625, + "eval_accuracy": 0.041931702544031314, + "eval_lookahead_loss": 7.881814068222046, + "eval_lookahead_perplexity": 2648.6730778134424, + "eval_loss": 5.83050012588501, + "eval_perplexity": 340.52894383069787, + "eval_runtime": 498.8452, + "eval_samples_per_second": 20.046, + "eval_steps_per_second": 5.012, + "step": 165000 + }, + { + "epoch": 0.31566619873046875, + "grad_norm": 158.4942626953125, + "learning_rate": 3.4216785430908205e-05, + "lookahead_loss": 7.898229797363281, + "loss": 6.0148, + "step": 165500 + }, + { + "epoch": 0.316619873046875, + "grad_norm": 54.70542526245117, + "learning_rate": 3.4169101715087895e-05, + "lookahead_loss": 7.989220093727112, + "loss": 5.9794, + "step": 166000 + }, + { + "epoch": 0.31757354736328125, + "grad_norm": 206.5970458984375, + "learning_rate": 3.412141799926758e-05, + "lookahead_loss": 7.9783684978485105, + "loss": 6.0004, + "step": 166500 + }, + { + "epoch": 0.3185272216796875, + "grad_norm": 89.85675811767578, + "learning_rate": 3.407373428344727e-05, + "lookahead_loss": 7.880394848823547, + "loss": 5.9838, + "step": 167000 + }, + { + "epoch": 0.31948089599609375, + "grad_norm": 204.40150451660156, + "learning_rate": 3.402605056762695e-05, + "lookahead_loss": 8.025307200431824, + "loss": 5.9986, + "step": 167500 + }, + { + "epoch": 0.3204345703125, + "grad_norm": 140.3711395263672, + "learning_rate": 3.397836685180664e-05, + "lookahead_loss": 7.988316534996033, + "loss": 6.002, + "step": 168000 + }, + { + "epoch": 0.32138824462890625, + "grad_norm": 149.5004119873047, + "learning_rate": 3.393068313598633e-05, + "lookahead_loss": 7.984311297416687, + "loss": 5.9663, + "step": 168500 + }, + { + "epoch": 0.3223419189453125, + "grad_norm": 174.3770294189453, + "learning_rate": 3.3882999420166016e-05, + "lookahead_loss": 7.965680088043213, + "loss": 5.9898, + "step": 169000 + }, + { + "epoch": 0.32329559326171875, + "grad_norm": 175.9324493408203, + "learning_rate": 3.3835315704345706e-05, + "lookahead_loss": 7.951382723808289, + "loss": 5.9598, + "step": 169500 + }, + { + "epoch": 0.324249267578125, + "grad_norm": 96.46771240234375, + "learning_rate": 3.378763198852539e-05, + "lookahead_loss": 8.140294232368468, + "loss": 5.9871, + "step": 170000 + }, + { + "epoch": 0.324249267578125, + "eval_accuracy": 0.041981800391389434, + "eval_lookahead_loss": 7.984321828460693, + "eval_lookahead_perplexity": 2934.58647647679, + "eval_loss": 5.820465087890625, + "eval_perplexity": 337.12881170200853, + "eval_runtime": 507.6051, + "eval_samples_per_second": 19.7, + "eval_steps_per_second": 4.925, + "step": 170000 + }, + { + "epoch": 0.32520294189453125, + "grad_norm": 190.13755798339844, + "learning_rate": 3.373994827270508e-05, + "lookahead_loss": 8.227613647460938, + "loss": 5.9968, + "step": 170500 + }, + { + "epoch": 0.3261566162109375, + "grad_norm": 114.09439849853516, + "learning_rate": 3.369226455688477e-05, + "lookahead_loss": 8.31316443824768, + "loss": 6.0104, + "step": 171000 + }, + { + "epoch": 0.32711029052734375, + "grad_norm": 150.9316864013672, + "learning_rate": 3.364458084106445e-05, + "lookahead_loss": 8.22113003730774, + "loss": 5.9334, + "step": 171500 + }, + { + "epoch": 0.32806396484375, + "grad_norm": 123.98262023925781, + "learning_rate": 3.3596897125244143e-05, + "lookahead_loss": 8.117613220214844, + "loss": 5.9601, + "step": 172000 + }, + { + "epoch": 0.32901763916015625, + "grad_norm": 396.377197265625, + "learning_rate": 3.354921340942383e-05, + "lookahead_loss": 8.024533190727233, + "loss": 5.9602, + "step": 172500 + }, + { + "epoch": 0.3299713134765625, + "grad_norm": 227.30914306640625, + "learning_rate": 3.350152969360352e-05, + "lookahead_loss": 8.140616178512573, + "loss": 5.9825, + "step": 173000 + }, + { + "epoch": 0.33092498779296875, + "grad_norm": 203.30186462402344, + "learning_rate": 3.345384597778321e-05, + "lookahead_loss": 8.304286516189576, + "loss": 5.9909, + "step": 173500 + }, + { + "epoch": 0.331878662109375, + "grad_norm": 117.4228286743164, + "learning_rate": 3.340616226196289e-05, + "lookahead_loss": 8.634052651405334, + "loss": 6.0052, + "step": 174000 + }, + { + "epoch": 0.33283233642578125, + "grad_norm": 97.48628997802734, + "learning_rate": 3.335847854614258e-05, + "lookahead_loss": 8.569329356193542, + "loss": 6.0127, + "step": 174500 + }, + { + "epoch": 0.3337860107421875, + "grad_norm": 201.49436950683594, + "learning_rate": 3.3310794830322264e-05, + "lookahead_loss": 8.132785663604736, + "loss": 5.9548, + "step": 175000 + }, + { + "epoch": 0.3337860107421875, + "eval_accuracy": 0.04122328767123288, + "eval_lookahead_loss": 8.164220848846435, + "eval_lookahead_perplexity": 3512.9831256282037, + "eval_loss": 5.835893154144287, + "eval_perplexity": 342.3703870624139, + "eval_runtime": 495.1123, + "eval_samples_per_second": 20.197, + "eval_steps_per_second": 5.049, + "step": 175000 + }, + { + "epoch": 0.33473968505859375, + "grad_norm": 173.42501831054688, + "learning_rate": 3.3263111114501955e-05, + "lookahead_loss": 8.182872535705567, + "loss": 6.0117, + "step": 175500 + }, + { + "epoch": 0.335693359375, + "grad_norm": 245.8326416015625, + "learning_rate": 3.3215427398681645e-05, + "lookahead_loss": 8.141062163352967, + "loss": 5.9791, + "step": 176000 + }, + { + "epoch": 0.33664703369140625, + "grad_norm": 186.81982421875, + "learning_rate": 3.316774368286133e-05, + "lookahead_loss": 8.138561144828797, + "loss": 5.9359, + "step": 176500 + }, + { + "epoch": 0.3376007080078125, + "grad_norm": 132.71649169921875, + "learning_rate": 3.312005996704102e-05, + "lookahead_loss": 8.40067853164673, + "loss": 5.9446, + "step": 177000 + }, + { + "epoch": 0.33855438232421875, + "grad_norm": 191.45095825195312, + "learning_rate": 3.30723762512207e-05, + "lookahead_loss": 8.193184445381165, + "loss": 5.9047, + "step": 177500 + }, + { + "epoch": 0.339508056640625, + "grad_norm": 81.00851440429688, + "learning_rate": 3.302469253540039e-05, + "lookahead_loss": 7.954344306945801, + "loss": 5.8553, + "step": 178000 + }, + { + "epoch": 0.34046173095703125, + "grad_norm": 158.39599609375, + "learning_rate": 3.297700881958008e-05, + "lookahead_loss": 7.938361072063446, + "loss": 5.849, + "step": 178500 + }, + { + "epoch": 0.3414154052734375, + "grad_norm": 110.21147155761719, + "learning_rate": 3.2929325103759766e-05, + "lookahead_loss": 7.959272700309754, + "loss": 5.8956, + "step": 179000 + }, + { + "epoch": 0.34236907958984375, + "grad_norm": 106.14759826660156, + "learning_rate": 3.2881641387939456e-05, + "lookahead_loss": 8.296606355667114, + "loss": 5.9792, + "step": 179500 + }, + { + "epoch": 0.34332275390625, + "grad_norm": 142.65284729003906, + "learning_rate": 3.283395767211914e-05, + "lookahead_loss": 8.36677256679535, + "loss": 6.0324, + "step": 180000 + }, + { + "epoch": 0.34332275390625, + "eval_accuracy": 0.04264794520547945, + "eval_lookahead_loss": 8.120613956260682, + "eval_lookahead_perplexity": 3363.0848983937512, + "eval_loss": 5.795071125030518, + "eval_perplexity": 328.6755602098744, + "eval_runtime": 504.9787, + "eval_samples_per_second": 19.803, + "eval_steps_per_second": 4.951, + "step": 180000 + }, + { + "epoch": 0.34427642822265625, + "grad_norm": 156.7557373046875, + "learning_rate": 3.278627395629883e-05, + "lookahead_loss": 8.017733713150024, + "loss": 5.9868, + "step": 180500 + }, + { + "epoch": 0.3452301025390625, + "grad_norm": 386.0038757324219, + "learning_rate": 3.273859024047852e-05, + "lookahead_loss": 8.118571619987488, + "loss": 6.0164, + "step": 181000 + }, + { + "epoch": 0.34618377685546875, + "grad_norm": 71.98054504394531, + "learning_rate": 3.26909065246582e-05, + "lookahead_loss": 8.162414593696594, + "loss": 5.9599, + "step": 181500 + }, + { + "epoch": 0.347137451171875, + "grad_norm": 233.21678161621094, + "learning_rate": 3.2643222808837893e-05, + "lookahead_loss": 7.9632285213470455, + "loss": 5.9319, + "step": 182000 + }, + { + "epoch": 0.34809112548828125, + "grad_norm": 131.25079345703125, + "learning_rate": 3.259553909301758e-05, + "lookahead_loss": 7.9603827228546145, + "loss": 5.9593, + "step": 182500 + }, + { + "epoch": 0.3490447998046875, + "grad_norm": 142.6866912841797, + "learning_rate": 3.254785537719727e-05, + "lookahead_loss": 7.958676391601562, + "loss": 5.9428, + "step": 183000 + }, + { + "epoch": 0.34999847412109375, + "grad_norm": 117.02886962890625, + "learning_rate": 3.250017166137696e-05, + "lookahead_loss": 7.801533300399781, + "loss": 5.9232, + "step": 183500 + }, + { + "epoch": 0.3509521484375, + "grad_norm": 102.8035888671875, + "learning_rate": 3.245248794555664e-05, + "lookahead_loss": 7.992493821144104, + "loss": 5.9632, + "step": 184000 + }, + { + "epoch": 0.35190582275390625, + "grad_norm": 240.892333984375, + "learning_rate": 3.240480422973633e-05, + "lookahead_loss": 7.938229041099548, + "loss": 5.9282, + "step": 184500 + }, + { + "epoch": 0.3528594970703125, + "grad_norm": 365.2929992675781, + "learning_rate": 3.2357120513916014e-05, + "lookahead_loss": 7.982856956481934, + "loss": 5.9459, + "step": 185000 + }, + { + "epoch": 0.3528594970703125, + "eval_accuracy": 0.04211565557729941, + "eval_lookahead_loss": 7.819290181541443, + "eval_lookahead_perplexity": 2488.13865433715, + "eval_loss": 5.774978160858154, + "eval_perplexity": 322.13739943729433, + "eval_runtime": 491.9997, + "eval_samples_per_second": 20.325, + "eval_steps_per_second": 5.081, + "step": 185000 + }, + { + "epoch": 0.35381317138671875, + "grad_norm": 263.4913635253906, + "learning_rate": 3.2309436798095705e-05, + "lookahead_loss": 7.849170780181884, + "loss": 5.9645, + "step": 185500 + }, + { + "epoch": 0.354766845703125, + "grad_norm": 95.41636657714844, + "learning_rate": 3.2261753082275395e-05, + "lookahead_loss": 7.830999266624451, + "loss": 5.8938, + "step": 186000 + }, + { + "epoch": 0.35572052001953125, + "grad_norm": 310.38421630859375, + "learning_rate": 3.221406936645508e-05, + "lookahead_loss": 7.809786183357239, + "loss": 5.9187, + "step": 186500 + }, + { + "epoch": 0.3566741943359375, + "grad_norm": 148.20550537109375, + "learning_rate": 3.216638565063477e-05, + "lookahead_loss": 7.883802563667297, + "loss": 5.896, + "step": 187000 + }, + { + "epoch": 0.35762786865234375, + "grad_norm": 153.30296325683594, + "learning_rate": 3.211870193481445e-05, + "lookahead_loss": 7.84271118927002, + "loss": 5.915, + "step": 187500 + }, + { + "epoch": 0.35858154296875, + "grad_norm": 156.14598083496094, + "learning_rate": 3.207101821899414e-05, + "lookahead_loss": 7.892906747817993, + "loss": 5.9474, + "step": 188000 + }, + { + "epoch": 0.35953521728515625, + "grad_norm": 499.22296142578125, + "learning_rate": 3.202333450317383e-05, + "lookahead_loss": 7.846433094978332, + "loss": 5.8989, + "step": 188500 + }, + { + "epoch": 0.3604888916015625, + "grad_norm": 398.5104064941406, + "learning_rate": 3.1975650787353516e-05, + "lookahead_loss": 7.953180156707764, + "loss": 5.9663, + "step": 189000 + }, + { + "epoch": 0.36144256591796875, + "grad_norm": 159.6160888671875, + "learning_rate": 3.1927967071533206e-05, + "lookahead_loss": 8.042246521949767, + "loss": 5.8625, + "step": 189500 + }, + { + "epoch": 0.362396240234375, + "grad_norm": 278.585693359375, + "learning_rate": 3.188028335571289e-05, + "lookahead_loss": 7.98437483215332, + "loss": 5.9077, + "step": 190000 + }, + { + "epoch": 0.362396240234375, + "eval_accuracy": 0.04195636007827789, + "eval_lookahead_loss": 7.856213401412964, + "eval_lookahead_perplexity": 2581.7258836281867, + "eval_loss": 5.772352695465088, + "eval_perplexity": 321.29274812983437, + "eval_runtime": 506.1007, + "eval_samples_per_second": 19.759, + "eval_steps_per_second": 4.94, + "step": 190000 + }, + { + "epoch": 0.36334991455078125, + "grad_norm": 778.6317138671875, + "learning_rate": 3.183259963989258e-05, + "lookahead_loss": 7.908500896453857, + "loss": 5.9352, + "step": 190500 + }, + { + "epoch": 0.3643035888671875, + "grad_norm": 257.8266296386719, + "learning_rate": 3.178491592407227e-05, + "lookahead_loss": 7.925882682800293, + "loss": 5.9671, + "step": 191000 + }, + { + "epoch": 0.36525726318359375, + "grad_norm": 97.79209899902344, + "learning_rate": 3.173723220825195e-05, + "lookahead_loss": 7.9774689893722535, + "loss": 5.9397, + "step": 191500 + }, + { + "epoch": 0.3662109375, + "grad_norm": 152.8977508544922, + "learning_rate": 3.1689548492431643e-05, + "lookahead_loss": 7.901604917526245, + "loss": 5.8894, + "step": 192000 + }, + { + "epoch": 0.36716461181640625, + "grad_norm": 108.49420166015625, + "learning_rate": 3.164186477661133e-05, + "lookahead_loss": 7.9163757438659665, + "loss": 5.8183, + "step": 192500 + }, + { + "epoch": 0.3681182861328125, + "grad_norm": 143.95188903808594, + "learning_rate": 3.159418106079102e-05, + "lookahead_loss": 7.77419694519043, + "loss": 5.8051, + "step": 193000 + }, + { + "epoch": 0.36907196044921875, + "grad_norm": 150.1779327392578, + "learning_rate": 3.154649734497071e-05, + "lookahead_loss": 7.756695732116699, + "loss": 5.87, + "step": 193500 + }, + { + "epoch": 0.370025634765625, + "grad_norm": 63.75507354736328, + "learning_rate": 3.149881362915039e-05, + "lookahead_loss": 7.828515367031097, + "loss": 5.8915, + "step": 194000 + }, + { + "epoch": 0.37097930908203125, + "grad_norm": 64.07188415527344, + "learning_rate": 3.145112991333008e-05, + "lookahead_loss": 7.92230148601532, + "loss": 5.9638, + "step": 194500 + }, + { + "epoch": 0.3719329833984375, + "grad_norm": 142.73704528808594, + "learning_rate": 3.1403446197509764e-05, + "lookahead_loss": 7.858141320228577, + "loss": 5.983, + "step": 195000 + }, + { + "epoch": 0.3719329833984375, + "eval_accuracy": 0.04148277886497065, + "eval_lookahead_loss": 7.871766919898987, + "eval_lookahead_perplexity": 2622.1947053474364, + "eval_loss": 5.762899875640869, + "eval_perplexity": 318.26993523256425, + "eval_runtime": 508.3909, + "eval_samples_per_second": 19.67, + "eval_steps_per_second": 4.917, + "step": 195000 + }, + { + "epoch": 0.37288665771484375, + "grad_norm": 103.4684066772461, + "learning_rate": 3.1355762481689455e-05, + "lookahead_loss": 7.987927829742431, + "loss": 5.92, + "step": 195500 + }, + { + "epoch": 0.37384033203125, + "grad_norm": 80.4505386352539, + "learning_rate": 3.1308078765869145e-05, + "lookahead_loss": 8.31801579284668, + "loss": 5.9799, + "step": 196000 + }, + { + "epoch": 0.37479400634765625, + "grad_norm": 352.4549865722656, + "learning_rate": 3.126039505004883e-05, + "lookahead_loss": 7.914292486190796, + "loss": 5.9375, + "step": 196500 + }, + { + "epoch": 0.3757476806640625, + "grad_norm": 156.81256103515625, + "learning_rate": 3.121271133422852e-05, + "lookahead_loss": 7.876584443092346, + "loss": 5.9211, + "step": 197000 + }, + { + "epoch": 0.37670135498046875, + "grad_norm": 215.19418334960938, + "learning_rate": 3.11650276184082e-05, + "lookahead_loss": 8.027199938774109, + "loss": 5.9769, + "step": 197500 + }, + { + "epoch": 0.377655029296875, + "grad_norm": 134.86676025390625, + "learning_rate": 3.111734390258789e-05, + "lookahead_loss": 8.137993032455444, + "loss": 5.9332, + "step": 198000 + }, + { + "epoch": 0.37860870361328125, + "grad_norm": 137.59007263183594, + "learning_rate": 3.106966018676758e-05, + "lookahead_loss": 7.937407537460327, + "loss": 5.9746, + "step": 198500 + }, + { + "epoch": 0.3795623779296875, + "grad_norm": 128.06895446777344, + "learning_rate": 3.1021976470947266e-05, + "lookahead_loss": 7.8242775764465335, + "loss": 5.9498, + "step": 199000 + }, + { + "epoch": 0.38051605224609375, + "grad_norm": 219.2415008544922, + "learning_rate": 3.0974292755126956e-05, + "lookahead_loss": 7.920266454696655, + "loss": 5.9506, + "step": 199500 + }, + { + "epoch": 0.3814697265625, + "grad_norm": 233.41378784179688, + "learning_rate": 3.092660903930664e-05, + "lookahead_loss": 8.106398473739624, + "loss": 5.9019, + "step": 200000 + }, + { + "epoch": 0.3814697265625, + "eval_accuracy": 0.040383561643835615, + "eval_lookahead_loss": 8.069347219944, + "eval_lookahead_perplexity": 3195.01550579553, + "eval_loss": 5.754023551940918, + "eval_perplexity": 315.457369352675, + "eval_runtime": 511.3541, + "eval_samples_per_second": 19.556, + "eval_steps_per_second": 4.889, + "step": 200000 + }, + { + "epoch": 0.38242340087890625, + "grad_norm": 158.30667114257812, + "learning_rate": 3.087892532348633e-05, + "lookahead_loss": 8.150268076896667, + "loss": 5.8639, + "step": 200500 + }, + { + "epoch": 0.3833770751953125, + "grad_norm": 122.84638214111328, + "learning_rate": 3.083124160766602e-05, + "lookahead_loss": 8.00338013458252, + "loss": 5.9156, + "step": 201000 + }, + { + "epoch": 0.38433074951171875, + "grad_norm": 192.3455810546875, + "learning_rate": 3.07835578918457e-05, + "lookahead_loss": 7.905283679008484, + "loss": 5.9312, + "step": 201500 + }, + { + "epoch": 0.385284423828125, + "grad_norm": 115.64942169189453, + "learning_rate": 3.0735874176025393e-05, + "lookahead_loss": 7.997805705070496, + "loss": 5.9367, + "step": 202000 + }, + { + "epoch": 0.38623809814453125, + "grad_norm": 168.84153747558594, + "learning_rate": 3.068819046020508e-05, + "lookahead_loss": 7.976878279685974, + "loss": 5.8912, + "step": 202500 + }, + { + "epoch": 0.3871917724609375, + "grad_norm": 77.6719741821289, + "learning_rate": 3.064050674438477e-05, + "lookahead_loss": 7.974493411064148, + "loss": 5.9187, + "step": 203000 + }, + { + "epoch": 0.38814544677734375, + "grad_norm": 121.06427764892578, + "learning_rate": 3.059282302856446e-05, + "lookahead_loss": 7.975774022102356, + "loss": 5.9053, + "step": 203500 + }, + { + "epoch": 0.38909912109375, + "grad_norm": 70.0561752319336, + "learning_rate": 3.054513931274414e-05, + "lookahead_loss": 7.915553599357605, + "loss": 5.9096, + "step": 204000 + }, + { + "epoch": 0.39005279541015625, + "grad_norm": 131.3404541015625, + "learning_rate": 3.049745559692383e-05, + "lookahead_loss": 8.067669066429138, + "loss": 5.8714, + "step": 204500 + }, + { + "epoch": 0.3910064697265625, + "grad_norm": 161.99171447753906, + "learning_rate": 3.0449771881103518e-05, + "lookahead_loss": 7.802521809577942, + "loss": 5.8664, + "step": 205000 + }, + { + "epoch": 0.3910064697265625, + "eval_accuracy": 0.041352446183953034, + "eval_lookahead_loss": 7.861547491264343, + "eval_lookahead_perplexity": 2595.533835150904, + "eval_loss": 5.740807056427002, + "eval_perplexity": 311.31555884510385, + "eval_runtime": 507.9446, + "eval_samples_per_second": 19.687, + "eval_steps_per_second": 4.922, + "step": 205000 + }, + { + "epoch": 0.39196014404296875, + "grad_norm": 197.28778076171875, + "learning_rate": 3.0402088165283205e-05, + "lookahead_loss": 7.90296000957489, + "loss": 5.8657, + "step": 205500 + }, + { + "epoch": 0.392913818359375, + "grad_norm": 170.33383178710938, + "learning_rate": 3.035440444946289e-05, + "lookahead_loss": 7.931079836845398, + "loss": 5.8714, + "step": 206000 + }, + { + "epoch": 0.39386749267578125, + "grad_norm": 86.03404235839844, + "learning_rate": 3.0306720733642578e-05, + "lookahead_loss": 7.8958673601150515, + "loss": 5.868, + "step": 206500 + }, + { + "epoch": 0.3948211669921875, + "grad_norm": 93.6749267578125, + "learning_rate": 3.025903701782227e-05, + "lookahead_loss": 7.910998147964477, + "loss": 5.8729, + "step": 207000 + }, + { + "epoch": 0.39577484130859375, + "grad_norm": 216.99295043945312, + "learning_rate": 3.0211353302001955e-05, + "lookahead_loss": 7.858925893783569, + "loss": 5.828, + "step": 207500 + }, + { + "epoch": 0.396728515625, + "grad_norm": 103.09634399414062, + "learning_rate": 3.0163669586181642e-05, + "lookahead_loss": 7.82588318157196, + "loss": 5.8121, + "step": 208000 + }, + { + "epoch": 0.39768218994140625, + "grad_norm": 191.628173828125, + "learning_rate": 3.011598587036133e-05, + "lookahead_loss": 7.758401550292969, + "loss": 5.8055, + "step": 208500 + }, + { + "epoch": 0.3986358642578125, + "grad_norm": 135.73814392089844, + "learning_rate": 3.0068302154541016e-05, + "lookahead_loss": 7.925666232109069, + "loss": 5.8651, + "step": 209000 + }, + { + "epoch": 0.39958953857421875, + "grad_norm": 499.8132629394531, + "learning_rate": 3.0020618438720706e-05, + "lookahead_loss": 8.071907279014587, + "loss": 5.9279, + "step": 209500 + }, + { + "epoch": 0.400543212890625, + "grad_norm": 267.57904052734375, + "learning_rate": 2.9972934722900393e-05, + "lookahead_loss": 7.945118926048279, + "loss": 5.9601, + "step": 210000 + }, + { + "epoch": 0.400543212890625, + "eval_accuracy": 0.04195088062622309, + "eval_lookahead_loss": 7.91679793586731, + "eval_lookahead_perplexity": 2742.9737905674015, + "eval_loss": 5.723841190338135, + "eval_perplexity": 306.07837308083026, + "eval_runtime": 537.9695, + "eval_samples_per_second": 18.588, + "eval_steps_per_second": 4.647, + "step": 210000 + }, + { + "epoch": 0.40149688720703125, + "grad_norm": 159.069580078125, + "learning_rate": 2.992525100708008e-05, + "lookahead_loss": 7.96452858543396, + "loss": 5.9025, + "step": 210500 + }, + { + "epoch": 0.4024505615234375, + "grad_norm": 301.83551025390625, + "learning_rate": 2.9877567291259766e-05, + "lookahead_loss": 7.934555889129639, + "loss": 5.871, + "step": 211000 + }, + { + "epoch": 0.40340423583984375, + "grad_norm": 177.71194458007812, + "learning_rate": 2.9829883575439453e-05, + "lookahead_loss": 8.067665080070496, + "loss": 5.9339, + "step": 211500 + }, + { + "epoch": 0.40435791015625, + "grad_norm": 158.5361328125, + "learning_rate": 2.9782199859619143e-05, + "lookahead_loss": 8.103134846687317, + "loss": 5.8619, + "step": 212000 + }, + { + "epoch": 0.40531158447265625, + "grad_norm": 169.25062561035156, + "learning_rate": 2.973451614379883e-05, + "lookahead_loss": 7.851924914360047, + "loss": 5.8463, + "step": 212500 + }, + { + "epoch": 0.4062652587890625, + "grad_norm": 275.23846435546875, + "learning_rate": 2.9686832427978517e-05, + "lookahead_loss": 7.910287003517151, + "loss": 5.8877, + "step": 213000 + }, + { + "epoch": 0.40721893310546875, + "grad_norm": 214.32382202148438, + "learning_rate": 2.9639148712158204e-05, + "lookahead_loss": 7.96638767528534, + "loss": 5.9324, + "step": 213500 + }, + { + "epoch": 0.408172607421875, + "grad_norm": 203.58135986328125, + "learning_rate": 2.959146499633789e-05, + "lookahead_loss": 8.02168957901001, + "loss": 5.9319, + "step": 214000 + }, + { + "epoch": 0.40912628173828125, + "grad_norm": 308.3120422363281, + "learning_rate": 2.954378128051758e-05, + "lookahead_loss": 7.987469184875488, + "loss": 5.8745, + "step": 214500 + }, + { + "epoch": 0.4100799560546875, + "grad_norm": 96.87218475341797, + "learning_rate": 2.9496097564697268e-05, + "lookahead_loss": 8.049342062950135, + "loss": 5.8749, + "step": 215000 + }, + { + "epoch": 0.4100799560546875, + "eval_accuracy": 0.04139334637964775, + "eval_lookahead_loss": 7.984254623508454, + "eval_lookahead_perplexity": 2934.3892643596846, + "eval_loss": 5.724365234375, + "eval_perplexity": 306.23881366235, + "eval_runtime": 516.2209, + "eval_samples_per_second": 19.372, + "eval_steps_per_second": 4.843, + "step": 215000 + }, + { + "epoch": 0.41103363037109375, + "grad_norm": 56.527061462402344, + "learning_rate": 2.9448413848876955e-05, + "lookahead_loss": 7.985482745170593, + "loss": 5.8625, + "step": 215500 + }, + { + "epoch": 0.4119873046875, + "grad_norm": 124.53937530517578, + "learning_rate": 2.940073013305664e-05, + "lookahead_loss": 8.11141568851471, + "loss": 5.878, + "step": 216000 + }, + { + "epoch": 0.41294097900390625, + "grad_norm": 67.76485443115234, + "learning_rate": 2.9353046417236328e-05, + "lookahead_loss": 7.951837394714356, + "loss": 5.8926, + "step": 216500 + }, + { + "epoch": 0.4138946533203125, + "grad_norm": 107.74806213378906, + "learning_rate": 2.930536270141602e-05, + "lookahead_loss": 7.8407617206573486, + "loss": 5.8511, + "step": 217000 + }, + { + "epoch": 0.41484832763671875, + "grad_norm": 155.94854736328125, + "learning_rate": 2.9257678985595705e-05, + "lookahead_loss": 7.946266674041748, + "loss": 5.7969, + "step": 217500 + }, + { + "epoch": 0.415802001953125, + "grad_norm": 80.70875549316406, + "learning_rate": 2.9209995269775392e-05, + "lookahead_loss": 7.866696907043457, + "loss": 5.8935, + "step": 218000 + }, + { + "epoch": 0.41675567626953125, + "grad_norm": 92.08131408691406, + "learning_rate": 2.916231155395508e-05, + "lookahead_loss": 7.830988502502441, + "loss": 5.8587, + "step": 218500 + }, + { + "epoch": 0.4177093505859375, + "grad_norm": 471.6579284667969, + "learning_rate": 2.9114627838134766e-05, + "lookahead_loss": 7.781314100265503, + "loss": 5.8768, + "step": 219000 + }, + { + "epoch": 0.41866302490234375, + "grad_norm": 446.3116149902344, + "learning_rate": 2.9066944122314456e-05, + "lookahead_loss": 7.896127394676208, + "loss": 5.8934, + "step": 219500 + }, + { + "epoch": 0.41961669921875, + "grad_norm": 123.6279067993164, + "learning_rate": 2.9019260406494143e-05, + "lookahead_loss": 7.748962701797486, + "loss": 5.8563, + "step": 220000 + }, + { + "epoch": 0.41961669921875, + "eval_accuracy": 0.04153972602739726, + "eval_lookahead_loss": 7.8041798466682435, + "eval_lookahead_perplexity": 2450.824669427835, + "eval_loss": 5.705533027648926, + "eval_perplexity": 300.52562583152474, + "eval_runtime": 505.0304, + "eval_samples_per_second": 19.801, + "eval_steps_per_second": 4.95, + "step": 220000 + }, + { + "epoch": 0.42057037353515625, + "grad_norm": 67.4626235961914, + "learning_rate": 2.897157669067383e-05, + "lookahead_loss": 7.983177913665772, + "loss": 5.8489, + "step": 220500 + }, + { + "epoch": 0.4215240478515625, + "grad_norm": 135.04417419433594, + "learning_rate": 2.8923892974853516e-05, + "lookahead_loss": 7.96869972038269, + "loss": 5.8434, + "step": 221000 + }, + { + "epoch": 0.42247772216796875, + "grad_norm": 98.8983383178711, + "learning_rate": 2.8876209259033203e-05, + "lookahead_loss": 7.941053133010865, + "loss": 5.8245, + "step": 221500 + }, + { + "epoch": 0.423431396484375, + "grad_norm": 499.2038269042969, + "learning_rate": 2.8828525543212893e-05, + "lookahead_loss": 7.911164036750794, + "loss": 5.7912, + "step": 222000 + }, + { + "epoch": 0.42438507080078125, + "grad_norm": 152.81951904296875, + "learning_rate": 2.878084182739258e-05, + "lookahead_loss": 7.7165368480682375, + "loss": 5.795, + "step": 222500 + }, + { + "epoch": 0.4253387451171875, + "grad_norm": 74.57410430908203, + "learning_rate": 2.8733158111572267e-05, + "lookahead_loss": 7.8263488864898685, + "loss": 5.8204, + "step": 223000 + }, + { + "epoch": 0.42629241943359375, + "grad_norm": 143.47433471679688, + "learning_rate": 2.8685474395751954e-05, + "lookahead_loss": 7.833469186782837, + "loss": 5.8978, + "step": 223500 + }, + { + "epoch": 0.42724609375, + "grad_norm": 54.15015411376953, + "learning_rate": 2.863779067993164e-05, + "lookahead_loss": 7.8595253925323485, + "loss": 5.8883, + "step": 224000 + }, + { + "epoch": 0.42819976806640625, + "grad_norm": 88.5059585571289, + "learning_rate": 2.859010696411133e-05, + "lookahead_loss": 7.931483741760254, + "loss": 5.913, + "step": 224500 + }, + { + "epoch": 0.4291534423828125, + "grad_norm": 149.3416748046875, + "learning_rate": 2.8542423248291018e-05, + "lookahead_loss": 7.9268950681686405, + "loss": 5.9083, + "step": 225000 + }, + { + "epoch": 0.4291534423828125, + "eval_accuracy": 0.041299608610567515, + "eval_lookahead_loss": 7.753063847732544, + "eval_lookahead_perplexity": 2328.6962666156755, + "eval_loss": 5.6959075927734375, + "eval_perplexity": 297.6468131303132, + "eval_runtime": 543.4848, + "eval_samples_per_second": 18.4, + "eval_steps_per_second": 4.6, + "step": 225000 + }, + { + "epoch": 0.43010711669921875, + "grad_norm": 177.7567901611328, + "learning_rate": 2.8494739532470705e-05, + "lookahead_loss": 7.793448933601379, + "loss": 5.8512, + "step": 225500 + }, + { + "epoch": 0.431060791015625, + "grad_norm": 117.91618347167969, + "learning_rate": 2.844705581665039e-05, + "lookahead_loss": 7.797062184333801, + "loss": 5.8499, + "step": 226000 + }, + { + "epoch": 0.43201446533203125, + "grad_norm": 166.80540466308594, + "learning_rate": 2.8399372100830078e-05, + "lookahead_loss": 8.055712942123414, + "loss": 5.9394, + "step": 226500 + }, + { + "epoch": 0.4329681396484375, + "grad_norm": 71.64677429199219, + "learning_rate": 2.835168838500977e-05, + "lookahead_loss": 8.059963097095489, + "loss": 5.8342, + "step": 227000 + }, + { + "epoch": 0.43392181396484375, + "grad_norm": 73.85379028320312, + "learning_rate": 2.8304004669189455e-05, + "lookahead_loss": 7.801866370677948, + "loss": 5.8752, + "step": 227500 + }, + { + "epoch": 0.43487548828125, + "grad_norm": 56.55813980102539, + "learning_rate": 2.8256320953369142e-05, + "lookahead_loss": 7.839802268028259, + "loss": 5.8201, + "step": 228000 + }, + { + "epoch": 0.43582916259765625, + "grad_norm": 81.2064208984375, + "learning_rate": 2.820863723754883e-05, + "lookahead_loss": 7.833906353950501, + "loss": 5.8929, + "step": 228500 + }, + { + "epoch": 0.4367828369140625, + "grad_norm": 53.52895736694336, + "learning_rate": 2.8160953521728516e-05, + "lookahead_loss": 7.848618391990661, + "loss": 5.844, + "step": 229000 + }, + { + "epoch": 0.43773651123046875, + "grad_norm": 114.41093444824219, + "learning_rate": 2.8113269805908206e-05, + "lookahead_loss": 7.861288282394409, + "loss": 5.825, + "step": 229500 + }, + { + "epoch": 0.438690185546875, + "grad_norm": 143.2441864013672, + "learning_rate": 2.8065586090087893e-05, + "lookahead_loss": 7.9216024770736695, + "loss": 5.8473, + "step": 230000 + }, + { + "epoch": 0.438690185546875, + "eval_accuracy": 0.04154383561643835, + "eval_lookahead_loss": 7.717986519050598, + "eval_lookahead_perplexity": 2248.42785317787, + "eval_loss": 5.677084445953369, + "eval_perplexity": 292.0965639192908, + "eval_runtime": 529.6013, + "eval_samples_per_second": 18.882, + "eval_steps_per_second": 4.721, + "step": 230000 + }, + { + "epoch": 0.43964385986328125, + "grad_norm": 155.46763610839844, + "learning_rate": 2.801790237426758e-05, + "lookahead_loss": 7.823653196334839, + "loss": 5.8482, + "step": 230500 + }, + { + "epoch": 0.4405975341796875, + "grad_norm": 130.3101348876953, + "learning_rate": 2.7970218658447266e-05, + "lookahead_loss": 7.919434549331665, + "loss": 5.7929, + "step": 231000 + }, + { + "epoch": 0.44155120849609375, + "grad_norm": 108.69844055175781, + "learning_rate": 2.7922534942626953e-05, + "lookahead_loss": 8.0193756980896, + "loss": 5.8608, + "step": 231500 + }, + { + "epoch": 0.4425048828125, + "grad_norm": 83.94735717773438, + "learning_rate": 2.7874851226806643e-05, + "lookahead_loss": 8.104357471466065, + "loss": 5.8443, + "step": 232000 + }, + { + "epoch": 0.44345855712890625, + "grad_norm": 77.8570785522461, + "learning_rate": 2.782716751098633e-05, + "lookahead_loss": 8.424842554092407, + "loss": 5.8548, + "step": 232500 + }, + { + "epoch": 0.4444122314453125, + "grad_norm": 151.28237915039062, + "learning_rate": 2.7779483795166017e-05, + "lookahead_loss": 7.99598745584488, + "loss": 5.8499, + "step": 233000 + }, + { + "epoch": 0.44536590576171875, + "grad_norm": 109.88167572021484, + "learning_rate": 2.7731800079345704e-05, + "lookahead_loss": 7.959760961532592, + "loss": 5.8062, + "step": 233500 + }, + { + "epoch": 0.446319580078125, + "grad_norm": 171.0165557861328, + "learning_rate": 2.768411636352539e-05, + "lookahead_loss": 7.911510745048523, + "loss": 5.8566, + "step": 234000 + }, + { + "epoch": 0.44727325439453125, + "grad_norm": 81.17371368408203, + "learning_rate": 2.763643264770508e-05, + "lookahead_loss": 7.937608414649963, + "loss": 5.8541, + "step": 234500 + }, + { + "epoch": 0.4482269287109375, + "grad_norm": 60.57700729370117, + "learning_rate": 2.7588748931884768e-05, + "lookahead_loss": 8.030696634292603, + "loss": 5.9021, + "step": 235000 + }, + { + "epoch": 0.4482269287109375, + "eval_accuracy": 0.04052563600782779, + "eval_lookahead_loss": 7.768605569458008, + "eval_lookahead_perplexity": 2365.1709212541596, + "eval_loss": 5.701207637786865, + "eval_perplexity": 299.2285425539532, + "eval_runtime": 505.6508, + "eval_samples_per_second": 19.776, + "eval_steps_per_second": 4.944, + "step": 235000 + }, + { + "epoch": 0.44918060302734375, + "grad_norm": 118.71009063720703, + "learning_rate": 2.7541065216064455e-05, + "lookahead_loss": 7.860527423858643, + "loss": 5.8575, + "step": 235500 + }, + { + "epoch": 0.45013427734375, + "grad_norm": 68.0650634765625, + "learning_rate": 2.749338150024414e-05, + "lookahead_loss": 7.873574114799499, + "loss": 5.8003, + "step": 236000 + }, + { + "epoch": 0.45108795166015625, + "grad_norm": 107.36117553710938, + "learning_rate": 2.7445697784423828e-05, + "lookahead_loss": 7.808487013816833, + "loss": 5.8215, + "step": 236500 + }, + { + "epoch": 0.4520416259765625, + "grad_norm": 133.5001220703125, + "learning_rate": 2.739801406860352e-05, + "lookahead_loss": 7.7671851272583, + "loss": 5.7891, + "step": 237000 + }, + { + "epoch": 0.45299530029296875, + "grad_norm": 66.27991485595703, + "learning_rate": 2.7350330352783205e-05, + "lookahead_loss": 7.9055367374420165, + "loss": 5.7832, + "step": 237500 + }, + { + "epoch": 0.453948974609375, + "grad_norm": 72.4911880493164, + "learning_rate": 2.7302646636962892e-05, + "lookahead_loss": 8.096063158988953, + "loss": 5.8939, + "step": 238000 + }, + { + "epoch": 0.45490264892578125, + "grad_norm": 57.94518280029297, + "learning_rate": 2.725496292114258e-05, + "lookahead_loss": 7.995163286209107, + "loss": 5.8627, + "step": 238500 + }, + { + "epoch": 0.4558563232421875, + "grad_norm": 74.75439453125, + "learning_rate": 2.7207279205322266e-05, + "lookahead_loss": 7.93011399936676, + "loss": 5.8687, + "step": 239000 + }, + { + "epoch": 0.45680999755859375, + "grad_norm": 227.26290893554688, + "learning_rate": 2.7159595489501956e-05, + "lookahead_loss": 7.961362115383148, + "loss": 5.8347, + "step": 239500 + }, + { + "epoch": 0.457763671875, + "grad_norm": 240.6214599609375, + "learning_rate": 2.7111911773681643e-05, + "lookahead_loss": 7.884561901092529, + "loss": 5.7837, + "step": 240000 + }, + { + "epoch": 0.457763671875, + "eval_accuracy": 0.04099373776908023, + "eval_lookahead_loss": 7.785055087089539, + "eval_lookahead_perplexity": 2404.398595649355, + "eval_loss": 5.656895160675049, + "eval_perplexity": 286.25847478758993, + "eval_runtime": 555.044, + "eval_samples_per_second": 18.017, + "eval_steps_per_second": 4.504, + "step": 240000 + }, + { + "epoch": 0.45871734619140625, + "grad_norm": 115.07328033447266, + "learning_rate": 2.706422805786133e-05, + "lookahead_loss": 7.9939801845550535, + "loss": 5.8357, + "step": 240500 + }, + { + "epoch": 0.4596710205078125, + "grad_norm": 96.98252868652344, + "learning_rate": 2.7016544342041016e-05, + "lookahead_loss": 7.892537168502807, + "loss": 5.8506, + "step": 241000 + }, + { + "epoch": 0.46062469482421875, + "grad_norm": 83.13056182861328, + "learning_rate": 2.6968860626220703e-05, + "lookahead_loss": 7.9084257860183715, + "loss": 5.8982, + "step": 241500 + }, + { + "epoch": 0.461578369140625, + "grad_norm": 196.73995971679688, + "learning_rate": 2.6921176910400393e-05, + "lookahead_loss": 7.848887843132019, + "loss": 5.8385, + "step": 242000 + }, + { + "epoch": 0.46253204345703125, + "grad_norm": 69.3080062866211, + "learning_rate": 2.687349319458008e-05, + "lookahead_loss": 7.837017350196838, + "loss": 5.8259, + "step": 242500 + }, + { + "epoch": 0.4634857177734375, + "grad_norm": 119.86585998535156, + "learning_rate": 2.6825809478759767e-05, + "lookahead_loss": 7.855517855644226, + "loss": 5.8397, + "step": 243000 + }, + { + "epoch": 0.46443939208984375, + "grad_norm": 125.81604766845703, + "learning_rate": 2.6778125762939454e-05, + "lookahead_loss": 7.8049629106521605, + "loss": 5.842, + "step": 243500 + }, + { + "epoch": 0.46539306640625, + "grad_norm": 108.61225128173828, + "learning_rate": 2.673044204711914e-05, + "lookahead_loss": 7.883237821578979, + "loss": 5.7949, + "step": 244000 + }, + { + "epoch": 0.46634674072265625, + "grad_norm": 64.97001647949219, + "learning_rate": 2.668275833129883e-05, + "lookahead_loss": 7.843094090461731, + "loss": 5.764, + "step": 244500 + }, + { + "epoch": 0.4673004150390625, + "grad_norm": 47.53362274169922, + "learning_rate": 2.6635074615478518e-05, + "lookahead_loss": 7.7077200756073, + "loss": 5.8015, + "step": 245000 + }, + { + "epoch": 0.4673004150390625, + "eval_accuracy": 0.0404146771037182, + "eval_lookahead_loss": 7.701358358764648, + "eval_lookahead_perplexity": 2211.3497590141683, + "eval_loss": 5.656283378601074, + "eval_perplexity": 286.0834005432942, + "eval_runtime": 498.6256, + "eval_samples_per_second": 20.055, + "eval_steps_per_second": 5.014, + "step": 245000 + }, + { + "epoch": 0.46825408935546875, + "grad_norm": 114.15084075927734, + "learning_rate": 2.6587390899658205e-05, + "lookahead_loss": 7.874519399642944, + "loss": 5.8027, + "step": 245500 + }, + { + "epoch": 0.469207763671875, + "grad_norm": 119.80757904052734, + "learning_rate": 2.653970718383789e-05, + "lookahead_loss": 7.989437255859375, + "loss": 5.7454, + "step": 246000 + }, + { + "epoch": 0.47016143798828125, + "grad_norm": 62.146060943603516, + "learning_rate": 2.6492023468017578e-05, + "lookahead_loss": 7.912638045310974, + "loss": 5.7694, + "step": 246500 + }, + { + "epoch": 0.4711151123046875, + "grad_norm": 101.16262817382812, + "learning_rate": 2.644433975219727e-05, + "lookahead_loss": 7.873991565704346, + "loss": 5.8134, + "step": 247000 + }, + { + "epoch": 0.47206878662109375, + "grad_norm": 80.39459228515625, + "learning_rate": 2.6396656036376955e-05, + "lookahead_loss": 7.925439959526062, + "loss": 5.7773, + "step": 247500 + }, + { + "epoch": 0.4730224609375, + "grad_norm": 411.4380798339844, + "learning_rate": 2.6348972320556642e-05, + "lookahead_loss": 7.806881208419799, + "loss": 5.8242, + "step": 248000 + }, + { + "epoch": 0.47397613525390625, + "grad_norm": 143.83450317382812, + "learning_rate": 2.630128860473633e-05, + "lookahead_loss": 7.881622109413147, + "loss": 5.7905, + "step": 248500 + }, + { + "epoch": 0.4749298095703125, + "grad_norm": 93.68058776855469, + "learning_rate": 2.6253604888916016e-05, + "lookahead_loss": 7.877734619140625, + "loss": 5.8267, + "step": 249000 + }, + { + "epoch": 0.47588348388671875, + "grad_norm": 76.90664672851562, + "learning_rate": 2.6205921173095706e-05, + "lookahead_loss": 7.865545108795166, + "loss": 5.8338, + "step": 249500 + }, + { + "epoch": 0.476837158203125, + "grad_norm": 119.10668182373047, + "learning_rate": 2.6158237457275393e-05, + "lookahead_loss": 7.802703098297119, + "loss": 5.7699, + "step": 250000 + }, + { + "epoch": 0.476837158203125, + "eval_accuracy": 0.04194637964774951, + "eval_lookahead_loss": 7.693028386116028, + "eval_lookahead_perplexity": 2193.0057844796493, + "eval_loss": 5.6396589279174805, + "eval_perplexity": 281.36673576917934, + "eval_runtime": 494.1958, + "eval_samples_per_second": 20.235, + "eval_steps_per_second": 5.059, + "step": 250000 + }, + { + "epoch": 0.47779083251953125, + "grad_norm": 72.06342315673828, + "learning_rate": 2.611055374145508e-05, + "lookahead_loss": 7.628686081886292, + "loss": 5.7147, + "step": 250500 + }, + { + "epoch": 0.4787445068359375, + "grad_norm": 118.65154266357422, + "learning_rate": 2.6062870025634766e-05, + "lookahead_loss": 7.7573640880584716, + "loss": 5.7378, + "step": 251000 + }, + { + "epoch": 0.47969818115234375, + "grad_norm": 64.01678466796875, + "learning_rate": 2.6015186309814453e-05, + "lookahead_loss": 7.851820120811462, + "loss": 5.8004, + "step": 251500 + }, + { + "epoch": 0.48065185546875, + "grad_norm": 63.454105377197266, + "learning_rate": 2.5967502593994143e-05, + "lookahead_loss": 7.90937801361084, + "loss": 5.7689, + "step": 252000 + }, + { + "epoch": 0.48160552978515625, + "grad_norm": 91.39474487304688, + "learning_rate": 2.591981887817383e-05, + "lookahead_loss": 7.8844979343414305, + "loss": 5.8592, + "step": 252500 + }, + { + "epoch": 0.4825592041015625, + "grad_norm": 227.0882110595703, + "learning_rate": 2.5872135162353517e-05, + "lookahead_loss": 7.9112224798202515, + "loss": 5.8099, + "step": 253000 + }, + { + "epoch": 0.48351287841796875, + "grad_norm": 114.92095947265625, + "learning_rate": 2.5824451446533204e-05, + "lookahead_loss": 7.757169961929321, + "loss": 5.8211, + "step": 253500 + }, + { + "epoch": 0.484466552734375, + "grad_norm": 260.2445373535156, + "learning_rate": 2.577676773071289e-05, + "lookahead_loss": 7.853659277915955, + "loss": 5.7908, + "step": 254000 + }, + { + "epoch": 0.48542022705078125, + "grad_norm": 163.41412353515625, + "learning_rate": 2.572908401489258e-05, + "lookahead_loss": 7.8117708969116215, + "loss": 5.6839, + "step": 254500 + }, + { + "epoch": 0.4863739013671875, + "grad_norm": 55.95166015625, + "learning_rate": 2.5681400299072268e-05, + "lookahead_loss": 7.848519646644593, + "loss": 5.7673, + "step": 255000 + }, + { + "epoch": 0.4863739013671875, + "eval_accuracy": 0.04090626223091977, + "eval_lookahead_loss": 7.794912069320679, + "eval_lookahead_perplexity": 2428.215900417534, + "eval_loss": 5.625692844390869, + "eval_perplexity": 277.46445763555334, + "eval_runtime": 494.9555, + "eval_samples_per_second": 20.204, + "eval_steps_per_second": 5.051, + "step": 255000 + }, + { + "epoch": 0.48732757568359375, + "grad_norm": 90.4194564819336, + "learning_rate": 2.5633716583251955e-05, + "lookahead_loss": 7.8131186962127686, + "loss": 5.7934, + "step": 255500 + }, + { + "epoch": 0.48828125, + "grad_norm": 120.99221801757812, + "learning_rate": 2.558603286743164e-05, + "lookahead_loss": 7.8157203016281125, + "loss": 5.7739, + "step": 256000 + }, + { + "epoch": 0.48923492431640625, + "grad_norm": 137.9215850830078, + "learning_rate": 2.5538349151611328e-05, + "lookahead_loss": 7.758997138977051, + "loss": 5.7952, + "step": 256500 + }, + { + "epoch": 0.4901885986328125, + "grad_norm": 67.85360717773438, + "learning_rate": 2.549066543579102e-05, + "lookahead_loss": 7.810414762496948, + "loss": 5.7824, + "step": 257000 + }, + { + "epoch": 0.49114227294921875, + "grad_norm": 157.27708435058594, + "learning_rate": 2.5442981719970705e-05, + "lookahead_loss": 7.715693735122681, + "loss": 5.7723, + "step": 257500 + }, + { + "epoch": 0.492095947265625, + "grad_norm": 255.9970703125, + "learning_rate": 2.5395298004150392e-05, + "lookahead_loss": 7.890613670349121, + "loss": 5.7698, + "step": 258000 + }, + { + "epoch": 0.49304962158203125, + "grad_norm": 122.6175308227539, + "learning_rate": 2.534761428833008e-05, + "lookahead_loss": 7.8175912246704105, + "loss": 5.7728, + "step": 258500 + }, + { + "epoch": 0.4940032958984375, + "grad_norm": 73.59149169921875, + "learning_rate": 2.5299930572509766e-05, + "lookahead_loss": 7.953368329048157, + "loss": 5.7749, + "step": 259000 + }, + { + "epoch": 0.49495697021484375, + "grad_norm": 113.43846893310547, + "learning_rate": 2.5252246856689456e-05, + "lookahead_loss": 7.870643635749817, + "loss": 5.8221, + "step": 259500 + }, + { + "epoch": 0.49591064453125, + "grad_norm": 139.81082153320312, + "learning_rate": 2.5204563140869143e-05, + "lookahead_loss": 7.966851595878601, + "loss": 5.7342, + "step": 260000 + }, + { + "epoch": 0.49591064453125, + "eval_accuracy": 0.04089080234833659, + "eval_lookahead_loss": 7.775774960517883, + "eval_lookahead_perplexity": 2382.188687128395, + "eval_loss": 5.613644123077393, + "eval_perplexity": 274.14142505858143, + "eval_runtime": 496.176, + "eval_samples_per_second": 20.154, + "eval_steps_per_second": 5.039, + "step": 260000 + }, + { + "epoch": 0.49686431884765625, + "grad_norm": 101.70256805419922, + "learning_rate": 2.515687942504883e-05, + "lookahead_loss": 7.908554591178894, + "loss": 5.7856, + "step": 260500 + }, + { + "epoch": 0.4978179931640625, + "grad_norm": 154.6017303466797, + "learning_rate": 2.5109195709228516e-05, + "lookahead_loss": 7.912033013343811, + "loss": 5.7798, + "step": 261000 + }, + { + "epoch": 0.49877166748046875, + "grad_norm": 70.03165435791016, + "learning_rate": 2.5061511993408203e-05, + "lookahead_loss": 7.7956780214309696, + "loss": 5.7771, + "step": 261500 + }, + { + "epoch": 0.499725341796875, + "grad_norm": 109.95203399658203, + "learning_rate": 2.5013828277587893e-05, + "lookahead_loss": 7.90615640258789, + "loss": 5.8069, + "step": 262000 + }, + { + "epoch": 0.5006790161132812, + "grad_norm": 72.66572570800781, + "learning_rate": 2.496614456176758e-05, + "lookahead_loss": 7.77998378276825, + "loss": 5.7768, + "step": 262500 + }, + { + "epoch": 0.5016326904296875, + "grad_norm": 119.75885772705078, + "learning_rate": 2.4918460845947267e-05, + "lookahead_loss": 7.814490133285522, + "loss": 5.7904, + "step": 263000 + }, + { + "epoch": 0.5025863647460938, + "grad_norm": 122.98607635498047, + "learning_rate": 2.4870777130126954e-05, + "lookahead_loss": 7.870713673591614, + "loss": 5.7542, + "step": 263500 + }, + { + "epoch": 0.5035400390625, + "grad_norm": 78.88066101074219, + "learning_rate": 2.482309341430664e-05, + "lookahead_loss": 7.856002924919128, + "loss": 5.8143, + "step": 264000 + }, + { + "epoch": 0.5044937133789062, + "grad_norm": 126.39031982421875, + "learning_rate": 2.477540969848633e-05, + "lookahead_loss": 7.8984311170578, + "loss": 5.7465, + "step": 264500 + }, + { + "epoch": 0.5054473876953125, + "grad_norm": 91.17089080810547, + "learning_rate": 2.4727725982666018e-05, + "lookahead_loss": 7.7288994808197025, + "loss": 5.7296, + "step": 265000 + }, + { + "epoch": 0.5054473876953125, + "eval_accuracy": 0.04080078277886497, + "eval_lookahead_loss": 7.769478130531311, + "eval_lookahead_perplexity": 2367.2355779702993, + "eval_loss": 5.609341621398926, + "eval_perplexity": 272.96446486933417, + "eval_runtime": 491.0419, + "eval_samples_per_second": 20.365, + "eval_steps_per_second": 5.091, + "step": 265000 + }, + { + "epoch": 0.5064010620117188, + "grad_norm": 62.866817474365234, + "learning_rate": 2.4680042266845705e-05, + "lookahead_loss": 7.825910142898559, + "loss": 5.7537, + "step": 265500 + }, + { + "epoch": 0.507354736328125, + "grad_norm": 91.29137420654297, + "learning_rate": 2.463235855102539e-05, + "lookahead_loss": 7.665714569091797, + "loss": 5.6859, + "step": 266000 + }, + { + "epoch": 0.5083084106445312, + "grad_norm": 83.57354736328125, + "learning_rate": 2.4584674835205078e-05, + "lookahead_loss": 7.752137540817261, + "loss": 5.7765, + "step": 266500 + }, + { + "epoch": 0.5092620849609375, + "grad_norm": 148.39450073242188, + "learning_rate": 2.453699111938477e-05, + "lookahead_loss": 7.823356912612915, + "loss": 5.8024, + "step": 267000 + }, + { + "epoch": 0.5102157592773438, + "grad_norm": 68.0697021484375, + "learning_rate": 2.4489307403564455e-05, + "lookahead_loss": 7.873646305084229, + "loss": 5.8081, + "step": 267500 + }, + { + "epoch": 0.51116943359375, + "grad_norm": 66.21329498291016, + "learning_rate": 2.4441623687744142e-05, + "lookahead_loss": 7.760735192298889, + "loss": 5.7251, + "step": 268000 + }, + { + "epoch": 0.5121231079101562, + "grad_norm": 70.84642028808594, + "learning_rate": 2.439393997192383e-05, + "lookahead_loss": 7.741357414245606, + "loss": 5.7393, + "step": 268500 + }, + { + "epoch": 0.5130767822265625, + "grad_norm": 76.88446807861328, + "learning_rate": 2.4346256256103516e-05, + "lookahead_loss": 7.6635161380767824, + "loss": 5.7403, + "step": 269000 + }, + { + "epoch": 0.5140304565429688, + "grad_norm": 93.43556213378906, + "learning_rate": 2.4298572540283206e-05, + "lookahead_loss": 7.789415211677551, + "loss": 5.7616, + "step": 269500 + }, + { + "epoch": 0.514984130859375, + "grad_norm": 114.78719329833984, + "learning_rate": 2.4250888824462893e-05, + "lookahead_loss": 7.793281699180603, + "loss": 5.7854, + "step": 270000 + }, + { + "epoch": 0.514984130859375, + "eval_accuracy": 0.041761448140900194, + "eval_lookahead_loss": 7.750978371047974, + "eval_lookahead_perplexity": 2323.8448853257946, + "eval_loss": 5.600083351135254, + "eval_perplexity": 270.44894871362214, + "eval_runtime": 491.5123, + "eval_samples_per_second": 20.345, + "eval_steps_per_second": 5.086, + "step": 270000 + }, + { + "epoch": 0.5159378051757812, + "grad_norm": 49.624996185302734, + "learning_rate": 2.420320510864258e-05, + "lookahead_loss": 7.783914581298828, + "loss": 5.7636, + "step": 270500 + }, + { + "epoch": 0.5168914794921875, + "grad_norm": 46.21735382080078, + "learning_rate": 2.4155521392822266e-05, + "lookahead_loss": 7.864769186019897, + "loss": 5.76, + "step": 271000 + }, + { + "epoch": 0.5178451538085938, + "grad_norm": 62.844337463378906, + "learning_rate": 2.4107837677001953e-05, + "lookahead_loss": 7.745238103866577, + "loss": 5.7718, + "step": 271500 + }, + { + "epoch": 0.518798828125, + "grad_norm": 202.1984100341797, + "learning_rate": 2.406015396118164e-05, + "lookahead_loss": 7.85829208278656, + "loss": 5.7752, + "step": 272000 + }, + { + "epoch": 0.5197525024414062, + "grad_norm": 56.38215637207031, + "learning_rate": 2.401247024536133e-05, + "lookahead_loss": 7.737642753601074, + "loss": 5.7255, + "step": 272500 + }, + { + "epoch": 0.5207061767578125, + "grad_norm": 114.9666976928711, + "learning_rate": 2.3964786529541017e-05, + "lookahead_loss": 7.7771752233505245, + "loss": 5.726, + "step": 273000 + }, + { + "epoch": 0.5216598510742188, + "grad_norm": 86.54193878173828, + "learning_rate": 2.3917102813720704e-05, + "lookahead_loss": 7.855183423995972, + "loss": 5.7242, + "step": 273500 + }, + { + "epoch": 0.522613525390625, + "grad_norm": 102.10504913330078, + "learning_rate": 2.386941909790039e-05, + "lookahead_loss": 7.743604795455933, + "loss": 5.7404, + "step": 274000 + }, + { + "epoch": 0.5235671997070312, + "grad_norm": 250.58990478515625, + "learning_rate": 2.3821735382080078e-05, + "lookahead_loss": 7.783545227050781, + "loss": 5.7318, + "step": 274500 + }, + { + "epoch": 0.5245208740234375, + "grad_norm": 190.8095245361328, + "learning_rate": 2.3774051666259768e-05, + "lookahead_loss": 7.851285044670105, + "loss": 5.7778, + "step": 275000 + }, + { + "epoch": 0.5245208740234375, + "eval_accuracy": 0.04155420743639922, + "eval_lookahead_loss": 7.729284999275207, + "eval_lookahead_perplexity": 2273.975725099355, + "eval_loss": 5.583587646484375, + "eval_perplexity": 266.0242969385082, + "eval_runtime": 527.2408, + "eval_samples_per_second": 18.967, + "eval_steps_per_second": 4.742, + "step": 275000 + }, + { + "epoch": 0.5254745483398438, + "grad_norm": 41.188209533691406, + "learning_rate": 2.3726367950439455e-05, + "lookahead_loss": 7.804956748008728, + "loss": 5.7542, + "step": 275500 + }, + { + "epoch": 0.52642822265625, + "grad_norm": 241.96835327148438, + "learning_rate": 2.367868423461914e-05, + "lookahead_loss": 7.728679870605469, + "loss": 5.7678, + "step": 276000 + }, + { + "epoch": 0.5273818969726562, + "grad_norm": 50.04011917114258, + "learning_rate": 2.3631000518798828e-05, + "lookahead_loss": 7.74855604171753, + "loss": 5.7384, + "step": 276500 + }, + { + "epoch": 0.5283355712890625, + "grad_norm": 36.478614807128906, + "learning_rate": 2.3583316802978515e-05, + "lookahead_loss": 7.890018739700317, + "loss": 5.7037, + "step": 277000 + }, + { + "epoch": 0.5292892456054688, + "grad_norm": 132.9232940673828, + "learning_rate": 2.3535633087158205e-05, + "lookahead_loss": 7.808481062889099, + "loss": 5.7531, + "step": 277500 + }, + { + "epoch": 0.530242919921875, + "grad_norm": 56.3750114440918, + "learning_rate": 2.3487949371337892e-05, + "lookahead_loss": 7.910913692474365, + "loss": 5.766, + "step": 278000 + }, + { + "epoch": 0.5311965942382812, + "grad_norm": 165.48663330078125, + "learning_rate": 2.344026565551758e-05, + "lookahead_loss": 7.636764324188232, + "loss": 5.6725, + "step": 278500 + }, + { + "epoch": 0.5321502685546875, + "grad_norm": 79.51268768310547, + "learning_rate": 2.3392581939697266e-05, + "lookahead_loss": 7.800500823020935, + "loss": 5.6625, + "step": 279000 + }, + { + "epoch": 0.5331039428710938, + "grad_norm": 80.42979431152344, + "learning_rate": 2.3344898223876953e-05, + "lookahead_loss": 7.803842956542969, + "loss": 5.6534, + "step": 279500 + }, + { + "epoch": 0.5340576171875, + "grad_norm": 187.73081970214844, + "learning_rate": 2.3297214508056643e-05, + "lookahead_loss": 7.863658168792725, + "loss": 5.7347, + "step": 280000 + }, + { + "epoch": 0.5340576171875, + "eval_accuracy": 0.04046771037181996, + "eval_lookahead_loss": 7.745952005004883, + "eval_lookahead_perplexity": 2312.1936964049305, + "eval_loss": 5.575263500213623, + "eval_perplexity": 263.81906285837397, + "eval_runtime": 502.734, + "eval_samples_per_second": 19.891, + "eval_steps_per_second": 4.973, + "step": 280000 + }, + { + "epoch": 0.5350112915039062, + "grad_norm": 108.30266571044922, + "learning_rate": 2.324953079223633e-05, + "lookahead_loss": 7.85016524028778, + "loss": 5.8183, + "step": 280500 + }, + { + "epoch": 0.5359649658203125, + "grad_norm": 47.64173889160156, + "learning_rate": 2.3201847076416016e-05, + "lookahead_loss": 7.779328009605408, + "loss": 5.7811, + "step": 281000 + }, + { + "epoch": 0.5369186401367188, + "grad_norm": 107.64262390136719, + "learning_rate": 2.3154163360595703e-05, + "lookahead_loss": 7.718870476722717, + "loss": 5.7721, + "step": 281500 + }, + { + "epoch": 0.537872314453125, + "grad_norm": 95.41127014160156, + "learning_rate": 2.310647964477539e-05, + "lookahead_loss": 7.72656600856781, + "loss": 5.7391, + "step": 282000 + }, + { + "epoch": 0.5388259887695312, + "grad_norm": 140.5384979248047, + "learning_rate": 2.305879592895508e-05, + "lookahead_loss": 7.807889041900634, + "loss": 5.717, + "step": 282500 + }, + { + "epoch": 0.5397796630859375, + "grad_norm": 42.966575622558594, + "learning_rate": 2.3011112213134767e-05, + "lookahead_loss": 7.781537691116333, + "loss": 5.7511, + "step": 283000 + }, + { + "epoch": 0.5407333374023438, + "grad_norm": 62.068511962890625, + "learning_rate": 2.2963428497314454e-05, + "lookahead_loss": 7.691704772949219, + "loss": 5.7061, + "step": 283500 + }, + { + "epoch": 0.54168701171875, + "grad_norm": 191.8215789794922, + "learning_rate": 2.291574478149414e-05, + "lookahead_loss": 7.758805885314941, + "loss": 5.7142, + "step": 284000 + }, + { + "epoch": 0.5426406860351562, + "grad_norm": 318.36279296875, + "learning_rate": 2.2868061065673828e-05, + "lookahead_loss": 7.865957304000855, + "loss": 5.7129, + "step": 284500 + }, + { + "epoch": 0.5435943603515625, + "grad_norm": 79.11617279052734, + "learning_rate": 2.2820377349853518e-05, + "lookahead_loss": 7.776995993614197, + "loss": 5.7254, + "step": 285000 + }, + { + "epoch": 0.5435943603515625, + "eval_accuracy": 0.04055127201565558, + "eval_lookahead_loss": 7.73265717010498, + "eval_lookahead_perplexity": 2281.6569035512052, + "eval_loss": 5.565149784088135, + "eval_perplexity": 261.16431903964894, + "eval_runtime": 494.8817, + "eval_samples_per_second": 20.207, + "eval_steps_per_second": 5.052, + "step": 285000 + }, + { + "epoch": 0.5445480346679688, + "grad_norm": 144.20730590820312, + "learning_rate": 2.2772693634033205e-05, + "lookahead_loss": 7.84266770362854, + "loss": 5.7365, + "step": 285500 + }, + { + "epoch": 0.545501708984375, + "grad_norm": 84.2968978881836, + "learning_rate": 2.272500991821289e-05, + "lookahead_loss": 7.9139693117141725, + "loss": 5.7307, + "step": 286000 + }, + { + "epoch": 0.5464553833007812, + "grad_norm": 81.49491119384766, + "learning_rate": 2.2677326202392578e-05, + "lookahead_loss": 7.7588143033981325, + "loss": 5.7223, + "step": 286500 + }, + { + "epoch": 0.5474090576171875, + "grad_norm": 74.05553436279297, + "learning_rate": 2.2629642486572265e-05, + "lookahead_loss": 7.779314447402954, + "loss": 5.6776, + "step": 287000 + }, + { + "epoch": 0.5483627319335938, + "grad_norm": 122.48589324951172, + "learning_rate": 2.2581958770751955e-05, + "lookahead_loss": 7.82890523147583, + "loss": 5.7335, + "step": 287500 + }, + { + "epoch": 0.54931640625, + "grad_norm": 83.34748840332031, + "learning_rate": 2.2534275054931642e-05, + "lookahead_loss": 7.733275609970093, + "loss": 5.7338, + "step": 288000 + }, + { + "epoch": 0.5502700805664062, + "grad_norm": 128.59695434570312, + "learning_rate": 2.248659133911133e-05, + "lookahead_loss": 7.648697338104248, + "loss": 5.7123, + "step": 288500 + }, + { + "epoch": 0.5512237548828125, + "grad_norm": 152.4603729248047, + "learning_rate": 2.2438907623291016e-05, + "lookahead_loss": 7.719506496429443, + "loss": 5.7183, + "step": 289000 + }, + { + "epoch": 0.5521774291992188, + "grad_norm": 140.11187744140625, + "learning_rate": 2.2391223907470703e-05, + "lookahead_loss": 7.7255661506652835, + "loss": 5.7197, + "step": 289500 + }, + { + "epoch": 0.553131103515625, + "grad_norm": 89.04906463623047, + "learning_rate": 2.2343540191650393e-05, + "lookahead_loss": 7.628684705734253, + "loss": 5.7287, + "step": 290000 + }, + { + "epoch": 0.553131103515625, + "eval_accuracy": 0.040459295499021525, + "eval_lookahead_loss": 7.653055633544922, + "eval_lookahead_perplexity": 2107.074209300532, + "eval_loss": 5.560282230377197, + "eval_perplexity": 259.89617656898656, + "eval_runtime": 488.7672, + "eval_samples_per_second": 20.46, + "eval_steps_per_second": 5.115, + "step": 290000 + }, + { + "epoch": 0.5540847778320312, + "grad_norm": 316.8251953125, + "learning_rate": 2.229585647583008e-05, + "lookahead_loss": 7.681621799468994, + "loss": 5.7289, + "step": 290500 + }, + { + "epoch": 0.5550384521484375, + "grad_norm": 112.64208221435547, + "learning_rate": 2.2248172760009766e-05, + "lookahead_loss": 7.6821005277633665, + "loss": 5.7381, + "step": 291000 + }, + { + "epoch": 0.5559921264648438, + "grad_norm": 81.51651000976562, + "learning_rate": 2.2200489044189453e-05, + "lookahead_loss": 7.794990436553955, + "loss": 5.7395, + "step": 291500 + }, + { + "epoch": 0.55694580078125, + "grad_norm": 70.82510375976562, + "learning_rate": 2.215280532836914e-05, + "lookahead_loss": 7.6794465637207034, + "loss": 5.6484, + "step": 292000 + }, + { + "epoch": 0.5578994750976562, + "grad_norm": 96.98380279541016, + "learning_rate": 2.210512161254883e-05, + "lookahead_loss": 7.709634718894958, + "loss": 5.6884, + "step": 292500 + }, + { + "epoch": 0.5588531494140625, + "grad_norm": 146.06198120117188, + "learning_rate": 2.2057437896728517e-05, + "lookahead_loss": 7.745594568252564, + "loss": 5.6675, + "step": 293000 + }, + { + "epoch": 0.5598068237304688, + "grad_norm": 153.40602111816406, + "learning_rate": 2.2009754180908204e-05, + "lookahead_loss": 7.588545488357544, + "loss": 5.6354, + "step": 293500 + }, + { + "epoch": 0.560760498046875, + "grad_norm": 104.3370361328125, + "learning_rate": 2.196207046508789e-05, + "lookahead_loss": 7.845240453720093, + "loss": 5.7679, + "step": 294000 + }, + { + "epoch": 0.5617141723632812, + "grad_norm": 672.0825805664062, + "learning_rate": 2.1914386749267578e-05, + "lookahead_loss": 7.632508312225342, + "loss": 5.7961, + "step": 294500 + }, + { + "epoch": 0.5626678466796875, + "grad_norm": 74.8082046508789, + "learning_rate": 2.1866703033447268e-05, + "lookahead_loss": 7.6974793691635135, + "loss": 5.8322, + "step": 295000 + }, + { + "epoch": 0.5626678466796875, + "eval_accuracy": 0.04100802348336595, + "eval_lookahead_loss": 7.662689605331421, + "eval_lookahead_perplexity": 2127.4717999273776, + "eval_loss": 5.550065994262695, + "eval_perplexity": 257.25453266879265, + "eval_runtime": 495.4073, + "eval_samples_per_second": 20.185, + "eval_steps_per_second": 5.046, + "step": 295000 + }, + { + "epoch": 0.5636215209960938, + "grad_norm": 200.99916076660156, + "learning_rate": 2.1819019317626955e-05, + "lookahead_loss": 7.8231658973693845, + "loss": 5.7546, + "step": 295500 + }, + { + "epoch": 0.5645751953125, + "grad_norm": 110.86414337158203, + "learning_rate": 2.177133560180664e-05, + "lookahead_loss": 7.715544991493225, + "loss": 5.7198, + "step": 296000 + }, + { + "epoch": 0.5655288696289062, + "grad_norm": 59.469722747802734, + "learning_rate": 2.1723651885986328e-05, + "lookahead_loss": 7.767370808601379, + "loss": 5.7023, + "step": 296500 + }, + { + "epoch": 0.5664825439453125, + "grad_norm": 83.87696075439453, + "learning_rate": 2.1675968170166015e-05, + "lookahead_loss": 7.763500256538391, + "loss": 5.7103, + "step": 297000 + }, + { + "epoch": 0.5674362182617188, + "grad_norm": 92.14472198486328, + "learning_rate": 2.1628284454345705e-05, + "lookahead_loss": 7.795149648666382, + "loss": 5.7405, + "step": 297500 + }, + { + "epoch": 0.568389892578125, + "grad_norm": 248.72216796875, + "learning_rate": 2.1580600738525392e-05, + "lookahead_loss": 7.81306653881073, + "loss": 5.757, + "step": 298000 + }, + { + "epoch": 0.5693435668945312, + "grad_norm": 129.39817810058594, + "learning_rate": 2.153291702270508e-05, + "lookahead_loss": 7.663261608123779, + "loss": 5.7157, + "step": 298500 + }, + { + "epoch": 0.5702972412109375, + "grad_norm": 90.48443603515625, + "learning_rate": 2.1485233306884766e-05, + "lookahead_loss": 7.769034329414367, + "loss": 5.7002, + "step": 299000 + }, + { + "epoch": 0.5712509155273438, + "grad_norm": 107.2739028930664, + "learning_rate": 2.1437549591064453e-05, + "lookahead_loss": 7.821802872657776, + "loss": 5.6969, + "step": 299500 + }, + { + "epoch": 0.57220458984375, + "grad_norm": 70.02484893798828, + "learning_rate": 2.1389865875244143e-05, + "lookahead_loss": 7.768518057823181, + "loss": 5.6957, + "step": 300000 + }, + { + "epoch": 0.57220458984375, + "eval_accuracy": 0.04062309197651663, + "eval_lookahead_loss": 7.697312644386291, + "eval_lookahead_perplexity": 2202.4213425881153, + "eval_loss": 5.551235198974609, + "eval_perplexity": 257.55549178767495, + "eval_runtime": 497.6095, + "eval_samples_per_second": 20.096, + "eval_steps_per_second": 5.024, + "step": 300000 + }, + { + "epoch": 0.5731582641601562, + "grad_norm": 122.69623565673828, + "learning_rate": 2.134218215942383e-05, + "lookahead_loss": 7.814246452331543, + "loss": 5.7245, + "step": 300500 + }, + { + "epoch": 0.5741119384765625, + "grad_norm": 215.25209045410156, + "learning_rate": 2.1294498443603516e-05, + "lookahead_loss": 7.766628502845764, + "loss": 5.6875, + "step": 301000 + }, + { + "epoch": 0.5750656127929688, + "grad_norm": 123.57778930664062, + "learning_rate": 2.1246814727783203e-05, + "lookahead_loss": 7.735668872833252, + "loss": 5.6916, + "step": 301500 + }, + { + "epoch": 0.576019287109375, + "grad_norm": 98.96884155273438, + "learning_rate": 2.119913101196289e-05, + "lookahead_loss": 7.78820293712616, + "loss": 5.6686, + "step": 302000 + }, + { + "epoch": 0.5769729614257812, + "grad_norm": 148.1219024658203, + "learning_rate": 2.115144729614258e-05, + "lookahead_loss": 7.746945549964905, + "loss": 5.6894, + "step": 302500 + }, + { + "epoch": 0.5779266357421875, + "grad_norm": 110.17955780029297, + "learning_rate": 2.1103763580322267e-05, + "lookahead_loss": 7.711278474807739, + "loss": 5.7131, + "step": 303000 + }, + { + "epoch": 0.5788803100585938, + "grad_norm": 78.07681274414062, + "learning_rate": 2.1056079864501954e-05, + "lookahead_loss": 7.746251266479492, + "loss": 5.6911, + "step": 303500 + }, + { + "epoch": 0.579833984375, + "grad_norm": 380.1153259277344, + "learning_rate": 2.100839614868164e-05, + "lookahead_loss": 7.924781229019165, + "loss": 5.6579, + "step": 304000 + }, + { + "epoch": 0.5807876586914062, + "grad_norm": 112.0230941772461, + "learning_rate": 2.0960712432861328e-05, + "lookahead_loss": 7.782016543388367, + "loss": 5.712, + "step": 304500 + }, + { + "epoch": 0.5817413330078125, + "grad_norm": 52.744354248046875, + "learning_rate": 2.0913028717041018e-05, + "lookahead_loss": 7.782493028640747, + "loss": 5.6885, + "step": 305000 + }, + { + "epoch": 0.5817413330078125, + "eval_accuracy": 0.040020743639921724, + "eval_lookahead_loss": 7.678230839538574, + "eval_lookahead_perplexity": 2160.793597689096, + "eval_loss": 5.5358195304870605, + "eval_perplexity": 253.61554816613182, + "eval_runtime": 503.9647, + "eval_samples_per_second": 19.843, + "eval_steps_per_second": 4.961, + "step": 305000 + }, + { + "epoch": 0.5826950073242188, + "grad_norm": 68.10065460205078, + "learning_rate": 2.0865345001220705e-05, + "lookahead_loss": 7.783782515525818, + "loss": 5.7234, + "step": 305500 + }, + { + "epoch": 0.583648681640625, + "grad_norm": 68.55180358886719, + "learning_rate": 2.081766128540039e-05, + "lookahead_loss": 7.652966531276703, + "loss": 5.635, + "step": 306000 + }, + { + "epoch": 0.5846023559570312, + "grad_norm": 72.38198852539062, + "learning_rate": 2.0769977569580078e-05, + "lookahead_loss": 7.675947504043579, + "loss": 5.6584, + "step": 306500 + }, + { + "epoch": 0.5855560302734375, + "grad_norm": 116.76799774169922, + "learning_rate": 2.0722293853759765e-05, + "lookahead_loss": 7.647464849472046, + "loss": 5.6506, + "step": 307000 + }, + { + "epoch": 0.5865097045898438, + "grad_norm": 175.15660095214844, + "learning_rate": 2.0674610137939455e-05, + "lookahead_loss": 7.6004626188278195, + "loss": 5.6435, + "step": 307500 + }, + { + "epoch": 0.58746337890625, + "grad_norm": 623.3847045898438, + "learning_rate": 2.0626926422119142e-05, + "lookahead_loss": 7.682892145156861, + "loss": 5.7644, + "step": 308000 + }, + { + "epoch": 0.5884170532226562, + "grad_norm": 48.260250091552734, + "learning_rate": 2.057924270629883e-05, + "lookahead_loss": 7.742434193611145, + "loss": 5.7517, + "step": 308500 + }, + { + "epoch": 0.5893707275390625, + "grad_norm": 113.37686920166016, + "learning_rate": 2.0531558990478516e-05, + "lookahead_loss": 7.651146509170532, + "loss": 5.7168, + "step": 309000 + }, + { + "epoch": 0.5903244018554688, + "grad_norm": 71.9517593383789, + "learning_rate": 2.0483875274658203e-05, + "lookahead_loss": 7.697682547569275, + "loss": 5.6941, + "step": 309500 + }, + { + "epoch": 0.591278076171875, + "grad_norm": 161.54751586914062, + "learning_rate": 2.0436191558837893e-05, + "lookahead_loss": 7.705917343139649, + "loss": 5.6931, + "step": 310000 + }, + { + "epoch": 0.591278076171875, + "eval_accuracy": 0.041322504892367905, + "eval_lookahead_loss": 7.629103371810913, + "eval_lookahead_perplexity": 2057.204644978009, + "eval_loss": 5.537159442901611, + "eval_perplexity": 253.95559855563573, + "eval_runtime": 503.3805, + "eval_samples_per_second": 19.866, + "eval_steps_per_second": 4.966, + "step": 310000 + }, + { + "epoch": 0.5922317504882812, + "grad_norm": 51.126556396484375, + "learning_rate": 2.038850784301758e-05, + "lookahead_loss": 7.593200707435608, + "loss": 5.6583, + "step": 310500 + }, + { + "epoch": 0.5931854248046875, + "grad_norm": 195.32054138183594, + "learning_rate": 2.0340824127197266e-05, + "lookahead_loss": 7.691995723724365, + "loss": 5.6802, + "step": 311000 + }, + { + "epoch": 0.5941390991210938, + "grad_norm": 62.952354431152344, + "learning_rate": 2.0293140411376953e-05, + "lookahead_loss": 7.610295973777771, + "loss": 5.661, + "step": 311500 + }, + { + "epoch": 0.5950927734375, + "grad_norm": 79.08522033691406, + "learning_rate": 2.024545669555664e-05, + "lookahead_loss": 7.792914806365967, + "loss": 5.7341, + "step": 312000 + }, + { + "epoch": 0.5960464477539062, + "grad_norm": 184.8982696533203, + "learning_rate": 2.019777297973633e-05, + "lookahead_loss": 7.712621468544007, + "loss": 5.7207, + "step": 312500 + }, + { + "epoch": 0.5970001220703125, + "grad_norm": 138.11695861816406, + "learning_rate": 2.0150089263916017e-05, + "lookahead_loss": 7.681151127815246, + "loss": 5.7089, + "step": 313000 + }, + { + "epoch": 0.5979537963867188, + "grad_norm": 71.23046112060547, + "learning_rate": 2.0102405548095704e-05, + "lookahead_loss": 7.654941561698913, + "loss": 5.6271, + "step": 313500 + }, + { + "epoch": 0.598907470703125, + "grad_norm": 53.982933044433594, + "learning_rate": 2.005472183227539e-05, + "lookahead_loss": 7.633636961936951, + "loss": 5.6608, + "step": 314000 + }, + { + "epoch": 0.5998611450195312, + "grad_norm": 92.91081237792969, + "learning_rate": 2.0007038116455078e-05, + "lookahead_loss": 7.673441905975341, + "loss": 5.7027, + "step": 314500 + }, + { + "epoch": 0.6008148193359375, + "grad_norm": 228.8234100341797, + "learning_rate": 1.9959354400634768e-05, + "lookahead_loss": 7.7196409702301025, + "loss": 5.7024, + "step": 315000 + }, + { + "epoch": 0.6008148193359375, + "eval_accuracy": 0.039889432485322895, + "eval_lookahead_loss": 7.6305412668228145, + "eval_lookahead_perplexity": 2060.164816973733, + "eval_loss": 5.514240741729736, + "eval_perplexity": 248.20145667995536, + "eval_runtime": 502.3221, + "eval_samples_per_second": 19.908, + "eval_steps_per_second": 4.977, + "step": 315000 + }, + { + "epoch": 0.6017684936523438, + "grad_norm": 58.343074798583984, + "learning_rate": 1.9911670684814455e-05, + "lookahead_loss": 7.6060272264480595, + "loss": 5.666, + "step": 315500 + }, + { + "epoch": 0.60272216796875, + "grad_norm": 77.31822204589844, + "learning_rate": 1.986398696899414e-05, + "lookahead_loss": 7.654997428894043, + "loss": 5.6989, + "step": 316000 + }, + { + "epoch": 0.6036758422851562, + "grad_norm": 68.17203521728516, + "learning_rate": 1.9816303253173828e-05, + "lookahead_loss": 7.6594281711578365, + "loss": 5.6474, + "step": 316500 + }, + { + "epoch": 0.6046295166015625, + "grad_norm": 59.93851852416992, + "learning_rate": 1.9768619537353515e-05, + "lookahead_loss": 7.71039444065094, + "loss": 5.6981, + "step": 317000 + }, + { + "epoch": 0.6055831909179688, + "grad_norm": 78.47380828857422, + "learning_rate": 1.9720935821533205e-05, + "lookahead_loss": 7.670339431762695, + "loss": 5.6722, + "step": 317500 + }, + { + "epoch": 0.606536865234375, + "grad_norm": 63.436275482177734, + "learning_rate": 1.9673252105712892e-05, + "lookahead_loss": 7.715926249504089, + "loss": 5.6661, + "step": 318000 + }, + { + "epoch": 0.6074905395507812, + "grad_norm": 93.15093231201172, + "learning_rate": 1.962556838989258e-05, + "lookahead_loss": 7.725040663719177, + "loss": 5.6774, + "step": 318500 + }, + { + "epoch": 0.6084442138671875, + "grad_norm": 72.31375885009766, + "learning_rate": 1.9577884674072266e-05, + "lookahead_loss": 7.643857460021973, + "loss": 5.6634, + "step": 319000 + }, + { + "epoch": 0.6093978881835938, + "grad_norm": 74.85087585449219, + "learning_rate": 1.9530200958251953e-05, + "lookahead_loss": 7.685256309509278, + "loss": 5.6825, + "step": 319500 + }, + { + "epoch": 0.6103515625, + "grad_norm": 43.51264572143555, + "learning_rate": 1.9482517242431643e-05, + "lookahead_loss": 7.5999038200378415, + "loss": 5.6039, + "step": 320000 + }, + { + "epoch": 0.6103515625, + "eval_accuracy": 0.04013463796477495, + "eval_lookahead_loss": 7.6355261796951295, + "eval_lookahead_perplexity": 2070.460198559594, + "eval_loss": 5.500070095062256, + "eval_perplexity": 244.7090845615846, + "eval_runtime": 507.4186, + "eval_samples_per_second": 19.708, + "eval_steps_per_second": 4.927, + "step": 320000 + }, + { + "epoch": 0.6113052368164062, + "grad_norm": 161.35877990722656, + "learning_rate": 1.943483352661133e-05, + "lookahead_loss": 7.653745270729065, + "loss": 5.6231, + "step": 320500 + }, + { + "epoch": 0.6122589111328125, + "grad_norm": 72.01158142089844, + "learning_rate": 1.9387149810791016e-05, + "lookahead_loss": 7.616978050231934, + "loss": 5.6297, + "step": 321000 + }, + { + "epoch": 0.6132125854492188, + "grad_norm": 143.87258911132812, + "learning_rate": 1.9339466094970703e-05, + "lookahead_loss": 7.666714570045471, + "loss": 5.7348, + "step": 321500 + }, + { + "epoch": 0.614166259765625, + "grad_norm": 89.29421997070312, + "learning_rate": 1.929178237915039e-05, + "lookahead_loss": 7.601069657325745, + "loss": 5.7579, + "step": 322000 + }, + { + "epoch": 0.6151199340820312, + "grad_norm": 162.37379455566406, + "learning_rate": 1.924409866333008e-05, + "lookahead_loss": 7.666943772315979, + "loss": 5.7215, + "step": 322500 + }, + { + "epoch": 0.6160736083984375, + "grad_norm": 76.7725830078125, + "learning_rate": 1.9196414947509767e-05, + "lookahead_loss": 7.677476940155029, + "loss": 5.7173, + "step": 323000 + }, + { + "epoch": 0.6170272827148438, + "grad_norm": 45.070919036865234, + "learning_rate": 1.9148731231689454e-05, + "lookahead_loss": 7.676739762306213, + "loss": 5.6547, + "step": 323500 + }, + { + "epoch": 0.61798095703125, + "grad_norm": 49.95730209350586, + "learning_rate": 1.910104751586914e-05, + "lookahead_loss": 7.692105040550232, + "loss": 5.688, + "step": 324000 + }, + { + "epoch": 0.6189346313476562, + "grad_norm": 72.1408462524414, + "learning_rate": 1.9053363800048828e-05, + "lookahead_loss": 7.673581674575805, + "loss": 5.6901, + "step": 324500 + }, + { + "epoch": 0.6198883056640625, + "grad_norm": 81.43831634521484, + "learning_rate": 1.9005680084228518e-05, + "lookahead_loss": 7.680682111740112, + "loss": 5.6658, + "step": 325000 + }, + { + "epoch": 0.6198883056640625, + "eval_accuracy": 0.03958610567514677, + "eval_lookahead_loss": 7.623195519447327, + "eval_lookahead_perplexity": 2045.0868140771765, + "eval_loss": 5.493398189544678, + "eval_perplexity": 243.0818431070078, + "eval_runtime": 493.2201, + "eval_samples_per_second": 20.275, + "eval_steps_per_second": 5.069, + "step": 325000 + }, + { + "epoch": 0.6208419799804688, + "grad_norm": 72.55989837646484, + "learning_rate": 1.8957996368408205e-05, + "lookahead_loss": 7.762272061347962, + "loss": 5.662, + "step": 325500 + }, + { + "epoch": 0.621795654296875, + "grad_norm": 147.73936462402344, + "learning_rate": 1.891031265258789e-05, + "lookahead_loss": 7.679998337745666, + "loss": 5.6401, + "step": 326000 + }, + { + "epoch": 0.6227493286132812, + "grad_norm": 122.97508239746094, + "learning_rate": 1.8862628936767578e-05, + "lookahead_loss": 7.7476051902771, + "loss": 5.6937, + "step": 326500 + }, + { + "epoch": 0.6237030029296875, + "grad_norm": 55.44895935058594, + "learning_rate": 1.8814945220947265e-05, + "lookahead_loss": 7.7029116897583005, + "loss": 5.6139, + "step": 327000 + }, + { + "epoch": 0.6246566772460938, + "grad_norm": 126.49166107177734, + "learning_rate": 1.8767261505126955e-05, + "lookahead_loss": 7.740103358268738, + "loss": 5.6644, + "step": 327500 + }, + { + "epoch": 0.6256103515625, + "grad_norm": 122.43217468261719, + "learning_rate": 1.8719577789306642e-05, + "lookahead_loss": 7.681442254066467, + "loss": 5.6461, + "step": 328000 + }, + { + "epoch": 0.6265640258789062, + "grad_norm": 91.77095794677734, + "learning_rate": 1.867189407348633e-05, + "lookahead_loss": 7.685675806999207, + "loss": 5.6977, + "step": 328500 + }, + { + "epoch": 0.6275177001953125, + "grad_norm": 84.63752746582031, + "learning_rate": 1.8624210357666016e-05, + "lookahead_loss": 7.558146634101868, + "loss": 5.6322, + "step": 329000 + }, + { + "epoch": 0.6284713745117188, + "grad_norm": 42.94167709350586, + "learning_rate": 1.8576526641845703e-05, + "lookahead_loss": 7.675655766487122, + "loss": 5.684, + "step": 329500 + }, + { + "epoch": 0.629425048828125, + "grad_norm": 64.6584243774414, + "learning_rate": 1.8528842926025393e-05, + "lookahead_loss": 7.690525363922119, + "loss": 5.6381, + "step": 330000 + }, + { + "epoch": 0.629425048828125, + "eval_accuracy": 0.03981976516634051, + "eval_lookahead_loss": 7.587001530265808, + "eval_lookahead_perplexity": 1972.3904849993476, + "eval_loss": 5.485361576080322, + "eval_perplexity": 241.13611728930275, + "eval_runtime": 491.5172, + "eval_samples_per_second": 20.345, + "eval_steps_per_second": 5.086, + "step": 330000 + }, + { + "epoch": 0.6303787231445312, + "grad_norm": 86.13662719726562, + "learning_rate": 1.848115921020508e-05, + "lookahead_loss": 7.763082075119018, + "loss": 5.724, + "step": 330500 + }, + { + "epoch": 0.6313323974609375, + "grad_norm": 70.23460388183594, + "learning_rate": 1.8433475494384766e-05, + "lookahead_loss": 7.649384373664856, + "loss": 5.6763, + "step": 331000 + }, + { + "epoch": 0.6322860717773438, + "grad_norm": 70.26712799072266, + "learning_rate": 1.8385791778564453e-05, + "lookahead_loss": 7.734354881286621, + "loss": 5.6425, + "step": 331500 + }, + { + "epoch": 0.63323974609375, + "grad_norm": 86.38507843017578, + "learning_rate": 1.833810806274414e-05, + "lookahead_loss": 7.683895874977112, + "loss": 5.7049, + "step": 332000 + }, + { + "epoch": 0.6341934204101562, + "grad_norm": 46.47780227661133, + "learning_rate": 1.829042434692383e-05, + "lookahead_loss": 7.6723647832870485, + "loss": 5.6406, + "step": 332500 + }, + { + "epoch": 0.6351470947265625, + "grad_norm": 67.89440155029297, + "learning_rate": 1.8242740631103517e-05, + "lookahead_loss": 7.625741244316101, + "loss": 5.6131, + "step": 333000 + }, + { + "epoch": 0.6361007690429688, + "grad_norm": 44.83938980102539, + "learning_rate": 1.8195056915283204e-05, + "lookahead_loss": 7.649735819339752, + "loss": 5.6, + "step": 333500 + }, + { + "epoch": 0.637054443359375, + "grad_norm": 88.61436462402344, + "learning_rate": 1.814737319946289e-05, + "lookahead_loss": 7.606497900009155, + "loss": 5.6132, + "step": 334000 + }, + { + "epoch": 0.6380081176757812, + "grad_norm": 52.044471740722656, + "learning_rate": 1.8099689483642578e-05, + "lookahead_loss": 7.582312633514404, + "loss": 5.6259, + "step": 334500 + }, + { + "epoch": 0.6389617919921875, + "grad_norm": 34.11779022216797, + "learning_rate": 1.8052005767822268e-05, + "lookahead_loss": 7.706371660232544, + "loss": 5.6829, + "step": 335000 + }, + { + "epoch": 0.6389617919921875, + "eval_accuracy": 0.039807045009784735, + "eval_lookahead_loss": 7.626707599639893, + "eval_lookahead_perplexity": 2052.2819505212456, + "eval_loss": 5.474246025085449, + "eval_perplexity": 238.47059828199343, + "eval_runtime": 497.4907, + "eval_samples_per_second": 20.101, + "eval_steps_per_second": 5.025, + "step": 335000 + }, + { + "epoch": 0.6399154663085938, + "grad_norm": 158.097412109375, + "learning_rate": 1.8004322052001955e-05, + "lookahead_loss": 7.65738560962677, + "loss": 5.7178, + "step": 335500 + }, + { + "epoch": 0.640869140625, + "grad_norm": 181.54568481445312, + "learning_rate": 1.795663833618164e-05, + "lookahead_loss": 7.771249891281128, + "loss": 5.675, + "step": 336000 + }, + { + "epoch": 0.6418228149414062, + "grad_norm": 104.94950866699219, + "learning_rate": 1.7908954620361328e-05, + "lookahead_loss": 7.6930615234375, + "loss": 5.659, + "step": 336500 + }, + { + "epoch": 0.6427764892578125, + "grad_norm": 109.18247985839844, + "learning_rate": 1.7861270904541015e-05, + "lookahead_loss": 7.655643951416016, + "loss": 5.59, + "step": 337000 + }, + { + "epoch": 0.6437301635742188, + "grad_norm": 463.67645263671875, + "learning_rate": 1.7813587188720705e-05, + "lookahead_loss": 7.595953322410583, + "loss": 5.6377, + "step": 337500 + }, + { + "epoch": 0.644683837890625, + "grad_norm": 82.59309387207031, + "learning_rate": 1.7765903472900392e-05, + "lookahead_loss": 7.674489199638367, + "loss": 5.6511, + "step": 338000 + }, + { + "epoch": 0.6456375122070312, + "grad_norm": 117.12977600097656, + "learning_rate": 1.771821975708008e-05, + "lookahead_loss": 7.68188614654541, + "loss": 5.6395, + "step": 338500 + }, + { + "epoch": 0.6465911865234375, + "grad_norm": 77.3825912475586, + "learning_rate": 1.7670536041259766e-05, + "lookahead_loss": 7.6222167444229125, + "loss": 5.6949, + "step": 339000 + }, + { + "epoch": 0.6475448608398438, + "grad_norm": 126.05923461914062, + "learning_rate": 1.7622852325439453e-05, + "lookahead_loss": 7.673994980812073, + "loss": 5.6422, + "step": 339500 + }, + { + "epoch": 0.64849853515625, + "grad_norm": 236.18064880371094, + "learning_rate": 1.7575168609619143e-05, + "lookahead_loss": 7.644799095153808, + "loss": 5.6595, + "step": 340000 + }, + { + "epoch": 0.64849853515625, + "eval_accuracy": 0.040354794520547944, + "eval_lookahead_loss": 7.5941201610565185, + "eval_lookahead_perplexity": 1986.4812987834332, + "eval_loss": 5.473311424255371, + "eval_perplexity": 238.24782757994683, + "eval_runtime": 495.3349, + "eval_samples_per_second": 20.188, + "eval_steps_per_second": 5.047, + "step": 340000 + }, + { + "epoch": 0.6494522094726562, + "grad_norm": 151.79774475097656, + "learning_rate": 1.752748489379883e-05, + "lookahead_loss": 7.707747150421143, + "loss": 5.6265, + "step": 340500 + }, + { + "epoch": 0.6504058837890625, + "grad_norm": 174.77175903320312, + "learning_rate": 1.7479801177978516e-05, + "lookahead_loss": 7.614183146476746, + "loss": 5.5959, + "step": 341000 + }, + { + "epoch": 0.6513595581054688, + "grad_norm": 147.75082397460938, + "learning_rate": 1.7432117462158203e-05, + "lookahead_loss": 7.5491219463348385, + "loss": 5.6517, + "step": 341500 + }, + { + "epoch": 0.652313232421875, + "grad_norm": 156.6146240234375, + "learning_rate": 1.738443374633789e-05, + "lookahead_loss": 7.723574287414551, + "loss": 5.6392, + "step": 342000 + }, + { + "epoch": 0.6532669067382812, + "grad_norm": 94.10812377929688, + "learning_rate": 1.733675003051758e-05, + "lookahead_loss": 7.594383279800415, + "loss": 5.6652, + "step": 342500 + }, + { + "epoch": 0.6542205810546875, + "grad_norm": 67.34707641601562, + "learning_rate": 1.7289066314697267e-05, + "lookahead_loss": 7.62354185295105, + "loss": 5.6603, + "step": 343000 + }, + { + "epoch": 0.6551742553710938, + "grad_norm": 200.42666625976562, + "learning_rate": 1.7241382598876954e-05, + "lookahead_loss": 7.650953397750855, + "loss": 5.6068, + "step": 343500 + }, + { + "epoch": 0.6561279296875, + "grad_norm": 55.42783737182617, + "learning_rate": 1.719369888305664e-05, + "lookahead_loss": 7.626990766525268, + "loss": 5.65, + "step": 344000 + }, + { + "epoch": 0.6570816040039062, + "grad_norm": 56.801307678222656, + "learning_rate": 1.7146015167236328e-05, + "lookahead_loss": 7.626035104751587, + "loss": 5.6017, + "step": 344500 + }, + { + "epoch": 0.6580352783203125, + "grad_norm": 702.0319213867188, + "learning_rate": 1.7098331451416018e-05, + "lookahead_loss": 7.75804856967926, + "loss": 5.6537, + "step": 345000 + }, + { + "epoch": 0.6580352783203125, + "eval_accuracy": 0.0394279843444227, + "eval_lookahead_loss": 7.639824665641784, + "eval_lookahead_perplexity": 2079.3791979900543, + "eval_loss": 5.465416431427002, + "eval_perplexity": 236.37426829011844, + "eval_runtime": 494.0525, + "eval_samples_per_second": 20.241, + "eval_steps_per_second": 5.06, + "step": 345000 + }, + { + "epoch": 0.6589889526367188, + "grad_norm": 94.32064056396484, + "learning_rate": 1.7050647735595705e-05, + "lookahead_loss": 7.747750399589538, + "loss": 5.6405, + "step": 345500 + }, + { + "epoch": 0.659942626953125, + "grad_norm": 144.17726135253906, + "learning_rate": 1.700296401977539e-05, + "lookahead_loss": 7.757573101997376, + "loss": 5.6444, + "step": 346000 + }, + { + "epoch": 0.6608963012695312, + "grad_norm": 57.23394775390625, + "learning_rate": 1.6955280303955078e-05, + "lookahead_loss": 7.681757378578186, + "loss": 5.6161, + "step": 346500 + }, + { + "epoch": 0.6618499755859375, + "grad_norm": 68.44986724853516, + "learning_rate": 1.6907596588134765e-05, + "lookahead_loss": 7.637135238647461, + "loss": 5.5768, + "step": 347000 + }, + { + "epoch": 0.6628036499023438, + "grad_norm": 95.4968490600586, + "learning_rate": 1.6859912872314455e-05, + "lookahead_loss": 7.6619671382904055, + "loss": 5.5881, + "step": 347500 + }, + { + "epoch": 0.66375732421875, + "grad_norm": 70.02124786376953, + "learning_rate": 1.6812229156494142e-05, + "lookahead_loss": 7.59656533241272, + "loss": 5.5941, + "step": 348000 + }, + { + "epoch": 0.6647109985351562, + "grad_norm": 119.07731628417969, + "learning_rate": 1.676454544067383e-05, + "lookahead_loss": 7.836125363349915, + "loss": 5.7395, + "step": 348500 + }, + { + "epoch": 0.6656646728515625, + "grad_norm": 57.9122200012207, + "learning_rate": 1.6716861724853516e-05, + "lookahead_loss": 7.736806938171386, + "loss": 5.6545, + "step": 349000 + }, + { + "epoch": 0.6666183471679688, + "grad_norm": 51.72354507446289, + "learning_rate": 1.6669178009033203e-05, + "lookahead_loss": 7.698277523040772, + "loss": 5.6523, + "step": 349500 + }, + { + "epoch": 0.667572021484375, + "grad_norm": 116.98773956298828, + "learning_rate": 1.6621494293212893e-05, + "lookahead_loss": 7.617208242416382, + "loss": 5.6034, + "step": 350000 + }, + { + "epoch": 0.667572021484375, + "eval_accuracy": 0.03986947162426614, + "eval_lookahead_loss": 7.594168025779724, + "eval_lookahead_perplexity": 1986.5763834365343, + "eval_loss": 5.4510602951049805, + "eval_perplexity": 233.00508913422928, + "eval_runtime": 492.4047, + "eval_samples_per_second": 20.308, + "eval_steps_per_second": 5.077, + "step": 350000 + }, + { + "epoch": 0.6685256958007812, + "grad_norm": 44.044517517089844, + "learning_rate": 1.657381057739258e-05, + "lookahead_loss": 7.6682868547439575, + "loss": 5.6715, + "step": 350500 + }, + { + "epoch": 0.6694793701171875, + "grad_norm": 55.66027069091797, + "learning_rate": 1.6526126861572266e-05, + "lookahead_loss": 7.652213685035705, + "loss": 5.6075, + "step": 351000 + }, + { + "epoch": 0.6704330444335938, + "grad_norm": 51.59208679199219, + "learning_rate": 1.6478443145751953e-05, + "lookahead_loss": 7.670471503257751, + "loss": 5.643, + "step": 351500 + }, + { + "epoch": 0.67138671875, + "grad_norm": 192.97735595703125, + "learning_rate": 1.643075942993164e-05, + "lookahead_loss": 7.639788820266723, + "loss": 5.6634, + "step": 352000 + }, + { + "epoch": 0.6723403930664062, + "grad_norm": 47.10070037841797, + "learning_rate": 1.638307571411133e-05, + "lookahead_loss": 7.67293514919281, + "loss": 5.6409, + "step": 352500 + }, + { + "epoch": 0.6732940673828125, + "grad_norm": 50.390377044677734, + "learning_rate": 1.6335391998291017e-05, + "lookahead_loss": 7.616825453758239, + "loss": 5.611, + "step": 353000 + }, + { + "epoch": 0.6742477416992188, + "grad_norm": 34.581695556640625, + "learning_rate": 1.6287708282470704e-05, + "lookahead_loss": 7.64467066192627, + "loss": 5.5793, + "step": 353500 + }, + { + "epoch": 0.675201416015625, + "grad_norm": 34.34901428222656, + "learning_rate": 1.624002456665039e-05, + "lookahead_loss": 7.5159718770980835, + "loss": 5.5344, + "step": 354000 + }, + { + "epoch": 0.6761550903320312, + "grad_norm": 57.81010818481445, + "learning_rate": 1.6192340850830078e-05, + "lookahead_loss": 7.623741924285889, + "loss": 5.6167, + "step": 354500 + }, + { + "epoch": 0.6771087646484375, + "grad_norm": 44.65420150756836, + "learning_rate": 1.6144657135009768e-05, + "lookahead_loss": 7.598629281997681, + "loss": 5.5748, + "step": 355000 + }, + { + "epoch": 0.6771087646484375, + "eval_accuracy": 0.03948708414872799, + "eval_lookahead_loss": 7.579181741142273, + "eval_lookahead_perplexity": 1957.0269554069453, + "eval_loss": 5.442113399505615, + "eval_perplexity": 230.92971485007791, + "eval_runtime": 490.7821, + "eval_samples_per_second": 20.376, + "eval_steps_per_second": 5.094, + "step": 355000 + }, + { + "epoch": 0.6780624389648438, + "grad_norm": 144.18019104003906, + "learning_rate": 1.6096973419189455e-05, + "lookahead_loss": 7.634109246253967, + "loss": 5.5847, + "step": 355500 + }, + { + "epoch": 0.67901611328125, + "grad_norm": 57.99235153198242, + "learning_rate": 1.604928970336914e-05, + "lookahead_loss": 7.586665316581726, + "loss": 5.5786, + "step": 356000 + }, + { + "epoch": 0.6799697875976562, + "grad_norm": 143.50289916992188, + "learning_rate": 1.6001605987548828e-05, + "lookahead_loss": 7.663339549064636, + "loss": 5.5967, + "step": 356500 + }, + { + "epoch": 0.6809234619140625, + "grad_norm": 65.68069458007812, + "learning_rate": 1.5953922271728515e-05, + "lookahead_loss": 7.617835608482361, + "loss": 5.6024, + "step": 357000 + }, + { + "epoch": 0.6818771362304688, + "grad_norm": 79.50352478027344, + "learning_rate": 1.5906238555908205e-05, + "lookahead_loss": 7.622173350334167, + "loss": 5.5964, + "step": 357500 + }, + { + "epoch": 0.682830810546875, + "grad_norm": 63.15433120727539, + "learning_rate": 1.5858554840087892e-05, + "lookahead_loss": 7.55947587966919, + "loss": 5.5829, + "step": 358000 + }, + { + "epoch": 0.6837844848632812, + "grad_norm": 54.891563415527344, + "learning_rate": 1.581087112426758e-05, + "lookahead_loss": 7.6174576425552365, + "loss": 5.5951, + "step": 358500 + }, + { + "epoch": 0.6847381591796875, + "grad_norm": 85.65673065185547, + "learning_rate": 1.5763187408447266e-05, + "lookahead_loss": 7.568507328033447, + "loss": 5.6119, + "step": 359000 + }, + { + "epoch": 0.6856918334960938, + "grad_norm": 69.52005004882812, + "learning_rate": 1.5715503692626953e-05, + "lookahead_loss": 7.615708199501038, + "loss": 5.5919, + "step": 359500 + }, + { + "epoch": 0.6866455078125, + "grad_norm": 151.19342041015625, + "learning_rate": 1.5667819976806643e-05, + "lookahead_loss": 7.622918822288513, + "loss": 5.5886, + "step": 360000 + }, + { + "epoch": 0.6866455078125, + "eval_accuracy": 0.03858590998043053, + "eval_lookahead_loss": 7.604607222747803, + "eval_lookahead_perplexity": 2007.4232686471876, + "eval_loss": 5.431325912475586, + "eval_perplexity": 228.45195199166682, + "eval_runtime": 489.7505, + "eval_samples_per_second": 20.419, + "eval_steps_per_second": 5.105, + "step": 360000 + }, + { + "epoch": 0.6875991821289062, + "grad_norm": 54.79999542236328, + "learning_rate": 1.562013626098633e-05, + "lookahead_loss": 7.6705844058990476, + "loss": 5.5549, + "step": 360500 + }, + { + "epoch": 0.6885528564453125, + "grad_norm": 170.39064025878906, + "learning_rate": 1.5572452545166016e-05, + "lookahead_loss": 7.604489400863647, + "loss": 5.5869, + "step": 361000 + }, + { + "epoch": 0.6895065307617188, + "grad_norm": 157.4973907470703, + "learning_rate": 1.5524768829345703e-05, + "lookahead_loss": 7.677668005943298, + "loss": 5.5929, + "step": 361500 + }, + { + "epoch": 0.690460205078125, + "grad_norm": 72.11114501953125, + "learning_rate": 1.547708511352539e-05, + "lookahead_loss": 7.671922834396362, + "loss": 5.5913, + "step": 362000 + }, + { + "epoch": 0.6914138793945312, + "grad_norm": 96.38089752197266, + "learning_rate": 1.542940139770508e-05, + "lookahead_loss": 7.641579981803894, + "loss": 5.569, + "step": 362500 + }, + { + "epoch": 0.6923675537109375, + "grad_norm": 75.91097259521484, + "learning_rate": 1.5381717681884767e-05, + "lookahead_loss": 7.568136302947998, + "loss": 5.5394, + "step": 363000 + }, + { + "epoch": 0.6933212280273438, + "grad_norm": 38.369083404541016, + "learning_rate": 1.5334033966064454e-05, + "lookahead_loss": 7.567677716255188, + "loss": 5.5073, + "step": 363500 + }, + { + "epoch": 0.69427490234375, + "grad_norm": 62.12953567504883, + "learning_rate": 1.528635025024414e-05, + "lookahead_loss": 7.618713015556335, + "loss": 5.5261, + "step": 364000 + }, + { + "epoch": 0.6952285766601562, + "grad_norm": 76.11123657226562, + "learning_rate": 1.523866653442383e-05, + "lookahead_loss": 7.479368523597717, + "loss": 5.5106, + "step": 364500 + }, + { + "epoch": 0.6961822509765625, + "grad_norm": 100.44092559814453, + "learning_rate": 1.5190982818603516e-05, + "lookahead_loss": 7.621063689231873, + "loss": 5.5795, + "step": 365000 + }, + { + "epoch": 0.6961822509765625, + "eval_accuracy": 0.03946066536203523, + "eval_lookahead_loss": 7.605871375846863, + "eval_lookahead_perplexity": 2009.9625636840503, + "eval_loss": 5.422245502471924, + "eval_perplexity": 226.38690452950155, + "eval_runtime": 491.0958, + "eval_samples_per_second": 20.363, + "eval_steps_per_second": 5.091, + "step": 365000 + }, + { + "epoch": 0.6971359252929688, + "grad_norm": 85.50440979003906, + "learning_rate": 1.5143299102783205e-05, + "lookahead_loss": 7.632618884086609, + "loss": 5.6807, + "step": 365500 + }, + { + "epoch": 0.698089599609375, + "grad_norm": 100.22126007080078, + "learning_rate": 1.5095615386962891e-05, + "lookahead_loss": 7.54717865562439, + "loss": 5.6459, + "step": 366000 + }, + { + "epoch": 0.6990432739257812, + "grad_norm": 69.62335205078125, + "learning_rate": 1.5047931671142578e-05, + "lookahead_loss": 7.772770000457764, + "loss": 5.6874, + "step": 366500 + }, + { + "epoch": 0.6999969482421875, + "grad_norm": 131.26800537109375, + "learning_rate": 1.5000247955322267e-05, + "lookahead_loss": 7.811462509155273, + "loss": 5.6796, + "step": 367000 + }, + { + "epoch": 0.7009506225585938, + "grad_norm": 140.08290100097656, + "learning_rate": 1.4952564239501954e-05, + "lookahead_loss": 7.590369746208191, + "loss": 5.5774, + "step": 367500 + }, + { + "epoch": 0.701904296875, + "grad_norm": 579.3489990234375, + "learning_rate": 1.4904880523681642e-05, + "lookahead_loss": 7.681502223968506, + "loss": 5.5824, + "step": 368000 + }, + { + "epoch": 0.7028579711914062, + "grad_norm": 59.328189849853516, + "learning_rate": 1.4857196807861329e-05, + "lookahead_loss": 7.522685302734375, + "loss": 5.565, + "step": 368500 + }, + { + "epoch": 0.7038116455078125, + "grad_norm": 65.36320495605469, + "learning_rate": 1.4809513092041016e-05, + "lookahead_loss": 7.587596984863281, + "loss": 5.5921, + "step": 369000 + }, + { + "epoch": 0.7047653198242188, + "grad_norm": 108.03498077392578, + "learning_rate": 1.4761829376220704e-05, + "lookahead_loss": 7.551915717124939, + "loss": 5.5514, + "step": 369500 + }, + { + "epoch": 0.705718994140625, + "grad_norm": 65.2369155883789, + "learning_rate": 1.4714145660400391e-05, + "lookahead_loss": 7.721729018211365, + "loss": 5.6283, + "step": 370000 + }, + { + "epoch": 0.705718994140625, + "eval_accuracy": 0.03867260273972603, + "eval_lookahead_loss": 7.591337853622437, + "eval_lookahead_perplexity": 1980.9619788810905, + "eval_loss": 5.414850234985352, + "eval_perplexity": 224.7188881307444, + "eval_runtime": 494.1633, + "eval_samples_per_second": 20.236, + "eval_steps_per_second": 5.059, + "step": 370000 + }, + { + "epoch": 0.7066726684570312, + "grad_norm": 59.80350875854492, + "learning_rate": 1.466646194458008e-05, + "lookahead_loss": 7.665653528213501, + "loss": 5.536, + "step": 370500 + }, + { + "epoch": 0.7076263427734375, + "grad_norm": 65.0726089477539, + "learning_rate": 1.4618778228759766e-05, + "lookahead_loss": 7.710492059707642, + "loss": 5.6378, + "step": 371000 + }, + { + "epoch": 0.7085800170898438, + "grad_norm": 93.6043930053711, + "learning_rate": 1.4571094512939453e-05, + "lookahead_loss": 7.646479023933411, + "loss": 5.5727, + "step": 371500 + }, + { + "epoch": 0.70953369140625, + "grad_norm": 133.7737274169922, + "learning_rate": 1.4523410797119142e-05, + "lookahead_loss": 7.665736899375916, + "loss": 5.597, + "step": 372000 + }, + { + "epoch": 0.7104873657226562, + "grad_norm": 43.25067901611328, + "learning_rate": 1.4475727081298829e-05, + "lookahead_loss": 7.712738265037537, + "loss": 5.5853, + "step": 372500 + }, + { + "epoch": 0.7114410400390625, + "grad_norm": 45.30479049682617, + "learning_rate": 1.4428043365478517e-05, + "lookahead_loss": 7.656399544715882, + "loss": 5.5657, + "step": 373000 + }, + { + "epoch": 0.7123947143554688, + "grad_norm": 116.81694793701172, + "learning_rate": 1.4380359649658204e-05, + "lookahead_loss": 7.652817762374878, + "loss": 5.6046, + "step": 373500 + }, + { + "epoch": 0.713348388671875, + "grad_norm": 67.41263580322266, + "learning_rate": 1.433267593383789e-05, + "lookahead_loss": 7.706176741600037, + "loss": 5.5591, + "step": 374000 + }, + { + "epoch": 0.7143020629882812, + "grad_norm": 58.651676177978516, + "learning_rate": 1.428499221801758e-05, + "lookahead_loss": 7.708136086463928, + "loss": 5.6032, + "step": 374500 + }, + { + "epoch": 0.7152557373046875, + "grad_norm": 82.5753173828125, + "learning_rate": 1.4237308502197266e-05, + "lookahead_loss": 7.606646158218384, + "loss": 5.5595, + "step": 375000 + }, + { + "epoch": 0.7152557373046875, + "eval_accuracy": 0.03926614481409002, + "eval_lookahead_loss": 7.616972279167175, + "eval_lookahead_perplexity": 2032.3992672134311, + "eval_loss": 5.404574394226074, + "eval_perplexity": 222.4215364460516, + "eval_runtime": 489.6251, + "eval_samples_per_second": 20.424, + "eval_steps_per_second": 5.106, + "step": 375000 + }, + { + "epoch": 0.7162094116210938, + "grad_norm": 89.38666534423828, + "learning_rate": 1.4189624786376955e-05, + "lookahead_loss": 7.610304831504822, + "loss": 5.5262, + "step": 375500 + }, + { + "epoch": 0.7171630859375, + "grad_norm": 79.63694763183594, + "learning_rate": 1.4141941070556641e-05, + "lookahead_loss": 7.683006196975708, + "loss": 5.5718, + "step": 376000 + }, + { + "epoch": 0.7181167602539062, + "grad_norm": 51.36540603637695, + "learning_rate": 1.4094257354736328e-05, + "lookahead_loss": 7.709863342285156, + "loss": 5.551, + "step": 376500 + }, + { + "epoch": 0.7190704345703125, + "grad_norm": 90.45024108886719, + "learning_rate": 1.4046573638916017e-05, + "lookahead_loss": 7.568320033073426, + "loss": 5.5702, + "step": 377000 + }, + { + "epoch": 0.7200241088867188, + "grad_norm": 124.38764190673828, + "learning_rate": 1.3998889923095704e-05, + "lookahead_loss": 7.666269627571106, + "loss": 5.5766, + "step": 377500 + }, + { + "epoch": 0.720977783203125, + "grad_norm": 99.64993286132812, + "learning_rate": 1.3951206207275392e-05, + "lookahead_loss": 7.621032904624939, + "loss": 5.5857, + "step": 378000 + }, + { + "epoch": 0.7219314575195312, + "grad_norm": 49.53767776489258, + "learning_rate": 1.3903522491455079e-05, + "lookahead_loss": 7.6225580930709835, + "loss": 5.5702, + "step": 378500 + }, + { + "epoch": 0.7228851318359375, + "grad_norm": 78.93052673339844, + "learning_rate": 1.3855838775634766e-05, + "lookahead_loss": 7.755086720466614, + "loss": 5.5359, + "step": 379000 + }, + { + "epoch": 0.7238388061523438, + "grad_norm": 63.549617767333984, + "learning_rate": 1.3808155059814454e-05, + "lookahead_loss": 7.577735940933228, + "loss": 5.5433, + "step": 379500 + }, + { + "epoch": 0.72479248046875, + "grad_norm": 115.36119842529297, + "learning_rate": 1.3760471343994141e-05, + "lookahead_loss": 7.665920634269714, + "loss": 5.5667, + "step": 380000 + }, + { + "epoch": 0.72479248046875, + "eval_accuracy": 0.039015264187866924, + "eval_lookahead_loss": 7.61840461063385, + "eval_lookahead_perplexity": 2035.3124224406115, + "eval_loss": 5.396119594573975, + "eval_perplexity": 220.54893431005306, + "eval_runtime": 496.8011, + "eval_samples_per_second": 20.129, + "eval_steps_per_second": 5.032, + "step": 380000 + }, + { + "epoch": 0.7257461547851562, + "grad_norm": 321.5196838378906, + "learning_rate": 1.371278762817383e-05, + "lookahead_loss": 7.702496208190918, + "loss": 5.6062, + "step": 380500 + }, + { + "epoch": 0.7266998291015625, + "grad_norm": 55.56507110595703, + "learning_rate": 1.3665103912353516e-05, + "lookahead_loss": 7.6807406759262085, + "loss": 5.5789, + "step": 381000 + }, + { + "epoch": 0.7276535034179688, + "grad_norm": 78.44230651855469, + "learning_rate": 1.3617420196533203e-05, + "lookahead_loss": 7.660055998802185, + "loss": 5.5765, + "step": 381500 + }, + { + "epoch": 0.728607177734375, + "grad_norm": 117.78761291503906, + "learning_rate": 1.3569736480712892e-05, + "lookahead_loss": 7.671208109855652, + "loss": 5.5374, + "step": 382000 + }, + { + "epoch": 0.7295608520507812, + "grad_norm": 40.70844650268555, + "learning_rate": 1.3522052764892579e-05, + "lookahead_loss": 7.6092142457962035, + "loss": 5.4831, + "step": 382500 + }, + { + "epoch": 0.7305145263671875, + "grad_norm": 110.36885833740234, + "learning_rate": 1.3474369049072265e-05, + "lookahead_loss": 7.553666878700256, + "loss": 5.5251, + "step": 383000 + }, + { + "epoch": 0.7314682006835938, + "grad_norm": 62.2484016418457, + "learning_rate": 1.3426685333251954e-05, + "lookahead_loss": 7.543814493179322, + "loss": 5.5209, + "step": 383500 + }, + { + "epoch": 0.732421875, + "grad_norm": 72.45450592041016, + "learning_rate": 1.337900161743164e-05, + "lookahead_loss": 7.570508386611938, + "loss": 5.4959, + "step": 384000 + }, + { + "epoch": 0.7333755493164062, + "grad_norm": 58.14596939086914, + "learning_rate": 1.333131790161133e-05, + "lookahead_loss": 7.501864053726196, + "loss": 5.4797, + "step": 384500 + }, + { + "epoch": 0.7343292236328125, + "grad_norm": 59.66182327270508, + "learning_rate": 1.3283634185791016e-05, + "lookahead_loss": 7.621293797492981, + "loss": 5.5508, + "step": 385000 + }, + { + "epoch": 0.7343292236328125, + "eval_accuracy": 0.039309393346379645, + "eval_lookahead_loss": 7.5623254400253295, + "eval_lookahead_perplexity": 1924.315193945646, + "eval_loss": 5.38715934753418, + "eval_perplexity": 218.58158848906987, + "eval_runtime": 493.5797, + "eval_samples_per_second": 20.26, + "eval_steps_per_second": 5.065, + "step": 385000 + }, + { + "epoch": 0.7352828979492188, + "grad_norm": 212.38523864746094, + "learning_rate": 1.3235950469970703e-05, + "lookahead_loss": 7.692860730171204, + "loss": 5.6151, + "step": 385500 + }, + { + "epoch": 0.736236572265625, + "grad_norm": 63.727699279785156, + "learning_rate": 1.3188266754150391e-05, + "lookahead_loss": 7.675564749717712, + "loss": 5.6071, + "step": 386000 + }, + { + "epoch": 0.7371902465820312, + "grad_norm": 132.97877502441406, + "learning_rate": 1.3140583038330078e-05, + "lookahead_loss": 7.571804716110229, + "loss": 5.5852, + "step": 386500 + }, + { + "epoch": 0.7381439208984375, + "grad_norm": 38.12655258178711, + "learning_rate": 1.3092899322509767e-05, + "lookahead_loss": 7.68760968208313, + "loss": 5.554, + "step": 387000 + }, + { + "epoch": 0.7390975952148438, + "grad_norm": 40.39825439453125, + "learning_rate": 1.3045215606689454e-05, + "lookahead_loss": 7.6389426393508915, + "loss": 5.5249, + "step": 387500 + }, + { + "epoch": 0.74005126953125, + "grad_norm": 98.46976470947266, + "learning_rate": 1.299753189086914e-05, + "lookahead_loss": 7.602905288696289, + "loss": 5.614, + "step": 388000 + }, + { + "epoch": 0.7410049438476562, + "grad_norm": 60.494598388671875, + "learning_rate": 1.2949848175048829e-05, + "lookahead_loss": 7.5567783374786375, + "loss": 5.5682, + "step": 388500 + }, + { + "epoch": 0.7419586181640625, + "grad_norm": 113.4759521484375, + "learning_rate": 1.2902164459228516e-05, + "lookahead_loss": 7.608567108154297, + "loss": 5.5305, + "step": 389000 + }, + { + "epoch": 0.7429122924804688, + "grad_norm": 159.10374450683594, + "learning_rate": 1.2854480743408204e-05, + "lookahead_loss": 7.667847086906433, + "loss": 5.6016, + "step": 389500 + }, + { + "epoch": 0.743865966796875, + "grad_norm": 41.53785705566406, + "learning_rate": 1.2806797027587891e-05, + "lookahead_loss": 7.6221770544052125, + "loss": 5.5796, + "step": 390000 + }, + { + "epoch": 0.743865966796875, + "eval_accuracy": 0.03906497064579256, + "eval_lookahead_loss": 7.559039109230041, + "eval_lookahead_perplexity": 1918.0016375635087, + "eval_loss": 5.379703998565674, + "eval_perplexity": 216.95804602660695, + "eval_runtime": 499.9939, + "eval_samples_per_second": 20.0, + "eval_steps_per_second": 5.0, + "step": 390000 + }, + { + "epoch": 0.7448196411132812, + "grad_norm": 46.71460723876953, + "learning_rate": 1.2759113311767578e-05, + "lookahead_loss": 7.672293969154358, + "loss": 5.5463, + "step": 390500 + }, + { + "epoch": 0.7457733154296875, + "grad_norm": 81.75881958007812, + "learning_rate": 1.2711429595947266e-05, + "lookahead_loss": 7.65075671005249, + "loss": 5.5797, + "step": 391000 + }, + { + "epoch": 0.7467269897460938, + "grad_norm": 171.552978515625, + "learning_rate": 1.2663745880126953e-05, + "lookahead_loss": 7.548689237594605, + "loss": 5.5142, + "step": 391500 + }, + { + "epoch": 0.7476806640625, + "grad_norm": 91.40204620361328, + "learning_rate": 1.2616062164306642e-05, + "lookahead_loss": 7.603097931861877, + "loss": 5.5173, + "step": 392000 + }, + { + "epoch": 0.7486343383789062, + "grad_norm": 55.01487350463867, + "learning_rate": 1.2568378448486329e-05, + "lookahead_loss": 7.637936964988708, + "loss": 5.5354, + "step": 392500 + }, + { + "epoch": 0.7495880126953125, + "grad_norm": 41.5325927734375, + "learning_rate": 1.2520694732666015e-05, + "lookahead_loss": 7.700466906547546, + "loss": 5.5058, + "step": 393000 + }, + { + "epoch": 0.7505416870117188, + "grad_norm": 66.20587921142578, + "learning_rate": 1.2473011016845704e-05, + "lookahead_loss": 7.661156332969665, + "loss": 5.5401, + "step": 393500 + }, + { + "epoch": 0.751495361328125, + "grad_norm": 65.88750457763672, + "learning_rate": 1.242532730102539e-05, + "lookahead_loss": 7.581592578887939, + "loss": 5.5349, + "step": 394000 + }, + { + "epoch": 0.7524490356445312, + "grad_norm": 37.96415710449219, + "learning_rate": 1.237764358520508e-05, + "lookahead_loss": 7.635292756080627, + "loss": 5.5336, + "step": 394500 + }, + { + "epoch": 0.7534027099609375, + "grad_norm": 29.575408935546875, + "learning_rate": 1.2329959869384766e-05, + "lookahead_loss": 7.5903411312103275, + "loss": 5.5285, + "step": 395000 + }, + { + "epoch": 0.7534027099609375, + "eval_accuracy": 0.0386, + "eval_lookahead_loss": 7.566088166999817, + "eval_lookahead_perplexity": 1931.5695060723358, + "eval_loss": 5.370765209197998, + "eval_perplexity": 215.02734566901094, + "eval_runtime": 495.3107, + "eval_samples_per_second": 20.189, + "eval_steps_per_second": 5.047, + "step": 395000 + }, + { + "epoch": 0.7543563842773438, + "grad_norm": 47.93413162231445, + "learning_rate": 1.2282276153564453e-05, + "lookahead_loss": 7.568159667015076, + "loss": 5.5068, + "step": 395500 + }, + { + "epoch": 0.75531005859375, + "grad_norm": 54.97478485107422, + "learning_rate": 1.2234592437744141e-05, + "lookahead_loss": 7.547260239601135, + "loss": 5.4286, + "step": 396000 + }, + { + "epoch": 0.7562637329101562, + "grad_norm": 48.8118896484375, + "learning_rate": 1.2186908721923828e-05, + "lookahead_loss": 7.646974905967713, + "loss": 5.5322, + "step": 396500 + }, + { + "epoch": 0.7572174072265625, + "grad_norm": 125.30783081054688, + "learning_rate": 1.2139225006103517e-05, + "lookahead_loss": 7.5923522834777835, + "loss": 5.5255, + "step": 397000 + }, + { + "epoch": 0.7581710815429688, + "grad_norm": 41.36922836303711, + "learning_rate": 1.2091541290283204e-05, + "lookahead_loss": 7.597936030387879, + "loss": 5.5393, + "step": 397500 + }, + { + "epoch": 0.759124755859375, + "grad_norm": 81.73969268798828, + "learning_rate": 1.204385757446289e-05, + "lookahead_loss": 7.58720658493042, + "loss": 5.4943, + "step": 398000 + }, + { + "epoch": 0.7600784301757812, + "grad_norm": 77.25200653076172, + "learning_rate": 1.1996173858642579e-05, + "lookahead_loss": 7.660082992553711, + "loss": 5.5275, + "step": 398500 + }, + { + "epoch": 0.7610321044921875, + "grad_norm": 76.67062377929688, + "learning_rate": 1.1948490142822266e-05, + "lookahead_loss": 7.688210826873779, + "loss": 5.5177, + "step": 399000 + }, + { + "epoch": 0.7619857788085938, + "grad_norm": 52.03340148925781, + "learning_rate": 1.1900806427001954e-05, + "lookahead_loss": 7.648560061454773, + "loss": 5.505, + "step": 399500 + }, + { + "epoch": 0.762939453125, + "grad_norm": 50.49256896972656, + "learning_rate": 1.1853122711181641e-05, + "lookahead_loss": 7.624071607112884, + "loss": 5.5194, + "step": 400000 + }, + { + "epoch": 0.762939453125, + "eval_accuracy": 0.03850939334637965, + "eval_lookahead_loss": 7.578225363349914, + "eval_lookahead_perplexity": 1955.1561930081455, + "eval_loss": 5.36337947845459, + "eval_perplexity": 213.44506194486513, + "eval_runtime": 496.2584, + "eval_samples_per_second": 20.151, + "eval_steps_per_second": 5.038, + "step": 400000 + }, + { + "epoch": 0.7638931274414062, + "grad_norm": 75.72126007080078, + "learning_rate": 1.1805438995361328e-05, + "lookahead_loss": 7.580011445045471, + "loss": 5.5122, + "step": 400500 + }, + { + "epoch": 0.7648468017578125, + "grad_norm": 79.77975463867188, + "learning_rate": 1.1757755279541016e-05, + "lookahead_loss": 7.519650664329529, + "loss": 5.477, + "step": 401000 + }, + { + "epoch": 0.7658004760742188, + "grad_norm": 37.1960334777832, + "learning_rate": 1.1710071563720703e-05, + "lookahead_loss": 7.5450973138809205, + "loss": 5.4536, + "step": 401500 + }, + { + "epoch": 0.766754150390625, + "grad_norm": 42.624446868896484, + "learning_rate": 1.1662387847900392e-05, + "lookahead_loss": 7.476261118888855, + "loss": 5.4812, + "step": 402000 + }, + { + "epoch": 0.7677078247070312, + "grad_norm": 51.64815139770508, + "learning_rate": 1.1614704132080079e-05, + "lookahead_loss": 7.470086345672607, + "loss": 5.4387, + "step": 402500 + }, + { + "epoch": 0.7686614990234375, + "grad_norm": 51.93968200683594, + "learning_rate": 1.1567020416259765e-05, + "lookahead_loss": 7.554843747138977, + "loss": 5.5533, + "step": 403000 + }, + { + "epoch": 0.7696151733398438, + "grad_norm": 47.713382720947266, + "learning_rate": 1.1519336700439454e-05, + "lookahead_loss": 7.571363210678101, + "loss": 5.4869, + "step": 403500 + }, + { + "epoch": 0.77056884765625, + "grad_norm": 104.33297729492188, + "learning_rate": 1.147165298461914e-05, + "lookahead_loss": 7.626045141220093, + "loss": 5.5522, + "step": 404000 + }, + { + "epoch": 0.7715225219726562, + "grad_norm": 116.54916381835938, + "learning_rate": 1.142396926879883e-05, + "lookahead_loss": 7.603502921104431, + "loss": 5.6182, + "step": 404500 + }, + { + "epoch": 0.7724761962890625, + "grad_norm": 70.0487289428711, + "learning_rate": 1.1376285552978516e-05, + "lookahead_loss": 7.664078218460083, + "loss": 5.5849, + "step": 405000 + }, + { + "epoch": 0.7724761962890625, + "eval_accuracy": 0.03844090019569472, + "eval_lookahead_loss": 7.553167304992676, + "eval_lookahead_perplexity": 1906.7725073113113, + "eval_loss": 5.357154369354248, + "eval_perplexity": 212.12047028865916, + "eval_runtime": 495.3451, + "eval_samples_per_second": 20.188, + "eval_steps_per_second": 5.047, + "step": 405000 + }, + { + "epoch": 0.7734298706054688, + "grad_norm": 194.6410675048828, + "learning_rate": 1.1328601837158203e-05, + "lookahead_loss": 7.5126777296066285, + "loss": 5.5604, + "step": 405500 + }, + { + "epoch": 0.774383544921875, + "grad_norm": 70.98809051513672, + "learning_rate": 1.1280918121337891e-05, + "lookahead_loss": 7.629218717575073, + "loss": 5.5836, + "step": 406000 + }, + { + "epoch": 0.7753372192382812, + "grad_norm": 56.12082290649414, + "learning_rate": 1.1233234405517578e-05, + "lookahead_loss": 7.580715406417847, + "loss": 5.5135, + "step": 406500 + }, + { + "epoch": 0.7762908935546875, + "grad_norm": 12.614853858947754, + "learning_rate": 1.1185550689697267e-05, + "lookahead_loss": 7.6448364210128785, + "loss": 5.5526, + "step": 407000 + }, + { + "epoch": 0.7772445678710938, + "grad_norm": 65.30194854736328, + "learning_rate": 1.1137866973876954e-05, + "lookahead_loss": 7.689765469074249, + "loss": 5.5003, + "step": 407500 + }, + { + "epoch": 0.7781982421875, + "grad_norm": 73.15607452392578, + "learning_rate": 1.109018325805664e-05, + "lookahead_loss": 7.562525021076202, + "loss": 5.5006, + "step": 408000 + }, + { + "epoch": 0.7791519165039062, + "grad_norm": 55.93141174316406, + "learning_rate": 1.1042499542236329e-05, + "lookahead_loss": 7.582972754478455, + "loss": 5.5263, + "step": 408500 + }, + { + "epoch": 0.7801055908203125, + "grad_norm": 53.561832427978516, + "learning_rate": 1.0994815826416016e-05, + "lookahead_loss": 7.54234787940979, + "loss": 5.5075, + "step": 409000 + }, + { + "epoch": 0.7810592651367188, + "grad_norm": 71.15483093261719, + "learning_rate": 1.0947132110595704e-05, + "lookahead_loss": 7.656134716033936, + "loss": 5.5566, + "step": 409500 + }, + { + "epoch": 0.782012939453125, + "grad_norm": 192.37393188476562, + "learning_rate": 1.0899448394775391e-05, + "lookahead_loss": 7.543108942985534, + "loss": 5.48, + "step": 410000 + }, + { + "epoch": 0.782012939453125, + "eval_accuracy": 0.03874872798434442, + "eval_lookahead_loss": 7.554345646476746, + "eval_lookahead_perplexity": 1909.020660743458, + "eval_loss": 5.349673748016357, + "eval_perplexity": 210.5395976991162, + "eval_runtime": 503.2128, + "eval_samples_per_second": 19.872, + "eval_steps_per_second": 4.968, + "step": 410000 + }, + { + "epoch": 0.7829666137695312, + "grad_norm": 64.28553009033203, + "learning_rate": 1.0851764678955078e-05, + "lookahead_loss": 7.49291491985321, + "loss": 5.437, + "step": 410500 + }, + { + "epoch": 0.7839202880859375, + "grad_norm": 44.04719924926758, + "learning_rate": 1.0804080963134766e-05, + "lookahead_loss": 7.670901400566101, + "loss": 5.5137, + "step": 411000 + }, + { + "epoch": 0.7848739624023438, + "grad_norm": 40.70610809326172, + "learning_rate": 1.0756397247314453e-05, + "lookahead_loss": 7.586776768684387, + "loss": 5.537, + "step": 411500 + }, + { + "epoch": 0.78582763671875, + "grad_norm": 104.03630828857422, + "learning_rate": 1.0708713531494142e-05, + "lookahead_loss": 7.584032464981079, + "loss": 5.5157, + "step": 412000 + }, + { + "epoch": 0.7867813110351562, + "grad_norm": 55.11607360839844, + "learning_rate": 1.0661029815673829e-05, + "lookahead_loss": 7.514433693885803, + "loss": 5.5268, + "step": 412500 + }, + { + "epoch": 0.7877349853515625, + "grad_norm": 74.29720306396484, + "learning_rate": 1.0613346099853515e-05, + "lookahead_loss": 7.687921340942383, + "loss": 5.5265, + "step": 413000 + }, + { + "epoch": 0.7886886596679688, + "grad_norm": 48.57407760620117, + "learning_rate": 1.0565662384033204e-05, + "lookahead_loss": 7.6220727558135986, + "loss": 5.5457, + "step": 413500 + }, + { + "epoch": 0.789642333984375, + "grad_norm": 34.73165512084961, + "learning_rate": 1.051797866821289e-05, + "lookahead_loss": 7.675038757324219, + "loss": 5.5086, + "step": 414000 + }, + { + "epoch": 0.7905960083007812, + "grad_norm": 41.740787506103516, + "learning_rate": 1.047029495239258e-05, + "lookahead_loss": 7.724614574432373, + "loss": 5.5197, + "step": 414500 + }, + { + "epoch": 0.7915496826171875, + "grad_norm": 37.688316345214844, + "learning_rate": 1.0422611236572266e-05, + "lookahead_loss": 7.609706436157227, + "loss": 5.5236, + "step": 415000 + }, + { + "epoch": 0.7915496826171875, + "eval_accuracy": 0.03794500978473581, + "eval_lookahead_loss": 7.549204428863526, + "eval_lookahead_perplexity": 1899.231156646987, + "eval_loss": 5.338712692260742, + "eval_perplexity": 208.24446295888373, + "eval_runtime": 492.8973, + "eval_samples_per_second": 20.288, + "eval_steps_per_second": 5.072, + "step": 415000 + }, + { + "epoch": 0.7925033569335938, + "grad_norm": 90.6028060913086, + "learning_rate": 1.0374927520751953e-05, + "lookahead_loss": 7.643197300910949, + "loss": 5.5076, + "step": 415500 + }, + { + "epoch": 0.79345703125, + "grad_norm": 68.4003677368164, + "learning_rate": 1.0327243804931641e-05, + "lookahead_loss": 7.646915596961975, + "loss": 5.5064, + "step": 416000 + }, + { + "epoch": 0.7944107055664062, + "grad_norm": 45.29821014404297, + "learning_rate": 1.0279560089111328e-05, + "lookahead_loss": 7.545495817184448, + "loss": 5.5018, + "step": 416500 + }, + { + "epoch": 0.7953643798828125, + "grad_norm": 58.231056213378906, + "learning_rate": 1.0231876373291017e-05, + "lookahead_loss": 7.632719036102295, + "loss": 5.5552, + "step": 417000 + }, + { + "epoch": 0.7963180541992188, + "grad_norm": 28.0937442779541, + "learning_rate": 1.0184192657470704e-05, + "lookahead_loss": 7.514857758522034, + "loss": 5.4702, + "step": 417500 + }, + { + "epoch": 0.797271728515625, + "grad_norm": 64.54873657226562, + "learning_rate": 1.013650894165039e-05, + "lookahead_loss": 7.616935418128968, + "loss": 5.5079, + "step": 418000 + }, + { + "epoch": 0.7982254028320312, + "grad_norm": 204.77696228027344, + "learning_rate": 1.0088825225830079e-05, + "lookahead_loss": 7.479987069129944, + "loss": 5.4917, + "step": 418500 + }, + { + "epoch": 0.7991790771484375, + "grad_norm": 56.822208404541016, + "learning_rate": 1.0041141510009766e-05, + "lookahead_loss": 7.45820097732544, + "loss": 5.4453, + "step": 419000 + }, + { + "epoch": 0.8001327514648438, + "grad_norm": 34.764198303222656, + "learning_rate": 9.993457794189454e-06, + "lookahead_loss": 7.533453133583069, + "loss": 5.4608, + "step": 419500 + }, + { + "epoch": 0.80108642578125, + "grad_norm": 76.45616149902344, + "learning_rate": 9.945774078369141e-06, + "lookahead_loss": 7.521807224273681, + "loss": 5.4733, + "step": 420000 + }, + { + "epoch": 0.80108642578125, + "eval_accuracy": 0.03823150684931507, + "eval_lookahead_loss": 7.5329319259643555, + "eval_lookahead_perplexity": 1868.5760065928805, + "eval_loss": 5.331576347351074, + "eval_perplexity": 206.76364873071873, + "eval_runtime": 493.2409, + "eval_samples_per_second": 20.274, + "eval_steps_per_second": 5.069, + "step": 420000 + }, + { + "epoch": 0.8020401000976562, + "grad_norm": 32.97414016723633, + "learning_rate": 9.898090362548828e-06, + "lookahead_loss": 7.487836048126221, + "loss": 5.4915, + "step": 420500 + }, + { + "epoch": 0.8029937744140625, + "grad_norm": 30.75705909729004, + "learning_rate": 9.850406646728516e-06, + "lookahead_loss": 7.573728487014771, + "loss": 5.532, + "step": 421000 + }, + { + "epoch": 0.8039474487304688, + "grad_norm": 31.36495018005371, + "learning_rate": 9.802722930908203e-06, + "lookahead_loss": 7.564960525512696, + "loss": 5.5667, + "step": 421500 + }, + { + "epoch": 0.804901123046875, + "grad_norm": 45.87747573852539, + "learning_rate": 9.755039215087892e-06, + "lookahead_loss": 7.605685463905335, + "loss": 5.5408, + "step": 422000 + }, + { + "epoch": 0.8058547973632812, + "grad_norm": 34.74448013305664, + "learning_rate": 9.707355499267579e-06, + "lookahead_loss": 7.5370338907241825, + "loss": 5.5577, + "step": 422500 + }, + { + "epoch": 0.8068084716796875, + "grad_norm": 52.0458869934082, + "learning_rate": 9.659671783447265e-06, + "lookahead_loss": 7.646506590843201, + "loss": 5.5266, + "step": 423000 + }, + { + "epoch": 0.8077621459960938, + "grad_norm": 25.270662307739258, + "learning_rate": 9.611988067626954e-06, + "lookahead_loss": 7.687211939811706, + "loss": 5.5425, + "step": 423500 + }, + { + "epoch": 0.8087158203125, + "grad_norm": 122.20146942138672, + "learning_rate": 9.56430435180664e-06, + "lookahead_loss": 7.609967276573181, + "loss": 5.4811, + "step": 424000 + }, + { + "epoch": 0.8096694946289062, + "grad_norm": 59.44173812866211, + "learning_rate": 9.51662063598633e-06, + "lookahead_loss": 7.6141966466903686, + "loss": 5.5326, + "step": 424500 + }, + { + "epoch": 0.8106231689453125, + "grad_norm": 39.35455322265625, + "learning_rate": 9.468936920166016e-06, + "lookahead_loss": 7.6102399034500126, + "loss": 5.4857, + "step": 425000 + }, + { + "epoch": 0.8106231689453125, + "eval_accuracy": 0.03774716242661448, + "eval_lookahead_loss": 7.5397014886856075, + "eval_lookahead_perplexity": 1881.2683614440737, + "eval_loss": 5.325006484985352, + "eval_perplexity": 205.40969253931107, + "eval_runtime": 493.2845, + "eval_samples_per_second": 20.272, + "eval_steps_per_second": 5.068, + "step": 425000 + }, + { + "epoch": 0.8115768432617188, + "grad_norm": 130.49562072753906, + "learning_rate": 9.421253204345703e-06, + "lookahead_loss": 7.615910403251648, + "loss": 5.4957, + "step": 425500 + }, + { + "epoch": 0.812530517578125, + "grad_norm": 53.05150604248047, + "learning_rate": 9.373569488525391e-06, + "lookahead_loss": 7.579840442657471, + "loss": 5.5065, + "step": 426000 + }, + { + "epoch": 0.8134841918945312, + "grad_norm": 22.887344360351562, + "learning_rate": 9.325885772705078e-06, + "lookahead_loss": 7.622527934074402, + "loss": 5.5203, + "step": 426500 + }, + { + "epoch": 0.8144378662109375, + "grad_norm": 44.67947006225586, + "learning_rate": 9.278202056884767e-06, + "lookahead_loss": 7.642703639984131, + "loss": 5.5073, + "step": 427000 + }, + { + "epoch": 0.8153915405273438, + "grad_norm": 34.36019515991211, + "learning_rate": 9.230518341064454e-06, + "lookahead_loss": 7.570685408592224, + "loss": 5.4852, + "step": 427500 + }, + { + "epoch": 0.81634521484375, + "grad_norm": 35.43575668334961, + "learning_rate": 9.18283462524414e-06, + "lookahead_loss": 7.55880092048645, + "loss": 5.4934, + "step": 428000 + }, + { + "epoch": 0.8172988891601562, + "grad_norm": 42.698402404785156, + "learning_rate": 9.135150909423829e-06, + "lookahead_loss": 7.617380438804626, + "loss": 5.5289, + "step": 428500 + }, + { + "epoch": 0.8182525634765625, + "grad_norm": 54.84654998779297, + "learning_rate": 9.087467193603516e-06, + "lookahead_loss": 7.56461144733429, + "loss": 5.4675, + "step": 429000 + }, + { + "epoch": 0.8192062377929688, + "grad_norm": 38.21747970581055, + "learning_rate": 9.039783477783204e-06, + "lookahead_loss": 7.676909485816956, + "loss": 5.4947, + "step": 429500 + }, + { + "epoch": 0.820159912109375, + "grad_norm": 37.15910720825195, + "learning_rate": 8.992099761962891e-06, + "lookahead_loss": 7.5221457786560055, + "loss": 5.4883, + "step": 430000 + }, + { + "epoch": 0.820159912109375, + "eval_accuracy": 0.03802152641878669, + "eval_lookahead_loss": 7.548032487678528, + "eval_lookahead_perplexity": 1897.0066731711827, + "eval_loss": 5.316047668457031, + "eval_perplexity": 203.57768336089825, + "eval_runtime": 491.4455, + "eval_samples_per_second": 20.348, + "eval_steps_per_second": 5.087, + "step": 430000 + }, + { + "epoch": 0.8211135864257812, + "grad_norm": 54.006351470947266, + "learning_rate": 8.944416046142578e-06, + "lookahead_loss": 7.594643787384033, + "loss": 5.4805, + "step": 430500 + }, + { + "epoch": 0.8220672607421875, + "grad_norm": 41.79615020751953, + "learning_rate": 8.896732330322266e-06, + "lookahead_loss": 7.592297328948975, + "loss": 5.5145, + "step": 431000 + }, + { + "epoch": 0.8230209350585938, + "grad_norm": 67.11832427978516, + "learning_rate": 8.849048614501953e-06, + "lookahead_loss": 7.534995983123779, + "loss": 5.5137, + "step": 431500 + }, + { + "epoch": 0.823974609375, + "grad_norm": 50.68977737426758, + "learning_rate": 8.801364898681642e-06, + "lookahead_loss": 7.515403673171997, + "loss": 5.4514, + "step": 432000 + }, + { + "epoch": 0.8249282836914062, + "grad_norm": 24.93614387512207, + "learning_rate": 8.753681182861329e-06, + "lookahead_loss": 7.622434290885925, + "loss": 5.5198, + "step": 432500 + }, + { + "epoch": 0.8258819580078125, + "grad_norm": 78.42184448242188, + "learning_rate": 8.705997467041015e-06, + "lookahead_loss": 7.671145604133606, + "loss": 5.4736, + "step": 433000 + }, + { + "epoch": 0.8268356323242188, + "grad_norm": 31.26125717163086, + "learning_rate": 8.658313751220704e-06, + "lookahead_loss": 7.575186867713928, + "loss": 5.5122, + "step": 433500 + }, + { + "epoch": 0.827789306640625, + "grad_norm": 44.12411117553711, + "learning_rate": 8.61063003540039e-06, + "lookahead_loss": 7.580108293533325, + "loss": 5.4887, + "step": 434000 + }, + { + "epoch": 0.8287429809570312, + "grad_norm": 29.470726013183594, + "learning_rate": 8.56294631958008e-06, + "lookahead_loss": 7.607799582481384, + "loss": 5.4834, + "step": 434500 + }, + { + "epoch": 0.8296966552734375, + "grad_norm": 33.90892791748047, + "learning_rate": 8.515262603759766e-06, + "lookahead_loss": 7.491770347595215, + "loss": 5.4659, + "step": 435000 + }, + { + "epoch": 0.8296966552734375, + "eval_accuracy": 0.037281996086105676, + "eval_lookahead_loss": 7.521309097480774, + "eval_lookahead_perplexity": 1846.98359369186, + "eval_loss": 5.307694435119629, + "eval_perplexity": 201.88423420437408, + "eval_runtime": 498.8184, + "eval_samples_per_second": 20.047, + "eval_steps_per_second": 5.012, + "step": 435000 + }, + { + "epoch": 0.8306503295898438, + "grad_norm": 30.055442810058594, + "learning_rate": 8.467578887939453e-06, + "lookahead_loss": 7.382915095329285, + "loss": 5.4657, + "step": 435500 + }, + { + "epoch": 0.83160400390625, + "grad_norm": 84.8909683227539, + "learning_rate": 8.419895172119141e-06, + "lookahead_loss": 7.504948205947876, + "loss": 5.4566, + "step": 436000 + }, + { + "epoch": 0.8325576782226562, + "grad_norm": 40.389060974121094, + "learning_rate": 8.372211456298828e-06, + "lookahead_loss": 7.531291335105896, + "loss": 5.427, + "step": 436500 + }, + { + "epoch": 0.8335113525390625, + "grad_norm": 64.67556762695312, + "learning_rate": 8.324527740478517e-06, + "lookahead_loss": 7.534828496932984, + "loss": 5.4107, + "step": 437000 + }, + { + "epoch": 0.8344650268554688, + "grad_norm": 72.10221862792969, + "learning_rate": 8.276844024658204e-06, + "lookahead_loss": 7.603535109519958, + "loss": 5.4699, + "step": 437500 + }, + { + "epoch": 0.835418701171875, + "grad_norm": 47.084693908691406, + "learning_rate": 8.22916030883789e-06, + "lookahead_loss": 7.610624960899353, + "loss": 5.492, + "step": 438000 + }, + { + "epoch": 0.8363723754882812, + "grad_norm": 37.98929214477539, + "learning_rate": 8.181476593017579e-06, + "lookahead_loss": 7.619500916481018, + "loss": 5.5408, + "step": 438500 + }, + { + "epoch": 0.8373260498046875, + "grad_norm": 45.6959114074707, + "learning_rate": 8.133792877197266e-06, + "lookahead_loss": 7.657407299041748, + "loss": 5.5354, + "step": 439000 + }, + { + "epoch": 0.8382797241210938, + "grad_norm": 22.71889877319336, + "learning_rate": 8.086109161376954e-06, + "lookahead_loss": 7.544789179801941, + "loss": 5.5414, + "step": 439500 + }, + { + "epoch": 0.8392333984375, + "grad_norm": 32.34550476074219, + "learning_rate": 8.038425445556641e-06, + "lookahead_loss": 7.561338201522827, + "loss": 5.4864, + "step": 440000 + }, + { + "epoch": 0.8392333984375, + "eval_accuracy": 0.03775205479452055, + "eval_lookahead_loss": 7.521659079551696, + "eval_lookahead_perplexity": 1847.6301179642924, + "eval_loss": 5.300100803375244, + "eval_perplexity": 200.35700561930025, + "eval_runtime": 491.3673, + "eval_samples_per_second": 20.351, + "eval_steps_per_second": 5.088, + "step": 440000 + }, + { + "epoch": 0.8401870727539062, + "grad_norm": 34.53316879272461, + "learning_rate": 7.990741729736328e-06, + "lookahead_loss": 7.524707970619201, + "loss": 5.5045, + "step": 440500 + }, + { + "epoch": 0.8411407470703125, + "grad_norm": 33.41945266723633, + "learning_rate": 7.943058013916016e-06, + "lookahead_loss": 7.484915809631348, + "loss": 5.4705, + "step": 441000 + }, + { + "epoch": 0.8420944213867188, + "grad_norm": 61.11821746826172, + "learning_rate": 7.895374298095703e-06, + "lookahead_loss": 7.557993590354919, + "loss": 5.4386, + "step": 441500 + }, + { + "epoch": 0.843048095703125, + "grad_norm": 32.36595916748047, + "learning_rate": 7.847690582275392e-06, + "lookahead_loss": 7.540317601203919, + "loss": 5.4391, + "step": 442000 + }, + { + "epoch": 0.8440017700195312, + "grad_norm": 38.94442367553711, + "learning_rate": 7.800006866455079e-06, + "lookahead_loss": 7.491489647865295, + "loss": 5.4867, + "step": 442500 + }, + { + "epoch": 0.8449554443359375, + "grad_norm": 25.709918975830078, + "learning_rate": 7.752323150634765e-06, + "lookahead_loss": 7.582665386199952, + "loss": 5.4716, + "step": 443000 + }, + { + "epoch": 0.8459091186523438, + "grad_norm": 80.47025299072266, + "learning_rate": 7.704639434814454e-06, + "lookahead_loss": 7.541484993934631, + "loss": 5.45, + "step": 443500 + }, + { + "epoch": 0.84686279296875, + "grad_norm": 44.92289352416992, + "learning_rate": 7.65695571899414e-06, + "lookahead_loss": 7.519715372085571, + "loss": 5.4719, + "step": 444000 + }, + { + "epoch": 0.8478164672851562, + "grad_norm": 73.77596282958984, + "learning_rate": 7.6092720031738284e-06, + "lookahead_loss": 7.590097392082215, + "loss": 5.4404, + "step": 444500 + }, + { + "epoch": 0.8487701416015625, + "grad_norm": 26.172473907470703, + "learning_rate": 7.561588287353516e-06, + "lookahead_loss": 7.472916863441467, + "loss": 5.4697, + "step": 445000 + }, + { + "epoch": 0.8487701416015625, + "eval_accuracy": 0.03762093933463796, + "eval_lookahead_loss": 7.5183752172470095, + "eval_lookahead_perplexity": 1841.5727063631832, + "eval_loss": 5.2961297035217285, + "eval_perplexity": 199.56294563288517, + "eval_runtime": 491.7573, + "eval_samples_per_second": 20.335, + "eval_steps_per_second": 5.084, + "step": 445000 + }, + { + "epoch": 0.8497238159179688, + "grad_norm": 41.10604476928711, + "learning_rate": 7.513904571533204e-06, + "lookahead_loss": 7.5092562437057495, + "loss": 5.4768, + "step": 445500 + }, + { + "epoch": 0.850677490234375, + "grad_norm": 55.875091552734375, + "learning_rate": 7.466220855712891e-06, + "lookahead_loss": 7.518968954086303, + "loss": 5.464, + "step": 446000 + }, + { + "epoch": 0.8516311645507812, + "grad_norm": 36.81357955932617, + "learning_rate": 7.418537139892578e-06, + "lookahead_loss": 7.55088060092926, + "loss": 5.5146, + "step": 446500 + }, + { + "epoch": 0.8525848388671875, + "grad_norm": 25.811378479003906, + "learning_rate": 7.370853424072266e-06, + "lookahead_loss": 7.674611318588257, + "loss": 5.487, + "step": 447000 + }, + { + "epoch": 0.8535385131835938, + "grad_norm": 26.89521598815918, + "learning_rate": 7.323169708251954e-06, + "lookahead_loss": 7.5006145572662355, + "loss": 5.4803, + "step": 447500 + }, + { + "epoch": 0.8544921875, + "grad_norm": 27.983924865722656, + "learning_rate": 7.275485992431641e-06, + "lookahead_loss": 7.486395846366882, + "loss": 5.5033, + "step": 448000 + }, + { + "epoch": 0.8554458618164062, + "grad_norm": 42.874385833740234, + "learning_rate": 7.227802276611328e-06, + "lookahead_loss": 7.545176350593567, + "loss": 5.4469, + "step": 448500 + }, + { + "epoch": 0.8563995361328125, + "grad_norm": 30.358718872070312, + "learning_rate": 7.180118560791016e-06, + "lookahead_loss": 7.484733279228211, + "loss": 5.3838, + "step": 449000 + }, + { + "epoch": 0.8573532104492188, + "grad_norm": 64.88677215576172, + "learning_rate": 7.1324348449707034e-06, + "lookahead_loss": 7.580103507995606, + "loss": 5.4775, + "step": 449500 + }, + { + "epoch": 0.858306884765625, + "grad_norm": 55.03095626831055, + "learning_rate": 7.084751129150391e-06, + "lookahead_loss": 7.491296406745911, + "loss": 5.359, + "step": 450000 + }, + { + "epoch": 0.858306884765625, + "eval_accuracy": 0.03679471624266145, + "eval_lookahead_loss": 7.517461001396179, + "eval_lookahead_perplexity": 1839.8898807546996, + "eval_loss": 5.291573524475098, + "eval_perplexity": 198.65576931968488, + "eval_runtime": 492.9123, + "eval_samples_per_second": 20.288, + "eval_steps_per_second": 5.072, + "step": 450000 + }, + { + "epoch": 0.8592605590820312, + "grad_norm": 61.702518463134766, + "learning_rate": 7.037067413330079e-06, + "lookahead_loss": 7.619806599617005, + "loss": 5.4577, + "step": 450500 + }, + { + "epoch": 0.8602142333984375, + "grad_norm": 49.73261260986328, + "learning_rate": 6.989383697509766e-06, + "lookahead_loss": 7.455164489746093, + "loss": 5.3937, + "step": 451000 + }, + { + "epoch": 0.8611679077148438, + "grad_norm": 30.131038665771484, + "learning_rate": 6.941699981689453e-06, + "lookahead_loss": 7.603461454391479, + "loss": 5.4831, + "step": 451500 + }, + { + "epoch": 0.86212158203125, + "grad_norm": 20.577909469604492, + "learning_rate": 6.894016265869141e-06, + "lookahead_loss": 7.594459860801697, + "loss": 5.4679, + "step": 452000 + }, + { + "epoch": 0.8630752563476562, + "grad_norm": 44.1313362121582, + "learning_rate": 6.846332550048829e-06, + "lookahead_loss": 7.47231307220459, + "loss": 5.4158, + "step": 452500 + }, + { + "epoch": 0.8640289306640625, + "grad_norm": 15.460197448730469, + "learning_rate": 6.798648834228516e-06, + "lookahead_loss": 7.56699965763092, + "loss": 5.3858, + "step": 453000 + }, + { + "epoch": 0.8649826049804688, + "grad_norm": 48.268009185791016, + "learning_rate": 6.750965118408203e-06, + "lookahead_loss": 7.49468569278717, + "loss": 5.4155, + "step": 453500 + }, + { + "epoch": 0.865936279296875, + "grad_norm": 32.19114685058594, + "learning_rate": 6.703281402587891e-06, + "lookahead_loss": 7.541509713172912, + "loss": 5.4254, + "step": 454000 + }, + { + "epoch": 0.8668899536132812, + "grad_norm": 30.903470993041992, + "learning_rate": 6.6555976867675784e-06, + "lookahead_loss": 7.51051789188385, + "loss": 5.5172, + "step": 454500 + }, + { + "epoch": 0.8678436279296875, + "grad_norm": 24.93724822998047, + "learning_rate": 6.607913970947266e-06, + "lookahead_loss": 7.530254680633545, + "loss": 5.5236, + "step": 455000 + }, + { + "epoch": 0.8678436279296875, + "eval_accuracy": 0.03753189823874755, + "eval_lookahead_loss": 7.508671671295166, + "eval_lookahead_perplexity": 1823.789341371162, + "eval_loss": 5.279980182647705, + "eval_perplexity": 196.36598385935807, + "eval_runtime": 494.8357, + "eval_samples_per_second": 20.209, + "eval_steps_per_second": 5.052, + "step": 455000 + }, + { + "epoch": 0.8687973022460938, + "grad_norm": 40.866634368896484, + "learning_rate": 6.560230255126954e-06, + "lookahead_loss": 7.578203195571899, + "loss": 5.4462, + "step": 455500 + }, + { + "epoch": 0.8697509765625, + "grad_norm": 38.755191802978516, + "learning_rate": 6.512546539306641e-06, + "lookahead_loss": 7.570026656150818, + "loss": 5.5094, + "step": 456000 + }, + { + "epoch": 0.8707046508789062, + "grad_norm": 36.57961654663086, + "learning_rate": 6.464862823486328e-06, + "lookahead_loss": 7.466435073852539, + "loss": 5.4475, + "step": 456500 + }, + { + "epoch": 0.8716583251953125, + "grad_norm": 40.567874908447266, + "learning_rate": 6.417179107666016e-06, + "lookahead_loss": 7.592789623260498, + "loss": 5.4643, + "step": 457000 + }, + { + "epoch": 0.8726119995117188, + "grad_norm": 47.6295166015625, + "learning_rate": 6.369495391845704e-06, + "lookahead_loss": 7.501975998878479, + "loss": 5.436, + "step": 457500 + }, + { + "epoch": 0.873565673828125, + "grad_norm": 35.93545150756836, + "learning_rate": 6.321811676025391e-06, + "lookahead_loss": 7.541886796951294, + "loss": 5.471, + "step": 458000 + }, + { + "epoch": 0.8745193481445312, + "grad_norm": 62.05765151977539, + "learning_rate": 6.274127960205078e-06, + "lookahead_loss": 7.559152512550354, + "loss": 5.4357, + "step": 458500 + }, + { + "epoch": 0.8754730224609375, + "grad_norm": 27.97027587890625, + "learning_rate": 6.226444244384766e-06, + "lookahead_loss": 7.617894318580627, + "loss": 5.484, + "step": 459000 + }, + { + "epoch": 0.8764266967773438, + "grad_norm": 38.51636505126953, + "learning_rate": 6.1787605285644534e-06, + "lookahead_loss": 7.5432721681594845, + "loss": 5.4131, + "step": 459500 + }, + { + "epoch": 0.87738037109375, + "grad_norm": 59.3133544921875, + "learning_rate": 6.131076812744141e-06, + "lookahead_loss": 7.525468029022217, + "loss": 5.4811, + "step": 460000 + }, + { + "epoch": 0.87738037109375, + "eval_accuracy": 0.03695616438356164, + "eval_lookahead_loss": 7.5068388912200925, + "eval_lookahead_perplexity": 1820.4497978643824, + "eval_loss": 5.275722980499268, + "eval_perplexity": 195.5317910945001, + "eval_runtime": 498.6588, + "eval_samples_per_second": 20.054, + "eval_steps_per_second": 5.013, + "step": 460000 + }, + { + "epoch": 0.8783340454101562, + "grad_norm": 36.35504150390625, + "learning_rate": 6.083393096923829e-06, + "lookahead_loss": 7.449315311431885, + "loss": 5.4553, + "step": 460500 + }, + { + "epoch": 0.8792877197265625, + "grad_norm": 23.547718048095703, + "learning_rate": 6.035709381103516e-06, + "lookahead_loss": 7.546644331932068, + "loss": 5.4658, + "step": 461000 + }, + { + "epoch": 0.8802413940429688, + "grad_norm": 41.75696563720703, + "learning_rate": 5.988025665283203e-06, + "lookahead_loss": 7.500026070594788, + "loss": 5.4131, + "step": 461500 + }, + { + "epoch": 0.881195068359375, + "grad_norm": 49.44127655029297, + "learning_rate": 5.940341949462891e-06, + "lookahead_loss": 7.574215328216552, + "loss": 5.4648, + "step": 462000 + }, + { + "epoch": 0.8821487426757812, + "grad_norm": 28.730375289916992, + "learning_rate": 5.892658233642579e-06, + "lookahead_loss": 7.558611562728882, + "loss": 5.4209, + "step": 462500 + }, + { + "epoch": 0.8831024169921875, + "grad_norm": 106.4295654296875, + "learning_rate": 5.844974517822266e-06, + "lookahead_loss": 7.569236333847046, + "loss": 5.4749, + "step": 463000 + }, + { + "epoch": 0.8840560913085938, + "grad_norm": 15.430024147033691, + "learning_rate": 5.797290802001953e-06, + "lookahead_loss": 7.467174778938293, + "loss": 5.4545, + "step": 463500 + }, + { + "epoch": 0.885009765625, + "grad_norm": 34.30374526977539, + "learning_rate": 5.749607086181641e-06, + "lookahead_loss": 7.567559573173523, + "loss": 5.4509, + "step": 464000 + }, + { + "epoch": 0.8859634399414062, + "grad_norm": 31.565866470336914, + "learning_rate": 5.7019233703613284e-06, + "lookahead_loss": 7.504168871879577, + "loss": 5.4369, + "step": 464500 + }, + { + "epoch": 0.8869171142578125, + "grad_norm": 22.142364501953125, + "learning_rate": 5.654239654541016e-06, + "lookahead_loss": 7.521105673789978, + "loss": 5.4134, + "step": 465000 + }, + { + "epoch": 0.8869171142578125, + "eval_accuracy": 0.03697436399217221, + "eval_lookahead_loss": 7.502119409370422, + "eval_lookahead_perplexity": 1811.8784601310306, + "eval_loss": 5.267831325531006, + "eval_perplexity": 193.99479436457258, + "eval_runtime": 494.4776, + "eval_samples_per_second": 20.223, + "eval_steps_per_second": 5.056, + "step": 465000 + }, + { + "epoch": 0.8878707885742188, + "grad_norm": 39.15439987182617, + "learning_rate": 5.606555938720704e-06, + "lookahead_loss": 7.514542613983155, + "loss": 5.4285, + "step": 465500 + }, + { + "epoch": 0.888824462890625, + "grad_norm": 28.45076560974121, + "learning_rate": 5.558872222900391e-06, + "lookahead_loss": 7.566133080482483, + "loss": 5.4338, + "step": 466000 + }, + { + "epoch": 0.8897781372070312, + "grad_norm": 32.75477981567383, + "learning_rate": 5.511188507080078e-06, + "lookahead_loss": 7.589635190010071, + "loss": 5.3926, + "step": 466500 + }, + { + "epoch": 0.8907318115234375, + "grad_norm": 34.84766387939453, + "learning_rate": 5.463504791259766e-06, + "lookahead_loss": 7.646199013710022, + "loss": 5.4259, + "step": 467000 + }, + { + "epoch": 0.8916854858398438, + "grad_norm": 33.61738204956055, + "learning_rate": 5.415821075439454e-06, + "lookahead_loss": 7.569194613456726, + "loss": 5.4137, + "step": 467500 + }, + { + "epoch": 0.89263916015625, + "grad_norm": 36.55641174316406, + "learning_rate": 5.368137359619141e-06, + "lookahead_loss": 7.553561694145203, + "loss": 5.4298, + "step": 468000 + }, + { + "epoch": 0.8935928344726562, + "grad_norm": 104.22590637207031, + "learning_rate": 5.320453643798828e-06, + "lookahead_loss": 7.547387008666992, + "loss": 5.4189, + "step": 468500 + }, + { + "epoch": 0.8945465087890625, + "grad_norm": 43.61272430419922, + "learning_rate": 5.272769927978516e-06, + "lookahead_loss": 7.5143665523529055, + "loss": 5.438, + "step": 469000 + }, + { + "epoch": 0.8955001831054688, + "grad_norm": 24.442922592163086, + "learning_rate": 5.2250862121582034e-06, + "lookahead_loss": 7.433665193557739, + "loss": 5.3989, + "step": 469500 + }, + { + "epoch": 0.896453857421875, + "grad_norm": 61.167510986328125, + "learning_rate": 5.177402496337891e-06, + "lookahead_loss": 7.620127082824707, + "loss": 5.4297, + "step": 470000 + }, + { + "epoch": 0.896453857421875, + "eval_accuracy": 0.03718493150684932, + "eval_lookahead_loss": 7.506443597984314, + "eval_lookahead_perplexity": 1819.7303285832502, + "eval_loss": 5.262168884277344, + "eval_perplexity": 192.89941442702468, + "eval_runtime": 491.9141, + "eval_samples_per_second": 20.329, + "eval_steps_per_second": 5.082, + "step": 470000 + }, + { + "epoch": 0.8974075317382812, + "grad_norm": 35.60335159301758, + "learning_rate": 5.129718780517579e-06, + "lookahead_loss": 7.447581491470337, + "loss": 5.404, + "step": 470500 + }, + { + "epoch": 0.8983612060546875, + "grad_norm": 21.729867935180664, + "learning_rate": 5.082035064697266e-06, + "lookahead_loss": 7.580970271110535, + "loss": 5.5061, + "step": 471000 + }, + { + "epoch": 0.8993148803710938, + "grad_norm": 46.88615798950195, + "learning_rate": 5.034351348876953e-06, + "lookahead_loss": 7.541093637466431, + "loss": 5.5533, + "step": 471500 + }, + { + "epoch": 0.9002685546875, + "grad_norm": 33.94005584716797, + "learning_rate": 4.986667633056641e-06, + "lookahead_loss": 7.514125886917114, + "loss": 5.4906, + "step": 472000 + }, + { + "epoch": 0.9012222290039062, + "grad_norm": 42.91880416870117, + "learning_rate": 4.938983917236329e-06, + "lookahead_loss": 7.502145031452179, + "loss": 5.4652, + "step": 472500 + }, + { + "epoch": 0.9021759033203125, + "grad_norm": 32.76023483276367, + "learning_rate": 4.891300201416016e-06, + "lookahead_loss": 7.585329412460327, + "loss": 5.4206, + "step": 473000 + }, + { + "epoch": 0.9031295776367188, + "grad_norm": 28.57848358154297, + "learning_rate": 4.843616485595703e-06, + "lookahead_loss": 7.504659191131592, + "loss": 5.4423, + "step": 473500 + }, + { + "epoch": 0.904083251953125, + "grad_norm": 38.453346252441406, + "learning_rate": 4.795932769775391e-06, + "lookahead_loss": 7.5887782773971555, + "loss": 5.4707, + "step": 474000 + }, + { + "epoch": 0.9050369262695312, + "grad_norm": 22.19831657409668, + "learning_rate": 4.7482490539550784e-06, + "lookahead_loss": 7.597484665870667, + "loss": 5.4324, + "step": 474500 + }, + { + "epoch": 0.9059906005859375, + "grad_norm": 21.222064971923828, + "learning_rate": 4.700565338134766e-06, + "lookahead_loss": 7.61494041633606, + "loss": 5.4657, + "step": 475000 + }, + { + "epoch": 0.9059906005859375, + "eval_accuracy": 0.03723581213307241, + "eval_lookahead_loss": 7.494832308387756, + "eval_lookahead_perplexity": 1798.7231092233949, + "eval_loss": 5.257956504821777, + "eval_perplexity": 192.08855791335694, + "eval_runtime": 495.0323, + "eval_samples_per_second": 20.201, + "eval_steps_per_second": 5.05, + "step": 475000 + }, + { + "epoch": 0.9069442749023438, + "grad_norm": 85.29813385009766, + "learning_rate": 4.652881622314453e-06, + "lookahead_loss": 7.477985557556153, + "loss": 5.4479, + "step": 475500 + }, + { + "epoch": 0.90789794921875, + "grad_norm": 34.99476623535156, + "learning_rate": 4.605197906494141e-06, + "lookahead_loss": 7.607454412460327, + "loss": 5.453, + "step": 476000 + }, + { + "epoch": 0.9088516235351562, + "grad_norm": 18.25977897644043, + "learning_rate": 4.557514190673828e-06, + "lookahead_loss": 7.474591215133667, + "loss": 5.4271, + "step": 476500 + }, + { + "epoch": 0.9098052978515625, + "grad_norm": 32.73667526245117, + "learning_rate": 4.509830474853516e-06, + "lookahead_loss": 7.4408266744613645, + "loss": 5.4273, + "step": 477000 + }, + { + "epoch": 0.9107589721679688, + "grad_norm": 44.201377868652344, + "learning_rate": 4.462146759033204e-06, + "lookahead_loss": 7.452820043563843, + "loss": 5.3904, + "step": 477500 + }, + { + "epoch": 0.911712646484375, + "grad_norm": 44.65700149536133, + "learning_rate": 4.4144630432128904e-06, + "lookahead_loss": 7.599451966285706, + "loss": 5.4603, + "step": 478000 + }, + { + "epoch": 0.9126663208007812, + "grad_norm": 41.947044372558594, + "learning_rate": 4.366779327392578e-06, + "lookahead_loss": 7.565536955833435, + "loss": 5.4336, + "step": 478500 + }, + { + "epoch": 0.9136199951171875, + "grad_norm": 43.33025360107422, + "learning_rate": 4.319095611572266e-06, + "lookahead_loss": 7.59088079738617, + "loss": 5.4484, + "step": 479000 + }, + { + "epoch": 0.9145736694335938, + "grad_norm": 24.180416107177734, + "learning_rate": 4.2714118957519534e-06, + "lookahead_loss": 7.605153242111206, + "loss": 5.4574, + "step": 479500 + }, + { + "epoch": 0.91552734375, + "grad_norm": 33.48406219482422, + "learning_rate": 4.223728179931641e-06, + "lookahead_loss": 7.407353488922119, + "loss": 5.398, + "step": 480000 + }, + { + "epoch": 0.91552734375, + "eval_accuracy": 0.03678160469667319, + "eval_lookahead_loss": 7.496743685531616, + "eval_lookahead_perplexity": 1802.1644352507528, + "eval_loss": 5.2516279220581055, + "eval_perplexity": 190.87674814037828, + "eval_runtime": 493.0015, + "eval_samples_per_second": 20.284, + "eval_steps_per_second": 5.071, + "step": 480000 + }, + { + "epoch": 0.9164810180664062, + "grad_norm": 70.6424560546875, + "learning_rate": 4.176044464111328e-06, + "lookahead_loss": 7.515648212432861, + "loss": 5.4131, + "step": 480500 + }, + { + "epoch": 0.9174346923828125, + "grad_norm": 33.325355529785156, + "learning_rate": 4.128360748291016e-06, + "lookahead_loss": 7.547232943534851, + "loss": 5.425, + "step": 481000 + }, + { + "epoch": 0.9183883666992188, + "grad_norm": 34.30824279785156, + "learning_rate": 4.080677032470703e-06, + "lookahead_loss": 7.502337241172791, + "loss": 5.4104, + "step": 481500 + }, + { + "epoch": 0.919342041015625, + "grad_norm": 75.03662872314453, + "learning_rate": 4.032993316650391e-06, + "lookahead_loss": 7.536139481544494, + "loss": 5.4208, + "step": 482000 + }, + { + "epoch": 0.9202957153320312, + "grad_norm": 73.71026611328125, + "learning_rate": 3.985309600830079e-06, + "lookahead_loss": 7.607461035728455, + "loss": 5.4449, + "step": 482500 + }, + { + "epoch": 0.9212493896484375, + "grad_norm": 34.28474807739258, + "learning_rate": 3.9376258850097654e-06, + "lookahead_loss": 7.571555335998535, + "loss": 5.4345, + "step": 483000 + }, + { + "epoch": 0.9222030639648438, + "grad_norm": 34.20186233520508, + "learning_rate": 3.889942169189453e-06, + "lookahead_loss": 7.516776182174683, + "loss": 5.4017, + "step": 483500 + }, + { + "epoch": 0.92315673828125, + "grad_norm": 36.054019927978516, + "learning_rate": 3.842258453369141e-06, + "lookahead_loss": 7.516911548614502, + "loss": 5.4503, + "step": 484000 + }, + { + "epoch": 0.9241104125976562, + "grad_norm": 31.07468605041504, + "learning_rate": 3.7945747375488284e-06, + "lookahead_loss": 7.449149313926696, + "loss": 5.3935, + "step": 484500 + }, + { + "epoch": 0.9250640869140625, + "grad_norm": 29.78468132019043, + "learning_rate": 3.7468910217285157e-06, + "lookahead_loss": 7.452660127639771, + "loss": 5.4084, + "step": 485000 + }, + { + "epoch": 0.9250640869140625, + "eval_accuracy": 0.03673679060665362, + "eval_lookahead_loss": 7.497453893852234, + "eval_lookahead_perplexity": 1803.4448020375453, + "eval_loss": 5.247228145599365, + "eval_perplexity": 190.03877791002319, + "eval_runtime": 491.0733, + "eval_samples_per_second": 20.364, + "eval_steps_per_second": 5.091, + "step": 485000 + }, + { + "epoch": 0.9260177612304688, + "grad_norm": 28.61113166809082, + "learning_rate": 3.6992073059082034e-06, + "lookahead_loss": 7.474437353134156, + "loss": 5.3534, + "step": 485500 + }, + { + "epoch": 0.926971435546875, + "grad_norm": 87.26258850097656, + "learning_rate": 3.6515235900878906e-06, + "lookahead_loss": 7.461425477027893, + "loss": 5.3503, + "step": 486000 + }, + { + "epoch": 0.9279251098632812, + "grad_norm": 38.279319763183594, + "learning_rate": 3.6038398742675783e-06, + "lookahead_loss": 7.398401821136475, + "loss": 5.3726, + "step": 486500 + }, + { + "epoch": 0.9288787841796875, + "grad_norm": 51.003173828125, + "learning_rate": 3.556156158447266e-06, + "lookahead_loss": 7.434428665161133, + "loss": 5.415, + "step": 487000 + }, + { + "epoch": 0.9298324584960938, + "grad_norm": 35.338523864746094, + "learning_rate": 3.508472442626953e-06, + "lookahead_loss": 7.5000336036682125, + "loss": 5.4204, + "step": 487500 + }, + { + "epoch": 0.9307861328125, + "grad_norm": 22.86884307861328, + "learning_rate": 3.460788726806641e-06, + "lookahead_loss": 7.490082403182983, + "loss": 5.5005, + "step": 488000 + }, + { + "epoch": 0.9317398071289062, + "grad_norm": 21.055870056152344, + "learning_rate": 3.413105010986328e-06, + "lookahead_loss": 7.511719326019287, + "loss": 5.5178, + "step": 488500 + }, + { + "epoch": 0.9326934814453125, + "grad_norm": 22.436960220336914, + "learning_rate": 3.3654212951660158e-06, + "lookahead_loss": 7.454348183631897, + "loss": 5.4314, + "step": 489000 + }, + { + "epoch": 0.9336471557617188, + "grad_norm": 27.64606285095215, + "learning_rate": 3.3177375793457034e-06, + "lookahead_loss": 7.546735237121582, + "loss": 5.4742, + "step": 489500 + }, + { + "epoch": 0.934600830078125, + "grad_norm": 29.162399291992188, + "learning_rate": 3.2700538635253907e-06, + "lookahead_loss": 7.569887726783753, + "loss": 5.4512, + "step": 490000 + }, + { + "epoch": 0.934600830078125, + "eval_accuracy": 0.03672348336594912, + "eval_lookahead_loss": 7.493668005943299, + "eval_lookahead_perplexity": 1796.6300702120866, + "eval_loss": 5.243106365203857, + "eval_perplexity": 189.25709187660786, + "eval_runtime": 493.1027, + "eval_samples_per_second": 20.28, + "eval_steps_per_second": 5.07, + "step": 490000 + }, + { + "epoch": 0.9355545043945312, + "grad_norm": 26.6416015625, + "learning_rate": 3.2223701477050784e-06, + "lookahead_loss": 7.57770721244812, + "loss": 5.4398, + "step": 490500 + }, + { + "epoch": 0.9365081787109375, + "grad_norm": 32.928955078125, + "learning_rate": 3.1746864318847656e-06, + "lookahead_loss": 7.534963851928711, + "loss": 5.4766, + "step": 491000 + }, + { + "epoch": 0.9374618530273438, + "grad_norm": 63.6113166809082, + "learning_rate": 3.1270027160644533e-06, + "lookahead_loss": 7.60041755104065, + "loss": 5.4454, + "step": 491500 + }, + { + "epoch": 0.93841552734375, + "grad_norm": 32.56538772583008, + "learning_rate": 3.079319000244141e-06, + "lookahead_loss": 7.524976124763489, + "loss": 5.434, + "step": 492000 + }, + { + "epoch": 0.9393692016601562, + "grad_norm": 48.78775405883789, + "learning_rate": 3.031635284423828e-06, + "lookahead_loss": 7.691059616088867, + "loss": 5.5711, + "step": 492500 + }, + { + "epoch": 0.9403228759765625, + "grad_norm": 42.24482727050781, + "learning_rate": 2.983951568603516e-06, + "lookahead_loss": 7.522299768447876, + "loss": 5.4212, + "step": 493000 + }, + { + "epoch": 0.9412765502929688, + "grad_norm": 34.92015075683594, + "learning_rate": 2.936267852783203e-06, + "lookahead_loss": 7.545178604125977, + "loss": 5.4312, + "step": 493500 + }, + { + "epoch": 0.942230224609375, + "grad_norm": 29.295696258544922, + "learning_rate": 2.8885841369628908e-06, + "lookahead_loss": 7.551432324409485, + "loss": 5.4316, + "step": 494000 + }, + { + "epoch": 0.9431838989257812, + "grad_norm": 78.8042984008789, + "learning_rate": 2.8409004211425784e-06, + "lookahead_loss": 7.562519548416137, + "loss": 5.4278, + "step": 494500 + }, + { + "epoch": 0.9441375732421875, + "grad_norm": 31.917383193969727, + "learning_rate": 2.7932167053222657e-06, + "lookahead_loss": 7.573059289932251, + "loss": 5.42, + "step": 495000 + }, + { + "epoch": 0.9441375732421875, + "eval_accuracy": 0.0366426614481409, + "eval_lookahead_loss": 7.49218367061615, + "eval_lookahead_perplexity": 1793.96524696397, + "eval_loss": 5.239822864532471, + "eval_perplexity": 188.6366851986412, + "eval_runtime": 488.9175, + "eval_samples_per_second": 20.453, + "eval_steps_per_second": 5.113, + "step": 495000 + }, + { + "epoch": 0.9450912475585938, + "grad_norm": 29.023418426513672, + "learning_rate": 2.7455329895019534e-06, + "lookahead_loss": 7.451029644966125, + "loss": 5.4249, + "step": 495500 + }, + { + "epoch": 0.946044921875, + "grad_norm": 48.77956771850586, + "learning_rate": 2.6978492736816406e-06, + "lookahead_loss": 7.429635667800904, + "loss": 5.3853, + "step": 496000 + }, + { + "epoch": 0.9469985961914062, + "grad_norm": 29.84053611755371, + "learning_rate": 2.6501655578613283e-06, + "lookahead_loss": 7.442624586105347, + "loss": 5.3423, + "step": 496500 + }, + { + "epoch": 0.9479522705078125, + "grad_norm": 47.194725036621094, + "learning_rate": 2.602481842041016e-06, + "lookahead_loss": 7.546961915016174, + "loss": 5.4146, + "step": 497000 + }, + { + "epoch": 0.9489059448242188, + "grad_norm": 82.2673568725586, + "learning_rate": 2.554798126220703e-06, + "lookahead_loss": 7.4985938720703125, + "loss": 5.3538, + "step": 497500 + }, + { + "epoch": 0.949859619140625, + "grad_norm": 27.915851593017578, + "learning_rate": 2.507114410400391e-06, + "lookahead_loss": 7.63710261631012, + "loss": 5.4079, + "step": 498000 + }, + { + "epoch": 0.9508132934570312, + "grad_norm": 24.54716682434082, + "learning_rate": 2.459430694580078e-06, + "lookahead_loss": 7.6107233562469485, + "loss": 5.4472, + "step": 498500 + }, + { + "epoch": 0.9517669677734375, + "grad_norm": 34.820865631103516, + "learning_rate": 2.4117469787597658e-06, + "lookahead_loss": 7.575458500862122, + "loss": 5.4205, + "step": 499000 + }, + { + "epoch": 0.9527206420898438, + "grad_norm": 70.72277069091797, + "learning_rate": 2.3640632629394534e-06, + "lookahead_loss": 7.434024419784546, + "loss": 5.4289, + "step": 499500 + }, + { + "epoch": 0.95367431640625, + "grad_norm": 35.12321853637695, + "learning_rate": 2.3163795471191407e-06, + "lookahead_loss": 7.5704899559021, + "loss": 5.4164, + "step": 500000 + }, + { + "epoch": 0.95367431640625, + "eval_accuracy": 0.0364545988258317, + "eval_lookahead_loss": 7.491130564880371, + "eval_lookahead_perplexity": 1792.077006305648, + "eval_loss": 5.235738754272461, + "eval_perplexity": 187.8678432635122, + "eval_runtime": 488.8948, + "eval_samples_per_second": 20.454, + "eval_steps_per_second": 5.114, + "step": 500000 + }, + { + "epoch": 1.0009536743164062, + "grad_norm": 35.58665466308594, + "learning_rate": 2.2686958312988284e-06, + "lookahead_loss": 7.5466746816635135, + "loss": 5.3962, + "step": 500500 + }, + { + "epoch": 1.0019073486328125, + "grad_norm": 45.4125862121582, + "learning_rate": 2.2210121154785156e-06, + "lookahead_loss": 7.599951001167297, + "loss": 5.392, + "step": 501000 + }, + { + "epoch": 1.0028610229492188, + "grad_norm": 42.52923583984375, + "learning_rate": 2.1733283996582033e-06, + "lookahead_loss": 7.549340567588806, + "loss": 5.3911, + "step": 501500 + }, + { + "epoch": 1.003814697265625, + "grad_norm": 24.68462562561035, + "learning_rate": 2.125644683837891e-06, + "lookahead_loss": 7.565037198066712, + "loss": 5.4362, + "step": 502000 + }, + { + "epoch": 1.0047683715820312, + "grad_norm": 25.820404052734375, + "learning_rate": 2.077960968017578e-06, + "lookahead_loss": 7.535558586120605, + "loss": 5.3996, + "step": 502500 + }, + { + "epoch": 1.0057220458984375, + "grad_norm": 23.42056655883789, + "learning_rate": 2.030277252197266e-06, + "lookahead_loss": 7.595151142120361, + "loss": 5.4258, + "step": 503000 + }, + { + "epoch": 1.0066757202148438, + "grad_norm": 40.301612854003906, + "learning_rate": 1.982593536376953e-06, + "lookahead_loss": 7.435441981315613, + "loss": 5.3966, + "step": 503500 + }, + { + "epoch": 1.00762939453125, + "grad_norm": 40.54277038574219, + "learning_rate": 1.9349098205566408e-06, + "lookahead_loss": 7.619449039459228, + "loss": 5.3941, + "step": 504000 + }, + { + "epoch": 1.0085830688476562, + "grad_norm": 41.602630615234375, + "learning_rate": 1.8872261047363282e-06, + "lookahead_loss": 7.436513647079468, + "loss": 5.3498, + "step": 504500 + }, + { + "epoch": 1.0095367431640625, + "grad_norm": 20.46828269958496, + "learning_rate": 1.8395423889160157e-06, + "lookahead_loss": 7.476216523170471, + "loss": 5.3912, + "step": 505000 + }, + { + "epoch": 1.0095367431640625, + "eval_accuracy": 0.03651839530332681, + "eval_lookahead_loss": 7.4885780134201045, + "eval_lookahead_perplexity": 1787.5084707179167, + "eval_loss": 5.232870101928711, + "eval_perplexity": 187.32968799385426, + "eval_runtime": 489.4338, + "eval_samples_per_second": 20.432, + "eval_steps_per_second": 5.108, + "step": 505000 + }, + { + "epoch": 1.0104904174804688, + "grad_norm": 59.87038040161133, + "learning_rate": 1.7918586730957031e-06, + "lookahead_loss": 7.500508081436157, + "loss": 5.3877, + "step": 505500 + }, + { + "epoch": 1.011444091796875, + "grad_norm": 47.217063903808594, + "learning_rate": 1.7441749572753908e-06, + "lookahead_loss": 7.522549951553345, + "loss": 5.4209, + "step": 506000 + }, + { + "epoch": 1.0123977661132812, + "grad_norm": 24.179237365722656, + "learning_rate": 1.6964912414550783e-06, + "lookahead_loss": 7.516633040428162, + "loss": 5.3795, + "step": 506500 + }, + { + "epoch": 1.0133514404296875, + "grad_norm": 23.66938018798828, + "learning_rate": 1.6488075256347657e-06, + "lookahead_loss": 7.492961671829224, + "loss": 5.3975, + "step": 507000 + }, + { + "epoch": 1.0143051147460938, + "grad_norm": 40.03607940673828, + "learning_rate": 1.6011238098144532e-06, + "lookahead_loss": 7.508479064941406, + "loss": 5.4027, + "step": 507500 + }, + { + "epoch": 1.0152587890625, + "grad_norm": 24.287736892700195, + "learning_rate": 1.5534400939941406e-06, + "lookahead_loss": 7.48128812456131, + "loss": 5.4076, + "step": 508000 + }, + { + "epoch": 1.0162124633789062, + "grad_norm": 28.126432418823242, + "learning_rate": 1.505756378173828e-06, + "lookahead_loss": 7.46256822681427, + "loss": 5.3622, + "step": 508500 + }, + { + "epoch": 1.0171661376953125, + "grad_norm": 27.604841232299805, + "learning_rate": 1.4580726623535158e-06, + "lookahead_loss": 7.507462024688721, + "loss": 5.3525, + "step": 509000 + }, + { + "epoch": 1.0181198120117188, + "grad_norm": 20.397815704345703, + "learning_rate": 1.4103889465332032e-06, + "lookahead_loss": 7.508451904773712, + "loss": 5.3677, + "step": 509500 + }, + { + "epoch": 1.019073486328125, + "grad_norm": 27.886219024658203, + "learning_rate": 1.3627052307128907e-06, + "lookahead_loss": 7.426554723739624, + "loss": 5.3829, + "step": 510000 + }, + { + "epoch": 1.019073486328125, + "eval_accuracy": 0.03655225048923679, + "eval_lookahead_loss": 7.487279405403137, + "eval_lookahead_perplexity": 1785.1887044469845, + "eval_loss": 5.229686737060547, + "eval_perplexity": 186.73429742184328, + "eval_runtime": 489.1008, + "eval_samples_per_second": 20.446, + "eval_steps_per_second": 5.111, + "step": 510000 + }, + { + "epoch": 1.0200271606445312, + "grad_norm": 19.291778564453125, + "learning_rate": 1.3150215148925781e-06, + "lookahead_loss": 7.471041418075561, + "loss": 5.3432, + "step": 510500 + }, + { + "epoch": 1.0209808349609375, + "grad_norm": 25.789371490478516, + "learning_rate": 1.2673377990722656e-06, + "lookahead_loss": 7.5697849712371825, + "loss": 5.379, + "step": 511000 + }, + { + "epoch": 1.0219345092773438, + "grad_norm": 18.22418975830078, + "learning_rate": 1.2196540832519533e-06, + "lookahead_loss": 7.417507988929748, + "loss": 5.4105, + "step": 511500 + }, + { + "epoch": 1.02288818359375, + "grad_norm": 19.042499542236328, + "learning_rate": 1.1719703674316407e-06, + "lookahead_loss": 7.568157598495484, + "loss": 5.4634, + "step": 512000 + }, + { + "epoch": 1.0238418579101562, + "grad_norm": 20.01932144165039, + "learning_rate": 1.1242866516113282e-06, + "lookahead_loss": 7.449432670593262, + "loss": 5.4228, + "step": 512500 + }, + { + "epoch": 1.0247955322265625, + "grad_norm": 27.379854202270508, + "learning_rate": 1.0766029357910156e-06, + "lookahead_loss": 7.561641505241394, + "loss": 5.4507, + "step": 513000 + }, + { + "epoch": 1.0257492065429688, + "grad_norm": 12.885735511779785, + "learning_rate": 1.028919219970703e-06, + "lookahead_loss": 7.499589974403381, + "loss": 5.3857, + "step": 513500 + }, + { + "epoch": 1.026702880859375, + "grad_norm": 24.73870086669922, + "learning_rate": 9.812355041503908e-07, + "lookahead_loss": 7.557055051803589, + "loss": 5.3787, + "step": 514000 + }, + { + "epoch": 1.0276565551757812, + "grad_norm": 44.89480972290039, + "learning_rate": 9.335517883300781e-07, + "lookahead_loss": 7.500895258903504, + "loss": 5.4148, + "step": 514500 + }, + { + "epoch": 1.0286102294921875, + "grad_norm": 26.01154899597168, + "learning_rate": 8.858680725097657e-07, + "lookahead_loss": 7.522830172538757, + "loss": 5.4174, + "step": 515000 + }, + { + "epoch": 1.0286102294921875, + "eval_accuracy": 0.03652622309197651, + "eval_lookahead_loss": 7.487877742385864, + "eval_lookahead_perplexity": 1786.2571684889008, + "eval_loss": 5.228353023529053, + "eval_perplexity": 186.48541336948466, + "eval_runtime": 487.6374, + "eval_samples_per_second": 20.507, + "eval_steps_per_second": 5.127, + "step": 515000 + }, + { + "epoch": 1.0295639038085938, + "grad_norm": 37.785377502441406, + "learning_rate": 8.381843566894531e-07, + "lookahead_loss": 7.5562867755889895, + "loss": 5.4452, + "step": 515500 + }, + { + "epoch": 1.030517578125, + "grad_norm": 25.26235580444336, + "learning_rate": 7.905006408691407e-07, + "lookahead_loss": 7.505988377571106, + "loss": 5.3892, + "step": 516000 + }, + { + "epoch": 1.0314712524414062, + "grad_norm": 20.39120864868164, + "learning_rate": 7.428169250488282e-07, + "lookahead_loss": 7.479778280258179, + "loss": 5.3975, + "step": 516500 + }, + { + "epoch": 1.0324249267578125, + "grad_norm": 27.12841033935547, + "learning_rate": 6.951332092285156e-07, + "lookahead_loss": 7.529824728965759, + "loss": 5.3681, + "step": 517000 + }, + { + "epoch": 1.0333786010742188, + "grad_norm": 29.57392120361328, + "learning_rate": 6.474494934082032e-07, + "lookahead_loss": 7.479436259269715, + "loss": 5.3985, + "step": 517500 + }, + { + "epoch": 1.034332275390625, + "grad_norm": 29.044187545776367, + "learning_rate": 5.997657775878906e-07, + "lookahead_loss": 7.565614990234375, + "loss": 5.4554, + "step": 518000 + }, + { + "epoch": 1.0352859497070312, + "grad_norm": 47.46847152709961, + "learning_rate": 5.520820617675782e-07, + "lookahead_loss": 7.505055674552917, + "loss": 5.3955, + "step": 518500 + }, + { + "epoch": 1.0362396240234375, + "grad_norm": 32.61491775512695, + "learning_rate": 5.043983459472657e-07, + "lookahead_loss": 7.5670665264129635, + "loss": 5.3999, + "step": 519000 + }, + { + "epoch": 1.0371932983398438, + "grad_norm": 47.412071228027344, + "learning_rate": 4.5671463012695317e-07, + "lookahead_loss": 7.5038084897995, + "loss": 5.3904, + "step": 519500 + }, + { + "epoch": 1.03814697265625, + "grad_norm": 44.64884567260742, + "learning_rate": 4.0903091430664063e-07, + "lookahead_loss": 7.536215165138245, + "loss": 5.403, + "step": 520000 + }, + { + "epoch": 1.03814697265625, + "eval_accuracy": 0.03652915851272016, + "eval_lookahead_loss": 7.48764083328247, + "eval_lookahead_perplexity": 1785.834038028392, + "eval_loss": 5.227207660675049, + "eval_perplexity": 186.27194217852093, + "eval_runtime": 490.7459, + "eval_samples_per_second": 20.377, + "eval_steps_per_second": 5.094, + "step": 520000 + }, + { + "epoch": 1.0391006469726562, + "grad_norm": 38.563106536865234, + "learning_rate": 3.6134719848632814e-07, + "lookahead_loss": 7.430149503707886, + "loss": 5.4272, + "step": 520500 + }, + { + "epoch": 1.0400543212890625, + "grad_norm": 26.263988494873047, + "learning_rate": 3.1366348266601565e-07, + "lookahead_loss": 7.604461443901062, + "loss": 5.4281, + "step": 521000 + }, + { + "epoch": 1.0410079956054688, + "grad_norm": 42.95314407348633, + "learning_rate": 2.6597976684570316e-07, + "lookahead_loss": 7.474581465721131, + "loss": 5.4112, + "step": 521500 + }, + { + "epoch": 1.041961669921875, + "grad_norm": 25.969568252563477, + "learning_rate": 2.1829605102539064e-07, + "lookahead_loss": 7.446981365680695, + "loss": 5.4104, + "step": 522000 + }, + { + "epoch": 1.0429153442382812, + "grad_norm": 22.706666946411133, + "learning_rate": 1.7061233520507813e-07, + "lookahead_loss": 7.569016202926636, + "loss": 5.3992, + "step": 522500 + }, + { + "epoch": 1.0438690185546875, + "grad_norm": 26.76810646057129, + "learning_rate": 1.2292861938476564e-07, + "lookahead_loss": 7.560819695472717, + "loss": 5.3885, + "step": 523000 + }, + { + "epoch": 1.0448226928710938, + "grad_norm": 42.19392395019531, + "learning_rate": 7.524490356445312e-08, + "lookahead_loss": 7.508039583206177, + "loss": 5.3889, + "step": 523500 + }, + { + "epoch": 1.0457763671875, + "grad_norm": 22.84039306640625, + "learning_rate": 2.7561187744140627e-08, + "lookahead_loss": 7.458563389778138, + "loss": 5.4373, + "step": 524000 + }, + { + "epoch": 1.04632568359375, + "step": 524288, + "total_flos": 4.833448717656785e+18, + "train_loss": 1.2404409057926387, + "train_runtime": 78461.8603, + "train_samples_per_second": 26.728, + "train_steps_per_second": 6.682 + } + ], + "logging_steps": 500, + "max_steps": 524288, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.833448717656785e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}