| { |
| "best_global_step": null, |
| "best_metric": 4.092768669128418, |
| "best_model_checkpoint": null, |
| "epoch": 1.04632568359375, |
| "eval_steps": 5000, |
| "global_step": 524288, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00095367431640625, |
| "grad_norm": 0.47839266061782837, |
| "learning_rate": 4.995241165161133e-05, |
| "lookahead_loss": 9.290306776046753, |
| "loss": 5.9344, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0019073486328125, |
| "grad_norm": 0.5228514671325684, |
| "learning_rate": 4.990472793579102e-05, |
| "lookahead_loss": 8.670977174758912, |
| "loss": 5.4972, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.00286102294921875, |
| "grad_norm": 0.42427414655685425, |
| "learning_rate": 4.98570442199707e-05, |
| "lookahead_loss": 8.167822477340698, |
| "loss": 5.193, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.003814697265625, |
| "grad_norm": 0.38330137729644775, |
| "learning_rate": 4.9809360504150393e-05, |
| "lookahead_loss": 7.861059184074402, |
| "loss": 5.0143, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.00476837158203125, |
| "grad_norm": 0.3493882119655609, |
| "learning_rate": 4.9761676788330084e-05, |
| "lookahead_loss": 7.637384836196899, |
| "loss": 4.8766, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.0057220458984375, |
| "grad_norm": 0.32057973742485046, |
| "learning_rate": 4.971399307250977e-05, |
| "lookahead_loss": 7.5237892694473265, |
| "loss": 4.8024, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.00667572021484375, |
| "grad_norm": 0.3964500427246094, |
| "learning_rate": 4.966630935668946e-05, |
| "lookahead_loss": 7.4240171346664425, |
| "loss": 4.7319, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.00762939453125, |
| "grad_norm": 0.3556903004646301, |
| "learning_rate": 4.961862564086914e-05, |
| "lookahead_loss": 7.364227453231812, |
| "loss": 4.6859, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.00858306884765625, |
| "grad_norm": 0.3163730800151825, |
| "learning_rate": 4.957094192504883e-05, |
| "lookahead_loss": 7.28202552986145, |
| "loss": 4.6178, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.0095367431640625, |
| "grad_norm": 0.3415260314941406, |
| "learning_rate": 4.952325820922852e-05, |
| "lookahead_loss": 7.296242783546448, |
| "loss": 4.6301, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0095367431640625, |
| "eval_accuracy": 0.03119412915851272, |
| "eval_lookahead_loss": 7.236605150604248, |
| "eval_lookahead_perplexity": 1389.3692561769383, |
| "eval_loss": 4.586381435394287, |
| "eval_perplexity": 98.1386657695201, |
| "eval_runtime": 516.0694, |
| "eval_samples_per_second": 19.377, |
| "eval_steps_per_second": 4.844, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.01049041748046875, |
| "grad_norm": 0.35341939330101013, |
| "learning_rate": 4.9475574493408205e-05, |
| "lookahead_loss": 7.269233721733094, |
| "loss": 4.6041, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.011444091796875, |
| "grad_norm": 0.3067445456981659, |
| "learning_rate": 4.9427890777587895e-05, |
| "lookahead_loss": 7.29026060962677, |
| "loss": 4.6214, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.01239776611328125, |
| "grad_norm": 0.3368770182132721, |
| "learning_rate": 4.938020706176758e-05, |
| "lookahead_loss": 7.258184072494507, |
| "loss": 4.592, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.0133514404296875, |
| "grad_norm": 0.3502334952354431, |
| "learning_rate": 4.933252334594727e-05, |
| "lookahead_loss": 7.237717715263367, |
| "loss": 4.5715, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.01430511474609375, |
| "grad_norm": 0.3599381148815155, |
| "learning_rate": 4.928483963012696e-05, |
| "lookahead_loss": 7.221656904220581, |
| "loss": 4.555, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.0152587890625, |
| "grad_norm": 0.3626171946525574, |
| "learning_rate": 4.923715591430664e-05, |
| "lookahead_loss": 7.231643494606018, |
| "loss": 4.5589, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.01621246337890625, |
| "grad_norm": 0.3790018856525421, |
| "learning_rate": 4.918947219848633e-05, |
| "lookahead_loss": 7.17628225517273, |
| "loss": 4.5139, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.0171661376953125, |
| "grad_norm": 0.4178946316242218, |
| "learning_rate": 4.9141788482666016e-05, |
| "lookahead_loss": 7.155110068321228, |
| "loss": 4.4933, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.01811981201171875, |
| "grad_norm": 0.41516491770744324, |
| "learning_rate": 4.9094104766845706e-05, |
| "lookahead_loss": 7.163973123550415, |
| "loss": 4.496, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.019073486328125, |
| "grad_norm": 0.4430026113986969, |
| "learning_rate": 4.9046421051025396e-05, |
| "lookahead_loss": 7.152189309120178, |
| "loss": 4.4871, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.019073486328125, |
| "eval_accuracy": 0.031754598825831704, |
| "eval_lookahead_loss": 7.113310220527649, |
| "eval_lookahead_perplexity": 1228.206458669234, |
| "eval_loss": 4.462108135223389, |
| "eval_perplexity": 86.67002878113007, |
| "eval_runtime": 522.3091, |
| "eval_samples_per_second": 19.146, |
| "eval_steps_per_second": 4.786, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.02002716064453125, |
| "grad_norm": 0.5481619238853455, |
| "learning_rate": 4.899873733520508e-05, |
| "lookahead_loss": 7.124247222900391, |
| "loss": 4.4599, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.0209808349609375, |
| "grad_norm": 0.5940960049629211, |
| "learning_rate": 4.895105361938477e-05, |
| "lookahead_loss": 7.1434852418899535, |
| "loss": 4.4718, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.02193450927734375, |
| "grad_norm": 0.7959221601486206, |
| "learning_rate": 4.890336990356445e-05, |
| "lookahead_loss": 7.147353162765503, |
| "loss": 4.475, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.02288818359375, |
| "grad_norm": 0.5282385349273682, |
| "learning_rate": 4.8855686187744143e-05, |
| "lookahead_loss": 7.19229192352295, |
| "loss": 4.5065, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.02384185791015625, |
| "grad_norm": 0.716699481010437, |
| "learning_rate": 4.8808002471923834e-05, |
| "lookahead_loss": 7.150345046043396, |
| "loss": 4.4729, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.0247955322265625, |
| "grad_norm": 0.9415345788002014, |
| "learning_rate": 4.876031875610352e-05, |
| "lookahead_loss": 7.165666316986084, |
| "loss": 4.4832, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.02574920654296875, |
| "grad_norm": 0.6600722074508667, |
| "learning_rate": 4.871263504028321e-05, |
| "lookahead_loss": 7.080785141944885, |
| "loss": 4.4265, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.026702880859375, |
| "grad_norm": 0.8373561501502991, |
| "learning_rate": 4.866495132446289e-05, |
| "lookahead_loss": 7.111087073326111, |
| "loss": 4.4359, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.02765655517578125, |
| "grad_norm": 1.6005096435546875, |
| "learning_rate": 4.861726760864258e-05, |
| "lookahead_loss": 7.129780281066894, |
| "loss": 4.4485, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.0286102294921875, |
| "grad_norm": 1.2250703573226929, |
| "learning_rate": 4.856958389282227e-05, |
| "lookahead_loss": 7.115953999519348, |
| "loss": 4.4417, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.0286102294921875, |
| "eval_accuracy": 0.0339853228962818, |
| "eval_lookahead_loss": 7.045913993644715, |
| "eval_lookahead_perplexity": 1148.1577652860024, |
| "eval_loss": 4.390574932098389, |
| "eval_perplexity": 80.68679507605191, |
| "eval_runtime": 519.7786, |
| "eval_samples_per_second": 19.239, |
| "eval_steps_per_second": 4.81, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.02956390380859375, |
| "grad_norm": 1.4603400230407715, |
| "learning_rate": 4.8521900177001955e-05, |
| "lookahead_loss": 7.124241777420044, |
| "loss": 4.4497, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.030517578125, |
| "grad_norm": 1.6053916215896606, |
| "learning_rate": 4.8474216461181645e-05, |
| "lookahead_loss": 7.088449460983276, |
| "loss": 4.4172, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.03147125244140625, |
| "grad_norm": 2.3265323638916016, |
| "learning_rate": 4.842653274536133e-05, |
| "lookahead_loss": 7.081040790557862, |
| "loss": 4.4108, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.0324249267578125, |
| "grad_norm": 1.873970627784729, |
| "learning_rate": 4.837884902954102e-05, |
| "lookahead_loss": 7.052405204772949, |
| "loss": 4.3861, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.03337860107421875, |
| "grad_norm": 1.9784423112869263, |
| "learning_rate": 4.833116531372071e-05, |
| "lookahead_loss": 7.082913987159729, |
| "loss": 4.4111, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.034332275390625, |
| "grad_norm": 2.0871779918670654, |
| "learning_rate": 4.828348159790039e-05, |
| "lookahead_loss": 7.119975289344787, |
| "loss": 4.4378, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.03528594970703125, |
| "grad_norm": 2.4158153533935547, |
| "learning_rate": 4.823579788208008e-05, |
| "lookahead_loss": 7.079687372207641, |
| "loss": 4.4055, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.0362396240234375, |
| "grad_norm": 2.0315685272216797, |
| "learning_rate": 4.8188114166259766e-05, |
| "lookahead_loss": 7.0762225513458255, |
| "loss": 4.4001, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.03719329833984375, |
| "grad_norm": 3.308830499649048, |
| "learning_rate": 4.8140430450439456e-05, |
| "lookahead_loss": 7.064541185379029, |
| "loss": 4.3927, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.03814697265625, |
| "grad_norm": 2.238238573074341, |
| "learning_rate": 4.8092746734619146e-05, |
| "lookahead_loss": 7.077585823059082, |
| "loss": 4.3978, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.03814697265625, |
| "eval_accuracy": 0.03521545988258317, |
| "eval_lookahead_loss": 6.992431879234314, |
| "eval_lookahead_perplexity": 1088.3650328004826, |
| "eval_loss": 4.342653751373291, |
| "eval_perplexity": 76.9113724176908, |
| "eval_runtime": 532.9305, |
| "eval_samples_per_second": 18.764, |
| "eval_steps_per_second": 4.691, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.03910064697265625, |
| "grad_norm": 3.000309944152832, |
| "learning_rate": 4.804506301879883e-05, |
| "lookahead_loss": 7.083662112236023, |
| "loss": 4.4046, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.0400543212890625, |
| "grad_norm": 2.325451135635376, |
| "learning_rate": 4.799737930297852e-05, |
| "lookahead_loss": 7.0873232898712155, |
| "loss": 4.4065, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.04100799560546875, |
| "grad_norm": 3.069692373275757, |
| "learning_rate": 4.79496955871582e-05, |
| "lookahead_loss": 7.058162942886352, |
| "loss": 4.3848, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.041961669921875, |
| "grad_norm": 3.350290060043335, |
| "learning_rate": 4.7902011871337893e-05, |
| "lookahead_loss": 7.070309939384461, |
| "loss": 4.3963, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.04291534423828125, |
| "grad_norm": 5.201033115386963, |
| "learning_rate": 4.7854328155517584e-05, |
| "lookahead_loss": 7.046905223846435, |
| "loss": 4.3767, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.0438690185546875, |
| "grad_norm": 4.287589073181152, |
| "learning_rate": 4.780664443969727e-05, |
| "lookahead_loss": 7.04586190032959, |
| "loss": 4.3753, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.04482269287109375, |
| "grad_norm": 2.5558464527130127, |
| "learning_rate": 4.775896072387696e-05, |
| "lookahead_loss": 7.0209474693536755, |
| "loss": 4.358, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.0457763671875, |
| "grad_norm": 3.9525115489959717, |
| "learning_rate": 4.771127700805664e-05, |
| "lookahead_loss": 7.05639953994751, |
| "loss": 4.3867, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.04673004150390625, |
| "grad_norm": 3.750539541244507, |
| "learning_rate": 4.766359329223633e-05, |
| "lookahead_loss": 7.069089519500732, |
| "loss": 4.3939, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.0476837158203125, |
| "grad_norm": 6.372467041015625, |
| "learning_rate": 4.761590957641602e-05, |
| "lookahead_loss": 6.999449357032776, |
| "loss": 4.3419, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.0476837158203125, |
| "eval_accuracy": 0.03489686888454012, |
| "eval_lookahead_loss": 6.956589477157593, |
| "eval_lookahead_perplexity": 1050.0462367243115, |
| "eval_loss": 4.311020851135254, |
| "eval_perplexity": 74.51652039978697, |
| "eval_runtime": 534.7013, |
| "eval_samples_per_second": 18.702, |
| "eval_steps_per_second": 4.676, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.04863739013671875, |
| "grad_norm": 6.991776466369629, |
| "learning_rate": 4.7568225860595705e-05, |
| "lookahead_loss": 7.03860978603363, |
| "loss": 4.3732, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.049591064453125, |
| "grad_norm": 4.837189197540283, |
| "learning_rate": 4.7520542144775395e-05, |
| "lookahead_loss": 7.0244465389251705, |
| "loss": 4.3598, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.05054473876953125, |
| "grad_norm": 5.740037441253662, |
| "learning_rate": 4.747285842895508e-05, |
| "lookahead_loss": 7.014849040031433, |
| "loss": 4.3552, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.0514984130859375, |
| "grad_norm": 4.548750400543213, |
| "learning_rate": 4.742517471313477e-05, |
| "lookahead_loss": 7.0321577501296995, |
| "loss": 4.3654, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.05245208740234375, |
| "grad_norm": 4.6926422119140625, |
| "learning_rate": 4.737749099731446e-05, |
| "lookahead_loss": 7.01492589378357, |
| "loss": 4.3569, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.05340576171875, |
| "grad_norm": 5.15637731552124, |
| "learning_rate": 4.732980728149414e-05, |
| "lookahead_loss": 6.99444252204895, |
| "loss": 4.3467, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.05435943603515625, |
| "grad_norm": 4.31205415725708, |
| "learning_rate": 4.728212356567383e-05, |
| "lookahead_loss": 7.008089102745056, |
| "loss": 4.3491, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.0553131103515625, |
| "grad_norm": 5.92137336730957, |
| "learning_rate": 4.7234439849853516e-05, |
| "lookahead_loss": 6.9638630380630495, |
| "loss": 4.3158, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.05626678466796875, |
| "grad_norm": 6.110989570617676, |
| "learning_rate": 4.7186756134033206e-05, |
| "lookahead_loss": 6.990848824501038, |
| "loss": 4.3417, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.057220458984375, |
| "grad_norm": 8.663269996643066, |
| "learning_rate": 4.7139072418212896e-05, |
| "lookahead_loss": 6.960992939949036, |
| "loss": 4.316, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.057220458984375, |
| "eval_accuracy": 0.036095107632093934, |
| "eval_lookahead_loss": 6.948215903663635, |
| "eval_lookahead_perplexity": 1041.2903077577691, |
| "eval_loss": 4.320182800292969, |
| "eval_perplexity": 75.20237405162895, |
| "eval_runtime": 521.9654, |
| "eval_samples_per_second": 19.158, |
| "eval_steps_per_second": 4.79, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.05817413330078125, |
| "grad_norm": 7.960083961486816, |
| "learning_rate": 4.709138870239258e-05, |
| "lookahead_loss": 6.963083945274353, |
| "loss": 4.316, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.0591278076171875, |
| "grad_norm": 7.997723579406738, |
| "learning_rate": 4.704370498657227e-05, |
| "lookahead_loss": 6.942691106796264, |
| "loss": 4.3003, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.06008148193359375, |
| "grad_norm": 15.110528945922852, |
| "learning_rate": 4.699602127075195e-05, |
| "lookahead_loss": 7.044429353713989, |
| "loss": 4.3816, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.06103515625, |
| "grad_norm": 10.701312065124512, |
| "learning_rate": 4.6948337554931643e-05, |
| "lookahead_loss": 7.026671842575073, |
| "loss": 4.3714, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.06198883056640625, |
| "grad_norm": 8.800566673278809, |
| "learning_rate": 4.6900653839111334e-05, |
| "lookahead_loss": 6.997694608688355, |
| "loss": 4.3499, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.0629425048828125, |
| "grad_norm": 7.564679145812988, |
| "learning_rate": 4.685297012329102e-05, |
| "lookahead_loss": 6.952921081542969, |
| "loss": 4.3101, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.06389617919921875, |
| "grad_norm": 6.677249431610107, |
| "learning_rate": 4.680528640747071e-05, |
| "lookahead_loss": 7.052048943519592, |
| "loss": 4.3864, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.064849853515625, |
| "grad_norm": 10.225991249084473, |
| "learning_rate": 4.675760269165039e-05, |
| "lookahead_loss": 6.983716876029968, |
| "loss": 4.333, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.06580352783203125, |
| "grad_norm": 7.170054912567139, |
| "learning_rate": 4.670991897583008e-05, |
| "lookahead_loss": 7.01130031299591, |
| "loss": 4.3605, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.0667572021484375, |
| "grad_norm": 6.779210090637207, |
| "learning_rate": 4.666223526000977e-05, |
| "lookahead_loss": 7.006741167068482, |
| "loss": 4.3488, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.0667572021484375, |
| "eval_accuracy": 0.03458767123287671, |
| "eval_lookahead_loss": 6.913046338272094, |
| "eval_lookahead_perplexity": 1005.3050816643253, |
| "eval_loss": 4.287929534912109, |
| "eval_perplexity": 72.81555026340061, |
| "eval_runtime": 529.4894, |
| "eval_samples_per_second": 18.886, |
| "eval_steps_per_second": 4.722, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.06771087646484375, |
| "grad_norm": 10.978616714477539, |
| "learning_rate": 4.6614551544189455e-05, |
| "lookahead_loss": 6.989995750427246, |
| "loss": 4.3434, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.06866455078125, |
| "grad_norm": 8.680578231811523, |
| "learning_rate": 4.6566867828369145e-05, |
| "lookahead_loss": 6.997309655189514, |
| "loss": 4.3435, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.06961822509765625, |
| "grad_norm": 10.540365219116211, |
| "learning_rate": 4.651918411254883e-05, |
| "lookahead_loss": 7.014720707893372, |
| "loss": 4.3631, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.0705718994140625, |
| "grad_norm": 11.400374412536621, |
| "learning_rate": 4.647150039672852e-05, |
| "lookahead_loss": 6.963991888046265, |
| "loss": 4.3196, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.07152557373046875, |
| "grad_norm": 11.558246612548828, |
| "learning_rate": 4.642381668090821e-05, |
| "lookahead_loss": 6.989323397636413, |
| "loss": 4.3397, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.072479248046875, |
| "grad_norm": 13.934370040893555, |
| "learning_rate": 4.637613296508789e-05, |
| "lookahead_loss": 6.990857390403748, |
| "loss": 4.3369, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.07343292236328125, |
| "grad_norm": 14.193124771118164, |
| "learning_rate": 4.632844924926758e-05, |
| "lookahead_loss": 6.950243325233459, |
| "loss": 4.3097, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.0743865966796875, |
| "grad_norm": 7.403049468994141, |
| "learning_rate": 4.6280765533447266e-05, |
| "lookahead_loss": 7.002062555313111, |
| "loss": 4.3498, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.07534027099609375, |
| "grad_norm": 17.167957305908203, |
| "learning_rate": 4.6233081817626956e-05, |
| "lookahead_loss": 6.96073443031311, |
| "loss": 4.318, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.0762939453125, |
| "grad_norm": 11.108458518981934, |
| "learning_rate": 4.6185398101806646e-05, |
| "lookahead_loss": 6.971267002105713, |
| "loss": 4.3213, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.0762939453125, |
| "eval_accuracy": 0.034748727984344426, |
| "eval_lookahead_loss": 6.893352962112426, |
| "eval_lookahead_perplexity": 985.7009003792375, |
| "eval_loss": 4.272909164428711, |
| "eval_perplexity": 71.7300067634484, |
| "eval_runtime": 519.2948, |
| "eval_samples_per_second": 19.257, |
| "eval_steps_per_second": 4.814, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.07724761962890625, |
| "grad_norm": 13.51186466217041, |
| "learning_rate": 4.613771438598633e-05, |
| "lookahead_loss": 6.971992794990539, |
| "loss": 4.3267, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.0782012939453125, |
| "grad_norm": 17.52781105041504, |
| "learning_rate": 4.609003067016602e-05, |
| "lookahead_loss": 6.978572855949402, |
| "loss": 4.3306, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.07915496826171875, |
| "grad_norm": 22.18255615234375, |
| "learning_rate": 4.60423469543457e-05, |
| "lookahead_loss": 6.973974548339844, |
| "loss": 4.3277, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.080108642578125, |
| "grad_norm": 15.776101112365723, |
| "learning_rate": 4.5994663238525393e-05, |
| "lookahead_loss": 6.974434162139892, |
| "loss": 4.3302, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.08106231689453125, |
| "grad_norm": 13.264528274536133, |
| "learning_rate": 4.5946979522705084e-05, |
| "lookahead_loss": 6.922822203636169, |
| "loss": 4.2955, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.0820159912109375, |
| "grad_norm": 9.05212116241455, |
| "learning_rate": 4.589929580688477e-05, |
| "lookahead_loss": 6.967613657951355, |
| "loss": 4.3292, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.08296966552734375, |
| "grad_norm": 52.12398910522461, |
| "learning_rate": 4.585161209106446e-05, |
| "lookahead_loss": 6.939634778022766, |
| "loss": 4.3035, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.08392333984375, |
| "grad_norm": 9.83299732208252, |
| "learning_rate": 4.580392837524414e-05, |
| "lookahead_loss": 6.969861132621765, |
| "loss": 4.3229, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.08487701416015625, |
| "grad_norm": 15.060040473937988, |
| "learning_rate": 4.575624465942383e-05, |
| "lookahead_loss": 6.928612260818482, |
| "loss": 4.3024, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.0858306884765625, |
| "grad_norm": 32.98693084716797, |
| "learning_rate": 4.570856094360352e-05, |
| "lookahead_loss": 6.932858068466187, |
| "loss": 4.2945, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.0858306884765625, |
| "eval_accuracy": 0.03372641878669276, |
| "eval_lookahead_loss": 6.868747735786438, |
| "eval_lookahead_perplexity": 961.7434544999155, |
| "eval_loss": 4.248973846435547, |
| "eval_perplexity": 70.03351032548787, |
| "eval_runtime": 646.5724, |
| "eval_samples_per_second": 15.466, |
| "eval_steps_per_second": 3.867, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.08678436279296875, |
| "grad_norm": 14.970084190368652, |
| "learning_rate": 4.5660877227783205e-05, |
| "lookahead_loss": 6.95002982711792, |
| "loss": 4.3107, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.087738037109375, |
| "grad_norm": 17.196474075317383, |
| "learning_rate": 4.5613193511962895e-05, |
| "lookahead_loss": 6.929351140975952, |
| "loss": 4.3093, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.08869171142578125, |
| "grad_norm": 23.335224151611328, |
| "learning_rate": 4.556550979614258e-05, |
| "lookahead_loss": 6.901777314186096, |
| "loss": 4.286, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.0896453857421875, |
| "grad_norm": 24.333749771118164, |
| "learning_rate": 4.551782608032227e-05, |
| "lookahead_loss": 6.945984739303589, |
| "loss": 4.3142, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.09059906005859375, |
| "grad_norm": 15.729629516601562, |
| "learning_rate": 4.547014236450196e-05, |
| "lookahead_loss": 6.881781202316284, |
| "loss": 4.2581, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.091552734375, |
| "grad_norm": 18.509035110473633, |
| "learning_rate": 4.542245864868164e-05, |
| "lookahead_loss": 6.898589003562927, |
| "loss": 4.2727, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.09250640869140625, |
| "grad_norm": 16.001665115356445, |
| "learning_rate": 4.537477493286133e-05, |
| "lookahead_loss": 6.867767862319946, |
| "loss": 4.2464, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.0934600830078125, |
| "grad_norm": 44.129173278808594, |
| "learning_rate": 4.5327091217041016e-05, |
| "lookahead_loss": 6.902135440826416, |
| "loss": 4.2775, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.09441375732421875, |
| "grad_norm": 19.078657150268555, |
| "learning_rate": 4.5279407501220706e-05, |
| "lookahead_loss": 6.915942321777344, |
| "loss": 4.2885, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.095367431640625, |
| "grad_norm": 24.603105545043945, |
| "learning_rate": 4.523172378540039e-05, |
| "lookahead_loss": 6.944480584144593, |
| "loss": 4.3156, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.095367431640625, |
| "eval_accuracy": 0.03630313111545988, |
| "eval_lookahead_loss": 6.854488192749024, |
| "eval_lookahead_perplexity": 948.1267470605636, |
| "eval_loss": 4.244652271270752, |
| "eval_perplexity": 69.73150827885007, |
| "eval_runtime": 538.6221, |
| "eval_samples_per_second": 18.566, |
| "eval_steps_per_second": 4.641, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.09632110595703125, |
| "grad_norm": 13.053648948669434, |
| "learning_rate": 4.518404006958008e-05, |
| "lookahead_loss": 6.975432216644287, |
| "loss": 4.3415, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.0972747802734375, |
| "grad_norm": 24.497657775878906, |
| "learning_rate": 4.513635635375977e-05, |
| "lookahead_loss": 6.936278092384338, |
| "loss": 4.3069, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.09822845458984375, |
| "grad_norm": 13.54862117767334, |
| "learning_rate": 4.508867263793945e-05, |
| "lookahead_loss": 6.955997831344605, |
| "loss": 4.3305, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.09918212890625, |
| "grad_norm": 62.653263092041016, |
| "learning_rate": 4.5040988922119143e-05, |
| "lookahead_loss": 6.919645679473877, |
| "loss": 4.2973, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.10013580322265625, |
| "grad_norm": 49.663692474365234, |
| "learning_rate": 4.499330520629883e-05, |
| "lookahead_loss": 6.94014087677002, |
| "loss": 4.3124, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.1010894775390625, |
| "grad_norm": 11.652701377868652, |
| "learning_rate": 4.494562149047852e-05, |
| "lookahead_loss": 6.943861758232116, |
| "loss": 4.3122, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.10204315185546875, |
| "grad_norm": 13.936113357543945, |
| "learning_rate": 4.489793777465821e-05, |
| "lookahead_loss": 6.891527242660523, |
| "loss": 4.2737, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.102996826171875, |
| "grad_norm": 16.067378997802734, |
| "learning_rate": 4.485025405883789e-05, |
| "lookahead_loss": 6.945025591850281, |
| "loss": 4.3139, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.10395050048828125, |
| "grad_norm": 24.42043685913086, |
| "learning_rate": 4.480257034301758e-05, |
| "lookahead_loss": 6.877603525161743, |
| "loss": 4.2583, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.1049041748046875, |
| "grad_norm": 13.74610424041748, |
| "learning_rate": 4.4754886627197264e-05, |
| "lookahead_loss": 6.973377682685852, |
| "loss": 4.3335, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.1049041748046875, |
| "eval_accuracy": 0.03585401174168298, |
| "eval_lookahead_loss": 6.843152507591247, |
| "eval_lookahead_perplexity": 937.4397673225168, |
| "eval_loss": 4.238409042358398, |
| "eval_perplexity": 69.2975146806309, |
| "eval_runtime": 540.2429, |
| "eval_samples_per_second": 18.51, |
| "eval_steps_per_second": 4.628, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.10585784912109375, |
| "grad_norm": 10.1171293258667, |
| "learning_rate": 4.4707202911376955e-05, |
| "lookahead_loss": 6.920138500213623, |
| "loss": 4.2991, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.1068115234375, |
| "grad_norm": 34.18367004394531, |
| "learning_rate": 4.4659519195556645e-05, |
| "lookahead_loss": 6.915216091156005, |
| "loss": 4.2844, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.10776519775390625, |
| "grad_norm": 30.37221336364746, |
| "learning_rate": 4.461183547973633e-05, |
| "lookahead_loss": 6.952143461227417, |
| "loss": 4.3192, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.1087188720703125, |
| "grad_norm": 49.1783332824707, |
| "learning_rate": 4.456415176391602e-05, |
| "lookahead_loss": 6.932615814208984, |
| "loss": 4.3039, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.10967254638671875, |
| "grad_norm": 20.412813186645508, |
| "learning_rate": 4.45164680480957e-05, |
| "lookahead_loss": 6.904240723609925, |
| "loss": 4.2811, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.110626220703125, |
| "grad_norm": 27.21106719970703, |
| "learning_rate": 4.446878433227539e-05, |
| "lookahead_loss": 6.765171314239502, |
| "loss": 4.1745, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.11157989501953125, |
| "grad_norm": 24.294544219970703, |
| "learning_rate": 4.442110061645508e-05, |
| "lookahead_loss": 6.951616223335266, |
| "loss": 4.3251, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.1125335693359375, |
| "grad_norm": 20.378206253051758, |
| "learning_rate": 4.4373416900634766e-05, |
| "lookahead_loss": 6.959182680130005, |
| "loss": 4.3283, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.11348724365234375, |
| "grad_norm": 12.9067964553833, |
| "learning_rate": 4.4325733184814456e-05, |
| "lookahead_loss": 6.9285484943389894, |
| "loss": 4.3083, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.11444091796875, |
| "grad_norm": 38.90343475341797, |
| "learning_rate": 4.427804946899414e-05, |
| "lookahead_loss": 6.915428097724915, |
| "loss": 4.2959, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.11444091796875, |
| "eval_accuracy": 0.037382974559686886, |
| "eval_lookahead_loss": 6.847941571235657, |
| "eval_lookahead_perplexity": 941.9399933643864, |
| "eval_loss": 4.25514554977417, |
| "eval_perplexity": 70.46707290860382, |
| "eval_runtime": 548.5723, |
| "eval_samples_per_second": 18.229, |
| "eval_steps_per_second": 4.557, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.11539459228515625, |
| "grad_norm": 18.8435115814209, |
| "learning_rate": 4.423036575317383e-05, |
| "lookahead_loss": 6.91269550037384, |
| "loss": 4.3019, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.1163482666015625, |
| "grad_norm": 33.77581787109375, |
| "learning_rate": 4.418268203735352e-05, |
| "lookahead_loss": 6.91165564250946, |
| "loss": 4.2978, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.11730194091796875, |
| "grad_norm": 21.479700088500977, |
| "learning_rate": 4.41349983215332e-05, |
| "lookahead_loss": 6.888538204193115, |
| "loss": 4.278, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.118255615234375, |
| "grad_norm": 68.20787811279297, |
| "learning_rate": 4.4087314605712893e-05, |
| "lookahead_loss": 6.920241100311279, |
| "loss": 4.3009, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.11920928955078125, |
| "grad_norm": 43.83788299560547, |
| "learning_rate": 4.403963088989258e-05, |
| "lookahead_loss": 6.9072804908752445, |
| "loss": 4.298, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.1201629638671875, |
| "grad_norm": 44.411651611328125, |
| "learning_rate": 4.399194717407227e-05, |
| "lookahead_loss": 6.946548145294189, |
| "loss": 4.3266, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.12111663818359375, |
| "grad_norm": 19.859819412231445, |
| "learning_rate": 4.394426345825196e-05, |
| "lookahead_loss": 6.92121026134491, |
| "loss": 4.3094, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.1220703125, |
| "grad_norm": 16.653898239135742, |
| "learning_rate": 4.389657974243164e-05, |
| "lookahead_loss": 6.889752195358277, |
| "loss": 4.2786, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.12302398681640625, |
| "grad_norm": 13.52586841583252, |
| "learning_rate": 4.384889602661133e-05, |
| "lookahead_loss": 6.929915476799011, |
| "loss": 4.3124, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.1239776611328125, |
| "grad_norm": 23.530059814453125, |
| "learning_rate": 4.3801212310791014e-05, |
| "lookahead_loss": 6.876242291450501, |
| "loss": 4.2732, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.1239776611328125, |
| "eval_accuracy": 0.03717573385518591, |
| "eval_lookahead_loss": 6.833459272956848, |
| "eval_lookahead_perplexity": 928.3968421052864, |
| "eval_loss": 4.241796493530273, |
| "eval_perplexity": 69.53265466570367, |
| "eval_runtime": 532.1132, |
| "eval_samples_per_second": 18.793, |
| "eval_steps_per_second": 4.698, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.12493133544921875, |
| "grad_norm": 37.93785095214844, |
| "learning_rate": 4.3753528594970705e-05, |
| "lookahead_loss": 6.876650710105896, |
| "loss": 4.2692, |
| "step": 65500 |
| }, |
| { |
| "epoch": 0.125885009765625, |
| "grad_norm": 24.844409942626953, |
| "learning_rate": 4.3705844879150395e-05, |
| "lookahead_loss": 6.878139553070068, |
| "loss": 4.266, |
| "step": 66000 |
| }, |
| { |
| "epoch": 0.12683868408203125, |
| "grad_norm": 17.841354370117188, |
| "learning_rate": 4.365816116333008e-05, |
| "lookahead_loss": 6.851497150421142, |
| "loss": 4.249, |
| "step": 66500 |
| }, |
| { |
| "epoch": 0.1277923583984375, |
| "grad_norm": 39.69922637939453, |
| "learning_rate": 4.361047744750977e-05, |
| "lookahead_loss": 6.865252791404724, |
| "loss": 4.2618, |
| "step": 67000 |
| }, |
| { |
| "epoch": 0.12874603271484375, |
| "grad_norm": 58.366493225097656, |
| "learning_rate": 4.356279373168945e-05, |
| "lookahead_loss": 6.906228137969971, |
| "loss": 4.2959, |
| "step": 67500 |
| }, |
| { |
| "epoch": 0.12969970703125, |
| "grad_norm": 35.06950759887695, |
| "learning_rate": 4.351511001586914e-05, |
| "lookahead_loss": 6.95566947555542, |
| "loss": 4.337, |
| "step": 68000 |
| }, |
| { |
| "epoch": 0.13065338134765625, |
| "grad_norm": 32.34492111206055, |
| "learning_rate": 4.346742630004883e-05, |
| "lookahead_loss": 6.909006106376648, |
| "loss": 4.2981, |
| "step": 68500 |
| }, |
| { |
| "epoch": 0.1316070556640625, |
| "grad_norm": 48.83039855957031, |
| "learning_rate": 4.3419742584228516e-05, |
| "lookahead_loss": 6.903264353752136, |
| "loss": 4.2953, |
| "step": 69000 |
| }, |
| { |
| "epoch": 0.13256072998046875, |
| "grad_norm": 70.04198455810547, |
| "learning_rate": 4.3372058868408206e-05, |
| "lookahead_loss": 6.894676825523376, |
| "loss": 4.2878, |
| "step": 69500 |
| }, |
| { |
| "epoch": 0.133514404296875, |
| "grad_norm": 20.394811630249023, |
| "learning_rate": 4.332437515258789e-05, |
| "lookahead_loss": 6.894831575393677, |
| "loss": 4.2924, |
| "step": 70000 |
| }, |
| { |
| "epoch": 0.133514404296875, |
| "eval_accuracy": 0.03839412915851272, |
| "eval_lookahead_loss": 6.823030910491943, |
| "eval_lookahead_perplexity": 918.765490223331, |
| "eval_loss": 4.244718551635742, |
| "eval_perplexity": 69.73613026184178, |
| "eval_runtime": 527.6392, |
| "eval_samples_per_second": 18.952, |
| "eval_steps_per_second": 4.738, |
| "step": 70000 |
| }, |
| { |
| "epoch": 0.13446807861328125, |
| "grad_norm": 25.3953857421875, |
| "learning_rate": 4.327669143676758e-05, |
| "lookahead_loss": 6.914466094017029, |
| "loss": 4.3088, |
| "step": 70500 |
| }, |
| { |
| "epoch": 0.1354217529296875, |
| "grad_norm": 43.19773483276367, |
| "learning_rate": 4.322900772094727e-05, |
| "lookahead_loss": 6.920570821762085, |
| "loss": 4.3051, |
| "step": 71000 |
| }, |
| { |
| "epoch": 0.13637542724609375, |
| "grad_norm": 42.381858825683594, |
| "learning_rate": 4.318132400512695e-05, |
| "lookahead_loss": 6.903384463310242, |
| "loss": 4.2952, |
| "step": 71500 |
| }, |
| { |
| "epoch": 0.1373291015625, |
| "grad_norm": 23.762941360473633, |
| "learning_rate": 4.3133640289306643e-05, |
| "lookahead_loss": 6.909006532669068, |
| "loss": 4.2991, |
| "step": 72000 |
| }, |
| { |
| "epoch": 0.13828277587890625, |
| "grad_norm": 29.057222366333008, |
| "learning_rate": 4.308595657348633e-05, |
| "lookahead_loss": 6.910077003479004, |
| "loss": 4.3034, |
| "step": 72500 |
| }, |
| { |
| "epoch": 0.1392364501953125, |
| "grad_norm": 153.4261474609375, |
| "learning_rate": 4.303827285766602e-05, |
| "lookahead_loss": 6.888102185249329, |
| "loss": 4.2878, |
| "step": 73000 |
| }, |
| { |
| "epoch": 0.14019012451171875, |
| "grad_norm": 15.459939002990723, |
| "learning_rate": 4.299058914184571e-05, |
| "lookahead_loss": 6.907894590377808, |
| "loss": 4.3014, |
| "step": 73500 |
| }, |
| { |
| "epoch": 0.141143798828125, |
| "grad_norm": 62.87086868286133, |
| "learning_rate": 4.294290542602539e-05, |
| "lookahead_loss": 6.893982265472412, |
| "loss": 4.2836, |
| "step": 74000 |
| }, |
| { |
| "epoch": 0.14209747314453125, |
| "grad_norm": 35.068294525146484, |
| "learning_rate": 4.289522171020508e-05, |
| "lookahead_loss": 6.8669888181686405, |
| "loss": 4.2665, |
| "step": 74500 |
| }, |
| { |
| "epoch": 0.1430511474609375, |
| "grad_norm": 66.57713317871094, |
| "learning_rate": 4.2847537994384764e-05, |
| "lookahead_loss": 6.896595262527466, |
| "loss": 4.2923, |
| "step": 75000 |
| }, |
| { |
| "epoch": 0.1430511474609375, |
| "eval_accuracy": 0.03636497064579256, |
| "eval_lookahead_loss": 6.83919605960846, |
| "eval_lookahead_perplexity": 933.7381630770153, |
| "eval_loss": 4.251890182495117, |
| "eval_perplexity": 70.23804968481546, |
| "eval_runtime": 529.7606, |
| "eval_samples_per_second": 18.876, |
| "eval_steps_per_second": 4.719, |
| "step": 75000 |
| }, |
| { |
| "epoch": 0.14400482177734375, |
| "grad_norm": 25.72868537902832, |
| "learning_rate": 4.2799854278564455e-05, |
| "lookahead_loss": 6.921064286231995, |
| "loss": 4.3106, |
| "step": 75500 |
| }, |
| { |
| "epoch": 0.14495849609375, |
| "grad_norm": 27.730438232421875, |
| "learning_rate": 4.2752170562744145e-05, |
| "lookahead_loss": 6.912421476364136, |
| "loss": 4.3031, |
| "step": 76000 |
| }, |
| { |
| "epoch": 0.14591217041015625, |
| "grad_norm": 68.38117218017578, |
| "learning_rate": 4.270448684692383e-05, |
| "lookahead_loss": 6.940694100379944, |
| "loss": 4.3284, |
| "step": 76500 |
| }, |
| { |
| "epoch": 0.1468658447265625, |
| "grad_norm": 53.29292678833008, |
| "learning_rate": 4.265680313110352e-05, |
| "lookahead_loss": 6.898805319786072, |
| "loss": 4.2968, |
| "step": 77000 |
| }, |
| { |
| "epoch": 0.14781951904296875, |
| "grad_norm": 32.84404754638672, |
| "learning_rate": 4.26091194152832e-05, |
| "lookahead_loss": 6.898816948890686, |
| "loss": 4.298, |
| "step": 77500 |
| }, |
| { |
| "epoch": 0.148773193359375, |
| "grad_norm": 29.60218048095703, |
| "learning_rate": 4.256143569946289e-05, |
| "lookahead_loss": 6.902635954856873, |
| "loss": 4.3043, |
| "step": 78000 |
| }, |
| { |
| "epoch": 0.14972686767578125, |
| "grad_norm": 29.58042335510254, |
| "learning_rate": 4.251375198364258e-05, |
| "lookahead_loss": 6.916378297805786, |
| "loss": 4.3195, |
| "step": 78500 |
| }, |
| { |
| "epoch": 0.1506805419921875, |
| "grad_norm": 38.733272552490234, |
| "learning_rate": 4.2466068267822266e-05, |
| "lookahead_loss": 6.922666564941406, |
| "loss": 4.3225, |
| "step": 79000 |
| }, |
| { |
| "epoch": 0.15163421630859375, |
| "grad_norm": 43.216041564941406, |
| "learning_rate": 4.2418384552001956e-05, |
| "lookahead_loss": 6.898184200286865, |
| "loss": 4.3006, |
| "step": 79500 |
| }, |
| { |
| "epoch": 0.152587890625, |
| "grad_norm": 33.54539489746094, |
| "learning_rate": 4.237070083618164e-05, |
| "lookahead_loss": 6.804031673431396, |
| "loss": 4.2252, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.152587890625, |
| "eval_accuracy": 0.03708434442270059, |
| "eval_lookahead_loss": 6.815139235687256, |
| "eval_lookahead_perplexity": 911.5434263241633, |
| "eval_loss": 4.238174915313721, |
| "eval_perplexity": 69.28129215745501, |
| "eval_runtime": 530.8957, |
| "eval_samples_per_second": 18.836, |
| "eval_steps_per_second": 4.709, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.15354156494140625, |
| "grad_norm": 42.14455795288086, |
| "learning_rate": 4.232301712036133e-05, |
| "lookahead_loss": 6.8902224273681645, |
| "loss": 4.2969, |
| "step": 80500 |
| }, |
| { |
| "epoch": 0.1544952392578125, |
| "grad_norm": 33.13570022583008, |
| "learning_rate": 4.227533340454102e-05, |
| "lookahead_loss": 6.866750447273255, |
| "loss": 4.2715, |
| "step": 81000 |
| }, |
| { |
| "epoch": 0.15544891357421875, |
| "grad_norm": 29.868881225585938, |
| "learning_rate": 4.22276496887207e-05, |
| "lookahead_loss": 6.874780201911927, |
| "loss": 4.2892, |
| "step": 81500 |
| }, |
| { |
| "epoch": 0.156402587890625, |
| "grad_norm": 17.926511764526367, |
| "learning_rate": 4.2179965972900393e-05, |
| "lookahead_loss": 6.907827667236328, |
| "loss": 4.3007, |
| "step": 82000 |
| }, |
| { |
| "epoch": 0.15735626220703125, |
| "grad_norm": 31.06654930114746, |
| "learning_rate": 4.213228225708008e-05, |
| "lookahead_loss": 6.821476745605469, |
| "loss": 4.243, |
| "step": 82500 |
| }, |
| { |
| "epoch": 0.1583099365234375, |
| "grad_norm": 139.1468048095703, |
| "learning_rate": 4.208459854125977e-05, |
| "lookahead_loss": 6.8343437967300416, |
| "loss": 4.2521, |
| "step": 83000 |
| }, |
| { |
| "epoch": 0.15926361083984375, |
| "grad_norm": 198.40293884277344, |
| "learning_rate": 4.203691482543946e-05, |
| "lookahead_loss": 6.820498440742493, |
| "loss": 4.2382, |
| "step": 83500 |
| }, |
| { |
| "epoch": 0.16021728515625, |
| "grad_norm": 21.4571533203125, |
| "learning_rate": 4.198923110961914e-05, |
| "lookahead_loss": 6.817658892631531, |
| "loss": 4.2389, |
| "step": 84000 |
| }, |
| { |
| "epoch": 0.16117095947265625, |
| "grad_norm": 22.81861114501953, |
| "learning_rate": 4.194154739379883e-05, |
| "lookahead_loss": 6.868869221687317, |
| "loss": 4.2826, |
| "step": 84500 |
| }, |
| { |
| "epoch": 0.1621246337890625, |
| "grad_norm": 9.23076343536377, |
| "learning_rate": 4.1893863677978514e-05, |
| "lookahead_loss": 6.930012647628784, |
| "loss": 4.3282, |
| "step": 85000 |
| }, |
| { |
| "epoch": 0.1621246337890625, |
| "eval_accuracy": 0.03720293542074364, |
| "eval_lookahead_loss": 6.80156947555542, |
| "eval_lookahead_perplexity": 899.2575474186556, |
| "eval_loss": 4.2262067794799805, |
| "eval_perplexity": 68.45706630389405, |
| "eval_runtime": 516.9164, |
| "eval_samples_per_second": 19.345, |
| "eval_steps_per_second": 4.836, |
| "step": 85000 |
| }, |
| { |
| "epoch": 0.16307830810546875, |
| "grad_norm": 40.47315216064453, |
| "learning_rate": 4.1846179962158205e-05, |
| "lookahead_loss": 6.934399582862854, |
| "loss": 4.335, |
| "step": 85500 |
| }, |
| { |
| "epoch": 0.164031982421875, |
| "grad_norm": 27.909290313720703, |
| "learning_rate": 4.1798496246337895e-05, |
| "lookahead_loss": 6.914112399101257, |
| "loss": 4.3154, |
| "step": 86000 |
| }, |
| { |
| "epoch": 0.16498565673828125, |
| "grad_norm": 22.771636962890625, |
| "learning_rate": 4.175081253051758e-05, |
| "lookahead_loss": 6.900087572097778, |
| "loss": 4.306, |
| "step": 86500 |
| }, |
| { |
| "epoch": 0.1659393310546875, |
| "grad_norm": 24.337963104248047, |
| "learning_rate": 4.170312881469727e-05, |
| "lookahead_loss": 6.903161329269409, |
| "loss": 4.3017, |
| "step": 87000 |
| }, |
| { |
| "epoch": 0.16689300537109375, |
| "grad_norm": 44.1867790222168, |
| "learning_rate": 4.165544509887695e-05, |
| "lookahead_loss": 6.8666487379074095, |
| "loss": 4.2699, |
| "step": 87500 |
| }, |
| { |
| "epoch": 0.1678466796875, |
| "grad_norm": 33.684051513671875, |
| "learning_rate": 4.160776138305664e-05, |
| "lookahead_loss": 6.877238927841186, |
| "loss": 4.2803, |
| "step": 88000 |
| }, |
| { |
| "epoch": 0.16880035400390625, |
| "grad_norm": 2417.9248046875, |
| "learning_rate": 4.156007766723633e-05, |
| "lookahead_loss": 6.875159637451172, |
| "loss": 4.2768, |
| "step": 88500 |
| }, |
| { |
| "epoch": 0.1697540283203125, |
| "grad_norm": 26.52556800842285, |
| "learning_rate": 4.1512393951416016e-05, |
| "lookahead_loss": 6.863027097702027, |
| "loss": 4.271, |
| "step": 89000 |
| }, |
| { |
| "epoch": 0.17070770263671875, |
| "grad_norm": 42.031463623046875, |
| "learning_rate": 4.1464710235595706e-05, |
| "lookahead_loss": 6.897581003189087, |
| "loss": 4.3001, |
| "step": 89500 |
| }, |
| { |
| "epoch": 0.171661376953125, |
| "grad_norm": 69.53683471679688, |
| "learning_rate": 4.141702651977539e-05, |
| "lookahead_loss": 6.87531339931488, |
| "loss": 4.2749, |
| "step": 90000 |
| }, |
| { |
| "epoch": 0.171661376953125, |
| "eval_accuracy": 0.03792270058708415, |
| "eval_lookahead_loss": 6.78285526561737, |
| "eval_lookahead_perplexity": 882.5851448919409, |
| "eval_loss": 4.21151065826416, |
| "eval_perplexity": 67.45836941913082, |
| "eval_runtime": 520.0151, |
| "eval_samples_per_second": 19.23, |
| "eval_steps_per_second": 4.808, |
| "step": 90000 |
| }, |
| { |
| "epoch": 0.17261505126953125, |
| "grad_norm": 50.69905471801758, |
| "learning_rate": 4.136934280395508e-05, |
| "lookahead_loss": 6.859779204368591, |
| "loss": 4.2727, |
| "step": 90500 |
| }, |
| { |
| "epoch": 0.1735687255859375, |
| "grad_norm": 39.7846794128418, |
| "learning_rate": 4.132165908813477e-05, |
| "lookahead_loss": 6.854459458351135, |
| "loss": 4.2648, |
| "step": 91000 |
| }, |
| { |
| "epoch": 0.17452239990234375, |
| "grad_norm": 189.6832275390625, |
| "learning_rate": 4.127397537231445e-05, |
| "lookahead_loss": 6.894931933403015, |
| "loss": 4.2965, |
| "step": 91500 |
| }, |
| { |
| "epoch": 0.17547607421875, |
| "grad_norm": 65.82791900634766, |
| "learning_rate": 4.1226291656494143e-05, |
| "lookahead_loss": 6.872823440551758, |
| "loss": 4.2786, |
| "step": 92000 |
| }, |
| { |
| "epoch": 0.17642974853515625, |
| "grad_norm": 255.64524841308594, |
| "learning_rate": 4.117860794067383e-05, |
| "lookahead_loss": 6.8618717288970945, |
| "loss": 4.2732, |
| "step": 92500 |
| }, |
| { |
| "epoch": 0.1773834228515625, |
| "grad_norm": 47.5529670715332, |
| "learning_rate": 4.113092422485352e-05, |
| "lookahead_loss": 6.872314291000366, |
| "loss": 4.2789, |
| "step": 93000 |
| }, |
| { |
| "epoch": 0.17833709716796875, |
| "grad_norm": 81.05416870117188, |
| "learning_rate": 4.108324050903321e-05, |
| "lookahead_loss": 6.872944073677063, |
| "loss": 4.288, |
| "step": 93500 |
| }, |
| { |
| "epoch": 0.179290771484375, |
| "grad_norm": 65.22057342529297, |
| "learning_rate": 4.103555679321289e-05, |
| "lookahead_loss": 6.897490359306335, |
| "loss": 4.3048, |
| "step": 94000 |
| }, |
| { |
| "epoch": 0.18024444580078125, |
| "grad_norm": 457.313720703125, |
| "learning_rate": 4.098787307739258e-05, |
| "lookahead_loss": 6.843439957618713, |
| "loss": 4.2655, |
| "step": 94500 |
| }, |
| { |
| "epoch": 0.1811981201171875, |
| "grad_norm": 47.758506774902344, |
| "learning_rate": 4.0940189361572264e-05, |
| "lookahead_loss": 6.877529720306397, |
| "loss": 4.2947, |
| "step": 95000 |
| }, |
| { |
| "epoch": 0.1811981201171875, |
| "eval_accuracy": 0.03745929549902153, |
| "eval_lookahead_loss": 6.788920408058167, |
| "eval_lookahead_perplexity": 887.9544157486474, |
| "eval_loss": 4.223480701446533, |
| "eval_perplexity": 68.27070113761616, |
| "eval_runtime": 521.8451, |
| "eval_samples_per_second": 19.163, |
| "eval_steps_per_second": 4.791, |
| "step": 95000 |
| }, |
| { |
| "epoch": 0.18215179443359375, |
| "grad_norm": 53.871639251708984, |
| "learning_rate": 4.0892505645751955e-05, |
| "lookahead_loss": 6.852700922966004, |
| "loss": 4.2679, |
| "step": 95500 |
| }, |
| { |
| "epoch": 0.18310546875, |
| "grad_norm": 164.45399475097656, |
| "learning_rate": 4.0844821929931645e-05, |
| "lookahead_loss": 6.870538569450378, |
| "loss": 4.2826, |
| "step": 96000 |
| }, |
| { |
| "epoch": 0.18405914306640625, |
| "grad_norm": 48.58974075317383, |
| "learning_rate": 4.079713821411133e-05, |
| "lookahead_loss": 6.86589546585083, |
| "loss": 4.2916, |
| "step": 96500 |
| }, |
| { |
| "epoch": 0.1850128173828125, |
| "grad_norm": 80.98228454589844, |
| "learning_rate": 4.074945449829102e-05, |
| "lookahead_loss": 6.898058345794678, |
| "loss": 4.3162, |
| "step": 97000 |
| }, |
| { |
| "epoch": 0.18596649169921875, |
| "grad_norm": 53.51699447631836, |
| "learning_rate": 4.07017707824707e-05, |
| "lookahead_loss": 6.897518453598022, |
| "loss": 4.3258, |
| "step": 97500 |
| }, |
| { |
| "epoch": 0.186920166015625, |
| "grad_norm": 88.69690704345703, |
| "learning_rate": 4.065408706665039e-05, |
| "lookahead_loss": 6.88834384727478, |
| "loss": 4.3169, |
| "step": 98000 |
| }, |
| { |
| "epoch": 0.18787384033203125, |
| "grad_norm": 74.50374603271484, |
| "learning_rate": 4.060640335083008e-05, |
| "lookahead_loss": 6.880096747398376, |
| "loss": 4.3068, |
| "step": 98500 |
| }, |
| { |
| "epoch": 0.1888275146484375, |
| "grad_norm": 22.93511962890625, |
| "learning_rate": 4.0558719635009766e-05, |
| "lookahead_loss": 6.870912865638733, |
| "loss": 4.2954, |
| "step": 99000 |
| }, |
| { |
| "epoch": 0.18978118896484375, |
| "grad_norm": 38.44004440307617, |
| "learning_rate": 4.0511035919189456e-05, |
| "lookahead_loss": 6.818508658409119, |
| "loss": 4.2577, |
| "step": 99500 |
| }, |
| { |
| "epoch": 0.19073486328125, |
| "grad_norm": 64.5463638305664, |
| "learning_rate": 4.046335220336914e-05, |
| "lookahead_loss": 6.8376926612854, |
| "loss": 4.2664, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.19073486328125, |
| "eval_accuracy": 0.037738747553816045, |
| "eval_lookahead_loss": 6.794300679588318, |
| "eval_lookahead_perplexity": 892.7447266428838, |
| "eval_loss": 4.232558727264404, |
| "eval_perplexity": 68.89328597011509, |
| "eval_runtime": 524.4432, |
| "eval_samples_per_second": 19.068, |
| "eval_steps_per_second": 4.767, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.19168853759765625, |
| "grad_norm": 42.10316467285156, |
| "learning_rate": 4.041566848754883e-05, |
| "lookahead_loss": 6.821827688217163, |
| "loss": 4.2536, |
| "step": 100500 |
| }, |
| { |
| "epoch": 0.1926422119140625, |
| "grad_norm": 114.29437255859375, |
| "learning_rate": 4.036798477172852e-05, |
| "lookahead_loss": 6.861088555335998, |
| "loss": 4.284, |
| "step": 101000 |
| }, |
| { |
| "epoch": 0.19359588623046875, |
| "grad_norm": 47.13467788696289, |
| "learning_rate": 4.03203010559082e-05, |
| "lookahead_loss": 6.923010750770569, |
| "loss": 4.3359, |
| "step": 101500 |
| }, |
| { |
| "epoch": 0.194549560546875, |
| "grad_norm": 155.36863708496094, |
| "learning_rate": 4.0272617340087893e-05, |
| "lookahead_loss": 6.883045758247375, |
| "loss": 4.3101, |
| "step": 102000 |
| }, |
| { |
| "epoch": 0.19550323486328125, |
| "grad_norm": 108.17499542236328, |
| "learning_rate": 4.022493362426758e-05, |
| "lookahead_loss": 6.915028656959533, |
| "loss": 4.332, |
| "step": 102500 |
| }, |
| { |
| "epoch": 0.1964569091796875, |
| "grad_norm": 79.96166229248047, |
| "learning_rate": 4.017724990844727e-05, |
| "lookahead_loss": 6.900765736579895, |
| "loss": 4.3161, |
| "step": 103000 |
| }, |
| { |
| "epoch": 0.19741058349609375, |
| "grad_norm": 69.99280548095703, |
| "learning_rate": 4.012956619262696e-05, |
| "lookahead_loss": 6.891475837707519, |
| "loss": 4.311, |
| "step": 103500 |
| }, |
| { |
| "epoch": 0.1983642578125, |
| "grad_norm": 99.41875457763672, |
| "learning_rate": 4.008188247680664e-05, |
| "lookahead_loss": 6.889204908370972, |
| "loss": 4.3088, |
| "step": 104000 |
| }, |
| { |
| "epoch": 0.19931793212890625, |
| "grad_norm": 48.443599700927734, |
| "learning_rate": 4.003419876098633e-05, |
| "lookahead_loss": 6.882583235740662, |
| "loss": 4.3047, |
| "step": 104500 |
| }, |
| { |
| "epoch": 0.2002716064453125, |
| "grad_norm": 44.56516647338867, |
| "learning_rate": 3.9986515045166014e-05, |
| "lookahead_loss": 6.877967342376709, |
| "loss": 4.295, |
| "step": 105000 |
| }, |
| { |
| "epoch": 0.2002716064453125, |
| "eval_accuracy": 0.03699432485322896, |
| "eval_lookahead_loss": 6.798581778526306, |
| "eval_lookahead_perplexity": 896.574847858134, |
| "eval_loss": 4.24163293838501, |
| "eval_perplexity": 69.52128317222774, |
| "eval_runtime": 518.7451, |
| "eval_samples_per_second": 19.277, |
| "eval_steps_per_second": 4.819, |
| "step": 105000 |
| }, |
| { |
| "epoch": 0.20122528076171875, |
| "grad_norm": 61.94982147216797, |
| "learning_rate": 3.9938831329345705e-05, |
| "lookahead_loss": 6.875483073234558, |
| "loss": 4.2959, |
| "step": 105500 |
| }, |
| { |
| "epoch": 0.202178955078125, |
| "grad_norm": 63.32769012451172, |
| "learning_rate": 3.9891147613525395e-05, |
| "lookahead_loss": 6.872545673370361, |
| "loss": 4.2974, |
| "step": 106000 |
| }, |
| { |
| "epoch": 0.20313262939453125, |
| "grad_norm": 31.93428611755371, |
| "learning_rate": 3.984346389770508e-05, |
| "lookahead_loss": 6.849458250045776, |
| "loss": 4.2811, |
| "step": 106500 |
| }, |
| { |
| "epoch": 0.2040863037109375, |
| "grad_norm": 41.28413772583008, |
| "learning_rate": 3.979578018188477e-05, |
| "lookahead_loss": 6.899335614204407, |
| "loss": 4.3138, |
| "step": 107000 |
| }, |
| { |
| "epoch": 0.20503997802734375, |
| "grad_norm": 74.20625305175781, |
| "learning_rate": 3.974809646606445e-05, |
| "lookahead_loss": 6.874688111305237, |
| "loss": 4.295, |
| "step": 107500 |
| }, |
| { |
| "epoch": 0.20599365234375, |
| "grad_norm": 36.43619155883789, |
| "learning_rate": 3.970041275024414e-05, |
| "lookahead_loss": 6.846071959495545, |
| "loss": 4.2703, |
| "step": 108000 |
| }, |
| { |
| "epoch": 0.20694732666015625, |
| "grad_norm": 53.30854034423828, |
| "learning_rate": 3.965272903442383e-05, |
| "lookahead_loss": 6.856776931762695, |
| "loss": 4.2789, |
| "step": 108500 |
| }, |
| { |
| "epoch": 0.2079010009765625, |
| "grad_norm": 39.38306427001953, |
| "learning_rate": 3.9605045318603516e-05, |
| "lookahead_loss": 6.895358375549316, |
| "loss": 4.3114, |
| "step": 109000 |
| }, |
| { |
| "epoch": 0.20885467529296875, |
| "grad_norm": 48.62928771972656, |
| "learning_rate": 3.9557361602783206e-05, |
| "lookahead_loss": 6.878321517944336, |
| "loss": 4.2947, |
| "step": 109500 |
| }, |
| { |
| "epoch": 0.209808349609375, |
| "grad_norm": 131.33816528320312, |
| "learning_rate": 3.950967788696289e-05, |
| "lookahead_loss": 6.872714728355407, |
| "loss": 4.2937, |
| "step": 110000 |
| }, |
| { |
| "epoch": 0.209808349609375, |
| "eval_accuracy": 0.037959099804305285, |
| "eval_lookahead_loss": 6.77405214061737, |
| "eval_lookahead_perplexity": 874.8497353813099, |
| "eval_loss": 4.2201995849609375, |
| "eval_perplexity": 68.0470641048424, |
| "eval_runtime": 526.7075, |
| "eval_samples_per_second": 18.986, |
| "eval_steps_per_second": 4.746, |
| "step": 110000 |
| }, |
| { |
| "epoch": 0.21076202392578125, |
| "grad_norm": 86.80801391601562, |
| "learning_rate": 3.946199417114258e-05, |
| "lookahead_loss": 6.869027629852295, |
| "loss": 4.2903, |
| "step": 110500 |
| }, |
| { |
| "epoch": 0.2117156982421875, |
| "grad_norm": 174.2025604248047, |
| "learning_rate": 3.941431045532227e-05, |
| "lookahead_loss": 6.891250504493714, |
| "loss": 4.3109, |
| "step": 111000 |
| }, |
| { |
| "epoch": 0.21266937255859375, |
| "grad_norm": 76.27803039550781, |
| "learning_rate": 3.936662673950195e-05, |
| "lookahead_loss": 6.8711349420547485, |
| "loss": 4.2963, |
| "step": 111500 |
| }, |
| { |
| "epoch": 0.213623046875, |
| "grad_norm": 76.03060150146484, |
| "learning_rate": 3.9318943023681643e-05, |
| "lookahead_loss": 6.839546483039856, |
| "loss": 4.2732, |
| "step": 112000 |
| }, |
| { |
| "epoch": 0.21457672119140625, |
| "grad_norm": 128.2604217529297, |
| "learning_rate": 3.927125930786133e-05, |
| "lookahead_loss": 6.838752264976502, |
| "loss": 4.2731, |
| "step": 112500 |
| }, |
| { |
| "epoch": 0.2155303955078125, |
| "grad_norm": 96.75354766845703, |
| "learning_rate": 3.922357559204102e-05, |
| "lookahead_loss": 6.876836185455322, |
| "loss": 4.304, |
| "step": 113000 |
| }, |
| { |
| "epoch": 0.21648406982421875, |
| "grad_norm": 110.94019317626953, |
| "learning_rate": 3.917589187622071e-05, |
| "lookahead_loss": 6.853927268981933, |
| "loss": 4.2887, |
| "step": 113500 |
| }, |
| { |
| "epoch": 0.217437744140625, |
| "grad_norm": 467.76409912109375, |
| "learning_rate": 3.912820816040039e-05, |
| "lookahead_loss": 6.846646924018859, |
| "loss": 4.2818, |
| "step": 114000 |
| }, |
| { |
| "epoch": 0.21839141845703125, |
| "grad_norm": 99.11727905273438, |
| "learning_rate": 3.908052444458008e-05, |
| "lookahead_loss": 6.880916981697083, |
| "loss": 4.3074, |
| "step": 114500 |
| }, |
| { |
| "epoch": 0.2193450927734375, |
| "grad_norm": 51.55413818359375, |
| "learning_rate": 3.9032840728759764e-05, |
| "lookahead_loss": 6.843124849319458, |
| "loss": 4.2807, |
| "step": 115000 |
| }, |
| { |
| "epoch": 0.2193450927734375, |
| "eval_accuracy": 0.03913835616438356, |
| "eval_lookahead_loss": 6.77919933757782, |
| "eval_lookahead_perplexity": 879.3643681647695, |
| "eval_loss": 4.2271575927734375, |
| "eval_perplexity": 68.52218714654228, |
| "eval_runtime": 534.829, |
| "eval_samples_per_second": 18.698, |
| "eval_steps_per_second": 4.674, |
| "step": 115000 |
| }, |
| { |
| "epoch": 0.22029876708984375, |
| "grad_norm": 65.96794891357422, |
| "learning_rate": 3.8985157012939455e-05, |
| "lookahead_loss": 6.810358964920044, |
| "loss": 4.2526, |
| "step": 115500 |
| }, |
| { |
| "epoch": 0.22125244140625, |
| "grad_norm": 108.59892272949219, |
| "learning_rate": 3.8937473297119145e-05, |
| "lookahead_loss": 6.829855125427246, |
| "loss": 4.2744, |
| "step": 116000 |
| }, |
| { |
| "epoch": 0.22220611572265625, |
| "grad_norm": 117.41635131835938, |
| "learning_rate": 3.888978958129883e-05, |
| "lookahead_loss": 6.854491446495056, |
| "loss": 4.2888, |
| "step": 116500 |
| }, |
| { |
| "epoch": 0.2231597900390625, |
| "grad_norm": 89.93463897705078, |
| "learning_rate": 3.884210586547852e-05, |
| "lookahead_loss": 6.824976842880249, |
| "loss": 4.2663, |
| "step": 117000 |
| }, |
| { |
| "epoch": 0.22411346435546875, |
| "grad_norm": 356.25531005859375, |
| "learning_rate": 3.87944221496582e-05, |
| "lookahead_loss": 6.88228607749939, |
| "loss": 4.3139, |
| "step": 117500 |
| }, |
| { |
| "epoch": 0.225067138671875, |
| "grad_norm": 47.11695098876953, |
| "learning_rate": 3.874673843383789e-05, |
| "lookahead_loss": 6.884691130638123, |
| "loss": 4.3185, |
| "step": 118000 |
| }, |
| { |
| "epoch": 0.22602081298828125, |
| "grad_norm": 328.4604797363281, |
| "learning_rate": 3.869905471801758e-05, |
| "lookahead_loss": 6.9153993072509765, |
| "loss": 4.3428, |
| "step": 118500 |
| }, |
| { |
| "epoch": 0.2269744873046875, |
| "grad_norm": 227.2041778564453, |
| "learning_rate": 3.8651371002197266e-05, |
| "lookahead_loss": 6.880539307594299, |
| "loss": 4.3116, |
| "step": 119000 |
| }, |
| { |
| "epoch": 0.22792816162109375, |
| "grad_norm": 37.20756530761719, |
| "learning_rate": 3.8603687286376956e-05, |
| "lookahead_loss": 6.872707705497742, |
| "loss": 4.3063, |
| "step": 119500 |
| }, |
| { |
| "epoch": 0.2288818359375, |
| "grad_norm": 38.43456268310547, |
| "learning_rate": 3.855600357055664e-05, |
| "lookahead_loss": 6.884976079940796, |
| "loss": 4.3136, |
| "step": 120000 |
| }, |
| { |
| "epoch": 0.2288818359375, |
| "eval_accuracy": 0.0384091976516634, |
| "eval_lookahead_loss": 6.786619314384461, |
| "eval_lookahead_perplexity": 885.9134985314373, |
| "eval_loss": 4.235724449157715, |
| "eval_perplexity": 69.11172853558656, |
| "eval_runtime": 527.4272, |
| "eval_samples_per_second": 18.96, |
| "eval_steps_per_second": 4.74, |
| "step": 120000 |
| }, |
| { |
| "epoch": 0.22983551025390625, |
| "grad_norm": 53.289703369140625, |
| "learning_rate": 3.850831985473633e-05, |
| "lookahead_loss": 6.8849776201248165, |
| "loss": 4.3135, |
| "step": 120500 |
| }, |
| { |
| "epoch": 0.2307891845703125, |
| "grad_norm": 108.9170150756836, |
| "learning_rate": 3.846063613891602e-05, |
| "lookahead_loss": 6.875076312065125, |
| "loss": 4.3031, |
| "step": 121000 |
| }, |
| { |
| "epoch": 0.23174285888671875, |
| "grad_norm": 142.25738525390625, |
| "learning_rate": 3.84129524230957e-05, |
| "lookahead_loss": 6.858631893157959, |
| "loss": 4.2926, |
| "step": 121500 |
| }, |
| { |
| "epoch": 0.232696533203125, |
| "grad_norm": 189.4082489013672, |
| "learning_rate": 3.8365268707275393e-05, |
| "lookahead_loss": 6.873845464706421, |
| "loss": 4.3063, |
| "step": 122000 |
| }, |
| { |
| "epoch": 0.23365020751953125, |
| "grad_norm": 63.17145538330078, |
| "learning_rate": 3.831758499145508e-05, |
| "lookahead_loss": 6.905432909011841, |
| "loss": 4.3369, |
| "step": 122500 |
| }, |
| { |
| "epoch": 0.2346038818359375, |
| "grad_norm": 112.66239929199219, |
| "learning_rate": 3.826990127563477e-05, |
| "lookahead_loss": 6.881167071342468, |
| "loss": 4.3141, |
| "step": 123000 |
| }, |
| { |
| "epoch": 0.23555755615234375, |
| "grad_norm": 33.97451400756836, |
| "learning_rate": 3.822221755981446e-05, |
| "lookahead_loss": 6.895055487632751, |
| "loss": 4.3264, |
| "step": 123500 |
| }, |
| { |
| "epoch": 0.23651123046875, |
| "grad_norm": 87.62593841552734, |
| "learning_rate": 3.817453384399414e-05, |
| "lookahead_loss": 6.887056506156921, |
| "loss": 4.3136, |
| "step": 124000 |
| }, |
| { |
| "epoch": 0.23746490478515625, |
| "grad_norm": 35.62492752075195, |
| "learning_rate": 3.812685012817383e-05, |
| "lookahead_loss": 6.860579108238221, |
| "loss": 4.2943, |
| "step": 124500 |
| }, |
| { |
| "epoch": 0.2384185791015625, |
| "grad_norm": 91.56244659423828, |
| "learning_rate": 3.8079166412353514e-05, |
| "lookahead_loss": 6.871204349517822, |
| "loss": 4.3081, |
| "step": 125000 |
| }, |
| { |
| "epoch": 0.2384185791015625, |
| "eval_accuracy": 0.038136986301369864, |
| "eval_lookahead_loss": 6.790317520332336, |
| "eval_lookahead_perplexity": 889.1958547747105, |
| "eval_loss": 4.242990016937256, |
| "eval_perplexity": 69.61569306087364, |
| "eval_runtime": 529.0281, |
| "eval_samples_per_second": 18.903, |
| "eval_steps_per_second": 4.726, |
| "step": 125000 |
| }, |
| { |
| "epoch": 0.23937225341796875, |
| "grad_norm": 80.9436264038086, |
| "learning_rate": 3.8031482696533205e-05, |
| "lookahead_loss": 6.889874279975891, |
| "loss": 4.3146, |
| "step": 125500 |
| }, |
| { |
| "epoch": 0.240325927734375, |
| "grad_norm": 87.64212036132812, |
| "learning_rate": 3.7983798980712895e-05, |
| "lookahead_loss": 6.855912109375, |
| "loss": 4.2922, |
| "step": 126000 |
| }, |
| { |
| "epoch": 0.24127960205078125, |
| "grad_norm": 101.43157196044922, |
| "learning_rate": 3.793611526489258e-05, |
| "lookahead_loss": 6.882770585060119, |
| "loss": 4.3151, |
| "step": 126500 |
| }, |
| { |
| "epoch": 0.2422332763671875, |
| "grad_norm": 61.23059844970703, |
| "learning_rate": 3.788843154907227e-05, |
| "lookahead_loss": 6.875010539054871, |
| "loss": 4.3075, |
| "step": 127000 |
| }, |
| { |
| "epoch": 0.24318695068359375, |
| "grad_norm": 37.92524337768555, |
| "learning_rate": 3.784074783325195e-05, |
| "lookahead_loss": 6.851895881652832, |
| "loss": 4.2942, |
| "step": 127500 |
| }, |
| { |
| "epoch": 0.244140625, |
| "grad_norm": 131.681396484375, |
| "learning_rate": 3.779306411743164e-05, |
| "lookahead_loss": 6.882919793128967, |
| "loss": 4.3138, |
| "step": 128000 |
| }, |
| { |
| "epoch": 0.24509429931640625, |
| "grad_norm": 119.59640502929688, |
| "learning_rate": 3.774538040161133e-05, |
| "lookahead_loss": 6.8536221790313725, |
| "loss": 4.2997, |
| "step": 128500 |
| }, |
| { |
| "epoch": 0.2460479736328125, |
| "grad_norm": 114.87542724609375, |
| "learning_rate": 3.7697696685791016e-05, |
| "lookahead_loss": 6.8760291271209715, |
| "loss": 4.3095, |
| "step": 129000 |
| }, |
| { |
| "epoch": 0.24700164794921875, |
| "grad_norm": 137.85009765625, |
| "learning_rate": 3.7650012969970706e-05, |
| "lookahead_loss": 6.860938015937805, |
| "loss": 4.3048, |
| "step": 129500 |
| }, |
| { |
| "epoch": 0.247955322265625, |
| "grad_norm": 113.33018493652344, |
| "learning_rate": 3.760232925415039e-05, |
| "lookahead_loss": 6.898747537612915, |
| "loss": 4.3328, |
| "step": 130000 |
| }, |
| { |
| "epoch": 0.247955322265625, |
| "eval_accuracy": 0.039609001956947165, |
| "eval_lookahead_loss": 6.783533498001098, |
| "eval_lookahead_perplexity": 883.1839457587195, |
| "eval_loss": 4.240610599517822, |
| "eval_perplexity": 69.45024518098104, |
| "eval_runtime": 622.7942, |
| "eval_samples_per_second": 16.057, |
| "eval_steps_per_second": 4.014, |
| "step": 130000 |
| }, |
| { |
| "epoch": 0.24890899658203125, |
| "grad_norm": 50.265419006347656, |
| "learning_rate": 3.755464553833008e-05, |
| "lookahead_loss": 6.8634723825454715, |
| "loss": 4.3059, |
| "step": 130500 |
| }, |
| { |
| "epoch": 0.2498626708984375, |
| "grad_norm": 96.71109771728516, |
| "learning_rate": 3.750696182250977e-05, |
| "lookahead_loss": 6.861883195877075, |
| "loss": 4.3064, |
| "step": 131000 |
| }, |
| { |
| "epoch": 0.25081634521484375, |
| "grad_norm": 59.87166213989258, |
| "learning_rate": 3.745927810668945e-05, |
| "lookahead_loss": 6.860560440063477, |
| "loss": 4.304, |
| "step": 131500 |
| }, |
| { |
| "epoch": 0.25177001953125, |
| "grad_norm": 85.31575775146484, |
| "learning_rate": 3.7411594390869143e-05, |
| "lookahead_loss": 6.855814200401306, |
| "loss": 4.2988, |
| "step": 132000 |
| }, |
| { |
| "epoch": 0.25272369384765625, |
| "grad_norm": 72.10446166992188, |
| "learning_rate": 3.736391067504883e-05, |
| "lookahead_loss": 6.842546007156372, |
| "loss": 4.2901, |
| "step": 132500 |
| }, |
| { |
| "epoch": 0.2536773681640625, |
| "grad_norm": 212.02857971191406, |
| "learning_rate": 3.731622695922852e-05, |
| "lookahead_loss": 6.884550764083862, |
| "loss": 4.3222, |
| "step": 133000 |
| }, |
| { |
| "epoch": 0.25463104248046875, |
| "grad_norm": 140.139892578125, |
| "learning_rate": 3.726854324340821e-05, |
| "lookahead_loss": 6.8949517126083375, |
| "loss": 4.3358, |
| "step": 133500 |
| }, |
| { |
| "epoch": 0.255584716796875, |
| "grad_norm": 334.65423583984375, |
| "learning_rate": 3.722085952758789e-05, |
| "lookahead_loss": 6.917874977111817, |
| "loss": 4.3551, |
| "step": 134000 |
| }, |
| { |
| "epoch": 0.25653839111328125, |
| "grad_norm": 187.89089965820312, |
| "learning_rate": 3.717317581176758e-05, |
| "lookahead_loss": 6.907001503944397, |
| "loss": 4.3446, |
| "step": 134500 |
| }, |
| { |
| "epoch": 0.2574920654296875, |
| "grad_norm": 81.5361557006836, |
| "learning_rate": 3.7125492095947264e-05, |
| "lookahead_loss": 6.880993166923523, |
| "loss": 4.3221, |
| "step": 135000 |
| }, |
| { |
| "epoch": 0.2574920654296875, |
| "eval_accuracy": 0.04001037181996086, |
| "eval_lookahead_loss": 6.775381837844849, |
| "eval_lookahead_perplexity": 876.0137944004181, |
| "eval_loss": 4.236556053161621, |
| "eval_perplexity": 69.16922603001332, |
| "eval_runtime": 570.0977, |
| "eval_samples_per_second": 17.541, |
| "eval_steps_per_second": 4.385, |
| "step": 135000 |
| }, |
| { |
| "epoch": 0.25844573974609375, |
| "grad_norm": 163.2711639404297, |
| "learning_rate": 3.7077808380126955e-05, |
| "lookahead_loss": 6.8826041498184205, |
| "loss": 4.3206, |
| "step": 135500 |
| }, |
| { |
| "epoch": 0.2593994140625, |
| "grad_norm": 79.22199249267578, |
| "learning_rate": 3.7030124664306645e-05, |
| "lookahead_loss": 6.8575506048202515, |
| "loss": 4.3033, |
| "step": 136000 |
| }, |
| { |
| "epoch": 0.26035308837890625, |
| "grad_norm": 110.32946014404297, |
| "learning_rate": 3.698244094848633e-05, |
| "lookahead_loss": 6.846091943740845, |
| "loss": 4.2959, |
| "step": 136500 |
| }, |
| { |
| "epoch": 0.2613067626953125, |
| "grad_norm": 322.8215637207031, |
| "learning_rate": 3.693475723266602e-05, |
| "lookahead_loss": 6.882950247764588, |
| "loss": 4.3237, |
| "step": 137000 |
| }, |
| { |
| "epoch": 0.26226043701171875, |
| "grad_norm": 166.58958435058594, |
| "learning_rate": 3.68870735168457e-05, |
| "lookahead_loss": 6.921587300300598, |
| "loss": 4.3546, |
| "step": 137500 |
| }, |
| { |
| "epoch": 0.263214111328125, |
| "grad_norm": 58.52566909790039, |
| "learning_rate": 3.683938980102539e-05, |
| "lookahead_loss": 6.886538036346436, |
| "loss": 4.3282, |
| "step": 138000 |
| }, |
| { |
| "epoch": 0.26416778564453125, |
| "grad_norm": 42.72242736816406, |
| "learning_rate": 3.679170608520508e-05, |
| "lookahead_loss": 6.900633343696594, |
| "loss": 4.3373, |
| "step": 138500 |
| }, |
| { |
| "epoch": 0.2651214599609375, |
| "grad_norm": 149.67503356933594, |
| "learning_rate": 3.6744022369384766e-05, |
| "lookahead_loss": 6.817947501182556, |
| "loss": 4.2724, |
| "step": 139000 |
| }, |
| { |
| "epoch": 0.26607513427734375, |
| "grad_norm": 72.29381561279297, |
| "learning_rate": 3.6696338653564456e-05, |
| "lookahead_loss": 6.917904253959656, |
| "loss": 4.3484, |
| "step": 139500 |
| }, |
| { |
| "epoch": 0.26702880859375, |
| "grad_norm": 122.92243194580078, |
| "learning_rate": 3.664865493774414e-05, |
| "lookahead_loss": 6.875696827888489, |
| "loss": 4.3163, |
| "step": 140000 |
| }, |
| { |
| "epoch": 0.26702880859375, |
| "eval_accuracy": 0.03886399217221135, |
| "eval_lookahead_loss": 6.780196311759949, |
| "eval_lookahead_perplexity": 880.2415089071973, |
| "eval_loss": 4.2404398918151855, |
| "eval_perplexity": 69.43839050105001, |
| "eval_runtime": 582.6096, |
| "eval_samples_per_second": 17.164, |
| "eval_steps_per_second": 4.291, |
| "step": 140000 |
| }, |
| { |
| "epoch": 0.26798248291015625, |
| "grad_norm": 46.50304412841797, |
| "learning_rate": 3.660097122192383e-05, |
| "lookahead_loss": 6.886635174751282, |
| "loss": 4.323, |
| "step": 140500 |
| }, |
| { |
| "epoch": 0.2689361572265625, |
| "grad_norm": 52.92224884033203, |
| "learning_rate": 3.655328750610352e-05, |
| "lookahead_loss": 6.881099202156067, |
| "loss": 4.3205, |
| "step": 141000 |
| }, |
| { |
| "epoch": 0.26988983154296875, |
| "grad_norm": 44.822757720947266, |
| "learning_rate": 3.65056037902832e-05, |
| "lookahead_loss": 6.892631261825562, |
| "loss": 4.3335, |
| "step": 141500 |
| }, |
| { |
| "epoch": 0.270843505859375, |
| "grad_norm": 69.46161651611328, |
| "learning_rate": 3.6457920074462893e-05, |
| "lookahead_loss": 6.876191157341004, |
| "loss": 4.3232, |
| "step": 142000 |
| }, |
| { |
| "epoch": 0.27179718017578125, |
| "grad_norm": 87.14396667480469, |
| "learning_rate": 3.641023635864258e-05, |
| "lookahead_loss": 6.874067866325379, |
| "loss": 4.3212, |
| "step": 142500 |
| }, |
| { |
| "epoch": 0.2727508544921875, |
| "grad_norm": 89.53559112548828, |
| "learning_rate": 3.636255264282227e-05, |
| "lookahead_loss": 6.858251659393311, |
| "loss": 4.3082, |
| "step": 143000 |
| }, |
| { |
| "epoch": 0.27370452880859375, |
| "grad_norm": 29.0264835357666, |
| "learning_rate": 3.631486892700196e-05, |
| "lookahead_loss": 6.89702485370636, |
| "loss": 4.3446, |
| "step": 143500 |
| }, |
| { |
| "epoch": 0.274658203125, |
| "grad_norm": 57.86652374267578, |
| "learning_rate": 3.626718521118164e-05, |
| "lookahead_loss": 6.8783858890533445, |
| "loss": 4.3212, |
| "step": 144000 |
| }, |
| { |
| "epoch": 0.27561187744140625, |
| "grad_norm": 57.66053009033203, |
| "learning_rate": 3.621950149536133e-05, |
| "lookahead_loss": 6.811130007743835, |
| "loss": 4.2648, |
| "step": 144500 |
| }, |
| { |
| "epoch": 0.2765655517578125, |
| "grad_norm": 102.4299087524414, |
| "learning_rate": 3.6171817779541014e-05, |
| "lookahead_loss": 6.850858941078186, |
| "loss": 4.2954, |
| "step": 145000 |
| }, |
| { |
| "epoch": 0.2765655517578125, |
| "eval_accuracy": 0.038117221135029355, |
| "eval_lookahead_loss": 6.768327401542663, |
| "eval_lookahead_perplexity": 869.8557571665997, |
| "eval_loss": 4.22998571395874, |
| "eval_perplexity": 68.71625048364461, |
| "eval_runtime": 536.0831, |
| "eval_samples_per_second": 18.654, |
| "eval_steps_per_second": 4.663, |
| "step": 145000 |
| }, |
| { |
| "epoch": 0.27751922607421875, |
| "grad_norm": 64.26153564453125, |
| "learning_rate": 3.6124134063720705e-05, |
| "lookahead_loss": 6.823865992546081, |
| "loss": 4.2736, |
| "step": 145500 |
| }, |
| { |
| "epoch": 0.278472900390625, |
| "grad_norm": 162.3262481689453, |
| "learning_rate": 3.6076450347900395e-05, |
| "lookahead_loss": 6.829603230476379, |
| "loss": 4.2828, |
| "step": 146000 |
| }, |
| { |
| "epoch": 0.27942657470703125, |
| "grad_norm": 31.876447677612305, |
| "learning_rate": 3.602876663208008e-05, |
| "lookahead_loss": 6.834772867202759, |
| "loss": 4.2822, |
| "step": 146500 |
| }, |
| { |
| "epoch": 0.2803802490234375, |
| "grad_norm": 25.226497650146484, |
| "learning_rate": 3.598108291625977e-05, |
| "lookahead_loss": 6.865582668304444, |
| "loss": 4.3144, |
| "step": 147000 |
| }, |
| { |
| "epoch": 0.28133392333984375, |
| "grad_norm": 28.350418090820312, |
| "learning_rate": 3.593339920043945e-05, |
| "lookahead_loss": 6.832012493133545, |
| "loss": 4.2802, |
| "step": 147500 |
| }, |
| { |
| "epoch": 0.28228759765625, |
| "grad_norm": 42.95259475708008, |
| "learning_rate": 3.588571548461914e-05, |
| "lookahead_loss": 6.801559608459472, |
| "loss": 4.25, |
| "step": 148000 |
| }, |
| { |
| "epoch": 0.28324127197265625, |
| "grad_norm": 140.41038513183594, |
| "learning_rate": 3.583803176879883e-05, |
| "lookahead_loss": 6.815518292427063, |
| "loss": 4.2627, |
| "step": 148500 |
| }, |
| { |
| "epoch": 0.2841949462890625, |
| "grad_norm": 93.87850952148438, |
| "learning_rate": 3.5790348052978516e-05, |
| "lookahead_loss": 6.875488904953003, |
| "loss": 4.3141, |
| "step": 149000 |
| }, |
| { |
| "epoch": 0.28514862060546875, |
| "grad_norm": 85.48013305664062, |
| "learning_rate": 3.5742664337158206e-05, |
| "lookahead_loss": 6.8670156326293945, |
| "loss": 4.3032, |
| "step": 149500 |
| }, |
| { |
| "epoch": 0.286102294921875, |
| "grad_norm": 47.62663650512695, |
| "learning_rate": 3.569498062133789e-05, |
| "lookahead_loss": 6.878105813026428, |
| "loss": 4.3153, |
| "step": 150000 |
| }, |
| { |
| "epoch": 0.286102294921875, |
| "eval_accuracy": 0.039768884540117413, |
| "eval_lookahead_loss": 6.756953916549683, |
| "eval_lookahead_perplexity": 860.0185136791051, |
| "eval_loss": 4.220040321350098, |
| "eval_perplexity": 68.03622754666361, |
| "eval_runtime": 755.0665, |
| "eval_samples_per_second": 13.244, |
| "eval_steps_per_second": 3.311, |
| "step": 150000 |
| }, |
| { |
| "epoch": 0.28705596923828125, |
| "grad_norm": 74.79360961914062, |
| "learning_rate": 3.564729690551758e-05, |
| "lookahead_loss": 6.859174869537354, |
| "loss": 4.2938, |
| "step": 150500 |
| }, |
| { |
| "epoch": 0.2880096435546875, |
| "grad_norm": 87.72206115722656, |
| "learning_rate": 3.559961318969727e-05, |
| "lookahead_loss": 6.861918277740479, |
| "loss": 4.2972, |
| "step": 151000 |
| }, |
| { |
| "epoch": 0.28896331787109375, |
| "grad_norm": 17.82638168334961, |
| "learning_rate": 3.555192947387695e-05, |
| "lookahead_loss": 6.845822727203369, |
| "loss": 4.2863, |
| "step": 151500 |
| }, |
| { |
| "epoch": 0.2899169921875, |
| "grad_norm": 35.82793426513672, |
| "learning_rate": 3.5504245758056643e-05, |
| "lookahead_loss": 6.879805488586426, |
| "loss": 4.3167, |
| "step": 152000 |
| }, |
| { |
| "epoch": 0.29087066650390625, |
| "grad_norm": 50.11417770385742, |
| "learning_rate": 3.545656204223633e-05, |
| "lookahead_loss": 6.827612632751465, |
| "loss": 4.2696, |
| "step": 152500 |
| }, |
| { |
| "epoch": 0.2918243408203125, |
| "grad_norm": 27.509164810180664, |
| "learning_rate": 3.540887832641602e-05, |
| "lookahead_loss": 6.885490052223205, |
| "loss": 4.3208, |
| "step": 153000 |
| }, |
| { |
| "epoch": 0.29277801513671875, |
| "grad_norm": 81.59918212890625, |
| "learning_rate": 3.536119461059571e-05, |
| "lookahead_loss": 6.868545116424561, |
| "loss": 4.3024, |
| "step": 153500 |
| }, |
| { |
| "epoch": 0.293731689453125, |
| "grad_norm": 71.62451934814453, |
| "learning_rate": 3.531351089477539e-05, |
| "lookahead_loss": 6.867383508682251, |
| "loss": 4.3018, |
| "step": 154000 |
| }, |
| { |
| "epoch": 0.29468536376953125, |
| "grad_norm": 129.45018005371094, |
| "learning_rate": 3.526582717895508e-05, |
| "lookahead_loss": 6.869330671310425, |
| "loss": 4.3004, |
| "step": 154500 |
| }, |
| { |
| "epoch": 0.2956390380859375, |
| "grad_norm": 50.30118179321289, |
| "learning_rate": 3.5218143463134764e-05, |
| "lookahead_loss": 6.837138333320618, |
| "loss": 4.2762, |
| "step": 155000 |
| }, |
| { |
| "epoch": 0.2956390380859375, |
| "eval_accuracy": 0.039083365949119375, |
| "eval_lookahead_loss": 6.751848223304749, |
| "eval_lookahead_perplexity": 855.6387134359732, |
| "eval_loss": 4.212831974029541, |
| "eval_perplexity": 67.54756213904808, |
| "eval_runtime": 704.2685, |
| "eval_samples_per_second": 14.199, |
| "eval_steps_per_second": 3.55, |
| "step": 155000 |
| }, |
| { |
| "epoch": 0.29659271240234375, |
| "grad_norm": 46.7334098815918, |
| "learning_rate": 3.5170459747314455e-05, |
| "lookahead_loss": 6.823378650665283, |
| "loss": 4.267, |
| "step": 155500 |
| }, |
| { |
| "epoch": 0.29754638671875, |
| "grad_norm": 54.036678314208984, |
| "learning_rate": 3.5122776031494145e-05, |
| "lookahead_loss": 6.854256680488587, |
| "loss": 4.2905, |
| "step": 156000 |
| }, |
| { |
| "epoch": 0.29850006103515625, |
| "grad_norm": 178.30545043945312, |
| "learning_rate": 3.507509231567383e-05, |
| "lookahead_loss": 6.830217700958252, |
| "loss": 4.2744, |
| "step": 156500 |
| }, |
| { |
| "epoch": 0.2994537353515625, |
| "grad_norm": 154.18592834472656, |
| "learning_rate": 3.502740859985352e-05, |
| "lookahead_loss": 6.832935528755188, |
| "loss": 4.274, |
| "step": 157000 |
| }, |
| { |
| "epoch": 0.30040740966796875, |
| "grad_norm": 171.55133056640625, |
| "learning_rate": 3.49797248840332e-05, |
| "lookahead_loss": 6.867803561210632, |
| "loss": 4.3103, |
| "step": 157500 |
| }, |
| { |
| "epoch": 0.301361083984375, |
| "grad_norm": 168.18153381347656, |
| "learning_rate": 3.493204116821289e-05, |
| "lookahead_loss": 6.825819746017456, |
| "loss": 4.2771, |
| "step": 158000 |
| }, |
| { |
| "epoch": 0.30231475830078125, |
| "grad_norm": 227.7982177734375, |
| "learning_rate": 3.488435745239258e-05, |
| "lookahead_loss": 6.852307137489319, |
| "loss": 4.2941, |
| "step": 158500 |
| }, |
| { |
| "epoch": 0.3032684326171875, |
| "grad_norm": 115.91482543945312, |
| "learning_rate": 3.4836673736572266e-05, |
| "lookahead_loss": 6.829904661178589, |
| "loss": 4.2803, |
| "step": 159000 |
| }, |
| { |
| "epoch": 0.30422210693359375, |
| "grad_norm": 63.71833419799805, |
| "learning_rate": 3.4788990020751956e-05, |
| "lookahead_loss": 6.797525130271912, |
| "loss": 4.2612, |
| "step": 159500 |
| }, |
| { |
| "epoch": 0.30517578125, |
| "grad_norm": 235.1917266845703, |
| "learning_rate": 3.474130630493164e-05, |
| "lookahead_loss": 6.840181548118592, |
| "loss": 4.2877, |
| "step": 160000 |
| }, |
| { |
| "epoch": 0.30517578125, |
| "eval_accuracy": 0.03816242661448141, |
| "eval_lookahead_loss": 6.762623178863525, |
| "eval_lookahead_perplexity": 864.9080311103977, |
| "eval_loss": 4.227556228637695, |
| "eval_perplexity": 68.5495079930091, |
| "eval_runtime": 636.3906, |
| "eval_samples_per_second": 15.714, |
| "eval_steps_per_second": 3.928, |
| "step": 160000 |
| }, |
| { |
| "epoch": 0.30612945556640625, |
| "grad_norm": 102.3113021850586, |
| "learning_rate": 3.469362258911133e-05, |
| "lookahead_loss": 6.838310770988464, |
| "loss": 4.2917, |
| "step": 160500 |
| }, |
| { |
| "epoch": 0.3070831298828125, |
| "grad_norm": 105.63389587402344, |
| "learning_rate": 3.464593887329102e-05, |
| "lookahead_loss": 6.832859290122986, |
| "loss": 4.2947, |
| "step": 161000 |
| }, |
| { |
| "epoch": 0.30803680419921875, |
| "grad_norm": 296.0216369628906, |
| "learning_rate": 3.45982551574707e-05, |
| "lookahead_loss": 6.84702619934082, |
| "loss": 4.2954, |
| "step": 161500 |
| }, |
| { |
| "epoch": 0.308990478515625, |
| "grad_norm": 63.36662673950195, |
| "learning_rate": 3.4550571441650393e-05, |
| "lookahead_loss": 6.837830230712891, |
| "loss": 4.2894, |
| "step": 162000 |
| }, |
| { |
| "epoch": 0.30994415283203125, |
| "grad_norm": 316.7913818359375, |
| "learning_rate": 3.450288772583008e-05, |
| "lookahead_loss": 6.814890047073364, |
| "loss": 4.2654, |
| "step": 162500 |
| }, |
| { |
| "epoch": 0.3108978271484375, |
| "grad_norm": 52.75489044189453, |
| "learning_rate": 3.445520401000977e-05, |
| "lookahead_loss": 6.821509984016418, |
| "loss": 4.2734, |
| "step": 163000 |
| }, |
| { |
| "epoch": 0.31185150146484375, |
| "grad_norm": 506.84539794921875, |
| "learning_rate": 3.440752029418946e-05, |
| "lookahead_loss": 6.817704682350159, |
| "loss": 4.2689, |
| "step": 163500 |
| }, |
| { |
| "epoch": 0.31280517578125, |
| "grad_norm": 79.96004486083984, |
| "learning_rate": 3.435983657836914e-05, |
| "lookahead_loss": 6.820456267356873, |
| "loss": 4.2836, |
| "step": 164000 |
| }, |
| { |
| "epoch": 0.31375885009765625, |
| "grad_norm": 49.6429328918457, |
| "learning_rate": 3.431215286254883e-05, |
| "lookahead_loss": 6.8662997255325315, |
| "loss": 4.3089, |
| "step": 164500 |
| }, |
| { |
| "epoch": 0.3147125244140625, |
| "grad_norm": 107.62309265136719, |
| "learning_rate": 3.4264469146728514e-05, |
| "lookahead_loss": 6.858008871078491, |
| "loss": 4.3023, |
| "step": 165000 |
| }, |
| { |
| "epoch": 0.3147125244140625, |
| "eval_accuracy": 0.03948473581213307, |
| "eval_lookahead_loss": 6.755647894287109, |
| "eval_lookahead_perplexity": 858.8960434990861, |
| "eval_loss": 4.223168849945068, |
| "eval_perplexity": 68.24941413632452, |
| "eval_runtime": 600.9731, |
| "eval_samples_per_second": 16.64, |
| "eval_steps_per_second": 4.16, |
| "step": 165000 |
| }, |
| { |
| "epoch": 0.31566619873046875, |
| "grad_norm": 128.85560607910156, |
| "learning_rate": 3.4216785430908205e-05, |
| "lookahead_loss": 6.865882458686829, |
| "loss": 4.3073, |
| "step": 165500 |
| }, |
| { |
| "epoch": 0.316619873046875, |
| "grad_norm": 131.54159545898438, |
| "learning_rate": 3.4169101715087895e-05, |
| "lookahead_loss": 6.827650568962097, |
| "loss": 4.2802, |
| "step": 166000 |
| }, |
| { |
| "epoch": 0.31757354736328125, |
| "grad_norm": 344.3023986816406, |
| "learning_rate": 3.412141799926758e-05, |
| "lookahead_loss": 6.854513771057129, |
| "loss": 4.2981, |
| "step": 166500 |
| }, |
| { |
| "epoch": 0.3185272216796875, |
| "grad_norm": 369.9884033203125, |
| "learning_rate": 3.407373428344727e-05, |
| "lookahead_loss": 6.850142656326294, |
| "loss": 4.2973, |
| "step": 167000 |
| }, |
| { |
| "epoch": 0.31948089599609375, |
| "grad_norm": 113.85972595214844, |
| "learning_rate": 3.402605056762695e-05, |
| "lookahead_loss": 6.861585620880127, |
| "loss": 4.3067, |
| "step": 167500 |
| }, |
| { |
| "epoch": 0.3204345703125, |
| "grad_norm": 72.0103759765625, |
| "learning_rate": 3.397836685180664e-05, |
| "lookahead_loss": 6.870785166740418, |
| "loss": 4.3156, |
| "step": 168000 |
| }, |
| { |
| "epoch": 0.32138824462890625, |
| "grad_norm": 98.10789489746094, |
| "learning_rate": 3.393068313598633e-05, |
| "lookahead_loss": 6.855742107868195, |
| "loss": 4.3076, |
| "step": 168500 |
| }, |
| { |
| "epoch": 0.3223419189453125, |
| "grad_norm": 389.3531799316406, |
| "learning_rate": 3.3882999420166016e-05, |
| "lookahead_loss": 6.877326703071594, |
| "loss": 4.3275, |
| "step": 169000 |
| }, |
| { |
| "epoch": 0.32329559326171875, |
| "grad_norm": 155.88134765625, |
| "learning_rate": 3.3835315704345706e-05, |
| "lookahead_loss": 6.849375417709351, |
| "loss": 4.3008, |
| "step": 169500 |
| }, |
| { |
| "epoch": 0.324249267578125, |
| "grad_norm": 695.6582641601562, |
| "learning_rate": 3.378763198852539e-05, |
| "lookahead_loss": 6.890757197380066, |
| "loss": 4.337, |
| "step": 170000 |
| }, |
| { |
| "epoch": 0.324249267578125, |
| "eval_accuracy": 0.039646575342465754, |
| "eval_lookahead_loss": 6.776586002159118, |
| "eval_lookahead_perplexity": 877.0692943205709, |
| "eval_loss": 4.247872829437256, |
| "eval_perplexity": 69.95644467325123, |
| "eval_runtime": 586.8671, |
| "eval_samples_per_second": 17.04, |
| "eval_steps_per_second": 4.26, |
| "step": 170000 |
| }, |
| { |
| "epoch": 0.32520294189453125, |
| "grad_norm": 148.5786590576172, |
| "learning_rate": 3.373994827270508e-05, |
| "lookahead_loss": 6.910296322822571, |
| "loss": 4.3571, |
| "step": 170500 |
| }, |
| { |
| "epoch": 0.3261566162109375, |
| "grad_norm": 182.1770782470703, |
| "learning_rate": 3.369226455688477e-05, |
| "lookahead_loss": 6.906925064086914, |
| "loss": 4.3553, |
| "step": 171000 |
| }, |
| { |
| "epoch": 0.32711029052734375, |
| "grad_norm": 248.4641876220703, |
| "learning_rate": 3.364458084106445e-05, |
| "lookahead_loss": 6.854492978096008, |
| "loss": 4.3063, |
| "step": 171500 |
| }, |
| { |
| "epoch": 0.32806396484375, |
| "grad_norm": 84.90131378173828, |
| "learning_rate": 3.3596897125244143e-05, |
| "lookahead_loss": 6.851985689163208, |
| "loss": 4.3028, |
| "step": 172000 |
| }, |
| { |
| "epoch": 0.32901763916015625, |
| "grad_norm": 538.4048461914062, |
| "learning_rate": 3.354921340942383e-05, |
| "lookahead_loss": 6.860056784629822, |
| "loss": 4.3099, |
| "step": 172500 |
| }, |
| { |
| "epoch": 0.3299713134765625, |
| "grad_norm": 1186.500732421875, |
| "learning_rate": 3.350152969360352e-05, |
| "lookahead_loss": 6.860212516784668, |
| "loss": 4.3107, |
| "step": 173000 |
| }, |
| { |
| "epoch": 0.33092498779296875, |
| "grad_norm": 153.32667541503906, |
| "learning_rate": 3.345384597778321e-05, |
| "lookahead_loss": 6.861251255989075, |
| "loss": 4.316, |
| "step": 173500 |
| }, |
| { |
| "epoch": 0.331878662109375, |
| "grad_norm": 96.35420227050781, |
| "learning_rate": 3.340616226196289e-05, |
| "lookahead_loss": 6.859582260131836, |
| "loss": 4.3105, |
| "step": 174000 |
| }, |
| { |
| "epoch": 0.33283233642578125, |
| "grad_norm": 368.3158264160156, |
| "learning_rate": 3.335847854614258e-05, |
| "lookahead_loss": 6.85998436164856, |
| "loss": 4.312, |
| "step": 174500 |
| }, |
| { |
| "epoch": 0.3337860107421875, |
| "grad_norm": 235.21160888671875, |
| "learning_rate": 3.3310794830322264e-05, |
| "lookahead_loss": 6.835501253128052, |
| "loss": 4.2891, |
| "step": 175000 |
| }, |
| { |
| "epoch": 0.3337860107421875, |
| "eval_accuracy": 0.03834227005870842, |
| "eval_lookahead_loss": 6.764507758712768, |
| "eval_lookahead_perplexity": 866.5395562440892, |
| "eval_loss": 4.234741687774658, |
| "eval_perplexity": 69.04384156147766, |
| "eval_runtime": 561.1431, |
| "eval_samples_per_second": 17.821, |
| "eval_steps_per_second": 4.455, |
| "step": 175000 |
| }, |
| { |
| "epoch": 0.33473968505859375, |
| "grad_norm": 536.079833984375, |
| "learning_rate": 3.3263111114501955e-05, |
| "lookahead_loss": 6.877282888412475, |
| "loss": 4.3319, |
| "step": 175500 |
| }, |
| { |
| "epoch": 0.335693359375, |
| "grad_norm": 284.269775390625, |
| "learning_rate": 3.3215427398681645e-05, |
| "lookahead_loss": 6.871447311401367, |
| "loss": 4.3258, |
| "step": 176000 |
| }, |
| { |
| "epoch": 0.33664703369140625, |
| "grad_norm": 144.25616455078125, |
| "learning_rate": 3.316774368286133e-05, |
| "lookahead_loss": 6.84305325126648, |
| "loss": 4.3015, |
| "step": 176500 |
| }, |
| { |
| "epoch": 0.3376007080078125, |
| "grad_norm": 541.6898193359375, |
| "learning_rate": 3.312005996704102e-05, |
| "lookahead_loss": 6.8506303358078, |
| "loss": 4.3046, |
| "step": 177000 |
| }, |
| { |
| "epoch": 0.33855438232421875, |
| "grad_norm": 214.76165771484375, |
| "learning_rate": 3.30723762512207e-05, |
| "lookahead_loss": 6.826534468650818, |
| "loss": 4.2857, |
| "step": 177500 |
| }, |
| { |
| "epoch": 0.339508056640625, |
| "grad_norm": 76.81355285644531, |
| "learning_rate": 3.302469253540039e-05, |
| "lookahead_loss": 6.8005103635787965, |
| "loss": 4.2663, |
| "step": 178000 |
| }, |
| { |
| "epoch": 0.34046173095703125, |
| "grad_norm": 229.3511199951172, |
| "learning_rate": 3.297700881958008e-05, |
| "lookahead_loss": 6.812626602172852, |
| "loss": 4.2819, |
| "step": 178500 |
| }, |
| { |
| "epoch": 0.3414154052734375, |
| "grad_norm": 649.7677001953125, |
| "learning_rate": 3.2929325103759766e-05, |
| "lookahead_loss": 6.839462232589722, |
| "loss": 4.3013, |
| "step": 179000 |
| }, |
| { |
| "epoch": 0.34236907958984375, |
| "grad_norm": 113.12621307373047, |
| "learning_rate": 3.2881641387939456e-05, |
| "lookahead_loss": 6.884343441009522, |
| "loss": 4.3425, |
| "step": 179500 |
| }, |
| { |
| "epoch": 0.34332275390625, |
| "grad_norm": 94.85265350341797, |
| "learning_rate": 3.283395767211914e-05, |
| "lookahead_loss": 6.918980081558227, |
| "loss": 4.3681, |
| "step": 180000 |
| }, |
| { |
| "epoch": 0.34332275390625, |
| "eval_accuracy": 0.03954050880626223, |
| "eval_lookahead_loss": 6.767707035446167, |
| "eval_lookahead_perplexity": 869.3162954951779, |
| "eval_loss": 4.239075660705566, |
| "eval_perplexity": 69.34372507596758, |
| "eval_runtime": 555.8911, |
| "eval_samples_per_second": 17.989, |
| "eval_steps_per_second": 4.497, |
| "step": 180000 |
| }, |
| { |
| "epoch": 0.34427642822265625, |
| "grad_norm": 183.81613159179688, |
| "learning_rate": 3.278627395629883e-05, |
| "lookahead_loss": 6.89797020149231, |
| "loss": 4.3489, |
| "step": 180500 |
| }, |
| { |
| "epoch": 0.3452301025390625, |
| "grad_norm": 56.57142639160156, |
| "learning_rate": 3.273859024047852e-05, |
| "lookahead_loss": 6.923570585250855, |
| "loss": 4.3708, |
| "step": 181000 |
| }, |
| { |
| "epoch": 0.34618377685546875, |
| "grad_norm": 156.73374938964844, |
| "learning_rate": 3.26909065246582e-05, |
| "lookahead_loss": 6.871435987472534, |
| "loss": 4.3269, |
| "step": 181500 |
| }, |
| { |
| "epoch": 0.347137451171875, |
| "grad_norm": 93.79271697998047, |
| "learning_rate": 3.2643222808837893e-05, |
| "lookahead_loss": 6.870373545646667, |
| "loss": 4.3319, |
| "step": 182000 |
| }, |
| { |
| "epoch": 0.34809112548828125, |
| "grad_norm": 359.62310791015625, |
| "learning_rate": 3.259553909301758e-05, |
| "lookahead_loss": 6.889604884147644, |
| "loss": 4.3436, |
| "step": 182500 |
| }, |
| { |
| "epoch": 0.3490447998046875, |
| "grad_norm": 75.86630249023438, |
| "learning_rate": 3.254785537719727e-05, |
| "lookahead_loss": 6.868936008453369, |
| "loss": 4.3267, |
| "step": 183000 |
| }, |
| { |
| "epoch": 0.34999847412109375, |
| "grad_norm": 61.89284896850586, |
| "learning_rate": 3.250017166137696e-05, |
| "lookahead_loss": 6.859222094535828, |
| "loss": 4.3128, |
| "step": 183500 |
| }, |
| { |
| "epoch": 0.3509521484375, |
| "grad_norm": 471.87738037109375, |
| "learning_rate": 3.245248794555664e-05, |
| "lookahead_loss": 6.881396533966065, |
| "loss": 4.3316, |
| "step": 184000 |
| }, |
| { |
| "epoch": 0.35190582275390625, |
| "grad_norm": 87.49971771240234, |
| "learning_rate": 3.240480422973633e-05, |
| "lookahead_loss": 6.857603517532349, |
| "loss": 4.3105, |
| "step": 184500 |
| }, |
| { |
| "epoch": 0.3528594970703125, |
| "grad_norm": 105.48241424560547, |
| "learning_rate": 3.2357120513916014e-05, |
| "lookahead_loss": 6.873289173126221, |
| "loss": 4.3192, |
| "step": 185000 |
| }, |
| { |
| "epoch": 0.3528594970703125, |
| "eval_accuracy": 0.03900039138943249, |
| "eval_lookahead_loss": 6.762678988456726, |
| "eval_lookahead_perplexity": 864.9563026227645, |
| "eval_loss": 4.232372283935547, |
| "eval_perplexity": 68.8804424738722, |
| "eval_runtime": 580.4662, |
| "eval_samples_per_second": 17.228, |
| "eval_steps_per_second": 4.307, |
| "step": 185000 |
| }, |
| { |
| "epoch": 0.35381317138671875, |
| "grad_norm": 63.99024200439453, |
| "learning_rate": 3.2309436798095705e-05, |
| "lookahead_loss": 6.874898074150085, |
| "loss": 4.3221, |
| "step": 185500 |
| }, |
| { |
| "epoch": 0.354766845703125, |
| "grad_norm": 112.1404037475586, |
| "learning_rate": 3.2261753082275395e-05, |
| "lookahead_loss": 6.833945309638977, |
| "loss": 4.2875, |
| "step": 186000 |
| }, |
| { |
| "epoch": 0.35572052001953125, |
| "grad_norm": 73.59690856933594, |
| "learning_rate": 3.221406936645508e-05, |
| "lookahead_loss": 6.851704045295715, |
| "loss": 4.3039, |
| "step": 186500 |
| }, |
| { |
| "epoch": 0.3566741943359375, |
| "grad_norm": 332.00238037109375, |
| "learning_rate": 3.216638565063477e-05, |
| "lookahead_loss": 6.8346173839569095, |
| "loss": 4.2888, |
| "step": 187000 |
| }, |
| { |
| "epoch": 0.35762786865234375, |
| "grad_norm": 124.59075927734375, |
| "learning_rate": 3.211870193481445e-05, |
| "lookahead_loss": 6.851246485710144, |
| "loss": 4.2999, |
| "step": 187500 |
| }, |
| { |
| "epoch": 0.35858154296875, |
| "grad_norm": 63.13421630859375, |
| "learning_rate": 3.207101821899414e-05, |
| "lookahead_loss": 6.851160929679871, |
| "loss": 4.3106, |
| "step": 188000 |
| }, |
| { |
| "epoch": 0.35953521728515625, |
| "grad_norm": 74.12056732177734, |
| "learning_rate": 3.202333450317383e-05, |
| "lookahead_loss": 6.834293618202209, |
| "loss": 4.2894, |
| "step": 188500 |
| }, |
| { |
| "epoch": 0.3604888916015625, |
| "grad_norm": 182.41357421875, |
| "learning_rate": 3.1975650787353516e-05, |
| "lookahead_loss": 6.868130068778992, |
| "loss": 4.3227, |
| "step": 189000 |
| }, |
| { |
| "epoch": 0.36144256591796875, |
| "grad_norm": 672.1762084960938, |
| "learning_rate": 3.1927967071533206e-05, |
| "lookahead_loss": 6.793136254310608, |
| "loss": 4.2627, |
| "step": 189500 |
| }, |
| { |
| "epoch": 0.362396240234375, |
| "grad_norm": 182.69644165039062, |
| "learning_rate": 3.188028335571289e-05, |
| "lookahead_loss": 6.834869485855102, |
| "loss": 4.2911, |
| "step": 190000 |
| }, |
| { |
| "epoch": 0.362396240234375, |
| "eval_accuracy": 0.03822191780821918, |
| "eval_lookahead_loss": 6.758944873428344, |
| "eval_lookahead_perplexity": 861.7324791041506, |
| "eval_loss": 4.230733394622803, |
| "eval_perplexity": 68.76764750732416, |
| "eval_runtime": 605.1013, |
| "eval_samples_per_second": 16.526, |
| "eval_steps_per_second": 4.132, |
| "step": 190000 |
| }, |
| { |
| "epoch": 0.36334991455078125, |
| "grad_norm": 268.18353271484375, |
| "learning_rate": 3.183259963989258e-05, |
| "lookahead_loss": 6.844361530303955, |
| "loss": 4.3009, |
| "step": 190500 |
| }, |
| { |
| "epoch": 0.3643035888671875, |
| "grad_norm": 146.6720428466797, |
| "learning_rate": 3.178491592407227e-05, |
| "lookahead_loss": 6.851699204444885, |
| "loss": 4.3166, |
| "step": 191000 |
| }, |
| { |
| "epoch": 0.36525726318359375, |
| "grad_norm": 130.42713928222656, |
| "learning_rate": 3.173723220825195e-05, |
| "lookahead_loss": 6.85072411441803, |
| "loss": 4.3122, |
| "step": 191500 |
| }, |
| { |
| "epoch": 0.3662109375, |
| "grad_norm": 104.52213287353516, |
| "learning_rate": 3.1689548492431643e-05, |
| "lookahead_loss": 6.830275066375733, |
| "loss": 4.2914, |
| "step": 192000 |
| }, |
| { |
| "epoch": 0.36716461181640625, |
| "grad_norm": 119.60869598388672, |
| "learning_rate": 3.164186477661133e-05, |
| "lookahead_loss": 6.770204667091369, |
| "loss": 4.2413, |
| "step": 192500 |
| }, |
| { |
| "epoch": 0.3681182861328125, |
| "grad_norm": 201.7767791748047, |
| "learning_rate": 3.159418106079102e-05, |
| "lookahead_loss": 6.751912678718567, |
| "loss": 4.2288, |
| "step": 193000 |
| }, |
| { |
| "epoch": 0.36907196044921875, |
| "grad_norm": 286.12420654296875, |
| "learning_rate": 3.154649734497071e-05, |
| "lookahead_loss": 6.794153441429138, |
| "loss": 4.2636, |
| "step": 193500 |
| }, |
| { |
| "epoch": 0.370025634765625, |
| "grad_norm": 160.72146606445312, |
| "learning_rate": 3.149881362915039e-05, |
| "lookahead_loss": 6.807093242645264, |
| "loss": 4.2732, |
| "step": 194000 |
| }, |
| { |
| "epoch": 0.37097930908203125, |
| "grad_norm": 55.39329528808594, |
| "learning_rate": 3.145112991333008e-05, |
| "lookahead_loss": 6.858504026412964, |
| "loss": 4.3133, |
| "step": 194500 |
| }, |
| { |
| "epoch": 0.3719329833984375, |
| "grad_norm": 394.2020568847656, |
| "learning_rate": 3.1403446197509764e-05, |
| "lookahead_loss": 6.8854999828338626, |
| "loss": 4.332, |
| "step": 195000 |
| }, |
| { |
| "epoch": 0.3719329833984375, |
| "eval_accuracy": 0.039397260273972605, |
| "eval_lookahead_loss": 6.749859248924255, |
| "eval_lookahead_perplexity": 853.9385612960015, |
| "eval_loss": 4.2233781814575195, |
| "eval_perplexity": 68.26370238484773, |
| "eval_runtime": 595.552, |
| "eval_samples_per_second": 16.791, |
| "eval_steps_per_second": 4.198, |
| "step": 195000 |
| }, |
| { |
| "epoch": 0.37288665771484375, |
| "grad_norm": 56.967472076416016, |
| "learning_rate": 3.1355762481689455e-05, |
| "lookahead_loss": 6.839566382408142, |
| "loss": 4.2956, |
| "step": 195500 |
| }, |
| { |
| "epoch": 0.37384033203125, |
| "grad_norm": 109.74634552001953, |
| "learning_rate": 3.1308078765869145e-05, |
| "lookahead_loss": 6.859189456939697, |
| "loss": 4.3184, |
| "step": 196000 |
| }, |
| { |
| "epoch": 0.37479400634765625, |
| "grad_norm": 121.26676177978516, |
| "learning_rate": 3.126039505004883e-05, |
| "lookahead_loss": 6.861068688392639, |
| "loss": 4.3133, |
| "step": 196500 |
| }, |
| { |
| "epoch": 0.3757476806640625, |
| "grad_norm": 260.8994445800781, |
| "learning_rate": 3.121271133422852e-05, |
| "lookahead_loss": 6.847253804206848, |
| "loss": 4.3041, |
| "step": 197000 |
| }, |
| { |
| "epoch": 0.37670135498046875, |
| "grad_norm": 70.82603454589844, |
| "learning_rate": 3.11650276184082e-05, |
| "lookahead_loss": 6.892813719749451, |
| "loss": 4.3439, |
| "step": 197500 |
| }, |
| { |
| "epoch": 0.377655029296875, |
| "grad_norm": 282.0182800292969, |
| "learning_rate": 3.111734390258789e-05, |
| "lookahead_loss": 6.85909540939331, |
| "loss": 4.3138, |
| "step": 198000 |
| }, |
| { |
| "epoch": 0.37860870361328125, |
| "grad_norm": 86.68909454345703, |
| "learning_rate": 3.106966018676758e-05, |
| "lookahead_loss": 6.898646444320678, |
| "loss": 4.3436, |
| "step": 198500 |
| }, |
| { |
| "epoch": 0.3795623779296875, |
| "grad_norm": 95.21270751953125, |
| "learning_rate": 3.1021976470947266e-05, |
| "lookahead_loss": 6.880235080718994, |
| "loss": 4.3338, |
| "step": 199000 |
| }, |
| { |
| "epoch": 0.38051605224609375, |
| "grad_norm": 162.9989471435547, |
| "learning_rate": 3.0974292755126956e-05, |
| "lookahead_loss": 6.881880113601684, |
| "loss": 4.3335, |
| "step": 199500 |
| }, |
| { |
| "epoch": 0.3814697265625, |
| "grad_norm": 73.56849670410156, |
| "learning_rate": 3.092660903930664e-05, |
| "lookahead_loss": 6.857299737930298, |
| "loss": 4.311, |
| "step": 200000 |
| }, |
| { |
| "epoch": 0.3814697265625, |
| "eval_accuracy": 0.039247358121330726, |
| "eval_lookahead_loss": 6.769299872207641, |
| "eval_lookahead_perplexity": 870.7020778175125, |
| "eval_loss": 4.2437896728515625, |
| "eval_perplexity": 69.67138392535527, |
| "eval_runtime": 587.6518, |
| "eval_samples_per_second": 17.017, |
| "eval_steps_per_second": 4.254, |
| "step": 200000 |
| }, |
| { |
| "epoch": 0.38242340087890625, |
| "grad_norm": 51.295570373535156, |
| "learning_rate": 3.087892532348633e-05, |
| "lookahead_loss": 6.826900623321533, |
| "loss": 4.2895, |
| "step": 200500 |
| }, |
| { |
| "epoch": 0.3833770751953125, |
| "grad_norm": 142.81150817871094, |
| "learning_rate": 3.083124160766602e-05, |
| "lookahead_loss": 6.86542509651184, |
| "loss": 4.3151, |
| "step": 201000 |
| }, |
| { |
| "epoch": 0.38433074951171875, |
| "grad_norm": 155.6156463623047, |
| "learning_rate": 3.07835578918457e-05, |
| "lookahead_loss": 6.881727516174316, |
| "loss": 4.3284, |
| "step": 201500 |
| }, |
| { |
| "epoch": 0.385284423828125, |
| "grad_norm": 82.43931579589844, |
| "learning_rate": 3.0735874176025393e-05, |
| "lookahead_loss": 6.8882527990341185, |
| "loss": 4.3363, |
| "step": 202000 |
| }, |
| { |
| "epoch": 0.38623809814453125, |
| "grad_norm": 62.418087005615234, |
| "learning_rate": 3.068819046020508e-05, |
| "lookahead_loss": 6.85464732837677, |
| "loss": 4.309, |
| "step": 202500 |
| }, |
| { |
| "epoch": 0.3871917724609375, |
| "grad_norm": 36.651763916015625, |
| "learning_rate": 3.064050674438477e-05, |
| "lookahead_loss": 6.871680235862732, |
| "loss": 4.3289, |
| "step": 203000 |
| }, |
| { |
| "epoch": 0.38814544677734375, |
| "grad_norm": 63.30792236328125, |
| "learning_rate": 3.059282302856446e-05, |
| "lookahead_loss": 6.878161911964416, |
| "loss": 4.3332, |
| "step": 203500 |
| }, |
| { |
| "epoch": 0.38909912109375, |
| "grad_norm": 120.7124252319336, |
| "learning_rate": 3.054513931274414e-05, |
| "lookahead_loss": 6.869472958564758, |
| "loss": 4.328, |
| "step": 204000 |
| }, |
| { |
| "epoch": 0.39005279541015625, |
| "grad_norm": 111.81350708007812, |
| "learning_rate": 3.049745559692383e-05, |
| "lookahead_loss": 6.832502111434937, |
| "loss": 4.2946, |
| "step": 204500 |
| }, |
| { |
| "epoch": 0.3910064697265625, |
| "grad_norm": 72.48461151123047, |
| "learning_rate": 3.0449771881103518e-05, |
| "lookahead_loss": 6.832289450645447, |
| "loss": 4.2935, |
| "step": 205000 |
| }, |
| { |
| "epoch": 0.3910064697265625, |
| "eval_accuracy": 0.03884305283757339, |
| "eval_lookahead_loss": 6.7642606378555294, |
| "eval_lookahead_perplexity": 866.3254427031694, |
| "eval_loss": 4.236800193786621, |
| "eval_perplexity": 69.1861151096584, |
| "eval_runtime": 549.8341, |
| "eval_samples_per_second": 18.187, |
| "eval_steps_per_second": 4.547, |
| "step": 205000 |
| }, |
| { |
| "epoch": 0.39196014404296875, |
| "grad_norm": 78.28961944580078, |
| "learning_rate": 3.0402088165283205e-05, |
| "lookahead_loss": 6.82104082775116, |
| "loss": 4.2881, |
| "step": 205500 |
| }, |
| { |
| "epoch": 0.392913818359375, |
| "grad_norm": 109.58631896972656, |
| "learning_rate": 3.035440444946289e-05, |
| "lookahead_loss": 6.809194861412048, |
| "loss": 4.2791, |
| "step": 206000 |
| }, |
| { |
| "epoch": 0.39386749267578125, |
| "grad_norm": 177.30950927734375, |
| "learning_rate": 3.0306720733642578e-05, |
| "lookahead_loss": 6.832217403411866, |
| "loss": 4.2887, |
| "step": 206500 |
| }, |
| { |
| "epoch": 0.3948211669921875, |
| "grad_norm": 65.06339263916016, |
| "learning_rate": 3.025903701782227e-05, |
| "lookahead_loss": 6.824791021347046, |
| "loss": 4.2898, |
| "step": 207000 |
| }, |
| { |
| "epoch": 0.39577484130859375, |
| "grad_norm": 67.95651245117188, |
| "learning_rate": 3.0211353302001955e-05, |
| "lookahead_loss": 6.803102507591247, |
| "loss": 4.2693, |
| "step": 207500 |
| }, |
| { |
| "epoch": 0.396728515625, |
| "grad_norm": 129.5733642578125, |
| "learning_rate": 3.0163669586181642e-05, |
| "lookahead_loss": 6.795765015602112, |
| "loss": 4.2638, |
| "step": 208000 |
| }, |
| { |
| "epoch": 0.39768218994140625, |
| "grad_norm": 88.25762939453125, |
| "learning_rate": 3.011598587036133e-05, |
| "lookahead_loss": 6.781947945594788, |
| "loss": 4.2519, |
| "step": 208500 |
| }, |
| { |
| "epoch": 0.3986358642578125, |
| "grad_norm": 75.52108764648438, |
| "learning_rate": 3.0068302154541016e-05, |
| "lookahead_loss": 6.820737454414368, |
| "loss": 4.2825, |
| "step": 209000 |
| }, |
| { |
| "epoch": 0.39958953857421875, |
| "grad_norm": 203.1063995361328, |
| "learning_rate": 3.0020618438720706e-05, |
| "lookahead_loss": 6.881164714813233, |
| "loss": 4.33, |
| "step": 209500 |
| }, |
| { |
| "epoch": 0.400543212890625, |
| "grad_norm": 54.69011306762695, |
| "learning_rate": 2.9972934722900393e-05, |
| "lookahead_loss": 6.893459950447083, |
| "loss": 4.3381, |
| "step": 210000 |
| }, |
| { |
| "epoch": 0.400543212890625, |
| "eval_accuracy": 0.03975714285714286, |
| "eval_lookahead_loss": 6.741827647972107, |
| "eval_lookahead_perplexity": 847.1075362879844, |
| "eval_loss": 4.217220306396484, |
| "eval_perplexity": 67.84463464199263, |
| "eval_runtime": 560.7386, |
| "eval_samples_per_second": 17.834, |
| "eval_steps_per_second": 4.458, |
| "step": 210000 |
| }, |
| { |
| "epoch": 0.40149688720703125, |
| "grad_norm": 177.4677734375, |
| "learning_rate": 2.992525100708008e-05, |
| "lookahead_loss": 6.8616189231872555, |
| "loss": 4.3105, |
| "step": 210500 |
| }, |
| { |
| "epoch": 0.4024505615234375, |
| "grad_norm": 143.2623291015625, |
| "learning_rate": 2.9877567291259766e-05, |
| "lookahead_loss": 6.832758441925049, |
| "loss": 4.2895, |
| "step": 211000 |
| }, |
| { |
| "epoch": 0.40340423583984375, |
| "grad_norm": 172.30426025390625, |
| "learning_rate": 2.9829883575439453e-05, |
| "lookahead_loss": 6.882635902404785, |
| "loss": 4.3272, |
| "step": 211500 |
| }, |
| { |
| "epoch": 0.40435791015625, |
| "grad_norm": 91.92830657958984, |
| "learning_rate": 2.9782199859619143e-05, |
| "lookahead_loss": 6.826775899887085, |
| "loss": 4.282, |
| "step": 212000 |
| }, |
| { |
| "epoch": 0.40531158447265625, |
| "grad_norm": 53.745338439941406, |
| "learning_rate": 2.973451614379883e-05, |
| "lookahead_loss": 6.824072843551636, |
| "loss": 4.2782, |
| "step": 212500 |
| }, |
| { |
| "epoch": 0.4062652587890625, |
| "grad_norm": 148.1055145263672, |
| "learning_rate": 2.9686832427978517e-05, |
| "lookahead_loss": 6.85183250617981, |
| "loss": 4.3008, |
| "step": 213000 |
| }, |
| { |
| "epoch": 0.40721893310546875, |
| "grad_norm": 58.04047775268555, |
| "learning_rate": 2.9639148712158204e-05, |
| "lookahead_loss": 6.86986146068573, |
| "loss": 4.3173, |
| "step": 213500 |
| }, |
| { |
| "epoch": 0.408172607421875, |
| "grad_norm": 75.42579650878906, |
| "learning_rate": 2.959146499633789e-05, |
| "lookahead_loss": 6.872432311058044, |
| "loss": 4.3162, |
| "step": 214000 |
| }, |
| { |
| "epoch": 0.40912628173828125, |
| "grad_norm": 144.2433319091797, |
| "learning_rate": 2.954378128051758e-05, |
| "lookahead_loss": 6.8378219327926635, |
| "loss": 4.2833, |
| "step": 214500 |
| }, |
| { |
| "epoch": 0.4100799560546875, |
| "grad_norm": 56.694496154785156, |
| "learning_rate": 2.9496097564697268e-05, |
| "lookahead_loss": 6.835438325881958, |
| "loss": 4.2862, |
| "step": 215000 |
| }, |
| { |
| "epoch": 0.4100799560546875, |
| "eval_accuracy": 0.03867358121330724, |
| "eval_lookahead_loss": 6.739575554847717, |
| "eval_lookahead_perplexity": 845.2019178504114, |
| "eval_loss": 4.21183967590332, |
| "eval_perplexity": 67.48056806425134, |
| "eval_runtime": 584.7146, |
| "eval_samples_per_second": 17.102, |
| "eval_steps_per_second": 4.276, |
| "step": 215000 |
| }, |
| { |
| "epoch": 0.41103363037109375, |
| "grad_norm": 47.73128128051758, |
| "learning_rate": 2.9448413848876955e-05, |
| "lookahead_loss": 6.823629376411438, |
| "loss": 4.2763, |
| "step": 215500 |
| }, |
| { |
| "epoch": 0.4119873046875, |
| "grad_norm": 41.67387008666992, |
| "learning_rate": 2.940073013305664e-05, |
| "lookahead_loss": 6.831622511863708, |
| "loss": 4.2826, |
| "step": 216000 |
| }, |
| { |
| "epoch": 0.41294097900390625, |
| "grad_norm": 75.79904174804688, |
| "learning_rate": 2.9353046417236328e-05, |
| "lookahead_loss": 6.838218030929565, |
| "loss": 4.2897, |
| "step": 216500 |
| }, |
| { |
| "epoch": 0.4138946533203125, |
| "grad_norm": 66.99076080322266, |
| "learning_rate": 2.930536270141602e-05, |
| "lookahead_loss": 6.812291440010071, |
| "loss": 4.2667, |
| "step": 217000 |
| }, |
| { |
| "epoch": 0.41484832763671875, |
| "grad_norm": 138.72842407226562, |
| "learning_rate": 2.9257678985595705e-05, |
| "lookahead_loss": 6.7639456605911255, |
| "loss": 4.2325, |
| "step": 217500 |
| }, |
| { |
| "epoch": 0.415802001953125, |
| "grad_norm": 104.42118072509766, |
| "learning_rate": 2.9209995269775392e-05, |
| "lookahead_loss": 6.831789570808411, |
| "loss": 4.2892, |
| "step": 218000 |
| }, |
| { |
| "epoch": 0.41675567626953125, |
| "grad_norm": 110.80390930175781, |
| "learning_rate": 2.916231155395508e-05, |
| "lookahead_loss": 6.819882885932922, |
| "loss": 4.2788, |
| "step": 218500 |
| }, |
| { |
| "epoch": 0.4177093505859375, |
| "grad_norm": 89.50532531738281, |
| "learning_rate": 2.9114627838134766e-05, |
| "lookahead_loss": 6.840888389587402, |
| "loss": 4.2924, |
| "step": 219000 |
| }, |
| { |
| "epoch": 0.41866302490234375, |
| "grad_norm": 468.4449157714844, |
| "learning_rate": 2.9066944122314456e-05, |
| "lookahead_loss": 6.854627352714538, |
| "loss": 4.3073, |
| "step": 219500 |
| }, |
| { |
| "epoch": 0.41961669921875, |
| "grad_norm": 43.495201110839844, |
| "learning_rate": 2.9019260406494143e-05, |
| "lookahead_loss": 6.826618343353272, |
| "loss": 4.2843, |
| "step": 220000 |
| }, |
| { |
| "epoch": 0.41961669921875, |
| "eval_accuracy": 0.03971761252446184, |
| "eval_lookahead_loss": 6.736925336265564, |
| "eval_lookahead_perplexity": 842.9649136064897, |
| "eval_loss": 4.211366176605225, |
| "eval_perplexity": 67.44862362606918, |
| "eval_runtime": 553.4267, |
| "eval_samples_per_second": 18.069, |
| "eval_steps_per_second": 4.517, |
| "step": 220000 |
| }, |
| { |
| "epoch": 0.42057037353515625, |
| "grad_norm": 125.62337493896484, |
| "learning_rate": 2.897157669067383e-05, |
| "lookahead_loss": 6.827990100860596, |
| "loss": 4.2879, |
| "step": 220500 |
| }, |
| { |
| "epoch": 0.4215240478515625, |
| "grad_norm": 337.7961730957031, |
| "learning_rate": 2.8923892974853516e-05, |
| "lookahead_loss": 6.810482286453247, |
| "loss": 4.2756, |
| "step": 221000 |
| }, |
| { |
| "epoch": 0.42247772216796875, |
| "grad_norm": 60.217185974121094, |
| "learning_rate": 2.8876209259033203e-05, |
| "lookahead_loss": 6.796955746650696, |
| "loss": 4.2665, |
| "step": 221500 |
| }, |
| { |
| "epoch": 0.423431396484375, |
| "grad_norm": 108.89976501464844, |
| "learning_rate": 2.8828525543212893e-05, |
| "lookahead_loss": 6.791898680686951, |
| "loss": 4.2567, |
| "step": 222000 |
| }, |
| { |
| "epoch": 0.42438507080078125, |
| "grad_norm": 50.305335998535156, |
| "learning_rate": 2.878084182739258e-05, |
| "lookahead_loss": 6.789860504150391, |
| "loss": 4.254, |
| "step": 222500 |
| }, |
| { |
| "epoch": 0.4253387451171875, |
| "grad_norm": 69.75407409667969, |
| "learning_rate": 2.8733158111572267e-05, |
| "lookahead_loss": 6.809138175010681, |
| "loss": 4.2654, |
| "step": 223000 |
| }, |
| { |
| "epoch": 0.42629241943359375, |
| "grad_norm": 81.425048828125, |
| "learning_rate": 2.8685474395751954e-05, |
| "lookahead_loss": 6.856522697448731, |
| "loss": 4.3064, |
| "step": 223500 |
| }, |
| { |
| "epoch": 0.42724609375, |
| "grad_norm": 63.404354095458984, |
| "learning_rate": 2.863779067993164e-05, |
| "lookahead_loss": 6.84972207069397, |
| "loss": 4.3026, |
| "step": 224000 |
| }, |
| { |
| "epoch": 0.42819976806640625, |
| "grad_norm": 95.73654174804688, |
| "learning_rate": 2.859010696411133e-05, |
| "lookahead_loss": 6.869048860549927, |
| "loss": 4.3176, |
| "step": 224500 |
| }, |
| { |
| "epoch": 0.4291534423828125, |
| "grad_norm": 65.30645751953125, |
| "learning_rate": 2.8542423248291018e-05, |
| "lookahead_loss": 6.8712594060897825, |
| "loss": 4.3212, |
| "step": 225000 |
| }, |
| { |
| "epoch": 0.4291534423828125, |
| "eval_accuracy": 0.039944227005870844, |
| "eval_lookahead_loss": 6.733989822387695, |
| "eval_lookahead_perplexity": 840.4940068689173, |
| "eval_loss": 4.206583023071289, |
| "eval_perplexity": 67.12677683852829, |
| "eval_runtime": 604.2318, |
| "eval_samples_per_second": 16.55, |
| "eval_steps_per_second": 4.137, |
| "step": 225000 |
| }, |
| { |
| "epoch": 0.43010711669921875, |
| "grad_norm": 44.87085723876953, |
| "learning_rate": 2.8494739532470705e-05, |
| "lookahead_loss": 6.821802568435669, |
| "loss": 4.2808, |
| "step": 225500 |
| }, |
| { |
| "epoch": 0.431060791015625, |
| "grad_norm": 225.40354919433594, |
| "learning_rate": 2.844705581665039e-05, |
| "lookahead_loss": 6.817881938934327, |
| "loss": 4.281, |
| "step": 226000 |
| }, |
| { |
| "epoch": 0.43201446533203125, |
| "grad_norm": 124.08724212646484, |
| "learning_rate": 2.8399372100830078e-05, |
| "lookahead_loss": 6.86967366695404, |
| "loss": 4.3265, |
| "step": 226500 |
| }, |
| { |
| "epoch": 0.4329681396484375, |
| "grad_norm": 200.86273193359375, |
| "learning_rate": 2.835168838500977e-05, |
| "lookahead_loss": 6.829223037719727, |
| "loss": 4.284, |
| "step": 227000 |
| }, |
| { |
| "epoch": 0.43392181396484375, |
| "grad_norm": 64.91610717773438, |
| "learning_rate": 2.8304004669189455e-05, |
| "lookahead_loss": 6.846840734481812, |
| "loss": 4.2989, |
| "step": 227500 |
| }, |
| { |
| "epoch": 0.43487548828125, |
| "grad_norm": 50.19611358642578, |
| "learning_rate": 2.8256320953369142e-05, |
| "lookahead_loss": 6.819207098007202, |
| "loss": 4.2758, |
| "step": 228000 |
| }, |
| { |
| "epoch": 0.43582916259765625, |
| "grad_norm": 79.49495697021484, |
| "learning_rate": 2.820863723754883e-05, |
| "lookahead_loss": 6.860958848953247, |
| "loss": 4.3113, |
| "step": 228500 |
| }, |
| { |
| "epoch": 0.4367828369140625, |
| "grad_norm": 55.03921890258789, |
| "learning_rate": 2.8160953521728516e-05, |
| "lookahead_loss": 6.831416009902954, |
| "loss": 4.2834, |
| "step": 229000 |
| }, |
| { |
| "epoch": 0.43773651123046875, |
| "grad_norm": 127.22222900390625, |
| "learning_rate": 2.8113269805908206e-05, |
| "lookahead_loss": 6.819813494682312, |
| "loss": 4.2771, |
| "step": 229500 |
| }, |
| { |
| "epoch": 0.438690185546875, |
| "grad_norm": 84.59156036376953, |
| "learning_rate": 2.8065586090087893e-05, |
| "lookahead_loss": 6.844387573242187, |
| "loss": 4.2976, |
| "step": 230000 |
| }, |
| { |
| "epoch": 0.438690185546875, |
| "eval_accuracy": 0.03896281800391389, |
| "eval_lookahead_loss": 6.726382809066773, |
| "eval_lookahead_perplexity": 834.1246145033832, |
| "eval_loss": 4.202380657196045, |
| "eval_perplexity": 66.84527745819214, |
| "eval_runtime": 539.1296, |
| "eval_samples_per_second": 18.548, |
| "eval_steps_per_second": 4.637, |
| "step": 230000 |
| }, |
| { |
| "epoch": 0.43964385986328125, |
| "grad_norm": 65.09040832519531, |
| "learning_rate": 2.801790237426758e-05, |
| "lookahead_loss": 6.849031944274902, |
| "loss": 4.298, |
| "step": 230500 |
| }, |
| { |
| "epoch": 0.4405975341796875, |
| "grad_norm": 158.001708984375, |
| "learning_rate": 2.7970218658447266e-05, |
| "lookahead_loss": 6.804644338607788, |
| "loss": 4.2604, |
| "step": 231000 |
| }, |
| { |
| "epoch": 0.44155120849609375, |
| "grad_norm": 94.63139343261719, |
| "learning_rate": 2.7922534942626953e-05, |
| "lookahead_loss": 6.841900848388672, |
| "loss": 4.2961, |
| "step": 231500 |
| }, |
| { |
| "epoch": 0.4425048828125, |
| "grad_norm": 138.0093231201172, |
| "learning_rate": 2.7874851226806643e-05, |
| "lookahead_loss": 6.819698590278626, |
| "loss": 4.2839, |
| "step": 232000 |
| }, |
| { |
| "epoch": 0.44345855712890625, |
| "grad_norm": 122.00431060791016, |
| "learning_rate": 2.782716751098633e-05, |
| "lookahead_loss": 6.827349545478821, |
| "loss": 4.2851, |
| "step": 232500 |
| }, |
| { |
| "epoch": 0.4444122314453125, |
| "grad_norm": 92.17147064208984, |
| "learning_rate": 2.7779483795166017e-05, |
| "lookahead_loss": 6.832472926139832, |
| "loss": 4.2871, |
| "step": 233000 |
| }, |
| { |
| "epoch": 0.44536590576171875, |
| "grad_norm": 48.53215026855469, |
| "learning_rate": 2.7731800079345704e-05, |
| "lookahead_loss": 6.8030523653030395, |
| "loss": 4.2663, |
| "step": 233500 |
| }, |
| { |
| "epoch": 0.446319580078125, |
| "grad_norm": 313.5240478515625, |
| "learning_rate": 2.768411636352539e-05, |
| "lookahead_loss": 6.843605909347534, |
| "loss": 4.2995, |
| "step": 234000 |
| }, |
| { |
| "epoch": 0.44727325439453125, |
| "grad_norm": 158.4006805419922, |
| "learning_rate": 2.763643264770508e-05, |
| "lookahead_loss": 6.846953560829163, |
| "loss": 4.3027, |
| "step": 234500 |
| }, |
| { |
| "epoch": 0.4482269287109375, |
| "grad_norm": 78.62808990478516, |
| "learning_rate": 2.7588748931884768e-05, |
| "lookahead_loss": 6.857602440834046, |
| "loss": 4.322, |
| "step": 235000 |
| }, |
| { |
| "epoch": 0.4482269287109375, |
| "eval_accuracy": 0.03697592954990215, |
| "eval_lookahead_loss": 6.745567938232422, |
| "eval_lookahead_perplexity": 850.2818971702327, |
| "eval_loss": 4.217191696166992, |
| "eval_perplexity": 67.84269361919239, |
| "eval_runtime": 577.1823, |
| "eval_samples_per_second": 17.326, |
| "eval_steps_per_second": 4.331, |
| "step": 235000 |
| }, |
| { |
| "epoch": 0.44918060302734375, |
| "grad_norm": 403.6563720703125, |
| "learning_rate": 2.7541065216064455e-05, |
| "lookahead_loss": 6.845228085517883, |
| "loss": 4.3007, |
| "step": 235500 |
| }, |
| { |
| "epoch": 0.45013427734375, |
| "grad_norm": 132.26510620117188, |
| "learning_rate": 2.749338150024414e-05, |
| "lookahead_loss": 6.804190621376038, |
| "loss": 4.2678, |
| "step": 236000 |
| }, |
| { |
| "epoch": 0.45108795166015625, |
| "grad_norm": 222.1504669189453, |
| "learning_rate": 2.7445697784423828e-05, |
| "lookahead_loss": 6.8256635046005245, |
| "loss": 4.2832, |
| "step": 236500 |
| }, |
| { |
| "epoch": 0.4520416259765625, |
| "grad_norm": 35.312767028808594, |
| "learning_rate": 2.739801406860352e-05, |
| "lookahead_loss": 6.7989252700805665, |
| "loss": 4.2669, |
| "step": 237000 |
| }, |
| { |
| "epoch": 0.45299530029296875, |
| "grad_norm": 52.764923095703125, |
| "learning_rate": 2.7350330352783205e-05, |
| "lookahead_loss": 6.791041137695313, |
| "loss": 4.263, |
| "step": 237500 |
| }, |
| { |
| "epoch": 0.453948974609375, |
| "grad_norm": 36.1938591003418, |
| "learning_rate": 2.7302646636962892e-05, |
| "lookahead_loss": 6.878719049453736, |
| "loss": 4.329, |
| "step": 238000 |
| }, |
| { |
| "epoch": 0.45490264892578125, |
| "grad_norm": 81.13445281982422, |
| "learning_rate": 2.725496292114258e-05, |
| "lookahead_loss": 6.846307719230652, |
| "loss": 4.3026, |
| "step": 238500 |
| }, |
| { |
| "epoch": 0.4558563232421875, |
| "grad_norm": 92.43334197998047, |
| "learning_rate": 2.7207279205322266e-05, |
| "lookahead_loss": 6.852748492240906, |
| "loss": 4.306, |
| "step": 239000 |
| }, |
| { |
| "epoch": 0.45680999755859375, |
| "grad_norm": 64.53598022460938, |
| "learning_rate": 2.7159595489501956e-05, |
| "lookahead_loss": 6.833488552093506, |
| "loss": 4.288, |
| "step": 239500 |
| }, |
| { |
| "epoch": 0.457763671875, |
| "grad_norm": 450.8660888671875, |
| "learning_rate": 2.7111911773681643e-05, |
| "lookahead_loss": 6.803179792404174, |
| "loss": 4.2604, |
| "step": 240000 |
| }, |
| { |
| "epoch": 0.457763671875, |
| "eval_accuracy": 0.03890567514677104, |
| "eval_lookahead_loss": 6.725540647888184, |
| "eval_lookahead_perplexity": 833.422442847298, |
| "eval_loss": 4.2009806632995605, |
| "eval_perplexity": 66.75175995498311, |
| "eval_runtime": 577.467, |
| "eval_samples_per_second": 17.317, |
| "eval_steps_per_second": 4.329, |
| "step": 240000 |
| }, |
| { |
| "epoch": 0.45871734619140625, |
| "grad_norm": 98.67919921875, |
| "learning_rate": 2.706422805786133e-05, |
| "lookahead_loss": 6.832691857337951, |
| "loss": 4.287, |
| "step": 240500 |
| }, |
| { |
| "epoch": 0.4596710205078125, |
| "grad_norm": 128.34164428710938, |
| "learning_rate": 2.7016544342041016e-05, |
| "lookahead_loss": 6.838578820228577, |
| "loss": 4.2911, |
| "step": 241000 |
| }, |
| { |
| "epoch": 0.46062469482421875, |
| "grad_norm": 125.73902130126953, |
| "learning_rate": 2.6968860626220703e-05, |
| "lookahead_loss": 6.871714052200318, |
| "loss": 4.3222, |
| "step": 241500 |
| }, |
| { |
| "epoch": 0.461578369140625, |
| "grad_norm": 83.38597106933594, |
| "learning_rate": 2.6921176910400393e-05, |
| "lookahead_loss": 6.835269724845886, |
| "loss": 4.2922, |
| "step": 242000 |
| }, |
| { |
| "epoch": 0.46253204345703125, |
| "grad_norm": 281.82073974609375, |
| "learning_rate": 2.687349319458008e-05, |
| "lookahead_loss": 6.833915582656861, |
| "loss": 4.2872, |
| "step": 242500 |
| }, |
| { |
| "epoch": 0.4634857177734375, |
| "grad_norm": 426.818115234375, |
| "learning_rate": 2.6825809478759767e-05, |
| "lookahead_loss": 6.836755400657654, |
| "loss": 4.2905, |
| "step": 243000 |
| }, |
| { |
| "epoch": 0.46443939208984375, |
| "grad_norm": 193.5459442138672, |
| "learning_rate": 2.6778125762939454e-05, |
| "lookahead_loss": 6.843824926376342, |
| "loss": 4.2951, |
| "step": 243500 |
| }, |
| { |
| "epoch": 0.46539306640625, |
| "grad_norm": 60.45668411254883, |
| "learning_rate": 2.673044204711914e-05, |
| "lookahead_loss": 6.826668175697327, |
| "loss": 4.2769, |
| "step": 244000 |
| }, |
| { |
| "epoch": 0.46634674072265625, |
| "grad_norm": 77.93930053710938, |
| "learning_rate": 2.668275833129883e-05, |
| "lookahead_loss": 6.804249542236328, |
| "loss": 4.2638, |
| "step": 244500 |
| }, |
| { |
| "epoch": 0.4673004150390625, |
| "grad_norm": 147.98187255859375, |
| "learning_rate": 2.6635074615478518e-05, |
| "lookahead_loss": 6.8260560712814335, |
| "loss": 4.2827, |
| "step": 245000 |
| }, |
| { |
| "epoch": 0.4673004150390625, |
| "eval_accuracy": 0.03744305283757338, |
| "eval_lookahead_loss": 6.731092700386047, |
| "eval_lookahead_perplexity": 838.0625170513761, |
| "eval_loss": 4.207005500793457, |
| "eval_perplexity": 67.15514239779003, |
| "eval_runtime": 552.034, |
| "eval_samples_per_second": 18.115, |
| "eval_steps_per_second": 4.529, |
| "step": 245000 |
| }, |
| { |
| "epoch": 0.46825408935546875, |
| "grad_norm": 145.95126342773438, |
| "learning_rate": 2.6587390899658205e-05, |
| "lookahead_loss": 6.826563521385193, |
| "loss": 4.2809, |
| "step": 245500 |
| }, |
| { |
| "epoch": 0.469207763671875, |
| "grad_norm": 593.6459350585938, |
| "learning_rate": 2.653970718383789e-05, |
| "lookahead_loss": 6.788387182235717, |
| "loss": 4.2516, |
| "step": 246000 |
| }, |
| { |
| "epoch": 0.47016143798828125, |
| "grad_norm": 104.18621826171875, |
| "learning_rate": 2.6492023468017578e-05, |
| "lookahead_loss": 6.8011353530883785, |
| "loss": 4.2646, |
| "step": 246500 |
| }, |
| { |
| "epoch": 0.4711151123046875, |
| "grad_norm": 100.52703857421875, |
| "learning_rate": 2.644433975219727e-05, |
| "lookahead_loss": 6.824602013587952, |
| "loss": 4.2897, |
| "step": 247000 |
| }, |
| { |
| "epoch": 0.47206878662109375, |
| "grad_norm": 147.22427368164062, |
| "learning_rate": 2.6396656036376955e-05, |
| "lookahead_loss": 6.809887815475464, |
| "loss": 4.2724, |
| "step": 247500 |
| }, |
| { |
| "epoch": 0.4730224609375, |
| "grad_norm": 145.30352783203125, |
| "learning_rate": 2.6348972320556642e-05, |
| "lookahead_loss": 6.8448622007369995, |
| "loss": 4.3001, |
| "step": 248000 |
| }, |
| { |
| "epoch": 0.47397613525390625, |
| "grad_norm": 129.2230987548828, |
| "learning_rate": 2.630128860473633e-05, |
| "lookahead_loss": 6.818229221343994, |
| "loss": 4.2806, |
| "step": 248500 |
| }, |
| { |
| "epoch": 0.4749298095703125, |
| "grad_norm": 173.33047485351562, |
| "learning_rate": 2.6253604888916016e-05, |
| "lookahead_loss": 6.837520868301391, |
| "loss": 4.2989, |
| "step": 249000 |
| }, |
| { |
| "epoch": 0.47588348388671875, |
| "grad_norm": 117.88336944580078, |
| "learning_rate": 2.6205921173095706e-05, |
| "lookahead_loss": 6.835369365692139, |
| "loss": 4.2986, |
| "step": 249500 |
| }, |
| { |
| "epoch": 0.476837158203125, |
| "grad_norm": 97.2651138305664, |
| "learning_rate": 2.6158237457275393e-05, |
| "lookahead_loss": 6.798692848205566, |
| "loss": 4.2629, |
| "step": 250000 |
| }, |
| { |
| "epoch": 0.476837158203125, |
| "eval_accuracy": 0.03982641878669276, |
| "eval_lookahead_loss": 6.723223708343506, |
| "eval_lookahead_perplexity": 831.4936886982524, |
| "eval_loss": 4.201556205749512, |
| "eval_perplexity": 66.79018948429807, |
| "eval_runtime": 630.7627, |
| "eval_samples_per_second": 15.854, |
| "eval_steps_per_second": 3.963, |
| "step": 250000 |
| }, |
| { |
| "epoch": 0.47779083251953125, |
| "grad_norm": 97.9665298461914, |
| "learning_rate": 2.611055374145508e-05, |
| "lookahead_loss": 6.761426301956177, |
| "loss": 4.2314, |
| "step": 250500 |
| }, |
| { |
| "epoch": 0.4787445068359375, |
| "grad_norm": 142.7858428955078, |
| "learning_rate": 2.6062870025634766e-05, |
| "lookahead_loss": 6.769398742675781, |
| "loss": 4.2404, |
| "step": 251000 |
| }, |
| { |
| "epoch": 0.47969818115234375, |
| "grad_norm": 257.583740234375, |
| "learning_rate": 2.6015186309814453e-05, |
| "lookahead_loss": 6.813614401817322, |
| "loss": 4.2772, |
| "step": 251500 |
| }, |
| { |
| "epoch": 0.48065185546875, |
| "grad_norm": 126.72929382324219, |
| "learning_rate": 2.5967502593994143e-05, |
| "lookahead_loss": 6.796018969535828, |
| "loss": 4.2638, |
| "step": 252000 |
| }, |
| { |
| "epoch": 0.48160552978515625, |
| "grad_norm": 56.30085754394531, |
| "learning_rate": 2.591981887817383e-05, |
| "lookahead_loss": 6.865161432266236, |
| "loss": 4.3199, |
| "step": 252500 |
| }, |
| { |
| "epoch": 0.4825592041015625, |
| "grad_norm": 60.45290756225586, |
| "learning_rate": 2.5872135162353517e-05, |
| "lookahead_loss": 6.83758104801178, |
| "loss": 4.2938, |
| "step": 253000 |
| }, |
| { |
| "epoch": 0.48351287841796875, |
| "grad_norm": 48.96196746826172, |
| "learning_rate": 2.5824451446533204e-05, |
| "lookahead_loss": 6.837399300575257, |
| "loss": 4.2945, |
| "step": 253500 |
| }, |
| { |
| "epoch": 0.484466552734375, |
| "grad_norm": 320.73291015625, |
| "learning_rate": 2.577676773071289e-05, |
| "lookahead_loss": 6.833300945281983, |
| "loss": 4.2871, |
| "step": 254000 |
| }, |
| { |
| "epoch": 0.48542022705078125, |
| "grad_norm": 90.25511932373047, |
| "learning_rate": 2.572908401489258e-05, |
| "lookahead_loss": 6.745852150917053, |
| "loss": 4.2196, |
| "step": 254500 |
| }, |
| { |
| "epoch": 0.4863739013671875, |
| "grad_norm": 233.1550750732422, |
| "learning_rate": 2.5681400299072268e-05, |
| "lookahead_loss": 6.8135366497039795, |
| "loss": 4.2719, |
| "step": 255000 |
| }, |
| { |
| "epoch": 0.4863739013671875, |
| "eval_accuracy": 0.03871761252446184, |
| "eval_lookahead_loss": 6.723375120162964, |
| "eval_lookahead_perplexity": 831.6195962022279, |
| "eval_loss": 4.201759338378906, |
| "eval_perplexity": 66.80375812917637, |
| "eval_runtime": 651.0195, |
| "eval_samples_per_second": 15.361, |
| "eval_steps_per_second": 3.84, |
| "step": 255000 |
| }, |
| { |
| "epoch": 0.48732757568359375, |
| "grad_norm": 30.3210506439209, |
| "learning_rate": 2.5633716583251955e-05, |
| "lookahead_loss": 6.830173519134521, |
| "loss": 4.2889, |
| "step": 255500 |
| }, |
| { |
| "epoch": 0.48828125, |
| "grad_norm": 71.12467193603516, |
| "learning_rate": 2.558603286743164e-05, |
| "lookahead_loss": 6.822467567443848, |
| "loss": 4.2796, |
| "step": 256000 |
| }, |
| { |
| "epoch": 0.48923492431640625, |
| "grad_norm": 70.18731689453125, |
| "learning_rate": 2.5538349151611328e-05, |
| "lookahead_loss": 6.833849480628968, |
| "loss": 4.2874, |
| "step": 256500 |
| }, |
| { |
| "epoch": 0.4901885986328125, |
| "grad_norm": 136.4093780517578, |
| "learning_rate": 2.549066543579102e-05, |
| "lookahead_loss": 6.826043591499329, |
| "loss": 4.281, |
| "step": 257000 |
| }, |
| { |
| "epoch": 0.49114227294921875, |
| "grad_norm": 136.4122772216797, |
| "learning_rate": 2.5442981719970705e-05, |
| "lookahead_loss": 6.824924607276916, |
| "loss": 4.28, |
| "step": 257500 |
| }, |
| { |
| "epoch": 0.492095947265625, |
| "grad_norm": 64.034423828125, |
| "learning_rate": 2.5395298004150392e-05, |
| "lookahead_loss": 6.819827025413513, |
| "loss": 4.2746, |
| "step": 258000 |
| }, |
| { |
| "epoch": 0.49304962158203125, |
| "grad_norm": 76.17876434326172, |
| "learning_rate": 2.534761428833008e-05, |
| "lookahead_loss": 6.8156179895401, |
| "loss": 4.2764, |
| "step": 258500 |
| }, |
| { |
| "epoch": 0.4940032958984375, |
| "grad_norm": 111.8006591796875, |
| "learning_rate": 2.5299930572509766e-05, |
| "lookahead_loss": 6.823220482826233, |
| "loss": 4.28, |
| "step": 259000 |
| }, |
| { |
| "epoch": 0.49495697021484375, |
| "grad_norm": 1842.7681884765625, |
| "learning_rate": 2.5252246856689456e-05, |
| "lookahead_loss": 6.85536534500122, |
| "loss": 4.3075, |
| "step": 259500 |
| }, |
| { |
| "epoch": 0.49591064453125, |
| "grad_norm": 142.96466064453125, |
| "learning_rate": 2.5204563140869143e-05, |
| "lookahead_loss": 6.786014308929444, |
| "loss": 4.2518, |
| "step": 260000 |
| }, |
| { |
| "epoch": 0.49591064453125, |
| "eval_accuracy": 0.03923698630136986, |
| "eval_lookahead_loss": 6.719935366630554, |
| "eval_lookahead_perplexity": 828.7639439441907, |
| "eval_loss": 4.197604656219482, |
| "eval_perplexity": 66.5267855121008, |
| "eval_runtime": 1337.5771, |
| "eval_samples_per_second": 7.476, |
| "eval_steps_per_second": 1.869, |
| "step": 260000 |
| }, |
| { |
| "epoch": 0.49686431884765625, |
| "grad_norm": 158.32261657714844, |
| "learning_rate": 2.515687942504883e-05, |
| "lookahead_loss": 6.825167228698731, |
| "loss": 4.2835, |
| "step": 260500 |
| }, |
| { |
| "epoch": 0.4978179931640625, |
| "grad_norm": 101.84738159179688, |
| "learning_rate": 2.5109195709228516e-05, |
| "lookahead_loss": 6.8140147142410274, |
| "loss": 4.2802, |
| "step": 261000 |
| }, |
| { |
| "epoch": 0.49877166748046875, |
| "grad_norm": 146.1331329345703, |
| "learning_rate": 2.5061511993408203e-05, |
| "lookahead_loss": 6.809534926414489, |
| "loss": 4.2733, |
| "step": 261500 |
| }, |
| { |
| "epoch": 0.499725341796875, |
| "grad_norm": 440.6026306152344, |
| "learning_rate": 2.5013828277587893e-05, |
| "lookahead_loss": 6.837049920082093, |
| "loss": 4.2921, |
| "step": 262000 |
| }, |
| { |
| "epoch": 0.5006790161132812, |
| "grad_norm": 106.92240142822266, |
| "learning_rate": 2.496614456176758e-05, |
| "lookahead_loss": 6.807016921043396, |
| "loss": 4.2705, |
| "step": 262500 |
| }, |
| { |
| "epoch": 0.5016326904296875, |
| "grad_norm": 112.94754028320312, |
| "learning_rate": 2.4918460845947267e-05, |
| "lookahead_loss": 6.808601552009582, |
| "loss": 4.277, |
| "step": 263000 |
| }, |
| { |
| "epoch": 0.5025863647460938, |
| "grad_norm": 241.30355834960938, |
| "learning_rate": 2.4870777130126954e-05, |
| "lookahead_loss": 6.802287021636963, |
| "loss": 4.2656, |
| "step": 263500 |
| }, |
| { |
| "epoch": 0.5035400390625, |
| "grad_norm": 129.61964416503906, |
| "learning_rate": 2.482309341430664e-05, |
| "lookahead_loss": 6.844995526313782, |
| "loss": 4.301, |
| "step": 264000 |
| }, |
| { |
| "epoch": 0.5044937133789062, |
| "grad_norm": 111.98877716064453, |
| "learning_rate": 2.477540969848633e-05, |
| "lookahead_loss": 6.782246248245239, |
| "loss": 4.2514, |
| "step": 264500 |
| }, |
| { |
| "epoch": 0.5054473876953125, |
| "grad_norm": 94.9834976196289, |
| "learning_rate": 2.4727725982666018e-05, |
| "lookahead_loss": 6.7783497676849365, |
| "loss": 4.2439, |
| "step": 265000 |
| }, |
| { |
| "epoch": 0.5054473876953125, |
| "eval_accuracy": 0.03809138943248532, |
| "eval_lookahead_loss": 6.72248406867981, |
| "eval_lookahead_perplexity": 830.8789103712231, |
| "eval_loss": 4.1985273361206055, |
| "eval_perplexity": 66.58819676708767, |
| "eval_runtime": 575.8115, |
| "eval_samples_per_second": 17.367, |
| "eval_steps_per_second": 4.342, |
| "step": 265000 |
| }, |
| { |
| "epoch": 0.5064010620117188, |
| "grad_norm": 135.8423309326172, |
| "learning_rate": 2.4680042266845705e-05, |
| "lookahead_loss": 6.791614148139954, |
| "loss": 4.2557, |
| "step": 265500 |
| }, |
| { |
| "epoch": 0.507354736328125, |
| "grad_norm": 146.2626190185547, |
| "learning_rate": 2.463235855102539e-05, |
| "lookahead_loss": 6.7422465858459475, |
| "loss": 4.2193, |
| "step": 266000 |
| }, |
| { |
| "epoch": 0.5083084106445312, |
| "grad_norm": 127.4433364868164, |
| "learning_rate": 2.4584674835205078e-05, |
| "lookahead_loss": 6.816955809593201, |
| "loss": 4.2783, |
| "step": 266500 |
| }, |
| { |
| "epoch": 0.5092620849609375, |
| "grad_norm": 107.38065338134766, |
| "learning_rate": 2.453699111938477e-05, |
| "lookahead_loss": 6.832611518383026, |
| "loss": 4.2913, |
| "step": 267000 |
| }, |
| { |
| "epoch": 0.5102157592773438, |
| "grad_norm": 37.94905090332031, |
| "learning_rate": 2.4489307403564455e-05, |
| "lookahead_loss": 6.834332969665527, |
| "loss": 4.2932, |
| "step": 267500 |
| }, |
| { |
| "epoch": 0.51116943359375, |
| "grad_norm": 182.36831665039062, |
| "learning_rate": 2.4441623687744142e-05, |
| "lookahead_loss": 6.7748250446319584, |
| "loss": 4.2422, |
| "step": 268000 |
| }, |
| { |
| "epoch": 0.5121231079101562, |
| "grad_norm": 471.6051025390625, |
| "learning_rate": 2.439393997192383e-05, |
| "lookahead_loss": 6.7940562725067135, |
| "loss": 4.2574, |
| "step": 268500 |
| }, |
| { |
| "epoch": 0.5130767822265625, |
| "grad_norm": 292.40679931640625, |
| "learning_rate": 2.4346256256103516e-05, |
| "lookahead_loss": 6.798037927627563, |
| "loss": 4.2635, |
| "step": 269000 |
| }, |
| { |
| "epoch": 0.5140304565429688, |
| "grad_norm": 103.75926208496094, |
| "learning_rate": 2.4298572540283206e-05, |
| "lookahead_loss": 6.814817329406738, |
| "loss": 4.2739, |
| "step": 269500 |
| }, |
| { |
| "epoch": 0.514984130859375, |
| "grad_norm": 218.8231658935547, |
| "learning_rate": 2.4250888824462893e-05, |
| "lookahead_loss": 6.830296605110169, |
| "loss": 4.2873, |
| "step": 270000 |
| }, |
| { |
| "epoch": 0.514984130859375, |
| "eval_accuracy": 0.038349510763209395, |
| "eval_lookahead_loss": 6.729008487129211, |
| "eval_lookahead_perplexity": 836.3176350280417, |
| "eval_loss": 4.206360340118408, |
| "eval_perplexity": 67.11183051385214, |
| "eval_runtime": 680.1612, |
| "eval_samples_per_second": 14.702, |
| "eval_steps_per_second": 3.676, |
| "step": 270000 |
| }, |
| { |
| "epoch": 0.5159378051757812, |
| "grad_norm": 264.31640625, |
| "learning_rate": 2.420320510864258e-05, |
| "lookahead_loss": 6.819584959030151, |
| "loss": 4.2782, |
| "step": 270500 |
| }, |
| { |
| "epoch": 0.5168914794921875, |
| "grad_norm": 187.81594848632812, |
| "learning_rate": 2.4155521392822266e-05, |
| "lookahead_loss": 6.80073481464386, |
| "loss": 4.2673, |
| "step": 271000 |
| }, |
| { |
| "epoch": 0.5178451538085938, |
| "grad_norm": 142.20513916015625, |
| "learning_rate": 2.4107837677001953e-05, |
| "lookahead_loss": 6.830631741523742, |
| "loss": 4.2842, |
| "step": 271500 |
| }, |
| { |
| "epoch": 0.518798828125, |
| "grad_norm": 141.13006591796875, |
| "learning_rate": 2.406015396118164e-05, |
| "lookahead_loss": 6.830031258583069, |
| "loss": 4.2853, |
| "step": 272000 |
| }, |
| { |
| "epoch": 0.5197525024414062, |
| "grad_norm": 172.76425170898438, |
| "learning_rate": 2.401247024536133e-05, |
| "lookahead_loss": 6.799144539833069, |
| "loss": 4.2591, |
| "step": 272500 |
| }, |
| { |
| "epoch": 0.5207061767578125, |
| "grad_norm": 74.04598999023438, |
| "learning_rate": 2.3964786529541017e-05, |
| "lookahead_loss": 6.795266880989074, |
| "loss": 4.2581, |
| "step": 273000 |
| }, |
| { |
| "epoch": 0.5216598510742188, |
| "grad_norm": 62.942020416259766, |
| "learning_rate": 2.3917102813720704e-05, |
| "lookahead_loss": 6.80137525844574, |
| "loss": 4.2648, |
| "step": 273500 |
| }, |
| { |
| "epoch": 0.522613525390625, |
| "grad_norm": 116.4559555053711, |
| "learning_rate": 2.386941909790039e-05, |
| "lookahead_loss": 6.809269550323486, |
| "loss": 4.27, |
| "step": 274000 |
| }, |
| { |
| "epoch": 0.5235671997070312, |
| "grad_norm": 119.12867736816406, |
| "learning_rate": 2.3821735382080078e-05, |
| "lookahead_loss": 6.803435432434082, |
| "loss": 4.2647, |
| "step": 274500 |
| }, |
| { |
| "epoch": 0.5245208740234375, |
| "grad_norm": 51.731048583984375, |
| "learning_rate": 2.3774051666259768e-05, |
| "lookahead_loss": 6.828841168403626, |
| "loss": 4.2885, |
| "step": 275000 |
| }, |
| { |
| "epoch": 0.5245208740234375, |
| "eval_accuracy": 0.039441487279843444, |
| "eval_lookahead_loss": 6.712037487220764, |
| "eval_lookahead_perplexity": 822.2442460392837, |
| "eval_loss": 4.190776824951172, |
| "eval_perplexity": 66.07409903789117, |
| "eval_runtime": 873.072, |
| "eval_samples_per_second": 11.454, |
| "eval_steps_per_second": 2.863, |
| "step": 275000 |
| }, |
| { |
| "epoch": 0.5254745483398438, |
| "grad_norm": 78.44624328613281, |
| "learning_rate": 2.3726367950439455e-05, |
| "lookahead_loss": 6.81116227722168, |
| "loss": 4.2735, |
| "step": 275500 |
| }, |
| { |
| "epoch": 0.52642822265625, |
| "grad_norm": 144.536865234375, |
| "learning_rate": 2.367868423461914e-05, |
| "lookahead_loss": 6.831217732429504, |
| "loss": 4.2893, |
| "step": 276000 |
| }, |
| { |
| "epoch": 0.5273818969726562, |
| "grad_norm": 173.55062866210938, |
| "learning_rate": 2.3631000518798828e-05, |
| "lookahead_loss": 6.804794922828674, |
| "loss": 4.2718, |
| "step": 276500 |
| }, |
| { |
| "epoch": 0.5283355712890625, |
| "grad_norm": 168.15768432617188, |
| "learning_rate": 2.3583316802978515e-05, |
| "lookahead_loss": 6.772975631713868, |
| "loss": 4.2486, |
| "step": 277000 |
| }, |
| { |
| "epoch": 0.5292892456054688, |
| "grad_norm": 156.95692443847656, |
| "learning_rate": 2.3535633087158205e-05, |
| "lookahead_loss": 6.817192688941955, |
| "loss": 4.2781, |
| "step": 277500 |
| }, |
| { |
| "epoch": 0.530242919921875, |
| "grad_norm": 189.3266143798828, |
| "learning_rate": 2.3487949371337892e-05, |
| "lookahead_loss": 6.810262331008911, |
| "loss": 4.2767, |
| "step": 278000 |
| }, |
| { |
| "epoch": 0.5311965942382812, |
| "grad_norm": 53.82436752319336, |
| "learning_rate": 2.344026565551758e-05, |
| "lookahead_loss": 6.743365891456604, |
| "loss": 4.221, |
| "step": 278500 |
| }, |
| { |
| "epoch": 0.5321502685546875, |
| "grad_norm": 136.30763244628906, |
| "learning_rate": 2.3392581939697266e-05, |
| "lookahead_loss": 6.746450958251953, |
| "loss": 4.2201, |
| "step": 279000 |
| }, |
| { |
| "epoch": 0.5331039428710938, |
| "grad_norm": 102.2168960571289, |
| "learning_rate": 2.3344898223876953e-05, |
| "lookahead_loss": 6.744244017601013, |
| "loss": 4.2143, |
| "step": 279500 |
| }, |
| { |
| "epoch": 0.5340576171875, |
| "grad_norm": 219.39195251464844, |
| "learning_rate": 2.3297214508056643e-05, |
| "lookahead_loss": 6.796566878318787, |
| "loss": 4.2603, |
| "step": 280000 |
| }, |
| { |
| "epoch": 0.5340576171875, |
| "eval_accuracy": 0.03922583170254403, |
| "eval_lookahead_loss": 6.7111371892929075, |
| "eval_lookahead_perplexity": 821.504314377832, |
| "eval_loss": 4.188486099243164, |
| "eval_perplexity": 65.92291462770987, |
| "eval_runtime": 1614.367, |
| "eval_samples_per_second": 6.194, |
| "eval_steps_per_second": 1.549, |
| "step": 280000 |
| }, |
| { |
| "epoch": 0.5350112915039062, |
| "grad_norm": 1495.875732421875, |
| "learning_rate": 2.324953079223633e-05, |
| "lookahead_loss": 6.85712878704071, |
| "loss": 4.3089, |
| "step": 280500 |
| }, |
| { |
| "epoch": 0.5359649658203125, |
| "grad_norm": 249.79989624023438, |
| "learning_rate": 2.3201847076416016e-05, |
| "lookahead_loss": 6.825008261680603, |
| "loss": 4.284, |
| "step": 281000 |
| }, |
| { |
| "epoch": 0.5369186401367188, |
| "grad_norm": 319.50018310546875, |
| "learning_rate": 2.3154163360595703e-05, |
| "lookahead_loss": 6.825734812736512, |
| "loss": 4.2816, |
| "step": 281500 |
| }, |
| { |
| "epoch": 0.537872314453125, |
| "grad_norm": 133.81101989746094, |
| "learning_rate": 2.310647964477539e-05, |
| "lookahead_loss": 6.814832735061645, |
| "loss": 4.269, |
| "step": 282000 |
| }, |
| { |
| "epoch": 0.5388259887695312, |
| "grad_norm": 91.1483154296875, |
| "learning_rate": 2.305879592895508e-05, |
| "lookahead_loss": 6.78851606464386, |
| "loss": 4.2535, |
| "step": 282500 |
| }, |
| { |
| "epoch": 0.5397796630859375, |
| "grad_norm": 182.8739013671875, |
| "learning_rate": 2.3011112213134767e-05, |
| "lookahead_loss": 6.826943335533142, |
| "loss": 4.2824, |
| "step": 283000 |
| }, |
| { |
| "epoch": 0.5407333374023438, |
| "grad_norm": 94.27992248535156, |
| "learning_rate": 2.2963428497314454e-05, |
| "lookahead_loss": 6.799699941635132, |
| "loss": 4.2609, |
| "step": 283500 |
| }, |
| { |
| "epoch": 0.54168701171875, |
| "grad_norm": 236.21224975585938, |
| "learning_rate": 2.291574478149414e-05, |
| "lookahead_loss": 6.801581911087036, |
| "loss": 4.2617, |
| "step": 284000 |
| }, |
| { |
| "epoch": 0.5426406860351562, |
| "grad_norm": 110.67129516601562, |
| "learning_rate": 2.2868061065673828e-05, |
| "lookahead_loss": 6.799880434989929, |
| "loss": 4.2585, |
| "step": 284500 |
| }, |
| { |
| "epoch": 0.5435943603515625, |
| "grad_norm": 211.9425048828125, |
| "learning_rate": 2.2820377349853518e-05, |
| "lookahead_loss": 6.803857459068299, |
| "loss": 4.2623, |
| "step": 285000 |
| }, |
| { |
| "epoch": 0.5435943603515625, |
| "eval_accuracy": 0.03992563600782779, |
| "eval_lookahead_loss": 6.709975707817078, |
| "eval_lookahead_perplexity": 820.5507062406658, |
| "eval_loss": 4.187664985656738, |
| "eval_perplexity": 65.86880664429253, |
| "eval_runtime": 798.0662, |
| "eval_samples_per_second": 12.53, |
| "eval_steps_per_second": 3.133, |
| "step": 285000 |
| }, |
| { |
| "epoch": 0.5445480346679688, |
| "grad_norm": 297.3907470703125, |
| "learning_rate": 2.2772693634033205e-05, |
| "lookahead_loss": 6.820889435768128, |
| "loss": 4.2772, |
| "step": 285500 |
| }, |
| { |
| "epoch": 0.545501708984375, |
| "grad_norm": 334.8447570800781, |
| "learning_rate": 2.272500991821289e-05, |
| "lookahead_loss": 6.802105306625366, |
| "loss": 4.2655, |
| "step": 286000 |
| }, |
| { |
| "epoch": 0.5464553833007812, |
| "grad_norm": 138.1062774658203, |
| "learning_rate": 2.2677326202392578e-05, |
| "lookahead_loss": 6.809896333694458, |
| "loss": 4.2662, |
| "step": 286500 |
| }, |
| { |
| "epoch": 0.5474090576171875, |
| "grad_norm": 62.1141242980957, |
| "learning_rate": 2.2629642486572265e-05, |
| "lookahead_loss": 6.778797305107116, |
| "loss": 4.2429, |
| "step": 287000 |
| }, |
| { |
| "epoch": 0.5483627319335938, |
| "grad_norm": 155.7555389404297, |
| "learning_rate": 2.2581958770751955e-05, |
| "lookahead_loss": 6.822315167427063, |
| "loss": 4.2752, |
| "step": 287500 |
| }, |
| { |
| "epoch": 0.54931640625, |
| "grad_norm": 43.916786193847656, |
| "learning_rate": 2.2534275054931642e-05, |
| "lookahead_loss": 6.813538794517517, |
| "loss": 4.2717, |
| "step": 288000 |
| }, |
| { |
| "epoch": 0.5502700805664062, |
| "grad_norm": 70.63623046875, |
| "learning_rate": 2.248659133911133e-05, |
| "lookahead_loss": 6.749878602027893, |
| "loss": 4.2407, |
| "step": 288500 |
| }, |
| { |
| "epoch": 0.5512237548828125, |
| "grad_norm": 210.43853759765625, |
| "learning_rate": 2.2438907623291016e-05, |
| "lookahead_loss": 6.801899238586426, |
| "loss": 4.2632, |
| "step": 289000 |
| }, |
| { |
| "epoch": 0.5521774291992188, |
| "grad_norm": 248.91004943847656, |
| "learning_rate": 2.2391223907470703e-05, |
| "lookahead_loss": 6.799204231262207, |
| "loss": 4.2614, |
| "step": 289500 |
| }, |
| { |
| "epoch": 0.553131103515625, |
| "grad_norm": 193.13775634765625, |
| "learning_rate": 2.2343540191650393e-05, |
| "lookahead_loss": 6.80387553691864, |
| "loss": 4.2669, |
| "step": 290000 |
| }, |
| { |
| "epoch": 0.553131103515625, |
| "eval_accuracy": 0.03853894324853229, |
| "eval_lookahead_loss": 6.711308574485779, |
| "eval_lookahead_perplexity": 821.6451201188611, |
| "eval_loss": 4.186929702758789, |
| "eval_perplexity": 65.82039223858216, |
| "eval_runtime": 582.7375, |
| "eval_samples_per_second": 17.16, |
| "eval_steps_per_second": 4.29, |
| "step": 290000 |
| }, |
| { |
| "epoch": 0.5540847778320312, |
| "grad_norm": 100.8056411743164, |
| "learning_rate": 2.229585647583008e-05, |
| "lookahead_loss": 6.780569408416748, |
| "loss": 4.2556, |
| "step": 290500 |
| }, |
| { |
| "epoch": 0.5550384521484375, |
| "grad_norm": 134.45547485351562, |
| "learning_rate": 2.2248172760009766e-05, |
| "lookahead_loss": 6.809809471130371, |
| "loss": 4.2757, |
| "step": 291000 |
| }, |
| { |
| "epoch": 0.5559921264648438, |
| "grad_norm": 81.97150421142578, |
| "learning_rate": 2.2200489044189453e-05, |
| "lookahead_loss": 6.817919278144837, |
| "loss": 4.2791, |
| "step": 291500 |
| }, |
| { |
| "epoch": 0.55694580078125, |
| "grad_norm": 79.4377670288086, |
| "learning_rate": 2.215280532836914e-05, |
| "lookahead_loss": 6.727724151611328, |
| "loss": 4.2115, |
| "step": 292000 |
| }, |
| { |
| "epoch": 0.5578994750976562, |
| "grad_norm": 77.00316619873047, |
| "learning_rate": 2.210512161254883e-05, |
| "lookahead_loss": 6.7791568479537965, |
| "loss": 4.2419, |
| "step": 292500 |
| }, |
| { |
| "epoch": 0.5588531494140625, |
| "grad_norm": 82.75985717773438, |
| "learning_rate": 2.2057437896728517e-05, |
| "lookahead_loss": 6.7614148283004765, |
| "loss": 4.2277, |
| "step": 293000 |
| }, |
| { |
| "epoch": 0.5598068237304688, |
| "grad_norm": 101.93817901611328, |
| "learning_rate": 2.2009754180908204e-05, |
| "lookahead_loss": 6.699149686813355, |
| "loss": 4.1925, |
| "step": 293500 |
| }, |
| { |
| "epoch": 0.560760498046875, |
| "grad_norm": 60.86259460449219, |
| "learning_rate": 2.196207046508789e-05, |
| "lookahead_loss": 6.8295795545578, |
| "loss": 4.2829, |
| "step": 294000 |
| }, |
| { |
| "epoch": 0.5617141723632812, |
| "grad_norm": 115.81536102294922, |
| "learning_rate": 2.1914386749267578e-05, |
| "lookahead_loss": 6.838942697525025, |
| "loss": 4.2941, |
| "step": 294500 |
| }, |
| { |
| "epoch": 0.5626678466796875, |
| "grad_norm": 280.6124267578125, |
| "learning_rate": 2.1866703033447268e-05, |
| "lookahead_loss": 6.8704422159194944, |
| "loss": 4.3208, |
| "step": 295000 |
| }, |
| { |
| "epoch": 0.5626678466796875, |
| "eval_accuracy": 0.03996340508806262, |
| "eval_lookahead_loss": 6.698442313957214, |
| "eval_lookahead_perplexity": 811.1413370460286, |
| "eval_loss": 4.178009986877441, |
| "eval_perplexity": 65.23590365189874, |
| "eval_runtime": 567.2167, |
| "eval_samples_per_second": 17.63, |
| "eval_steps_per_second": 4.407, |
| "step": 295000 |
| }, |
| { |
| "epoch": 0.5636215209960938, |
| "grad_norm": 123.4942626953125, |
| "learning_rate": 2.1819019317626955e-05, |
| "lookahead_loss": 6.8146950101852415, |
| "loss": 4.2758, |
| "step": 295500 |
| }, |
| { |
| "epoch": 0.5645751953125, |
| "grad_norm": 310.1524963378906, |
| "learning_rate": 2.177133560180664e-05, |
| "lookahead_loss": 6.799054090499878, |
| "loss": 4.2585, |
| "step": 296000 |
| }, |
| { |
| "epoch": 0.5655288696289062, |
| "grad_norm": 139.82127380371094, |
| "learning_rate": 2.1723651885986328e-05, |
| "lookahead_loss": 6.789865644454956, |
| "loss": 4.2472, |
| "step": 296500 |
| }, |
| { |
| "epoch": 0.5664825439453125, |
| "grad_norm": 107.57597351074219, |
| "learning_rate": 2.1675968170166015e-05, |
| "lookahead_loss": 6.7933146877288815, |
| "loss": 4.2527, |
| "step": 297000 |
| }, |
| { |
| "epoch": 0.5674362182617188, |
| "grad_norm": 129.25466918945312, |
| "learning_rate": 2.1628284454345705e-05, |
| "lookahead_loss": 6.822815986633301, |
| "loss": 4.2744, |
| "step": 297500 |
| }, |
| { |
| "epoch": 0.568389892578125, |
| "grad_norm": 71.5947036743164, |
| "learning_rate": 2.1580600738525392e-05, |
| "lookahead_loss": 6.821934850692749, |
| "loss": 4.279, |
| "step": 298000 |
| }, |
| { |
| "epoch": 0.5693435668945312, |
| "grad_norm": 230.48190307617188, |
| "learning_rate": 2.153291702270508e-05, |
| "lookahead_loss": 6.806265343666077, |
| "loss": 4.2646, |
| "step": 298500 |
| }, |
| { |
| "epoch": 0.5702972412109375, |
| "grad_norm": 145.06097412109375, |
| "learning_rate": 2.1485233306884766e-05, |
| "lookahead_loss": 6.776267616271973, |
| "loss": 4.2443, |
| "step": 299000 |
| }, |
| { |
| "epoch": 0.5712509155273438, |
| "grad_norm": 78.96148681640625, |
| "learning_rate": 2.1437549591064453e-05, |
| "lookahead_loss": 6.785057843208313, |
| "loss": 4.2472, |
| "step": 299500 |
| }, |
| { |
| "epoch": 0.57220458984375, |
| "grad_norm": 62.98528289794922, |
| "learning_rate": 2.1389865875244143e-05, |
| "lookahead_loss": 6.792482016563415, |
| "loss": 4.249, |
| "step": 300000 |
| }, |
| { |
| "epoch": 0.57220458984375, |
| "eval_accuracy": 0.039692172211350296, |
| "eval_lookahead_loss": 6.699759496116638, |
| "eval_lookahead_perplexity": 812.2104619054775, |
| "eval_loss": 4.1785569190979, |
| "eval_perplexity": 65.27159302848204, |
| "eval_runtime": 576.4962, |
| "eval_samples_per_second": 17.346, |
| "eval_steps_per_second": 4.337, |
| "step": 300000 |
| }, |
| { |
| "epoch": 0.5731582641601562, |
| "grad_norm": 258.02593994140625, |
| "learning_rate": 2.134218215942383e-05, |
| "lookahead_loss": 6.817351764678955, |
| "loss": 4.267, |
| "step": 300500 |
| }, |
| { |
| "epoch": 0.5741119384765625, |
| "grad_norm": 128.00973510742188, |
| "learning_rate": 2.1294498443603516e-05, |
| "lookahead_loss": 6.781159680366516, |
| "loss": 4.2407, |
| "step": 301000 |
| }, |
| { |
| "epoch": 0.5750656127929688, |
| "grad_norm": 156.14358520507812, |
| "learning_rate": 2.1246814727783203e-05, |
| "lookahead_loss": 6.7920808553695675, |
| "loss": 4.2501, |
| "step": 301500 |
| }, |
| { |
| "epoch": 0.576019287109375, |
| "grad_norm": 116.12018585205078, |
| "learning_rate": 2.119913101196289e-05, |
| "lookahead_loss": 6.767132887840271, |
| "loss": 4.2329, |
| "step": 302000 |
| }, |
| { |
| "epoch": 0.5769729614257812, |
| "grad_norm": 360.1877746582031, |
| "learning_rate": 2.115144729614258e-05, |
| "lookahead_loss": 6.78225277519226, |
| "loss": 4.2467, |
| "step": 302500 |
| }, |
| { |
| "epoch": 0.5779266357421875, |
| "grad_norm": 335.22357177734375, |
| "learning_rate": 2.1103763580322267e-05, |
| "lookahead_loss": 6.807877942085266, |
| "loss": 4.2635, |
| "step": 303000 |
| }, |
| { |
| "epoch": 0.5788803100585938, |
| "grad_norm": 65.33663177490234, |
| "learning_rate": 2.1056079864501954e-05, |
| "lookahead_loss": 6.790404032707214, |
| "loss": 4.2525, |
| "step": 303500 |
| }, |
| { |
| "epoch": 0.579833984375, |
| "grad_norm": 248.95620727539062, |
| "learning_rate": 2.100839614868164e-05, |
| "lookahead_loss": 6.755083421707154, |
| "loss": 4.2246, |
| "step": 304000 |
| }, |
| { |
| "epoch": 0.5807876586914062, |
| "grad_norm": 153.62921142578125, |
| "learning_rate": 2.0960712432861328e-05, |
| "lookahead_loss": 6.7886246490478515, |
| "loss": 4.2547, |
| "step": 304500 |
| }, |
| { |
| "epoch": 0.5817413330078125, |
| "grad_norm": 43.066688537597656, |
| "learning_rate": 2.0913028717041018e-05, |
| "lookahead_loss": 6.765781289100647, |
| "loss": 4.2383, |
| "step": 305000 |
| }, |
| { |
| "epoch": 0.5817413330078125, |
| "eval_accuracy": 0.03901506849315069, |
| "eval_lookahead_loss": 6.6950070844650265, |
| "eval_lookahead_perplexity": 808.3596609859953, |
| "eval_loss": 4.17401123046875, |
| "eval_perplexity": 64.97556203235999, |
| "eval_runtime": 609.7098, |
| "eval_samples_per_second": 16.401, |
| "eval_steps_per_second": 4.1, |
| "step": 305000 |
| }, |
| { |
| "epoch": 0.5826950073242188, |
| "grad_norm": 98.32864379882812, |
| "learning_rate": 2.0865345001220705e-05, |
| "lookahead_loss": 6.8042620153427125, |
| "loss": 4.2655, |
| "step": 305500 |
| }, |
| { |
| "epoch": 0.583648681640625, |
| "grad_norm": 76.21353149414062, |
| "learning_rate": 2.081766128540039e-05, |
| "lookahead_loss": 6.733520712852478, |
| "loss": 4.2087, |
| "step": 306000 |
| }, |
| { |
| "epoch": 0.5846023559570312, |
| "grad_norm": 99.90493774414062, |
| "learning_rate": 2.0769977569580078e-05, |
| "lookahead_loss": 6.76951104927063, |
| "loss": 4.2306, |
| "step": 306500 |
| }, |
| { |
| "epoch": 0.5855560302734375, |
| "grad_norm": 95.8613510131836, |
| "learning_rate": 2.0722293853759765e-05, |
| "lookahead_loss": 6.748024558067322, |
| "loss": 4.2209, |
| "step": 307000 |
| }, |
| { |
| "epoch": 0.5865097045898438, |
| "grad_norm": 50.98145294189453, |
| "learning_rate": 2.0674610137939455e-05, |
| "lookahead_loss": 6.750844541549682, |
| "loss": 4.2197, |
| "step": 307500 |
| }, |
| { |
| "epoch": 0.58746337890625, |
| "grad_norm": 139.3031463623047, |
| "learning_rate": 2.0626926422119142e-05, |
| "lookahead_loss": 6.826841161727906, |
| "loss": 4.2831, |
| "step": 308000 |
| }, |
| { |
| "epoch": 0.5884170532226562, |
| "grad_norm": 591.4248657226562, |
| "learning_rate": 2.057924270629883e-05, |
| "lookahead_loss": 6.8200990076065064, |
| "loss": 4.2779, |
| "step": 308500 |
| }, |
| { |
| "epoch": 0.5893707275390625, |
| "grad_norm": 44.19729995727539, |
| "learning_rate": 2.0531558990478516e-05, |
| "lookahead_loss": 6.801220169067383, |
| "loss": 4.2604, |
| "step": 309000 |
| }, |
| { |
| "epoch": 0.5903244018554688, |
| "grad_norm": 128.22125244140625, |
| "learning_rate": 2.0483875274658203e-05, |
| "lookahead_loss": 6.796990342140198, |
| "loss": 4.2533, |
| "step": 309500 |
| }, |
| { |
| "epoch": 0.591278076171875, |
| "grad_norm": 112.5945816040039, |
| "learning_rate": 2.0436191558837893e-05, |
| "lookahead_loss": 6.789237281799316, |
| "loss": 4.2473, |
| "step": 310000 |
| }, |
| { |
| "epoch": 0.591278076171875, |
| "eval_accuracy": 0.038703326810176124, |
| "eval_lookahead_loss": 6.706747333145142, |
| "eval_lookahead_perplexity": 817.9059325808217, |
| "eval_loss": 4.182686805725098, |
| "eval_perplexity": 65.5417147096937, |
| "eval_runtime": 581.2686, |
| "eval_samples_per_second": 17.204, |
| "eval_steps_per_second": 4.301, |
| "step": 310000 |
| }, |
| { |
| "epoch": 0.5922317504882812, |
| "grad_norm": 276.32080078125, |
| "learning_rate": 2.038850784301758e-05, |
| "lookahead_loss": 6.768315551757812, |
| "loss": 4.2308, |
| "step": 310500 |
| }, |
| { |
| "epoch": 0.5931854248046875, |
| "grad_norm": 209.5062713623047, |
| "learning_rate": 2.0340824127197266e-05, |
| "lookahead_loss": 6.7758984260559085, |
| "loss": 4.2418, |
| "step": 311000 |
| }, |
| { |
| "epoch": 0.5941390991210938, |
| "grad_norm": 32.1325798034668, |
| "learning_rate": 2.0293140411376953e-05, |
| "lookahead_loss": 6.77459735584259, |
| "loss": 4.236, |
| "step": 311500 |
| }, |
| { |
| "epoch": 0.5950927734375, |
| "grad_norm": 63.37489318847656, |
| "learning_rate": 2.024545669555664e-05, |
| "lookahead_loss": 6.830884791374206, |
| "loss": 4.2796, |
| "step": 312000 |
| }, |
| { |
| "epoch": 0.5960464477539062, |
| "grad_norm": 61.66058349609375, |
| "learning_rate": 2.019777297973633e-05, |
| "lookahead_loss": 6.811630078315735, |
| "loss": 4.2693, |
| "step": 312500 |
| }, |
| { |
| "epoch": 0.5970001220703125, |
| "grad_norm": 65.00341796875, |
| "learning_rate": 2.0150089263916017e-05, |
| "lookahead_loss": 6.801686690330506, |
| "loss": 4.2594, |
| "step": 313000 |
| }, |
| { |
| "epoch": 0.5979537963867188, |
| "grad_norm": 76.48503875732422, |
| "learning_rate": 2.0102405548095704e-05, |
| "lookahead_loss": 6.753870129585266, |
| "loss": 4.2166, |
| "step": 313500 |
| }, |
| { |
| "epoch": 0.598907470703125, |
| "grad_norm": 85.11914825439453, |
| "learning_rate": 2.005472183227539e-05, |
| "lookahead_loss": 6.770056640625, |
| "loss": 4.2344, |
| "step": 314000 |
| }, |
| { |
| "epoch": 0.5998611450195312, |
| "grad_norm": 116.34899139404297, |
| "learning_rate": 2.0007038116455078e-05, |
| "lookahead_loss": 6.807936497688294, |
| "loss": 4.2612, |
| "step": 314500 |
| }, |
| { |
| "epoch": 0.6008148193359375, |
| "grad_norm": 72.38105010986328, |
| "learning_rate": 1.9959354400634768e-05, |
| "lookahead_loss": 6.804626309394837, |
| "loss": 4.2612, |
| "step": 315000 |
| }, |
| { |
| "epoch": 0.6008148193359375, |
| "eval_accuracy": 0.03909256360078278, |
| "eval_lookahead_loss": 6.692839616584778, |
| "eval_lookahead_perplexity": 806.6094648172993, |
| "eval_loss": 4.17153263092041, |
| "eval_perplexity": 64.81471305610077, |
| "eval_runtime": 561.7722, |
| "eval_samples_per_second": 17.801, |
| "eval_steps_per_second": 4.45, |
| "step": 315000 |
| }, |
| { |
| "epoch": 0.6017684936523438, |
| "grad_norm": 68.6777114868164, |
| "learning_rate": 1.9911670684814455e-05, |
| "lookahead_loss": 6.783543480873108, |
| "loss": 4.2418, |
| "step": 315500 |
| }, |
| { |
| "epoch": 0.60272216796875, |
| "grad_norm": 83.95289611816406, |
| "learning_rate": 1.986398696899414e-05, |
| "lookahead_loss": 6.802493001937866, |
| "loss": 4.2615, |
| "step": 316000 |
| }, |
| { |
| "epoch": 0.6036758422851562, |
| "grad_norm": 157.76747131347656, |
| "learning_rate": 1.9816303253173828e-05, |
| "lookahead_loss": 6.765641038894653, |
| "loss": 4.2329, |
| "step": 316500 |
| }, |
| { |
| "epoch": 0.6046295166015625, |
| "grad_norm": 81.34313201904297, |
| "learning_rate": 1.9768619537353515e-05, |
| "lookahead_loss": 6.8008738670349125, |
| "loss": 4.2646, |
| "step": 317000 |
| }, |
| { |
| "epoch": 0.6055831909179688, |
| "grad_norm": 171.65989685058594, |
| "learning_rate": 1.9720935821533205e-05, |
| "lookahead_loss": 6.777490834236145, |
| "loss": 4.2458, |
| "step": 317500 |
| }, |
| { |
| "epoch": 0.606536865234375, |
| "grad_norm": 91.10609436035156, |
| "learning_rate": 1.9673252105712892e-05, |
| "lookahead_loss": 6.768827021598816, |
| "loss": 4.2402, |
| "step": 318000 |
| }, |
| { |
| "epoch": 0.6074905395507812, |
| "grad_norm": 4991.34521484375, |
| "learning_rate": 1.962556838989258e-05, |
| "lookahead_loss": 6.769472657203674, |
| "loss": 4.2413, |
| "step": 318500 |
| }, |
| { |
| "epoch": 0.6084442138671875, |
| "grad_norm": 71.24060821533203, |
| "learning_rate": 1.9577884674072266e-05, |
| "lookahead_loss": 6.768800120353699, |
| "loss": 4.235, |
| "step": 319000 |
| }, |
| { |
| "epoch": 0.6093978881835938, |
| "grad_norm": 79.60295867919922, |
| "learning_rate": 1.9530200958251953e-05, |
| "lookahead_loss": 6.790882308959961, |
| "loss": 4.252, |
| "step": 319500 |
| }, |
| { |
| "epoch": 0.6103515625, |
| "grad_norm": 172.84564208984375, |
| "learning_rate": 1.9482517242431643e-05, |
| "lookahead_loss": 6.726774211883545, |
| "loss": 4.1997, |
| "step": 320000 |
| }, |
| { |
| "epoch": 0.6103515625, |
| "eval_accuracy": 0.03917729941291585, |
| "eval_lookahead_loss": 6.687376212692261, |
| "eval_lookahead_perplexity": 802.2146477893597, |
| "eval_loss": 4.165813446044922, |
| "eval_perplexity": 64.44508372623659, |
| "eval_runtime": 559.0949, |
| "eval_samples_per_second": 17.886, |
| "eval_steps_per_second": 4.472, |
| "step": 320000 |
| }, |
| { |
| "epoch": 0.6113052368164062, |
| "grad_norm": 73.97826385498047, |
| "learning_rate": 1.943483352661133e-05, |
| "lookahead_loss": 6.7440031061172485, |
| "loss": 4.2132, |
| "step": 320500 |
| }, |
| { |
| "epoch": 0.6122589111328125, |
| "grad_norm": 46.80066680908203, |
| "learning_rate": 1.9387149810791016e-05, |
| "lookahead_loss": 6.752664255142212, |
| "loss": 4.2178, |
| "step": 321000 |
| }, |
| { |
| "epoch": 0.6132125854492188, |
| "grad_norm": 176.61981201171875, |
| "learning_rate": 1.9339466094970703e-05, |
| "lookahead_loss": 6.820150462150574, |
| "loss": 4.2742, |
| "step": 321500 |
| }, |
| { |
| "epoch": 0.614166259765625, |
| "grad_norm": 50.99040985107422, |
| "learning_rate": 1.929178237915039e-05, |
| "lookahead_loss": 6.832323289871216, |
| "loss": 4.2853, |
| "step": 322000 |
| }, |
| { |
| "epoch": 0.6151199340820312, |
| "grad_norm": 69.94245147705078, |
| "learning_rate": 1.924409866333008e-05, |
| "lookahead_loss": 6.8110770416259765, |
| "loss": 4.2677, |
| "step": 322500 |
| }, |
| { |
| "epoch": 0.6160736083984375, |
| "grad_norm": 53.37430953979492, |
| "learning_rate": 1.9196414947509767e-05, |
| "lookahead_loss": 6.806340113639831, |
| "loss": 4.2648, |
| "step": 323000 |
| }, |
| { |
| "epoch": 0.6170272827148438, |
| "grad_norm": 134.60284423828125, |
| "learning_rate": 1.9148731231689454e-05, |
| "lookahead_loss": 6.767860838890075, |
| "loss": 4.2323, |
| "step": 323500 |
| }, |
| { |
| "epoch": 0.61798095703125, |
| "grad_norm": 86.28919982910156, |
| "learning_rate": 1.910104751586914e-05, |
| "lookahead_loss": 6.806537877082825, |
| "loss": 4.2595, |
| "step": 324000 |
| }, |
| { |
| "epoch": 0.6189346313476562, |
| "grad_norm": 103.1623764038086, |
| "learning_rate": 1.9053363800048828e-05, |
| "lookahead_loss": 6.791725672721863, |
| "loss": 4.253, |
| "step": 324500 |
| }, |
| { |
| "epoch": 0.6198883056640625, |
| "grad_norm": 407.5108947753906, |
| "learning_rate": 1.9005680084228518e-05, |
| "lookahead_loss": 6.78332125377655, |
| "loss": 4.2437, |
| "step": 325000 |
| }, |
| { |
| "epoch": 0.6198883056640625, |
| "eval_accuracy": 0.04002857142857143, |
| "eval_lookahead_loss": 6.689265847015381, |
| "eval_lookahead_perplexity": 803.7319732659873, |
| "eval_loss": 4.168144702911377, |
| "eval_perplexity": 64.59549702808235, |
| "eval_runtime": 532.376, |
| "eval_samples_per_second": 18.784, |
| "eval_steps_per_second": 4.696, |
| "step": 325000 |
| }, |
| { |
| "epoch": 0.6208419799804688, |
| "grad_norm": 83.51589965820312, |
| "learning_rate": 1.8957996368408205e-05, |
| "lookahead_loss": 6.778052265167236, |
| "loss": 4.2391, |
| "step": 325500 |
| }, |
| { |
| "epoch": 0.621795654296875, |
| "grad_norm": 163.7206573486328, |
| "learning_rate": 1.891031265258789e-05, |
| "lookahead_loss": 6.768165149688721, |
| "loss": 4.2321, |
| "step": 326000 |
| }, |
| { |
| "epoch": 0.6227493286132812, |
| "grad_norm": 123.90320587158203, |
| "learning_rate": 1.8862628936767578e-05, |
| "lookahead_loss": 6.815169135093689, |
| "loss": 4.2675, |
| "step": 326500 |
| }, |
| { |
| "epoch": 0.6237030029296875, |
| "grad_norm": 72.05839538574219, |
| "learning_rate": 1.8814945220947265e-05, |
| "lookahead_loss": 6.759617408752441, |
| "loss": 4.2189, |
| "step": 327000 |
| }, |
| { |
| "epoch": 0.6246566772460938, |
| "grad_norm": 71.78340148925781, |
| "learning_rate": 1.8767261505126955e-05, |
| "lookahead_loss": 6.795024278640747, |
| "loss": 4.2507, |
| "step": 327500 |
| }, |
| { |
| "epoch": 0.6256103515625, |
| "grad_norm": 110.57073211669922, |
| "learning_rate": 1.8719577789306642e-05, |
| "lookahead_loss": 6.780984460830688, |
| "loss": 4.2395, |
| "step": 328000 |
| }, |
| { |
| "epoch": 0.6265640258789062, |
| "grad_norm": 200.6844940185547, |
| "learning_rate": 1.867189407348633e-05, |
| "lookahead_loss": 6.804348068237305, |
| "loss": 4.2607, |
| "step": 328500 |
| }, |
| { |
| "epoch": 0.6275177001953125, |
| "grad_norm": 186.4147186279297, |
| "learning_rate": 1.8624210357666016e-05, |
| "lookahead_loss": 6.759407227516174, |
| "loss": 4.2231, |
| "step": 329000 |
| }, |
| { |
| "epoch": 0.6284713745117188, |
| "grad_norm": 154.31472778320312, |
| "learning_rate": 1.8576526641845703e-05, |
| "lookahead_loss": 6.796393509864807, |
| "loss": 4.2549, |
| "step": 329500 |
| }, |
| { |
| "epoch": 0.629425048828125, |
| "grad_norm": 57.93864440917969, |
| "learning_rate": 1.8528842926025393e-05, |
| "lookahead_loss": 6.760720524787903, |
| "loss": 4.2243, |
| "step": 330000 |
| }, |
| { |
| "epoch": 0.629425048828125, |
| "eval_accuracy": 0.039936399217221134, |
| "eval_lookahead_loss": 6.681839813995361, |
| "eval_lookahead_perplexity": 797.7855396263418, |
| "eval_loss": 4.1614556312561035, |
| "eval_perplexity": 64.16485502317559, |
| "eval_runtime": 526.1126, |
| "eval_samples_per_second": 19.007, |
| "eval_steps_per_second": 4.752, |
| "step": 330000 |
| }, |
| { |
| "epoch": 0.6303787231445312, |
| "grad_norm": 57.461753845214844, |
| "learning_rate": 1.848115921020508e-05, |
| "lookahead_loss": 6.832729354858398, |
| "loss": 4.2905, |
| "step": 330500 |
| }, |
| { |
| "epoch": 0.6313323974609375, |
| "grad_norm": 89.03836822509766, |
| "learning_rate": 1.8433475494384766e-05, |
| "lookahead_loss": 6.780477429389953, |
| "loss": 4.2477, |
| "step": 331000 |
| }, |
| { |
| "epoch": 0.6322860717773438, |
| "grad_norm": 66.491455078125, |
| "learning_rate": 1.8385791778564453e-05, |
| "lookahead_loss": 6.7762909030914305, |
| "loss": 4.2381, |
| "step": 331500 |
| }, |
| { |
| "epoch": 0.63323974609375, |
| "grad_norm": 409.5155334472656, |
| "learning_rate": 1.833810806274414e-05, |
| "lookahead_loss": 6.8101942882537845, |
| "loss": 4.2704, |
| "step": 332000 |
| }, |
| { |
| "epoch": 0.6341934204101562, |
| "grad_norm": 119.67936706542969, |
| "learning_rate": 1.829042434692383e-05, |
| "lookahead_loss": 6.76052673625946, |
| "loss": 4.2297, |
| "step": 332500 |
| }, |
| { |
| "epoch": 0.6351470947265625, |
| "grad_norm": 166.02719116210938, |
| "learning_rate": 1.8242740631103517e-05, |
| "lookahead_loss": 6.7347633228302, |
| "loss": 4.214, |
| "step": 333000 |
| }, |
| { |
| "epoch": 0.6361007690429688, |
| "grad_norm": 75.54403686523438, |
| "learning_rate": 1.8195056915283204e-05, |
| "lookahead_loss": 6.744675261497497, |
| "loss": 4.2118, |
| "step": 333500 |
| }, |
| { |
| "epoch": 0.637054443359375, |
| "grad_norm": 87.48888397216797, |
| "learning_rate": 1.814737319946289e-05, |
| "lookahead_loss": 6.753357731819153, |
| "loss": 4.2182, |
| "step": 334000 |
| }, |
| { |
| "epoch": 0.6380081176757812, |
| "grad_norm": 77.82637023925781, |
| "learning_rate": 1.8099689483642578e-05, |
| "lookahead_loss": 6.758910771369934, |
| "loss": 4.2246, |
| "step": 334500 |
| }, |
| { |
| "epoch": 0.6389617919921875, |
| "grad_norm": 264.6513366699219, |
| "learning_rate": 1.8052005767822268e-05, |
| "lookahead_loss": 6.7860059032440185, |
| "loss": 4.2524, |
| "step": 335000 |
| }, |
| { |
| "epoch": 0.6389617919921875, |
| "eval_accuracy": 0.03992504892367906, |
| "eval_lookahead_loss": 6.67642790184021, |
| "eval_lookahead_perplexity": 793.4796564076045, |
| "eval_loss": 4.1580071449279785, |
| "eval_perplexity": 63.943964485784704, |
| "eval_runtime": 551.7858, |
| "eval_samples_per_second": 18.123, |
| "eval_steps_per_second": 4.531, |
| "step": 335000 |
| }, |
| { |
| "epoch": 0.6399154663085938, |
| "grad_norm": 105.51661682128906, |
| "learning_rate": 1.8004322052001955e-05, |
| "lookahead_loss": 6.819765789031982, |
| "loss": 4.277, |
| "step": 335500 |
| }, |
| { |
| "epoch": 0.640869140625, |
| "grad_norm": 104.6438980102539, |
| "learning_rate": 1.795663833618164e-05, |
| "lookahead_loss": 6.7929087829589845, |
| "loss": 4.251, |
| "step": 336000 |
| }, |
| { |
| "epoch": 0.6418228149414062, |
| "grad_norm": 118.0073013305664, |
| "learning_rate": 1.7908954620361328e-05, |
| "lookahead_loss": 6.781796964645386, |
| "loss": 4.2444, |
| "step": 336500 |
| }, |
| { |
| "epoch": 0.6427764892578125, |
| "grad_norm": 113.10981750488281, |
| "learning_rate": 1.7861270904541015e-05, |
| "lookahead_loss": 6.739816927909851, |
| "loss": 4.2108, |
| "step": 337000 |
| }, |
| { |
| "epoch": 0.6437301635742188, |
| "grad_norm": 100.2951889038086, |
| "learning_rate": 1.7813587188720705e-05, |
| "lookahead_loss": 6.777650778770447, |
| "loss": 4.2383, |
| "step": 337500 |
| }, |
| { |
| "epoch": 0.644683837890625, |
| "grad_norm": 85.64952850341797, |
| "learning_rate": 1.7765903472900392e-05, |
| "lookahead_loss": 6.780595546722412, |
| "loss": 4.2447, |
| "step": 338000 |
| }, |
| { |
| "epoch": 0.6456375122070312, |
| "grad_norm": 124.38178253173828, |
| "learning_rate": 1.771821975708008e-05, |
| "lookahead_loss": 6.778977559089661, |
| "loss": 4.2386, |
| "step": 338500 |
| }, |
| { |
| "epoch": 0.6465911865234375, |
| "grad_norm": 165.3598175048828, |
| "learning_rate": 1.7670536041259766e-05, |
| "lookahead_loss": 6.806086463928223, |
| "loss": 4.2607, |
| "step": 339000 |
| }, |
| { |
| "epoch": 0.6475448608398438, |
| "grad_norm": 109.42640686035156, |
| "learning_rate": 1.7622852325439453e-05, |
| "lookahead_loss": 6.776420484542847, |
| "loss": 4.238, |
| "step": 339500 |
| }, |
| { |
| "epoch": 0.64849853515625, |
| "grad_norm": 139.0491485595703, |
| "learning_rate": 1.7575168609619143e-05, |
| "lookahead_loss": 6.79261616230011, |
| "loss": 4.2496, |
| "step": 340000 |
| }, |
| { |
| "epoch": 0.64849853515625, |
| "eval_accuracy": 0.039570450097847355, |
| "eval_lookahead_loss": 6.682711238098144, |
| "eval_lookahead_perplexity": 798.4810521736097, |
| "eval_loss": 4.16325569152832, |
| "eval_perplexity": 64.28045964600445, |
| "eval_runtime": 530.9782, |
| "eval_samples_per_second": 18.833, |
| "eval_steps_per_second": 4.708, |
| "step": 340000 |
| }, |
| { |
| "epoch": 0.6494522094726562, |
| "grad_norm": 47.83683776855469, |
| "learning_rate": 1.752748489379883e-05, |
| "lookahead_loss": 6.77837984085083, |
| "loss": 4.2369, |
| "step": 340500 |
| }, |
| { |
| "epoch": 0.6504058837890625, |
| "grad_norm": 223.80381774902344, |
| "learning_rate": 1.7479801177978516e-05, |
| "lookahead_loss": 6.750338411331176, |
| "loss": 4.2174, |
| "step": 341000 |
| }, |
| { |
| "epoch": 0.6513595581054688, |
| "grad_norm": 128.40768432617188, |
| "learning_rate": 1.7432117462158203e-05, |
| "lookahead_loss": 6.789745329856872, |
| "loss": 4.2468, |
| "step": 341500 |
| }, |
| { |
| "epoch": 0.652313232421875, |
| "grad_norm": 64.70144653320312, |
| "learning_rate": 1.738443374633789e-05, |
| "lookahead_loss": 6.774203729629517, |
| "loss": 4.2382, |
| "step": 342000 |
| }, |
| { |
| "epoch": 0.6532669067382812, |
| "grad_norm": 93.99317169189453, |
| "learning_rate": 1.733675003051758e-05, |
| "lookahead_loss": 6.7962212581634525, |
| "loss": 4.2522, |
| "step": 342500 |
| }, |
| { |
| "epoch": 0.6542205810546875, |
| "grad_norm": 40.22169494628906, |
| "learning_rate": 1.7289066314697267e-05, |
| "lookahead_loss": 6.7743144826889035, |
| "loss": 4.2426, |
| "step": 343000 |
| }, |
| { |
| "epoch": 0.6551742553710938, |
| "grad_norm": 53.113426208496094, |
| "learning_rate": 1.7241382598876954e-05, |
| "lookahead_loss": 6.7418350315094, |
| "loss": 4.213, |
| "step": 343500 |
| }, |
| { |
| "epoch": 0.6561279296875, |
| "grad_norm": 99.87715911865234, |
| "learning_rate": 1.719369888305664e-05, |
| "lookahead_loss": 6.78454621887207, |
| "loss": 4.2411, |
| "step": 344000 |
| }, |
| { |
| "epoch": 0.6570816040039062, |
| "grad_norm": 196.7032928466797, |
| "learning_rate": 1.7146015167236328e-05, |
| "lookahead_loss": 6.752220818519592, |
| "loss": 4.2181, |
| "step": 344500 |
| }, |
| { |
| "epoch": 0.6580352783203125, |
| "grad_norm": 116.31472778320312, |
| "learning_rate": 1.7098331451416018e-05, |
| "lookahead_loss": 6.778828821182251, |
| "loss": 4.2444, |
| "step": 345000 |
| }, |
| { |
| "epoch": 0.6580352783203125, |
| "eval_accuracy": 0.03938688845401174, |
| "eval_lookahead_loss": 6.6817688535690305, |
| "eval_lookahead_perplexity": 797.7289304328597, |
| "eval_loss": 4.160607814788818, |
| "eval_perplexity": 64.110478056567, |
| "eval_runtime": 602.1944, |
| "eval_samples_per_second": 16.606, |
| "eval_steps_per_second": 4.151, |
| "step": 345000 |
| }, |
| { |
| "epoch": 0.6589889526367188, |
| "grad_norm": 77.72695922851562, |
| "learning_rate": 1.7050647735595705e-05, |
| "lookahead_loss": 6.763183795928955, |
| "loss": 4.2316, |
| "step": 345500 |
| }, |
| { |
| "epoch": 0.659942626953125, |
| "grad_norm": 271.14483642578125, |
| "learning_rate": 1.700296401977539e-05, |
| "lookahead_loss": 6.773144967079163, |
| "loss": 4.2394, |
| "step": 346000 |
| }, |
| { |
| "epoch": 0.6608963012695312, |
| "grad_norm": 201.7103729248047, |
| "learning_rate": 1.6955280303955078e-05, |
| "lookahead_loss": 6.755885152816773, |
| "loss": 4.2197, |
| "step": 346500 |
| }, |
| { |
| "epoch": 0.6618499755859375, |
| "grad_norm": 117.94550323486328, |
| "learning_rate": 1.6907596588134765e-05, |
| "lookahead_loss": 6.725367462158203, |
| "loss": 4.1968, |
| "step": 347000 |
| }, |
| { |
| "epoch": 0.6628036499023438, |
| "grad_norm": 113.2874755859375, |
| "learning_rate": 1.6859912872314455e-05, |
| "lookahead_loss": 6.736243143081665, |
| "loss": 4.2033, |
| "step": 347500 |
| }, |
| { |
| "epoch": 0.66375732421875, |
| "grad_norm": 81.23294067382812, |
| "learning_rate": 1.6812229156494142e-05, |
| "lookahead_loss": 6.739593348503113, |
| "loss": 4.2066, |
| "step": 348000 |
| }, |
| { |
| "epoch": 0.6647109985351562, |
| "grad_norm": 155.0906219482422, |
| "learning_rate": 1.676454544067383e-05, |
| "lookahead_loss": 6.833329524040222, |
| "loss": 4.2887, |
| "step": 348500 |
| }, |
| { |
| "epoch": 0.6656646728515625, |
| "grad_norm": 154.31324768066406, |
| "learning_rate": 1.6716861724853516e-05, |
| "lookahead_loss": 6.786603861808777, |
| "loss": 4.2436, |
| "step": 349000 |
| }, |
| { |
| "epoch": 0.6666183471679688, |
| "grad_norm": 131.5563201904297, |
| "learning_rate": 1.6669178009033203e-05, |
| "lookahead_loss": 6.789072910308838, |
| "loss": 4.2454, |
| "step": 349500 |
| }, |
| { |
| "epoch": 0.667572021484375, |
| "grad_norm": 110.59390258789062, |
| "learning_rate": 1.6621494293212893e-05, |
| "lookahead_loss": 6.743032719612121, |
| "loss": 4.2127, |
| "step": 350000 |
| }, |
| { |
| "epoch": 0.667572021484375, |
| "eval_accuracy": 0.04016712328767123, |
| "eval_lookahead_loss": 6.6685991918563845, |
| "eval_lookahead_perplexity": 787.2919866383941, |
| "eval_loss": 4.150493621826172, |
| "eval_perplexity": 63.465320434707415, |
| "eval_runtime": 574.0326, |
| "eval_samples_per_second": 17.421, |
| "eval_steps_per_second": 4.355, |
| "step": 350000 |
| }, |
| { |
| "epoch": 0.6685256958007812, |
| "grad_norm": 79.56262969970703, |
| "learning_rate": 1.657381057739258e-05, |
| "lookahead_loss": 6.805004561424256, |
| "loss": 4.2579, |
| "step": 350500 |
| }, |
| { |
| "epoch": 0.6694793701171875, |
| "grad_norm": 135.4939727783203, |
| "learning_rate": 1.6526126861572266e-05, |
| "lookahead_loss": 6.75914619064331, |
| "loss": 4.2197, |
| "step": 351000 |
| }, |
| { |
| "epoch": 0.6704330444335938, |
| "grad_norm": 782.4779052734375, |
| "learning_rate": 1.6478443145751953e-05, |
| "lookahead_loss": 6.789163728713989, |
| "loss": 4.2434, |
| "step": 351500 |
| }, |
| { |
| "epoch": 0.67138671875, |
| "grad_norm": 132.7369384765625, |
| "learning_rate": 1.643075942993164e-05, |
| "lookahead_loss": 6.80147934627533, |
| "loss": 4.2554, |
| "step": 352000 |
| }, |
| { |
| "epoch": 0.6723403930664062, |
| "grad_norm": 128.3571319580078, |
| "learning_rate": 1.638307571411133e-05, |
| "lookahead_loss": 6.777853595733642, |
| "loss": 4.2386, |
| "step": 352500 |
| }, |
| { |
| "epoch": 0.6732940673828125, |
| "grad_norm": 62.090885162353516, |
| "learning_rate": 1.6335391998291017e-05, |
| "lookahead_loss": 6.763955370903015, |
| "loss": 4.2248, |
| "step": 353000 |
| }, |
| { |
| "epoch": 0.6742477416992188, |
| "grad_norm": 46.80070114135742, |
| "learning_rate": 1.6287708282470704e-05, |
| "lookahead_loss": 6.739695261001587, |
| "loss": 4.2063, |
| "step": 353500 |
| }, |
| { |
| "epoch": 0.675201416015625, |
| "grad_norm": 491.0822448730469, |
| "learning_rate": 1.624002456665039e-05, |
| "lookahead_loss": 6.707343686103821, |
| "loss": 4.1805, |
| "step": 354000 |
| }, |
| { |
| "epoch": 0.6761550903320312, |
| "grad_norm": 86.55126953125, |
| "learning_rate": 1.6192340850830078e-05, |
| "lookahead_loss": 6.765286821365357, |
| "loss": 4.2276, |
| "step": 354500 |
| }, |
| { |
| "epoch": 0.6771087646484375, |
| "grad_norm": 95.26092529296875, |
| "learning_rate": 1.6144657135009768e-05, |
| "lookahead_loss": 6.744797526359558, |
| "loss": 4.2095, |
| "step": 355000 |
| }, |
| { |
| "epoch": 0.6771087646484375, |
| "eval_accuracy": 0.03898904109589041, |
| "eval_lookahead_loss": 6.677797871589661, |
| "eval_lookahead_perplexity": 794.5674444819339, |
| "eval_loss": 4.156777381896973, |
| "eval_perplexity": 63.86537689416568, |
| "eval_runtime": 546.2578, |
| "eval_samples_per_second": 18.306, |
| "eval_steps_per_second": 4.577, |
| "step": 355000 |
| }, |
| { |
| "epoch": 0.6780624389648438, |
| "grad_norm": 56.963478088378906, |
| "learning_rate": 1.6096973419189455e-05, |
| "lookahead_loss": 6.753186054229737, |
| "loss": 4.2157, |
| "step": 355500 |
| }, |
| { |
| "epoch": 0.67901611328125, |
| "grad_norm": 188.1451416015625, |
| "learning_rate": 1.604928970336914e-05, |
| "lookahead_loss": 6.750112458229065, |
| "loss": 4.2123, |
| "step": 356000 |
| }, |
| { |
| "epoch": 0.6799697875976562, |
| "grad_norm": 111.59520721435547, |
| "learning_rate": 1.6001605987548828e-05, |
| "lookahead_loss": 6.751095014572144, |
| "loss": 4.2179, |
| "step": 356500 |
| }, |
| { |
| "epoch": 0.6809234619140625, |
| "grad_norm": 70.11491394042969, |
| "learning_rate": 1.5953922271728515e-05, |
| "lookahead_loss": 6.7610349245071415, |
| "loss": 4.222, |
| "step": 357000 |
| }, |
| { |
| "epoch": 0.6818771362304688, |
| "grad_norm": 60.44892883300781, |
| "learning_rate": 1.5906238555908205e-05, |
| "lookahead_loss": 6.757164681434632, |
| "loss": 4.2175, |
| "step": 357500 |
| }, |
| { |
| "epoch": 0.682830810546875, |
| "grad_norm": 283.4502258300781, |
| "learning_rate": 1.5858554840087892e-05, |
| "lookahead_loss": 6.743275443077088, |
| "loss": 4.2107, |
| "step": 358000 |
| }, |
| { |
| "epoch": 0.6837844848632812, |
| "grad_norm": 227.9576873779297, |
| "learning_rate": 1.581087112426758e-05, |
| "lookahead_loss": 6.761423203468323, |
| "loss": 4.2215, |
| "step": 358500 |
| }, |
| { |
| "epoch": 0.6847381591796875, |
| "grad_norm": 91.83037567138672, |
| "learning_rate": 1.5763187408447266e-05, |
| "lookahead_loss": 6.773238019943237, |
| "loss": 4.232, |
| "step": 359000 |
| }, |
| { |
| "epoch": 0.6856918334960938, |
| "grad_norm": 92.19053649902344, |
| "learning_rate": 1.5715503692626953e-05, |
| "lookahead_loss": 6.76289450931549, |
| "loss": 4.2225, |
| "step": 359500 |
| }, |
| { |
| "epoch": 0.6866455078125, |
| "grad_norm": 58.52967834472656, |
| "learning_rate": 1.5667819976806643e-05, |
| "lookahead_loss": 6.762613282203675, |
| "loss": 4.2228, |
| "step": 360000 |
| }, |
| { |
| "epoch": 0.6866455078125, |
| "eval_accuracy": 0.039182387475538163, |
| "eval_lookahead_loss": 6.667040529060364, |
| "eval_lookahead_perplexity": 786.065819747813, |
| "eval_loss": 4.147232532501221, |
| "eval_perplexity": 63.258691456630295, |
| "eval_runtime": 735.927, |
| "eval_samples_per_second": 13.588, |
| "eval_steps_per_second": 3.397, |
| "step": 360000 |
| }, |
| { |
| "epoch": 0.6875991821289062, |
| "grad_norm": 119.70156860351562, |
| "learning_rate": 1.562013626098633e-05, |
| "lookahead_loss": 6.734339241027832, |
| "loss": 4.2017, |
| "step": 360500 |
| }, |
| { |
| "epoch": 0.6885528564453125, |
| "grad_norm": 86.41842651367188, |
| "learning_rate": 1.5572452545166016e-05, |
| "lookahead_loss": 6.755916650772095, |
| "loss": 4.2177, |
| "step": 361000 |
| }, |
| { |
| "epoch": 0.6895065307617188, |
| "grad_norm": 111.80638885498047, |
| "learning_rate": 1.5524768829345703e-05, |
| "lookahead_loss": 6.762873152732849, |
| "loss": 4.2231, |
| "step": 361500 |
| }, |
| { |
| "epoch": 0.690460205078125, |
| "grad_norm": 93.34003448486328, |
| "learning_rate": 1.547708511352539e-05, |
| "lookahead_loss": 6.740634337425232, |
| "loss": 4.2152, |
| "step": 362000 |
| }, |
| { |
| "epoch": 0.6914138793945312, |
| "grad_norm": 114.20498657226562, |
| "learning_rate": 1.542940139770508e-05, |
| "lookahead_loss": 6.739887097358704, |
| "loss": 4.2036, |
| "step": 362500 |
| }, |
| { |
| "epoch": 0.6923675537109375, |
| "grad_norm": 151.4510955810547, |
| "learning_rate": 1.5381717681884767e-05, |
| "lookahead_loss": 6.717738738059998, |
| "loss": 4.1854, |
| "step": 363000 |
| }, |
| { |
| "epoch": 0.6933212280273438, |
| "grad_norm": 49.312618255615234, |
| "learning_rate": 1.5334033966064454e-05, |
| "lookahead_loss": 6.685453664779663, |
| "loss": 4.1642, |
| "step": 363500 |
| }, |
| { |
| "epoch": 0.69427490234375, |
| "grad_norm": 141.49258422851562, |
| "learning_rate": 1.528635025024414e-05, |
| "lookahead_loss": 6.70034351348877, |
| "loss": 4.1749, |
| "step": 364000 |
| }, |
| { |
| "epoch": 0.6952285766601562, |
| "grad_norm": 95.40576171875, |
| "learning_rate": 1.523866653442383e-05, |
| "lookahead_loss": 6.68953297328949, |
| "loss": 4.1656, |
| "step": 364500 |
| }, |
| { |
| "epoch": 0.6961822509765625, |
| "grad_norm": 187.89523315429688, |
| "learning_rate": 1.5190982818603516e-05, |
| "lookahead_loss": 6.7389403352737425, |
| "loss": 4.2044, |
| "step": 365000 |
| }, |
| { |
| "epoch": 0.6961822509765625, |
| "eval_accuracy": 0.03970391389432485, |
| "eval_lookahead_loss": 6.665154379463196, |
| "eval_lookahead_perplexity": 784.5845793781453, |
| "eval_loss": 4.145508766174316, |
| "eval_perplexity": 63.149742182935384, |
| "eval_runtime": 614.6026, |
| "eval_samples_per_second": 16.271, |
| "eval_steps_per_second": 4.068, |
| "step": 365000 |
| }, |
| { |
| "epoch": 0.6971359252929688, |
| "grad_norm": 54.54765319824219, |
| "learning_rate": 1.5143299102783205e-05, |
| "lookahead_loss": 6.809711742401123, |
| "loss": 4.263, |
| "step": 365500 |
| }, |
| { |
| "epoch": 0.698089599609375, |
| "grad_norm": 101.66915893554688, |
| "learning_rate": 1.5095615386962891e-05, |
| "lookahead_loss": 6.785634720802308, |
| "loss": 4.2432, |
| "step": 366000 |
| }, |
| { |
| "epoch": 0.6990432739257812, |
| "grad_norm": 512.562744140625, |
| "learning_rate": 1.5047931671142578e-05, |
| "lookahead_loss": 6.807685653686524, |
| "loss": 4.264, |
| "step": 366500 |
| }, |
| { |
| "epoch": 0.6999969482421875, |
| "grad_norm": 62.921695709228516, |
| "learning_rate": 1.5000247955322267e-05, |
| "lookahead_loss": 6.809448163986206, |
| "loss": 4.2624, |
| "step": 367000 |
| }, |
| { |
| "epoch": 0.7009506225585938, |
| "grad_norm": 182.09976196289062, |
| "learning_rate": 1.4952564239501954e-05, |
| "lookahead_loss": 6.744594740867615, |
| "loss": 4.2077, |
| "step": 367500 |
| }, |
| { |
| "epoch": 0.701904296875, |
| "grad_norm": 120.20858764648438, |
| "learning_rate": 1.4904880523681642e-05, |
| "lookahead_loss": 6.7497634525299075, |
| "loss": 4.2126, |
| "step": 368000 |
| }, |
| { |
| "epoch": 0.7028579711914062, |
| "grad_norm": 149.9693603515625, |
| "learning_rate": 1.4857196807861329e-05, |
| "lookahead_loss": 6.736365944862365, |
| "loss": 4.2, |
| "step": 368500 |
| }, |
| { |
| "epoch": 0.7038116455078125, |
| "grad_norm": 396.1338806152344, |
| "learning_rate": 1.4809513092041016e-05, |
| "lookahead_loss": 6.759799390792847, |
| "loss": 4.2209, |
| "step": 369000 |
| }, |
| { |
| "epoch": 0.7047653198242188, |
| "grad_norm": 181.96615600585938, |
| "learning_rate": 1.4761829376220704e-05, |
| "lookahead_loss": 6.737936126708984, |
| "loss": 4.2008, |
| "step": 369500 |
| }, |
| { |
| "epoch": 0.705718994140625, |
| "grad_norm": 139.66781616210938, |
| "learning_rate": 1.4714145660400391e-05, |
| "lookahead_loss": 6.793075837135315, |
| "loss": 4.2489, |
| "step": 370000 |
| }, |
| { |
| "epoch": 0.705718994140625, |
| "eval_accuracy": 0.03946771037181996, |
| "eval_lookahead_loss": 6.661704420280457, |
| "eval_lookahead_perplexity": 781.8824583875486, |
| "eval_loss": 4.143365383148193, |
| "eval_perplexity": 63.01453305167847, |
| "eval_runtime": 541.7603, |
| "eval_samples_per_second": 18.458, |
| "eval_steps_per_second": 4.615, |
| "step": 370000 |
| }, |
| { |
| "epoch": 0.7066726684570312, |
| "grad_norm": 92.09716796875, |
| "learning_rate": 1.466646194458008e-05, |
| "lookahead_loss": 6.72950865650177, |
| "loss": 4.1942, |
| "step": 370500 |
| }, |
| { |
| "epoch": 0.7076263427734375, |
| "grad_norm": 78.67496490478516, |
| "learning_rate": 1.4618778228759766e-05, |
| "lookahead_loss": 6.8039822130203245, |
| "loss": 4.2545, |
| "step": 371000 |
| }, |
| { |
| "epoch": 0.7085800170898438, |
| "grad_norm": 158.746826171875, |
| "learning_rate": 1.4571094512939453e-05, |
| "lookahead_loss": 6.759525968551635, |
| "loss": 4.2187, |
| "step": 371500 |
| }, |
| { |
| "epoch": 0.70953369140625, |
| "grad_norm": 61.143348693847656, |
| "learning_rate": 1.4523410797119142e-05, |
| "lookahead_loss": 6.759423946380616, |
| "loss": 4.2222, |
| "step": 372000 |
| }, |
| { |
| "epoch": 0.7104873657226562, |
| "grad_norm": 216.4177703857422, |
| "learning_rate": 1.4475727081298829e-05, |
| "lookahead_loss": 6.772340903282165, |
| "loss": 4.2275, |
| "step": 372500 |
| }, |
| { |
| "epoch": 0.7114410400390625, |
| "grad_norm": 244.95675659179688, |
| "learning_rate": 1.4428043365478517e-05, |
| "lookahead_loss": 6.751278745651245, |
| "loss": 4.2117, |
| "step": 373000 |
| }, |
| { |
| "epoch": 0.7123947143554688, |
| "grad_norm": 124.38166809082031, |
| "learning_rate": 1.4380359649658204e-05, |
| "lookahead_loss": 6.779648446083069, |
| "loss": 4.2365, |
| "step": 373500 |
| }, |
| { |
| "epoch": 0.713348388671875, |
| "grad_norm": 144.73187255859375, |
| "learning_rate": 1.433267593383789e-05, |
| "lookahead_loss": 6.753758860588074, |
| "loss": 4.2108, |
| "step": 374000 |
| }, |
| { |
| "epoch": 0.7143020629882812, |
| "grad_norm": 76.78428649902344, |
| "learning_rate": 1.428499221801758e-05, |
| "lookahead_loss": 6.768776368141174, |
| "loss": 4.2291, |
| "step": 374500 |
| }, |
| { |
| "epoch": 0.7152557373046875, |
| "grad_norm": 73.66600799560547, |
| "learning_rate": 1.4237308502197266e-05, |
| "lookahead_loss": 6.75096876335144, |
| "loss": 4.2101, |
| "step": 375000 |
| }, |
| { |
| "epoch": 0.7152557373046875, |
| "eval_accuracy": 0.03955557729941291, |
| "eval_lookahead_loss": 6.656800045394897, |
| "eval_lookahead_perplexity": 778.0572016091816, |
| "eval_loss": 4.138411521911621, |
| "eval_perplexity": 62.70313973551051, |
| "eval_runtime": 604.6384, |
| "eval_samples_per_second": 16.539, |
| "eval_steps_per_second": 4.135, |
| "step": 375000 |
| }, |
| { |
| "epoch": 0.7162094116210938, |
| "grad_norm": 65.6556625366211, |
| "learning_rate": 1.4189624786376955e-05, |
| "lookahead_loss": 6.713685374259949, |
| "loss": 4.1854, |
| "step": 375500 |
| }, |
| { |
| "epoch": 0.7171630859375, |
| "grad_norm": 79.419189453125, |
| "learning_rate": 1.4141941070556641e-05, |
| "lookahead_loss": 6.750682616233826, |
| "loss": 4.2146, |
| "step": 376000 |
| }, |
| { |
| "epoch": 0.7181167602539062, |
| "grad_norm": 67.82014465332031, |
| "learning_rate": 1.4094257354736328e-05, |
| "lookahead_loss": 6.736169541358948, |
| "loss": 4.2009, |
| "step": 376500 |
| }, |
| { |
| "epoch": 0.7190704345703125, |
| "grad_norm": 246.75955200195312, |
| "learning_rate": 1.4046573638916017e-05, |
| "lookahead_loss": 6.7368752555847164, |
| "loss": 4.2055, |
| "step": 377000 |
| }, |
| { |
| "epoch": 0.7200241088867188, |
| "grad_norm": 441.4236145019531, |
| "learning_rate": 1.3998889923095704e-05, |
| "lookahead_loss": 6.752421786308289, |
| "loss": 4.2144, |
| "step": 377500 |
| }, |
| { |
| "epoch": 0.720977783203125, |
| "grad_norm": 72.79417419433594, |
| "learning_rate": 1.3951206207275392e-05, |
| "lookahead_loss": 6.743340139389038, |
| "loss": 4.2148, |
| "step": 378000 |
| }, |
| { |
| "epoch": 0.7219314575195312, |
| "grad_norm": 106.20365142822266, |
| "learning_rate": 1.3903522491455079e-05, |
| "lookahead_loss": 6.760519687652588, |
| "loss": 4.2189, |
| "step": 378500 |
| }, |
| { |
| "epoch": 0.7228851318359375, |
| "grad_norm": 190.8138427734375, |
| "learning_rate": 1.3855838775634766e-05, |
| "lookahead_loss": 6.73517029762268, |
| "loss": 4.1994, |
| "step": 379000 |
| }, |
| { |
| "epoch": 0.7238388061523438, |
| "grad_norm": 350.27081298828125, |
| "learning_rate": 1.3808155059814454e-05, |
| "lookahead_loss": 6.734821619987488, |
| "loss": 4.1995, |
| "step": 379500 |
| }, |
| { |
| "epoch": 0.72479248046875, |
| "grad_norm": 191.59182739257812, |
| "learning_rate": 1.3760471343994141e-05, |
| "lookahead_loss": 6.75331635761261, |
| "loss": 4.2117, |
| "step": 380000 |
| }, |
| { |
| "epoch": 0.72479248046875, |
| "eval_accuracy": 0.03951017612524462, |
| "eval_lookahead_loss": 6.6607113904953, |
| "eval_lookahead_perplexity": 781.106411200636, |
| "eval_loss": 4.1406354904174805, |
| "eval_perplexity": 62.842744724509245, |
| "eval_runtime": 628.139, |
| "eval_samples_per_second": 15.92, |
| "eval_steps_per_second": 3.98, |
| "step": 380000 |
| }, |
| { |
| "epoch": 0.7257461547851562, |
| "grad_norm": 559.4917602539062, |
| "learning_rate": 1.371278762817383e-05, |
| "lookahead_loss": 6.772550243377686, |
| "loss": 4.2324, |
| "step": 380500 |
| }, |
| { |
| "epoch": 0.7266998291015625, |
| "grad_norm": 54.30452346801758, |
| "learning_rate": 1.3665103912353516e-05, |
| "lookahead_loss": 6.759418657302857, |
| "loss": 4.2196, |
| "step": 381000 |
| }, |
| { |
| "epoch": 0.7276535034179688, |
| "grad_norm": 51.869407653808594, |
| "learning_rate": 1.3617420196533203e-05, |
| "lookahead_loss": 6.747075421333313, |
| "loss": 4.2136, |
| "step": 381500 |
| }, |
| { |
| "epoch": 0.728607177734375, |
| "grad_norm": 122.09829711914062, |
| "learning_rate": 1.3569736480712892e-05, |
| "lookahead_loss": 6.7093631448745725, |
| "loss": 4.1885, |
| "step": 382000 |
| }, |
| { |
| "epoch": 0.7295608520507812, |
| "grad_norm": 104.17369842529297, |
| "learning_rate": 1.3522052764892579e-05, |
| "lookahead_loss": 6.681021654129029, |
| "loss": 4.1599, |
| "step": 382500 |
| }, |
| { |
| "epoch": 0.7305145263671875, |
| "grad_norm": 126.83779907226562, |
| "learning_rate": 1.3474369049072265e-05, |
| "lookahead_loss": 6.719450049400329, |
| "loss": 4.1857, |
| "step": 383000 |
| }, |
| { |
| "epoch": 0.7314682006835938, |
| "grad_norm": 131.89505004882812, |
| "learning_rate": 1.3426685333251954e-05, |
| "lookahead_loss": 6.710157048225403, |
| "loss": 4.1801, |
| "step": 383500 |
| }, |
| { |
| "epoch": 0.732421875, |
| "grad_norm": 97.56663513183594, |
| "learning_rate": 1.337900161743164e-05, |
| "lookahead_loss": 6.694256157875061, |
| "loss": 4.1664, |
| "step": 384000 |
| }, |
| { |
| "epoch": 0.7333755493164062, |
| "grad_norm": 114.05889892578125, |
| "learning_rate": 1.333131790161133e-05, |
| "lookahead_loss": 6.685075366020203, |
| "loss": 4.1592, |
| "step": 384500 |
| }, |
| { |
| "epoch": 0.7343292236328125, |
| "grad_norm": 131.41526794433594, |
| "learning_rate": 1.3283634185791016e-05, |
| "lookahead_loss": 6.7366541376113895, |
| "loss": 4.2015, |
| "step": 385000 |
| }, |
| { |
| "epoch": 0.7343292236328125, |
| "eval_accuracy": 0.03966086105675147, |
| "eval_lookahead_loss": 6.653297366905212, |
| "eval_lookahead_perplexity": 775.33668471481, |
| "eval_loss": 4.135571002960205, |
| "eval_perplexity": 62.525283001078016, |
| "eval_runtime": 602.4062, |
| "eval_samples_per_second": 16.6, |
| "eval_steps_per_second": 4.15, |
| "step": 385000 |
| }, |
| { |
| "epoch": 0.7352828979492188, |
| "grad_norm": 175.20257568359375, |
| "learning_rate": 1.3235950469970703e-05, |
| "lookahead_loss": 6.774082406997681, |
| "loss": 4.2338, |
| "step": 385500 |
| }, |
| { |
| "epoch": 0.736236572265625, |
| "grad_norm": 783.3842163085938, |
| "learning_rate": 1.3188266754150391e-05, |
| "lookahead_loss": 6.774978160858154, |
| "loss": 4.2347, |
| "step": 386000 |
| }, |
| { |
| "epoch": 0.7371902465820312, |
| "grad_norm": 87.95611572265625, |
| "learning_rate": 1.3140583038330078e-05, |
| "lookahead_loss": 6.763265552520752, |
| "loss": 4.2226, |
| "step": 386500 |
| }, |
| { |
| "epoch": 0.7381439208984375, |
| "grad_norm": 97.0749282836914, |
| "learning_rate": 1.3092899322509767e-05, |
| "lookahead_loss": 6.743141615867615, |
| "loss": 4.2081, |
| "step": 387000 |
| }, |
| { |
| "epoch": 0.7390975952148438, |
| "grad_norm": 88.42437744140625, |
| "learning_rate": 1.3045215606689454e-05, |
| "lookahead_loss": 6.724607077598572, |
| "loss": 4.1922, |
| "step": 387500 |
| }, |
| { |
| "epoch": 0.74005126953125, |
| "grad_norm": 719.595703125, |
| "learning_rate": 1.299753189086914e-05, |
| "lookahead_loss": 6.778698437690735, |
| "loss": 4.2375, |
| "step": 388000 |
| }, |
| { |
| "epoch": 0.7410049438476562, |
| "grad_norm": 80.32106018066406, |
| "learning_rate": 1.2949848175048829e-05, |
| "lookahead_loss": 6.75388152885437, |
| "loss": 4.2147, |
| "step": 388500 |
| }, |
| { |
| "epoch": 0.7419586181640625, |
| "grad_norm": 89.23448181152344, |
| "learning_rate": 1.2902164459228516e-05, |
| "lookahead_loss": 6.729073187351227, |
| "loss": 4.1971, |
| "step": 389000 |
| }, |
| { |
| "epoch": 0.7429122924804688, |
| "grad_norm": 270.7666931152344, |
| "learning_rate": 1.2854480743408204e-05, |
| "lookahead_loss": 6.779856322288513, |
| "loss": 4.2368, |
| "step": 389500 |
| }, |
| { |
| "epoch": 0.743865966796875, |
| "grad_norm": 88.04039764404297, |
| "learning_rate": 1.2806797027587891e-05, |
| "lookahead_loss": 6.767697734832764, |
| "loss": 4.2242, |
| "step": 390000 |
| }, |
| { |
| "epoch": 0.743865966796875, |
| "eval_accuracy": 0.03954579256360078, |
| "eval_lookahead_loss": 6.652922348213195, |
| "eval_lookahead_perplexity": 775.0459734799163, |
| "eval_loss": 4.134410381317139, |
| "eval_perplexity": 62.4527569002106, |
| "eval_runtime": 545.6556, |
| "eval_samples_per_second": 18.327, |
| "eval_steps_per_second": 4.582, |
| "step": 390000 |
| }, |
| { |
| "epoch": 0.7448196411132812, |
| "grad_norm": 162.0526885986328, |
| "learning_rate": 1.2759113311767578e-05, |
| "lookahead_loss": 6.741535930633545, |
| "loss": 4.2058, |
| "step": 390500 |
| }, |
| { |
| "epoch": 0.7457733154296875, |
| "grad_norm": 279.76202392578125, |
| "learning_rate": 1.2711429595947266e-05, |
| "lookahead_loss": 6.76415267086029, |
| "loss": 4.2225, |
| "step": 391000 |
| }, |
| { |
| "epoch": 0.7467269897460938, |
| "grad_norm": 85.63066101074219, |
| "learning_rate": 1.2663745880126953e-05, |
| "lookahead_loss": 6.719045895576477, |
| "loss": 4.1871, |
| "step": 391500 |
| }, |
| { |
| "epoch": 0.7476806640625, |
| "grad_norm": 76.89558410644531, |
| "learning_rate": 1.2616062164306642e-05, |
| "lookahead_loss": 6.722171626091003, |
| "loss": 4.1883, |
| "step": 392000 |
| }, |
| { |
| "epoch": 0.7486343383789062, |
| "grad_norm": 80.84141540527344, |
| "learning_rate": 1.2568378448486329e-05, |
| "lookahead_loss": 6.738049573898316, |
| "loss": 4.1998, |
| "step": 392500 |
| }, |
| { |
| "epoch": 0.7495880126953125, |
| "grad_norm": 71.48626708984375, |
| "learning_rate": 1.2520694732666015e-05, |
| "lookahead_loss": 6.722238230705261, |
| "loss": 4.1868, |
| "step": 393000 |
| }, |
| { |
| "epoch": 0.7505416870117188, |
| "grad_norm": 89.6850357055664, |
| "learning_rate": 1.2473011016845704e-05, |
| "lookahead_loss": 6.74547945022583, |
| "loss": 4.2057, |
| "step": 393500 |
| }, |
| { |
| "epoch": 0.751495361328125, |
| "grad_norm": 145.5947723388672, |
| "learning_rate": 1.242532730102539e-05, |
| "lookahead_loss": 6.7350228714942935, |
| "loss": 4.2011, |
| "step": 394000 |
| }, |
| { |
| "epoch": 0.7524490356445312, |
| "grad_norm": 133.01275634765625, |
| "learning_rate": 1.237764358520508e-05, |
| "lookahead_loss": 6.739734363555908, |
| "loss": 4.2028, |
| "step": 394500 |
| }, |
| { |
| "epoch": 0.7534027099609375, |
| "grad_norm": 125.26770782470703, |
| "learning_rate": 1.2329959869384766e-05, |
| "lookahead_loss": 6.728721881866455, |
| "loss": 4.1964, |
| "step": 395000 |
| }, |
| { |
| "epoch": 0.7534027099609375, |
| "eval_accuracy": 0.039690606653620356, |
| "eval_lookahead_loss": 6.6506430501937865, |
| "eval_lookahead_perplexity": 773.2814244580899, |
| "eval_loss": 4.131379127502441, |
| "eval_perplexity": 62.263733376494, |
| "eval_runtime": 548.2699, |
| "eval_samples_per_second": 18.239, |
| "eval_steps_per_second": 4.56, |
| "step": 395000 |
| }, |
| { |
| "epoch": 0.7543563842773438, |
| "grad_norm": 255.24940490722656, |
| "learning_rate": 1.2282276153564453e-05, |
| "lookahead_loss": 6.707877191543579, |
| "loss": 4.18, |
| "step": 395500 |
| }, |
| { |
| "epoch": 0.75531005859375, |
| "grad_norm": 220.05422973632812, |
| "learning_rate": 1.2234592437744141e-05, |
| "lookahead_loss": 6.661872485160828, |
| "loss": 4.1398, |
| "step": 396000 |
| }, |
| { |
| "epoch": 0.7562637329101562, |
| "grad_norm": 79.41294860839844, |
| "learning_rate": 1.2186908721923828e-05, |
| "lookahead_loss": 6.725895358085633, |
| "loss": 4.1975, |
| "step": 396500 |
| }, |
| { |
| "epoch": 0.7572174072265625, |
| "grad_norm": 86.21733093261719, |
| "learning_rate": 1.2139225006103517e-05, |
| "lookahead_loss": 6.736812241554261, |
| "loss": 4.2011, |
| "step": 397000 |
| }, |
| { |
| "epoch": 0.7581710815429688, |
| "grad_norm": 159.94407653808594, |
| "learning_rate": 1.2091541290283204e-05, |
| "lookahead_loss": 6.746391032218933, |
| "loss": 4.2101, |
| "step": 397500 |
| }, |
| { |
| "epoch": 0.759124755859375, |
| "grad_norm": 218.86178588867188, |
| "learning_rate": 1.204385757446289e-05, |
| "lookahead_loss": 6.716503149986267, |
| "loss": 4.1848, |
| "step": 398000 |
| }, |
| { |
| "epoch": 0.7600784301757812, |
| "grad_norm": 52.67180252075195, |
| "learning_rate": 1.1996173858642579e-05, |
| "lookahead_loss": 6.739886863708496, |
| "loss": 4.2042, |
| "step": 398500 |
| }, |
| { |
| "epoch": 0.7610321044921875, |
| "grad_norm": 92.24057006835938, |
| "learning_rate": 1.1948490142822266e-05, |
| "lookahead_loss": 6.717536543846131, |
| "loss": 4.1951, |
| "step": 399000 |
| }, |
| { |
| "epoch": 0.7619857788085938, |
| "grad_norm": 300.2378845214844, |
| "learning_rate": 1.1900806427001954e-05, |
| "lookahead_loss": 6.713908068656921, |
| "loss": 4.1851, |
| "step": 399500 |
| }, |
| { |
| "epoch": 0.762939453125, |
| "grad_norm": 135.1028289794922, |
| "learning_rate": 1.1853122711181641e-05, |
| "lookahead_loss": 6.724124988555908, |
| "loss": 4.1939, |
| "step": 400000 |
| }, |
| { |
| "epoch": 0.762939453125, |
| "eval_accuracy": 0.04010939334637965, |
| "eval_lookahead_loss": 6.6492425035476685, |
| "eval_lookahead_perplexity": 772.1991658063292, |
| "eval_loss": 4.129917621612549, |
| "eval_perplexity": 62.17280102870445, |
| "eval_runtime": 629.1957, |
| "eval_samples_per_second": 15.893, |
| "eval_steps_per_second": 3.973, |
| "step": 400000 |
| }, |
| { |
| "epoch": 0.7638931274414062, |
| "grad_norm": 92.46707916259766, |
| "learning_rate": 1.1805438995361328e-05, |
| "lookahead_loss": 6.714075137138367, |
| "loss": 4.1881, |
| "step": 400500 |
| }, |
| { |
| "epoch": 0.7648468017578125, |
| "grad_norm": 137.0272979736328, |
| "learning_rate": 1.1757755279541016e-05, |
| "lookahead_loss": 6.693613886833191, |
| "loss": 4.1649, |
| "step": 401000 |
| }, |
| { |
| "epoch": 0.7658004760742188, |
| "grad_norm": 141.4224090576172, |
| "learning_rate": 1.1710071563720703e-05, |
| "lookahead_loss": 6.686843331336975, |
| "loss": 4.1573, |
| "step": 401500 |
| }, |
| { |
| "epoch": 0.766754150390625, |
| "grad_norm": 323.1603088378906, |
| "learning_rate": 1.1662387847900392e-05, |
| "lookahead_loss": 6.700284439086914, |
| "loss": 4.1691, |
| "step": 402000 |
| }, |
| { |
| "epoch": 0.7677078247070312, |
| "grad_norm": 100.39446258544922, |
| "learning_rate": 1.1614704132080079e-05, |
| "lookahead_loss": 6.672040264129639, |
| "loss": 4.1457, |
| "step": 402500 |
| }, |
| { |
| "epoch": 0.7686614990234375, |
| "grad_norm": 110.9265365600586, |
| "learning_rate": 1.1567020416259765e-05, |
| "lookahead_loss": 6.752053305625916, |
| "loss": 4.2114, |
| "step": 403000 |
| }, |
| { |
| "epoch": 0.7696151733398438, |
| "grad_norm": 66.57839965820312, |
| "learning_rate": 1.1519336700439454e-05, |
| "lookahead_loss": 6.693446171760559, |
| "loss": 4.1671, |
| "step": 403500 |
| }, |
| { |
| "epoch": 0.77056884765625, |
| "grad_norm": 1101.944091796875, |
| "learning_rate": 1.147165298461914e-05, |
| "lookahead_loss": 6.747435054779053, |
| "loss": 4.2106, |
| "step": 404000 |
| }, |
| { |
| "epoch": 0.7715225219726562, |
| "grad_norm": 216.44805908203125, |
| "learning_rate": 1.142396926879883e-05, |
| "lookahead_loss": 6.790946921348572, |
| "loss": 4.2487, |
| "step": 404500 |
| }, |
| { |
| "epoch": 0.7724761962890625, |
| "grad_norm": 115.8578109741211, |
| "learning_rate": 1.1376285552978516e-05, |
| "lookahead_loss": 6.77033796787262, |
| "loss": 4.229, |
| "step": 405000 |
| }, |
| { |
| "epoch": 0.7724761962890625, |
| "eval_accuracy": 0.039693933463796474, |
| "eval_lookahead_loss": 6.644717829895019, |
| "eval_lookahead_perplexity": 768.713109167925, |
| "eval_loss": 4.127577781677246, |
| "eval_perplexity": 62.02749668671223, |
| "eval_runtime": 541.354, |
| "eval_samples_per_second": 18.472, |
| "eval_steps_per_second": 4.618, |
| "step": 405000 |
| }, |
| { |
| "epoch": 0.7734298706054688, |
| "grad_norm": 124.61088562011719, |
| "learning_rate": 1.1328601837158203e-05, |
| "lookahead_loss": 6.752311071395874, |
| "loss": 4.216, |
| "step": 405500 |
| }, |
| { |
| "epoch": 0.774383544921875, |
| "grad_norm": 96.72774505615234, |
| "learning_rate": 1.1280918121337891e-05, |
| "lookahead_loss": 6.769245140075683, |
| "loss": 4.2302, |
| "step": 406000 |
| }, |
| { |
| "epoch": 0.7753372192382812, |
| "grad_norm": 114.72098541259766, |
| "learning_rate": 1.1233234405517578e-05, |
| "lookahead_loss": 6.730911369323731, |
| "loss": 4.1923, |
| "step": 406500 |
| }, |
| { |
| "epoch": 0.7762908935546875, |
| "grad_norm": 151.98924255371094, |
| "learning_rate": 1.1185550689697267e-05, |
| "lookahead_loss": 6.768715893745422, |
| "loss": 4.2234, |
| "step": 407000 |
| }, |
| { |
| "epoch": 0.7772445678710938, |
| "grad_norm": 148.03590393066406, |
| "learning_rate": 1.1137866973876954e-05, |
| "lookahead_loss": 6.697064418494701, |
| "loss": 4.177, |
| "step": 407500 |
| }, |
| { |
| "epoch": 0.7781982421875, |
| "grad_norm": 95.94850158691406, |
| "learning_rate": 1.109018325805664e-05, |
| "lookahead_loss": 6.716143970489502, |
| "loss": 4.1869, |
| "step": 408000 |
| }, |
| { |
| "epoch": 0.7791519165039062, |
| "grad_norm": 67.62986755371094, |
| "learning_rate": 1.1042499542236329e-05, |
| "lookahead_loss": 6.7381351261138915, |
| "loss": 4.1996, |
| "step": 408500 |
| }, |
| { |
| "epoch": 0.7801055908203125, |
| "grad_norm": 70.48086547851562, |
| "learning_rate": 1.0994815826416016e-05, |
| "lookahead_loss": 6.740923089027405, |
| "loss": 4.1992, |
| "step": 409000 |
| }, |
| { |
| "epoch": 0.7810592651367188, |
| "grad_norm": 97.19287109375, |
| "learning_rate": 1.0947132110595704e-05, |
| "lookahead_loss": 6.766435678482056, |
| "loss": 4.221, |
| "step": 409500 |
| }, |
| { |
| "epoch": 0.782012939453125, |
| "grad_norm": 64.90044403076172, |
| "learning_rate": 1.0899448394775391e-05, |
| "lookahead_loss": 6.721424859046936, |
| "loss": 4.1805, |
| "step": 410000 |
| }, |
| { |
| "epoch": 0.782012939453125, |
| "eval_accuracy": 0.039914677103718196, |
| "eval_lookahead_loss": 6.640819017601014, |
| "eval_lookahead_perplexity": 765.7218759648092, |
| "eval_loss": 4.122921943664551, |
| "eval_perplexity": 61.73937794735351, |
| "eval_runtime": 544.2769, |
| "eval_samples_per_second": 18.373, |
| "eval_steps_per_second": 4.593, |
| "step": 410000 |
| }, |
| { |
| "epoch": 0.7829666137695312, |
| "grad_norm": 456.87994384765625, |
| "learning_rate": 1.0851764678955078e-05, |
| "lookahead_loss": 6.679848365783691, |
| "loss": 4.1521, |
| "step": 410500 |
| }, |
| { |
| "epoch": 0.7839202880859375, |
| "grad_norm": 108.23741149902344, |
| "learning_rate": 1.0804080963134766e-05, |
| "lookahead_loss": 6.736476675987244, |
| "loss": 4.1969, |
| "step": 411000 |
| }, |
| { |
| "epoch": 0.7848739624023438, |
| "grad_norm": 195.4386749267578, |
| "learning_rate": 1.0756397247314453e-05, |
| "lookahead_loss": 6.749841248512268, |
| "loss": 4.2073, |
| "step": 411500 |
| }, |
| { |
| "epoch": 0.78582763671875, |
| "grad_norm": 264.2134704589844, |
| "learning_rate": 1.0708713531494142e-05, |
| "lookahead_loss": 6.739609439849853, |
| "loss": 4.1997, |
| "step": 412000 |
| }, |
| { |
| "epoch": 0.7867813110351562, |
| "grad_norm": 114.15167999267578, |
| "learning_rate": 1.0661029815673829e-05, |
| "lookahead_loss": 6.74979611492157, |
| "loss": 4.2068, |
| "step": 412500 |
| }, |
| { |
| "epoch": 0.7877349853515625, |
| "grad_norm": 96.55416870117188, |
| "learning_rate": 1.0613346099853515e-05, |
| "lookahead_loss": 6.75324995136261, |
| "loss": 4.21, |
| "step": 413000 |
| }, |
| { |
| "epoch": 0.7886886596679688, |
| "grad_norm": 66.29523468017578, |
| "learning_rate": 1.0565662384033204e-05, |
| "lookahead_loss": 6.745070665359497, |
| "loss": 4.2114, |
| "step": 413500 |
| }, |
| { |
| "epoch": 0.789642333984375, |
| "grad_norm": 186.03321838378906, |
| "learning_rate": 1.051797866821289e-05, |
| "lookahead_loss": 6.733612382888794, |
| "loss": 4.1965, |
| "step": 414000 |
| }, |
| { |
| "epoch": 0.7905960083007812, |
| "grad_norm": 130.6208953857422, |
| "learning_rate": 1.047029495239258e-05, |
| "lookahead_loss": 6.736516487121582, |
| "loss": 4.2007, |
| "step": 414500 |
| }, |
| { |
| "epoch": 0.7915496826171875, |
| "grad_norm": 209.33946228027344, |
| "learning_rate": 1.0422611236572266e-05, |
| "lookahead_loss": 6.745011897087097, |
| "loss": 4.2036, |
| "step": 415000 |
| }, |
| { |
| "epoch": 0.7915496826171875, |
| "eval_accuracy": 0.039369275929549905, |
| "eval_lookahead_loss": 6.6451485158920285, |
| "eval_lookahead_perplexity": 769.0442544444592, |
| "eval_loss": 4.126081943511963, |
| "eval_perplexity": 61.93478294953203, |
| "eval_runtime": 531.8455, |
| "eval_samples_per_second": 18.802, |
| "eval_steps_per_second": 4.701, |
| "step": 415000 |
| }, |
| { |
| "epoch": 0.7925033569335938, |
| "grad_norm": 96.8683853149414, |
| "learning_rate": 1.0374927520751953e-05, |
| "lookahead_loss": 6.726663240432739, |
| "loss": 4.1929, |
| "step": 415500 |
| }, |
| { |
| "epoch": 0.79345703125, |
| "grad_norm": 87.13639068603516, |
| "learning_rate": 1.0327243804931641e-05, |
| "lookahead_loss": 6.71943754863739, |
| "loss": 4.1894, |
| "step": 416000 |
| }, |
| { |
| "epoch": 0.7944107055664062, |
| "grad_norm": 179.15228271484375, |
| "learning_rate": 1.0279560089111328e-05, |
| "lookahead_loss": 6.736039206504822, |
| "loss": 4.1963, |
| "step": 416500 |
| }, |
| { |
| "epoch": 0.7953643798828125, |
| "grad_norm": 179.32254028320312, |
| "learning_rate": 1.0231876373291017e-05, |
| "lookahead_loss": 6.745444897651672, |
| "loss": 4.2135, |
| "step": 417000 |
| }, |
| { |
| "epoch": 0.7963180541992188, |
| "grad_norm": 68.59551239013672, |
| "learning_rate": 1.0184192657470704e-05, |
| "lookahead_loss": 6.708368253707886, |
| "loss": 4.176, |
| "step": 417500 |
| }, |
| { |
| "epoch": 0.797271728515625, |
| "grad_norm": 133.32241821289062, |
| "learning_rate": 1.013650894165039e-05, |
| "lookahead_loss": 6.733284886360169, |
| "loss": 4.1957, |
| "step": 418000 |
| }, |
| { |
| "epoch": 0.7982254028320312, |
| "grad_norm": 125.75166320800781, |
| "learning_rate": 1.0088825225830079e-05, |
| "lookahead_loss": 6.712747953414917, |
| "loss": 4.1804, |
| "step": 418500 |
| }, |
| { |
| "epoch": 0.7991790771484375, |
| "grad_norm": 190.40147399902344, |
| "learning_rate": 1.0041141510009766e-05, |
| "lookahead_loss": 6.675843842506409, |
| "loss": 4.1519, |
| "step": 419000 |
| }, |
| { |
| "epoch": 0.8001327514648438, |
| "grad_norm": 71.69327545166016, |
| "learning_rate": 9.993457794189454e-06, |
| "lookahead_loss": 6.696978316307068, |
| "loss": 4.1642, |
| "step": 419500 |
| }, |
| { |
| "epoch": 0.80108642578125, |
| "grad_norm": 373.5713806152344, |
| "learning_rate": 9.945774078369141e-06, |
| "lookahead_loss": 6.698423825263977, |
| "loss": 4.169, |
| "step": 420000 |
| }, |
| { |
| "epoch": 0.80108642578125, |
| "eval_accuracy": 0.04042915851272016, |
| "eval_lookahead_loss": 6.636352127075195, |
| "eval_lookahead_perplexity": 762.3091080765332, |
| "eval_loss": 4.119161128997803, |
| "eval_perplexity": 61.507623654880035, |
| "eval_runtime": 518.8657, |
| "eval_samples_per_second": 19.273, |
| "eval_steps_per_second": 4.818, |
| "step": 420000 |
| }, |
| { |
| "epoch": 0.8020401000976562, |
| "grad_norm": 104.65477752685547, |
| "learning_rate": 9.898090362548828e-06, |
| "lookahead_loss": 6.708876605033875, |
| "loss": 4.179, |
| "step": 420500 |
| }, |
| { |
| "epoch": 0.8029937744140625, |
| "grad_norm": 215.17054748535156, |
| "learning_rate": 9.850406646728516e-06, |
| "lookahead_loss": 6.743595356941223, |
| "loss": 4.2033, |
| "step": 421000 |
| }, |
| { |
| "epoch": 0.8039474487304688, |
| "grad_norm": 253.47689819335938, |
| "learning_rate": 9.802722930908203e-06, |
| "lookahead_loss": 6.770770524024964, |
| "loss": 4.2282, |
| "step": 421500 |
| }, |
| { |
| "epoch": 0.804901123046875, |
| "grad_norm": 266.0059509277344, |
| "learning_rate": 9.755039215087892e-06, |
| "lookahead_loss": 6.756140286445618, |
| "loss": 4.213, |
| "step": 422000 |
| }, |
| { |
| "epoch": 0.8058547973632812, |
| "grad_norm": 100.93340301513672, |
| "learning_rate": 9.707355499267579e-06, |
| "lookahead_loss": 6.763956332206726, |
| "loss": 4.2181, |
| "step": 422500 |
| }, |
| { |
| "epoch": 0.8068084716796875, |
| "grad_norm": 181.82791137695312, |
| "learning_rate": 9.659671783447265e-06, |
| "lookahead_loss": 6.751832070350647, |
| "loss": 4.2097, |
| "step": 423000 |
| }, |
| { |
| "epoch": 0.8077621459960938, |
| "grad_norm": 150.0238800048828, |
| "learning_rate": 9.611988067626954e-06, |
| "lookahead_loss": 6.761205335617065, |
| "loss": 4.2169, |
| "step": 423500 |
| }, |
| { |
| "epoch": 0.8087158203125, |
| "grad_norm": 148.784912109375, |
| "learning_rate": 9.56430435180664e-06, |
| "lookahead_loss": 6.7160528411865235, |
| "loss": 4.1819, |
| "step": 424000 |
| }, |
| { |
| "epoch": 0.8096694946289062, |
| "grad_norm": 138.87332153320312, |
| "learning_rate": 9.51662063598633e-06, |
| "lookahead_loss": 6.747767681121826, |
| "loss": 4.2099, |
| "step": 424500 |
| }, |
| { |
| "epoch": 0.8106231689453125, |
| "grad_norm": 266.2472229003906, |
| "learning_rate": 9.468936920166016e-06, |
| "lookahead_loss": 6.713744059562683, |
| "loss": 4.1815, |
| "step": 425000 |
| }, |
| { |
| "epoch": 0.8106231689453125, |
| "eval_accuracy": 0.039504892367906067, |
| "eval_lookahead_loss": 6.6398556581497195, |
| "eval_lookahead_perplexity": 764.984565762934, |
| "eval_loss": 4.120965957641602, |
| "eval_perplexity": 61.618734613823904, |
| "eval_runtime": 522.4957, |
| "eval_samples_per_second": 19.139, |
| "eval_steps_per_second": 4.785, |
| "step": 425000 |
| }, |
| { |
| "epoch": 0.8115768432617188, |
| "grad_norm": 103.43211364746094, |
| "learning_rate": 9.421253204345703e-06, |
| "lookahead_loss": 6.73514714717865, |
| "loss": 4.1941, |
| "step": 425500 |
| }, |
| { |
| "epoch": 0.812530517578125, |
| "grad_norm": 361.7665100097656, |
| "learning_rate": 9.373569488525391e-06, |
| "lookahead_loss": 6.74234337425232, |
| "loss": 4.2007, |
| "step": 426000 |
| }, |
| { |
| "epoch": 0.8134841918945312, |
| "grad_norm": 87.8446273803711, |
| "learning_rate": 9.325885772705078e-06, |
| "lookahead_loss": 6.754991671562195, |
| "loss": 4.2121, |
| "step": 426500 |
| }, |
| { |
| "epoch": 0.8144378662109375, |
| "grad_norm": 76.55017852783203, |
| "learning_rate": 9.278202056884767e-06, |
| "lookahead_loss": 6.745593297958374, |
| "loss": 4.2036, |
| "step": 427000 |
| }, |
| { |
| "epoch": 0.8153915405273438, |
| "grad_norm": 218.13674926757812, |
| "learning_rate": 9.230518341064454e-06, |
| "lookahead_loss": 6.726083401679992, |
| "loss": 4.1887, |
| "step": 427500 |
| }, |
| { |
| "epoch": 0.81634521484375, |
| "grad_norm": 123.31546783447266, |
| "learning_rate": 9.18283462524414e-06, |
| "lookahead_loss": 6.736561235427857, |
| "loss": 4.1959, |
| "step": 428000 |
| }, |
| { |
| "epoch": 0.8172988891601562, |
| "grad_norm": 85.19544219970703, |
| "learning_rate": 9.135150909423829e-06, |
| "lookahead_loss": 6.7593774347305295, |
| "loss": 4.2144, |
| "step": 428500 |
| }, |
| { |
| "epoch": 0.8182525634765625, |
| "grad_norm": 498.4485168457031, |
| "learning_rate": 9.087467193603516e-06, |
| "lookahead_loss": 6.723316964149475, |
| "loss": 4.1825, |
| "step": 429000 |
| }, |
| { |
| "epoch": 0.8192062377929688, |
| "grad_norm": 601.1961669921875, |
| "learning_rate": 9.039783477783204e-06, |
| "lookahead_loss": 6.732193453788757, |
| "loss": 4.1938, |
| "step": 429500 |
| }, |
| { |
| "epoch": 0.820159912109375, |
| "grad_norm": 103.03712463378906, |
| "learning_rate": 8.992099761962891e-06, |
| "lookahead_loss": 6.727562563896179, |
| "loss": 4.189, |
| "step": 430000 |
| }, |
| { |
| "epoch": 0.820159912109375, |
| "eval_accuracy": 0.03966301369863014, |
| "eval_lookahead_loss": 6.636439321517944, |
| "eval_lookahead_perplexity": 762.3755800923673, |
| "eval_loss": 4.118071556091309, |
| "eval_perplexity": 61.44064311132203, |
| "eval_runtime": 545.7588, |
| "eval_samples_per_second": 18.323, |
| "eval_steps_per_second": 4.581, |
| "step": 430000 |
| }, |
| { |
| "epoch": 0.8211135864257812, |
| "grad_norm": 285.5552978515625, |
| "learning_rate": 8.944416046142578e-06, |
| "lookahead_loss": 6.718878838539124, |
| "loss": 4.1851, |
| "step": 430500 |
| }, |
| { |
| "epoch": 0.8220672607421875, |
| "grad_norm": 249.6902618408203, |
| "learning_rate": 8.896732330322266e-06, |
| "lookahead_loss": 6.740551570892334, |
| "loss": 4.2036, |
| "step": 431000 |
| }, |
| { |
| "epoch": 0.8230209350585938, |
| "grad_norm": 94.56451416015625, |
| "learning_rate": 8.849048614501953e-06, |
| "lookahead_loss": 6.744161255836487, |
| "loss": 4.2036, |
| "step": 431500 |
| }, |
| { |
| "epoch": 0.823974609375, |
| "grad_norm": 125.3357925415039, |
| "learning_rate": 8.801364898681642e-06, |
| "lookahead_loss": 6.708107975959778, |
| "loss": 4.1729, |
| "step": 432000 |
| }, |
| { |
| "epoch": 0.8249282836914062, |
| "grad_norm": 189.7957000732422, |
| "learning_rate": 8.753681182861329e-06, |
| "lookahead_loss": 6.752819055557251, |
| "loss": 4.2105, |
| "step": 432500 |
| }, |
| { |
| "epoch": 0.8258819580078125, |
| "grad_norm": 169.02549743652344, |
| "learning_rate": 8.705997467041015e-06, |
| "lookahead_loss": 6.713758069038391, |
| "loss": 4.1825, |
| "step": 433000 |
| }, |
| { |
| "epoch": 0.8268356323242188, |
| "grad_norm": 136.54107666015625, |
| "learning_rate": 8.658313751220704e-06, |
| "lookahead_loss": 6.742133658409118, |
| "loss": 4.2052, |
| "step": 433500 |
| }, |
| { |
| "epoch": 0.827789306640625, |
| "grad_norm": 73.28398895263672, |
| "learning_rate": 8.61063003540039e-06, |
| "lookahead_loss": 6.718400698661804, |
| "loss": 4.1906, |
| "step": 434000 |
| }, |
| { |
| "epoch": 0.8287429809570312, |
| "grad_norm": 222.73492431640625, |
| "learning_rate": 8.56294631958008e-06, |
| "lookahead_loss": 6.727536248207092, |
| "loss": 4.1911, |
| "step": 434500 |
| }, |
| { |
| "epoch": 0.8296966552734375, |
| "grad_norm": 107.4002685546875, |
| "learning_rate": 8.515262603759766e-06, |
| "lookahead_loss": 6.7094270057678225, |
| "loss": 4.1774, |
| "step": 435000 |
| }, |
| { |
| "epoch": 0.8296966552734375, |
| "eval_accuracy": 0.03907221135029354, |
| "eval_lookahead_loss": 6.636478104400635, |
| "eval_lookahead_perplexity": 762.4051477884132, |
| "eval_loss": 4.117182731628418, |
| "eval_perplexity": 61.386057426848396, |
| "eval_runtime": 520.3423, |
| "eval_samples_per_second": 19.218, |
| "eval_steps_per_second": 4.805, |
| "step": 435000 |
| }, |
| { |
| "epoch": 0.8306503295898438, |
| "grad_norm": 92.71411895751953, |
| "learning_rate": 8.467578887939453e-06, |
| "lookahead_loss": 6.715947633743286, |
| "loss": 4.1791, |
| "step": 435500 |
| }, |
| { |
| "epoch": 0.83160400390625, |
| "grad_norm": 127.72246551513672, |
| "learning_rate": 8.419895172119141e-06, |
| "lookahead_loss": 6.699627959251404, |
| "loss": 4.1683, |
| "step": 436000 |
| }, |
| { |
| "epoch": 0.8325576782226562, |
| "grad_norm": 149.75445556640625, |
| "learning_rate": 8.372211456298828e-06, |
| "lookahead_loss": 6.682903980255127, |
| "loss": 4.1536, |
| "step": 436500 |
| }, |
| { |
| "epoch": 0.8335113525390625, |
| "grad_norm": 219.17393493652344, |
| "learning_rate": 8.324527740478517e-06, |
| "lookahead_loss": 6.673503661155701, |
| "loss": 4.1439, |
| "step": 437000 |
| }, |
| { |
| "epoch": 0.8344650268554688, |
| "grad_norm": 152.45407104492188, |
| "learning_rate": 8.276844024658204e-06, |
| "lookahead_loss": 6.710149125099182, |
| "loss": 4.1776, |
| "step": 437500 |
| }, |
| { |
| "epoch": 0.835418701171875, |
| "grad_norm": 121.80523681640625, |
| "learning_rate": 8.22916030883789e-06, |
| "lookahead_loss": 6.717210608482361, |
| "loss": 4.1858, |
| "step": 438000 |
| }, |
| { |
| "epoch": 0.8363723754882812, |
| "grad_norm": 210.8258819580078, |
| "learning_rate": 8.181476593017579e-06, |
| "lookahead_loss": 6.7499087524414065, |
| "loss": 4.2124, |
| "step": 438500 |
| }, |
| { |
| "epoch": 0.8373260498046875, |
| "grad_norm": 136.6400604248047, |
| "learning_rate": 8.133792877197266e-06, |
| "lookahead_loss": 6.755180807113647, |
| "loss": 4.2146, |
| "step": 439000 |
| }, |
| { |
| "epoch": 0.8382797241210938, |
| "grad_norm": 228.47804260253906, |
| "learning_rate": 8.086109161376954e-06, |
| "lookahead_loss": 6.756373795509338, |
| "loss": 4.2163, |
| "step": 439500 |
| }, |
| { |
| "epoch": 0.8392333984375, |
| "grad_norm": 177.43670654296875, |
| "learning_rate": 8.038425445556641e-06, |
| "lookahead_loss": 6.736400113105774, |
| "loss": 4.1925, |
| "step": 440000 |
| }, |
| { |
| "epoch": 0.8392333984375, |
| "eval_accuracy": 0.03982035225048924, |
| "eval_lookahead_loss": 6.630845066070557, |
| "eval_lookahead_perplexity": 758.1225636723709, |
| "eval_loss": 4.112912654876709, |
| "eval_perplexity": 61.124493097469106, |
| "eval_runtime": 520.3146, |
| "eval_samples_per_second": 19.219, |
| "eval_steps_per_second": 4.805, |
| "step": 440000 |
| }, |
| { |
| "epoch": 0.8401870727539062, |
| "grad_norm": 79.19274139404297, |
| "learning_rate": 7.990741729736328e-06, |
| "lookahead_loss": 6.734998291015625, |
| "loss": 4.1974, |
| "step": 440500 |
| }, |
| { |
| "epoch": 0.8411407470703125, |
| "grad_norm": 121.88748931884766, |
| "learning_rate": 7.943058013916016e-06, |
| "lookahead_loss": 6.719315155029297, |
| "loss": 4.1827, |
| "step": 441000 |
| }, |
| { |
| "epoch": 0.8420944213867188, |
| "grad_norm": 101.53911590576172, |
| "learning_rate": 7.895374298095703e-06, |
| "lookahead_loss": 6.695139500617981, |
| "loss": 4.1631, |
| "step": 441500 |
| }, |
| { |
| "epoch": 0.843048095703125, |
| "grad_norm": 97.01951599121094, |
| "learning_rate": 7.847690582275392e-06, |
| "lookahead_loss": 6.702260056495667, |
| "loss": 4.1669, |
| "step": 442000 |
| }, |
| { |
| "epoch": 0.8440017700195312, |
| "grad_norm": 77.66926574707031, |
| "learning_rate": 7.800006866455079e-06, |
| "lookahead_loss": 6.735781726837158, |
| "loss": 4.1951, |
| "step": 442500 |
| }, |
| { |
| "epoch": 0.8449554443359375, |
| "grad_norm": 148.02151489257812, |
| "learning_rate": 7.752323150634765e-06, |
| "lookahead_loss": 6.730671715736389, |
| "loss": 4.1879, |
| "step": 443000 |
| }, |
| { |
| "epoch": 0.8459091186523438, |
| "grad_norm": 93.03111267089844, |
| "learning_rate": 7.704639434814454e-06, |
| "lookahead_loss": 6.706265162467957, |
| "loss": 4.1701, |
| "step": 443500 |
| }, |
| { |
| "epoch": 0.84686279296875, |
| "grad_norm": 219.8049774169922, |
| "learning_rate": 7.65695571899414e-06, |
| "lookahead_loss": 6.72840634059906, |
| "loss": 4.1866, |
| "step": 444000 |
| }, |
| { |
| "epoch": 0.8478164672851562, |
| "grad_norm": 246.82992553710938, |
| "learning_rate": 7.6092720031738284e-06, |
| "lookahead_loss": 6.710630246162415, |
| "loss": 4.1697, |
| "step": 444500 |
| }, |
| { |
| "epoch": 0.8487701416015625, |
| "grad_norm": 1647.665283203125, |
| "learning_rate": 7.561588287353516e-06, |
| "lookahead_loss": 6.727151582717895, |
| "loss": 4.1868, |
| "step": 445000 |
| }, |
| { |
| "epoch": 0.8487701416015625, |
| "eval_accuracy": 0.03913052837573386, |
| "eval_lookahead_loss": 6.630252032852173, |
| "eval_lookahead_perplexity": 757.6731050936131, |
| "eval_loss": 4.112374782562256, |
| "eval_perplexity": 61.0916247651724, |
| "eval_runtime": 521.8321, |
| "eval_samples_per_second": 19.163, |
| "eval_steps_per_second": 4.791, |
| "step": 445000 |
| }, |
| { |
| "epoch": 0.8497238159179688, |
| "grad_norm": 284.5116271972656, |
| "learning_rate": 7.513904571533204e-06, |
| "lookahead_loss": 6.738407536506653, |
| "loss": 4.195, |
| "step": 445500 |
| }, |
| { |
| "epoch": 0.850677490234375, |
| "grad_norm": 85.92292022705078, |
| "learning_rate": 7.466220855712891e-06, |
| "lookahead_loss": 6.725696940422058, |
| "loss": 4.1858, |
| "step": 446000 |
| }, |
| { |
| "epoch": 0.8516311645507812, |
| "grad_norm": 104.6988754272461, |
| "learning_rate": 7.418537139892578e-06, |
| "lookahead_loss": 6.747252093315124, |
| "loss": 4.209, |
| "step": 446500 |
| }, |
| { |
| "epoch": 0.8525848388671875, |
| "grad_norm": 71.97230529785156, |
| "learning_rate": 7.370853424072266e-06, |
| "lookahead_loss": 6.734288429260254, |
| "loss": 4.197, |
| "step": 447000 |
| }, |
| { |
| "epoch": 0.8535385131835938, |
| "grad_norm": 82.83120727539062, |
| "learning_rate": 7.323169708251954e-06, |
| "lookahead_loss": 6.726963366508484, |
| "loss": 4.1891, |
| "step": 447500 |
| }, |
| { |
| "epoch": 0.8544921875, |
| "grad_norm": 133.32351684570312, |
| "learning_rate": 7.275485992431641e-06, |
| "lookahead_loss": 6.7516463804245, |
| "loss": 4.2078, |
| "step": 448000 |
| }, |
| { |
| "epoch": 0.8554458618164062, |
| "grad_norm": 110.70391845703125, |
| "learning_rate": 7.227802276611328e-06, |
| "lookahead_loss": 6.7124689655303955, |
| "loss": 4.1755, |
| "step": 448500 |
| }, |
| { |
| "epoch": 0.8563995361328125, |
| "grad_norm": 101.02713775634766, |
| "learning_rate": 7.180118560791016e-06, |
| "lookahead_loss": 6.657481478691101, |
| "loss": 4.1371, |
| "step": 449000 |
| }, |
| { |
| "epoch": 0.8573532104492188, |
| "grad_norm": 217.88677978515625, |
| "learning_rate": 7.1324348449707034e-06, |
| "lookahead_loss": 6.73441841506958, |
| "loss": 4.1937, |
| "step": 449500 |
| }, |
| { |
| "epoch": 0.858306884765625, |
| "grad_norm": 103.732177734375, |
| "learning_rate": 7.084751129150391e-06, |
| "lookahead_loss": 6.637221877098083, |
| "loss": 4.1189, |
| "step": 450000 |
| }, |
| { |
| "epoch": 0.858306884765625, |
| "eval_accuracy": 0.039362230919765165, |
| "eval_lookahead_loss": 6.6330756767272945, |
| "eval_lookahead_perplexity": 759.8155274115283, |
| "eval_loss": 4.114243984222412, |
| "eval_perplexity": 61.20592412260008, |
| "eval_runtime": 521.8192, |
| "eval_samples_per_second": 19.164, |
| "eval_steps_per_second": 4.791, |
| "step": 450000 |
| }, |
| { |
| "epoch": 0.8592605590820312, |
| "grad_norm": 170.5393524169922, |
| "learning_rate": 7.037067413330079e-06, |
| "lookahead_loss": 6.714610489845276, |
| "loss": 4.1804, |
| "step": 450500 |
| }, |
| { |
| "epoch": 0.8602142333984375, |
| "grad_norm": 169.95333862304688, |
| "learning_rate": 6.989383697509766e-06, |
| "lookahead_loss": 6.676036776542664, |
| "loss": 4.1487, |
| "step": 451000 |
| }, |
| { |
| "epoch": 0.8611679077148438, |
| "grad_norm": 108.02375793457031, |
| "learning_rate": 6.941699981689453e-06, |
| "lookahead_loss": 6.733264635086059, |
| "loss": 4.1964, |
| "step": 451500 |
| }, |
| { |
| "epoch": 0.86212158203125, |
| "grad_norm": 151.3297882080078, |
| "learning_rate": 6.894016265869141e-06, |
| "lookahead_loss": 6.7138033876419065, |
| "loss": 4.1851, |
| "step": 452000 |
| }, |
| { |
| "epoch": 0.8630752563476562, |
| "grad_norm": 71.29066467285156, |
| "learning_rate": 6.846332550048829e-06, |
| "lookahead_loss": 6.682135174751282, |
| "loss": 4.1534, |
| "step": 452500 |
| }, |
| { |
| "epoch": 0.8640289306640625, |
| "grad_norm": 97.84779357910156, |
| "learning_rate": 6.798648834228516e-06, |
| "lookahead_loss": 6.661462572097778, |
| "loss": 4.1329, |
| "step": 453000 |
| }, |
| { |
| "epoch": 0.8649826049804688, |
| "grad_norm": 177.5167999267578, |
| "learning_rate": 6.750965118408203e-06, |
| "lookahead_loss": 6.683147459983826, |
| "loss": 4.1523, |
| "step": 453500 |
| }, |
| { |
| "epoch": 0.865936279296875, |
| "grad_norm": 99.48072052001953, |
| "learning_rate": 6.703281402587891e-06, |
| "lookahead_loss": 6.69360842704773, |
| "loss": 4.1597, |
| "step": 454000 |
| }, |
| { |
| "epoch": 0.8668899536132812, |
| "grad_norm": 109.61930084228516, |
| "learning_rate": 6.6555976867675784e-06, |
| "lookahead_loss": 6.741188433647156, |
| "loss": 4.204, |
| "step": 454500 |
| }, |
| { |
| "epoch": 0.8678436279296875, |
| "grad_norm": 115.34671783447266, |
| "learning_rate": 6.607913970947266e-06, |
| "lookahead_loss": 6.758744140625, |
| "loss": 4.2156, |
| "step": 455000 |
| }, |
| { |
| "epoch": 0.8678436279296875, |
| "eval_accuracy": 0.03943502935420744, |
| "eval_lookahead_loss": 6.630196190643311, |
| "eval_lookahead_perplexity": 757.6307961351529, |
| "eval_loss": 4.11140775680542, |
| "eval_perplexity": 61.03257614586696, |
| "eval_runtime": 525.8627, |
| "eval_samples_per_second": 19.016, |
| "eval_steps_per_second": 4.754, |
| "step": 455000 |
| }, |
| { |
| "epoch": 0.8687973022460938, |
| "grad_norm": 130.134033203125, |
| "learning_rate": 6.560230255126954e-06, |
| "lookahead_loss": 6.6926358089447024, |
| "loss": 4.1645, |
| "step": 455500 |
| }, |
| { |
| "epoch": 0.8697509765625, |
| "grad_norm": 156.10279846191406, |
| "learning_rate": 6.512546539306641e-06, |
| "lookahead_loss": 6.743165104866028, |
| "loss": 4.2025, |
| "step": 456000 |
| }, |
| { |
| "epoch": 0.8707046508789062, |
| "grad_norm": 195.76612854003906, |
| "learning_rate": 6.464862823486328e-06, |
| "lookahead_loss": 6.711678610801696, |
| "loss": 4.1747, |
| "step": 456500 |
| }, |
| { |
| "epoch": 0.8716583251953125, |
| "grad_norm": 258.5284729003906, |
| "learning_rate": 6.417179107666016e-06, |
| "lookahead_loss": 6.720124932289123, |
| "loss": 4.1854, |
| "step": 457000 |
| }, |
| { |
| "epoch": 0.8726119995117188, |
| "grad_norm": 489.4788513183594, |
| "learning_rate": 6.369495391845704e-06, |
| "lookahead_loss": 6.709194701194763, |
| "loss": 4.1699, |
| "step": 457500 |
| }, |
| { |
| "epoch": 0.873565673828125, |
| "grad_norm": 240.5293426513672, |
| "learning_rate": 6.321811676025391e-06, |
| "lookahead_loss": 6.73247544002533, |
| "loss": 4.1894, |
| "step": 458000 |
| }, |
| { |
| "epoch": 0.8745193481445312, |
| "grad_norm": 281.386474609375, |
| "learning_rate": 6.274127960205078e-06, |
| "lookahead_loss": 6.703888931274414, |
| "loss": 4.1684, |
| "step": 458500 |
| }, |
| { |
| "epoch": 0.8754730224609375, |
| "grad_norm": 132.50120544433594, |
| "learning_rate": 6.226444244384766e-06, |
| "lookahead_loss": 6.734236249923706, |
| "loss": 4.1926, |
| "step": 459000 |
| }, |
| { |
| "epoch": 0.8764266967773438, |
| "grad_norm": 70.10245513916016, |
| "learning_rate": 6.1787605285644534e-06, |
| "lookahead_loss": 6.686241549491882, |
| "loss": 4.1528, |
| "step": 459500 |
| }, |
| { |
| "epoch": 0.87738037109375, |
| "grad_norm": 111.56172943115234, |
| "learning_rate": 6.131076812744141e-06, |
| "lookahead_loss": 6.737575661659241, |
| "loss": 4.1966, |
| "step": 460000 |
| }, |
| { |
| "epoch": 0.87738037109375, |
| "eval_accuracy": 0.03982426614481409, |
| "eval_lookahead_loss": 6.625440689468384, |
| "eval_lookahead_perplexity": 754.036435261458, |
| "eval_loss": 4.106970310211182, |
| "eval_perplexity": 60.762347355045904, |
| "eval_runtime": 514.9961, |
| "eval_samples_per_second": 19.418, |
| "eval_steps_per_second": 4.854, |
| "step": 460000 |
| }, |
| { |
| "epoch": 0.8783340454101562, |
| "grad_norm": 1611.44970703125, |
| "learning_rate": 6.083393096923829e-06, |
| "lookahead_loss": 6.7267300930023195, |
| "loss": 4.1868, |
| "step": 460500 |
| }, |
| { |
| "epoch": 0.8792877197265625, |
| "grad_norm": 76.2000732421875, |
| "learning_rate": 6.035709381103516e-06, |
| "lookahead_loss": 6.72923139667511, |
| "loss": 4.1881, |
| "step": 461000 |
| }, |
| { |
| "epoch": 0.8802413940429688, |
| "grad_norm": 250.62884521484375, |
| "learning_rate": 5.988025665283203e-06, |
| "lookahead_loss": 6.68507626247406, |
| "loss": 4.1543, |
| "step": 461500 |
| }, |
| { |
| "epoch": 0.881195068359375, |
| "grad_norm": 111.74227905273438, |
| "learning_rate": 5.940341949462891e-06, |
| "lookahead_loss": 6.74182855129242, |
| "loss": 4.192, |
| "step": 462000 |
| }, |
| { |
| "epoch": 0.8821487426757812, |
| "grad_norm": 116.05091094970703, |
| "learning_rate": 5.892658233642579e-06, |
| "lookahead_loss": 6.701608322143555, |
| "loss": 4.1643, |
| "step": 462500 |
| }, |
| { |
| "epoch": 0.8831024169921875, |
| "grad_norm": 90.0874252319336, |
| "learning_rate": 5.844974517822266e-06, |
| "lookahead_loss": 6.73692225933075, |
| "loss": 4.1924, |
| "step": 463000 |
| }, |
| { |
| "epoch": 0.8840560913085938, |
| "grad_norm": 129.70205688476562, |
| "learning_rate": 5.797290802001953e-06, |
| "lookahead_loss": 6.718269413948059, |
| "loss": 4.1815, |
| "step": 463500 |
| }, |
| { |
| "epoch": 0.885009765625, |
| "grad_norm": 91.08078002929688, |
| "learning_rate": 5.749607086181641e-06, |
| "lookahead_loss": 6.720780562400818, |
| "loss": 4.179, |
| "step": 464000 |
| }, |
| { |
| "epoch": 0.8859634399414062, |
| "grad_norm": 61.97770690917969, |
| "learning_rate": 5.7019233703613284e-06, |
| "lookahead_loss": 6.715042568206787, |
| "loss": 4.176, |
| "step": 464500 |
| }, |
| { |
| "epoch": 0.8869171142578125, |
| "grad_norm": 196.26634216308594, |
| "learning_rate": 5.654239654541016e-06, |
| "lookahead_loss": 6.6928185338974, |
| "loss": 4.158, |
| "step": 465000 |
| }, |
| { |
| "epoch": 0.8869171142578125, |
| "eval_accuracy": 0.03982739726027397, |
| "eval_lookahead_loss": 6.626271807479858, |
| "eval_lookahead_perplexity": 754.6633890243046, |
| "eval_loss": 4.1067585945129395, |
| "eval_perplexity": 60.749484373944355, |
| "eval_runtime": 519.6726, |
| "eval_samples_per_second": 19.243, |
| "eval_steps_per_second": 4.811, |
| "step": 465000 |
| }, |
| { |
| "epoch": 0.8878707885742188, |
| "grad_norm": 227.43484497070312, |
| "learning_rate": 5.606555938720704e-06, |
| "lookahead_loss": 6.6981716785430905, |
| "loss": 4.1672, |
| "step": 465500 |
| }, |
| { |
| "epoch": 0.888824462890625, |
| "grad_norm": 178.01953125, |
| "learning_rate": 5.558872222900391e-06, |
| "lookahead_loss": 6.7102625408172605, |
| "loss": 4.1766, |
| "step": 466000 |
| }, |
| { |
| "epoch": 0.8897781372070312, |
| "grad_norm": 69.32343292236328, |
| "learning_rate": 5.511188507080078e-06, |
| "lookahead_loss": 6.678157561302185, |
| "loss": 4.1466, |
| "step": 466500 |
| }, |
| { |
| "epoch": 0.8907318115234375, |
| "grad_norm": 188.93446350097656, |
| "learning_rate": 5.463504791259766e-06, |
| "lookahead_loss": 6.688272457122803, |
| "loss": 4.1624, |
| "step": 467000 |
| }, |
| { |
| "epoch": 0.8916854858398438, |
| "grad_norm": 74.56832885742188, |
| "learning_rate": 5.415821075439454e-06, |
| "lookahead_loss": 6.692232824325561, |
| "loss": 4.1601, |
| "step": 467500 |
| }, |
| { |
| "epoch": 0.89263916015625, |
| "grad_norm": 163.1488800048828, |
| "learning_rate": 5.368137359619141e-06, |
| "lookahead_loss": 6.696221140861511, |
| "loss": 4.1672, |
| "step": 468000 |
| }, |
| { |
| "epoch": 0.8935928344726562, |
| "grad_norm": 75.43521118164062, |
| "learning_rate": 5.320453643798828e-06, |
| "lookahead_loss": 6.70114368724823, |
| "loss": 4.1645, |
| "step": 468500 |
| }, |
| { |
| "epoch": 0.8945465087890625, |
| "grad_norm": 136.74179077148438, |
| "learning_rate": 5.272769927978516e-06, |
| "lookahead_loss": 6.6990417346954345, |
| "loss": 4.1672, |
| "step": 469000 |
| }, |
| { |
| "epoch": 0.8955001831054688, |
| "grad_norm": 152.8857879638672, |
| "learning_rate": 5.2250862121582034e-06, |
| "lookahead_loss": 6.669786009788513, |
| "loss": 4.1421, |
| "step": 469500 |
| }, |
| { |
| "epoch": 0.896453857421875, |
| "grad_norm": 52.024742126464844, |
| "learning_rate": 5.177402496337891e-06, |
| "lookahead_loss": 6.697947387695312, |
| "loss": 4.1639, |
| "step": 470000 |
| }, |
| { |
| "epoch": 0.896453857421875, |
| "eval_accuracy": 0.039165753424657536, |
| "eval_lookahead_loss": 6.621960063743591, |
| "eval_lookahead_perplexity": 751.4164788363321, |
| "eval_loss": 4.103501796722412, |
| "eval_perplexity": 60.5519574147347, |
| "eval_runtime": 518.5471, |
| "eval_samples_per_second": 19.285, |
| "eval_steps_per_second": 4.821, |
| "step": 470000 |
| }, |
| { |
| "epoch": 0.8974075317382812, |
| "grad_norm": 129.81210327148438, |
| "learning_rate": 5.129718780517579e-06, |
| "lookahead_loss": 6.685195894241333, |
| "loss": 4.1524, |
| "step": 470500 |
| }, |
| { |
| "epoch": 0.8983612060546875, |
| "grad_norm": 38.63855743408203, |
| "learning_rate": 5.082035064697266e-06, |
| "lookahead_loss": 6.7547687768936155, |
| "loss": 4.2084, |
| "step": 471000 |
| }, |
| { |
| "epoch": 0.8993148803710938, |
| "grad_norm": 221.00448608398438, |
| "learning_rate": 5.034351348876953e-06, |
| "lookahead_loss": 6.762701703071595, |
| "loss": 4.2224, |
| "step": 471500 |
| }, |
| { |
| "epoch": 0.9002685546875, |
| "grad_norm": 253.66175842285156, |
| "learning_rate": 4.986667633056641e-06, |
| "lookahead_loss": 6.738181292533874, |
| "loss": 4.1969, |
| "step": 472000 |
| }, |
| { |
| "epoch": 0.9012222290039062, |
| "grad_norm": 97.86500549316406, |
| "learning_rate": 4.938983917236329e-06, |
| "lookahead_loss": 6.723217816352844, |
| "loss": 4.1837, |
| "step": 472500 |
| }, |
| { |
| "epoch": 0.9021759033203125, |
| "grad_norm": 259.8615417480469, |
| "learning_rate": 4.891300201416016e-06, |
| "lookahead_loss": 6.698670424461365, |
| "loss": 4.1611, |
| "step": 473000 |
| }, |
| { |
| "epoch": 0.9031295776367188, |
| "grad_norm": 172.56629943847656, |
| "learning_rate": 4.843616485595703e-06, |
| "lookahead_loss": 6.711057712554932, |
| "loss": 4.1734, |
| "step": 473500 |
| }, |
| { |
| "epoch": 0.904083251953125, |
| "grad_norm": 241.21856689453125, |
| "learning_rate": 4.795932769775391e-06, |
| "lookahead_loss": 6.730319765090942, |
| "loss": 4.1915, |
| "step": 474000 |
| }, |
| { |
| "epoch": 0.9050369262695312, |
| "grad_norm": 90.1880111694336, |
| "learning_rate": 4.7482490539550784e-06, |
| "lookahead_loss": 6.7173728694915775, |
| "loss": 4.1748, |
| "step": 474500 |
| }, |
| { |
| "epoch": 0.9059906005859375, |
| "grad_norm": 235.19883728027344, |
| "learning_rate": 4.700565338134766e-06, |
| "lookahead_loss": 6.740855940818786, |
| "loss": 4.1936, |
| "step": 475000 |
| }, |
| { |
| "epoch": 0.9059906005859375, |
| "eval_accuracy": 0.039570058708414874, |
| "eval_lookahead_loss": 6.620401067733765, |
| "eval_lookahead_perplexity": 750.2459362168253, |
| "eval_loss": 4.101663112640381, |
| "eval_perplexity": 60.44072378758805, |
| "eval_runtime": 522.5293, |
| "eval_samples_per_second": 19.138, |
| "eval_steps_per_second": 4.784, |
| "step": 475000 |
| }, |
| { |
| "epoch": 0.9069442749023438, |
| "grad_norm": 74.00786590576172, |
| "learning_rate": 4.652881622314453e-06, |
| "lookahead_loss": 6.729425669670105, |
| "loss": 4.1836, |
| "step": 475500 |
| }, |
| { |
| "epoch": 0.90789794921875, |
| "grad_norm": 58.23165512084961, |
| "learning_rate": 4.605197906494141e-06, |
| "lookahead_loss": 6.721751620292664, |
| "loss": 4.1844, |
| "step": 476000 |
| }, |
| { |
| "epoch": 0.9088516235351562, |
| "grad_norm": 63.72830581665039, |
| "learning_rate": 4.557514190673828e-06, |
| "lookahead_loss": 6.705439714431763, |
| "loss": 4.1677, |
| "step": 476500 |
| }, |
| { |
| "epoch": 0.9098052978515625, |
| "grad_norm": 195.08311462402344, |
| "learning_rate": 4.509830474853516e-06, |
| "lookahead_loss": 6.709064541816711, |
| "loss": 4.1718, |
| "step": 477000 |
| }, |
| { |
| "epoch": 0.9107589721679688, |
| "grad_norm": 146.79742431640625, |
| "learning_rate": 4.462146759033204e-06, |
| "lookahead_loss": 6.687383010864258, |
| "loss": 4.1507, |
| "step": 477500 |
| }, |
| { |
| "epoch": 0.911712646484375, |
| "grad_norm": 57.35370635986328, |
| "learning_rate": 4.4144630432128904e-06, |
| "lookahead_loss": 6.724326784133911, |
| "loss": 4.1861, |
| "step": 478000 |
| }, |
| { |
| "epoch": 0.9126663208007812, |
| "grad_norm": 82.28441619873047, |
| "learning_rate": 4.366779327392578e-06, |
| "lookahead_loss": 6.724619345664978, |
| "loss": 4.1788, |
| "step": 478500 |
| }, |
| { |
| "epoch": 0.9136199951171875, |
| "grad_norm": 101.08804321289062, |
| "learning_rate": 4.319095611572266e-06, |
| "lookahead_loss": 6.737988086700439, |
| "loss": 4.1905, |
| "step": 479000 |
| }, |
| { |
| "epoch": 0.9145736694335938, |
| "grad_norm": 41.27861404418945, |
| "learning_rate": 4.2714118957519534e-06, |
| "lookahead_loss": 6.733048192024231, |
| "loss": 4.1911, |
| "step": 479500 |
| }, |
| { |
| "epoch": 0.91552734375, |
| "grad_norm": 51.263427734375, |
| "learning_rate": 4.223728179931641e-06, |
| "lookahead_loss": 6.685365100860595, |
| "loss": 4.154, |
| "step": 480000 |
| }, |
| { |
| "epoch": 0.91552734375, |
| "eval_accuracy": 0.03933874755381605, |
| "eval_lookahead_loss": 6.618105301856994, |
| "eval_lookahead_perplexity": 748.525522786123, |
| "eval_loss": 4.099847793579102, |
| "eval_perplexity": 60.33110411706085, |
| "eval_runtime": 518.6476, |
| "eval_samples_per_second": 19.281, |
| "eval_steps_per_second": 4.82, |
| "step": 480000 |
| }, |
| { |
| "epoch": 0.9164810180664062, |
| "grad_norm": 111.2357406616211, |
| "learning_rate": 4.176044464111328e-06, |
| "lookahead_loss": 6.692235992431641, |
| "loss": 4.1607, |
| "step": 480500 |
| }, |
| { |
| "epoch": 0.9174346923828125, |
| "grad_norm": 86.12166595458984, |
| "learning_rate": 4.128360748291016e-06, |
| "lookahead_loss": 6.702814944267273, |
| "loss": 4.1693, |
| "step": 481000 |
| }, |
| { |
| "epoch": 0.9183883666992188, |
| "grad_norm": 62.889732360839844, |
| "learning_rate": 4.080677032470703e-06, |
| "lookahead_loss": 6.708718207359314, |
| "loss": 4.1681, |
| "step": 481500 |
| }, |
| { |
| "epoch": 0.919342041015625, |
| "grad_norm": 44.466835021972656, |
| "learning_rate": 4.032993316650391e-06, |
| "lookahead_loss": 6.702862030982971, |
| "loss": 4.1675, |
| "step": 482000 |
| }, |
| { |
| "epoch": 0.9202957153320312, |
| "grad_norm": 169.56138610839844, |
| "learning_rate": 3.985309600830079e-06, |
| "lookahead_loss": 6.720054666519165, |
| "loss": 4.184, |
| "step": 482500 |
| }, |
| { |
| "epoch": 0.9212493896484375, |
| "grad_norm": 150.70069885253906, |
| "learning_rate": 3.9376258850097654e-06, |
| "lookahead_loss": 6.715733885765076, |
| "loss": 4.1768, |
| "step": 483000 |
| }, |
| { |
| "epoch": 0.9222030639648438, |
| "grad_norm": 184.50372314453125, |
| "learning_rate": 3.889942169189453e-06, |
| "lookahead_loss": 6.674390200614929, |
| "loss": 4.1521, |
| "step": 483500 |
| }, |
| { |
| "epoch": 0.92315673828125, |
| "grad_norm": 91.0439224243164, |
| "learning_rate": 3.842258453369141e-06, |
| "lookahead_loss": 6.734517775535584, |
| "loss": 4.1922, |
| "step": 484000 |
| }, |
| { |
| "epoch": 0.9241104125976562, |
| "grad_norm": 191.3260955810547, |
| "learning_rate": 3.7945747375488284e-06, |
| "lookahead_loss": 6.686105996131897, |
| "loss": 4.1536, |
| "step": 484500 |
| }, |
| { |
| "epoch": 0.9250640869140625, |
| "grad_norm": 99.60480499267578, |
| "learning_rate": 3.7468910217285157e-06, |
| "lookahead_loss": 6.6733855466842655, |
| "loss": 4.1473, |
| "step": 485000 |
| }, |
| { |
| "epoch": 0.9250640869140625, |
| "eval_accuracy": 0.03972191780821918, |
| "eval_lookahead_loss": 6.618634088516235, |
| "eval_lookahead_perplexity": 748.9214377647278, |
| "eval_loss": 4.099872589111328, |
| "eval_perplexity": 60.33260007744374, |
| "eval_runtime": 518.3278, |
| "eval_samples_per_second": 19.293, |
| "eval_steps_per_second": 4.823, |
| "step": 485000 |
| }, |
| { |
| "epoch": 0.9260177612304688, |
| "grad_norm": 62.19022750854492, |
| "learning_rate": 3.6992073059082034e-06, |
| "lookahead_loss": 6.641545587539673, |
| "loss": 4.1216, |
| "step": 485500 |
| }, |
| { |
| "epoch": 0.926971435546875, |
| "grad_norm": 88.33380126953125, |
| "learning_rate": 3.6515235900878906e-06, |
| "lookahead_loss": 6.6500311880111695, |
| "loss": 4.1243, |
| "step": 486000 |
| }, |
| { |
| "epoch": 0.9279251098632812, |
| "grad_norm": 88.00282287597656, |
| "learning_rate": 3.6038398742675783e-06, |
| "lookahead_loss": 6.657510120391846, |
| "loss": 4.1325, |
| "step": 486500 |
| }, |
| { |
| "epoch": 0.9288787841796875, |
| "grad_norm": 121.33488464355469, |
| "learning_rate": 3.556156158447266e-06, |
| "lookahead_loss": 6.699663286209106, |
| "loss": 4.1616, |
| "step": 487000 |
| }, |
| { |
| "epoch": 0.9298324584960938, |
| "grad_norm": 213.5430450439453, |
| "learning_rate": 3.508472442626953e-06, |
| "lookahead_loss": 6.686011109352112, |
| "loss": 4.1566, |
| "step": 487500 |
| }, |
| { |
| "epoch": 0.9307861328125, |
| "grad_norm": 367.992431640625, |
| "learning_rate": 3.460788726806641e-06, |
| "lookahead_loss": 6.760558218955993, |
| "loss": 4.2117, |
| "step": 488000 |
| }, |
| { |
| "epoch": 0.9317398071289062, |
| "grad_norm": 471.0282287597656, |
| "learning_rate": 3.413105010986328e-06, |
| "lookahead_loss": 6.769396718025208, |
| "loss": 4.2214, |
| "step": 488500 |
| }, |
| { |
| "epoch": 0.9326934814453125, |
| "grad_norm": 112.82720947265625, |
| "learning_rate": 3.3654212951660158e-06, |
| "lookahead_loss": 6.709150005340576, |
| "loss": 4.1716, |
| "step": 489000 |
| }, |
| { |
| "epoch": 0.9336471557617188, |
| "grad_norm": 95.36373901367188, |
| "learning_rate": 3.3177375793457034e-06, |
| "lookahead_loss": 6.74151674079895, |
| "loss": 4.1982, |
| "step": 489500 |
| }, |
| { |
| "epoch": 0.934600830078125, |
| "grad_norm": 75.07386016845703, |
| "learning_rate": 3.2700538635253907e-06, |
| "lookahead_loss": 6.729368120193482, |
| "loss": 4.1857, |
| "step": 490000 |
| }, |
| { |
| "epoch": 0.934600830078125, |
| "eval_accuracy": 0.03967945205479452, |
| "eval_lookahead_loss": 6.615631866836548, |
| "eval_lookahead_perplexity": 746.6763813526909, |
| "eval_loss": 4.097633361816406, |
| "eval_perplexity": 60.19765281775738, |
| "eval_runtime": 528.7711, |
| "eval_samples_per_second": 18.912, |
| "eval_steps_per_second": 4.728, |
| "step": 490000 |
| }, |
| { |
| "epoch": 0.9355545043945312, |
| "grad_norm": 62.85585021972656, |
| "learning_rate": 3.2223701477050784e-06, |
| "lookahead_loss": 6.727360112190246, |
| "loss": 4.1838, |
| "step": 490500 |
| }, |
| { |
| "epoch": 0.9365081787109375, |
| "grad_norm": 85.25778198242188, |
| "learning_rate": 3.1746864318847656e-06, |
| "lookahead_loss": 6.743950333595276, |
| "loss": 4.1991, |
| "step": 491000 |
| }, |
| { |
| "epoch": 0.9374618530273438, |
| "grad_norm": 104.68131256103516, |
| "learning_rate": 3.1270027160644533e-06, |
| "lookahead_loss": 6.7288976640701295, |
| "loss": 4.184, |
| "step": 491500 |
| }, |
| { |
| "epoch": 0.93841552734375, |
| "grad_norm": 288.7207946777344, |
| "learning_rate": 3.079319000244141e-06, |
| "lookahead_loss": 6.721266926765442, |
| "loss": 4.1779, |
| "step": 492000 |
| }, |
| { |
| "epoch": 0.9393692016601562, |
| "grad_norm": 70.64923858642578, |
| "learning_rate": 3.031635284423828e-06, |
| "lookahead_loss": 6.816908353805542, |
| "loss": 4.2603, |
| "step": 492500 |
| }, |
| { |
| "epoch": 0.9403228759765625, |
| "grad_norm": 149.22265625, |
| "learning_rate": 2.983951568603516e-06, |
| "lookahead_loss": 6.723154423713684, |
| "loss": 4.1771, |
| "step": 493000 |
| }, |
| { |
| "epoch": 0.9412765502929688, |
| "grad_norm": 326.1004333496094, |
| "learning_rate": 2.936267852783203e-06, |
| "lookahead_loss": 6.712617171287537, |
| "loss": 4.1748, |
| "step": 493500 |
| }, |
| { |
| "epoch": 0.942230224609375, |
| "grad_norm": 282.9819641113281, |
| "learning_rate": 2.8885841369628908e-06, |
| "lookahead_loss": 6.720616690635681, |
| "loss": 4.1783, |
| "step": 494000 |
| }, |
| { |
| "epoch": 0.9431838989257812, |
| "grad_norm": 121.57740783691406, |
| "learning_rate": 2.8409004211425784e-06, |
| "lookahead_loss": 6.7278664150238034, |
| "loss": 4.1807, |
| "step": 494500 |
| }, |
| { |
| "epoch": 0.9441375732421875, |
| "grad_norm": 105.49919128417969, |
| "learning_rate": 2.7932167053222657e-06, |
| "lookahead_loss": 6.714613382339477, |
| "loss": 4.1724, |
| "step": 495000 |
| }, |
| { |
| "epoch": 0.9441375732421875, |
| "eval_accuracy": 0.03972230919765166, |
| "eval_lookahead_loss": 6.6166566438674925, |
| "eval_lookahead_perplexity": 747.4419503596189, |
| "eval_loss": 4.097666263580322, |
| "eval_perplexity": 60.19963345930181, |
| "eval_runtime": 525.8972, |
| "eval_samples_per_second": 19.015, |
| "eval_steps_per_second": 4.754, |
| "step": 495000 |
| }, |
| { |
| "epoch": 0.9450912475585938, |
| "grad_norm": 451.87841796875, |
| "learning_rate": 2.7455329895019534e-06, |
| "lookahead_loss": 6.716301713943482, |
| "loss": 4.1747, |
| "step": 495500 |
| }, |
| { |
| "epoch": 0.946044921875, |
| "grad_norm": 61.304141998291016, |
| "learning_rate": 2.6978492736816406e-06, |
| "lookahead_loss": 6.679008432388306, |
| "loss": 4.1466, |
| "step": 496000 |
| }, |
| { |
| "epoch": 0.9469985961914062, |
| "grad_norm": 97.46538543701172, |
| "learning_rate": 2.6501655578613283e-06, |
| "lookahead_loss": 6.638355240821839, |
| "loss": 4.1161, |
| "step": 496500 |
| }, |
| { |
| "epoch": 0.9479522705078125, |
| "grad_norm": 117.76644897460938, |
| "learning_rate": 2.602481842041016e-06, |
| "lookahead_loss": 6.696069451332092, |
| "loss": 4.1666, |
| "step": 497000 |
| }, |
| { |
| "epoch": 0.9489059448242188, |
| "grad_norm": 78.37489318847656, |
| "learning_rate": 2.554798126220703e-06, |
| "lookahead_loss": 6.66585451221466, |
| "loss": 4.1364, |
| "step": 497500 |
| }, |
| { |
| "epoch": 0.949859619140625, |
| "grad_norm": 106.2125015258789, |
| "learning_rate": 2.507114410400391e-06, |
| "lookahead_loss": 6.713165844917297, |
| "loss": 4.1728, |
| "step": 498000 |
| }, |
| { |
| "epoch": 0.9508132934570312, |
| "grad_norm": 433.8307800292969, |
| "learning_rate": 2.459430694580078e-06, |
| "lookahead_loss": 6.708888305664063, |
| "loss": 4.1796, |
| "step": 498500 |
| }, |
| { |
| "epoch": 0.9517669677734375, |
| "grad_norm": 78.49517822265625, |
| "learning_rate": 2.4117469787597658e-06, |
| "lookahead_loss": 6.700650588989258, |
| "loss": 4.1675, |
| "step": 499000 |
| }, |
| { |
| "epoch": 0.9527206420898438, |
| "grad_norm": 154.3566131591797, |
| "learning_rate": 2.3640632629394534e-06, |
| "lookahead_loss": 6.716837489128113, |
| "loss": 4.1763, |
| "step": 499500 |
| }, |
| { |
| "epoch": 0.95367431640625, |
| "grad_norm": 95.81932067871094, |
| "learning_rate": 2.3163795471191407e-06, |
| "lookahead_loss": 6.694699822425842, |
| "loss": 4.166, |
| "step": 500000 |
| }, |
| { |
| "epoch": 0.95367431640625, |
| "eval_accuracy": 0.03936203522504892, |
| "eval_lookahead_loss": 6.615106293678283, |
| "eval_lookahead_perplexity": 746.2840513968199, |
| "eval_loss": 4.09620475769043, |
| "eval_perplexity": 60.11171560231395, |
| "eval_runtime": 516.9736, |
| "eval_samples_per_second": 19.343, |
| "eval_steps_per_second": 4.836, |
| "step": 500000 |
| }, |
| { |
| "epoch": 1.0009536743164062, |
| "grad_norm": 137.3190460205078, |
| "learning_rate": 2.2686958312988284e-06, |
| "lookahead_loss": 6.699611643791199, |
| "loss": 4.1627, |
| "step": 500500 |
| }, |
| { |
| "epoch": 1.0019073486328125, |
| "grad_norm": 67.73553466796875, |
| "learning_rate": 2.2210121154785156e-06, |
| "lookahead_loss": 6.699389809608459, |
| "loss": 4.1603, |
| "step": 501000 |
| }, |
| { |
| "epoch": 1.0028610229492188, |
| "grad_norm": 81.45501708984375, |
| "learning_rate": 2.1733283996582033e-06, |
| "lookahead_loss": 6.696600555419922, |
| "loss": 4.1583, |
| "step": 501500 |
| }, |
| { |
| "epoch": 1.003814697265625, |
| "grad_norm": 99.1773910522461, |
| "learning_rate": 2.125644683837891e-06, |
| "lookahead_loss": 6.721456358909607, |
| "loss": 4.1799, |
| "step": 502000 |
| }, |
| { |
| "epoch": 1.0047683715820312, |
| "grad_norm": 72.01434326171875, |
| "learning_rate": 2.077960968017578e-06, |
| "lookahead_loss": 6.702658643722534, |
| "loss": 4.163, |
| "step": 502500 |
| }, |
| { |
| "epoch": 1.0057220458984375, |
| "grad_norm": 56.51216506958008, |
| "learning_rate": 2.030277252197266e-06, |
| "lookahead_loss": 6.7238514680862425, |
| "loss": 4.1797, |
| "step": 503000 |
| }, |
| { |
| "epoch": 1.0066757202148438, |
| "grad_norm": 58.71382522583008, |
| "learning_rate": 1.982593536376953e-06, |
| "lookahead_loss": 6.697559148788452, |
| "loss": 4.1612, |
| "step": 503500 |
| }, |
| { |
| "epoch": 1.00762939453125, |
| "grad_norm": 155.58201599121094, |
| "learning_rate": 1.9349098205566408e-06, |
| "lookahead_loss": 6.6970227403640745, |
| "loss": 4.1589, |
| "step": 504000 |
| }, |
| { |
| "epoch": 1.0085830688476562, |
| "grad_norm": 200.99688720703125, |
| "learning_rate": 1.8872261047363282e-06, |
| "lookahead_loss": 6.661710713386536, |
| "loss": 4.133, |
| "step": 504500 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "grad_norm": 112.41580200195312, |
| "learning_rate": 1.8395423889160157e-06, |
| "lookahead_loss": 6.6971626424789426, |
| "loss": 4.1581, |
| "step": 505000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.039548727984344424, |
| "eval_lookahead_loss": 6.615424564743042, |
| "eval_lookahead_perplexity": 746.5216098184505, |
| "eval_loss": 4.096465587615967, |
| "eval_perplexity": 60.12739658156383, |
| "eval_runtime": 515.8701, |
| "eval_samples_per_second": 19.385, |
| "eval_steps_per_second": 4.846, |
| "step": 505000 |
| }, |
| { |
| "epoch": 1.0104904174804688, |
| "grad_norm": 96.87193298339844, |
| "learning_rate": 1.7918586730957031e-06, |
| "lookahead_loss": 6.692916042327881, |
| "loss": 4.1553, |
| "step": 505500 |
| }, |
| { |
| "epoch": 1.011444091796875, |
| "grad_norm": 114.84173583984375, |
| "learning_rate": 1.7441749572753908e-06, |
| "lookahead_loss": 6.701267983436584, |
| "loss": 4.171, |
| "step": 506000 |
| }, |
| { |
| "epoch": 1.0123977661132812, |
| "grad_norm": 82.23168182373047, |
| "learning_rate": 1.6964912414550783e-06, |
| "lookahead_loss": 6.682006938934326, |
| "loss": 4.1521, |
| "step": 506500 |
| }, |
| { |
| "epoch": 1.0133514404296875, |
| "grad_norm": 47.911319732666016, |
| "learning_rate": 1.6488075256347657e-06, |
| "lookahead_loss": 6.699205176353455, |
| "loss": 4.1638, |
| "step": 507000 |
| }, |
| { |
| "epoch": 1.0143051147460938, |
| "grad_norm": 113.22929382324219, |
| "learning_rate": 1.6011238098144532e-06, |
| "lookahead_loss": 6.695888810157776, |
| "loss": 4.1609, |
| "step": 507500 |
| }, |
| { |
| "epoch": 1.0152587890625, |
| "grad_norm": 49.27049255371094, |
| "learning_rate": 1.5534400939941406e-06, |
| "lookahead_loss": 6.701726266860962, |
| "loss": 4.166, |
| "step": 508000 |
| }, |
| { |
| "epoch": 1.0162124633789062, |
| "grad_norm": 75.45677185058594, |
| "learning_rate": 1.505756378173828e-06, |
| "lookahead_loss": 6.669482478141784, |
| "loss": 4.1386, |
| "step": 508500 |
| }, |
| { |
| "epoch": 1.0171661376953125, |
| "grad_norm": 105.63385772705078, |
| "learning_rate": 1.4580726623535158e-06, |
| "lookahead_loss": 6.65812814617157, |
| "loss": 4.128, |
| "step": 509000 |
| }, |
| { |
| "epoch": 1.0181198120117188, |
| "grad_norm": 67.5099868774414, |
| "learning_rate": 1.4103889465332032e-06, |
| "lookahead_loss": 6.663897967338562, |
| "loss": 4.1379, |
| "step": 509500 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "grad_norm": 101.13226318359375, |
| "learning_rate": 1.3627052307128907e-06, |
| "lookahead_loss": 6.67702170753479, |
| "loss": 4.1443, |
| "step": 510000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.039494716242661446, |
| "eval_lookahead_loss": 6.6156795272827145, |
| "eval_lookahead_perplexity": 746.7119691302262, |
| "eval_loss": 4.096371650695801, |
| "eval_perplexity": 60.121748664389486, |
| "eval_runtime": 519.593, |
| "eval_samples_per_second": 19.246, |
| "eval_steps_per_second": 4.811, |
| "step": 510000 |
| }, |
| { |
| "epoch": 1.0200271606445312, |
| "grad_norm": 120.44178009033203, |
| "learning_rate": 1.3150215148925781e-06, |
| "lookahead_loss": 6.656263332366944, |
| "loss": 4.1245, |
| "step": 510500 |
| }, |
| { |
| "epoch": 1.0209808349609375, |
| "grad_norm": 118.97191619873047, |
| "learning_rate": 1.2673377990722656e-06, |
| "lookahead_loss": 6.667703623771668, |
| "loss": 4.1401, |
| "step": 511000 |
| }, |
| { |
| "epoch": 1.0219345092773438, |
| "grad_norm": 75.89666748046875, |
| "learning_rate": 1.2196540832519533e-06, |
| "lookahead_loss": 6.697701314926148, |
| "loss": 4.1615, |
| "step": 511500 |
| }, |
| { |
| "epoch": 1.02288818359375, |
| "grad_norm": 55.75214767456055, |
| "learning_rate": 1.1719703674316407e-06, |
| "lookahead_loss": 6.73455620098114, |
| "loss": 4.1919, |
| "step": 512000 |
| }, |
| { |
| "epoch": 1.0238418579101562, |
| "grad_norm": 65.6138687133789, |
| "learning_rate": 1.1242866516113282e-06, |
| "lookahead_loss": 6.6985436611175535, |
| "loss": 4.1659, |
| "step": 512500 |
| }, |
| { |
| "epoch": 1.0247955322265625, |
| "grad_norm": 97.0747299194336, |
| "learning_rate": 1.0766029357910156e-06, |
| "lookahead_loss": 6.730535256385803, |
| "loss": 4.1866, |
| "step": 513000 |
| }, |
| { |
| "epoch": 1.0257492065429688, |
| "grad_norm": 100.54351806640625, |
| "learning_rate": 1.028919219970703e-06, |
| "lookahead_loss": 6.648995252609253, |
| "loss": 4.133, |
| "step": 513500 |
| }, |
| { |
| "epoch": 1.026702880859375, |
| "grad_norm": 100.19547271728516, |
| "learning_rate": 9.812355041503908e-07, |
| "lookahead_loss": 6.684429960250855, |
| "loss": 4.1475, |
| "step": 514000 |
| }, |
| { |
| "epoch": 1.0276565551757812, |
| "grad_norm": 223.9405517578125, |
| "learning_rate": 9.335517883300781e-07, |
| "lookahead_loss": 6.710543825149536, |
| "loss": 4.1698, |
| "step": 514500 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "grad_norm": 216.8909912109375, |
| "learning_rate": 8.858680725097657e-07, |
| "lookahead_loss": 6.698768047332764, |
| "loss": 4.1641, |
| "step": 515000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.03969921722113503, |
| "eval_lookahead_loss": 6.6132298768997195, |
| "eval_lookahead_perplexity": 744.8850244704942, |
| "eval_loss": 4.094522953033447, |
| "eval_perplexity": 60.010704403436016, |
| "eval_runtime": 515.7258, |
| "eval_samples_per_second": 19.39, |
| "eval_steps_per_second": 4.848, |
| "step": 515000 |
| }, |
| { |
| "epoch": 1.0295639038085938, |
| "grad_norm": 106.16375732421875, |
| "learning_rate": 8.381843566894531e-07, |
| "lookahead_loss": 6.716665421485901, |
| "loss": 4.1798, |
| "step": 515500 |
| }, |
| { |
| "epoch": 1.030517578125, |
| "grad_norm": 179.04811096191406, |
| "learning_rate": 7.905006408691407e-07, |
| "lookahead_loss": 6.682203989028931, |
| "loss": 4.1491, |
| "step": 516000 |
| }, |
| { |
| "epoch": 1.0314712524414062, |
| "grad_norm": 48.13349914550781, |
| "learning_rate": 7.428169250488282e-07, |
| "lookahead_loss": 6.690872841835022, |
| "loss": 4.1544, |
| "step": 516500 |
| }, |
| { |
| "epoch": 1.0324249267578125, |
| "grad_norm": 100.41188049316406, |
| "learning_rate": 6.951332092285156e-07, |
| "lookahead_loss": 6.660830945968628, |
| "loss": 4.1332, |
| "step": 517000 |
| }, |
| { |
| "epoch": 1.0333786010742188, |
| "grad_norm": 141.83055114746094, |
| "learning_rate": 6.474494934082032e-07, |
| "lookahead_loss": 6.68863224029541, |
| "loss": 4.1559, |
| "step": 517500 |
| }, |
| { |
| "epoch": 1.034332275390625, |
| "grad_norm": 348.7958984375, |
| "learning_rate": 5.997657775878906e-07, |
| "lookahead_loss": 6.736293955802918, |
| "loss": 4.193, |
| "step": 518000 |
| }, |
| { |
| "epoch": 1.0352859497070312, |
| "grad_norm": 74.92747497558594, |
| "learning_rate": 5.520820617675782e-07, |
| "lookahead_loss": 6.696659141540527, |
| "loss": 4.1581, |
| "step": 518500 |
| }, |
| { |
| "epoch": 1.0362396240234375, |
| "grad_norm": 630.7868041992188, |
| "learning_rate": 5.043983459472657e-07, |
| "lookahead_loss": 6.696161406517029, |
| "loss": 4.1594, |
| "step": 519000 |
| }, |
| { |
| "epoch": 1.0371932983398438, |
| "grad_norm": 57.09252166748047, |
| "learning_rate": 4.5671463012695317e-07, |
| "lookahead_loss": 6.694094017982483, |
| "loss": 4.1571, |
| "step": 519500 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "grad_norm": 123.07572174072266, |
| "learning_rate": 4.0903091430664063e-07, |
| "lookahead_loss": 6.7121089191436765, |
| "loss": 4.1673, |
| "step": 520000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.039719373776908026, |
| "eval_lookahead_loss": 6.611176923561096, |
| "eval_lookahead_perplexity": 743.3573789016892, |
| "eval_loss": 4.092768669128418, |
| "eval_perplexity": 59.905520878429385, |
| "eval_runtime": 516.4668, |
| "eval_samples_per_second": 19.362, |
| "eval_steps_per_second": 4.841, |
| "step": 520000 |
| }, |
| { |
| "epoch": 1.0391006469726562, |
| "grad_norm": 142.13406372070312, |
| "learning_rate": 3.6134719848632814e-07, |
| "lookahead_loss": 6.7205035676956175, |
| "loss": 4.1771, |
| "step": 520500 |
| }, |
| { |
| "epoch": 1.0400543212890625, |
| "grad_norm": 239.94732666015625, |
| "learning_rate": 3.1366348266601565e-07, |
| "lookahead_loss": 6.725655457496643, |
| "loss": 4.1805, |
| "step": 521000 |
| }, |
| { |
| "epoch": 1.0410079956054688, |
| "grad_norm": 79.76507568359375, |
| "learning_rate": 2.6597976684570316e-07, |
| "lookahead_loss": 6.715541929244995, |
| "loss": 4.1697, |
| "step": 521500 |
| }, |
| { |
| "epoch": 1.041961669921875, |
| "grad_norm": 76.12458801269531, |
| "learning_rate": 2.1829605102539064e-07, |
| "lookahead_loss": 6.7047438621521, |
| "loss": 4.1687, |
| "step": 522000 |
| }, |
| { |
| "epoch": 1.0429153442382812, |
| "grad_norm": 93.98125457763672, |
| "learning_rate": 1.7061233520507813e-07, |
| "lookahead_loss": 6.696334157943726, |
| "loss": 4.1605, |
| "step": 522500 |
| }, |
| { |
| "epoch": 1.0438690185546875, |
| "grad_norm": 241.1474609375, |
| "learning_rate": 1.2292861938476564e-07, |
| "lookahead_loss": 6.692067070007324, |
| "loss": 4.1574, |
| "step": 523000 |
| }, |
| { |
| "epoch": 1.0448226928710938, |
| "grad_norm": 85.80900573730469, |
| "learning_rate": 7.524490356445312e-08, |
| "lookahead_loss": 6.683690548062325, |
| "loss": 4.1495, |
| "step": 523500 |
| }, |
| { |
| "epoch": 1.0457763671875, |
| "grad_norm": 200.30140686035156, |
| "learning_rate": 2.7561187744140627e-08, |
| "lookahead_loss": 6.722083292961121, |
| "loss": 4.1804, |
| "step": 524000 |
| }, |
| { |
| "epoch": 1.04632568359375, |
| "step": 524288, |
| "total_flos": 5.045399375119909e+18, |
| "train_loss": 0.6306409304961562, |
| "train_runtime": 55280.8486, |
| "train_samples_per_second": 37.936, |
| "train_steps_per_second": 9.484 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 524288, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.045399375119909e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|