diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10931 @@ +{ + "best_global_step": null, + "best_metric": 0.1416631042957306, + "best_model_checkpoint": null, + "epoch": 2.1178512573242188, + "eval_steps": 5000, + "global_step": 524288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "base_loss": 0.3044133240580559, + "epoch": 0.00095367431640625, + "grad_norm": 0.0010083107044920325, + "learning_rate": 4.995241165161133e-05, + "lookahead_loss": 10.315906455993652, + "loss": 0.3208, + "step": 500 + }, + { + "base_loss": 0.30059696701169014, + "epoch": 0.0019073486328125, + "grad_norm": 0.0010226964950561523, + "learning_rate": 4.990472793579102e-05, + "lookahead_loss": 10.178516641616822, + "loss": 0.3205, + "step": 1000 + }, + { + "base_loss": 0.31169990518689156, + "epoch": 0.00286102294921875, + "grad_norm": 0.001013169065117836, + "learning_rate": 4.98570442199707e-05, + "lookahead_loss": 10.051177593231202, + "loss": 0.3281, + "step": 1500 + }, + { + "base_loss": 0.3227726019620895, + "epoch": 0.003814697265625, + "grad_norm": 0.0010217369999736547, + "learning_rate": 4.9809360504150393e-05, + "lookahead_loss": 9.926475008010865, + "loss": 0.3417, + "step": 2000 + }, + { + "base_loss": 0.3022470915019512, + "epoch": 0.00476837158203125, + "grad_norm": 0.0010057457257062197, + "learning_rate": 4.9761676788330084e-05, + "lookahead_loss": 9.79694257736206, + "loss": 0.3232, + "step": 2500 + }, + { + "base_loss": 0.30552061820030213, + "epoch": 0.0057220458984375, + "grad_norm": 0.0008910459582693875, + "learning_rate": 4.971399307250977e-05, + "lookahead_loss": 9.700162549972534, + "loss": 0.3197, + "step": 3000 + }, + { + "base_loss": 0.2953472335338593, + "epoch": 0.00667572021484375, + "grad_norm": 0.001025513163767755, + "learning_rate": 4.966630935668946e-05, + "lookahead_loss": 9.54606210899353, + "loss": 0.3201, + "step": 3500 + }, + { + "base_loss": 0.312746944963932, + "epoch": 0.00762939453125, + "grad_norm": 0.0010036778403446078, + "learning_rate": 4.961862564086914e-05, + "lookahead_loss": 9.464179010391236, + "loss": 0.3296, + "step": 4000 + }, + { + "base_loss": 0.3169711889922619, + "epoch": 0.00858306884765625, + "grad_norm": 0.0009707122226245701, + "learning_rate": 4.957094192504883e-05, + "lookahead_loss": 9.352066455841065, + "loss": 0.3273, + "step": 4500 + }, + { + "base_loss": 0.306710629016161, + "epoch": 0.0095367431640625, + "grad_norm": 0.001031655934639275, + "learning_rate": 4.952325820922852e-05, + "lookahead_loss": 9.252665700912475, + "loss": 0.3249, + "step": 5000 + }, + { + "epoch": 0.0095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 9.162839600072502, + "eval_lookahead_perplexity": 9536.09737037476, + "eval_loss": 0.147327721118927, + "eval_perplexity": 1.158733642297792, + "eval_runtime": 476.7291, + "eval_samples_per_second": 10.488, + "eval_steps_per_second": 0.329, + "step": 5000 + }, + { + "base_loss": 0.30083237382769584, + "epoch": 0.01049041748046875, + "grad_norm": 0.0010012760758399963, + "learning_rate": 4.9475574493408205e-05, + "lookahead_loss": 9.12458013534546, + "loss": 0.316, + "step": 5500 + }, + { + "base_loss": 0.2993237827420235, + "epoch": 0.011444091796875, + "grad_norm": 0.001035403460264206, + "learning_rate": 4.9427890777587895e-05, + "lookahead_loss": 9.04572417831421, + "loss": 0.3185, + "step": 6000 + }, + { + "base_loss": 0.3238567093908787, + "epoch": 0.01239776611328125, + "grad_norm": 0.0008969915215857327, + "learning_rate": 4.938020706176758e-05, + "lookahead_loss": 8.952016605377198, + "loss": 0.3386, + "step": 6500 + }, + { + "base_loss": 0.3051931007504463, + "epoch": 0.0133514404296875, + "grad_norm": 0.000971041910815984, + "learning_rate": 4.933252334594727e-05, + "lookahead_loss": 8.886044243812561, + "loss": 0.3244, + "step": 7000 + }, + { + "base_loss": 0.29808008483052256, + "epoch": 0.01430511474609375, + "grad_norm": 0.0009857703698799014, + "learning_rate": 4.928483963012696e-05, + "lookahead_loss": 8.771625858306885, + "loss": 0.3177, + "step": 7500 + }, + { + "base_loss": 0.29345863962173463, + "epoch": 0.0152587890625, + "grad_norm": 0.0009497535647824407, + "learning_rate": 4.923715591430664e-05, + "lookahead_loss": 8.633492926597595, + "loss": 0.3098, + "step": 8000 + }, + { + "base_loss": 0.3092884007692337, + "epoch": 0.01621246337890625, + "grad_norm": 0.0010520165087655187, + "learning_rate": 4.918947219848633e-05, + "lookahead_loss": 8.621233073234558, + "loss": 0.3252, + "step": 8500 + }, + { + "base_loss": 0.31143338218331335, + "epoch": 0.0171661376953125, + "grad_norm": 0.0009231261792592704, + "learning_rate": 4.9141788482666016e-05, + "lookahead_loss": 8.558024926185608, + "loss": 0.3269, + "step": 9000 + }, + { + "base_loss": 0.3001442384421825, + "epoch": 0.01811981201171875, + "grad_norm": 0.0009771535405889153, + "learning_rate": 4.9094104766845706e-05, + "lookahead_loss": 8.481963250160216, + "loss": 0.3153, + "step": 9500 + }, + { + "base_loss": 0.2986592257618904, + "epoch": 0.019073486328125, + "grad_norm": 0.000987049425020814, + "learning_rate": 4.9046421051025396e-05, + "lookahead_loss": 8.409450398445129, + "loss": 0.3149, + "step": 10000 + }, + { + "epoch": 0.019073486328125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 8.298678596179705, + "eval_lookahead_perplexity": 4018.5587449798586, + "eval_loss": 0.14565235376358032, + "eval_perplexity": 1.1567939630712722, + "eval_runtime": 489.7259, + "eval_samples_per_second": 10.21, + "eval_steps_per_second": 0.321, + "step": 10000 + }, + { + "base_loss": 0.30347599306702616, + "epoch": 0.02002716064453125, + "grad_norm": 0.0010771030792966485, + "learning_rate": 4.899873733520508e-05, + "lookahead_loss": 8.271695964813233, + "loss": 0.3189, + "step": 10500 + }, + { + "base_loss": 0.3299741225540638, + "epoch": 0.0209808349609375, + "grad_norm": 0.0009491143864579499, + "learning_rate": 4.895105361938477e-05, + "lookahead_loss": 8.27685186958313, + "loss": 0.344, + "step": 11000 + }, + { + "base_loss": 0.3070560489296913, + "epoch": 0.02193450927734375, + "grad_norm": 0.0009909559739753604, + "learning_rate": 4.890336990356445e-05, + "lookahead_loss": 8.1793439950943, + "loss": 0.3199, + "step": 11500 + }, + { + "base_loss": 0.301061170309782, + "epoch": 0.02288818359375, + "grad_norm": 0.001020422438159585, + "learning_rate": 4.8855686187744143e-05, + "lookahead_loss": 8.126035836219788, + "loss": 0.3167, + "step": 12000 + }, + { + "base_loss": 0.30337609922885894, + "epoch": 0.02384185791015625, + "grad_norm": 0.0009905572514981031, + "learning_rate": 4.8808002471923834e-05, + "lookahead_loss": 8.095245086669921, + "loss": 0.3178, + "step": 12500 + }, + { + "base_loss": 0.3241444931924343, + "epoch": 0.0247955322265625, + "grad_norm": 0.0009353117784485221, + "learning_rate": 4.876031875610352e-05, + "lookahead_loss": 8.033969619750977, + "loss": 0.3394, + "step": 13000 + }, + { + "base_loss": 0.3070600248277187, + "epoch": 0.02574920654296875, + "grad_norm": 0.000984247657470405, + "learning_rate": 4.871263504028321e-05, + "lookahead_loss": 7.95574036693573, + "loss": 0.3244, + "step": 13500 + }, + { + "base_loss": 0.3022406686246395, + "epoch": 0.026702880859375, + "grad_norm": 0.001025758683681488, + "learning_rate": 4.866495132446289e-05, + "lookahead_loss": 7.927939188957215, + "loss": 0.3141, + "step": 14000 + }, + { + "base_loss": 0.30680677881836893, + "epoch": 0.02765655517578125, + "grad_norm": 0.0009658489725552499, + "learning_rate": 4.861726760864258e-05, + "lookahead_loss": 7.988002327919006, + "loss": 0.3219, + "step": 14500 + }, + { + "base_loss": 0.33426042160391806, + "epoch": 0.0286102294921875, + "grad_norm": 0.0010099124629050493, + "learning_rate": 4.856958389282227e-05, + "lookahead_loss": 7.973947680473327, + "loss": 0.3455, + "step": 15000 + }, + { + "epoch": 0.0286102294921875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.854276204642396, + "eval_lookahead_perplexity": 2576.729413719151, + "eval_loss": 0.1447858065366745, + "eval_perplexity": 1.1557919806657084, + "eval_runtime": 476.5774, + "eval_samples_per_second": 10.491, + "eval_steps_per_second": 0.329, + "step": 15000 + }, + { + "base_loss": 0.3049518305659294, + "epoch": 0.02956390380859375, + "grad_norm": 0.0009620142518542707, + "learning_rate": 4.8521900177001955e-05, + "lookahead_loss": 7.91404645729065, + "loss": 0.3161, + "step": 15500 + }, + { + "base_loss": 0.3062360401749611, + "epoch": 0.030517578125, + "grad_norm": 0.0009641471551731229, + "learning_rate": 4.8474216461181645e-05, + "lookahead_loss": 7.844175812721253, + "loss": 0.3196, + "step": 16000 + }, + { + "base_loss": 0.30225355681777, + "epoch": 0.03147125244140625, + "grad_norm": 0.0009732363396324217, + "learning_rate": 4.842653274536133e-05, + "lookahead_loss": 7.831875602722168, + "loss": 0.3166, + "step": 16500 + }, + { + "base_loss": 0.3184074863195419, + "epoch": 0.0324249267578125, + "grad_norm": 0.0010106490226462483, + "learning_rate": 4.837884902954102e-05, + "lookahead_loss": 7.771908633232116, + "loss": 0.3381, + "step": 17000 + }, + { + "base_loss": 0.30629492220282556, + "epoch": 0.03337860107421875, + "grad_norm": 0.0010188270825892687, + "learning_rate": 4.833116531372071e-05, + "lookahead_loss": 7.789399157524109, + "loss": 0.3185, + "step": 17500 + }, + { + "base_loss": 0.3031555346250534, + "epoch": 0.034332275390625, + "grad_norm": 0.0009390591876581311, + "learning_rate": 4.828348159790039e-05, + "lookahead_loss": 7.772115784645081, + "loss": 0.3169, + "step": 18000 + }, + { + "base_loss": 0.31164542263746264, + "epoch": 0.03528594970703125, + "grad_norm": 0.0010221318807452917, + "learning_rate": 4.823579788208008e-05, + "lookahead_loss": 7.639335807800293, + "loss": 0.3253, + "step": 18500 + }, + { + "base_loss": 0.324304408878088, + "epoch": 0.0362396240234375, + "grad_norm": 0.00101387407630682, + "learning_rate": 4.8188114166259766e-05, + "lookahead_loss": 7.712016674995422, + "loss": 0.3383, + "step": 19000 + }, + { + "base_loss": 0.30813179594278334, + "epoch": 0.03719329833984375, + "grad_norm": 0.0009941563475877047, + "learning_rate": 4.8140430450439456e-05, + "lookahead_loss": 7.633579847335816, + "loss": 0.3224, + "step": 19500 + }, + { + "base_loss": 0.30138176554441454, + "epoch": 0.03814697265625, + "grad_norm": 0.0009536141296848655, + "learning_rate": 4.8092746734619146e-05, + "lookahead_loss": 7.657862429618835, + "loss": 0.3164, + "step": 20000 + }, + { + "epoch": 0.03814697265625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.604301906622256, + "eval_lookahead_perplexity": 2006.810463506877, + "eval_loss": 0.14430111646652222, + "eval_perplexity": 1.1552319155094923, + "eval_runtime": 492.3491, + "eval_samples_per_second": 10.155, + "eval_steps_per_second": 0.319, + "step": 20000 + }, + { + "base_loss": 0.30871694785356524, + "epoch": 0.03910064697265625, + "grad_norm": 0.0009617543546482921, + "learning_rate": 4.804506301879883e-05, + "lookahead_loss": 7.594292169570923, + "loss": 0.3229, + "step": 20500 + }, + { + "base_loss": 0.32506244936585427, + "epoch": 0.0400543212890625, + "grad_norm": 0.0009832490468397737, + "learning_rate": 4.799737930297852e-05, + "lookahead_loss": 7.60844051361084, + "loss": 0.3364, + "step": 21000 + }, + { + "base_loss": 0.30769926142692566, + "epoch": 0.04100799560546875, + "grad_norm": 0.0009847276378422976, + "learning_rate": 4.79496955871582e-05, + "lookahead_loss": 7.544025864601135, + "loss": 0.3191, + "step": 21500 + }, + { + "base_loss": 0.29858891409635546, + "epoch": 0.041961669921875, + "grad_norm": 0.0010060840286314487, + "learning_rate": 4.7902011871337893e-05, + "lookahead_loss": 7.582731894493103, + "loss": 0.3122, + "step": 22000 + }, + { + "base_loss": 0.3094627737402916, + "epoch": 0.04291534423828125, + "grad_norm": 0.0009809609036892653, + "learning_rate": 4.7854328155517584e-05, + "lookahead_loss": 7.609361615180969, + "loss": 0.3268, + "step": 22500 + }, + { + "base_loss": 0.32764697542786597, + "epoch": 0.0438690185546875, + "grad_norm": 0.0009822545107454062, + "learning_rate": 4.780664443969727e-05, + "lookahead_loss": 7.582602263450623, + "loss": 0.343, + "step": 23000 + }, + { + "base_loss": 0.29553532418608663, + "epoch": 0.04482269287109375, + "grad_norm": 0.0010076743783429265, + "learning_rate": 4.775896072387696e-05, + "lookahead_loss": 7.513092971801758, + "loss": 0.3112, + "step": 23500 + }, + { + "base_loss": 0.3041268612146378, + "epoch": 0.0457763671875, + "grad_norm": 0.0009422925650142133, + "learning_rate": 4.771127700805664e-05, + "lookahead_loss": 7.501417625427246, + "loss": 0.3176, + "step": 24000 + }, + { + "base_loss": 0.3303914776444435, + "epoch": 0.04673004150390625, + "grad_norm": 0.000979002215899527, + "learning_rate": 4.766359329223633e-05, + "lookahead_loss": 7.502476096153259, + "loss": 0.3419, + "step": 24500 + }, + { + "base_loss": 0.3248122656941414, + "epoch": 0.0476837158203125, + "grad_norm": 0.0010156352072954178, + "learning_rate": 4.761590957641602e-05, + "lookahead_loss": 7.487288349151611, + "loss": 0.3412, + "step": 25000 + }, + { + "epoch": 0.0476837158203125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.440544437676573, + "eval_lookahead_perplexity": 1703.677514928871, + "eval_loss": 0.14398084580898285, + "eval_perplexity": 1.1548619878659485, + "eval_runtime": 487.508, + "eval_samples_per_second": 10.256, + "eval_steps_per_second": 0.322, + "step": 25000 + }, + { + "base_loss": 0.2969100174307823, + "epoch": 0.04863739013671875, + "grad_norm": 0.0009701626840978861, + "learning_rate": 4.7568225860595705e-05, + "lookahead_loss": 7.416944421768188, + "loss": 0.3108, + "step": 25500 + }, + { + "base_loss": 0.302005185931921, + "epoch": 0.049591064453125, + "grad_norm": 0.0009941664757207036, + "learning_rate": 4.7520542144775395e-05, + "lookahead_loss": 7.415986575126648, + "loss": 0.3185, + "step": 26000 + }, + { + "base_loss": 0.31874441370368006, + "epoch": 0.05054473876953125, + "grad_norm": 0.0009409674676135182, + "learning_rate": 4.747285842895508e-05, + "lookahead_loss": 7.50030288696289, + "loss": 0.3333, + "step": 26500 + }, + { + "base_loss": 0.30408672893047334, + "epoch": 0.0514984130859375, + "grad_norm": 0.0009882714366540313, + "learning_rate": 4.742517471313477e-05, + "lookahead_loss": 7.433645064353943, + "loss": 0.3203, + "step": 27000 + }, + { + "base_loss": 0.3058005510568619, + "epoch": 0.05245208740234375, + "grad_norm": 0.0010352524695917964, + "learning_rate": 4.737749099731446e-05, + "lookahead_loss": 7.385928537368774, + "loss": 0.3201, + "step": 27500 + }, + { + "base_loss": 0.32026463899016383, + "epoch": 0.05340576171875, + "grad_norm": 0.0009495351114310324, + "learning_rate": 4.732980728149414e-05, + "lookahead_loss": 7.358126895904541, + "loss": 0.3313, + "step": 28000 + }, + { + "base_loss": 0.35889338579773905, + "epoch": 0.05435943603515625, + "grad_norm": 0.0009934919653460383, + "learning_rate": 4.728212356567383e-05, + "lookahead_loss": 7.398619123458863, + "loss": 0.3729, + "step": 28500 + }, + { + "base_loss": 0.29546374672651293, + "epoch": 0.0553131103515625, + "grad_norm": 0.0009958905866369605, + "learning_rate": 4.7234439849853516e-05, + "lookahead_loss": 7.388701396942139, + "loss": 0.3082, + "step": 29000 + }, + { + "base_loss": 0.3063408683240414, + "epoch": 0.05626678466796875, + "grad_norm": 0.0009431101498194039, + "learning_rate": 4.7186756134033206e-05, + "lookahead_loss": 7.369903712272644, + "loss": 0.3195, + "step": 29500 + }, + { + "base_loss": 0.3186078954935074, + "epoch": 0.057220458984375, + "grad_norm": 0.0009628318366594613, + "learning_rate": 4.7139072418212896e-05, + "lookahead_loss": 7.403464751243591, + "loss": 0.3334, + "step": 30000 + }, + { + "epoch": 0.057220458984375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.322395800020748, + "eval_lookahead_perplexity": 1513.8264541066753, + "eval_loss": 0.1437508761882782, + "eval_perplexity": 1.154596435228323, + "eval_runtime": 480.8305, + "eval_samples_per_second": 10.399, + "eval_steps_per_second": 0.327, + "step": 30000 + }, + { + "base_loss": 0.31768571627140046, + "epoch": 0.05817413330078125, + "grad_norm": 0.0009793334174901247, + "learning_rate": 4.709138870239258e-05, + "lookahead_loss": 7.361583526611328, + "loss": 0.3287, + "step": 30500 + }, + { + "base_loss": 0.2922684009075165, + "epoch": 0.0591278076171875, + "grad_norm": 0.0009462712332606316, + "learning_rate": 4.704370498657227e-05, + "lookahead_loss": 7.2946535530090335, + "loss": 0.3098, + "step": 31000 + }, + { + "base_loss": 0.30112267237901685, + "epoch": 0.06008148193359375, + "grad_norm": 0.0009671795414760709, + "learning_rate": 4.699602127075195e-05, + "lookahead_loss": 7.319288095474243, + "loss": 0.3168, + "step": 31500 + }, + { + "base_loss": 0.32029621145129206, + "epoch": 0.06103515625, + "grad_norm": 0.0009950937237590551, + "learning_rate": 4.6948337554931643e-05, + "lookahead_loss": 7.297159067153931, + "loss": 0.333, + "step": 32000 + }, + { + "base_loss": 0.30533574494719506, + "epoch": 0.06198883056640625, + "grad_norm": 0.0010346778435632586, + "learning_rate": 4.6900653839111334e-05, + "lookahead_loss": 7.286148567199707, + "loss": 0.3169, + "step": 32500 + }, + { + "base_loss": 0.30571810373663905, + "epoch": 0.0629425048828125, + "grad_norm": 0.0010247246827930212, + "learning_rate": 4.685297012329102e-05, + "lookahead_loss": 7.249496428489685, + "loss": 0.3185, + "step": 33000 + }, + { + "base_loss": 0.31451627737283705, + "epoch": 0.06389617919921875, + "grad_norm": 0.0009608972468413413, + "learning_rate": 4.680528640747071e-05, + "lookahead_loss": 7.3038947277069095, + "loss": 0.3298, + "step": 33500 + }, + { + "base_loss": 0.30425655883550645, + "epoch": 0.064849853515625, + "grad_norm": 0.0009828249458223581, + "learning_rate": 4.675760269165039e-05, + "lookahead_loss": 7.304937886238098, + "loss": 0.3192, + "step": 34000 + }, + { + "base_loss": 0.31105126801133154, + "epoch": 0.06580352783203125, + "grad_norm": 0.0009732933831401169, + "learning_rate": 4.670991897583008e-05, + "lookahead_loss": 7.2146655473709105, + "loss": 0.3228, + "step": 34500 + }, + { + "base_loss": 0.3071163959801197, + "epoch": 0.0667572021484375, + "grad_norm": 0.0009960451861843467, + "learning_rate": 4.666223526000977e-05, + "lookahead_loss": 7.182215183258057, + "loss": 0.3182, + "step": 35000 + }, + { + "epoch": 0.0667572021484375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.2324602169731556, + "eval_lookahead_perplexity": 1383.6223313598848, + "eval_loss": 0.14357592165470123, + "eval_perplexity": 1.1543944510170698, + "eval_runtime": 512.9222, + "eval_samples_per_second": 9.748, + "eval_steps_per_second": 0.306, + "step": 35000 + }, + { + "base_loss": 0.3321034919023514, + "epoch": 0.06771087646484375, + "grad_norm": 0.0010179802775382996, + "learning_rate": 4.6614551544189455e-05, + "lookahead_loss": 7.266714824676514, + "loss": 0.3445, + "step": 35500 + }, + { + "base_loss": 0.3017843673825264, + "epoch": 0.06866455078125, + "grad_norm": 0.0009934077970683575, + "learning_rate": 4.6566867828369145e-05, + "lookahead_loss": 7.266165484428406, + "loss": 0.312, + "step": 36000 + }, + { + "base_loss": 0.302195555627346, + "epoch": 0.06961822509765625, + "grad_norm": 0.0009844391606748104, + "learning_rate": 4.651918411254883e-05, + "lookahead_loss": 7.245750316619873, + "loss": 0.318, + "step": 36500 + }, + { + "base_loss": 0.3459869565963745, + "epoch": 0.0705718994140625, + "grad_norm": 0.0009586875676177442, + "learning_rate": 4.647150039672852e-05, + "lookahead_loss": 7.152301582336426, + "loss": 0.3617, + "step": 37000 + }, + { + "base_loss": 0.3151495299339294, + "epoch": 0.07152557373046875, + "grad_norm": 0.0009651901782490313, + "learning_rate": 4.642381668090821e-05, + "lookahead_loss": 7.200114166259765, + "loss": 0.3277, + "step": 37500 + }, + { + "base_loss": 0.30790447345376015, + "epoch": 0.072479248046875, + "grad_norm": 0.001032789470627904, + "learning_rate": 4.637613296508789e-05, + "lookahead_loss": 7.234860193252564, + "loss": 0.3219, + "step": 38000 + }, + { + "base_loss": 0.30545566940307617, + "epoch": 0.07343292236328125, + "grad_norm": 0.0009383106953464448, + "learning_rate": 4.632844924926758e-05, + "lookahead_loss": 7.182545600891113, + "loss": 0.32, + "step": 38500 + }, + { + "base_loss": 0.32841417971253395, + "epoch": 0.0743865966796875, + "grad_norm": 0.000990406610071659, + "learning_rate": 4.6280765533447266e-05, + "lookahead_loss": 7.212004456520081, + "loss": 0.3421, + "step": 39000 + }, + { + "base_loss": 0.30363579127192497, + "epoch": 0.07534027099609375, + "grad_norm": 0.0010100390063598752, + "learning_rate": 4.6233081817626956e-05, + "lookahead_loss": 7.244741122245789, + "loss": 0.3174, + "step": 39500 + }, + { + "base_loss": 0.30504586565494535, + "epoch": 0.0762939453125, + "grad_norm": 0.0009652067092247307, + "learning_rate": 4.6185398101806646e-05, + "lookahead_loss": 7.1760479412078855, + "loss": 0.3193, + "step": 40000 + }, + { + "epoch": 0.0762939453125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.159913369261038, + "eval_lookahead_perplexity": 1286.7994516897043, + "eval_loss": 0.14343445003032684, + "eval_perplexity": 1.1542311485105234, + "eval_runtime": 502.1684, + "eval_samples_per_second": 9.957, + "eval_steps_per_second": 0.313, + "step": 40000 + }, + { + "base_loss": 0.33029799509048463, + "epoch": 0.07724761962890625, + "grad_norm": 0.0010073404991999269, + "learning_rate": 4.613771438598633e-05, + "lookahead_loss": 7.19347186088562, + "loss": 0.3467, + "step": 40500 + }, + { + "base_loss": 0.3037954642176628, + "epoch": 0.0782012939453125, + "grad_norm": 0.0009825917659327388, + "learning_rate": 4.609003067016602e-05, + "lookahead_loss": 7.184024220466614, + "loss": 0.3181, + "step": 41000 + }, + { + "base_loss": 0.29821320512890814, + "epoch": 0.07915496826171875, + "grad_norm": 0.0009405228192918003, + "learning_rate": 4.60423469543457e-05, + "lookahead_loss": 7.188343933105469, + "loss": 0.3143, + "step": 41500 + }, + { + "base_loss": 0.3142137563228607, + "epoch": 0.080108642578125, + "grad_norm": 0.0009637173498049378, + "learning_rate": 4.5994663238525393e-05, + "lookahead_loss": 7.147234386444092, + "loss": 0.3317, + "step": 42000 + }, + { + "base_loss": 0.3222310249209404, + "epoch": 0.08106231689453125, + "grad_norm": 0.0009848393965512514, + "learning_rate": 4.5946979522705084e-05, + "lookahead_loss": 7.199878736495972, + "loss": 0.3406, + "step": 42500 + }, + { + "base_loss": 0.3002626436650753, + "epoch": 0.0820159912109375, + "grad_norm": 0.000929056026507169, + "learning_rate": 4.589929580688477e-05, + "lookahead_loss": 7.160097922325134, + "loss": 0.3137, + "step": 43000 + }, + { + "base_loss": 0.3045452245473862, + "epoch": 0.08296966552734375, + "grad_norm": 0.0009913038229569793, + "learning_rate": 4.585161209106446e-05, + "lookahead_loss": 7.1922206773757935, + "loss": 0.3198, + "step": 43500 + }, + { + "base_loss": 0.33469617655873296, + "epoch": 0.08392333984375, + "grad_norm": 0.0009477322455495596, + "learning_rate": 4.580392837524414e-05, + "lookahead_loss": 7.188063373565674, + "loss": 0.347, + "step": 44000 + }, + { + "base_loss": 0.30740025800466536, + "epoch": 0.08487701416015625, + "grad_norm": 0.0009691762970760465, + "learning_rate": 4.575624465942383e-05, + "lookahead_loss": 7.11799400806427, + "loss": 0.3204, + "step": 44500 + }, + { + "base_loss": 0.300477741509676, + "epoch": 0.0858306884765625, + "grad_norm": 0.000998710049316287, + "learning_rate": 4.570856094360352e-05, + "lookahead_loss": 7.151413684844971, + "loss": 0.3112, + "step": 45000 + }, + { + "epoch": 0.0858306884765625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.100333274743808, + "eval_lookahead_perplexity": 1212.3710598241469, + "eval_loss": 0.14331814646720886, + "eval_perplexity": 1.154096915121352, + "eval_runtime": 479.3028, + "eval_samples_per_second": 10.432, + "eval_steps_per_second": 0.328, + "step": 45000 + }, + { + "base_loss": 0.301901563256979, + "epoch": 0.08678436279296875, + "grad_norm": 0.0011987128527835011, + "learning_rate": 4.5660877227783205e-05, + "lookahead_loss": 7.107098340034485, + "loss": 0.3142, + "step": 45500 + }, + { + "base_loss": 0.338142231285572, + "epoch": 0.087738037109375, + "grad_norm": 0.0009075519046746194, + "learning_rate": 4.5613193511962895e-05, + "lookahead_loss": 7.136958950996399, + "loss": 0.3464, + "step": 46000 + }, + { + "base_loss": 0.3009798979461193, + "epoch": 0.08869171142578125, + "grad_norm": 0.0009948944207280874, + "learning_rate": 4.556550979614258e-05, + "lookahead_loss": 7.107201243400573, + "loss": 0.3132, + "step": 46500 + }, + { + "base_loss": 0.3090392453968525, + "epoch": 0.0896453857421875, + "grad_norm": 0.0010095473844558, + "learning_rate": 4.551782608032227e-05, + "lookahead_loss": 7.135781683921814, + "loss": 0.3192, + "step": 47000 + }, + { + "base_loss": 0.30036539113521576, + "epoch": 0.09059906005859375, + "grad_norm": 0.0009663203964009881, + "learning_rate": 4.547014236450196e-05, + "lookahead_loss": 7.099122268676758, + "loss": 0.3132, + "step": 47500 + }, + { + "base_loss": 0.3006012495756149, + "epoch": 0.091552734375, + "grad_norm": 0.0009152375860139728, + "learning_rate": 4.542245864868164e-05, + "lookahead_loss": 7.080106061935425, + "loss": 0.3121, + "step": 48000 + }, + { + "base_loss": 0.31875682109594344, + "epoch": 0.09250640869140625, + "grad_norm": 0.0009438424604013562, + "learning_rate": 4.537477493286133e-05, + "lookahead_loss": 7.104524848937988, + "loss": 0.3365, + "step": 48500 + }, + { + "base_loss": 0.3104289738535881, + "epoch": 0.0934600830078125, + "grad_norm": 0.000929334491956979, + "learning_rate": 4.5327091217041016e-05, + "lookahead_loss": 7.102272230148316, + "loss": 0.3228, + "step": 49000 + }, + { + "base_loss": 0.2877590928971767, + "epoch": 0.09441375732421875, + "grad_norm": 0.000983032863587141, + "learning_rate": 4.5279407501220706e-05, + "lookahead_loss": 7.092049780845642, + "loss": 0.3028, + "step": 49500 + }, + { + "base_loss": 0.2935507807135582, + "epoch": 0.095367431640625, + "grad_norm": 0.0009688133141025901, + "learning_rate": 4.523172378540039e-05, + "lookahead_loss": 7.024641987800599, + "loss": 0.3084, + "step": 50000 + }, + { + "epoch": 0.095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.052693091261501, + "eval_lookahead_perplexity": 1155.9676810375186, + "eval_loss": 0.1432213932275772, + "eval_perplexity": 1.1539852579076508, + "eval_runtime": 494.2154, + "eval_samples_per_second": 10.117, + "eval_steps_per_second": 0.318, + "step": 50000 + }, + { + "base_loss": 0.2986202912926674, + "epoch": 0.09632110595703125, + "grad_norm": 0.0009543218766339123, + "learning_rate": 4.518404006958008e-05, + "lookahead_loss": 7.0854365358352664, + "loss": 0.3132, + "step": 50500 + }, + { + "base_loss": 0.3307524161040783, + "epoch": 0.0972747802734375, + "grad_norm": 0.0010054496815428138, + "learning_rate": 4.513635635375977e-05, + "lookahead_loss": 7.08996038722992, + "loss": 0.3427, + "step": 51000 + }, + { + "base_loss": 0.29244673988223074, + "epoch": 0.09822845458984375, + "grad_norm": 0.0010033181170001626, + "learning_rate": 4.508867263793945e-05, + "lookahead_loss": 7.052651536941529, + "loss": 0.3083, + "step": 51500 + }, + { + "base_loss": 0.295786843508482, + "epoch": 0.09918212890625, + "grad_norm": 0.0009846463799476624, + "learning_rate": 4.5040988922119143e-05, + "lookahead_loss": 7.07691504573822, + "loss": 0.3121, + "step": 52000 + }, + { + "base_loss": 0.30293611577153207, + "epoch": 0.10013580322265625, + "grad_norm": 0.0009364968864247203, + "learning_rate": 4.499330520629883e-05, + "lookahead_loss": 7.069658821105957, + "loss": 0.3161, + "step": 52500 + }, + { + "base_loss": 0.3240869597494602, + "epoch": 0.1010894775390625, + "grad_norm": 0.0009558585588820279, + "learning_rate": 4.494562149047852e-05, + "lookahead_loss": 7.12118856048584, + "loss": 0.335, + "step": 53000 + }, + { + "base_loss": 0.30599541807174685, + "epoch": 0.10204315185546875, + "grad_norm": 0.000964898441452533, + "learning_rate": 4.489793777465821e-05, + "lookahead_loss": 7.0878299045562745, + "loss": 0.3163, + "step": 53500 + }, + { + "base_loss": 0.2991089904308319, + "epoch": 0.102996826171875, + "grad_norm": 0.0009853472001850605, + "learning_rate": 4.485025405883789e-05, + "lookahead_loss": 7.047714894294739, + "loss": 0.3149, + "step": 54000 + }, + { + "base_loss": 0.30219315418601034, + "epoch": 0.10395050048828125, + "grad_norm": 0.0010090046562254429, + "learning_rate": 4.480257034301758e-05, + "lookahead_loss": 7.079160309791565, + "loss": 0.3135, + "step": 54500 + }, + { + "base_loss": 0.3133500624895096, + "epoch": 0.1049041748046875, + "grad_norm": 0.0009890320943668485, + "learning_rate": 4.4754886627197264e-05, + "lookahead_loss": 7.02062137889862, + "loss": 0.3281, + "step": 55000 + }, + { + "epoch": 0.1049041748046875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 7.00973541820392, + "eval_lookahead_perplexity": 1107.3614784488032, + "eval_loss": 0.14313539862632751, + "eval_perplexity": 1.1538860256723285, + "eval_runtime": 478.5928, + "eval_samples_per_second": 10.447, + "eval_steps_per_second": 0.328, + "step": 55000 + }, + { + "base_loss": 0.3070584389269352, + "epoch": 0.10585784912109375, + "grad_norm": 0.0009455215185880661, + "learning_rate": 4.4707202911376955e-05, + "lookahead_loss": 6.998170353889465, + "loss": 0.3221, + "step": 55500 + }, + { + "base_loss": 0.29948241996765135, + "epoch": 0.1068115234375, + "grad_norm": 0.0009629906271584332, + "learning_rate": 4.4659519195556645e-05, + "lookahead_loss": 7.0833413105010985, + "loss": 0.3107, + "step": 56000 + }, + { + "base_loss": 0.29492466670274736, + "epoch": 0.10776519775390625, + "grad_norm": 0.0009873651433736086, + "learning_rate": 4.461183547973633e-05, + "lookahead_loss": 6.994237482070923, + "loss": 0.3092, + "step": 56500 + }, + { + "base_loss": 0.3188383647501469, + "epoch": 0.1087188720703125, + "grad_norm": 0.0010177677031606436, + "learning_rate": 4.456415176391602e-05, + "lookahead_loss": 6.982017017364502, + "loss": 0.3324, + "step": 57000 + }, + { + "base_loss": 0.31659464621543887, + "epoch": 0.10967254638671875, + "grad_norm": 0.0009399647242389619, + "learning_rate": 4.45164680480957e-05, + "lookahead_loss": 6.973290238380432, + "loss": 0.3271, + "step": 57500 + }, + { + "base_loss": 0.3013280538916588, + "epoch": 0.110626220703125, + "grad_norm": 0.0008893092744983733, + "learning_rate": 4.446878433227539e-05, + "lookahead_loss": 6.9135963726043705, + "loss": 0.3151, + "step": 58000 + }, + { + "base_loss": 0.29822684854269027, + "epoch": 0.11157989501953125, + "grad_norm": 0.0010066829854622483, + "learning_rate": 4.442110061645508e-05, + "lookahead_loss": 7.023115937232971, + "loss": 0.3097, + "step": 58500 + }, + { + "base_loss": 0.3082665235698223, + "epoch": 0.1125335693359375, + "grad_norm": 0.0009333739290013909, + "learning_rate": 4.4373416900634766e-05, + "lookahead_loss": 7.03084280014038, + "loss": 0.322, + "step": 59000 + }, + { + "base_loss": 0.34342152199149134, + "epoch": 0.11348724365234375, + "grad_norm": 0.001008225604891777, + "learning_rate": 4.4325733184814456e-05, + "lookahead_loss": 7.03166408252716, + "loss": 0.3512, + "step": 59500 + }, + { + "base_loss": 0.29527223294973376, + "epoch": 0.11444091796875, + "grad_norm": 0.0009543896303512156, + "learning_rate": 4.427804946899414e-05, + "lookahead_loss": 6.924260063171387, + "loss": 0.3096, + "step": 60000 + }, + { + "epoch": 0.11444091796875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.971603098007056, + "eval_lookahead_perplexity": 1065.930172124952, + "eval_loss": 0.14305971562862396, + "eval_perplexity": 1.153798699423495, + "eval_runtime": 487.8431, + "eval_samples_per_second": 10.249, + "eval_steps_per_second": 0.322, + "step": 60000 + }, + { + "base_loss": 0.2983711498081684, + "epoch": 0.11539459228515625, + "grad_norm": 0.0009778671665117145, + "learning_rate": 4.423036575317383e-05, + "lookahead_loss": 6.959155892372132, + "loss": 0.3119, + "step": 60500 + }, + { + "base_loss": 0.3164993856549263, + "epoch": 0.1163482666015625, + "grad_norm": 0.0010577912908047438, + "learning_rate": 4.418268203735352e-05, + "lookahead_loss": 6.982672909736634, + "loss": 0.326, + "step": 61000 + }, + { + "base_loss": 0.3281388694047928, + "epoch": 0.11730194091796875, + "grad_norm": 0.0010003127390518785, + "learning_rate": 4.41349983215332e-05, + "lookahead_loss": 6.978295309066772, + "loss": 0.3447, + "step": 61500 + }, + { + "base_loss": 0.3066762860417366, + "epoch": 0.118255615234375, + "grad_norm": 0.0010272158542647958, + "learning_rate": 4.4087314605712893e-05, + "lookahead_loss": 6.9806537971496585, + "loss": 0.317, + "step": 62000 + }, + { + "base_loss": 0.3002779276072979, + "epoch": 0.11920928955078125, + "grad_norm": 0.0009698990033939481, + "learning_rate": 4.403963088989258e-05, + "lookahead_loss": 7.002115357398987, + "loss": 0.3116, + "step": 62500 + }, + { + "base_loss": 0.3048044160306454, + "epoch": 1.0009536743164062, + "grad_norm": 0.0009617453324608505, + "learning_rate": 4.399194717407227e-05, + "lookahead_loss": 7.047370400428772, + "loss": 0.3145, + "step": 63000 + }, + { + "base_loss": 0.2995053820014, + "epoch": 1.0019073486328125, + "grad_norm": 0.0010174677008762956, + "learning_rate": 4.394426345825196e-05, + "lookahead_loss": 6.895120985031128, + "loss": 0.3142, + "step": 63500 + }, + { + "base_loss": 0.31198617857694627, + "epoch": 1.0028610229492188, + "grad_norm": 0.0010111057199537754, + "learning_rate": 4.389657974243164e-05, + "lookahead_loss": 6.888555366516114, + "loss": 0.3226, + "step": 64000 + }, + { + "base_loss": 0.32396442687511445, + "epoch": 1.003814697265625, + "grad_norm": 0.0009548735106363893, + "learning_rate": 4.384889602661133e-05, + "lookahead_loss": 6.908667636871338, + "loss": 0.336, + "step": 64500 + }, + { + "base_loss": 0.3013957371413708, + "epoch": 1.0047683715820312, + "grad_norm": 0.000966136809438467, + "learning_rate": 4.3801212310791014e-05, + "lookahead_loss": 6.905056614875793, + "loss": 0.3168, + "step": 65000 + }, + { + "epoch": 1.0047683715820312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.937254576637341, + "eval_lookahead_perplexity": 1029.9387120995427, + "eval_loss": 0.14299124479293823, + "eval_perplexity": 1.1537197005669222, + "eval_runtime": 494.7263, + "eval_samples_per_second": 10.107, + "eval_steps_per_second": 0.317, + "step": 65000 + }, + { + "base_loss": 0.3039788320362568, + "epoch": 1.0057220458984375, + "grad_norm": 0.0008604921749792993, + "learning_rate": 4.3753528594970705e-05, + "lookahead_loss": 7.014158073425293, + "loss": 0.3139, + "step": 65500 + }, + { + "base_loss": 0.29717833909392355, + "epoch": 1.0066757202148438, + "grad_norm": 0.0009630241547711194, + "learning_rate": 4.3705844879150395e-05, + "lookahead_loss": 6.847448231697083, + "loss": 0.3148, + "step": 66000 + }, + { + "base_loss": 0.31199148765206336, + "epoch": 1.00762939453125, + "grad_norm": 0.0010012584971264005, + "learning_rate": 4.365816116333008e-05, + "lookahead_loss": 6.918432865142822, + "loss": 0.3246, + "step": 66500 + }, + { + "base_loss": 0.3148621036410332, + "epoch": 1.0085830688476562, + "grad_norm": 0.0009159519104287028, + "learning_rate": 4.361047744750977e-05, + "lookahead_loss": 6.913657369136811, + "loss": 0.3229, + "step": 67000 + }, + { + "base_loss": 0.30580521461367605, + "epoch": 1.0095367431640625, + "grad_norm": 0.0009974334388971329, + "learning_rate": 4.356279373168945e-05, + "lookahead_loss": 6.923233027458191, + "loss": 0.3191, + "step": 67500 + }, + { + "base_loss": 0.3015244754254818, + "epoch": 1.0104904174804688, + "grad_norm": 0.0009639709023758769, + "learning_rate": 4.351511001586914e-05, + "lookahead_loss": 6.88383205986023, + "loss": 0.3128, + "step": 68000 + }, + { + "base_loss": 0.30137019059062004, + "epoch": 1.011444091796875, + "grad_norm": 0.0010148598812520504, + "learning_rate": 4.346742630004883e-05, + "lookahead_loss": 6.898301356315613, + "loss": 0.3139, + "step": 68500 + }, + { + "base_loss": 0.3252628707587719, + "epoch": 1.0123977661132812, + "grad_norm": 0.000888565497007221, + "learning_rate": 4.3419742584228516e-05, + "lookahead_loss": 6.891662693023681, + "loss": 0.3359, + "step": 69000 + }, + { + "base_loss": 0.30557073107361793, + "epoch": 1.0133514404296875, + "grad_norm": 0.0009476915001869202, + "learning_rate": 4.3372058868408206e-05, + "lookahead_loss": 6.974801006317139, + "loss": 0.3211, + "step": 69500 + }, + { + "base_loss": 0.30054079556465146, + "epoch": 1.0143051147460938, + "grad_norm": 0.0009728021686896682, + "learning_rate": 4.332437515258789e-05, + "lookahead_loss": 6.902195900917053, + "loss": 0.3158, + "step": 70000 + }, + { + "epoch": 1.0143051147460938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.905760923513589, + "eval_lookahead_perplexity": 998.0076319369023, + "eval_loss": 0.14292870461940765, + "eval_perplexity": 1.1536475489928526, + "eval_runtime": 481.4279, + "eval_samples_per_second": 10.386, + "eval_steps_per_second": 0.326, + "step": 70000 + }, + { + "base_loss": 0.29648803743720054, + "epoch": 1.0152587890625, + "grad_norm": 0.0009132726700045168, + "learning_rate": 4.327669143676758e-05, + "lookahead_loss": 6.884706204414368, + "loss": 0.3079, + "step": 70500 + }, + { + "base_loss": 0.31412097451090815, + "epoch": 1.0162124633789062, + "grad_norm": 0.0009932307293638587, + "learning_rate": 4.322900772094727e-05, + "lookahead_loss": 6.908680680274963, + "loss": 0.326, + "step": 71000 + }, + { + "base_loss": 0.3125672063827515, + "epoch": 1.0171661376953125, + "grad_norm": 0.0009134129504673183, + "learning_rate": 4.318132400512695e-05, + "lookahead_loss": 6.958608627319336, + "loss": 0.3241, + "step": 71500 + }, + { + "base_loss": 0.3002317441105843, + "epoch": 1.0181198120117188, + "grad_norm": 0.0009274011244997382, + "learning_rate": 4.3133640289306643e-05, + "lookahead_loss": 6.960107209205628, + "loss": 0.3111, + "step": 72000 + }, + { + "base_loss": 0.29831535935401915, + "epoch": 1.019073486328125, + "grad_norm": 0.0009689630824141204, + "learning_rate": 4.308595657348633e-05, + "lookahead_loss": 6.973208980560303, + "loss": 0.3109, + "step": 72500 + }, + { + "base_loss": 0.3020369653701782, + "epoch": 1.0200271606445312, + "grad_norm": 0.001046851510182023, + "learning_rate": 4.303827285766602e-05, + "lookahead_loss": 6.801126411437989, + "loss": 0.3157, + "step": 73000 + }, + { + "base_loss": 0.32652922403812407, + "epoch": 1.0209808349609375, + "grad_norm": 0.0009485671180300415, + "learning_rate": 4.299058914184571e-05, + "lookahead_loss": 6.887979488372803, + "loss": 0.3396, + "step": 73500 + }, + { + "base_loss": 0.30453234216570857, + "epoch": 1.0219345092773438, + "grad_norm": 0.0009610042907297611, + "learning_rate": 4.294290542602539e-05, + "lookahead_loss": 6.849929617881775, + "loss": 0.3146, + "step": 74000 + }, + { + "base_loss": 0.2977458454966545, + "epoch": 1.02288818359375, + "grad_norm": 0.0010150724556297064, + "learning_rate": 4.289522171020508e-05, + "lookahead_loss": 6.878925356388092, + "loss": 0.3125, + "step": 74500 + }, + { + "base_loss": 0.30405546057224275, + "epoch": 1.0238418579101562, + "grad_norm": 0.0009638189221732318, + "learning_rate": 4.2847537994384764e-05, + "lookahead_loss": 6.84999968624115, + "loss": 0.3149, + "step": 75000 + }, + { + "epoch": 1.0238418579101562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.875502051636815, + "eval_lookahead_perplexity": 968.2613607781705, + "eval_loss": 0.142868772149086, + "eval_perplexity": 1.1535784101172133, + "eval_runtime": 496.1931, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 0.316, + "step": 75000 + }, + { + "base_loss": 0.32463854083418847, + "epoch": 1.0247955322265625, + "grad_norm": 0.0009209099807776511, + "learning_rate": 4.2799854278564455e-05, + "lookahead_loss": 6.842094340324402, + "loss": 0.3362, + "step": 75500 + }, + { + "base_loss": 0.3075324648320675, + "epoch": 1.0257492065429688, + "grad_norm": 0.0009934107074514031, + "learning_rate": 4.2752170562744145e-05, + "lookahead_loss": 6.811694778442383, + "loss": 0.3238, + "step": 76000 + }, + { + "base_loss": 0.30398501074314116, + "epoch": 1.026702880859375, + "grad_norm": 0.0009826256427913904, + "learning_rate": 4.270448684692383e-05, + "lookahead_loss": 6.813813063621521, + "loss": 0.314, + "step": 76500 + }, + { + "base_loss": 0.3081837382018566, + "epoch": 1.0276565551757812, + "grad_norm": 0.0009539109887555242, + "learning_rate": 4.265680313110352e-05, + "lookahead_loss": 6.9388167886734005, + "loss": 0.319, + "step": 77000 + }, + { + "base_loss": 0.32895678067207335, + "epoch": 1.0286102294921875, + "grad_norm": 0.0009696350898593664, + "learning_rate": 4.26091194152832e-05, + "lookahead_loss": 6.9466577863693235, + "loss": 0.341, + "step": 77500 + }, + { + "base_loss": 0.30588172587752344, + "epoch": 1.0295639038085938, + "grad_norm": 0.0009499301086179912, + "learning_rate": 4.256143569946289e-05, + "lookahead_loss": 6.890705446243286, + "loss": 0.3154, + "step": 78000 + }, + { + "base_loss": 0.3051903445720673, + "epoch": 1.030517578125, + "grad_norm": 0.0009480732842348516, + "learning_rate": 4.251375198364258e-05, + "lookahead_loss": 6.864274346351624, + "loss": 0.3174, + "step": 78500 + }, + { + "base_loss": 0.30346439191699026, + "epoch": 1.0314712524414062, + "grad_norm": 0.0009676506742835045, + "learning_rate": 4.2466068267822266e-05, + "lookahead_loss": 6.894395670890808, + "loss": 0.3162, + "step": 79000 + }, + { + "base_loss": 0.31795056411623956, + "epoch": 1.0324249267578125, + "grad_norm": 0.0009829605696722865, + "learning_rate": 4.2418384552001956e-05, + "lookahead_loss": 6.821612464904785, + "loss": 0.3355, + "step": 79500 + }, + { + "base_loss": 0.30795893451571466, + "epoch": 1.0333786010742188, + "grad_norm": 0.0009851903887465596, + "learning_rate": 4.237070083618164e-05, + "lookahead_loss": 6.886163942337036, + "loss": 0.3185, + "step": 80000 + }, + { + "epoch": 1.0333786010742188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.84953888384298, + "eval_lookahead_perplexity": 943.4457682675941, + "eval_loss": 0.14281712472438812, + "eval_perplexity": 1.1535188323016774, + "eval_runtime": 489.9432, + "eval_samples_per_second": 10.205, + "eval_steps_per_second": 0.32, + "step": 80000 + }, + { + "base_loss": 0.3031257001161575, + "epoch": 1.034332275390625, + "grad_norm": 0.0009114276035688818, + "learning_rate": 4.232301712036133e-05, + "lookahead_loss": 6.9387643995285035, + "loss": 0.3159, + "step": 80500 + }, + { + "base_loss": 0.31068781118094924, + "epoch": 1.0352859497070312, + "grad_norm": 0.0010243533179163933, + "learning_rate": 4.227533340454102e-05, + "lookahead_loss": 6.779029149055481, + "loss": 0.3235, + "step": 81000 + }, + { + "base_loss": 0.32500979214906695, + "epoch": 1.0362396240234375, + "grad_norm": 0.0009815491503104568, + "learning_rate": 4.22276496887207e-05, + "lookahead_loss": 6.89651736831665, + "loss": 0.3377, + "step": 81500 + }, + { + "base_loss": 0.3069631262719631, + "epoch": 1.0371932983398438, + "grad_norm": 0.0010114209726452827, + "learning_rate": 4.2179965972900393e-05, + "lookahead_loss": 6.8101696758270265, + "loss": 0.3181, + "step": 82000 + }, + { + "base_loss": 0.3025422422587872, + "epoch": 1.03814697265625, + "grad_norm": 0.0009560140897519886, + "learning_rate": 4.213228225708008e-05, + "lookahead_loss": 6.864963472366333, + "loss": 0.314, + "step": 82500 + }, + { + "base_loss": 0.3076345331072807, + "epoch": 1.0391006469726562, + "grad_norm": 0.000972763926256448, + "learning_rate": 4.208459854125977e-05, + "lookahead_loss": 6.800820850372315, + "loss": 0.3196, + "step": 83000 + }, + { + "base_loss": 0.3235399980545044, + "epoch": 1.0400543212890625, + "grad_norm": 0.0009581708000041544, + "learning_rate": 4.203691482543946e-05, + "lookahead_loss": 6.839151841163635, + "loss": 0.3342, + "step": 83500 + }, + { + "base_loss": 0.30506757298111914, + "epoch": 1.0410079956054688, + "grad_norm": 0.0009835829259827733, + "learning_rate": 4.198923110961914e-05, + "lookahead_loss": 6.788027523994446, + "loss": 0.3148, + "step": 84000 + }, + { + "base_loss": 0.29668092691898346, + "epoch": 1.041961669921875, + "grad_norm": 0.0009812023490667343, + "learning_rate": 4.194154739379883e-05, + "lookahead_loss": 6.869966278076172, + "loss": 0.3089, + "step": 84500 + }, + { + "base_loss": 0.30789948108792303, + "epoch": 1.0429153442382812, + "grad_norm": 0.0009604549850337207, + "learning_rate": 4.1893863677978514e-05, + "lookahead_loss": 6.8703847560882565, + "loss": 0.3247, + "step": 85000 + }, + { + "epoch": 1.0429153442382812, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.825141870175687, + "eval_lookahead_perplexity": 920.7070156519827, + "eval_loss": 0.14276856184005737, + "eval_perplexity": 1.1534628154602318, + "eval_runtime": 479.8866, + "eval_samples_per_second": 10.419, + "eval_steps_per_second": 0.327, + "step": 85000 + }, + { + "base_loss": 0.3281280441880226, + "epoch": 1.0438690185546875, + "grad_norm": 0.0009952600812539458, + "learning_rate": 4.1846179962158205e-05, + "lookahead_loss": 6.8946290845870974, + "loss": 0.3444, + "step": 85500 + }, + { + "base_loss": 0.2978555924296379, + "epoch": 1.0448226928710938, + "grad_norm": 0.0010316900443285704, + "learning_rate": 4.1798496246337895e-05, + "lookahead_loss": 6.817944658279419, + "loss": 0.3111, + "step": 86000 + }, + { + "base_loss": 0.3044668311774731, + "epoch": 1.0457763671875, + "grad_norm": 0.0009631580905988812, + "learning_rate": 4.175081253051758e-05, + "lookahead_loss": 6.833521637916565, + "loss": 0.318, + "step": 86500 + }, + { + "base_loss": 0.3298782432973385, + "epoch": 1.0467300415039062, + "grad_norm": 0.0009412643266841769, + "learning_rate": 4.170312881469727e-05, + "lookahead_loss": 6.806390251159668, + "loss": 0.3407, + "step": 87000 + }, + { + "base_loss": 0.32442897310853, + "epoch": 1.0476837158203125, + "grad_norm": 0.0009984897915273905, + "learning_rate": 4.165544509887695e-05, + "lookahead_loss": 6.830627080917359, + "loss": 0.3392, + "step": 87500 + }, + { + "base_loss": 0.2941350122392178, + "epoch": 1.0486373901367188, + "grad_norm": 0.0009231239673681557, + "learning_rate": 4.160776138305664e-05, + "lookahead_loss": 6.789681484222412, + "loss": 0.3084, + "step": 88000 + }, + { + "base_loss": 0.301623804807663, + "epoch": 1.049591064453125, + "grad_norm": 0.000988572952337563, + "learning_rate": 4.156007766723633e-05, + "lookahead_loss": 6.77013840007782, + "loss": 0.3152, + "step": 88500 + }, + { + "base_loss": 0.31965578559041025, + "epoch": 1.0505447387695312, + "grad_norm": 0.0009195742895826697, + "learning_rate": 4.1512393951416016e-05, + "lookahead_loss": 6.868552158355713, + "loss": 0.3327, + "step": 89000 + }, + { + "base_loss": 0.30511142282187936, + "epoch": 1.0514984130859375, + "grad_norm": 0.000990525702945888, + "learning_rate": 4.1464710235595706e-05, + "lookahead_loss": 6.824063732147216, + "loss": 0.3195, + "step": 89500 + }, + { + "base_loss": 0.3033564644157887, + "epoch": 1.0524520874023438, + "grad_norm": 0.0010395031422376633, + "learning_rate": 4.141702651977539e-05, + "lookahead_loss": 6.759691466331482, + "loss": 0.3166, + "step": 90000 + }, + { + "epoch": 1.0524520874023438, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.803260954043355, + "eval_lookahead_perplexity": 900.7799093735076, + "eval_loss": 0.14272409677505493, + "eval_perplexity": 1.1534115278014274, + "eval_runtime": 518.4496, + "eval_samples_per_second": 9.644, + "eval_steps_per_second": 0.303, + "step": 90000 + }, + { + "base_loss": 0.32089028322696683, + "epoch": 1.05340576171875, + "grad_norm": 0.0009381326381117105, + "learning_rate": 4.136934280395508e-05, + "lookahead_loss": 6.765647802352905, + "loss": 0.3307, + "step": 90500 + }, + { + "base_loss": 0.35406574749946595, + "epoch": 1.0543594360351562, + "grad_norm": 0.0009708745637908578, + "learning_rate": 4.132165908813477e-05, + "lookahead_loss": 6.8132513179779055, + "loss": 0.3699, + "step": 91000 + }, + { + "base_loss": 0.2938829956352711, + "epoch": 1.0553131103515625, + "grad_norm": 0.0009931994136422873, + "learning_rate": 4.127397537231445e-05, + "lookahead_loss": 6.817016356945038, + "loss": 0.3071, + "step": 91500 + }, + { + "base_loss": 0.30498689064383505, + "epoch": 1.0562667846679688, + "grad_norm": 0.0009295056224800646, + "learning_rate": 4.1226291656494143e-05, + "lookahead_loss": 6.81505980014801, + "loss": 0.3181, + "step": 92000 + }, + { + "base_loss": 0.317481600522995, + "epoch": 1.057220458984375, + "grad_norm": 0.0009703385876491666, + "learning_rate": 4.117860794067383e-05, + "lookahead_loss": 6.835084310531617, + "loss": 0.3317, + "step": 92500 + }, + { + "base_loss": 0.3179551683664322, + "epoch": 1.0581741333007812, + "grad_norm": 0.0009712363826110959, + "learning_rate": 4.113092422485352e-05, + "lookahead_loss": 6.822910179138184, + "loss": 0.3299, + "step": 93000 + }, + { + "base_loss": 0.29271650505065916, + "epoch": 1.0591278076171875, + "grad_norm": 0.000948713393881917, + "learning_rate": 4.108324050903321e-05, + "lookahead_loss": 6.735391735076904, + "loss": 0.3073, + "step": 93500 + }, + { + "base_loss": 0.3039356949329376, + "epoch": 1.0600814819335938, + "grad_norm": 0.0009828072506934404, + "learning_rate": 4.103555679321289e-05, + "lookahead_loss": 6.790757938861847, + "loss": 0.3189, + "step": 94000 + }, + { + "base_loss": 0.32165152502059935, + "epoch": 1.06103515625, + "grad_norm": 0.000970664550550282, + "learning_rate": 4.098787307739258e-05, + "lookahead_loss": 6.76230890083313, + "loss": 0.3332, + "step": 94500 + }, + { + "base_loss": 0.3061283130943775, + "epoch": 1.0619888305664062, + "grad_norm": 0.0010395641438663006, + "learning_rate": 4.0940189361572264e-05, + "lookahead_loss": 6.777449913024903, + "loss": 0.3171, + "step": 95000 + }, + { + "epoch": 1.0619888305664062, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.780221919282176, + "eval_lookahead_perplexity": 880.2640499998114, + "eval_loss": 0.14268015325069427, + "eval_perplexity": 1.1533608439474792, + "eval_runtime": 486.317, + "eval_samples_per_second": 10.281, + "eval_steps_per_second": 0.323, + "step": 95000 + }, + { + "base_loss": 0.30640321379899976, + "epoch": 1.0629425048828125, + "grad_norm": 0.0009915264090523124, + "learning_rate": 4.0892505645751955e-05, + "lookahead_loss": 6.743420834541321, + "loss": 0.3177, + "step": 95500 + }, + { + "base_loss": 0.31782649287581444, + "epoch": 1.0638961791992188, + "grad_norm": 0.0009494387777522206, + "learning_rate": 4.0844821929931645e-05, + "lookahead_loss": 6.79113444519043, + "loss": 0.3309, + "step": 96000 + }, + { + "base_loss": 0.30349985790252687, + "epoch": 1.064849853515625, + "grad_norm": 0.0009761872352100909, + "learning_rate": 4.079713821411133e-05, + "lookahead_loss": 6.813262487411499, + "loss": 0.3191, + "step": 96500 + }, + { + "base_loss": 0.3075440634191036, + "epoch": 1.0658035278320312, + "grad_norm": 0.0009754234342835844, + "learning_rate": 4.074945449829102e-05, + "lookahead_loss": 6.742620223045349, + "loss": 0.321, + "step": 97000 + }, + { + "base_loss": 0.3064390652179718, + "epoch": 1.0667572021484375, + "grad_norm": 0.0009838842088356614, + "learning_rate": 4.07017707824707e-05, + "lookahead_loss": 6.700049202919006, + "loss": 0.317, + "step": 97500 + }, + { + "base_loss": 0.3303199237883091, + "epoch": 1.0677108764648438, + "grad_norm": 0.0010200405959039927, + "learning_rate": 4.065408706665039e-05, + "lookahead_loss": 6.784039269447327, + "loss": 0.3412, + "step": 98000 + }, + { + "base_loss": 0.2994670196175575, + "epoch": 1.06866455078125, + "grad_norm": 0.0010121484519913793, + "learning_rate": 4.060640335083008e-05, + "lookahead_loss": 6.775819786071778, + "loss": 0.3108, + "step": 98500 + }, + { + "base_loss": 0.3000359579175711, + "epoch": 1.0696182250976562, + "grad_norm": 0.0009712814935483038, + "learning_rate": 4.0558719635009766e-05, + "lookahead_loss": 6.7902104940414425, + "loss": 0.3161, + "step": 99000 + }, + { + "base_loss": 0.34639680609107015, + "epoch": 1.0705718994140625, + "grad_norm": 0.0009614603477530181, + "learning_rate": 4.0511035919189456e-05, + "lookahead_loss": 6.673412177562714, + "loss": 0.3593, + "step": 99500 + }, + { + "base_loss": 0.3132462115287781, + "epoch": 1.0715255737304688, + "grad_norm": 0.0009738055523484945, + "learning_rate": 4.046335220336914e-05, + "lookahead_loss": 6.736346269607544, + "loss": 0.3247, + "step": 100000 + }, + { + "epoch": 1.0715255737304688, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.758892397530163, + "eval_lookahead_perplexity": 861.6872601047816, + "eval_loss": 0.14263826608657837, + "eval_perplexity": 1.1533125339443155, + "eval_runtime": 499.825, + "eval_samples_per_second": 10.004, + "eval_steps_per_second": 0.314, + "step": 100000 + }, + { + "base_loss": 0.3044133240580559, + "epoch": 1.0009536743164062, + "grad_norm": 0.000981901423074305, + "learning_rate": 4.041566848754883e-05, + "lookahead_loss": 6.821595732688904, + "loss": 0.3138, + "step": 100500 + }, + { + "base_loss": 0.30059696701169014, + "epoch": 1.0019073486328125, + "grad_norm": 0.000999079318717122, + "learning_rate": 4.036798477172852e-05, + "lookahead_loss": 6.66421698474884, + "loss": 0.3137, + "step": 101000 + }, + { + "base_loss": 0.31169990518689156, + "epoch": 1.0028610229492188, + "grad_norm": 0.0009957862785086036, + "learning_rate": 4.03203010559082e-05, + "lookahead_loss": 6.655234758377075, + "loss": 0.3215, + "step": 101500 + }, + { + "base_loss": 0.3227726019620895, + "epoch": 1.003814697265625, + "grad_norm": 0.0009710168233141303, + "learning_rate": 4.0272617340087893e-05, + "lookahead_loss": 6.6804737997055055, + "loss": 0.3354, + "step": 102000 + }, + { + "base_loss": 0.3022470915019512, + "epoch": 1.0047683715820312, + "grad_norm": 0.000950310961343348, + "learning_rate": 4.022493362426758e-05, + "lookahead_loss": 6.665619974136352, + "loss": 0.3172, + "step": 102500 + }, + { + "base_loss": 0.30552061820030213, + "epoch": 1.0057220458984375, + "grad_norm": 0.0008522234857082367, + "learning_rate": 4.017724990844727e-05, + "lookahead_loss": 6.790848443984985, + "loss": 0.314, + "step": 103000 + }, + { + "base_loss": 0.2953472335338593, + "epoch": 1.0066757202148438, + "grad_norm": 0.0009317957446910441, + "learning_rate": 4.012956619262696e-05, + "lookahead_loss": 6.637859883308411, + "loss": 0.3144, + "step": 103500 + }, + { + "base_loss": 0.312746944963932, + "epoch": 1.00762939453125, + "grad_norm": 0.0009721561800688505, + "learning_rate": 4.008188247680664e-05, + "lookahead_loss": 6.700247512817382, + "loss": 0.3242, + "step": 104000 + }, + { + "base_loss": 0.3169711889922619, + "epoch": 1.0085830688476562, + "grad_norm": 0.0009455936960875988, + "learning_rate": 4.003419876098633e-05, + "lookahead_loss": 6.695308849334717, + "loss": 0.3221, + "step": 104500 + }, + { + "base_loss": 0.306710629016161, + "epoch": 1.0095367431640625, + "grad_norm": 0.0009776534279808402, + "learning_rate": 3.9986515045166014e-05, + "lookahead_loss": 6.697743677139282, + "loss": 0.3199, + "step": 105000 + }, + { + "epoch": 1.0095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.739253997802734, + "eval_lookahead_perplexity": 844.9301809110984, + "eval_loss": 0.14259953796863556, + "eval_perplexity": 1.1532678691853726, + "eval_runtime": 479.8, + "eval_samples_per_second": 10.421, + "eval_steps_per_second": 0.327, + "step": 105000 + }, + { + "base_loss": 0.30083237382769584, + "epoch": 1.0104904174804688, + "grad_norm": 0.0009771056938916445, + "learning_rate": 3.9938831329345705e-05, + "lookahead_loss": 6.666155261039734, + "loss": 0.3113, + "step": 105500 + }, + { + "base_loss": 0.2993237827420235, + "epoch": 1.011444091796875, + "grad_norm": 0.0010173760820180178, + "learning_rate": 3.9891147613525395e-05, + "lookahead_loss": 6.6820623445510865, + "loss": 0.3139, + "step": 106000 + }, + { + "base_loss": 0.3238567093908787, + "epoch": 1.0123977661132812, + "grad_norm": 0.000880017876625061, + "learning_rate": 3.984346389770508e-05, + "lookahead_loss": 6.659082005500793, + "loss": 0.3342, + "step": 106500 + }, + { + "base_loss": 0.3051931007504463, + "epoch": 1.0133514404296875, + "grad_norm": 0.0009482282912358642, + "learning_rate": 3.979578018188477e-05, + "lookahead_loss": 6.76999457359314, + "loss": 0.3202, + "step": 107000 + }, + { + "base_loss": 0.29808008483052256, + "epoch": 1.0143051147460938, + "grad_norm": 0.0009459082502871752, + "learning_rate": 3.974809646606445e-05, + "lookahead_loss": 6.709939098358154, + "loss": 0.3136, + "step": 107500 + }, + { + "base_loss": 0.29345863962173463, + "epoch": 1.0152587890625, + "grad_norm": 0.0009026661282405257, + "learning_rate": 3.970041275024414e-05, + "lookahead_loss": 6.641336709499359, + "loss": 0.3059, + "step": 108000 + }, + { + "base_loss": 0.3092884007692337, + "epoch": 1.0162124633789062, + "grad_norm": 0.0009934415575116873, + "learning_rate": 3.965272903442383e-05, + "lookahead_loss": 6.710262378692627, + "loss": 0.3215, + "step": 108500 + }, + { + "base_loss": 0.31143338218331335, + "epoch": 1.0171661376953125, + "grad_norm": 0.0009016263647936285, + "learning_rate": 3.9605045318603516e-05, + "lookahead_loss": 6.74980753993988, + "loss": 0.3234, + "step": 109000 + }, + { + "base_loss": 0.3001442384421825, + "epoch": 1.0181198120117188, + "grad_norm": 0.0009415132808499038, + "learning_rate": 3.9557361602783206e-05, + "lookahead_loss": 6.753072287559509, + "loss": 0.3119, + "step": 109500 + }, + { + "base_loss": 0.2986592257618904, + "epoch": 1.019073486328125, + "grad_norm": 0.0009482503519393504, + "learning_rate": 3.950967788696289e-05, + "lookahead_loss": 6.771802840709686, + "loss": 0.3116, + "step": 110000 + }, + { + "epoch": 1.019073486328125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.720215821799378, + "eval_lookahead_perplexity": 828.9964076723086, + "eval_loss": 0.1425611525774002, + "eval_perplexity": 1.1532236013966384, + "eval_runtime": 486.3292, + "eval_samples_per_second": 10.281, + "eval_steps_per_second": 0.323, + "step": 110000 + }, + { + "base_loss": 0.30347599306702616, + "epoch": 1.0200271606445312, + "grad_norm": 0.0010480897035449743, + "learning_rate": 3.946199417114258e-05, + "lookahead_loss": 6.608225521087647, + "loss": 0.3157, + "step": 110500 + }, + { + "base_loss": 0.3299741225540638, + "epoch": 1.0209808349609375, + "grad_norm": 0.0009382431744597852, + "learning_rate": 3.941431045532227e-05, + "lookahead_loss": 6.704700765609741, + "loss": 0.341, + "step": 111000 + }, + { + "base_loss": 0.3070560489296913, + "epoch": 1.0219345092773438, + "grad_norm": 0.0009868369670584798, + "learning_rate": 3.936662673950195e-05, + "lookahead_loss": 6.654251655578613, + "loss": 0.3169, + "step": 111500 + }, + { + "base_loss": 0.301061170309782, + "epoch": 1.02288818359375, + "grad_norm": 0.0010104605462402105, + "learning_rate": 3.9318943023681643e-05, + "lookahead_loss": 6.692898473739624, + "loss": 0.3139, + "step": 112000 + }, + { + "base_loss": 0.30337609922885894, + "epoch": 1.0238418579101562, + "grad_norm": 0.0009765701834112406, + "learning_rate": 3.927125930786133e-05, + "lookahead_loss": 6.65063930606842, + "loss": 0.315, + "step": 112500 + }, + { + "base_loss": 0.3241444931924343, + "epoch": 1.0247955322265625, + "grad_norm": 0.0009068374638445675, + "learning_rate": 3.922357559204102e-05, + "lookahead_loss": 6.651956144332885, + "loss": 0.3367, + "step": 113000 + }, + { + "base_loss": 0.3070600248277187, + "epoch": 1.0257492065429688, + "grad_norm": 0.0009709529695101082, + "learning_rate": 3.917589187622071e-05, + "lookahead_loss": 6.608614470481872, + "loss": 0.3218, + "step": 113500 + }, + { + "base_loss": 0.3022406686246395, + "epoch": 1.026702880859375, + "grad_norm": 0.0010030195116996765, + "learning_rate": 3.912820816040039e-05, + "lookahead_loss": 6.62807030582428, + "loss": 0.3116, + "step": 114000 + }, + { + "base_loss": 0.30680677881836893, + "epoch": 1.0276565551757812, + "grad_norm": 0.0009639645577408373, + "learning_rate": 3.908052444458008e-05, + "lookahead_loss": 6.749866914749146, + "loss": 0.3195, + "step": 114500 + }, + { + "base_loss": 0.33426042160391806, + "epoch": 1.0286102294921875, + "grad_norm": 0.00096013059373945, + "learning_rate": 3.9032840728759764e-05, + "lookahead_loss": 6.75391247177124, + "loss": 0.3431, + "step": 115000 + }, + { + "epoch": 1.0286102294921875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.702820940901296, + "eval_lookahead_perplexity": 814.7008094738575, + "eval_loss": 0.14252659678459167, + "eval_perplexity": 1.1531837515293326, + "eval_runtime": 479.4017, + "eval_samples_per_second": 10.43, + "eval_steps_per_second": 0.327, + "step": 115000 + }, + { + "base_loss": 0.3049518305659294, + "epoch": 1.0295639038085938, + "grad_norm": 0.0009578875033184886, + "learning_rate": 3.8985157012939455e-05, + "lookahead_loss": 6.715790456771851, + "loss": 0.3138, + "step": 115500 + }, + { + "base_loss": 0.3062360401749611, + "epoch": 1.030517578125, + "grad_norm": 0.000939805235248059, + "learning_rate": 3.8937473297119145e-05, + "lookahead_loss": 6.674023857593537, + "loss": 0.3173, + "step": 116000 + }, + { + "base_loss": 0.30225355681777, + "epoch": 1.0314712524414062, + "grad_norm": 0.0009627947001717985, + "learning_rate": 3.888978958129883e-05, + "lookahead_loss": 6.712348463058472, + "loss": 0.3144, + "step": 116500 + }, + { + "base_loss": 0.3184074863195419, + "epoch": 1.0324249267578125, + "grad_norm": 0.0009747587610036135, + "learning_rate": 3.884210586547852e-05, + "lookahead_loss": 6.652136072158814, + "loss": 0.3359, + "step": 117000 + }, + { + "base_loss": 0.30629492220282556, + "epoch": 1.0333786010742188, + "grad_norm": 0.0010025979718193412, + "learning_rate": 3.87944221496582e-05, + "lookahead_loss": 6.700876714706421, + "loss": 0.3163, + "step": 117500 + }, + { + "base_loss": 0.3031555346250534, + "epoch": 1.034332275390625, + "grad_norm": 0.0009006695472635329, + "learning_rate": 3.874673843383789e-05, + "lookahead_loss": 6.757840476036072, + "loss": 0.3149, + "step": 118000 + }, + { + "base_loss": 0.31164542263746264, + "epoch": 1.0352859497070312, + "grad_norm": 0.0010140526574105024, + "learning_rate": 3.869905471801758e-05, + "lookahead_loss": 6.589431819915771, + "loss": 0.3232, + "step": 118500 + }, + { + "base_loss": 0.324304408878088, + "epoch": 1.0362396240234375, + "grad_norm": 0.0009850772330537438, + "learning_rate": 3.8651371002197266e-05, + "lookahead_loss": 6.707897541999817, + "loss": 0.3363, + "step": 119000 + }, + { + "base_loss": 0.30813179594278334, + "epoch": 1.0371932983398438, + "grad_norm": 0.0009913926478475332, + "learning_rate": 3.8603687286376956e-05, + "lookahead_loss": 6.640646786689758, + "loss": 0.3205, + "step": 119500 + }, + { + "base_loss": 0.30138176554441454, + "epoch": 1.03814697265625, + "grad_norm": 0.0009480128646828234, + "learning_rate": 3.855600357055664e-05, + "lookahead_loss": 6.679037447929383, + "loss": 0.3145, + "step": 120000 + }, + { + "epoch": 1.03814697265625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.686355599960961, + "eval_lookahead_perplexity": 801.3963149780681, + "eval_loss": 0.1424950510263443, + "eval_perplexity": 1.1531473740472726, + "eval_runtime": 489.1492, + "eval_samples_per_second": 10.222, + "eval_steps_per_second": 0.321, + "step": 120000 + }, + { + "base_loss": 0.30871694785356524, + "epoch": 1.0391006469726562, + "grad_norm": 0.0009410271886736155, + "learning_rate": 3.850831985473633e-05, + "lookahead_loss": 6.625640468597412, + "loss": 0.3211, + "step": 120500 + }, + { + "base_loss": 0.32506244936585427, + "epoch": 1.0400543212890625, + "grad_norm": 0.0009350285981781781, + "learning_rate": 3.846063613891602e-05, + "lookahead_loss": 6.6512613153457645, + "loss": 0.3345, + "step": 121000 + }, + { + "base_loss": 0.30769926142692566, + "epoch": 1.0410079956054688, + "grad_norm": 0.000978046446107328, + "learning_rate": 3.84129524230957e-05, + "lookahead_loss": 6.6048227882385255, + "loss": 0.3173, + "step": 121500 + }, + { + "base_loss": 0.29858891409635546, + "epoch": 1.041961669921875, + "grad_norm": 0.000979132833890617, + "learning_rate": 3.8365268707275393e-05, + "lookahead_loss": 6.689519411087036, + "loss": 0.3105, + "step": 122000 + }, + { + "base_loss": 0.3094627737402916, + "epoch": 1.0429153442382812, + "grad_norm": 0.0009663606178946793, + "learning_rate": 3.831758499145508e-05, + "lookahead_loss": 6.7044447908401485, + "loss": 0.3251, + "step": 122500 + }, + { + "base_loss": 0.32764697542786597, + "epoch": 1.0438690185546875, + "grad_norm": 0.0009794370271265507, + "learning_rate": 3.826990127563477e-05, + "lookahead_loss": 6.723638868331909, + "loss": 0.3413, + "step": 123000 + }, + { + "base_loss": 0.29553532418608663, + "epoch": 1.0448226928710938, + "grad_norm": 0.0009821865241974592, + "learning_rate": 3.822221755981446e-05, + "lookahead_loss": 6.640208226203918, + "loss": 0.3095, + "step": 123500 + }, + { + "base_loss": 0.3041268612146378, + "epoch": 1.0457763671875, + "grad_norm": 0.0009438347187824547, + "learning_rate": 3.817453384399414e-05, + "lookahead_loss": 6.654849781990051, + "loss": 0.3159, + "step": 124000 + }, + { + "base_loss": 0.3303914776444435, + "epoch": 1.0467300415039062, + "grad_norm": 0.0009457177948206663, + "learning_rate": 3.812685012817383e-05, + "lookahead_loss": 6.630368858814239, + "loss": 0.3403, + "step": 124500 + }, + { + "base_loss": 0.3248122656941414, + "epoch": 1.0476837158203125, + "grad_norm": 0.0010212536435574293, + "learning_rate": 3.8079166412353514e-05, + "lookahead_loss": 6.664453419685364, + "loss": 0.3396, + "step": 125000 + }, + { + "epoch": 1.0476837158203125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.671399636009631, + "eval_lookahead_perplexity": 789.4998439268663, + "eval_loss": 0.14246411621570587, + "eval_perplexity": 1.153111702203372, + "eval_runtime": 489.8064, + "eval_samples_per_second": 10.208, + "eval_steps_per_second": 0.321, + "step": 125000 + }, + { + "base_loss": 0.2969100174307823, + "epoch": 1.0486373901367188, + "grad_norm": 0.0009448639466427267, + "learning_rate": 3.8031482696533205e-05, + "lookahead_loss": 6.613508511543274, + "loss": 0.3092, + "step": 125500 + }, + { + "base_loss": 0.302005185931921, + "epoch": 1.049591064453125, + "grad_norm": 0.0009936641436070204, + "learning_rate": 3.7983798980712895e-05, + "lookahead_loss": 6.604424912452698, + "loss": 0.3169, + "step": 126000 + }, + { + "base_loss": 0.31874441370368006, + "epoch": 1.0505447387695312, + "grad_norm": 0.0009042201563715935, + "learning_rate": 3.793611526489258e-05, + "lookahead_loss": 6.707013080596924, + "loss": 0.3317, + "step": 126500 + }, + { + "base_loss": 0.30408672893047334, + "epoch": 1.0514984130859375, + "grad_norm": 0.0009868575725704432, + "learning_rate": 3.788843154907227e-05, + "lookahead_loss": 6.650782800674438, + "loss": 0.3188, + "step": 127000 + }, + { + "base_loss": 0.3058005510568619, + "epoch": 1.0524520874023438, + "grad_norm": 0.00102641258854419, + "learning_rate": 3.784074783325195e-05, + "lookahead_loss": 6.603402623176574, + "loss": 0.3186, + "step": 127500 + }, + { + "base_loss": 0.32026463899016383, + "epoch": 1.05340576171875, + "grad_norm": 0.0009292624308727682, + "learning_rate": 3.779306411743164e-05, + "lookahead_loss": 6.604880172729493, + "loss": 0.3298, + "step": 128000 + }, + { + "base_loss": 0.35889338579773905, + "epoch": 1.0543594360351562, + "grad_norm": 0.0009620094788260758, + "learning_rate": 3.774538040161133e-05, + "lookahead_loss": 6.646626858711243, + "loss": 0.3714, + "step": 128500 + }, + { + "base_loss": 0.29546374672651293, + "epoch": 1.0553131103515625, + "grad_norm": 0.0009715965134091675, + "learning_rate": 3.7697696685791016e-05, + "lookahead_loss": 6.637260946273804, + "loss": 0.3067, + "step": 129000 + }, + { + "base_loss": 0.3063408683240414, + "epoch": 1.0562667846679688, + "grad_norm": 0.0009361687116324902, + "learning_rate": 3.7650012969970706e-05, + "lookahead_loss": 6.649524963378906, + "loss": 0.3181, + "step": 129500 + }, + { + "base_loss": 0.3186078954935074, + "epoch": 1.057220458984375, + "grad_norm": 0.0009357648668810725, + "learning_rate": 3.760232925415039e-05, + "lookahead_loss": 6.6754186210632325, + "loss": 0.332, + "step": 130000 + }, + { + "epoch": 1.057220458984375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.6561929562602185, + "eval_lookahead_perplexity": 777.5849948862144, + "eval_loss": 0.14243432879447937, + "eval_perplexity": 1.1530773544909447, + "eval_runtime": 474.4309, + "eval_samples_per_second": 10.539, + "eval_steps_per_second": 0.331, + "step": 130000 + }, + { + "base_loss": 0.31768571627140046, + "epoch": 1.0581741333007812, + "grad_norm": 0.0009750055032782257, + "learning_rate": 3.755464553833008e-05, + "lookahead_loss": 6.655493871688843, + "loss": 0.3273, + "step": 130500 + }, + { + "base_loss": 0.2922684009075165, + "epoch": 1.0591278076171875, + "grad_norm": 0.0009410986676812172, + "learning_rate": 3.750696182250977e-05, + "lookahead_loss": 6.57894612789154, + "loss": 0.3084, + "step": 131000 + }, + { + "base_loss": 0.30112267237901685, + "epoch": 1.0600814819335938, + "grad_norm": 0.0009611063869670033, + "learning_rate": 3.745927810668945e-05, + "lookahead_loss": 6.6305308623313906, + "loss": 0.3154, + "step": 131500 + }, + { + "base_loss": 0.32029621145129206, + "epoch": 1.06103515625, + "grad_norm": 0.0009844622109085321, + "learning_rate": 3.7411594390869143e-05, + "lookahead_loss": 6.5994256973266605, + "loss": 0.3317, + "step": 132000 + }, + { + "base_loss": 0.30533574494719506, + "epoch": 1.0619888305664062, + "grad_norm": 0.0010215704096481204, + "learning_rate": 3.736391067504883e-05, + "lookahead_loss": 6.603619037628174, + "loss": 0.3156, + "step": 132500 + }, + { + "base_loss": 0.30571810373663905, + "epoch": 1.0629425048828125, + "grad_norm": 0.0010208436287939548, + "learning_rate": 3.731622695922852e-05, + "lookahead_loss": 6.59216494178772, + "loss": 0.3172, + "step": 133000 + }, + { + "base_loss": 0.31451627737283705, + "epoch": 1.0638961791992188, + "grad_norm": 0.0009539647144265473, + "learning_rate": 3.726854324340821e-05, + "lookahead_loss": 6.634691061019898, + "loss": 0.3285, + "step": 133500 + }, + { + "base_loss": 0.30425655883550645, + "epoch": 1.064849853515625, + "grad_norm": 0.0009696083143353462, + "learning_rate": 3.722085952758789e-05, + "lookahead_loss": 6.656025648117065, + "loss": 0.3179, + "step": 134000 + }, + { + "base_loss": 0.31105126801133154, + "epoch": 1.0658035278320312, + "grad_norm": 0.0009692656458355486, + "learning_rate": 3.717317581176758e-05, + "lookahead_loss": 6.560485363006592, + "loss": 0.3215, + "step": 134500 + }, + { + "base_loss": 0.3071163959801197, + "epoch": 1.0667572021484375, + "grad_norm": 0.0009831758216023445, + "learning_rate": 3.7125492095947264e-05, + "lookahead_loss": 6.5449725456237795, + "loss": 0.3169, + "step": 135000 + }, + { + "epoch": 1.0667572021484375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.641093528308807, + "eval_lookahead_perplexity": 765.9321036725115, + "eval_loss": 0.14240561425685883, + "eval_perplexity": 1.1530442448832356, + "eval_runtime": 486.1958, + "eval_samples_per_second": 10.284, + "eval_steps_per_second": 0.323, + "step": 135000 + }, + { + "base_loss": 0.3321034919023514, + "epoch": 1.0677108764648438, + "grad_norm": 0.0010069276904687285, + "learning_rate": 3.7077808380126955e-05, + "lookahead_loss": 6.633243779182434, + "loss": 0.3433, + "step": 135500 + }, + { + "base_loss": 0.3017843673825264, + "epoch": 1.06866455078125, + "grad_norm": 0.0009863151935860515, + "learning_rate": 3.7030124664306645e-05, + "lookahead_loss": 6.624729963302612, + "loss": 0.3108, + "step": 136000 + }, + { + "base_loss": 0.302195555627346, + "epoch": 1.0696182250976562, + "grad_norm": 0.0009720239322632551, + "learning_rate": 3.698244094848633e-05, + "lookahead_loss": 6.641478686332703, + "loss": 0.3168, + "step": 136500 + }, + { + "base_loss": 0.3459869565963745, + "epoch": 1.0705718994140625, + "grad_norm": 0.0009440227877348661, + "learning_rate": 3.693475723266602e-05, + "lookahead_loss": 6.523862397193908, + "loss": 0.3605, + "step": 137000 + }, + { + "base_loss": 0.3151495299339294, + "epoch": 1.0715255737304688, + "grad_norm": 0.0009616228053346276, + "learning_rate": 3.68870735168457e-05, + "lookahead_loss": 6.586525348186493, + "loss": 0.3265, + "step": 137500 + }, + { + "base_loss": 0.30790447345376015, + "epoch": 1.072479248046875, + "grad_norm": 0.001022504991851747, + "learning_rate": 3.683938980102539e-05, + "lookahead_loss": 6.648889023780823, + "loss": 0.3207, + "step": 138000 + }, + { + "base_loss": 0.30545566940307617, + "epoch": 1.0734329223632812, + "grad_norm": 0.0009266745182685554, + "learning_rate": 3.679170608520508e-05, + "lookahead_loss": 6.604302444458008, + "loss": 0.3189, + "step": 138500 + }, + { + "base_loss": 0.32841417971253395, + "epoch": 1.0743865966796875, + "grad_norm": 0.0009727867436595261, + "learning_rate": 3.6744022369384766e-05, + "lookahead_loss": 6.628902969360351, + "loss": 0.3409, + "step": 139000 + }, + { + "base_loss": 0.30363579127192497, + "epoch": 1.0753402709960938, + "grad_norm": 0.0009738055523484945, + "learning_rate": 3.6696338653564456e-05, + "lookahead_loss": 6.6886735420227055, + "loss": 0.3163, + "step": 139500 + }, + { + "base_loss": 0.30504586565494535, + "epoch": 1.0762939453125, + "grad_norm": 0.0009336514631286263, + "learning_rate": 3.664865493774414e-05, + "lookahead_loss": 6.637202547073365, + "loss": 0.3183, + "step": 140000 + }, + { + "epoch": 1.0762939453125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.625944952614391, + "eval_lookahead_perplexity": 754.4167639313139, + "eval_loss": 0.14237651228904724, + "eval_perplexity": 1.1530106895150016, + "eval_runtime": 498.2724, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 0.315, + "step": 140000 + }, + { + "base_loss": 0.33029799509048463, + "epoch": 1.0772476196289062, + "grad_norm": 0.0009831954957917333, + "learning_rate": 3.660097122192383e-05, + "lookahead_loss": 6.633178756713868, + "loss": 0.3456, + "step": 140500 + }, + { + "base_loss": 0.3037954642176628, + "epoch": 1.0782012939453125, + "grad_norm": 0.00096993736224249, + "learning_rate": 3.655328750610352e-05, + "lookahead_loss": 6.641198945999146, + "loss": 0.317, + "step": 141000 + }, + { + "base_loss": 0.29821320512890814, + "epoch": 1.0791549682617188, + "grad_norm": 0.0009353780187666416, + "learning_rate": 3.65056037902832e-05, + "lookahead_loss": 6.639979603767395, + "loss": 0.3132, + "step": 141500 + }, + { + "base_loss": 0.3142137563228607, + "epoch": 1.080108642578125, + "grad_norm": 0.0009523274493403733, + "learning_rate": 3.6457920074462893e-05, + "lookahead_loss": 6.602743772506714, + "loss": 0.3306, + "step": 142000 + }, + { + "base_loss": 0.3222310249209404, + "epoch": 1.0810623168945312, + "grad_norm": 0.0009943461045622826, + "learning_rate": 3.641023635864258e-05, + "lookahead_loss": 6.649980679988861, + "loss": 0.3395, + "step": 142500 + }, + { + "base_loss": 0.3002626436650753, + "epoch": 1.0820159912109375, + "grad_norm": 0.0009161168127320707, + "learning_rate": 3.636255264282227e-05, + "lookahead_loss": 6.636230380058288, + "loss": 0.3127, + "step": 143000 + }, + { + "base_loss": 0.3045452245473862, + "epoch": 1.0829696655273438, + "grad_norm": 0.0009895727271214128, + "learning_rate": 3.631486892700196e-05, + "lookahead_loss": 6.670627347946167, + "loss": 0.3188, + "step": 143500 + }, + { + "base_loss": 0.33469617655873296, + "epoch": 1.08392333984375, + "grad_norm": 0.0009402433061040938, + "learning_rate": 3.626718521118164e-05, + "lookahead_loss": 6.6782106046676635, + "loss": 0.346, + "step": 144000 + }, + { + "base_loss": 0.30740025800466536, + "epoch": 1.0848770141601562, + "grad_norm": 0.0009543623309582472, + "learning_rate": 3.621950149536133e-05, + "lookahead_loss": 6.598346343994141, + "loss": 0.3193, + "step": 144500 + }, + { + "base_loss": 0.300477741509676, + "epoch": 1.0858306884765625, + "grad_norm": 0.001002481789328158, + "learning_rate": 3.6171817779541014e-05, + "lookahead_loss": 6.61094771194458, + "loss": 0.3102, + "step": 145000 + }, + { + "epoch": 1.0858306884765625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.611703176467945, + "eval_lookahead_perplexity": 743.7486758347069, + "eval_loss": 0.14234933257102966, + "eval_perplexity": 1.1529793514354714, + "eval_runtime": 479.6333, + "eval_samples_per_second": 10.425, + "eval_steps_per_second": 0.327, + "step": 145000 + }, + { + "base_loss": 0.301901563256979, + "epoch": 1.0867843627929688, + "grad_norm": 0.0011312811402603984, + "learning_rate": 3.6124134063720705e-05, + "lookahead_loss": 6.581235520362854, + "loss": 0.3132, + "step": 145500 + }, + { + "base_loss": 0.338142231285572, + "epoch": 1.087738037109375, + "grad_norm": 0.0008956918027251959, + "learning_rate": 3.6076450347900395e-05, + "lookahead_loss": 6.629713255405426, + "loss": 0.3454, + "step": 146000 + }, + { + "base_loss": 0.3009798979461193, + "epoch": 1.0886917114257812, + "grad_norm": 0.0009857059922069311, + "learning_rate": 3.602876663208008e-05, + "lookahead_loss": 6.607778671741485, + "loss": 0.3122, + "step": 146500 + }, + { + "base_loss": 0.3090392453968525, + "epoch": 1.0896453857421875, + "grad_norm": 0.0010041121859103441, + "learning_rate": 3.598108291625977e-05, + "lookahead_loss": 6.641937935829163, + "loss": 0.3183, + "step": 147000 + }, + { + "base_loss": 0.30036539113521576, + "epoch": 1.0905990600585938, + "grad_norm": 0.0009667676058597863, + "learning_rate": 3.593339920043945e-05, + "lookahead_loss": 6.610139918327332, + "loss": 0.3123, + "step": 147500 + }, + { + "base_loss": 0.3006012495756149, + "epoch": 1.091552734375, + "grad_norm": 0.0008970113703981042, + "learning_rate": 3.588571548461914e-05, + "lookahead_loss": 6.584732450485229, + "loss": 0.3112, + "step": 148000 + }, + { + "base_loss": 0.31875682109594344, + "epoch": 1.0925064086914062, + "grad_norm": 0.0009264895925298333, + "learning_rate": 3.583803176879883e-05, + "lookahead_loss": 6.613501731872558, + "loss": 0.3355, + "step": 148500 + }, + { + "base_loss": 0.3104289738535881, + "epoch": 1.0934600830078125, + "grad_norm": 0.0009199742926284671, + "learning_rate": 3.5790348052978516e-05, + "lookahead_loss": 6.620612399101257, + "loss": 0.3219, + "step": 149000 + }, + { + "base_loss": 0.2877590928971767, + "epoch": 1.0944137573242188, + "grad_norm": 0.0009749355376698077, + "learning_rate": 3.5742664337158206e-05, + "lookahead_loss": 6.6178521070480345, + "loss": 0.3019, + "step": 149500 + }, + { + "base_loss": 0.2935507807135582, + "epoch": 1.095367431640625, + "grad_norm": 0.0009596819872967899, + "learning_rate": 3.569498062133789e-05, + "lookahead_loss": 6.54475514793396, + "loss": 0.3075, + "step": 150000 + }, + { + "epoch": 1.095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.600208011298134, + "eval_lookahead_perplexity": 735.2481132509488, + "eval_loss": 0.14232492446899414, + "eval_perplexity": 1.1529512097412606, + "eval_runtime": 487.1683, + "eval_samples_per_second": 10.263, + "eval_steps_per_second": 0.322, + "step": 150000 + }, + { + "base_loss": 0.2986202912926674, + "epoch": 1.0963211059570312, + "grad_norm": 0.0009500982123427093, + "learning_rate": 3.564729690551758e-05, + "lookahead_loss": 6.612848388671875, + "loss": 0.3123, + "step": 150500 + }, + { + "base_loss": 0.3307524161040783, + "epoch": 1.0972747802734375, + "grad_norm": 0.0009901663288474083, + "learning_rate": 3.559961318969727e-05, + "lookahead_loss": 6.628784805297852, + "loss": 0.3418, + "step": 151000 + }, + { + "base_loss": 0.29244673988223074, + "epoch": 1.0982284545898438, + "grad_norm": 0.0009901755256578326, + "learning_rate": 3.555192947387695e-05, + "lookahead_loss": 6.573961565494537, + "loss": 0.3074, + "step": 151500 + }, + { + "base_loss": 0.295786843508482, + "epoch": 1.09918212890625, + "grad_norm": 0.0009776337537914515, + "learning_rate": 3.5504245758056643e-05, + "lookahead_loss": 6.619675145149231, + "loss": 0.3112, + "step": 152000 + }, + { + "base_loss": 0.30293611577153207, + "epoch": 1.1001358032226562, + "grad_norm": 0.0009244863176718354, + "learning_rate": 3.545656204223633e-05, + "lookahead_loss": 6.614836613655091, + "loss": 0.3152, + "step": 152500 + }, + { + "base_loss": 0.3240869597494602, + "epoch": 1.1010894775390625, + "grad_norm": 0.0009519928717054427, + "learning_rate": 3.540887832641602e-05, + "lookahead_loss": 6.675669587135315, + "loss": 0.3341, + "step": 153000 + }, + { + "base_loss": 0.30599541807174685, + "epoch": 1.1020431518554688, + "grad_norm": 0.0009593720897100866, + "learning_rate": 3.536119461059571e-05, + "lookahead_loss": 6.632304777145386, + "loss": 0.3154, + "step": 153500 + }, + { + "base_loss": 0.2991089904308319, + "epoch": 1.102996826171875, + "grad_norm": 0.0009689299622550607, + "learning_rate": 3.531351089477539e-05, + "lookahead_loss": 6.595957444190979, + "loss": 0.314, + "step": 154000 + }, + { + "base_loss": 0.30219315418601034, + "epoch": 1.1039505004882812, + "grad_norm": 0.0010047038085758686, + "learning_rate": 3.526582717895508e-05, + "lookahead_loss": 6.639606526374817, + "loss": 0.3127, + "step": 154500 + }, + { + "base_loss": 0.3133500624895096, + "epoch": 1.1049041748046875, + "grad_norm": 0.0009745966526679695, + "learning_rate": 3.5218143463134764e-05, + "lookahead_loss": 6.568594479560852, + "loss": 0.3272, + "step": 155000 + }, + { + "epoch": 1.1049041748046875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.58809915061195, + "eval_lookahead_perplexity": 726.3987820642042, + "eval_loss": 0.142300546169281, + "eval_perplexity": 1.1529231030937126, + "eval_runtime": 477.613, + "eval_samples_per_second": 10.469, + "eval_steps_per_second": 0.329, + "step": 155000 + }, + { + "base_loss": 0.3070584389269352, + "epoch": 1.1058578491210938, + "grad_norm": 0.0009309753077104688, + "learning_rate": 3.5170459747314455e-05, + "lookahead_loss": 6.5637693691253665, + "loss": 0.3213, + "step": 155500 + }, + { + "base_loss": 0.29948241996765135, + "epoch": 1.1068115234375, + "grad_norm": 0.0009559483733028173, + "learning_rate": 3.5122776031494145e-05, + "lookahead_loss": 6.645429815769195, + "loss": 0.3098, + "step": 156000 + }, + { + "base_loss": 0.29492466670274736, + "epoch": 1.1077651977539062, + "grad_norm": 0.0009711109451018274, + "learning_rate": 3.507509231567383e-05, + "lookahead_loss": 6.56166731595993, + "loss": 0.3083, + "step": 156500 + }, + { + "base_loss": 0.3188383647501469, + "epoch": 1.1087188720703125, + "grad_norm": 0.0010144627885892987, + "learning_rate": 3.502740859985352e-05, + "lookahead_loss": 6.550499409675598, + "loss": 0.3316, + "step": 157000 + }, + { + "base_loss": 0.31659464621543887, + "epoch": 1.1096725463867188, + "grad_norm": 0.0009141165646724403, + "learning_rate": 3.49797248840332e-05, + "lookahead_loss": 6.527835812091827, + "loss": 0.3262, + "step": 157500 + }, + { + "base_loss": 0.3013280538916588, + "epoch": 1.110626220703125, + "grad_norm": 0.0008812876185402274, + "learning_rate": 3.493204116821289e-05, + "lookahead_loss": 6.478862133026123, + "loss": 0.3142, + "step": 158000 + }, + { + "base_loss": 0.29822684854269027, + "epoch": 1.1115798950195312, + "grad_norm": 0.0010021587368100882, + "learning_rate": 3.488435745239258e-05, + "lookahead_loss": 6.607266705513, + "loss": 0.3089, + "step": 158500 + }, + { + "base_loss": 0.3082665235698223, + "epoch": 1.1125335693359375, + "grad_norm": 0.0009319260716438293, + "learning_rate": 3.4836673736572266e-05, + "lookahead_loss": 6.618119819164276, + "loss": 0.3212, + "step": 159000 + }, + { + "base_loss": 0.34342152199149134, + "epoch": 1.1134872436523438, + "grad_norm": 0.0010022877249866724, + "learning_rate": 3.4788990020751956e-05, + "lookahead_loss": 6.628146926879883, + "loss": 0.3504, + "step": 159500 + }, + { + "base_loss": 0.29527223294973376, + "epoch": 1.11444091796875, + "grad_norm": 0.0009499759180471301, + "learning_rate": 3.474130630493164e-05, + "lookahead_loss": 6.5076857767105105, + "loss": 0.3088, + "step": 160000 + }, + { + "epoch": 1.11444091796875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.576469425957043, + "eval_lookahead_perplexity": 717.9998972605395, + "eval_loss": 0.14227746427059174, + "eval_perplexity": 1.1528964917465718, + "eval_runtime": 484.9446, + "eval_samples_per_second": 10.31, + "eval_steps_per_second": 0.324, + "step": 160000 + }, + { + "base_loss": 0.2983711498081684, + "epoch": 1.1153945922851562, + "grad_norm": 0.0009712922619655728, + "learning_rate": 3.469362258911133e-05, + "lookahead_loss": 6.557691449165344, + "loss": 0.3111, + "step": 160500 + }, + { + "base_loss": 0.3164993856549263, + "epoch": 1.1163482666015625, + "grad_norm": 0.0010468108812347054, + "learning_rate": 3.464593887329102e-05, + "lookahead_loss": 6.562255940437317, + "loss": 0.3252, + "step": 161000 + }, + { + "base_loss": 0.3281388694047928, + "epoch": 1.1173019409179688, + "grad_norm": 0.0009933033725246787, + "learning_rate": 3.45982551574707e-05, + "lookahead_loss": 6.5705895509719845, + "loss": 0.3439, + "step": 161500 + }, + { + "base_loss": 0.3066762860417366, + "epoch": 1.118255615234375, + "grad_norm": 0.0010285564931109548, + "learning_rate": 3.4550571441650393e-05, + "lookahead_loss": 6.584883491516114, + "loss": 0.3162, + "step": 162000 + }, + { + "base_loss": 0.3002779276072979, + "epoch": 1.1192092895507812, + "grad_norm": 0.0009605666273273528, + "learning_rate": 3.450288772583008e-05, + "lookahead_loss": 6.601356457710266, + "loss": 0.3108, + "step": 162500 + }, + { + "base_loss": 0.3048044160306454, + "epoch": 2.0009536743164062, + "grad_norm": 0.000960490433499217, + "learning_rate": 3.445520401000977e-05, + "lookahead_loss": 6.638746548652649, + "loss": 0.3137, + "step": 163000 + }, + { + "base_loss": 0.2995053820014, + "epoch": 2.0019073486328125, + "grad_norm": 0.001005924423225224, + "learning_rate": 3.440752029418946e-05, + "lookahead_loss": 6.48337349319458, + "loss": 0.3134, + "step": 163500 + }, + { + "base_loss": 0.31198617857694627, + "epoch": 2.0028610229492188, + "grad_norm": 0.0010051662102341652, + "learning_rate": 3.435983657836914e-05, + "lookahead_loss": 6.466943081855774, + "loss": 0.3218, + "step": 164000 + }, + { + "base_loss": 0.32396442687511445, + "epoch": 2.003814697265625, + "grad_norm": 0.0009522914770059288, + "learning_rate": 3.431215286254883e-05, + "lookahead_loss": 6.503096837997436, + "loss": 0.3352, + "step": 164500 + }, + { + "base_loss": 0.3013957371413708, + "epoch": 2.0047683715820312, + "grad_norm": 0.0009518108563497663, + "learning_rate": 3.4264469146728514e-05, + "lookahead_loss": 6.490046030044556, + "loss": 0.316, + "step": 165000 + }, + { + "epoch": 2.0047683715820312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.564821576158079, + "eval_lookahead_perplexity": 709.6852601291519, + "eval_loss": 0.14225433766841888, + "eval_perplexity": 1.1528698294763657, + "eval_runtime": 496.0385, + "eval_samples_per_second": 10.08, + "eval_steps_per_second": 0.317, + "step": 165000 + }, + { + "base_loss": 0.3039788320362568, + "epoch": 2.0057220458984375, + "grad_norm": 0.0008521187701262534, + "learning_rate": 3.4216785430908205e-05, + "lookahead_loss": 6.6104288854599, + "loss": 0.3131, + "step": 165500 + }, + { + "base_loss": 0.29717833909392355, + "epoch": 2.0066757202148438, + "grad_norm": 0.0009511762764304876, + "learning_rate": 3.4169101715087895e-05, + "lookahead_loss": 6.45364487361908, + "loss": 0.314, + "step": 166000 + }, + { + "base_loss": 0.31199148765206336, + "epoch": 2.00762939453125, + "grad_norm": 0.0009973255218937993, + "learning_rate": 3.412141799926758e-05, + "lookahead_loss": 6.517452167510986, + "loss": 0.3238, + "step": 166500 + }, + { + "base_loss": 0.3148621036410332, + "epoch": 2.0085830688476562, + "grad_norm": 0.0009120389586314559, + "learning_rate": 3.407373428344727e-05, + "lookahead_loss": 6.512342976093292, + "loss": 0.3221, + "step": 167000 + }, + { + "base_loss": 0.30580521461367605, + "epoch": 2.0095367431640625, + "grad_norm": 0.0009860595455393195, + "learning_rate": 3.402605056762695e-05, + "lookahead_loss": 6.524440247535706, + "loss": 0.3184, + "step": 167500 + }, + { + "base_loss": 0.3015244754254818, + "epoch": 2.0104904174804688, + "grad_norm": 0.000945191946811974, + "learning_rate": 3.397836685180664e-05, + "lookahead_loss": 6.494577717781067, + "loss": 0.312, + "step": 168000 + }, + { + "base_loss": 0.30137019059062004, + "epoch": 2.011444091796875, + "grad_norm": 0.0010133878095075488, + "learning_rate": 3.393068313598633e-05, + "lookahead_loss": 6.505839110851288, + "loss": 0.3131, + "step": 168500 + }, + { + "base_loss": 0.3252628707587719, + "epoch": 2.0123977661132812, + "grad_norm": 0.000883644272107631, + "learning_rate": 3.3882999420166016e-05, + "lookahead_loss": 6.501287104606629, + "loss": 0.3352, + "step": 169000 + }, + { + "base_loss": 0.30557073107361793, + "epoch": 2.0133514404296875, + "grad_norm": 0.0009423987939953804, + "learning_rate": 3.3835315704345706e-05, + "lookahead_loss": 6.586906661987305, + "loss": 0.3203, + "step": 169500 + }, + { + "base_loss": 0.30054079556465146, + "epoch": 2.0143051147460938, + "grad_norm": 0.0009685211116448045, + "learning_rate": 3.378763198852539e-05, + "lookahead_loss": 6.517388526916504, + "loss": 0.315, + "step": 170000 + }, + { + "epoch": 2.0143051147460938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.553637696531253, + "eval_lookahead_perplexity": 701.792444028261, + "eval_loss": 0.1422322541475296, + "eval_perplexity": 1.1528443703325186, + "eval_runtime": 475.1232, + "eval_samples_per_second": 10.524, + "eval_steps_per_second": 0.33, + "step": 170000 + }, + { + "base_loss": 0.29648803743720054, + "epoch": 2.0152587890625, + "grad_norm": 0.0009045371552929282, + "learning_rate": 3.373994827270508e-05, + "lookahead_loss": 6.50550382566452, + "loss": 0.3072, + "step": 170500 + }, + { + "base_loss": 0.31412097451090815, + "epoch": 2.0162124633789062, + "grad_norm": 0.0009835059754550457, + "learning_rate": 3.369226455688477e-05, + "lookahead_loss": 6.529936217784882, + "loss": 0.3253, + "step": 171000 + }, + { + "base_loss": 0.3125672063827515, + "epoch": 2.0171661376953125, + "grad_norm": 0.00090819998877123, + "learning_rate": 3.364458084106445e-05, + "lookahead_loss": 6.578486999034881, + "loss": 0.3233, + "step": 171500 + }, + { + "base_loss": 0.3002317441105843, + "epoch": 2.0181198120117188, + "grad_norm": 0.0009094449342228472, + "learning_rate": 3.3596897125244143e-05, + "lookahead_loss": 6.584220232963562, + "loss": 0.3104, + "step": 172000 + }, + { + "base_loss": 0.29831535935401915, + "epoch": 2.019073486328125, + "grad_norm": 0.000955162278842181, + "learning_rate": 3.354921340942383e-05, + "lookahead_loss": 6.600418879508972, + "loss": 0.3102, + "step": 172500 + }, + { + "base_loss": 0.3020369653701782, + "epoch": 2.0200271606445312, + "grad_norm": 0.0010421768529340625, + "learning_rate": 3.350152969360352e-05, + "lookahead_loss": 6.4263093366622925, + "loss": 0.3149, + "step": 173000 + }, + { + "base_loss": 0.32652922403812407, + "epoch": 2.0209808349609375, + "grad_norm": 0.0009437088738195598, + "learning_rate": 3.345384597778321e-05, + "lookahead_loss": 6.5167139654159545, + "loss": 0.3389, + "step": 173500 + }, + { + "base_loss": 0.30453234216570857, + "epoch": 2.0219345092773438, + "grad_norm": 0.0009512313990853727, + "learning_rate": 3.340616226196289e-05, + "lookahead_loss": 6.482069372653961, + "loss": 0.3138, + "step": 174000 + }, + { + "base_loss": 0.2977458454966545, + "epoch": 2.02288818359375, + "grad_norm": 0.0010046373354271054, + "learning_rate": 3.335847854614258e-05, + "lookahead_loss": 6.520741944313049, + "loss": 0.3118, + "step": 174500 + }, + { + "base_loss": 0.30405546057224275, + "epoch": 2.0238418579101562, + "grad_norm": 0.0009604114457033575, + "learning_rate": 3.3310794830322264e-05, + "lookahead_loss": 6.482968965530396, + "loss": 0.3142, + "step": 175000 + }, + { + "epoch": 2.0238418579101562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.541717086737148, + "eval_lookahead_perplexity": 693.4763152866533, + "eval_loss": 0.1422090083360672, + "eval_perplexity": 1.152817571841118, + "eval_runtime": 485.9421, + "eval_samples_per_second": 10.289, + "eval_steps_per_second": 0.323, + "step": 175000 + }, + { + "base_loss": 0.32463854083418847, + "epoch": 2.0247955322265625, + "grad_norm": 0.0009208981646224856, + "learning_rate": 3.3263111114501955e-05, + "lookahead_loss": 6.484801607131958, + "loss": 0.3355, + "step": 175500 + }, + { + "base_loss": 0.3075324648320675, + "epoch": 2.0257492065429688, + "grad_norm": 0.0009852510411292315, + "learning_rate": 3.3215427398681645e-05, + "lookahead_loss": 6.444905442237854, + "loss": 0.323, + "step": 176000 + }, + { + "base_loss": 0.30398501074314116, + "epoch": 2.026702880859375, + "grad_norm": 0.000977648189291358, + "learning_rate": 3.316774368286133e-05, + "lookahead_loss": 6.460987593650818, + "loss": 0.3133, + "step": 176500 + }, + { + "base_loss": 0.3081837382018566, + "epoch": 2.0276565551757812, + "grad_norm": 0.000948708038777113, + "learning_rate": 3.312005996704102e-05, + "lookahead_loss": 6.576977911949157, + "loss": 0.3183, + "step": 177000 + }, + { + "base_loss": 0.32895678067207335, + "epoch": 2.0286102294921875, + "grad_norm": 0.0009632044821046293, + "learning_rate": 3.30723762512207e-05, + "lookahead_loss": 6.5979501276016235, + "loss": 0.3403, + "step": 177500 + }, + { + "base_loss": 0.30588172587752344, + "epoch": 2.0295639038085938, + "grad_norm": 0.0009475542465224862, + "learning_rate": 3.302469253540039e-05, + "lookahead_loss": 6.530917593955993, + "loss": 0.3147, + "step": 178000 + }, + { + "base_loss": 0.3051903445720673, + "epoch": 2.030517578125, + "grad_norm": 0.0009406576864421368, + "learning_rate": 3.297700881958008e-05, + "lookahead_loss": 6.51764566040039, + "loss": 0.3167, + "step": 178500 + }, + { + "base_loss": 0.30346439191699026, + "epoch": 2.0314712524414062, + "grad_norm": 0.0009703211835585535, + "learning_rate": 3.2929325103759766e-05, + "lookahead_loss": 6.543263789176941, + "loss": 0.3155, + "step": 179000 + }, + { + "base_loss": 0.31795056411623956, + "epoch": 2.0324249267578125, + "grad_norm": 0.0009707180433906615, + "learning_rate": 3.2881641387939456e-05, + "lookahead_loss": 6.475300903320313, + "loss": 0.3348, + "step": 179500 + }, + { + "base_loss": 0.30795893451571466, + "epoch": 2.0333786010742188, + "grad_norm": 0.00098248606082052, + "learning_rate": 3.283395767211914e-05, + "lookahead_loss": 6.536730496883393, + "loss": 0.3178, + "step": 180000 + }, + { + "epoch": 2.0333786010742188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.531855725632689, + "eval_lookahead_perplexity": 686.6713034107797, + "eval_loss": 0.1421896070241928, + "eval_perplexity": 1.1527952058848376, + "eval_runtime": 486.5429, + "eval_samples_per_second": 10.277, + "eval_steps_per_second": 0.323, + "step": 180000 + }, + { + "base_loss": 0.3031257001161575, + "epoch": 2.034332275390625, + "grad_norm": 0.0009016969706863165, + "learning_rate": 3.278627395629883e-05, + "lookahead_loss": 6.603059131622315, + "loss": 0.3153, + "step": 180500 + }, + { + "base_loss": 0.31068781118094924, + "epoch": 2.0352859497070312, + "grad_norm": 0.0010222060373052955, + "learning_rate": 3.273859024047852e-05, + "lookahead_loss": 6.439461089611053, + "loss": 0.3228, + "step": 181000 + }, + { + "base_loss": 0.32500979214906695, + "epoch": 2.0362396240234375, + "grad_norm": 0.0009705196134746075, + "learning_rate": 3.26909065246582e-05, + "lookahead_loss": 6.5541965799331665, + "loss": 0.337, + "step": 181500 + }, + { + "base_loss": 0.3069631262719631, + "epoch": 2.0371932983398438, + "grad_norm": 0.0010073435259982944, + "learning_rate": 3.2643222808837893e-05, + "lookahead_loss": 6.471248873710632, + "loss": 0.3175, + "step": 182000 + }, + { + "base_loss": 0.3025422422587872, + "epoch": 2.03814697265625, + "grad_norm": 0.0009568389505147934, + "learning_rate": 3.259553909301758e-05, + "lookahead_loss": 6.524754017829895, + "loss": 0.3133, + "step": 182500 + }, + { + "base_loss": 0.3076345331072807, + "epoch": 2.0391006469726562, + "grad_norm": 0.0009688584832474589, + "learning_rate": 3.254785537719727e-05, + "lookahead_loss": 6.467104331970215, + "loss": 0.3189, + "step": 183000 + }, + { + "base_loss": 0.3235399980545044, + "epoch": 2.0400543212890625, + "grad_norm": 0.0009437952539883554, + "learning_rate": 3.250017166137696e-05, + "lookahead_loss": 6.506529898166656, + "loss": 0.3336, + "step": 183500 + }, + { + "base_loss": 0.30506757298111914, + "epoch": 2.0410079956054688, + "grad_norm": 0.0009767162846401334, + "learning_rate": 3.245248794555664e-05, + "lookahead_loss": 6.448292857646942, + "loss": 0.3142, + "step": 184000 + }, + { + "base_loss": 0.29668092691898346, + "epoch": 2.041961669921875, + "grad_norm": 0.0009727113647386432, + "learning_rate": 3.240480422973633e-05, + "lookahead_loss": 6.536737553119659, + "loss": 0.3083, + "step": 184500 + }, + { + "base_loss": 0.30789948108792303, + "epoch": 2.0429153442382812, + "grad_norm": 0.000962116289883852, + "learning_rate": 3.2357120513916014e-05, + "lookahead_loss": 6.536309094429016, + "loss": 0.324, + "step": 185000 + }, + { + "epoch": 2.0429153442382812, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.522111146213909, + "eval_lookahead_perplexity": 680.0124767842166, + "eval_loss": 0.14217031002044678, + "eval_perplexity": 1.1527729606060655, + "eval_runtime": 473.7078, + "eval_samples_per_second": 10.555, + "eval_steps_per_second": 0.331, + "step": 185000 + }, + { + "base_loss": 0.3281280441880226, + "epoch": 2.0438690185546875, + "grad_norm": 0.0009924211772158742, + "learning_rate": 3.2309436798095705e-05, + "lookahead_loss": 6.569694968223572, + "loss": 0.3437, + "step": 185500 + }, + { + "base_loss": 0.2978555924296379, + "epoch": 2.0448226928710938, + "grad_norm": 0.0010296371765434742, + "learning_rate": 3.2261753082275395e-05, + "lookahead_loss": 6.4846109199523925, + "loss": 0.3105, + "step": 186000 + }, + { + "base_loss": 0.3044668311774731, + "epoch": 2.0457763671875, + "grad_norm": 0.0009649458806961775, + "learning_rate": 3.221406936645508e-05, + "lookahead_loss": 6.508920118331909, + "loss": 0.3174, + "step": 186500 + }, + { + "base_loss": 0.3298782432973385, + "epoch": 2.0467300415039062, + "grad_norm": 0.0009266676497645676, + "learning_rate": 3.216638565063477e-05, + "lookahead_loss": 6.470085307598114, + "loss": 0.3401, + "step": 187000 + }, + { + "base_loss": 0.32442897310853, + "epoch": 2.0476837158203125, + "grad_norm": 0.0009991949191316962, + "learning_rate": 3.211870193481445e-05, + "lookahead_loss": 6.5127511582374575, + "loss": 0.3385, + "step": 187500 + }, + { + "base_loss": 0.2941350122392178, + "epoch": 2.0486373901367188, + "grad_norm": 0.0009228453855030239, + "learning_rate": 3.207101821899414e-05, + "lookahead_loss": 6.472175216674804, + "loss": 0.3078, + "step": 188000 + }, + { + "base_loss": 0.301623804807663, + "epoch": 2.049591064453125, + "grad_norm": 0.0009880108991637826, + "learning_rate": 3.202333450317383e-05, + "lookahead_loss": 6.448336009979248, + "loss": 0.3146, + "step": 188500 + }, + { + "base_loss": 0.31965578559041025, + "epoch": 2.0505447387695312, + "grad_norm": 0.0009035322000272572, + "learning_rate": 3.1975650787353516e-05, + "lookahead_loss": 6.551394259452819, + "loss": 0.3321, + "step": 189000 + }, + { + "base_loss": 0.30511142282187936, + "epoch": 2.0514984130859375, + "grad_norm": 0.0009879703866317868, + "learning_rate": 3.1927967071533206e-05, + "lookahead_loss": 6.507521020889282, + "loss": 0.3188, + "step": 189500 + }, + { + "base_loss": 0.3033564644157887, + "epoch": 2.0524520874023438, + "grad_norm": 0.0010369070805609226, + "learning_rate": 3.188028335571289e-05, + "lookahead_loss": 6.442676889419555, + "loss": 0.316, + "step": 190000 + }, + { + "epoch": 2.0524520874023438, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.5135073006724395, + "eval_lookahead_perplexity": 674.1868517938349, + "eval_loss": 0.14215251803398132, + "eval_perplexity": 1.1527524506676095, + "eval_runtime": 486.1617, + "eval_samples_per_second": 10.285, + "eval_steps_per_second": 0.323, + "step": 190000 + }, + { + "base_loss": 0.32089028322696683, + "epoch": 2.05340576171875, + "grad_norm": 0.0009397159446962178, + "learning_rate": 3.183259963989258e-05, + "lookahead_loss": 6.455264377593994, + "loss": 0.3301, + "step": 190500 + }, + { + "base_loss": 0.35406574749946595, + "epoch": 2.0543594360351562, + "grad_norm": 0.0009735460043884814, + "learning_rate": 3.178491592407227e-05, + "lookahead_loss": 6.496513916969299, + "loss": 0.3693, + "step": 191000 + }, + { + "base_loss": 0.2938829956352711, + "epoch": 2.0553131103515625, + "grad_norm": 0.0009796855738386512, + "learning_rate": 3.173723220825195e-05, + "lookahead_loss": 6.495951771259308, + "loss": 0.3064, + "step": 191500 + }, + { + "base_loss": 0.30498689064383505, + "epoch": 2.0562667846679688, + "grad_norm": 0.000926612876355648, + "learning_rate": 3.1689548492431643e-05, + "lookahead_loss": 6.501594274520874, + "loss": 0.3175, + "step": 192000 + }, + { + "base_loss": 0.317481600522995, + "epoch": 2.057220458984375, + "grad_norm": 0.0009604953811503947, + "learning_rate": 3.164186477661133e-05, + "lookahead_loss": 6.519902579307556, + "loss": 0.3311, + "step": 192500 + }, + { + "base_loss": 0.3179551683664322, + "epoch": 2.0581741333007812, + "grad_norm": 0.0009685103432275355, + "learning_rate": 3.159418106079102e-05, + "lookahead_loss": 6.514713489532471, + "loss": 0.3293, + "step": 193000 + }, + { + "base_loss": 0.29271650505065916, + "epoch": 2.0591278076171875, + "grad_norm": 0.0009422221919521689, + "learning_rate": 3.154649734497071e-05, + "lookahead_loss": 6.422006649971008, + "loss": 0.3067, + "step": 193500 + }, + { + "base_loss": 0.3039356949329376, + "epoch": 2.0600814819335938, + "grad_norm": 0.0009756973595358431, + "learning_rate": 3.149881362915039e-05, + "lookahead_loss": 6.483645843505859, + "loss": 0.3183, + "step": 194000 + }, + { + "base_loss": 0.32165152502059935, + "epoch": 2.06103515625, + "grad_norm": 0.0009674925822764635, + "learning_rate": 3.145112991333008e-05, + "lookahead_loss": 6.4522641057968135, + "loss": 0.3326, + "step": 194500 + }, + { + "base_loss": 0.3061283130943775, + "epoch": 2.0619888305664062, + "grad_norm": 0.0010340444277971983, + "learning_rate": 3.1403446197509764e-05, + "lookahead_loss": 6.467900278091431, + "loss": 0.3165, + "step": 195000 + }, + { + "epoch": 2.0619888305664062, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.5032055431280655, + "eval_lookahead_perplexity": 667.2771942234244, + "eval_loss": 0.1421336829662323, + "eval_perplexity": 1.152730738701577, + "eval_runtime": 481.0535, + "eval_samples_per_second": 10.394, + "eval_steps_per_second": 0.326, + "step": 195000 + }, + { + "base_loss": 0.30640321379899976, + "epoch": 2.0629425048828125, + "grad_norm": 0.0009878401178866625, + "learning_rate": 3.1355762481689455e-05, + "lookahead_loss": 6.44850652551651, + "loss": 0.3171, + "step": 195500 + }, + { + "base_loss": 0.31782649287581444, + "epoch": 2.0638961791992188, + "grad_norm": 0.0009471693192608654, + "learning_rate": 3.1308078765869145e-05, + "lookahead_loss": 6.486370619773865, + "loss": 0.3303, + "step": 196000 + }, + { + "base_loss": 0.30349985790252687, + "epoch": 2.064849853515625, + "grad_norm": 0.0009686322882771492, + "learning_rate": 3.126039505004883e-05, + "lookahead_loss": 6.50835819530487, + "loss": 0.3185, + "step": 196500 + }, + { + "base_loss": 0.3075440634191036, + "epoch": 2.0658035278320312, + "grad_norm": 0.0009692518506199121, + "learning_rate": 3.121271133422852e-05, + "lookahead_loss": 6.4442818622589115, + "loss": 0.3204, + "step": 197000 + }, + { + "base_loss": 0.3064390652179718, + "epoch": 2.0667572021484375, + "grad_norm": 0.0009859678102657199, + "learning_rate": 3.11650276184082e-05, + "lookahead_loss": 6.408861030578613, + "loss": 0.3165, + "step": 197500 + }, + { + "base_loss": 0.3303199237883091, + "epoch": 2.0677108764648438, + "grad_norm": 0.0010187255684286356, + "learning_rate": 3.111734390258789e-05, + "lookahead_loss": 6.487036975860596, + "loss": 0.3406, + "step": 198000 + }, + { + "base_loss": 0.2994670196175575, + "epoch": 2.06866455078125, + "grad_norm": 0.0010079372441396117, + "learning_rate": 3.106966018676758e-05, + "lookahead_loss": 6.467383036613464, + "loss": 0.3102, + "step": 198500 + }, + { + "base_loss": 0.3000359579175711, + "epoch": 2.0696182250976562, + "grad_norm": 0.0009658647468313575, + "learning_rate": 3.1021976470947266e-05, + "lookahead_loss": 6.49391339302063, + "loss": 0.3156, + "step": 199000 + }, + { + "base_loss": 0.34639680609107015, + "epoch": 2.0705718994140625, + "grad_norm": 0.0009569233516231179, + "learning_rate": 3.0974292755126956e-05, + "lookahead_loss": 6.382178443908692, + "loss": 0.3587, + "step": 199500 + }, + { + "base_loss": 0.3132462115287781, + "epoch": 2.0715255737304688, + "grad_norm": 0.0009783732239156961, + "learning_rate": 3.092660903930664e-05, + "lookahead_loss": 6.4411235184669495, + "loss": 0.3241, + "step": 200000 + }, + { + "epoch": 2.0715255737304688, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.493639263482139, + "eval_lookahead_perplexity": 660.9242693582831, + "eval_loss": 0.1421152651309967, + "eval_perplexity": 1.1527095080922722, + "eval_runtime": 490.5924, + "eval_samples_per_second": 10.192, + "eval_steps_per_second": 0.32, + "step": 200000 + }, + { + "base_loss": 0.3044133240580559, + "epoch": 1.0009536743164062, + "grad_norm": 0.000980817130766809, + "learning_rate": 3.087892532348633e-05, + "lookahead_loss": 6.5381371078491215, + "loss": 0.3133, + "step": 200500 + }, + { + "base_loss": 0.30059696701169014, + "epoch": 1.0019073486328125, + "grad_norm": 0.000991058419458568, + "learning_rate": 3.083124160766602e-05, + "lookahead_loss": 6.378455901622773, + "loss": 0.3132, + "step": 201000 + }, + { + "base_loss": 0.31169990518689156, + "epoch": 1.0028610229492188, + "grad_norm": 0.0009886854095384479, + "learning_rate": 3.07835578918457e-05, + "lookahead_loss": 6.3639041023254395, + "loss": 0.3209, + "step": 201500 + }, + { + "base_loss": 0.3227726019620895, + "epoch": 1.003814697265625, + "grad_norm": 0.0009649033308960497, + "learning_rate": 3.0735874176025393e-05, + "lookahead_loss": 6.3965963945388795, + "loss": 0.3349, + "step": 202000 + }, + { + "base_loss": 0.3022470915019512, + "epoch": 1.0047683715820312, + "grad_norm": 0.0009402409195899963, + "learning_rate": 3.068819046020508e-05, + "lookahead_loss": 6.376853492736816, + "loss": 0.3166, + "step": 202500 + }, + { + "base_loss": 0.30552061820030213, + "epoch": 1.0057220458984375, + "grad_norm": 0.0008486240985803306, + "learning_rate": 3.064050674438477e-05, + "lookahead_loss": 6.5066717085838315, + "loss": 0.3134, + "step": 203000 + }, + { + "base_loss": 0.2953472335338593, + "epoch": 1.0066757202148438, + "grad_norm": 0.000918999663554132, + "learning_rate": 3.059282302856446e-05, + "lookahead_loss": 6.362914432525635, + "loss": 0.3139, + "step": 203500 + }, + { + "base_loss": 0.312746944963932, + "epoch": 1.00762939453125, + "grad_norm": 0.0009720294619910419, + "learning_rate": 3.054513931274414e-05, + "lookahead_loss": 6.418793338775635, + "loss": 0.3236, + "step": 204000 + }, + { + "base_loss": 0.3169711889922619, + "epoch": 1.0085830688476562, + "grad_norm": 0.0009424291201867163, + "learning_rate": 3.049745559692383e-05, + "lookahead_loss": 6.410425403594971, + "loss": 0.3216, + "step": 204500 + }, + { + "base_loss": 0.306710629016161, + "epoch": 1.0095367431640625, + "grad_norm": 0.0009739029337652028, + "learning_rate": 3.0449771881103518e-05, + "lookahead_loss": 6.414248113632202, + "loss": 0.3193, + "step": 205000 + }, + { + "epoch": 1.0095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.484710060369474, + "eval_lookahead_perplexity": 655.0490120345486, + "eval_loss": 0.14209794998168945, + "eval_perplexity": 1.1526895489278302, + "eval_runtime": 477.8749, + "eval_samples_per_second": 10.463, + "eval_steps_per_second": 0.329, + "step": 205000 + }, + { + "base_loss": 0.30083237382769584, + "epoch": 1.0104904174804688, + "grad_norm": 0.0009575801086612046, + "learning_rate": 3.0402088165283205e-05, + "lookahead_loss": 6.388895375728607, + "loss": 0.3107, + "step": 205500 + }, + { + "base_loss": 0.2993237827420235, + "epoch": 1.011444091796875, + "grad_norm": 0.0010149423032999039, + "learning_rate": 3.035440444946289e-05, + "lookahead_loss": 6.404768172740936, + "loss": 0.3134, + "step": 206000 + }, + { + "base_loss": 0.3238567093908787, + "epoch": 1.0123977661132812, + "grad_norm": 0.0008795901667326689, + "learning_rate": 3.0306720733642578e-05, + "lookahead_loss": 6.385066100120545, + "loss": 0.3337, + "step": 206500 + }, + { + "base_loss": 0.3051931007504463, + "epoch": 1.0133514404296875, + "grad_norm": 0.000939674791879952, + "learning_rate": 3.025903701782227e-05, + "lookahead_loss": 6.491093005180359, + "loss": 0.3197, + "step": 207000 + }, + { + "base_loss": 0.29808008483052256, + "epoch": 1.0143051147460938, + "grad_norm": 0.0009383645956404507, + "learning_rate": 3.0211353302001955e-05, + "lookahead_loss": 6.434820441246033, + "loss": 0.3131, + "step": 207500 + }, + { + "base_loss": 0.29345863962173463, + "epoch": 1.0152587890625, + "grad_norm": 0.0008934679790399969, + "learning_rate": 3.0163669586181642e-05, + "lookahead_loss": 6.370939202308655, + "loss": 0.3054, + "step": 208000 + }, + { + "base_loss": 0.3092884007692337, + "epoch": 1.0162124633789062, + "grad_norm": 0.000988858169876039, + "learning_rate": 3.011598587036133e-05, + "lookahead_loss": 6.438449412822723, + "loss": 0.321, + "step": 208500 + }, + { + "base_loss": 0.31143338218331335, + "epoch": 1.0171661376953125, + "grad_norm": 0.0008980570128187537, + "learning_rate": 3.0068302154541016e-05, + "lookahead_loss": 6.477062056541443, + "loss": 0.3228, + "step": 209000 + }, + { + "base_loss": 0.3001442384421825, + "epoch": 1.0181198120117188, + "grad_norm": 0.0009314729832112789, + "learning_rate": 3.0020618438720706e-05, + "lookahead_loss": 6.479909467697143, + "loss": 0.3114, + "step": 209500 + }, + { + "base_loss": 0.2986592257618904, + "epoch": 1.019073486328125, + "grad_norm": 0.0009430127684026957, + "learning_rate": 2.9972934722900393e-05, + "lookahead_loss": 6.505731671333313, + "loss": 0.3111, + "step": 210000 + }, + { + "epoch": 1.019073486328125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.475796708664574, + "eval_lookahead_perplexity": 649.2362737806238, + "eval_loss": 0.14207999408245087, + "eval_perplexity": 1.152668851536257, + "eval_runtime": 509.9698, + "eval_samples_per_second": 9.805, + "eval_steps_per_second": 0.308, + "step": 210000 + }, + { + "base_loss": 0.30347599306702616, + "epoch": 1.0200271606445312, + "grad_norm": 0.0010456894524395466, + "learning_rate": 2.992525100708008e-05, + "lookahead_loss": 6.344668842315674, + "loss": 0.3151, + "step": 210500 + }, + { + "base_loss": 0.3299741225540638, + "epoch": 1.0209808349609375, + "grad_norm": 0.0009360331459902227, + "learning_rate": 2.9877567291259766e-05, + "lookahead_loss": 6.4390918169021605, + "loss": 0.3405, + "step": 211000 + }, + { + "base_loss": 0.3070560489296913, + "epoch": 1.0219345092773438, + "grad_norm": 0.0009817855898290873, + "learning_rate": 2.9829883575439453e-05, + "lookahead_loss": 6.387048331260681, + "loss": 0.3164, + "step": 211500 + }, + { + "base_loss": 0.301061170309782, + "epoch": 1.02288818359375, + "grad_norm": 0.0009992312407121062, + "learning_rate": 2.9782199859619143e-05, + "lookahead_loss": 6.428957866668701, + "loss": 0.3134, + "step": 212000 + }, + { + "base_loss": 0.30337609922885894, + "epoch": 1.0238418579101562, + "grad_norm": 0.0009748202282935381, + "learning_rate": 2.973451614379883e-05, + "lookahead_loss": 6.38404591846466, + "loss": 0.3145, + "step": 212500 + }, + { + "base_loss": 0.3241444931924343, + "epoch": 1.0247955322265625, + "grad_norm": 0.0009037147392518818, + "learning_rate": 2.9686832427978517e-05, + "lookahead_loss": 6.393755014896393, + "loss": 0.3362, + "step": 213000 + }, + { + "base_loss": 0.3070600248277187, + "epoch": 1.0257492065429688, + "grad_norm": 0.000968066742643714, + "learning_rate": 2.9639148712158204e-05, + "lookahead_loss": 6.347007934570312, + "loss": 0.3213, + "step": 213500 + }, + { + "base_loss": 0.3022406686246395, + "epoch": 1.026702880859375, + "grad_norm": 0.0010011859703809023, + "learning_rate": 2.959146499633789e-05, + "lookahead_loss": 6.375035131454468, + "loss": 0.3111, + "step": 214000 + }, + { + "base_loss": 0.30680677881836893, + "epoch": 1.0276565551757812, + "grad_norm": 0.000959697412326932, + "learning_rate": 2.954378128051758e-05, + "lookahead_loss": 6.487227991104126, + "loss": 0.3189, + "step": 214500 + }, + { + "base_loss": 0.33426042160391806, + "epoch": 1.0286102294921875, + "grad_norm": 0.0009539132006466389, + "learning_rate": 2.9496097564697268e-05, + "lookahead_loss": 6.501760796546936, + "loss": 0.3426, + "step": 215000 + }, + { + "epoch": 1.0286102294921875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.4675389867240245, + "eval_lookahead_perplexity": 643.8971360549982, + "eval_loss": 0.14206384122371674, + "eval_perplexity": 1.1526502327895043, + "eval_runtime": 483.9033, + "eval_samples_per_second": 10.333, + "eval_steps_per_second": 0.324, + "step": 215000 + }, + { + "base_loss": 0.3049518305659294, + "epoch": 1.0295639038085938, + "grad_norm": 0.0009558768360875547, + "learning_rate": 2.9448413848876955e-05, + "lookahead_loss": 6.452504009246826, + "loss": 0.3133, + "step": 215500 + }, + { + "base_loss": 0.3062360401749611, + "epoch": 1.030517578125, + "grad_norm": 0.0009344167774543166, + "learning_rate": 2.940073013305664e-05, + "lookahead_loss": 6.419843075275421, + "loss": 0.3168, + "step": 216000 + }, + { + "base_loss": 0.30225355681777, + "epoch": 1.0314712524414062, + "grad_norm": 0.00096104945987463, + "learning_rate": 2.9353046417236328e-05, + "lookahead_loss": 6.45460268497467, + "loss": 0.3139, + "step": 216500 + }, + { + "base_loss": 0.3184074863195419, + "epoch": 1.0324249267578125, + "grad_norm": 0.0009695267654024065, + "learning_rate": 2.930536270141602e-05, + "lookahead_loss": 6.404374066352844, + "loss": 0.3355, + "step": 217000 + }, + { + "base_loss": 0.30629492220282556, + "epoch": 1.0333786010742188, + "grad_norm": 0.0010007137898355722, + "learning_rate": 2.9257678985595705e-05, + "lookahead_loss": 6.446723832607269, + "loss": 0.3158, + "step": 217500 + }, + { + "base_loss": 0.3031555346250534, + "epoch": 1.034332275390625, + "grad_norm": 0.0008954454096965492, + "learning_rate": 2.9209995269775392e-05, + "lookahead_loss": 6.511106987953186, + "loss": 0.3144, + "step": 218000 + }, + { + "base_loss": 0.31164542263746264, + "epoch": 1.0352859497070312, + "grad_norm": 0.001012432505376637, + "learning_rate": 2.916231155395508e-05, + "lookahead_loss": 6.340117260456085, + "loss": 0.3227, + "step": 218500 + }, + { + "base_loss": 0.324304408878088, + "epoch": 1.0362396240234375, + "grad_norm": 0.0009743616101332009, + "learning_rate": 2.9114627838134766e-05, + "lookahead_loss": 6.457801213741303, + "loss": 0.3358, + "step": 219000 + }, + { + "base_loss": 0.30813179594278334, + "epoch": 1.0371932983398438, + "grad_norm": 0.0009892105590552092, + "learning_rate": 2.9066944122314456e-05, + "lookahead_loss": 6.390979628562928, + "loss": 0.32, + "step": 219500 + }, + { + "base_loss": 0.30138176554441454, + "epoch": 1.03814697265625, + "grad_norm": 0.0009503848268650472, + "learning_rate": 2.9019260406494143e-05, + "lookahead_loss": 6.42660464668274, + "loss": 0.314, + "step": 220000 + }, + { + "epoch": 1.03814697265625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.459770851622755, + "eval_lookahead_perplexity": 638.9146335424198, + "eval_loss": 0.14204934239387512, + "eval_perplexity": 1.1526335208310643, + "eval_runtime": 509.6316, + "eval_samples_per_second": 9.811, + "eval_steps_per_second": 0.308, + "step": 220000 + }, + { + "base_loss": 0.30871694785356524, + "epoch": 1.0391006469726562, + "grad_norm": 0.0009405686287209392, + "learning_rate": 2.897157669067383e-05, + "lookahead_loss": 6.3810950956344605, + "loss": 0.3206, + "step": 220500 + }, + { + "base_loss": 0.32506244936585427, + "epoch": 1.0400543212890625, + "grad_norm": 0.0009293987532146275, + "learning_rate": 2.8923892974853516e-05, + "lookahead_loss": 6.408866803169251, + "loss": 0.334, + "step": 221000 + }, + { + "base_loss": 0.30769926142692566, + "epoch": 1.0410079956054688, + "grad_norm": 0.0009767550509423018, + "learning_rate": 2.8876209259033203e-05, + "lookahead_loss": 6.354361628055573, + "loss": 0.3168, + "step": 221500 + }, + { + "base_loss": 0.29858891409635546, + "epoch": 1.041961669921875, + "grad_norm": 0.0009748723823577166, + "learning_rate": 2.8828525543212893e-05, + "lookahead_loss": 6.44218283367157, + "loss": 0.31, + "step": 222000 + }, + { + "base_loss": 0.3094627737402916, + "epoch": 1.0429153442382812, + "grad_norm": 0.0009669262799434364, + "learning_rate": 2.878084182739258e-05, + "lookahead_loss": 6.456247410774231, + "loss": 0.3246, + "step": 222500 + }, + { + "base_loss": 0.32764697542786597, + "epoch": 1.0438690185546875, + "grad_norm": 0.0009784965077415109, + "learning_rate": 2.8733158111572267e-05, + "lookahead_loss": 6.482624104499817, + "loss": 0.3408, + "step": 223000 + }, + { + "base_loss": 0.29553532418608663, + "epoch": 1.0448226928710938, + "grad_norm": 0.000980429002083838, + "learning_rate": 2.8685474395751954e-05, + "lookahead_loss": 6.394272118568421, + "loss": 0.309, + "step": 223500 + }, + { + "base_loss": 0.3041268612146378, + "epoch": 1.0457763671875, + "grad_norm": 0.0009423611336387694, + "learning_rate": 2.863779067993164e-05, + "lookahead_loss": 6.412952316761017, + "loss": 0.3154, + "step": 224000 + }, + { + "base_loss": 0.3303914776444435, + "epoch": 1.0467300415039062, + "grad_norm": 0.0009371626656502485, + "learning_rate": 2.859010696411133e-05, + "lookahead_loss": 6.382916459560394, + "loss": 0.3398, + "step": 224500 + }, + { + "base_loss": 0.3248122656941414, + "epoch": 1.0476837158203125, + "grad_norm": 0.0010233812499791384, + "learning_rate": 2.8542423248291018e-05, + "lookahead_loss": 6.4274598398208616, + "loss": 0.3391, + "step": 225000 + }, + { + "epoch": 1.0476837158203125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.452684202894997, + "eval_lookahead_perplexity": 634.4028754716901, + "eval_loss": 0.14203447103500366, + "eval_perplexity": 1.152616379731785, + "eval_runtime": 491.9684, + "eval_samples_per_second": 10.163, + "eval_steps_per_second": 0.319, + "step": 225000 + }, + { + "base_loss": 0.2969100174307823, + "epoch": 1.0486373901367188, + "grad_norm": 0.0009405760793015361, + "learning_rate": 2.8494739532470705e-05, + "lookahead_loss": 6.374634448051452, + "loss": 0.3088, + "step": 225500 + }, + { + "base_loss": 0.302005185931921, + "epoch": 1.049591064453125, + "grad_norm": 0.000992949353531003, + "learning_rate": 2.844705581665039e-05, + "lookahead_loss": 6.366885425567627, + "loss": 0.3164, + "step": 226000 + }, + { + "base_loss": 0.31874441370368006, + "epoch": 1.0505447387695312, + "grad_norm": 0.0008948028553277254, + "learning_rate": 2.8399372100830078e-05, + "lookahead_loss": 6.472296991348267, + "loss": 0.3313, + "step": 226500 + }, + { + "base_loss": 0.30408672893047334, + "epoch": 1.0514984130859375, + "grad_norm": 0.000984621699899435, + "learning_rate": 2.835168838500977e-05, + "lookahead_loss": 6.415002500534057, + "loss": 0.3183, + "step": 227000 + }, + { + "base_loss": 0.3058005510568619, + "epoch": 1.0524520874023438, + "grad_norm": 0.00102510757278651, + "learning_rate": 2.8304004669189455e-05, + "lookahead_loss": 6.365005512237548, + "loss": 0.3181, + "step": 227500 + }, + { + "base_loss": 0.32026463899016383, + "epoch": 1.05340576171875, + "grad_norm": 0.0009289232548326254, + "learning_rate": 2.8256320953369142e-05, + "lookahead_loss": 6.374865962982177, + "loss": 0.3294, + "step": 228000 + }, + { + "base_loss": 0.35889338579773905, + "epoch": 1.0543594360351562, + "grad_norm": 0.0009590112022124231, + "learning_rate": 2.820863723754883e-05, + "lookahead_loss": 6.41260059261322, + "loss": 0.371, + "step": 228500 + }, + { + "base_loss": 0.29546374672651293, + "epoch": 1.0553131103515625, + "grad_norm": 0.0009624488302506506, + "learning_rate": 2.8160953521728516e-05, + "lookahead_loss": 6.395633814334869, + "loss": 0.3062, + "step": 229000 + }, + { + "base_loss": 0.3063408683240414, + "epoch": 1.0562667846679688, + "grad_norm": 0.0009341055992990732, + "learning_rate": 2.8113269805908206e-05, + "lookahead_loss": 6.4170992894172665, + "loss": 0.3176, + "step": 229500 + }, + { + "base_loss": 0.3186078954935074, + "epoch": 1.057220458984375, + "grad_norm": 0.0009300449746660888, + "learning_rate": 2.8065586090087893e-05, + "lookahead_loss": 6.438336689949035, + "loss": 0.3315, + "step": 230000 + }, + { + "epoch": 1.057220458984375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.445277567488698, + "eval_lookahead_perplexity": 629.721442905707, + "eval_loss": 0.142020121216774, + "eval_perplexity": 1.1525998400149184, + "eval_runtime": 485.202, + "eval_samples_per_second": 10.305, + "eval_steps_per_second": 0.324, + "step": 230000 + }, + { + "base_loss": 0.31768571627140046, + "epoch": 1.0581741333007812, + "grad_norm": 0.0009736772626638412, + "learning_rate": 2.801790237426758e-05, + "lookahead_loss": 6.425261500358581, + "loss": 0.3269, + "step": 230500 + }, + { + "base_loss": 0.2922684009075165, + "epoch": 1.0591278076171875, + "grad_norm": 0.0009367645834572613, + "learning_rate": 2.7970218658447266e-05, + "lookahead_loss": 6.343348423957825, + "loss": 0.3079, + "step": 231000 + }, + { + "base_loss": 0.30112267237901685, + "epoch": 1.0600814819335938, + "grad_norm": 0.0009572458802722394, + "learning_rate": 2.7922534942626953e-05, + "lookahead_loss": 6.400773149013519, + "loss": 0.315, + "step": 231500 + }, + { + "base_loss": 0.32029621145129206, + "epoch": 1.06103515625, + "grad_norm": 0.0009820304112508893, + "learning_rate": 2.7874851226806643e-05, + "lookahead_loss": 6.3678281145095825, + "loss": 0.3312, + "step": 232000 + }, + { + "base_loss": 0.30533574494719506, + "epoch": 1.0619888305664062, + "grad_norm": 0.001014688634313643, + "learning_rate": 2.782716751098633e-05, + "lookahead_loss": 6.370753986358642, + "loss": 0.3151, + "step": 232500 + }, + { + "base_loss": 0.30571810373663905, + "epoch": 1.0629425048828125, + "grad_norm": 0.00102465960662812, + "learning_rate": 2.7779483795166017e-05, + "lookahead_loss": 6.3701961503028866, + "loss": 0.3167, + "step": 233000 + }, + { + "base_loss": 0.31451627737283705, + "epoch": 1.0638961791992188, + "grad_norm": 0.0009522989275865257, + "learning_rate": 2.7731800079345704e-05, + "lookahead_loss": 6.405235059261322, + "loss": 0.3281, + "step": 233500 + }, + { + "base_loss": 0.30425655883550645, + "epoch": 1.064849853515625, + "grad_norm": 0.0009637218317948282, + "learning_rate": 2.768411636352539e-05, + "lookahead_loss": 6.427546417236328, + "loss": 0.3175, + "step": 234000 + }, + { + "base_loss": 0.31105126801133154, + "epoch": 1.0658035278320312, + "grad_norm": 0.0009667099802754819, + "learning_rate": 2.763643264770508e-05, + "lookahead_loss": 6.337568170547486, + "loss": 0.321, + "step": 234500 + }, + { + "base_loss": 0.3071163959801197, + "epoch": 1.0667572021484375, + "grad_norm": 0.0009796451777219772, + "learning_rate": 2.7588748931884768e-05, + "lookahead_loss": 6.3229367260932925, + "loss": 0.3165, + "step": 235000 + }, + { + "epoch": 1.0667572021484375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.437671433622464, + "eval_lookahead_perplexity": 624.9498669395556, + "eval_loss": 0.14200608432292938, + "eval_perplexity": 1.1525836612068692, + "eval_runtime": 494.9992, + "eval_samples_per_second": 10.101, + "eval_steps_per_second": 0.317, + "step": 235000 + }, + { + "base_loss": 0.3321034919023514, + "epoch": 1.0677108764648438, + "grad_norm": 0.0010060551576316357, + "learning_rate": 2.7541065216064455e-05, + "lookahead_loss": 6.408410006523132, + "loss": 0.3428, + "step": 235500 + }, + { + "base_loss": 0.3017843673825264, + "epoch": 1.06866455078125, + "grad_norm": 0.00098395103123039, + "learning_rate": 2.749338150024414e-05, + "lookahead_loss": 6.3884356451034545, + "loss": 0.3103, + "step": 236000 + }, + { + "base_loss": 0.302195555627346, + "epoch": 1.0696182250976562, + "grad_norm": 0.0009701807866804302, + "learning_rate": 2.7445697784423828e-05, + "lookahead_loss": 6.414979763031006, + "loss": 0.3163, + "step": 236500 + }, + { + "base_loss": 0.3459869565963745, + "epoch": 1.0705718994140625, + "grad_norm": 0.0009390601189807057, + "learning_rate": 2.739801406860352e-05, + "lookahead_loss": 6.305636825084687, + "loss": 0.3601, + "step": 237000 + }, + { + "base_loss": 0.3151495299339294, + "epoch": 1.0715255737304688, + "grad_norm": 0.0009649373241700232, + "learning_rate": 2.7350330352783205e-05, + "lookahead_loss": 6.361052248954773, + "loss": 0.3261, + "step": 237500 + }, + { + "base_loss": 0.30790447345376015, + "epoch": 1.072479248046875, + "grad_norm": 0.0010175015777349472, + "learning_rate": 2.7302646636962892e-05, + "lookahead_loss": 6.435437150001526, + "loss": 0.3203, + "step": 238000 + }, + { + "base_loss": 0.30545566940307617, + "epoch": 1.0734329223632812, + "grad_norm": 0.0009223404340445995, + "learning_rate": 2.725496292114258e-05, + "lookahead_loss": 6.397067200183868, + "loss": 0.3185, + "step": 238500 + }, + { + "base_loss": 0.32841417971253395, + "epoch": 1.0743865966796875, + "grad_norm": 0.0009676161571405828, + "learning_rate": 2.7207279205322266e-05, + "lookahead_loss": 6.417798148632049, + "loss": 0.3405, + "step": 239000 + }, + { + "base_loss": 0.30363579127192497, + "epoch": 1.0753402709960938, + "grad_norm": 0.0009646876715123653, + "learning_rate": 2.7159595489501956e-05, + "lookahead_loss": 6.479552453041077, + "loss": 0.3159, + "step": 239500 + }, + { + "base_loss": 0.30504586565494535, + "epoch": 1.0762939453125, + "grad_norm": 0.0009274889598600566, + "learning_rate": 2.7111911773681643e-05, + "lookahead_loss": 6.43780565738678, + "loss": 0.3179, + "step": 240000 + }, + { + "epoch": 1.0762939453125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.429800482222828, + "eval_lookahead_perplexity": 620.0502246280633, + "eval_loss": 0.14199130237102509, + "eval_perplexity": 1.1525666238965462, + "eval_runtime": 498.2628, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 0.315, + "step": 240000 + }, + { + "base_loss": 0.33029799509048463, + "epoch": 1.0772476196289062, + "grad_norm": 0.0009776068618521094, + "learning_rate": 2.706422805786133e-05, + "lookahead_loss": 6.427621715545654, + "loss": 0.3452, + "step": 240500 + }, + { + "base_loss": 0.3037954642176628, + "epoch": 1.0782012939453125, + "grad_norm": 0.0009672937449067831, + "learning_rate": 2.7016544342041016e-05, + "lookahead_loss": 6.43726087474823, + "loss": 0.3166, + "step": 241000 + }, + { + "base_loss": 0.29821320512890814, + "epoch": 1.0791549682617188, + "grad_norm": 0.0009347986779175699, + "learning_rate": 2.6968860626220703e-05, + "lookahead_loss": 6.435288313865661, + "loss": 0.3128, + "step": 241500 + }, + { + "base_loss": 0.3142137563228607, + "epoch": 1.080108642578125, + "grad_norm": 0.0009484716574661434, + "learning_rate": 2.6921176910400393e-05, + "lookahead_loss": 6.400076964378357, + "loss": 0.3302, + "step": 242000 + }, + { + "base_loss": 0.3222310249209404, + "epoch": 1.0810623168945312, + "grad_norm": 0.0009952324908226728, + "learning_rate": 2.687349319458008e-05, + "lookahead_loss": 6.442263381481171, + "loss": 0.3391, + "step": 242500 + }, + { + "base_loss": 0.3002626436650753, + "epoch": 1.0820159912109375, + "grad_norm": 0.0009078267030417919, + "learning_rate": 2.6825809478759767e-05, + "lookahead_loss": 6.434686975479126, + "loss": 0.3123, + "step": 243000 + }, + { + "base_loss": 0.3045452245473862, + "epoch": 1.0829696655273438, + "grad_norm": 0.0009887174237519503, + "learning_rate": 2.6778125762939454e-05, + "lookahead_loss": 6.471680318832397, + "loss": 0.3184, + "step": 243500 + }, + { + "base_loss": 0.33469617655873296, + "epoch": 1.08392333984375, + "grad_norm": 0.0009395595989190042, + "learning_rate": 2.673044204711914e-05, + "lookahead_loss": 6.481926362037659, + "loss": 0.3456, + "step": 244000 + }, + { + "base_loss": 0.30740025800466536, + "epoch": 1.0848770141601562, + "grad_norm": 0.0009517846046946943, + "learning_rate": 2.668275833129883e-05, + "lookahead_loss": 6.399352873802185, + "loss": 0.3189, + "step": 244500 + }, + { + "base_loss": 0.300477741509676, + "epoch": 1.0858306884765625, + "grad_norm": 0.0010044319787994027, + "learning_rate": 2.6635074615478518e-05, + "lookahead_loss": 6.401573163032531, + "loss": 0.3098, + "step": 245000 + }, + { + "epoch": 1.0858306884765625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.42229609748426, + "eval_lookahead_perplexity": 615.4145449024552, + "eval_loss": 0.14197739958763123, + "eval_perplexity": 1.152550600123815, + "eval_runtime": 481.6396, + "eval_samples_per_second": 10.381, + "eval_steps_per_second": 0.326, + "step": 245000 + }, + { + "base_loss": 0.301901563256979, + "epoch": 1.0867843627929688, + "grad_norm": 0.00112338166218251, + "learning_rate": 2.6587390899658205e-05, + "lookahead_loss": 6.377507792472839, + "loss": 0.3128, + "step": 245500 + }, + { + "base_loss": 0.338142231285572, + "epoch": 1.087738037109375, + "grad_norm": 0.0008945403969846666, + "learning_rate": 2.653970718383789e-05, + "lookahead_loss": 6.43312175321579, + "loss": 0.345, + "step": 246000 + }, + { + "base_loss": 0.3009798979461193, + "epoch": 1.0886917114257812, + "grad_norm": 0.000983057077974081, + "learning_rate": 2.6492023468017578e-05, + "lookahead_loss": 6.410857200622559, + "loss": 0.3118, + "step": 246500 + }, + { + "base_loss": 0.3090392453968525, + "epoch": 1.0896453857421875, + "grad_norm": 0.0010020765475928783, + "learning_rate": 2.644433975219727e-05, + "lookahead_loss": 6.44453787612915, + "loss": 0.3179, + "step": 247000 + }, + { + "base_loss": 0.30036539113521576, + "epoch": 1.0905990600585938, + "grad_norm": 0.0009690640727058053, + "learning_rate": 2.6396656036376955e-05, + "lookahead_loss": 6.414513184547424, + "loss": 0.3119, + "step": 247500 + }, + { + "base_loss": 0.3006012495756149, + "epoch": 1.091552734375, + "grad_norm": 0.0008964896551333368, + "learning_rate": 2.6348972320556642e-05, + "lookahead_loss": 6.387368701457977, + "loss": 0.3108, + "step": 248000 + }, + { + "base_loss": 0.31875682109594344, + "epoch": 1.0925064086914062, + "grad_norm": 0.0009248583228327334, + "learning_rate": 2.630128860473633e-05, + "lookahead_loss": 6.417301115036011, + "loss": 0.3352, + "step": 248500 + }, + { + "base_loss": 0.3104289738535881, + "epoch": 1.0934600830078125, + "grad_norm": 0.0009198631742037833, + "learning_rate": 2.6253604888916016e-05, + "lookahead_loss": 6.426006122589111, + "loss": 0.3215, + "step": 249000 + }, + { + "base_loss": 0.2877590928971767, + "epoch": 1.0944137573242188, + "grad_norm": 0.0009729803423397243, + "learning_rate": 2.6205921173095706e-05, + "lookahead_loss": 6.423968379020691, + "loss": 0.3015, + "step": 249500 + }, + { + "base_loss": 0.2935507807135582, + "epoch": 1.095367431640625, + "grad_norm": 0.0009585893130861223, + "learning_rate": 2.6158237457275393e-05, + "lookahead_loss": 6.3482320585250855, + "loss": 0.3071, + "step": 250000 + }, + { + "epoch": 1.095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.416615177648136, + "eval_lookahead_perplexity": 611.9283360192982, + "eval_loss": 0.14196503162384033, + "eval_perplexity": 1.1525363455078759, + "eval_runtime": 497.7474, + "eval_samples_per_second": 10.045, + "eval_steps_per_second": 0.315, + "step": 250000 + }, + { + "base_loss": 0.2986202912926674, + "epoch": 1.0963211059570312, + "grad_norm": 0.0009496866841800511, + "learning_rate": 2.611055374145508e-05, + "lookahead_loss": 6.419412896156311, + "loss": 0.3119, + "step": 250500 + }, + { + "base_loss": 0.3307524161040783, + "epoch": 1.0972747802734375, + "grad_norm": 0.0009860220598056912, + "learning_rate": 2.6062870025634766e-05, + "lookahead_loss": 6.438486506462097, + "loss": 0.3414, + "step": 251000 + }, + { + "base_loss": 0.29244673988223074, + "epoch": 1.0982284545898438, + "grad_norm": 0.0009862987790256739, + "learning_rate": 2.6015186309814453e-05, + "lookahead_loss": 6.378677644729614, + "loss": 0.307, + "step": 251500 + }, + { + "base_loss": 0.295786843508482, + "epoch": 1.09918212890625, + "grad_norm": 0.0009763432899489999, + "learning_rate": 2.5967502593994143e-05, + "lookahead_loss": 6.428773358821869, + "loss": 0.3108, + "step": 252000 + }, + { + "base_loss": 0.30293611577153207, + "epoch": 1.1001358032226562, + "grad_norm": 0.0009206574759446084, + "learning_rate": 2.591981887817383e-05, + "lookahead_loss": 6.424844082832337, + "loss": 0.3149, + "step": 252500 + }, + { + "base_loss": 0.3240869597494602, + "epoch": 1.1010894775390625, + "grad_norm": 0.0009480842272751033, + "learning_rate": 2.5872135162353517e-05, + "lookahead_loss": 6.487875280857086, + "loss": 0.3337, + "step": 253000 + }, + { + "base_loss": 0.30599541807174685, + "epoch": 1.1020431518554688, + "grad_norm": 0.0009581232443451881, + "learning_rate": 2.5824451446533204e-05, + "lookahead_loss": 6.4391917552948, + "loss": 0.315, + "step": 253500 + }, + { + "base_loss": 0.2991089904308319, + "epoch": 1.102996826171875, + "grad_norm": 0.0009612528956495225, + "learning_rate": 2.577676773071289e-05, + "lookahead_loss": 6.4068403224945065, + "loss": 0.3137, + "step": 254000 + }, + { + "base_loss": 0.30219315418601034, + "epoch": 1.1039505004882812, + "grad_norm": 0.001000542426481843, + "learning_rate": 2.572908401489258e-05, + "lookahead_loss": 6.44905508184433, + "loss": 0.3123, + "step": 254500 + }, + { + "base_loss": 0.3133500624895096, + "epoch": 1.1049041748046875, + "grad_norm": 0.000969534448813647, + "learning_rate": 2.5681400299072268e-05, + "lookahead_loss": 6.377971741676331, + "loss": 0.3269, + "step": 255000 + }, + { + "epoch": 1.1049041748046875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.41043955373307, + "eval_lookahead_perplexity": 608.1609417317039, + "eval_loss": 0.14195261895656586, + "eval_perplexity": 1.1525220395464852, + "eval_runtime": 493.4765, + "eval_samples_per_second": 10.132, + "eval_steps_per_second": 0.318, + "step": 255000 + }, + { + "base_loss": 0.3070584389269352, + "epoch": 1.1058578491210938, + "grad_norm": 0.0009202616638503969, + "learning_rate": 2.5633716583251955e-05, + "lookahead_loss": 6.378754141807556, + "loss": 0.3209, + "step": 255500 + }, + { + "base_loss": 0.29948241996765135, + "epoch": 1.1068115234375, + "grad_norm": 0.0009544827044010162, + "learning_rate": 2.558603286743164e-05, + "lookahead_loss": 6.456400173187256, + "loss": 0.3095, + "step": 256000 + }, + { + "base_loss": 0.29492466670274736, + "epoch": 1.1077651977539062, + "grad_norm": 0.0009679844952188432, + "learning_rate": 2.5538349151611328e-05, + "lookahead_loss": 6.373538660526275, + "loss": 0.308, + "step": 256500 + }, + { + "base_loss": 0.3188383647501469, + "epoch": 1.1087188720703125, + "grad_norm": 0.0010139403166249394, + "learning_rate": 2.549066543579102e-05, + "lookahead_loss": 6.368225742340088, + "loss": 0.3312, + "step": 257000 + }, + { + "base_loss": 0.31659464621543887, + "epoch": 1.1096725463867188, + "grad_norm": 0.0009056073613464832, + "learning_rate": 2.5442981719970705e-05, + "lookahead_loss": 6.33794606256485, + "loss": 0.3259, + "step": 257500 + }, + { + "base_loss": 0.3013280538916588, + "epoch": 1.110626220703125, + "grad_norm": 0.0008776048780418932, + "learning_rate": 2.5395298004150392e-05, + "lookahead_loss": 6.292243501186371, + "loss": 0.3138, + "step": 258000 + }, + { + "base_loss": 0.29822684854269027, + "epoch": 1.1115798950195312, + "grad_norm": 0.0010010605910792947, + "learning_rate": 2.534761428833008e-05, + "lookahead_loss": 6.424101258277893, + "loss": 0.3086, + "step": 258500 + }, + { + "base_loss": 0.3082665235698223, + "epoch": 1.1125335693359375, + "grad_norm": 0.0009309147717431188, + "learning_rate": 2.5299930572509766e-05, + "lookahead_loss": 6.439701238632202, + "loss": 0.3208, + "step": 259000 + }, + { + "base_loss": 0.34342152199149134, + "epoch": 1.1134872436523438, + "grad_norm": 0.0010006511583924294, + "learning_rate": 2.5252246856689456e-05, + "lookahead_loss": 6.450336651325226, + "loss": 0.35, + "step": 259500 + }, + { + "base_loss": 0.29527223294973376, + "epoch": 1.11444091796875, + "grad_norm": 0.0009478523279540241, + "learning_rate": 2.5204563140869143e-05, + "lookahead_loss": 6.324613780021667, + "loss": 0.3085, + "step": 260000 + }, + { + "epoch": 1.11444091796875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.404462502787289, + "eval_lookahead_perplexity": 604.5367745052963, + "eval_loss": 0.14194083213806152, + "eval_perplexity": 1.152508455058442, + "eval_runtime": 492.6929, + "eval_samples_per_second": 10.148, + "eval_steps_per_second": 0.319, + "step": 260000 + }, + { + "base_loss": 0.2983711498081684, + "epoch": 1.1153945922851562, + "grad_norm": 0.0009710767772048712, + "learning_rate": 2.515687942504883e-05, + "lookahead_loss": 6.38006550693512, + "loss": 0.3108, + "step": 260500 + }, + { + "base_loss": 0.3164993856549263, + "epoch": 1.1163482666015625, + "grad_norm": 0.0010414492571726441, + "learning_rate": 2.5109195709228516e-05, + "lookahead_loss": 6.3788374900817875, + "loss": 0.3248, + "step": 261000 + }, + { + "base_loss": 0.3281388694047928, + "epoch": 1.1173019409179688, + "grad_norm": 0.000990699976682663, + "learning_rate": 2.5061511993408203e-05, + "lookahead_loss": 6.393062267303467, + "loss": 0.3435, + "step": 261500 + }, + { + "base_loss": 0.3066762860417366, + "epoch": 1.118255615234375, + "grad_norm": 0.0010284364689141512, + "learning_rate": 2.5013828277587893e-05, + "lookahead_loss": 6.40700654888153, + "loss": 0.3159, + "step": 262000 + }, + { + "base_loss": 0.3002779276072979, + "epoch": 1.1192092895507812, + "grad_norm": 0.0009613982401788235, + "learning_rate": 2.496614456176758e-05, + "lookahead_loss": 6.420954690456391, + "loss": 0.3105, + "step": 262500 + }, + { + "base_loss": 0.3048044160306454, + "epoch": 2.0009536743164062, + "grad_norm": 0.0009599330369383097, + "learning_rate": 2.4918460845947267e-05, + "lookahead_loss": 6.447178890228272, + "loss": 0.3134, + "step": 263000 + }, + { + "base_loss": 0.2995053820014, + "epoch": 2.0019073486328125, + "grad_norm": 0.0009974995627999306, + "learning_rate": 2.4870777130126954e-05, + "lookahead_loss": 6.2957507429122925, + "loss": 0.313, + "step": 263500 + }, + { + "base_loss": 0.31198617857694627, + "epoch": 2.0028610229492188, + "grad_norm": 0.0010019242763519287, + "learning_rate": 2.482309341430664e-05, + "lookahead_loss": 6.277785212516784, + "loss": 0.3214, + "step": 264000 + }, + { + "base_loss": 0.32396442687511445, + "epoch": 2.003814697265625, + "grad_norm": 0.0009505018242634833, + "learning_rate": 2.477540969848633e-05, + "lookahead_loss": 6.31766603565216, + "loss": 0.3349, + "step": 264500 + }, + { + "base_loss": 0.3013957371413708, + "epoch": 2.0047683715820312, + "grad_norm": 0.0009469058131799102, + "learning_rate": 2.4727725982666018e-05, + "lookahead_loss": 6.30050235748291, + "loss": 0.3156, + "step": 265000 + }, + { + "epoch": 2.0047683715820312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.39829970548709, + "eval_lookahead_perplexity": 600.8225935311413, + "eval_loss": 0.14192867279052734, + "eval_perplexity": 1.1524944413927996, + "eval_runtime": 494.1378, + "eval_samples_per_second": 10.119, + "eval_steps_per_second": 0.318, + "step": 265000 + }, + { + "base_loss": 0.3039788320362568, + "epoch": 2.0057220458984375, + "grad_norm": 0.000850850366987288, + "learning_rate": 2.4680042266845705e-05, + "lookahead_loss": 6.422514378547668, + "loss": 0.3127, + "step": 265500 + }, + { + "base_loss": 0.29717833909392355, + "epoch": 2.0066757202148438, + "grad_norm": 0.0009482078021392226, + "learning_rate": 2.463235855102539e-05, + "lookahead_loss": 6.272583706855774, + "loss": 0.3137, + "step": 266000 + }, + { + "base_loss": 0.31199148765206336, + "epoch": 2.00762939453125, + "grad_norm": 0.000996118295006454, + "learning_rate": 2.4584674835205078e-05, + "lookahead_loss": 6.3314825057983395, + "loss": 0.3235, + "step": 266500 + }, + { + "base_loss": 0.3148621036410332, + "epoch": 2.0085830688476562, + "grad_norm": 0.0009106769575737417, + "learning_rate": 2.453699111938477e-05, + "lookahead_loss": 6.323533965110779, + "loss": 0.3218, + "step": 267000 + }, + { + "base_loss": 0.30580521461367605, + "epoch": 2.0095367431640625, + "grad_norm": 0.000983801088295877, + "learning_rate": 2.4489307403564455e-05, + "lookahead_loss": 6.337914984703064, + "loss": 0.318, + "step": 267500 + }, + { + "base_loss": 0.3015244754254818, + "epoch": 2.0104904174804688, + "grad_norm": 0.0009343393030576408, + "learning_rate": 2.4441623687744142e-05, + "lookahead_loss": 6.310884699344635, + "loss": 0.3117, + "step": 268000 + }, + { + "base_loss": 0.30137019059062004, + "epoch": 2.011444091796875, + "grad_norm": 0.0010131921153515577, + "learning_rate": 2.439393997192383e-05, + "lookahead_loss": 6.322187620162964, + "loss": 0.3128, + "step": 268500 + }, + { + "base_loss": 0.3252628707587719, + "epoch": 2.0123977661132812, + "grad_norm": 0.0008827339042909443, + "learning_rate": 2.4346256256103516e-05, + "lookahead_loss": 6.31829994392395, + "loss": 0.3348, + "step": 269000 + }, + { + "base_loss": 0.30557073107361793, + "epoch": 2.0133514404296875, + "grad_norm": 0.00094022904522717, + "learning_rate": 2.4298572540283206e-05, + "lookahead_loss": 6.4020820398330684, + "loss": 0.3199, + "step": 269500 + }, + { + "base_loss": 0.30054079556465146, + "epoch": 2.0143051147460938, + "grad_norm": 0.0009659275528974831, + "learning_rate": 2.4250888824462893e-05, + "lookahead_loss": 6.335035899162293, + "loss": 0.3147, + "step": 270000 + }, + { + "epoch": 2.0143051147460938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.392428995321353, + "eval_lookahead_perplexity": 597.3056717387095, + "eval_loss": 0.1419171392917633, + "eval_perplexity": 1.152481149176237, + "eval_runtime": 554.3174, + "eval_samples_per_second": 9.02, + "eval_steps_per_second": 0.283, + "step": 270000 + }, + { + "base_loss": 0.29648803743720054, + "epoch": 2.0152587890625, + "grad_norm": 0.0009011123329401016, + "learning_rate": 2.420320510864258e-05, + "lookahead_loss": 6.325328355789185, + "loss": 0.3068, + "step": 270500 + }, + { + "base_loss": 0.31412097451090815, + "epoch": 2.0162124633789062, + "grad_norm": 0.0009812801145017147, + "learning_rate": 2.4155521392822266e-05, + "lookahead_loss": 6.3500317144393925, + "loss": 0.3249, + "step": 271000 + }, + { + "base_loss": 0.3125672063827515, + "epoch": 2.0171661376953125, + "grad_norm": 0.000906952831428498, + "learning_rate": 2.4107837677001953e-05, + "lookahead_loss": 6.396524043083191, + "loss": 0.323, + "step": 271500 + }, + { + "base_loss": 0.3002317441105843, + "epoch": 2.0181198120117188, + "grad_norm": 0.0009032755624502897, + "learning_rate": 2.406015396118164e-05, + "lookahead_loss": 6.402642107963562, + "loss": 0.31, + "step": 272000 + }, + { + "base_loss": 0.29831535935401915, + "epoch": 2.019073486328125, + "grad_norm": 0.0009511581738479435, + "learning_rate": 2.401247024536133e-05, + "lookahead_loss": 6.4212296113967895, + "loss": 0.3098, + "step": 272500 + }, + { + "base_loss": 0.3020369653701782, + "epoch": 2.0200271606445312, + "grad_norm": 0.0010397924343124032, + "learning_rate": 2.3964786529541017e-05, + "lookahead_loss": 6.250913990974427, + "loss": 0.3146, + "step": 273000 + }, + { + "base_loss": 0.32652922403812407, + "epoch": 2.0209808349609375, + "grad_norm": 0.0009413071093149483, + "learning_rate": 2.3917102813720704e-05, + "lookahead_loss": 6.339208589553833, + "loss": 0.3385, + "step": 273500 + }, + { + "base_loss": 0.30453234216570857, + "epoch": 2.0219345092773438, + "grad_norm": 0.0009446305339224637, + "learning_rate": 2.386941909790039e-05, + "lookahead_loss": 6.305522140026093, + "loss": 0.3135, + "step": 274000 + }, + { + "base_loss": 0.2977458454966545, + "epoch": 2.02288818359375, + "grad_norm": 0.0009985527722164989, + "learning_rate": 2.3821735382080078e-05, + "lookahead_loss": 6.342605011940003, + "loss": 0.3114, + "step": 274500 + }, + { + "base_loss": 0.30405546057224275, + "epoch": 2.0238418579101562, + "grad_norm": 0.0009585677762515843, + "learning_rate": 2.3774051666259768e-05, + "lookahead_loss": 6.30675194978714, + "loss": 0.3139, + "step": 275000 + }, + { + "epoch": 2.0238418579101562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.3858810156679, + "eval_lookahead_perplexity": 593.4073035000985, + "eval_loss": 0.14190448820590973, + "eval_perplexity": 1.1524665691305012, + "eval_runtime": 494.8539, + "eval_samples_per_second": 10.104, + "eval_steps_per_second": 0.317, + "step": 275000 + }, + { + "base_loss": 0.32463854083418847, + "epoch": 2.0247955322265625, + "grad_norm": 0.0009209871059283614, + "learning_rate": 2.3726367950439455e-05, + "lookahead_loss": 6.313430188179016, + "loss": 0.3352, + "step": 275500 + }, + { + "base_loss": 0.3075324648320675, + "epoch": 2.0257492065429688, + "grad_norm": 0.0009806884918361902, + "learning_rate": 2.367868423461914e-05, + "lookahead_loss": 6.26762502527237, + "loss": 0.3227, + "step": 276000 + }, + { + "base_loss": 0.30398501074314116, + "epoch": 2.026702880859375, + "grad_norm": 0.0009761953260749578, + "learning_rate": 2.3631000518798828e-05, + "lookahead_loss": 6.290206132888794, + "loss": 0.313, + "step": 276500 + }, + { + "base_loss": 0.3081837382018566, + "epoch": 2.0276565551757812, + "grad_norm": 0.0009468916687183082, + "learning_rate": 2.3583316802978515e-05, + "lookahead_loss": 6.399575003147126, + "loss": 0.318, + "step": 277000 + }, + { + "base_loss": 0.32895678067207335, + "epoch": 2.0286102294921875, + "grad_norm": 0.000959627446718514, + "learning_rate": 2.3535633087158205e-05, + "lookahead_loss": 6.428856110572815, + "loss": 0.3399, + "step": 277500 + }, + { + "base_loss": 0.30588172587752344, + "epoch": 2.0295639038085938, + "grad_norm": 0.0009464654722250998, + "learning_rate": 2.3487949371337892e-05, + "lookahead_loss": 6.353826072692871, + "loss": 0.3144, + "step": 278000 + }, + { + "base_loss": 0.3051903445720673, + "epoch": 2.030517578125, + "grad_norm": 0.0009384790319018066, + "learning_rate": 2.344026565551758e-05, + "lookahead_loss": 6.346502691268921, + "loss": 0.3164, + "step": 278500 + }, + { + "base_loss": 0.30346439191699026, + "epoch": 2.0314712524414062, + "grad_norm": 0.0009686793782748282, + "learning_rate": 2.3392581939697266e-05, + "lookahead_loss": 6.370296128749848, + "loss": 0.3152, + "step": 279000 + }, + { + "base_loss": 0.31795056411623956, + "epoch": 2.0324249267578125, + "grad_norm": 0.0009664087556302547, + "learning_rate": 2.3344898223876953e-05, + "lookahead_loss": 6.306786568641662, + "loss": 0.3345, + "step": 279500 + }, + { + "base_loss": 0.30795893451571466, + "epoch": 2.0333786010742188, + "grad_norm": 0.0009828972397372127, + "learning_rate": 2.3297214508056643e-05, + "lookahead_loss": 6.365034967899322, + "loss": 0.3175, + "step": 280000 + }, + { + "epoch": 2.0333786010742188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.380661546993561, + "eval_lookahead_perplexity": 590.3181016776454, + "eval_loss": 0.1418943554162979, + "eval_perplexity": 1.1524548914883852, + "eval_runtime": 501.8161, + "eval_samples_per_second": 9.964, + "eval_steps_per_second": 0.313, + "step": 280000 + }, + { + "base_loss": 0.3031257001161575, + "epoch": 2.034332275390625, + "grad_norm": 0.0008994505042210221, + "learning_rate": 2.324953079223633e-05, + "lookahead_loss": 6.434097370147705, + "loss": 0.315, + "step": 280500 + }, + { + "base_loss": 0.31068781118094924, + "epoch": 2.0352859497070312, + "grad_norm": 0.0010203543351963162, + "learning_rate": 2.3201847076416016e-05, + "lookahead_loss": 6.271405048847199, + "loss": 0.3225, + "step": 281000 + }, + { + "base_loss": 0.32500979214906695, + "epoch": 2.0362396240234375, + "grad_norm": 0.000965456769336015, + "learning_rate": 2.3154163360595703e-05, + "lookahead_loss": 6.383410150051117, + "loss": 0.3366, + "step": 281500 + }, + { + "base_loss": 0.3069631262719631, + "epoch": 2.0371932983398438, + "grad_norm": 0.001005359343253076, + "learning_rate": 2.310647964477539e-05, + "lookahead_loss": 6.302391654968262, + "loss": 0.3172, + "step": 282000 + }, + { + "base_loss": 0.3025422422587872, + "epoch": 2.03814697265625, + "grad_norm": 0.0009587942040525377, + "learning_rate": 2.305879592895508e-05, + "lookahead_loss": 6.354145127296448, + "loss": 0.313, + "step": 282500 + }, + { + "base_loss": 0.3076345331072807, + "epoch": 2.0391006469726562, + "grad_norm": 0.0009680093498900533, + "learning_rate": 2.3011112213134767e-05, + "lookahead_loss": 6.302607263088226, + "loss": 0.3186, + "step": 283000 + }, + { + "base_loss": 0.3235399980545044, + "epoch": 2.0400543212890625, + "grad_norm": 0.0009408083860762417, + "learning_rate": 2.2963428497314454e-05, + "lookahead_loss": 6.342537101268769, + "loss": 0.3333, + "step": 283500 + }, + { + "base_loss": 0.30506757298111914, + "epoch": 2.0410079956054688, + "grad_norm": 0.0009745181305333972, + "learning_rate": 2.291574478149414e-05, + "lookahead_loss": 6.2780855388641355, + "loss": 0.3138, + "step": 284000 + }, + { + "base_loss": 0.29668092691898346, + "epoch": 2.041961669921875, + "grad_norm": 0.0009714270127005875, + "learning_rate": 2.2868061065673828e-05, + "lookahead_loss": 6.369159207820893, + "loss": 0.308, + "step": 284500 + }, + { + "base_loss": 0.30789948108792303, + "epoch": 2.0429153442382812, + "grad_norm": 0.000963637896347791, + "learning_rate": 2.2820377349853518e-05, + "lookahead_loss": 6.3689776067733765, + "loss": 0.3237, + "step": 285000 + }, + { + "epoch": 2.0429153442382812, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.37550598592423, + "eval_lookahead_perplexity": 587.2825124606885, + "eval_loss": 0.14188414812088013, + "eval_perplexity": 1.1524431281008884, + "eval_runtime": 480.4361, + "eval_samples_per_second": 10.407, + "eval_steps_per_second": 0.327, + "step": 285000 + }, + { + "base_loss": 0.3281280441880226, + "epoch": 2.0438690185546875, + "grad_norm": 0.000991139211691916, + "learning_rate": 2.2772693634033205e-05, + "lookahead_loss": 6.4065694217681886, + "loss": 0.3434, + "step": 285500 + }, + { + "base_loss": 0.2978555924296379, + "epoch": 2.0448226928710938, + "grad_norm": 0.0010299658169969916, + "learning_rate": 2.272500991821289e-05, + "lookahead_loss": 6.316880146980286, + "loss": 0.3102, + "step": 286000 + }, + { + "base_loss": 0.3044668311774731, + "epoch": 2.0457763671875, + "grad_norm": 0.0009648673003539443, + "learning_rate": 2.2677326202392578e-05, + "lookahead_loss": 6.345701354980469, + "loss": 0.3171, + "step": 286500 + }, + { + "base_loss": 0.3298782432973385, + "epoch": 2.0467300415039062, + "grad_norm": 0.0009219807107001543, + "learning_rate": 2.2629642486572265e-05, + "lookahead_loss": 6.301569487571716, + "loss": 0.3398, + "step": 287000 + }, + { + "base_loss": 0.32442897310853, + "epoch": 2.0476837158203125, + "grad_norm": 0.000999429146759212, + "learning_rate": 2.2581958770751955e-05, + "lookahead_loss": 6.352350478172302, + "loss": 0.3382, + "step": 287500 + }, + { + "base_loss": 0.2941350122392178, + "epoch": 2.0486373901367188, + "grad_norm": 0.0009221304790116847, + "learning_rate": 2.2534275054931642e-05, + "lookahead_loss": 6.310078239440918, + "loss": 0.3075, + "step": 288000 + }, + { + "base_loss": 0.301623804807663, + "epoch": 2.049591064453125, + "grad_norm": 0.0009881729492917657, + "learning_rate": 2.248659133911133e-05, + "lookahead_loss": 6.285659980773926, + "loss": 0.3143, + "step": 288500 + }, + { + "base_loss": 0.31965578559041025, + "epoch": 2.0505447387695312, + "grad_norm": 0.0008979432168416679, + "learning_rate": 2.2438907623291016e-05, + "lookahead_loss": 6.39150973701477, + "loss": 0.3318, + "step": 289000 + }, + { + "base_loss": 0.30511142282187936, + "epoch": 2.0514984130859375, + "grad_norm": 0.0009867299813777208, + "learning_rate": 2.2391223907470703e-05, + "lookahead_loss": 6.347462896347046, + "loss": 0.3185, + "step": 289500 + }, + { + "base_loss": 0.3033564644157887, + "epoch": 2.0524520874023438, + "grad_norm": 0.0010347136994823813, + "learning_rate": 2.2343540191650393e-05, + "lookahead_loss": 6.280826330661774, + "loss": 0.3157, + "step": 290000 + }, + { + "epoch": 2.0524520874023438, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.37110259662421, + "eval_lookahead_perplexity": 584.702164237485, + "eval_loss": 0.14187486469745636, + "eval_perplexity": 1.1524324295330182, + "eval_runtime": 497.5748, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 0.316, + "step": 290000 + }, + { + "base_loss": 0.32089028322696683, + "epoch": 2.05340576171875, + "grad_norm": 0.0009401792194694281, + "learning_rate": 2.229585647583008e-05, + "lookahead_loss": 6.298841968536377, + "loss": 0.3298, + "step": 290500 + }, + { + "base_loss": 0.35406574749946595, + "epoch": 2.0543594360351562, + "grad_norm": 0.0009749068995006382, + "learning_rate": 2.2248172760009766e-05, + "lookahead_loss": 6.336953705310822, + "loss": 0.369, + "step": 291000 + }, + { + "base_loss": 0.2938829956352711, + "epoch": 2.0553131103515625, + "grad_norm": 0.0009761652327142656, + "learning_rate": 2.2200489044189453e-05, + "lookahead_loss": 6.33252804851532, + "loss": 0.3061, + "step": 291500 + }, + { + "base_loss": 0.30498689064383505, + "epoch": 2.0562667846679688, + "grad_norm": 0.0009249149006791413, + "learning_rate": 2.215280532836914e-05, + "lookahead_loss": 6.340421550750732, + "loss": 0.3172, + "step": 292000 + }, + { + "base_loss": 0.317481600522995, + "epoch": 2.057220458984375, + "grad_norm": 0.0009565609507262707, + "learning_rate": 2.210512161254883e-05, + "lookahead_loss": 6.357920897006989, + "loss": 0.3308, + "step": 292500 + }, + { + "base_loss": 0.3179551683664322, + "epoch": 2.0581741333007812, + "grad_norm": 0.000965911487583071, + "learning_rate": 2.2057437896728517e-05, + "lookahead_loss": 6.357965930938721, + "loss": 0.329, + "step": 293000 + }, + { + "base_loss": 0.29271650505065916, + "epoch": 2.0591278076171875, + "grad_norm": 0.0009383960859850049, + "learning_rate": 2.2009754180908204e-05, + "lookahead_loss": 6.261647268295288, + "loss": 0.3064, + "step": 293500 + }, + { + "base_loss": 0.3039356949329376, + "epoch": 2.0600814819335938, + "grad_norm": 0.0009705057600513101, + "learning_rate": 2.196207046508789e-05, + "lookahead_loss": 6.3261529092788695, + "loss": 0.318, + "step": 294000 + }, + { + "base_loss": 0.32165152502059935, + "epoch": 2.06103515625, + "grad_norm": 0.0009664911776781082, + "learning_rate": 2.1914386749267578e-05, + "lookahead_loss": 6.293122010707855, + "loss": 0.3323, + "step": 294500 + }, + { + "base_loss": 0.3061283130943775, + "epoch": 2.0619888305664062, + "grad_norm": 0.0010313090169802308, + "learning_rate": 2.1866703033447268e-05, + "lookahead_loss": 6.3093594617843625, + "loss": 0.3162, + "step": 295000 + }, + { + "epoch": 2.0619888305664062, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.365444588585022, + "eval_lookahead_perplexity": 581.4032561167049, + "eval_loss": 0.14186474680900574, + "eval_perplexity": 1.1524207694092372, + "eval_runtime": 486.7335, + "eval_samples_per_second": 10.273, + "eval_steps_per_second": 0.323, + "step": 295000 + }, + { + "base_loss": 0.30640321379899976, + "epoch": 2.0629425048828125, + "grad_norm": 0.000986712984740734, + "learning_rate": 2.1819019317626955e-05, + "lookahead_loss": 6.2958876605033876, + "loss": 0.3168, + "step": 295500 + }, + { + "base_loss": 0.31782649287581444, + "epoch": 2.0638961791992188, + "grad_norm": 0.0009465559851378202, + "learning_rate": 2.177133560180664e-05, + "lookahead_loss": 6.329127202033996, + "loss": 0.33, + "step": 296000 + }, + { + "base_loss": 0.30349985790252687, + "epoch": 2.064849853515625, + "grad_norm": 0.0009652600274421275, + "learning_rate": 2.1723651885986328e-05, + "lookahead_loss": 6.3504975581169125, + "loss": 0.3182, + "step": 296500 + }, + { + "base_loss": 0.3075440634191036, + "epoch": 2.0658035278320312, + "grad_norm": 0.0009673352469690144, + "learning_rate": 2.1675968170166015e-05, + "lookahead_loss": 6.291652394771576, + "loss": 0.3201, + "step": 297000 + }, + { + "base_loss": 0.3064390652179718, + "epoch": 2.0667572021484375, + "grad_norm": 0.0009870745707303286, + "learning_rate": 2.1628284454345705e-05, + "lookahead_loss": 6.258386531829834, + "loss": 0.3162, + "step": 297500 + }, + { + "base_loss": 0.3303199237883091, + "epoch": 2.0677108764648438, + "grad_norm": 0.0010181849356740713, + "learning_rate": 2.1580600738525392e-05, + "lookahead_loss": 6.332426538467407, + "loss": 0.3403, + "step": 298000 + }, + { + "base_loss": 0.2994670196175575, + "epoch": 2.06866455078125, + "grad_norm": 0.00100489251781255, + "learning_rate": 2.153291702270508e-05, + "lookahead_loss": 6.305504947662354, + "loss": 0.3099, + "step": 298500 + }, + { + "base_loss": 0.3000359579175711, + "epoch": 2.0696182250976562, + "grad_norm": 0.0009632077999413013, + "learning_rate": 2.1485233306884766e-05, + "lookahead_loss": 6.336464623451233, + "loss": 0.3152, + "step": 299000 + }, + { + "base_loss": 0.34639680609107015, + "epoch": 2.0705718994140625, + "grad_norm": 0.000955465598963201, + "learning_rate": 2.1437549591064453e-05, + "lookahead_loss": 6.232782573699951, + "loss": 0.3584, + "step": 299500 + }, + { + "base_loss": 0.3132462115287781, + "epoch": 2.0715255737304688, + "grad_norm": 0.0009801997803151608, + "learning_rate": 2.1389865875244143e-05, + "lookahead_loss": 6.286718217372894, + "loss": 0.3238, + "step": 300000 + }, + { + "epoch": 2.0715255737304688, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.360287372296611, + "eval_lookahead_perplexity": 578.4125522560697, + "eval_loss": 0.14185494184494019, + "eval_perplexity": 1.1524094700204, + "eval_runtime": 491.2258, + "eval_samples_per_second": 10.179, + "eval_steps_per_second": 0.32, + "step": 300000 + }, + { + "base_loss": 0.3044133240580559, + "epoch": 1.0009536743164062, + "grad_norm": 0.00098008185159415, + "learning_rate": 2.134218215942383e-05, + "lookahead_loss": 6.387941931724549, + "loss": 0.313, + "step": 300500 + }, + { + "base_loss": 0.30059696701169014, + "epoch": 1.0019073486328125, + "grad_norm": 0.0009868093766272068, + "learning_rate": 2.1294498443603516e-05, + "lookahead_loss": 6.230354256629944, + "loss": 0.3129, + "step": 301000 + }, + { + "base_loss": 0.31169990518689156, + "epoch": 1.0028610229492188, + "grad_norm": 0.0009847109904512763, + "learning_rate": 2.1246814727783203e-05, + "lookahead_loss": 6.2152260408401485, + "loss": 0.3206, + "step": 301500 + }, + { + "base_loss": 0.3227726019620895, + "epoch": 1.003814697265625, + "grad_norm": 0.0009617835166864097, + "learning_rate": 2.119913101196289e-05, + "lookahead_loss": 6.249225090503693, + "loss": 0.3346, + "step": 302000 + }, + { + "base_loss": 0.3022470915019512, + "epoch": 1.0047683715820312, + "grad_norm": 0.0009369543986395001, + "learning_rate": 2.115144729614258e-05, + "lookahead_loss": 6.227331521987915, + "loss": 0.3164, + "step": 302500 + }, + { + "base_loss": 0.30552061820030213, + "epoch": 1.0057220458984375, + "grad_norm": 0.0008485575090162456, + "learning_rate": 2.1103763580322267e-05, + "lookahead_loss": 6.357279621124268, + "loss": 0.3131, + "step": 303000 + }, + { + "base_loss": 0.2953472335338593, + "epoch": 1.0066757202148438, + "grad_norm": 0.0009135955478996038, + "learning_rate": 2.1056079864501954e-05, + "lookahead_loss": 6.219304689407348, + "loss": 0.3136, + "step": 303500 + }, + { + "base_loss": 0.312746944963932, + "epoch": 1.00762939453125, + "grad_norm": 0.0009722797549329698, + "learning_rate": 2.100839614868164e-05, + "lookahead_loss": 6.2718313956260685, + "loss": 0.3233, + "step": 304000 + }, + { + "base_loss": 0.3169711889922619, + "epoch": 1.0085830688476562, + "grad_norm": 0.0009408054756931961, + "learning_rate": 2.0960712432861328e-05, + "lookahead_loss": 6.2603740873336795, + "loss": 0.3213, + "step": 304500 + }, + { + "base_loss": 0.306710629016161, + "epoch": 1.0095367431640625, + "grad_norm": 0.0009732133476063609, + "learning_rate": 2.0913028717041018e-05, + "lookahead_loss": 6.265056456089019, + "loss": 0.319, + "step": 305000 + }, + { + "epoch": 1.0095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.355427415987935, + "eval_lookahead_perplexity": 575.6083122850689, + "eval_loss": 0.14184564352035522, + "eval_perplexity": 1.1523987545929106, + "eval_runtime": 511.7814, + "eval_samples_per_second": 9.77, + "eval_steps_per_second": 0.307, + "step": 305000 + }, + { + "base_loss": 0.30083237382769584, + "epoch": 1.0104904174804688, + "grad_norm": 0.0009537778096273541, + "learning_rate": 2.0865345001220705e-05, + "lookahead_loss": 6.2422507305145265, + "loss": 0.3105, + "step": 305500 + }, + { + "base_loss": 0.2993237827420235, + "epoch": 1.011444091796875, + "grad_norm": 0.001014180015772581, + "learning_rate": 2.081766128540039e-05, + "lookahead_loss": 6.2589937620162965, + "loss": 0.3131, + "step": 306000 + }, + { + "base_loss": 0.3238567093908787, + "epoch": 1.0123977661132812, + "grad_norm": 0.0008796192123554647, + "learning_rate": 2.0769977569580078e-05, + "lookahead_loss": 6.240856420516968, + "loss": 0.3334, + "step": 306500 + }, + { + "base_loss": 0.3051931007504463, + "epoch": 1.0133514404296875, + "grad_norm": 0.0009358287206850946, + "learning_rate": 2.0722293853759765e-05, + "lookahead_loss": 6.343574766159057, + "loss": 0.3194, + "step": 307000 + }, + { + "base_loss": 0.29808008483052256, + "epoch": 1.0143051147460938, + "grad_norm": 0.0009350993204861879, + "learning_rate": 2.0674610137939455e-05, + "lookahead_loss": 6.2895770835876466, + "loss": 0.3128, + "step": 307500 + }, + { + "base_loss": 0.29345863962173463, + "epoch": 1.0152587890625, + "grad_norm": 0.0008895855862647295, + "learning_rate": 2.0626926422119142e-05, + "lookahead_loss": 6.227820379257202, + "loss": 0.3051, + "step": 308000 + }, + { + "base_loss": 0.3092884007692337, + "epoch": 1.0162124633789062, + "grad_norm": 0.0009875975083559752, + "learning_rate": 2.057924270629883e-05, + "lookahead_loss": 6.29535734128952, + "loss": 0.3207, + "step": 308500 + }, + { + "base_loss": 0.31143338218331335, + "epoch": 1.0171661376953125, + "grad_norm": 0.0008970174239948392, + "learning_rate": 2.0531558990478516e-05, + "lookahead_loss": 6.331631092071533, + "loss": 0.3226, + "step": 309000 + }, + { + "base_loss": 0.3001442384421825, + "epoch": 1.0181198120117188, + "grad_norm": 0.0009284500265493989, + "learning_rate": 2.0483875274658203e-05, + "lookahead_loss": 6.334627725601196, + "loss": 0.3111, + "step": 309500 + }, + { + "base_loss": 0.2986592257618904, + "epoch": 1.019073486328125, + "grad_norm": 0.0009415113599970937, + "learning_rate": 2.0436191558837893e-05, + "lookahead_loss": 6.363989944458008, + "loss": 0.3108, + "step": 310000 + }, + { + "epoch": 1.019073486328125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.350594499240668, + "eval_lookahead_perplexity": 572.8331566823812, + "eval_loss": 0.14183586835861206, + "eval_perplexity": 1.1523874897637498, + "eval_runtime": 523.8891, + "eval_samples_per_second": 9.544, + "eval_steps_per_second": 0.3, + "step": 310000 + }, + { + "base_loss": 0.30347599306702616, + "epoch": 1.0200271606445312, + "grad_norm": 0.0010443053906783462, + "learning_rate": 2.038850784301758e-05, + "lookahead_loss": 6.206486799240112, + "loss": 0.3149, + "step": 310500 + }, + { + "base_loss": 0.3299741225540638, + "epoch": 1.0209808349609375, + "grad_norm": 0.0009351633489131927, + "learning_rate": 2.0340824127197266e-05, + "lookahead_loss": 6.298027420997619, + "loss": 0.3402, + "step": 311000 + }, + { + "base_loss": 0.3070560489296913, + "epoch": 1.0219345092773438, + "grad_norm": 0.0009786691516637802, + "learning_rate": 2.0293140411376953e-05, + "lookahead_loss": 6.2451701836586, + "loss": 0.3161, + "step": 311500 + }, + { + "base_loss": 0.301061170309782, + "epoch": 1.02288818359375, + "grad_norm": 0.0009934369008988142, + "learning_rate": 2.024545669555664e-05, + "lookahead_loss": 6.286135159492493, + "loss": 0.3131, + "step": 312000 + }, + { + "base_loss": 0.30337609922885894, + "epoch": 1.0238418579101562, + "grad_norm": 0.0009733522310853004, + "learning_rate": 2.019777297973633e-05, + "lookahead_loss": 6.243100929737091, + "loss": 0.3142, + "step": 312500 + }, + { + "base_loss": 0.3241444931924343, + "epoch": 1.0247955322265625, + "grad_norm": 0.0009022055310197175, + "learning_rate": 2.0150089263916017e-05, + "lookahead_loss": 6.256959664344787, + "loss": 0.3359, + "step": 313000 + }, + { + "base_loss": 0.3070600248277187, + "epoch": 1.0257492065429688, + "grad_norm": 0.0009672227897681296, + "learning_rate": 2.0102405548095704e-05, + "lookahead_loss": 6.207882607460022, + "loss": 0.321, + "step": 313500 + }, + { + "base_loss": 0.3022406686246395, + "epoch": 1.026702880859375, + "grad_norm": 0.0010004551149904728, + "learning_rate": 2.005472183227539e-05, + "lookahead_loss": 6.240452417850494, + "loss": 0.3108, + "step": 314000 + }, + { + "base_loss": 0.30680677881836893, + "epoch": 1.0276565551757812, + "grad_norm": 0.000957614800427109, + "learning_rate": 2.0007038116455078e-05, + "lookahead_loss": 6.346712133407593, + "loss": 0.3187, + "step": 314500 + }, + { + "base_loss": 0.33426042160391806, + "epoch": 1.0286102294921875, + "grad_norm": 0.0009514397825114429, + "learning_rate": 1.9959354400634768e-05, + "lookahead_loss": 6.367521642208099, + "loss": 0.3424, + "step": 315000 + }, + { + "epoch": 1.0286102294921875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.346073081318182, + "eval_lookahead_perplexity": 570.2489850435419, + "eval_loss": 0.14182710647583008, + "eval_perplexity": 1.1523773927238796, + "eval_runtime": 488.1256, + "eval_samples_per_second": 10.243, + "eval_steps_per_second": 0.322, + "step": 315000 + }, + { + "base_loss": 0.3049518305659294, + "epoch": 1.0295639038085938, + "grad_norm": 0.0009535288554616272, + "learning_rate": 1.9911670684814455e-05, + "lookahead_loss": 6.311778125762939, + "loss": 0.313, + "step": 315500 + }, + { + "base_loss": 0.3062360401749611, + "epoch": 1.030517578125, + "grad_norm": 0.0009322623955085874, + "learning_rate": 1.986398696899414e-05, + "lookahead_loss": 6.283089350700378, + "loss": 0.3165, + "step": 316000 + }, + { + "base_loss": 0.30225355681777, + "epoch": 1.0314712524414062, + "grad_norm": 0.0009594020084477961, + "learning_rate": 1.9816303253173828e-05, + "lookahead_loss": 6.316081517219543, + "loss": 0.3137, + "step": 316500 + }, + { + "base_loss": 0.3184074863195419, + "epoch": 1.0324249267578125, + "grad_norm": 0.0009672947344370186, + "learning_rate": 1.9768619537353515e-05, + "lookahead_loss": 6.272526082992553, + "loss": 0.3352, + "step": 317000 + }, + { + "base_loss": 0.30629492220282556, + "epoch": 1.0333786010742188, + "grad_norm": 0.0010004861978814006, + "learning_rate": 1.9720935821533205e-05, + "lookahead_loss": 6.310171216011048, + "loss": 0.3156, + "step": 317500 + }, + { + "base_loss": 0.3031555346250534, + "epoch": 1.034332275390625, + "grad_norm": 0.0008947703754529357, + "learning_rate": 1.9673252105712892e-05, + "lookahead_loss": 6.376363782882691, + "loss": 0.3142, + "step": 318000 + }, + { + "base_loss": 0.31164542263746264, + "epoch": 1.0352859497070312, + "grad_norm": 0.0010116840712726116, + "learning_rate": 1.962556838989258e-05, + "lookahead_loss": 6.206376674175263, + "loss": 0.3225, + "step": 318500 + }, + { + "base_loss": 0.324304408878088, + "epoch": 1.0362396240234375, + "grad_norm": 0.0009698173380456865, + "learning_rate": 1.9577884674072266e-05, + "lookahead_loss": 6.3223733234405515, + "loss": 0.3355, + "step": 319000 + }, + { + "base_loss": 0.30813179594278334, + "epoch": 1.0371932983398438, + "grad_norm": 0.0009873044909909368, + "learning_rate": 1.9530200958251953e-05, + "lookahead_loss": 6.256875138282776, + "loss": 0.3197, + "step": 319500 + }, + { + "base_loss": 0.30138176554441454, + "epoch": 1.03814697265625, + "grad_norm": 0.0009513625991530716, + "learning_rate": 1.9482517242431643e-05, + "lookahead_loss": 6.290307790756225, + "loss": 0.3137, + "step": 320000 + }, + { + "epoch": 1.03814697265625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.3419040491024905, + "eval_lookahead_perplexity": 567.876547474501, + "eval_loss": 0.14181947708129883, + "eval_perplexity": 1.1523686008156402, + "eval_runtime": 489.5987, + "eval_samples_per_second": 10.212, + "eval_steps_per_second": 0.321, + "step": 320000 + }, + { + "base_loss": 0.30871694785356524, + "epoch": 1.0391006469726562, + "grad_norm": 0.0009404685697518289, + "learning_rate": 1.943483352661133e-05, + "lookahead_loss": 6.250073432922363, + "loss": 0.3203, + "step": 320500 + }, + { + "base_loss": 0.32506244936585427, + "epoch": 1.0400543212890625, + "grad_norm": 0.0009284181869588792, + "learning_rate": 1.9387149810791016e-05, + "lookahead_loss": 6.279073251724244, + "loss": 0.3338, + "step": 321000 + }, + { + "base_loss": 0.30769926142692566, + "epoch": 1.0410079956054688, + "grad_norm": 0.0009764406131580472, + "learning_rate": 1.9339466094970703e-05, + "lookahead_loss": 6.219414553642273, + "loss": 0.3165, + "step": 321500 + }, + { + "base_loss": 0.29858891409635546, + "epoch": 1.041961669921875, + "grad_norm": 0.0009733131737448275, + "learning_rate": 1.929178237915039e-05, + "lookahead_loss": 6.308469578266144, + "loss": 0.3097, + "step": 322000 + }, + { + "base_loss": 0.3094627737402916, + "epoch": 1.0429153442382812, + "grad_norm": 0.0009679461945779622, + "learning_rate": 1.924409866333008e-05, + "lookahead_loss": 6.322273569583893, + "loss": 0.3243, + "step": 322500 + }, + { + "base_loss": 0.32764697542786597, + "epoch": 1.0438690185546875, + "grad_norm": 0.000978301279246807, + "learning_rate": 1.9196414947509767e-05, + "lookahead_loss": 6.351773486614228, + "loss": 0.3406, + "step": 323000 + }, + { + "base_loss": 0.29553532418608663, + "epoch": 1.0448226928710938, + "grad_norm": 0.0009794612415134907, + "learning_rate": 1.9148731231689454e-05, + "lookahead_loss": 6.261120037555695, + "loss": 0.3087, + "step": 323500 + }, + { + "base_loss": 0.3041268612146378, + "epoch": 1.0457763671875, + "grad_norm": 0.0009413088555447757, + "learning_rate": 1.910104751586914e-05, + "lookahead_loss": 6.282316081047058, + "loss": 0.3152, + "step": 324000 + }, + { + "base_loss": 0.3303914776444435, + "epoch": 1.0467300415039062, + "grad_norm": 0.0009337280644103885, + "learning_rate": 1.9053363800048828e-05, + "lookahead_loss": 6.249731389045715, + "loss": 0.3395, + "step": 324500 + }, + { + "base_loss": 0.3248122656941414, + "epoch": 1.0476837158203125, + "grad_norm": 0.0010242167627438903, + "learning_rate": 1.9005680084228518e-05, + "lookahead_loss": 6.29884423494339, + "loss": 0.3389, + "step": 325000 + }, + { + "epoch": 1.0476837158203125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.338128092951668, + "eval_lookahead_perplexity": 565.7363137893566, + "eval_loss": 0.14181146025657654, + "eval_perplexity": 1.152359362515583, + "eval_runtime": 491.4419, + "eval_samples_per_second": 10.174, + "eval_steps_per_second": 0.319, + "step": 325000 + }, + { + "base_loss": 0.2969100174307823, + "epoch": 1.0486373901367188, + "grad_norm": 0.0009389603510499001, + "learning_rate": 1.8957996368408205e-05, + "lookahead_loss": 6.244260203361511, + "loss": 0.3085, + "step": 325500 + }, + { + "base_loss": 0.302005185931921, + "epoch": 1.049591064453125, + "grad_norm": 0.0009924854384735227, + "learning_rate": 1.891031265258789e-05, + "lookahead_loss": 6.2381826696395875, + "loss": 0.3161, + "step": 326000 + }, + { + "base_loss": 0.31874441370368006, + "epoch": 1.0505447387695312, + "grad_norm": 0.0008907459559850395, + "learning_rate": 1.8862628936767578e-05, + "lookahead_loss": 6.345169459342957, + "loss": 0.331, + "step": 326500 + }, + { + "base_loss": 0.30408672893047334, + "epoch": 1.0514984130859375, + "grad_norm": 0.0009830091148614883, + "learning_rate": 1.8814945220947265e-05, + "lookahead_loss": 6.287209135055542, + "loss": 0.3181, + "step": 327000 + }, + { + "base_loss": 0.3058005510568619, + "epoch": 1.0524520874023438, + "grad_norm": 0.0010249796323478222, + "learning_rate": 1.8767261505126955e-05, + "lookahead_loss": 6.2355098533630375, + "loss": 0.3179, + "step": 327500 + }, + { + "base_loss": 0.32026463899016383, + "epoch": 1.05340576171875, + "grad_norm": 0.0009288426954299212, + "learning_rate": 1.8719577789306642e-05, + "lookahead_loss": 6.250459560394287, + "loss": 0.3291, + "step": 328000 + }, + { + "base_loss": 0.35889338579773905, + "epoch": 1.0543594360351562, + "grad_norm": 0.0009575394215062261, + "learning_rate": 1.867189407348633e-05, + "lookahead_loss": 6.286208940505982, + "loss": 0.3707, + "step": 328500 + }, + { + "base_loss": 0.29546374672651293, + "epoch": 1.0553131103515625, + "grad_norm": 0.0009593223221600056, + "learning_rate": 1.8624210357666016e-05, + "lookahead_loss": 6.2649248747825625, + "loss": 0.306, + "step": 329000 + }, + { + "base_loss": 0.3063408683240414, + "epoch": 1.0562667846679688, + "grad_norm": 0.0009331299806945026, + "learning_rate": 1.8576526641845703e-05, + "lookahead_loss": 6.290005283355713, + "loss": 0.3174, + "step": 329500 + }, + { + "base_loss": 0.3186078954935074, + "epoch": 1.057220458984375, + "grad_norm": 0.0009277429198846221, + "learning_rate": 1.8528842926025393e-05, + "lookahead_loss": 6.308471469879151, + "loss": 0.3313, + "step": 330000 + }, + { + "epoch": 1.057220458984375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.33419472797991, + "eval_lookahead_perplexity": 563.5154370122383, + "eval_loss": 0.1418038159608841, + "eval_perplexity": 1.152350553573541, + "eval_runtime": 475.9845, + "eval_samples_per_second": 10.505, + "eval_steps_per_second": 0.33, + "step": 330000 + }, + { + "base_loss": 0.31768571627140046, + "epoch": 1.0581741333007812, + "grad_norm": 0.0009722410468384624, + "learning_rate": 1.848115921020508e-05, + "lookahead_loss": 6.300428405284881, + "loss": 0.3266, + "step": 330500 + }, + { + "base_loss": 0.2922684009075165, + "epoch": 1.0591278076171875, + "grad_norm": 0.0009333764901384711, + "learning_rate": 1.8433475494384766e-05, + "lookahead_loss": 6.21525590801239, + "loss": 0.3077, + "step": 331000 + }, + { + "base_loss": 0.30112267237901685, + "epoch": 1.0600814819335938, + "grad_norm": 0.0009552223491482437, + "learning_rate": 1.8385791778564453e-05, + "lookahead_loss": 6.275212284564972, + "loss": 0.3148, + "step": 331500 + }, + { + "base_loss": 0.32029621145129206, + "epoch": 1.06103515625, + "grad_norm": 0.0009804009459912777, + "learning_rate": 1.833810806274414e-05, + "lookahead_loss": 6.241409729480743, + "loss": 0.331, + "step": 332000 + }, + { + "base_loss": 0.30533574494719506, + "epoch": 1.0619888305664062, + "grad_norm": 0.0010112527525052428, + "learning_rate": 1.829042434692383e-05, + "lookahead_loss": 6.244034860610962, + "loss": 0.3149, + "step": 332500 + }, + { + "base_loss": 0.30571810373663905, + "epoch": 1.0629425048828125, + "grad_norm": 0.0010262149153277278, + "learning_rate": 1.8242740631103517e-05, + "lookahead_loss": 6.248726521492005, + "loss": 0.3165, + "step": 333000 + }, + { + "base_loss": 0.31451627737283705, + "epoch": 1.0638961791992188, + "grad_norm": 0.0009512413525953889, + "learning_rate": 1.8195056915283204e-05, + "lookahead_loss": 6.2796635246276855, + "loss": 0.3278, + "step": 333500 + }, + { + "base_loss": 0.30425655883550645, + "epoch": 1.064849853515625, + "grad_norm": 0.0009613555739633739, + "learning_rate": 1.814737319946289e-05, + "lookahead_loss": 6.302466047286988, + "loss": 0.3173, + "step": 334000 + }, + { + "base_loss": 0.31105126801133154, + "epoch": 1.0658035278320312, + "grad_norm": 0.0009652305161580443, + "learning_rate": 1.8099689483642578e-05, + "lookahead_loss": 6.216504365444183, + "loss": 0.3208, + "step": 334500 + }, + { + "base_loss": 0.3071163959801197, + "epoch": 1.0667572021484375, + "grad_norm": 0.0009779701940715313, + "learning_rate": 1.8052005767822268e-05, + "lookahead_loss": 6.2020517086982725, + "loss": 0.3163, + "step": 335000 + }, + { + "epoch": 1.0667572021484375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.330107611208297, + "eval_lookahead_perplexity": 561.2169838413607, + "eval_loss": 0.14179642498493195, + "eval_perplexity": 1.1523420366097856, + "eval_runtime": 486.3873, + "eval_samples_per_second": 10.28, + "eval_steps_per_second": 0.323, + "step": 335000 + }, + { + "base_loss": 0.3321034919023514, + "epoch": 1.0677108764648438, + "grad_norm": 0.001005273894406855, + "learning_rate": 1.8004322052001955e-05, + "lookahead_loss": 6.285574303627014, + "loss": 0.3426, + "step": 335500 + }, + { + "base_loss": 0.3017843673825264, + "epoch": 1.06866455078125, + "grad_norm": 0.0009829438058659434, + "learning_rate": 1.795663833618164e-05, + "lookahead_loss": 6.258272046089172, + "loss": 0.3101, + "step": 336000 + }, + { + "base_loss": 0.302195555627346, + "epoch": 1.0696182250976562, + "grad_norm": 0.0009696283377707005, + "learning_rate": 1.7908954620361328e-05, + "lookahead_loss": 6.2891021289825435, + "loss": 0.3161, + "step": 336500 + }, + { + "base_loss": 0.3459869565963745, + "epoch": 1.0705718994140625, + "grad_norm": 0.0009371968917548656, + "learning_rate": 1.7861270904541015e-05, + "lookahead_loss": 6.187218756198883, + "loss": 0.3598, + "step": 337000 + }, + { + "base_loss": 0.3151495299339294, + "epoch": 1.0715255737304688, + "grad_norm": 0.0009671805892139673, + "learning_rate": 1.7813587188720705e-05, + "lookahead_loss": 6.237253646850586, + "loss": 0.3259, + "step": 337500 + }, + { + "base_loss": 0.30790447345376015, + "epoch": 1.072479248046875, + "grad_norm": 0.0010147603461518884, + "learning_rate": 1.7765903472900392e-05, + "lookahead_loss": 6.318773533821106, + "loss": 0.3201, + "step": 338000 + }, + { + "base_loss": 0.30545566940307617, + "epoch": 1.0734329223632812, + "grad_norm": 0.0009206641116179526, + "learning_rate": 1.771821975708008e-05, + "lookahead_loss": 6.285214097976684, + "loss": 0.3183, + "step": 338500 + }, + { + "base_loss": 0.32841417971253395, + "epoch": 1.0743865966796875, + "grad_norm": 0.0009649362764321268, + "learning_rate": 1.7670536041259766e-05, + "lookahead_loss": 6.303198455810547, + "loss": 0.3403, + "step": 339000 + }, + { + "base_loss": 0.30363579127192497, + "epoch": 1.0753402709960938, + "grad_norm": 0.0009605617960914969, + "learning_rate": 1.7622852325439453e-05, + "lookahead_loss": 6.3648786582946775, + "loss": 0.3156, + "step": 339500 + }, + { + "base_loss": 0.30504586565494535, + "epoch": 1.0762939453125, + "grad_norm": 0.000925544067285955, + "learning_rate": 1.7575168609619143e-05, + "lookahead_loss": 6.328798898696899, + "loss": 0.3177, + "step": 340000 + }, + { + "epoch": 1.0762939453125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.325847924326936, + "eval_lookahead_perplexity": 558.8314596179115, + "eval_loss": 0.14178849756717682, + "eval_perplexity": 1.1523329015492734, + "eval_runtime": 487.3002, + "eval_samples_per_second": 10.261, + "eval_steps_per_second": 0.322, + "step": 340000 + }, + { + "base_loss": 0.33029799509048463, + "epoch": 1.0772476196289062, + "grad_norm": 0.0009759070817381144, + "learning_rate": 1.752748489379883e-05, + "lookahead_loss": 6.315556499004364, + "loss": 0.345, + "step": 340500 + }, + { + "base_loss": 0.3037954642176628, + "epoch": 1.0782012939453125, + "grad_norm": 0.0009659862844273448, + "learning_rate": 1.7479801177978516e-05, + "lookahead_loss": 6.325719955921173, + "loss": 0.3164, + "step": 341000 + }, + { + "base_loss": 0.29821320512890814, + "epoch": 1.0791549682617188, + "grad_norm": 0.0009346814476884902, + "learning_rate": 1.7432117462158203e-05, + "lookahead_loss": 6.323197066307068, + "loss": 0.3126, + "step": 341500 + }, + { + "base_loss": 0.3142137563228607, + "epoch": 1.080108642578125, + "grad_norm": 0.0009469282813370228, + "learning_rate": 1.738443374633789e-05, + "lookahead_loss": 6.2899098715782165, + "loss": 0.33, + "step": 342000 + }, + { + "base_loss": 0.3222310249209404, + "epoch": 1.0810623168945312, + "grad_norm": 0.0009950444800779223, + "learning_rate": 1.733675003051758e-05, + "lookahead_loss": 6.328862300872803, + "loss": 0.3389, + "step": 342500 + }, + { + "base_loss": 0.3002626436650753, + "epoch": 1.0820159912109375, + "grad_norm": 0.000903125386685133, + "learning_rate": 1.7289066314697267e-05, + "lookahead_loss": 6.32488419675827, + "loss": 0.312, + "step": 343000 + }, + { + "base_loss": 0.3045452245473862, + "epoch": 1.0829696655273438, + "grad_norm": 0.0009882268495857716, + "learning_rate": 1.7241382598876954e-05, + "lookahead_loss": 6.3639033141136165, + "loss": 0.3182, + "step": 343500 + }, + { + "base_loss": 0.33469617655873296, + "epoch": 1.08392333984375, + "grad_norm": 0.0009396182140335441, + "learning_rate": 1.719369888305664e-05, + "lookahead_loss": 6.374470998764038, + "loss": 0.3454, + "step": 344000 + }, + { + "base_loss": 0.30740025800466536, + "epoch": 1.0848770141601562, + "grad_norm": 0.0009506403002887964, + "learning_rate": 1.7146015167236328e-05, + "lookahead_loss": 6.291221611976623, + "loss": 0.3187, + "step": 344500 + }, + { + "base_loss": 0.300477741509676, + "epoch": 1.0858306884765625, + "grad_norm": 0.0010049225529655814, + "learning_rate": 1.7098331451416018e-05, + "lookahead_loss": 6.287293316841126, + "loss": 0.3096, + "step": 345000 + }, + { + "epoch": 1.0858306884765625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.321735369130826, + "eval_lookahead_perplexity": 556.5379537122502, + "eval_loss": 0.1417810320854187, + "eval_perplexity": 1.152324298861129, + "eval_runtime": 481.7792, + "eval_samples_per_second": 10.378, + "eval_steps_per_second": 0.326, + "step": 345000 + }, + { + "base_loss": 0.301901563256979, + "epoch": 1.0867843627929688, + "grad_norm": 0.0011207083007320762, + "learning_rate": 1.7050647735595705e-05, + "lookahead_loss": 6.266433288574219, + "loss": 0.3126, + "step": 345500 + }, + { + "base_loss": 0.338142231285572, + "epoch": 1.087738037109375, + "grad_norm": 0.0008945625741034746, + "learning_rate": 1.700296401977539e-05, + "lookahead_loss": 6.325605587482452, + "loss": 0.3448, + "step": 346000 + }, + { + "base_loss": 0.3009798979461193, + "epoch": 1.0886917114257812, + "grad_norm": 0.00098209991119802, + "learning_rate": 1.6955280303955078e-05, + "lookahead_loss": 6.302682322978973, + "loss": 0.3116, + "step": 346500 + }, + { + "base_loss": 0.3090392453968525, + "epoch": 1.0896453857421875, + "grad_norm": 0.0010011927224695683, + "learning_rate": 1.6907596588134765e-05, + "lookahead_loss": 6.335950228691101, + "loss": 0.3177, + "step": 347000 + }, + { + "base_loss": 0.30036539113521576, + "epoch": 1.0905990600585938, + "grad_norm": 0.0009704971453174949, + "learning_rate": 1.6859912872314455e-05, + "lookahead_loss": 6.307251618385315, + "loss": 0.3117, + "step": 347500 + }, + { + "base_loss": 0.3006012495756149, + "epoch": 1.091552734375, + "grad_norm": 0.0008963792352005839, + "learning_rate": 1.6812229156494142e-05, + "lookahead_loss": 6.279744626045227, + "loss": 0.3106, + "step": 348000 + }, + { + "base_loss": 0.31875682109594344, + "epoch": 1.0925064086914062, + "grad_norm": 0.0009249325376003981, + "learning_rate": 1.676454544067383e-05, + "lookahead_loss": 6.309427938461304, + "loss": 0.3349, + "step": 348500 + }, + { + "base_loss": 0.3104289738535881, + "epoch": 1.0934600830078125, + "grad_norm": 0.0009198809857480228, + "learning_rate": 1.6716861724853516e-05, + "lookahead_loss": 6.319187465667724, + "loss": 0.3213, + "step": 349000 + }, + { + "base_loss": 0.2877590928971767, + "epoch": 1.0944137573242188, + "grad_norm": 0.0009716861532069743, + "learning_rate": 1.6669178009033203e-05, + "lookahead_loss": 6.317689159870148, + "loss": 0.3013, + "step": 349500 + }, + { + "base_loss": 0.2935507807135582, + "epoch": 1.095367431640625, + "grad_norm": 0.000957997574005276, + "learning_rate": 1.6621494293212893e-05, + "lookahead_loss": 6.240253508090973, + "loss": 0.3069, + "step": 350000 + }, + { + "epoch": 1.095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.31877008109047, + "eval_lookahead_perplexity": 554.8901027579203, + "eval_loss": 0.14177444577217102, + "eval_perplexity": 1.1523167093173274, + "eval_runtime": 497.5282, + "eval_samples_per_second": 10.05, + "eval_steps_per_second": 0.316, + "step": 350000 + }, + { + "base_loss": 0.2986202912926674, + "epoch": 1.0963211059570312, + "grad_norm": 0.0009497444843873382, + "learning_rate": 1.657381057739258e-05, + "lookahead_loss": 6.313019587516784, + "loss": 0.3117, + "step": 350500 + }, + { + "base_loss": 0.3307524161040783, + "epoch": 1.0972747802734375, + "grad_norm": 0.000984227517619729, + "learning_rate": 1.6526126861572266e-05, + "lookahead_loss": 6.333485441207886, + "loss": 0.3412, + "step": 351000 + }, + { + "base_loss": 0.29244673988223074, + "epoch": 1.0982284545898438, + "grad_norm": 0.0009845261229202151, + "learning_rate": 1.6478443145751953e-05, + "lookahead_loss": 6.272432360649109, + "loss": 0.3068, + "step": 351500 + }, + { + "base_loss": 0.295786843508482, + "epoch": 1.09918212890625, + "grad_norm": 0.0009755997452884912, + "learning_rate": 1.643075942993164e-05, + "lookahead_loss": 6.323694842338562, + "loss": 0.3106, + "step": 352000 + }, + { + "base_loss": 0.30293611577153207, + "epoch": 1.1001358032226562, + "grad_norm": 0.0009190894779749215, + "learning_rate": 1.638307571411133e-05, + "lookahead_loss": 6.320311902999878, + "loss": 0.3147, + "step": 352500 + }, + { + "base_loss": 0.3240869597494602, + "epoch": 1.1010894775390625, + "grad_norm": 0.0009462400339543819, + "learning_rate": 1.6335391998291017e-05, + "lookahead_loss": 6.3840909061431885, + "loss": 0.3335, + "step": 353000 + }, + { + "base_loss": 0.30599541807174685, + "epoch": 1.1020431518554688, + "grad_norm": 0.00095773657085374, + "learning_rate": 1.6287708282470704e-05, + "lookahead_loss": 6.332640226364136, + "loss": 0.3148, + "step": 353500 + }, + { + "base_loss": 0.2991089904308319, + "epoch": 1.102996826171875, + "grad_norm": 0.0009572218987159431, + "learning_rate": 1.624002456665039e-05, + "lookahead_loss": 6.303247368812561, + "loss": 0.3135, + "step": 354000 + }, + { + "base_loss": 0.30219315418601034, + "epoch": 1.1039505004882812, + "grad_norm": 0.0009981011971831322, + "learning_rate": 1.6192340850830078e-05, + "lookahead_loss": 6.34359606218338, + "loss": 0.3121, + "step": 354500 + }, + { + "base_loss": 0.3133500624895096, + "epoch": 1.1049041748046875, + "grad_norm": 0.0009674631292000413, + "learning_rate": 1.6144657135009768e-05, + "lookahead_loss": 6.2734492511749265, + "loss": 0.3267, + "step": 355000 + }, + { + "epoch": 1.1049041748046875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.315534021146, + "eval_lookahead_perplexity": 553.0973474191636, + "eval_loss": 0.14176791906356812, + "eval_perplexity": 1.1523091885064907, + "eval_runtime": 478.2434, + "eval_samples_per_second": 10.455, + "eval_steps_per_second": 0.328, + "step": 355000 + }, + { + "base_loss": 0.3070584389269352, + "epoch": 1.1058578491210938, + "grad_norm": 0.0009178784675896168, + "learning_rate": 1.6096973419189455e-05, + "lookahead_loss": 6.277120381355286, + "loss": 0.3207, + "step": 355500 + }, + { + "base_loss": 0.29948241996765135, + "epoch": 1.1068115234375, + "grad_norm": 0.0009539983002468944, + "learning_rate": 1.604928970336914e-05, + "lookahead_loss": 6.351998689651489, + "loss": 0.3093, + "step": 356000 + }, + { + "base_loss": 0.29492466670274736, + "epoch": 1.1077651977539062, + "grad_norm": 0.0009671118459664285, + "learning_rate": 1.6001605987548828e-05, + "lookahead_loss": 6.269609031677246, + "loss": 0.3078, + "step": 356500 + }, + { + "base_loss": 0.3188383647501469, + "epoch": 1.1087188720703125, + "grad_norm": 0.0010139980586245656, + "learning_rate": 1.5953922271728515e-05, + "lookahead_loss": 6.268310216903687, + "loss": 0.331, + "step": 357000 + }, + { + "base_loss": 0.31659464621543887, + "epoch": 1.1096725463867188, + "grad_norm": 0.0009025723556987941, + "learning_rate": 1.5906238555908205e-05, + "lookahead_loss": 6.234257357120514, + "loss": 0.3257, + "step": 357500 + }, + { + "base_loss": 0.3013280538916588, + "epoch": 1.110626220703125, + "grad_norm": 0.0008754499140195549, + "learning_rate": 1.5858554840087892e-05, + "lookahead_loss": 6.1902008948326115, + "loss": 0.3136, + "step": 358000 + }, + { + "base_loss": 0.29822684854269027, + "epoch": 1.1115798950195312, + "grad_norm": 0.0009994573192670941, + "learning_rate": 1.581087112426758e-05, + "lookahead_loss": 6.32253562450409, + "loss": 0.3084, + "step": 358500 + }, + { + "base_loss": 0.3082665235698223, + "epoch": 1.1125335693359375, + "grad_norm": 0.0009301855461671948, + "learning_rate": 1.5763187408447266e-05, + "lookahead_loss": 6.341563850402832, + "loss": 0.3206, + "step": 359000 + }, + { + "base_loss": 0.34342152199149134, + "epoch": 1.1134872436523438, + "grad_norm": 0.0009999109897762537, + "learning_rate": 1.5715503692626953e-05, + "lookahead_loss": 6.352512986660003, + "loss": 0.3498, + "step": 359500 + }, + { + "base_loss": 0.29527223294973376, + "epoch": 1.11444091796875, + "grad_norm": 0.0009466528426855803, + "learning_rate": 1.5667819976806643e-05, + "lookahead_loss": 6.224033700942993, + "loss": 0.3083, + "step": 360000 + }, + { + "epoch": 1.11444091796875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.312419369578742, + "eval_lookahead_perplexity": 551.3773219297262, + "eval_loss": 0.14176180958747864, + "eval_perplexity": 1.152302148522561, + "eval_runtime": 491.9703, + "eval_samples_per_second": 10.163, + "eval_steps_per_second": 0.319, + "step": 360000 + }, + { + "base_loss": 0.2983711498081684, + "epoch": 1.1153945922851562, + "grad_norm": 0.0009709897567518055, + "learning_rate": 1.562013626098633e-05, + "lookahead_loss": 6.282243706703186, + "loss": 0.3106, + "step": 360500 + }, + { + "base_loss": 0.3164993856549263, + "epoch": 1.1163482666015625, + "grad_norm": 0.0010387166403234005, + "learning_rate": 1.5572452545166016e-05, + "lookahead_loss": 6.278607957839966, + "loss": 0.3246, + "step": 361000 + }, + { + "base_loss": 0.3281388694047928, + "epoch": 1.1173019409179688, + "grad_norm": 0.0009896036935970187, + "learning_rate": 1.5524768829345703e-05, + "lookahead_loss": 6.2957491955757146, + "loss": 0.3433, + "step": 361500 + }, + { + "base_loss": 0.3066762860417366, + "epoch": 1.118255615234375, + "grad_norm": 0.0010281683644279838, + "learning_rate": 1.547708511352539e-05, + "lookahead_loss": 6.308385594367981, + "loss": 0.3157, + "step": 362000 + }, + { + "base_loss": 0.3002779276072979, + "epoch": 1.1192092895507812, + "grad_norm": 0.00096211361233145, + "learning_rate": 1.542940139770508e-05, + "lookahead_loss": 6.321347719669342, + "loss": 0.3103, + "step": 362500 + }, + { + "base_loss": 0.3048044160306454, + "epoch": 2.0009536743164062, + "grad_norm": 0.0009596212767064571, + "learning_rate": 1.5381717681884767e-05, + "lookahead_loss": 6.3391434679031375, + "loss": 0.3132, + "step": 363000 + }, + { + "base_loss": 0.2995053820014, + "epoch": 2.0019073486328125, + "grad_norm": 0.00099264329764992, + "learning_rate": 1.5334033966064454e-05, + "lookahead_loss": 6.190942444801331, + "loss": 0.3128, + "step": 363500 + }, + { + "base_loss": 0.31198617857694627, + "epoch": 2.0028610229492188, + "grad_norm": 0.0010000969050452113, + "learning_rate": 1.528635025024414e-05, + "lookahead_loss": 6.173706921577454, + "loss": 0.3212, + "step": 364000 + }, + { + "base_loss": 0.32396442687511445, + "epoch": 2.003814697265625, + "grad_norm": 0.0009497402934357524, + "learning_rate": 1.523866653442383e-05, + "lookahead_loss": 6.214160351753235, + "loss": 0.3347, + "step": 364500 + }, + { + "base_loss": 0.3013957371413708, + "epoch": 2.0047683715820312, + "grad_norm": 0.0009450508514419198, + "learning_rate": 1.5190982818603516e-05, + "lookahead_loss": 6.195119174003601, + "loss": 0.3154, + "step": 365000 + }, + { + "epoch": 2.0047683715820312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.309196997755252, + "eval_lookahead_perplexity": 549.6034387739307, + "eval_loss": 0.14175546169281006, + "eval_perplexity": 1.1522948338531123, + "eval_runtime": 496.8296, + "eval_samples_per_second": 10.064, + "eval_steps_per_second": 0.316, + "step": 365000 + }, + { + "base_loss": 0.3039788320362568, + "epoch": 2.0057220458984375, + "grad_norm": 0.0008507427992299199, + "learning_rate": 1.5143299102783205e-05, + "lookahead_loss": 6.3170522661209105, + "loss": 0.3125, + "step": 365500 + }, + { + "base_loss": 0.29717833909392355, + "epoch": 2.0066757202148438, + "grad_norm": 0.000947000808082521, + "learning_rate": 1.5095615386962891e-05, + "lookahead_loss": 6.171763348579407, + "loss": 0.3135, + "step": 366000 + }, + { + "base_loss": 0.31199148765206336, + "epoch": 2.00762939453125, + "grad_norm": 0.0009953895350918174, + "learning_rate": 1.5047931671142578e-05, + "lookahead_loss": 6.227945377349854, + "loss": 0.3233, + "step": 366500 + }, + { + "base_loss": 0.3148621036410332, + "epoch": 2.0085830688476562, + "grad_norm": 0.0009100789902731776, + "learning_rate": 1.5000247955322267e-05, + "lookahead_loss": 6.21800847530365, + "loss": 0.3216, + "step": 367000 + }, + { + "base_loss": 0.30580521461367605, + "epoch": 2.0095367431640625, + "grad_norm": 0.0009829141199588776, + "learning_rate": 1.4952564239501954e-05, + "lookahead_loss": 6.2338280124664305, + "loss": 0.3178, + "step": 367500 + }, + { + "base_loss": 0.3015244754254818, + "epoch": 2.0104904174804688, + "grad_norm": 0.0009305374696850777, + "learning_rate": 1.4904880523681642e-05, + "lookahead_loss": 6.208019327163696, + "loss": 0.3115, + "step": 368000 + }, + { + "base_loss": 0.30137019059062004, + "epoch": 2.011444091796875, + "grad_norm": 0.0010132869938388467, + "learning_rate": 1.4857196807861329e-05, + "lookahead_loss": 6.219922257423401, + "loss": 0.3126, + "step": 368500 + }, + { + "base_loss": 0.3252628707587719, + "epoch": 2.0123977661132812, + "grad_norm": 0.0008825024706311524, + "learning_rate": 1.4809513092041016e-05, + "lookahead_loss": 6.216059367179871, + "loss": 0.3346, + "step": 369000 + }, + { + "base_loss": 0.30557073107361793, + "epoch": 2.0133514404296875, + "grad_norm": 0.0009391508647240698, + "learning_rate": 1.4761829376220704e-05, + "lookahead_loss": 6.2985112986564635, + "loss": 0.3197, + "step": 369500 + }, + { + "base_loss": 0.30054079556465146, + "epoch": 2.0143051147460938, + "grad_norm": 0.0009645905811339617, + "learning_rate": 1.4714145660400391e-05, + "lookahead_loss": 6.233403862953186, + "loss": 0.3145, + "step": 370000 + }, + { + "epoch": 2.0143051147460938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.306184032854562, + "eval_lookahead_perplexity": 547.9499950384007, + "eval_loss": 0.1417495757341385, + "eval_perplexity": 1.1522880515133032, + "eval_runtime": 484.8879, + "eval_samples_per_second": 10.312, + "eval_steps_per_second": 0.324, + "step": 370000 + }, + { + "base_loss": 0.29648803743720054, + "epoch": 2.0152587890625, + "grad_norm": 0.0008996450342237949, + "learning_rate": 1.466646194458008e-05, + "lookahead_loss": 6.224600920200348, + "loss": 0.3066, + "step": 370500 + }, + { + "base_loss": 0.31412097451090815, + "epoch": 2.0162124633789062, + "grad_norm": 0.0009805329609662294, + "learning_rate": 1.4618778228759766e-05, + "lookahead_loss": 6.249731856346131, + "loss": 0.3247, + "step": 371000 + }, + { + "base_loss": 0.3125672063827515, + "epoch": 2.0171661376953125, + "grad_norm": 0.0009064034675247967, + "learning_rate": 1.4571094512939453e-05, + "lookahead_loss": 6.2948720178604125, + "loss": 0.3228, + "step": 371500 + }, + { + "base_loss": 0.3002317441105843, + "epoch": 2.0181198120117188, + "grad_norm": 0.000900619721505791, + "learning_rate": 1.4523410797119142e-05, + "lookahead_loss": 6.301207159996033, + "loss": 0.3098, + "step": 372000 + }, + { + "base_loss": 0.29831535935401915, + "epoch": 2.019073486328125, + "grad_norm": 0.0009492257959209383, + "learning_rate": 1.4475727081298829e-05, + "lookahead_loss": 6.321104723453522, + "loss": 0.3096, + "step": 372500 + }, + { + "base_loss": 0.3020369653701782, + "epoch": 2.0200271606445312, + "grad_norm": 0.0010382728651165962, + "learning_rate": 1.4428043365478517e-05, + "lookahead_loss": 6.153746718406677, + "loss": 0.3144, + "step": 373000 + }, + { + "base_loss": 0.32652922403812407, + "epoch": 2.0209808349609375, + "grad_norm": 0.0009400880662724376, + "learning_rate": 1.4380359649658204e-05, + "lookahead_loss": 6.239998239040375, + "loss": 0.3383, + "step": 373500 + }, + { + "base_loss": 0.30453234216570857, + "epoch": 2.0219345092773438, + "grad_norm": 0.0009412117651663721, + "learning_rate": 1.433267593383789e-05, + "lookahead_loss": 6.207024999618531, + "loss": 0.3133, + "step": 374000 + }, + { + "base_loss": 0.2977458454966545, + "epoch": 2.02288818359375, + "grad_norm": 0.0009957071160897613, + "learning_rate": 1.428499221801758e-05, + "lookahead_loss": 6.241996213436127, + "loss": 0.3112, + "step": 374500 + }, + { + "base_loss": 0.30405546057224275, + "epoch": 2.0238418579101562, + "grad_norm": 0.0009575379081070423, + "learning_rate": 1.4237308502197266e-05, + "lookahead_loss": 6.2086448826789855, + "loss": 0.3137, + "step": 375000 + }, + { + "epoch": 2.0238418579101562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.302762184661036, + "eval_lookahead_perplexity": 546.0781976673755, + "eval_loss": 0.1417429894208908, + "eval_perplexity": 1.152280462208237, + "eval_runtime": 493.3905, + "eval_samples_per_second": 10.134, + "eval_steps_per_second": 0.318, + "step": 375000 + }, + { + "base_loss": 0.32463854083418847, + "epoch": 2.0247955322265625, + "grad_norm": 0.000920959166251123, + "learning_rate": 1.4189624786376955e-05, + "lookahead_loss": 6.218081157684326, + "loss": 0.335, + "step": 375500 + }, + { + "base_loss": 0.3075324648320675, + "epoch": 2.0257492065429688, + "grad_norm": 0.0009786912705749273, + "learning_rate": 1.4141941070556641e-05, + "lookahead_loss": 6.169022241592407, + "loss": 0.3225, + "step": 376000 + }, + { + "base_loss": 0.30398501074314116, + "epoch": 2.026702880859375, + "grad_norm": 0.0009755670907907188, + "learning_rate": 1.4094257354736328e-05, + "lookahead_loss": 6.1950150799751285, + "loss": 0.3128, + "step": 376500 + }, + { + "base_loss": 0.3081837382018566, + "epoch": 2.0276565551757812, + "grad_norm": 0.0009461453300900757, + "learning_rate": 1.4046573638916017e-05, + "lookahead_loss": 6.300460319519043, + "loss": 0.3178, + "step": 377000 + }, + { + "base_loss": 0.32895678067207335, + "epoch": 2.0286102294921875, + "grad_norm": 0.00095773721113801, + "learning_rate": 1.3998889923095704e-05, + "lookahead_loss": 6.334568171024323, + "loss": 0.3398, + "step": 377500 + }, + { + "base_loss": 0.30588172587752344, + "epoch": 2.0295639038085938, + "grad_norm": 0.0009456981788389385, + "learning_rate": 1.3951206207275392e-05, + "lookahead_loss": 6.255176815032959, + "loss": 0.3142, + "step": 378000 + }, + { + "base_loss": 0.3051903445720673, + "epoch": 2.030517578125, + "grad_norm": 0.0009376407833769917, + "learning_rate": 1.3903522491455079e-05, + "lookahead_loss": 6.250563184261322, + "loss": 0.3162, + "step": 378500 + }, + { + "base_loss": 0.30346439191699026, + "epoch": 2.0314712524414062, + "grad_norm": 0.0009672795422375202, + "learning_rate": 1.3855838775634766e-05, + "lookahead_loss": 6.2738850560188295, + "loss": 0.315, + "step": 379000 + }, + { + "base_loss": 0.31795056411623956, + "epoch": 2.0324249267578125, + "grad_norm": 0.0009643193334341049, + "learning_rate": 1.3808155059814454e-05, + "lookahead_loss": 6.213383667945862, + "loss": 0.3343, + "step": 379500 + }, + { + "base_loss": 0.30795893451571466, + "epoch": 2.0333786010742188, + "grad_norm": 0.000983322854153812, + "learning_rate": 1.3760471343994141e-05, + "lookahead_loss": 6.269245834827423, + "loss": 0.3173, + "step": 380000 + }, + { + "epoch": 2.0333786010742188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.300099506926613, + "eval_lookahead_perplexity": 544.6261014991892, + "eval_loss": 0.14173784852027893, + "eval_perplexity": 1.1522745384641306, + "eval_runtime": 505.1083, + "eval_samples_per_second": 9.899, + "eval_steps_per_second": 0.311, + "step": 380000 + }, + { + "base_loss": 0.3031257001161575, + "epoch": 2.034332275390625, + "grad_norm": 0.0008987266919575632, + "learning_rate": 1.371278762817383e-05, + "lookahead_loss": 6.3391483335495, + "loss": 0.3148, + "step": 380500 + }, + { + "base_loss": 0.31068781118094924, + "epoch": 2.0352859497070312, + "grad_norm": 0.001019320567138493, + "learning_rate": 1.3665103912353516e-05, + "lookahead_loss": 6.17788455247879, + "loss": 0.3223, + "step": 381000 + }, + { + "base_loss": 0.32500979214906695, + "epoch": 2.0362396240234375, + "grad_norm": 0.0009629032574594021, + "learning_rate": 1.3617420196533203e-05, + "lookahead_loss": 6.288136891365051, + "loss": 0.3365, + "step": 381500 + }, + { + "base_loss": 0.3069631262719631, + "epoch": 2.0371932983398438, + "grad_norm": 0.0010042600333690643, + "learning_rate": 1.3569736480712892e-05, + "lookahead_loss": 6.208441403865814, + "loss": 0.317, + "step": 382000 + }, + { + "base_loss": 0.3025422422587872, + "epoch": 2.03814697265625, + "grad_norm": 0.0009597797179594636, + "learning_rate": 1.3522052764892579e-05, + "lookahead_loss": 6.258864019393921, + "loss": 0.3128, + "step": 382500 + }, + { + "base_loss": 0.3076345331072807, + "epoch": 2.0391006469726562, + "grad_norm": 0.0009677776833996177, + "learning_rate": 1.3474369049072265e-05, + "lookahead_loss": 6.211527245044708, + "loss": 0.3184, + "step": 383000 + }, + { + "base_loss": 0.3235399980545044, + "epoch": 2.0400543212890625, + "grad_norm": 0.0009396423120051622, + "learning_rate": 1.3426685333251954e-05, + "lookahead_loss": 6.251573430538177, + "loss": 0.3331, + "step": 383500 + }, + { + "base_loss": 0.30506757298111914, + "epoch": 2.0410079956054688, + "grad_norm": 0.0009736265055835247, + "learning_rate": 1.337900161743164e-05, + "lookahead_loss": 6.183384760856629, + "loss": 0.3137, + "step": 384000 + }, + { + "base_loss": 0.29668092691898346, + "epoch": 2.041961669921875, + "grad_norm": 0.0009713602485135198, + "learning_rate": 1.333131790161133e-05, + "lookahead_loss": 6.275757871627808, + "loss": 0.3078, + "step": 384500 + }, + { + "base_loss": 0.30789948108792303, + "epoch": 2.0429153442382812, + "grad_norm": 0.0009644374949857593, + "learning_rate": 1.3283634185791016e-05, + "lookahead_loss": 6.276010320186615, + "loss": 0.3235, + "step": 385000 + }, + { + "epoch": 2.0429153442382812, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.297511412312809, + "eval_lookahead_perplexity": 543.2183800632656, + "eval_loss": 0.14173275232315063, + "eval_perplexity": 1.1522686662608996, + "eval_runtime": 484.6579, + "eval_samples_per_second": 10.317, + "eval_steps_per_second": 0.324, + "step": 385000 + }, + { + "base_loss": 0.3281280441880226, + "epoch": 2.0438690185546875, + "grad_norm": 0.0009904025355353951, + "learning_rate": 1.3235950469970703e-05, + "lookahead_loss": 6.3155609292984005, + "loss": 0.3432, + "step": 385500 + }, + { + "base_loss": 0.2978555924296379, + "epoch": 2.0448226928710938, + "grad_norm": 0.001029996550641954, + "learning_rate": 1.3188266754150391e-05, + "lookahead_loss": 6.223522740364075, + "loss": 0.31, + "step": 386000 + }, + { + "base_loss": 0.3044668311774731, + "epoch": 2.0457763671875, + "grad_norm": 0.0009646739927120507, + "learning_rate": 1.3140583038330078e-05, + "lookahead_loss": 6.255016824245453, + "loss": 0.3169, + "step": 386500 + }, + { + "base_loss": 0.3298782432973385, + "epoch": 2.0467300415039062, + "grad_norm": 0.0009199704509228468, + "learning_rate": 1.3092899322509767e-05, + "lookahead_loss": 6.208101546764373, + "loss": 0.3396, + "step": 387000 + }, + { + "base_loss": 0.32442897310853, + "epoch": 2.0476837158203125, + "grad_norm": 0.0009992391569539905, + "learning_rate": 1.3045215606689454e-05, + "lookahead_loss": 6.262858564376831, + "loss": 0.338, + "step": 387500 + }, + { + "base_loss": 0.2941350122392178, + "epoch": 2.0486373901367188, + "grad_norm": 0.0009216153994202614, + "learning_rate": 1.299753189086914e-05, + "lookahead_loss": 6.21948232126236, + "loss": 0.3073, + "step": 388000 + }, + { + "base_loss": 0.301623804807663, + "epoch": 2.049591064453125, + "grad_norm": 0.0009883494349196553, + "learning_rate": 1.2949848175048829e-05, + "lookahead_loss": 6.195394364833832, + "loss": 0.3141, + "step": 388500 + }, + { + "base_loss": 0.31965578559041025, + "epoch": 2.0505447387695312, + "grad_norm": 0.0008956629899330437, + "learning_rate": 1.2902164459228516e-05, + "lookahead_loss": 6.302784600257874, + "loss": 0.3316, + "step": 389000 + }, + { + "base_loss": 0.30511142282187936, + "epoch": 2.0514984130859375, + "grad_norm": 0.0009860226418823004, + "learning_rate": 1.2854480743408204e-05, + "lookahead_loss": 6.258593583583832, + "loss": 0.3184, + "step": 389500 + }, + { + "base_loss": 0.3033564644157887, + "epoch": 2.0524520874023438, + "grad_norm": 0.0010332902893424034, + "learning_rate": 1.2806797027587891e-05, + "lookahead_loss": 6.190782598495483, + "loss": 0.3155, + "step": 390000 + }, + { + "epoch": 2.0524520874023438, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.295379272284218, + "eval_lookahead_perplexity": 542.0613962748962, + "eval_loss": 0.14172814786434174, + "eval_perplexity": 1.1522633606995036, + "eval_runtime": 493.7686, + "eval_samples_per_second": 10.126, + "eval_steps_per_second": 0.318, + "step": 390000 + }, + { + "base_loss": 0.32089028322696683, + "epoch": 2.05340576171875, + "grad_norm": 0.0009403086733072996, + "learning_rate": 1.2759113311767578e-05, + "lookahead_loss": 6.21219446849823, + "loss": 0.3296, + "step": 390500 + }, + { + "base_loss": 0.35406574749946595, + "epoch": 2.0543594360351562, + "grad_norm": 0.0009752140031196177, + "learning_rate": 1.2711429595947266e-05, + "lookahead_loss": 6.248851782798767, + "loss": 0.3688, + "step": 391000 + }, + { + "base_loss": 0.2938829956352711, + "epoch": 2.0553131103515625, + "grad_norm": 0.0009749157470650971, + "learning_rate": 1.2663745880126953e-05, + "lookahead_loss": 6.242171205997467, + "loss": 0.3059, + "step": 391500 + }, + { + "base_loss": 0.30498689064383505, + "epoch": 2.0562667846679688, + "grad_norm": 0.0009239926584996283, + "learning_rate": 1.2616062164306642e-05, + "lookahead_loss": 6.25093336057663, + "loss": 0.317, + "step": 392000 + }, + { + "base_loss": 0.317481600522995, + "epoch": 2.057220458984375, + "grad_norm": 0.0009549495298415422, + "learning_rate": 1.2568378448486329e-05, + "lookahead_loss": 6.26803636598587, + "loss": 0.3306, + "step": 392500 + }, + { + "base_loss": 0.3179551683664322, + "epoch": 2.0581741333007812, + "grad_norm": 0.0009643736411817372, + "learning_rate": 1.2520694732666015e-05, + "lookahead_loss": 6.271330441474914, + "loss": 0.3288, + "step": 393000 + }, + { + "base_loss": 0.29271650505065916, + "epoch": 2.0591278076171875, + "grad_norm": 0.0009359052637591958, + "learning_rate": 1.2473011016845704e-05, + "lookahead_loss": 6.172923516273499, + "loss": 0.3062, + "step": 393500 + }, + { + "base_loss": 0.3039356949329376, + "epoch": 2.0600814819335938, + "grad_norm": 0.0009677503257989883, + "learning_rate": 1.242532730102539e-05, + "lookahead_loss": 6.238824967384338, + "loss": 0.3178, + "step": 394000 + }, + { + "base_loss": 0.32165152502059935, + "epoch": 2.06103515625, + "grad_norm": 0.0009660319774411619, + "learning_rate": 1.237764358520508e-05, + "lookahead_loss": 6.204953857421875, + "loss": 0.3321, + "step": 394500 + }, + { + "base_loss": 0.3061283130943775, + "epoch": 2.0619888305664062, + "grad_norm": 0.0010299277491867542, + "learning_rate": 1.2329959869384766e-05, + "lookahead_loss": 6.221849868774414, + "loss": 0.3161, + "step": 395000 + }, + { + "epoch": 2.0619888305664062, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.292554542279472, + "eval_lookahead_perplexity": 540.5323797305604, + "eval_loss": 0.1417231410741806, + "eval_perplexity": 1.1522575915730886, + "eval_runtime": 478.4232, + "eval_samples_per_second": 10.451, + "eval_steps_per_second": 0.328, + "step": 395000 + }, + { + "base_loss": 0.30640321379899976, + "epoch": 2.0629425048828125, + "grad_norm": 0.0009860399877652526, + "learning_rate": 1.2282276153564453e-05, + "lookahead_loss": 6.2113259787559505, + "loss": 0.3166, + "step": 395500 + }, + { + "base_loss": 0.31782649287581444, + "epoch": 2.0638961791992188, + "grad_norm": 0.0009462848538532853, + "learning_rate": 1.2234592437744141e-05, + "lookahead_loss": 6.24202803850174, + "loss": 0.3298, + "step": 396000 + }, + { + "base_loss": 0.30349985790252687, + "epoch": 2.064849853515625, + "grad_norm": 0.0009636973845772445, + "learning_rate": 1.2186908721923828e-05, + "lookahead_loss": 6.263174514293671, + "loss": 0.318, + "step": 396500 + }, + { + "base_loss": 0.3075440634191036, + "epoch": 2.0658035278320312, + "grad_norm": 0.0009665335528552532, + "learning_rate": 1.2139225006103517e-05, + "lookahead_loss": 6.2076529054641725, + "loss": 0.3199, + "step": 397000 + }, + { + "base_loss": 0.3064390652179718, + "epoch": 2.0667572021484375, + "grad_norm": 0.0009875985560938716, + "learning_rate": 1.2091541290283204e-05, + "lookahead_loss": 6.175316077709198, + "loss": 0.316, + "step": 397500 + }, + { + "base_loss": 0.3303199237883091, + "epoch": 2.0677108764648438, + "grad_norm": 0.0010177858639508486, + "learning_rate": 1.204385757446289e-05, + "lookahead_loss": 6.2469313488006595, + "loss": 0.3401, + "step": 398000 + }, + { + "base_loss": 0.2994670196175575, + "epoch": 2.06866455078125, + "grad_norm": 0.0010028753895312548, + "learning_rate": 1.1996173858642579e-05, + "lookahead_loss": 6.215955723762512, + "loss": 0.3097, + "step": 398500 + }, + { + "base_loss": 0.3000359579175711, + "epoch": 2.0696182250976562, + "grad_norm": 0.0009619069169275463, + "learning_rate": 1.1948490142822266e-05, + "lookahead_loss": 6.249015758514404, + "loss": 0.3151, + "step": 399000 + }, + { + "base_loss": 0.34639680609107015, + "epoch": 2.0705718994140625, + "grad_norm": 0.0009548591333441436, + "learning_rate": 1.1900806427001954e-05, + "lookahead_loss": 6.150674021720886, + "loss": 0.3582, + "step": 399500 + }, + { + "base_loss": 0.3132462115287781, + "epoch": 2.0715255737304688, + "grad_norm": 0.0009808745235204697, + "learning_rate": 1.1853122711181641e-05, + "lookahead_loss": 6.201566618442535, + "loss": 0.3237, + "step": 400000 + }, + { + "epoch": 2.0715255737304688, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.12980225879829912, + "eval_base_perplexity": 1.1386032122951009, + "eval_lookahead_loss": 6.290062046660402, + "eval_lookahead_perplexity": 539.1867827859916, + "eval_loss": 0.14171843230724335, + "eval_perplexity": 1.1522521658734124, + "eval_runtime": 491.0719, + "eval_samples_per_second": 10.182, + "eval_steps_per_second": 0.32, + "step": 400000 + }, + { + "base_loss": 0.30655504322052, + "epoch": 1.0009536743164062, + "grad_norm": 0.0009596187737770379, + "learning_rate": 1.1805438995361328e-05, + "lookahead_loss": 6.3476774702072145, + "loss": 0.314, + "step": 400500 + }, + { + "base_loss": 0.3002312153875828, + "epoch": 1.0019073486328125, + "grad_norm": 0.0009994172723963857, + "learning_rate": 1.1757755279541016e-05, + "lookahead_loss": 6.19147784948349, + "loss": 0.3128, + "step": 401000 + }, + { + "base_loss": 0.312505132496357, + "epoch": 1.0028610229492188, + "grad_norm": 0.000982376979663968, + "learning_rate": 1.1710071563720703e-05, + "lookahead_loss": 6.1830660572052, + "loss": 0.3219, + "step": 401500 + }, + { + "base_loss": 0.3240452491641045, + "epoch": 1.003814697265625, + "grad_norm": 0.0009494811529293656, + "learning_rate": 1.1662387847900392e-05, + "lookahead_loss": 6.214938600540161, + "loss": 0.3359, + "step": 402000 + }, + { + "base_loss": 0.29858038023114203, + "epoch": 1.0047683715820312, + "grad_norm": 0.0009325055871158838, + "learning_rate": 1.1614704132080079e-05, + "lookahead_loss": 6.201517785072327, + "loss": 0.3133, + "step": 402500 + }, + { + "base_loss": 0.3042404046058655, + "epoch": 1.0057220458984375, + "grad_norm": 0.0008430758025497198, + "learning_rate": 1.1567020416259765e-05, + "lookahead_loss": 6.324912932395935, + "loss": 0.3128, + "step": 403000 + }, + { + "base_loss": 0.29714440524578095, + "epoch": 1.0066757202148438, + "grad_norm": 0.0009388598846271634, + "learning_rate": 1.1519336700439454e-05, + "lookahead_loss": 6.184690756320953, + "loss": 0.3123, + "step": 403500 + }, + { + "base_loss": 0.31379624953866003, + "epoch": 1.00762939453125, + "grad_norm": 0.0009833249496296048, + "learning_rate": 1.147165298461914e-05, + "lookahead_loss": 6.242422096252441, + "loss": 0.3243, + "step": 404000 + }, + { + "base_loss": 0.31622857597470283, + "epoch": 1.0085830688476562, + "grad_norm": 0.000923054467421025, + "learning_rate": 1.142396926879883e-05, + "lookahead_loss": 6.224862882137298, + "loss": 0.3228, + "step": 404500 + }, + { + "base_loss": 0.3033116071224213, + "epoch": 1.0095367431640625, + "grad_norm": 0.0009825342567637563, + "learning_rate": 1.1376285552978516e-05, + "lookahead_loss": 6.246697548866272, + "loss": 0.3162, + "step": 405000 + }, + { + "epoch": 1.0095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.28775512524687, + "eval_lookahead_perplexity": 537.9443548936493, + "eval_loss": 0.14171403646469116, + "eval_perplexity": 1.1522471007654436, + "eval_runtime": 259.3375, + "eval_samples_per_second": 19.28, + "eval_steps_per_second": 0.605, + "step": 405000 + }, + { + "base_loss": 0.302242584168911, + "epoch": 1.0104904174804688, + "grad_norm": 0.0009655858739279211, + "learning_rate": 1.1328601837158203e-05, + "lookahead_loss": 6.211535950660705, + "loss": 0.3117, + "step": 405500 + }, + { + "base_loss": 0.3031807193160057, + "epoch": 1.011444091796875, + "grad_norm": 0.0009799316758289933, + "learning_rate": 1.1280918121337891e-05, + "lookahead_loss": 6.2251484837532045, + "loss": 0.3162, + "step": 406000 + }, + { + "base_loss": 0.324542246311903, + "epoch": 1.0123977661132812, + "grad_norm": 0.0008746847743168473, + "learning_rate": 1.1233234405517578e-05, + "lookahead_loss": 6.221592821121216, + "loss": 0.3343, + "step": 406500 + }, + { + "base_loss": 0.3043093577325344, + "epoch": 1.0133514404296875, + "grad_norm": 0.0009363812278024852, + "learning_rate": 1.1185550689697267e-05, + "lookahead_loss": 6.322680716514587, + "loss": 0.3195, + "step": 407000 + }, + { + "base_loss": 0.29890961676836014, + "epoch": 1.0143051147460938, + "grad_norm": 0.000903900305274874, + "learning_rate": 1.1137866973876954e-05, + "lookahead_loss": 6.26603977060318, + "loss": 0.3139, + "step": 407500 + }, + { + "base_loss": 0.2968312213420868, + "epoch": 1.0152587890625, + "grad_norm": 0.0009099857416003942, + "learning_rate": 1.109018325805664e-05, + "lookahead_loss": 6.221451986312866, + "loss": 0.3077, + "step": 408000 + }, + { + "base_loss": 0.309987826526165, + "epoch": 1.0162124633789062, + "grad_norm": 0.0009831819916144013, + "learning_rate": 1.1042499542236329e-05, + "lookahead_loss": 6.266968548774719, + "loss": 0.3218, + "step": 408500 + }, + { + "base_loss": 0.3124798896312714, + "epoch": 1.0171661376953125, + "grad_norm": 0.0009032660746015608, + "learning_rate": 1.0994815826416016e-05, + "lookahead_loss": 6.291549580097199, + "loss": 0.3231, + "step": 409000 + }, + { + "base_loss": 0.30385399025678633, + "epoch": 1.0181198120117188, + "grad_norm": 0.000948640750721097, + "learning_rate": 1.0947132110595704e-05, + "lookahead_loss": 6.300529542922973, + "loss": 0.3126, + "step": 409500 + }, + { + "base_loss": 0.2997076933085918, + "epoch": 1.019073486328125, + "grad_norm": 0.0009449218632653356, + "learning_rate": 1.0899448394775391e-05, + "lookahead_loss": 6.32943299818039, + "loss": 0.31, + "step": 410000 + }, + { + "epoch": 1.019073486328125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.285395630632346, + "eval_lookahead_perplexity": 536.6765743342321, + "eval_loss": 0.14170922338962555, + "eval_perplexity": 1.1522415549269998, + "eval_runtime": 282.9772, + "eval_samples_per_second": 17.669, + "eval_steps_per_second": 0.555, + "step": 410000 + }, + { + "base_loss": 0.30210260692238805, + "epoch": 1.0200271606445312, + "grad_norm": 0.0010289070196449757, + "learning_rate": 1.0851764678955078e-05, + "lookahead_loss": 6.1576169376373295, + "loss": 0.3146, + "step": 410500 + }, + { + "base_loss": 0.3285051781535149, + "epoch": 1.0209808349609375, + "grad_norm": 0.0009721849346533418, + "learning_rate": 1.0804080963134766e-05, + "lookahead_loss": 6.248751623630524, + "loss": 0.3382, + "step": 411000 + }, + { + "base_loss": 0.30326452678442, + "epoch": 1.0219345092773438, + "grad_norm": 0.0009491312084719539, + "learning_rate": 1.0756397247314453e-05, + "lookahead_loss": 6.210500438690185, + "loss": 0.3137, + "step": 411500 + }, + { + "base_loss": 0.29889601907134056, + "epoch": 1.02288818359375, + "grad_norm": 0.0009739006636664271, + "learning_rate": 1.0708713531494142e-05, + "lookahead_loss": 6.2617092299461365, + "loss": 0.3114, + "step": 412000 + }, + { + "base_loss": 0.3006108500063419, + "epoch": 1.0238418579101562, + "grad_norm": 0.0009736933861859143, + "learning_rate": 1.0661029815673829e-05, + "lookahead_loss": 6.211144655704499, + "loss": 0.3122, + "step": 412500 + }, + { + "base_loss": 0.3237688979506493, + "epoch": 1.0247955322265625, + "grad_norm": 0.0008902736008167267, + "learning_rate": 1.0613346099853515e-05, + "lookahead_loss": 6.235258470058441, + "loss": 0.3352, + "step": 413000 + }, + { + "base_loss": 0.3078545735180378, + "epoch": 1.0257492065429688, + "grad_norm": 0.0009605266386643052, + "learning_rate": 1.0565662384033204e-05, + "lookahead_loss": 6.186525918960571, + "loss": 0.3217, + "step": 413500 + }, + { + "base_loss": 0.3022345977425575, + "epoch": 1.026702880859375, + "grad_norm": 0.0010046123061329126, + "learning_rate": 1.051797866821289e-05, + "lookahead_loss": 6.204047555446625, + "loss": 0.3107, + "step": 414000 + }, + { + "base_loss": 0.3071480156183243, + "epoch": 1.0276565551757812, + "grad_norm": 0.0009506128262728453, + "learning_rate": 1.047029495239258e-05, + "lookahead_loss": 6.313683287620544, + "loss": 0.3183, + "step": 414500 + }, + { + "base_loss": 0.3302598208785057, + "epoch": 1.0286102294921875, + "grad_norm": 0.0009364956058561802, + "learning_rate": 1.0422611236572266e-05, + "lookahead_loss": 6.342069396495819, + "loss": 0.3412, + "step": 415000 + }, + { + "epoch": 1.0286102294921875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.2832431381883715, + "eval_lookahead_perplexity": 535.5226244428729, + "eval_loss": 0.14170511066913605, + "eval_perplexity": 1.1522368160892926, + "eval_runtime": 266.9875, + "eval_samples_per_second": 18.727, + "eval_steps_per_second": 0.588, + "step": 415000 + }, + { + "base_loss": 0.3027013133764267, + "epoch": 1.0295639038085938, + "grad_norm": 0.0009334692731499672, + "learning_rate": 1.0374927520751953e-05, + "lookahead_loss": 6.272345328807831, + "loss": 0.3116, + "step": 415500 + }, + { + "base_loss": 0.3046494301855564, + "epoch": 1.030517578125, + "grad_norm": 0.0009495760896243155, + "learning_rate": 1.0327243804931641e-05, + "lookahead_loss": 6.2516368660926815, + "loss": 0.3156, + "step": 416000 + }, + { + "base_loss": 0.3023626366853714, + "epoch": 1.0314712524414062, + "grad_norm": 0.0009388629696331918, + "learning_rate": 1.0279560089111328e-05, + "lookahead_loss": 6.280500651359558, + "loss": 0.3144, + "step": 416500 + }, + { + "base_loss": 0.3171934984624386, + "epoch": 1.0324249267578125, + "grad_norm": 0.000977948191575706, + "learning_rate": 1.0231876373291017e-05, + "lookahead_loss": 6.244725295066834, + "loss": 0.3343, + "step": 417000 + }, + { + "base_loss": 0.305971223294735, + "epoch": 1.0333786010742188, + "grad_norm": 0.0009961809264495969, + "learning_rate": 1.0184192657470704e-05, + "lookahead_loss": 6.270906726360321, + "loss": 0.3149, + "step": 417500 + }, + { + "base_loss": 0.3008191674053669, + "epoch": 1.034332275390625, + "grad_norm": 0.0008914514328353107, + "learning_rate": 1.013650894165039e-05, + "lookahead_loss": 6.347097273826599, + "loss": 0.3133, + "step": 418000 + }, + { + "base_loss": 0.3125488177835941, + "epoch": 1.0352859497070312, + "grad_norm": 0.0010067359544336796, + "learning_rate": 1.0088825225830079e-05, + "lookahead_loss": 6.17832250881195, + "loss": 0.3221, + "step": 418500 + }, + { + "base_loss": 0.32382212686538697, + "epoch": 1.0362396240234375, + "grad_norm": 0.0009813315700739622, + "learning_rate": 1.0041141510009766e-05, + "lookahead_loss": 6.293478096008301, + "loss": 0.3371, + "step": 419000 + }, + { + "base_loss": 0.30577521124482154, + "epoch": 1.0371932983398438, + "grad_norm": 0.00100911152549088, + "learning_rate": 9.993457794189454e-06, + "lookahead_loss": 6.232032457828522, + "loss": 0.3171, + "step": 419500 + }, + { + "base_loss": 0.3027714610397816, + "epoch": 1.03814697265625, + "grad_norm": 0.0009809184120967984, + "learning_rate": 9.945774078369141e-06, + "lookahead_loss": 6.269201689720154, + "loss": 0.3148, + "step": 420000 + }, + { + "epoch": 1.03814697265625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.281337650439229, + "eval_lookahead_perplexity": 534.5031642355383, + "eval_loss": 0.1417016088962555, + "eval_perplexity": 1.1522327812247228, + "eval_runtime": 273.7915, + "eval_samples_per_second": 18.262, + "eval_steps_per_second": 0.573, + "step": 420000 + }, + { + "base_loss": 0.3064252578020096, + "epoch": 1.0391006469726562, + "grad_norm": 0.0009215309401042759, + "learning_rate": 9.898090362548828e-06, + "lookahead_loss": 6.227767462253571, + "loss": 0.3193, + "step": 420500 + }, + { + "base_loss": 0.3251348150372505, + "epoch": 1.0400543212890625, + "grad_norm": 0.0009615287999622524, + "learning_rate": 9.850406646728516e-06, + "lookahead_loss": 6.262166633605957, + "loss": 0.3343, + "step": 421000 + }, + { + "base_loss": 0.3045478595495224, + "epoch": 1.0410079956054688, + "grad_norm": 0.0009617906180210412, + "learning_rate": 9.802722930908203e-06, + "lookahead_loss": 6.192641661167145, + "loss": 0.315, + "step": 421500 + }, + { + "base_loss": 0.2982518375813961, + "epoch": 1.041961669921875, + "grad_norm": 0.000983264995738864, + "learning_rate": 9.755039215087892e-06, + "lookahead_loss": 6.274356018543243, + "loss": 0.3097, + "step": 422000 + }, + { + "base_loss": 0.3089935587644577, + "epoch": 1.0429153442382812, + "grad_norm": 0.000969003711361438, + "learning_rate": 9.707355499267579e-06, + "lookahead_loss": 6.285097982883453, + "loss": 0.324, + "step": 422500 + }, + { + "base_loss": 0.3268603746891022, + "epoch": 1.0438690185546875, + "grad_norm": 0.0009796868544071913, + "learning_rate": 9.659671783447265e-06, + "lookahead_loss": 6.3252786407470705, + "loss": 0.3406, + "step": 423000 + }, + { + "base_loss": 0.29676153120398524, + "epoch": 1.0448226928710938, + "grad_norm": 0.0009741850662976503, + "learning_rate": 9.611988067626954e-06, + "lookahead_loss": 6.226795008659363, + "loss": 0.3091, + "step": 423500 + }, + { + "base_loss": 0.3044439141750336, + "epoch": 1.0457763671875, + "grad_norm": 0.0009648915147408843, + "learning_rate": 9.56430435180664e-06, + "lookahead_loss": 6.256449856758118, + "loss": 0.3161, + "step": 424000 + }, + { + "base_loss": 0.3313070158064365, + "epoch": 1.0467300415039062, + "grad_norm": 0.0009417349356226623, + "learning_rate": 9.51662063598633e-06, + "lookahead_loss": 6.215352055072785, + "loss": 0.3393, + "step": 424500 + }, + { + "base_loss": 0.32587327966094015, + "epoch": 1.0476837158203125, + "grad_norm": 0.0010068246629089117, + "learning_rate": 9.468936920166016e-06, + "lookahead_loss": 6.259899432659149, + "loss": 0.3387, + "step": 425000 + }, + { + "epoch": 1.0476837158203125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.279669542282154, + "eval_lookahead_perplexity": 533.6122983841667, + "eval_loss": 0.14169801771640778, + "eval_perplexity": 1.1522286433570088, + "eval_runtime": 271.06, + "eval_samples_per_second": 18.446, + "eval_steps_per_second": 0.579, + "step": 425000 + }, + { + "base_loss": 0.29468956208229063, + "epoch": 1.0486373901367188, + "grad_norm": 0.0009451521327719092, + "learning_rate": 9.421253204345703e-06, + "lookahead_loss": 6.224238969326019, + "loss": 0.3076, + "step": 425500 + }, + { + "base_loss": 0.3027429393827915, + "epoch": 1.049591064453125, + "grad_norm": 0.0009969412349164486, + "learning_rate": 9.373569488525391e-06, + "lookahead_loss": 6.211018812656403, + "loss": 0.3161, + "step": 426000 + }, + { + "base_loss": 0.3190444597601891, + "epoch": 1.0505447387695312, + "grad_norm": 0.000896650250069797, + "learning_rate": 9.325885772705078e-06, + "lookahead_loss": 6.3047479743957515, + "loss": 0.3331, + "step": 426500 + }, + { + "base_loss": 0.3043918348252773, + "epoch": 1.0514984130859375, + "grad_norm": 0.0009918854339048266, + "learning_rate": 9.278202056884767e-06, + "lookahead_loss": 6.259522766113281, + "loss": 0.3177, + "step": 427000 + }, + { + "base_loss": 0.3046840020418167, + "epoch": 1.0524520874023438, + "grad_norm": 0.001043196301907301, + "learning_rate": 9.230518341064454e-06, + "lookahead_loss": 6.19600498008728, + "loss": 0.317, + "step": 427500 + }, + { + "base_loss": 0.3202188531160355, + "epoch": 1.05340576171875, + "grad_norm": 0.0009135944419540465, + "learning_rate": 9.18283462524414e-06, + "lookahead_loss": 6.209219309806824, + "loss": 0.3299, + "step": 428000 + }, + { + "base_loss": 0.3542410895228386, + "epoch": 1.0543594360351562, + "grad_norm": 0.000933772069402039, + "learning_rate": 9.135150909423829e-06, + "lookahead_loss": 6.252681206703186, + "loss": 0.3686, + "step": 428500 + }, + { + "base_loss": 0.2943912135362625, + "epoch": 1.0553131103515625, + "grad_norm": 0.000966727442573756, + "learning_rate": 9.087467193603516e-06, + "lookahead_loss": 6.257615083217621, + "loss": 0.3081, + "step": 429000 + }, + { + "base_loss": 0.30392896428704264, + "epoch": 1.0562667846679688, + "grad_norm": 0.0009276124183088541, + "learning_rate": 9.039783477783204e-06, + "lookahead_loss": 6.273712680816651, + "loss": 0.3167, + "step": 429500 + }, + { + "base_loss": 0.3181495431959629, + "epoch": 1.057220458984375, + "grad_norm": 0.000952663947828114, + "learning_rate": 8.992099761962891e-06, + "lookahead_loss": 6.288021715641022, + "loss": 0.3316, + "step": 430000 + }, + { + "epoch": 1.057220458984375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.277979724323407, + "eval_lookahead_perplexity": 532.7113521712807, + "eval_loss": 0.14169469475746155, + "eval_perplexity": 1.1522248145548917, + "eval_runtime": 263.3566, + "eval_samples_per_second": 18.986, + "eval_steps_per_second": 0.596, + "step": 430000 + }, + { + "base_loss": 0.3180287193655968, + "epoch": 1.0581741333007812, + "grad_norm": 0.000987058854661882, + "learning_rate": 8.944416046142578e-06, + "lookahead_loss": 6.290538039207458, + "loss": 0.3283, + "step": 430500 + }, + { + "base_loss": 0.292520221978426, + "epoch": 1.0591278076171875, + "grad_norm": 0.0009176459279842675, + "learning_rate": 8.896732330322266e-06, + "lookahead_loss": 6.187150172233581, + "loss": 0.308, + "step": 431000 + }, + { + "base_loss": 0.3019208701252937, + "epoch": 1.0600814819335938, + "grad_norm": 0.0009774015052244067, + "learning_rate": 8.849048614501953e-06, + "lookahead_loss": 6.234365815162659, + "loss": 0.3146, + "step": 431500 + }, + { + "base_loss": 0.32141088619828223, + "epoch": 1.06103515625, + "grad_norm": 0.0009807058377191424, + "learning_rate": 8.801364898681642e-06, + "lookahead_loss": 6.213557529449463, + "loss": 0.332, + "step": 432000 + }, + { + "base_loss": 0.30723505771160126, + "epoch": 1.0619888305664062, + "grad_norm": 0.0010287961922585964, + "learning_rate": 8.753681182861329e-06, + "lookahead_loss": 6.22539913892746, + "loss": 0.3162, + "step": 432500 + }, + { + "base_loss": 0.308370777964592, + "epoch": 1.0629425048828125, + "grad_norm": 0.0009847691981121898, + "learning_rate": 8.705997467041015e-06, + "lookahead_loss": 6.221470012664795, + "loss": 0.3171, + "step": 433000 + }, + { + "base_loss": 0.3175841515958309, + "epoch": 1.0638961791992188, + "grad_norm": 0.0009484458132646978, + "learning_rate": 8.658313751220704e-06, + "lookahead_loss": 6.254562690258026, + "loss": 0.3296, + "step": 433500 + }, + { + "base_loss": 0.3023634272813797, + "epoch": 1.064849853515625, + "grad_norm": 0.0009465797338634729, + "learning_rate": 8.61063003540039e-06, + "lookahead_loss": 6.276018251895905, + "loss": 0.3172, + "step": 434000 + }, + { + "base_loss": 0.31000158992409704, + "epoch": 1.0658035278320312, + "grad_norm": 0.0009775181533768773, + "learning_rate": 8.56294631958008e-06, + "lookahead_loss": 6.20667854642868, + "loss": 0.321, + "step": 434500 + }, + { + "base_loss": 0.3074465197324753, + "epoch": 1.0667572021484375, + "grad_norm": 0.0009713208419270813, + "learning_rate": 8.515262603759766e-06, + "lookahead_loss": 6.182765590667724, + "loss": 0.3168, + "step": 435000 + }, + { + "epoch": 1.0667572021484375, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.276130061560926, + "eval_lookahead_perplexity": 531.72692652841, + "eval_loss": 0.1416913866996765, + "eval_perplexity": 1.1522210029349285, + "eval_runtime": 274.5921, + "eval_samples_per_second": 18.209, + "eval_steps_per_second": 0.572, + "step": 435000 + }, + { + "base_loss": 0.3308669750988483, + "epoch": 1.0677108764648438, + "grad_norm": 0.0010318380082026124, + "learning_rate": 8.467578887939453e-06, + "lookahead_loss": 6.267029664039612, + "loss": 0.3436, + "step": 435500 + }, + { + "base_loss": 0.300963022172451, + "epoch": 1.06866455078125, + "grad_norm": 0.0009859588462859392, + "learning_rate": 8.419895172119141e-06, + "lookahead_loss": 6.230448030471802, + "loss": 0.31, + "step": 436000 + }, + { + "base_loss": 0.3016065271794796, + "epoch": 1.0696182250976562, + "grad_norm": 0.0009570185793563724, + "learning_rate": 8.372211456298828e-06, + "lookahead_loss": 6.258872253417969, + "loss": 0.3144, + "step": 436500 + }, + { + "base_loss": 0.3469915909469128, + "epoch": 1.0705718994140625, + "grad_norm": 0.0009695589542388916, + "learning_rate": 8.324527740478517e-06, + "lookahead_loss": 6.167494082450867, + "loss": 0.3587, + "step": 437000 + }, + { + "base_loss": 0.31762470316886904, + "epoch": 1.0715255737304688, + "grad_norm": 0.0009688555146567523, + "learning_rate": 8.276844024658204e-06, + "lookahead_loss": 6.213669309139251, + "loss": 0.3254, + "step": 437500 + }, + { + "base_loss": 0.3090612238943577, + "epoch": 1.072479248046875, + "grad_norm": 0.0009899616707116365, + "learning_rate": 8.22916030883789e-06, + "lookahead_loss": 6.308782180786133, + "loss": 0.3187, + "step": 438000 + }, + { + "base_loss": 0.3051177371442318, + "epoch": 1.0734329223632812, + "grad_norm": 0.0009489938383921981, + "learning_rate": 8.181476593017579e-06, + "lookahead_loss": 6.267108190059662, + "loss": 0.3174, + "step": 438500 + }, + { + "base_loss": 0.32735036182403565, + "epoch": 1.0743865966796875, + "grad_norm": 0.000950060726609081, + "learning_rate": 8.133792877197266e-06, + "lookahead_loss": 6.282675455093384, + "loss": 0.3397, + "step": 439000 + }, + { + "base_loss": 0.3037717220187187, + "epoch": 1.0753402709960938, + "grad_norm": 0.0009571823175065219, + "learning_rate": 8.086109161376954e-06, + "lookahead_loss": 6.34437693977356, + "loss": 0.3157, + "step": 439500 + }, + { + "base_loss": 0.3043428426384926, + "epoch": 1.0762939453125, + "grad_norm": 0.0008969463524408638, + "learning_rate": 8.038425445556641e-06, + "lookahead_loss": 6.308073208332062, + "loss": 0.3173, + "step": 440000 + }, + { + "epoch": 1.0762939453125, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.274432948603036, + "eval_lookahead_perplexity": 530.8252911762661, + "eval_loss": 0.14168821275234222, + "eval_perplexity": 1.1522173458519513, + "eval_runtime": 279.9465, + "eval_samples_per_second": 17.861, + "eval_steps_per_second": 0.561, + "step": 440000 + }, + { + "base_loss": 0.3304513318836689, + "epoch": 1.0772476196289062, + "grad_norm": 0.0009665157413110137, + "learning_rate": 7.990741729736328e-06, + "lookahead_loss": 6.282677404880523, + "loss": 0.3437, + "step": 440500 + }, + { + "base_loss": 0.3036956556737423, + "epoch": 1.0782012939453125, + "grad_norm": 0.0009915807750076056, + "learning_rate": 7.943058013916016e-06, + "lookahead_loss": 6.307204761981964, + "loss": 0.3177, + "step": 441000 + }, + { + "base_loss": 0.29657268461585046, + "epoch": 1.0791549682617188, + "grad_norm": 0.0009162530768662691, + "learning_rate": 7.895374298095703e-06, + "lookahead_loss": 6.30237598323822, + "loss": 0.3109, + "step": 441500 + }, + { + "base_loss": 0.3123248810470104, + "epoch": 1.080108642578125, + "grad_norm": 0.0009562050108797848, + "learning_rate": 7.847690582275392e-06, + "lookahead_loss": 6.292247913837433, + "loss": 0.3289, + "step": 442000 + }, + { + "base_loss": 0.32140666726231576, + "epoch": 1.0810623168945312, + "grad_norm": 0.0009877033298835158, + "learning_rate": 7.800006866455079e-06, + "lookahead_loss": 6.305101052761078, + "loss": 0.337, + "step": 442500 + }, + { + "base_loss": 0.29961148300766943, + "epoch": 1.0820159912109375, + "grad_norm": 0.0009005140163935721, + "learning_rate": 7.752323150634765e-06, + "lookahead_loss": 6.306686497688293, + "loss": 0.312, + "step": 443000 + }, + { + "base_loss": 0.3036400380730629, + "epoch": 1.0829696655273438, + "grad_norm": 0.0010059344349429011, + "learning_rate": 7.704639434814454e-06, + "lookahead_loss": 6.3338696246147155, + "loss": 0.3172, + "step": 443500 + }, + { + "base_loss": 0.3315876969695091, + "epoch": 1.08392333984375, + "grad_norm": 0.000925060361623764, + "learning_rate": 7.65695571899414e-06, + "lookahead_loss": 6.3393225388526915, + "loss": 0.3447, + "step": 444000 + }, + { + "base_loss": 0.30729381024837493, + "epoch": 1.0848770141601562, + "grad_norm": 0.00096750573720783, + "learning_rate": 7.6092720031738284e-06, + "lookahead_loss": 6.285042664051056, + "loss": 0.3195, + "step": 444500 + }, + { + "base_loss": 0.3001744159460068, + "epoch": 1.0858306884765625, + "grad_norm": 0.0010059243068099022, + "learning_rate": 7.561588287353516e-06, + "lookahead_loss": 6.272412059783935, + "loss": 0.3098, + "step": 445000 + }, + { + "epoch": 1.0858306884765625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.272678756104491, + "eval_lookahead_perplexity": 529.8949376805812, + "eval_loss": 0.14168505370616913, + "eval_perplexity": 1.1522137059499036, + "eval_runtime": 265.8234, + "eval_samples_per_second": 18.809, + "eval_steps_per_second": 0.591, + "step": 445000 + }, + { + "base_loss": 0.3043146550655365, + "epoch": 1.0867843627929688, + "grad_norm": 0.0011264849454164505, + "learning_rate": 7.513904571533204e-06, + "lookahead_loss": 6.235183692932129, + "loss": 0.3136, + "step": 445500 + }, + { + "base_loss": 0.33685871040821075, + "epoch": 1.087738037109375, + "grad_norm": 0.0008835060871206224, + "learning_rate": 7.466220855712891e-06, + "lookahead_loss": 6.309118765830994, + "loss": 0.3441, + "step": 446000 + }, + { + "base_loss": 0.30271838963031766, + "epoch": 1.0886917114257812, + "grad_norm": 0.0009826867608353496, + "learning_rate": 7.418537139892578e-06, + "lookahead_loss": 6.282386909008026, + "loss": 0.3131, + "step": 446500 + }, + { + "base_loss": 0.31106107553839685, + "epoch": 1.0896453857421875, + "grad_norm": 0.000977760530076921, + "learning_rate": 7.370853424072266e-06, + "lookahead_loss": 6.315260496139526, + "loss": 0.3203, + "step": 447000 + }, + { + "base_loss": 0.29801268032193184, + "epoch": 1.0905990600585938, + "grad_norm": 0.0009775172220543027, + "learning_rate": 7.323169708251954e-06, + "lookahead_loss": 6.3085113048553465, + "loss": 0.3104, + "step": 447500 + }, + { + "base_loss": 0.29706856977939605, + "epoch": 1.091552734375, + "grad_norm": 0.0008752316934987903, + "learning_rate": 7.275485992431641e-06, + "lookahead_loss": 6.266508299827576, + "loss": 0.3082, + "step": 448000 + }, + { + "base_loss": 0.3189851225912571, + "epoch": 1.0925064086914062, + "grad_norm": 0.0009569500689394772, + "learning_rate": 7.227802276611328e-06, + "lookahead_loss": 6.296401259899139, + "loss": 0.3345, + "step": 448500 + }, + { + "base_loss": 0.307105902582407, + "epoch": 1.0934600830078125, + "grad_norm": 0.0009048188221640885, + "learning_rate": 7.180118560791016e-06, + "lookahead_loss": 6.301604011535645, + "loss": 0.3195, + "step": 449000 + }, + { + "base_loss": 0.2863923677802086, + "epoch": 1.0944137573242188, + "grad_norm": 0.0009495110716670752, + "learning_rate": 7.1324348449707034e-06, + "lookahead_loss": 6.301865918159485, + "loss": 0.3022, + "step": 449500 + }, + { + "base_loss": 0.2923275768607855, + "epoch": 1.095367431640625, + "grad_norm": 0.0009773832280188799, + "learning_rate": 7.084751129150391e-06, + "lookahead_loss": 6.225174713611603, + "loss": 0.3078, + "step": 450000 + }, + { + "epoch": 1.095367431640625, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.2715035490334605, + "eval_lookahead_perplexity": 529.2725671816515, + "eval_loss": 0.1416824758052826, + "eval_perplexity": 1.152210735660998, + "eval_runtime": 280.1769, + "eval_samples_per_second": 17.846, + "eval_steps_per_second": 0.56, + "step": 450000 + }, + { + "base_loss": 0.2988976333141327, + "epoch": 1.0963211059570312, + "grad_norm": 0.0009743800619617105, + "learning_rate": 7.037067413330079e-06, + "lookahead_loss": 6.307745354652405, + "loss": 0.3117, + "step": 450500 + }, + { + "base_loss": 0.3292928241491318, + "epoch": 1.0972747802734375, + "grad_norm": 0.0009718555375002325, + "learning_rate": 6.989383697509766e-06, + "lookahead_loss": 6.326489259719849, + "loss": 0.3394, + "step": 451000 + }, + { + "base_loss": 0.2914348037838936, + "epoch": 1.0982284545898438, + "grad_norm": 0.000966022489592433, + "learning_rate": 6.941699981689453e-06, + "lookahead_loss": 6.256584117889404, + "loss": 0.3078, + "step": 451500 + }, + { + "base_loss": 0.2972012578845024, + "epoch": 1.09918212890625, + "grad_norm": 0.0009904210455715656, + "learning_rate": 6.894016265869141e-06, + "lookahead_loss": 6.307202220916748, + "loss": 0.3096, + "step": 452000 + }, + { + "base_loss": 0.3006402098238468, + "epoch": 1.1001358032226562, + "grad_norm": 0.0009319457458332181, + "learning_rate": 6.846332550048829e-06, + "lookahead_loss": 6.3021579661369325, + "loss": 0.3145, + "step": 452500 + }, + { + "base_loss": 0.3227167456150055, + "epoch": 1.1010894775390625, + "grad_norm": 0.0009431499638594687, + "learning_rate": 6.798648834228516e-06, + "lookahead_loss": 6.373525240421295, + "loss": 0.3319, + "step": 453000 + }, + { + "base_loss": 0.30574207335710524, + "epoch": 1.1020431518554688, + "grad_norm": 0.0009230657014995813, + "learning_rate": 6.750965118408203e-06, + "lookahead_loss": 6.3279352407455445, + "loss": 0.3147, + "step": 453500 + }, + { + "base_loss": 0.29960223579406736, + "epoch": 1.102996826171875, + "grad_norm": 0.0009560135076753795, + "learning_rate": 6.703281402587891e-06, + "lookahead_loss": 6.294001242637634, + "loss": 0.3141, + "step": 454000 + }, + { + "base_loss": 0.2996614835858345, + "epoch": 1.1039505004882812, + "grad_norm": 0.0009949164232239127, + "learning_rate": 6.6555976867675784e-06, + "lookahead_loss": 6.323962463855743, + "loss": 0.3115, + "step": 454500 + }, + { + "base_loss": 0.3155037875175476, + "epoch": 1.1049041748046875, + "grad_norm": 0.0009888532804325223, + "learning_rate": 6.607913970947266e-06, + "lookahead_loss": 6.259588910102845, + "loss": 0.3275, + "step": 455000 + }, + { + "epoch": 1.1049041748046875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.270278566561568, + "eval_lookahead_perplexity": 528.6246145103582, + "eval_loss": 0.14167998731136322, + "eval_perplexity": 1.1522078683951562, + "eval_runtime": 283.4432, + "eval_samples_per_second": 17.64, + "eval_steps_per_second": 0.554, + "step": 455000 + }, + { + "base_loss": 0.30881242457032204, + "epoch": 1.1058578491210938, + "grad_norm": 0.0009281703387387097, + "learning_rate": 6.560230255126954e-06, + "lookahead_loss": 6.247208318710327, + "loss": 0.3205, + "step": 455500 + }, + { + "base_loss": 0.29839677426218986, + "epoch": 1.1068115234375, + "grad_norm": 0.0009286696440540254, + "learning_rate": 6.512546539306641e-06, + "lookahead_loss": 6.329126072883606, + "loss": 0.3091, + "step": 456000 + }, + { + "base_loss": 0.294783333927393, + "epoch": 1.1077651977539062, + "grad_norm": 0.0009652974549680948, + "learning_rate": 6.464862823486328e-06, + "lookahead_loss": 6.249011724472046, + "loss": 0.3079, + "step": 456500 + }, + { + "base_loss": 0.32150769320130346, + "epoch": 1.1087188720703125, + "grad_norm": 0.0010076012695208192, + "learning_rate": 6.417179107666016e-06, + "lookahead_loss": 6.260827511787414, + "loss": 0.3333, + "step": 457000 + }, + { + "base_loss": 0.3191940434873104, + "epoch": 1.1096725463867188, + "grad_norm": 0.0009163662907667458, + "learning_rate": 6.369495391845704e-06, + "lookahead_loss": 6.215093274116516, + "loss": 0.328, + "step": 457500 + }, + { + "base_loss": 0.30270202097296717, + "epoch": 1.110626220703125, + "grad_norm": 0.0008867261931300163, + "learning_rate": 6.321811676025391e-06, + "lookahead_loss": 6.187451065540314, + "loss": 0.3138, + "step": 458000 + }, + { + "base_loss": 0.2974509707689285, + "epoch": 1.1115798950195312, + "grad_norm": 0.0009730788879096508, + "learning_rate": 6.274127960205078e-06, + "lookahead_loss": 6.312874153614044, + "loss": 0.3082, + "step": 458500 + }, + { + "base_loss": 0.3114223616421223, + "epoch": 1.1125335693359375, + "grad_norm": 0.0009278567740693688, + "learning_rate": 6.226444244384766e-06, + "lookahead_loss": 6.330186011314392, + "loss": 0.3226, + "step": 459000 + }, + { + "base_loss": 0.3443338246643543, + "epoch": 1.1134872436523438, + "grad_norm": 0.0009897375712171197, + "learning_rate": 6.1787605285644534e-06, + "lookahead_loss": 6.337146789073944, + "loss": 0.3514, + "step": 459500 + }, + { + "base_loss": 0.2939972540736198, + "epoch": 1.11444091796875, + "grad_norm": 0.0009331995388492942, + "learning_rate": 6.131076812744141e-06, + "lookahead_loss": 6.213301693439484, + "loss": 0.308, + "step": 460000 + }, + { + "epoch": 1.11444091796875, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.269182653091967, + "eval_lookahead_perplexity": 528.0456050050623, + "eval_loss": 0.14167781174182892, + "eval_perplexity": 1.1522053616895473, + "eval_runtime": 302.2149, + "eval_samples_per_second": 16.545, + "eval_steps_per_second": 0.519, + "step": 460000 + }, + { + "base_loss": 0.29841734063625336, + "epoch": 1.1153945922851562, + "grad_norm": 0.0009835686068981886, + "learning_rate": 6.083393096923829e-06, + "lookahead_loss": 6.2884809045791625, + "loss": 0.3111, + "step": 460500 + }, + { + "base_loss": 0.3147252712547779, + "epoch": 1.1163482666015625, + "grad_norm": 0.001012452645227313, + "learning_rate": 6.035709381103516e-06, + "lookahead_loss": 6.26203669834137, + "loss": 0.3248, + "step": 461000 + }, + { + "base_loss": 0.32950386153161526, + "epoch": 1.1173019409179688, + "grad_norm": 0.0009783547138795257, + "learning_rate": 5.988025665283203e-06, + "lookahead_loss": 6.272063705444336, + "loss": 0.3429, + "step": 461500 + }, + { + "base_loss": 0.30734304267168044, + "epoch": 1.118255615234375, + "grad_norm": 0.0010219624964520335, + "learning_rate": 5.940341949462891e-06, + "lookahead_loss": 6.304396607875824, + "loss": 0.3164, + "step": 462000 + }, + { + "base_loss": 0.3014386140704155, + "epoch": 1.1192092895507812, + "grad_norm": 0.0009705079719424248, + "learning_rate": 5.892658233642579e-06, + "lookahead_loss": 6.310923490047455, + "loss": 0.3126, + "step": 462500 + }, + { + "base_loss": 0.30611268219351767, + "epoch": 2.0009536743164062, + "grad_norm": 0.0009619362535886467, + "learning_rate": 5.844974517822266e-06, + "lookahead_loss": 6.335032467365265, + "loss": 0.3146, + "step": 463000 + }, + { + "base_loss": 0.301539769411087, + "epoch": 2.0019073486328125, + "grad_norm": 0.0010018181055784225, + "learning_rate": 5.797290802001953e-06, + "lookahead_loss": 6.165237899780274, + "loss": 0.3138, + "step": 463500 + }, + { + "base_loss": 0.31222748425602914, + "epoch": 2.0028610229492188, + "grad_norm": 0.0009731088066473603, + "learning_rate": 5.749607086181641e-06, + "lookahead_loss": 6.1653591270446775, + "loss": 0.3221, + "step": 464000 + }, + { + "base_loss": 0.32267384630441664, + "epoch": 2.003814697265625, + "grad_norm": 0.0009498675935901701, + "learning_rate": 5.7019233703613284e-06, + "lookahead_loss": 6.204732715129852, + "loss": 0.3348, + "step": 464500 + }, + { + "base_loss": 0.30016050645709036, + "epoch": 2.0047683715820312, + "grad_norm": 0.0009479303262196481, + "learning_rate": 5.654239654541016e-06, + "lookahead_loss": 6.178038363456726, + "loss": 0.3159, + "step": 465000 + }, + { + "epoch": 2.0047683715820312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.268028480176347, + "eval_lookahead_perplexity": 527.4365006430484, + "eval_loss": 0.14167556166648865, + "eval_perplexity": 1.1522027691435928, + "eval_runtime": 292.4005, + "eval_samples_per_second": 17.1, + "eval_steps_per_second": 0.537, + "step": 465000 + }, + { + "base_loss": 0.3024714471399784, + "epoch": 2.0057220458984375, + "grad_norm": 0.0008406939450651407, + "learning_rate": 5.606555938720704e-06, + "lookahead_loss": 6.285563422203064, + "loss": 0.3112, + "step": 465500 + }, + { + "base_loss": 0.2964489733278751, + "epoch": 2.0066757202148438, + "grad_norm": 0.0009032645612023771, + "learning_rate": 5.558872222900391e-06, + "lookahead_loss": 6.164625645637512, + "loss": 0.3126, + "step": 466000 + }, + { + "base_loss": 0.31337857532501223, + "epoch": 2.00762939453125, + "grad_norm": 0.0009625664097256958, + "learning_rate": 5.511188507080078e-06, + "lookahead_loss": 6.221101438045502, + "loss": 0.3233, + "step": 466500 + }, + { + "base_loss": 0.3180972839295864, + "epoch": 2.0085830688476562, + "grad_norm": 0.0009384072618559003, + "learning_rate": 5.463504791259766e-06, + "lookahead_loss": 6.201664540290833, + "loss": 0.3226, + "step": 467000 + }, + { + "base_loss": 0.30493127757310867, + "epoch": 2.0095367431640625, + "grad_norm": 0.0009722619433887303, + "learning_rate": 5.415821075439454e-06, + "lookahead_loss": 6.221090071678161, + "loss": 0.318, + "step": 467500 + }, + { + "base_loss": 0.30099570405483245, + "epoch": 2.0104904174804688, + "grad_norm": 0.0009319408563897014, + "learning_rate": 5.368137359619141e-06, + "lookahead_loss": 6.196816132545472, + "loss": 0.3109, + "step": 468000 + }, + { + "base_loss": 0.30160990768671037, + "epoch": 2.011444091796875, + "grad_norm": 0.0010112961754202843, + "learning_rate": 5.320453643798828e-06, + "lookahead_loss": 6.200212286949157, + "loss": 0.3142, + "step": 468500 + }, + { + "base_loss": 0.32538792353868484, + "epoch": 2.0123977661132812, + "grad_norm": 0.0008800759678706527, + "learning_rate": 5.272769927978516e-06, + "lookahead_loss": 6.190517845630645, + "loss": 0.3353, + "step": 469000 + }, + { + "base_loss": 0.3040602553486824, + "epoch": 2.0133514404296875, + "grad_norm": 0.0009243732201866806, + "learning_rate": 5.2250862121582034e-06, + "lookahead_loss": 6.284952702045441, + "loss": 0.3189, + "step": 469500 + }, + { + "base_loss": 0.29813345649838446, + "epoch": 2.0143051147460938, + "grad_norm": 0.0009385402081534266, + "learning_rate": 5.177402496337891e-06, + "lookahead_loss": 6.231608829021454, + "loss": 0.3115, + "step": 470000 + }, + { + "epoch": 2.0143051147460938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.266982959101376, + "eval_lookahead_perplexity": 526.8853428396731, + "eval_loss": 0.1416735202074051, + "eval_perplexity": 1.1522004169711846, + "eval_runtime": 269.3988, + "eval_samples_per_second": 18.56, + "eval_steps_per_second": 0.583, + "step": 470000 + }, + { + "base_loss": 0.2960759684741497, + "epoch": 2.0152587890625, + "grad_norm": 0.0008989098714664578, + "learning_rate": 5.129718780517579e-06, + "lookahead_loss": 6.189414587974548, + "loss": 0.3066, + "step": 470500 + }, + { + "base_loss": 0.31211792075634004, + "epoch": 2.0162124633789062, + "grad_norm": 0.0009662451921030879, + "learning_rate": 5.082035064697266e-06, + "lookahead_loss": 6.23908690071106, + "loss": 0.3224, + "step": 471000 + }, + { + "base_loss": 0.31110167542099953, + "epoch": 2.0171661376953125, + "grad_norm": 0.0009187129326164722, + "learning_rate": 5.034351348876953e-06, + "lookahead_loss": 6.273652675628662, + "loss": 0.3221, + "step": 471500 + }, + { + "base_loss": 0.2990322083234787, + "epoch": 2.0181198120117188, + "grad_norm": 0.0009293456678278744, + "learning_rate": 4.986667633056641e-06, + "lookahead_loss": 6.288145971298218, + "loss": 0.3115, + "step": 472000 + }, + { + "base_loss": 0.29806812533736227, + "epoch": 2.019073486328125, + "grad_norm": 0.0009703227551653981, + "learning_rate": 4.938983917236329e-06, + "lookahead_loss": 6.313042426109314, + "loss": 0.3094, + "step": 472500 + }, + { + "base_loss": 0.30187543269991873, + "epoch": 2.0200271606445312, + "grad_norm": 0.0010225786827504635, + "learning_rate": 4.891300201416016e-06, + "lookahead_loss": 6.147682514667511, + "loss": 0.3146, + "step": 473000 + }, + { + "base_loss": 0.32729279178380966, + "epoch": 2.0209808349609375, + "grad_norm": 0.0009684797842055559, + "learning_rate": 4.843616485595703e-06, + "lookahead_loss": 6.230285405635834, + "loss": 0.3372, + "step": 473500 + }, + { + "base_loss": 0.3057846530973911, + "epoch": 2.0219345092773438, + "grad_norm": 0.0009616024908609688, + "learning_rate": 4.795932769775391e-06, + "lookahead_loss": 6.194816417217255, + "loss": 0.314, + "step": 474000 + }, + { + "base_loss": 0.2997340569794178, + "epoch": 2.02288818359375, + "grad_norm": 0.0009833164513111115, + "learning_rate": 4.7482490539550784e-06, + "lookahead_loss": 6.231217849731445, + "loss": 0.3113, + "step": 474500 + }, + { + "base_loss": 0.30260268279910085, + "epoch": 2.0238418579101562, + "grad_norm": 0.0009483063477091491, + "learning_rate": 4.700565338134766e-06, + "lookahead_loss": 6.187247472763062, + "loss": 0.3137, + "step": 475000 + }, + { + "epoch": 2.0238418579101562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.265868381189462, + "eval_lookahead_perplexity": 526.2984152234725, + "eval_loss": 0.14167135953903198, + "eval_perplexity": 1.1521979274508736, + "eval_runtime": 280.7547, + "eval_samples_per_second": 17.809, + "eval_steps_per_second": 0.559, + "step": 475000 + }, + { + "base_loss": 0.3236656226217747, + "epoch": 2.0247955322265625, + "grad_norm": 0.0009003359591588378, + "learning_rate": 4.652881622314453e-06, + "lookahead_loss": 6.210826418876648, + "loss": 0.3344, + "step": 475500 + }, + { + "base_loss": 0.30869458481669426, + "epoch": 2.0257492065429688, + "grad_norm": 0.0009723911061882973, + "learning_rate": 4.605197906494141e-06, + "lookahead_loss": 6.163979069232941, + "loss": 0.3224, + "step": 476000 + }, + { + "base_loss": 0.3019005296528339, + "epoch": 2.026702880859375, + "grad_norm": 0.00100004265550524, + "learning_rate": 4.557514190673828e-06, + "lookahead_loss": 6.190570694923401, + "loss": 0.311, + "step": 476500 + }, + { + "base_loss": 0.3077106066644192, + "epoch": 2.0276565551757812, + "grad_norm": 0.0009560610051266849, + "learning_rate": 4.509830474853516e-06, + "lookahead_loss": 6.293141817092896, + "loss": 0.318, + "step": 477000 + }, + { + "base_loss": 0.3280421564877033, + "epoch": 2.0286102294921875, + "grad_norm": 0.0009679990471340716, + "learning_rate": 4.462146759033204e-06, + "lookahead_loss": 6.31468297290802, + "loss": 0.3389, + "step": 477500 + }, + { + "base_loss": 0.30581475085020066, + "epoch": 2.0295639038085938, + "grad_norm": 0.0009467425406910479, + "learning_rate": 4.4144630432128904e-06, + "lookahead_loss": 6.2611168823242185, + "loss": 0.3136, + "step": 478000 + }, + { + "base_loss": 0.3068877322375774, + "epoch": 2.030517578125, + "grad_norm": 0.0009708070429041982, + "learning_rate": 4.366779327392578e-06, + "lookahead_loss": 6.253306629657746, + "loss": 0.3162, + "step": 478500 + }, + { + "base_loss": 0.3014947620034218, + "epoch": 2.0314712524414062, + "grad_norm": 0.0009608942782506347, + "learning_rate": 4.319095611572266e-06, + "lookahead_loss": 6.257045313835144, + "loss": 0.3138, + "step": 479000 + }, + { + "base_loss": 0.3173881909847259, + "epoch": 2.0324249267578125, + "grad_norm": 0.0009665957186371088, + "learning_rate": 4.2714118957519534e-06, + "lookahead_loss": 6.220450338363648, + "loss": 0.3333, + "step": 479500 + }, + { + "base_loss": 0.3059709269702435, + "epoch": 2.0333786010742188, + "grad_norm": 0.0009840476559475064, + "learning_rate": 4.223728179931641e-06, + "lookahead_loss": 6.266497844696045, + "loss": 0.3146, + "step": 480000 + }, + { + "epoch": 2.0333786010742188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.265060663223267, + "eval_lookahead_perplexity": 525.8734861724328, + "eval_loss": 0.14166982471942902, + "eval_perplexity": 1.1521961590362653, + "eval_runtime": 293.9499, + "eval_samples_per_second": 17.01, + "eval_steps_per_second": 0.534, + "step": 480000 + }, + { + "base_loss": 0.3017133396565914, + "epoch": 2.034332275390625, + "grad_norm": 0.0009041751618497074, + "learning_rate": 4.176044464111328e-06, + "lookahead_loss": 6.33329191493988, + "loss": 0.3142, + "step": 480500 + }, + { + "base_loss": 0.31134800574183463, + "epoch": 2.0352859497070312, + "grad_norm": 0.0010052843717858195, + "learning_rate": 4.128360748291016e-06, + "lookahead_loss": 6.164324035167694, + "loss": 0.3224, + "step": 481000 + }, + { + "base_loss": 0.32387468561530114, + "epoch": 2.0362396240234375, + "grad_norm": 0.0009477023268118501, + "learning_rate": 4.080677032470703e-06, + "lookahead_loss": 6.275509401321411, + "loss": 0.3362, + "step": 481500 + }, + { + "base_loss": 0.3080780008882284, + "epoch": 2.0371932983398438, + "grad_norm": 0.0009896598057821393, + "learning_rate": 4.032993316650391e-06, + "lookahead_loss": 6.207332150936127, + "loss": 0.3192, + "step": 482000 + }, + { + "base_loss": 0.30180328992009164, + "epoch": 2.03814697265625, + "grad_norm": 0.0009810053743422031, + "learning_rate": 3.985309600830079e-06, + "lookahead_loss": 6.2532279634475705, + "loss": 0.3137, + "step": 482500 + }, + { + "base_loss": 0.30689890575408935, + "epoch": 2.0391006469726562, + "grad_norm": 0.0009287380962632596, + "learning_rate": 3.9376258850097654e-06, + "lookahead_loss": 6.21064705324173, + "loss": 0.3188, + "step": 483000 + }, + { + "base_loss": 0.32427770999073985, + "epoch": 2.0400543212890625, + "grad_norm": 0.0009327345178462565, + "learning_rate": 3.889942169189453e-06, + "lookahead_loss": 6.246191625595093, + "loss": 0.3331, + "step": 483500 + }, + { + "base_loss": 0.30682690465450285, + "epoch": 2.0410079956054688, + "grad_norm": 0.0009762158733792603, + "learning_rate": 3.842258453369141e-06, + "lookahead_loss": 6.171364137649536, + "loss": 0.3156, + "step": 484000 + }, + { + "base_loss": 0.29654143354296686, + "epoch": 2.041961669921875, + "grad_norm": 0.0009480268345214427, + "learning_rate": 3.7945747375488284e-06, + "lookahead_loss": 6.259895644187927, + "loss": 0.3079, + "step": 484500 + }, + { + "base_loss": 0.30721216344833374, + "epoch": 2.0429153442382812, + "grad_norm": 0.0009784712456166744, + "learning_rate": 3.7468910217285157e-06, + "lookahead_loss": 6.273552363395691, + "loss": 0.323, + "step": 485000 + }, + { + "epoch": 2.0429153442382812, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.264319371110715, + "eval_lookahead_perplexity": 525.4838047566577, + "eval_loss": 0.1416684091091156, + "eval_perplexity": 1.152194527976654, + "eval_runtime": 262.8066, + "eval_samples_per_second": 19.025, + "eval_steps_per_second": 0.597, + "step": 485000 + }, + { + "base_loss": 0.32630143281817436, + "epoch": 2.0438690185546875, + "grad_norm": 0.0009804986184462905, + "learning_rate": 3.6992073059082034e-06, + "lookahead_loss": 6.298372200012207, + "loss": 0.3415, + "step": 485500 + }, + { + "base_loss": 0.296696748316288, + "epoch": 2.0448226928710938, + "grad_norm": 0.0009967249352484941, + "learning_rate": 3.6515235900878906e-06, + "lookahead_loss": 6.2167662091255185, + "loss": 0.3097, + "step": 486000 + }, + { + "base_loss": 0.30323311913013457, + "epoch": 2.0457763671875, + "grad_norm": 0.000955309544224292, + "learning_rate": 3.6038398742675783e-06, + "lookahead_loss": 6.234561102390289, + "loss": 0.3159, + "step": 486500 + }, + { + "base_loss": 0.32944888742268086, + "epoch": 2.0467300415039062, + "grad_norm": 0.0009679795475676656, + "learning_rate": 3.556156158447266e-06, + "lookahead_loss": 6.20711869430542, + "loss": 0.3393, + "step": 487000 + }, + { + "base_loss": 0.32393511798977853, + "epoch": 2.0476837158203125, + "grad_norm": 0.000986600760370493, + "learning_rate": 3.508472442626953e-06, + "lookahead_loss": 6.246892070293426, + "loss": 0.3393, + "step": 487500 + }, + { + "base_loss": 0.293301939278841, + "epoch": 2.0486373901367188, + "grad_norm": 0.0009528908412903547, + "learning_rate": 3.460788726806641e-06, + "lookahead_loss": 6.20588248538971, + "loss": 0.3056, + "step": 488000 + }, + { + "base_loss": 0.3036652799248695, + "epoch": 2.049591064453125, + "grad_norm": 0.001002751407213509, + "learning_rate": 3.413105010986328e-06, + "lookahead_loss": 6.184293527126313, + "loss": 0.3171, + "step": 488500 + }, + { + "base_loss": 0.3174412237107754, + "epoch": 2.0505447387695312, + "grad_norm": 0.0009026027983054519, + "learning_rate": 3.3654212951660158e-06, + "lookahead_loss": 6.298267126560211, + "loss": 0.332, + "step": 489000 + }, + { + "base_loss": 0.30474287942051886, + "epoch": 2.0514984130859375, + "grad_norm": 0.0009782775305211544, + "learning_rate": 3.3177375793457034e-06, + "lookahead_loss": 6.237424486160278, + "loss": 0.3178, + "step": 489500 + }, + { + "base_loss": 0.30692395463585853, + "epoch": 2.0524520874023438, + "grad_norm": 0.001052686246111989, + "learning_rate": 3.2700538635253907e-06, + "lookahead_loss": 6.181076605796814, + "loss": 0.3177, + "step": 490000 + }, + { + "epoch": 2.0524520874023438, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.263757377386855, + "eval_lookahead_perplexity": 525.1885691244512, + "eval_loss": 0.1416672170162201, + "eval_perplexity": 1.1521931544545614, + "eval_runtime": 329.3149, + "eval_samples_per_second": 15.183, + "eval_steps_per_second": 0.477, + "step": 490000 + }, + { + "base_loss": 0.32256214889883994, + "epoch": 2.05340576171875, + "grad_norm": 0.0009564721258357167, + "learning_rate": 3.2223701477050784e-06, + "lookahead_loss": 6.204883997440338, + "loss": 0.3312, + "step": 490500 + }, + { + "base_loss": 0.3550116382241249, + "epoch": 2.0543594360351562, + "grad_norm": 0.0009583597420714796, + "learning_rate": 3.1746864318847656e-06, + "lookahead_loss": 6.232365051269531, + "loss": 0.3693, + "step": 491000 + }, + { + "base_loss": 0.2970747436285019, + "epoch": 2.0553131103515625, + "grad_norm": 0.0009528571390546858, + "learning_rate": 3.1270027160644533e-06, + "lookahead_loss": 6.238228107452392, + "loss": 0.308, + "step": 491500 + }, + { + "base_loss": 0.30645539990067483, + "epoch": 2.0562667846679688, + "grad_norm": 0.000955388299189508, + "learning_rate": 3.079319000244141e-06, + "lookahead_loss": 6.246179574489593, + "loss": 0.3167, + "step": 492000 + }, + { + "base_loss": 0.31723022189736366, + "epoch": 2.057220458984375, + "grad_norm": 0.0009772485354915261, + "learning_rate": 3.031635284423828e-06, + "lookahead_loss": 6.266298104286194, + "loss": 0.3307, + "step": 492500 + }, + { + "base_loss": 0.3193240025639534, + "epoch": 2.0581741333007812, + "grad_norm": 0.0009720840025693178, + "learning_rate": 2.983951568603516e-06, + "lookahead_loss": 6.26861172580719, + "loss": 0.3271, + "step": 493000 + }, + { + "base_loss": 0.2937832759618759, + "epoch": 2.0591278076171875, + "grad_norm": 0.0009176793391816318, + "learning_rate": 2.936267852783203e-06, + "lookahead_loss": 6.168696069717408, + "loss": 0.3071, + "step": 493500 + }, + { + "base_loss": 0.30271227744221685, + "epoch": 2.0600814819335938, + "grad_norm": 0.0009546041255816817, + "learning_rate": 2.8885841369628908e-06, + "lookahead_loss": 6.2382691907882695, + "loss": 0.3164, + "step": 494000 + }, + { + "base_loss": 0.3198817696869373, + "epoch": 2.06103515625, + "grad_norm": 0.000980038894340396, + "learning_rate": 2.8409004211425784e-06, + "lookahead_loss": 6.195622821807861, + "loss": 0.3306, + "step": 494500 + }, + { + "base_loss": 0.3065698970258236, + "epoch": 2.0619888305664062, + "grad_norm": 0.0010103557724505663, + "learning_rate": 2.7932167053222657e-06, + "lookahead_loss": 6.207577312469483, + "loss": 0.3147, + "step": 495000 + }, + { + "epoch": 2.0619888305664062, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.263126853936777, + "eval_lookahead_perplexity": 524.857529790904, + "eval_loss": 0.14166606962680817, + "eval_perplexity": 1.152191832441094, + "eval_runtime": 261.5077, + "eval_samples_per_second": 19.12, + "eval_steps_per_second": 0.6, + "step": 495000 + }, + { + "base_loss": 0.30793745544552803, + "epoch": 2.0629425048828125, + "grad_norm": 0.0010035582818090916, + "learning_rate": 2.7455329895019534e-06, + "lookahead_loss": 6.2057581758499145, + "loss": 0.3177, + "step": 495500 + }, + { + "base_loss": 0.3166033121049404, + "epoch": 2.0638961791992188, + "grad_norm": 0.0009549797978252172, + "learning_rate": 2.6978492736816406e-06, + "lookahead_loss": 6.229872955322266, + "loss": 0.3283, + "step": 496000 + }, + { + "base_loss": 0.30278992640972135, + "epoch": 2.064849853515625, + "grad_norm": 0.0009489752119407058, + "learning_rate": 2.6501655578613283e-06, + "lookahead_loss": 6.260746804714203, + "loss": 0.318, + "step": 496500 + }, + { + "base_loss": 0.30789859166741373, + "epoch": 2.0658035278320312, + "grad_norm": 0.0009766009170562029, + "learning_rate": 2.602481842041016e-06, + "lookahead_loss": 6.182268424510956, + "loss": 0.3194, + "step": 497000 + }, + { + "base_loss": 0.307515013217926, + "epoch": 2.0667572021484375, + "grad_norm": 0.0009974743006750941, + "learning_rate": 2.554798126220703e-06, + "lookahead_loss": 6.173391879081726, + "loss": 0.3162, + "step": 497500 + }, + { + "base_loss": 0.3304452752768993, + "epoch": 2.0677108764648438, + "grad_norm": 0.0010042872745543718, + "learning_rate": 2.507114410400391e-06, + "lookahead_loss": 6.237967335224152, + "loss": 0.3422, + "step": 498000 + }, + { + "base_loss": 0.3000219973921776, + "epoch": 2.06866455078125, + "grad_norm": 0.0010214201174676418, + "learning_rate": 2.459430694580078e-06, + "lookahead_loss": 6.208795167446136, + "loss": 0.3092, + "step": 498500 + }, + { + "base_loss": 0.3050138043165207, + "epoch": 2.0696182250976562, + "grad_norm": 0.0009576964075677097, + "learning_rate": 2.4117469787597658e-06, + "lookahead_loss": 6.242776551246643, + "loss": 0.3164, + "step": 499000 + }, + { + "base_loss": 0.34709723374247553, + "epoch": 2.0705718994140625, + "grad_norm": 0.000993231893517077, + "learning_rate": 2.3640632629394534e-06, + "lookahead_loss": 6.156780051231384, + "loss": 0.3596, + "step": 499500 + }, + { + "base_loss": 0.31454886627197265, + "epoch": 2.0715255737304688, + "grad_norm": 0.0009543150081299245, + "learning_rate": 2.3163795471191407e-06, + "lookahead_loss": 6.195955769062042, + "loss": 0.3257, + "step": 500000 + }, + { + "epoch": 2.0715255737304688, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.26263678035797, + "eval_lookahead_perplexity": 524.6003740006914, + "eval_loss": 0.14166513085365295, + "eval_perplexity": 1.1521907507948397, + "eval_runtime": 275.4101, + "eval_samples_per_second": 18.155, + "eval_steps_per_second": 0.57, + "step": 500000 + }, + { + "base_loss": 0.30621468406915664, + "epoch": 2.072479248046875, + "grad_norm": 0.0009792483178898692, + "learning_rate": 2.2686958312988284e-06, + "lookahead_loss": 6.2803432626724245, + "loss": 0.3179, + "step": 500500 + }, + { + "base_loss": 0.3062588813006878, + "epoch": 2.0734329223632812, + "grad_norm": 0.0009362597484141588, + "learning_rate": 2.2210121154785156e-06, + "lookahead_loss": 6.2517331213951115, + "loss": 0.3185, + "step": 501000 + }, + { + "base_loss": 0.3277868445813656, + "epoch": 2.0743865966796875, + "grad_norm": 0.0009583532810211182, + "learning_rate": 2.1733283996582033e-06, + "lookahead_loss": 6.266947806835175, + "loss": 0.3402, + "step": 501500 + }, + { + "base_loss": 0.30303199696540833, + "epoch": 2.0753402709960938, + "grad_norm": 0.0009691590094007552, + "learning_rate": 2.125644683837891e-06, + "lookahead_loss": 6.343798541069031, + "loss": 0.3149, + "step": 502000 + }, + { + "base_loss": 0.30761926966905595, + "epoch": 2.0762939453125, + "grad_norm": 0.0009193113073706627, + "learning_rate": 2.077960968017578e-06, + "lookahead_loss": 6.292341439723969, + "loss": 0.3183, + "step": 502500 + }, + { + "base_loss": 0.33150802648067473, + "epoch": 2.0772476196289062, + "grad_norm": 0.0009670378058217466, + "learning_rate": 2.030277252197266e-06, + "lookahead_loss": 6.284011145591736, + "loss": 0.344, + "step": 503000 + }, + { + "base_loss": 0.30574921500682833, + "epoch": 2.0782012939453125, + "grad_norm": 0.0009755421779118478, + "learning_rate": 1.982593536376953e-06, + "lookahead_loss": 6.284181317806244, + "loss": 0.3172, + "step": 503500 + }, + { + "base_loss": 0.2994054418802261, + "epoch": 2.0791549682617188, + "grad_norm": 0.0009333517518825829, + "learning_rate": 1.9349098205566408e-06, + "lookahead_loss": 6.281347653388977, + "loss": 0.3131, + "step": 504000 + }, + { + "base_loss": 0.31194803246855735, + "epoch": 2.080108642578125, + "grad_norm": 0.000949243491049856, + "learning_rate": 1.8872261047363282e-06, + "lookahead_loss": 6.265330550193787, + "loss": 0.3291, + "step": 504500 + }, + { + "base_loss": 0.3244352611005306, + "epoch": 2.0810623168945312, + "grad_norm": 0.0010095882462337613, + "learning_rate": 1.8395423889160157e-06, + "lookahead_loss": 6.2886764497756955, + "loss": 0.3387, + "step": 505000 + }, + { + "epoch": 2.0810623168945312, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.262195241337006, + "eval_lookahead_perplexity": 524.3687935948129, + "eval_loss": 0.1416643261909485, + "eval_perplexity": 1.1521898236702872, + "eval_runtime": 285.1859, + "eval_samples_per_second": 17.532, + "eval_steps_per_second": 0.551, + "step": 505000 + }, + { + "base_loss": 0.30112930870056154, + "epoch": 2.0820159912109375, + "grad_norm": 0.0009192558936774731, + "learning_rate": 1.7918586730957031e-06, + "lookahead_loss": 6.295443790435791, + "loss": 0.3128, + "step": 505500 + }, + { + "base_loss": 0.30448419651389125, + "epoch": 2.0829696655273438, + "grad_norm": 0.000990957603789866, + "learning_rate": 1.7441749572753908e-06, + "lookahead_loss": 6.325591234683991, + "loss": 0.3178, + "step": 506000 + }, + { + "base_loss": 0.33415990057587625, + "epoch": 2.08392333984375, + "grad_norm": 0.0009418633999302983, + "learning_rate": 1.6964912414550783e-06, + "lookahead_loss": 6.339192730903625, + "loss": 0.3452, + "step": 506500 + }, + { + "base_loss": 0.31056174263358116, + "epoch": 2.0848770141601562, + "grad_norm": 0.000988660496659577, + "learning_rate": 1.6488075256347657e-06, + "lookahead_loss": 6.263816433906555, + "loss": 0.3206, + "step": 507000 + }, + { + "base_loss": 0.2973758824914694, + "epoch": 2.0858306884765625, + "grad_norm": 0.001018465030938387, + "learning_rate": 1.6011238098144532e-06, + "lookahead_loss": 6.263220739364624, + "loss": 0.3086, + "step": 507500 + }, + { + "base_loss": 0.3042152850329876, + "epoch": 2.0867843627929688, + "grad_norm": 0.0010985544649884105, + "learning_rate": 1.5534400939941406e-06, + "lookahead_loss": 6.2364231939315795, + "loss": 0.3137, + "step": 508000 + }, + { + "base_loss": 0.337257578343153, + "epoch": 2.087738037109375, + "grad_norm": 0.0008788631530478597, + "learning_rate": 1.505756378173828e-06, + "lookahead_loss": 6.284335279464722, + "loss": 0.3438, + "step": 508500 + }, + { + "base_loss": 0.3000968562066555, + "epoch": 2.0886917114257812, + "grad_norm": 0.0010022291680797935, + "learning_rate": 1.4580726623535158e-06, + "lookahead_loss": 6.2766430473327635, + "loss": 0.3112, + "step": 509000 + }, + { + "base_loss": 0.3103375973403454, + "epoch": 2.0896453857421875, + "grad_norm": 0.0009887145133689046, + "learning_rate": 1.4103889465332032e-06, + "lookahead_loss": 6.313614233970642, + "loss": 0.3183, + "step": 509500 + }, + { + "base_loss": 0.29991284269094465, + "epoch": 2.0905990600585938, + "grad_norm": 0.0009741162066347897, + "learning_rate": 1.3627052307128907e-06, + "lookahead_loss": 6.287968347549438, + "loss": 0.3118, + "step": 510000 + }, + { + "epoch": 2.0905990600585938, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.261872408869929, + "eval_lookahead_perplexity": 524.1995376456467, + "eval_loss": 0.14166373014450073, + "eval_perplexity": 1.1521891369118402, + "eval_runtime": 266.5991, + "eval_samples_per_second": 18.755, + "eval_steps_per_second": 0.589, + "step": 510000 + }, + { + "base_loss": 0.3000768061578274, + "epoch": 2.091552734375, + "grad_norm": 0.0008802406955510378, + "learning_rate": 1.3150215148925781e-06, + "lookahead_loss": 6.250903000354767, + "loss": 0.3106, + "step": 510500 + }, + { + "base_loss": 0.31945454320311545, + "epoch": 2.0925064086914062, + "grad_norm": 0.0009589286637492478, + "learning_rate": 1.2673377990722656e-06, + "lookahead_loss": 6.295792224884033, + "loss": 0.3331, + "step": 511000 + }, + { + "base_loss": 0.309114942163229, + "epoch": 2.0934600830078125, + "grad_norm": 0.0008792807930149138, + "learning_rate": 1.2196540832519533e-06, + "lookahead_loss": 6.295494238376618, + "loss": 0.3194, + "step": 511500 + }, + { + "base_loss": 0.28753899577260017, + "epoch": 2.0944137573242188, + "grad_norm": 0.000960128556471318, + "learning_rate": 1.1719703674316407e-06, + "lookahead_loss": 6.281352967262268, + "loss": 0.3032, + "step": 512000 + }, + { + "base_loss": 0.29245217123627665, + "epoch": 2.095367431640625, + "grad_norm": 0.0009780465625226498, + "learning_rate": 1.1242866516113282e-06, + "lookahead_loss": 6.217807303905487, + "loss": 0.3078, + "step": 512500 + }, + { + "base_loss": 0.30112256136536597, + "epoch": 2.0963211059570312, + "grad_norm": 0.0009418194531463087, + "learning_rate": 1.0766029357910156e-06, + "lookahead_loss": 6.289472890853882, + "loss": 0.3133, + "step": 513000 + }, + { + "base_loss": 0.3297825155258179, + "epoch": 2.0972747802734375, + "grad_norm": 0.0009738854714669287, + "learning_rate": 1.028919219970703e-06, + "lookahead_loss": 6.311433359146118, + "loss": 0.3409, + "step": 513500 + }, + { + "base_loss": 0.2911633634865284, + "epoch": 2.0982284545898438, + "grad_norm": 0.0009711748571135104, + "learning_rate": 9.812355041503908e-07, + "lookahead_loss": 6.2425033135414125, + "loss": 0.306, + "step": 514000 + }, + { + "base_loss": 0.2934698580801487, + "epoch": 2.09918212890625, + "grad_norm": 0.0009796826634556055, + "learning_rate": 9.335517883300781e-07, + "lookahead_loss": 6.291777185440063, + "loss": 0.3096, + "step": 514500 + }, + { + "base_loss": 0.3032768616080284, + "epoch": 2.1001358032226562, + "grad_norm": 0.0009275350021198392, + "learning_rate": 8.858680725097657e-07, + "lookahead_loss": 6.298918974876404, + "loss": 0.3147, + "step": 515000 + }, + { + "epoch": 2.1001358032226562, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.261701932730385, + "eval_lookahead_perplexity": 524.1101817488584, + "eval_loss": 0.1416633427143097, + "eval_perplexity": 1.1521886905190692, + "eval_runtime": 277.4688, + "eval_samples_per_second": 18.02, + "eval_steps_per_second": 0.566, + "step": 515000 + }, + { + "base_loss": 0.3233104472160339, + "epoch": 2.1010894775390625, + "grad_norm": 0.0009578253957442939, + "learning_rate": 8.381843566894531e-07, + "lookahead_loss": 6.362341589927674, + "loss": 0.333, + "step": 515500 + }, + { + "base_loss": 0.30158160945773127, + "epoch": 2.1020431518554688, + "grad_norm": 0.0009492792305536568, + "learning_rate": 7.905006408691407e-07, + "lookahead_loss": 6.3039414925575255, + "loss": 0.3126, + "step": 516000 + }, + { + "base_loss": 0.29719595339894295, + "epoch": 2.102996826171875, + "grad_norm": 0.0009683790849521756, + "learning_rate": 7.428169250488282e-07, + "lookahead_loss": 6.287424234390259, + "loss": 0.3125, + "step": 516500 + }, + { + "base_loss": 0.3008777514696121, + "epoch": 2.1039505004882812, + "grad_norm": 0.0009997799061238766, + "learning_rate": 6.951332092285156e-07, + "lookahead_loss": 6.316920805931091, + "loss": 0.3115, + "step": 517000 + }, + { + "base_loss": 0.31517351168394087, + "epoch": 2.1049041748046875, + "grad_norm": 0.0009900464210659266, + "learning_rate": 6.474494934082032e-07, + "lookahead_loss": 6.248059861183166, + "loss": 0.3275, + "step": 517500 + }, + { + "base_loss": 0.3079349631667137, + "epoch": 2.1058578491210938, + "grad_norm": 0.0009343238198198378, + "learning_rate": 5.997657775878906e-07, + "lookahead_loss": 6.2560534324646, + "loss": 0.3205, + "step": 518000 + }, + { + "base_loss": 0.29828172570466993, + "epoch": 2.1068115234375, + "grad_norm": 0.0009323744452558458, + "learning_rate": 5.520820617675782e-07, + "lookahead_loss": 6.324647262573242, + "loss": 0.3082, + "step": 518500 + }, + { + "base_loss": 0.29354105108976364, + "epoch": 2.1077651977539062, + "grad_norm": 0.0009788897586986423, + "learning_rate": 5.043983459472657e-07, + "lookahead_loss": 6.246097377300263, + "loss": 0.3059, + "step": 519000 + }, + { + "base_loss": 0.3199558552503586, + "epoch": 2.1087188720703125, + "grad_norm": 0.0010072775185108185, + "learning_rate": 4.5671463012695317e-07, + "lookahead_loss": 6.2578262209892275, + "loss": 0.3328, + "step": 519500 + }, + { + "base_loss": 0.31955078572034834, + "epoch": 2.1096725463867188, + "grad_norm": 0.0009075988782569766, + "learning_rate": 4.0903091430664063e-07, + "lookahead_loss": 6.215923315525055, + "loss": 0.3287, + "step": 520000 + }, + { + "epoch": 2.1096725463867188, + "eval_accuracy": 0.002520743639921722, + "eval_base_loss": 0.1298022617856725, + "eval_base_perplexity": 1.1386032156965338, + "eval_lookahead_loss": 6.261576982351919, + "eval_lookahead_perplexity": 524.0446980744811, + "eval_loss": 0.1416631042957306, + "eval_perplexity": 1.1521884158159117, + "eval_runtime": 280.4129, + "eval_samples_per_second": 17.831, + "eval_steps_per_second": 0.56, + "step": 520000 + }, + { + "base_loss": 0.30168747982382776, + "epoch": 2.110626220703125, + "grad_norm": 0.0009171248530037701, + "learning_rate": 3.6134719848632814e-07, + "lookahead_loss": 6.176019702911377, + "loss": 0.3148, + "step": 520500 + }, + { + "base_loss": 0.29647291442751883, + "epoch": 2.1115798950195312, + "grad_norm": 0.001002452103421092, + "learning_rate": 3.1366348266601565e-07, + "lookahead_loss": 6.290383939743042, + "loss": 0.3072, + "step": 521000 + }, + { + "base_loss": 0.3107553372234106, + "epoch": 2.1125335693359375, + "grad_norm": 0.0009536948637105525, + "learning_rate": 2.6597976684570316e-07, + "lookahead_loss": 6.31272796344757, + "loss": 0.322, + "step": 521500 + }, + { + "base_loss": 0.34275346267223356, + "epoch": 2.1134872436523438, + "grad_norm": 0.001009950996376574, + "learning_rate": 2.1829605102539064e-07, + "lookahead_loss": 6.32132270526886, + "loss": 0.3504, + "step": 522000 + }, + { + "base_loss": 0.29299941608309743, + "epoch": 2.11444091796875, + "grad_norm": 0.0009339774260297418, + "learning_rate": 1.7061233520507813e-07, + "lookahead_loss": 6.207951300144195, + "loss": 0.3073, + "step": 522500 + }, + { + "base_loss": 0.29959108304977417, + "epoch": 2.1153945922851562, + "grad_norm": 0.0009626490063965321, + "learning_rate": 1.2292861938476564e-07, + "lookahead_loss": 6.268310222625733, + "loss": 0.3103, + "step": 523000 + }, + { + "base_loss": 0.3146350940167904, + "epoch": 2.1163482666015625, + "grad_norm": 0.0010090811410918832, + "learning_rate": 7.524490356445312e-08, + "lookahead_loss": 6.263385523796082, + "loss": 0.3238, + "step": 523500 + }, + { + "base_loss": 0.32594672916829587, + "epoch": 2.1173019409179688, + "grad_norm": 0.0009779389947652817, + "learning_rate": 2.7561187744140627e-08, + "lookahead_loss": 6.26274676322937, + "loss": 0.3421, + "step": 524000 + }, + { + "epoch": 2.1178512573242188, + "step": 524288, + "total_flos": 4.036319640756106e+19, + "train_loss": 0.07611130246368703, + "train_runtime": 105022.8966, + "train_samples_per_second": 159.748, + "train_steps_per_second": 4.992 + } + ], + "logging_steps": 500, + "max_steps": 524288, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.036319640756106e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}