| { |
| "best_global_step": null, |
| "best_metric": 0.1416620910167694, |
| "best_model_checkpoint": null, |
| "epoch": 2.1083145141601562, |
| "eval_steps": 5000, |
| "global_step": 524288, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "base_loss": 0.3044133240580559, |
| "epoch": 0.00095367431640625, |
| "grad_norm": 0.0010083107044920325, |
| "learning_rate": 4.995241165161133e-05, |
| "lookahead_loss": 10.315906455993652, |
| "loss": 0.3208, |
| "step": 500 |
| }, |
| { |
| "base_loss": 0.30059696701169014, |
| "epoch": 0.0019073486328125, |
| "grad_norm": 0.0010226964950561523, |
| "learning_rate": 4.990472793579102e-05, |
| "lookahead_loss": 10.178516641616822, |
| "loss": 0.3205, |
| "step": 1000 |
| }, |
| { |
| "base_loss": 0.31169990518689156, |
| "epoch": 0.00286102294921875, |
| "grad_norm": 0.001013169065117836, |
| "learning_rate": 4.98570442199707e-05, |
| "lookahead_loss": 10.051177593231202, |
| "loss": 0.3281, |
| "step": 1500 |
| }, |
| { |
| "base_loss": 0.3227726019620895, |
| "epoch": 0.003814697265625, |
| "grad_norm": 0.0010217369999736547, |
| "learning_rate": 4.9809360504150393e-05, |
| "lookahead_loss": 9.926475008010865, |
| "loss": 0.3417, |
| "step": 2000 |
| }, |
| { |
| "base_loss": 0.3022470915019512, |
| "epoch": 0.00476837158203125, |
| "grad_norm": 0.0010057457257062197, |
| "learning_rate": 4.9761676788330084e-05, |
| "lookahead_loss": 9.79694257736206, |
| "loss": 0.3232, |
| "step": 2500 |
| }, |
| { |
| "base_loss": 0.30552061820030213, |
| "epoch": 0.0057220458984375, |
| "grad_norm": 0.0008910459582693875, |
| "learning_rate": 4.971399307250977e-05, |
| "lookahead_loss": 9.700162549972534, |
| "loss": 0.3197, |
| "step": 3000 |
| }, |
| { |
| "base_loss": 0.2953472335338593, |
| "epoch": 0.00667572021484375, |
| "grad_norm": 0.001025513163767755, |
| "learning_rate": 4.966630935668946e-05, |
| "lookahead_loss": 9.54606210899353, |
| "loss": 0.3201, |
| "step": 3500 |
| }, |
| { |
| "base_loss": 0.312746944963932, |
| "epoch": 0.00762939453125, |
| "grad_norm": 0.0010036778403446078, |
| "learning_rate": 4.961862564086914e-05, |
| "lookahead_loss": 9.464179010391236, |
| "loss": 0.3296, |
| "step": 4000 |
| }, |
| { |
| "base_loss": 0.3169711889922619, |
| "epoch": 0.00858306884765625, |
| "grad_norm": 0.0009707122226245701, |
| "learning_rate": 4.957094192504883e-05, |
| "lookahead_loss": 9.352066455841065, |
| "loss": 0.3273, |
| "step": 4500 |
| }, |
| { |
| "base_loss": 0.306710629016161, |
| "epoch": 0.0095367431640625, |
| "grad_norm": 0.001031655934639275, |
| "learning_rate": 4.952325820922852e-05, |
| "lookahead_loss": 9.252665700912475, |
| "loss": 0.3249, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 9.162839600072502, |
| "eval_lookahead_perplexity": 9536.09737037476, |
| "eval_loss": 0.147327721118927, |
| "eval_perplexity": 1.158733642297792, |
| "eval_runtime": 480.4293, |
| "eval_samples_per_second": 10.407, |
| "eval_steps_per_second": 0.327, |
| "step": 5000 |
| }, |
| { |
| "base_loss": 0.30083237382769584, |
| "epoch": 0.01049041748046875, |
| "grad_norm": 0.0010012760758399963, |
| "learning_rate": 4.9475574493408205e-05, |
| "lookahead_loss": 9.12458013534546, |
| "loss": 0.316, |
| "step": 5500 |
| }, |
| { |
| "base_loss": 0.2993237827420235, |
| "epoch": 0.011444091796875, |
| "grad_norm": 0.001035403460264206, |
| "learning_rate": 4.9427890777587895e-05, |
| "lookahead_loss": 9.04572417831421, |
| "loss": 0.3185, |
| "step": 6000 |
| }, |
| { |
| "base_loss": 0.3238567093908787, |
| "epoch": 0.01239776611328125, |
| "grad_norm": 0.0008969915215857327, |
| "learning_rate": 4.938020706176758e-05, |
| "lookahead_loss": 8.952016605377198, |
| "loss": 0.3386, |
| "step": 6500 |
| }, |
| { |
| "base_loss": 0.3051931007504463, |
| "epoch": 0.0133514404296875, |
| "grad_norm": 0.000971041910815984, |
| "learning_rate": 4.933252334594727e-05, |
| "lookahead_loss": 8.886044243812561, |
| "loss": 0.3244, |
| "step": 7000 |
| }, |
| { |
| "base_loss": 0.29808008483052256, |
| "epoch": 0.01430511474609375, |
| "grad_norm": 0.0009857703698799014, |
| "learning_rate": 4.928483963012696e-05, |
| "lookahead_loss": 8.771625858306885, |
| "loss": 0.3177, |
| "step": 7500 |
| }, |
| { |
| "base_loss": 0.29345863962173463, |
| "epoch": 0.0152587890625, |
| "grad_norm": 0.0009497535647824407, |
| "learning_rate": 4.923715591430664e-05, |
| "lookahead_loss": 8.633492926597595, |
| "loss": 0.3098, |
| "step": 8000 |
| }, |
| { |
| "base_loss": 0.3092884007692337, |
| "epoch": 0.01621246337890625, |
| "grad_norm": 0.0010520165087655187, |
| "learning_rate": 4.918947219848633e-05, |
| "lookahead_loss": 8.621233073234558, |
| "loss": 0.3252, |
| "step": 8500 |
| }, |
| { |
| "base_loss": 0.31143338218331335, |
| "epoch": 0.0171661376953125, |
| "grad_norm": 0.0009231261792592704, |
| "learning_rate": 4.9141788482666016e-05, |
| "lookahead_loss": 8.558024926185608, |
| "loss": 0.3269, |
| "step": 9000 |
| }, |
| { |
| "base_loss": 0.3001442384421825, |
| "epoch": 0.01811981201171875, |
| "grad_norm": 0.0009771535405889153, |
| "learning_rate": 4.9094104766845706e-05, |
| "lookahead_loss": 8.481963250160216, |
| "loss": 0.3153, |
| "step": 9500 |
| }, |
| { |
| "base_loss": 0.2986592257618904, |
| "epoch": 0.019073486328125, |
| "grad_norm": 0.000987049425020814, |
| "learning_rate": 4.9046421051025396e-05, |
| "lookahead_loss": 8.409450398445129, |
| "loss": 0.3149, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.019073486328125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 8.298678596179705, |
| "eval_lookahead_perplexity": 4018.5587449798586, |
| "eval_loss": 0.14565235376358032, |
| "eval_perplexity": 1.1567939630712722, |
| "eval_runtime": 491.9, |
| "eval_samples_per_second": 10.165, |
| "eval_steps_per_second": 0.319, |
| "step": 10000 |
| }, |
| { |
| "base_loss": 0.30347599306702616, |
| "epoch": 0.02002716064453125, |
| "grad_norm": 0.0010771030792966485, |
| "learning_rate": 4.899873733520508e-05, |
| "lookahead_loss": 8.271695964813233, |
| "loss": 0.3189, |
| "step": 10500 |
| }, |
| { |
| "base_loss": 0.3299741225540638, |
| "epoch": 0.0209808349609375, |
| "grad_norm": 0.0009491143864579499, |
| "learning_rate": 4.895105361938477e-05, |
| "lookahead_loss": 8.27685186958313, |
| "loss": 0.344, |
| "step": 11000 |
| }, |
| { |
| "base_loss": 0.3070560489296913, |
| "epoch": 0.02193450927734375, |
| "grad_norm": 0.0009909559739753604, |
| "learning_rate": 4.890336990356445e-05, |
| "lookahead_loss": 8.1793439950943, |
| "loss": 0.3199, |
| "step": 11500 |
| }, |
| { |
| "base_loss": 0.301061170309782, |
| "epoch": 0.02288818359375, |
| "grad_norm": 0.001020422438159585, |
| "learning_rate": 4.8855686187744143e-05, |
| "lookahead_loss": 8.126035836219788, |
| "loss": 0.3167, |
| "step": 12000 |
| }, |
| { |
| "base_loss": 0.30337609922885894, |
| "epoch": 0.02384185791015625, |
| "grad_norm": 0.0009905572514981031, |
| "learning_rate": 4.8808002471923834e-05, |
| "lookahead_loss": 8.095245086669921, |
| "loss": 0.3178, |
| "step": 12500 |
| }, |
| { |
| "base_loss": 0.3241444931924343, |
| "epoch": 0.0247955322265625, |
| "grad_norm": 0.0009353117784485221, |
| "learning_rate": 4.876031875610352e-05, |
| "lookahead_loss": 8.033969619750977, |
| "loss": 0.3394, |
| "step": 13000 |
| }, |
| { |
| "base_loss": 0.3070600248277187, |
| "epoch": 0.02574920654296875, |
| "grad_norm": 0.000984247657470405, |
| "learning_rate": 4.871263504028321e-05, |
| "lookahead_loss": 7.95574036693573, |
| "loss": 0.3244, |
| "step": 13500 |
| }, |
| { |
| "base_loss": 0.3022406686246395, |
| "epoch": 0.026702880859375, |
| "grad_norm": 0.001025758683681488, |
| "learning_rate": 4.866495132446289e-05, |
| "lookahead_loss": 7.927939188957215, |
| "loss": 0.3141, |
| "step": 14000 |
| }, |
| { |
| "base_loss": 0.30680677881836893, |
| "epoch": 0.02765655517578125, |
| "grad_norm": 0.0009658489725552499, |
| "learning_rate": 4.861726760864258e-05, |
| "lookahead_loss": 7.988002327919006, |
| "loss": 0.3219, |
| "step": 14500 |
| }, |
| { |
| "base_loss": 0.33426042160391806, |
| "epoch": 0.0286102294921875, |
| "grad_norm": 0.0010099124629050493, |
| "learning_rate": 4.856958389282227e-05, |
| "lookahead_loss": 7.973947680473327, |
| "loss": 0.3455, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.0286102294921875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.854276204642396, |
| "eval_lookahead_perplexity": 2576.729413719151, |
| "eval_loss": 0.1447858065366745, |
| "eval_perplexity": 1.1557919806657084, |
| "eval_runtime": 483.93, |
| "eval_samples_per_second": 10.332, |
| "eval_steps_per_second": 0.324, |
| "step": 15000 |
| }, |
| { |
| "base_loss": 0.3049518305659294, |
| "epoch": 0.02956390380859375, |
| "grad_norm": 0.0009620142518542707, |
| "learning_rate": 4.8521900177001955e-05, |
| "lookahead_loss": 7.91404645729065, |
| "loss": 0.3161, |
| "step": 15500 |
| }, |
| { |
| "base_loss": 0.3062360401749611, |
| "epoch": 0.030517578125, |
| "grad_norm": 0.0009641471551731229, |
| "learning_rate": 4.8474216461181645e-05, |
| "lookahead_loss": 7.844175812721253, |
| "loss": 0.3196, |
| "step": 16000 |
| }, |
| { |
| "base_loss": 0.30225355681777, |
| "epoch": 0.03147125244140625, |
| "grad_norm": 0.0009732363396324217, |
| "learning_rate": 4.842653274536133e-05, |
| "lookahead_loss": 7.831875602722168, |
| "loss": 0.3166, |
| "step": 16500 |
| }, |
| { |
| "base_loss": 0.3184074863195419, |
| "epoch": 0.0324249267578125, |
| "grad_norm": 0.0010106490226462483, |
| "learning_rate": 4.837884902954102e-05, |
| "lookahead_loss": 7.771908633232116, |
| "loss": 0.3381, |
| "step": 17000 |
| }, |
| { |
| "base_loss": 0.30629492220282556, |
| "epoch": 0.03337860107421875, |
| "grad_norm": 0.0010188270825892687, |
| "learning_rate": 4.833116531372071e-05, |
| "lookahead_loss": 7.789399157524109, |
| "loss": 0.3185, |
| "step": 17500 |
| }, |
| { |
| "base_loss": 0.3031555346250534, |
| "epoch": 0.034332275390625, |
| "grad_norm": 0.0009390591876581311, |
| "learning_rate": 4.828348159790039e-05, |
| "lookahead_loss": 7.772115784645081, |
| "loss": 0.3169, |
| "step": 18000 |
| }, |
| { |
| "base_loss": 0.31164542263746264, |
| "epoch": 0.03528594970703125, |
| "grad_norm": 0.0010221318807452917, |
| "learning_rate": 4.823579788208008e-05, |
| "lookahead_loss": 7.639335807800293, |
| "loss": 0.3253, |
| "step": 18500 |
| }, |
| { |
| "base_loss": 0.324304408878088, |
| "epoch": 0.0362396240234375, |
| "grad_norm": 0.00101387407630682, |
| "learning_rate": 4.8188114166259766e-05, |
| "lookahead_loss": 7.712016674995422, |
| "loss": 0.3383, |
| "step": 19000 |
| }, |
| { |
| "base_loss": 0.30813179594278334, |
| "epoch": 0.03719329833984375, |
| "grad_norm": 0.0009941563475877047, |
| "learning_rate": 4.8140430450439456e-05, |
| "lookahead_loss": 7.633579847335816, |
| "loss": 0.3224, |
| "step": 19500 |
| }, |
| { |
| "base_loss": 0.30138176554441454, |
| "epoch": 0.03814697265625, |
| "grad_norm": 0.0009536141296848655, |
| "learning_rate": 4.8092746734619146e-05, |
| "lookahead_loss": 7.657862429618835, |
| "loss": 0.3164, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.03814697265625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.604301906622256, |
| "eval_lookahead_perplexity": 2006.810463506877, |
| "eval_loss": 0.14430111646652222, |
| "eval_perplexity": 1.1552319155094923, |
| "eval_runtime": 498.3437, |
| "eval_samples_per_second": 10.033, |
| "eval_steps_per_second": 0.315, |
| "step": 20000 |
| }, |
| { |
| "base_loss": 0.30871694785356524, |
| "epoch": 0.03910064697265625, |
| "grad_norm": 0.0009617543546482921, |
| "learning_rate": 4.804506301879883e-05, |
| "lookahead_loss": 7.594292169570923, |
| "loss": 0.3229, |
| "step": 20500 |
| }, |
| { |
| "base_loss": 0.32506244936585427, |
| "epoch": 0.0400543212890625, |
| "grad_norm": 0.0009832490468397737, |
| "learning_rate": 4.799737930297852e-05, |
| "lookahead_loss": 7.60844051361084, |
| "loss": 0.3364, |
| "step": 21000 |
| }, |
| { |
| "base_loss": 0.30769926142692566, |
| "epoch": 0.04100799560546875, |
| "grad_norm": 0.0009847276378422976, |
| "learning_rate": 4.79496955871582e-05, |
| "lookahead_loss": 7.544025864601135, |
| "loss": 0.3191, |
| "step": 21500 |
| }, |
| { |
| "base_loss": 0.29858891409635546, |
| "epoch": 0.041961669921875, |
| "grad_norm": 0.0010060840286314487, |
| "learning_rate": 4.7902011871337893e-05, |
| "lookahead_loss": 7.582731894493103, |
| "loss": 0.3122, |
| "step": 22000 |
| }, |
| { |
| "base_loss": 0.3094627737402916, |
| "epoch": 0.04291534423828125, |
| "grad_norm": 0.0009809609036892653, |
| "learning_rate": 4.7854328155517584e-05, |
| "lookahead_loss": 7.609361615180969, |
| "loss": 0.3268, |
| "step": 22500 |
| }, |
| { |
| "base_loss": 0.32764697542786597, |
| "epoch": 0.0438690185546875, |
| "grad_norm": 0.0009822545107454062, |
| "learning_rate": 4.780664443969727e-05, |
| "lookahead_loss": 7.582602263450623, |
| "loss": 0.343, |
| "step": 23000 |
| }, |
| { |
| "base_loss": 0.29553532418608663, |
| "epoch": 0.04482269287109375, |
| "grad_norm": 0.0010076743783429265, |
| "learning_rate": 4.775896072387696e-05, |
| "lookahead_loss": 7.513092971801758, |
| "loss": 0.3112, |
| "step": 23500 |
| }, |
| { |
| "base_loss": 0.3041268612146378, |
| "epoch": 0.0457763671875, |
| "grad_norm": 0.0009422925650142133, |
| "learning_rate": 4.771127700805664e-05, |
| "lookahead_loss": 7.501417625427246, |
| "loss": 0.3176, |
| "step": 24000 |
| }, |
| { |
| "base_loss": 0.3303914776444435, |
| "epoch": 0.04673004150390625, |
| "grad_norm": 0.000979002215899527, |
| "learning_rate": 4.766359329223633e-05, |
| "lookahead_loss": 7.502476096153259, |
| "loss": 0.3419, |
| "step": 24500 |
| }, |
| { |
| "base_loss": 0.3248122656941414, |
| "epoch": 0.0476837158203125, |
| "grad_norm": 0.0010156352072954178, |
| "learning_rate": 4.761590957641602e-05, |
| "lookahead_loss": 7.487288349151611, |
| "loss": 0.3412, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.0476837158203125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.440544437676573, |
| "eval_lookahead_perplexity": 1703.677514928871, |
| "eval_loss": 0.14398084580898285, |
| "eval_perplexity": 1.1548619878659485, |
| "eval_runtime": 492.0521, |
| "eval_samples_per_second": 10.162, |
| "eval_steps_per_second": 0.319, |
| "step": 25000 |
| }, |
| { |
| "base_loss": 0.2969100174307823, |
| "epoch": 0.04863739013671875, |
| "grad_norm": 0.0009701626840978861, |
| "learning_rate": 4.7568225860595705e-05, |
| "lookahead_loss": 7.416944421768188, |
| "loss": 0.3108, |
| "step": 25500 |
| }, |
| { |
| "base_loss": 0.302005185931921, |
| "epoch": 0.049591064453125, |
| "grad_norm": 0.0009941664757207036, |
| "learning_rate": 4.7520542144775395e-05, |
| "lookahead_loss": 7.415986575126648, |
| "loss": 0.3185, |
| "step": 26000 |
| }, |
| { |
| "base_loss": 0.31874441370368006, |
| "epoch": 0.05054473876953125, |
| "grad_norm": 0.0009409674676135182, |
| "learning_rate": 4.747285842895508e-05, |
| "lookahead_loss": 7.50030288696289, |
| "loss": 0.3333, |
| "step": 26500 |
| }, |
| { |
| "base_loss": 0.30408672893047334, |
| "epoch": 0.0514984130859375, |
| "grad_norm": 0.0009882714366540313, |
| "learning_rate": 4.742517471313477e-05, |
| "lookahead_loss": 7.433645064353943, |
| "loss": 0.3203, |
| "step": 27000 |
| }, |
| { |
| "base_loss": 0.3058005510568619, |
| "epoch": 0.05245208740234375, |
| "grad_norm": 0.0010352524695917964, |
| "learning_rate": 4.737749099731446e-05, |
| "lookahead_loss": 7.385928537368774, |
| "loss": 0.3201, |
| "step": 27500 |
| }, |
| { |
| "base_loss": 0.32026463899016383, |
| "epoch": 0.05340576171875, |
| "grad_norm": 0.0009495351114310324, |
| "learning_rate": 4.732980728149414e-05, |
| "lookahead_loss": 7.358126895904541, |
| "loss": 0.3313, |
| "step": 28000 |
| }, |
| { |
| "base_loss": 0.35889338579773905, |
| "epoch": 0.05435943603515625, |
| "grad_norm": 0.0009934919653460383, |
| "learning_rate": 4.728212356567383e-05, |
| "lookahead_loss": 7.398619123458863, |
| "loss": 0.3729, |
| "step": 28500 |
| }, |
| { |
| "base_loss": 0.29546374672651293, |
| "epoch": 0.0553131103515625, |
| "grad_norm": 0.0009958905866369605, |
| "learning_rate": 4.7234439849853516e-05, |
| "lookahead_loss": 7.388701396942139, |
| "loss": 0.3082, |
| "step": 29000 |
| }, |
| { |
| "base_loss": 0.3063408683240414, |
| "epoch": 0.05626678466796875, |
| "grad_norm": 0.0009431101498194039, |
| "learning_rate": 4.7186756134033206e-05, |
| "lookahead_loss": 7.369903712272644, |
| "loss": 0.3195, |
| "step": 29500 |
| }, |
| { |
| "base_loss": 0.3186078954935074, |
| "epoch": 0.057220458984375, |
| "grad_norm": 0.0009628318366594613, |
| "learning_rate": 4.7139072418212896e-05, |
| "lookahead_loss": 7.403464751243591, |
| "loss": 0.3334, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.057220458984375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.322395800020748, |
| "eval_lookahead_perplexity": 1513.8264541066753, |
| "eval_loss": 0.1437508761882782, |
| "eval_perplexity": 1.154596435228323, |
| "eval_runtime": 481.4129, |
| "eval_samples_per_second": 10.386, |
| "eval_steps_per_second": 0.326, |
| "step": 30000 |
| }, |
| { |
| "base_loss": 0.31768571627140046, |
| "epoch": 0.05817413330078125, |
| "grad_norm": 0.0009793334174901247, |
| "learning_rate": 4.709138870239258e-05, |
| "lookahead_loss": 7.361583526611328, |
| "loss": 0.3287, |
| "step": 30500 |
| }, |
| { |
| "base_loss": 0.2922684009075165, |
| "epoch": 0.0591278076171875, |
| "grad_norm": 0.0009462712332606316, |
| "learning_rate": 4.704370498657227e-05, |
| "lookahead_loss": 7.2946535530090335, |
| "loss": 0.3098, |
| "step": 31000 |
| }, |
| { |
| "base_loss": 0.30112267237901685, |
| "epoch": 0.06008148193359375, |
| "grad_norm": 0.0009671795414760709, |
| "learning_rate": 4.699602127075195e-05, |
| "lookahead_loss": 7.319288095474243, |
| "loss": 0.3168, |
| "step": 31500 |
| }, |
| { |
| "base_loss": 0.32029621145129206, |
| "epoch": 0.06103515625, |
| "grad_norm": 0.0009950937237590551, |
| "learning_rate": 4.6948337554931643e-05, |
| "lookahead_loss": 7.297159067153931, |
| "loss": 0.333, |
| "step": 32000 |
| }, |
| { |
| "base_loss": 0.30533574494719506, |
| "epoch": 0.06198883056640625, |
| "grad_norm": 0.0010346778435632586, |
| "learning_rate": 4.6900653839111334e-05, |
| "lookahead_loss": 7.286148567199707, |
| "loss": 0.3169, |
| "step": 32500 |
| }, |
| { |
| "base_loss": 0.30571810373663905, |
| "epoch": 0.0629425048828125, |
| "grad_norm": 0.0010247246827930212, |
| "learning_rate": 4.685297012329102e-05, |
| "lookahead_loss": 7.249496428489685, |
| "loss": 0.3185, |
| "step": 33000 |
| }, |
| { |
| "base_loss": 0.31451627737283705, |
| "epoch": 0.06389617919921875, |
| "grad_norm": 0.0009608972468413413, |
| "learning_rate": 4.680528640747071e-05, |
| "lookahead_loss": 7.3038947277069095, |
| "loss": 0.3298, |
| "step": 33500 |
| }, |
| { |
| "base_loss": 0.30425655883550645, |
| "epoch": 0.064849853515625, |
| "grad_norm": 0.0009828249458223581, |
| "learning_rate": 4.675760269165039e-05, |
| "lookahead_loss": 7.304937886238098, |
| "loss": 0.3192, |
| "step": 34000 |
| }, |
| { |
| "base_loss": 0.31105126801133154, |
| "epoch": 0.06580352783203125, |
| "grad_norm": 0.0009732933831401169, |
| "learning_rate": 4.670991897583008e-05, |
| "lookahead_loss": 7.2146655473709105, |
| "loss": 0.3228, |
| "step": 34500 |
| }, |
| { |
| "base_loss": 0.3071163959801197, |
| "epoch": 0.0667572021484375, |
| "grad_norm": 0.0009960451861843467, |
| "learning_rate": 4.666223526000977e-05, |
| "lookahead_loss": 7.182215183258057, |
| "loss": 0.3182, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.0667572021484375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.2324602169731556, |
| "eval_lookahead_perplexity": 1383.6223313598848, |
| "eval_loss": 0.14357592165470123, |
| "eval_perplexity": 1.1543944510170698, |
| "eval_runtime": 497.8756, |
| "eval_samples_per_second": 10.043, |
| "eval_steps_per_second": 0.315, |
| "step": 35000 |
| }, |
| { |
| "base_loss": 0.3321034919023514, |
| "epoch": 0.06771087646484375, |
| "grad_norm": 0.0010179802775382996, |
| "learning_rate": 4.6614551544189455e-05, |
| "lookahead_loss": 7.266714824676514, |
| "loss": 0.3445, |
| "step": 35500 |
| }, |
| { |
| "base_loss": 0.3017843673825264, |
| "epoch": 0.06866455078125, |
| "grad_norm": 0.0009934077970683575, |
| "learning_rate": 4.6566867828369145e-05, |
| "lookahead_loss": 7.266165484428406, |
| "loss": 0.312, |
| "step": 36000 |
| }, |
| { |
| "base_loss": 0.302195555627346, |
| "epoch": 0.06961822509765625, |
| "grad_norm": 0.0009844391606748104, |
| "learning_rate": 4.651918411254883e-05, |
| "lookahead_loss": 7.245750316619873, |
| "loss": 0.318, |
| "step": 36500 |
| }, |
| { |
| "base_loss": 0.3459869565963745, |
| "epoch": 0.0705718994140625, |
| "grad_norm": 0.0009586875676177442, |
| "learning_rate": 4.647150039672852e-05, |
| "lookahead_loss": 7.152301582336426, |
| "loss": 0.3617, |
| "step": 37000 |
| }, |
| { |
| "base_loss": 0.3151495299339294, |
| "epoch": 0.07152557373046875, |
| "grad_norm": 0.0009651901782490313, |
| "learning_rate": 4.642381668090821e-05, |
| "lookahead_loss": 7.200114166259765, |
| "loss": 0.3277, |
| "step": 37500 |
| }, |
| { |
| "base_loss": 0.30790447345376015, |
| "epoch": 0.072479248046875, |
| "grad_norm": 0.001032789470627904, |
| "learning_rate": 4.637613296508789e-05, |
| "lookahead_loss": 7.234860193252564, |
| "loss": 0.3219, |
| "step": 38000 |
| }, |
| { |
| "base_loss": 0.30545566940307617, |
| "epoch": 0.07343292236328125, |
| "grad_norm": 0.0009383106953464448, |
| "learning_rate": 4.632844924926758e-05, |
| "lookahead_loss": 7.182545600891113, |
| "loss": 0.32, |
| "step": 38500 |
| }, |
| { |
| "base_loss": 0.32841417971253395, |
| "epoch": 0.0743865966796875, |
| "grad_norm": 0.000990406610071659, |
| "learning_rate": 4.6280765533447266e-05, |
| "lookahead_loss": 7.212004456520081, |
| "loss": 0.3421, |
| "step": 39000 |
| }, |
| { |
| "base_loss": 0.30363579127192497, |
| "epoch": 0.07534027099609375, |
| "grad_norm": 0.0010100390063598752, |
| "learning_rate": 4.6233081817626956e-05, |
| "lookahead_loss": 7.244741122245789, |
| "loss": 0.3174, |
| "step": 39500 |
| }, |
| { |
| "base_loss": 0.30504586565494535, |
| "epoch": 0.0762939453125, |
| "grad_norm": 0.0009652067092247307, |
| "learning_rate": 4.6185398101806646e-05, |
| "lookahead_loss": 7.1760479412078855, |
| "loss": 0.3193, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.0762939453125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.159913369261038, |
| "eval_lookahead_perplexity": 1286.7994516897043, |
| "eval_loss": 0.14343445003032684, |
| "eval_perplexity": 1.1542311485105234, |
| "eval_runtime": 492.5085, |
| "eval_samples_per_second": 10.152, |
| "eval_steps_per_second": 0.319, |
| "step": 40000 |
| }, |
| { |
| "base_loss": 0.33029799509048463, |
| "epoch": 0.07724761962890625, |
| "grad_norm": 0.0010073404991999269, |
| "learning_rate": 4.613771438598633e-05, |
| "lookahead_loss": 7.19347186088562, |
| "loss": 0.3467, |
| "step": 40500 |
| }, |
| { |
| "base_loss": 0.3037954642176628, |
| "epoch": 0.0782012939453125, |
| "grad_norm": 0.0009825917659327388, |
| "learning_rate": 4.609003067016602e-05, |
| "lookahead_loss": 7.184024220466614, |
| "loss": 0.3181, |
| "step": 41000 |
| }, |
| { |
| "base_loss": 0.29821320512890814, |
| "epoch": 0.07915496826171875, |
| "grad_norm": 0.0009405228192918003, |
| "learning_rate": 4.60423469543457e-05, |
| "lookahead_loss": 7.188343933105469, |
| "loss": 0.3143, |
| "step": 41500 |
| }, |
| { |
| "base_loss": 0.3142137563228607, |
| "epoch": 0.080108642578125, |
| "grad_norm": 0.0009637173498049378, |
| "learning_rate": 4.5994663238525393e-05, |
| "lookahead_loss": 7.147234386444092, |
| "loss": 0.3317, |
| "step": 42000 |
| }, |
| { |
| "base_loss": 0.3222310249209404, |
| "epoch": 0.08106231689453125, |
| "grad_norm": 0.0009848393965512514, |
| "learning_rate": 4.5946979522705084e-05, |
| "lookahead_loss": 7.199878736495972, |
| "loss": 0.3406, |
| "step": 42500 |
| }, |
| { |
| "base_loss": 0.3002626436650753, |
| "epoch": 0.0820159912109375, |
| "grad_norm": 0.000929056026507169, |
| "learning_rate": 4.589929580688477e-05, |
| "lookahead_loss": 7.160097922325134, |
| "loss": 0.3137, |
| "step": 43000 |
| }, |
| { |
| "base_loss": 0.3045452245473862, |
| "epoch": 0.08296966552734375, |
| "grad_norm": 0.0009913038229569793, |
| "learning_rate": 4.585161209106446e-05, |
| "lookahead_loss": 7.1922206773757935, |
| "loss": 0.3198, |
| "step": 43500 |
| }, |
| { |
| "base_loss": 0.33469617655873296, |
| "epoch": 0.08392333984375, |
| "grad_norm": 0.0009477322455495596, |
| "learning_rate": 4.580392837524414e-05, |
| "lookahead_loss": 7.188063373565674, |
| "loss": 0.347, |
| "step": 44000 |
| }, |
| { |
| "base_loss": 0.30740025800466536, |
| "epoch": 0.08487701416015625, |
| "grad_norm": 0.0009691762970760465, |
| "learning_rate": 4.575624465942383e-05, |
| "lookahead_loss": 7.11799400806427, |
| "loss": 0.3204, |
| "step": 44500 |
| }, |
| { |
| "base_loss": 0.300477741509676, |
| "epoch": 0.0858306884765625, |
| "grad_norm": 0.000998710049316287, |
| "learning_rate": 4.570856094360352e-05, |
| "lookahead_loss": 7.151413684844971, |
| "loss": 0.3112, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.0858306884765625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.100333274743808, |
| "eval_lookahead_perplexity": 1212.3710598241469, |
| "eval_loss": 0.14331814646720886, |
| "eval_perplexity": 1.154096915121352, |
| "eval_runtime": 480.4538, |
| "eval_samples_per_second": 10.407, |
| "eval_steps_per_second": 0.327, |
| "step": 45000 |
| }, |
| { |
| "base_loss": 0.301901563256979, |
| "epoch": 0.08678436279296875, |
| "grad_norm": 0.0011987128527835011, |
| "learning_rate": 4.5660877227783205e-05, |
| "lookahead_loss": 7.107098340034485, |
| "loss": 0.3142, |
| "step": 45500 |
| }, |
| { |
| "base_loss": 0.338142231285572, |
| "epoch": 0.087738037109375, |
| "grad_norm": 0.0009075519046746194, |
| "learning_rate": 4.5613193511962895e-05, |
| "lookahead_loss": 7.136958950996399, |
| "loss": 0.3464, |
| "step": 46000 |
| }, |
| { |
| "base_loss": 0.3009798979461193, |
| "epoch": 0.08869171142578125, |
| "grad_norm": 0.0009948944207280874, |
| "learning_rate": 4.556550979614258e-05, |
| "lookahead_loss": 7.107201243400573, |
| "loss": 0.3132, |
| "step": 46500 |
| }, |
| { |
| "base_loss": 0.3090392453968525, |
| "epoch": 0.0896453857421875, |
| "grad_norm": 0.0010095473844558, |
| "learning_rate": 4.551782608032227e-05, |
| "lookahead_loss": 7.135781683921814, |
| "loss": 0.3192, |
| "step": 47000 |
| }, |
| { |
| "base_loss": 0.30036539113521576, |
| "epoch": 0.09059906005859375, |
| "grad_norm": 0.0009663203964009881, |
| "learning_rate": 4.547014236450196e-05, |
| "lookahead_loss": 7.099122268676758, |
| "loss": 0.3132, |
| "step": 47500 |
| }, |
| { |
| "base_loss": 0.3006012495756149, |
| "epoch": 0.091552734375, |
| "grad_norm": 0.0009152375860139728, |
| "learning_rate": 4.542245864868164e-05, |
| "lookahead_loss": 7.080106061935425, |
| "loss": 0.3121, |
| "step": 48000 |
| }, |
| { |
| "base_loss": 0.31875682109594344, |
| "epoch": 0.09250640869140625, |
| "grad_norm": 0.0009438424604013562, |
| "learning_rate": 4.537477493286133e-05, |
| "lookahead_loss": 7.104524848937988, |
| "loss": 0.3365, |
| "step": 48500 |
| }, |
| { |
| "base_loss": 0.3104289738535881, |
| "epoch": 0.0934600830078125, |
| "grad_norm": 0.000929334491956979, |
| "learning_rate": 4.5327091217041016e-05, |
| "lookahead_loss": 7.102272230148316, |
| "loss": 0.3228, |
| "step": 49000 |
| }, |
| { |
| "base_loss": 0.2877590928971767, |
| "epoch": 0.09441375732421875, |
| "grad_norm": 0.000983032863587141, |
| "learning_rate": 4.5279407501220706e-05, |
| "lookahead_loss": 7.092049780845642, |
| "loss": 0.3028, |
| "step": 49500 |
| }, |
| { |
| "base_loss": 0.2935507807135582, |
| "epoch": 0.095367431640625, |
| "grad_norm": 0.0009688133141025901, |
| "learning_rate": 4.523172378540039e-05, |
| "lookahead_loss": 7.024641987800599, |
| "loss": 0.3084, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.052693091261501, |
| "eval_lookahead_perplexity": 1155.9676810375186, |
| "eval_loss": 0.1432213932275772, |
| "eval_perplexity": 1.1539852579076508, |
| "eval_runtime": 492.5723, |
| "eval_samples_per_second": 10.151, |
| "eval_steps_per_second": 0.319, |
| "step": 50000 |
| }, |
| { |
| "base_loss": 0.2986202912926674, |
| "epoch": 0.09632110595703125, |
| "grad_norm": 0.0009543218766339123, |
| "learning_rate": 4.518404006958008e-05, |
| "lookahead_loss": 7.0854365358352664, |
| "loss": 0.3132, |
| "step": 50500 |
| }, |
| { |
| "base_loss": 0.3307524161040783, |
| "epoch": 0.0972747802734375, |
| "grad_norm": 0.0010054496815428138, |
| "learning_rate": 4.513635635375977e-05, |
| "lookahead_loss": 7.08996038722992, |
| "loss": 0.3427, |
| "step": 51000 |
| }, |
| { |
| "base_loss": 0.29244673988223074, |
| "epoch": 0.09822845458984375, |
| "grad_norm": 0.0010033181170001626, |
| "learning_rate": 4.508867263793945e-05, |
| "lookahead_loss": 7.052651536941529, |
| "loss": 0.3083, |
| "step": 51500 |
| }, |
| { |
| "base_loss": 0.295786843508482, |
| "epoch": 0.09918212890625, |
| "grad_norm": 0.0009846463799476624, |
| "learning_rate": 4.5040988922119143e-05, |
| "lookahead_loss": 7.07691504573822, |
| "loss": 0.3121, |
| "step": 52000 |
| }, |
| { |
| "base_loss": 0.30293611577153207, |
| "epoch": 0.10013580322265625, |
| "grad_norm": 0.0009364968864247203, |
| "learning_rate": 4.499330520629883e-05, |
| "lookahead_loss": 7.069658821105957, |
| "loss": 0.3161, |
| "step": 52500 |
| }, |
| { |
| "base_loss": 0.3240869597494602, |
| "epoch": 0.1010894775390625, |
| "grad_norm": 0.0009558585588820279, |
| "learning_rate": 4.494562149047852e-05, |
| "lookahead_loss": 7.12118856048584, |
| "loss": 0.335, |
| "step": 53000 |
| }, |
| { |
| "base_loss": 0.30599541807174685, |
| "epoch": 0.10204315185546875, |
| "grad_norm": 0.000964898441452533, |
| "learning_rate": 4.489793777465821e-05, |
| "lookahead_loss": 7.0878299045562745, |
| "loss": 0.3163, |
| "step": 53500 |
| }, |
| { |
| "base_loss": 0.2991089904308319, |
| "epoch": 0.102996826171875, |
| "grad_norm": 0.0009853472001850605, |
| "learning_rate": 4.485025405883789e-05, |
| "lookahead_loss": 7.047714894294739, |
| "loss": 0.3149, |
| "step": 54000 |
| }, |
| { |
| "base_loss": 0.30219315418601034, |
| "epoch": 0.10395050048828125, |
| "grad_norm": 0.0010090046562254429, |
| "learning_rate": 4.480257034301758e-05, |
| "lookahead_loss": 7.079160309791565, |
| "loss": 0.3135, |
| "step": 54500 |
| }, |
| { |
| "base_loss": 0.3133500624895096, |
| "epoch": 0.1049041748046875, |
| "grad_norm": 0.0009890320943668485, |
| "learning_rate": 4.4754886627197264e-05, |
| "lookahead_loss": 7.02062137889862, |
| "loss": 0.3281, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.1049041748046875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 7.00973541820392, |
| "eval_lookahead_perplexity": 1107.3614784488032, |
| "eval_loss": 0.14313539862632751, |
| "eval_perplexity": 1.1538860256723285, |
| "eval_runtime": 480.347, |
| "eval_samples_per_second": 10.409, |
| "eval_steps_per_second": 0.327, |
| "step": 55000 |
| }, |
| { |
| "base_loss": 0.3070584389269352, |
| "epoch": 0.10585784912109375, |
| "grad_norm": 0.0009455215185880661, |
| "learning_rate": 4.4707202911376955e-05, |
| "lookahead_loss": 6.998170353889465, |
| "loss": 0.3221, |
| "step": 55500 |
| }, |
| { |
| "base_loss": 0.29948241996765135, |
| "epoch": 0.1068115234375, |
| "grad_norm": 0.0009629906271584332, |
| "learning_rate": 4.4659519195556645e-05, |
| "lookahead_loss": 7.0833413105010985, |
| "loss": 0.3107, |
| "step": 56000 |
| }, |
| { |
| "base_loss": 0.29492466670274736, |
| "epoch": 0.10776519775390625, |
| "grad_norm": 0.0009873651433736086, |
| "learning_rate": 4.461183547973633e-05, |
| "lookahead_loss": 6.994237482070923, |
| "loss": 0.3092, |
| "step": 56500 |
| }, |
| { |
| "base_loss": 0.3188383647501469, |
| "epoch": 0.1087188720703125, |
| "grad_norm": 0.0010177677031606436, |
| "learning_rate": 4.456415176391602e-05, |
| "lookahead_loss": 6.982017017364502, |
| "loss": 0.3324, |
| "step": 57000 |
| }, |
| { |
| "base_loss": 0.31659464621543887, |
| "epoch": 0.10967254638671875, |
| "grad_norm": 0.0009399647242389619, |
| "learning_rate": 4.45164680480957e-05, |
| "lookahead_loss": 6.973290238380432, |
| "loss": 0.3271, |
| "step": 57500 |
| }, |
| { |
| "base_loss": 0.3013280538916588, |
| "epoch": 0.110626220703125, |
| "grad_norm": 0.0008893092744983733, |
| "learning_rate": 4.446878433227539e-05, |
| "lookahead_loss": 6.9135963726043705, |
| "loss": 0.3151, |
| "step": 58000 |
| }, |
| { |
| "base_loss": 0.29822684854269027, |
| "epoch": 0.11157989501953125, |
| "grad_norm": 0.0010066829854622483, |
| "learning_rate": 4.442110061645508e-05, |
| "lookahead_loss": 7.023115937232971, |
| "loss": 0.3097, |
| "step": 58500 |
| }, |
| { |
| "base_loss": 0.3082665235698223, |
| "epoch": 0.1125335693359375, |
| "grad_norm": 0.0009333739290013909, |
| "learning_rate": 4.4373416900634766e-05, |
| "lookahead_loss": 7.03084280014038, |
| "loss": 0.322, |
| "step": 59000 |
| }, |
| { |
| "base_loss": 0.34342152199149134, |
| "epoch": 0.11348724365234375, |
| "grad_norm": 0.001008225604891777, |
| "learning_rate": 4.4325733184814456e-05, |
| "lookahead_loss": 7.03166408252716, |
| "loss": 0.3512, |
| "step": 59500 |
| }, |
| { |
| "base_loss": 0.29527223294973376, |
| "epoch": 0.11444091796875, |
| "grad_norm": 0.0009543896303512156, |
| "learning_rate": 4.427804946899414e-05, |
| "lookahead_loss": 6.924260063171387, |
| "loss": 0.3096, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.11444091796875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.971603098007056, |
| "eval_lookahead_perplexity": 1065.930172124952, |
| "eval_loss": 0.14305971562862396, |
| "eval_perplexity": 1.153798699423495, |
| "eval_runtime": 498.2888, |
| "eval_samples_per_second": 10.034, |
| "eval_steps_per_second": 0.315, |
| "step": 60000 |
| }, |
| { |
| "base_loss": 0.2983711498081684, |
| "epoch": 0.11539459228515625, |
| "grad_norm": 0.0009778671665117145, |
| "learning_rate": 4.423036575317383e-05, |
| "lookahead_loss": 6.959155892372132, |
| "loss": 0.3119, |
| "step": 60500 |
| }, |
| { |
| "base_loss": 0.3164993856549263, |
| "epoch": 0.1163482666015625, |
| "grad_norm": 0.0010577912908047438, |
| "learning_rate": 4.418268203735352e-05, |
| "lookahead_loss": 6.982672909736634, |
| "loss": 0.326, |
| "step": 61000 |
| }, |
| { |
| "base_loss": 0.3281388694047928, |
| "epoch": 0.11730194091796875, |
| "grad_norm": 0.0010003127390518785, |
| "learning_rate": 4.41349983215332e-05, |
| "lookahead_loss": 6.978295309066772, |
| "loss": 0.3447, |
| "step": 61500 |
| }, |
| { |
| "base_loss": 0.3066762860417366, |
| "epoch": 0.118255615234375, |
| "grad_norm": 0.0010272158542647958, |
| "learning_rate": 4.4087314605712893e-05, |
| "lookahead_loss": 6.9806537971496585, |
| "loss": 0.317, |
| "step": 62000 |
| }, |
| { |
| "base_loss": 0.3002779276072979, |
| "epoch": 0.11920928955078125, |
| "grad_norm": 0.0009698990033939481, |
| "learning_rate": 4.403963088989258e-05, |
| "lookahead_loss": 7.002115357398987, |
| "loss": 0.3116, |
| "step": 62500 |
| }, |
| { |
| "base_loss": 0.3048044160306454, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.0009617453324608505, |
| "learning_rate": 4.399194717407227e-05, |
| "lookahead_loss": 7.047370400428772, |
| "loss": 0.3145, |
| "step": 63000 |
| }, |
| { |
| "base_loss": 0.2995053820014, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.0010174677008762956, |
| "learning_rate": 4.394426345825196e-05, |
| "lookahead_loss": 6.895120985031128, |
| "loss": 0.3142, |
| "step": 63500 |
| }, |
| { |
| "base_loss": 0.31198617857694627, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.0010111057199537754, |
| "learning_rate": 4.389657974243164e-05, |
| "lookahead_loss": 6.888555366516114, |
| "loss": 0.3226, |
| "step": 64000 |
| }, |
| { |
| "base_loss": 0.32396442687511445, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.0009548735106363893, |
| "learning_rate": 4.384889602661133e-05, |
| "lookahead_loss": 6.908667636871338, |
| "loss": 0.336, |
| "step": 64500 |
| }, |
| { |
| "base_loss": 0.3013957371413708, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.000966136809438467, |
| "learning_rate": 4.3801212310791014e-05, |
| "lookahead_loss": 6.905056614875793, |
| "loss": 0.3168, |
| "step": 65000 |
| }, |
| { |
| "epoch": 1.0047683715820312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.937254576637341, |
| "eval_lookahead_perplexity": 1029.9387120995427, |
| "eval_loss": 0.14299124479293823, |
| "eval_perplexity": 1.1537197005669222, |
| "eval_runtime": 494.5805, |
| "eval_samples_per_second": 10.11, |
| "eval_steps_per_second": 0.317, |
| "step": 65000 |
| }, |
| { |
| "base_loss": 0.3039788320362568, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.0008604921749792993, |
| "learning_rate": 4.3753528594970705e-05, |
| "lookahead_loss": 7.014158073425293, |
| "loss": 0.3139, |
| "step": 65500 |
| }, |
| { |
| "base_loss": 0.29717833909392355, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.0009630241547711194, |
| "learning_rate": 4.3705844879150395e-05, |
| "lookahead_loss": 6.847448231697083, |
| "loss": 0.3148, |
| "step": 66000 |
| }, |
| { |
| "base_loss": 0.31199148765206336, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.0010012584971264005, |
| "learning_rate": 4.365816116333008e-05, |
| "lookahead_loss": 6.918432865142822, |
| "loss": 0.3246, |
| "step": 66500 |
| }, |
| { |
| "base_loss": 0.3148621036410332, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.0009159519104287028, |
| "learning_rate": 4.361047744750977e-05, |
| "lookahead_loss": 6.913657369136811, |
| "loss": 0.3229, |
| "step": 67000 |
| }, |
| { |
| "base_loss": 0.30580521461367605, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.0009974334388971329, |
| "learning_rate": 4.356279373168945e-05, |
| "lookahead_loss": 6.923233027458191, |
| "loss": 0.3191, |
| "step": 67500 |
| }, |
| { |
| "base_loss": 0.3015244754254818, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.0009639709023758769, |
| "learning_rate": 4.351511001586914e-05, |
| "lookahead_loss": 6.88383205986023, |
| "loss": 0.3128, |
| "step": 68000 |
| }, |
| { |
| "base_loss": 0.30137019059062004, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.0010148598812520504, |
| "learning_rate": 4.346742630004883e-05, |
| "lookahead_loss": 6.898301356315613, |
| "loss": 0.3139, |
| "step": 68500 |
| }, |
| { |
| "base_loss": 0.3252628707587719, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.000888565497007221, |
| "learning_rate": 4.3419742584228516e-05, |
| "lookahead_loss": 6.891662693023681, |
| "loss": 0.3359, |
| "step": 69000 |
| }, |
| { |
| "base_loss": 0.30557073107361793, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.0009476915001869202, |
| "learning_rate": 4.3372058868408206e-05, |
| "lookahead_loss": 6.974801006317139, |
| "loss": 0.3211, |
| "step": 69500 |
| }, |
| { |
| "base_loss": 0.30054079556465146, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.0009728021686896682, |
| "learning_rate": 4.332437515258789e-05, |
| "lookahead_loss": 6.902195900917053, |
| "loss": 0.3158, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.0143051147460938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.905760923513589, |
| "eval_lookahead_perplexity": 998.0076319369023, |
| "eval_loss": 0.14292870461940765, |
| "eval_perplexity": 1.1536475489928526, |
| "eval_runtime": 489.0741, |
| "eval_samples_per_second": 10.223, |
| "eval_steps_per_second": 0.321, |
| "step": 70000 |
| }, |
| { |
| "base_loss": 0.29648803743720054, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.0009132726700045168, |
| "learning_rate": 4.327669143676758e-05, |
| "lookahead_loss": 6.884706204414368, |
| "loss": 0.3079, |
| "step": 70500 |
| }, |
| { |
| "base_loss": 0.31412097451090815, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.0009932307293638587, |
| "learning_rate": 4.322900772094727e-05, |
| "lookahead_loss": 6.908680680274963, |
| "loss": 0.326, |
| "step": 71000 |
| }, |
| { |
| "base_loss": 0.3125672063827515, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.0009134129504673183, |
| "learning_rate": 4.318132400512695e-05, |
| "lookahead_loss": 6.958608627319336, |
| "loss": 0.3241, |
| "step": 71500 |
| }, |
| { |
| "base_loss": 0.3002317441105843, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.0009274011244997382, |
| "learning_rate": 4.3133640289306643e-05, |
| "lookahead_loss": 6.960107209205628, |
| "loss": 0.3111, |
| "step": 72000 |
| }, |
| { |
| "base_loss": 0.29831535935401915, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.0009689630824141204, |
| "learning_rate": 4.308595657348633e-05, |
| "lookahead_loss": 6.973208980560303, |
| "loss": 0.3109, |
| "step": 72500 |
| }, |
| { |
| "base_loss": 0.3020369653701782, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.001046851510182023, |
| "learning_rate": 4.303827285766602e-05, |
| "lookahead_loss": 6.801126411437989, |
| "loss": 0.3157, |
| "step": 73000 |
| }, |
| { |
| "base_loss": 0.32652922403812407, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.0009485671180300415, |
| "learning_rate": 4.299058914184571e-05, |
| "lookahead_loss": 6.887979488372803, |
| "loss": 0.3396, |
| "step": 73500 |
| }, |
| { |
| "base_loss": 0.30453234216570857, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.0009610042907297611, |
| "learning_rate": 4.294290542602539e-05, |
| "lookahead_loss": 6.849929617881775, |
| "loss": 0.3146, |
| "step": 74000 |
| }, |
| { |
| "base_loss": 0.2977458454966545, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.0010150724556297064, |
| "learning_rate": 4.289522171020508e-05, |
| "lookahead_loss": 6.878925356388092, |
| "loss": 0.3125, |
| "step": 74500 |
| }, |
| { |
| "base_loss": 0.30405546057224275, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.0009638189221732318, |
| "learning_rate": 4.2847537994384764e-05, |
| "lookahead_loss": 6.84999968624115, |
| "loss": 0.3149, |
| "step": 75000 |
| }, |
| { |
| "epoch": 1.0238418579101562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.875502051636815, |
| "eval_lookahead_perplexity": 968.2613607781705, |
| "eval_loss": 0.142868772149086, |
| "eval_perplexity": 1.1535784101172133, |
| "eval_runtime": 499.6825, |
| "eval_samples_per_second": 10.006, |
| "eval_steps_per_second": 0.314, |
| "step": 75000 |
| }, |
| { |
| "base_loss": 0.32463854083418847, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.0009209099807776511, |
| "learning_rate": 4.2799854278564455e-05, |
| "lookahead_loss": 6.842094340324402, |
| "loss": 0.3362, |
| "step": 75500 |
| }, |
| { |
| "base_loss": 0.3075324648320675, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.0009934107074514031, |
| "learning_rate": 4.2752170562744145e-05, |
| "lookahead_loss": 6.811694778442383, |
| "loss": 0.3238, |
| "step": 76000 |
| }, |
| { |
| "base_loss": 0.30398501074314116, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.0009826256427913904, |
| "learning_rate": 4.270448684692383e-05, |
| "lookahead_loss": 6.813813063621521, |
| "loss": 0.314, |
| "step": 76500 |
| }, |
| { |
| "base_loss": 0.3081837382018566, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.0009539109887555242, |
| "learning_rate": 4.265680313110352e-05, |
| "lookahead_loss": 6.9388167886734005, |
| "loss": 0.319, |
| "step": 77000 |
| }, |
| { |
| "base_loss": 0.32895678067207335, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.0009696350898593664, |
| "learning_rate": 4.26091194152832e-05, |
| "lookahead_loss": 6.9466577863693235, |
| "loss": 0.341, |
| "step": 77500 |
| }, |
| { |
| "base_loss": 0.30588172587752344, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.0009499301086179912, |
| "learning_rate": 4.256143569946289e-05, |
| "lookahead_loss": 6.890705446243286, |
| "loss": 0.3154, |
| "step": 78000 |
| }, |
| { |
| "base_loss": 0.3051903445720673, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.0009480732842348516, |
| "learning_rate": 4.251375198364258e-05, |
| "lookahead_loss": 6.864274346351624, |
| "loss": 0.3174, |
| "step": 78500 |
| }, |
| { |
| "base_loss": 0.30346439191699026, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.0009676506742835045, |
| "learning_rate": 4.2466068267822266e-05, |
| "lookahead_loss": 6.894395670890808, |
| "loss": 0.3162, |
| "step": 79000 |
| }, |
| { |
| "base_loss": 0.31795056411623956, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.0009829605696722865, |
| "learning_rate": 4.2418384552001956e-05, |
| "lookahead_loss": 6.821612464904785, |
| "loss": 0.3355, |
| "step": 79500 |
| }, |
| { |
| "base_loss": 0.30795893451571466, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.0009851903887465596, |
| "learning_rate": 4.237070083618164e-05, |
| "lookahead_loss": 6.886163942337036, |
| "loss": 0.3185, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.0333786010742188, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.84953888384298, |
| "eval_lookahead_perplexity": 943.4457682675941, |
| "eval_loss": 0.14281712472438812, |
| "eval_perplexity": 1.1535188323016774, |
| "eval_runtime": 496.4245, |
| "eval_samples_per_second": 10.072, |
| "eval_steps_per_second": 0.316, |
| "step": 80000 |
| }, |
| { |
| "base_loss": 0.3031257001161575, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.0009114276035688818, |
| "learning_rate": 4.232301712036133e-05, |
| "lookahead_loss": 6.9387643995285035, |
| "loss": 0.3159, |
| "step": 80500 |
| }, |
| { |
| "base_loss": 0.31068781118094924, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.0010243533179163933, |
| "learning_rate": 4.227533340454102e-05, |
| "lookahead_loss": 6.779029149055481, |
| "loss": 0.3235, |
| "step": 81000 |
| }, |
| { |
| "base_loss": 0.32500979214906695, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.0009815491503104568, |
| "learning_rate": 4.22276496887207e-05, |
| "lookahead_loss": 6.89651736831665, |
| "loss": 0.3377, |
| "step": 81500 |
| }, |
| { |
| "base_loss": 0.3069631262719631, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.0010114209726452827, |
| "learning_rate": 4.2179965972900393e-05, |
| "lookahead_loss": 6.8101696758270265, |
| "loss": 0.3181, |
| "step": 82000 |
| }, |
| { |
| "base_loss": 0.3025422422587872, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.0009560140897519886, |
| "learning_rate": 4.213228225708008e-05, |
| "lookahead_loss": 6.864963472366333, |
| "loss": 0.314, |
| "step": 82500 |
| }, |
| { |
| "base_loss": 0.3076345331072807, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.000972763926256448, |
| "learning_rate": 4.208459854125977e-05, |
| "lookahead_loss": 6.800820850372315, |
| "loss": 0.3196, |
| "step": 83000 |
| }, |
| { |
| "base_loss": 0.3235399980545044, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.0009581708000041544, |
| "learning_rate": 4.203691482543946e-05, |
| "lookahead_loss": 6.839151841163635, |
| "loss": 0.3342, |
| "step": 83500 |
| }, |
| { |
| "base_loss": 0.30506757298111914, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.0009835829259827733, |
| "learning_rate": 4.198923110961914e-05, |
| "lookahead_loss": 6.788027523994446, |
| "loss": 0.3148, |
| "step": 84000 |
| }, |
| { |
| "base_loss": 0.29668092691898346, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.0009812023490667343, |
| "learning_rate": 4.194154739379883e-05, |
| "lookahead_loss": 6.869966278076172, |
| "loss": 0.3089, |
| "step": 84500 |
| }, |
| { |
| "base_loss": 0.30789948108792303, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.0009604549850337207, |
| "learning_rate": 4.1893863677978514e-05, |
| "lookahead_loss": 6.8703847560882565, |
| "loss": 0.3247, |
| "step": 85000 |
| }, |
| { |
| "epoch": 1.0429153442382812, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.825141870175687, |
| "eval_lookahead_perplexity": 920.7070156519827, |
| "eval_loss": 0.14276856184005737, |
| "eval_perplexity": 1.1534628154602318, |
| "eval_runtime": 493.0912, |
| "eval_samples_per_second": 10.14, |
| "eval_steps_per_second": 0.318, |
| "step": 85000 |
| }, |
| { |
| "base_loss": 0.3281280441880226, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.0009952600812539458, |
| "learning_rate": 4.1846179962158205e-05, |
| "lookahead_loss": 6.8946290845870974, |
| "loss": 0.3444, |
| "step": 85500 |
| }, |
| { |
| "base_loss": 0.2978555924296379, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.0010316900443285704, |
| "learning_rate": 4.1798496246337895e-05, |
| "lookahead_loss": 6.817944658279419, |
| "loss": 0.3111, |
| "step": 86000 |
| }, |
| { |
| "base_loss": 0.3044668311774731, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.0009631580905988812, |
| "learning_rate": 4.175081253051758e-05, |
| "lookahead_loss": 6.833521637916565, |
| "loss": 0.318, |
| "step": 86500 |
| }, |
| { |
| "base_loss": 0.3298782432973385, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.0009412643266841769, |
| "learning_rate": 4.170312881469727e-05, |
| "lookahead_loss": 6.806390251159668, |
| "loss": 0.3407, |
| "step": 87000 |
| }, |
| { |
| "base_loss": 0.32442897310853, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.0009984897915273905, |
| "learning_rate": 4.165544509887695e-05, |
| "lookahead_loss": 6.830627080917359, |
| "loss": 0.3392, |
| "step": 87500 |
| }, |
| { |
| "base_loss": 0.2941350122392178, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.0009231239673681557, |
| "learning_rate": 4.160776138305664e-05, |
| "lookahead_loss": 6.789681484222412, |
| "loss": 0.3084, |
| "step": 88000 |
| }, |
| { |
| "base_loss": 0.301623804807663, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.000988572952337563, |
| "learning_rate": 4.156007766723633e-05, |
| "lookahead_loss": 6.77013840007782, |
| "loss": 0.3152, |
| "step": 88500 |
| }, |
| { |
| "base_loss": 0.31965578559041025, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.0009195742895826697, |
| "learning_rate": 4.1512393951416016e-05, |
| "lookahead_loss": 6.868552158355713, |
| "loss": 0.3327, |
| "step": 89000 |
| }, |
| { |
| "base_loss": 0.30511142282187936, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.000990525702945888, |
| "learning_rate": 4.1464710235595706e-05, |
| "lookahead_loss": 6.824063732147216, |
| "loss": 0.3195, |
| "step": 89500 |
| }, |
| { |
| "base_loss": 0.3033564644157887, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.0010395031422376633, |
| "learning_rate": 4.141702651977539e-05, |
| "lookahead_loss": 6.759691466331482, |
| "loss": 0.3166, |
| "step": 90000 |
| }, |
| { |
| "epoch": 1.0524520874023438, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.803260954043355, |
| "eval_lookahead_perplexity": 900.7799093735076, |
| "eval_loss": 0.14272409677505493, |
| "eval_perplexity": 1.1534115278014274, |
| "eval_runtime": 492.5913, |
| "eval_samples_per_second": 10.15, |
| "eval_steps_per_second": 0.319, |
| "step": 90000 |
| }, |
| { |
| "base_loss": 0.32089028322696683, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.0009381326381117105, |
| "learning_rate": 4.136934280395508e-05, |
| "lookahead_loss": 6.765647802352905, |
| "loss": 0.3307, |
| "step": 90500 |
| }, |
| { |
| "base_loss": 0.35406574749946595, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.0009708745637908578, |
| "learning_rate": 4.132165908813477e-05, |
| "lookahead_loss": 6.8132513179779055, |
| "loss": 0.3699, |
| "step": 91000 |
| }, |
| { |
| "base_loss": 0.2938829956352711, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.0009931994136422873, |
| "learning_rate": 4.127397537231445e-05, |
| "lookahead_loss": 6.817016356945038, |
| "loss": 0.3071, |
| "step": 91500 |
| }, |
| { |
| "base_loss": 0.30498689064383505, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.0009295056224800646, |
| "learning_rate": 4.1226291656494143e-05, |
| "lookahead_loss": 6.81505980014801, |
| "loss": 0.3181, |
| "step": 92000 |
| }, |
| { |
| "base_loss": 0.317481600522995, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.0009703385876491666, |
| "learning_rate": 4.117860794067383e-05, |
| "lookahead_loss": 6.835084310531617, |
| "loss": 0.3317, |
| "step": 92500 |
| }, |
| { |
| "base_loss": 0.3179551683664322, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.0009712363826110959, |
| "learning_rate": 4.113092422485352e-05, |
| "lookahead_loss": 6.822910179138184, |
| "loss": 0.3299, |
| "step": 93000 |
| }, |
| { |
| "base_loss": 0.29271650505065916, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.000948713393881917, |
| "learning_rate": 4.108324050903321e-05, |
| "lookahead_loss": 6.735391735076904, |
| "loss": 0.3073, |
| "step": 93500 |
| }, |
| { |
| "base_loss": 0.3039356949329376, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.0009828072506934404, |
| "learning_rate": 4.103555679321289e-05, |
| "lookahead_loss": 6.790757938861847, |
| "loss": 0.3189, |
| "step": 94000 |
| }, |
| { |
| "base_loss": 0.32165152502059935, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.000970664550550282, |
| "learning_rate": 4.098787307739258e-05, |
| "lookahead_loss": 6.76230890083313, |
| "loss": 0.3332, |
| "step": 94500 |
| }, |
| { |
| "base_loss": 0.3061283130943775, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.0010395641438663006, |
| "learning_rate": 4.0940189361572264e-05, |
| "lookahead_loss": 6.777449913024903, |
| "loss": 0.3171, |
| "step": 95000 |
| }, |
| { |
| "epoch": 1.0619888305664062, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.780221919282176, |
| "eval_lookahead_perplexity": 880.2640499998114, |
| "eval_loss": 0.14268015325069427, |
| "eval_perplexity": 1.1533608439474792, |
| "eval_runtime": 484.5014, |
| "eval_samples_per_second": 10.32, |
| "eval_steps_per_second": 0.324, |
| "step": 95000 |
| }, |
| { |
| "base_loss": 0.30640321379899976, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.0009915264090523124, |
| "learning_rate": 4.0892505645751955e-05, |
| "lookahead_loss": 6.743420834541321, |
| "loss": 0.3177, |
| "step": 95500 |
| }, |
| { |
| "base_loss": 0.31782649287581444, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.0009494387777522206, |
| "learning_rate": 4.0844821929931645e-05, |
| "lookahead_loss": 6.79113444519043, |
| "loss": 0.3309, |
| "step": 96000 |
| }, |
| { |
| "base_loss": 0.30349985790252687, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.0009761872352100909, |
| "learning_rate": 4.079713821411133e-05, |
| "lookahead_loss": 6.813262487411499, |
| "loss": 0.3191, |
| "step": 96500 |
| }, |
| { |
| "base_loss": 0.3075440634191036, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.0009754234342835844, |
| "learning_rate": 4.074945449829102e-05, |
| "lookahead_loss": 6.742620223045349, |
| "loss": 0.321, |
| "step": 97000 |
| }, |
| { |
| "base_loss": 0.3064390652179718, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.0009838842088356614, |
| "learning_rate": 4.07017707824707e-05, |
| "lookahead_loss": 6.700049202919006, |
| "loss": 0.317, |
| "step": 97500 |
| }, |
| { |
| "base_loss": 0.3303199237883091, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.0010200405959039927, |
| "learning_rate": 4.065408706665039e-05, |
| "lookahead_loss": 6.784039269447327, |
| "loss": 0.3412, |
| "step": 98000 |
| }, |
| { |
| "base_loss": 0.2994670196175575, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.0010121484519913793, |
| "learning_rate": 4.060640335083008e-05, |
| "lookahead_loss": 6.775819786071778, |
| "loss": 0.3108, |
| "step": 98500 |
| }, |
| { |
| "base_loss": 0.3000359579175711, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.0009712814935483038, |
| "learning_rate": 4.0558719635009766e-05, |
| "lookahead_loss": 6.7902104940414425, |
| "loss": 0.3161, |
| "step": 99000 |
| }, |
| { |
| "base_loss": 0.34639680609107015, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.0009614603477530181, |
| "learning_rate": 4.0511035919189456e-05, |
| "lookahead_loss": 6.673412177562714, |
| "loss": 0.3593, |
| "step": 99500 |
| }, |
| { |
| "base_loss": 0.3132462115287781, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.0009738055523484945, |
| "learning_rate": 4.046335220336914e-05, |
| "lookahead_loss": 6.736346269607544, |
| "loss": 0.3247, |
| "step": 100000 |
| }, |
| { |
| "epoch": 1.0715255737304688, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.758892397530163, |
| "eval_lookahead_perplexity": 861.6872601047816, |
| "eval_loss": 0.14263826608657837, |
| "eval_perplexity": 1.1533125339443155, |
| "eval_runtime": 492.9295, |
| "eval_samples_per_second": 10.143, |
| "eval_steps_per_second": 0.319, |
| "step": 100000 |
| }, |
| { |
| "base_loss": 0.3044133240580559, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.000981901423074305, |
| "learning_rate": 4.041566848754883e-05, |
| "lookahead_loss": 6.821595732688904, |
| "loss": 0.3138, |
| "step": 100500 |
| }, |
| { |
| "base_loss": 0.30059696701169014, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.000999079318717122, |
| "learning_rate": 4.036798477172852e-05, |
| "lookahead_loss": 6.66421698474884, |
| "loss": 0.3137, |
| "step": 101000 |
| }, |
| { |
| "base_loss": 0.31169990518689156, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.0009957862785086036, |
| "learning_rate": 4.03203010559082e-05, |
| "lookahead_loss": 6.655234758377075, |
| "loss": 0.3215, |
| "step": 101500 |
| }, |
| { |
| "base_loss": 0.3227726019620895, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.0009710168233141303, |
| "learning_rate": 4.0272617340087893e-05, |
| "lookahead_loss": 6.6804737997055055, |
| "loss": 0.3354, |
| "step": 102000 |
| }, |
| { |
| "base_loss": 0.3022470915019512, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.000950310961343348, |
| "learning_rate": 4.022493362426758e-05, |
| "lookahead_loss": 6.665619974136352, |
| "loss": 0.3172, |
| "step": 102500 |
| }, |
| { |
| "base_loss": 0.30552061820030213, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.0008522234857082367, |
| "learning_rate": 4.017724990844727e-05, |
| "lookahead_loss": 6.790848443984985, |
| "loss": 0.314, |
| "step": 103000 |
| }, |
| { |
| "base_loss": 0.2953472335338593, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.0009317957446910441, |
| "learning_rate": 4.012956619262696e-05, |
| "lookahead_loss": 6.637859883308411, |
| "loss": 0.3144, |
| "step": 103500 |
| }, |
| { |
| "base_loss": 0.312746944963932, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.0009721561800688505, |
| "learning_rate": 4.008188247680664e-05, |
| "lookahead_loss": 6.700247512817382, |
| "loss": 0.3242, |
| "step": 104000 |
| }, |
| { |
| "base_loss": 0.3169711889922619, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.0009455936960875988, |
| "learning_rate": 4.003419876098633e-05, |
| "lookahead_loss": 6.695308849334717, |
| "loss": 0.3221, |
| "step": 104500 |
| }, |
| { |
| "base_loss": 0.306710629016161, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.0009776534279808402, |
| "learning_rate": 3.9986515045166014e-05, |
| "lookahead_loss": 6.697743677139282, |
| "loss": 0.3199, |
| "step": 105000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.739253997802734, |
| "eval_lookahead_perplexity": 844.9301809110984, |
| "eval_loss": 0.14259953796863556, |
| "eval_perplexity": 1.1532678691853726, |
| "eval_runtime": 478.9091, |
| "eval_samples_per_second": 10.44, |
| "eval_steps_per_second": 0.328, |
| "step": 105000 |
| }, |
| { |
| "base_loss": 0.30083237382769584, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.0009771056938916445, |
| "learning_rate": 3.9938831329345705e-05, |
| "lookahead_loss": 6.666155261039734, |
| "loss": 0.3113, |
| "step": 105500 |
| }, |
| { |
| "base_loss": 0.2993237827420235, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.0010173760820180178, |
| "learning_rate": 3.9891147613525395e-05, |
| "lookahead_loss": 6.6820623445510865, |
| "loss": 0.3139, |
| "step": 106000 |
| }, |
| { |
| "base_loss": 0.3238567093908787, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.000880017876625061, |
| "learning_rate": 3.984346389770508e-05, |
| "lookahead_loss": 6.659082005500793, |
| "loss": 0.3342, |
| "step": 106500 |
| }, |
| { |
| "base_loss": 0.3051931007504463, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.0009482282912358642, |
| "learning_rate": 3.979578018188477e-05, |
| "lookahead_loss": 6.76999457359314, |
| "loss": 0.3202, |
| "step": 107000 |
| }, |
| { |
| "base_loss": 0.29808008483052256, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.0009459082502871752, |
| "learning_rate": 3.974809646606445e-05, |
| "lookahead_loss": 6.709939098358154, |
| "loss": 0.3136, |
| "step": 107500 |
| }, |
| { |
| "base_loss": 0.29345863962173463, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.0009026661282405257, |
| "learning_rate": 3.970041275024414e-05, |
| "lookahead_loss": 6.641336709499359, |
| "loss": 0.3059, |
| "step": 108000 |
| }, |
| { |
| "base_loss": 0.3092884007692337, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.0009934415575116873, |
| "learning_rate": 3.965272903442383e-05, |
| "lookahead_loss": 6.710262378692627, |
| "loss": 0.3215, |
| "step": 108500 |
| }, |
| { |
| "base_loss": 0.31143338218331335, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.0009016263647936285, |
| "learning_rate": 3.9605045318603516e-05, |
| "lookahead_loss": 6.74980753993988, |
| "loss": 0.3234, |
| "step": 109000 |
| }, |
| { |
| "base_loss": 0.3001442384421825, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.0009415132808499038, |
| "learning_rate": 3.9557361602783206e-05, |
| "lookahead_loss": 6.753072287559509, |
| "loss": 0.3119, |
| "step": 109500 |
| }, |
| { |
| "base_loss": 0.2986592257618904, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.0009482503519393504, |
| "learning_rate": 3.950967788696289e-05, |
| "lookahead_loss": 6.771802840709686, |
| "loss": 0.3116, |
| "step": 110000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.720215821799378, |
| "eval_lookahead_perplexity": 828.9964076723086, |
| "eval_loss": 0.1425611525774002, |
| "eval_perplexity": 1.1532236013966384, |
| "eval_runtime": 486.3363, |
| "eval_samples_per_second": 10.281, |
| "eval_steps_per_second": 0.323, |
| "step": 110000 |
| }, |
| { |
| "base_loss": 0.30347599306702616, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.0010480897035449743, |
| "learning_rate": 3.946199417114258e-05, |
| "lookahead_loss": 6.608225521087647, |
| "loss": 0.3157, |
| "step": 110500 |
| }, |
| { |
| "base_loss": 0.3299741225540638, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.0009382431744597852, |
| "learning_rate": 3.941431045532227e-05, |
| "lookahead_loss": 6.704700765609741, |
| "loss": 0.341, |
| "step": 111000 |
| }, |
| { |
| "base_loss": 0.3070560489296913, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.0009868369670584798, |
| "learning_rate": 3.936662673950195e-05, |
| "lookahead_loss": 6.654251655578613, |
| "loss": 0.3169, |
| "step": 111500 |
| }, |
| { |
| "base_loss": 0.301061170309782, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.0010104605462402105, |
| "learning_rate": 3.9318943023681643e-05, |
| "lookahead_loss": 6.692898473739624, |
| "loss": 0.3139, |
| "step": 112000 |
| }, |
| { |
| "base_loss": 0.30337609922885894, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.0009765701834112406, |
| "learning_rate": 3.927125930786133e-05, |
| "lookahead_loss": 6.65063930606842, |
| "loss": 0.315, |
| "step": 112500 |
| }, |
| { |
| "base_loss": 0.3241444931924343, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.0009068374638445675, |
| "learning_rate": 3.922357559204102e-05, |
| "lookahead_loss": 6.651956144332885, |
| "loss": 0.3367, |
| "step": 113000 |
| }, |
| { |
| "base_loss": 0.3070600248277187, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.0009709529695101082, |
| "learning_rate": 3.917589187622071e-05, |
| "lookahead_loss": 6.608614470481872, |
| "loss": 0.3218, |
| "step": 113500 |
| }, |
| { |
| "base_loss": 0.3022406686246395, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.0010030195116996765, |
| "learning_rate": 3.912820816040039e-05, |
| "lookahead_loss": 6.62807030582428, |
| "loss": 0.3116, |
| "step": 114000 |
| }, |
| { |
| "base_loss": 0.30680677881836893, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.0009639645577408373, |
| "learning_rate": 3.908052444458008e-05, |
| "lookahead_loss": 6.749866914749146, |
| "loss": 0.3195, |
| "step": 114500 |
| }, |
| { |
| "base_loss": 0.33426042160391806, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.00096013059373945, |
| "learning_rate": 3.9032840728759764e-05, |
| "lookahead_loss": 6.75391247177124, |
| "loss": 0.3431, |
| "step": 115000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.702820940901296, |
| "eval_lookahead_perplexity": 814.7008094738575, |
| "eval_loss": 0.14252659678459167, |
| "eval_perplexity": 1.1531837515293326, |
| "eval_runtime": 472.8492, |
| "eval_samples_per_second": 10.574, |
| "eval_steps_per_second": 0.332, |
| "step": 115000 |
| }, |
| { |
| "base_loss": 0.3049518305659294, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.0009578875033184886, |
| "learning_rate": 3.8985157012939455e-05, |
| "lookahead_loss": 6.715790456771851, |
| "loss": 0.3138, |
| "step": 115500 |
| }, |
| { |
| "base_loss": 0.3062360401749611, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.000939805235248059, |
| "learning_rate": 3.8937473297119145e-05, |
| "lookahead_loss": 6.674023857593537, |
| "loss": 0.3173, |
| "step": 116000 |
| }, |
| { |
| "base_loss": 0.30225355681777, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.0009627947001717985, |
| "learning_rate": 3.888978958129883e-05, |
| "lookahead_loss": 6.712348463058472, |
| "loss": 0.3144, |
| "step": 116500 |
| }, |
| { |
| "base_loss": 0.3184074863195419, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.0009747587610036135, |
| "learning_rate": 3.884210586547852e-05, |
| "lookahead_loss": 6.652136072158814, |
| "loss": 0.3359, |
| "step": 117000 |
| }, |
| { |
| "base_loss": 0.30629492220282556, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.0010025979718193412, |
| "learning_rate": 3.87944221496582e-05, |
| "lookahead_loss": 6.700876714706421, |
| "loss": 0.3163, |
| "step": 117500 |
| }, |
| { |
| "base_loss": 0.3031555346250534, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.0009006695472635329, |
| "learning_rate": 3.874673843383789e-05, |
| "lookahead_loss": 6.757840476036072, |
| "loss": 0.3149, |
| "step": 118000 |
| }, |
| { |
| "base_loss": 0.31164542263746264, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.0010140526574105024, |
| "learning_rate": 3.869905471801758e-05, |
| "lookahead_loss": 6.589431819915771, |
| "loss": 0.3232, |
| "step": 118500 |
| }, |
| { |
| "base_loss": 0.324304408878088, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.0009850772330537438, |
| "learning_rate": 3.8651371002197266e-05, |
| "lookahead_loss": 6.707897541999817, |
| "loss": 0.3363, |
| "step": 119000 |
| }, |
| { |
| "base_loss": 0.30813179594278334, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.0009913926478475332, |
| "learning_rate": 3.8603687286376956e-05, |
| "lookahead_loss": 6.640646786689758, |
| "loss": 0.3205, |
| "step": 119500 |
| }, |
| { |
| "base_loss": 0.30138176554441454, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.0009480128646828234, |
| "learning_rate": 3.855600357055664e-05, |
| "lookahead_loss": 6.679037447929383, |
| "loss": 0.3145, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.686355599960961, |
| "eval_lookahead_perplexity": 801.3963149780681, |
| "eval_loss": 0.1424950510263443, |
| "eval_perplexity": 1.1531473740472726, |
| "eval_runtime": 483.3795, |
| "eval_samples_per_second": 10.344, |
| "eval_steps_per_second": 0.325, |
| "step": 120000 |
| }, |
| { |
| "base_loss": 0.30871694785356524, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.0009410271886736155, |
| "learning_rate": 3.850831985473633e-05, |
| "lookahead_loss": 6.625640468597412, |
| "loss": 0.3211, |
| "step": 120500 |
| }, |
| { |
| "base_loss": 0.32506244936585427, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.0009350285981781781, |
| "learning_rate": 3.846063613891602e-05, |
| "lookahead_loss": 6.6512613153457645, |
| "loss": 0.3345, |
| "step": 121000 |
| }, |
| { |
| "base_loss": 0.30769926142692566, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.000978046446107328, |
| "learning_rate": 3.84129524230957e-05, |
| "lookahead_loss": 6.6048227882385255, |
| "loss": 0.3173, |
| "step": 121500 |
| }, |
| { |
| "base_loss": 0.29858891409635546, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.000979132833890617, |
| "learning_rate": 3.8365268707275393e-05, |
| "lookahead_loss": 6.689519411087036, |
| "loss": 0.3105, |
| "step": 122000 |
| }, |
| { |
| "base_loss": 0.3094627737402916, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.0009663606178946793, |
| "learning_rate": 3.831758499145508e-05, |
| "lookahead_loss": 6.7044447908401485, |
| "loss": 0.3251, |
| "step": 122500 |
| }, |
| { |
| "base_loss": 0.32764697542786597, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.0009794370271265507, |
| "learning_rate": 3.826990127563477e-05, |
| "lookahead_loss": 6.723638868331909, |
| "loss": 0.3413, |
| "step": 123000 |
| }, |
| { |
| "base_loss": 0.29553532418608663, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.0009821865241974592, |
| "learning_rate": 3.822221755981446e-05, |
| "lookahead_loss": 6.640208226203918, |
| "loss": 0.3095, |
| "step": 123500 |
| }, |
| { |
| "base_loss": 0.3041268612146378, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.0009438347187824547, |
| "learning_rate": 3.817453384399414e-05, |
| "lookahead_loss": 6.654849781990051, |
| "loss": 0.3159, |
| "step": 124000 |
| }, |
| { |
| "base_loss": 0.3303914776444435, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.0009457177948206663, |
| "learning_rate": 3.812685012817383e-05, |
| "lookahead_loss": 6.630368858814239, |
| "loss": 0.3403, |
| "step": 124500 |
| }, |
| { |
| "base_loss": 0.3248122656941414, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.0010212536435574293, |
| "learning_rate": 3.8079166412353514e-05, |
| "lookahead_loss": 6.664453419685364, |
| "loss": 0.3396, |
| "step": 125000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.671399636009631, |
| "eval_lookahead_perplexity": 789.4998439268663, |
| "eval_loss": 0.14246411621570587, |
| "eval_perplexity": 1.153111702203372, |
| "eval_runtime": 483.8837, |
| "eval_samples_per_second": 10.333, |
| "eval_steps_per_second": 0.324, |
| "step": 125000 |
| }, |
| { |
| "base_loss": 0.2969100174307823, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.0009448639466427267, |
| "learning_rate": 3.8031482696533205e-05, |
| "lookahead_loss": 6.613508511543274, |
| "loss": 0.3092, |
| "step": 125500 |
| }, |
| { |
| "base_loss": 0.302005185931921, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.0009936641436070204, |
| "learning_rate": 3.7983798980712895e-05, |
| "lookahead_loss": 6.604424912452698, |
| "loss": 0.3169, |
| "step": 126000 |
| }, |
| { |
| "base_loss": 0.31874441370368006, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.0009042201563715935, |
| "learning_rate": 3.793611526489258e-05, |
| "lookahead_loss": 6.707013080596924, |
| "loss": 0.3317, |
| "step": 126500 |
| }, |
| { |
| "base_loss": 0.30408672893047334, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.0009868575725704432, |
| "learning_rate": 3.788843154907227e-05, |
| "lookahead_loss": 6.650782800674438, |
| "loss": 0.3188, |
| "step": 127000 |
| }, |
| { |
| "base_loss": 0.3058005510568619, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.00102641258854419, |
| "learning_rate": 3.784074783325195e-05, |
| "lookahead_loss": 6.603402623176574, |
| "loss": 0.3186, |
| "step": 127500 |
| }, |
| { |
| "base_loss": 0.32026463899016383, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.0009292624308727682, |
| "learning_rate": 3.779306411743164e-05, |
| "lookahead_loss": 6.604880172729493, |
| "loss": 0.3298, |
| "step": 128000 |
| }, |
| { |
| "base_loss": 0.35889338579773905, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.0009620094788260758, |
| "learning_rate": 3.774538040161133e-05, |
| "lookahead_loss": 6.646626858711243, |
| "loss": 0.3714, |
| "step": 128500 |
| }, |
| { |
| "base_loss": 0.29546374672651293, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.0009715965134091675, |
| "learning_rate": 3.7697696685791016e-05, |
| "lookahead_loss": 6.637260946273804, |
| "loss": 0.3067, |
| "step": 129000 |
| }, |
| { |
| "base_loss": 0.3063408683240414, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.0009361687116324902, |
| "learning_rate": 3.7650012969970706e-05, |
| "lookahead_loss": 6.649524963378906, |
| "loss": 0.3181, |
| "step": 129500 |
| }, |
| { |
| "base_loss": 0.3186078954935074, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.0009357648668810725, |
| "learning_rate": 3.760232925415039e-05, |
| "lookahead_loss": 6.6754186210632325, |
| "loss": 0.332, |
| "step": 130000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.6561929562602185, |
| "eval_lookahead_perplexity": 777.5849948862144, |
| "eval_loss": 0.14243432879447937, |
| "eval_perplexity": 1.1530773544909447, |
| "eval_runtime": 471.0856, |
| "eval_samples_per_second": 10.614, |
| "eval_steps_per_second": 0.333, |
| "step": 130000 |
| }, |
| { |
| "base_loss": 0.31768571627140046, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.0009750055032782257, |
| "learning_rate": 3.755464553833008e-05, |
| "lookahead_loss": 6.655493871688843, |
| "loss": 0.3273, |
| "step": 130500 |
| }, |
| { |
| "base_loss": 0.2922684009075165, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.0009410986676812172, |
| "learning_rate": 3.750696182250977e-05, |
| "lookahead_loss": 6.57894612789154, |
| "loss": 0.3084, |
| "step": 131000 |
| }, |
| { |
| "base_loss": 0.30112267237901685, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.0009611063869670033, |
| "learning_rate": 3.745927810668945e-05, |
| "lookahead_loss": 6.6305308623313906, |
| "loss": 0.3154, |
| "step": 131500 |
| }, |
| { |
| "base_loss": 0.32029621145129206, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.0009844622109085321, |
| "learning_rate": 3.7411594390869143e-05, |
| "lookahead_loss": 6.5994256973266605, |
| "loss": 0.3317, |
| "step": 132000 |
| }, |
| { |
| "base_loss": 0.30533574494719506, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.0010215704096481204, |
| "learning_rate": 3.736391067504883e-05, |
| "lookahead_loss": 6.603619037628174, |
| "loss": 0.3156, |
| "step": 132500 |
| }, |
| { |
| "base_loss": 0.30571810373663905, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.0010208436287939548, |
| "learning_rate": 3.731622695922852e-05, |
| "lookahead_loss": 6.59216494178772, |
| "loss": 0.3172, |
| "step": 133000 |
| }, |
| { |
| "base_loss": 0.31451627737283705, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.0009539647144265473, |
| "learning_rate": 3.726854324340821e-05, |
| "lookahead_loss": 6.634691061019898, |
| "loss": 0.3285, |
| "step": 133500 |
| }, |
| { |
| "base_loss": 0.30425655883550645, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.0009696083143353462, |
| "learning_rate": 3.722085952758789e-05, |
| "lookahead_loss": 6.656025648117065, |
| "loss": 0.3179, |
| "step": 134000 |
| }, |
| { |
| "base_loss": 0.31105126801133154, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.0009692656458355486, |
| "learning_rate": 3.717317581176758e-05, |
| "lookahead_loss": 6.560485363006592, |
| "loss": 0.3215, |
| "step": 134500 |
| }, |
| { |
| "base_loss": 0.3071163959801197, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.0009831758216023445, |
| "learning_rate": 3.7125492095947264e-05, |
| "lookahead_loss": 6.5449725456237795, |
| "loss": 0.3169, |
| "step": 135000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.641093528308807, |
| "eval_lookahead_perplexity": 765.9321036725115, |
| "eval_loss": 0.14240561425685883, |
| "eval_perplexity": 1.1530442448832356, |
| "eval_runtime": 485.8723, |
| "eval_samples_per_second": 10.291, |
| "eval_steps_per_second": 0.323, |
| "step": 135000 |
| }, |
| { |
| "base_loss": 0.3321034919023514, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.0010069276904687285, |
| "learning_rate": 3.7077808380126955e-05, |
| "lookahead_loss": 6.633243779182434, |
| "loss": 0.3433, |
| "step": 135500 |
| }, |
| { |
| "base_loss": 0.3017843673825264, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.0009863151935860515, |
| "learning_rate": 3.7030124664306645e-05, |
| "lookahead_loss": 6.624729963302612, |
| "loss": 0.3108, |
| "step": 136000 |
| }, |
| { |
| "base_loss": 0.302195555627346, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.0009720239322632551, |
| "learning_rate": 3.698244094848633e-05, |
| "lookahead_loss": 6.641478686332703, |
| "loss": 0.3168, |
| "step": 136500 |
| }, |
| { |
| "base_loss": 0.3459869565963745, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.0009440227877348661, |
| "learning_rate": 3.693475723266602e-05, |
| "lookahead_loss": 6.523862397193908, |
| "loss": 0.3605, |
| "step": 137000 |
| }, |
| { |
| "base_loss": 0.3151495299339294, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.0009616228053346276, |
| "learning_rate": 3.68870735168457e-05, |
| "lookahead_loss": 6.586525348186493, |
| "loss": 0.3265, |
| "step": 137500 |
| }, |
| { |
| "base_loss": 0.30790447345376015, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.001022504991851747, |
| "learning_rate": 3.683938980102539e-05, |
| "lookahead_loss": 6.648889023780823, |
| "loss": 0.3207, |
| "step": 138000 |
| }, |
| { |
| "base_loss": 0.30545566940307617, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.0009266745182685554, |
| "learning_rate": 3.679170608520508e-05, |
| "lookahead_loss": 6.604302444458008, |
| "loss": 0.3189, |
| "step": 138500 |
| }, |
| { |
| "base_loss": 0.32841417971253395, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.0009727867436595261, |
| "learning_rate": 3.6744022369384766e-05, |
| "lookahead_loss": 6.628902969360351, |
| "loss": 0.3409, |
| "step": 139000 |
| }, |
| { |
| "base_loss": 0.30363579127192497, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.0009738055523484945, |
| "learning_rate": 3.6696338653564456e-05, |
| "lookahead_loss": 6.6886735420227055, |
| "loss": 0.3163, |
| "step": 139500 |
| }, |
| { |
| "base_loss": 0.30504586565494535, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.0009336514631286263, |
| "learning_rate": 3.664865493774414e-05, |
| "lookahead_loss": 6.637202547073365, |
| "loss": 0.3183, |
| "step": 140000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.625944952614391, |
| "eval_lookahead_perplexity": 754.4167639313139, |
| "eval_loss": 0.14237651228904724, |
| "eval_perplexity": 1.1530106895150016, |
| "eval_runtime": 484.1692, |
| "eval_samples_per_second": 10.327, |
| "eval_steps_per_second": 0.324, |
| "step": 140000 |
| }, |
| { |
| "base_loss": 0.33029799509048463, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.0009831954957917333, |
| "learning_rate": 3.660097122192383e-05, |
| "lookahead_loss": 6.633178756713868, |
| "loss": 0.3456, |
| "step": 140500 |
| }, |
| { |
| "base_loss": 0.3037954642176628, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.00096993736224249, |
| "learning_rate": 3.655328750610352e-05, |
| "lookahead_loss": 6.641198945999146, |
| "loss": 0.317, |
| "step": 141000 |
| }, |
| { |
| "base_loss": 0.29821320512890814, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.0009353780187666416, |
| "learning_rate": 3.65056037902832e-05, |
| "lookahead_loss": 6.639979603767395, |
| "loss": 0.3132, |
| "step": 141500 |
| }, |
| { |
| "base_loss": 0.3142137563228607, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.0009523274493403733, |
| "learning_rate": 3.6457920074462893e-05, |
| "lookahead_loss": 6.602743772506714, |
| "loss": 0.3306, |
| "step": 142000 |
| }, |
| { |
| "base_loss": 0.3222310249209404, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.0009943461045622826, |
| "learning_rate": 3.641023635864258e-05, |
| "lookahead_loss": 6.649980679988861, |
| "loss": 0.3395, |
| "step": 142500 |
| }, |
| { |
| "base_loss": 0.3002626436650753, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.0009161168127320707, |
| "learning_rate": 3.636255264282227e-05, |
| "lookahead_loss": 6.636230380058288, |
| "loss": 0.3127, |
| "step": 143000 |
| }, |
| { |
| "base_loss": 0.3045452245473862, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.0009895727271214128, |
| "learning_rate": 3.631486892700196e-05, |
| "lookahead_loss": 6.670627347946167, |
| "loss": 0.3188, |
| "step": 143500 |
| }, |
| { |
| "base_loss": 0.33469617655873296, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.0009402433061040938, |
| "learning_rate": 3.626718521118164e-05, |
| "lookahead_loss": 6.6782106046676635, |
| "loss": 0.346, |
| "step": 144000 |
| }, |
| { |
| "base_loss": 0.30740025800466536, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.0009543623309582472, |
| "learning_rate": 3.621950149536133e-05, |
| "lookahead_loss": 6.598346343994141, |
| "loss": 0.3193, |
| "step": 144500 |
| }, |
| { |
| "base_loss": 0.300477741509676, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.001002481789328158, |
| "learning_rate": 3.6171817779541014e-05, |
| "lookahead_loss": 6.61094771194458, |
| "loss": 0.3102, |
| "step": 145000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.611703176467945, |
| "eval_lookahead_perplexity": 743.7486758347069, |
| "eval_loss": 0.14234933257102966, |
| "eval_perplexity": 1.1529793514354714, |
| "eval_runtime": 475.9695, |
| "eval_samples_per_second": 10.505, |
| "eval_steps_per_second": 0.33, |
| "step": 145000 |
| }, |
| { |
| "base_loss": 0.301901563256979, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.0011312811402603984, |
| "learning_rate": 3.6124134063720705e-05, |
| "lookahead_loss": 6.581235520362854, |
| "loss": 0.3132, |
| "step": 145500 |
| }, |
| { |
| "base_loss": 0.338142231285572, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.0008956918027251959, |
| "learning_rate": 3.6076450347900395e-05, |
| "lookahead_loss": 6.629713255405426, |
| "loss": 0.3454, |
| "step": 146000 |
| }, |
| { |
| "base_loss": 0.3009798979461193, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.0009857059922069311, |
| "learning_rate": 3.602876663208008e-05, |
| "lookahead_loss": 6.607778671741485, |
| "loss": 0.3122, |
| "step": 146500 |
| }, |
| { |
| "base_loss": 0.3090392453968525, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.0010041121859103441, |
| "learning_rate": 3.598108291625977e-05, |
| "lookahead_loss": 6.641937935829163, |
| "loss": 0.3183, |
| "step": 147000 |
| }, |
| { |
| "base_loss": 0.30036539113521576, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.0009667676058597863, |
| "learning_rate": 3.593339920043945e-05, |
| "lookahead_loss": 6.610139918327332, |
| "loss": 0.3123, |
| "step": 147500 |
| }, |
| { |
| "base_loss": 0.3006012495756149, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.0008970113703981042, |
| "learning_rate": 3.588571548461914e-05, |
| "lookahead_loss": 6.584732450485229, |
| "loss": 0.3112, |
| "step": 148000 |
| }, |
| { |
| "base_loss": 0.31875682109594344, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.0009264895925298333, |
| "learning_rate": 3.583803176879883e-05, |
| "lookahead_loss": 6.613501731872558, |
| "loss": 0.3355, |
| "step": 148500 |
| }, |
| { |
| "base_loss": 0.3104289738535881, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.0009199742926284671, |
| "learning_rate": 3.5790348052978516e-05, |
| "lookahead_loss": 6.620612399101257, |
| "loss": 0.3219, |
| "step": 149000 |
| }, |
| { |
| "base_loss": 0.2877590928971767, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.0009749355376698077, |
| "learning_rate": 3.5742664337158206e-05, |
| "lookahead_loss": 6.6178521070480345, |
| "loss": 0.3019, |
| "step": 149500 |
| }, |
| { |
| "base_loss": 0.2935507807135582, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.0009596819872967899, |
| "learning_rate": 3.569498062133789e-05, |
| "lookahead_loss": 6.54475514793396, |
| "loss": 0.3075, |
| "step": 150000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.600208011298134, |
| "eval_lookahead_perplexity": 735.2481132509488, |
| "eval_loss": 0.14232492446899414, |
| "eval_perplexity": 1.1529512097412606, |
| "eval_runtime": 491.9361, |
| "eval_samples_per_second": 10.164, |
| "eval_steps_per_second": 0.319, |
| "step": 150000 |
| }, |
| { |
| "base_loss": 0.2986202912926674, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.0009500982123427093, |
| "learning_rate": 3.564729690551758e-05, |
| "lookahead_loss": 6.612848388671875, |
| "loss": 0.3123, |
| "step": 150500 |
| }, |
| { |
| "base_loss": 0.3307524161040783, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.0009901663288474083, |
| "learning_rate": 3.559961318969727e-05, |
| "lookahead_loss": 6.628784805297852, |
| "loss": 0.3418, |
| "step": 151000 |
| }, |
| { |
| "base_loss": 0.29244673988223074, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.0009901755256578326, |
| "learning_rate": 3.555192947387695e-05, |
| "lookahead_loss": 6.573961565494537, |
| "loss": 0.3074, |
| "step": 151500 |
| }, |
| { |
| "base_loss": 0.295786843508482, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.0009776337537914515, |
| "learning_rate": 3.5504245758056643e-05, |
| "lookahead_loss": 6.619675145149231, |
| "loss": 0.3112, |
| "step": 152000 |
| }, |
| { |
| "base_loss": 0.30293611577153207, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.0009244863176718354, |
| "learning_rate": 3.545656204223633e-05, |
| "lookahead_loss": 6.614836613655091, |
| "loss": 0.3152, |
| "step": 152500 |
| }, |
| { |
| "base_loss": 0.3240869597494602, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.0009519928717054427, |
| "learning_rate": 3.540887832641602e-05, |
| "lookahead_loss": 6.675669587135315, |
| "loss": 0.3341, |
| "step": 153000 |
| }, |
| { |
| "base_loss": 0.30599541807174685, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.0009593720897100866, |
| "learning_rate": 3.536119461059571e-05, |
| "lookahead_loss": 6.632304777145386, |
| "loss": 0.3154, |
| "step": 153500 |
| }, |
| { |
| "base_loss": 0.2991089904308319, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.0009689299622550607, |
| "learning_rate": 3.531351089477539e-05, |
| "lookahead_loss": 6.595957444190979, |
| "loss": 0.314, |
| "step": 154000 |
| }, |
| { |
| "base_loss": 0.30219315418601034, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.0010047038085758686, |
| "learning_rate": 3.526582717895508e-05, |
| "lookahead_loss": 6.639606526374817, |
| "loss": 0.3127, |
| "step": 154500 |
| }, |
| { |
| "base_loss": 0.3133500624895096, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.0009745966526679695, |
| "learning_rate": 3.5218143463134764e-05, |
| "lookahead_loss": 6.568594479560852, |
| "loss": 0.3272, |
| "step": 155000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.58809915061195, |
| "eval_lookahead_perplexity": 726.3987820642042, |
| "eval_loss": 0.142300546169281, |
| "eval_perplexity": 1.1529231030937126, |
| "eval_runtime": 475.0132, |
| "eval_samples_per_second": 10.526, |
| "eval_steps_per_second": 0.331, |
| "step": 155000 |
| }, |
| { |
| "base_loss": 0.3070584389269352, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.0009309753077104688, |
| "learning_rate": 3.5170459747314455e-05, |
| "lookahead_loss": 6.5637693691253665, |
| "loss": 0.3213, |
| "step": 155500 |
| }, |
| { |
| "base_loss": 0.29948241996765135, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.0009559483733028173, |
| "learning_rate": 3.5122776031494145e-05, |
| "lookahead_loss": 6.645429815769195, |
| "loss": 0.3098, |
| "step": 156000 |
| }, |
| { |
| "base_loss": 0.29492466670274736, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.0009711109451018274, |
| "learning_rate": 3.507509231567383e-05, |
| "lookahead_loss": 6.56166731595993, |
| "loss": 0.3083, |
| "step": 156500 |
| }, |
| { |
| "base_loss": 0.3188383647501469, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.0010144627885892987, |
| "learning_rate": 3.502740859985352e-05, |
| "lookahead_loss": 6.550499409675598, |
| "loss": 0.3316, |
| "step": 157000 |
| }, |
| { |
| "base_loss": 0.31659464621543887, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.0009141165646724403, |
| "learning_rate": 3.49797248840332e-05, |
| "lookahead_loss": 6.527835812091827, |
| "loss": 0.3262, |
| "step": 157500 |
| }, |
| { |
| "base_loss": 0.3013280538916588, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.0008812876185402274, |
| "learning_rate": 3.493204116821289e-05, |
| "lookahead_loss": 6.478862133026123, |
| "loss": 0.3142, |
| "step": 158000 |
| }, |
| { |
| "base_loss": 0.29822684854269027, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.0010021587368100882, |
| "learning_rate": 3.488435745239258e-05, |
| "lookahead_loss": 6.607266705513, |
| "loss": 0.3089, |
| "step": 158500 |
| }, |
| { |
| "base_loss": 0.3082665235698223, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.0009319260716438293, |
| "learning_rate": 3.4836673736572266e-05, |
| "lookahead_loss": 6.618119819164276, |
| "loss": 0.3212, |
| "step": 159000 |
| }, |
| { |
| "base_loss": 0.34342152199149134, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.0010022877249866724, |
| "learning_rate": 3.4788990020751956e-05, |
| "lookahead_loss": 6.628146926879883, |
| "loss": 0.3504, |
| "step": 159500 |
| }, |
| { |
| "base_loss": 0.29527223294973376, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.0009499759180471301, |
| "learning_rate": 3.474130630493164e-05, |
| "lookahead_loss": 6.5076857767105105, |
| "loss": 0.3088, |
| "step": 160000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.576469425957043, |
| "eval_lookahead_perplexity": 717.9998972605395, |
| "eval_loss": 0.14227746427059174, |
| "eval_perplexity": 1.1528964917465718, |
| "eval_runtime": 486.8871, |
| "eval_samples_per_second": 10.269, |
| "eval_steps_per_second": 0.322, |
| "step": 160000 |
| }, |
| { |
| "base_loss": 0.2983711498081684, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.0009712922619655728, |
| "learning_rate": 3.469362258911133e-05, |
| "lookahead_loss": 6.557691449165344, |
| "loss": 0.3111, |
| "step": 160500 |
| }, |
| { |
| "base_loss": 0.3164993856549263, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.0010468108812347054, |
| "learning_rate": 3.464593887329102e-05, |
| "lookahead_loss": 6.562255940437317, |
| "loss": 0.3252, |
| "step": 161000 |
| }, |
| { |
| "base_loss": 0.3281388694047928, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.0009933033725246787, |
| "learning_rate": 3.45982551574707e-05, |
| "lookahead_loss": 6.5705895509719845, |
| "loss": 0.3439, |
| "step": 161500 |
| }, |
| { |
| "base_loss": 0.3066762860417366, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.0010285564931109548, |
| "learning_rate": 3.4550571441650393e-05, |
| "lookahead_loss": 6.584883491516114, |
| "loss": 0.3162, |
| "step": 162000 |
| }, |
| { |
| "base_loss": 0.3002779276072979, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.0009605666273273528, |
| "learning_rate": 3.450288772583008e-05, |
| "lookahead_loss": 6.601356457710266, |
| "loss": 0.3108, |
| "step": 162500 |
| }, |
| { |
| "base_loss": 0.3048044160306454, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.000960490433499217, |
| "learning_rate": 3.445520401000977e-05, |
| "lookahead_loss": 6.638746548652649, |
| "loss": 0.3137, |
| "step": 163000 |
| }, |
| { |
| "base_loss": 0.2995053820014, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.001005924423225224, |
| "learning_rate": 3.440752029418946e-05, |
| "lookahead_loss": 6.48337349319458, |
| "loss": 0.3134, |
| "step": 163500 |
| }, |
| { |
| "base_loss": 0.31198617857694627, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.0010051662102341652, |
| "learning_rate": 3.435983657836914e-05, |
| "lookahead_loss": 6.466943081855774, |
| "loss": 0.3218, |
| "step": 164000 |
| }, |
| { |
| "base_loss": 0.32396442687511445, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.0009522914770059288, |
| "learning_rate": 3.431215286254883e-05, |
| "lookahead_loss": 6.503096837997436, |
| "loss": 0.3352, |
| "step": 164500 |
| }, |
| { |
| "base_loss": 0.3013957371413708, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.0009518108563497663, |
| "learning_rate": 3.4264469146728514e-05, |
| "lookahead_loss": 6.490046030044556, |
| "loss": 0.316, |
| "step": 165000 |
| }, |
| { |
| "epoch": 2.0047683715820312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.564821576158079, |
| "eval_lookahead_perplexity": 709.6852601291519, |
| "eval_loss": 0.14225433766841888, |
| "eval_perplexity": 1.1528698294763657, |
| "eval_runtime": 483.7285, |
| "eval_samples_per_second": 10.336, |
| "eval_steps_per_second": 0.325, |
| "step": 165000 |
| }, |
| { |
| "base_loss": 0.3039788320362568, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.0008521187701262534, |
| "learning_rate": 3.4216785430908205e-05, |
| "lookahead_loss": 6.6104288854599, |
| "loss": 0.3131, |
| "step": 165500 |
| }, |
| { |
| "base_loss": 0.29717833909392355, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.0009511762764304876, |
| "learning_rate": 3.4169101715087895e-05, |
| "lookahead_loss": 6.45364487361908, |
| "loss": 0.314, |
| "step": 166000 |
| }, |
| { |
| "base_loss": 0.31199148765206336, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.0009973255218937993, |
| "learning_rate": 3.412141799926758e-05, |
| "lookahead_loss": 6.517452167510986, |
| "loss": 0.3238, |
| "step": 166500 |
| }, |
| { |
| "base_loss": 0.3148621036410332, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.0009120389586314559, |
| "learning_rate": 3.407373428344727e-05, |
| "lookahead_loss": 6.512342976093292, |
| "loss": 0.3221, |
| "step": 167000 |
| }, |
| { |
| "base_loss": 0.30580521461367605, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.0009860595455393195, |
| "learning_rate": 3.402605056762695e-05, |
| "lookahead_loss": 6.524440247535706, |
| "loss": 0.3184, |
| "step": 167500 |
| }, |
| { |
| "base_loss": 0.3015244754254818, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.000945191946811974, |
| "learning_rate": 3.397836685180664e-05, |
| "lookahead_loss": 6.494577717781067, |
| "loss": 0.312, |
| "step": 168000 |
| }, |
| { |
| "base_loss": 0.30137019059062004, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.0010133878095075488, |
| "learning_rate": 3.393068313598633e-05, |
| "lookahead_loss": 6.505839110851288, |
| "loss": 0.3131, |
| "step": 168500 |
| }, |
| { |
| "base_loss": 0.3252628707587719, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.000883644272107631, |
| "learning_rate": 3.3882999420166016e-05, |
| "lookahead_loss": 6.501287104606629, |
| "loss": 0.3352, |
| "step": 169000 |
| }, |
| { |
| "base_loss": 0.30557073107361793, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.0009423987939953804, |
| "learning_rate": 3.3835315704345706e-05, |
| "lookahead_loss": 6.586906661987305, |
| "loss": 0.3203, |
| "step": 169500 |
| }, |
| { |
| "base_loss": 0.30054079556465146, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.0009685211116448045, |
| "learning_rate": 3.378763198852539e-05, |
| "lookahead_loss": 6.517388526916504, |
| "loss": 0.315, |
| "step": 170000 |
| }, |
| { |
| "epoch": 2.0143051147460938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.553637696531253, |
| "eval_lookahead_perplexity": 701.792444028261, |
| "eval_loss": 0.1422322541475296, |
| "eval_perplexity": 1.1528443703325186, |
| "eval_runtime": 473.8031, |
| "eval_samples_per_second": 10.553, |
| "eval_steps_per_second": 0.331, |
| "step": 170000 |
| }, |
| { |
| "base_loss": 0.29648803743720054, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.0009045371552929282, |
| "learning_rate": 3.373994827270508e-05, |
| "lookahead_loss": 6.50550382566452, |
| "loss": 0.3072, |
| "step": 170500 |
| }, |
| { |
| "base_loss": 0.31412097451090815, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.0009835059754550457, |
| "learning_rate": 3.369226455688477e-05, |
| "lookahead_loss": 6.529936217784882, |
| "loss": 0.3253, |
| "step": 171000 |
| }, |
| { |
| "base_loss": 0.3125672063827515, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.00090819998877123, |
| "learning_rate": 3.364458084106445e-05, |
| "lookahead_loss": 6.578486999034881, |
| "loss": 0.3233, |
| "step": 171500 |
| }, |
| { |
| "base_loss": 0.3002317441105843, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.0009094449342228472, |
| "learning_rate": 3.3596897125244143e-05, |
| "lookahead_loss": 6.584220232963562, |
| "loss": 0.3104, |
| "step": 172000 |
| }, |
| { |
| "base_loss": 0.29831535935401915, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.000955162278842181, |
| "learning_rate": 3.354921340942383e-05, |
| "lookahead_loss": 6.600418879508972, |
| "loss": 0.3102, |
| "step": 172500 |
| }, |
| { |
| "base_loss": 0.3020369653701782, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.0010421768529340625, |
| "learning_rate": 3.350152969360352e-05, |
| "lookahead_loss": 6.4263093366622925, |
| "loss": 0.3149, |
| "step": 173000 |
| }, |
| { |
| "base_loss": 0.32652922403812407, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.0009437088738195598, |
| "learning_rate": 3.345384597778321e-05, |
| "lookahead_loss": 6.5167139654159545, |
| "loss": 0.3389, |
| "step": 173500 |
| }, |
| { |
| "base_loss": 0.30453234216570857, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.0009512313990853727, |
| "learning_rate": 3.340616226196289e-05, |
| "lookahead_loss": 6.482069372653961, |
| "loss": 0.3138, |
| "step": 174000 |
| }, |
| { |
| "base_loss": 0.2977458454966545, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.0010046373354271054, |
| "learning_rate": 3.335847854614258e-05, |
| "lookahead_loss": 6.520741944313049, |
| "loss": 0.3118, |
| "step": 174500 |
| }, |
| { |
| "base_loss": 0.30405546057224275, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.0009604114457033575, |
| "learning_rate": 3.3310794830322264e-05, |
| "lookahead_loss": 6.482968965530396, |
| "loss": 0.3142, |
| "step": 175000 |
| }, |
| { |
| "epoch": 2.0238418579101562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.541717086737148, |
| "eval_lookahead_perplexity": 693.4763152866533, |
| "eval_loss": 0.1422090083360672, |
| "eval_perplexity": 1.152817571841118, |
| "eval_runtime": 483.1484, |
| "eval_samples_per_second": 10.349, |
| "eval_steps_per_second": 0.325, |
| "step": 175000 |
| }, |
| { |
| "base_loss": 0.32463854083418847, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.0009208981646224856, |
| "learning_rate": 3.3263111114501955e-05, |
| "lookahead_loss": 6.484801607131958, |
| "loss": 0.3355, |
| "step": 175500 |
| }, |
| { |
| "base_loss": 0.3075324648320675, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.0009852510411292315, |
| "learning_rate": 3.3215427398681645e-05, |
| "lookahead_loss": 6.444905442237854, |
| "loss": 0.323, |
| "step": 176000 |
| }, |
| { |
| "base_loss": 0.30398501074314116, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.000977648189291358, |
| "learning_rate": 3.316774368286133e-05, |
| "lookahead_loss": 6.460987593650818, |
| "loss": 0.3133, |
| "step": 176500 |
| }, |
| { |
| "base_loss": 0.3081837382018566, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.000948708038777113, |
| "learning_rate": 3.312005996704102e-05, |
| "lookahead_loss": 6.576977911949157, |
| "loss": 0.3183, |
| "step": 177000 |
| }, |
| { |
| "base_loss": 0.32895678067207335, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.0009632044821046293, |
| "learning_rate": 3.30723762512207e-05, |
| "lookahead_loss": 6.5979501276016235, |
| "loss": 0.3403, |
| "step": 177500 |
| }, |
| { |
| "base_loss": 0.30588172587752344, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.0009475542465224862, |
| "learning_rate": 3.302469253540039e-05, |
| "lookahead_loss": 6.530917593955993, |
| "loss": 0.3147, |
| "step": 178000 |
| }, |
| { |
| "base_loss": 0.3051903445720673, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.0009406576864421368, |
| "learning_rate": 3.297700881958008e-05, |
| "lookahead_loss": 6.51764566040039, |
| "loss": 0.3167, |
| "step": 178500 |
| }, |
| { |
| "base_loss": 0.30346439191699026, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.0009703211835585535, |
| "learning_rate": 3.2929325103759766e-05, |
| "lookahead_loss": 6.543263789176941, |
| "loss": 0.3155, |
| "step": 179000 |
| }, |
| { |
| "base_loss": 0.31795056411623956, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.0009707180433906615, |
| "learning_rate": 3.2881641387939456e-05, |
| "lookahead_loss": 6.475300903320313, |
| "loss": 0.3348, |
| "step": 179500 |
| }, |
| { |
| "base_loss": 0.30795893451571466, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.00098248606082052, |
| "learning_rate": 3.283395767211914e-05, |
| "lookahead_loss": 6.536730496883393, |
| "loss": 0.3178, |
| "step": 180000 |
| }, |
| { |
| "epoch": 2.0333786010742188, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.531855725632689, |
| "eval_lookahead_perplexity": 686.6713034107797, |
| "eval_loss": 0.1421896070241928, |
| "eval_perplexity": 1.1527952058848376, |
| "eval_runtime": 483.5703, |
| "eval_samples_per_second": 10.34, |
| "eval_steps_per_second": 0.325, |
| "step": 180000 |
| }, |
| { |
| "base_loss": 0.3031257001161575, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.0009016969706863165, |
| "learning_rate": 3.278627395629883e-05, |
| "lookahead_loss": 6.603059131622315, |
| "loss": 0.3153, |
| "step": 180500 |
| }, |
| { |
| "base_loss": 0.31068781118094924, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.0010222060373052955, |
| "learning_rate": 3.273859024047852e-05, |
| "lookahead_loss": 6.439461089611053, |
| "loss": 0.3228, |
| "step": 181000 |
| }, |
| { |
| "base_loss": 0.32500979214906695, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.0009705196134746075, |
| "learning_rate": 3.26909065246582e-05, |
| "lookahead_loss": 6.5541965799331665, |
| "loss": 0.337, |
| "step": 181500 |
| }, |
| { |
| "base_loss": 0.3069631262719631, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.0010073435259982944, |
| "learning_rate": 3.2643222808837893e-05, |
| "lookahead_loss": 6.471248873710632, |
| "loss": 0.3175, |
| "step": 182000 |
| }, |
| { |
| "base_loss": 0.3025422422587872, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.0009568389505147934, |
| "learning_rate": 3.259553909301758e-05, |
| "lookahead_loss": 6.524754017829895, |
| "loss": 0.3133, |
| "step": 182500 |
| }, |
| { |
| "base_loss": 0.3076345331072807, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.0009688584832474589, |
| "learning_rate": 3.254785537719727e-05, |
| "lookahead_loss": 6.467104331970215, |
| "loss": 0.3189, |
| "step": 183000 |
| }, |
| { |
| "base_loss": 0.3235399980545044, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.0009437952539883554, |
| "learning_rate": 3.250017166137696e-05, |
| "lookahead_loss": 6.506529898166656, |
| "loss": 0.3336, |
| "step": 183500 |
| }, |
| { |
| "base_loss": 0.30506757298111914, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.0009767162846401334, |
| "learning_rate": 3.245248794555664e-05, |
| "lookahead_loss": 6.448292857646942, |
| "loss": 0.3142, |
| "step": 184000 |
| }, |
| { |
| "base_loss": 0.29668092691898346, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.0009727113647386432, |
| "learning_rate": 3.240480422973633e-05, |
| "lookahead_loss": 6.536737553119659, |
| "loss": 0.3083, |
| "step": 184500 |
| }, |
| { |
| "base_loss": 0.30789948108792303, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.000962116289883852, |
| "learning_rate": 3.2357120513916014e-05, |
| "lookahead_loss": 6.536309094429016, |
| "loss": 0.324, |
| "step": 185000 |
| }, |
| { |
| "epoch": 2.0429153442382812, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.522111146213909, |
| "eval_lookahead_perplexity": 680.0124767842166, |
| "eval_loss": 0.14217031002044678, |
| "eval_perplexity": 1.1527729606060655, |
| "eval_runtime": 471.9889, |
| "eval_samples_per_second": 10.593, |
| "eval_steps_per_second": 0.333, |
| "step": 185000 |
| }, |
| { |
| "base_loss": 0.3281280441880226, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.0009924211772158742, |
| "learning_rate": 3.2309436798095705e-05, |
| "lookahead_loss": 6.569694968223572, |
| "loss": 0.3437, |
| "step": 185500 |
| }, |
| { |
| "base_loss": 0.2978555924296379, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.0010296371765434742, |
| "learning_rate": 3.2261753082275395e-05, |
| "lookahead_loss": 6.4846109199523925, |
| "loss": 0.3105, |
| "step": 186000 |
| }, |
| { |
| "base_loss": 0.3044668311774731, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.0009649458806961775, |
| "learning_rate": 3.221406936645508e-05, |
| "lookahead_loss": 6.508920118331909, |
| "loss": 0.3174, |
| "step": 186500 |
| }, |
| { |
| "base_loss": 0.3298782432973385, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.0009266676497645676, |
| "learning_rate": 3.216638565063477e-05, |
| "lookahead_loss": 6.470085307598114, |
| "loss": 0.3401, |
| "step": 187000 |
| }, |
| { |
| "base_loss": 0.32442897310853, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.0009991949191316962, |
| "learning_rate": 3.211870193481445e-05, |
| "lookahead_loss": 6.5127511582374575, |
| "loss": 0.3385, |
| "step": 187500 |
| }, |
| { |
| "base_loss": 0.2941350122392178, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.0009228453855030239, |
| "learning_rate": 3.207101821899414e-05, |
| "lookahead_loss": 6.472175216674804, |
| "loss": 0.3078, |
| "step": 188000 |
| }, |
| { |
| "base_loss": 0.301623804807663, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.0009880108991637826, |
| "learning_rate": 3.202333450317383e-05, |
| "lookahead_loss": 6.448336009979248, |
| "loss": 0.3146, |
| "step": 188500 |
| }, |
| { |
| "base_loss": 0.31965578559041025, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.0009035322000272572, |
| "learning_rate": 3.1975650787353516e-05, |
| "lookahead_loss": 6.551394259452819, |
| "loss": 0.3321, |
| "step": 189000 |
| }, |
| { |
| "base_loss": 0.30511142282187936, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.0009879703866317868, |
| "learning_rate": 3.1927967071533206e-05, |
| "lookahead_loss": 6.507521020889282, |
| "loss": 0.3188, |
| "step": 189500 |
| }, |
| { |
| "base_loss": 0.3033564644157887, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.0010369070805609226, |
| "learning_rate": 3.188028335571289e-05, |
| "lookahead_loss": 6.442676889419555, |
| "loss": 0.316, |
| "step": 190000 |
| }, |
| { |
| "epoch": 2.0524520874023438, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.5135073006724395, |
| "eval_lookahead_perplexity": 674.1868517938349, |
| "eval_loss": 0.14215251803398132, |
| "eval_perplexity": 1.1527524506676095, |
| "eval_runtime": 483.1418, |
| "eval_samples_per_second": 10.349, |
| "eval_steps_per_second": 0.325, |
| "step": 190000 |
| }, |
| { |
| "base_loss": 0.32089028322696683, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.0009397159446962178, |
| "learning_rate": 3.183259963989258e-05, |
| "lookahead_loss": 6.455264377593994, |
| "loss": 0.3301, |
| "step": 190500 |
| }, |
| { |
| "base_loss": 0.35406574749946595, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.0009735460043884814, |
| "learning_rate": 3.178491592407227e-05, |
| "lookahead_loss": 6.496513916969299, |
| "loss": 0.3693, |
| "step": 191000 |
| }, |
| { |
| "base_loss": 0.2938829956352711, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.0009796855738386512, |
| "learning_rate": 3.173723220825195e-05, |
| "lookahead_loss": 6.495951771259308, |
| "loss": 0.3064, |
| "step": 191500 |
| }, |
| { |
| "base_loss": 0.30498689064383505, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.000926612876355648, |
| "learning_rate": 3.1689548492431643e-05, |
| "lookahead_loss": 6.501594274520874, |
| "loss": 0.3175, |
| "step": 192000 |
| }, |
| { |
| "base_loss": 0.317481600522995, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.0009604953811503947, |
| "learning_rate": 3.164186477661133e-05, |
| "lookahead_loss": 6.519902579307556, |
| "loss": 0.3311, |
| "step": 192500 |
| }, |
| { |
| "base_loss": 0.3179551683664322, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.0009685103432275355, |
| "learning_rate": 3.159418106079102e-05, |
| "lookahead_loss": 6.514713489532471, |
| "loss": 0.3293, |
| "step": 193000 |
| }, |
| { |
| "base_loss": 0.29271650505065916, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.0009422221919521689, |
| "learning_rate": 3.154649734497071e-05, |
| "lookahead_loss": 6.422006649971008, |
| "loss": 0.3067, |
| "step": 193500 |
| }, |
| { |
| "base_loss": 0.3039356949329376, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.0009756973595358431, |
| "learning_rate": 3.149881362915039e-05, |
| "lookahead_loss": 6.483645843505859, |
| "loss": 0.3183, |
| "step": 194000 |
| }, |
| { |
| "base_loss": 0.32165152502059935, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.0009674925822764635, |
| "learning_rate": 3.145112991333008e-05, |
| "lookahead_loss": 6.4522641057968135, |
| "loss": 0.3326, |
| "step": 194500 |
| }, |
| { |
| "base_loss": 0.3061283130943775, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.0010340444277971983, |
| "learning_rate": 3.1403446197509764e-05, |
| "lookahead_loss": 6.467900278091431, |
| "loss": 0.3165, |
| "step": 195000 |
| }, |
| { |
| "epoch": 2.0619888305664062, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.5032055431280655, |
| "eval_lookahead_perplexity": 667.2771942234244, |
| "eval_loss": 0.1421336829662323, |
| "eval_perplexity": 1.152730738701577, |
| "eval_runtime": 473.7725, |
| "eval_samples_per_second": 10.554, |
| "eval_steps_per_second": 0.331, |
| "step": 195000 |
| }, |
| { |
| "base_loss": 0.30640321379899976, |
| "epoch": 2.0629425048828125, |
| "grad_norm": 0.0009878401178866625, |
| "learning_rate": 3.1355762481689455e-05, |
| "lookahead_loss": 6.44850652551651, |
| "loss": 0.3171, |
| "step": 195500 |
| }, |
| { |
| "base_loss": 0.31782649287581444, |
| "epoch": 2.0638961791992188, |
| "grad_norm": 0.0009471693192608654, |
| "learning_rate": 3.1308078765869145e-05, |
| "lookahead_loss": 6.486370619773865, |
| "loss": 0.3303, |
| "step": 196000 |
| }, |
| { |
| "base_loss": 0.30349985790252687, |
| "epoch": 2.064849853515625, |
| "grad_norm": 0.0009686322882771492, |
| "learning_rate": 3.126039505004883e-05, |
| "lookahead_loss": 6.50835819530487, |
| "loss": 0.3185, |
| "step": 196500 |
| }, |
| { |
| "base_loss": 0.3075440634191036, |
| "epoch": 2.0658035278320312, |
| "grad_norm": 0.0009692518506199121, |
| "learning_rate": 3.121271133422852e-05, |
| "lookahead_loss": 6.4442818622589115, |
| "loss": 0.3204, |
| "step": 197000 |
| }, |
| { |
| "base_loss": 0.3064390652179718, |
| "epoch": 2.0667572021484375, |
| "grad_norm": 0.0009859678102657199, |
| "learning_rate": 3.11650276184082e-05, |
| "lookahead_loss": 6.408861030578613, |
| "loss": 0.3165, |
| "step": 197500 |
| }, |
| { |
| "base_loss": 0.3303199237883091, |
| "epoch": 2.0677108764648438, |
| "grad_norm": 0.0010187255684286356, |
| "learning_rate": 3.111734390258789e-05, |
| "lookahead_loss": 6.487036975860596, |
| "loss": 0.3406, |
| "step": 198000 |
| }, |
| { |
| "base_loss": 0.2994670196175575, |
| "epoch": 2.06866455078125, |
| "grad_norm": 0.0010079372441396117, |
| "learning_rate": 3.106966018676758e-05, |
| "lookahead_loss": 6.467383036613464, |
| "loss": 0.3102, |
| "step": 198500 |
| }, |
| { |
| "base_loss": 0.3000359579175711, |
| "epoch": 2.0696182250976562, |
| "grad_norm": 0.0009658647468313575, |
| "learning_rate": 3.1021976470947266e-05, |
| "lookahead_loss": 6.49391339302063, |
| "loss": 0.3156, |
| "step": 199000 |
| }, |
| { |
| "base_loss": 0.34639680609107015, |
| "epoch": 2.0705718994140625, |
| "grad_norm": 0.0009569233516231179, |
| "learning_rate": 3.0974292755126956e-05, |
| "lookahead_loss": 6.382178443908692, |
| "loss": 0.3587, |
| "step": 199500 |
| }, |
| { |
| "base_loss": 0.3132462115287781, |
| "epoch": 2.0715255737304688, |
| "grad_norm": 0.0009783732239156961, |
| "learning_rate": 3.092660903930664e-05, |
| "lookahead_loss": 6.4411235184669495, |
| "loss": 0.3241, |
| "step": 200000 |
| }, |
| { |
| "epoch": 2.0715255737304688, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.12980225879829912, |
| "eval_base_perplexity": 1.1386032122951009, |
| "eval_lookahead_loss": 6.493639263482139, |
| "eval_lookahead_perplexity": 660.9242693582831, |
| "eval_loss": 0.1421152651309967, |
| "eval_perplexity": 1.1527095080922722, |
| "eval_runtime": 497.9192, |
| "eval_samples_per_second": 10.042, |
| "eval_steps_per_second": 0.315, |
| "step": 200000 |
| }, |
| { |
| "base_loss": 0.30655504322052, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.0009628058760426939, |
| "learning_rate": 3.087892532348633e-05, |
| "lookahead_loss": 6.568298415660858, |
| "loss": 0.3144, |
| "step": 200500 |
| }, |
| { |
| "base_loss": 0.3002312153875828, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.0010066054528579116, |
| "learning_rate": 3.083124160766602e-05, |
| "lookahead_loss": 6.406751696586609, |
| "loss": 0.3132, |
| "step": 201000 |
| }, |
| { |
| "base_loss": 0.312505132496357, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.0009853820083662868, |
| "learning_rate": 3.07835578918457e-05, |
| "lookahead_loss": 6.39734969997406, |
| "loss": 0.3223, |
| "step": 201500 |
| }, |
| { |
| "base_loss": 0.3240452491641045, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.000949465436860919, |
| "learning_rate": 3.0735874176025393e-05, |
| "lookahead_loss": 6.42671659564972, |
| "loss": 0.3363, |
| "step": 202000 |
| }, |
| { |
| "base_loss": 0.29858038023114203, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.0009409029153175652, |
| "learning_rate": 3.068819046020508e-05, |
| "lookahead_loss": 6.4190877294540405, |
| "loss": 0.3137, |
| "step": 202500 |
| }, |
| { |
| "base_loss": 0.3042404046058655, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.0008439480443485081, |
| "learning_rate": 3.064050674438477e-05, |
| "lookahead_loss": 6.542135063171386, |
| "loss": 0.3132, |
| "step": 203000 |
| }, |
| { |
| "base_loss": 0.29714440524578095, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.0009408199694007635, |
| "learning_rate": 3.059282302856446e-05, |
| "lookahead_loss": 6.393728638648986, |
| "loss": 0.3127, |
| "step": 203500 |
| }, |
| { |
| "base_loss": 0.31379624953866003, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.0009831907227635384, |
| "learning_rate": 3.054513931274414e-05, |
| "lookahead_loss": 6.455501090049744, |
| "loss": 0.3247, |
| "step": 204000 |
| }, |
| { |
| "base_loss": 0.31622857597470283, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.0009250900475308299, |
| "learning_rate": 3.049745559692383e-05, |
| "lookahead_loss": 6.439110320091247, |
| "loss": 0.3233, |
| "step": 204500 |
| }, |
| { |
| "base_loss": 0.3033116071224213, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.0009871090296655893, |
| "learning_rate": 3.0449771881103518e-05, |
| "lookahead_loss": 6.460744188308716, |
| "loss": 0.3166, |
| "step": 205000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.484790183484745, |
| "eval_lookahead_perplexity": 655.1014987047179, |
| "eval_loss": 0.1420980542898178, |
| "eval_perplexity": 1.152689669162726, |
| "eval_runtime": 259.918, |
| "eval_samples_per_second": 19.237, |
| "eval_steps_per_second": 0.604, |
| "step": 205000 |
| }, |
| { |
| "base_loss": 0.302242584168911, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.0009698246722109616, |
| "learning_rate": 3.0402088165283205e-05, |
| "lookahead_loss": 6.423384314537048, |
| "loss": 0.3121, |
| "step": 205500 |
| }, |
| { |
| "base_loss": 0.3031807193160057, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.0009783682180568576, |
| "learning_rate": 3.035440444946289e-05, |
| "lookahead_loss": 6.434307872772217, |
| "loss": 0.3166, |
| "step": 206000 |
| }, |
| { |
| "base_loss": 0.324542246311903, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.0008743834332562983, |
| "learning_rate": 3.0306720733642578e-05, |
| "lookahead_loss": 6.431088864326477, |
| "loss": 0.3347, |
| "step": 206500 |
| }, |
| { |
| "base_loss": 0.3043093577325344, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.0009393716463819146, |
| "learning_rate": 3.025903701782227e-05, |
| "lookahead_loss": 6.5347442026138305, |
| "loss": 0.3199, |
| "step": 207000 |
| }, |
| { |
| "base_loss": 0.29890961676836014, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.000908377580344677, |
| "learning_rate": 3.0211353302001955e-05, |
| "lookahead_loss": 6.474946077346802, |
| "loss": 0.3143, |
| "step": 207500 |
| }, |
| { |
| "base_loss": 0.2968312213420868, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.0009118029265664518, |
| "learning_rate": 3.0163669586181642e-05, |
| "lookahead_loss": 6.427557950973511, |
| "loss": 0.3081, |
| "step": 208000 |
| }, |
| { |
| "base_loss": 0.309987826526165, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.0009858175180852413, |
| "learning_rate": 3.011598587036133e-05, |
| "lookahead_loss": 6.471011517524719, |
| "loss": 0.3222, |
| "step": 208500 |
| }, |
| { |
| "base_loss": 0.3124798896312714, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.0009066257625818253, |
| "learning_rate": 3.0068302154541016e-05, |
| "lookahead_loss": 6.498817705154419, |
| "loss": 0.3235, |
| "step": 209000 |
| }, |
| { |
| "base_loss": 0.30385399025678633, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.0009525167988613248, |
| "learning_rate": 3.0020618438720706e-05, |
| "lookahead_loss": 6.509060912132263, |
| "loss": 0.313, |
| "step": 209500 |
| }, |
| { |
| "base_loss": 0.2997076933085918, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.0009451503865420818, |
| "learning_rate": 2.9972934722900393e-05, |
| "lookahead_loss": 6.532156089782715, |
| "loss": 0.3104, |
| "step": 210000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.475708332305518, |
| "eval_lookahead_perplexity": 649.1788991778967, |
| "eval_loss": 0.14207972586154938, |
| "eval_perplexity": 1.15266854236642, |
| "eval_runtime": 295.0132, |
| "eval_samples_per_second": 16.948, |
| "eval_steps_per_second": 0.532, |
| "step": 210000 |
| }, |
| { |
| "base_loss": 0.30210260692238805, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.0010357605060562491, |
| "learning_rate": 2.992525100708008e-05, |
| "lookahead_loss": 6.357669311523438, |
| "loss": 0.315, |
| "step": 210500 |
| }, |
| { |
| "base_loss": 0.3285051781535149, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.0009752861224114895, |
| "learning_rate": 2.9877567291259766e-05, |
| "lookahead_loss": 6.451837629318237, |
| "loss": 0.3386, |
| "step": 211000 |
| }, |
| { |
| "base_loss": 0.30326452678442, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.0009549338137730956, |
| "learning_rate": 2.9829883575439453e-05, |
| "lookahead_loss": 6.413721528530121, |
| "loss": 0.3141, |
| "step": 211500 |
| }, |
| { |
| "base_loss": 0.29889601907134056, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.0009833979420363903, |
| "learning_rate": 2.9782199859619143e-05, |
| "lookahead_loss": 6.465893486022949, |
| "loss": 0.3118, |
| "step": 212000 |
| }, |
| { |
| "base_loss": 0.3006108500063419, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.0009762793779373169, |
| "learning_rate": 2.973451614379883e-05, |
| "lookahead_loss": 6.41252710723877, |
| "loss": 0.3126, |
| "step": 212500 |
| }, |
| { |
| "base_loss": 0.3237688979506493, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.000890803465154022, |
| "learning_rate": 2.9686832427978517e-05, |
| "lookahead_loss": 6.42967294883728, |
| "loss": 0.3356, |
| "step": 213000 |
| }, |
| { |
| "base_loss": 0.3078545735180378, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.0009638071060180664, |
| "learning_rate": 2.9639148712158204e-05, |
| "lookahead_loss": 6.387160004615784, |
| "loss": 0.3221, |
| "step": 213500 |
| }, |
| { |
| "base_loss": 0.3022345977425575, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.0010054127778857946, |
| "learning_rate": 2.959146499633789e-05, |
| "lookahead_loss": 6.398777591705322, |
| "loss": 0.3111, |
| "step": 214000 |
| }, |
| { |
| "base_loss": 0.3071480156183243, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.000952261732891202, |
| "learning_rate": 2.954378128051758e-05, |
| "lookahead_loss": 6.514560857772827, |
| "loss": 0.3187, |
| "step": 214500 |
| }, |
| { |
| "base_loss": 0.3302598208785057, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.0009398755501024425, |
| "learning_rate": 2.9496097564697268e-05, |
| "lookahead_loss": 6.534868264198304, |
| "loss": 0.3416, |
| "step": 215000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.467427203449578, |
| "eval_lookahead_perplexity": 643.8251631474817, |
| "eval_loss": 0.14206360280513763, |
| "eval_perplexity": 1.1526499579763063, |
| "eval_runtime": 274.565, |
| "eval_samples_per_second": 18.211, |
| "eval_steps_per_second": 0.572, |
| "step": 215000 |
| }, |
| { |
| "base_loss": 0.3027013133764267, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.0009348155581392348, |
| "learning_rate": 2.9448413848876955e-05, |
| "lookahead_loss": 6.474138929367065, |
| "loss": 0.312, |
| "step": 215500 |
| }, |
| { |
| "base_loss": 0.3046494301855564, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.0009527892689220607, |
| "learning_rate": 2.940073013305664e-05, |
| "lookahead_loss": 6.449120025157929, |
| "loss": 0.316, |
| "step": 216000 |
| }, |
| { |
| "base_loss": 0.3023626366853714, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.0009392331703566015, |
| "learning_rate": 2.9353046417236328e-05, |
| "lookahead_loss": 6.4767416534423825, |
| "loss": 0.3147, |
| "step": 216500 |
| }, |
| { |
| "base_loss": 0.3171934984624386, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.000982875470072031, |
| "learning_rate": 2.930536270141602e-05, |
| "lookahead_loss": 6.436639490127564, |
| "loss": 0.3346, |
| "step": 217000 |
| }, |
| { |
| "base_loss": 0.305971223294735, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.0009957151487469673, |
| "learning_rate": 2.9257678985595705e-05, |
| "lookahead_loss": 6.465258483886719, |
| "loss": 0.3153, |
| "step": 217500 |
| }, |
| { |
| "base_loss": 0.3008191674053669, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.000894519907888025, |
| "learning_rate": 2.9209995269775392e-05, |
| "lookahead_loss": 6.542671841621399, |
| "loss": 0.3137, |
| "step": 218000 |
| }, |
| { |
| "base_loss": 0.3125488177835941, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.001007439219392836, |
| "learning_rate": 2.916231155395508e-05, |
| "lookahead_loss": 6.36938267993927, |
| "loss": 0.3225, |
| "step": 218500 |
| }, |
| { |
| "base_loss": 0.32382212686538697, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.000986156752333045, |
| "learning_rate": 2.9114627838134766e-05, |
| "lookahead_loss": 6.487519608497619, |
| "loss": 0.3375, |
| "step": 219000 |
| }, |
| { |
| "base_loss": 0.30577521124482154, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.001012337044812739, |
| "learning_rate": 2.9066944122314456e-05, |
| "lookahead_loss": 6.423724625587464, |
| "loss": 0.3175, |
| "step": 219500 |
| }, |
| { |
| "base_loss": 0.3027714610397816, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.0009804905857890844, |
| "learning_rate": 2.9019260406494143e-05, |
| "lookahead_loss": 6.465024924278259, |
| "loss": 0.3152, |
| "step": 220000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.459714018117887, |
| "eval_lookahead_perplexity": 638.8783228163264, |
| "eval_loss": 0.14204907417297363, |
| "eval_perplexity": 1.1526332116707039, |
| "eval_runtime": 342.2757, |
| "eval_samples_per_second": 14.608, |
| "eval_steps_per_second": 0.459, |
| "step": 220000 |
| }, |
| { |
| "base_loss": 0.3064252578020096, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.000922691891901195, |
| "learning_rate": 2.897157669067383e-05, |
| "lookahead_loss": 6.412614955902099, |
| "loss": 0.3197, |
| "step": 220500 |
| }, |
| { |
| "base_loss": 0.3251348150372505, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.0009598666802048683, |
| "learning_rate": 2.8923892974853516e-05, |
| "lookahead_loss": 6.448003714561462, |
| "loss": 0.3346, |
| "step": 221000 |
| }, |
| { |
| "base_loss": 0.3045478595495224, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.0009646528051234782, |
| "learning_rate": 2.8876209259033203e-05, |
| "lookahead_loss": 6.387988406181336, |
| "loss": 0.3154, |
| "step": 221500 |
| }, |
| { |
| "base_loss": 0.2982518375813961, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.0009869185741990805, |
| "learning_rate": 2.8828525543212893e-05, |
| "lookahead_loss": 6.465492009162903, |
| "loss": 0.31, |
| "step": 222000 |
| }, |
| { |
| "base_loss": 0.3089935587644577, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.0009668731945566833, |
| "learning_rate": 2.878084182739258e-05, |
| "lookahead_loss": 6.474124125003815, |
| "loss": 0.3243, |
| "step": 222500 |
| }, |
| { |
| "base_loss": 0.3268603746891022, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.0009782308479771018, |
| "learning_rate": 2.8733158111572267e-05, |
| "lookahead_loss": 6.512031971931457, |
| "loss": 0.341, |
| "step": 223000 |
| }, |
| { |
| "base_loss": 0.29676153120398524, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.0009767008014023304, |
| "learning_rate": 2.8685474395751954e-05, |
| "lookahead_loss": 6.4184129590988155, |
| "loss": 0.3094, |
| "step": 223500 |
| }, |
| { |
| "base_loss": 0.3044439141750336, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.0009669915889389813, |
| "learning_rate": 2.863779067993164e-05, |
| "lookahead_loss": 6.440961833953858, |
| "loss": 0.3164, |
| "step": 224000 |
| }, |
| { |
| "base_loss": 0.3313070158064365, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.0009441322763450444, |
| "learning_rate": 2.859010696411133e-05, |
| "lookahead_loss": 6.40649371099472, |
| "loss": 0.3396, |
| "step": 224500 |
| }, |
| { |
| "base_loss": 0.32587327966094015, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.0010059355990961194, |
| "learning_rate": 2.8542423248291018e-05, |
| "lookahead_loss": 6.4436911821365355, |
| "loss": 0.3391, |
| "step": 225000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.452761023189313, |
| "eval_lookahead_perplexity": 634.4516123592657, |
| "eval_loss": 0.14203442633152008, |
| "eval_perplexity": 1.1526163282058186, |
| "eval_runtime": 279.8888, |
| "eval_samples_per_second": 17.864, |
| "eval_steps_per_second": 0.561, |
| "step": 225000 |
| }, |
| { |
| "base_loss": 0.29468956208229063, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.0009452200611121953, |
| "learning_rate": 2.8494739532470705e-05, |
| "lookahead_loss": 6.411119204521179, |
| "loss": 0.308, |
| "step": 225500 |
| }, |
| { |
| "base_loss": 0.3027429393827915, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.000997032504528761, |
| "learning_rate": 2.844705581665039e-05, |
| "lookahead_loss": 6.39424205160141, |
| "loss": 0.3165, |
| "step": 226000 |
| }, |
| { |
| "base_loss": 0.3190444597601891, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.000901962979696691, |
| "learning_rate": 2.8399372100830078e-05, |
| "lookahead_loss": 6.48411437702179, |
| "loss": 0.3334, |
| "step": 226500 |
| }, |
| { |
| "base_loss": 0.3043918348252773, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.0009930033702403307, |
| "learning_rate": 2.835168838500977e-05, |
| "lookahead_loss": 6.4403407697677615, |
| "loss": 0.3181, |
| "step": 227000 |
| }, |
| { |
| "base_loss": 0.3046840020418167, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.0010451226262375712, |
| "learning_rate": 2.8304004669189455e-05, |
| "lookahead_loss": 6.379611058235168, |
| "loss": 0.3174, |
| "step": 227500 |
| }, |
| { |
| "base_loss": 0.3202188531160355, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.0009132448467426002, |
| "learning_rate": 2.8256320953369142e-05, |
| "lookahead_loss": 6.3884695830345155, |
| "loss": 0.3302, |
| "step": 228000 |
| }, |
| { |
| "base_loss": 0.3542410895228386, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.0009372152271680534, |
| "learning_rate": 2.820863723754883e-05, |
| "lookahead_loss": 6.432561398506165, |
| "loss": 0.369, |
| "step": 228500 |
| }, |
| { |
| "base_loss": 0.2943912135362625, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.0009705543052405119, |
| "learning_rate": 2.8160953521728516e-05, |
| "lookahead_loss": 6.44163135099411, |
| "loss": 0.3085, |
| "step": 229000 |
| }, |
| { |
| "base_loss": 0.30392896428704264, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.0009304340346716344, |
| "learning_rate": 2.8113269805908206e-05, |
| "lookahead_loss": 6.4533995332717895, |
| "loss": 0.317, |
| "step": 229500 |
| }, |
| { |
| "base_loss": 0.3181495431959629, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.0009568824316374958, |
| "learning_rate": 2.8065586090087893e-05, |
| "lookahead_loss": 6.471045615673066, |
| "loss": 0.3319, |
| "step": 230000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.445454614231, |
| "eval_lookahead_perplexity": 629.832942905796, |
| "eval_loss": 0.14202013611793518, |
| "eval_perplexity": 1.1525998571899945, |
| "eval_runtime": 305.3757, |
| "eval_samples_per_second": 16.373, |
| "eval_steps_per_second": 0.514, |
| "step": 230000 |
| }, |
| { |
| "base_loss": 0.3180287193655968, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.000988287152722478, |
| "learning_rate": 2.801790237426758e-05, |
| "lookahead_loss": 6.466564978599548, |
| "loss": 0.3287, |
| "step": 230500 |
| }, |
| { |
| "base_loss": 0.292520221978426, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.000923556333873421, |
| "learning_rate": 2.7970218658447266e-05, |
| "lookahead_loss": 6.3680587558746335, |
| "loss": 0.3084, |
| "step": 231000 |
| }, |
| { |
| "base_loss": 0.3019208701252937, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.0009795061778277159, |
| "learning_rate": 2.7922534942626953e-05, |
| "lookahead_loss": 6.414703974723816, |
| "loss": 0.315, |
| "step": 231500 |
| }, |
| { |
| "base_loss": 0.32141088619828223, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.0009851646609604359, |
| "learning_rate": 2.7874851226806643e-05, |
| "lookahead_loss": 6.393617419719696, |
| "loss": 0.3324, |
| "step": 232000 |
| }, |
| { |
| "base_loss": 0.30723505771160126, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.0010298019042238593, |
| "learning_rate": 2.782716751098633e-05, |
| "lookahead_loss": 6.404361547470093, |
| "loss": 0.3166, |
| "step": 232500 |
| }, |
| { |
| "base_loss": 0.308370777964592, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.0009864643216133118, |
| "learning_rate": 2.7779483795166017e-05, |
| "lookahead_loss": 6.393691156387329, |
| "loss": 0.3175, |
| "step": 233000 |
| }, |
| { |
| "base_loss": 0.3175841515958309, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.0009484239271841943, |
| "learning_rate": 2.7731800079345704e-05, |
| "lookahead_loss": 6.43188937664032, |
| "loss": 0.33, |
| "step": 233500 |
| }, |
| { |
| "base_loss": 0.3023634272813797, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.0009487507632002234, |
| "learning_rate": 2.768411636352539e-05, |
| "lookahead_loss": 6.451686841011047, |
| "loss": 0.3175, |
| "step": 234000 |
| }, |
| { |
| "base_loss": 0.31000158992409704, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.0009820089908316731, |
| "learning_rate": 2.763643264770508e-05, |
| "lookahead_loss": 6.376307249069214, |
| "loss": 0.3214, |
| "step": 234500 |
| }, |
| { |
| "base_loss": 0.3074465197324753, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.0009712969767861068, |
| "learning_rate": 2.7588748931884768e-05, |
| "lookahead_loss": 6.353045886516571, |
| "loss": 0.3171, |
| "step": 235000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.437607750725061, |
| "eval_lookahead_perplexity": 624.9100695885159, |
| "eval_loss": 0.1420057862997055, |
| "eval_perplexity": 1.1525833177102218, |
| "eval_runtime": 291.1708, |
| "eval_samples_per_second": 17.172, |
| "eval_steps_per_second": 0.539, |
| "step": 235000 |
| }, |
| { |
| "base_loss": 0.3308669750988483, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.0010304059833288193, |
| "learning_rate": 2.7541065216064455e-05, |
| "lookahead_loss": 6.44032656955719, |
| "loss": 0.3439, |
| "step": 235500 |
| }, |
| { |
| "base_loss": 0.300963022172451, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.0009902446763589978, |
| "learning_rate": 2.749338150024414e-05, |
| "lookahead_loss": 6.4132391576766965, |
| "loss": 0.3104, |
| "step": 236000 |
| }, |
| { |
| "base_loss": 0.3016065271794796, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.0009580631158314645, |
| "learning_rate": 2.7445697784423828e-05, |
| "lookahead_loss": 6.435465224266053, |
| "loss": 0.3147, |
| "step": 236500 |
| }, |
| { |
| "base_loss": 0.3469915909469128, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.0009712293976917863, |
| "learning_rate": 2.739801406860352e-05, |
| "lookahead_loss": 6.335322134017944, |
| "loss": 0.3591, |
| "step": 237000 |
| }, |
| { |
| "base_loss": 0.31762470316886904, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.00096644286531955, |
| "learning_rate": 2.7350330352783205e-05, |
| "lookahead_loss": 6.388244523525238, |
| "loss": 0.3257, |
| "step": 237500 |
| }, |
| { |
| "base_loss": 0.3090612238943577, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.0009957854636013508, |
| "learning_rate": 2.7302646636962892e-05, |
| "lookahead_loss": 6.472851838111877, |
| "loss": 0.319, |
| "step": 238000 |
| }, |
| { |
| "base_loss": 0.3051177371442318, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.0009499763837084174, |
| "learning_rate": 2.725496292114258e-05, |
| "lookahead_loss": 6.422829883098602, |
| "loss": 0.3177, |
| "step": 238500 |
| }, |
| { |
| "base_loss": 0.32735036182403565, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.0009518949664197862, |
| "learning_rate": 2.7207279205322266e-05, |
| "lookahead_loss": 6.442498418331146, |
| "loss": 0.34, |
| "step": 239000 |
| }, |
| { |
| "base_loss": 0.3037717220187187, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.0009656847105361521, |
| "learning_rate": 2.7159595489501956e-05, |
| "lookahead_loss": 6.504564664363861, |
| "loss": 0.316, |
| "step": 239500 |
| }, |
| { |
| "base_loss": 0.3043428426384926, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.0009005047613754869, |
| "learning_rate": 2.7111911773681643e-05, |
| "lookahead_loss": 6.462020317077637, |
| "loss": 0.3176, |
| "step": 240000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.43002515707534, |
| "eval_lookahead_perplexity": 620.189549971696, |
| "eval_loss": 0.14199133217334747, |
| "eval_perplexity": 1.1525666582457088, |
| "eval_runtime": 297.2728, |
| "eval_samples_per_second": 16.82, |
| "eval_steps_per_second": 0.528, |
| "step": 240000 |
| }, |
| { |
| "base_loss": 0.3304513318836689, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.0009675352484919131, |
| "learning_rate": 2.706422805786133e-05, |
| "lookahead_loss": 6.441016254425048, |
| "loss": 0.344, |
| "step": 240500 |
| }, |
| { |
| "base_loss": 0.3036956556737423, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.000994171597994864, |
| "learning_rate": 2.7016544342041016e-05, |
| "lookahead_loss": 6.463219980716706, |
| "loss": 0.318, |
| "step": 241000 |
| }, |
| { |
| "base_loss": 0.29657268461585046, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.000914372387342155, |
| "learning_rate": 2.6968860626220703e-05, |
| "lookahead_loss": 6.459996415138245, |
| "loss": 0.3112, |
| "step": 241500 |
| }, |
| { |
| "base_loss": 0.3123248810470104, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.0009580728365108371, |
| "learning_rate": 2.6921176910400393e-05, |
| "lookahead_loss": 6.445644073486328, |
| "loss": 0.3292, |
| "step": 242000 |
| }, |
| { |
| "base_loss": 0.32140666726231576, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.000988593208603561, |
| "learning_rate": 2.687349319458008e-05, |
| "lookahead_loss": 6.463636265277863, |
| "loss": 0.3373, |
| "step": 242500 |
| }, |
| { |
| "base_loss": 0.29961148300766943, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.0009065298363566399, |
| "learning_rate": 2.6825809478759767e-05, |
| "lookahead_loss": 6.459467195987702, |
| "loss": 0.3123, |
| "step": 243000 |
| }, |
| { |
| "base_loss": 0.3036400380730629, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.0010064428206533194, |
| "learning_rate": 2.6778125762939454e-05, |
| "lookahead_loss": 6.485273163795471, |
| "loss": 0.3175, |
| "step": 243500 |
| }, |
| { |
| "base_loss": 0.3315876969695091, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.0009258039062842727, |
| "learning_rate": 2.673044204711914e-05, |
| "lookahead_loss": 6.489104858398438, |
| "loss": 0.345, |
| "step": 244000 |
| }, |
| { |
| "base_loss": 0.30729381024837493, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.000969972345046699, |
| "learning_rate": 2.668275833129883e-05, |
| "lookahead_loss": 6.436074539661408, |
| "loss": 0.3198, |
| "step": 244500 |
| }, |
| { |
| "base_loss": 0.3001744159460068, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.0010077956831082702, |
| "learning_rate": 2.6635074615478518e-05, |
| "lookahead_loss": 6.4327317771911625, |
| "loss": 0.3101, |
| "step": 245000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.422323587222602, |
| "eval_lookahead_perplexity": 615.4314627197983, |
| "eval_loss": 0.14197710156440735, |
| "eval_perplexity": 1.1525502566370207, |
| "eval_runtime": 260.851, |
| "eval_samples_per_second": 19.168, |
| "eval_steps_per_second": 0.602, |
| "step": 245000 |
| }, |
| { |
| "base_loss": 0.3043146550655365, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.001128224772401154, |
| "learning_rate": 2.6587390899658205e-05, |
| "lookahead_loss": 6.390339149475098, |
| "loss": 0.3138, |
| "step": 245500 |
| }, |
| { |
| "base_loss": 0.33685871040821075, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.0008826267439872026, |
| "learning_rate": 2.653970718383789e-05, |
| "lookahead_loss": 6.4577340927124025, |
| "loss": 0.3444, |
| "step": 246000 |
| }, |
| { |
| "base_loss": 0.30271838963031766, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.0009827081812545657, |
| "learning_rate": 2.6492023468017578e-05, |
| "lookahead_loss": 6.433503454208374, |
| "loss": 0.3134, |
| "step": 246500 |
| }, |
| { |
| "base_loss": 0.31106107553839685, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.000977760530076921, |
| "learning_rate": 2.644433975219727e-05, |
| "lookahead_loss": 6.467117793083191, |
| "loss": 0.3206, |
| "step": 247000 |
| }, |
| { |
| "base_loss": 0.29801268032193184, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.0009753919439390302, |
| "learning_rate": 2.6396656036376955e-05, |
| "lookahead_loss": 6.457008891105652, |
| "loss": 0.3107, |
| "step": 247500 |
| }, |
| { |
| "base_loss": 0.29706856977939605, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.0008775305468589067, |
| "learning_rate": 2.6348972320556642e-05, |
| "lookahead_loss": 6.417390830039978, |
| "loss": 0.3085, |
| "step": 248000 |
| }, |
| { |
| "base_loss": 0.3189851225912571, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.0009562866762280464, |
| "learning_rate": 2.630128860473633e-05, |
| "lookahead_loss": 6.447861388206482, |
| "loss": 0.3347, |
| "step": 248500 |
| }, |
| { |
| "base_loss": 0.307105902582407, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.0009072708780877292, |
| "learning_rate": 2.6253604888916016e-05, |
| "lookahead_loss": 6.451038600921631, |
| "loss": 0.3198, |
| "step": 249000 |
| }, |
| { |
| "base_loss": 0.2863923677802086, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.0009528218070045114, |
| "learning_rate": 2.6205921173095706e-05, |
| "lookahead_loss": 6.449621738433838, |
| "loss": 0.3025, |
| "step": 249500 |
| }, |
| { |
| "base_loss": 0.2923275768607855, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.0009796309750527143, |
| "learning_rate": 2.6158237457275393e-05, |
| "lookahead_loss": 6.37427237701416, |
| "loss": 0.3081, |
| "step": 250000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.416619156877073, |
| "eval_lookahead_perplexity": 611.9307710270847, |
| "eval_loss": 0.1419648975133896, |
| "eval_perplexity": 1.1525361909407175, |
| "eval_runtime": 271.5852, |
| "eval_samples_per_second": 18.41, |
| "eval_steps_per_second": 0.578, |
| "step": 250000 |
| }, |
| { |
| "base_loss": 0.2988976333141327, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.0009741184767335653, |
| "learning_rate": 2.611055374145508e-05, |
| "lookahead_loss": 6.453314149856568, |
| "loss": 0.312, |
| "step": 250500 |
| }, |
| { |
| "base_loss": 0.3292928241491318, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.0009737982181832194, |
| "learning_rate": 2.6062870025634766e-05, |
| "lookahead_loss": 6.470526203155518, |
| "loss": 0.3397, |
| "step": 251000 |
| }, |
| { |
| "base_loss": 0.2914348037838936, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.0009696637280285358, |
| "learning_rate": 2.6015186309814453e-05, |
| "lookahead_loss": 6.402810046672821, |
| "loss": 0.3081, |
| "step": 251500 |
| }, |
| { |
| "base_loss": 0.2972012578845024, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.0009921115124598145, |
| "learning_rate": 2.5967502593994143e-05, |
| "lookahead_loss": 6.451505978584289, |
| "loss": 0.3099, |
| "step": 252000 |
| }, |
| { |
| "base_loss": 0.3006402098238468, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.0009354639914818108, |
| "learning_rate": 2.591981887817383e-05, |
| "lookahead_loss": 6.447086319446564, |
| "loss": 0.3148, |
| "step": 252500 |
| }, |
| { |
| "base_loss": 0.3227167456150055, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.0009446279727853835, |
| "learning_rate": 2.5872135162353517e-05, |
| "lookahead_loss": 6.516663771629333, |
| "loss": 0.3322, |
| "step": 253000 |
| }, |
| { |
| "base_loss": 0.30574207335710524, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.0009245507535524666, |
| "learning_rate": 2.5824451446533204e-05, |
| "lookahead_loss": 6.473391896247864, |
| "loss": 0.315, |
| "step": 253500 |
| }, |
| { |
| "base_loss": 0.29960223579406736, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.0009597218013368547, |
| "learning_rate": 2.577676773071289e-05, |
| "lookahead_loss": 6.437963982582092, |
| "loss": 0.3144, |
| "step": 254000 |
| }, |
| { |
| "base_loss": 0.2996614835858345, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.0009940110612660646, |
| "learning_rate": 2.572908401489258e-05, |
| "lookahead_loss": 6.468970078468323, |
| "loss": 0.3118, |
| "step": 254500 |
| }, |
| { |
| "base_loss": 0.3155037875175476, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.0009892784291878343, |
| "learning_rate": 2.5681400299072268e-05, |
| "lookahead_loss": 6.404183995246887, |
| "loss": 0.3277, |
| "step": 255000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.4104706182266575, |
| "eval_lookahead_perplexity": 608.1798342368199, |
| "eval_loss": 0.1419525295495987, |
| "eval_perplexity": 1.1525219365029897, |
| "eval_runtime": 267.8175, |
| "eval_samples_per_second": 18.669, |
| "eval_steps_per_second": 0.586, |
| "step": 255000 |
| }, |
| { |
| "base_loss": 0.30881242457032204, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.0009296280913986266, |
| "learning_rate": 2.5633716583251955e-05, |
| "lookahead_loss": 6.387983600616455, |
| "loss": 0.3208, |
| "step": 255500 |
| }, |
| { |
| "base_loss": 0.29839677426218986, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.0009309362503699958, |
| "learning_rate": 2.558603286743164e-05, |
| "lookahead_loss": 6.475096418380737, |
| "loss": 0.3094, |
| "step": 256000 |
| }, |
| { |
| "base_loss": 0.294783333927393, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.0009678134229034185, |
| "learning_rate": 2.5538349151611328e-05, |
| "lookahead_loss": 6.3918491244316105, |
| "loss": 0.3082, |
| "step": 256500 |
| }, |
| { |
| "base_loss": 0.32150769320130346, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.0010087802074849606, |
| "learning_rate": 2.549066543579102e-05, |
| "lookahead_loss": 6.3981378741264345, |
| "loss": 0.3336, |
| "step": 257000 |
| }, |
| { |
| "base_loss": 0.3191940434873104, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.0009189763222821057, |
| "learning_rate": 2.5442981719970705e-05, |
| "lookahead_loss": 6.356856064796448, |
| "loss": 0.3283, |
| "step": 257500 |
| }, |
| { |
| "base_loss": 0.30270202097296717, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.0008876527426764369, |
| "learning_rate": 2.5395298004150392e-05, |
| "lookahead_loss": 6.327256792068481, |
| "loss": 0.3141, |
| "step": 258000 |
| }, |
| { |
| "base_loss": 0.2974509707689285, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.0009743132395669818, |
| "learning_rate": 2.534761428833008e-05, |
| "lookahead_loss": 6.453075678348541, |
| "loss": 0.3085, |
| "step": 258500 |
| }, |
| { |
| "base_loss": 0.3114223616421223, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.0009253611788153648, |
| "learning_rate": 2.5299930572509766e-05, |
| "lookahead_loss": 6.464822199821472, |
| "loss": 0.3229, |
| "step": 259000 |
| }, |
| { |
| "base_loss": 0.3443338246643543, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.0009910385124385357, |
| "learning_rate": 2.5252246856689456e-05, |
| "lookahead_loss": 6.472144736289978, |
| "loss": 0.3517, |
| "step": 259500 |
| }, |
| { |
| "base_loss": 0.2939972540736198, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.000937771808821708, |
| "learning_rate": 2.5204563140869143e-05, |
| "lookahead_loss": 6.35105470085144, |
| "loss": 0.3083, |
| "step": 260000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.404723172751479, |
| "eval_lookahead_perplexity": 604.6943796252261, |
| "eval_loss": 0.14194095134735107, |
| "eval_perplexity": 1.1525085924481642, |
| "eval_runtime": 271.0139, |
| "eval_samples_per_second": 18.449, |
| "eval_steps_per_second": 0.579, |
| "step": 260000 |
| }, |
| { |
| "base_loss": 0.29841734063625336, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.0009852543007582426, |
| "learning_rate": 2.515687942504883e-05, |
| "lookahead_loss": 6.423218637466431, |
| "loss": 0.3114, |
| "step": 260500 |
| }, |
| { |
| "base_loss": 0.3147252712547779, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.0010154576739296317, |
| "learning_rate": 2.5109195709228516e-05, |
| "lookahead_loss": 6.40000822353363, |
| "loss": 0.325, |
| "step": 261000 |
| }, |
| { |
| "base_loss": 0.32950386153161526, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.000979804084636271, |
| "learning_rate": 2.5061511993408203e-05, |
| "lookahead_loss": 6.405930280685425, |
| "loss": 0.3432, |
| "step": 261500 |
| }, |
| { |
| "base_loss": 0.30734304267168044, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.0010216154623776674, |
| "learning_rate": 2.5013828277587893e-05, |
| "lookahead_loss": 6.439217642784119, |
| "loss": 0.3167, |
| "step": 262000 |
| }, |
| { |
| "base_loss": 0.3014386140704155, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.0009699428919702768, |
| "learning_rate": 2.496614456176758e-05, |
| "lookahead_loss": 6.446448052883148, |
| "loss": 0.3129, |
| "step": 262500 |
| }, |
| { |
| "base_loss": 0.30611268219351767, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.0009632021537981927, |
| "learning_rate": 2.4918460845947267e-05, |
| "lookahead_loss": 6.482765606403351, |
| "loss": 0.3149, |
| "step": 263000 |
| }, |
| { |
| "base_loss": 0.301539769411087, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.0010064532980322838, |
| "learning_rate": 2.4870777130126954e-05, |
| "lookahead_loss": 6.310386034011841, |
| "loss": 0.314, |
| "step": 263500 |
| }, |
| { |
| "base_loss": 0.31222748425602914, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.0009766683215275407, |
| "learning_rate": 2.482309341430664e-05, |
| "lookahead_loss": 6.307436645507813, |
| "loss": 0.3223, |
| "step": 264000 |
| }, |
| { |
| "base_loss": 0.32267384630441664, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.0009503905894234776, |
| "learning_rate": 2.477540969848633e-05, |
| "lookahead_loss": 6.3480785894393925, |
| "loss": 0.3351, |
| "step": 264500 |
| }, |
| { |
| "base_loss": 0.30016050645709036, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.000949801120441407, |
| "learning_rate": 2.4727725982666018e-05, |
| "lookahead_loss": 6.324021258354187, |
| "loss": 0.3162, |
| "step": 265000 |
| }, |
| { |
| "epoch": 2.0047683715820312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.398448333191795, |
| "eval_lookahead_perplexity": 600.9118990506257, |
| "eval_loss": 0.14192864298820496, |
| "eval_perplexity": 1.152494407045789, |
| "eval_runtime": 267.8201, |
| "eval_samples_per_second": 18.669, |
| "eval_steps_per_second": 0.586, |
| "step": 265000 |
| }, |
| { |
| "base_loss": 0.3024714471399784, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.000842128531076014, |
| "learning_rate": 2.4680042266845705e-05, |
| "lookahead_loss": 6.430944422245026, |
| "loss": 0.3115, |
| "step": 265500 |
| }, |
| { |
| "base_loss": 0.2964489733278751, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.0009076377027668059, |
| "learning_rate": 2.463235855102539e-05, |
| "lookahead_loss": 6.303486613273621, |
| "loss": 0.3129, |
| "step": 266000 |
| }, |
| { |
| "base_loss": 0.31337857532501223, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.0009625607635825872, |
| "learning_rate": 2.4584674835205078e-05, |
| "lookahead_loss": 6.363961039543152, |
| "loss": 0.3236, |
| "step": 266500 |
| }, |
| { |
| "base_loss": 0.3180972839295864, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.0009394744993187487, |
| "learning_rate": 2.453699111938477e-05, |
| "lookahead_loss": 6.347307140827179, |
| "loss": 0.3229, |
| "step": 267000 |
| }, |
| { |
| "base_loss": 0.30493127757310867, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.0009756337967701256, |
| "learning_rate": 2.4489307403564455e-05, |
| "lookahead_loss": 6.364429833412171, |
| "loss": 0.3183, |
| "step": 267500 |
| }, |
| { |
| "base_loss": 0.30099570405483245, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.0009337849332951009, |
| "learning_rate": 2.4441623687744142e-05, |
| "lookahead_loss": 6.338661858081817, |
| "loss": 0.3112, |
| "step": 268000 |
| }, |
| { |
| "base_loss": 0.30160990768671037, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.0010120351798832417, |
| "learning_rate": 2.439393997192383e-05, |
| "lookahead_loss": 6.341832407951355, |
| "loss": 0.3145, |
| "step": 268500 |
| }, |
| { |
| "base_loss": 0.32538792353868484, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.0008798608323559165, |
| "learning_rate": 2.4346256256103516e-05, |
| "lookahead_loss": 6.331272545814514, |
| "loss": 0.3356, |
| "step": 269000 |
| }, |
| { |
| "base_loss": 0.3040602553486824, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.0009250569855794311, |
| "learning_rate": 2.4298572540283206e-05, |
| "lookahead_loss": 6.42705198764801, |
| "loss": 0.3192, |
| "step": 269500 |
| }, |
| { |
| "base_loss": 0.29813345649838446, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.000940575497224927, |
| "learning_rate": 2.4250888824462893e-05, |
| "lookahead_loss": 6.370742217063904, |
| "loss": 0.3118, |
| "step": 270000 |
| }, |
| { |
| "epoch": 2.0143051147460938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.3924329357025345, |
| "eval_lookahead_perplexity": 597.308025355375, |
| "eval_loss": 0.14191682636737823, |
| "eval_perplexity": 1.1524807885368387, |
| "eval_runtime": 257.7499, |
| "eval_samples_per_second": 19.399, |
| "eval_steps_per_second": 0.609, |
| "step": 270000 |
| }, |
| { |
| "base_loss": 0.2960759684741497, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.0009009315399453044, |
| "learning_rate": 2.420320510864258e-05, |
| "lookahead_loss": 6.327431865215302, |
| "loss": 0.3069, |
| "step": 270500 |
| }, |
| { |
| "base_loss": 0.31211792075634004, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.0009672954329289496, |
| "learning_rate": 2.4155521392822266e-05, |
| "lookahead_loss": 6.376048874855042, |
| "loss": 0.3227, |
| "step": 271000 |
| }, |
| { |
| "base_loss": 0.31110167542099953, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.0009206890244968235, |
| "learning_rate": 2.4107837677001953e-05, |
| "lookahead_loss": 6.41415591430664, |
| "loss": 0.3223, |
| "step": 271500 |
| }, |
| { |
| "base_loss": 0.2990322083234787, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.0009310735040344298, |
| "learning_rate": 2.406015396118164e-05, |
| "lookahead_loss": 6.427069549560547, |
| "loss": 0.3117, |
| "step": 272000 |
| }, |
| { |
| "base_loss": 0.29806812533736227, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.0009696860215626657, |
| "learning_rate": 2.401247024536133e-05, |
| "lookahead_loss": 6.449417492866516, |
| "loss": 0.3097, |
| "step": 272500 |
| }, |
| { |
| "base_loss": 0.30187543269991873, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.001025256235152483, |
| "learning_rate": 2.3964786529541017e-05, |
| "lookahead_loss": 6.279822265148163, |
| "loss": 0.3149, |
| "step": 273000 |
| }, |
| { |
| "base_loss": 0.32729279178380966, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.0009685347322374582, |
| "learning_rate": 2.3917102813720704e-05, |
| "lookahead_loss": 6.365867915153504, |
| "loss": 0.3375, |
| "step": 273500 |
| }, |
| { |
| "base_loss": 0.3057846530973911, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.000963672180660069, |
| "learning_rate": 2.386941909790039e-05, |
| "lookahead_loss": 6.3289254207611085, |
| "loss": 0.3143, |
| "step": 274000 |
| }, |
| { |
| "base_loss": 0.2997340569794178, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.0009888113709166646, |
| "learning_rate": 2.3821735382080078e-05, |
| "lookahead_loss": 6.368264214992523, |
| "loss": 0.3116, |
| "step": 274500 |
| }, |
| { |
| "base_loss": 0.30260268279910085, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.0009482959285378456, |
| "learning_rate": 2.3774051666259768e-05, |
| "lookahead_loss": 6.32055482673645, |
| "loss": 0.314, |
| "step": 275000 |
| }, |
| { |
| "epoch": 2.0238418579101562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.385799606006366, |
| "eval_lookahead_perplexity": 593.3589963787288, |
| "eval_loss": 0.1419040411710739, |
| "eval_perplexity": 1.1524660539379128, |
| "eval_runtime": 265.9647, |
| "eval_samples_per_second": 18.799, |
| "eval_steps_per_second": 0.59, |
| "step": 275000 |
| }, |
| { |
| "base_loss": 0.3236656226217747, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.0009006787440739572, |
| "learning_rate": 2.3726367950439455e-05, |
| "lookahead_loss": 6.34121999502182, |
| "loss": 0.3346, |
| "step": 275500 |
| }, |
| { |
| "base_loss": 0.30869458481669426, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.0009739008382894099, |
| "learning_rate": 2.367868423461914e-05, |
| "lookahead_loss": 6.298044787406921, |
| "loss": 0.3227, |
| "step": 276000 |
| }, |
| { |
| "base_loss": 0.3019005296528339, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.0010005339281633496, |
| "learning_rate": 2.3631000518798828e-05, |
| "lookahead_loss": 6.320043759346008, |
| "loss": 0.3112, |
| "step": 276500 |
| }, |
| { |
| "base_loss": 0.3077106066644192, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.0009583939099684358, |
| "learning_rate": 2.3583316802978515e-05, |
| "lookahead_loss": 6.426606664657593, |
| "loss": 0.3183, |
| "step": 277000 |
| }, |
| { |
| "base_loss": 0.3280421564877033, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.0009699960355646908, |
| "learning_rate": 2.3535633087158205e-05, |
| "lookahead_loss": 6.442135247707367, |
| "loss": 0.3392, |
| "step": 277500 |
| }, |
| { |
| "base_loss": 0.30581475085020066, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.0009488245123066008, |
| "learning_rate": 2.3487949371337892e-05, |
| "lookahead_loss": 6.394023329734802, |
| "loss": 0.3138, |
| "step": 278000 |
| }, |
| { |
| "base_loss": 0.3068877322375774, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.0009734364575706422, |
| "learning_rate": 2.344026565551758e-05, |
| "lookahead_loss": 6.383229479789734, |
| "loss": 0.3165, |
| "step": 278500 |
| }, |
| { |
| "base_loss": 0.3014947620034218, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.0009622674551792443, |
| "learning_rate": 2.3392581939697266e-05, |
| "lookahead_loss": 6.387813640594483, |
| "loss": 0.3141, |
| "step": 279000 |
| }, |
| { |
| "base_loss": 0.3173881909847259, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.0009690853185020387, |
| "learning_rate": 2.3344898223876953e-05, |
| "lookahead_loss": 6.346703090667725, |
| "loss": 0.3335, |
| "step": 279500 |
| }, |
| { |
| "base_loss": 0.3059709269702435, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.0009841558057814837, |
| "learning_rate": 2.3297214508056643e-05, |
| "lookahead_loss": 6.396211630821228, |
| "loss": 0.3149, |
| "step": 280000 |
| }, |
| { |
| "epoch": 2.0333786010742188, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.38051580849547, |
| "eval_lookahead_perplexity": 590.2320758728989, |
| "eval_loss": 0.14189383387565613, |
| "eval_perplexity": 1.152454290436478, |
| "eval_runtime": 270.4203, |
| "eval_samples_per_second": 18.49, |
| "eval_steps_per_second": 0.581, |
| "step": 280000 |
| }, |
| { |
| "base_loss": 0.3017133396565914, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.0009047857020050287, |
| "learning_rate": 2.324953079223633e-05, |
| "lookahead_loss": 6.462435024261475, |
| "loss": 0.3144, |
| "step": 280500 |
| }, |
| { |
| "base_loss": 0.31134800574183463, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.0010061671491712332, |
| "learning_rate": 2.3201847076416016e-05, |
| "lookahead_loss": 6.291885371685028, |
| "loss": 0.3227, |
| "step": 281000 |
| }, |
| { |
| "base_loss": 0.32387468561530114, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.0009494886617176235, |
| "learning_rate": 2.3154163360595703e-05, |
| "lookahead_loss": 6.4035931491851805, |
| "loss": 0.3364, |
| "step": 281500 |
| }, |
| { |
| "base_loss": 0.3080780008882284, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.0009919317672029138, |
| "learning_rate": 2.310647964477539e-05, |
| "lookahead_loss": 6.333096095561981, |
| "loss": 0.3194, |
| "step": 282000 |
| }, |
| { |
| "base_loss": 0.30180328992009164, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.0009797826642170548, |
| "learning_rate": 2.305879592895508e-05, |
| "lookahead_loss": 6.382054131031037, |
| "loss": 0.3139, |
| "step": 282500 |
| }, |
| { |
| "base_loss": 0.30689890575408935, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.0009295629570260644, |
| "learning_rate": 2.3011112213134767e-05, |
| "lookahead_loss": 6.332678085803986, |
| "loss": 0.319, |
| "step": 283000 |
| }, |
| { |
| "base_loss": 0.32427770999073985, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.0009341423865407705, |
| "learning_rate": 2.2963428497314454e-05, |
| "lookahead_loss": 6.368278294086457, |
| "loss": 0.3333, |
| "step": 283500 |
| }, |
| { |
| "base_loss": 0.30682690465450285, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.000977862160652876, |
| "learning_rate": 2.291574478149414e-05, |
| "lookahead_loss": 6.298216439723968, |
| "loss": 0.3158, |
| "step": 284000 |
| }, |
| { |
| "base_loss": 0.29654143354296686, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.0009500051965005696, |
| "learning_rate": 2.2868061065673828e-05, |
| "lookahead_loss": 6.386434417724609, |
| "loss": 0.3081, |
| "step": 284500 |
| }, |
| { |
| "base_loss": 0.30721216344833374, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.0009771424811333418, |
| "learning_rate": 2.2820377349853518e-05, |
| "lookahead_loss": 6.396834279060363, |
| "loss": 0.3232, |
| "step": 285000 |
| }, |
| { |
| "epoch": 2.0429153442382812, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.375226985151395, |
| "eval_lookahead_perplexity": 587.1186830411696, |
| "eval_loss": 0.14188361167907715, |
| "eval_perplexity": 1.1524425098823847, |
| "eval_runtime": 255.0149, |
| "eval_samples_per_second": 19.607, |
| "eval_steps_per_second": 0.616, |
| "step": 285000 |
| }, |
| { |
| "base_loss": 0.32630143281817436, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.000979576027020812, |
| "learning_rate": 2.2772693634033205e-05, |
| "lookahead_loss": 6.421607649326324, |
| "loss": 0.3418, |
| "step": 285500 |
| }, |
| { |
| "base_loss": 0.296696748316288, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.0009980611503124237, |
| "learning_rate": 2.272500991821289e-05, |
| "lookahead_loss": 6.342905656814575, |
| "loss": 0.3099, |
| "step": 286000 |
| }, |
| { |
| "base_loss": 0.30323311913013457, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.0009551486582495272, |
| "learning_rate": 2.2677326202392578e-05, |
| "lookahead_loss": 6.355796694755554, |
| "loss": 0.3161, |
| "step": 286500 |
| }, |
| { |
| "base_loss": 0.32944888742268086, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.0009698076173663139, |
| "learning_rate": 2.2629642486572265e-05, |
| "lookahead_loss": 6.332300736427307, |
| "loss": 0.3395, |
| "step": 287000 |
| }, |
| { |
| "base_loss": 0.32393511798977853, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.0009880122961476445, |
| "learning_rate": 2.2581958770751955e-05, |
| "lookahead_loss": 6.367317282676697, |
| "loss": 0.3395, |
| "step": 287500 |
| }, |
| { |
| "base_loss": 0.293301939278841, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.0009537344449199736, |
| "learning_rate": 2.2534275054931642e-05, |
| "lookahead_loss": 6.327198909759521, |
| "loss": 0.3059, |
| "step": 288000 |
| }, |
| { |
| "base_loss": 0.3036652799248695, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.001002758159302175, |
| "learning_rate": 2.248659133911133e-05, |
| "lookahead_loss": 6.3047745990753175, |
| "loss": 0.3174, |
| "step": 288500 |
| }, |
| { |
| "base_loss": 0.3174412237107754, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.0009080055169761181, |
| "learning_rate": 2.2438907623291016e-05, |
| "lookahead_loss": 6.417123206138611, |
| "loss": 0.3322, |
| "step": 289000 |
| }, |
| { |
| "base_loss": 0.30474287942051886, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.0009786185109987855, |
| "learning_rate": 2.2391223907470703e-05, |
| "lookahead_loss": 6.35580781173706, |
| "loss": 0.318, |
| "step": 289500 |
| }, |
| { |
| "base_loss": 0.30692395463585853, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.001052466919645667, |
| "learning_rate": 2.2343540191650393e-05, |
| "lookahead_loss": 6.302160401821136, |
| "loss": 0.3179, |
| "step": 290000 |
| }, |
| { |
| "epoch": 2.0524520874023438, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.370734815780347, |
| "eval_lookahead_perplexity": 584.4871615214768, |
| "eval_loss": 0.14187423884868622, |
| "eval_perplexity": 1.1524317082848252, |
| "eval_runtime": 269.1546, |
| "eval_samples_per_second": 18.577, |
| "eval_steps_per_second": 0.583, |
| "step": 290000 |
| }, |
| { |
| "base_loss": 0.32256214889883994, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.0009556798031553626, |
| "learning_rate": 2.229585647583008e-05, |
| "lookahead_loss": 6.321493628501892, |
| "loss": 0.3314, |
| "step": 290500 |
| }, |
| { |
| "base_loss": 0.3550116382241249, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.000961259298492223, |
| "learning_rate": 2.2248172760009766e-05, |
| "lookahead_loss": 6.349826979160309, |
| "loss": 0.3695, |
| "step": 291000 |
| }, |
| { |
| "base_loss": 0.2970747436285019, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.0009557474404573441, |
| "learning_rate": 2.2200489044189453e-05, |
| "lookahead_loss": 6.357933250904083, |
| "loss": 0.3083, |
| "step": 291500 |
| }, |
| { |
| "base_loss": 0.30645539990067483, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.0009564717183820903, |
| "learning_rate": 2.215280532836914e-05, |
| "lookahead_loss": 6.364681176662445, |
| "loss": 0.3169, |
| "step": 292000 |
| }, |
| { |
| "base_loss": 0.31723022189736366, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.0009774576174095273, |
| "learning_rate": 2.210512161254883e-05, |
| "lookahead_loss": 6.384846560955047, |
| "loss": 0.331, |
| "step": 292500 |
| }, |
| { |
| "base_loss": 0.3193240025639534, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.0009728356963023543, |
| "learning_rate": 2.2057437896728517e-05, |
| "lookahead_loss": 6.384281438827514, |
| "loss": 0.3274, |
| "step": 293000 |
| }, |
| { |
| "base_loss": 0.2937832759618759, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.0009236375335603952, |
| "learning_rate": 2.2009754180908204e-05, |
| "lookahead_loss": 6.286119668006897, |
| "loss": 0.3073, |
| "step": 293500 |
| }, |
| { |
| "base_loss": 0.30271227744221685, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.000955918338149786, |
| "learning_rate": 2.196207046508789e-05, |
| "lookahead_loss": 6.3528727483749385, |
| "loss": 0.3166, |
| "step": 294000 |
| }, |
| { |
| "base_loss": 0.3198817696869373, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.0009816517122089863, |
| "learning_rate": 2.1914386749267578e-05, |
| "lookahead_loss": 6.311840002059936, |
| "loss": 0.3308, |
| "step": 294500 |
| }, |
| { |
| "base_loss": 0.3065698970258236, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.0010125135304406285, |
| "learning_rate": 2.1866703033447268e-05, |
| "lookahead_loss": 6.322453142166138, |
| "loss": 0.3149, |
| "step": 295000 |
| }, |
| { |
| "epoch": 2.0619888305664062, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.365234135057979, |
| "eval_lookahead_perplexity": 581.2809106252928, |
| "eval_loss": 0.14186429977416992, |
| "eval_perplexity": 1.152420254237123, |
| "eval_runtime": 252.3744, |
| "eval_samples_per_second": 19.812, |
| "eval_steps_per_second": 0.622, |
| "step": 295000 |
| }, |
| { |
| "base_loss": 0.30793745544552803, |
| "epoch": 2.0629425048828125, |
| "grad_norm": 0.0010041906498372555, |
| "learning_rate": 2.1819019317626955e-05, |
| "lookahead_loss": 6.316522599697113, |
| "loss": 0.3179, |
| "step": 295500 |
| }, |
| { |
| "base_loss": 0.3166033121049404, |
| "epoch": 2.0638961791992188, |
| "grad_norm": 0.0009562079794704914, |
| "learning_rate": 2.177133560180664e-05, |
| "lookahead_loss": 6.345142545700074, |
| "loss": 0.3285, |
| "step": 296000 |
| }, |
| { |
| "base_loss": 0.30278992640972135, |
| "epoch": 2.064849853515625, |
| "grad_norm": 0.0009520898456685245, |
| "learning_rate": 2.1723651885986328e-05, |
| "lookahead_loss": 6.375065122127533, |
| "loss": 0.3182, |
| "step": 296500 |
| }, |
| { |
| "base_loss": 0.30789859166741373, |
| "epoch": 2.0658035278320312, |
| "grad_norm": 0.0009764463757164776, |
| "learning_rate": 2.1675968170166015e-05, |
| "lookahead_loss": 6.292466301918029, |
| "loss": 0.3196, |
| "step": 297000 |
| }, |
| { |
| "base_loss": 0.307515013217926, |
| "epoch": 2.0667572021484375, |
| "grad_norm": 0.0009969203965738416, |
| "learning_rate": 2.1628284454345705e-05, |
| "lookahead_loss": 6.283729823112488, |
| "loss": 0.3164, |
| "step": 297500 |
| }, |
| { |
| "base_loss": 0.3304452752768993, |
| "epoch": 2.0677108764648438, |
| "grad_norm": 0.0010050119599327445, |
| "learning_rate": 2.1580600738525392e-05, |
| "lookahead_loss": 6.349568615913391, |
| "loss": 0.3424, |
| "step": 298000 |
| }, |
| { |
| "base_loss": 0.3000219973921776, |
| "epoch": 2.06866455078125, |
| "grad_norm": 0.0010230648331344128, |
| "learning_rate": 2.153291702270508e-05, |
| "lookahead_loss": 6.3259260330200195, |
| "loss": 0.3094, |
| "step": 298500 |
| }, |
| { |
| "base_loss": 0.3050138043165207, |
| "epoch": 2.0696182250976562, |
| "grad_norm": 0.000958802062086761, |
| "learning_rate": 2.1485233306884766e-05, |
| "lookahead_loss": 6.357368535041809, |
| "loss": 0.3167, |
| "step": 299000 |
| }, |
| { |
| "base_loss": 0.34709723374247553, |
| "epoch": 2.0705718994140625, |
| "grad_norm": 0.0009933494729921222, |
| "learning_rate": 2.1437549591064453e-05, |
| "lookahead_loss": 6.26349934053421, |
| "loss": 0.3598, |
| "step": 299500 |
| }, |
| { |
| "base_loss": 0.31454886627197265, |
| "epoch": 2.0715255737304688, |
| "grad_norm": 0.0009534511482343078, |
| "learning_rate": 2.1389865875244143e-05, |
| "lookahead_loss": 6.306615966796875, |
| "loss": 0.3259, |
| "step": 300000 |
| }, |
| { |
| "epoch": 2.0715255737304688, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.36017004521891, |
| "eval_lookahead_perplexity": 578.3446927825626, |
| "eval_loss": 0.14185450971126556, |
| "eval_perplexity": 1.1524089720255686, |
| "eval_runtime": 269.0073, |
| "eval_samples_per_second": 18.587, |
| "eval_steps_per_second": 0.584, |
| "step": 300000 |
| }, |
| { |
| "base_loss": 0.30621468406915664, |
| "epoch": 2.072479248046875, |
| "grad_norm": 0.0009834032971411943, |
| "learning_rate": 2.134218215942383e-05, |
| "lookahead_loss": 6.379785425662995, |
| "loss": 0.3181, |
| "step": 300500 |
| }, |
| { |
| "base_loss": 0.3062588813006878, |
| "epoch": 2.0734329223632812, |
| "grad_norm": 0.0009368330356664956, |
| "learning_rate": 2.1294498443603516e-05, |
| "lookahead_loss": 6.34466918849945, |
| "loss": 0.3187, |
| "step": 301000 |
| }, |
| { |
| "base_loss": 0.3277868445813656, |
| "epoch": 2.0743865966796875, |
| "grad_norm": 0.0009595782612450421, |
| "learning_rate": 2.1246814727783203e-05, |
| "lookahead_loss": 6.362534090518952, |
| "loss": 0.3403, |
| "step": 301500 |
| }, |
| { |
| "base_loss": 0.30303199696540833, |
| "epoch": 2.0753402709960938, |
| "grad_norm": 0.0009713785257190466, |
| "learning_rate": 2.119913101196289e-05, |
| "lookahead_loss": 6.440483233451843, |
| "loss": 0.3151, |
| "step": 302000 |
| }, |
| { |
| "base_loss": 0.30761926966905595, |
| "epoch": 2.0762939453125, |
| "grad_norm": 0.0009204771486110985, |
| "learning_rate": 2.115144729614258e-05, |
| "lookahead_loss": 6.384215325355529, |
| "loss": 0.3185, |
| "step": 302500 |
| }, |
| { |
| "base_loss": 0.33150802648067473, |
| "epoch": 2.0772476196289062, |
| "grad_norm": 0.0009674776811152697, |
| "learning_rate": 2.1103763580322267e-05, |
| "lookahead_loss": 6.376626619338989, |
| "loss": 0.3442, |
| "step": 303000 |
| }, |
| { |
| "base_loss": 0.30574921500682833, |
| "epoch": 2.0782012939453125, |
| "grad_norm": 0.0009765610448084772, |
| "learning_rate": 2.1056079864501954e-05, |
| "lookahead_loss": 6.3769332141876225, |
| "loss": 0.3173, |
| "step": 303500 |
| }, |
| { |
| "base_loss": 0.2994054418802261, |
| "epoch": 2.0791549682617188, |
| "grad_norm": 0.0009325052960775793, |
| "learning_rate": 2.100839614868164e-05, |
| "lookahead_loss": 6.375643718719482, |
| "loss": 0.3132, |
| "step": 304000 |
| }, |
| { |
| "base_loss": 0.31194803246855735, |
| "epoch": 2.080108642578125, |
| "grad_norm": 0.0009499716688878834, |
| "learning_rate": 2.0960712432861328e-05, |
| "lookahead_loss": 6.357075540542603, |
| "loss": 0.3293, |
| "step": 304500 |
| }, |
| { |
| "base_loss": 0.3244352611005306, |
| "epoch": 2.0810623168945312, |
| "grad_norm": 0.0010088100098073483, |
| "learning_rate": 2.0913028717041018e-05, |
| "lookahead_loss": 6.381894103050232, |
| "loss": 0.3389, |
| "step": 305000 |
| }, |
| { |
| "epoch": 2.0810623168945312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.354905619788855, |
| "eval_lookahead_perplexity": 575.3080404027362, |
| "eval_loss": 0.14184485375881195, |
| "eval_perplexity": 1.1523978444730512, |
| "eval_runtime": 264.9253, |
| "eval_samples_per_second": 18.873, |
| "eval_steps_per_second": 0.593, |
| "step": 305000 |
| }, |
| { |
| "base_loss": 0.30112930870056154, |
| "epoch": 2.0820159912109375, |
| "grad_norm": 0.0009222808876074851, |
| "learning_rate": 2.0865345001220705e-05, |
| "lookahead_loss": 6.385272013187408, |
| "loss": 0.313, |
| "step": 305500 |
| }, |
| { |
| "base_loss": 0.30448419651389125, |
| "epoch": 2.0829696655273438, |
| "grad_norm": 0.0009914437541738153, |
| "learning_rate": 2.081766128540039e-05, |
| "lookahead_loss": 6.414517087459564, |
| "loss": 0.3179, |
| "step": 306000 |
| }, |
| { |
| "base_loss": 0.33415990057587625, |
| "epoch": 2.08392333984375, |
| "grad_norm": 0.0009415415697731078, |
| "learning_rate": 2.0769977569580078e-05, |
| "lookahead_loss": 6.42780497264862, |
| "loss": 0.3454, |
| "step": 306500 |
| }, |
| { |
| "base_loss": 0.31056174263358116, |
| "epoch": 2.0848770141601562, |
| "grad_norm": 0.000990306492894888, |
| "learning_rate": 2.0722293853759765e-05, |
| "lookahead_loss": 6.353310499668122, |
| "loss": 0.3208, |
| "step": 307000 |
| }, |
| { |
| "base_loss": 0.2973758824914694, |
| "epoch": 2.0858306884765625, |
| "grad_norm": 0.0010177840013056993, |
| "learning_rate": 2.0674610137939455e-05, |
| "lookahead_loss": 6.358139885902405, |
| "loss": 0.3087, |
| "step": 307500 |
| }, |
| { |
| "base_loss": 0.3042152850329876, |
| "epoch": 2.0867843627929688, |
| "grad_norm": 0.0011030308669432998, |
| "learning_rate": 2.0626926422119142e-05, |
| "lookahead_loss": 6.3280936369895935, |
| "loss": 0.3138, |
| "step": 308000 |
| }, |
| { |
| "base_loss": 0.337257578343153, |
| "epoch": 2.087738037109375, |
| "grad_norm": 0.000879972823895514, |
| "learning_rate": 2.057924270629883e-05, |
| "lookahead_loss": 6.373201508045197, |
| "loss": 0.344, |
| "step": 308500 |
| }, |
| { |
| "base_loss": 0.3000968562066555, |
| "epoch": 2.0886917114257812, |
| "grad_norm": 0.0010026989039033651, |
| "learning_rate": 2.0531558990478516e-05, |
| "lookahead_loss": 6.366145831108093, |
| "loss": 0.3114, |
| "step": 309000 |
| }, |
| { |
| "base_loss": 0.3103375973403454, |
| "epoch": 2.0896453857421875, |
| "grad_norm": 0.0009907458443194628, |
| "learning_rate": 2.0483875274658203e-05, |
| "lookahead_loss": 6.402014326095581, |
| "loss": 0.3184, |
| "step": 309500 |
| }, |
| { |
| "base_loss": 0.29991284269094465, |
| "epoch": 2.0905990600585938, |
| "grad_norm": 0.0009730191086418927, |
| "learning_rate": 2.0436191558837893e-05, |
| "lookahead_loss": 6.375398355484009, |
| "loss": 0.3119, |
| "step": 310000 |
| }, |
| { |
| "epoch": 2.0905990600585938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.350209241477065, |
| "eval_lookahead_perplexity": 572.6125107670246, |
| "eval_loss": 0.14183588325977325, |
| "eval_perplexity": 1.1523875069356617, |
| "eval_runtime": 261.1778, |
| "eval_samples_per_second": 19.144, |
| "eval_steps_per_second": 0.601, |
| "step": 310000 |
| }, |
| { |
| "base_loss": 0.3000768061578274, |
| "epoch": 2.091552734375, |
| "grad_norm": 0.0008830556180328131, |
| "learning_rate": 2.038850784301758e-05, |
| "lookahead_loss": 6.33889298915863, |
| "loss": 0.3108, |
| "step": 310500 |
| }, |
| { |
| "base_loss": 0.31945454320311545, |
| "epoch": 2.0925064086914062, |
| "grad_norm": 0.0009592261631041765, |
| "learning_rate": 2.0340824127197266e-05, |
| "lookahead_loss": 6.384022113323212, |
| "loss": 0.3332, |
| "step": 311000 |
| }, |
| { |
| "base_loss": 0.309114942163229, |
| "epoch": 2.0934600830078125, |
| "grad_norm": 0.0008802940137684345, |
| "learning_rate": 2.0293140411376953e-05, |
| "lookahead_loss": 6.382145908355713, |
| "loss": 0.3196, |
| "step": 311500 |
| }, |
| { |
| "base_loss": 0.28753899577260017, |
| "epoch": 2.0944137573242188, |
| "grad_norm": 0.0009626175160519779, |
| "learning_rate": 2.024545669555664e-05, |
| "lookahead_loss": 6.367324014663696, |
| "loss": 0.3034, |
| "step": 312000 |
| }, |
| { |
| "base_loss": 0.29245217123627665, |
| "epoch": 2.095367431640625, |
| "grad_norm": 0.0009789286414161325, |
| "learning_rate": 2.019777297973633e-05, |
| "lookahead_loss": 6.304876070022583, |
| "loss": 0.3079, |
| "step": 312500 |
| }, |
| { |
| "base_loss": 0.30112256136536597, |
| "epoch": 2.0963211059570312, |
| "grad_norm": 0.0009424127638339996, |
| "learning_rate": 2.0150089263916017e-05, |
| "lookahead_loss": 6.374583042144775, |
| "loss": 0.3135, |
| "step": 313000 |
| }, |
| { |
| "base_loss": 0.3297825155258179, |
| "epoch": 2.0972747802734375, |
| "grad_norm": 0.0009768138406798244, |
| "learning_rate": 2.0102405548095704e-05, |
| "lookahead_loss": 6.39582384967804, |
| "loss": 0.341, |
| "step": 313500 |
| }, |
| { |
| "base_loss": 0.2911633634865284, |
| "epoch": 2.0982284545898438, |
| "grad_norm": 0.0009722594986669719, |
| "learning_rate": 2.005472183227539e-05, |
| "lookahead_loss": 6.326456391811371, |
| "loss": 0.3062, |
| "step": 314000 |
| }, |
| { |
| "base_loss": 0.2934698580801487, |
| "epoch": 2.09918212890625, |
| "grad_norm": 0.0009798408718779683, |
| "learning_rate": 2.0007038116455078e-05, |
| "lookahead_loss": 6.375765996932984, |
| "loss": 0.3097, |
| "step": 314500 |
| }, |
| { |
| "base_loss": 0.3032768616080284, |
| "epoch": 2.1001358032226562, |
| "grad_norm": 0.0009298865916207433, |
| "learning_rate": 1.9959354400634768e-05, |
| "lookahead_loss": 6.382933850288391, |
| "loss": 0.3149, |
| "step": 315000 |
| }, |
| { |
| "epoch": 2.1001358032226562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.346465186189158, |
| "eval_lookahead_perplexity": 570.4726262907923, |
| "eval_loss": 0.14182765781879425, |
| "eval_perplexity": 1.1523780280792224, |
| "eval_runtime": 278.3923, |
| "eval_samples_per_second": 17.96, |
| "eval_steps_per_second": 0.564, |
| "step": 315000 |
| }, |
| { |
| "base_loss": 0.3233104472160339, |
| "epoch": 2.1010894775390625, |
| "grad_norm": 0.0009602688369341195, |
| "learning_rate": 1.9911670684814455e-05, |
| "lookahead_loss": 6.44511248588562, |
| "loss": 0.3332, |
| "step": 315500 |
| }, |
| { |
| "base_loss": 0.30158160945773127, |
| "epoch": 2.1020431518554688, |
| "grad_norm": 0.0009496202110312879, |
| "learning_rate": 1.986398696899414e-05, |
| "lookahead_loss": 6.3878337059021, |
| "loss": 0.3128, |
| "step": 316000 |
| }, |
| { |
| "base_loss": 0.29719595339894295, |
| "epoch": 2.102996826171875, |
| "grad_norm": 0.0009702611714601517, |
| "learning_rate": 1.9816303253173828e-05, |
| "lookahead_loss": 6.36890030002594, |
| "loss": 0.3127, |
| "step": 316500 |
| }, |
| { |
| "base_loss": 0.3008777514696121, |
| "epoch": 2.1039505004882812, |
| "grad_norm": 0.0009990332182496786, |
| "learning_rate": 1.9768619537353515e-05, |
| "lookahead_loss": 6.40038135433197, |
| "loss": 0.3116, |
| "step": 317000 |
| }, |
| { |
| "base_loss": 0.31517351168394087, |
| "epoch": 2.1049041748046875, |
| "grad_norm": 0.0009908730862662196, |
| "learning_rate": 1.9720935821533205e-05, |
| "lookahead_loss": 6.332096821784973, |
| "loss": 0.3277, |
| "step": 317500 |
| }, |
| { |
| "base_loss": 0.3079349631667137, |
| "epoch": 2.1058578491210938, |
| "grad_norm": 0.0009350811596959829, |
| "learning_rate": 1.9673252105712892e-05, |
| "lookahead_loss": 6.3365942516326905, |
| "loss": 0.3206, |
| "step": 318000 |
| }, |
| { |
| "base_loss": 0.29828172570466993, |
| "epoch": 2.1068115234375, |
| "grad_norm": 0.0009335639770142734, |
| "learning_rate": 1.962556838989258e-05, |
| "lookahead_loss": 6.408078939437866, |
| "loss": 0.3084, |
| "step": 318500 |
| }, |
| { |
| "base_loss": 0.29354105108976364, |
| "epoch": 2.1077651977539062, |
| "grad_norm": 0.0009805822046473622, |
| "learning_rate": 1.9577884674072266e-05, |
| "lookahead_loss": 6.328373380184174, |
| "loss": 0.306, |
| "step": 319000 |
| }, |
| { |
| "base_loss": 0.3199558552503586, |
| "epoch": 2.1087188720703125, |
| "grad_norm": 0.0010078581981360912, |
| "learning_rate": 1.9530200958251953e-05, |
| "lookahead_loss": 6.335598618030548, |
| "loss": 0.333, |
| "step": 319500 |
| }, |
| { |
| "base_loss": 0.31955078572034834, |
| "epoch": 2.1096725463867188, |
| "grad_norm": 0.0009100844035856426, |
| "learning_rate": 1.9482517242431643e-05, |
| "lookahead_loss": 6.297343758106232, |
| "loss": 0.3289, |
| "step": 320000 |
| }, |
| { |
| "epoch": 2.1096725463867188, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.342238569411988, |
| "eval_lookahead_perplexity": 568.066545490244, |
| "eval_loss": 0.14181943237781525, |
| "eval_perplexity": 1.1523685493007505, |
| "eval_runtime": 268.0733, |
| "eval_samples_per_second": 18.652, |
| "eval_steps_per_second": 0.586, |
| "step": 320000 |
| }, |
| { |
| "base_loss": 0.30168747982382776, |
| "epoch": 2.110626220703125, |
| "grad_norm": 0.0009170817211270332, |
| "learning_rate": 1.943483352661133e-05, |
| "lookahead_loss": 6.255009590625763, |
| "loss": 0.315, |
| "step": 320500 |
| }, |
| { |
| "base_loss": 0.29647291442751883, |
| "epoch": 2.1115798950195312, |
| "grad_norm": 0.0010039744665846229, |
| "learning_rate": 1.9387149810791016e-05, |
| "lookahead_loss": 6.370616749763489, |
| "loss": 0.3074, |
| "step": 321000 |
| }, |
| { |
| "base_loss": 0.3107553372234106, |
| "epoch": 2.1125335693359375, |
| "grad_norm": 0.0009533903794363141, |
| "learning_rate": 1.9339466094970703e-05, |
| "lookahead_loss": 6.388658571243286, |
| "loss": 0.3221, |
| "step": 321500 |
| }, |
| { |
| "base_loss": 0.34275346267223356, |
| "epoch": 2.1134872436523438, |
| "grad_norm": 0.0010111962910741568, |
| "learning_rate": 1.929178237915039e-05, |
| "lookahead_loss": 6.398144736289978, |
| "loss": 0.3506, |
| "step": 322000 |
| }, |
| { |
| "base_loss": 0.29299941608309743, |
| "epoch": 2.11444091796875, |
| "grad_norm": 0.0009360475232824683, |
| "learning_rate": 1.924409866333008e-05, |
| "lookahead_loss": 6.286139685630799, |
| "loss": 0.3075, |
| "step": 322500 |
| }, |
| { |
| "base_loss": 0.29959108304977417, |
| "epoch": 2.1153945922851562, |
| "grad_norm": 0.0009626513347029686, |
| "learning_rate": 1.9196414947509767e-05, |
| "lookahead_loss": 6.344089280605316, |
| "loss": 0.3104, |
| "step": 323000 |
| }, |
| { |
| "base_loss": 0.3146350940167904, |
| "epoch": 2.1163482666015625, |
| "grad_norm": 0.0010095579782500863, |
| "learning_rate": 1.9148731231689454e-05, |
| "lookahead_loss": 6.34140051651001, |
| "loss": 0.3239, |
| "step": 323500 |
| }, |
| { |
| "base_loss": 0.32594672916829587, |
| "epoch": 2.1173019409179688, |
| "grad_norm": 0.0009781294502317905, |
| "learning_rate": 1.910104751586914e-05, |
| "lookahead_loss": 6.33776809501648, |
| "loss": 0.3422, |
| "step": 324000 |
| }, |
| { |
| "base_loss": 0.3083632712960243, |
| "epoch": 2.118255615234375, |
| "grad_norm": 0.0010100657818838954, |
| "learning_rate": 1.9053363800048828e-05, |
| "lookahead_loss": 6.3736692142486575, |
| "loss": 0.3159, |
| "step": 324500 |
| }, |
| { |
| "base_loss": 0.30149356806278227, |
| "epoch": 2.1192092895507812, |
| "grad_norm": 0.000946766056586057, |
| "learning_rate": 1.9005680084228518e-05, |
| "lookahead_loss": 6.3695377283096315, |
| "loss": 0.3111, |
| "step": 325000 |
| }, |
| { |
| "epoch": 2.1192092895507812, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.338089135508187, |
| "eval_lookahead_perplexity": 565.7142745781854, |
| "eval_loss": 0.14181137084960938, |
| "eval_perplexity": 1.1523592594866319, |
| "eval_runtime": 257.0144, |
| "eval_samples_per_second": 19.454, |
| "eval_steps_per_second": 0.611, |
| "step": 325000 |
| }, |
| { |
| "base_loss": 0.308347962975502, |
| "epoch": 3.0009536743164062, |
| "grad_norm": 0.0009298936929553747, |
| "learning_rate": 1.8957996368408205e-05, |
| "lookahead_loss": 6.413288908958435, |
| "loss": 0.3164, |
| "step": 325500 |
| }, |
| { |
| "base_loss": 0.3005163077712059, |
| "epoch": 3.0019073486328125, |
| "grad_norm": 0.0010061148786917329, |
| "learning_rate": 1.891031265258789e-05, |
| "lookahead_loss": 6.252055767536163, |
| "loss": 0.3131, |
| "step": 326000 |
| }, |
| { |
| "base_loss": 0.3118715011179447, |
| "epoch": 3.0028610229492188, |
| "grad_norm": 0.001017007976770401, |
| "learning_rate": 1.8862628936767578e-05, |
| "lookahead_loss": 6.243941001415252, |
| "loss": 0.3221, |
| "step": 326500 |
| }, |
| { |
| "base_loss": 0.3241955025494099, |
| "epoch": 3.003814697265625, |
| "grad_norm": 0.0009476043051108718, |
| "learning_rate": 1.8814945220947265e-05, |
| "lookahead_loss": 6.282772970199585, |
| "loss": 0.3347, |
| "step": 327000 |
| }, |
| { |
| "base_loss": 0.3020790828168392, |
| "epoch": 3.0047683715820312, |
| "grad_norm": 0.0009582182974554598, |
| "learning_rate": 1.8767261505126955e-05, |
| "lookahead_loss": 6.2533429822921756, |
| "loss": 0.3166, |
| "step": 327500 |
| }, |
| { |
| "base_loss": 0.3016543593108654, |
| "epoch": 3.0057220458984375, |
| "grad_norm": 0.0008436063071712852, |
| "learning_rate": 1.8719577789306642e-05, |
| "lookahead_loss": 6.375606914520263, |
| "loss": 0.3119, |
| "step": 328000 |
| }, |
| { |
| "base_loss": 0.29849619832634927, |
| "epoch": 3.0066757202148438, |
| "grad_norm": 0.0009330453467555344, |
| "learning_rate": 1.867189407348633e-05, |
| "lookahead_loss": 6.2344167790412905, |
| "loss": 0.3128, |
| "step": 328500 |
| }, |
| { |
| "base_loss": 0.31336222241818906, |
| "epoch": 3.00762939453125, |
| "grad_norm": 0.0009829691844061017, |
| "learning_rate": 1.8624210357666016e-05, |
| "lookahead_loss": 6.291501372814179, |
| "loss": 0.3234, |
| "step": 329000 |
| }, |
| { |
| "base_loss": 0.316298152923584, |
| "epoch": 3.0085830688476562, |
| "grad_norm": 0.0009287581779062748, |
| "learning_rate": 1.8576526641845703e-05, |
| "lookahead_loss": 6.2824432291984555, |
| "loss": 0.3224, |
| "step": 329500 |
| }, |
| { |
| "base_loss": 0.3036236027777195, |
| "epoch": 3.0095367431640625, |
| "grad_norm": 0.000979596166871488, |
| "learning_rate": 1.8528842926025393e-05, |
| "lookahead_loss": 6.3004652094841, |
| "loss": 0.3179, |
| "step": 330000 |
| }, |
| { |
| "epoch": 3.0095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.334081890484015, |
| "eval_lookahead_perplexity": 563.4518549287164, |
| "eval_loss": 0.1418035328388214, |
| "eval_perplexity": 1.1523502273177215, |
| "eval_runtime": 268.9824, |
| "eval_samples_per_second": 18.589, |
| "eval_steps_per_second": 0.584, |
| "step": 330000 |
| }, |
| { |
| "base_loss": 0.30253981775045397, |
| "epoch": 3.0104904174804688, |
| "grad_norm": 0.0009402994182892144, |
| "learning_rate": 1.848115921020508e-05, |
| "lookahead_loss": 6.262249879837036, |
| "loss": 0.3119, |
| "step": 330500 |
| }, |
| { |
| "base_loss": 0.30147307565808296, |
| "epoch": 3.011444091796875, |
| "grad_norm": 0.0010017170570790768, |
| "learning_rate": 1.8433475494384766e-05, |
| "lookahead_loss": 6.274536405563355, |
| "loss": 0.3139, |
| "step": 331000 |
| }, |
| { |
| "base_loss": 0.32504893574118615, |
| "epoch": 3.0123977661132812, |
| "grad_norm": 0.0008727677050046623, |
| "learning_rate": 1.8385791778564453e-05, |
| "lookahead_loss": 6.272889457702637, |
| "loss": 0.335, |
| "step": 331500 |
| }, |
| { |
| "base_loss": 0.3044933348596096, |
| "epoch": 3.0133514404296875, |
| "grad_norm": 0.0009206495014950633, |
| "learning_rate": 1.833810806274414e-05, |
| "lookahead_loss": 6.381618105888367, |
| "loss": 0.3184, |
| "step": 332000 |
| }, |
| { |
| "base_loss": 0.29874410527944567, |
| "epoch": 3.0143051147460938, |
| "grad_norm": 0.0009790639160200953, |
| "learning_rate": 1.829042434692383e-05, |
| "lookahead_loss": 6.30400173664093, |
| "loss": 0.3137, |
| "step": 332500 |
| }, |
| { |
| "base_loss": 0.2969954553842545, |
| "epoch": 3.0152587890625, |
| "grad_norm": 0.0009097548900172114, |
| "learning_rate": 1.8242740631103517e-05, |
| "lookahead_loss": 6.282666071414948, |
| "loss": 0.3061, |
| "step": 333000 |
| }, |
| { |
| "base_loss": 0.30927150130271913, |
| "epoch": 3.0162124633789062, |
| "grad_norm": 0.0009808189934119582, |
| "learning_rate": 1.8195056915283204e-05, |
| "lookahead_loss": 6.3133458938598634, |
| "loss": 0.322, |
| "step": 333500 |
| }, |
| { |
| "base_loss": 0.3112277380824089, |
| "epoch": 3.0171661376953125, |
| "grad_norm": 0.0009308747248724103, |
| "learning_rate": 1.814737319946289e-05, |
| "lookahead_loss": 6.369205039024353, |
| "loss": 0.3221, |
| "step": 334000 |
| }, |
| { |
| "base_loss": 0.300263544857502, |
| "epoch": 3.0181198120117188, |
| "grad_norm": 0.0009227790287695825, |
| "learning_rate": 1.8099689483642578e-05, |
| "lookahead_loss": 6.362029777526855, |
| "loss": 0.312, |
| "step": 334500 |
| }, |
| { |
| "base_loss": 0.29675460466742515, |
| "epoch": 3.019073486328125, |
| "grad_norm": 0.0009541187318973243, |
| "learning_rate": 1.8052005767822268e-05, |
| "lookahead_loss": 6.378857530593872, |
| "loss": 0.3094, |
| "step": 335000 |
| }, |
| { |
| "epoch": 3.019073486328125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.329803198671189, |
| "eval_lookahead_perplexity": 561.046168355948, |
| "eval_loss": 0.14179490506649017, |
| "eval_perplexity": 1.1523402851452038, |
| "eval_runtime": 295.7066, |
| "eval_samples_per_second": 16.909, |
| "eval_steps_per_second": 0.531, |
| "step": 335000 |
| }, |
| { |
| "base_loss": 0.30056491792201995, |
| "epoch": 3.0200271606445312, |
| "grad_norm": 0.0010459490586072206, |
| "learning_rate": 1.8004322052001955e-05, |
| "lookahead_loss": 6.219315876483917, |
| "loss": 0.3149, |
| "step": 335500 |
| }, |
| { |
| "base_loss": 0.3270763371884823, |
| "epoch": 3.0209808349609375, |
| "grad_norm": 0.0009397394605912268, |
| "learning_rate": 1.795663833618164e-05, |
| "lookahead_loss": 6.310785099029541, |
| "loss": 0.3388, |
| "step": 336000 |
| }, |
| { |
| "base_loss": 0.3038404151797295, |
| "epoch": 3.0219345092773438, |
| "grad_norm": 0.0009903458412736654, |
| "learning_rate": 1.7908954620361328e-05, |
| "lookahead_loss": 6.267493858814239, |
| "loss": 0.3142, |
| "step": 336500 |
| }, |
| { |
| "base_loss": 0.29968900653719904, |
| "epoch": 3.02288818359375, |
| "grad_norm": 0.0009887177729979157, |
| "learning_rate": 1.7861270904541015e-05, |
| "lookahead_loss": 6.31684240436554, |
| "loss": 0.3122, |
| "step": 337000 |
| }, |
| { |
| "base_loss": 0.3010028195679188, |
| "epoch": 3.0238418579101562, |
| "grad_norm": 0.0009739255765452981, |
| "learning_rate": 1.7813587188720705e-05, |
| "lookahead_loss": 6.265827451705933, |
| "loss": 0.3125, |
| "step": 337500 |
| }, |
| { |
| "base_loss": 0.32444008192420004, |
| "epoch": 3.0247955322265625, |
| "grad_norm": 0.0009061110904440284, |
| "learning_rate": 1.7765903472900392e-05, |
| "lookahead_loss": 6.279670271396637, |
| "loss": 0.3341, |
| "step": 338000 |
| }, |
| { |
| "base_loss": 0.30922784996032715, |
| "epoch": 3.0257492065429688, |
| "grad_norm": 0.0009870253270491958, |
| "learning_rate": 1.771821975708008e-05, |
| "lookahead_loss": 6.245859955787659, |
| "loss": 0.3225, |
| "step": 338500 |
| }, |
| { |
| "base_loss": 0.30224511262774467, |
| "epoch": 3.026702880859375, |
| "grad_norm": 0.0009940272429957986, |
| "learning_rate": 1.7670536041259766e-05, |
| "lookahead_loss": 6.2585461874008175, |
| "loss": 0.3115, |
| "step": 339000 |
| }, |
| { |
| "base_loss": 0.3063249331712723, |
| "epoch": 3.0276565551757812, |
| "grad_norm": 0.0009431723156012595, |
| "learning_rate": 1.7622852325439453e-05, |
| "lookahead_loss": 6.363879596710205, |
| "loss": 0.3174, |
| "step": 339500 |
| }, |
| { |
| "base_loss": 0.32986500787734985, |
| "epoch": 3.0286102294921875, |
| "grad_norm": 0.0009309691959060729, |
| "learning_rate": 1.7575168609619143e-05, |
| "lookahead_loss": 6.401720482826233, |
| "loss": 0.3396, |
| "step": 340000 |
| }, |
| { |
| "epoch": 3.0286102294921875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.325850799060858, |
| "eval_lookahead_perplexity": 558.8330661119743, |
| "eval_loss": 0.1417873352766037, |
| "eval_perplexity": 1.152331562204383, |
| "eval_runtime": 270.5286, |
| "eval_samples_per_second": 18.482, |
| "eval_steps_per_second": 0.58, |
| "step": 340000 |
| }, |
| { |
| "base_loss": 0.3042540407180786, |
| "epoch": 3.0295639038085938, |
| "grad_norm": 0.0009302702965214849, |
| "learning_rate": 1.752748489379883e-05, |
| "lookahead_loss": 6.327819796562195, |
| "loss": 0.3128, |
| "step": 340500 |
| }, |
| { |
| "base_loss": 0.3056684481501579, |
| "epoch": 3.030517578125, |
| "grad_norm": 0.0009667924023233354, |
| "learning_rate": 1.7479801177978516e-05, |
| "lookahead_loss": 6.319185664653778, |
| "loss": 0.3153, |
| "step": 341000 |
| }, |
| { |
| "base_loss": 0.30326661148667333, |
| "epoch": 3.0314712524414062, |
| "grad_norm": 0.000947739346884191, |
| "learning_rate": 1.7432117462158203e-05, |
| "lookahead_loss": 6.31995133113861, |
| "loss": 0.3148, |
| "step": 341500 |
| }, |
| { |
| "base_loss": 0.31433943542838094, |
| "epoch": 3.0324249267578125, |
| "grad_norm": 0.0009427520562894642, |
| "learning_rate": 1.738443374633789e-05, |
| "lookahead_loss": 6.287163452148437, |
| "loss": 0.3335, |
| "step": 342000 |
| }, |
| { |
| "base_loss": 0.3052255228161812, |
| "epoch": 3.0333786010742188, |
| "grad_norm": 0.0010005695512518287, |
| "learning_rate": 1.733675003051758e-05, |
| "lookahead_loss": 6.3287235207557675, |
| "loss": 0.3154, |
| "step": 342500 |
| }, |
| { |
| "base_loss": 0.304299351811409, |
| "epoch": 3.034332275390625, |
| "grad_norm": 0.0008999764686450362, |
| "learning_rate": 1.7289066314697267e-05, |
| "lookahead_loss": 6.394580018997193, |
| "loss": 0.316, |
| "step": 343000 |
| }, |
| { |
| "base_loss": 0.3093027866780758, |
| "epoch": 3.0352859497070312, |
| "grad_norm": 0.001007239567115903, |
| "learning_rate": 1.7241382598876954e-05, |
| "lookahead_loss": 6.223367876529694, |
| "loss": 0.3212, |
| "step": 343500 |
| }, |
| { |
| "base_loss": 0.3244023490846157, |
| "epoch": 3.0362396240234375, |
| "grad_norm": 0.0009588600951246917, |
| "learning_rate": 1.719369888305664e-05, |
| "lookahead_loss": 6.34389029598236, |
| "loss": 0.3373, |
| "step": 344000 |
| }, |
| { |
| "base_loss": 0.30684786412119863, |
| "epoch": 3.0371932983398438, |
| "grad_norm": 0.0009947418002411723, |
| "learning_rate": 1.7146015167236328e-05, |
| "lookahead_loss": 6.284828133106232, |
| "loss": 0.3188, |
| "step": 344500 |
| }, |
| { |
| "base_loss": 0.30203218227624895, |
| "epoch": 3.03814697265625, |
| "grad_norm": 0.0009736179490573704, |
| "learning_rate": 1.7098331451416018e-05, |
| "lookahead_loss": 6.32041801738739, |
| "loss": 0.314, |
| "step": 345000 |
| }, |
| { |
| "epoch": 3.03814697265625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.322265743828429, |
| "eval_lookahead_perplexity": 556.8332056513128, |
| "eval_loss": 0.1417807936668396, |
| "eval_perplexity": 1.1523240241256398, |
| "eval_runtime": 265.1579, |
| "eval_samples_per_second": 18.857, |
| "eval_steps_per_second": 0.592, |
| "step": 345000 |
| }, |
| { |
| "base_loss": 0.30594835013151167, |
| "epoch": 3.0391006469726562, |
| "grad_norm": 0.0009133536368608475, |
| "learning_rate": 1.7050647735595705e-05, |
| "lookahead_loss": 6.269966838359832, |
| "loss": 0.319, |
| "step": 345500 |
| }, |
| { |
| "base_loss": 0.3232807823717594, |
| "epoch": 3.0400543212890625, |
| "grad_norm": 0.0009371156920678914, |
| "learning_rate": 1.700296401977539e-05, |
| "lookahead_loss": 6.320635152816773, |
| "loss": 0.3325, |
| "step": 346000 |
| }, |
| { |
| "base_loss": 0.30651690036058427, |
| "epoch": 3.0410079956054688, |
| "grad_norm": 0.000969213608186692, |
| "learning_rate": 1.6955280303955078e-05, |
| "lookahead_loss": 6.247298479557037, |
| "loss": 0.3148, |
| "step": 346500 |
| }, |
| { |
| "base_loss": 0.2957771936655045, |
| "epoch": 3.041961669921875, |
| "grad_norm": 0.0009626311366446316, |
| "learning_rate": 1.6907596588134765e-05, |
| "lookahead_loss": 6.325569787979126, |
| "loss": 0.3075, |
| "step": 347000 |
| }, |
| { |
| "base_loss": 0.310648419380188, |
| "epoch": 3.0429153442382812, |
| "grad_norm": 0.00098798715043813, |
| "learning_rate": 1.6859912872314455e-05, |
| "lookahead_loss": 6.343762699604034, |
| "loss": 0.3249, |
| "step": 347500 |
| }, |
| { |
| "base_loss": 0.3259735953062773, |
| "epoch": 3.0438690185546875, |
| "grad_norm": 0.0009963024640455842, |
| "learning_rate": 1.6812229156494142e-05, |
| "lookahead_loss": 6.38560670042038, |
| "loss": 0.3417, |
| "step": 348000 |
| }, |
| { |
| "base_loss": 0.295075288772583, |
| "epoch": 3.0448226928710938, |
| "grad_norm": 0.0009655926842242479, |
| "learning_rate": 1.676454544067383e-05, |
| "lookahead_loss": 6.27928000164032, |
| "loss": 0.31, |
| "step": 348500 |
| }, |
| { |
| "base_loss": 0.3027168534696102, |
| "epoch": 3.0457763671875, |
| "grad_norm": 0.0009574743453413248, |
| "learning_rate": 1.6716861724853516e-05, |
| "lookahead_loss": 6.29035155916214, |
| "loss": 0.3156, |
| "step": 349000 |
| }, |
| { |
| "base_loss": 0.33181600126624106, |
| "epoch": 3.0467300415039062, |
| "grad_norm": 0.0009480383596383035, |
| "learning_rate": 1.6669178009033203e-05, |
| "lookahead_loss": 6.259274421691894, |
| "loss": 0.3397, |
| "step": 349500 |
| }, |
| { |
| "base_loss": 0.3247646952867508, |
| "epoch": 3.0476837158203125, |
| "grad_norm": 0.001015504589304328, |
| "learning_rate": 1.6621494293212893e-05, |
| "lookahead_loss": 6.312367619514466, |
| "loss": 0.3392, |
| "step": 350000 |
| }, |
| { |
| "epoch": 3.0476837158203125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.319038366738219, |
| "eval_lookahead_perplexity": 555.0389917800691, |
| "eval_loss": 0.14177383482456207, |
| "eval_perplexity": 1.1523160053124042, |
| "eval_runtime": 258.8888, |
| "eval_samples_per_second": 19.313, |
| "eval_steps_per_second": 0.606, |
| "step": 350000 |
| }, |
| { |
| "base_loss": 0.29319588682055475, |
| "epoch": 3.0486373901367188, |
| "grad_norm": 0.0009484434267506003, |
| "learning_rate": 1.657381057739258e-05, |
| "lookahead_loss": 6.280442509651184, |
| "loss": 0.3075, |
| "step": 350500 |
| }, |
| { |
| "base_loss": 0.3029366814792156, |
| "epoch": 3.049591064453125, |
| "grad_norm": 0.0009550508693791926, |
| "learning_rate": 1.6526126861572266e-05, |
| "lookahead_loss": 6.254170268058777, |
| "loss": 0.316, |
| "step": 351000 |
| }, |
| { |
| "base_loss": 0.32044321012496946, |
| "epoch": 3.0505447387695312, |
| "grad_norm": 0.0009002664592117071, |
| "learning_rate": 1.6478443145751953e-05, |
| "lookahead_loss": 6.356799177169799, |
| "loss": 0.3322, |
| "step": 351500 |
| }, |
| { |
| "base_loss": 0.3040877487659454, |
| "epoch": 3.0514984130859375, |
| "grad_norm": 0.0009919478325173259, |
| "learning_rate": 1.643075942993164e-05, |
| "lookahead_loss": 6.317004768371582, |
| "loss": 0.3172, |
| "step": 352000 |
| }, |
| { |
| "base_loss": 0.30488765078783037, |
| "epoch": 3.0524520874023438, |
| "grad_norm": 0.0010407265508547425, |
| "learning_rate": 1.638307571411133e-05, |
| "lookahead_loss": 6.25145507478714, |
| "loss": 0.3177, |
| "step": 352500 |
| }, |
| { |
| "base_loss": 0.32170946165919306, |
| "epoch": 3.05340576171875, |
| "grad_norm": 0.0009659235365688801, |
| "learning_rate": 1.6335391998291017e-05, |
| "lookahead_loss": 6.258784490585327, |
| "loss": 0.3297, |
| "step": 353000 |
| }, |
| { |
| "base_loss": 0.3580245627462864, |
| "epoch": 3.0543594360351562, |
| "grad_norm": 0.0009487209608778358, |
| "learning_rate": 1.6287708282470704e-05, |
| "lookahead_loss": 6.3121585865020755, |
| "loss": 0.3715, |
| "step": 353500 |
| }, |
| { |
| "base_loss": 0.29618824023008344, |
| "epoch": 3.0553131103515625, |
| "grad_norm": 0.0009817008394747972, |
| "learning_rate": 1.624002456665039e-05, |
| "lookahead_loss": 6.299326305389404, |
| "loss": 0.3075, |
| "step": 354000 |
| }, |
| { |
| "base_loss": 0.30493036335706714, |
| "epoch": 3.0562667846679688, |
| "grad_norm": 0.000914248637855053, |
| "learning_rate": 1.6192340850830078e-05, |
| "lookahead_loss": 6.316375274181366, |
| "loss": 0.3167, |
| "step": 354500 |
| }, |
| { |
| "base_loss": 0.318002614736557, |
| "epoch": 3.057220458984375, |
| "grad_norm": 0.000944992178119719, |
| "learning_rate": 1.6144657135009768e-05, |
| "lookahead_loss": 6.3369423885345455, |
| "loss": 0.3319, |
| "step": 355000 |
| }, |
| { |
| "epoch": 3.057220458984375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.315732623822392, |
| "eval_lookahead_perplexity": 553.207204941304, |
| "eval_loss": 0.14176736772060394, |
| "eval_perplexity": 1.1523085531891022, |
| "eval_runtime": 271.1413, |
| "eval_samples_per_second": 18.441, |
| "eval_steps_per_second": 0.579, |
| "step": 355000 |
| }, |
| { |
| "base_loss": 0.3197241204380989, |
| "epoch": 3.0581741333007812, |
| "grad_norm": 0.0009679922368377447, |
| "learning_rate": 1.6096973419189455e-05, |
| "lookahead_loss": 6.325226506233215, |
| "loss": 0.3298, |
| "step": 355500 |
| }, |
| { |
| "base_loss": 0.2939224536716938, |
| "epoch": 3.0591278076171875, |
| "grad_norm": 0.0009332736954092979, |
| "learning_rate": 1.604928970336914e-05, |
| "lookahead_loss": 6.236564951896668, |
| "loss": 0.3079, |
| "step": 356000 |
| }, |
| { |
| "base_loss": 0.30190801098942754, |
| "epoch": 3.0600814819335938, |
| "grad_norm": 0.0009871459333226085, |
| "learning_rate": 1.6001605987548828e-05, |
| "lookahead_loss": 6.2921178107261655, |
| "loss": 0.3148, |
| "step": 356500 |
| }, |
| { |
| "base_loss": 0.3212427071630955, |
| "epoch": 3.06103515625, |
| "grad_norm": 0.0009790301555767655, |
| "learning_rate": 1.5953922271728515e-05, |
| "lookahead_loss": 6.265547143936157, |
| "loss": 0.3313, |
| "step": 357000 |
| }, |
| { |
| "base_loss": 0.3071871542930603, |
| "epoch": 3.0619888305664062, |
| "grad_norm": 0.001000607735477388, |
| "learning_rate": 1.5906238555908205e-05, |
| "lookahead_loss": 6.265981761932373, |
| "loss": 0.3169, |
| "step": 357500 |
| }, |
| { |
| "base_loss": 0.3058412022292614, |
| "epoch": 3.0629425048828125, |
| "grad_norm": 0.0009778927778825164, |
| "learning_rate": 1.5858554840087892e-05, |
| "lookahead_loss": 6.28272382068634, |
| "loss": 0.3154, |
| "step": 358000 |
| }, |
| { |
| "base_loss": 0.31580124926567077, |
| "epoch": 3.0638961791992188, |
| "grad_norm": 0.0009540447499603033, |
| "learning_rate": 1.581087112426758e-05, |
| "lookahead_loss": 6.302886658191681, |
| "loss": 0.329, |
| "step": 358500 |
| }, |
| { |
| "base_loss": 0.30067266592383385, |
| "epoch": 3.064849853515625, |
| "grad_norm": 0.000962612044531852, |
| "learning_rate": 1.5763187408447266e-05, |
| "lookahead_loss": 6.3211470890045165, |
| "loss": 0.3147, |
| "step": 359000 |
| }, |
| { |
| "base_loss": 0.3075073702633381, |
| "epoch": 3.0658035278320312, |
| "grad_norm": 0.0009687106939963996, |
| "learning_rate": 1.5715503692626953e-05, |
| "lookahead_loss": 6.239695079803467, |
| "loss": 0.3197, |
| "step": 359500 |
| }, |
| { |
| "base_loss": 0.30778184497356414, |
| "epoch": 3.0667572021484375, |
| "grad_norm": 0.0009693547617644072, |
| "learning_rate": 1.5667819976806643e-05, |
| "lookahead_loss": 6.236202743053436, |
| "loss": 0.3169, |
| "step": 360000 |
| }, |
| { |
| "epoch": 3.0667572021484375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.312184997259999, |
| "eval_lookahead_perplexity": 551.2481094907844, |
| "eval_loss": 0.14176103472709656, |
| "eval_perplexity": 1.152301255649624, |
| "eval_runtime": 269.9401, |
| "eval_samples_per_second": 18.523, |
| "eval_steps_per_second": 0.582, |
| "step": 360000 |
| }, |
| { |
| "base_loss": 0.32953174033761023, |
| "epoch": 3.0677108764648438, |
| "grad_norm": 0.0010309175122529268, |
| "learning_rate": 1.562013626098633e-05, |
| "lookahead_loss": 6.302441148757935, |
| "loss": 0.3414, |
| "step": 360500 |
| }, |
| { |
| "base_loss": 0.3003321154117584, |
| "epoch": 3.06866455078125, |
| "grad_norm": 0.0010074395686388016, |
| "learning_rate": 1.5572452545166016e-05, |
| "lookahead_loss": 6.279360107421875, |
| "loss": 0.3096, |
| "step": 361000 |
| }, |
| { |
| "base_loss": 0.3017352370470762, |
| "epoch": 3.0696182250976562, |
| "grad_norm": 0.0009648427367210388, |
| "learning_rate": 1.5524768829345703e-05, |
| "lookahead_loss": 6.320928843975067, |
| "loss": 0.3148, |
| "step": 361500 |
| }, |
| { |
| "base_loss": 0.3461217338144779, |
| "epoch": 3.0705718994140625, |
| "grad_norm": 0.0009705660049803555, |
| "learning_rate": 1.547708511352539e-05, |
| "lookahead_loss": 6.2096345415115355, |
| "loss": 0.3589, |
| "step": 362000 |
| }, |
| { |
| "base_loss": 0.3151253694295883, |
| "epoch": 3.0715255737304688, |
| "grad_norm": 0.0009764889837242663, |
| "learning_rate": 1.542940139770508e-05, |
| "lookahead_loss": 6.254361331939697, |
| "loss": 0.3252, |
| "step": 362500 |
| }, |
| { |
| "base_loss": 0.3069308316111565, |
| "epoch": 3.072479248046875, |
| "grad_norm": 0.0009667645208537579, |
| "learning_rate": 1.5381717681884767e-05, |
| "lookahead_loss": 6.327083124160767, |
| "loss": 0.318, |
| "step": 363000 |
| }, |
| { |
| "base_loss": 0.30406438249349593, |
| "epoch": 3.0734329223632812, |
| "grad_norm": 0.0009305006824433804, |
| "learning_rate": 1.5334033966064454e-05, |
| "lookahead_loss": 6.285885016918183, |
| "loss": 0.317, |
| "step": 363500 |
| }, |
| { |
| "base_loss": 0.32663529852032663, |
| "epoch": 3.0743865966796875, |
| "grad_norm": 0.0009700483060441911, |
| "learning_rate": 1.528635025024414e-05, |
| "lookahead_loss": 6.314631988525391, |
| "loss": 0.3395, |
| "step": 364000 |
| }, |
| { |
| "base_loss": 0.30363860473036763, |
| "epoch": 3.0753402709960938, |
| "grad_norm": 0.0009474587277509272, |
| "learning_rate": 1.523866653442383e-05, |
| "lookahead_loss": 6.379505019664764, |
| "loss": 0.3149, |
| "step": 364500 |
| }, |
| { |
| "base_loss": 0.30343541222810744, |
| "epoch": 3.0762939453125, |
| "grad_norm": 0.0009159389301203191, |
| "learning_rate": 1.5190982818603516e-05, |
| "lookahead_loss": 6.336886539459228, |
| "loss": 0.3166, |
| "step": 365000 |
| }, |
| { |
| "epoch": 3.0762939453125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.308753397518073, |
| "eval_lookahead_perplexity": 549.3596886259452, |
| "eval_loss": 0.14175455272197723, |
| "eval_perplexity": 1.1522937864511935, |
| "eval_runtime": 258.1196, |
| "eval_samples_per_second": 19.371, |
| "eval_steps_per_second": 0.608, |
| "step": 365000 |
| }, |
| { |
| "base_loss": 0.32582479974627493, |
| "epoch": 3.0772476196289062, |
| "grad_norm": 0.0009780022082850337, |
| "learning_rate": 1.5143299102783205e-05, |
| "lookahead_loss": 6.3267164087295535, |
| "loss": 0.3404, |
| "step": 365500 |
| }, |
| { |
| "base_loss": 0.30336130890250207, |
| "epoch": 3.0782012939453125, |
| "grad_norm": 0.000988679938018322, |
| "learning_rate": 1.5095615386962891e-05, |
| "lookahead_loss": 6.332203316688537, |
| "loss": 0.3175, |
| "step": 366000 |
| }, |
| { |
| "base_loss": 0.29930157062411306, |
| "epoch": 3.0791549682617188, |
| "grad_norm": 0.0009294356568716466, |
| "learning_rate": 1.5047931671142578e-05, |
| "lookahead_loss": 6.334248623847961, |
| "loss": 0.3128, |
| "step": 366500 |
| }, |
| { |
| "base_loss": 0.30971532610058783, |
| "epoch": 3.080108642578125, |
| "grad_norm": 0.0009678815258666873, |
| "learning_rate": 1.5000247955322267e-05, |
| "lookahead_loss": 6.314932657241822, |
| "loss": 0.3278, |
| "step": 367000 |
| }, |
| { |
| "base_loss": 0.3241646741628647, |
| "epoch": 3.0810623168945312, |
| "grad_norm": 0.0009766423609107733, |
| "learning_rate": 1.4952564239501954e-05, |
| "lookahead_loss": 6.3299585676193235, |
| "loss": 0.3401, |
| "step": 367500 |
| }, |
| { |
| "base_loss": 0.3009178417623043, |
| "epoch": 3.0820159912109375, |
| "grad_norm": 0.0008978794794529676, |
| "learning_rate": 1.4904880523681642e-05, |
| "lookahead_loss": 6.32042356300354, |
| "loss": 0.313, |
| "step": 368000 |
| }, |
| { |
| "base_loss": 0.3062888396978378, |
| "epoch": 3.0829696655273438, |
| "grad_norm": 0.0009827233152464032, |
| "learning_rate": 1.4857196807861329e-05, |
| "lookahead_loss": 6.378310326099395, |
| "loss": 0.3195, |
| "step": 368500 |
| }, |
| { |
| "base_loss": 0.33434281674027444, |
| "epoch": 3.08392333984375, |
| "grad_norm": 0.000925905245821923, |
| "learning_rate": 1.4809513092041016e-05, |
| "lookahead_loss": 6.374326509475708, |
| "loss": 0.345, |
| "step": 369000 |
| }, |
| { |
| "base_loss": 0.3093685868382454, |
| "epoch": 3.0848770141601562, |
| "grad_norm": 0.0009787451708689332, |
| "learning_rate": 1.4761829376220704e-05, |
| "lookahead_loss": 6.290310836315155, |
| "loss": 0.3189, |
| "step": 369500 |
| }, |
| { |
| "base_loss": 0.2982211470901966, |
| "epoch": 3.0858306884765625, |
| "grad_norm": 0.000987353385426104, |
| "learning_rate": 1.4714145660400391e-05, |
| "lookahead_loss": 6.305355979442597, |
| "loss": 0.3087, |
| "step": 370000 |
| }, |
| { |
| "epoch": 3.0858306884765625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.305268980062808, |
| "eval_lookahead_perplexity": 547.448821200929, |
| "eval_loss": 0.14174824953079224, |
| "eval_perplexity": 1.1522865233460466, |
| "eval_runtime": 275.1775, |
| "eval_samples_per_second": 18.17, |
| "eval_steps_per_second": 0.571, |
| "step": 370000 |
| }, |
| { |
| "base_loss": 0.30163490331172943, |
| "epoch": 3.0867843627929688, |
| "grad_norm": 0.001101371948607266, |
| "learning_rate": 1.466646194458008e-05, |
| "lookahead_loss": 6.271534814834594, |
| "loss": 0.313, |
| "step": 370500 |
| }, |
| { |
| "base_loss": 0.33820600137114526, |
| "epoch": 3.087738037109375, |
| "grad_norm": 0.0009093422559089959, |
| "learning_rate": 1.4618778228759766e-05, |
| "lookahead_loss": 6.3157040328979495, |
| "loss": 0.344, |
| "step": 371000 |
| }, |
| { |
| "base_loss": 0.30079344487190246, |
| "epoch": 3.0886917114257812, |
| "grad_norm": 0.0010120077058672905, |
| "learning_rate": 1.4571094512939453e-05, |
| "lookahead_loss": 6.31459610080719, |
| "loss": 0.3135, |
| "step": 371500 |
| }, |
| { |
| "base_loss": 0.3107509427666664, |
| "epoch": 3.0896453857421875, |
| "grad_norm": 0.0009789596078917384, |
| "learning_rate": 1.4523410797119142e-05, |
| "lookahead_loss": 6.341436826229096, |
| "loss": 0.3187, |
| "step": 372000 |
| }, |
| { |
| "base_loss": 0.30222347214818, |
| "epoch": 3.0905990600585938, |
| "grad_norm": 0.0009555344004184008, |
| "learning_rate": 1.4475727081298829e-05, |
| "lookahead_loss": 6.318659209251404, |
| "loss": 0.3128, |
| "step": 372500 |
| }, |
| { |
| "base_loss": 0.3001328880786896, |
| "epoch": 3.091552734375, |
| "grad_norm": 0.0009132505510933697, |
| "learning_rate": 1.4428043365478517e-05, |
| "lookahead_loss": 6.288368509769439, |
| "loss": 0.3099, |
| "step": 373000 |
| }, |
| { |
| "base_loss": 0.31811392498016355, |
| "epoch": 3.0925064086914062, |
| "grad_norm": 0.0009298596996814013, |
| "learning_rate": 1.4380359649658204e-05, |
| "lookahead_loss": 6.329743075847626, |
| "loss": 0.3326, |
| "step": 373500 |
| }, |
| { |
| "base_loss": 0.31058550345897673, |
| "epoch": 3.0934600830078125, |
| "grad_norm": 0.0008941558189690113, |
| "learning_rate": 1.433267593383789e-05, |
| "lookahead_loss": 6.32411856842041, |
| "loss": 0.3212, |
| "step": 374000 |
| }, |
| { |
| "base_loss": 0.28807635736465453, |
| "epoch": 3.0944137573242188, |
| "grad_norm": 0.0009412313811480999, |
| "learning_rate": 1.428499221801758e-05, |
| "lookahead_loss": 6.325871033668518, |
| "loss": 0.3033, |
| "step": 374500 |
| }, |
| { |
| "base_loss": 0.2929033098220825, |
| "epoch": 3.095367431640625, |
| "grad_norm": 0.0009629988926462829, |
| "learning_rate": 1.4237308502197266e-05, |
| "lookahead_loss": 6.251724208831787, |
| "loss": 0.3074, |
| "step": 375000 |
| }, |
| { |
| "epoch": 3.095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.302847357222828, |
| "eval_lookahead_perplexity": 546.1247105271912, |
| "eval_loss": 0.14174294471740723, |
| "eval_perplexity": 1.1522804106972875, |
| "eval_runtime": 253.3905, |
| "eval_samples_per_second": 19.732, |
| "eval_steps_per_second": 0.62, |
| "step": 375000 |
| }, |
| { |
| "base_loss": 0.3016510356664658, |
| "epoch": 3.0963211059570312, |
| "grad_norm": 0.0009708671714179218, |
| "learning_rate": 1.4189624786376955e-05, |
| "lookahead_loss": 6.314412651538849, |
| "loss": 0.3143, |
| "step": 375500 |
| }, |
| { |
| "base_loss": 0.32936670687794684, |
| "epoch": 3.0972747802734375, |
| "grad_norm": 0.0009516220889054239, |
| "learning_rate": 1.4141941070556641e-05, |
| "lookahead_loss": 6.348289193153382, |
| "loss": 0.3409, |
| "step": 376000 |
| }, |
| { |
| "base_loss": 0.29046559768915176, |
| "epoch": 3.0982284545898438, |
| "grad_norm": 0.0009895728435367346, |
| "learning_rate": 1.4094257354736328e-05, |
| "lookahead_loss": 6.274867289066314, |
| "loss": 0.3055, |
| "step": 376500 |
| }, |
| { |
| "base_loss": 0.29554841595888137, |
| "epoch": 3.09918212890625, |
| "grad_norm": 0.0009511645184829831, |
| "learning_rate": 1.4046573638916017e-05, |
| "lookahead_loss": 6.323284810066223, |
| "loss": 0.31, |
| "step": 377000 |
| }, |
| { |
| "base_loss": 0.3033588379621506, |
| "epoch": 3.1001358032226562, |
| "grad_norm": 0.0009181915083900094, |
| "learning_rate": 1.3998889923095704e-05, |
| "lookahead_loss": 6.335620421409607, |
| "loss": 0.3154, |
| "step": 377500 |
| }, |
| { |
| "base_loss": 0.32281789609789846, |
| "epoch": 3.1010894775390625, |
| "grad_norm": 0.0009614496375434101, |
| "learning_rate": 1.3951206207275392e-05, |
| "lookahead_loss": 6.400040787696838, |
| "loss": 0.3334, |
| "step": 378000 |
| }, |
| { |
| "base_loss": 0.30253357443213463, |
| "epoch": 3.1020431518554688, |
| "grad_norm": 0.0009595219744369388, |
| "learning_rate": 1.3903522491455079e-05, |
| "lookahead_loss": 6.356647694587708, |
| "loss": 0.3143, |
| "step": 378500 |
| }, |
| { |
| "base_loss": 0.2986242602169514, |
| "epoch": 3.102996826171875, |
| "grad_norm": 0.0009835211094468832, |
| "learning_rate": 1.3855838775634766e-05, |
| "lookahead_loss": 6.328301607131958, |
| "loss": 0.3132, |
| "step": 379000 |
| }, |
| { |
| "base_loss": 0.2968667206466198, |
| "epoch": 3.1039505004882812, |
| "grad_norm": 0.0009939175797626376, |
| "learning_rate": 1.3808155059814454e-05, |
| "lookahead_loss": 6.348633293628692, |
| "loss": 0.3109, |
| "step": 379500 |
| }, |
| { |
| "base_loss": 0.3167901526391506, |
| "epoch": 3.1049041748046875, |
| "grad_norm": 0.0009683977696113288, |
| "learning_rate": 1.3760471343994141e-05, |
| "lookahead_loss": 6.29404914855957, |
| "loss": 0.3288, |
| "step": 380000 |
| }, |
| { |
| "epoch": 3.1049041748046875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.3002430775675915, |
| "eval_lookahead_perplexity": 544.7042994310043, |
| "eval_loss": 0.14173762500286102, |
| "eval_perplexity": 1.1522742809107298, |
| "eval_runtime": 271.874, |
| "eval_samples_per_second": 18.391, |
| "eval_steps_per_second": 0.577, |
| "step": 380000 |
| }, |
| { |
| "base_loss": 0.3047486243546009, |
| "epoch": 3.1058578491210938, |
| "grad_norm": 0.0009209378040395677, |
| "learning_rate": 1.371278762817383e-05, |
| "lookahead_loss": 6.285359220504761, |
| "loss": 0.3195, |
| "step": 380500 |
| }, |
| { |
| "base_loss": 0.2987241801023483, |
| "epoch": 3.1068115234375, |
| "grad_norm": 0.0009467555209994316, |
| "learning_rate": 1.3665103912353516e-05, |
| "lookahead_loss": 6.353755208015442, |
| "loss": 0.3091, |
| "step": 381000 |
| }, |
| { |
| "base_loss": 0.29589339858293534, |
| "epoch": 3.1077651977539062, |
| "grad_norm": 0.000994194415397942, |
| "learning_rate": 1.3617420196533203e-05, |
| "lookahead_loss": 6.2666127576828, |
| "loss": 0.3074, |
| "step": 381500 |
| }, |
| { |
| "base_loss": 0.32084586623311045, |
| "epoch": 3.1087188720703125, |
| "grad_norm": 0.0009913091780617833, |
| "learning_rate": 1.3569736480712892e-05, |
| "lookahead_loss": 6.280218794345855, |
| "loss": 0.333, |
| "step": 382000 |
| }, |
| { |
| "base_loss": 0.31951685512065886, |
| "epoch": 3.1096725463867188, |
| "grad_norm": 0.0009130350081250072, |
| "learning_rate": 1.3522052764892579e-05, |
| "lookahead_loss": 6.237833600521087, |
| "loss": 0.3277, |
| "step": 382500 |
| }, |
| { |
| "base_loss": 0.30553389444947243, |
| "epoch": 3.110626220703125, |
| "grad_norm": 0.0009011180372908711, |
| "learning_rate": 1.3474369049072265e-05, |
| "lookahead_loss": 6.207389236927033, |
| "loss": 0.3161, |
| "step": 383000 |
| }, |
| { |
| "base_loss": 0.29624236226081846, |
| "epoch": 3.1115798950195312, |
| "grad_norm": 0.0009934761328622699, |
| "learning_rate": 1.3426685333251954e-05, |
| "lookahead_loss": 6.32315664100647, |
| "loss": 0.3073, |
| "step": 383500 |
| }, |
| { |
| "base_loss": 0.3096672693490982, |
| "epoch": 3.1125335693359375, |
| "grad_norm": 0.0009398029651492834, |
| "learning_rate": 1.337900161743164e-05, |
| "lookahead_loss": 6.333669634819031, |
| "loss": 0.3207, |
| "step": 384000 |
| }, |
| { |
| "base_loss": 0.34586220744252205, |
| "epoch": 3.1134872436523438, |
| "grad_norm": 0.000988540006801486, |
| "learning_rate": 1.333131790161133e-05, |
| "lookahead_loss": 6.357729611396789, |
| "loss": 0.3524, |
| "step": 384500 |
| }, |
| { |
| "base_loss": 0.2944044529795647, |
| "epoch": 3.11444091796875, |
| "grad_norm": 0.0009240133222192526, |
| "learning_rate": 1.3283634185791016e-05, |
| "lookahead_loss": 6.241785104751587, |
| "loss": 0.3082, |
| "step": 385000 |
| }, |
| { |
| "epoch": 3.11444091796875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.29768767372107, |
| "eval_lookahead_perplexity": 543.3141369388018, |
| "eval_loss": 0.14173269271850586, |
| "eval_perplexity": 1.152268597580337, |
| "eval_runtime": 266.7684, |
| "eval_samples_per_second": 18.743, |
| "eval_steps_per_second": 0.589, |
| "step": 385000 |
| }, |
| { |
| "base_loss": 0.29711622232198714, |
| "epoch": 3.1153945922851562, |
| "grad_norm": 0.0009787186281755567, |
| "learning_rate": 1.3235950469970703e-05, |
| "lookahead_loss": 6.293258435726166, |
| "loss": 0.3096, |
| "step": 385500 |
| }, |
| { |
| "base_loss": 0.3151656058430672, |
| "epoch": 3.1163482666015625, |
| "grad_norm": 0.001029650797136128, |
| "learning_rate": 1.3188266754150391e-05, |
| "lookahead_loss": 6.278011352539062, |
| "loss": 0.3247, |
| "step": 386000 |
| }, |
| { |
| "base_loss": 0.32477895976603033, |
| "epoch": 3.1173019409179688, |
| "grad_norm": 0.0009709696751087904, |
| "learning_rate": 1.3140583038330078e-05, |
| "lookahead_loss": 6.306861896514892, |
| "loss": 0.3409, |
| "step": 386500 |
| }, |
| { |
| "base_loss": 0.3063432638645172, |
| "epoch": 3.118255615234375, |
| "grad_norm": 0.0010217542294412851, |
| "learning_rate": 1.3092899322509767e-05, |
| "lookahead_loss": 6.320661119937896, |
| "loss": 0.3155, |
| "step": 387000 |
| }, |
| { |
| "base_loss": 0.30022109842300415, |
| "epoch": 3.1192092895507812, |
| "grad_norm": 0.0009462428861297667, |
| "learning_rate": 1.3045215606689454e-05, |
| "lookahead_loss": 6.326066428661346, |
| "loss": 0.3108, |
| "step": 387500 |
| }, |
| { |
| "base_loss": 0.30673753410577775, |
| "epoch": 4.000953674316406, |
| "grad_norm": 0.0009463900933042169, |
| "learning_rate": 1.299753189086914e-05, |
| "lookahead_loss": 6.373164211273194, |
| "loss": 0.3143, |
| "step": 388000 |
| }, |
| { |
| "base_loss": 0.29716887477040294, |
| "epoch": 4.0019073486328125, |
| "grad_norm": 0.0010187909938395023, |
| "learning_rate": 1.2949848175048829e-05, |
| "lookahead_loss": 6.219348033428192, |
| "loss": 0.3109, |
| "step": 388500 |
| }, |
| { |
| "base_loss": 0.3124042835831642, |
| "epoch": 4.002861022949219, |
| "grad_norm": 0.0009658048511482775, |
| "learning_rate": 1.2902164459228516e-05, |
| "lookahead_loss": 6.20941606426239, |
| "loss": 0.3217, |
| "step": 389000 |
| }, |
| { |
| "base_loss": 0.32272179606556894, |
| "epoch": 4.003814697265625, |
| "grad_norm": 0.0009669990977272391, |
| "learning_rate": 1.2854480743408204e-05, |
| "lookahead_loss": 6.2338160667419436, |
| "loss": 0.3353, |
| "step": 389500 |
| }, |
| { |
| "base_loss": 0.3010484471619129, |
| "epoch": 4.004768371582031, |
| "grad_norm": 0.0009306691936217248, |
| "learning_rate": 1.2806797027587891e-05, |
| "lookahead_loss": 6.211200427055359, |
| "loss": 0.3171, |
| "step": 390000 |
| }, |
| { |
| "epoch": 4.004768371582031, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.2949571426684106, |
| "eval_lookahead_perplexity": 541.8326243950327, |
| "eval_loss": 0.14172740280628204, |
| "eval_perplexity": 1.1522625021967197, |
| "eval_runtime": 259.6181, |
| "eval_samples_per_second": 19.259, |
| "eval_steps_per_second": 0.605, |
| "step": 390000 |
| }, |
| { |
| "base_loss": 0.304677970200777, |
| "epoch": 4.0057220458984375, |
| "grad_norm": 0.0008401920204050839, |
| "learning_rate": 1.2759113311767578e-05, |
| "lookahead_loss": 6.327916157245636, |
| "loss": 0.3134, |
| "step": 390500 |
| }, |
| { |
| "base_loss": 0.2963202897310257, |
| "epoch": 4.006675720214844, |
| "grad_norm": 0.0009407810866832733, |
| "learning_rate": 1.2711429595947266e-05, |
| "lookahead_loss": 6.200181405544281, |
| "loss": 0.3115, |
| "step": 391000 |
| }, |
| { |
| "base_loss": 0.3113421536386013, |
| "epoch": 4.00762939453125, |
| "grad_norm": 0.0009980009635910392, |
| "learning_rate": 1.2663745880126953e-05, |
| "lookahead_loss": 6.251565413475037, |
| "loss": 0.3226, |
| "step": 391500 |
| }, |
| { |
| "base_loss": 0.31501844617724417, |
| "epoch": 4.008583068847656, |
| "grad_norm": 0.000927921908441931, |
| "learning_rate": 1.2616062164306642e-05, |
| "lookahead_loss": 6.240460067272187, |
| "loss": 0.321, |
| "step": 392000 |
| }, |
| { |
| "base_loss": 0.30433804252743724, |
| "epoch": 4.0095367431640625, |
| "grad_norm": 0.0009920436423271894, |
| "learning_rate": 1.2568378448486329e-05, |
| "lookahead_loss": 6.250067127704621, |
| "loss": 0.3173, |
| "step": 392500 |
| }, |
| { |
| "base_loss": 0.300560106664896, |
| "epoch": 4.010490417480469, |
| "grad_norm": 0.0009590853005647659, |
| "learning_rate": 1.2520694732666015e-05, |
| "lookahead_loss": 6.2296731300354, |
| "loss": 0.3111, |
| "step": 393000 |
| }, |
| { |
| "base_loss": 0.3013677542209625, |
| "epoch": 4.011444091796875, |
| "grad_norm": 0.0010011434787884355, |
| "learning_rate": 1.2473011016845704e-05, |
| "lookahead_loss": 6.2316191611289975, |
| "loss": 0.3144, |
| "step": 393500 |
| }, |
| { |
| "base_loss": 0.3262205650210381, |
| "epoch": 4.012397766113281, |
| "grad_norm": 0.0008804492536000907, |
| "learning_rate": 1.242532730102539e-05, |
| "lookahead_loss": 6.226336854934693, |
| "loss": 0.3349, |
| "step": 394000 |
| }, |
| { |
| "base_loss": 0.30570278534293177, |
| "epoch": 4.0133514404296875, |
| "grad_norm": 0.0009209011332131922, |
| "learning_rate": 1.237764358520508e-05, |
| "lookahead_loss": 6.313926457881927, |
| "loss": 0.319, |
| "step": 394500 |
| }, |
| { |
| "base_loss": 0.2983907374441624, |
| "epoch": 4.014305114746094, |
| "grad_norm": 0.0009333580383099616, |
| "learning_rate": 1.2329959869384766e-05, |
| "lookahead_loss": 6.263853558540344, |
| "loss": 0.3132, |
| "step": 395000 |
| }, |
| { |
| "epoch": 4.014305114746094, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.2924738585377655, |
| "eval_lookahead_perplexity": 540.4887693149997, |
| "eval_loss": 0.14172253012657166, |
| "eval_perplexity": 1.1522568876042834, |
| "eval_runtime": 266.6176, |
| "eval_samples_per_second": 18.753, |
| "eval_steps_per_second": 0.589, |
| "step": 395000 |
| }, |
| { |
| "base_loss": 0.2952827827334404, |
| "epoch": 4.0152587890625, |
| "grad_norm": 0.0009197366307489574, |
| "learning_rate": 1.2282276153564453e-05, |
| "lookahead_loss": 6.225392914295196, |
| "loss": 0.3071, |
| "step": 395500 |
| }, |
| { |
| "base_loss": 0.3103551665246487, |
| "epoch": 4.016212463378906, |
| "grad_norm": 0.0009927282808348536, |
| "learning_rate": 1.2234592437744141e-05, |
| "lookahead_loss": 6.285288751602173, |
| "loss": 0.3233, |
| "step": 396000 |
| }, |
| { |
| "base_loss": 0.31515152567625043, |
| "epoch": 4.0171661376953125, |
| "grad_norm": 0.0008934010402299464, |
| "learning_rate": 1.2186908721923828e-05, |
| "lookahead_loss": 6.318837342262268, |
| "loss": 0.3244, |
| "step": 396500 |
| }, |
| { |
| "base_loss": 0.2997098692059517, |
| "epoch": 4.018119812011719, |
| "grad_norm": 0.0009356258087791502, |
| "learning_rate": 1.2139225006103517e-05, |
| "lookahead_loss": 6.3236184453964235, |
| "loss": 0.3105, |
| "step": 397000 |
| }, |
| { |
| "base_loss": 0.2971528458297253, |
| "epoch": 4.019073486328125, |
| "grad_norm": 0.0009439104469493032, |
| "learning_rate": 1.2091541290283204e-05, |
| "lookahead_loss": 6.3374961051940915, |
| "loss": 0.3097, |
| "step": 397500 |
| }, |
| { |
| "base_loss": 0.30046086144447326, |
| "epoch": 4.020027160644531, |
| "grad_norm": 0.001036732573993504, |
| "learning_rate": 1.204385757446289e-05, |
| "lookahead_loss": 6.184789316654205, |
| "loss": 0.3137, |
| "step": 398000 |
| }, |
| { |
| "base_loss": 0.32556178280711173, |
| "epoch": 4.0209808349609375, |
| "grad_norm": 0.0009528475347906351, |
| "learning_rate": 1.1996173858642579e-05, |
| "lookahead_loss": 6.28066121339798, |
| "loss": 0.3384, |
| "step": 398500 |
| }, |
| { |
| "base_loss": 0.30630752837657926, |
| "epoch": 4.021934509277344, |
| "grad_norm": 0.0009664383833296597, |
| "learning_rate": 1.1948490142822266e-05, |
| "lookahead_loss": 6.229596662521362, |
| "loss": 0.3151, |
| "step": 399000 |
| }, |
| { |
| "base_loss": 0.2993029763698578, |
| "epoch": 4.02288818359375, |
| "grad_norm": 0.0009780285181477666, |
| "learning_rate": 1.1900806427001954e-05, |
| "lookahead_loss": 6.271652706146241, |
| "loss": 0.3126, |
| "step": 399500 |
| }, |
| { |
| "base_loss": 0.3033270851969719, |
| "epoch": 4.023841857910156, |
| "grad_norm": 0.0009725343552418053, |
| "learning_rate": 1.1853122711181641e-05, |
| "lookahead_loss": 6.217432358264923, |
| "loss": 0.3153, |
| "step": 400000 |
| }, |
| { |
| "epoch": 4.023841857910156, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.289627787404167, |
| "eval_lookahead_perplexity": 538.9526867675877, |
| "eval_loss": 0.14171701669692993, |
| "eval_perplexity": 1.1522505347345173, |
| "eval_runtime": 269.7244, |
| "eval_samples_per_second": 18.537, |
| "eval_steps_per_second": 0.582, |
| "step": 400000 |
| }, |
| { |
| "base_loss": 0.32289262464642526, |
| "epoch": 4.0247955322265625, |
| "grad_norm": 0.0009039760916493833, |
| "learning_rate": 1.1805438995361328e-05, |
| "lookahead_loss": 6.247900634765625, |
| "loss": 0.333, |
| "step": 400500 |
| }, |
| { |
| "base_loss": 0.3097568289935589, |
| "epoch": 4.025749206542969, |
| "grad_norm": 0.0009861037833616138, |
| "learning_rate": 1.1757755279541016e-05, |
| "lookahead_loss": 6.196441818714142, |
| "loss": 0.323, |
| "step": 401000 |
| }, |
| { |
| "base_loss": 0.30243406727910044, |
| "epoch": 4.026702880859375, |
| "grad_norm": 0.0010050033451989293, |
| "learning_rate": 1.1710071563720703e-05, |
| "lookahead_loss": 6.22297222328186, |
| "loss": 0.3127, |
| "step": 401500 |
| }, |
| { |
| "base_loss": 0.3076402995288372, |
| "epoch": 4.027656555175781, |
| "grad_norm": 0.0009313607588410378, |
| "learning_rate": 1.1662387847900392e-05, |
| "lookahead_loss": 6.321592648506164, |
| "loss": 0.3182, |
| "step": 402000 |
| }, |
| { |
| "base_loss": 0.3292808674275875, |
| "epoch": 4.0286102294921875, |
| "grad_norm": 0.0009357398957945406, |
| "learning_rate": 1.1614704132080079e-05, |
| "lookahead_loss": 6.344618997097015, |
| "loss": 0.3395, |
| "step": 402500 |
| }, |
| { |
| "base_loss": 0.30565082639455793, |
| "epoch": 4.029563903808594, |
| "grad_norm": 0.0009264600230380893, |
| "learning_rate": 1.1567020416259765e-05, |
| "lookahead_loss": 6.279807175636291, |
| "loss": 0.3144, |
| "step": 403000 |
| }, |
| { |
| "base_loss": 0.3062080657184124, |
| "epoch": 4.030517578125, |
| "grad_norm": 0.0009344466379843652, |
| "learning_rate": 1.1519336700439454e-05, |
| "lookahead_loss": 6.271709934711456, |
| "loss": 0.3164, |
| "step": 403500 |
| }, |
| { |
| "base_loss": 0.3015955919623375, |
| "epoch": 4.031471252441406, |
| "grad_norm": 0.000959421566221863, |
| "learning_rate": 1.147165298461914e-05, |
| "lookahead_loss": 6.298808106422424, |
| "loss": 0.314, |
| "step": 404000 |
| }, |
| { |
| "base_loss": 0.3150785211026669, |
| "epoch": 4.0324249267578125, |
| "grad_norm": 0.0009721990791149437, |
| "learning_rate": 1.142396926879883e-05, |
| "lookahead_loss": 6.2469435338974, |
| "loss": 0.3334, |
| "step": 404500 |
| }, |
| { |
| "base_loss": 0.30686600294709204, |
| "epoch": 4.033378601074219, |
| "grad_norm": 0.0010035648010671139, |
| "learning_rate": 1.1376285552978516e-05, |
| "lookahead_loss": 6.3015628657341, |
| "loss": 0.317, |
| "step": 405000 |
| }, |
| { |
| "epoch": 4.033378601074219, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.287427164114321, |
| "eval_lookahead_perplexity": 537.7679589808528, |
| "eval_loss": 0.1417127251625061, |
| "eval_perplexity": 1.1522455898222932, |
| "eval_runtime": 259.3547, |
| "eval_samples_per_second": 19.279, |
| "eval_steps_per_second": 0.605, |
| "step": 405000 |
| }, |
| { |
| "base_loss": 0.30655504322052, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.0009588917600922287, |
| "learning_rate": 1.1328601837158203e-05, |
| "lookahead_loss": 6.336568244457245, |
| "loss": 0.3139, |
| "step": 405500 |
| }, |
| { |
| "base_loss": 0.3002312153875828, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.0009991881670430303, |
| "learning_rate": 1.1280918121337891e-05, |
| "lookahead_loss": 6.180880591869355, |
| "loss": 0.3128, |
| "step": 406000 |
| }, |
| { |
| "base_loss": 0.312505132496357, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.0009822511347010732, |
| "learning_rate": 1.1233234405517578e-05, |
| "lookahead_loss": 6.171671084403991, |
| "loss": 0.3219, |
| "step": 406500 |
| }, |
| { |
| "base_loss": 0.3240452491641045, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.0009494374971836805, |
| "learning_rate": 1.1185550689697267e-05, |
| "lookahead_loss": 6.2043786425590515, |
| "loss": 0.3358, |
| "step": 407000 |
| }, |
| { |
| "base_loss": 0.29858038023114203, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.0009324284037575126, |
| "learning_rate": 1.1137866973876954e-05, |
| "lookahead_loss": 6.192115921020508, |
| "loss": 0.3133, |
| "step": 407500 |
| }, |
| { |
| "base_loss": 0.3042404046058655, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.0008426170097663999, |
| "learning_rate": 1.109018325805664e-05, |
| "lookahead_loss": 6.314742174148559, |
| "loss": 0.3128, |
| "step": 408000 |
| }, |
| { |
| "base_loss": 0.29714440524578095, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.0009390473132953048, |
| "learning_rate": 1.1042499542236329e-05, |
| "lookahead_loss": 6.173552748680115, |
| "loss": 0.3123, |
| "step": 408500 |
| }, |
| { |
| "base_loss": 0.31379624953866003, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.0009832140058279037, |
| "learning_rate": 1.0994815826416016e-05, |
| "lookahead_loss": 6.232072317123413, |
| "loss": 0.3242, |
| "step": 409000 |
| }, |
| { |
| "base_loss": 0.31622857597470283, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.0009228273993358016, |
| "learning_rate": 1.0947132110595704e-05, |
| "lookahead_loss": 6.214964485168457, |
| "loss": 0.3228, |
| "step": 409500 |
| }, |
| { |
| "base_loss": 0.3033116071224213, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.000982031342573464, |
| "learning_rate": 1.0899448394775391e-05, |
| "lookahead_loss": 6.235745645046234, |
| "loss": 0.3161, |
| "step": 410000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.285334576433078, |
| "eval_lookahead_perplexity": 536.6438089759619, |
| "eval_loss": 0.14170871675014496, |
| "eval_perplexity": 1.1522409711560848, |
| "eval_runtime": 271.4698, |
| "eval_samples_per_second": 18.418, |
| "eval_steps_per_second": 0.578, |
| "step": 410000 |
| }, |
| { |
| "base_loss": 0.302242584168911, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.0009649608400650322, |
| "learning_rate": 1.0851764678955078e-05, |
| "lookahead_loss": 6.201606332778931, |
| "loss": 0.3116, |
| "step": 410500 |
| }, |
| { |
| "base_loss": 0.3031807193160057, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.000979897566139698, |
| "learning_rate": 1.0804080963134766e-05, |
| "lookahead_loss": 6.214184216022492, |
| "loss": 0.3162, |
| "step": 411000 |
| }, |
| { |
| "base_loss": 0.324542246311903, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.0008748429245315492, |
| "learning_rate": 1.0756397247314453e-05, |
| "lookahead_loss": 6.212394516944885, |
| "loss": 0.3343, |
| "step": 411500 |
| }, |
| { |
| "base_loss": 0.3043093577325344, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.0009360151016153395, |
| "learning_rate": 1.0708713531494142e-05, |
| "lookahead_loss": 6.3113635568618776, |
| "loss": 0.3195, |
| "step": 412000 |
| }, |
| { |
| "base_loss": 0.29890961676836014, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.0009034210816025734, |
| "learning_rate": 1.0661029815673829e-05, |
| "lookahead_loss": 6.2563081007003785, |
| "loss": 0.3139, |
| "step": 412500 |
| }, |
| { |
| "base_loss": 0.2968312213420868, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.0009093395201489329, |
| "learning_rate": 1.0613346099853515e-05, |
| "lookahead_loss": 6.21161489534378, |
| "loss": 0.3077, |
| "step": 413000 |
| }, |
| { |
| "base_loss": 0.309987826526165, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.0009834787342697382, |
| "learning_rate": 1.0565662384033204e-05, |
| "lookahead_loss": 6.257289779186249, |
| "loss": 0.3218, |
| "step": 413500 |
| }, |
| { |
| "base_loss": 0.3124798896312714, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.0009035744587890804, |
| "learning_rate": 1.051797866821289e-05, |
| "lookahead_loss": 6.281959458351135, |
| "loss": 0.323, |
| "step": 414000 |
| }, |
| { |
| "base_loss": 0.30385399025678633, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.0009486905764788389, |
| "learning_rate": 1.047029495239258e-05, |
| "lookahead_loss": 6.2897271089553834, |
| "loss": 0.3125, |
| "step": 414500 |
| }, |
| { |
| "base_loss": 0.2997076933085918, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.0009449619683437049, |
| "learning_rate": 1.0422611236572266e-05, |
| "lookahead_loss": 6.319130481719971, |
| "loss": 0.31, |
| "step": 415000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.283114870516256, |
| "eval_lookahead_perplexity": 535.4539386076507, |
| "eval_loss": 0.14170418679714203, |
| "eval_perplexity": 1.1522357515704595, |
| "eval_runtime": 302.1089, |
| "eval_samples_per_second": 16.55, |
| "eval_steps_per_second": 0.52, |
| "step": 415000 |
| }, |
| { |
| "base_loss": 0.30210260692238805, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.0010286852484568954, |
| "learning_rate": 1.0374927520751953e-05, |
| "lookahead_loss": 6.147320285797119, |
| "loss": 0.3146, |
| "step": 415500 |
| }, |
| { |
| "base_loss": 0.3285051781535149, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.0009719706140458584, |
| "learning_rate": 1.0327243804931641e-05, |
| "lookahead_loss": 6.239666066169739, |
| "loss": 0.3382, |
| "step": 416000 |
| }, |
| { |
| "base_loss": 0.30326452678442, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.0009495667181909084, |
| "learning_rate": 1.0279560089111328e-05, |
| "lookahead_loss": 6.200324444770813, |
| "loss": 0.3137, |
| "step": 416500 |
| }, |
| { |
| "base_loss": 0.29889601907134056, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.0009736404754221439, |
| "learning_rate": 1.0231876373291017e-05, |
| "lookahead_loss": 6.251074130535126, |
| "loss": 0.3114, |
| "step": 417000 |
| }, |
| { |
| "base_loss": 0.3006108500063419, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.0009736517095007002, |
| "learning_rate": 1.0184192657470704e-05, |
| "lookahead_loss": 6.2008349332809445, |
| "loss": 0.3122, |
| "step": 417500 |
| }, |
| { |
| "base_loss": 0.3237688979506493, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.0008906475268304348, |
| "learning_rate": 1.013650894165039e-05, |
| "lookahead_loss": 6.22595210647583, |
| "loss": 0.3352, |
| "step": 418000 |
| }, |
| { |
| "base_loss": 0.3078545735180378, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.0009600927005521953, |
| "learning_rate": 1.0088825225830079e-05, |
| "lookahead_loss": 6.177468316078186, |
| "loss": 0.3217, |
| "step": 418500 |
| }, |
| { |
| "base_loss": 0.3022345977425575, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.001004669931717217, |
| "learning_rate": 1.0041141510009766e-05, |
| "lookahead_loss": 6.194265043735504, |
| "loss": 0.3106, |
| "step": 419000 |
| }, |
| { |
| "base_loss": 0.3071480156183243, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.0009501728927716613, |
| "learning_rate": 9.993457794189454e-06, |
| "lookahead_loss": 6.303304790496826, |
| "loss": 0.3183, |
| "step": 419500 |
| }, |
| { |
| "base_loss": 0.3302598208785057, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.0009357588132843375, |
| "learning_rate": 9.945774078369141e-06, |
| "lookahead_loss": 6.331861734390259, |
| "loss": 0.3412, |
| "step": 420000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.281080325190633, |
| "eval_lookahead_perplexity": 534.3656407708145, |
| "eval_loss": 0.14170029759407043, |
| "eval_perplexity": 1.1522312703003497, |
| "eval_runtime": 283.7378, |
| "eval_samples_per_second": 17.622, |
| "eval_steps_per_second": 0.553, |
| "step": 420000 |
| }, |
| { |
| "base_loss": 0.3027013133764267, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.0009330170578323305, |
| "learning_rate": 9.898090362548828e-06, |
| "lookahead_loss": 6.26250266456604, |
| "loss": 0.3116, |
| "step": 420500 |
| }, |
| { |
| "base_loss": 0.3046494301855564, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.0009495181730017066, |
| "learning_rate": 9.850406646728516e-06, |
| "lookahead_loss": 6.2420420794487, |
| "loss": 0.3156, |
| "step": 421000 |
| }, |
| { |
| "base_loss": 0.3023626366853714, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.0009388374746777117, |
| "learning_rate": 9.802722930908203e-06, |
| "lookahead_loss": 6.270756626129151, |
| "loss": 0.3143, |
| "step": 421500 |
| }, |
| { |
| "base_loss": 0.3171934984624386, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.0009781798580661416, |
| "learning_rate": 9.755039215087892e-06, |
| "lookahead_loss": 6.233568662643433, |
| "loss": 0.3343, |
| "step": 422000 |
| }, |
| { |
| "base_loss": 0.305971223294735, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.0009960117749869823, |
| "learning_rate": 9.707355499267579e-06, |
| "lookahead_loss": 6.261573143005371, |
| "loss": 0.3149, |
| "step": 422500 |
| }, |
| { |
| "base_loss": 0.3008191674053669, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.000891255447641015, |
| "learning_rate": 9.659671783447265e-06, |
| "lookahead_loss": 6.343621186256408, |
| "loss": 0.3133, |
| "step": 423000 |
| }, |
| { |
| "base_loss": 0.3125488177835941, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.0010064059169963002, |
| "learning_rate": 9.611988067626954e-06, |
| "lookahead_loss": 6.176242787361145, |
| "loss": 0.3221, |
| "step": 423500 |
| }, |
| { |
| "base_loss": 0.32382212686538697, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.0009808322647586465, |
| "learning_rate": 9.56430435180664e-06, |
| "lookahead_loss": 6.291307106018066, |
| "loss": 0.3371, |
| "step": 424000 |
| }, |
| { |
| "base_loss": 0.30577521124482154, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.0010097597260028124, |
| "learning_rate": 9.51662063598633e-06, |
| "lookahead_loss": 6.22961159658432, |
| "loss": 0.3171, |
| "step": 424500 |
| }, |
| { |
| "base_loss": 0.3027714610397816, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.0009810663759708405, |
| "learning_rate": 9.468936920166016e-06, |
| "lookahead_loss": 6.266712354660034, |
| "loss": 0.3148, |
| "step": 425000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.279258819433828, |
| "eval_lookahead_perplexity": 533.3931766233101, |
| "eval_loss": 0.14169692993164062, |
| "eval_perplexity": 1.152227389980924, |
| "eval_runtime": 396.6631, |
| "eval_samples_per_second": 12.605, |
| "eval_steps_per_second": 0.396, |
| "step": 425000 |
| }, |
| { |
| "base_loss": 0.3064252578020096, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.0009220910142175853, |
| "learning_rate": 9.421253204345703e-06, |
| "lookahead_loss": 6.2247487473487855, |
| "loss": 0.3193, |
| "step": 425500 |
| }, |
| { |
| "base_loss": 0.3251348150372505, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.0009618565090931952, |
| "learning_rate": 9.373569488525391e-06, |
| "lookahead_loss": 6.2589435048103335, |
| "loss": 0.3343, |
| "step": 426000 |
| }, |
| { |
| "base_loss": 0.3045478595495224, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.0009619101765565574, |
| "learning_rate": 9.325885772705078e-06, |
| "lookahead_loss": 6.189810398101806, |
| "loss": 0.315, |
| "step": 426500 |
| }, |
| { |
| "base_loss": 0.2982518375813961, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.0009836278622969985, |
| "learning_rate": 9.278202056884767e-06, |
| "lookahead_loss": 6.272165112972259, |
| "loss": 0.3097, |
| "step": 427000 |
| }, |
| { |
| "base_loss": 0.3089935587644577, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.0009688155842013657, |
| "learning_rate": 9.230518341064454e-06, |
| "lookahead_loss": 6.282459297657013, |
| "loss": 0.324, |
| "step": 427500 |
| }, |
| { |
| "base_loss": 0.3268603746891022, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.0009795301593840122, |
| "learning_rate": 9.18283462524414e-06, |
| "lookahead_loss": 6.323117096424102, |
| "loss": 0.3406, |
| "step": 428000 |
| }, |
| { |
| "base_loss": 0.29676153120398524, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.0009742308175191283, |
| "learning_rate": 9.135150909423829e-06, |
| "lookahead_loss": 6.223826610565186, |
| "loss": 0.3091, |
| "step": 428500 |
| }, |
| { |
| "base_loss": 0.3044439141750336, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.000964790116995573, |
| "learning_rate": 9.087467193603516e-06, |
| "lookahead_loss": 6.254047955989837, |
| "loss": 0.3161, |
| "step": 429000 |
| }, |
| { |
| "base_loss": 0.3313070158064365, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.000941166770644486, |
| "learning_rate": 9.039783477783204e-06, |
| "lookahead_loss": 6.2130351490974425, |
| "loss": 0.3393, |
| "step": 429500 |
| }, |
| { |
| "base_loss": 0.32587327966094015, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.0010063599329441786, |
| "learning_rate": 8.992099761962891e-06, |
| "lookahead_loss": 6.257496692657471, |
| "loss": 0.3387, |
| "step": 430000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.2776359635800985, |
| "eval_lookahead_perplexity": 532.5282583929193, |
| "eval_loss": 0.14169345796108246, |
| "eval_perplexity": 1.1522233894882945, |
| "eval_runtime": 288.3523, |
| "eval_samples_per_second": 17.34, |
| "eval_steps_per_second": 0.544, |
| "step": 430000 |
| }, |
| { |
| "base_loss": 0.29468956208229063, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.0009450262296013534, |
| "learning_rate": 8.944416046142578e-06, |
| "lookahead_loss": 6.2217209124565125, |
| "loss": 0.3076, |
| "step": 430500 |
| }, |
| { |
| "base_loss": 0.3027429393827915, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.0009969213278964162, |
| "learning_rate": 8.896732330322266e-06, |
| "lookahead_loss": 6.207447394371033, |
| "loss": 0.3161, |
| "step": 431000 |
| }, |
| { |
| "base_loss": 0.3190444597601891, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.0008964469889178872, |
| "learning_rate": 8.849048614501953e-06, |
| "lookahead_loss": 6.302306387901306, |
| "loss": 0.3331, |
| "step": 431500 |
| }, |
| { |
| "base_loss": 0.3043918348252773, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.0009915096452459693, |
| "learning_rate": 8.801364898681642e-06, |
| "lookahead_loss": 6.257247267246246, |
| "loss": 0.3177, |
| "step": 432000 |
| }, |
| { |
| "base_loss": 0.3046840020418167, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.0010428299428895116, |
| "learning_rate": 8.753681182861329e-06, |
| "lookahead_loss": 6.1936208391189576, |
| "loss": 0.317, |
| "step": 432500 |
| }, |
| { |
| "base_loss": 0.3202188531160355, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.0009129694662988186, |
| "learning_rate": 8.705997467041015e-06, |
| "lookahead_loss": 6.2067328634262084, |
| "loss": 0.3299, |
| "step": 433000 |
| }, |
| { |
| "base_loss": 0.3542410895228386, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.0009339757962152362, |
| "learning_rate": 8.658313751220704e-06, |
| "lookahead_loss": 6.249197509765625, |
| "loss": 0.3686, |
| "step": 433500 |
| }, |
| { |
| "base_loss": 0.2943912135362625, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.0009666451369412243, |
| "learning_rate": 8.61063003540039e-06, |
| "lookahead_loss": 6.255749958992005, |
| "loss": 0.3081, |
| "step": 434000 |
| }, |
| { |
| "base_loss": 0.30392896428704264, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.0009277292410843074, |
| "learning_rate": 8.56294631958008e-06, |
| "lookahead_loss": 6.270913313865662, |
| "loss": 0.3167, |
| "step": 434500 |
| }, |
| { |
| "base_loss": 0.3181495431959629, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.0009528077207505703, |
| "learning_rate": 8.515262603759766e-06, |
| "lookahead_loss": 6.285120022773743, |
| "loss": 0.3316, |
| "step": 435000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.276001627071977, |
| "eval_lookahead_perplexity": 531.658638837682, |
| "eval_loss": 0.1416902393102646, |
| "eval_perplexity": 1.152219680889508, |
| "eval_runtime": 272.5458, |
| "eval_samples_per_second": 18.346, |
| "eval_steps_per_second": 0.576, |
| "step": 435000 |
| }, |
| { |
| "base_loss": 0.3180287193655968, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.000986663973890245, |
| "learning_rate": 8.467578887939453e-06, |
| "lookahead_loss": 6.288068524360657, |
| "loss": 0.3283, |
| "step": 435500 |
| }, |
| { |
| "base_loss": 0.292520221978426, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.0009171205456368625, |
| "learning_rate": 8.419895172119141e-06, |
| "lookahead_loss": 6.184898139953614, |
| "loss": 0.308, |
| "step": 436000 |
| }, |
| { |
| "base_loss": 0.3019208701252937, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.0009770637843757868, |
| "learning_rate": 8.372211456298828e-06, |
| "lookahead_loss": 6.231124799251557, |
| "loss": 0.3146, |
| "step": 436500 |
| }, |
| { |
| "base_loss": 0.32141088619828223, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.0009801468113437295, |
| "learning_rate": 8.324527740478517e-06, |
| "lookahead_loss": 6.210278995513916, |
| "loss": 0.332, |
| "step": 437000 |
| }, |
| { |
| "base_loss": 0.30723505771160126, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.0010288211051374674, |
| "learning_rate": 8.276844024658204e-06, |
| "lookahead_loss": 6.222631175994873, |
| "loss": 0.3162, |
| "step": 437500 |
| }, |
| { |
| "base_loss": 0.308370777964592, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.000984584796242416, |
| "learning_rate": 8.22916030883789e-06, |
| "lookahead_loss": 6.219526912689209, |
| "loss": 0.3171, |
| "step": 438000 |
| }, |
| { |
| "base_loss": 0.3175841515958309, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.0009486065828241408, |
| "learning_rate": 8.181476593017579e-06, |
| "lookahead_loss": 6.251926373958588, |
| "loss": 0.3296, |
| "step": 438500 |
| }, |
| { |
| "base_loss": 0.3023634272813797, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.0009462623856961727, |
| "learning_rate": 8.133792877197266e-06, |
| "lookahead_loss": 6.273111929893494, |
| "loss": 0.3172, |
| "step": 439000 |
| }, |
| { |
| "base_loss": 0.31000158992409704, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.000977607793174684, |
| "learning_rate": 8.086109161376954e-06, |
| "lookahead_loss": 6.2029001622200015, |
| "loss": 0.321, |
| "step": 439500 |
| }, |
| { |
| "base_loss": 0.3074465197324753, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.0009711860329844058, |
| "learning_rate": 8.038425445556641e-06, |
| "lookahead_loss": 6.179575590610504, |
| "loss": 0.3168, |
| "step": 440000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.274223205761407, |
| "eval_lookahead_perplexity": 530.7139660465189, |
| "eval_loss": 0.1416870653629303, |
| "eval_perplexity": 1.1522160238107269, |
| "eval_runtime": 271.6398, |
| "eval_samples_per_second": 18.407, |
| "eval_steps_per_second": 0.578, |
| "step": 440000 |
| }, |
| { |
| "base_loss": 0.3308669750988483, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.001032104599289596, |
| "learning_rate": 7.990741729736328e-06, |
| "lookahead_loss": 6.263833108901977, |
| "loss": 0.3436, |
| "step": 440500 |
| }, |
| { |
| "base_loss": 0.300963022172451, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.0009854926029220223, |
| "learning_rate": 7.943058013916016e-06, |
| "lookahead_loss": 6.227484060764313, |
| "loss": 0.31, |
| "step": 441000 |
| }, |
| { |
| "base_loss": 0.3016065271794796, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.0009571650298312306, |
| "learning_rate": 7.895374298095703e-06, |
| "lookahead_loss": 6.25590787267685, |
| "loss": 0.3144, |
| "step": 441500 |
| }, |
| { |
| "base_loss": 0.3469915909469128, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.0009693103493191302, |
| "learning_rate": 7.847690582275392e-06, |
| "lookahead_loss": 6.164663645267487, |
| "loss": 0.3587, |
| "step": 442000 |
| }, |
| { |
| "base_loss": 0.31762470316886904, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.0009685850236564875, |
| "learning_rate": 7.800006866455079e-06, |
| "lookahead_loss": 6.211194790840149, |
| "loss": 0.3254, |
| "step": 442500 |
| }, |
| { |
| "base_loss": 0.3090612238943577, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.0009897744748741388, |
| "learning_rate": 7.752323150634765e-06, |
| "lookahead_loss": 6.288339027404785, |
| "loss": 0.3187, |
| "step": 443000 |
| }, |
| { |
| "base_loss": 0.3051177371442318, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.0009485665941610932, |
| "learning_rate": 7.704639434814454e-06, |
| "lookahead_loss": 6.247628257751465, |
| "loss": 0.3173, |
| "step": 443500 |
| }, |
| { |
| "base_loss": 0.32735036182403565, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.000949955778196454, |
| "learning_rate": 7.65695571899414e-06, |
| "lookahead_loss": 6.261586086750031, |
| "loss": 0.3397, |
| "step": 444000 |
| }, |
| { |
| "base_loss": 0.3037717220187187, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.0009570113033987582, |
| "learning_rate": 7.6092720031738284e-06, |
| "lookahead_loss": 6.32311720943451, |
| "loss": 0.3156, |
| "step": 444500 |
| }, |
| { |
| "base_loss": 0.3043428426384926, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.0008967678295448422, |
| "learning_rate": 7.561588287353516e-06, |
| "lookahead_loss": 6.288136465072632, |
| "loss": 0.3173, |
| "step": 445000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.272627116773076, |
| "eval_lookahead_perplexity": 529.8675749667811, |
| "eval_loss": 0.14168404042720795, |
| "eval_perplexity": 1.1522125384365882, |
| "eval_runtime": 265.0143, |
| "eval_samples_per_second": 18.867, |
| "eval_steps_per_second": 0.592, |
| "step": 445000 |
| }, |
| { |
| "base_loss": 0.3304513318836689, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.0009672945598140359, |
| "learning_rate": 7.513904571533204e-06, |
| "lookahead_loss": 6.262294914245605, |
| "loss": 0.3437, |
| "step": 445500 |
| }, |
| { |
| "base_loss": 0.3036956556737423, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.0009918182622641325, |
| "learning_rate": 7.466220855712891e-06, |
| "lookahead_loss": 6.286622961521148, |
| "loss": 0.3177, |
| "step": 446000 |
| }, |
| { |
| "base_loss": 0.29657268461585046, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.0009162202477455139, |
| "learning_rate": 7.418537139892578e-06, |
| "lookahead_loss": 6.281464424133301, |
| "loss": 0.3109, |
| "step": 446500 |
| }, |
| { |
| "base_loss": 0.3123248810470104, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.0009557082084938884, |
| "learning_rate": 7.370853424072266e-06, |
| "lookahead_loss": 6.271296732425689, |
| "loss": 0.3289, |
| "step": 447000 |
| }, |
| { |
| "base_loss": 0.32140666726231576, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.0009874977404251695, |
| "learning_rate": 7.323169708251954e-06, |
| "lookahead_loss": 6.28459517621994, |
| "loss": 0.3369, |
| "step": 447500 |
| }, |
| { |
| "base_loss": 0.29961148300766943, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.0008999704150483012, |
| "learning_rate": 7.275485992431641e-06, |
| "lookahead_loss": 6.285267066001892, |
| "loss": 0.312, |
| "step": 448000 |
| }, |
| { |
| "base_loss": 0.3036400380730629, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.0010060252388939261, |
| "learning_rate": 7.227802276611328e-06, |
| "lookahead_loss": 6.313744902610779, |
| "loss": 0.3171, |
| "step": 448500 |
| }, |
| { |
| "base_loss": 0.3315876969695091, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.0009252046584151685, |
| "learning_rate": 7.180118560791016e-06, |
| "lookahead_loss": 6.31829022693634, |
| "loss": 0.3446, |
| "step": 449000 |
| }, |
| { |
| "base_loss": 0.30729381024837493, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.0009679401991888881, |
| "learning_rate": 7.1324348449707034e-06, |
| "lookahead_loss": 6.265555395126343, |
| "loss": 0.3195, |
| "step": 449500 |
| }, |
| { |
| "base_loss": 0.3001744159460068, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.0010059743653982878, |
| "learning_rate": 7.084751129150391e-06, |
| "lookahead_loss": 6.251848965168, |
| "loss": 0.3098, |
| "step": 450000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.271020517562525, |
| "eval_lookahead_perplexity": 529.0169736098392, |
| "eval_loss": 0.14168114960193634, |
| "eval_perplexity": 1.1522092075962782, |
| "eval_runtime": 256.5611, |
| "eval_samples_per_second": 19.489, |
| "eval_steps_per_second": 0.612, |
| "step": 450000 |
| }, |
| { |
| "base_loss": 0.3043146550655365, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.0011264593340456486, |
| "learning_rate": 7.037067413330079e-06, |
| "lookahead_loss": 6.214835606098175, |
| "loss": 0.3135, |
| "step": 450500 |
| }, |
| { |
| "base_loss": 0.33685871040821075, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.000883612665347755, |
| "learning_rate": 6.989383697509766e-06, |
| "lookahead_loss": 6.2883378591537475, |
| "loss": 0.344, |
| "step": 451000 |
| }, |
| { |
| "base_loss": 0.30271838963031766, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.0009825986344367266, |
| "learning_rate": 6.941699981689453e-06, |
| "lookahead_loss": 6.2627932052612305, |
| "loss": 0.313, |
| "step": 451500 |
| }, |
| { |
| "base_loss": 0.31106107553839685, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.0009782552951946855, |
| "learning_rate": 6.894016265869141e-06, |
| "lookahead_loss": 6.294947155952453, |
| "loss": 0.3202, |
| "step": 452000 |
| }, |
| { |
| "base_loss": 0.29801268032193184, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.0009775352664291859, |
| "learning_rate": 6.846332550048829e-06, |
| "lookahead_loss": 6.287975093841553, |
| "loss": 0.3103, |
| "step": 452500 |
| }, |
| { |
| "base_loss": 0.29706856977939605, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.0008749934495426714, |
| "learning_rate": 6.798648834228516e-06, |
| "lookahead_loss": 6.24645920419693, |
| "loss": 0.3081, |
| "step": 453000 |
| }, |
| { |
| "base_loss": 0.3189851225912571, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.0009568808600306511, |
| "learning_rate": 6.750965118408203e-06, |
| "lookahead_loss": 6.2773444094657895, |
| "loss": 0.3344, |
| "step": 453500 |
| }, |
| { |
| "base_loss": 0.307105902582407, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.0009044524631462991, |
| "learning_rate": 6.703281402587891e-06, |
| "lookahead_loss": 6.281952003479004, |
| "loss": 0.3195, |
| "step": 454000 |
| }, |
| { |
| "base_loss": 0.2863923677802086, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.0009490604279562831, |
| "learning_rate": 6.6555976867675784e-06, |
| "lookahead_loss": 6.28246708202362, |
| "loss": 0.3022, |
| "step": 454500 |
| }, |
| { |
| "base_loss": 0.2923275768607855, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.0009766619186848402, |
| "learning_rate": 6.607913970947266e-06, |
| "lookahead_loss": 6.205554433822632, |
| "loss": 0.3078, |
| "step": 455000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.269953422272168, |
| "eval_lookahead_perplexity": 528.4527631754679, |
| "eval_loss": 0.1416788250207901, |
| "eval_perplexity": 1.1522065291955907, |
| "eval_runtime": 265.9126, |
| "eval_samples_per_second": 18.803, |
| "eval_steps_per_second": 0.59, |
| "step": 455000 |
| }, |
| { |
| "base_loss": 0.2988976333141327, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.0009743266855366528, |
| "learning_rate": 6.560230255126954e-06, |
| "lookahead_loss": 6.28825941324234, |
| "loss": 0.3117, |
| "step": 455500 |
| }, |
| { |
| "base_loss": 0.3292928241491318, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.0009721256792545319, |
| "learning_rate": 6.512546539306641e-06, |
| "lookahead_loss": 6.306233211517334, |
| "loss": 0.3394, |
| "step": 456000 |
| }, |
| { |
| "base_loss": 0.2914348037838936, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.0009659443167038262, |
| "learning_rate": 6.464862823486328e-06, |
| "lookahead_loss": 6.237008224010467, |
| "loss": 0.3078, |
| "step": 456500 |
| }, |
| { |
| "base_loss": 0.2972012578845024, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.0009896591072902083, |
| "learning_rate": 6.417179107666016e-06, |
| "lookahead_loss": 6.287574047088623, |
| "loss": 0.3096, |
| "step": 457000 |
| }, |
| { |
| "base_loss": 0.3006402098238468, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.0009321753168478608, |
| "learning_rate": 6.369495391845704e-06, |
| "lookahead_loss": 6.283053328514099, |
| "loss": 0.3145, |
| "step": 457500 |
| }, |
| { |
| "base_loss": 0.3227167456150055, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.0009433454251848161, |
| "learning_rate": 6.321811676025391e-06, |
| "lookahead_loss": 6.353063113212586, |
| "loss": 0.3319, |
| "step": 458000 |
| }, |
| { |
| "base_loss": 0.30574207335710524, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.0009229978313669562, |
| "learning_rate": 6.274127960205078e-06, |
| "lookahead_loss": 6.309114946365357, |
| "loss": 0.3146, |
| "step": 458500 |
| }, |
| { |
| "base_loss": 0.29960223579406736, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.0009563881903886795, |
| "learning_rate": 6.226444244384766e-06, |
| "lookahead_loss": 6.274078320503235, |
| "loss": 0.314, |
| "step": 459000 |
| }, |
| { |
| "base_loss": 0.2996614835858345, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.0009942464530467987, |
| "learning_rate": 6.1787605285644534e-06, |
| "lookahead_loss": 6.303729599952698, |
| "loss": 0.3115, |
| "step": 459500 |
| }, |
| { |
| "base_loss": 0.3155037875175476, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.000987946754321456, |
| "learning_rate": 6.131076812744141e-06, |
| "lookahead_loss": 6.240426075935364, |
| "loss": 0.3274, |
| "step": 460000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.268837010898529, |
| "eval_lookahead_perplexity": 527.8631217026941, |
| "eval_loss": 0.14167657494544983, |
| "eval_perplexity": 1.1522039366470092, |
| "eval_runtime": 255.3675, |
| "eval_samples_per_second": 19.58, |
| "eval_steps_per_second": 0.615, |
| "step": 460000 |
| }, |
| { |
| "base_loss": 0.30881242457032204, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.0009278358775191009, |
| "learning_rate": 6.083393096923829e-06, |
| "lookahead_loss": 6.227671199798584, |
| "loss": 0.3205, |
| "step": 460500 |
| }, |
| { |
| "base_loss": 0.29839677426218986, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.0009282033424824476, |
| "learning_rate": 6.035709381103516e-06, |
| "lookahead_loss": 6.310325454711914, |
| "loss": 0.3091, |
| "step": 461000 |
| }, |
| { |
| "base_loss": 0.294783333927393, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.0009650102001614869, |
| "learning_rate": 5.988025665283203e-06, |
| "lookahead_loss": 6.230243679523468, |
| "loss": 0.3079, |
| "step": 461500 |
| }, |
| { |
| "base_loss": 0.32150769320130346, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.0010077395709231496, |
| "learning_rate": 5.940341949462891e-06, |
| "lookahead_loss": 6.241594205856323, |
| "loss": 0.3333, |
| "step": 462000 |
| }, |
| { |
| "base_loss": 0.3191940434873104, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.0009163669892586768, |
| "learning_rate": 5.892658233642579e-06, |
| "lookahead_loss": 6.1973172135353085, |
| "loss": 0.328, |
| "step": 462500 |
| }, |
| { |
| "base_loss": 0.30270202097296717, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.0008867474389262497, |
| "learning_rate": 5.844974517822266e-06, |
| "lookahead_loss": 6.168700245380402, |
| "loss": 0.3138, |
| "step": 463000 |
| }, |
| { |
| "base_loss": 0.2974509707689285, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.0009725645068101585, |
| "learning_rate": 5.797290802001953e-06, |
| "lookahead_loss": 6.2942254137992855, |
| "loss": 0.3082, |
| "step": 463500 |
| }, |
| { |
| "base_loss": 0.3114223616421223, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.0009279170189984143, |
| "learning_rate": 5.749607086181641e-06, |
| "lookahead_loss": 6.310624211788178, |
| "loss": 0.3226, |
| "step": 464000 |
| }, |
| { |
| "base_loss": 0.3443338246643543, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.0009897082345560193, |
| "learning_rate": 5.7019233703613284e-06, |
| "lookahead_loss": 6.318526268482208, |
| "loss": 0.3514, |
| "step": 464500 |
| }, |
| { |
| "base_loss": 0.2939972540736198, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.000933043600525707, |
| "learning_rate": 5.654239654541016e-06, |
| "lookahead_loss": 6.195446736812592, |
| "loss": 0.308, |
| "step": 465000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.2678496045426435, |
| "eval_lookahead_perplexity": 527.3421635423134, |
| "eval_loss": 0.14167460799217224, |
| "eval_perplexity": 1.1522016703179285, |
| "eval_runtime": 265.7854, |
| "eval_samples_per_second": 18.812, |
| "eval_steps_per_second": 0.591, |
| "step": 465000 |
| }, |
| { |
| "base_loss": 0.29841734063625336, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.0009832673240453005, |
| "learning_rate": 5.606555938720704e-06, |
| "lookahead_loss": 6.269070412158966, |
| "loss": 0.311, |
| "step": 465500 |
| }, |
| { |
| "base_loss": 0.3147252712547779, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.0010125736007466912, |
| "learning_rate": 5.558872222900391e-06, |
| "lookahead_loss": 6.2435841588973995, |
| "loss": 0.3247, |
| "step": 466000 |
| }, |
| { |
| "base_loss": 0.32950386153161526, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.0009781451663002372, |
| "learning_rate": 5.511188507080078e-06, |
| "lookahead_loss": 6.253851639270782, |
| "loss": 0.3429, |
| "step": 466500 |
| }, |
| { |
| "base_loss": 0.30734304267168044, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.0010217922972515225, |
| "learning_rate": 5.463504791259766e-06, |
| "lookahead_loss": 6.285841710567475, |
| "loss": 0.3164, |
| "step": 467000 |
| }, |
| { |
| "base_loss": 0.3014386140704155, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.0009706662967801094, |
| "learning_rate": 5.415821075439454e-06, |
| "lookahead_loss": 6.292059094905853, |
| "loss": 0.3126, |
| "step": 467500 |
| }, |
| { |
| "base_loss": 0.30611268219351767, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.0009617306059226394, |
| "learning_rate": 5.368137359619141e-06, |
| "lookahead_loss": 6.326679523468018, |
| "loss": 0.3146, |
| "step": 468000 |
| }, |
| { |
| "base_loss": 0.301539769411087, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.001001556869596243, |
| "learning_rate": 5.320453643798828e-06, |
| "lookahead_loss": 6.157724303245544, |
| "loss": 0.3137, |
| "step": 468500 |
| }, |
| { |
| "base_loss": 0.31222748425602914, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.0009728239965625107, |
| "learning_rate": 5.272769927978516e-06, |
| "lookahead_loss": 6.156917175769806, |
| "loss": 0.3221, |
| "step": 469000 |
| }, |
| { |
| "base_loss": 0.32267384630441664, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.0009497611317783594, |
| "learning_rate": 5.2250862121582034e-06, |
| "lookahead_loss": 6.197290048122406, |
| "loss": 0.3348, |
| "step": 469500 |
| }, |
| { |
| "base_loss": 0.30016050645709036, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.0009481213637627661, |
| "learning_rate": 5.177402496337891e-06, |
| "lookahead_loss": 6.171369277000427, |
| "loss": 0.3159, |
| "step": 470000 |
| }, |
| { |
| "epoch": 2.0047683715820312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.266808138106959, |
| "eval_lookahead_perplexity": 526.7932402710601, |
| "eval_loss": 0.14167256653308868, |
| "eval_perplexity": 1.1521993181477634, |
| "eval_runtime": 264.661, |
| "eval_samples_per_second": 18.892, |
| "eval_steps_per_second": 0.593, |
| "step": 470000 |
| }, |
| { |
| "base_loss": 0.3024714471399784, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.0008405263070017099, |
| "learning_rate": 5.129718780517579e-06, |
| "lookahead_loss": 6.278398807525635, |
| "loss": 0.3112, |
| "step": 470500 |
| }, |
| { |
| "base_loss": 0.2964489733278751, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.0009033022215589881, |
| "learning_rate": 5.082035064697266e-06, |
| "lookahead_loss": 6.156778864383697, |
| "loss": 0.3126, |
| "step": 471000 |
| }, |
| { |
| "base_loss": 0.31337857532501223, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.0009623009245842695, |
| "learning_rate": 5.034351348876953e-06, |
| "lookahead_loss": 6.213428562164307, |
| "loss": 0.3233, |
| "step": 471500 |
| }, |
| { |
| "base_loss": 0.3180972839295864, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.0009386781021021307, |
| "learning_rate": 4.986667633056641e-06, |
| "lookahead_loss": 6.1949804525375365, |
| "loss": 0.3226, |
| "step": 472000 |
| }, |
| { |
| "base_loss": 0.30493127757310867, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.0009717259090393782, |
| "learning_rate": 4.938983917236329e-06, |
| "lookahead_loss": 6.213238111972808, |
| "loss": 0.318, |
| "step": 472500 |
| }, |
| { |
| "base_loss": 0.30099570405483245, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.0009312546462751925, |
| "learning_rate": 4.891300201416016e-06, |
| "lookahead_loss": 6.1899485034942625, |
| "loss": 0.3109, |
| "step": 473000 |
| }, |
| { |
| "base_loss": 0.30160990768671037, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.0010111306328326464, |
| "learning_rate": 4.843616485595703e-06, |
| "lookahead_loss": 6.192778927326202, |
| "loss": 0.3142, |
| "step": 473500 |
| }, |
| { |
| "base_loss": 0.32538792353868484, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.000879990984685719, |
| "learning_rate": 4.795932769775391e-06, |
| "lookahead_loss": 6.183738801956177, |
| "loss": 0.3353, |
| "step": 474000 |
| }, |
| { |
| "base_loss": 0.3040602553486824, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.0009244357934221625, |
| "learning_rate": 4.7482490539550784e-06, |
| "lookahead_loss": 6.276916295051575, |
| "loss": 0.3189, |
| "step": 474500 |
| }, |
| { |
| "base_loss": 0.29813345649838446, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.0009381592972204089, |
| "learning_rate": 4.700565338134766e-06, |
| "lookahead_loss": 6.224869798183441, |
| "loss": 0.3115, |
| "step": 475000 |
| }, |
| { |
| "epoch": 2.0143051147460938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.26585840036313, |
| "eval_lookahead_perplexity": 526.2931623566051, |
| "eval_loss": 0.14167073369026184, |
| "eval_perplexity": 1.1521972063494434, |
| "eval_runtime": 254.0716, |
| "eval_samples_per_second": 19.679, |
| "eval_steps_per_second": 0.618, |
| "step": 475000 |
| }, |
| { |
| "base_loss": 0.2960759684741497, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.0008980022976174951, |
| "learning_rate": 4.652881622314453e-06, |
| "lookahead_loss": 6.182446990966797, |
| "loss": 0.3066, |
| "step": 475500 |
| }, |
| { |
| "base_loss": 0.31211792075634004, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.0009660014766268432, |
| "learning_rate": 4.605197906494141e-06, |
| "lookahead_loss": 6.232044881820679, |
| "loss": 0.3224, |
| "step": 476000 |
| }, |
| { |
| "base_loss": 0.31110167542099953, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.0009189226548187435, |
| "learning_rate": 4.557514190673828e-06, |
| "lookahead_loss": 6.267304803848266, |
| "loss": 0.3221, |
| "step": 476500 |
| }, |
| { |
| "base_loss": 0.2990322083234787, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.0009295094641856849, |
| "learning_rate": 4.509830474853516e-06, |
| "lookahead_loss": 6.280393055915832, |
| "loss": 0.3114, |
| "step": 477000 |
| }, |
| { |
| "base_loss": 0.29806812533736227, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.00097031140467152, |
| "learning_rate": 4.462146759033204e-06, |
| "lookahead_loss": 6.305790534496308, |
| "loss": 0.3094, |
| "step": 477500 |
| }, |
| { |
| "base_loss": 0.30187543269991873, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.001022745855152607, |
| "learning_rate": 4.4144630432128904e-06, |
| "lookahead_loss": 6.14053142118454, |
| "loss": 0.3146, |
| "step": 478000 |
| }, |
| { |
| "base_loss": 0.32729279178380966, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.0009685283876024187, |
| "learning_rate": 4.366779327392578e-06, |
| "lookahead_loss": 6.224147350311279, |
| "loss": 0.3372, |
| "step": 478500 |
| }, |
| { |
| "base_loss": 0.3057846530973911, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.0009614306618459523, |
| "learning_rate": 4.319095611572266e-06, |
| "lookahead_loss": 6.187606465339661, |
| "loss": 0.314, |
| "step": 479000 |
| }, |
| { |
| "base_loss": 0.2997340569794178, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.000983657082542777, |
| "learning_rate": 4.2714118957519534e-06, |
| "lookahead_loss": 6.223736906528473, |
| "loss": 0.3113, |
| "step": 479500 |
| }, |
| { |
| "base_loss": 0.30260268279910085, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.0009483291069045663, |
| "learning_rate": 4.223728179931641e-06, |
| "lookahead_loss": 6.179685834407806, |
| "loss": 0.3137, |
| "step": 480000 |
| }, |
| { |
| "epoch": 2.0238418579101562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.264854770879776, |
| "eval_lookahead_perplexity": 525.7652239935444, |
| "eval_loss": 0.14166878163814545, |
| "eval_perplexity": 1.1521949572026435, |
| "eval_runtime": 266.1285, |
| "eval_samples_per_second": 18.788, |
| "eval_steps_per_second": 0.59, |
| "step": 480000 |
| }, |
| { |
| "base_loss": 0.3236656226217747, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.0009004331659525633, |
| "learning_rate": 4.176044464111328e-06, |
| "lookahead_loss": 6.2046719169616695, |
| "loss": 0.3344, |
| "step": 480500 |
| }, |
| { |
| "base_loss": 0.30869458481669426, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.0009720096713863313, |
| "learning_rate": 4.128360748291016e-06, |
| "lookahead_loss": 6.157517350673675, |
| "loss": 0.3224, |
| "step": 481000 |
| }, |
| { |
| "base_loss": 0.3019005296528339, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.0010002320632338524, |
| "learning_rate": 4.080677032470703e-06, |
| "lookahead_loss": 6.184016052246093, |
| "loss": 0.311, |
| "step": 481500 |
| }, |
| { |
| "base_loss": 0.3077106066644192, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.0009557516314089298, |
| "learning_rate": 4.032993316650391e-06, |
| "lookahead_loss": 6.2857253398895265, |
| "loss": 0.318, |
| "step": 482000 |
| }, |
| { |
| "base_loss": 0.3280421564877033, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.0009686889825388789, |
| "learning_rate": 3.985309600830079e-06, |
| "lookahead_loss": 6.307454082489014, |
| "loss": 0.3389, |
| "step": 482500 |
| }, |
| { |
| "base_loss": 0.30581475085020066, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.0009459998109377921, |
| "learning_rate": 3.9376258850097654e-06, |
| "lookahead_loss": 6.254247172832489, |
| "loss": 0.3136, |
| "step": 483000 |
| }, |
| { |
| "base_loss": 0.3068877322375774, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.0009705954580567777, |
| "learning_rate": 3.889942169189453e-06, |
| "lookahead_loss": 6.246685606956482, |
| "loss": 0.3162, |
| "step": 483500 |
| }, |
| { |
| "base_loss": 0.3014947620034218, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.000960386183578521, |
| "learning_rate": 3.842258453369141e-06, |
| "lookahead_loss": 6.249851522445678, |
| "loss": 0.3138, |
| "step": 484000 |
| }, |
| { |
| "base_loss": 0.3173881909847259, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.0009665894904173911, |
| "learning_rate": 3.7945747375488284e-06, |
| "lookahead_loss": 6.212476090431213, |
| "loss": 0.3333, |
| "step": 484500 |
| }, |
| { |
| "base_loss": 0.3059709269702435, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.0009842559229582548, |
| "learning_rate": 3.7468910217285157e-06, |
| "lookahead_loss": 6.260224392414093, |
| "loss": 0.3146, |
| "step": 485000 |
| }, |
| { |
| "epoch": 2.0333786010742188, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.2641355762847315, |
| "eval_lookahead_perplexity": 525.3872324272244, |
| "eval_loss": 0.1416674256324768, |
| "eval_perplexity": 1.1521933948208094, |
| "eval_runtime": 264.2356, |
| "eval_samples_per_second": 18.923, |
| "eval_steps_per_second": 0.594, |
| "step": 485000 |
| }, |
| { |
| "base_loss": 0.3017133396565914, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.0009045092738233507, |
| "learning_rate": 3.6992073059082034e-06, |
| "lookahead_loss": 6.33234530544281, |
| "loss": 0.3141, |
| "step": 485500 |
| }, |
| { |
| "base_loss": 0.31134800574183463, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.0010052913567051291, |
| "learning_rate": 3.6515235900878906e-06, |
| "lookahead_loss": 6.164672554492951, |
| "loss": 0.3224, |
| "step": 486000 |
| }, |
| { |
| "base_loss": 0.32387468561530114, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.0009474267717450857, |
| "learning_rate": 3.6038398742675783e-06, |
| "lookahead_loss": 6.275379824161529, |
| "loss": 0.3362, |
| "step": 486500 |
| }, |
| { |
| "base_loss": 0.3080780008882284, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.0009899679571390152, |
| "learning_rate": 3.556156158447266e-06, |
| "lookahead_loss": 6.207229743003845, |
| "loss": 0.3192, |
| "step": 487000 |
| }, |
| { |
| "base_loss": 0.30180328992009164, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.0009810588089749217, |
| "learning_rate": 3.508472442626953e-06, |
| "lookahead_loss": 6.2532482995986935, |
| "loss": 0.3137, |
| "step": 487500 |
| }, |
| { |
| "base_loss": 0.30689890575408935, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.0009289110312238336, |
| "learning_rate": 3.460788726806641e-06, |
| "lookahead_loss": 6.209832691192627, |
| "loss": 0.3188, |
| "step": 488000 |
| }, |
| { |
| "base_loss": 0.32427770999073985, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.0009330078610219061, |
| "learning_rate": 3.413105010986328e-06, |
| "lookahead_loss": 6.245529312610627, |
| "loss": 0.3331, |
| "step": 488500 |
| }, |
| { |
| "base_loss": 0.30682690465450285, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.0009762721601873636, |
| "learning_rate": 3.3654212951660158e-06, |
| "lookahead_loss": 6.171029562950134, |
| "loss": 0.3156, |
| "step": 489000 |
| }, |
| { |
| "base_loss": 0.29654143354296686, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.0009487427887506783, |
| "learning_rate": 3.3177375793457034e-06, |
| "lookahead_loss": 6.259974523544312, |
| "loss": 0.3079, |
| "step": 489500 |
| }, |
| { |
| "base_loss": 0.30721216344833374, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.0009788471506908536, |
| "learning_rate": 3.2700538635253907e-06, |
| "lookahead_loss": 6.27340632724762, |
| "loss": 0.323, |
| "step": 490000 |
| }, |
| { |
| "epoch": 2.0429153442382812, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.263471987300788, |
| "eval_lookahead_perplexity": 525.038706899121, |
| "eval_loss": 0.14166615903377533, |
| "eval_perplexity": 1.1521919354550758, |
| "eval_runtime": 253.9561, |
| "eval_samples_per_second": 19.688, |
| "eval_steps_per_second": 0.618, |
| "step": 490000 |
| }, |
| { |
| "base_loss": 0.32630143281817436, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.0009800581028684974, |
| "learning_rate": 3.2223701477050784e-06, |
| "lookahead_loss": 6.298602212905884, |
| "loss": 0.3415, |
| "step": 490500 |
| }, |
| { |
| "base_loss": 0.296696748316288, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.0009970470564439893, |
| "learning_rate": 3.1746864318847656e-06, |
| "lookahead_loss": 6.2160419683456425, |
| "loss": 0.3097, |
| "step": 491000 |
| }, |
| { |
| "base_loss": 0.30323311913013457, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.0009552966221235693, |
| "learning_rate": 3.1270027160644533e-06, |
| "lookahead_loss": 6.2349854435920715, |
| "loss": 0.3159, |
| "step": 491500 |
| }, |
| { |
| "base_loss": 0.32944888742268086, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.0009680980583652854, |
| "learning_rate": 3.079319000244141e-06, |
| "lookahead_loss": 6.207026290416717, |
| "loss": 0.3393, |
| "step": 492000 |
| }, |
| { |
| "base_loss": 0.32393511798977853, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.0009857366094365716, |
| "learning_rate": 3.031635284423828e-06, |
| "lookahead_loss": 6.246865995407105, |
| "loss": 0.3393, |
| "step": 492500 |
| }, |
| { |
| "base_loss": 0.293301939278841, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.0009528042282909155, |
| "learning_rate": 2.983951568603516e-06, |
| "lookahead_loss": 6.2056601891517635, |
| "loss": 0.3056, |
| "step": 493000 |
| }, |
| { |
| "base_loss": 0.3036652799248695, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.001003090525045991, |
| "learning_rate": 2.936267852783203e-06, |
| "lookahead_loss": 6.183267019271851, |
| "loss": 0.3171, |
| "step": 493500 |
| }, |
| { |
| "base_loss": 0.3174412237107754, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.0009027134510688484, |
| "learning_rate": 2.8885841369628908e-06, |
| "lookahead_loss": 6.298980396270752, |
| "loss": 0.332, |
| "step": 494000 |
| }, |
| { |
| "base_loss": 0.30474287942051886, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.0009782238630577922, |
| "learning_rate": 2.8409004211425784e-06, |
| "lookahead_loss": 6.23814437866211, |
| "loss": 0.3178, |
| "step": 494500 |
| }, |
| { |
| "base_loss": 0.30692395463585853, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.0010520197683945298, |
| "learning_rate": 2.7932167053222657e-06, |
| "lookahead_loss": 6.181493873119354, |
| "loss": 0.3177, |
| "step": 495000 |
| }, |
| { |
| "epoch": 2.0524520874023438, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.262974844192164, |
| "eval_lookahead_perplexity": 524.7777523954666, |
| "eval_loss": 0.14166510105133057, |
| "eval_perplexity": 1.15219071645688, |
| "eval_runtime": 270.937, |
| "eval_samples_per_second": 18.454, |
| "eval_steps_per_second": 0.579, |
| "step": 495000 |
| }, |
| { |
| "base_loss": 0.32256214889883994, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.0009561408660374582, |
| "learning_rate": 2.7455329895019534e-06, |
| "lookahead_loss": 6.20479358291626, |
| "loss": 0.3312, |
| "step": 495500 |
| }, |
| { |
| "base_loss": 0.3550116382241249, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.0009577757446095347, |
| "learning_rate": 2.6978492736816406e-06, |
| "lookahead_loss": 6.231877175807953, |
| "loss": 0.3693, |
| "step": 496000 |
| }, |
| { |
| "base_loss": 0.2970747436285019, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.0009534953278489411, |
| "learning_rate": 2.6501655578613283e-06, |
| "lookahead_loss": 6.23866082906723, |
| "loss": 0.308, |
| "step": 496500 |
| }, |
| { |
| "base_loss": 0.30645539990067483, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.0009555872529745102, |
| "learning_rate": 2.602481842041016e-06, |
| "lookahead_loss": 6.246189908981323, |
| "loss": 0.3167, |
| "step": 497000 |
| }, |
| { |
| "base_loss": 0.31723022189736366, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.0009773544734343886, |
| "learning_rate": 2.554798126220703e-06, |
| "lookahead_loss": 6.266136578559875, |
| "loss": 0.3307, |
| "step": 497500 |
| }, |
| { |
| "base_loss": 0.3193240025639534, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.0009717575740069151, |
| "learning_rate": 2.507114410400391e-06, |
| "lookahead_loss": 6.268920562744141, |
| "loss": 0.3271, |
| "step": 498000 |
| }, |
| { |
| "base_loss": 0.2937832759618759, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.0009175707236863673, |
| "learning_rate": 2.459430694580078e-06, |
| "lookahead_loss": 6.168703609466553, |
| "loss": 0.3071, |
| "step": 498500 |
| }, |
| { |
| "base_loss": 0.30271227744221685, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.0009538477752357721, |
| "learning_rate": 2.4117469787597658e-06, |
| "lookahead_loss": 6.238021997451782, |
| "loss": 0.3164, |
| "step": 499000 |
| }, |
| { |
| "base_loss": 0.3198817696869373, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.0009791208431124687, |
| "learning_rate": 2.3640632629394534e-06, |
| "lookahead_loss": 6.194750837326049, |
| "loss": 0.3306, |
| "step": 499500 |
| }, |
| { |
| "base_loss": 0.3065698970258236, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.0010103002423420548, |
| "learning_rate": 2.3163795471191407e-06, |
| "lookahead_loss": 6.2074076795578, |
| "loss": 0.3147, |
| "step": 500000 |
| }, |
| { |
| "epoch": 2.0619888305664062, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.26243155147321, |
| "eval_lookahead_perplexity": 524.4927218980288, |
| "eval_loss": 0.14166411757469177, |
| "eval_perplexity": 1.1521895833047842, |
| "eval_runtime": 292.5742, |
| "eval_samples_per_second": 17.09, |
| "eval_steps_per_second": 0.537, |
| "step": 500000 |
| }, |
| { |
| "base_loss": 0.30793745544552803, |
| "epoch": 2.0629425048828125, |
| "grad_norm": 0.0010036919265985489, |
| "learning_rate": 2.2686958312988284e-06, |
| "lookahead_loss": 6.206808748722076, |
| "loss": 0.3177, |
| "step": 500500 |
| }, |
| { |
| "base_loss": 0.3166033121049404, |
| "epoch": 2.0638961791992188, |
| "grad_norm": 0.0009552605915814638, |
| "learning_rate": 2.2210121154785156e-06, |
| "lookahead_loss": 6.2300395545959475, |
| "loss": 0.3283, |
| "step": 501000 |
| }, |
| { |
| "base_loss": 0.30278992640972135, |
| "epoch": 2.064849853515625, |
| "grad_norm": 0.0009494012338109314, |
| "learning_rate": 2.1733283996582033e-06, |
| "lookahead_loss": 6.2603726406097415, |
| "loss": 0.318, |
| "step": 501500 |
| }, |
| { |
| "base_loss": 0.30789859166741373, |
| "epoch": 2.0658035278320312, |
| "grad_norm": 0.000976825482212007, |
| "learning_rate": 2.125644683837891e-06, |
| "lookahead_loss": 6.1817445759773255, |
| "loss": 0.3194, |
| "step": 502000 |
| }, |
| { |
| "base_loss": 0.307515013217926, |
| "epoch": 2.0667572021484375, |
| "grad_norm": 0.000997724011540413, |
| "learning_rate": 2.077960968017578e-06, |
| "lookahead_loss": 6.173175088882446, |
| "loss": 0.3162, |
| "step": 502500 |
| }, |
| { |
| "base_loss": 0.3304452752768993, |
| "epoch": 2.0677108764648438, |
| "grad_norm": 0.0010043421061709523, |
| "learning_rate": 2.030277252197266e-06, |
| "lookahead_loss": 6.237773827552795, |
| "loss": 0.3422, |
| "step": 503000 |
| }, |
| { |
| "base_loss": 0.3000219973921776, |
| "epoch": 2.06866455078125, |
| "grad_norm": 0.0010214378125965595, |
| "learning_rate": 1.982593536376953e-06, |
| "lookahead_loss": 6.208347861766815, |
| "loss": 0.3092, |
| "step": 503500 |
| }, |
| { |
| "base_loss": 0.3050138043165207, |
| "epoch": 2.0696182250976562, |
| "grad_norm": 0.0009577918681316078, |
| "learning_rate": 1.9349098205566408e-06, |
| "lookahead_loss": 6.242258395195007, |
| "loss": 0.3164, |
| "step": 504000 |
| }, |
| { |
| "base_loss": 0.34709723374247553, |
| "epoch": 2.0705718994140625, |
| "grad_norm": 0.0009933137334883213, |
| "learning_rate": 1.8872261047363282e-06, |
| "lookahead_loss": 6.156369118213654, |
| "loss": 0.3596, |
| "step": 504500 |
| }, |
| { |
| "base_loss": 0.31454886627197265, |
| "epoch": 2.0715255737304688, |
| "grad_norm": 0.0009540682658553123, |
| "learning_rate": 1.8395423889160157e-06, |
| "lookahead_loss": 6.19591244506836, |
| "loss": 0.3257, |
| "step": 505000 |
| }, |
| { |
| "epoch": 2.0715255737304688, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.262022242378503, |
| "eval_lookahead_perplexity": 524.2780861860196, |
| "eval_loss": 0.1416633278131485, |
| "eval_perplexity": 1.15218867335012, |
| "eval_runtime": 273.5457, |
| "eval_samples_per_second": 18.278, |
| "eval_steps_per_second": 0.574, |
| "step": 505000 |
| }, |
| { |
| "base_loss": 0.30621468406915664, |
| "epoch": 2.072479248046875, |
| "grad_norm": 0.0009789029136300087, |
| "learning_rate": 1.7918586730957031e-06, |
| "lookahead_loss": 6.262781209945679, |
| "loss": 0.3179, |
| "step": 505500 |
| }, |
| { |
| "base_loss": 0.3062588813006878, |
| "epoch": 2.0734329223632812, |
| "grad_norm": 0.0009360404801554978, |
| "learning_rate": 1.7441749572753908e-06, |
| "lookahead_loss": 6.235173214435577, |
| "loss": 0.3185, |
| "step": 506000 |
| }, |
| { |
| "base_loss": 0.3277868445813656, |
| "epoch": 2.0743865966796875, |
| "grad_norm": 0.0009577349992468953, |
| "learning_rate": 1.6964912414550783e-06, |
| "lookahead_loss": 6.249401865959167, |
| "loss": 0.3401, |
| "step": 506500 |
| }, |
| { |
| "base_loss": 0.30303199696540833, |
| "epoch": 2.0753402709960938, |
| "grad_norm": 0.0009694884065538645, |
| "learning_rate": 1.6488075256347657e-06, |
| "lookahead_loss": 6.3263852491378785, |
| "loss": 0.3149, |
| "step": 507000 |
| }, |
| { |
| "base_loss": 0.30761926966905595, |
| "epoch": 2.0762939453125, |
| "grad_norm": 0.0009187961695715785, |
| "learning_rate": 1.6011238098144532e-06, |
| "lookahead_loss": 6.275495934486389, |
| "loss": 0.3183, |
| "step": 507500 |
| }, |
| { |
| "base_loss": 0.33150802648067473, |
| "epoch": 2.0772476196289062, |
| "grad_norm": 0.0009673857130110264, |
| "learning_rate": 1.5534400939941406e-06, |
| "lookahead_loss": 6.266810004234314, |
| "loss": 0.344, |
| "step": 508000 |
| }, |
| { |
| "base_loss": 0.30574921500682833, |
| "epoch": 2.0782012939453125, |
| "grad_norm": 0.0009754388011060655, |
| "learning_rate": 1.505756378173828e-06, |
| "lookahead_loss": 6.266697756290435, |
| "loss": 0.3171, |
| "step": 508500 |
| }, |
| { |
| "base_loss": 0.2994054418802261, |
| "epoch": 2.0791549682617188, |
| "grad_norm": 0.0009331432520411909, |
| "learning_rate": 1.4580726623535158e-06, |
| "lookahead_loss": 6.26380909538269, |
| "loss": 0.313, |
| "step": 509000 |
| }, |
| { |
| "base_loss": 0.31194803246855735, |
| "epoch": 2.080108642578125, |
| "grad_norm": 0.0009484157781116664, |
| "learning_rate": 1.4103889465332032e-06, |
| "lookahead_loss": 6.247877294540405, |
| "loss": 0.3291, |
| "step": 509500 |
| }, |
| { |
| "base_loss": 0.3244352611005306, |
| "epoch": 2.0810623168945312, |
| "grad_norm": 0.001009124331176281, |
| "learning_rate": 1.3627052307128907e-06, |
| "lookahead_loss": 6.271973398685455, |
| "loss": 0.3387, |
| "step": 510000 |
| }, |
| { |
| "epoch": 2.0810623168945312, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.261683738269745, |
| "eval_lookahead_perplexity": 524.1006459335351, |
| "eval_loss": 0.14166270196437836, |
| "eval_perplexity": 1.1521879522544816, |
| "eval_runtime": 285.7481, |
| "eval_samples_per_second": 17.498, |
| "eval_steps_per_second": 0.549, |
| "step": 510000 |
| }, |
| { |
| "base_loss": 0.30112930870056154, |
| "epoch": 2.0820159912109375, |
| "grad_norm": 0.0009189407574012876, |
| "learning_rate": 1.3150215148925781e-06, |
| "lookahead_loss": 6.277680154800415, |
| "loss": 0.3128, |
| "step": 510500 |
| }, |
| { |
| "base_loss": 0.30448419651389125, |
| "epoch": 2.0829696655273438, |
| "grad_norm": 0.000990754459053278, |
| "learning_rate": 1.2673377990722656e-06, |
| "lookahead_loss": 6.309187083244324, |
| "loss": 0.3177, |
| "step": 511000 |
| }, |
| { |
| "base_loss": 0.33415990057587625, |
| "epoch": 2.08392333984375, |
| "grad_norm": 0.0009419569978490472, |
| "learning_rate": 1.2196540832519533e-06, |
| "lookahead_loss": 6.3219421377182, |
| "loss": 0.3452, |
| "step": 511500 |
| }, |
| { |
| "base_loss": 0.31056174263358116, |
| "epoch": 2.0848770141601562, |
| "grad_norm": 0.0009886783082038164, |
| "learning_rate": 1.1719703674316407e-06, |
| "lookahead_loss": 6.2479847407341005, |
| "loss": 0.3206, |
| "step": 512000 |
| }, |
| { |
| "base_loss": 0.2973758824914694, |
| "epoch": 2.0858306884765625, |
| "grad_norm": 0.0010188610758632421, |
| "learning_rate": 1.1242866516113282e-06, |
| "lookahead_loss": 6.2457886896133425, |
| "loss": 0.3085, |
| "step": 512500 |
| }, |
| { |
| "base_loss": 0.3042152850329876, |
| "epoch": 2.0867843627929688, |
| "grad_norm": 0.0010978945065289736, |
| "learning_rate": 1.0766029357910156e-06, |
| "lookahead_loss": 6.219885497093201, |
| "loss": 0.3136, |
| "step": 513000 |
| }, |
| { |
| "base_loss": 0.337257578343153, |
| "epoch": 2.087738037109375, |
| "grad_norm": 0.0008794477325864136, |
| "learning_rate": 1.028919219970703e-06, |
| "lookahead_loss": 6.267465775966644, |
| "loss": 0.3438, |
| "step": 513500 |
| }, |
| { |
| "base_loss": 0.3000968562066555, |
| "epoch": 2.0886917114257812, |
| "grad_norm": 0.001002022996544838, |
| "learning_rate": 9.812355041503908e-07, |
| "lookahead_loss": 6.260487885951996, |
| "loss": 0.3112, |
| "step": 514000 |
| }, |
| { |
| "base_loss": 0.3103375973403454, |
| "epoch": 2.0896453857421875, |
| "grad_norm": 0.0009891856461763382, |
| "learning_rate": 9.335517883300781e-07, |
| "lookahead_loss": 6.296863189697266, |
| "loss": 0.3182, |
| "step": 514500 |
| }, |
| { |
| "base_loss": 0.29991284269094465, |
| "epoch": 2.0905990600585938, |
| "grad_norm": 0.000973585934843868, |
| "learning_rate": 8.858680725097657e-07, |
| "lookahead_loss": 6.271089879989624, |
| "loss": 0.3117, |
| "step": 515000 |
| }, |
| { |
| "epoch": 2.0905990600585938, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.261462169714248, |
| "eval_lookahead_perplexity": 523.9845345742689, |
| "eval_loss": 0.14166229963302612, |
| "eval_perplexity": 1.152187488693238, |
| "eval_runtime": 284.7414, |
| "eval_samples_per_second": 17.56, |
| "eval_steps_per_second": 0.551, |
| "step": 515000 |
| }, |
| { |
| "base_loss": 0.3000768061578274, |
| "epoch": 2.091552734375, |
| "grad_norm": 0.0008802941883914173, |
| "learning_rate": 8.381843566894531e-07, |
| "lookahead_loss": 6.234204215049743, |
| "loss": 0.3106, |
| "step": 515500 |
| }, |
| { |
| "base_loss": 0.31945454320311545, |
| "epoch": 2.0925064086914062, |
| "grad_norm": 0.000959145778324455, |
| "learning_rate": 7.905006408691407e-07, |
| "lookahead_loss": 6.28009091091156, |
| "loss": 0.333, |
| "step": 516000 |
| }, |
| { |
| "base_loss": 0.309114942163229, |
| "epoch": 2.0934600830078125, |
| "grad_norm": 0.0008794525056146085, |
| "learning_rate": 7.428169250488282e-07, |
| "lookahead_loss": 6.279258125305176, |
| "loss": 0.3194, |
| "step": 516500 |
| }, |
| { |
| "base_loss": 0.28753899577260017, |
| "epoch": 2.0944137573242188, |
| "grad_norm": 0.0009603716316632926, |
| "learning_rate": 6.951332092285156e-07, |
| "lookahead_loss": 6.265149411201477, |
| "loss": 0.3032, |
| "step": 517000 |
| }, |
| { |
| "base_loss": 0.29245217123627665, |
| "epoch": 2.095367431640625, |
| "grad_norm": 0.0009777392260730267, |
| "learning_rate": 6.474494934082032e-07, |
| "lookahead_loss": 6.201946850299835, |
| "loss": 0.3077, |
| "step": 517500 |
| }, |
| { |
| "base_loss": 0.30112256136536597, |
| "epoch": 2.0963211059570312, |
| "grad_norm": 0.0009416543180122972, |
| "learning_rate": 5.997657775878906e-07, |
| "lookahead_loss": 6.2732035398483275, |
| "loss": 0.3133, |
| "step": 518000 |
| }, |
| { |
| "base_loss": 0.3297825155258179, |
| "epoch": 2.0972747802734375, |
| "grad_norm": 0.0009741596295498312, |
| "learning_rate": 5.520820617675782e-07, |
| "lookahead_loss": 6.294767707824707, |
| "loss": 0.3408, |
| "step": 518500 |
| }, |
| { |
| "base_loss": 0.2911633634865284, |
| "epoch": 2.0982284545898438, |
| "grad_norm": 0.0009707529679872096, |
| "learning_rate": 5.043983459472657e-07, |
| "lookahead_loss": 6.226530519485474, |
| "loss": 0.306, |
| "step": 519000 |
| }, |
| { |
| "base_loss": 0.2934698580801487, |
| "epoch": 2.09918212890625, |
| "grad_norm": 0.0009795463411137462, |
| "learning_rate": 4.5671463012695317e-07, |
| "lookahead_loss": 6.274832231521606, |
| "loss": 0.3095, |
| "step": 519500 |
| }, |
| { |
| "base_loss": 0.3032768616080284, |
| "epoch": 2.1001358032226562, |
| "grad_norm": 0.0009279102087020874, |
| "learning_rate": 4.0903091430664063e-07, |
| "lookahead_loss": 6.28268619632721, |
| "loss": 0.3147, |
| "step": 520000 |
| }, |
| { |
| "epoch": 2.1001358032226562, |
| "eval_accuracy": 0.002520743639921722, |
| "eval_base_loss": 0.1298022617856725, |
| "eval_base_perplexity": 1.1386032156965338, |
| "eval_lookahead_loss": 6.261365200383976, |
| "eval_lookahead_perplexity": 523.9337266083253, |
| "eval_loss": 0.1416620910167694, |
| "eval_perplexity": 1.152187248328222, |
| "eval_runtime": 316.6586, |
| "eval_samples_per_second": 15.79, |
| "eval_steps_per_second": 0.496, |
| "step": 520000 |
| }, |
| { |
| "base_loss": 0.3233104472160339, |
| "epoch": 2.1010894775390625, |
| "grad_norm": 0.000958006305154413, |
| "learning_rate": 3.6134719848632814e-07, |
| "lookahead_loss": 6.3454251170158384, |
| "loss": 0.333, |
| "step": 520500 |
| }, |
| { |
| "base_loss": 0.30158160945773127, |
| "epoch": 2.1020431518554688, |
| "grad_norm": 0.0009491976234130561, |
| "learning_rate": 3.1366348266601565e-07, |
| "lookahead_loss": 6.288103145599365, |
| "loss": 0.3126, |
| "step": 521000 |
| }, |
| { |
| "base_loss": 0.29719595339894295, |
| "epoch": 2.102996826171875, |
| "grad_norm": 0.0009692716994322836, |
| "learning_rate": 2.6597976684570316e-07, |
| "lookahead_loss": 6.270690215110779, |
| "loss": 0.3125, |
| "step": 521500 |
| }, |
| { |
| "base_loss": 0.3008777514696121, |
| "epoch": 2.1039505004882812, |
| "grad_norm": 0.0009985940996557474, |
| "learning_rate": 2.1829605102539064e-07, |
| "lookahead_loss": 6.300080471038818, |
| "loss": 0.3114, |
| "step": 522000 |
| }, |
| { |
| "base_loss": 0.31517351168394087, |
| "epoch": 2.1049041748046875, |
| "grad_norm": 0.0009895325638353825, |
| "learning_rate": 1.7061233520507813e-07, |
| "lookahead_loss": 6.232696861267089, |
| "loss": 0.3275, |
| "step": 522500 |
| }, |
| { |
| "base_loss": 0.3079349631667137, |
| "epoch": 2.1058578491210938, |
| "grad_norm": 0.0009340654360130429, |
| "learning_rate": 1.2292861938476564e-07, |
| "lookahead_loss": 6.239760284423828, |
| "loss": 0.3204, |
| "step": 523000 |
| }, |
| { |
| "base_loss": 0.29828172570466993, |
| "epoch": 2.1068115234375, |
| "grad_norm": 0.0009318156517110765, |
| "learning_rate": 7.524490356445312e-08, |
| "lookahead_loss": 6.308520089626312, |
| "loss": 0.3082, |
| "step": 523500 |
| }, |
| { |
| "base_loss": 0.29354105108976364, |
| "epoch": 2.1077651977539062, |
| "grad_norm": 0.0009783627465367317, |
| "learning_rate": 2.7561187744140627e-08, |
| "lookahead_loss": 6.2301955571174625, |
| "loss": 0.3059, |
| "step": 524000 |
| }, |
| { |
| "epoch": 2.1083145141601562, |
| "step": 524288, |
| "total_flos": 4.036319640756106e+19, |
| "train_loss": 0.07301426184130833, |
| "train_runtime": 100630.1012, |
| "train_samples_per_second": 166.722, |
| "train_steps_per_second": 5.21 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 524288, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.036319640756106e+19, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|