{ "best_global_step": null, "best_metric": 5.2184648513793945, "best_model_checkpoint": null, "epoch": 1.04632568359375, "eval_steps": 5000, "global_step": 524288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00095367431640625, "grad_norm": 1.3056761026382446, "learning_rate": 4.995241165161133e-05, "lookahead_loss": 80.39738775634765, "loss": 8.2932, "step": 500 }, { "epoch": 0.0019073486328125, "grad_norm": 1.2807952165603638, "learning_rate": 4.990472793579102e-05, "lookahead_loss": 66.65608617782593, "loss": 7.6771, "step": 1000 }, { "epoch": 0.00286102294921875, "grad_norm": 1.0663251876831055, "learning_rate": 4.98570442199707e-05, "lookahead_loss": 62.47020024108887, "loss": 7.3608, "step": 1500 }, { "epoch": 0.003814697265625, "grad_norm": 1.1057463884353638, "learning_rate": 4.9809360504150393e-05, "lookahead_loss": 59.82121521377564, "loss": 7.2033, "step": 2000 }, { "epoch": 0.00476837158203125, "grad_norm": 0.9191403985023499, "learning_rate": 4.9761676788330084e-05, "lookahead_loss": 58.27328967666626, "loss": 7.0473, "step": 2500 }, { "epoch": 0.0057220458984375, "grad_norm": 0.9611405730247498, "learning_rate": 4.971399307250977e-05, "lookahead_loss": 56.5085450630188, "loss": 6.9631, "step": 3000 }, { "epoch": 0.00667572021484375, "grad_norm": 1.0410672426223755, "learning_rate": 4.966630935668946e-05, "lookahead_loss": 53.816093334198, "loss": 6.8561, "step": 3500 }, { "epoch": 0.00762939453125, "grad_norm": 0.9675432443618774, "learning_rate": 4.961862564086914e-05, "lookahead_loss": 54.05169961547852, "loss": 6.789, "step": 4000 }, { "epoch": 0.00858306884765625, "grad_norm": 1.2447336912155151, "learning_rate": 4.957094192504883e-05, "lookahead_loss": 56.84773049163818, "loss": 6.6831, "step": 4500 }, { "epoch": 0.0095367431640625, "grad_norm": 1.0130715370178223, "learning_rate": 4.952325820922852e-05, "lookahead_loss": 72.43959342956543, "loss": 6.7799, "step": 5000 }, { "epoch": 0.0095367431640625, "eval_accuracy": 0.02766183953033268, "eval_lookahead_loss": 48.3491586643219, "eval_lookahead_perplexity": 9.948848386690328e+20, "eval_loss": 6.628754615783691, "eval_perplexity": 756.5394014791259, "eval_runtime": 487.3495, "eval_samples_per_second": 20.519, "eval_steps_per_second": 5.13, "step": 5000 }, { "epoch": 0.01049041748046875, "grad_norm": 1.0467580556869507, "learning_rate": 4.9475574493408205e-05, "lookahead_loss": 63.25316702270508, "loss": 6.6907, "step": 5500 }, { "epoch": 0.011444091796875, "grad_norm": 1.1341805458068848, "learning_rate": 4.9427890777587895e-05, "lookahead_loss": 59.4886450958252, "loss": 6.7035, "step": 6000 }, { "epoch": 0.01239776611328125, "grad_norm": 0.9798545241355896, "learning_rate": 4.938020706176758e-05, "lookahead_loss": 56.41143209075928, "loss": 6.6372, "step": 6500 }, { "epoch": 0.0133514404296875, "grad_norm": 1.0765999555587769, "learning_rate": 4.933252334594727e-05, "lookahead_loss": 54.48769342041015, "loss": 6.5861, "step": 7000 }, { "epoch": 0.01430511474609375, "grad_norm": 1.0987727642059326, "learning_rate": 4.928483963012696e-05, "lookahead_loss": 52.982984714508056, "loss": 6.5492, "step": 7500 }, { "epoch": 0.0152587890625, "grad_norm": 1.0358542203903198, "learning_rate": 4.923715591430664e-05, "lookahead_loss": 50.78885376739502, "loss": 6.5371, "step": 8000 }, { "epoch": 0.01621246337890625, "grad_norm": 1.1576260328292847, "learning_rate": 4.918947219848633e-05, "lookahead_loss": 48.45184814071655, "loss": 6.4595, "step": 8500 }, { "epoch": 0.0171661376953125, "grad_norm": 1.4743402004241943, "learning_rate": 4.9141788482666016e-05, "lookahead_loss": 46.44318388366699, "loss": 6.4064, "step": 9000 }, { "epoch": 0.01811981201171875, "grad_norm": 1.1851649284362793, "learning_rate": 4.9094104766845706e-05, "lookahead_loss": 43.854859851837155, "loss": 6.3933, "step": 9500 }, { "epoch": 0.019073486328125, "grad_norm": 1.195031762123108, "learning_rate": 4.9046421051025396e-05, "lookahead_loss": 40.898235008239745, "loss": 6.3701, "step": 10000 }, { "epoch": 0.019073486328125, "eval_accuracy": 0.029780039138943247, "eval_lookahead_loss": 29.94217044086456, "eval_lookahead_perplexity": 10086010098930.062, "eval_loss": 6.28956937789917, "eval_perplexity": 538.9212077272832, "eval_runtime": 495.5414, "eval_samples_per_second": 20.18, "eval_steps_per_second": 5.045, "step": 10000 }, { "epoch": 0.02002716064453125, "grad_norm": 1.4402297735214233, "learning_rate": 4.899873733520508e-05, "lookahead_loss": 38.44713633728027, "loss": 6.3026, "step": 10500 }, { "epoch": 0.0209808349609375, "grad_norm": 1.4548667669296265, "learning_rate": 4.895105361938477e-05, "lookahead_loss": 36.39043072509766, "loss": 6.3107, "step": 11000 }, { "epoch": 0.02193450927734375, "grad_norm": 1.708287239074707, "learning_rate": 4.890336990356445e-05, "lookahead_loss": 33.16651112365723, "loss": 6.3098, "step": 11500 }, { "epoch": 0.02288818359375, "grad_norm": 1.4410710334777832, "learning_rate": 4.8855686187744143e-05, "lookahead_loss": 32.12632596206665, "loss": 6.3473, "step": 12000 }, { "epoch": 0.02384185791015625, "grad_norm": 1.918511152267456, "learning_rate": 4.8808002471923834e-05, "lookahead_loss": 29.618786083221437, "loss": 6.2814, "step": 12500 }, { "epoch": 0.0247955322265625, "grad_norm": 1.889897346496582, "learning_rate": 4.876031875610352e-05, "lookahead_loss": 28.033683681488036, "loss": 6.2917, "step": 13000 }, { "epoch": 0.02574920654296875, "grad_norm": 1.8124333620071411, "learning_rate": 4.871263504028321e-05, "lookahead_loss": 26.352406589508057, "loss": 6.2104, "step": 13500 }, { "epoch": 0.026702880859375, "grad_norm": 2.1337523460388184, "learning_rate": 4.866495132446289e-05, "lookahead_loss": 24.596827478408812, "loss": 6.1926, "step": 14000 }, { "epoch": 0.02765655517578125, "grad_norm": 2.6559059619903564, "learning_rate": 4.861726760864258e-05, "lookahead_loss": 23.342848896026613, "loss": 6.2033, "step": 14500 }, { "epoch": 0.0286102294921875, "grad_norm": 2.839890480041504, "learning_rate": 4.856958389282227e-05, "lookahead_loss": 22.18355765914917, "loss": 6.1926, "step": 15000 }, { "epoch": 0.0286102294921875, "eval_accuracy": 0.030957534246575342, "eval_lookahead_loss": 17.429219709587098, "eval_lookahead_perplexity": 37103420.11018392, "eval_loss": 6.096866607666016, "eval_perplexity": 444.4629092280927, "eval_runtime": 521.324, "eval_samples_per_second": 19.182, "eval_steps_per_second": 4.795, "step": 15000 }, { "epoch": 0.02956390380859375, "grad_norm": 2.8206896781921387, "learning_rate": 4.8521900177001955e-05, "lookahead_loss": 21.063527490615844, "loss": 6.2064, "step": 15500 }, { "epoch": 0.030517578125, "grad_norm": 3.5090460777282715, "learning_rate": 4.8474216461181645e-05, "lookahead_loss": 19.730982110977173, "loss": 6.1413, "step": 16000 }, { "epoch": 0.03147125244140625, "grad_norm": 3.1577465534210205, "learning_rate": 4.842653274536133e-05, "lookahead_loss": 18.50304963684082, "loss": 6.1231, "step": 16500 }, { "epoch": 0.0324249267578125, "grad_norm": 2.909254312515259, "learning_rate": 4.837884902954102e-05, "lookahead_loss": 17.398171233177184, "loss": 6.0729, "step": 17000 }, { "epoch": 0.03337860107421875, "grad_norm": 4.339597702026367, "learning_rate": 4.833116531372071e-05, "lookahead_loss": 16.484822058677672, "loss": 6.1134, "step": 17500 }, { "epoch": 0.034332275390625, "grad_norm": 3.352776527404785, "learning_rate": 4.828348159790039e-05, "lookahead_loss": 15.538122650146484, "loss": 6.1474, "step": 18000 }, { "epoch": 0.03528594970703125, "grad_norm": 3.6056292057037354, "learning_rate": 4.823579788208008e-05, "lookahead_loss": 14.782153512954713, "loss": 6.088, "step": 18500 }, { "epoch": 0.0362396240234375, "grad_norm": 4.141178607940674, "learning_rate": 4.8188114166259766e-05, "lookahead_loss": 14.080382549285888, "loss": 6.0756, "step": 19000 }, { "epoch": 0.03719329833984375, "grad_norm": 5.789239406585693, "learning_rate": 4.8140430450439456e-05, "lookahead_loss": 13.14514859008789, "loss": 6.0534, "step": 19500 }, { "epoch": 0.03814697265625, "grad_norm": 4.247617244720459, "learning_rate": 4.8092746734619146e-05, "lookahead_loss": 12.56843734741211, "loss": 6.058, "step": 20000 }, { "epoch": 0.03814697265625, "eval_accuracy": 0.03115303326810176, "eval_lookahead_loss": 11.20961524925232, "eval_lookahead_perplexity": 73837.00068569763, "eval_loss": 5.970548152923584, "eval_perplexity": 391.7203345568312, "eval_runtime": 498.181, "eval_samples_per_second": 20.073, "eval_steps_per_second": 5.018, "step": 20000 }, { "epoch": 0.03910064697265625, "grad_norm": 5.5760884284973145, "learning_rate": 4.804506301879883e-05, "lookahead_loss": 11.657794467926026, "loss": 6.0591, "step": 20500 }, { "epoch": 0.0400543212890625, "grad_norm": 5.372546195983887, "learning_rate": 4.799737930297852e-05, "lookahead_loss": 11.471096613883972, "loss": 6.0614, "step": 21000 }, { "epoch": 0.04100799560546875, "grad_norm": 6.685136795043945, "learning_rate": 4.79496955871582e-05, "lookahead_loss": 10.636363464355469, "loss": 6.0248, "step": 21500 }, { "epoch": 0.041961669921875, "grad_norm": 6.8125410079956055, "learning_rate": 4.7902011871337893e-05, "lookahead_loss": 10.30396674156189, "loss": 6.0432, "step": 22000 }, { "epoch": 0.04291534423828125, "grad_norm": 7.968947410583496, "learning_rate": 4.7854328155517584e-05, "lookahead_loss": 10.147621893882752, "loss": 6.0055, "step": 22500 }, { "epoch": 0.0438690185546875, "grad_norm": 8.161097526550293, "learning_rate": 4.780664443969727e-05, "lookahead_loss": 9.941907609939575, "loss": 6.0023, "step": 23000 }, { "epoch": 0.04482269287109375, "grad_norm": 7.431755542755127, "learning_rate": 4.775896072387696e-05, "lookahead_loss": 9.715325316429139, "loss": 5.9733, "step": 23500 }, { "epoch": 0.0457763671875, "grad_norm": 10.322345733642578, "learning_rate": 4.771127700805664e-05, "lookahead_loss": 9.351762413024902, "loss": 6.0275, "step": 24000 }, { "epoch": 0.04673004150390625, "grad_norm": 6.5574445724487305, "learning_rate": 4.766359329223633e-05, "lookahead_loss": 9.41238141155243, "loss": 6.0368, "step": 24500 }, { "epoch": 0.0476837158203125, "grad_norm": 11.035201072692871, "learning_rate": 4.761590957641602e-05, "lookahead_loss": 9.015678495407105, "loss": 5.9483, "step": 25000 }, { "epoch": 0.0476837158203125, "eval_accuracy": 0.03183385518590998, "eval_lookahead_loss": 8.760862347221375, "eval_lookahead_perplexity": 6379.610646111682, "eval_loss": 5.898678302764893, "eval_perplexity": 364.555317551897, "eval_runtime": 499.1638, "eval_samples_per_second": 20.034, "eval_steps_per_second": 5.008, "step": 25000 }, { "epoch": 0.04863739013671875, "grad_norm": 8.716949462890625, "learning_rate": 4.7568225860595705e-05, "lookahead_loss": 8.916663691520691, "loss": 6.0036, "step": 25500 }, { "epoch": 0.049591064453125, "grad_norm": 14.857547760009766, "learning_rate": 4.7520542144775395e-05, "lookahead_loss": 8.943543824195862, "loss": 5.9687, "step": 26000 }, { "epoch": 0.05054473876953125, "grad_norm": 13.311576843261719, "learning_rate": 4.747285842895508e-05, "lookahead_loss": 8.790614643096923, "loss": 5.9579, "step": 26500 }, { "epoch": 0.0514984130859375, "grad_norm": 10.850611686706543, "learning_rate": 4.742517471313477e-05, "lookahead_loss": 8.777617733955383, "loss": 5.9713, "step": 27000 }, { "epoch": 0.05245208740234375, "grad_norm": 8.855813980102539, "learning_rate": 4.737749099731446e-05, "lookahead_loss": 8.904243718147278, "loss": 5.9605, "step": 27500 }, { "epoch": 0.05340576171875, "grad_norm": 11.445196151733398, "learning_rate": 4.732980728149414e-05, "lookahead_loss": 8.767489963531494, "loss": 5.9581, "step": 28000 }, { "epoch": 0.05435943603515625, "grad_norm": 14.010653495788574, "learning_rate": 4.728212356567383e-05, "lookahead_loss": 8.957100777626037, "loss": 5.9464, "step": 28500 }, { "epoch": 0.0553131103515625, "grad_norm": 10.872274398803711, "learning_rate": 4.7234439849853516e-05, "lookahead_loss": 8.747265585899353, "loss": 5.8926, "step": 29000 }, { "epoch": 0.05626678466796875, "grad_norm": 14.190624237060547, "learning_rate": 4.7186756134033206e-05, "lookahead_loss": 8.832529782295227, "loss": 5.9385, "step": 29500 }, { "epoch": 0.057220458984375, "grad_norm": 17.005949020385742, "learning_rate": 4.7139072418212896e-05, "lookahead_loss": 8.845139456748962, "loss": 5.8936, "step": 30000 }, { "epoch": 0.057220458984375, "eval_accuracy": 0.03169275929549902, "eval_lookahead_loss": 8.643145226097108, "eval_lookahead_perplexity": 5671.138816908125, "eval_loss": 5.910150527954102, "eval_perplexity": 368.76166016999963, "eval_runtime": 507.8086, "eval_samples_per_second": 19.692, "eval_steps_per_second": 4.923, "step": 30000 }, { "epoch": 0.05817413330078125, "grad_norm": 15.910021781921387, "learning_rate": 4.709138870239258e-05, "lookahead_loss": 8.676046948432923, "loss": 5.8869, "step": 30500 }, { "epoch": 0.0591278076171875, "grad_norm": 20.27501678466797, "learning_rate": 4.704370498657227e-05, "lookahead_loss": 8.74879408454895, "loss": 5.8644, "step": 31000 }, { "epoch": 0.06008148193359375, "grad_norm": 20.8532772064209, "learning_rate": 4.699602127075195e-05, "lookahead_loss": 8.865408114433288, "loss": 5.9963, "step": 31500 }, { "epoch": 0.06103515625, "grad_norm": 28.121856689453125, "learning_rate": 4.6948337554931643e-05, "lookahead_loss": 8.802754476547241, "loss": 5.9827, "step": 32000 }, { "epoch": 0.06198883056640625, "grad_norm": 10.530069351196289, "learning_rate": 4.6900653839111334e-05, "lookahead_loss": 8.693642028808593, "loss": 5.9413, "step": 32500 }, { "epoch": 0.0629425048828125, "grad_norm": 25.250747680664062, "learning_rate": 4.685297012329102e-05, "lookahead_loss": 8.587189674854278, "loss": 5.8735, "step": 33000 }, { "epoch": 0.06389617919921875, "grad_norm": 30.053342819213867, "learning_rate": 4.680528640747071e-05, "lookahead_loss": 8.775328702926636, "loss": 5.9872, "step": 33500 }, { "epoch": 0.064849853515625, "grad_norm": 18.317350387573242, "learning_rate": 4.675760269165039e-05, "lookahead_loss": 8.824234586715699, "loss": 5.9, "step": 34000 }, { "epoch": 0.06580352783203125, "grad_norm": 16.45642852783203, "learning_rate": 4.670991897583008e-05, "lookahead_loss": 8.737088046073914, "loss": 5.9483, "step": 34500 }, { "epoch": 0.0667572021484375, "grad_norm": 13.92780876159668, "learning_rate": 4.666223526000977e-05, "lookahead_loss": 8.798971945762634, "loss": 5.9237, "step": 35000 }, { "epoch": 0.0667572021484375, "eval_accuracy": 0.034218786692759295, "eval_lookahead_loss": 8.531277675056458, "eval_lookahead_perplexity": 5070.92068643502, "eval_loss": 5.8055548667907715, "eval_perplexity": 332.1394353665202, "eval_runtime": 493.1892, "eval_samples_per_second": 20.276, "eval_steps_per_second": 5.069, "step": 35000 }, { "epoch": 0.06771087646484375, "grad_norm": 21.991329193115234, "learning_rate": 4.6614551544189455e-05, "lookahead_loss": 8.59095177268982, "loss": 5.9189, "step": 35500 }, { "epoch": 0.06866455078125, "grad_norm": 36.610286712646484, "learning_rate": 4.6566867828369145e-05, "lookahead_loss": 8.691834009170533, "loss": 5.9127, "step": 36000 }, { "epoch": 0.06961822509765625, "grad_norm": 33.285057067871094, "learning_rate": 4.651918411254883e-05, "lookahead_loss": 8.622922178268432, "loss": 5.946, "step": 36500 }, { "epoch": 0.0705718994140625, "grad_norm": 31.65322494506836, "learning_rate": 4.647150039672852e-05, "lookahead_loss": 8.654987375259399, "loss": 5.873, "step": 37000 }, { "epoch": 0.07152557373046875, "grad_norm": 20.918020248413086, "learning_rate": 4.642381668090821e-05, "lookahead_loss": 8.627805239677429, "loss": 5.8953, "step": 37500 }, { "epoch": 0.072479248046875, "grad_norm": 20.8701229095459, "learning_rate": 4.637613296508789e-05, "lookahead_loss": 8.659117082595825, "loss": 5.8925, "step": 38000 }, { "epoch": 0.07343292236328125, "grad_norm": 24.38290786743164, "learning_rate": 4.632844924926758e-05, "lookahead_loss": 8.552610912322997, "loss": 5.857, "step": 38500 }, { "epoch": 0.0743865966796875, "grad_norm": 27.10965919494629, "learning_rate": 4.6280765533447266e-05, "lookahead_loss": 8.746970396995545, "loss": 5.9103, "step": 39000 }, { "epoch": 0.07534027099609375, "grad_norm": 24.493501663208008, "learning_rate": 4.6233081817626956e-05, "lookahead_loss": 8.708542342185973, "loss": 5.8758, "step": 39500 }, { "epoch": 0.0762939453125, "grad_norm": 67.41834259033203, "learning_rate": 4.6185398101806646e-05, "lookahead_loss": 8.52537347316742, "loss": 5.8761, "step": 40000 }, { "epoch": 0.0762939453125, "eval_accuracy": 0.034638747553816046, "eval_lookahead_loss": 8.38566984462738, "eval_lookahead_perplexity": 4383.794026980688, "eval_loss": 5.768317699432373, "eval_perplexity": 319.99894516609083, "eval_runtime": 508.5191, "eval_samples_per_second": 19.665, "eval_steps_per_second": 4.916, "step": 40000 }, { "epoch": 0.07724761962890625, "grad_norm": 27.38882827758789, "learning_rate": 4.613771438598633e-05, "lookahead_loss": 8.534793689727783, "loss": 5.8883, "step": 40500 }, { "epoch": 0.0782012939453125, "grad_norm": 21.599966049194336, "learning_rate": 4.609003067016602e-05, "lookahead_loss": 8.820077924728393, "loss": 5.8984, "step": 41000 }, { "epoch": 0.07915496826171875, "grad_norm": 31.059114456176758, "learning_rate": 4.60423469543457e-05, "lookahead_loss": 8.608491080284118, "loss": 5.8841, "step": 41500 }, { "epoch": 0.080108642578125, "grad_norm": 27.385765075683594, "learning_rate": 4.5994663238525393e-05, "lookahead_loss": 8.802984870910645, "loss": 5.8989, "step": 42000 }, { "epoch": 0.08106231689453125, "grad_norm": 103.62828063964844, "learning_rate": 4.5946979522705084e-05, "lookahead_loss": 8.5754426612854, "loss": 5.8485, "step": 42500 }, { "epoch": 0.0820159912109375, "grad_norm": 59.44806671142578, "learning_rate": 4.589929580688477e-05, "lookahead_loss": 8.820483881950379, "loss": 5.8997, "step": 43000 }, { "epoch": 0.08296966552734375, "grad_norm": 48.33935546875, "learning_rate": 4.585161209106446e-05, "lookahead_loss": 8.588190244674683, "loss": 5.8662, "step": 43500 }, { "epoch": 0.08392333984375, "grad_norm": 28.534992218017578, "learning_rate": 4.580392837524414e-05, "lookahead_loss": 8.733718086242677, "loss": 5.8894, "step": 44000 }, { "epoch": 0.08487701416015625, "grad_norm": 102.52527618408203, "learning_rate": 4.575624465942383e-05, "lookahead_loss": 8.816640190124511, "loss": 5.8741, "step": 44500 }, { "epoch": 0.0858306884765625, "grad_norm": 50.407569885253906, "learning_rate": 4.570856094360352e-05, "lookahead_loss": 8.781707544326782, "loss": 5.8407, "step": 45000 }, { "epoch": 0.0858306884765625, "eval_accuracy": 0.035340313111545986, "eval_lookahead_loss": 8.614317288398743, "eval_lookahead_perplexity": 5509.985598266144, "eval_loss": 5.758557319641113, "eval_perplexity": 316.89082681079816, "eval_runtime": 493.3962, "eval_samples_per_second": 20.268, "eval_steps_per_second": 5.067, "step": 45000 }, { "epoch": 0.08678436279296875, "grad_norm": 41.735107421875, "learning_rate": 4.5660877227783205e-05, "lookahead_loss": 8.752670993804932, "loss": 5.8717, "step": 45500 }, { "epoch": 0.087738037109375, "grad_norm": 32.98950958251953, "learning_rate": 4.5613193511962895e-05, "lookahead_loss": 8.599378199577332, "loss": 5.8836, "step": 46000 }, { "epoch": 0.08869171142578125, "grad_norm": 38.76126480102539, "learning_rate": 4.556550979614258e-05, "lookahead_loss": 8.445634421348572, "loss": 5.8289, "step": 46500 }, { "epoch": 0.0896453857421875, "grad_norm": 39.935543060302734, "learning_rate": 4.551782608032227e-05, "lookahead_loss": 8.488491415977478, "loss": 5.8636, "step": 47000 }, { "epoch": 0.09059906005859375, "grad_norm": 50.00627136230469, "learning_rate": 4.547014236450196e-05, "lookahead_loss": 8.42595292186737, "loss": 5.7684, "step": 47500 }, { "epoch": 0.091552734375, "grad_norm": 60.3950309753418, "learning_rate": 4.542245864868164e-05, "lookahead_loss": 8.605745649337768, "loss": 5.8107, "step": 48000 }, { "epoch": 0.09250640869140625, "grad_norm": 190.85903930664062, "learning_rate": 4.537477493286133e-05, "lookahead_loss": 8.649221048355102, "loss": 5.7591, "step": 48500 }, { "epoch": 0.0934600830078125, "grad_norm": 183.35992431640625, "learning_rate": 4.5327091217041016e-05, "lookahead_loss": 8.724970165252685, "loss": 5.8119, "step": 49000 }, { "epoch": 0.09441375732421875, "grad_norm": 87.01403045654297, "learning_rate": 4.5279407501220706e-05, "lookahead_loss": 9.013324408531188, "loss": 5.8426, "step": 49500 }, { "epoch": 0.095367431640625, "grad_norm": 83.90569305419922, "learning_rate": 4.523172378540039e-05, "lookahead_loss": 8.811200015068055, "loss": 5.9167, "step": 50000 }, { "epoch": 0.095367431640625, "eval_accuracy": 0.03540352250489237, "eval_lookahead_loss": 8.65797981300354, "eval_lookahead_perplexity": 5755.894925157666, "eval_loss": 5.7595930099487305, "eval_perplexity": 317.2191975845604, "eval_runtime": 512.9644, "eval_samples_per_second": 19.495, "eval_steps_per_second": 4.874, "step": 50000 }, { "epoch": 0.09632110595703125, "grad_norm": 34.10512924194336, "learning_rate": 4.518404006958008e-05, "lookahead_loss": 8.662215993881226, "loss": 5.9413, "step": 50500 }, { "epoch": 0.0972747802734375, "grad_norm": 57.223548889160156, "learning_rate": 4.513635635375977e-05, "lookahead_loss": 8.813905738830567, "loss": 5.8847, "step": 51000 }, { "epoch": 0.09822845458984375, "grad_norm": 53.162113189697266, "learning_rate": 4.508867263793945e-05, "lookahead_loss": 8.897481372833251, "loss": 5.9268, "step": 51500 }, { "epoch": 0.09918212890625, "grad_norm": 124.06058502197266, "learning_rate": 4.5040988922119143e-05, "lookahead_loss": 8.983036054611206, "loss": 5.8816, "step": 52000 }, { "epoch": 0.10013580322265625, "grad_norm": 92.73865509033203, "learning_rate": 4.499330520629883e-05, "lookahead_loss": 8.930865283966064, "loss": 5.943, "step": 52500 }, { "epoch": 0.1010894775390625, "grad_norm": 58.489749908447266, "learning_rate": 4.494562149047852e-05, "lookahead_loss": 9.098317293167113, "loss": 5.9178, "step": 53000 }, { "epoch": 0.10204315185546875, "grad_norm": 94.85649871826172, "learning_rate": 4.489793777465821e-05, "lookahead_loss": 9.46035364818573, "loss": 5.8577, "step": 53500 }, { "epoch": 0.102996826171875, "grad_norm": 162.58407592773438, "learning_rate": 4.485025405883789e-05, "lookahead_loss": 9.276899855613708, "loss": 5.9345, "step": 54000 }, { "epoch": 0.10395050048828125, "grad_norm": 80.6322250366211, "learning_rate": 4.480257034301758e-05, "lookahead_loss": 9.332964287757873, "loss": 5.8577, "step": 54500 }, { "epoch": 0.1049041748046875, "grad_norm": 77.14865112304688, "learning_rate": 4.4754886627197264e-05, "lookahead_loss": 9.498707158088685, "loss": 6.0003, "step": 55000 }, { "epoch": 0.1049041748046875, "eval_accuracy": 0.035851859099804304, "eval_lookahead_loss": 8.73584467163086, "eval_lookahead_perplexity": 6221.987519758647, "eval_loss": 5.799686908721924, "eval_perplexity": 330.196162185596, "eval_runtime": 521.2951, "eval_samples_per_second": 19.183, "eval_steps_per_second": 4.796, "step": 55000 }, { "epoch": 0.10585784912109375, "grad_norm": 62.42657470703125, "learning_rate": 4.4707202911376955e-05, "lookahead_loss": 8.894667556762695, "loss": 5.9778, "step": 55500 }, { "epoch": 0.1068115234375, "grad_norm": 150.36558532714844, "learning_rate": 4.4659519195556645e-05, "lookahead_loss": 9.041878464698792, "loss": 5.8986, "step": 56000 }, { "epoch": 0.10776519775390625, "grad_norm": 190.84130859375, "learning_rate": 4.461183547973633e-05, "lookahead_loss": 9.438038084983825, "loss": 5.9759, "step": 56500 }, { "epoch": 0.1087188720703125, "grad_norm": 117.12763214111328, "learning_rate": 4.456415176391602e-05, "lookahead_loss": 9.390923029899596, "loss": 5.9787, "step": 57000 }, { "epoch": 0.10967254638671875, "grad_norm": 93.06073760986328, "learning_rate": 4.45164680480957e-05, "lookahead_loss": 9.128630822181702, "loss": 5.9107, "step": 57500 }, { "epoch": 0.110626220703125, "grad_norm": 63.656436920166016, "learning_rate": 4.446878433227539e-05, "lookahead_loss": 8.939864095687867, "loss": 5.7037, "step": 58000 }, { "epoch": 0.11157989501953125, "grad_norm": 335.43505859375, "learning_rate": 4.442110061645508e-05, "lookahead_loss": 8.729833122253417, "loss": 5.9595, "step": 58500 }, { "epoch": 0.1125335693359375, "grad_norm": 90.5297622680664, "learning_rate": 4.4373416900634766e-05, "lookahead_loss": 8.692692355155945, "loss": 5.9648, "step": 59000 }, { "epoch": 0.11348724365234375, "grad_norm": 146.70401000976562, "learning_rate": 4.4325733184814456e-05, "lookahead_loss": 9.105256281852721, "loss": 5.9394, "step": 59500 }, { "epoch": 0.11444091796875, "grad_norm": 169.68324279785156, "learning_rate": 4.427804946899414e-05, "lookahead_loss": 9.118044589996337, "loss": 5.9179, "step": 60000 }, { "epoch": 0.11444091796875, "eval_accuracy": 0.03788512720156556, "eval_lookahead_loss": 8.697398168563844, "eval_lookahead_perplexity": 5987.313952373037, "eval_loss": 5.790381908416748, "eval_perplexity": 327.13793725348785, "eval_runtime": 497.8527, "eval_samples_per_second": 20.086, "eval_steps_per_second": 5.022, "step": 60000 }, { "epoch": 0.11539459228515625, "grad_norm": 77.94841766357422, "learning_rate": 4.423036575317383e-05, "lookahead_loss": 9.03482995414734, "loss": 5.9334, "step": 60500 }, { "epoch": 0.1163482666015625, "grad_norm": 94.758056640625, "learning_rate": 4.418268203735352e-05, "lookahead_loss": 8.781494087219238, "loss": 5.9389, "step": 61000 }, { "epoch": 0.11730194091796875, "grad_norm": 106.37804412841797, "learning_rate": 4.41349983215332e-05, "lookahead_loss": 8.71310696220398, "loss": 5.9192, "step": 61500 }, { "epoch": 0.118255615234375, "grad_norm": 56.514732360839844, "learning_rate": 4.4087314605712893e-05, "lookahead_loss": 8.607766095161438, "loss": 5.9676, "step": 62000 }, { "epoch": 0.11920928955078125, "grad_norm": 167.48855590820312, "learning_rate": 4.403963088989258e-05, "lookahead_loss": 8.72884268951416, "loss": 5.9143, "step": 62500 }, { "epoch": 0.1201629638671875, "grad_norm": 102.03301239013672, "learning_rate": 4.399194717407227e-05, "lookahead_loss": 8.980250079154969, "loss": 5.9852, "step": 63000 }, { "epoch": 0.12111663818359375, "grad_norm": 152.84397888183594, "learning_rate": 4.394426345825196e-05, "lookahead_loss": 8.589770315170288, "loss": 5.9468, "step": 63500 }, { "epoch": 0.1220703125, "grad_norm": 110.2666015625, "learning_rate": 4.389657974243164e-05, "lookahead_loss": 8.763785968780518, "loss": 5.9014, "step": 64000 }, { "epoch": 0.12302398681640625, "grad_norm": 77.3198013305664, "learning_rate": 4.384889602661133e-05, "lookahead_loss": 8.867919365882873, "loss": 5.9708, "step": 64500 }, { "epoch": 0.1239776611328125, "grad_norm": 61.512916564941406, "learning_rate": 4.3801212310791014e-05, "lookahead_loss": 8.630557582855225, "loss": 5.9174, "step": 65000 }, { "epoch": 0.1239776611328125, "eval_accuracy": 0.03844520547945206, "eval_lookahead_loss": 8.541237368392943, "eval_lookahead_perplexity": 5121.677844698275, "eval_loss": 5.808926105499268, "eval_perplexity": 333.2610462343505, "eval_runtime": 510.2124, "eval_samples_per_second": 19.6, "eval_steps_per_second": 4.9, "step": 65000 }, { "epoch": 0.12493133544921875, "grad_norm": 258.09722900390625, "learning_rate": 4.3753528594970705e-05, "lookahead_loss": 8.702727296829224, "loss": 5.8833, "step": 65500 }, { "epoch": 0.125885009765625, "grad_norm": 137.35374450683594, "learning_rate": 4.3705844879150395e-05, "lookahead_loss": 8.855953419685363, "loss": 5.9011, "step": 66000 }, { "epoch": 0.12683868408203125, "grad_norm": 129.56959533691406, "learning_rate": 4.365816116333008e-05, "lookahead_loss": 9.062326998710633, "loss": 5.899, "step": 66500 }, { "epoch": 0.1277923583984375, "grad_norm": 67.58332061767578, "learning_rate": 4.361047744750977e-05, "lookahead_loss": 8.786095474243163, "loss": 5.9366, "step": 67000 }, { "epoch": 0.12874603271484375, "grad_norm": 201.39047241210938, "learning_rate": 4.356279373168945e-05, "lookahead_loss": 8.842333135604859, "loss": 5.9812, "step": 67500 }, { "epoch": 0.12969970703125, "grad_norm": 430.5280456542969, "learning_rate": 4.351511001586914e-05, "lookahead_loss": 9.000409759521485, "loss": 6.0923, "step": 68000 }, { "epoch": 0.13065338134765625, "grad_norm": 225.57858276367188, "learning_rate": 4.346742630004883e-05, "lookahead_loss": 9.180291424751282, "loss": 6.1033, "step": 68500 }, { "epoch": 0.1316070556640625, "grad_norm": 196.08851623535156, "learning_rate": 4.3419742584228516e-05, "lookahead_loss": 8.709693765640258, "loss": 6.036, "step": 69000 }, { "epoch": 0.13256072998046875, "grad_norm": 109.48348999023438, "learning_rate": 4.3372058868408206e-05, "lookahead_loss": 9.069932061195374, "loss": 6.0102, "step": 69500 }, { "epoch": 0.133514404296875, "grad_norm": 240.88931274414062, "learning_rate": 4.332437515258789e-05, "lookahead_loss": 8.664248411178589, "loss": 5.9954, "step": 70000 }, { "epoch": 0.133514404296875, "eval_accuracy": 0.039309001956947164, "eval_lookahead_loss": 8.51489791355133, "eval_lookahead_perplexity": 4988.536771355592, "eval_loss": 5.840917587280273, "eval_perplexity": 344.09493298478753, "eval_runtime": 490.8206, "eval_samples_per_second": 20.374, "eval_steps_per_second": 5.094, "step": 70000 }, { "epoch": 0.13446807861328125, "grad_norm": 591.8555297851562, "learning_rate": 4.327669143676758e-05, "lookahead_loss": 8.821753074645995, "loss": 6.0136, "step": 70500 }, { "epoch": 0.1354217529296875, "grad_norm": 274.14794921875, "learning_rate": 4.322900772094727e-05, "lookahead_loss": 8.955734300613404, "loss": 6.0117, "step": 71000 }, { "epoch": 0.13637542724609375, "grad_norm": 137.9414520263672, "learning_rate": 4.318132400512695e-05, "lookahead_loss": 9.109353039741515, "loss": 5.9762, "step": 71500 }, { "epoch": 0.1373291015625, "grad_norm": 190.22702026367188, "learning_rate": 4.3133640289306643e-05, "lookahead_loss": 8.929817812919616, "loss": 5.998, "step": 72000 }, { "epoch": 0.13828277587890625, "grad_norm": 73.93792724609375, "learning_rate": 4.308595657348633e-05, "lookahead_loss": 8.980986825942994, "loss": 6.0097, "step": 72500 }, { "epoch": 0.1392364501953125, "grad_norm": 141.13626098632812, "learning_rate": 4.303827285766602e-05, "lookahead_loss": 8.522379638671875, "loss": 5.987, "step": 73000 }, { "epoch": 0.14019012451171875, "grad_norm": 181.7722625732422, "learning_rate": 4.299058914184571e-05, "lookahead_loss": 8.887932646751404, "loss": 6.0183, "step": 73500 }, { "epoch": 0.141143798828125, "grad_norm": 356.2704162597656, "learning_rate": 4.294290542602539e-05, "lookahead_loss": 8.78232068157196, "loss": 5.9868, "step": 74000 }, { "epoch": 0.14209747314453125, "grad_norm": 261.717529296875, "learning_rate": 4.289522171020508e-05, "lookahead_loss": 8.88387086391449, "loss": 5.9578, "step": 74500 }, { "epoch": 0.1430511474609375, "grad_norm": 304.59661865234375, "learning_rate": 4.2847537994384764e-05, "lookahead_loss": 8.975398461341857, "loss": 6.0362, "step": 75000 }, { "epoch": 0.1430511474609375, "eval_accuracy": 0.04000978473581213, "eval_lookahead_loss": 8.379310646438599, "eval_lookahead_perplexity": 4356.00506335526, "eval_loss": 5.870528697967529, "eval_perplexity": 354.43632050246913, "eval_runtime": 491.9958, "eval_samples_per_second": 20.325, "eval_steps_per_second": 5.081, "step": 75000 }, { "epoch": 0.14400482177734375, "grad_norm": 179.32708740234375, "learning_rate": 4.2799854278564455e-05, "lookahead_loss": 8.664676836013793, "loss": 6.0262, "step": 75500 }, { "epoch": 0.14495849609375, "grad_norm": 120.44501495361328, "learning_rate": 4.2752170562744145e-05, "lookahead_loss": 8.505344146728516, "loss": 5.9964, "step": 76000 }, { "epoch": 0.14591217041015625, "grad_norm": 97.7379379272461, "learning_rate": 4.270448684692383e-05, "lookahead_loss": 8.67644682407379, "loss": 6.0547, "step": 76500 }, { "epoch": 0.1468658447265625, "grad_norm": 49.137752532958984, "learning_rate": 4.265680313110352e-05, "lookahead_loss": 8.623183971405028, "loss": 5.9986, "step": 77000 }, { "epoch": 0.14781951904296875, "grad_norm": 133.72708129882812, "learning_rate": 4.26091194152832e-05, "lookahead_loss": 8.350838703155517, "loss": 5.9716, "step": 77500 }, { "epoch": 0.148773193359375, "grad_norm": 242.15216064453125, "learning_rate": 4.256143569946289e-05, "lookahead_loss": 8.39862640285492, "loss": 5.9662, "step": 78000 }, { "epoch": 0.14972686767578125, "grad_norm": 199.80955505371094, "learning_rate": 4.251375198364258e-05, "lookahead_loss": 8.453930419921875, "loss": 5.9593, "step": 78500 }, { "epoch": 0.1506805419921875, "grad_norm": 85.46812438964844, "learning_rate": 4.2466068267822266e-05, "lookahead_loss": 8.478532793045044, "loss": 5.9639, "step": 79000 }, { "epoch": 0.15163421630859375, "grad_norm": 988.5408935546875, "learning_rate": 4.2418384552001956e-05, "lookahead_loss": 8.451021835327149, "loss": 5.9353, "step": 79500 }, { "epoch": 0.152587890625, "grad_norm": 89.64875030517578, "learning_rate": 4.237070083618164e-05, "lookahead_loss": 8.364713426589965, "loss": 5.8366, "step": 80000 }, { "epoch": 0.152587890625, "eval_accuracy": 0.03932289628180039, "eval_lookahead_loss": 8.227179842376708, "eval_lookahead_perplexity": 3741.2678952264614, "eval_loss": 5.812952518463135, "eval_perplexity": 334.6055978748494, "eval_runtime": 492.245, "eval_samples_per_second": 20.315, "eval_steps_per_second": 5.079, "step": 80000 }, { "epoch": 0.15354156494140625, "grad_norm": 143.27752685546875, "learning_rate": 4.232301712036133e-05, "lookahead_loss": 8.54583135509491, "loss": 5.9852, "step": 80500 }, { "epoch": 0.1544952392578125, "grad_norm": 233.29669189453125, "learning_rate": 4.227533340454102e-05, "lookahead_loss": 8.570575041770935, "loss": 5.9513, "step": 81000 }, { "epoch": 0.15544891357421875, "grad_norm": 234.28756713867188, "learning_rate": 4.22276496887207e-05, "lookahead_loss": 8.680360085487365, "loss": 6.0038, "step": 81500 }, { "epoch": 0.156402587890625, "grad_norm": 178.49525451660156, "learning_rate": 4.2179965972900393e-05, "lookahead_loss": 8.54448090839386, "loss": 5.9944, "step": 82000 }, { "epoch": 0.15735626220703125, "grad_norm": 84.13599395751953, "learning_rate": 4.213228225708008e-05, "lookahead_loss": 8.290419595718383, "loss": 5.9032, "step": 82500 }, { "epoch": 0.1583099365234375, "grad_norm": 144.843994140625, "learning_rate": 4.208459854125977e-05, "lookahead_loss": 8.417510450363158, "loss": 5.8554, "step": 83000 }, { "epoch": 0.15926361083984375, "grad_norm": 127.59434509277344, "learning_rate": 4.203691482543946e-05, "lookahead_loss": 8.55427560520172, "loss": 5.8462, "step": 83500 }, { "epoch": 0.16021728515625, "grad_norm": 61.06243133544922, "learning_rate": 4.198923110961914e-05, "lookahead_loss": 8.34434042263031, "loss": 5.8574, "step": 84000 }, { "epoch": 0.16117095947265625, "grad_norm": 342.4031066894531, "learning_rate": 4.194154739379883e-05, "lookahead_loss": 8.480031717300415, "loss": 5.9171, "step": 84500 }, { "epoch": 0.1621246337890625, "grad_norm": 166.8557586669922, "learning_rate": 4.1893863677978514e-05, "lookahead_loss": 8.558206010818482, "loss": 6.0205, "step": 85000 }, { "epoch": 0.1621246337890625, "eval_accuracy": 0.03916477495107632, "eval_lookahead_loss": 8.577949367332458, "eval_lookahead_perplexity": 5313.198927127469, "eval_loss": 5.798160552978516, "eval_perplexity": 329.69254982058027, "eval_runtime": 489.147, "eval_samples_per_second": 20.444, "eval_steps_per_second": 5.111, "step": 85000 }, { "epoch": 0.16307830810546875, "grad_norm": 239.66050720214844, "learning_rate": 4.1846179962158205e-05, "lookahead_loss": 8.602962774276733, "loss": 6.0133, "step": 85500 }, { "epoch": 0.164031982421875, "grad_norm": 266.0901184082031, "learning_rate": 4.1798496246337895e-05, "lookahead_loss": 8.339720858573914, "loss": 5.9745, "step": 86000 }, { "epoch": 0.16498565673828125, "grad_norm": 138.05540466308594, "learning_rate": 4.175081253051758e-05, "lookahead_loss": 8.560099305152892, "loss": 5.9717, "step": 86500 }, { "epoch": 0.1659393310546875, "grad_norm": 506.8746337890625, "learning_rate": 4.170312881469727e-05, "lookahead_loss": 8.301853426933288, "loss": 5.9886, "step": 87000 }, { "epoch": 0.16689300537109375, "grad_norm": 275.16314697265625, "learning_rate": 4.165544509887695e-05, "lookahead_loss": 8.508106866836547, "loss": 5.9247, "step": 87500 }, { "epoch": 0.1678466796875, "grad_norm": 149.2919464111328, "learning_rate": 4.160776138305664e-05, "lookahead_loss": 8.488696005821229, "loss": 5.96, "step": 88000 }, { "epoch": 0.16880035400390625, "grad_norm": 211.85987854003906, "learning_rate": 4.156007766723633e-05, "lookahead_loss": 8.815311118125916, "loss": 5.9528, "step": 88500 }, { "epoch": 0.1697540283203125, "grad_norm": 204.696533203125, "learning_rate": 4.1512393951416016e-05, "lookahead_loss": 9.006188857078552, "loss": 5.9527, "step": 89000 }, { "epoch": 0.17070770263671875, "grad_norm": 7534.8779296875, "learning_rate": 4.1464710235595706e-05, "lookahead_loss": 8.903898489952088, "loss": 6.0016, "step": 89500 }, { "epoch": 0.171661376953125, "grad_norm": 274.6275634765625, "learning_rate": 4.141702651977539e-05, "lookahead_loss": 8.663808915138244, "loss": 5.973, "step": 90000 }, { "epoch": 0.171661376953125, "eval_accuracy": 0.03982348336594912, "eval_lookahead_loss": 8.863761722755433, "eval_lookahead_perplexity": 7071.032039842666, "eval_loss": 5.844636917114258, "eval_perplexity": 345.3771184920824, "eval_runtime": 496.6961, "eval_samples_per_second": 20.133, "eval_steps_per_second": 5.033, "step": 90000 }, { "epoch": 0.17261505126953125, "grad_norm": 565.8211059570312, "learning_rate": 4.136934280395508e-05, "lookahead_loss": 8.778876346588135, "loss": 5.9844, "step": 90500 }, { "epoch": 0.1735687255859375, "grad_norm": 260.30975341796875, "learning_rate": 4.132165908813477e-05, "lookahead_loss": 8.375056875228882, "loss": 5.9378, "step": 91000 }, { "epoch": 0.17452239990234375, "grad_norm": 396.7339782714844, "learning_rate": 4.127397537231445e-05, "lookahead_loss": 8.52590683746338, "loss": 5.9886, "step": 91500 }, { "epoch": 0.17547607421875, "grad_norm": 173.97508239746094, "learning_rate": 4.1226291656494143e-05, "lookahead_loss": 8.35165282535553, "loss": 5.9546, "step": 92000 }, { "epoch": 0.17642974853515625, "grad_norm": 74.57556915283203, "learning_rate": 4.117860794067383e-05, "lookahead_loss": 8.475467248916626, "loss": 5.934, "step": 92500 }, { "epoch": 0.1773834228515625, "grad_norm": 89.71018981933594, "learning_rate": 4.113092422485352e-05, "lookahead_loss": 8.406667669296265, "loss": 5.9699, "step": 93000 }, { "epoch": 0.17833709716796875, "grad_norm": 75.69933319091797, "learning_rate": 4.108324050903321e-05, "lookahead_loss": 8.254762513160706, "loss": 5.976, "step": 93500 }, { "epoch": 0.179290771484375, "grad_norm": 100.19921112060547, "learning_rate": 4.103555679321289e-05, "lookahead_loss": 8.335499359130859, "loss": 6.0024, "step": 94000 }, { "epoch": 0.18024444580078125, "grad_norm": 122.08023834228516, "learning_rate": 4.098787307739258e-05, "lookahead_loss": 8.403356021881104, "loss": 5.9131, "step": 94500 }, { "epoch": 0.1811981201171875, "grad_norm": 62.658103942871094, "learning_rate": 4.0940189361572264e-05, "lookahead_loss": 8.307968511581421, "loss": 5.9258, "step": 95000 }, { "epoch": 0.1811981201171875, "eval_accuracy": 0.03995968688845401, "eval_lookahead_loss": 8.162174356269837, "eval_lookahead_perplexity": 3505.801183142459, "eval_loss": 5.773275852203369, "eval_perplexity": 321.58948864315954, "eval_runtime": 499.7755, "eval_samples_per_second": 20.009, "eval_steps_per_second": 5.002, "step": 95000 }, { "epoch": 0.18215179443359375, "grad_norm": 84.90938568115234, "learning_rate": 4.0892505645751955e-05, "lookahead_loss": 8.207753345489502, "loss": 5.8814, "step": 95500 }, { "epoch": 0.18310546875, "grad_norm": 94.45681762695312, "learning_rate": 4.0844821929931645e-05, "lookahead_loss": 8.139867059707642, "loss": 5.9053, "step": 96000 }, { "epoch": 0.18405914306640625, "grad_norm": 104.47096252441406, "learning_rate": 4.079713821411133e-05, "lookahead_loss": 8.308179296493531, "loss": 5.8973, "step": 96500 }, { "epoch": 0.1850128173828125, "grad_norm": 84.75871276855469, "learning_rate": 4.074945449829102e-05, "lookahead_loss": 8.258435565948487, "loss": 5.9105, "step": 97000 }, { "epoch": 0.18596649169921875, "grad_norm": 105.67626953125, "learning_rate": 4.07017707824707e-05, "lookahead_loss": 8.376761375427247, "loss": 5.8652, "step": 97500 }, { "epoch": 0.186920166015625, "grad_norm": 69.76514434814453, "learning_rate": 4.065408706665039e-05, "lookahead_loss": 8.302103631973267, "loss": 5.8791, "step": 98000 }, { "epoch": 0.18787384033203125, "grad_norm": 141.76950073242188, "learning_rate": 4.060640335083008e-05, "lookahead_loss": 8.385335474014282, "loss": 5.8638, "step": 98500 }, { "epoch": 0.1888275146484375, "grad_norm": 127.81639862060547, "learning_rate": 4.0558719635009766e-05, "lookahead_loss": 8.444834937095642, "loss": 5.8673, "step": 99000 }, { "epoch": 0.18978118896484375, "grad_norm": 142.02383422851562, "learning_rate": 4.0511035919189456e-05, "lookahead_loss": 8.431948372840882, "loss": 5.8171, "step": 99500 }, { "epoch": 0.19073486328125, "grad_norm": 75.32522583007812, "learning_rate": 4.046335220336914e-05, "lookahead_loss": 8.610531043052672, "loss": 5.8746, "step": 100000 }, { "epoch": 0.19073486328125, "eval_accuracy": 0.04027573385518591, "eval_lookahead_loss": 8.25171227722168, "eval_lookahead_perplexity": 3834.1853922462456, "eval_loss": 5.757181644439697, "eval_perplexity": 316.4551876764785, "eval_runtime": 493.9439, "eval_samples_per_second": 20.245, "eval_steps_per_second": 5.061, "step": 100000 }, { "epoch": 0.19168853759765625, "grad_norm": 148.36856079101562, "learning_rate": 4.041566848754883e-05, "lookahead_loss": 8.445461787223817, "loss": 5.8346, "step": 100500 }, { "epoch": 0.1926422119140625, "grad_norm": 92.01062774658203, "learning_rate": 4.036798477172852e-05, "lookahead_loss": 8.501290910720826, "loss": 5.9143, "step": 101000 }, { "epoch": 0.19359588623046875, "grad_norm": 117.37158203125, "learning_rate": 4.03203010559082e-05, "lookahead_loss": 8.476218932151795, "loss": 6.0285, "step": 101500 }, { "epoch": 0.194549560546875, "grad_norm": 215.9250030517578, "learning_rate": 4.0272617340087893e-05, "lookahead_loss": 8.316578735351563, "loss": 5.9491, "step": 102000 }, { "epoch": 0.19550323486328125, "grad_norm": 114.09319305419922, "learning_rate": 4.022493362426758e-05, "lookahead_loss": 8.372049181938172, "loss": 5.9942, "step": 102500 }, { "epoch": 0.1964569091796875, "grad_norm": 141.19760131835938, "learning_rate": 4.017724990844727e-05, "lookahead_loss": 8.273028931617738, "loss": 5.9434, "step": 103000 }, { "epoch": 0.19741058349609375, "grad_norm": 188.97369384765625, "learning_rate": 4.012956619262696e-05, "lookahead_loss": 8.3159986038208, "loss": 5.913, "step": 103500 }, { "epoch": 0.1983642578125, "grad_norm": 138.5894775390625, "learning_rate": 4.008188247680664e-05, "lookahead_loss": 8.419611748695374, "loss": 5.9035, "step": 104000 }, { "epoch": 0.19931793212890625, "grad_norm": 182.44581604003906, "learning_rate": 4.003419876098633e-05, "lookahead_loss": 8.58366820526123, "loss": 5.8969, "step": 104500 }, { "epoch": 0.2002716064453125, "grad_norm": 114.81692504882812, "learning_rate": 3.9986515045166014e-05, "lookahead_loss": 8.354695693969727, "loss": 5.9069, "step": 105000 }, { "epoch": 0.2002716064453125, "eval_accuracy": 0.040168688845401174, "eval_lookahead_loss": 8.241800089454651, "eval_lookahead_perplexity": 3796.3679630684564, "eval_loss": 5.758640289306641, "eval_perplexity": 316.9171202274702, "eval_runtime": 493.0106, "eval_samples_per_second": 20.284, "eval_steps_per_second": 5.071, "step": 105000 }, { "epoch": 0.20122528076171875, "grad_norm": 110.94558715820312, "learning_rate": 3.9938831329345705e-05, "lookahead_loss": 8.208633882522584, "loss": 5.8878, "step": 105500 }, { "epoch": 0.202178955078125, "grad_norm": 190.33078002929688, "learning_rate": 3.9891147613525395e-05, "lookahead_loss": 8.334890480995178, "loss": 5.8797, "step": 106000 }, { "epoch": 0.20313262939453125, "grad_norm": 102.24852752685547, "learning_rate": 3.984346389770508e-05, "lookahead_loss": 8.191088889122009, "loss": 5.8614, "step": 106500 }, { "epoch": 0.2040863037109375, "grad_norm": 118.60765075683594, "learning_rate": 3.979578018188477e-05, "lookahead_loss": 8.161592358589173, "loss": 5.9089, "step": 107000 }, { "epoch": 0.20503997802734375, "grad_norm": 474.5517578125, "learning_rate": 3.974809646606445e-05, "lookahead_loss": 8.33444945526123, "loss": 5.8743, "step": 107500 }, { "epoch": 0.20599365234375, "grad_norm": 362.70172119140625, "learning_rate": 3.970041275024414e-05, "lookahead_loss": 8.598750807762146, "loss": 5.8378, "step": 108000 }, { "epoch": 0.20694732666015625, "grad_norm": 164.10594177246094, "learning_rate": 3.965272903442383e-05, "lookahead_loss": 8.582070672035217, "loss": 5.8822, "step": 108500 }, { "epoch": 0.2079010009765625, "grad_norm": 306.6429443359375, "learning_rate": 3.9605045318603516e-05, "lookahead_loss": 8.63638463306427, "loss": 5.9286, "step": 109000 }, { "epoch": 0.20885467529296875, "grad_norm": 200.9951934814453, "learning_rate": 3.9557361602783206e-05, "lookahead_loss": 8.704802231788635, "loss": 5.9232, "step": 109500 }, { "epoch": 0.209808349609375, "grad_norm": 94.9920425415039, "learning_rate": 3.950967788696289e-05, "lookahead_loss": 8.756813703536988, "loss": 5.9402, "step": 110000 }, { "epoch": 0.209808349609375, "eval_accuracy": 0.04001272015655577, "eval_lookahead_loss": 8.528185512924194, "eval_lookahead_perplexity": 5055.264795264064, "eval_loss": 5.770267963409424, "eval_perplexity": 320.6236365400816, "eval_runtime": 494.1352, "eval_samples_per_second": 20.237, "eval_steps_per_second": 5.059, "step": 110000 }, { "epoch": 0.21076202392578125, "grad_norm": 284.00390625, "learning_rate": 3.946199417114258e-05, "lookahead_loss": 8.594149451255799, "loss": 5.9109, "step": 110500 }, { "epoch": 0.2117156982421875, "grad_norm": 176.80123901367188, "learning_rate": 3.941431045532227e-05, "lookahead_loss": 8.615830171585083, "loss": 5.9461, "step": 111000 }, { "epoch": 0.21266937255859375, "grad_norm": 235.83326721191406, "learning_rate": 3.936662673950195e-05, "lookahead_loss": 8.687150791168213, "loss": 5.9636, "step": 111500 }, { "epoch": 0.213623046875, "grad_norm": 103.35598754882812, "learning_rate": 3.9318943023681643e-05, "lookahead_loss": 8.320789858818054, "loss": 5.8988, "step": 112000 }, { "epoch": 0.21457672119140625, "grad_norm": 182.26434326171875, "learning_rate": 3.927125930786133e-05, "lookahead_loss": 8.342611763954162, "loss": 5.8917, "step": 112500 }, { "epoch": 0.2155303955078125, "grad_norm": 119.27557373046875, "learning_rate": 3.922357559204102e-05, "lookahead_loss": 8.243752840042115, "loss": 5.927, "step": 113000 }, { "epoch": 0.21648406982421875, "grad_norm": 183.15756225585938, "learning_rate": 3.917589187622071e-05, "lookahead_loss": 8.342035251617432, "loss": 5.8904, "step": 113500 }, { "epoch": 0.217437744140625, "grad_norm": 154.41551208496094, "learning_rate": 3.912820816040039e-05, "lookahead_loss": 8.131375756263733, "loss": 5.8979, "step": 114000 }, { "epoch": 0.21839141845703125, "grad_norm": 95.44281005859375, "learning_rate": 3.908052444458008e-05, "lookahead_loss": 8.211273818969726, "loss": 5.9357, "step": 114500 }, { "epoch": 0.2193450927734375, "grad_norm": 156.48316955566406, "learning_rate": 3.9032840728759764e-05, "lookahead_loss": 8.335354602813721, "loss": 5.8692, "step": 115000 }, { "epoch": 0.2193450927734375, "eval_accuracy": 0.040494324853228965, "eval_lookahead_loss": 8.386283096504211, "eval_lookahead_perplexity": 4386.483221387857, "eval_loss": 5.746561527252197, "eval_perplexity": 313.112179488684, "eval_runtime": 494.2892, "eval_samples_per_second": 20.231, "eval_steps_per_second": 5.058, "step": 115000 }, { "epoch": 0.22029876708984375, "grad_norm": 236.41307067871094, "learning_rate": 3.8985157012939455e-05, "lookahead_loss": 8.49956429386139, "loss": 5.8253, "step": 115500 }, { "epoch": 0.22125244140625, "grad_norm": 213.94174194335938, "learning_rate": 3.8937473297119145e-05, "lookahead_loss": 8.654695083618163, "loss": 5.8751, "step": 116000 }, { "epoch": 0.22220611572265625, "grad_norm": 365.95562744140625, "learning_rate": 3.888978958129883e-05, "lookahead_loss": 8.708047358512879, "loss": 5.8828, "step": 116500 }, { "epoch": 0.2231597900390625, "grad_norm": 304.7732849121094, "learning_rate": 3.884210586547852e-05, "lookahead_loss": 8.52287411212921, "loss": 5.8546, "step": 117000 }, { "epoch": 0.22411346435546875, "grad_norm": 193.3865966796875, "learning_rate": 3.87944221496582e-05, "lookahead_loss": 8.768192059516906, "loss": 5.9711, "step": 117500 }, { "epoch": 0.225067138671875, "grad_norm": 81.27263641357422, "learning_rate": 3.874673843383789e-05, "lookahead_loss": 8.642061423301696, "loss": 5.9826, "step": 118000 }, { "epoch": 0.22602081298828125, "grad_norm": 87.96857452392578, "learning_rate": 3.869905471801758e-05, "lookahead_loss": 8.657119782447815, "loss": 5.9928, "step": 118500 }, { "epoch": 0.2269744873046875, "grad_norm": 307.6184387207031, "learning_rate": 3.8651371002197266e-05, "lookahead_loss": 8.464791445732116, "loss": 5.9168, "step": 119000 }, { "epoch": 0.22792816162109375, "grad_norm": 337.4336242675781, "learning_rate": 3.8603687286376956e-05, "lookahead_loss": 8.84510175895691, "loss": 5.9226, "step": 119500 }, { "epoch": 0.2288818359375, "grad_norm": 76.92498016357422, "learning_rate": 3.855600357055664e-05, "lookahead_loss": 9.544180777549744, "loss": 5.9973, "step": 120000 }, { "epoch": 0.2288818359375, "eval_accuracy": 0.03938454011741683, "eval_lookahead_loss": 8.653075380134583, "eval_lookahead_perplexity": 5727.734636465208, "eval_loss": 5.781472206115723, "eval_perplexity": 324.2361817059904, "eval_runtime": 499.1904, "eval_samples_per_second": 20.032, "eval_steps_per_second": 5.008, "step": 120000 }, { "epoch": 0.22983551025390625, "grad_norm": 177.666259765625, "learning_rate": 3.850831985473633e-05, "lookahead_loss": 8.754478918075561, "loss": 5.9635, "step": 120500 }, { "epoch": 0.2307891845703125, "grad_norm": 182.19992065429688, "learning_rate": 3.846063613891602e-05, "lookahead_loss": 8.454521368026734, "loss": 5.9219, "step": 121000 }, { "epoch": 0.23174285888671875, "grad_norm": 146.20733642578125, "learning_rate": 3.84129524230957e-05, "lookahead_loss": 8.433117434501648, "loss": 5.8765, "step": 121500 }, { "epoch": 0.232696533203125, "grad_norm": 259.7908020019531, "learning_rate": 3.8365268707275393e-05, "lookahead_loss": 8.247176418304443, "loss": 5.8642, "step": 122000 }, { "epoch": 0.23365020751953125, "grad_norm": 209.00685119628906, "learning_rate": 3.831758499145508e-05, "lookahead_loss": 8.332466981887817, "loss": 5.9301, "step": 122500 }, { "epoch": 0.2346038818359375, "grad_norm": 147.10595703125, "learning_rate": 3.826990127563477e-05, "lookahead_loss": 8.276109350204468, "loss": 5.8865, "step": 123000 }, { "epoch": 0.23555755615234375, "grad_norm": 99.65434265136719, "learning_rate": 3.822221755981446e-05, "lookahead_loss": 8.37988139820099, "loss": 5.8958, "step": 123500 }, { "epoch": 0.23651123046875, "grad_norm": 126.29402923583984, "learning_rate": 3.817453384399414e-05, "lookahead_loss": 8.136833830833435, "loss": 5.8886, "step": 124000 }, { "epoch": 0.23746490478515625, "grad_norm": 157.21607971191406, "learning_rate": 3.812685012817383e-05, "lookahead_loss": 8.302670984268188, "loss": 5.8689, "step": 124500 }, { "epoch": 0.2384185791015625, "grad_norm": 103.73290252685547, "learning_rate": 3.8079166412353514e-05, "lookahead_loss": 8.080665404319763, "loss": 5.8888, "step": 125000 }, { "epoch": 0.2384185791015625, "eval_accuracy": 0.04015225048923679, "eval_lookahead_loss": 8.107316847419739, "eval_lookahead_perplexity": 3318.6615977402853, "eval_loss": 5.730016231536865, "eval_perplexity": 307.9742672325134, "eval_runtime": 505.1516, "eval_samples_per_second": 19.796, "eval_steps_per_second": 4.949, "step": 125000 }, { "epoch": 0.23937225341796875, "grad_norm": 72.77110290527344, "learning_rate": 3.8031482696533205e-05, "lookahead_loss": 8.290345351219177, "loss": 5.8921, "step": 125500 }, { "epoch": 0.240325927734375, "grad_norm": 115.89087677001953, "learning_rate": 3.7983798980712895e-05, "lookahead_loss": 8.112725184440613, "loss": 5.879, "step": 126000 }, { "epoch": 0.24127960205078125, "grad_norm": 235.29368591308594, "learning_rate": 3.793611526489258e-05, "lookahead_loss": 7.949018230438233, "loss": 5.8968, "step": 126500 }, { "epoch": 0.2422332763671875, "grad_norm": 168.8629913330078, "learning_rate": 3.788843154907227e-05, "lookahead_loss": 8.093102324485779, "loss": 5.8933, "step": 127000 }, { "epoch": 0.24318695068359375, "grad_norm": 528.7093505859375, "learning_rate": 3.784074783325195e-05, "lookahead_loss": 8.183201222419738, "loss": 5.8897, "step": 127500 }, { "epoch": 0.244140625, "grad_norm": 105.25418853759766, "learning_rate": 3.779306411743164e-05, "lookahead_loss": 8.16680136871338, "loss": 5.9033, "step": 128000 }, { "epoch": 0.24509429931640625, "grad_norm": 117.61679077148438, "learning_rate": 3.774538040161133e-05, "lookahead_loss": 8.230734627723693, "loss": 5.8708, "step": 128500 }, { "epoch": 0.2460479736328125, "grad_norm": 801.7338256835938, "learning_rate": 3.7697696685791016e-05, "lookahead_loss": 8.51412269115448, "loss": 5.8784, "step": 129000 }, { "epoch": 0.24700164794921875, "grad_norm": 111.5937728881836, "learning_rate": 3.7650012969970706e-05, "lookahead_loss": 8.413945517539979, "loss": 5.8686, "step": 129500 }, { "epoch": 0.247955322265625, "grad_norm": 247.51522827148438, "learning_rate": 3.760232925415039e-05, "lookahead_loss": 9.133602955818176, "loss": 5.9601, "step": 130000 }, { "epoch": 0.247955322265625, "eval_accuracy": 0.040870058708414876, "eval_lookahead_loss": 8.694239983177185, "eval_lookahead_perplexity": 5968.43473267319, "eval_loss": 5.752523422241211, "eval_perplexity": 314.98449715740867, "eval_runtime": 506.288, "eval_samples_per_second": 19.752, "eval_steps_per_second": 4.938, "step": 130000 }, { "epoch": 0.24890899658203125, "grad_norm": 91.10067749023438, "learning_rate": 3.755464553833008e-05, "lookahead_loss": 8.722510346412658, "loss": 5.9129, "step": 130500 }, { "epoch": 0.2498626708984375, "grad_norm": 125.04576873779297, "learning_rate": 3.750696182250977e-05, "lookahead_loss": 8.298921633720399, "loss": 5.8677, "step": 131000 }, { "epoch": 0.25081634521484375, "grad_norm": 128.95962524414062, "learning_rate": 3.745927810668945e-05, "lookahead_loss": 8.28252799797058, "loss": 5.8574, "step": 131500 }, { "epoch": 0.25177001953125, "grad_norm": 384.0034484863281, "learning_rate": 3.7411594390869143e-05, "lookahead_loss": 8.150385370254517, "loss": 5.8547, "step": 132000 }, { "epoch": 0.25272369384765625, "grad_norm": 190.9263153076172, "learning_rate": 3.736391067504883e-05, "lookahead_loss": 8.239738685607911, "loss": 5.843, "step": 132500 }, { "epoch": 0.2536773681640625, "grad_norm": 139.46914672851562, "learning_rate": 3.731622695922852e-05, "lookahead_loss": 9.501468218803407, "loss": 5.9685, "step": 133000 }, { "epoch": 0.25463104248046875, "grad_norm": 84.62285614013672, "learning_rate": 3.726854324340821e-05, "lookahead_loss": 9.616533128738403, "loss": 6.0399, "step": 133500 }, { "epoch": 0.255584716796875, "grad_norm": 125.87892150878906, "learning_rate": 3.722085952758789e-05, "lookahead_loss": 8.491555513381957, "loss": 5.9431, "step": 134000 }, { "epoch": 0.25653839111328125, "grad_norm": 442.814453125, "learning_rate": 3.717317581176758e-05, "lookahead_loss": 8.225759867668152, "loss": 5.9357, "step": 134500 }, { "epoch": 0.2574920654296875, "grad_norm": 178.03439331054688, "learning_rate": 3.7125492095947264e-05, "lookahead_loss": 8.229091106414796, "loss": 5.8925, "step": 135000 }, { "epoch": 0.2574920654296875, "eval_accuracy": 0.040535029354207434, "eval_lookahead_loss": 8.166419868087768, "eval_lookahead_perplexity": 3520.7167431864013, "eval_loss": 5.714168071746826, "eval_perplexity": 303.13191437680405, "eval_runtime": 531.9495, "eval_samples_per_second": 18.799, "eval_steps_per_second": 4.7, "step": 135000 }, { "epoch": 0.25844573974609375, "grad_norm": 213.3984375, "learning_rate": 3.7077808380126955e-05, "lookahead_loss": 8.2034528799057, "loss": 5.893, "step": 135500 }, { "epoch": 0.2593994140625, "grad_norm": 107.15808868408203, "learning_rate": 3.7030124664306645e-05, "lookahead_loss": 8.004346689224244, "loss": 5.8571, "step": 136000 }, { "epoch": 0.26035308837890625, "grad_norm": 143.5915985107422, "learning_rate": 3.698244094848633e-05, "lookahead_loss": 8.046038606643677, "loss": 5.8414, "step": 136500 }, { "epoch": 0.2613067626953125, "grad_norm": 122.59235382080078, "learning_rate": 3.693475723266602e-05, "lookahead_loss": 8.150496848106384, "loss": 5.8998, "step": 137000 }, { "epoch": 0.26226043701171875, "grad_norm": 203.30479431152344, "learning_rate": 3.68870735168457e-05, "lookahead_loss": 8.115791666984558, "loss": 5.9206, "step": 137500 }, { "epoch": 0.263214111328125, "grad_norm": 231.3633270263672, "learning_rate": 3.683938980102539e-05, "lookahead_loss": 8.05616670703888, "loss": 5.8715, "step": 138000 }, { "epoch": 0.26416778564453125, "grad_norm": 136.63230895996094, "learning_rate": 3.679170608520508e-05, "lookahead_loss": 8.164144186019897, "loss": 5.8742, "step": 138500 }, { "epoch": 0.2651214599609375, "grad_norm": 200.2576141357422, "learning_rate": 3.6744022369384766e-05, "lookahead_loss": 8.124045961380006, "loss": 5.7839, "step": 139000 }, { "epoch": 0.26607513427734375, "grad_norm": 195.14260864257812, "learning_rate": 3.6696338653564456e-05, "lookahead_loss": 8.438293287277222, "loss": 5.9325, "step": 139500 }, { "epoch": 0.26702880859375, "grad_norm": 97.98570251464844, "learning_rate": 3.664865493774414e-05, "lookahead_loss": 8.239123771667481, "loss": 5.8557, "step": 140000 }, { "epoch": 0.26702880859375, "eval_accuracy": 0.04010606653620352, "eval_lookahead_loss": 7.995683560371399, "eval_lookahead_perplexity": 2968.1185920269118, "eval_loss": 5.699228286743164, "eval_perplexity": 298.6368500027575, "eval_runtime": 549.547, "eval_samples_per_second": 18.197, "eval_steps_per_second": 4.549, "step": 140000 }, { "epoch": 0.26798248291015625, "grad_norm": 171.977294921875, "learning_rate": 3.660097122192383e-05, "lookahead_loss": 8.129929531097412, "loss": 5.8711, "step": 140500 }, { "epoch": 0.2689361572265625, "grad_norm": 94.07719421386719, "learning_rate": 3.655328750610352e-05, "lookahead_loss": 8.203173540115356, "loss": 5.8628, "step": 141000 }, { "epoch": 0.26988983154296875, "grad_norm": 104.68416595458984, "learning_rate": 3.65056037902832e-05, "lookahead_loss": 8.133862190246582, "loss": 5.8943, "step": 141500 }, { "epoch": 0.270843505859375, "grad_norm": 255.28533935546875, "learning_rate": 3.6457920074462893e-05, "lookahead_loss": 8.057834712982178, "loss": 5.8519, "step": 142000 }, { "epoch": 0.27179718017578125, "grad_norm": 140.80821228027344, "learning_rate": 3.641023635864258e-05, "lookahead_loss": 8.065425714492799, "loss": 5.8633, "step": 142500 }, { "epoch": 0.2727508544921875, "grad_norm": 352.9727783203125, "learning_rate": 3.636255264282227e-05, "lookahead_loss": 7.982576118469238, "loss": 5.8192, "step": 143000 }, { "epoch": 0.27370452880859375, "grad_norm": 343.0337219238281, "learning_rate": 3.631486892700196e-05, "lookahead_loss": 8.147901776313782, "loss": 5.8947, "step": 143500 }, { "epoch": 0.274658203125, "grad_norm": 150.05630493164062, "learning_rate": 3.626718521118164e-05, "lookahead_loss": 8.160402549743653, "loss": 5.8654, "step": 144000 }, { "epoch": 0.27561187744140625, "grad_norm": 117.56890106201172, "learning_rate": 3.621950149536133e-05, "lookahead_loss": 8.112312719345093, "loss": 5.8142, "step": 144500 }, { "epoch": 0.2765655517578125, "grad_norm": 170.95387268066406, "learning_rate": 3.6171817779541014e-05, "lookahead_loss": 8.179612812995911, "loss": 5.8511, "step": 145000 }, { "epoch": 0.2765655517578125, "eval_accuracy": 0.040202152641878666, "eval_lookahead_loss": 7.975206629180908, "eval_lookahead_perplexity": 2907.9586791475394, "eval_loss": 5.704812049865723, "eval_perplexity": 300.3090316212061, "eval_runtime": 517.9662, "eval_samples_per_second": 19.306, "eval_steps_per_second": 4.827, "step": 145000 }, { "epoch": 0.27751922607421875, "grad_norm": 96.55410766601562, "learning_rate": 3.6124134063720705e-05, "lookahead_loss": 8.07830410861969, "loss": 5.8184, "step": 145500 }, { "epoch": 0.278472900390625, "grad_norm": 109.48090362548828, "learning_rate": 3.6076450347900395e-05, "lookahead_loss": 8.020852543830872, "loss": 5.8162, "step": 146000 }, { "epoch": 0.27942657470703125, "grad_norm": 133.98004150390625, "learning_rate": 3.602876663208008e-05, "lookahead_loss": 8.015573055267334, "loss": 5.8057, "step": 146500 }, { "epoch": 0.2803802490234375, "grad_norm": 114.32443237304688, "learning_rate": 3.598108291625977e-05, "lookahead_loss": 8.070414279937744, "loss": 5.8629, "step": 147000 }, { "epoch": 0.28133392333984375, "grad_norm": 228.17405700683594, "learning_rate": 3.593339920043945e-05, "lookahead_loss": 7.88713484954834, "loss": 5.8084, "step": 147500 }, { "epoch": 0.28228759765625, "grad_norm": 108.4611587524414, "learning_rate": 3.588571548461914e-05, "lookahead_loss": 8.00777504825592, "loss": 5.7551, "step": 148000 }, { "epoch": 0.28324127197265625, "grad_norm": 174.8192596435547, "learning_rate": 3.583803176879883e-05, "lookahead_loss": 8.141259288787841, "loss": 5.7819, "step": 148500 }, { "epoch": 0.2841949462890625, "grad_norm": 208.42135620117188, "learning_rate": 3.5790348052978516e-05, "lookahead_loss": 8.115955495834351, "loss": 5.8965, "step": 149000 }, { "epoch": 0.28514862060546875, "grad_norm": 84.07195281982422, "learning_rate": 3.5742664337158206e-05, "lookahead_loss": 8.166845591068268, "loss": 5.8718, "step": 149500 }, { "epoch": 0.286102294921875, "grad_norm": 95.39439392089844, "learning_rate": 3.569498062133789e-05, "lookahead_loss": 7.9816878776550295, "loss": 5.8921, "step": 150000 }, { "epoch": 0.286102294921875, "eval_accuracy": 0.04071780821917808, "eval_lookahead_loss": 7.96270551071167, "eval_lookahead_perplexity": 2871.8322242218233, "eval_loss": 5.664773464202881, "eval_perplexity": 288.5226130126613, "eval_runtime": 626.5163, "eval_samples_per_second": 15.961, "eval_steps_per_second": 3.99, "step": 150000 }, { "epoch": 0.28705596923828125, "grad_norm": 200.16087341308594, "learning_rate": 3.564729690551758e-05, "lookahead_loss": 8.159387079238892, "loss": 5.8435, "step": 150500 }, { "epoch": 0.2880096435546875, "grad_norm": 186.96412658691406, "learning_rate": 3.559961318969727e-05, "lookahead_loss": 8.297013856887817, "loss": 5.8487, "step": 151000 }, { "epoch": 0.28896331787109375, "grad_norm": 132.94847106933594, "learning_rate": 3.555192947387695e-05, "lookahead_loss": 8.266641647338867, "loss": 5.8221, "step": 151500 }, { "epoch": 0.2899169921875, "grad_norm": 197.22323608398438, "learning_rate": 3.5504245758056643e-05, "lookahead_loss": 8.100845941543579, "loss": 5.8775, "step": 152000 }, { "epoch": 0.29087066650390625, "grad_norm": 126.6111831665039, "learning_rate": 3.545656204223633e-05, "lookahead_loss": 7.972996250152588, "loss": 5.798, "step": 152500 }, { "epoch": 0.2918243408203125, "grad_norm": 156.23532104492188, "learning_rate": 3.540887832641602e-05, "lookahead_loss": 8.054474659442901, "loss": 5.8827, "step": 153000 }, { "epoch": 0.29277801513671875, "grad_norm": 79.5146484375, "learning_rate": 3.536119461059571e-05, "lookahead_loss": 8.00607446193695, "loss": 5.8534, "step": 153500 }, { "epoch": 0.293731689453125, "grad_norm": 263.6907043457031, "learning_rate": 3.531351089477539e-05, "lookahead_loss": 8.044375644683837, "loss": 5.8624, "step": 154000 }, { "epoch": 0.29468536376953125, "grad_norm": 122.52787780761719, "learning_rate": 3.526582717895508e-05, "lookahead_loss": 8.031048059463501, "loss": 5.8525, "step": 154500 }, { "epoch": 0.2956390380859375, "grad_norm": 188.5401153564453, "learning_rate": 3.5218143463134764e-05, "lookahead_loss": 7.998628089904785, "loss": 5.8002, "step": 155000 }, { "epoch": 0.2956390380859375, "eval_accuracy": 0.04003307240704501, "eval_lookahead_loss": 7.891171989440918, "eval_lookahead_perplexity": 2673.5754874775644, "eval_loss": 5.649391174316406, "eval_perplexity": 284.11843455618543, "eval_runtime": 552.2757, "eval_samples_per_second": 18.107, "eval_steps_per_second": 4.527, "step": 155000 }, { "epoch": 0.29659271240234375, "grad_norm": 168.0022430419922, "learning_rate": 3.5170459747314455e-05, "lookahead_loss": 8.031185307502746, "loss": 5.7806, "step": 155500 }, { "epoch": 0.29754638671875, "grad_norm": 212.51593017578125, "learning_rate": 3.5122776031494145e-05, "lookahead_loss": 8.120828740119935, "loss": 5.8299, "step": 156000 }, { "epoch": 0.29850006103515625, "grad_norm": 144.78135681152344, "learning_rate": 3.507509231567383e-05, "lookahead_loss": 8.36105933856964, "loss": 5.8281, "step": 156500 }, { "epoch": 0.2994537353515625, "grad_norm": 157.19422912597656, "learning_rate": 3.502740859985352e-05, "lookahead_loss": 8.359768169403075, "loss": 5.8386, "step": 157000 }, { "epoch": 0.30040740966796875, "grad_norm": 83.20254516601562, "learning_rate": 3.49797248840332e-05, "lookahead_loss": 8.57559892654419, "loss": 5.9348, "step": 157500 }, { "epoch": 0.301361083984375, "grad_norm": 46.59617233276367, "learning_rate": 3.493204116821289e-05, "lookahead_loss": 8.401400839805603, "loss": 5.8505, "step": 158000 }, { "epoch": 0.30231475830078125, "grad_norm": 224.74256896972656, "learning_rate": 3.488435745239258e-05, "lookahead_loss": 8.388716652870178, "loss": 5.871, "step": 158500 }, { "epoch": 0.3032684326171875, "grad_norm": 123.9939956665039, "learning_rate": 3.4836673736572266e-05, "lookahead_loss": 8.030845665931702, "loss": 5.8064, "step": 159000 }, { "epoch": 0.30422210693359375, "grad_norm": 235.08709716796875, "learning_rate": 3.4788990020751956e-05, "lookahead_loss": 8.040814640045166, "loss": 5.7708, "step": 159500 }, { "epoch": 0.30517578125, "grad_norm": 111.90243530273438, "learning_rate": 3.474130630493164e-05, "lookahead_loss": 8.105880452156066, "loss": 5.8017, "step": 160000 }, { "epoch": 0.30517578125, "eval_accuracy": 0.039995107632093935, "eval_lookahead_loss": 8.029669752311706, "eval_lookahead_perplexity": 3070.7274051745303, "eval_loss": 5.665389060974121, "eval_perplexity": 288.7002812820574, "eval_runtime": 563.4331, "eval_samples_per_second": 17.748, "eval_steps_per_second": 4.437, "step": 160000 }, { "epoch": 0.30612945556640625, "grad_norm": 228.57728576660156, "learning_rate": 3.469362258911133e-05, "lookahead_loss": 8.09919670009613, "loss": 5.7844, "step": 160500 }, { "epoch": 0.3070831298828125, "grad_norm": 352.4801940917969, "learning_rate": 3.464593887329102e-05, "lookahead_loss": 8.174008321762084, "loss": 5.8215, "step": 161000 }, { "epoch": 0.30803680419921875, "grad_norm": 116.84239959716797, "learning_rate": 3.45982551574707e-05, "lookahead_loss": 8.25044997882843, "loss": 5.8009, "step": 161500 }, { "epoch": 0.308990478515625, "grad_norm": 240.47418212890625, "learning_rate": 3.4550571441650393e-05, "lookahead_loss": 8.050434608459472, "loss": 5.8107, "step": 162000 }, { "epoch": 0.30994415283203125, "grad_norm": 208.1582489013672, "learning_rate": 3.450288772583008e-05, "lookahead_loss": 8.146479434013367, "loss": 5.7966, "step": 162500 }, { "epoch": 0.3108978271484375, "grad_norm": 163.59103393554688, "learning_rate": 3.445520401000977e-05, "lookahead_loss": 8.11200495815277, "loss": 5.8208, "step": 163000 }, { "epoch": 0.31185150146484375, "grad_norm": 67.6489486694336, "learning_rate": 3.440752029418946e-05, "lookahead_loss": 8.068148673057555, "loss": 5.8021, "step": 163500 }, { "epoch": 0.31280517578125, "grad_norm": 109.78558349609375, "learning_rate": 3.435983657836914e-05, "lookahead_loss": 7.983370945930481, "loss": 5.8485, "step": 164000 }, { "epoch": 0.31375885009765625, "grad_norm": 861.293701171875, "learning_rate": 3.431215286254883e-05, "lookahead_loss": 7.982110065460205, "loss": 5.8757, "step": 164500 }, { "epoch": 0.3147125244140625, "grad_norm": 109.53030395507812, "learning_rate": 3.4264469146728514e-05, "lookahead_loss": 8.05136696434021, "loss": 5.8462, "step": 165000 }, { "epoch": 0.3147125244140625, "eval_accuracy": 0.040486888454011744, "eval_lookahead_loss": 7.969085597801208, "eval_lookahead_perplexity": 2890.2133381216318, "eval_loss": 5.663883686065674, "eval_perplexity": 288.2660060780627, "eval_runtime": 686.0565, "eval_samples_per_second": 14.576, "eval_steps_per_second": 3.644, "step": 165000 }, { "epoch": 0.31566619873046875, "grad_norm": 93.1082992553711, "learning_rate": 3.4216785430908205e-05, "lookahead_loss": 7.966514645576477, "loss": 5.8625, "step": 165500 }, { "epoch": 0.316619873046875, "grad_norm": 76.82735443115234, "learning_rate": 3.4169101715087895e-05, "lookahead_loss": 7.983373554229736, "loss": 5.8218, "step": 166000 }, { "epoch": 0.31757354736328125, "grad_norm": 210.3797607421875, "learning_rate": 3.412141799926758e-05, "lookahead_loss": 8.170269319534302, "loss": 5.8551, "step": 166500 }, { "epoch": 0.3185272216796875, "grad_norm": 100.93452453613281, "learning_rate": 3.407373428344727e-05, "lookahead_loss": 8.394991693496705, "loss": 5.8604, "step": 167000 }, { "epoch": 0.31948089599609375, "grad_norm": 484.09832763671875, "learning_rate": 3.402605056762695e-05, "lookahead_loss": 8.799272077560424, "loss": 5.866, "step": 167500 }, { "epoch": 0.3204345703125, "grad_norm": 285.850341796875, "learning_rate": 3.397836685180664e-05, "lookahead_loss": 10.117726707458496, "loss": 5.9184, "step": 168000 }, { "epoch": 0.32138824462890625, "grad_norm": 756.4190063476562, "learning_rate": 3.393068313598633e-05, "lookahead_loss": 9.095554563522338, "loss": 5.8656, "step": 168500 }, { "epoch": 0.3223419189453125, "grad_norm": 309.3544006347656, "learning_rate": 3.3882999420166016e-05, "lookahead_loss": 8.838002510070801, "loss": 5.9167, "step": 169000 }, { "epoch": 0.32329559326171875, "grad_norm": 183.6266326904297, "learning_rate": 3.3835315704345706e-05, "lookahead_loss": 8.383255373954773, "loss": 5.8588, "step": 169500 }, { "epoch": 0.324249267578125, "grad_norm": 225.40940856933594, "learning_rate": 3.378763198852539e-05, "lookahead_loss": 8.47177024269104, "loss": 5.8635, "step": 170000 }, { "epoch": 0.324249267578125, "eval_accuracy": 0.040258904109589044, "eval_lookahead_loss": 8.240499300765991, "eval_lookahead_perplexity": 3791.432900996432, "eval_loss": 5.665499687194824, "eval_perplexity": 288.7322208697369, "eval_runtime": 553.7865, "eval_samples_per_second": 18.058, "eval_steps_per_second": 4.514, "step": 170000 }, { "epoch": 0.32520294189453125, "grad_norm": 449.2416076660156, "learning_rate": 3.373994827270508e-05, "lookahead_loss": 8.298254358291626, "loss": 5.855, "step": 170500 }, { "epoch": 0.3261566162109375, "grad_norm": 154.37396240234375, "learning_rate": 3.369226455688477e-05, "lookahead_loss": 8.340126993179322, "loss": 5.8557, "step": 171000 }, { "epoch": 0.32711029052734375, "grad_norm": 274.73968505859375, "learning_rate": 3.364458084106445e-05, "lookahead_loss": 8.133369791030884, "loss": 5.7964, "step": 171500 }, { "epoch": 0.32806396484375, "grad_norm": 151.22427368164062, "learning_rate": 3.3596897125244143e-05, "lookahead_loss": 8.307823240280152, "loss": 5.835, "step": 172000 }, { "epoch": 0.32901763916015625, "grad_norm": 145.38394165039062, "learning_rate": 3.354921340942383e-05, "lookahead_loss": 8.022544721603394, "loss": 5.8467, "step": 172500 }, { "epoch": 0.3299713134765625, "grad_norm": 138.09385681152344, "learning_rate": 3.350152969360352e-05, "lookahead_loss": 8.127099394798279, "loss": 5.8466, "step": 173000 }, { "epoch": 0.33092498779296875, "grad_norm": 377.91656494140625, "learning_rate": 3.345384597778321e-05, "lookahead_loss": 8.1452312541008, "loss": 5.857, "step": 173500 }, { "epoch": 0.331878662109375, "grad_norm": 312.3646545410156, "learning_rate": 3.340616226196289e-05, "lookahead_loss": 8.134938665390015, "loss": 5.8392, "step": 174000 }, { "epoch": 0.33283233642578125, "grad_norm": 206.4745330810547, "learning_rate": 3.335847854614258e-05, "lookahead_loss": 8.414973595619202, "loss": 5.8322, "step": 174500 }, { "epoch": 0.3337860107421875, "grad_norm": 426.61541748046875, "learning_rate": 3.3310794830322264e-05, "lookahead_loss": 8.030229475975037, "loss": 5.7894, "step": 175000 }, { "epoch": 0.3337860107421875, "eval_accuracy": 0.039916634050880626, "eval_lookahead_loss": 8.03914587841034, "eval_lookahead_perplexity": 3099.964312810433, "eval_loss": 5.663393020629883, "eval_perplexity": 288.1245986072828, "eval_runtime": 520.724, "eval_samples_per_second": 19.204, "eval_steps_per_second": 4.801, "step": 175000 }, { "epoch": 0.33473968505859375, "grad_norm": 272.94317626953125, "learning_rate": 3.3263111114501955e-05, "lookahead_loss": 8.26338819217682, "loss": 5.8704, "step": 175500 }, { "epoch": 0.335693359375, "grad_norm": 342.1329345703125, "learning_rate": 3.3215427398681645e-05, "lookahead_loss": 8.820026116371155, "loss": 5.8846, "step": 176000 }, { "epoch": 0.33664703369140625, "grad_norm": 370.400634765625, "learning_rate": 3.316774368286133e-05, "lookahead_loss": 8.490201613426208, "loss": 5.8347, "step": 176500 }, { "epoch": 0.3376007080078125, "grad_norm": 159.84844970703125, "learning_rate": 3.312005996704102e-05, "lookahead_loss": 8.217913587570191, "loss": 5.8344, "step": 177000 }, { "epoch": 0.33855438232421875, "grad_norm": 2411.37451171875, "learning_rate": 3.30723762512207e-05, "lookahead_loss": 8.13470829296112, "loss": 5.7937, "step": 177500 }, { "epoch": 0.339508056640625, "grad_norm": 232.2559814453125, "learning_rate": 3.302469253540039e-05, "lookahead_loss": 7.857377230644226, "loss": 5.7343, "step": 178000 }, { "epoch": 0.34046173095703125, "grad_norm": 346.36309814453125, "learning_rate": 3.297700881958008e-05, "lookahead_loss": 7.92359204864502, "loss": 5.7337, "step": 178500 }, { "epoch": 0.3414154052734375, "grad_norm": 161.33160400390625, "learning_rate": 3.2929325103759766e-05, "lookahead_loss": 8.056898574829102, "loss": 5.7821, "step": 179000 }, { "epoch": 0.34236907958984375, "grad_norm": 341.30780029296875, "learning_rate": 3.2881641387939456e-05, "lookahead_loss": 8.376977870941163, "loss": 5.8777, "step": 179500 }, { "epoch": 0.34332275390625, "grad_norm": 759.652099609375, "learning_rate": 3.283395767211914e-05, "lookahead_loss": 8.27805838394165, "loss": 5.9122, "step": 180000 }, { "epoch": 0.34332275390625, "eval_accuracy": 0.04105303326810176, "eval_lookahead_loss": 8.043607985687256, "eval_lookahead_perplexity": 3113.8275928482267, "eval_loss": 5.654881954193115, "eval_perplexity": 285.6827570843937, "eval_runtime": 571.3628, "eval_samples_per_second": 17.502, "eval_steps_per_second": 4.376, "step": 180000 }, { "epoch": 0.34427642822265625, "grad_norm": 281.83343505859375, "learning_rate": 3.278627395629883e-05, "lookahead_loss": 8.309767990112304, "loss": 5.8761, "step": 180500 }, { "epoch": 0.3452301025390625, "grad_norm": 250.48297119140625, "learning_rate": 3.273859024047852e-05, "lookahead_loss": 8.456875951766968, "loss": 5.9098, "step": 181000 }, { "epoch": 0.34618377685546875, "grad_norm": 140.5371856689453, "learning_rate": 3.26909065246582e-05, "lookahead_loss": 8.162277752876282, "loss": 5.8447, "step": 181500 }, { "epoch": 0.347137451171875, "grad_norm": 241.10263061523438, "learning_rate": 3.2643222808837893e-05, "lookahead_loss": 7.967324462890625, "loss": 5.8361, "step": 182000 }, { "epoch": 0.34809112548828125, "grad_norm": 212.21217346191406, "learning_rate": 3.259553909301758e-05, "lookahead_loss": 8.094841960906983, "loss": 5.8594, "step": 182500 }, { "epoch": 0.3490447998046875, "grad_norm": 246.3410186767578, "learning_rate": 3.254785537719727e-05, "lookahead_loss": 8.097664910316468, "loss": 5.8508, "step": 183000 }, { "epoch": 0.34999847412109375, "grad_norm": 205.01502990722656, "learning_rate": 3.250017166137696e-05, "lookahead_loss": 7.913767645835876, "loss": 5.8212, "step": 183500 }, { "epoch": 0.3509521484375, "grad_norm": 416.0746154785156, "learning_rate": 3.245248794555664e-05, "lookahead_loss": 8.170962131500245, "loss": 5.8526, "step": 184000 }, { "epoch": 0.35190582275390625, "grad_norm": 570.0916137695312, "learning_rate": 3.240480422973633e-05, "lookahead_loss": 8.107626894950867, "loss": 5.8204, "step": 184500 }, { "epoch": 0.3528594970703125, "grad_norm": 235.67578125, "learning_rate": 3.2357120513916014e-05, "lookahead_loss": 8.328926097869873, "loss": 5.8401, "step": 185000 }, { "epoch": 0.3528594970703125, "eval_accuracy": 0.040856360078277885, "eval_lookahead_loss": 8.26389328956604, "eval_lookahead_perplexity": 3881.1752629203247, "eval_loss": 5.655423164367676, "eval_perplexity": 285.8374133462205, "eval_runtime": 531.0433, "eval_samples_per_second": 18.831, "eval_steps_per_second": 4.708, "step": 185000 }, { "epoch": 0.35381317138671875, "grad_norm": 141.55096435546875, "learning_rate": 3.2309436798095705e-05, "lookahead_loss": 8.1678730840683, "loss": 5.857, "step": 185500 }, { "epoch": 0.354766845703125, "grad_norm": 324.599853515625, "learning_rate": 3.2261753082275395e-05, "lookahead_loss": 8.148693853378296, "loss": 5.7887, "step": 186000 }, { "epoch": 0.35572052001953125, "grad_norm": 891.4889526367188, "learning_rate": 3.221406936645508e-05, "lookahead_loss": 8.12470075082779, "loss": 5.8135, "step": 186500 }, { "epoch": 0.3566741943359375, "grad_norm": 375.6871337890625, "learning_rate": 3.216638565063477e-05, "lookahead_loss": 8.219293649673462, "loss": 5.7972, "step": 187000 }, { "epoch": 0.35762786865234375, "grad_norm": 179.70066833496094, "learning_rate": 3.211870193481445e-05, "lookahead_loss": 8.218752856254577, "loss": 5.8141, "step": 187500 }, { "epoch": 0.35858154296875, "grad_norm": 253.73570251464844, "learning_rate": 3.207101821899414e-05, "lookahead_loss": 8.280322497367859, "loss": 5.8451, "step": 188000 }, { "epoch": 0.35953521728515625, "grad_norm": 387.13787841796875, "learning_rate": 3.202333450317383e-05, "lookahead_loss": 8.040139802932739, "loss": 5.802, "step": 188500 }, { "epoch": 0.3604888916015625, "grad_norm": 245.4624481201172, "learning_rate": 3.1975650787353516e-05, "lookahead_loss": 8.120493285179139, "loss": 5.8742, "step": 189000 }, { "epoch": 0.36144256591796875, "grad_norm": 435.4244384765625, "learning_rate": 3.1927967071533206e-05, "lookahead_loss": 8.146875252723694, "loss": 5.7788, "step": 189500 }, { "epoch": 0.362396240234375, "grad_norm": 906.0565185546875, "learning_rate": 3.188028335571289e-05, "lookahead_loss": 8.166283338546753, "loss": 5.8252, "step": 190000 }, { "epoch": 0.362396240234375, "eval_accuracy": 0.040760469667318985, "eval_lookahead_loss": 7.975135536003113, "eval_lookahead_perplexity": 2907.7519504727275, "eval_loss": 5.659226894378662, "eval_perplexity": 286.92673211749127, "eval_runtime": 589.2326, "eval_samples_per_second": 16.971, "eval_steps_per_second": 4.243, "step": 190000 }, { "epoch": 0.36334991455078125, "grad_norm": 121.22456359863281, "learning_rate": 3.183259963989258e-05, "lookahead_loss": 8.150741298675538, "loss": 5.8502, "step": 190500 }, { "epoch": 0.3643035888671875, "grad_norm": 278.058349609375, "learning_rate": 3.178491592407227e-05, "lookahead_loss": 8.131261066436767, "loss": 5.8694, "step": 191000 }, { "epoch": 0.36525726318359375, "grad_norm": 624.7596435546875, "learning_rate": 3.173723220825195e-05, "lookahead_loss": 8.119778440475464, "loss": 5.8411, "step": 191500 }, { "epoch": 0.3662109375, "grad_norm": 566.6157836914062, "learning_rate": 3.1689548492431643e-05, "lookahead_loss": 8.058290302276612, "loss": 5.7905, "step": 192000 }, { "epoch": 0.36716461181640625, "grad_norm": 202.44036865234375, "learning_rate": 3.164186477661133e-05, "lookahead_loss": 8.05600505924225, "loss": 5.7159, "step": 192500 }, { "epoch": 0.3681182861328125, "grad_norm": 305.76129150390625, "learning_rate": 3.159418106079102e-05, "lookahead_loss": 7.997986116409302, "loss": 5.7022, "step": 193000 }, { "epoch": 0.36907196044921875, "grad_norm": 253.99127197265625, "learning_rate": 3.154649734497071e-05, "lookahead_loss": 8.106854535102844, "loss": 5.763, "step": 193500 }, { "epoch": 0.370025634765625, "grad_norm": 272.1480407714844, "learning_rate": 3.149881362915039e-05, "lookahead_loss": 8.210675827026368, "loss": 5.7916, "step": 194000 }, { "epoch": 0.37097930908203125, "grad_norm": 278.51385498046875, "learning_rate": 3.145112991333008e-05, "lookahead_loss": 8.134797664642335, "loss": 5.8759, "step": 194500 }, { "epoch": 0.3719329833984375, "grad_norm": 502.9010314941406, "learning_rate": 3.1403446197509764e-05, "lookahead_loss": 8.081912855148316, "loss": 5.8975, "step": 195000 }, { "epoch": 0.3719329833984375, "eval_accuracy": 0.04048375733855186, "eval_lookahead_loss": 7.978938808631897, "eval_lookahead_perplexity": 2918.8319807091743, "eval_loss": 5.641407012939453, "eval_perplexity": 281.85901889667974, "eval_runtime": 557.1792, "eval_samples_per_second": 17.948, "eval_steps_per_second": 4.487, "step": 195000 }, { "epoch": 0.37288665771484375, "grad_norm": 122.92047882080078, "learning_rate": 3.1355762481689455e-05, "lookahead_loss": 7.95623159790039, "loss": 5.8257, "step": 195500 }, { "epoch": 0.37384033203125, "grad_norm": 118.28034973144531, "learning_rate": 3.1308078765869145e-05, "lookahead_loss": 8.163280108451843, "loss": 5.891, "step": 196000 }, { "epoch": 0.37479400634765625, "grad_norm": 96.59468078613281, "learning_rate": 3.126039505004883e-05, "lookahead_loss": 7.991646527290344, "loss": 5.8571, "step": 196500 }, { "epoch": 0.3757476806640625, "grad_norm": 118.70195007324219, "learning_rate": 3.121271133422852e-05, "lookahead_loss": 7.9621712636947635, "loss": 5.8371, "step": 197000 }, { "epoch": 0.37670135498046875, "grad_norm": 267.72894287109375, "learning_rate": 3.11650276184082e-05, "lookahead_loss": 8.0290306558609, "loss": 5.8964, "step": 197500 }, { "epoch": 0.377655029296875, "grad_norm": 652.552001953125, "learning_rate": 3.111734390258789e-05, "lookahead_loss": 8.187683712005615, "loss": 5.8437, "step": 198000 }, { "epoch": 0.37860870361328125, "grad_norm": 126.76042175292969, "learning_rate": 3.106966018676758e-05, "lookahead_loss": 7.977932915687561, "loss": 5.8855, "step": 198500 }, { "epoch": 0.3795623779296875, "grad_norm": 351.3497009277344, "learning_rate": 3.1021976470947266e-05, "lookahead_loss": 7.912714574813843, "loss": 5.8553, "step": 199000 }, { "epoch": 0.38051605224609375, "grad_norm": 161.19146728515625, "learning_rate": 3.0974292755126956e-05, "lookahead_loss": 8.016971839904786, "loss": 5.8537, "step": 199500 }, { "epoch": 0.3814697265625, "grad_norm": 140.53555297851562, "learning_rate": 3.092660903930664e-05, "lookahead_loss": 8.002714621543884, "loss": 5.8008, "step": 200000 }, { "epoch": 0.3814697265625, "eval_accuracy": 0.03930645792563601, "eval_lookahead_loss": 7.877221362876892, "eval_lookahead_perplexity": 2636.536394250328, "eval_loss": 5.632281303405762, "eval_perplexity": 279.29855614971035, "eval_runtime": 540.911, "eval_samples_per_second": 18.487, "eval_steps_per_second": 4.622, "step": 200000 }, { "epoch": 0.38242340087890625, "grad_norm": 298.0007629394531, "learning_rate": 3.087892532348633e-05, "lookahead_loss": 8.005448036193847, "loss": 5.7537, "step": 200500 }, { "epoch": 0.3833770751953125, "grad_norm": 201.19056701660156, "learning_rate": 3.083124160766602e-05, "lookahead_loss": 8.01079409980774, "loss": 5.8229, "step": 201000 }, { "epoch": 0.38433074951171875, "grad_norm": 251.80567932128906, "learning_rate": 3.07835578918457e-05, "lookahead_loss": 8.004904676437379, "loss": 5.8583, "step": 201500 }, { "epoch": 0.385284423828125, "grad_norm": 178.39732360839844, "learning_rate": 3.0735874176025393e-05, "lookahead_loss": 7.99044279384613, "loss": 5.8442, "step": 202000 }, { "epoch": 0.38623809814453125, "grad_norm": 205.5572052001953, "learning_rate": 3.068819046020508e-05, "lookahead_loss": 7.993159726142883, "loss": 5.8064, "step": 202500 }, { "epoch": 0.3871917724609375, "grad_norm": 316.1572265625, "learning_rate": 3.064050674438477e-05, "lookahead_loss": 7.932577266693115, "loss": 5.8209, "step": 203000 }, { "epoch": 0.38814544677734375, "grad_norm": 440.8681335449219, "learning_rate": 3.059282302856446e-05, "lookahead_loss": 7.9652340259552, "loss": 5.8089, "step": 203500 }, { "epoch": 0.38909912109375, "grad_norm": 154.04983520507812, "learning_rate": 3.054513931274414e-05, "lookahead_loss": 7.938410136222839, "loss": 5.8171, "step": 204000 }, { "epoch": 0.39005279541015625, "grad_norm": 128.0736541748047, "learning_rate": 3.049745559692383e-05, "lookahead_loss": 8.142047427654266, "loss": 5.7789, "step": 204500 }, { "epoch": 0.3910064697265625, "grad_norm": 110.61468505859375, "learning_rate": 3.0449771881103518e-05, "lookahead_loss": 7.931080083847046, "loss": 5.776, "step": 205000 }, { "epoch": 0.3910064697265625, "eval_accuracy": 0.04008043052837573, "eval_lookahead_loss": 7.935212258720398, "eval_lookahead_perplexity": 2793.9517170335457, "eval_loss": 5.628756523132324, "eval_perplexity": 278.3158230851167, "eval_runtime": 533.8016, "eval_samples_per_second": 18.734, "eval_steps_per_second": 4.683, "step": 205000 }, { "epoch": 0.39196014404296875, "grad_norm": 123.47557067871094, "learning_rate": 3.0402088165283205e-05, "lookahead_loss": 8.026521171569824, "loss": 5.7751, "step": 205500 }, { "epoch": 0.392913818359375, "grad_norm": 496.3388671875, "learning_rate": 3.035440444946289e-05, "lookahead_loss": 8.016767192840577, "loss": 5.7799, "step": 206000 }, { "epoch": 0.39386749267578125, "grad_norm": 206.68429565429688, "learning_rate": 3.0306720733642578e-05, "lookahead_loss": 8.032983914375306, "loss": 5.781, "step": 206500 }, { "epoch": 0.3948211669921875, "grad_norm": 225.5697021484375, "learning_rate": 3.025903701782227e-05, "lookahead_loss": 8.051541337013244, "loss": 5.7931, "step": 207000 }, { "epoch": 0.39577484130859375, "grad_norm": 256.7747497558594, "learning_rate": 3.0211353302001955e-05, "lookahead_loss": 7.942907706260681, "loss": 5.7521, "step": 207500 }, { "epoch": 0.396728515625, "grad_norm": 263.1712646484375, "learning_rate": 3.0163669586181642e-05, "lookahead_loss": 7.9048853788375855, "loss": 5.7429, "step": 208000 }, { "epoch": 0.39768218994140625, "grad_norm": 267.57366943359375, "learning_rate": 3.011598587036133e-05, "lookahead_loss": 7.9338939008712765, "loss": 5.7253, "step": 208500 }, { "epoch": 0.3986358642578125, "grad_norm": 177.303466796875, "learning_rate": 3.0068302154541016e-05, "lookahead_loss": 8.015990719795226, "loss": 5.7804, "step": 209000 }, { "epoch": 0.39958953857421875, "grad_norm": 413.600341796875, "learning_rate": 3.0020618438720706e-05, "lookahead_loss": 8.161231702804566, "loss": 5.8478, "step": 209500 }, { "epoch": 0.400543212890625, "grad_norm": 408.364501953125, "learning_rate": 2.9972934722900393e-05, "lookahead_loss": 8.09525810623169, "loss": 5.8825, "step": 210000 }, { "epoch": 0.400543212890625, "eval_accuracy": 0.040115851272015654, "eval_lookahead_loss": 7.980498466491699, "eval_lookahead_perplexity": 2923.387911872717, "eval_loss": 5.619168758392334, "eval_perplexity": 275.6601477885441, "eval_runtime": 549.7944, "eval_samples_per_second": 18.189, "eval_steps_per_second": 4.547, "step": 210000 }, { "epoch": 0.40149688720703125, "grad_norm": 218.811767578125, "learning_rate": 2.992525100708008e-05, "lookahead_loss": 8.165108282089234, "loss": 5.8299, "step": 210500 }, { "epoch": 0.4024505615234375, "grad_norm": 269.861572265625, "learning_rate": 2.9877567291259766e-05, "lookahead_loss": 8.332568598747253, "loss": 5.8062, "step": 211000 }, { "epoch": 0.40340423583984375, "grad_norm": 199.4031982421875, "learning_rate": 2.9829883575439453e-05, "lookahead_loss": 8.860951683998108, "loss": 5.8857, "step": 211500 }, { "epoch": 0.40435791015625, "grad_norm": 122.75851440429688, "learning_rate": 2.9782199859619143e-05, "lookahead_loss": 8.536361406326295, "loss": 5.8049, "step": 212000 }, { "epoch": 0.40531158447265625, "grad_norm": 131.1082000732422, "learning_rate": 2.973451614379883e-05, "lookahead_loss": 8.294275575637817, "loss": 5.7822, "step": 212500 }, { "epoch": 0.4062652587890625, "grad_norm": 249.11328125, "learning_rate": 2.9686832427978517e-05, "lookahead_loss": 8.12021396636963, "loss": 5.8034, "step": 213000 }, { "epoch": 0.40721893310546875, "grad_norm": 220.605712890625, "learning_rate": 2.9639148712158204e-05, "lookahead_loss": 8.209441896438598, "loss": 5.8605, "step": 213500 }, { "epoch": 0.408172607421875, "grad_norm": 127.85371398925781, "learning_rate": 2.959146499633789e-05, "lookahead_loss": 8.08768050956726, "loss": 5.8298, "step": 214000 }, { "epoch": 0.40912628173828125, "grad_norm": 96.46656036376953, "learning_rate": 2.954378128051758e-05, "lookahead_loss": 8.066397113800049, "loss": 5.7576, "step": 214500 }, { "epoch": 0.4100799560546875, "grad_norm": 116.53323364257812, "learning_rate": 2.9496097564697268e-05, "lookahead_loss": 8.137188397407531, "loss": 5.7651, "step": 215000 }, { "epoch": 0.4100799560546875, "eval_accuracy": 0.03998610567514677, "eval_lookahead_loss": 7.998925698089599, "eval_lookahead_perplexity": 2977.757257764051, "eval_loss": 5.599298000335693, "eval_perplexity": 270.2366347968742, "eval_runtime": 523.2915, "eval_samples_per_second": 19.11, "eval_steps_per_second": 4.777, "step": 215000 }, { "epoch": 0.41103363037109375, "grad_norm": 165.58358764648438, "learning_rate": 2.9448413848876955e-05, "lookahead_loss": 7.942638432502746, "loss": 5.7568, "step": 215500 }, { "epoch": 0.4119873046875, "grad_norm": 113.01541900634766, "learning_rate": 2.940073013305664e-05, "lookahead_loss": 8.086526737213134, "loss": 5.7696, "step": 216000 }, { "epoch": 0.41294097900390625, "grad_norm": 157.66212463378906, "learning_rate": 2.9353046417236328e-05, "lookahead_loss": 8.029751837730407, "loss": 5.7905, "step": 216500 }, { "epoch": 0.4138946533203125, "grad_norm": 133.1062774658203, "learning_rate": 2.930536270141602e-05, "lookahead_loss": 7.96623583984375, "loss": 5.7473, "step": 217000 }, { "epoch": 0.41484832763671875, "grad_norm": 102.1449966430664, "learning_rate": 2.9257678985595705e-05, "lookahead_loss": 7.9718176441192625, "loss": 5.6927, "step": 217500 }, { "epoch": 0.415802001953125, "grad_norm": 293.2792663574219, "learning_rate": 2.9209995269775392e-05, "lookahead_loss": 7.995547922134399, "loss": 5.7955, "step": 218000 }, { "epoch": 0.41675567626953125, "grad_norm": 217.73924255371094, "learning_rate": 2.916231155395508e-05, "lookahead_loss": 7.921799677848816, "loss": 5.7624, "step": 218500 }, { "epoch": 0.4177093505859375, "grad_norm": 93.51490783691406, "learning_rate": 2.9114627838134766e-05, "lookahead_loss": 7.902137145996094, "loss": 5.7767, "step": 219000 }, { "epoch": 0.41866302490234375, "grad_norm": 129.6032257080078, "learning_rate": 2.9066944122314456e-05, "lookahead_loss": 8.048154498100281, "loss": 5.8033, "step": 219500 }, { "epoch": 0.41961669921875, "grad_norm": 352.0240173339844, "learning_rate": 2.9019260406494143e-05, "lookahead_loss": 7.887757177352905, "loss": 5.7721, "step": 220000 }, { "epoch": 0.41961669921875, "eval_accuracy": 0.04063816046966732, "eval_lookahead_loss": 7.8928001026153565, "eval_lookahead_perplexity": 2677.9319163689015, "eval_loss": 5.597925662994385, "eval_perplexity": 269.86603332523515, "eval_runtime": 526.2518, "eval_samples_per_second": 19.002, "eval_steps_per_second": 4.751, "step": 220000 }, { "epoch": 0.42057037353515625, "grad_norm": 125.7489242553711, "learning_rate": 2.897157669067383e-05, "lookahead_loss": 8.019349517822265, "loss": 5.7651, "step": 220500 }, { "epoch": 0.4215240478515625, "grad_norm": 223.880615234375, "learning_rate": 2.8923892974853516e-05, "lookahead_loss": 8.037874313354493, "loss": 5.7612, "step": 221000 }, { "epoch": 0.42247772216796875, "grad_norm": 140.98391723632812, "learning_rate": 2.8876209259033203e-05, "lookahead_loss": 8.131105442047119, "loss": 5.7468, "step": 221500 }, { "epoch": 0.423431396484375, "grad_norm": 104.86039733886719, "learning_rate": 2.8828525543212893e-05, "lookahead_loss": 8.150389587402344, "loss": 5.706, "step": 222000 }, { "epoch": 0.42438507080078125, "grad_norm": 275.54541015625, "learning_rate": 2.878084182739258e-05, "lookahead_loss": 7.888950291633606, "loss": 5.7059, "step": 222500 }, { "epoch": 0.4253387451171875, "grad_norm": 115.34858703613281, "learning_rate": 2.8733158111572267e-05, "lookahead_loss": 8.009958860397338, "loss": 5.7372, "step": 223000 }, { "epoch": 0.42629241943359375, "grad_norm": 114.41761016845703, "learning_rate": 2.8685474395751954e-05, "lookahead_loss": 7.961854325294494, "loss": 5.8186, "step": 223500 }, { "epoch": 0.42724609375, "grad_norm": 286.1802062988281, "learning_rate": 2.863779067993164e-05, "lookahead_loss": 8.01421659374237, "loss": 5.803, "step": 224000 }, { "epoch": 0.42819976806640625, "grad_norm": 353.0839538574219, "learning_rate": 2.859010696411133e-05, "lookahead_loss": 8.038468587875366, "loss": 5.8375, "step": 224500 }, { "epoch": 0.4291534423828125, "grad_norm": 293.6863708496094, "learning_rate": 2.8542423248291018e-05, "lookahead_loss": 8.135893997192383, "loss": 5.8312, "step": 225000 }, { "epoch": 0.4291534423828125, "eval_accuracy": 0.03961996086105675, "eval_lookahead_loss": 8.019239653778076, "eval_lookahead_perplexity": 3038.8658641177813, "eval_loss": 5.605351448059082, "eval_perplexity": 271.8774594511356, "eval_runtime": 515.5355, "eval_samples_per_second": 19.397, "eval_steps_per_second": 4.849, "step": 225000 }, { "epoch": 0.43010711669921875, "grad_norm": 193.81024169921875, "learning_rate": 2.8494739532470705e-05, "lookahead_loss": 8.016305312156677, "loss": 5.775, "step": 225500 }, { "epoch": 0.431060791015625, "grad_norm": 372.3008728027344, "learning_rate": 2.844705581665039e-05, "lookahead_loss": 8.025052762031555, "loss": 5.7744, "step": 226000 }, { "epoch": 0.43201446533203125, "grad_norm": 188.81065368652344, "learning_rate": 2.8399372100830078e-05, "lookahead_loss": 8.127145385742187, "loss": 5.8578, "step": 226500 }, { "epoch": 0.4329681396484375, "grad_norm": 285.2081604003906, "learning_rate": 2.835168838500977e-05, "lookahead_loss": 8.17154926776886, "loss": 5.7772, "step": 227000 }, { "epoch": 0.43392181396484375, "grad_norm": 111.29180908203125, "learning_rate": 2.8304004669189455e-05, "lookahead_loss": 8.060789797782897, "loss": 5.8196, "step": 227500 }, { "epoch": 0.43487548828125, "grad_norm": 91.93737030029297, "learning_rate": 2.8256320953369142e-05, "lookahead_loss": 8.039645392417908, "loss": 5.7659, "step": 228000 }, { "epoch": 0.43582916259765625, "grad_norm": 195.14163208007812, "learning_rate": 2.820863723754883e-05, "lookahead_loss": 8.037476936340331, "loss": 5.8295, "step": 228500 }, { "epoch": 0.4367828369140625, "grad_norm": 122.90339660644531, "learning_rate": 2.8160953521728516e-05, "lookahead_loss": 7.967752510070801, "loss": 5.7762, "step": 229000 }, { "epoch": 0.43773651123046875, "grad_norm": 326.6914367675781, "learning_rate": 2.8113269805908206e-05, "lookahead_loss": 7.894039862632751, "loss": 5.7567, "step": 229500 }, { "epoch": 0.438690185546875, "grad_norm": 55.29019546508789, "learning_rate": 2.8065586090087893e-05, "lookahead_loss": 7.995678850173951, "loss": 5.7752, "step": 230000 }, { "epoch": 0.438690185546875, "eval_accuracy": 0.0404866927592955, "eval_lookahead_loss": 7.800916164588928, "eval_lookahead_perplexity": 2442.838995313791, "eval_loss": 5.582324981689453, "eval_perplexity": 265.6886093991886, "eval_runtime": 521.4095, "eval_samples_per_second": 19.179, "eval_steps_per_second": 4.795, "step": 230000 }, { "epoch": 0.43964385986328125, "grad_norm": 181.02182006835938, "learning_rate": 2.801790237426758e-05, "lookahead_loss": 7.8941036252975465, "loss": 5.7747, "step": 230500 }, { "epoch": 0.4405975341796875, "grad_norm": 219.7286834716797, "learning_rate": 2.7970218658447266e-05, "lookahead_loss": 7.850437452316284, "loss": 5.7117, "step": 231000 }, { "epoch": 0.44155120849609375, "grad_norm": 318.82452392578125, "learning_rate": 2.7922534942626953e-05, "lookahead_loss": 7.93010457611084, "loss": 5.7734, "step": 231500 }, { "epoch": 0.4425048828125, "grad_norm": 92.1238784790039, "learning_rate": 2.7874851226806643e-05, "lookahead_loss": 7.852621850967407, "loss": 5.7491, "step": 232000 }, { "epoch": 0.44345855712890625, "grad_norm": 222.38514709472656, "learning_rate": 2.782716751098633e-05, "lookahead_loss": 7.910350761413574, "loss": 5.7513, "step": 232500 }, { "epoch": 0.4444122314453125, "grad_norm": 89.17455291748047, "learning_rate": 2.7779483795166017e-05, "lookahead_loss": 7.936665622711182, "loss": 5.7743, "step": 233000 }, { "epoch": 0.44536590576171875, "grad_norm": 127.73918914794922, "learning_rate": 2.7731800079345704e-05, "lookahead_loss": 7.933457874298096, "loss": 5.7236, "step": 233500 }, { "epoch": 0.446319580078125, "grad_norm": 227.76210021972656, "learning_rate": 2.768411636352539e-05, "lookahead_loss": 7.891174397468567, "loss": 5.7803, "step": 234000 }, { "epoch": 0.44727325439453125, "grad_norm": 238.3862762451172, "learning_rate": 2.763643264770508e-05, "lookahead_loss": 8.019835186958312, "loss": 5.7706, "step": 234500 }, { "epoch": 0.4482269287109375, "grad_norm": 218.50311279296875, "learning_rate": 2.7588748931884768e-05, "lookahead_loss": 8.05141785144806, "loss": 5.8101, "step": 235000 }, { "epoch": 0.4482269287109375, "eval_accuracy": 0.039656360078277886, "eval_lookahead_loss": 7.8881004663467404, "eval_lookahead_perplexity": 2665.376137316668, "eval_loss": 5.590256214141846, "eval_perplexity": 267.80422615428853, "eval_runtime": 531.3033, "eval_samples_per_second": 18.822, "eval_steps_per_second": 4.705, "step": 235000 }, { "epoch": 0.44918060302734375, "grad_norm": 121.96994018554688, "learning_rate": 2.7541065216064455e-05, "lookahead_loss": 7.990393484115601, "loss": 5.7673, "step": 235500 }, { "epoch": 0.45013427734375, "grad_norm": 226.03062438964844, "learning_rate": 2.749338150024414e-05, "lookahead_loss": 7.948341635704041, "loss": 5.7245, "step": 236000 }, { "epoch": 0.45108795166015625, "grad_norm": 215.0734100341797, "learning_rate": 2.7445697784423828e-05, "lookahead_loss": 7.886289103507996, "loss": 5.7379, "step": 236500 }, { "epoch": 0.4520416259765625, "grad_norm": 260.45806884765625, "learning_rate": 2.739801406860352e-05, "lookahead_loss": 7.853891803741455, "loss": 5.7108, "step": 237000 }, { "epoch": 0.45299530029296875, "grad_norm": 220.23484802246094, "learning_rate": 2.7350330352783205e-05, "lookahead_loss": 7.949448519706726, "loss": 5.7043, "step": 237500 }, { "epoch": 0.453948974609375, "grad_norm": 179.2233428955078, "learning_rate": 2.7302646636962892e-05, "lookahead_loss": 8.07891303062439, "loss": 5.8097, "step": 238000 }, { "epoch": 0.45490264892578125, "grad_norm": 185.00640869140625, "learning_rate": 2.725496292114258e-05, "lookahead_loss": 7.934249756813049, "loss": 5.7821, "step": 238500 }, { "epoch": 0.4558563232421875, "grad_norm": 98.55986785888672, "learning_rate": 2.7207279205322266e-05, "lookahead_loss": 7.9079827318191525, "loss": 5.7936, "step": 239000 }, { "epoch": 0.45680999755859375, "grad_norm": 98.65374755859375, "learning_rate": 2.7159595489501956e-05, "lookahead_loss": 7.935722849845886, "loss": 5.764, "step": 239500 }, { "epoch": 0.457763671875, "grad_norm": 71.52519226074219, "learning_rate": 2.7111911773681643e-05, "lookahead_loss": 7.954815607070923, "loss": 5.7115, "step": 240000 }, { "epoch": 0.457763671875, "eval_accuracy": 0.03998199608610568, "eval_lookahead_loss": 7.9381085346221925, "eval_lookahead_perplexity": 2802.055501795803, "eval_loss": 5.5693535804748535, "eval_perplexity": 262.2645115327738, "eval_runtime": 516.5388, "eval_samples_per_second": 19.36, "eval_steps_per_second": 4.84, "step": 240000 }, { "epoch": 0.45871734619140625, "grad_norm": 101.23875427246094, "learning_rate": 2.706422805786133e-05, "lookahead_loss": 8.06018431186676, "loss": 5.7647, "step": 240500 }, { "epoch": 0.4596710205078125, "grad_norm": 68.76702117919922, "learning_rate": 2.7016544342041016e-05, "lookahead_loss": 7.927814032554626, "loss": 5.7776, "step": 241000 }, { "epoch": 0.46062469482421875, "grad_norm": 87.05778503417969, "learning_rate": 2.6968860626220703e-05, "lookahead_loss": 7.93625232219696, "loss": 5.826, "step": 241500 }, { "epoch": 0.461578369140625, "grad_norm": 110.24046325683594, "learning_rate": 2.6921176910400393e-05, "lookahead_loss": 7.907251337051392, "loss": 5.7621, "step": 242000 }, { "epoch": 0.46253204345703125, "grad_norm": 154.62127685546875, "learning_rate": 2.687349319458008e-05, "lookahead_loss": 8.007177041053772, "loss": 5.744, "step": 242500 }, { "epoch": 0.4634857177734375, "grad_norm": 125.7730941772461, "learning_rate": 2.6825809478759767e-05, "lookahead_loss": 7.996015496253968, "loss": 5.7582, "step": 243000 }, { "epoch": 0.46443939208984375, "grad_norm": 160.78684997558594, "learning_rate": 2.6778125762939454e-05, "lookahead_loss": 7.899339105606079, "loss": 5.7627, "step": 243500 }, { "epoch": 0.46539306640625, "grad_norm": 189.1313018798828, "learning_rate": 2.673044204711914e-05, "lookahead_loss": 7.929858269691468, "loss": 5.7188, "step": 244000 }, { "epoch": 0.46634674072265625, "grad_norm": 393.9104919433594, "learning_rate": 2.668275833129883e-05, "lookahead_loss": 7.9483852958679195, "loss": 5.6762, "step": 244500 }, { "epoch": 0.4673004150390625, "grad_norm": 130.38381958007812, "learning_rate": 2.6635074615478518e-05, "lookahead_loss": 7.811206496238708, "loss": 5.7196, "step": 245000 }, { "epoch": 0.4673004150390625, "eval_accuracy": 0.039366536203522505, "eval_lookahead_loss": 7.8143014207839965, "eval_lookahead_perplexity": 2475.7568365023726, "eval_loss": 5.559566974639893, "eval_perplexity": 259.71035080198726, "eval_runtime": 619.6304, "eval_samples_per_second": 16.139, "eval_steps_per_second": 4.035, "step": 245000 }, { "epoch": 0.46825408935546875, "grad_norm": 88.3947982788086, "learning_rate": 2.6587390899658205e-05, "lookahead_loss": 8.005855606079102, "loss": 5.7302, "step": 245500 }, { "epoch": 0.469207763671875, "grad_norm": 209.3162841796875, "learning_rate": 2.653970718383789e-05, "lookahead_loss": 7.885679057121277, "loss": 5.6678, "step": 246000 }, { "epoch": 0.47016143798828125, "grad_norm": 101.4078369140625, "learning_rate": 2.6492023468017578e-05, "lookahead_loss": 7.771181092262268, "loss": 5.6962, "step": 246500 }, { "epoch": 0.4711151123046875, "grad_norm": 145.29884338378906, "learning_rate": 2.644433975219727e-05, "lookahead_loss": 7.83440586566925, "loss": 5.7379, "step": 247000 }, { "epoch": 0.47206878662109375, "grad_norm": 202.72129821777344, "learning_rate": 2.6396656036376955e-05, "lookahead_loss": 7.983269752502442, "loss": 5.6937, "step": 247500 }, { "epoch": 0.4730224609375, "grad_norm": 121.18062591552734, "learning_rate": 2.6348972320556642e-05, "lookahead_loss": 7.854932978153228, "loss": 5.7405, "step": 248000 }, { "epoch": 0.47397613525390625, "grad_norm": 92.59541320800781, "learning_rate": 2.630128860473633e-05, "lookahead_loss": 7.95072023677826, "loss": 5.7063, "step": 248500 }, { "epoch": 0.4749298095703125, "grad_norm": 53.81815719604492, "learning_rate": 2.6253604888916016e-05, "lookahead_loss": 7.889274845123291, "loss": 5.745, "step": 249000 }, { "epoch": 0.47588348388671875, "grad_norm": 255.19566345214844, "learning_rate": 2.6205921173095706e-05, "lookahead_loss": 7.865012439727783, "loss": 5.7582, "step": 249500 }, { "epoch": 0.476837158203125, "grad_norm": 132.7261505126953, "learning_rate": 2.6158237457275393e-05, "lookahead_loss": 7.94509057712555, "loss": 5.6944, "step": 250000 }, { "epoch": 0.476837158203125, "eval_accuracy": 0.04090117416829746, "eval_lookahead_loss": 7.877230117797851, "eval_lookahead_perplexity": 2636.559477019111, "eval_loss": 5.5477824211120605, "eval_perplexity": 256.6677433682515, "eval_runtime": 544.4863, "eval_samples_per_second": 18.366, "eval_steps_per_second": 4.591, "step": 250000 }, { "epoch": 0.47779083251953125, "grad_norm": 78.739501953125, "learning_rate": 2.611055374145508e-05, "lookahead_loss": 7.73318566608429, "loss": 5.6425, "step": 250500 }, { "epoch": 0.4787445068359375, "grad_norm": 134.01547241210938, "learning_rate": 2.6062870025634766e-05, "lookahead_loss": 7.789692436218262, "loss": 5.6702, "step": 251000 }, { "epoch": 0.47969818115234375, "grad_norm": 105.15640258789062, "learning_rate": 2.6015186309814453e-05, "lookahead_loss": 7.943496225357055, "loss": 5.7236, "step": 251500 }, { "epoch": 0.48065185546875, "grad_norm": 266.74237060546875, "learning_rate": 2.5967502593994143e-05, "lookahead_loss": 8.164602140426636, "loss": 5.7067, "step": 252000 }, { "epoch": 0.48160552978515625, "grad_norm": 183.5140380859375, "learning_rate": 2.591981887817383e-05, "lookahead_loss": 8.07798819065094, "loss": 5.7975, "step": 252500 }, { "epoch": 0.4825592041015625, "grad_norm": 171.31027221679688, "learning_rate": 2.5872135162353517e-05, "lookahead_loss": 8.011557813644409, "loss": 5.7384, "step": 253000 }, { "epoch": 0.48351287841796875, "grad_norm": 185.53964233398438, "learning_rate": 2.5824451446533204e-05, "lookahead_loss": 7.873360324859619, "loss": 5.746, "step": 253500 }, { "epoch": 0.484466552734375, "grad_norm": 221.7014617919922, "learning_rate": 2.577676773071289e-05, "lookahead_loss": 7.927414426803589, "loss": 5.713, "step": 254000 }, { "epoch": 0.48542022705078125, "grad_norm": 406.493896484375, "learning_rate": 2.572908401489258e-05, "lookahead_loss": 7.7192041015625, "loss": 5.5998, "step": 254500 }, { "epoch": 0.4863739013671875, "grad_norm": 186.44837951660156, "learning_rate": 2.5681400299072268e-05, "lookahead_loss": 7.8425050592422485, "loss": 5.6823, "step": 255000 }, { "epoch": 0.4863739013671875, "eval_accuracy": 0.039488062622309195, "eval_lookahead_loss": 7.7952254072189335, "eval_lookahead_perplexity": 2428.976871698624, "eval_loss": 5.529767990112305, "eval_perplexity": 252.08541792878367, "eval_runtime": 584.3307, "eval_samples_per_second": 17.114, "eval_steps_per_second": 4.278, "step": 255000 }, { "epoch": 0.48732757568359375, "grad_norm": 160.92430114746094, "learning_rate": 2.5633716583251955e-05, "lookahead_loss": 7.863575527191162, "loss": 5.7197, "step": 255500 }, { "epoch": 0.48828125, "grad_norm": 340.6885986328125, "learning_rate": 2.558603286743164e-05, "lookahead_loss": 7.944182499885559, "loss": 5.7097, "step": 256000 }, { "epoch": 0.48923492431640625, "grad_norm": 467.88232421875, "learning_rate": 2.5538349151611328e-05, "lookahead_loss": 8.079351444244384, "loss": 5.7367, "step": 256500 }, { "epoch": 0.4901885986328125, "grad_norm": 422.1680603027344, "learning_rate": 2.549066543579102e-05, "lookahead_loss": 8.077034591674805, "loss": 5.7252, "step": 257000 }, { "epoch": 0.49114227294921875, "grad_norm": 195.889892578125, "learning_rate": 2.5442981719970705e-05, "lookahead_loss": 7.955115791320801, "loss": 5.7118, "step": 257500 }, { "epoch": 0.492095947265625, "grad_norm": 125.11628723144531, "learning_rate": 2.5395298004150392e-05, "lookahead_loss": 8.130371218681335, "loss": 5.7072, "step": 258000 }, { "epoch": 0.49304962158203125, "grad_norm": 163.484375, "learning_rate": 2.534761428833008e-05, "lookahead_loss": 7.984317245483399, "loss": 5.7133, "step": 258500 }, { "epoch": 0.4940032958984375, "grad_norm": 108.6294937133789, "learning_rate": 2.5299930572509766e-05, "lookahead_loss": 8.122366019248963, "loss": 5.7196, "step": 259000 }, { "epoch": 0.49495697021484375, "grad_norm": 176.71615600585938, "learning_rate": 2.5252246856689456e-05, "lookahead_loss": 7.94747174167633, "loss": 5.7616, "step": 259500 }, { "epoch": 0.49591064453125, "grad_norm": 119.74739074707031, "learning_rate": 2.5204563140869143e-05, "lookahead_loss": 8.043834136009217, "loss": 5.674, "step": 260000 }, { "epoch": 0.49591064453125, "eval_accuracy": 0.039883561643835615, "eval_lookahead_loss": 7.892603374671936, "eval_lookahead_perplexity": 2677.405144147482, "eval_loss": 5.5317888259887695, "eval_perplexity": 252.5953562625681, "eval_runtime": 931.6606, "eval_samples_per_second": 10.734, "eval_steps_per_second": 2.683, "step": 260000 }, { "epoch": 0.49686431884765625, "grad_norm": 256.5855407714844, "learning_rate": 2.515687942504883e-05, "lookahead_loss": 8.040000820159912, "loss": 5.7287, "step": 260500 }, { "epoch": 0.4978179931640625, "grad_norm": 351.4286804199219, "learning_rate": 2.5109195709228516e-05, "lookahead_loss": 8.082924491882324, "loss": 5.711, "step": 261000 }, { "epoch": 0.49877166748046875, "grad_norm": 494.5824279785156, "learning_rate": 2.5061511993408203e-05, "lookahead_loss": 7.976051651954651, "loss": 5.7022, "step": 261500 }, { "epoch": 0.499725341796875, "grad_norm": 720.60498046875, "learning_rate": 2.5013828277587893e-05, "lookahead_loss": 8.031590118408204, "loss": 5.7293, "step": 262000 }, { "epoch": 0.5006790161132812, "grad_norm": 134.50892639160156, "learning_rate": 2.496614456176758e-05, "lookahead_loss": 7.9360523090362545, "loss": 5.7008, "step": 262500 }, { "epoch": 0.5016326904296875, "grad_norm": 179.30775451660156, "learning_rate": 2.4918460845947267e-05, "lookahead_loss": 7.954775795936585, "loss": 5.7213, "step": 263000 }, { "epoch": 0.5025863647460938, "grad_norm": 136.96092224121094, "learning_rate": 2.4870777130126954e-05, "lookahead_loss": 7.960262534141541, "loss": 5.6912, "step": 263500 }, { "epoch": 0.5035400390625, "grad_norm": 232.34616088867188, "learning_rate": 2.482309341430664e-05, "lookahead_loss": 7.991747802734375, "loss": 5.7477, "step": 264000 }, { "epoch": 0.5044937133789062, "grad_norm": 136.36636352539062, "learning_rate": 2.477540969848633e-05, "lookahead_loss": 8.005833382606506, "loss": 5.6751, "step": 264500 }, { "epoch": 0.5054473876953125, "grad_norm": 523.0411376953125, "learning_rate": 2.4727725982666018e-05, "lookahead_loss": 7.865674677848816, "loss": 5.6606, "step": 265000 }, { "epoch": 0.5054473876953125, "eval_accuracy": 0.039968297455968686, "eval_lookahead_loss": 7.818929781341553, "eval_lookahead_perplexity": 2487.242090239418, "eval_loss": 5.5178399085998535, "eval_perplexity": 249.09638467146706, "eval_runtime": 711.2568, "eval_samples_per_second": 14.06, "eval_steps_per_second": 3.515, "step": 265000 }, { "epoch": 0.5064010620117188, "grad_norm": 98.11244201660156, "learning_rate": 2.4680042266845705e-05, "lookahead_loss": 7.945243515014648, "loss": 5.6797, "step": 265500 }, { "epoch": 0.507354736328125, "grad_norm": 113.1644287109375, "learning_rate": 2.463235855102539e-05, "lookahead_loss": 7.830617101669311, "loss": 5.6047, "step": 266000 }, { "epoch": 0.5083084106445312, "grad_norm": 175.635986328125, "learning_rate": 2.4584674835205078e-05, "lookahead_loss": 7.968400657653809, "loss": 5.7098, "step": 266500 }, { "epoch": 0.5092620849609375, "grad_norm": 203.55357360839844, "learning_rate": 2.453699111938477e-05, "lookahead_loss": 7.926096343994141, "loss": 5.7296, "step": 267000 }, { "epoch": 0.5102157592773438, "grad_norm": 147.76788330078125, "learning_rate": 2.4489307403564455e-05, "lookahead_loss": 7.857442797660828, "loss": 5.7263, "step": 267500 }, { "epoch": 0.51116943359375, "grad_norm": 225.1747283935547, "learning_rate": 2.4441623687744142e-05, "lookahead_loss": 7.82410337638855, "loss": 5.6505, "step": 268000 }, { "epoch": 0.5121231079101562, "grad_norm": 102.12814331054688, "learning_rate": 2.439393997192383e-05, "lookahead_loss": 7.833889961242676, "loss": 5.6692, "step": 268500 }, { "epoch": 0.5130767822265625, "grad_norm": 139.8765869140625, "learning_rate": 2.4346256256103516e-05, "lookahead_loss": 7.902825137138366, "loss": 5.6796, "step": 269000 }, { "epoch": 0.5140304565429688, "grad_norm": 231.129638671875, "learning_rate": 2.4298572540283206e-05, "lookahead_loss": 7.88152232170105, "loss": 5.6918, "step": 269500 }, { "epoch": 0.514984130859375, "grad_norm": 910.3927612304688, "learning_rate": 2.4250888824462893e-05, "lookahead_loss": 7.906339003562927, "loss": 5.7097, "step": 270000 }, { "epoch": 0.514984130859375, "eval_accuracy": 0.03953228962818004, "eval_lookahead_loss": 7.846470501327515, "eval_lookahead_perplexity": 2556.6945233235087, "eval_loss": 5.510077953338623, "eval_perplexity": 247.17039407407708, "eval_runtime": 1010.3834, "eval_samples_per_second": 9.897, "eval_steps_per_second": 2.474, "step": 270000 }, { "epoch": 0.5159378051757812, "grad_norm": 112.84613037109375, "learning_rate": 2.420320510864258e-05, "lookahead_loss": 7.836203103065491, "loss": 5.6826, "step": 270500 }, { "epoch": 0.5168914794921875, "grad_norm": 213.50648498535156, "learning_rate": 2.4155521392822266e-05, "lookahead_loss": 7.963844347953796, "loss": 5.6805, "step": 271000 }, { "epoch": 0.5178451538085938, "grad_norm": 248.32904052734375, "learning_rate": 2.4107837677001953e-05, "lookahead_loss": 7.891025122642517, "loss": 5.6991, "step": 271500 }, { "epoch": 0.518798828125, "grad_norm": 150.98219299316406, "learning_rate": 2.406015396118164e-05, "lookahead_loss": 8.056645055770874, "loss": 5.7061, "step": 272000 }, { "epoch": 0.5197525024414062, "grad_norm": 287.1550598144531, "learning_rate": 2.401247024536133e-05, "lookahead_loss": 7.888085339546204, "loss": 5.652, "step": 272500 }, { "epoch": 0.5207061767578125, "grad_norm": 285.25653076171875, "learning_rate": 2.3964786529541017e-05, "lookahead_loss": 7.850812032699585, "loss": 5.6583, "step": 273000 }, { "epoch": 0.5216598510742188, "grad_norm": 169.58987426757812, "learning_rate": 2.3917102813720704e-05, "lookahead_loss": 7.85244350528717, "loss": 5.6571, "step": 273500 }, { "epoch": 0.522613525390625, "grad_norm": 270.7475280761719, "learning_rate": 2.386941909790039e-05, "lookahead_loss": 7.711337128639221, "loss": 5.6709, "step": 274000 }, { "epoch": 0.5235671997070312, "grad_norm": 95.86302947998047, "learning_rate": 2.3821735382080078e-05, "lookahead_loss": 7.814918475151062, "loss": 5.6661, "step": 274500 }, { "epoch": 0.5245208740234375, "grad_norm": 108.87361907958984, "learning_rate": 2.3774051666259768e-05, "lookahead_loss": 7.845372378349304, "loss": 5.7047, "step": 275000 }, { "epoch": 0.5245208740234375, "eval_accuracy": 0.0401679060665362, "eval_lookahead_loss": 7.766735979652405, "eval_lookahead_perplexity": 2360.753152805512, "eval_loss": 5.501504898071289, "eval_perplexity": 245.06044589932588, "eval_runtime": 554.3454, "eval_samples_per_second": 18.039, "eval_steps_per_second": 4.51, "step": 275000 }, { "epoch": 0.5254745483398438, "grad_norm": 220.28314208984375, "learning_rate": 2.3726367950439455e-05, "lookahead_loss": 7.902491591453552, "loss": 5.6872, "step": 275500 }, { "epoch": 0.52642822265625, "grad_norm": 179.75665283203125, "learning_rate": 2.367868423461914e-05, "lookahead_loss": 7.8042371435165405, "loss": 5.6987, "step": 276000 }, { "epoch": 0.5273818969726562, "grad_norm": 187.7112274169922, "learning_rate": 2.3631000518798828e-05, "lookahead_loss": 7.775227765083313, "loss": 5.6744, "step": 276500 }, { "epoch": 0.5283355712890625, "grad_norm": 181.23861694335938, "learning_rate": 2.3583316802978515e-05, "lookahead_loss": 7.902981309890747, "loss": 5.637, "step": 277000 }, { "epoch": 0.5292892456054688, "grad_norm": 264.29046630859375, "learning_rate": 2.3535633087158205e-05, "lookahead_loss": 7.891763837814331, "loss": 5.6865, "step": 277500 }, { "epoch": 0.530242919921875, "grad_norm": 288.1813049316406, "learning_rate": 2.3487949371337892e-05, "lookahead_loss": 7.95842147064209, "loss": 5.7011, "step": 278000 }, { "epoch": 0.5311965942382812, "grad_norm": 771.5421142578125, "learning_rate": 2.344026565551758e-05, "lookahead_loss": 7.780130308151245, "loss": 5.6183, "step": 278500 }, { "epoch": 0.5321502685546875, "grad_norm": 170.33197021484375, "learning_rate": 2.3392581939697266e-05, "lookahead_loss": 7.832579337120056, "loss": 5.6035, "step": 279000 }, { "epoch": 0.5331039428710938, "grad_norm": 135.21746826171875, "learning_rate": 2.3344898223876953e-05, "lookahead_loss": 7.8323476657867435, "loss": 5.6069, "step": 279500 }, { "epoch": 0.5340576171875, "grad_norm": 197.5283966064453, "learning_rate": 2.3297214508056643e-05, "lookahead_loss": 7.921973949432373, "loss": 5.6797, "step": 280000 }, { "epoch": 0.5340576171875, "eval_accuracy": 0.03973953033268102, "eval_lookahead_loss": 7.896882347488403, "eval_lookahead_perplexity": 2688.886234096004, "eval_loss": 5.498246669769287, "eval_perplexity": 244.26328239436936, "eval_runtime": 696.1062, "eval_samples_per_second": 14.366, "eval_steps_per_second": 3.591, "step": 280000 }, { "epoch": 0.5350112915039062, "grad_norm": 176.56553649902344, "learning_rate": 2.324953079223633e-05, "lookahead_loss": 8.153367050170898, "loss": 5.7676, "step": 280500 }, { "epoch": 0.5359649658203125, "grad_norm": 136.72787475585938, "learning_rate": 2.3201847076416016e-05, "lookahead_loss": 8.111672859191895, "loss": 5.7358, "step": 281000 }, { "epoch": 0.5369186401367188, "grad_norm": 396.6884460449219, "learning_rate": 2.3154163360595703e-05, "lookahead_loss": 8.053862014770507, "loss": 5.7268, "step": 281500 }, { "epoch": 0.537872314453125, "grad_norm": 338.91400146484375, "learning_rate": 2.310647964477539e-05, "lookahead_loss": 7.907861955642701, "loss": 5.6864, "step": 282000 }, { "epoch": 0.5388259887695312, "grad_norm": 136.49169921875, "learning_rate": 2.305879592895508e-05, "lookahead_loss": 7.894360318183899, "loss": 5.6667, "step": 282500 }, { "epoch": 0.5397796630859375, "grad_norm": 88.46846008300781, "learning_rate": 2.3011112213134767e-05, "lookahead_loss": 7.9689692754745485, "loss": 5.6979, "step": 283000 }, { "epoch": 0.5407333374023438, "grad_norm": 121.82408905029297, "learning_rate": 2.2963428497314454e-05, "lookahead_loss": 7.89152933883667, "loss": 5.657, "step": 283500 }, { "epoch": 0.54168701171875, "grad_norm": 313.6229248046875, "learning_rate": 2.291574478149414e-05, "lookahead_loss": 7.964260926246643, "loss": 5.6644, "step": 284000 }, { "epoch": 0.5426406860351562, "grad_norm": 333.09686279296875, "learning_rate": 2.2868061065673828e-05, "lookahead_loss": 8.006710891723634, "loss": 5.6619, "step": 284500 }, { "epoch": 0.5435943603515625, "grad_norm": 362.7091979980469, "learning_rate": 2.2820377349853518e-05, "lookahead_loss": 7.931339549064636, "loss": 5.6739, "step": 285000 }, { "epoch": 0.5435943603515625, "eval_accuracy": 0.039767906066536206, "eval_lookahead_loss": 8.024146954345703, "eval_lookahead_perplexity": 3053.8151425988985, "eval_loss": 5.492959022521973, "eval_perplexity": 242.9751130155805, "eval_runtime": 727.9789, "eval_samples_per_second": 13.737, "eval_steps_per_second": 3.434, "step": 285000 }, { "epoch": 0.5445480346679688, "grad_norm": 343.2017517089844, "learning_rate": 2.2772693634033205e-05, "lookahead_loss": 8.036055931091308, "loss": 5.6889, "step": 285500 }, { "epoch": 0.545501708984375, "grad_norm": 125.46600341796875, "learning_rate": 2.272500991821289e-05, "lookahead_loss": 7.9711162357330325, "loss": 5.6854, "step": 286000 }, { "epoch": 0.5464553833007812, "grad_norm": 147.85902404785156, "learning_rate": 2.2677326202392578e-05, "lookahead_loss": 7.900034138679504, "loss": 5.676, "step": 286500 }, { "epoch": 0.5474090576171875, "grad_norm": 178.62083435058594, "learning_rate": 2.2629642486572265e-05, "lookahead_loss": 7.966514761924744, "loss": 5.6325, "step": 287000 }, { "epoch": 0.5483627319335938, "grad_norm": 186.846435546875, "learning_rate": 2.2581958770751955e-05, "lookahead_loss": 8.05688861656189, "loss": 5.6929, "step": 287500 }, { "epoch": 0.54931640625, "grad_norm": 138.51942443847656, "learning_rate": 2.2534275054931642e-05, "lookahead_loss": 7.938032037734986, "loss": 5.6878, "step": 288000 }, { "epoch": 0.5502700805664062, "grad_norm": 571.3887329101562, "learning_rate": 2.248659133911133e-05, "lookahead_loss": 7.8102029514312745, "loss": 5.6835, "step": 288500 }, { "epoch": 0.5512237548828125, "grad_norm": 350.6131591796875, "learning_rate": 2.2438907623291016e-05, "lookahead_loss": 8.008330041885376, "loss": 5.692, "step": 289000 }, { "epoch": 0.5521774291992188, "grad_norm": 204.22744750976562, "learning_rate": 2.2391223907470703e-05, "lookahead_loss": 8.051590940475464, "loss": 5.6782, "step": 289500 }, { "epoch": 0.553131103515625, "grad_norm": 175.1352996826172, "learning_rate": 2.2343540191650393e-05, "lookahead_loss": 8.018924990653991, "loss": 5.6826, "step": 290000 }, { "epoch": 0.553131103515625, "eval_accuracy": 0.0397133072407045, "eval_lookahead_loss": 7.910617658615112, "eval_lookahead_perplexity": 2726.0737293706875, "eval_loss": 5.498958110809326, "eval_perplexity": 244.4371231494307, "eval_runtime": 3414.2138, "eval_samples_per_second": 2.929, "eval_steps_per_second": 0.732, "step": 290000 }, { "epoch": 0.5540847778320312, "grad_norm": 169.14169311523438, "learning_rate": 2.229585647583008e-05, "lookahead_loss": 7.94398180103302, "loss": 5.6832, "step": 290500 }, { "epoch": 0.5550384521484375, "grad_norm": 337.6749572753906, "learning_rate": 2.2248172760009766e-05, "lookahead_loss": 7.863245802879334, "loss": 5.6869, "step": 291000 }, { "epoch": 0.5559921264648438, "grad_norm": 208.70806884765625, "learning_rate": 2.2200489044189453e-05, "lookahead_loss": 8.01245299911499, "loss": 5.6838, "step": 291500 }, { "epoch": 0.55694580078125, "grad_norm": 268.7373046875, "learning_rate": 2.215280532836914e-05, "lookahead_loss": 7.866070913314819, "loss": 5.5954, "step": 292000 }, { "epoch": 0.5578994750976562, "grad_norm": 614.4173583984375, "learning_rate": 2.210512161254883e-05, "lookahead_loss": 7.853419567108154, "loss": 5.6345, "step": 292500 }, { "epoch": 0.5588531494140625, "grad_norm": 177.70216369628906, "learning_rate": 2.2057437896728517e-05, "lookahead_loss": 8.02554615020752, "loss": 5.61, "step": 293000 }, { "epoch": 0.5598068237304688, "grad_norm": 399.96954345703125, "learning_rate": 2.2009754180908204e-05, "lookahead_loss": 7.839038130760193, "loss": 5.5791, "step": 293500 }, { "epoch": 0.560760498046875, "grad_norm": 193.72483825683594, "learning_rate": 2.196207046508789e-05, "lookahead_loss": 7.947853876113892, "loss": 5.7159, "step": 294000 }, { "epoch": 0.5617141723632812, "grad_norm": 180.09555053710938, "learning_rate": 2.1914386749267578e-05, "lookahead_loss": 7.849945856094361, "loss": 5.7476, "step": 294500 }, { "epoch": 0.5626678466796875, "grad_norm": 278.494873046875, "learning_rate": 2.1866703033447268e-05, "lookahead_loss": 7.914177971839905, "loss": 5.7864, "step": 295000 }, { "epoch": 0.5626678466796875, "eval_accuracy": 0.03965949119373777, "eval_lookahead_loss": 7.8498057962417604, "eval_lookahead_perplexity": 2565.236089968157, "eval_loss": 5.479959964752197, "eval_perplexity": 239.83710524410398, "eval_runtime": 1093.9314, "eval_samples_per_second": 9.141, "eval_steps_per_second": 2.285, "step": 295000 }, { "epoch": 0.5636215209960938, "grad_norm": 305.6580505371094, "learning_rate": 2.1819019317626955e-05, "lookahead_loss": 7.965362969398498, "loss": 5.7001, "step": 295500 }, { "epoch": 0.5645751953125, "grad_norm": 264.44256591796875, "learning_rate": 2.177133560180664e-05, "lookahead_loss": 7.95421821975708, "loss": 5.6634, "step": 296000 }, { "epoch": 0.5655288696289062, "grad_norm": 252.90435791015625, "learning_rate": 2.1723651885986328e-05, "lookahead_loss": 7.970698679924011, "loss": 5.6515, "step": 296500 }, { "epoch": 0.5664825439453125, "grad_norm": 232.62823486328125, "learning_rate": 2.1675968170166015e-05, "lookahead_loss": 7.928063584327698, "loss": 5.6578, "step": 297000 }, { "epoch": 0.5674362182617188, "grad_norm": 298.5772399902344, "learning_rate": 2.1628284454345705e-05, "lookahead_loss": 7.942742992401123, "loss": 5.6943, "step": 297500 }, { "epoch": 0.568389892578125, "grad_norm": 214.66708374023438, "learning_rate": 2.1580600738525392e-05, "lookahead_loss": 7.996443361282348, "loss": 5.7065, "step": 298000 }, { "epoch": 0.5693435668945312, "grad_norm": 177.85177612304688, "learning_rate": 2.153291702270508e-05, "lookahead_loss": 7.8757939615249635, "loss": 5.6599, "step": 298500 }, { "epoch": 0.5702972412109375, "grad_norm": 122.9302978515625, "learning_rate": 2.1485233306884766e-05, "lookahead_loss": 7.994811054229737, "loss": 5.6491, "step": 299000 }, { "epoch": 0.5712509155273438, "grad_norm": 387.8847351074219, "learning_rate": 2.1437549591064453e-05, "lookahead_loss": 8.066488211631775, "loss": 5.65, "step": 299500 }, { "epoch": 0.57220458984375, "grad_norm": 187.95985412597656, "learning_rate": 2.1389865875244143e-05, "lookahead_loss": 8.112970567703247, "loss": 5.6506, "step": 300000 }, { "epoch": 0.57220458984375, "eval_accuracy": 0.0401091976516634, "eval_lookahead_loss": 7.9693742902755735, "eval_lookahead_perplexity": 2891.0478414132717, "eval_loss": 5.480536937713623, "eval_perplexity": 239.9755246972774, "eval_runtime": 926.6395, "eval_samples_per_second": 10.792, "eval_steps_per_second": 2.698, "step": 300000 }, { "epoch": 0.5731582641601562, "grad_norm": 236.52964782714844, "learning_rate": 2.134218215942383e-05, "lookahead_loss": 7.984907669067383, "loss": 5.6749, "step": 300500 }, { "epoch": 0.5741119384765625, "grad_norm": 150.34609985351562, "learning_rate": 2.1294498443603516e-05, "lookahead_loss": 7.983117621421814, "loss": 5.6439, "step": 301000 }, { "epoch": 0.5750656127929688, "grad_norm": 270.03192138671875, "learning_rate": 2.1246814727783203e-05, "lookahead_loss": 8.069944981575013, "loss": 5.6535, "step": 301500 }, { "epoch": 0.576019287109375, "grad_norm": 571.3355102539062, "learning_rate": 2.119913101196289e-05, "lookahead_loss": 8.167645770072937, "loss": 5.6276, "step": 302000 }, { "epoch": 0.5769729614257812, "grad_norm": 193.0608367919922, "learning_rate": 2.115144729614258e-05, "lookahead_loss": 8.044493011474609, "loss": 5.6489, "step": 302500 }, { "epoch": 0.5779266357421875, "grad_norm": 148.1036376953125, "learning_rate": 2.1103763580322267e-05, "lookahead_loss": 8.132023513793945, "loss": 5.6815, "step": 303000 }, { "epoch": 0.5788803100585938, "grad_norm": 154.26748657226562, "learning_rate": 2.1056079864501954e-05, "lookahead_loss": 7.9708847284317015, "loss": 5.6481, "step": 303500 }, { "epoch": 0.579833984375, "grad_norm": 311.95806884765625, "learning_rate": 2.100839614868164e-05, "lookahead_loss": 7.945155754089355, "loss": 5.6066, "step": 304000 }, { "epoch": 0.5807876586914062, "grad_norm": 301.0680847167969, "learning_rate": 2.0960712432861328e-05, "lookahead_loss": 7.9810845003128055, "loss": 5.6702, "step": 304500 }, { "epoch": 0.5817413330078125, "grad_norm": 221.9695587158203, "learning_rate": 2.0913028717041018e-05, "lookahead_loss": 7.977873930931091, "loss": 5.6403, "step": 305000 }, { "epoch": 0.5817413330078125, "eval_accuracy": 0.03901859099804305, "eval_lookahead_loss": 7.830066268348694, "eval_lookahead_perplexity": 2515.096039158014, "eval_loss": 5.4738359451293945, "eval_perplexity": 238.3728263180611, "eval_runtime": 600.8823, "eval_samples_per_second": 16.642, "eval_steps_per_second": 4.161, "step": 305000 }, { "epoch": 0.5826950073242188, "grad_norm": 196.30093383789062, "learning_rate": 2.0865345001220705e-05, "lookahead_loss": 7.978430555343628, "loss": 5.6788, "step": 305500 }, { "epoch": 0.583648681640625, "grad_norm": 192.31936645507812, "learning_rate": 2.081766128540039e-05, "lookahead_loss": 7.899202779769897, "loss": 5.5895, "step": 306000 }, { "epoch": 0.5846023559570312, "grad_norm": 186.54086303710938, "learning_rate": 2.0769977569580078e-05, "lookahead_loss": 7.98490788269043, "loss": 5.617, "step": 306500 }, { "epoch": 0.5855560302734375, "grad_norm": 282.47760009765625, "learning_rate": 2.0722293853759765e-05, "lookahead_loss": 8.029359992027283, "loss": 5.6078, "step": 307000 }, { "epoch": 0.5865097045898438, "grad_norm": 217.1461639404297, "learning_rate": 2.0674610137939455e-05, "lookahead_loss": 7.926498282432556, "loss": 5.5949, "step": 307500 }, { "epoch": 0.58746337890625, "grad_norm": 230.530517578125, "learning_rate": 2.0626926422119142e-05, "lookahead_loss": 7.863164049148559, "loss": 5.721, "step": 308000 }, { "epoch": 0.5884170532226562, "grad_norm": 161.09767150878906, "learning_rate": 2.057924270629883e-05, "lookahead_loss": 7.878196062088013, "loss": 5.71, "step": 308500 }, { "epoch": 0.5893707275390625, "grad_norm": 731.7041625976562, "learning_rate": 2.0531558990478516e-05, "lookahead_loss": 7.8225637693405154, "loss": 5.6723, "step": 309000 }, { "epoch": 0.5903244018554688, "grad_norm": 288.5928649902344, "learning_rate": 2.0483875274658203e-05, "lookahead_loss": 7.989353318214416, "loss": 5.6535, "step": 309500 }, { "epoch": 0.591278076171875, "grad_norm": 149.40711975097656, "learning_rate": 2.0436191558837893e-05, "lookahead_loss": 8.027669719696044, "loss": 5.6538, "step": 310000 }, { "epoch": 0.591278076171875, "eval_accuracy": 0.03978082191780822, "eval_lookahead_loss": 7.893353377151489, "eval_lookahead_perplexity": 2679.413957857828, "eval_loss": 5.481051445007324, "eval_perplexity": 240.09902562338314, "eval_runtime": 542.2032, "eval_samples_per_second": 18.443, "eval_steps_per_second": 4.611, "step": 310000 }, { "epoch": 0.5922317504882812, "grad_norm": 184.80284118652344, "learning_rate": 2.038850784301758e-05, "lookahead_loss": 7.8615158100128175, "loss": 5.6122, "step": 310500 }, { "epoch": 0.5931854248046875, "grad_norm": 124.25953674316406, "learning_rate": 2.0340824127197266e-05, "lookahead_loss": 7.862430813789367, "loss": 5.6309, "step": 311000 }, { "epoch": 0.5941390991210938, "grad_norm": 187.29763793945312, "learning_rate": 2.0293140411376953e-05, "lookahead_loss": 7.85451729297638, "loss": 5.6151, "step": 311500 }, { "epoch": 0.5950927734375, "grad_norm": 195.3189239501953, "learning_rate": 2.024545669555664e-05, "lookahead_loss": 8.075851541519166, "loss": 5.6941, "step": 312000 }, { "epoch": 0.5960464477539062, "grad_norm": 88.47908782958984, "learning_rate": 2.019777297973633e-05, "lookahead_loss": 7.9950023403167725, "loss": 5.6824, "step": 312500 }, { "epoch": 0.5970001220703125, "grad_norm": 218.31484985351562, "learning_rate": 2.0150089263916017e-05, "lookahead_loss": 7.959700776100159, "loss": 5.6741, "step": 313000 }, { "epoch": 0.5979537963867188, "grad_norm": 380.0316467285156, "learning_rate": 2.0102405548095704e-05, "lookahead_loss": 7.884088822364808, "loss": 5.5878, "step": 313500 }, { "epoch": 0.598907470703125, "grad_norm": 282.35888671875, "learning_rate": 2.005472183227539e-05, "lookahead_loss": 7.971494802474975, "loss": 5.6198, "step": 314000 }, { "epoch": 0.5998611450195312, "grad_norm": 256.425537109375, "learning_rate": 2.0007038116455078e-05, "lookahead_loss": 7.9931043472290035, "loss": 5.6688, "step": 314500 }, { "epoch": 0.6008148193359375, "grad_norm": 64.56837463378906, "learning_rate": 1.9959354400634768e-05, "lookahead_loss": 7.982756353378296, "loss": 5.6665, "step": 315000 }, { "epoch": 0.6008148193359375, "eval_accuracy": 0.039886301369863014, "eval_lookahead_loss": 7.84068729801178, "eval_lookahead_perplexity": 2541.951311665254, "eval_loss": 5.456647872924805, "eval_perplexity": 234.3106673152359, "eval_runtime": 729.6414, "eval_samples_per_second": 13.705, "eval_steps_per_second": 3.426, "step": 315000 }, { "epoch": 0.6017684936523438, "grad_norm": 144.10299682617188, "learning_rate": 1.9911670684814455e-05, "lookahead_loss": 7.832165965080261, "loss": 5.633, "step": 315500 }, { "epoch": 0.60272216796875, "grad_norm": 283.2576599121094, "learning_rate": 1.986398696899414e-05, "lookahead_loss": 7.868695465087891, "loss": 5.6626, "step": 316000 }, { "epoch": 0.6036758422851562, "grad_norm": 179.86874389648438, "learning_rate": 1.9816303253173828e-05, "lookahead_loss": 7.958296907424927, "loss": 5.6194, "step": 316500 }, { "epoch": 0.6046295166015625, "grad_norm": 515.33154296875, "learning_rate": 1.9768619537353515e-05, "lookahead_loss": 7.954486019134522, "loss": 5.6664, "step": 317000 }, { "epoch": 0.6055831909179688, "grad_norm": 154.55613708496094, "learning_rate": 1.9720935821533205e-05, "lookahead_loss": 7.896434834480286, "loss": 5.6389, "step": 317500 }, { "epoch": 0.606536865234375, "grad_norm": 569.8792114257812, "learning_rate": 1.9673252105712892e-05, "lookahead_loss": 7.952841144561767, "loss": 5.6327, "step": 318000 }, { "epoch": 0.6074905395507812, "grad_norm": 1072.5340576171875, "learning_rate": 1.962556838989258e-05, "lookahead_loss": 7.955419460296631, "loss": 5.6424, "step": 318500 }, { "epoch": 0.6084442138671875, "grad_norm": 1139.8504638671875, "learning_rate": 1.9577884674072266e-05, "lookahead_loss": 7.85601903629303, "loss": 5.6284, "step": 319000 }, { "epoch": 0.6093978881835938, "grad_norm": 134.07656860351562, "learning_rate": 1.9530200958251953e-05, "lookahead_loss": 7.908764311790466, "loss": 5.6456, "step": 319500 }, { "epoch": 0.6103515625, "grad_norm": 398.4620666503906, "learning_rate": 1.9482517242431643e-05, "lookahead_loss": 7.849265632629394, "loss": 5.5755, "step": 320000 }, { "epoch": 0.6103515625, "eval_accuracy": 0.03949530332681018, "eval_lookahead_loss": 7.860798316192627, "eval_lookahead_perplexity": 2593.5900541108776, "eval_loss": 5.44772481918335, "eval_perplexity": 232.22920096729266, "eval_runtime": 530.154, "eval_samples_per_second": 18.862, "eval_steps_per_second": 4.716, "step": 320000 }, { "epoch": 0.6113052368164062, "grad_norm": 90.57913208007812, "learning_rate": 1.943483352661133e-05, "lookahead_loss": 7.902767556190491, "loss": 5.5908, "step": 320500 }, { "epoch": 0.6122589111328125, "grad_norm": 215.03770446777344, "learning_rate": 1.9387149810791016e-05, "lookahead_loss": 7.873751315116882, "loss": 5.6027, "step": 321000 }, { "epoch": 0.6132125854492188, "grad_norm": 222.5513916015625, "learning_rate": 1.9339466094970703e-05, "lookahead_loss": 7.94311203289032, "loss": 5.7104, "step": 321500 }, { "epoch": 0.614166259765625, "grad_norm": 155.6073455810547, "learning_rate": 1.929178237915039e-05, "lookahead_loss": 7.916344712257385, "loss": 5.7343, "step": 322000 }, { "epoch": 0.6151199340820312, "grad_norm": 190.56959533691406, "learning_rate": 1.924409866333008e-05, "lookahead_loss": 7.898597079277039, "loss": 5.6952, "step": 322500 }, { "epoch": 0.6160736083984375, "grad_norm": 234.36346435546875, "learning_rate": 1.9196414947509767e-05, "lookahead_loss": 7.913662529945373, "loss": 5.6912, "step": 323000 }, { "epoch": 0.6170272827148438, "grad_norm": 155.151611328125, "learning_rate": 1.9148731231689454e-05, "lookahead_loss": 7.872819535255432, "loss": 5.6262, "step": 323500 }, { "epoch": 0.61798095703125, "grad_norm": 300.3796691894531, "learning_rate": 1.910104751586914e-05, "lookahead_loss": 7.960519321441651, "loss": 5.6584, "step": 324000 }, { "epoch": 0.6189346313476562, "grad_norm": 441.7313537597656, "learning_rate": 1.9053363800048828e-05, "lookahead_loss": 7.89206614112854, "loss": 5.6605, "step": 324500 }, { "epoch": 0.6198883056640625, "grad_norm": 116.43922424316406, "learning_rate": 1.9005680084228518e-05, "lookahead_loss": 8.026430695533753, "loss": 5.641, "step": 325000 }, { "epoch": 0.6198883056640625, "eval_accuracy": 0.039441095890410956, "eval_lookahead_loss": 7.981214308929443, "eval_lookahead_perplexity": 2925.481346197193, "eval_loss": 5.443291187286377, "eval_perplexity": 231.20186128035343, "eval_runtime": 1030.579, "eval_samples_per_second": 9.703, "eval_steps_per_second": 2.426, "step": 325000 }, { "epoch": 0.6208419799804688, "grad_norm": 137.5011749267578, "learning_rate": 1.8957996368408205e-05, "lookahead_loss": 8.058280223846436, "loss": 5.6382, "step": 325500 }, { "epoch": 0.621795654296875, "grad_norm": 205.3738250732422, "learning_rate": 1.891031265258789e-05, "lookahead_loss": 7.941509570598602, "loss": 5.6168, "step": 326000 }, { "epoch": 0.6227493286132812, "grad_norm": 312.34893798828125, "learning_rate": 1.8862628936767578e-05, "lookahead_loss": 7.914756356239319, "loss": 5.666, "step": 326500 }, { "epoch": 0.6237030029296875, "grad_norm": 181.58055114746094, "learning_rate": 1.8814945220947265e-05, "lookahead_loss": 7.834921408653259, "loss": 5.5851, "step": 327000 }, { "epoch": 0.6246566772460938, "grad_norm": 125.09971618652344, "learning_rate": 1.8767261505126955e-05, "lookahead_loss": 7.937186376571655, "loss": 5.6383, "step": 327500 }, { "epoch": 0.6256103515625, "grad_norm": 181.6016387939453, "learning_rate": 1.8719577789306642e-05, "lookahead_loss": 7.865681925773621, "loss": 5.6133, "step": 328000 }, { "epoch": 0.6265640258789062, "grad_norm": 223.63528442382812, "learning_rate": 1.867189407348633e-05, "lookahead_loss": 7.899964000701904, "loss": 5.668, "step": 328500 }, { "epoch": 0.6275177001953125, "grad_norm": 219.20458984375, "learning_rate": 1.8624210357666016e-05, "lookahead_loss": 7.785865209579468, "loss": 5.5995, "step": 329000 }, { "epoch": 0.6284713745117188, "grad_norm": 344.0431823730469, "learning_rate": 1.8576526641845703e-05, "lookahead_loss": 7.85551165676117, "loss": 5.6527, "step": 329500 }, { "epoch": 0.629425048828125, "grad_norm": 281.7197570800781, "learning_rate": 1.8528842926025393e-05, "lookahead_loss": 7.9053622255325315, "loss": 5.6113, "step": 330000 }, { "epoch": 0.629425048828125, "eval_accuracy": 0.039111350293542076, "eval_lookahead_loss": 7.812703450012207, "eval_lookahead_perplexity": 2471.803808692209, "eval_loss": 5.4369001388549805, "eval_perplexity": 229.72895072231472, "eval_runtime": 556.6264, "eval_samples_per_second": 17.965, "eval_steps_per_second": 4.491, "step": 330000 }, { "epoch": 0.6303787231445312, "grad_norm": 400.74285888671875, "learning_rate": 1.848115921020508e-05, "lookahead_loss": 7.9515310754776, "loss": 5.6869, "step": 330500 }, { "epoch": 0.6313323974609375, "grad_norm": 250.89913940429688, "learning_rate": 1.8433475494384766e-05, "lookahead_loss": 7.804065631866455, "loss": 5.6526, "step": 331000 }, { "epoch": 0.6322860717773438, "grad_norm": 219.79942321777344, "learning_rate": 1.8385791778564453e-05, "lookahead_loss": 7.878339442253113, "loss": 5.6144, "step": 331500 }, { "epoch": 0.63323974609375, "grad_norm": 325.0855712890625, "learning_rate": 1.833810806274414e-05, "lookahead_loss": 7.79832727432251, "loss": 5.6799, "step": 332000 }, { "epoch": 0.6341934204101562, "grad_norm": 241.0140380859375, "learning_rate": 1.829042434692383e-05, "lookahead_loss": 7.815989983558655, "loss": 5.6148, "step": 332500 }, { "epoch": 0.6351470947265625, "grad_norm": 218.62649536132812, "learning_rate": 1.8242740631103517e-05, "lookahead_loss": 7.813162120819092, "loss": 5.5924, "step": 333000 }, { "epoch": 0.6361007690429688, "grad_norm": 449.4620056152344, "learning_rate": 1.8195056915283204e-05, "lookahead_loss": 7.8195184822082515, "loss": 5.5818, "step": 333500 }, { "epoch": 0.637054443359375, "grad_norm": 375.36602783203125, "learning_rate": 1.814737319946289e-05, "lookahead_loss": 7.8206783285140995, "loss": 5.5986, "step": 334000 }, { "epoch": 0.6380081176757812, "grad_norm": 489.7010192871094, "learning_rate": 1.8099689483642578e-05, "lookahead_loss": 7.880506252288819, "loss": 5.6189, "step": 334500 }, { "epoch": 0.6389617919921875, "grad_norm": 264.9696044921875, "learning_rate": 1.8052005767822268e-05, "lookahead_loss": 7.984205496788025, "loss": 5.6697, "step": 335000 }, { "epoch": 0.6389617919921875, "eval_accuracy": 0.039367710371819964, "eval_lookahead_loss": 7.904350224494934, "eval_lookahead_perplexity": 2709.0416712687543, "eval_loss": 5.4349799156188965, "eval_perplexity": 229.28824311704537, "eval_runtime": 515.5836, "eval_samples_per_second": 19.395, "eval_steps_per_second": 4.849, "step": 335000 }, { "epoch": 0.6399154663085938, "grad_norm": 670.4136352539062, "learning_rate": 1.8004322052001955e-05, "lookahead_loss": 7.864183287620545, "loss": 5.7087, "step": 335500 }, { "epoch": 0.640869140625, "grad_norm": 225.13453674316406, "learning_rate": 1.795663833618164e-05, "lookahead_loss": 8.017053544998168, "loss": 5.6704, "step": 336000 }, { "epoch": 0.6418228149414062, "grad_norm": 156.30531311035156, "learning_rate": 1.7908954620361328e-05, "lookahead_loss": 7.901902256965637, "loss": 5.6483, "step": 336500 }, { "epoch": 0.6427764892578125, "grad_norm": 362.4761047363281, "learning_rate": 1.7861270904541015e-05, "lookahead_loss": 7.863450795173645, "loss": 5.5805, "step": 337000 }, { "epoch": 0.6437301635742188, "grad_norm": 232.38467407226562, "learning_rate": 1.7813587188720705e-05, "lookahead_loss": 7.829627919197082, "loss": 5.6296, "step": 337500 }, { "epoch": 0.644683837890625, "grad_norm": 147.89044189453125, "learning_rate": 1.7765903472900392e-05, "lookahead_loss": 7.869099666595459, "loss": 5.6294, "step": 338000 }, { "epoch": 0.6456375122070312, "grad_norm": 246.5752716064453, "learning_rate": 1.771821975708008e-05, "lookahead_loss": 7.995955725669861, "loss": 5.6313, "step": 338500 }, { "epoch": 0.6465911865234375, "grad_norm": 109.2996597290039, "learning_rate": 1.7670536041259766e-05, "lookahead_loss": 7.967386183738708, "loss": 5.6858, "step": 339000 }, { "epoch": 0.6475448608398438, "grad_norm": 327.04766845703125, "learning_rate": 1.7622852325439453e-05, "lookahead_loss": 7.91061208820343, "loss": 5.6292, "step": 339500 }, { "epoch": 0.64849853515625, "grad_norm": 639.6124267578125, "learning_rate": 1.7575168609619143e-05, "lookahead_loss": 7.922152138710022, "loss": 5.6425, "step": 340000 }, { "epoch": 0.64849853515625, "eval_accuracy": 0.03969354207436399, "eval_lookahead_loss": 7.844210968589783, "eval_lookahead_perplexity": 2550.9241100216273, "eval_loss": 5.430090427398682, "eval_perplexity": 228.16987729960823, "eval_runtime": 581.1783, "eval_samples_per_second": 17.206, "eval_steps_per_second": 4.302, "step": 340000 }, { "epoch": 0.6494522094726562, "grad_norm": 153.1531982421875, "learning_rate": 1.752748489379883e-05, "lookahead_loss": 7.981652014732361, "loss": 5.6107, "step": 340500 }, { "epoch": 0.6504058837890625, "grad_norm": 157.4033203125, "learning_rate": 1.7479801177978516e-05, "lookahead_loss": 8.100610873222351, "loss": 5.587, "step": 341000 }, { "epoch": 0.6513595581054688, "grad_norm": 235.71585083007812, "learning_rate": 1.7432117462158203e-05, "lookahead_loss": 7.88981394481659, "loss": 5.6366, "step": 341500 }, { "epoch": 0.652313232421875, "grad_norm": 346.0996398925781, "learning_rate": 1.738443374633789e-05, "lookahead_loss": 8.017710515975953, "loss": 5.6243, "step": 342000 }, { "epoch": 0.6532669067382812, "grad_norm": 141.08938598632812, "learning_rate": 1.733675003051758e-05, "lookahead_loss": 7.923397108078003, "loss": 5.6437, "step": 342500 }, { "epoch": 0.6542205810546875, "grad_norm": 178.15817260742188, "learning_rate": 1.7289066314697267e-05, "lookahead_loss": 7.819316389083863, "loss": 5.6364, "step": 343000 }, { "epoch": 0.6551742553710938, "grad_norm": 206.52708435058594, "learning_rate": 1.7241382598876954e-05, "lookahead_loss": 7.931987959861756, "loss": 5.5813, "step": 343500 }, { "epoch": 0.6561279296875, "grad_norm": 314.149169921875, "learning_rate": 1.719369888305664e-05, "lookahead_loss": 7.961332881927491, "loss": 5.6253, "step": 344000 }, { "epoch": 0.6570816040039062, "grad_norm": 323.419921875, "learning_rate": 1.7146015167236328e-05, "lookahead_loss": 7.946225228309632, "loss": 5.574, "step": 344500 }, { "epoch": 0.6580352783203125, "grad_norm": 977.4662475585938, "learning_rate": 1.7098331451416018e-05, "lookahead_loss": 7.979162298202515, "loss": 5.626, "step": 345000 }, { "epoch": 0.6580352783203125, "eval_accuracy": 0.0391091976516634, "eval_lookahead_loss": 7.863940311431885, "eval_lookahead_perplexity": 2601.7519172664884, "eval_loss": 5.422463893890381, "eval_perplexity": 226.43635088583528, "eval_runtime": 664.6652, "eval_samples_per_second": 15.045, "eval_steps_per_second": 3.761, "step": 345000 }, { "epoch": 0.6589889526367188, "grad_norm": 320.04437255859375, "learning_rate": 1.7050647735595705e-05, "lookahead_loss": 7.973353558540344, "loss": 5.6149, "step": 345500 }, { "epoch": 0.659942626953125, "grad_norm": 992.9889526367188, "learning_rate": 1.700296401977539e-05, "lookahead_loss": 8.032712125778199, "loss": 5.6159, "step": 346000 }, { "epoch": 0.6608963012695312, "grad_norm": 343.72271728515625, "learning_rate": 1.6955280303955078e-05, "lookahead_loss": 8.019125736236573, "loss": 5.5928, "step": 346500 }, { "epoch": 0.6618499755859375, "grad_norm": 99.04891967773438, "learning_rate": 1.6907596588134765e-05, "lookahead_loss": 8.086309061050414, "loss": 5.5593, "step": 347000 }, { "epoch": 0.6628036499023438, "grad_norm": 260.2797546386719, "learning_rate": 1.6859912872314455e-05, "lookahead_loss": 8.191697989463806, "loss": 5.5777, "step": 347500 }, { "epoch": 0.66375732421875, "grad_norm": 208.03736877441406, "learning_rate": 1.6812229156494142e-05, "lookahead_loss": 8.10074711227417, "loss": 5.5829, "step": 348000 }, { "epoch": 0.6647109985351562, "grad_norm": 135.5190887451172, "learning_rate": 1.676454544067383e-05, "lookahead_loss": 8.40675752735138, "loss": 5.7307, "step": 348500 }, { "epoch": 0.6656646728515625, "grad_norm": 152.09461975097656, "learning_rate": 1.6716861724853516e-05, "lookahead_loss": 8.380592555046082, "loss": 5.6533, "step": 349000 }, { "epoch": 0.6666183471679688, "grad_norm": 225.4604949951172, "learning_rate": 1.6669178009033203e-05, "lookahead_loss": 8.263343521118165, "loss": 5.6449, "step": 349500 }, { "epoch": 0.667572021484375, "grad_norm": 208.3924560546875, "learning_rate": 1.6621494293212893e-05, "lookahead_loss": 8.073844646453857, "loss": 5.5888, "step": 350000 }, { "epoch": 0.667572021484375, "eval_accuracy": 0.03943463796477495, "eval_lookahead_loss": 7.982448950004578, "eval_lookahead_perplexity": 2929.0954962617097, "eval_loss": 5.419064998626709, "eval_perplexity": 225.66802391640886, "eval_runtime": 1185.2674, "eval_samples_per_second": 8.437, "eval_steps_per_second": 2.109, "step": 350000 }, { "epoch": 0.6685256958007812, "grad_norm": 202.98760986328125, "learning_rate": 1.657381057739258e-05, "lookahead_loss": 8.11694134426117, "loss": 5.661, "step": 350500 }, { "epoch": 0.6694793701171875, "grad_norm": 554.2233276367188, "learning_rate": 1.6526126861572266e-05, "lookahead_loss": 8.236780458450317, "loss": 5.5999, "step": 351000 }, { "epoch": 0.6704330444335938, "grad_norm": 243.07240295410156, "learning_rate": 1.6478443145751953e-05, "lookahead_loss": 8.176079161643981, "loss": 5.6346, "step": 351500 }, { "epoch": 0.67138671875, "grad_norm": 281.7927551269531, "learning_rate": 1.643075942993164e-05, "lookahead_loss": 8.195244685173035, "loss": 5.6561, "step": 352000 }, { "epoch": 0.6723403930664062, "grad_norm": 219.89898681640625, "learning_rate": 1.638307571411133e-05, "lookahead_loss": 8.12547733783722, "loss": 5.6312, "step": 352500 }, { "epoch": 0.6732940673828125, "grad_norm": 99.43099212646484, "learning_rate": 1.6335391998291017e-05, "lookahead_loss": 8.119747738838196, "loss": 5.6053, "step": 353000 }, { "epoch": 0.6742477416992188, "grad_norm": 122.55955505371094, "learning_rate": 1.6287708282470704e-05, "lookahead_loss": 8.09814368915558, "loss": 5.5748, "step": 353500 }, { "epoch": 0.675201416015625, "grad_norm": 174.5946502685547, "learning_rate": 1.624002456665039e-05, "lookahead_loss": 8.124195915222169, "loss": 5.5361, "step": 354000 }, { "epoch": 0.6761550903320312, "grad_norm": 193.15853881835938, "learning_rate": 1.6192340850830078e-05, "lookahead_loss": 8.189052962303162, "loss": 5.621, "step": 354500 }, { "epoch": 0.6771087646484375, "grad_norm": 205.3660888671875, "learning_rate": 1.6144657135009768e-05, "lookahead_loss": 8.16627775812149, "loss": 5.5793, "step": 355000 }, { "epoch": 0.6771087646484375, "eval_accuracy": 0.038860665362035224, "eval_lookahead_loss": 8.04288404827118, "eval_lookahead_perplexity": 3111.5741923054516, "eval_loss": 5.414566993713379, "eval_perplexity": 224.6552474802888, "eval_runtime": 1016.5992, "eval_samples_per_second": 9.837, "eval_steps_per_second": 2.459, "step": 355000 }, { "epoch": 0.6780624389648438, "grad_norm": 252.7603759765625, "learning_rate": 1.6096973419189455e-05, "lookahead_loss": 8.196721817016602, "loss": 5.5854, "step": 355500 }, { "epoch": 0.67901611328125, "grad_norm": 237.2992401123047, "learning_rate": 1.604928970336914e-05, "lookahead_loss": 8.203863400459289, "loss": 5.5878, "step": 356000 }, { "epoch": 0.6799697875976562, "grad_norm": 80.4056625366211, "learning_rate": 1.6001605987548828e-05, "lookahead_loss": 8.260224917411804, "loss": 5.6015, "step": 356500 }, { "epoch": 0.6809234619140625, "grad_norm": 267.7515563964844, "learning_rate": 1.5953922271728515e-05, "lookahead_loss": 8.126566572189331, "loss": 5.5966, "step": 357000 }, { "epoch": 0.6818771362304688, "grad_norm": 371.93121337890625, "learning_rate": 1.5906238555908205e-05, "lookahead_loss": 8.130836866378784, "loss": 5.5919, "step": 357500 }, { "epoch": 0.682830810546875, "grad_norm": 208.82469177246094, "learning_rate": 1.5858554840087892e-05, "lookahead_loss": 7.941144113540649, "loss": 5.5752, "step": 358000 }, { "epoch": 0.6837844848632812, "grad_norm": 102.74617767333984, "learning_rate": 1.581087112426758e-05, "lookahead_loss": 7.996092744350434, "loss": 5.5871, "step": 358500 }, { "epoch": 0.6847381591796875, "grad_norm": 571.7108154296875, "learning_rate": 1.5763187408447266e-05, "lookahead_loss": 7.83837704372406, "loss": 5.6024, "step": 359000 }, { "epoch": 0.6856918334960938, "grad_norm": 261.8134460449219, "learning_rate": 1.5715503692626953e-05, "lookahead_loss": 7.859425446510315, "loss": 5.5764, "step": 359500 }, { "epoch": 0.6866455078125, "grad_norm": 379.63299560546875, "learning_rate": 1.5667819976806643e-05, "lookahead_loss": 7.849499279975891, "loss": 5.5751, "step": 360000 }, { "epoch": 0.6866455078125, "eval_accuracy": 0.0384972602739726, "eval_lookahead_loss": 7.826880103874206, "eval_lookahead_perplexity": 2507.095282140415, "eval_loss": 5.403864860534668, "eval_perplexity": 222.26377684674273, "eval_runtime": 609.9636, "eval_samples_per_second": 16.394, "eval_steps_per_second": 4.099, "step": 360000 }, { "epoch": 0.6875991821289062, "grad_norm": 128.7223663330078, "learning_rate": 1.562013626098633e-05, "lookahead_loss": 7.92839573764801, "loss": 5.5467, "step": 360500 }, { "epoch": 0.6885528564453125, "grad_norm": 182.72486877441406, "learning_rate": 1.5572452545166016e-05, "lookahead_loss": 7.849254702568055, "loss": 5.5745, "step": 361000 }, { "epoch": 0.6895065307617188, "grad_norm": 208.63546752929688, "learning_rate": 1.5524768829345703e-05, "lookahead_loss": 7.895947131156921, "loss": 5.5806, "step": 361500 }, { "epoch": 0.690460205078125, "grad_norm": 179.85487365722656, "learning_rate": 1.547708511352539e-05, "lookahead_loss": 7.9130114021301265, "loss": 5.5759, "step": 362000 }, { "epoch": 0.6914138793945312, "grad_norm": 143.0045928955078, "learning_rate": 1.542940139770508e-05, "lookahead_loss": 7.876603882789611, "loss": 5.5566, "step": 362500 }, { "epoch": 0.6923675537109375, "grad_norm": 528.8530883789062, "learning_rate": 1.5381717681884767e-05, "lookahead_loss": 7.830058940887451, "loss": 5.5281, "step": 363000 }, { "epoch": 0.6933212280273438, "grad_norm": 228.25576782226562, "learning_rate": 1.5334033966064454e-05, "lookahead_loss": 7.887627326011658, "loss": 5.4944, "step": 363500 }, { "epoch": 0.69427490234375, "grad_norm": 209.771728515625, "learning_rate": 1.528635025024414e-05, "lookahead_loss": 7.883073943138123, "loss": 5.5174, "step": 364000 }, { "epoch": 0.6952285766601562, "grad_norm": 338.88037109375, "learning_rate": 1.523866653442383e-05, "lookahead_loss": 7.746066631317139, "loss": 5.495, "step": 364500 }, { "epoch": 0.6961822509765625, "grad_norm": 123.92079162597656, "learning_rate": 1.5190982818603516e-05, "lookahead_loss": 7.839604325294495, "loss": 5.5659, "step": 365000 }, { "epoch": 0.6961822509765625, "eval_accuracy": 0.038788062622309195, "eval_lookahead_loss": 7.800725620651245, "eval_lookahead_perplexity": 2442.3735714957497, "eval_loss": 5.392970085144043, "eval_perplexity": 219.85540607182773, "eval_runtime": 525.6924, "eval_samples_per_second": 19.023, "eval_steps_per_second": 4.756, "step": 365000 }, { "epoch": 0.6971359252929688, "grad_norm": 4183.66455078125, "learning_rate": 1.5143299102783205e-05, "lookahead_loss": 7.841345190048218, "loss": 5.6654, "step": 365500 }, { "epoch": 0.698089599609375, "grad_norm": 268.1899108886719, "learning_rate": 1.5095615386962891e-05, "lookahead_loss": 7.753796419143677, "loss": 5.633, "step": 366000 }, { "epoch": 0.6990432739257812, "grad_norm": 153.07745361328125, "learning_rate": 1.5047931671142578e-05, "lookahead_loss": 7.957260661125183, "loss": 5.6733, "step": 366500 }, { "epoch": 0.6999969482421875, "grad_norm": 659.2149658203125, "learning_rate": 1.5000247955322267e-05, "lookahead_loss": 8.033715418815612, "loss": 5.6649, "step": 367000 }, { "epoch": 0.7009506225585938, "grad_norm": 154.3483428955078, "learning_rate": 1.4952564239501954e-05, "lookahead_loss": 7.81259859752655, "loss": 5.5637, "step": 367500 }, { "epoch": 0.701904296875, "grad_norm": 120.33118438720703, "learning_rate": 1.4904880523681642e-05, "lookahead_loss": 7.939181873321533, "loss": 5.5693, "step": 368000 }, { "epoch": 0.7028579711914062, "grad_norm": 398.9562683105469, "learning_rate": 1.4857196807861329e-05, "lookahead_loss": 7.739817490577698, "loss": 5.5509, "step": 368500 }, { "epoch": 0.7038116455078125, "grad_norm": 563.5996704101562, "learning_rate": 1.4809513092041016e-05, "lookahead_loss": 7.826303864479065, "loss": 5.5756, "step": 369000 }, { "epoch": 0.7047653198242188, "grad_norm": 225.58432006835938, "learning_rate": 1.4761829376220704e-05, "lookahead_loss": 7.84067007446289, "loss": 5.5359, "step": 369500 }, { "epoch": 0.705718994140625, "grad_norm": 99.23751831054688, "learning_rate": 1.4714145660400391e-05, "lookahead_loss": 7.910350508689881, "loss": 5.6128, "step": 370000 }, { "epoch": 0.705718994140625, "eval_accuracy": 0.038456360078277886, "eval_lookahead_loss": 7.768917008590698, "eval_lookahead_perplexity": 2365.907642750536, "eval_loss": 5.383975982666016, "eval_perplexity": 217.88686989750394, "eval_runtime": 551.2072, "eval_samples_per_second": 18.142, "eval_steps_per_second": 4.535, "step": 370000 }, { "epoch": 0.7066726684570312, "grad_norm": 504.18603515625, "learning_rate": 1.466646194458008e-05, "lookahead_loss": 7.853506729125977, "loss": 5.5237, "step": 370500 }, { "epoch": 0.7076263427734375, "grad_norm": 189.79991149902344, "learning_rate": 1.4618778228759766e-05, "lookahead_loss": 7.924680317878723, "loss": 5.6273, "step": 371000 }, { "epoch": 0.7085800170898438, "grad_norm": 210.91360473632812, "learning_rate": 1.4571094512939453e-05, "lookahead_loss": 7.814344255447388, "loss": 5.5629, "step": 371500 }, { "epoch": 0.70953369140625, "grad_norm": 111.18330383300781, "learning_rate": 1.4523410797119142e-05, "lookahead_loss": 7.855846696853638, "loss": 5.5888, "step": 372000 }, { "epoch": 0.7104873657226562, "grad_norm": 220.45945739746094, "learning_rate": 1.4475727081298829e-05, "lookahead_loss": 7.903277499198913, "loss": 5.5781, "step": 372500 }, { "epoch": 0.7114410400390625, "grad_norm": 99.71575164794922, "learning_rate": 1.4428043365478517e-05, "lookahead_loss": 7.831927432060242, "loss": 5.5571, "step": 373000 }, { "epoch": 0.7123947143554688, "grad_norm": 92.58635711669922, "learning_rate": 1.4380359649658204e-05, "lookahead_loss": 7.796199646949768, "loss": 5.598, "step": 373500 }, { "epoch": 0.713348388671875, "grad_norm": 95.23898315429688, "learning_rate": 1.433267593383789e-05, "lookahead_loss": 7.830960173606872, "loss": 5.5508, "step": 374000 }, { "epoch": 0.7143020629882812, "grad_norm": 79.95771026611328, "learning_rate": 1.428499221801758e-05, "lookahead_loss": 7.855954801559448, "loss": 5.5932, "step": 374500 }, { "epoch": 0.7152557373046875, "grad_norm": 210.9166717529297, "learning_rate": 1.4237308502197266e-05, "lookahead_loss": 7.731211150169373, "loss": 5.5471, "step": 375000 }, { "epoch": 0.7152557373046875, "eval_accuracy": 0.03802994129158513, "eval_lookahead_loss": 7.734612303161621, "eval_lookahead_perplexity": 2286.1222100990685, "eval_loss": 5.376159191131592, "eval_perplexity": 216.19033303343215, "eval_runtime": 569.0766, "eval_samples_per_second": 17.572, "eval_steps_per_second": 4.393, "step": 375000 }, { "epoch": 0.7162094116210938, "grad_norm": 247.77749633789062, "learning_rate": 1.4189624786376955e-05, "lookahead_loss": 7.71632539844513, "loss": 5.5085, "step": 375500 }, { "epoch": 0.7171630859375, "grad_norm": 234.3800506591797, "learning_rate": 1.4141941070556641e-05, "lookahead_loss": 7.801510418891906, "loss": 5.5557, "step": 376000 }, { "epoch": 0.7181167602539062, "grad_norm": 113.59115600585938, "learning_rate": 1.4094257354736328e-05, "lookahead_loss": 7.861888924598694, "loss": 5.5331, "step": 376500 }, { "epoch": 0.7190704345703125, "grad_norm": 126.256103515625, "learning_rate": 1.4046573638916017e-05, "lookahead_loss": 7.729419247627258, "loss": 5.5549, "step": 377000 }, { "epoch": 0.7200241088867188, "grad_norm": 168.9649658203125, "learning_rate": 1.3998889923095704e-05, "lookahead_loss": 7.8099126653671265, "loss": 5.5618, "step": 377500 }, { "epoch": 0.720977783203125, "grad_norm": 175.36114501953125, "learning_rate": 1.3951206207275392e-05, "lookahead_loss": 7.776826338768005, "loss": 5.5643, "step": 378000 }, { "epoch": 0.7219314575195312, "grad_norm": 115.26795959472656, "learning_rate": 1.3903522491455079e-05, "lookahead_loss": 7.75958394241333, "loss": 5.5525, "step": 378500 }, { "epoch": 0.7228851318359375, "grad_norm": 71.99981689453125, "learning_rate": 1.3855838775634766e-05, "lookahead_loss": 7.883628920555115, "loss": 5.5148, "step": 379000 }, { "epoch": 0.7238388061523438, "grad_norm": 100.01074981689453, "learning_rate": 1.3808155059814454e-05, "lookahead_loss": 7.76290507030487, "loss": 5.5256, "step": 379500 }, { "epoch": 0.72479248046875, "grad_norm": 448.4578857421875, "learning_rate": 1.3760471343994141e-05, "lookahead_loss": 7.777594248771668, "loss": 5.5468, "step": 380000 }, { "epoch": 0.72479248046875, "eval_accuracy": 0.0386880626223092, "eval_lookahead_loss": 7.737389213562012, "eval_lookahead_perplexity": 2292.479389214118, "eval_loss": 5.367162227630615, "eval_perplexity": 214.25400011756284, "eval_runtime": 538.088, "eval_samples_per_second": 18.584, "eval_steps_per_second": 4.646, "step": 380000 }, { "epoch": 0.7257461547851562, "grad_norm": 88.47937774658203, "learning_rate": 1.371278762817383e-05, "lookahead_loss": 7.828882398605347, "loss": 5.5884, "step": 380500 }, { "epoch": 0.7266998291015625, "grad_norm": 163.5406951904297, "learning_rate": 1.3665103912353516e-05, "lookahead_loss": 7.844945970535278, "loss": 5.5616, "step": 381000 }, { "epoch": 0.7276535034179688, "grad_norm": 128.4533233642578, "learning_rate": 1.3617420196533203e-05, "lookahead_loss": 7.834921336174011, "loss": 5.5581, "step": 381500 }, { "epoch": 0.728607177734375, "grad_norm": 168.65591430664062, "learning_rate": 1.3569736480712892e-05, "lookahead_loss": 7.810140036582947, "loss": 5.5178, "step": 382000 }, { "epoch": 0.7295608520507812, "grad_norm": 192.74423217773438, "learning_rate": 1.3522052764892579e-05, "lookahead_loss": 7.768440069198609, "loss": 5.4689, "step": 382500 }, { "epoch": 0.7305145263671875, "grad_norm": 328.0540771484375, "learning_rate": 1.3474369049072265e-05, "lookahead_loss": 7.702497232437134, "loss": 5.5079, "step": 383000 }, { "epoch": 0.7314682006835938, "grad_norm": 237.95433044433594, "learning_rate": 1.3426685333251954e-05, "lookahead_loss": 7.715453090667725, "loss": 5.5052, "step": 383500 }, { "epoch": 0.732421875, "grad_norm": 110.07740020751953, "learning_rate": 1.337900161743164e-05, "lookahead_loss": 7.7685176267623905, "loss": 5.479, "step": 384000 }, { "epoch": 0.7333755493164062, "grad_norm": 215.25111389160156, "learning_rate": 1.333131790161133e-05, "lookahead_loss": 7.668212026596069, "loss": 5.464, "step": 384500 }, { "epoch": 0.7343292236328125, "grad_norm": 469.9798583984375, "learning_rate": 1.3283634185791016e-05, "lookahead_loss": 7.780706875801086, "loss": 5.5354, "step": 385000 }, { "epoch": 0.7343292236328125, "eval_accuracy": 0.038334246575342464, "eval_lookahead_loss": 7.716654870033264, "eval_lookahead_perplexity": 2245.4357291083375, "eval_loss": 5.3563361167907715, "eval_perplexity": 211.94697316199418, "eval_runtime": 546.0051, "eval_samples_per_second": 18.315, "eval_steps_per_second": 4.579, "step": 385000 }, { "epoch": 0.7352828979492188, "grad_norm": 162.39273071289062, "learning_rate": 1.3235950469970703e-05, "lookahead_loss": 7.824522475242615, "loss": 5.598, "step": 385500 }, { "epoch": 0.736236572265625, "grad_norm": 129.5315704345703, "learning_rate": 1.3188266754150391e-05, "lookahead_loss": 7.814901178359985, "loss": 5.5896, "step": 386000 }, { "epoch": 0.7371902465820312, "grad_norm": 198.42787170410156, "learning_rate": 1.3140583038330078e-05, "lookahead_loss": 7.703146760940552, "loss": 5.5665, "step": 386500 }, { "epoch": 0.7381439208984375, "grad_norm": 551.6124877929688, "learning_rate": 1.3092899322509767e-05, "lookahead_loss": 7.861192024230957, "loss": 5.5338, "step": 387000 }, { "epoch": 0.7390975952148438, "grad_norm": 109.1438217163086, "learning_rate": 1.3045215606689454e-05, "lookahead_loss": 7.787821484565735, "loss": 5.5051, "step": 387500 }, { "epoch": 0.74005126953125, "grad_norm": 153.73126220703125, "learning_rate": 1.299753189086914e-05, "lookahead_loss": 7.77448010635376, "loss": 5.5942, "step": 388000 }, { "epoch": 0.7410049438476562, "grad_norm": 65.89240264892578, "learning_rate": 1.2949848175048829e-05, "lookahead_loss": 7.767938301086426, "loss": 5.5491, "step": 388500 }, { "epoch": 0.7419586181640625, "grad_norm": 155.22056579589844, "learning_rate": 1.2902164459228516e-05, "lookahead_loss": 7.787211930274963, "loss": 5.5135, "step": 389000 }, { "epoch": 0.7429122924804688, "grad_norm": 262.892578125, "learning_rate": 1.2854480743408204e-05, "lookahead_loss": 7.843936257362365, "loss": 5.586, "step": 389500 }, { "epoch": 0.743865966796875, "grad_norm": 1119.6715087890625, "learning_rate": 1.2806797027587891e-05, "lookahead_loss": 7.808092055320739, "loss": 5.5659, "step": 390000 }, { "epoch": 0.743865966796875, "eval_accuracy": 0.03835440313111546, "eval_lookahead_loss": 7.742131027793884, "eval_lookahead_perplexity": 2303.3757143662056, "eval_loss": 5.349612712860107, "eval_perplexity": 210.5267477740264, "eval_runtime": 530.5133, "eval_samples_per_second": 18.85, "eval_steps_per_second": 4.712, "step": 390000 }, { "epoch": 0.7448196411132812, "grad_norm": 110.75211334228516, "learning_rate": 1.2759113311767578e-05, "lookahead_loss": 7.837679747581482, "loss": 5.5302, "step": 390500 }, { "epoch": 0.7457733154296875, "grad_norm": 164.66920471191406, "learning_rate": 1.2711429595947266e-05, "lookahead_loss": 7.82012502002716, "loss": 5.5628, "step": 391000 }, { "epoch": 0.7467269897460938, "grad_norm": 253.49217224121094, "learning_rate": 1.2663745880126953e-05, "lookahead_loss": 7.723696685791015, "loss": 5.4981, "step": 391500 }, { "epoch": 0.7476806640625, "grad_norm": 194.13186645507812, "learning_rate": 1.2616062164306642e-05, "lookahead_loss": 7.7434462480545045, "loss": 5.5023, "step": 392000 }, { "epoch": 0.7486343383789062, "grad_norm": 161.11878967285156, "learning_rate": 1.2568378448486329e-05, "lookahead_loss": 7.827606229782105, "loss": 5.5217, "step": 392500 }, { "epoch": 0.7495880126953125, "grad_norm": 156.92559814453125, "learning_rate": 1.2520694732666015e-05, "lookahead_loss": 7.885674408912658, "loss": 5.4902, "step": 393000 }, { "epoch": 0.7505416870117188, "grad_norm": 181.0946807861328, "learning_rate": 1.2473011016845704e-05, "lookahead_loss": 7.857169241905212, "loss": 5.5287, "step": 393500 }, { "epoch": 0.751495361328125, "grad_norm": 87.40216064453125, "learning_rate": 1.242532730102539e-05, "lookahead_loss": 7.85726635313034, "loss": 5.5248, "step": 394000 }, { "epoch": 0.7524490356445312, "grad_norm": 240.7547607421875, "learning_rate": 1.237764358520508e-05, "lookahead_loss": 7.870921675682068, "loss": 5.5188, "step": 394500 }, { "epoch": 0.7534027099609375, "grad_norm": 405.5052795410156, "learning_rate": 1.2329959869384766e-05, "lookahead_loss": 7.794099634170532, "loss": 5.5114, "step": 395000 }, { "epoch": 0.7534027099609375, "eval_accuracy": 0.03817260273972603, "eval_lookahead_loss": 7.759988555908203, "eval_lookahead_perplexity": 2344.8777701258873, "eval_loss": 5.34311056137085, "eval_perplexity": 209.16231165974463, "eval_runtime": 547.2196, "eval_samples_per_second": 18.274, "eval_steps_per_second": 4.569, "step": 395000 }, { "epoch": 0.7543563842773438, "grad_norm": 66.67276000976562, "learning_rate": 1.2282276153564453e-05, "lookahead_loss": 7.785623269081116, "loss": 5.4901, "step": 395500 }, { "epoch": 0.75531005859375, "grad_norm": 174.82382202148438, "learning_rate": 1.2234592437744141e-05, "lookahead_loss": 7.781565052986145, "loss": 5.4129, "step": 396000 }, { "epoch": 0.7562637329101562, "grad_norm": 935.8668212890625, "learning_rate": 1.2186908721923828e-05, "lookahead_loss": 7.922936456680298, "loss": 5.5139, "step": 396500 }, { "epoch": 0.7572174072265625, "grad_norm": 113.73637390136719, "learning_rate": 1.2139225006103517e-05, "lookahead_loss": 7.81322253704071, "loss": 5.5103, "step": 397000 }, { "epoch": 0.7581710815429688, "grad_norm": 254.02626037597656, "learning_rate": 1.2091541290283204e-05, "lookahead_loss": 7.85380380821228, "loss": 5.522, "step": 397500 }, { "epoch": 0.759124755859375, "grad_norm": 182.22901916503906, "learning_rate": 1.204385757446289e-05, "lookahead_loss": 7.852971189975738, "loss": 5.4792, "step": 398000 }, { "epoch": 0.7600784301757812, "grad_norm": 212.43470764160156, "learning_rate": 1.1996173858642579e-05, "lookahead_loss": 7.90075197315216, "loss": 5.513, "step": 398500 }, { "epoch": 0.7610321044921875, "grad_norm": 126.07649993896484, "learning_rate": 1.1948490142822266e-05, "lookahead_loss": 7.852678387641907, "loss": 5.4944, "step": 399000 }, { "epoch": 0.7619857788085938, "grad_norm": 272.16387939453125, "learning_rate": 1.1900806427001954e-05, "lookahead_loss": 7.846392110824585, "loss": 5.4904, "step": 399500 }, { "epoch": 0.762939453125, "grad_norm": 117.46968841552734, "learning_rate": 1.1853122711181641e-05, "lookahead_loss": 7.842329785346985, "loss": 5.5024, "step": 400000 }, { "epoch": 0.762939453125, "eval_accuracy": 0.03825714285714286, "eval_lookahead_loss": 7.777910020828247, "eval_lookahead_perplexity": 2387.2802370950617, "eval_loss": 5.336972236633301, "eval_perplexity": 207.882337934124, "eval_runtime": 564.0388, "eval_samples_per_second": 17.729, "eval_steps_per_second": 4.432, "step": 400000 }, { "epoch": 0.7638931274414062, "grad_norm": 69.66509246826172, "learning_rate": 1.1805438995361328e-05, "lookahead_loss": 7.784950831413269, "loss": 5.4966, "step": 400500 }, { "epoch": 0.7648468017578125, "grad_norm": 96.76663208007812, "learning_rate": 1.1757755279541016e-05, "lookahead_loss": 7.750488299369812, "loss": 5.4667, "step": 401000 }, { "epoch": 0.7658004760742188, "grad_norm": 170.43850708007812, "learning_rate": 1.1710071563720703e-05, "lookahead_loss": 7.796458812713623, "loss": 5.4423, "step": 401500 }, { "epoch": 0.766754150390625, "grad_norm": 151.98562622070312, "learning_rate": 1.1662387847900392e-05, "lookahead_loss": 7.744953604698181, "loss": 5.4719, "step": 402000 }, { "epoch": 0.7677078247070312, "grad_norm": 329.09686279296875, "learning_rate": 1.1614704132080079e-05, "lookahead_loss": 7.685482928276062, "loss": 5.4275, "step": 402500 }, { "epoch": 0.7686614990234375, "grad_norm": 107.29830169677734, "learning_rate": 1.1567020416259765e-05, "lookahead_loss": 7.81631841468811, "loss": 5.5416, "step": 403000 }, { "epoch": 0.7696151733398438, "grad_norm": 141.64466857910156, "learning_rate": 1.1519336700439454e-05, "lookahead_loss": 7.8476922960281374, "loss": 5.4771, "step": 403500 }, { "epoch": 0.77056884765625, "grad_norm": 110.25850677490234, "learning_rate": 1.147165298461914e-05, "lookahead_loss": 7.822906564712524, "loss": 5.5389, "step": 404000 }, { "epoch": 0.7715225219726562, "grad_norm": 147.84559631347656, "learning_rate": 1.142396926879883e-05, "lookahead_loss": 7.853226441383362, "loss": 5.6036, "step": 404500 }, { "epoch": 0.7724761962890625, "grad_norm": 265.9990539550781, "learning_rate": 1.1376285552978516e-05, "lookahead_loss": 7.864651885032654, "loss": 5.5723, "step": 405000 }, { "epoch": 0.7724761962890625, "eval_accuracy": 0.03810724070450098, "eval_lookahead_loss": 7.745997590065002, "eval_lookahead_perplexity": 2312.299100295992, "eval_loss": 5.330674171447754, "eval_perplexity": 206.5771956685277, "eval_runtime": 508.5935, "eval_samples_per_second": 19.662, "eval_steps_per_second": 4.916, "step": 405000 }, { "epoch": 0.7734298706054688, "grad_norm": 357.8275146484375, "learning_rate": 1.1328601837158203e-05, "lookahead_loss": 7.727678363800049, "loss": 5.55, "step": 405500 }, { "epoch": 0.774383544921875, "grad_norm": 136.9630584716797, "learning_rate": 1.1280918121337891e-05, "lookahead_loss": 7.7854668245315555, "loss": 5.5688, "step": 406000 }, { "epoch": 0.7753372192382812, "grad_norm": 312.82269287109375, "learning_rate": 1.1233234405517578e-05, "lookahead_loss": 7.719874919891358, "loss": 5.5005, "step": 406500 }, { "epoch": 0.7762908935546875, "grad_norm": 13.053879737854004, "learning_rate": 1.1185550689697267e-05, "lookahead_loss": 7.8243987693786625, "loss": 5.541, "step": 407000 }, { "epoch": 0.7772445678710938, "grad_norm": 86.7345962524414, "learning_rate": 1.1137866973876954e-05, "lookahead_loss": 7.937169346809387, "loss": 5.4925, "step": 407500 }, { "epoch": 0.7781982421875, "grad_norm": 113.7857894897461, "learning_rate": 1.109018325805664e-05, "lookahead_loss": 7.802515108585357, "loss": 5.4891, "step": 408000 }, { "epoch": 0.7791519165039062, "grad_norm": 129.57952880859375, "learning_rate": 1.1042499542236329e-05, "lookahead_loss": 7.774034664154053, "loss": 5.5119, "step": 408500 }, { "epoch": 0.7801055908203125, "grad_norm": 118.24760437011719, "learning_rate": 1.0994815826416016e-05, "lookahead_loss": 7.721765648841858, "loss": 5.4936, "step": 409000 }, { "epoch": 0.7810592651367188, "grad_norm": 381.8975524902344, "learning_rate": 1.0947132110595704e-05, "lookahead_loss": 7.818323125839234, "loss": 5.5429, "step": 409500 }, { "epoch": 0.782012939453125, "grad_norm": 136.74472045898438, "learning_rate": 1.0899448394775391e-05, "lookahead_loss": 7.713536421775818, "loss": 5.4679, "step": 410000 }, { "epoch": 0.782012939453125, "eval_accuracy": 0.0383958904109589, "eval_lookahead_loss": 7.707771606636047, "eval_lookahead_perplexity": 2225.577266651169, "eval_loss": 5.320427417755127, "eval_perplexity": 204.47125796468424, "eval_runtime": 497.8089, "eval_samples_per_second": 20.088, "eval_steps_per_second": 5.022, "step": 410000 }, { "epoch": 0.7829666137695312, "grad_norm": 132.606201171875, "learning_rate": 1.0851764678955078e-05, "lookahead_loss": 7.649002804756164, "loss": 5.4205, "step": 410500 }, { "epoch": 0.7839202880859375, "grad_norm": 80.95204162597656, "learning_rate": 1.0804080963134766e-05, "lookahead_loss": 7.803996085166931, "loss": 5.4983, "step": 411000 }, { "epoch": 0.7848739624023438, "grad_norm": 240.8425750732422, "learning_rate": 1.0756397247314453e-05, "lookahead_loss": 7.769220488548279, "loss": 5.5195, "step": 411500 }, { "epoch": 0.78582763671875, "grad_norm": 119.86480712890625, "learning_rate": 1.0708713531494142e-05, "lookahead_loss": 7.765147686958313, "loss": 5.5, "step": 412000 }, { "epoch": 0.7867813110351562, "grad_norm": 88.26171112060547, "learning_rate": 1.0661029815673829e-05, "lookahead_loss": 7.712034560203552, "loss": 5.5098, "step": 412500 }, { "epoch": 0.7877349853515625, "grad_norm": 171.93386840820312, "learning_rate": 1.0613346099853515e-05, "lookahead_loss": 7.829547630310058, "loss": 5.5071, "step": 413000 }, { "epoch": 0.7886886596679688, "grad_norm": 147.1614532470703, "learning_rate": 1.0565662384033204e-05, "lookahead_loss": 7.797605522155762, "loss": 5.5237, "step": 413500 }, { "epoch": 0.789642333984375, "grad_norm": 298.8372802734375, "learning_rate": 1.051797866821289e-05, "lookahead_loss": 7.843295482635498, "loss": 5.4878, "step": 414000 }, { "epoch": 0.7905960083007812, "grad_norm": 182.6975555419922, "learning_rate": 1.047029495239258e-05, "lookahead_loss": 7.87462833404541, "loss": 5.5003, "step": 414500 }, { "epoch": 0.7915496826171875, "grad_norm": 255.0580291748047, "learning_rate": 1.0422611236572266e-05, "lookahead_loss": 7.75749968624115, "loss": 5.5022, "step": 415000 }, { "epoch": 0.7915496826171875, "eval_accuracy": 0.0379119373776908, "eval_lookahead_loss": 7.703862183761597, "eval_lookahead_perplexity": 2216.8935292367933, "eval_loss": 5.311500549316406, "eval_perplexity": 202.65409280978884, "eval_runtime": 507.8876, "eval_samples_per_second": 19.689, "eval_steps_per_second": 4.922, "step": 415000 }, { "epoch": 0.7925033569335938, "grad_norm": 163.51576232910156, "learning_rate": 1.0374927520751953e-05, "lookahead_loss": 7.790949342727661, "loss": 5.4871, "step": 415500 }, { "epoch": 0.79345703125, "grad_norm": 150.1991729736328, "learning_rate": 1.0327243804931641e-05, "lookahead_loss": 7.799429728507995, "loss": 5.4894, "step": 416000 }, { "epoch": 0.7944107055664062, "grad_norm": 15770.830078125, "learning_rate": 1.0279560089111328e-05, "lookahead_loss": 7.68923697757721, "loss": 5.4792, "step": 416500 }, { "epoch": 0.7953643798828125, "grad_norm": 124.74213409423828, "learning_rate": 1.0231876373291017e-05, "lookahead_loss": 7.75842698097229, "loss": 5.5357, "step": 417000 }, { "epoch": 0.7963180541992188, "grad_norm": 121.89371490478516, "learning_rate": 1.0184192657470704e-05, "lookahead_loss": 7.6784037504196165, "loss": 5.4504, "step": 417500 }, { "epoch": 0.797271728515625, "grad_norm": 114.72571563720703, "learning_rate": 1.013650894165039e-05, "lookahead_loss": 7.77801070022583, "loss": 5.488, "step": 418000 }, { "epoch": 0.7982254028320312, "grad_norm": 223.60484313964844, "learning_rate": 1.0088825225830079e-05, "lookahead_loss": 7.697888485908508, "loss": 5.4734, "step": 418500 }, { "epoch": 0.7991790771484375, "grad_norm": 112.8887939453125, "learning_rate": 1.0041141510009766e-05, "lookahead_loss": 7.679011648178101, "loss": 5.4312, "step": 419000 }, { "epoch": 0.8001327514648438, "grad_norm": 124.98394012451172, "learning_rate": 9.993457794189454e-06, "lookahead_loss": 7.726365936279297, "loss": 5.4442, "step": 419500 }, { "epoch": 0.80108642578125, "grad_norm": 181.61282348632812, "learning_rate": 9.945774078369141e-06, "lookahead_loss": 7.708832776069641, "loss": 5.4582, "step": 420000 }, { "epoch": 0.80108642578125, "eval_accuracy": 0.0382187866927593, "eval_lookahead_loss": 7.720092709350586, "eval_lookahead_perplexity": 2253.168460670755, "eval_loss": 5.309783458709717, "eval_perplexity": 202.30641595237518, "eval_runtime": 530.344, "eval_samples_per_second": 18.856, "eval_steps_per_second": 4.714, "step": 420000 }, { "epoch": 0.8020401000976562, "grad_norm": 103.95464324951172, "learning_rate": 9.898090362548828e-06, "lookahead_loss": 7.674679337501526, "loss": 5.4754, "step": 420500 }, { "epoch": 0.8029937744140625, "grad_norm": 199.56729125976562, "learning_rate": 9.850406646728516e-06, "lookahead_loss": 7.748070892333985, "loss": 5.5174, "step": 421000 }, { "epoch": 0.8039474487304688, "grad_norm": 350.2108154296875, "learning_rate": 9.802722930908203e-06, "lookahead_loss": 7.7302656116485595, "loss": 5.5472, "step": 421500 }, { "epoch": 0.804901123046875, "grad_norm": 87.42015075683594, "learning_rate": 9.755039215087892e-06, "lookahead_loss": 7.749013783454895, "loss": 5.5208, "step": 422000 }, { "epoch": 0.8058547973632812, "grad_norm": 96.27731323242188, "learning_rate": 9.707355499267579e-06, "lookahead_loss": 7.70984751033783, "loss": 5.5424, "step": 422500 }, { "epoch": 0.8068084716796875, "grad_norm": 297.0563659667969, "learning_rate": 9.659671783447265e-06, "lookahead_loss": 7.797365827560425, "loss": 5.5105, "step": 423000 }, { "epoch": 0.8077621459960938, "grad_norm": 108.24977111816406, "learning_rate": 9.611988067626954e-06, "lookahead_loss": 7.8540793771743775, "loss": 5.5273, "step": 423500 }, { "epoch": 0.8087158203125, "grad_norm": 160.7525177001953, "learning_rate": 9.56430435180664e-06, "lookahead_loss": 7.743619729995728, "loss": 5.4651, "step": 424000 }, { "epoch": 0.8096694946289062, "grad_norm": 144.3737030029297, "learning_rate": 9.51662063598633e-06, "lookahead_loss": 7.769571304321289, "loss": 5.5171, "step": 424500 }, { "epoch": 0.8106231689453125, "grad_norm": 197.60581970214844, "learning_rate": 9.468936920166016e-06, "lookahead_loss": 7.785249656677246, "loss": 5.4716, "step": 425000 }, { "epoch": 0.8106231689453125, "eval_accuracy": 0.0379320939334638, "eval_lookahead_loss": 7.696689488220215, "eval_lookahead_perplexity": 2201.049317685471, "eval_loss": 5.297877788543701, "eval_perplexity": 199.9121037191841, "eval_runtime": 504.3506, "eval_samples_per_second": 19.827, "eval_steps_per_second": 4.957, "step": 425000 }, { "epoch": 0.8115768432617188, "grad_norm": 734.5932006835938, "learning_rate": 9.421253204345703e-06, "lookahead_loss": 7.775777452468872, "loss": 5.4818, "step": 425500 }, { "epoch": 0.812530517578125, "grad_norm": 94.16063690185547, "learning_rate": 9.373569488525391e-06, "lookahead_loss": 7.729480327606201, "loss": 5.4929, "step": 426000 }, { "epoch": 0.8134841918945312, "grad_norm": 89.50267028808594, "learning_rate": 9.325885772705078e-06, "lookahead_loss": 7.789216313362122, "loss": 5.5052, "step": 426500 }, { "epoch": 0.8144378662109375, "grad_norm": 119.9497299194336, "learning_rate": 9.278202056884767e-06, "lookahead_loss": 7.800880197525024, "loss": 5.4966, "step": 427000 }, { "epoch": 0.8153915405273438, "grad_norm": 156.38731384277344, "learning_rate": 9.230518341064454e-06, "lookahead_loss": 7.7137400379180905, "loss": 5.4738, "step": 427500 }, { "epoch": 0.81634521484375, "grad_norm": 215.97308349609375, "learning_rate": 9.18283462524414e-06, "lookahead_loss": 7.738602474212646, "loss": 5.4777, "step": 428000 }, { "epoch": 0.8172988891601562, "grad_norm": 255.94969177246094, "learning_rate": 9.135150909423829e-06, "lookahead_loss": 7.775208297729492, "loss": 5.5147, "step": 428500 }, { "epoch": 0.8182525634765625, "grad_norm": 123.00167083740234, "learning_rate": 9.087467193603516e-06, "lookahead_loss": 7.753657881736755, "loss": 5.4559, "step": 429000 }, { "epoch": 0.8192062377929688, "grad_norm": 111.40052795410156, "learning_rate": 9.039783477783204e-06, "lookahead_loss": 7.842267797470093, "loss": 5.4805, "step": 429500 }, { "epoch": 0.820159912109375, "grad_norm": 206.00018310546875, "learning_rate": 8.992099761962891e-06, "lookahead_loss": 7.680137983322144, "loss": 5.4742, "step": 430000 }, { "epoch": 0.820159912109375, "eval_accuracy": 0.037870450097847355, "eval_lookahead_loss": 7.705809011268616, "eval_lookahead_perplexity": 2221.2136424327596, "eval_loss": 5.2943925857543945, "eval_perplexity": 199.2165822184436, "eval_runtime": 527.9198, "eval_samples_per_second": 18.942, "eval_steps_per_second": 4.736, "step": 430000 }, { "epoch": 0.8211135864257812, "grad_norm": 131.6860809326172, "learning_rate": 8.944416046142578e-06, "lookahead_loss": 7.7781404790878295, "loss": 5.4653, "step": 430500 }, { "epoch": 0.8220672607421875, "grad_norm": 197.48202514648438, "learning_rate": 8.896732330322266e-06, "lookahead_loss": 7.747256222724914, "loss": 5.5004, "step": 431000 }, { "epoch": 0.8230209350585938, "grad_norm": 109.67562103271484, "learning_rate": 8.849048614501953e-06, "lookahead_loss": 7.732384922027588, "loss": 5.501, "step": 431500 }, { "epoch": 0.823974609375, "grad_norm": 113.43403625488281, "learning_rate": 8.801364898681642e-06, "lookahead_loss": 7.704549064636231, "loss": 5.4421, "step": 432000 }, { "epoch": 0.8249282836914062, "grad_norm": 126.0523910522461, "learning_rate": 8.753681182861329e-06, "lookahead_loss": 7.770391117095947, "loss": 5.5053, "step": 432500 }, { "epoch": 0.8258819580078125, "grad_norm": 306.01702880859375, "learning_rate": 8.705997467041015e-06, "lookahead_loss": 7.843114874839783, "loss": 5.46, "step": 433000 }, { "epoch": 0.8268356323242188, "grad_norm": 92.95161437988281, "learning_rate": 8.658313751220704e-06, "lookahead_loss": 7.801055955886841, "loss": 5.5, "step": 433500 }, { "epoch": 0.827789306640625, "grad_norm": 318.50372314453125, "learning_rate": 8.61063003540039e-06, "lookahead_loss": 7.816186615943908, "loss": 5.4733, "step": 434000 }, { "epoch": 0.8287429809570312, "grad_norm": 156.40240478515625, "learning_rate": 8.56294631958008e-06, "lookahead_loss": 7.859918023109436, "loss": 5.4715, "step": 434500 }, { "epoch": 0.8296966552734375, "grad_norm": 94.06924438476562, "learning_rate": 8.515262603759766e-06, "lookahead_loss": 7.67504247379303, "loss": 5.456, "step": 435000 }, { "epoch": 0.8296966552734375, "eval_accuracy": 0.037759099804305286, "eval_lookahead_loss": 7.729836627960205, "eval_lookahead_perplexity": 2275.2304613807537, "eval_loss": 5.28588342666626, "eval_perplexity": 197.52860843100976, "eval_runtime": 639.2357, "eval_samples_per_second": 15.644, "eval_steps_per_second": 3.911, "step": 435000 }, { "epoch": 0.8306503295898438, "grad_norm": 182.57151794433594, "learning_rate": 8.467578887939453e-06, "lookahead_loss": 7.599406764984131, "loss": 5.4543, "step": 435500 }, { "epoch": 0.83160400390625, "grad_norm": 102.3682632446289, "learning_rate": 8.419895172119141e-06, "lookahead_loss": 7.6790240297317505, "loss": 5.4444, "step": 436000 }, { "epoch": 0.8325576782226562, "grad_norm": 130.85208129882812, "learning_rate": 8.372211456298828e-06, "lookahead_loss": 7.777050636291504, "loss": 5.4141, "step": 436500 }, { "epoch": 0.8335113525390625, "grad_norm": 84.96771240234375, "learning_rate": 8.324527740478517e-06, "lookahead_loss": 7.740947603225708, "loss": 5.3987, "step": 437000 }, { "epoch": 0.8344650268554688, "grad_norm": 139.57728576660156, "learning_rate": 8.276844024658204e-06, "lookahead_loss": 7.764598629951477, "loss": 5.456, "step": 437500 }, { "epoch": 0.835418701171875, "grad_norm": 283.9523010253906, "learning_rate": 8.22916030883789e-06, "lookahead_loss": 7.85090064907074, "loss": 5.4813, "step": 438000 }, { "epoch": 0.8363723754882812, "grad_norm": 104.79523468017578, "learning_rate": 8.181476593017579e-06, "lookahead_loss": 7.8095324149131775, "loss": 5.5272, "step": 438500 }, { "epoch": 0.8373260498046875, "grad_norm": 303.10797119140625, "learning_rate": 8.133792877197266e-06, "lookahead_loss": 7.858753878593445, "loss": 5.5229, "step": 439000 }, { "epoch": 0.8382797241210938, "grad_norm": 199.2714080810547, "learning_rate": 8.086109161376954e-06, "lookahead_loss": 7.707453769683838, "loss": 5.529, "step": 439500 }, { "epoch": 0.8392333984375, "grad_norm": 150.98452758789062, "learning_rate": 8.038425445556641e-06, "lookahead_loss": 7.744642693519593, "loss": 5.4751, "step": 440000 }, { "epoch": 0.8392333984375, "eval_accuracy": 0.037976908023483365, "eval_lookahead_loss": 7.7101520814895625, "eval_lookahead_perplexity": 2230.8815081709868, "eval_loss": 5.279390811920166, "eval_perplexity": 196.25028559452127, "eval_runtime": 1218.8406, "eval_samples_per_second": 8.205, "eval_steps_per_second": 2.051, "step": 440000 }, { "epoch": 0.8401870727539062, "grad_norm": 90.09400939941406, "learning_rate": 7.990741729736328e-06, "lookahead_loss": 7.72508162689209, "loss": 5.4921, "step": 440500 }, { "epoch": 0.8411407470703125, "grad_norm": 149.82041931152344, "learning_rate": 7.943058013916016e-06, "lookahead_loss": 7.713854930877686, "loss": 5.4586, "step": 441000 }, { "epoch": 0.8420944213867188, "grad_norm": 169.2808074951172, "learning_rate": 7.895374298095703e-06, "lookahead_loss": 7.776987294197083, "loss": 5.43, "step": 441500 }, { "epoch": 0.843048095703125, "grad_norm": 127.67754364013672, "learning_rate": 7.847690582275392e-06, "lookahead_loss": 7.789427947044373, "loss": 5.4273, "step": 442000 }, { "epoch": 0.8440017700195312, "grad_norm": 96.45807647705078, "learning_rate": 7.800006866455079e-06, "lookahead_loss": 7.707818923950195, "loss": 5.4764, "step": 442500 }, { "epoch": 0.8449554443359375, "grad_norm": 120.39371490478516, "learning_rate": 7.752323150634765e-06, "lookahead_loss": 7.766773469924927, "loss": 5.4607, "step": 443000 }, { "epoch": 0.8459091186523438, "grad_norm": 122.88436126708984, "learning_rate": 7.704639434814454e-06, "lookahead_loss": 7.7503203182220455, "loss": 5.4387, "step": 443500 }, { "epoch": 0.84686279296875, "grad_norm": 150.4892120361328, "learning_rate": 7.65695571899414e-06, "lookahead_loss": 7.692335112571716, "loss": 5.4645, "step": 444000 }, { "epoch": 0.8478164672851562, "grad_norm": 98.17855834960938, "learning_rate": 7.6092720031738284e-06, "lookahead_loss": 7.823717679977417, "loss": 5.4307, "step": 444500 }, { "epoch": 0.8487701416015625, "grad_norm": 317.6507873535156, "learning_rate": 7.561588287353516e-06, "lookahead_loss": 7.669734752655029, "loss": 5.4628, "step": 445000 }, { "epoch": 0.8487701416015625, "eval_accuracy": 0.03793522504892368, "eval_lookahead_loss": 7.732646193313599, "eval_lookahead_perplexity": 2281.6318584168284, "eval_loss": 5.274598598480225, "eval_perplexity": 195.31206221719063, "eval_runtime": 543.8746, "eval_samples_per_second": 18.387, "eval_steps_per_second": 4.597, "step": 445000 }, { "epoch": 0.8497238159179688, "grad_norm": 98.74201202392578, "learning_rate": 7.513904571533204e-06, "lookahead_loss": 7.716559967041015, "loss": 5.4684, "step": 445500 }, { "epoch": 0.850677490234375, "grad_norm": 797.8794555664062, "learning_rate": 7.466220855712891e-06, "lookahead_loss": 7.728174803733825, "loss": 5.457, "step": 446000 }, { "epoch": 0.8516311645507812, "grad_norm": 149.2899932861328, "learning_rate": 7.418537139892578e-06, "lookahead_loss": 7.748901443481445, "loss": 5.507, "step": 446500 }, { "epoch": 0.8525848388671875, "grad_norm": 150.59202575683594, "learning_rate": 7.370853424072266e-06, "lookahead_loss": 7.86814893913269, "loss": 5.4778, "step": 447000 }, { "epoch": 0.8535385131835938, "grad_norm": 114.1617660522461, "learning_rate": 7.323169708251954e-06, "lookahead_loss": 7.736129783630371, "loss": 5.4728, "step": 447500 }, { "epoch": 0.8544921875, "grad_norm": 184.8936767578125, "learning_rate": 7.275485992431641e-06, "lookahead_loss": 7.691138429641724, "loss": 5.4947, "step": 448000 }, { "epoch": 0.8554458618164062, "grad_norm": 193.5766143798828, "learning_rate": 7.227802276611328e-06, "lookahead_loss": 7.7288674039840695, "loss": 5.439, "step": 448500 }, { "epoch": 0.8563995361328125, "grad_norm": 279.9077453613281, "learning_rate": 7.180118560791016e-06, "lookahead_loss": 7.668610495090484, "loss": 5.3734, "step": 449000 }, { "epoch": 0.8573532104492188, "grad_norm": 257.7452697753906, "learning_rate": 7.1324348449707034e-06, "lookahead_loss": 7.746779972076416, "loss": 5.4672, "step": 449500 }, { "epoch": 0.858306884765625, "grad_norm": 103.3786392211914, "learning_rate": 7.084751129150391e-06, "lookahead_loss": 7.657151092529297, "loss": 5.3535, "step": 450000 }, { "epoch": 0.858306884765625, "eval_accuracy": 0.03771389432485323, "eval_lookahead_loss": 7.686327855300903, "eval_lookahead_perplexity": 2178.360601686126, "eval_loss": 5.273197174072266, "eval_perplexity": 195.03853883197118, "eval_runtime": 522.5465, "eval_samples_per_second": 19.137, "eval_steps_per_second": 4.784, "step": 450000 }, { "epoch": 0.8592605590820312, "grad_norm": 216.6535186767578, "learning_rate": 7.037067413330079e-06, "lookahead_loss": 7.793745121002197, "loss": 5.4505, "step": 450500 }, { "epoch": 0.8602142333984375, "grad_norm": 553.9771728515625, "learning_rate": 6.989383697509766e-06, "lookahead_loss": 7.640247445106507, "loss": 5.3837, "step": 451000 }, { "epoch": 0.8611679077148438, "grad_norm": 164.79608154296875, "learning_rate": 6.941699981689453e-06, "lookahead_loss": 7.816086415290832, "loss": 5.4752, "step": 451500 }, { "epoch": 0.86212158203125, "grad_norm": 188.76063537597656, "learning_rate": 6.894016265869141e-06, "lookahead_loss": 7.750656454086304, "loss": 5.4572, "step": 452000 }, { "epoch": 0.8630752563476562, "grad_norm": 120.69510650634766, "learning_rate": 6.846332550048829e-06, "lookahead_loss": 7.658810668945312, "loss": 5.4083, "step": 452500 }, { "epoch": 0.8640289306640625, "grad_norm": 70.45494842529297, "learning_rate": 6.798648834228516e-06, "lookahead_loss": 7.733378632545471, "loss": 5.3804, "step": 453000 }, { "epoch": 0.8649826049804688, "grad_norm": 193.24459838867188, "learning_rate": 6.750965118408203e-06, "lookahead_loss": 7.659905039787293, "loss": 5.4082, "step": 453500 }, { "epoch": 0.865936279296875, "grad_norm": 254.3077392578125, "learning_rate": 6.703281402587891e-06, "lookahead_loss": 7.73323121547699, "loss": 5.4215, "step": 454000 }, { "epoch": 0.8668899536132812, "grad_norm": 130.81289672851562, "learning_rate": 6.6555976867675784e-06, "lookahead_loss": 7.705599264144897, "loss": 5.5111, "step": 454500 }, { "epoch": 0.8678436279296875, "grad_norm": 218.465576171875, "learning_rate": 6.607913970947266e-06, "lookahead_loss": 7.750705163955688, "loss": 5.5193, "step": 455000 }, { "epoch": 0.8678436279296875, "eval_accuracy": 0.037965753424657536, "eval_lookahead_loss": 7.706162044715882, "eval_lookahead_perplexity": 2221.9979435761843, "eval_loss": 5.264558792114258, "eval_perplexity": 193.36097757630287, "eval_runtime": 552.673, "eval_samples_per_second": 18.094, "eval_steps_per_second": 4.523, "step": 455000 }, { "epoch": 0.8687973022460938, "grad_norm": 270.5840148925781, "learning_rate": 6.560230255126954e-06, "lookahead_loss": 7.776663407325745, "loss": 5.4378, "step": 455500 }, { "epoch": 0.8697509765625, "grad_norm": 149.66030883789062, "learning_rate": 6.512546539306641e-06, "lookahead_loss": 7.777847550392151, "loss": 5.5021, "step": 456000 }, { "epoch": 0.8707046508789062, "grad_norm": 231.241455078125, "learning_rate": 6.464862823486328e-06, "lookahead_loss": 7.675812285423278, "loss": 5.4423, "step": 456500 }, { "epoch": 0.8716583251953125, "grad_norm": 177.73062133789062, "learning_rate": 6.417179107666016e-06, "lookahead_loss": 7.7567992095947265, "loss": 5.4594, "step": 457000 }, { "epoch": 0.8726119995117188, "grad_norm": 232.8194580078125, "learning_rate": 6.369495391845704e-06, "lookahead_loss": 7.7342501668930055, "loss": 5.4323, "step": 457500 }, { "epoch": 0.873565673828125, "grad_norm": 167.5643310546875, "learning_rate": 6.321811676025391e-06, "lookahead_loss": 7.744601145744324, "loss": 5.4687, "step": 458000 }, { "epoch": 0.8745193481445312, "grad_norm": 95.80736541748047, "learning_rate": 6.274127960205078e-06, "lookahead_loss": 7.796501852989197, "loss": 5.4301, "step": 458500 }, { "epoch": 0.8754730224609375, "grad_norm": 323.820556640625, "learning_rate": 6.226444244384766e-06, "lookahead_loss": 7.819704398155213, "loss": 5.4798, "step": 459000 }, { "epoch": 0.8764266967773438, "grad_norm": 80.21458435058594, "learning_rate": 6.1787605285644534e-06, "lookahead_loss": 7.744061072349548, "loss": 5.4072, "step": 459500 }, { "epoch": 0.87738037109375, "grad_norm": 72.09957885742188, "learning_rate": 6.131076812744141e-06, "lookahead_loss": 7.71459010887146, "loss": 5.4747, "step": 460000 }, { "epoch": 0.87738037109375, "eval_accuracy": 0.03743894324853229, "eval_lookahead_loss": 7.688755961990356, "eval_lookahead_perplexity": 2183.6563203182072, "eval_loss": 5.2600998878479, "eval_perplexity": 192.5007188194539, "eval_runtime": 511.8174, "eval_samples_per_second": 19.538, "eval_steps_per_second": 4.885, "step": 460000 }, { "epoch": 0.8783340454101562, "grad_norm": 363.082275390625, "learning_rate": 6.083393096923829e-06, "lookahead_loss": 7.616396760940551, "loss": 5.4528, "step": 460500 }, { "epoch": 0.8792877197265625, "grad_norm": 213.73391723632812, "learning_rate": 6.035709381103516e-06, "lookahead_loss": 7.7111747407913205, "loss": 5.4592, "step": 461000 }, { "epoch": 0.8802413940429688, "grad_norm": 402.4309997558594, "learning_rate": 5.988025665283203e-06, "lookahead_loss": 7.654868431091309, "loss": 5.409, "step": 461500 }, { "epoch": 0.881195068359375, "grad_norm": 142.43113708496094, "learning_rate": 5.940341949462891e-06, "lookahead_loss": 7.758353511810303, "loss": 5.4602, "step": 462000 }, { "epoch": 0.8821487426757812, "grad_norm": 126.97706604003906, "learning_rate": 5.892658233642579e-06, "lookahead_loss": 7.70611284160614, "loss": 5.4189, "step": 462500 }, { "epoch": 0.8831024169921875, "grad_norm": 104.2982177734375, "learning_rate": 5.844974517822266e-06, "lookahead_loss": 7.7407079010009765, "loss": 5.4741, "step": 463000 }, { "epoch": 0.8840560913085938, "grad_norm": 147.13421630859375, "learning_rate": 5.797290802001953e-06, "lookahead_loss": 7.645737180709839, "loss": 5.4482, "step": 463500 }, { "epoch": 0.885009765625, "grad_norm": 157.4132080078125, "learning_rate": 5.749607086181641e-06, "lookahead_loss": 7.686143802642822, "loss": 5.4446, "step": 464000 }, { "epoch": 0.8859634399414062, "grad_norm": 220.73529052734375, "learning_rate": 5.7019233703613284e-06, "lookahead_loss": 7.678248985290527, "loss": 5.4347, "step": 464500 }, { "epoch": 0.8869171142578125, "grad_norm": 105.21410369873047, "learning_rate": 5.654239654541016e-06, "lookahead_loss": 7.681058571815491, "loss": 5.4077, "step": 465000 }, { "epoch": 0.8869171142578125, "eval_accuracy": 0.037543835616438356, "eval_lookahead_loss": 7.667192045974732, "eval_lookahead_perplexity": 2137.072211851019, "eval_loss": 5.252965450286865, "eval_perplexity": 191.1322219928881, "eval_runtime": 522.4441, "eval_samples_per_second": 19.141, "eval_steps_per_second": 4.785, "step": 465000 }, { "epoch": 0.8878707885742188, "grad_norm": 150.7294158935547, "learning_rate": 5.606555938720704e-06, "lookahead_loss": 7.725572785377502, "loss": 5.4244, "step": 465500 }, { "epoch": 0.888824462890625, "grad_norm": 101.06095886230469, "learning_rate": 5.558872222900391e-06, "lookahead_loss": 7.7090388069152835, "loss": 5.4281, "step": 466000 }, { "epoch": 0.8897781372070312, "grad_norm": 162.92123413085938, "learning_rate": 5.511188507080078e-06, "lookahead_loss": 7.739173527717591, "loss": 5.3883, "step": 466500 }, { "epoch": 0.8907318115234375, "grad_norm": 214.8478240966797, "learning_rate": 5.463504791259766e-06, "lookahead_loss": 7.825649773597717, "loss": 5.4192, "step": 467000 }, { "epoch": 0.8916854858398438, "grad_norm": 137.3257598876953, "learning_rate": 5.415821075439454e-06, "lookahead_loss": 7.6974540634155275, "loss": 5.41, "step": 467500 }, { "epoch": 0.89263916015625, "grad_norm": 104.17424011230469, "learning_rate": 5.368137359619141e-06, "lookahead_loss": 7.733235692024231, "loss": 5.4257, "step": 468000 }, { "epoch": 0.8935928344726562, "grad_norm": 140.99998474121094, "learning_rate": 5.320453643798828e-06, "lookahead_loss": 7.718685688495636, "loss": 5.4114, "step": 468500 }, { "epoch": 0.8945465087890625, "grad_norm": 174.72120666503906, "learning_rate": 5.272769927978516e-06, "lookahead_loss": 7.642657055854797, "loss": 5.4343, "step": 469000 }, { "epoch": 0.8955001831054688, "grad_norm": 108.69181060791016, "learning_rate": 5.2250862121582034e-06, "lookahead_loss": 7.605739291191101, "loss": 5.3956, "step": 469500 }, { "epoch": 0.896453857421875, "grad_norm": 161.02919006347656, "learning_rate": 5.177402496337891e-06, "lookahead_loss": 7.7666136140823365, "loss": 5.4288, "step": 470000 }, { "epoch": 0.896453857421875, "eval_accuracy": 0.037718199608610566, "eval_lookahead_loss": 7.65384827709198, "eval_lookahead_perplexity": 2108.7450301709177, "eval_loss": 5.248459815979004, "eval_perplexity": 190.2729872485019, "eval_runtime": 544.868, "eval_samples_per_second": 18.353, "eval_steps_per_second": 4.588, "step": 470000 }, { "epoch": 0.8974075317382812, "grad_norm": 97.2838134765625, "learning_rate": 5.129718780517579e-06, "lookahead_loss": 7.610149185180664, "loss": 5.4018, "step": 470500 }, { "epoch": 0.8983612060546875, "grad_norm": 69.68118286132812, "learning_rate": 5.082035064697266e-06, "lookahead_loss": 7.718782844543457, "loss": 5.5024, "step": 471000 }, { "epoch": 0.8993148803710938, "grad_norm": 115.63383483886719, "learning_rate": 5.034351348876953e-06, "lookahead_loss": 7.656383040428161, "loss": 5.5489, "step": 471500 }, { "epoch": 0.9002685546875, "grad_norm": 169.95054626464844, "learning_rate": 4.986667633056641e-06, "lookahead_loss": 7.67255229473114, "loss": 5.4864, "step": 472000 }, { "epoch": 0.9012222290039062, "grad_norm": 65.87653350830078, "learning_rate": 4.938983917236329e-06, "lookahead_loss": 7.684901348114014, "loss": 5.4657, "step": 472500 }, { "epoch": 0.9021759033203125, "grad_norm": 104.85416412353516, "learning_rate": 4.891300201416016e-06, "lookahead_loss": 7.742523490905762, "loss": 5.4211, "step": 473000 }, { "epoch": 0.9031295776367188, "grad_norm": 135.2244415283203, "learning_rate": 4.843616485595703e-06, "lookahead_loss": 7.694372376441955, "loss": 5.44, "step": 473500 }, { "epoch": 0.904083251953125, "grad_norm": 109.97718048095703, "learning_rate": 4.795932769775391e-06, "lookahead_loss": 7.7515069169998165, "loss": 5.4679, "step": 474000 }, { "epoch": 0.9050369262695312, "grad_norm": 64.65538024902344, "learning_rate": 4.7482490539550784e-06, "lookahead_loss": 7.775709008216858, "loss": 5.4299, "step": 474500 }, { "epoch": 0.9059906005859375, "grad_norm": 78.92939758300781, "learning_rate": 4.700565338134766e-06, "lookahead_loss": 7.810865023612976, "loss": 5.4653, "step": 475000 }, { "epoch": 0.9059906005859375, "eval_accuracy": 0.03773835616438356, "eval_lookahead_loss": 7.664966944503784, "eval_lookahead_perplexity": 2132.3222958112583, "eval_loss": 5.245616436004639, "eval_perplexity": 189.73273727885407, "eval_runtime": 555.6172, "eval_samples_per_second": 17.998, "eval_steps_per_second": 4.5, "step": 475000 }, { "epoch": 0.9069442749023438, "grad_norm": 234.80465698242188, "learning_rate": 4.652881622314453e-06, "lookahead_loss": 7.6420389251708984, "loss": 5.4469, "step": 475500 }, { "epoch": 0.90789794921875, "grad_norm": 125.80445861816406, "learning_rate": 4.605197906494141e-06, "lookahead_loss": 7.7773317623138425, "loss": 5.4515, "step": 476000 }, { "epoch": 0.9088516235351562, "grad_norm": 108.83649444580078, "learning_rate": 4.557514190673828e-06, "lookahead_loss": 7.645542621612549, "loss": 5.4213, "step": 476500 }, { "epoch": 0.9098052978515625, "grad_norm": 105.81759643554688, "learning_rate": 4.509830474853516e-06, "lookahead_loss": 7.6288732271194455, "loss": 5.4256, "step": 477000 }, { "epoch": 0.9107589721679688, "grad_norm": 147.88575744628906, "learning_rate": 4.462146759033204e-06, "lookahead_loss": 7.6444828977584836, "loss": 5.3914, "step": 477500 }, { "epoch": 0.911712646484375, "grad_norm": 271.27716064453125, "learning_rate": 4.4144630432128904e-06, "lookahead_loss": 7.757286865234375, "loss": 5.4608, "step": 478000 }, { "epoch": 0.9126663208007812, "grad_norm": 120.65753173828125, "learning_rate": 4.366779327392578e-06, "lookahead_loss": 7.728390460968018, "loss": 5.4332, "step": 478500 }, { "epoch": 0.9136199951171875, "grad_norm": 116.51058959960938, "learning_rate": 4.319095611572266e-06, "lookahead_loss": 7.7424820652008055, "loss": 5.4456, "step": 479000 }, { "epoch": 0.9145736694335938, "grad_norm": 108.98234558105469, "learning_rate": 4.2714118957519534e-06, "lookahead_loss": 7.795019497871399, "loss": 5.4525, "step": 479500 }, { "epoch": 0.91552734375, "grad_norm": 48.105224609375, "learning_rate": 4.223728179931641e-06, "lookahead_loss": 7.580483585357666, "loss": 5.3929, "step": 480000 }, { "epoch": 0.91552734375, "eval_accuracy": 0.03760880626223092, "eval_lookahead_loss": 7.660269195365906, "eval_lookahead_perplexity": 2122.32867273051, "eval_loss": 5.240941047668457, "eval_perplexity": 188.84773353221235, "eval_runtime": 500.825, "eval_samples_per_second": 19.967, "eval_steps_per_second": 4.992, "step": 480000 }, { "epoch": 0.9164810180664062, "grad_norm": 202.2429962158203, "learning_rate": 4.176044464111328e-06, "lookahead_loss": 7.6918159732818605, "loss": 5.4069, "step": 480500 }, { "epoch": 0.9174346923828125, "grad_norm": 109.00312805175781, "learning_rate": 4.128360748291016e-06, "lookahead_loss": 7.693651158332825, "loss": 5.4204, "step": 481000 }, { "epoch": 0.9183883666992188, "grad_norm": 106.34810638427734, "learning_rate": 4.080677032470703e-06, "lookahead_loss": 7.673038469314575, "loss": 5.4062, "step": 481500 }, { "epoch": 0.919342041015625, "grad_norm": 96.4828872680664, "learning_rate": 4.032993316650391e-06, "lookahead_loss": 7.70093497467041, "loss": 5.4205, "step": 482000 }, { "epoch": 0.9202957153320312, "grad_norm": 69.57780456542969, "learning_rate": 3.985309600830079e-06, "lookahead_loss": 7.76972049331665, "loss": 5.4407, "step": 482500 }, { "epoch": 0.9212493896484375, "grad_norm": 142.65328979492188, "learning_rate": 3.9376258850097654e-06, "lookahead_loss": 7.731374537467956, "loss": 5.4283, "step": 483000 }, { "epoch": 0.9222030639648438, "grad_norm": 64.49398803710938, "learning_rate": 3.889942169189453e-06, "lookahead_loss": 7.6634223232269285, "loss": 5.3897, "step": 483500 }, { "epoch": 0.92315673828125, "grad_norm": 166.99147033691406, "learning_rate": 3.842258453369141e-06, "lookahead_loss": 7.666593769073486, "loss": 5.4466, "step": 484000 }, { "epoch": 0.9241104125976562, "grad_norm": 190.2186279296875, "learning_rate": 3.7945747375488284e-06, "lookahead_loss": 7.610544244766236, "loss": 5.3883, "step": 484500 }, { "epoch": 0.9250640869140625, "grad_norm": 89.94544219970703, "learning_rate": 3.7468910217285157e-06, "lookahead_loss": 7.624362926483155, "loss": 5.405, "step": 485000 }, { "epoch": 0.9250640869140625, "eval_accuracy": 0.037383365949119375, "eval_lookahead_loss": 7.653021510887146, "eval_lookahead_perplexity": 2107.002311555058, "eval_loss": 5.235273361206055, "eval_perplexity": 187.78043121391528, "eval_runtime": 511.3942, "eval_samples_per_second": 19.554, "eval_steps_per_second": 4.889, "step": 485000 }, { "epoch": 0.9260177612304688, "grad_norm": 87.36448669433594, "learning_rate": 3.6992073059082034e-06, "lookahead_loss": 7.623468467712402, "loss": 5.3506, "step": 485500 }, { "epoch": 0.926971435546875, "grad_norm": 149.6256103515625, "learning_rate": 3.6515235900878906e-06, "lookahead_loss": 7.652008644104004, "loss": 5.3508, "step": 486000 }, { "epoch": 0.9279251098632812, "grad_norm": 56.405609130859375, "learning_rate": 3.6038398742675783e-06, "lookahead_loss": 7.569256920814514, "loss": 5.3688, "step": 486500 }, { "epoch": 0.9288787841796875, "grad_norm": 85.67948150634766, "learning_rate": 3.556156158447266e-06, "lookahead_loss": 7.5971185417175295, "loss": 5.4105, "step": 487000 }, { "epoch": 0.9298324584960938, "grad_norm": 102.79003143310547, "learning_rate": 3.508472442626953e-06, "lookahead_loss": 7.665099676132202, "loss": 5.4162, "step": 487500 }, { "epoch": 0.9307861328125, "grad_norm": 193.53512573242188, "learning_rate": 3.460788726806641e-06, "lookahead_loss": 7.656330892562866, "loss": 5.4998, "step": 488000 }, { "epoch": 0.9317398071289062, "grad_norm": 193.1105499267578, "learning_rate": 3.413105010986328e-06, "lookahead_loss": 7.674955784797668, "loss": 5.5111, "step": 488500 }, { "epoch": 0.9326934814453125, "grad_norm": 866.8139038085938, "learning_rate": 3.3654212951660158e-06, "lookahead_loss": 7.635992547512054, "loss": 5.4306, "step": 489000 }, { "epoch": 0.9336471557617188, "grad_norm": 91.95511627197266, "learning_rate": 3.3177375793457034e-06, "lookahead_loss": 7.679105440139771, "loss": 5.4748, "step": 489500 }, { "epoch": 0.934600830078125, "grad_norm": 75.68182373046875, "learning_rate": 3.2700538635253907e-06, "lookahead_loss": 7.6908284912109375, "loss": 5.4504, "step": 490000 }, { "epoch": 0.934600830078125, "eval_accuracy": 0.03739060665362035, "eval_lookahead_loss": 7.655217681884766, "eval_lookahead_perplexity": 2111.634733857456, "eval_loss": 5.231479644775391, "eval_perplexity": 187.069395094153, "eval_runtime": 550.2884, "eval_samples_per_second": 18.172, "eval_steps_per_second": 4.543, "step": 490000 }, { "epoch": 0.9355545043945312, "grad_norm": 110.42078399658203, "learning_rate": 3.2223701477050784e-06, "lookahead_loss": 7.736663967132569, "loss": 5.4389, "step": 490500 }, { "epoch": 0.9365081787109375, "grad_norm": 67.31777954101562, "learning_rate": 3.1746864318847656e-06, "lookahead_loss": 7.695682306289672, "loss": 5.4758, "step": 491000 }, { "epoch": 0.9374618530273438, "grad_norm": 128.5832061767578, "learning_rate": 3.1270027160644533e-06, "lookahead_loss": 7.776985114097595, "loss": 5.4458, "step": 491500 }, { "epoch": 0.93841552734375, "grad_norm": 219.8087615966797, "learning_rate": 3.079319000244141e-06, "lookahead_loss": 7.691537849426269, "loss": 5.4321, "step": 492000 }, { "epoch": 0.9393692016601562, "grad_norm": 83.56503295898438, "learning_rate": 3.031635284423828e-06, "lookahead_loss": 7.8277788896560665, "loss": 5.5699, "step": 492500 }, { "epoch": 0.9403228759765625, "grad_norm": 79.42426300048828, "learning_rate": 2.983951568603516e-06, "lookahead_loss": 7.678209396362305, "loss": 5.4204, "step": 493000 }, { "epoch": 0.9412765502929688, "grad_norm": 68.04448699951172, "learning_rate": 2.936267852783203e-06, "lookahead_loss": 7.700016125679016, "loss": 5.4305, "step": 493500 }, { "epoch": 0.942230224609375, "grad_norm": 211.01824951171875, "learning_rate": 2.8885841369628908e-06, "lookahead_loss": 7.698555393218994, "loss": 5.4324, "step": 494000 }, { "epoch": 0.9431838989257812, "grad_norm": 99.62227630615234, "learning_rate": 2.8409004211425784e-06, "lookahead_loss": 7.688579089164734, "loss": 5.4269, "step": 494500 }, { "epoch": 0.9441375732421875, "grad_norm": 77.29727935791016, "learning_rate": 2.7932167053222657e-06, "lookahead_loss": 7.740472438812255, "loss": 5.4217, "step": 495000 }, { "epoch": 0.9441375732421875, "eval_accuracy": 0.03741232876712329, "eval_lookahead_loss": 7.6548206918716435, "eval_lookahead_perplexity": 2110.7966023326826, "eval_loss": 5.230607032775879, "eval_perplexity": 186.90622729669516, "eval_runtime": 513.4699, "eval_samples_per_second": 19.475, "eval_steps_per_second": 4.869, "step": 495000 }, { "epoch": 0.9450912475585938, "grad_norm": 87.93878173828125, "learning_rate": 2.7455329895019534e-06, "lookahead_loss": 7.651820372581482, "loss": 5.4204, "step": 495500 }, { "epoch": 0.946044921875, "grad_norm": 155.9666748046875, "learning_rate": 2.6978492736816406e-06, "lookahead_loss": 7.609496916770935, "loss": 5.3861, "step": 496000 }, { "epoch": 0.9469985961914062, "grad_norm": 89.44951629638672, "learning_rate": 2.6501655578613283e-06, "lookahead_loss": 7.643394894599915, "loss": 5.3401, "step": 496500 }, { "epoch": 0.9479522705078125, "grad_norm": 110.69894409179688, "learning_rate": 2.602481842041016e-06, "lookahead_loss": 7.7107248210906985, "loss": 5.4093, "step": 497000 }, { "epoch": 0.9489059448242188, "grad_norm": 82.24137115478516, "learning_rate": 2.554798126220703e-06, "lookahead_loss": 7.687695365905761, "loss": 5.3484, "step": 497500 }, { "epoch": 0.949859619140625, "grad_norm": 59.092132568359375, "learning_rate": 2.507114410400391e-06, "lookahead_loss": 7.777789443969726, "loss": 5.4028, "step": 498000 }, { "epoch": 0.9508132934570312, "grad_norm": 181.999267578125, "learning_rate": 2.459430694580078e-06, "lookahead_loss": 7.781276138305664, "loss": 5.4447, "step": 498500 }, { "epoch": 0.9517669677734375, "grad_norm": 103.4625244140625, "learning_rate": 2.4117469787597658e-06, "lookahead_loss": 7.749319807052612, "loss": 5.4151, "step": 499000 }, { "epoch": 0.9527206420898438, "grad_norm": 197.38221740722656, "learning_rate": 2.3640632629394534e-06, "lookahead_loss": 7.57794569015503, "loss": 5.4254, "step": 499500 }, { "epoch": 0.95367431640625, "grad_norm": 248.95787048339844, "learning_rate": 2.3163795471191407e-06, "lookahead_loss": 7.732926556587219, "loss": 5.4109, "step": 500000 }, { "epoch": 0.95367431640625, "eval_accuracy": 0.037162230919765164, "eval_lookahead_loss": 7.649688281440735, "eval_lookahead_perplexity": 2099.990881251161, "eval_loss": 5.225395202636719, "eval_perplexity": 185.93463786689676, "eval_runtime": 549.0712, "eval_samples_per_second": 18.213, "eval_steps_per_second": 4.553, "step": 500000 }, { "epoch": 1.0009536743164062, "grad_norm": 83.26018524169922, "learning_rate": 2.2686958312988284e-06, "lookahead_loss": 7.7242847862243655, "loss": 5.3939, "step": 500500 }, { "epoch": 1.0019073486328125, "grad_norm": 96.01728057861328, "learning_rate": 2.2210121154785156e-06, "lookahead_loss": 7.755638115882873, "loss": 5.3898, "step": 501000 }, { "epoch": 1.0028610229492188, "grad_norm": 94.703369140625, "learning_rate": 2.1733283996582033e-06, "lookahead_loss": 7.72125712108612, "loss": 5.3884, "step": 501500 }, { "epoch": 1.003814697265625, "grad_norm": 139.4209442138672, "learning_rate": 2.125644683837891e-06, "lookahead_loss": 7.730610724449158, "loss": 5.4332, "step": 502000 }, { "epoch": 1.0047683715820312, "grad_norm": 224.29269409179688, "learning_rate": 2.077960968017578e-06, "lookahead_loss": 7.716648176193237, "loss": 5.3949, "step": 502500 }, { "epoch": 1.0057220458984375, "grad_norm": 173.1646728515625, "learning_rate": 2.030277252197266e-06, "lookahead_loss": 7.768192593574524, "loss": 5.4247, "step": 503000 }, { "epoch": 1.0066757202148438, "grad_norm": 114.15776062011719, "learning_rate": 1.982593536376953e-06, "lookahead_loss": 7.601891781806946, "loss": 5.3944, "step": 503500 }, { "epoch": 1.00762939453125, "grad_norm": 224.26058959960938, "learning_rate": 1.9349098205566408e-06, "lookahead_loss": 7.772348122596741, "loss": 5.3954, "step": 504000 }, { "epoch": 1.0085830688476562, "grad_norm": 98.72941589355469, "learning_rate": 1.8872261047363282e-06, "lookahead_loss": 7.60324462890625, "loss": 5.3467, "step": 504500 }, { "epoch": 1.0095367431640625, "grad_norm": 87.36551666259766, "learning_rate": 1.8395423889160157e-06, "lookahead_loss": 7.6556842937469485, "loss": 5.3892, "step": 505000 }, { "epoch": 1.0095367431640625, "eval_accuracy": 0.03737142857142857, "eval_lookahead_loss": 7.648708747863769, "eval_lookahead_perplexity": 2097.934876798744, "eval_loss": 5.223393440246582, "eval_perplexity": 185.56281317828987, "eval_runtime": 536.6077, "eval_samples_per_second": 18.636, "eval_steps_per_second": 4.659, "step": 505000 }, { "epoch": 1.0104904174804688, "grad_norm": 130.89544677734375, "learning_rate": 1.7918586730957031e-06, "lookahead_loss": 7.678762232780456, "loss": 5.3856, "step": 505500 }, { "epoch": 1.011444091796875, "grad_norm": 166.3372344970703, "learning_rate": 1.7441749572753908e-06, "lookahead_loss": 7.673016448974609, "loss": 5.4188, "step": 506000 }, { "epoch": 1.0123977661132812, "grad_norm": 72.69336700439453, "learning_rate": 1.6964912414550783e-06, "lookahead_loss": 7.66916607952118, "loss": 5.373, "step": 506500 }, { "epoch": 1.0133514404296875, "grad_norm": 98.03321075439453, "learning_rate": 1.6488075256347657e-06, "lookahead_loss": 7.696274509429932, "loss": 5.3929, "step": 507000 }, { "epoch": 1.0143051147460938, "grad_norm": 105.33464050292969, "learning_rate": 1.6011238098144532e-06, "lookahead_loss": 7.691936190605164, "loss": 5.3989, "step": 507500 }, { "epoch": 1.0152587890625, "grad_norm": 134.5007781982422, "learning_rate": 1.5534400939941406e-06, "lookahead_loss": 7.66206702709198, "loss": 5.4026, "step": 508000 }, { "epoch": 1.0162124633789062, "grad_norm": 83.63746643066406, "learning_rate": 1.505756378173828e-06, "lookahead_loss": 7.621736203193665, "loss": 5.3589, "step": 508500 }, { "epoch": 1.0171661376953125, "grad_norm": 85.1529541015625, "learning_rate": 1.4580726623535158e-06, "lookahead_loss": 7.683165393829346, "loss": 5.3536, "step": 509000 }, { "epoch": 1.0181198120117188, "grad_norm": 87.83940887451172, "learning_rate": 1.4103889465332032e-06, "lookahead_loss": 7.683896534442901, "loss": 5.3669, "step": 509500 }, { "epoch": 1.019073486328125, "grad_norm": 73.76750946044922, "learning_rate": 1.3627052307128907e-06, "lookahead_loss": 7.550733237266541, "loss": 5.3806, "step": 510000 }, { "epoch": 1.019073486328125, "eval_accuracy": 0.037382583170254405, "eval_lookahead_loss": 7.6463030187606815, "eval_lookahead_perplexity": 2092.8938798769923, "eval_loss": 5.220276355743408, "eval_perplexity": 184.98529875719717, "eval_runtime": 509.8325, "eval_samples_per_second": 19.614, "eval_steps_per_second": 4.904, "step": 510000 }, { "epoch": 1.0200271606445312, "grad_norm": 62.9285888671875, "learning_rate": 1.3150215148925781e-06, "lookahead_loss": 7.63447567653656, "loss": 5.3448, "step": 510500 }, { "epoch": 1.0209808349609375, "grad_norm": 113.06671905517578, "learning_rate": 1.2673377990722656e-06, "lookahead_loss": 7.720664642333984, "loss": 5.3774, "step": 511000 }, { "epoch": 1.0219345092773438, "grad_norm": 100.25318908691406, "learning_rate": 1.2196540832519533e-06, "lookahead_loss": 7.596182235717773, "loss": 5.4132, "step": 511500 }, { "epoch": 1.02288818359375, "grad_norm": 96.62307739257812, "learning_rate": 1.1719703674316407e-06, "lookahead_loss": 7.725762758255005, "loss": 5.4647, "step": 512000 }, { "epoch": 1.0238418579101562, "grad_norm": 123.0115737915039, "learning_rate": 1.1242866516113282e-06, "lookahead_loss": 7.599422789573669, "loss": 5.4204, "step": 512500 }, { "epoch": 1.0247955322265625, "grad_norm": 118.0004653930664, "learning_rate": 1.0766029357910156e-06, "lookahead_loss": 7.7002778873443605, "loss": 5.4478, "step": 513000 }, { "epoch": 1.0257492065429688, "grad_norm": 162.49331665039062, "learning_rate": 1.028919219970703e-06, "lookahead_loss": 7.676640281200409, "loss": 5.385, "step": 513500 }, { "epoch": 1.026702880859375, "grad_norm": 137.939208984375, "learning_rate": 9.812355041503908e-07, "lookahead_loss": 7.6651019268035885, "loss": 5.3785, "step": 514000 }, { "epoch": 1.0276565551757812, "grad_norm": 99.53665924072266, "learning_rate": 9.335517883300781e-07, "lookahead_loss": 7.6673005018234255, "loss": 5.4146, "step": 514500 }, { "epoch": 1.0286102294921875, "grad_norm": 95.87904357910156, "learning_rate": 8.858680725097657e-07, "lookahead_loss": 7.671229022502899, "loss": 5.4174, "step": 515000 }, { "epoch": 1.0286102294921875, "eval_accuracy": 0.03746360078277886, "eval_lookahead_loss": 7.644979669189453, "eval_lookahead_perplexity": 2090.126081444792, "eval_loss": 5.219385147094727, "eval_perplexity": 184.82051169979803, "eval_runtime": 496.1125, "eval_samples_per_second": 20.157, "eval_steps_per_second": 5.039, "step": 515000 }, { "epoch": 1.0295639038085938, "grad_norm": 104.27928924560547, "learning_rate": 8.381843566894531e-07, "lookahead_loss": 7.693174307823181, "loss": 5.4416, "step": 515500 }, { "epoch": 1.030517578125, "grad_norm": 97.7083740234375, "learning_rate": 7.905006408691407e-07, "lookahead_loss": 7.677439743041992, "loss": 5.3868, "step": 516000 }, { "epoch": 1.0314712524414062, "grad_norm": 186.6566925048828, "learning_rate": 7.428169250488282e-07, "lookahead_loss": 7.6245401668548585, "loss": 5.3988, "step": 516500 }, { "epoch": 1.0324249267578125, "grad_norm": 489.04669189453125, "learning_rate": 6.951332092285156e-07, "lookahead_loss": 7.684243922233581, "loss": 5.3685, "step": 517000 }, { "epoch": 1.0333786010742188, "grad_norm": 174.38314819335938, "learning_rate": 6.474494934082032e-07, "lookahead_loss": 7.639172402381897, "loss": 5.3991, "step": 517500 }, { "epoch": 1.034332275390625, "grad_norm": 87.31172180175781, "learning_rate": 5.997657775878906e-07, "lookahead_loss": 7.723142143249512, "loss": 5.4526, "step": 518000 }, { "epoch": 1.0352859497070312, "grad_norm": 79.88231658935547, "learning_rate": 5.520820617675782e-07, "lookahead_loss": 7.656815986633301, "loss": 5.3946, "step": 518500 }, { "epoch": 1.0362396240234375, "grad_norm": 74.0523452758789, "learning_rate": 5.043983459472657e-07, "lookahead_loss": 7.695533846855164, "loss": 5.3992, "step": 519000 }, { "epoch": 1.0371932983398438, "grad_norm": 82.26604461669922, "learning_rate": 4.5671463012695317e-07, "lookahead_loss": 7.637405508041382, "loss": 5.3889, "step": 519500 }, { "epoch": 1.03814697265625, "grad_norm": 152.38641357421875, "learning_rate": 4.0903091430664063e-07, "lookahead_loss": 7.678390237808228, "loss": 5.4017, "step": 520000 }, { "epoch": 1.03814697265625, "eval_accuracy": 0.037367906066536206, "eval_lookahead_loss": 7.644941953659058, "eval_lookahead_perplexity": 2090.0472527175793, "eval_loss": 5.2184648513793945, "eval_perplexity": 184.65050041710194, "eval_runtime": 496.9515, "eval_samples_per_second": 20.123, "eval_steps_per_second": 5.031, "step": 520000 }, { "epoch": 1.0391006469726562, "grad_norm": 123.77998352050781, "learning_rate": 3.6134719848632814e-07, "lookahead_loss": 7.573062379837036, "loss": 5.4254, "step": 520500 }, { "epoch": 1.0400543212890625, "grad_norm": 101.2959213256836, "learning_rate": 3.1366348266601565e-07, "lookahead_loss": 7.710000981330872, "loss": 5.4297, "step": 521000 }, { "epoch": 1.0410079956054688, "grad_norm": 56.064205169677734, "learning_rate": 2.6597976684570316e-07, "lookahead_loss": 7.570002226829529, "loss": 5.4129, "step": 521500 }, { "epoch": 1.041961669921875, "grad_norm": 75.95256805419922, "learning_rate": 2.1829605102539064e-07, "lookahead_loss": 7.604613988399506, "loss": 5.4073, "step": 522000 }, { "epoch": 1.0429153442382812, "grad_norm": 100.23770904541016, "learning_rate": 1.7061233520507813e-07, "lookahead_loss": 7.724450712203979, "loss": 5.3997, "step": 522500 }, { "epoch": 1.0438690185546875, "grad_norm": 133.91412353515625, "learning_rate": 1.2292861938476564e-07, "lookahead_loss": 7.727368996620179, "loss": 5.3845, "step": 523000 }, { "epoch": 1.0448226928710938, "grad_norm": 88.35601043701172, "learning_rate": 7.524490356445312e-08, "lookahead_loss": 7.666746793746948, "loss": 5.3885, "step": 523500 }, { "epoch": 1.0457763671875, "grad_norm": 335.9785461425781, "learning_rate": 2.7561187744140627e-08, "lookahead_loss": 7.599938707351685, "loss": 5.435, "step": 524000 }, { "epoch": 1.04632568359375, "step": 524288, "total_flos": 4.833448717656785e+18, "train_loss": 2.1971652193460613, "train_runtime": 166030.7788, "train_samples_per_second": 12.631, "train_steps_per_second": 3.158 } ], "logging_steps": 500, "max_steps": 524288, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.833448717656785e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }