diff --git "a/checkpoint-76160/trainer_state.json" "b/checkpoint-76160/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-76160/trainer_state.json" @@ -0,0 +1,10695 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 76160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006565126050420168, + "grad_norm": 14.6875, + "learning_rate": 1.9600000000000002e-05, + "loss": 7.9703, + "step": 50 + }, + { + "epoch": 0.013130252100840336, + "grad_norm": 4.0, + "learning_rate": 3.960000000000001e-05, + "loss": 5.4946, + "step": 100 + }, + { + "epoch": 0.019695378151260504, + "grad_norm": 4.71875, + "learning_rate": 5.96e-05, + "loss": 4.3864, + "step": 150 + }, + { + "epoch": 0.026260504201680673, + "grad_norm": 3.59375, + "learning_rate": 7.960000000000001e-05, + "loss": 4.1838, + "step": 200 + }, + { + "epoch": 0.03282563025210084, + "grad_norm": 3.515625, + "learning_rate": 9.960000000000001e-05, + "loss": 4.0394, + "step": 250 + }, + { + "epoch": 0.03939075630252101, + "grad_norm": 3.9375, + "learning_rate": 0.00011960000000000001, + "loss": 3.9963, + "step": 300 + }, + { + "epoch": 0.04595588235294118, + "grad_norm": 3.1875, + "learning_rate": 0.0001396, + "loss": 3.9232, + "step": 350 + }, + { + "epoch": 0.052521008403361345, + "grad_norm": 2.359375, + "learning_rate": 0.0001596, + "loss": 3.8485, + "step": 400 + }, + { + "epoch": 0.05908613445378151, + "grad_norm": 2.265625, + "learning_rate": 0.0001796, + "loss": 3.8405, + "step": 450 + }, + { + "epoch": 0.06565126050420168, + "grad_norm": 1.9375, + "learning_rate": 0.0001996, + "loss": 3.8045, + "step": 500 + }, + { + "epoch": 0.07221638655462184, + "grad_norm": 1.8671875, + "learning_rate": 0.00019987047316944226, + "loss": 3.7991, + "step": 550 + }, + { + "epoch": 0.07878151260504201, + "grad_norm": 1.7890625, + "learning_rate": 0.00019973830293417924, + "loss": 3.7579, + "step": 600 + }, + { + "epoch": 0.08534663865546219, + "grad_norm": 1.734375, + "learning_rate": 0.0001996061326989162, + "loss": 3.7281, + "step": 650 + }, + { + "epoch": 0.09191176470588236, + "grad_norm": 1.3828125, + "learning_rate": 0.0001994739624636532, + "loss": 3.7043, + "step": 700 + }, + { + "epoch": 0.09847689075630252, + "grad_norm": 1.2421875, + "learning_rate": 0.0001993417922283902, + "loss": 3.7007, + "step": 750 + }, + { + "epoch": 0.10504201680672269, + "grad_norm": 1.78125, + "learning_rate": 0.00019920962199312717, + "loss": 3.6837, + "step": 800 + }, + { + "epoch": 0.11160714285714286, + "grad_norm": 1.5546875, + "learning_rate": 0.00019907745175786414, + "loss": 3.6388, + "step": 850 + }, + { + "epoch": 0.11817226890756302, + "grad_norm": 1.5859375, + "learning_rate": 0.00019894528152260112, + "loss": 3.6421, + "step": 900 + }, + { + "epoch": 0.1247373949579832, + "grad_norm": 1.6875, + "learning_rate": 0.00019881311128733812, + "loss": 3.6235, + "step": 950 + }, + { + "epoch": 0.13130252100840337, + "grad_norm": 1.5703125, + "learning_rate": 0.0001986809410520751, + "loss": 3.6121, + "step": 1000 + }, + { + "epoch": 0.13786764705882354, + "grad_norm": 2.984375, + "learning_rate": 0.00019854877081681207, + "loss": 3.5556, + "step": 1050 + }, + { + "epoch": 0.14443277310924368, + "grad_norm": 1.84375, + "learning_rate": 0.00019841660058154904, + "loss": 3.5299, + "step": 1100 + }, + { + "epoch": 0.15099789915966386, + "grad_norm": 1.890625, + "learning_rate": 0.00019828443034628602, + "loss": 3.4619, + "step": 1150 + }, + { + "epoch": 0.15756302521008403, + "grad_norm": 1.90625, + "learning_rate": 0.00019815226011102302, + "loss": 3.4076, + "step": 1200 + }, + { + "epoch": 0.1641281512605042, + "grad_norm": 2.015625, + "learning_rate": 0.00019802008987576, + "loss": 3.3649, + "step": 1250 + }, + { + "epoch": 0.17069327731092437, + "grad_norm": 1.8515625, + "learning_rate": 0.00019788791964049697, + "loss": 3.3748, + "step": 1300 + }, + { + "epoch": 0.17725840336134455, + "grad_norm": 2.5625, + "learning_rate": 0.00019775574940523395, + "loss": 3.3507, + "step": 1350 + }, + { + "epoch": 0.18382352941176472, + "grad_norm": 1.9140625, + "learning_rate": 0.00019762357916997095, + "loss": 3.3366, + "step": 1400 + }, + { + "epoch": 0.19038865546218486, + "grad_norm": 1.671875, + "learning_rate": 0.00019749140893470792, + "loss": 3.3007, + "step": 1450 + }, + { + "epoch": 0.19695378151260504, + "grad_norm": 1.890625, + "learning_rate": 0.0001973592386994449, + "loss": 3.2944, + "step": 1500 + }, + { + "epoch": 0.2035189075630252, + "grad_norm": 1.578125, + "learning_rate": 0.00019722706846418187, + "loss": 3.2694, + "step": 1550 + }, + { + "epoch": 0.21008403361344538, + "grad_norm": 1.6484375, + "learning_rate": 0.00019709489822891888, + "loss": 3.3025, + "step": 1600 + }, + { + "epoch": 0.21664915966386555, + "grad_norm": 1.359375, + "learning_rate": 0.00019696272799365585, + "loss": 3.278, + "step": 1650 + }, + { + "epoch": 0.22321428571428573, + "grad_norm": 1.6328125, + "learning_rate": 0.0001968305577583928, + "loss": 3.2147, + "step": 1700 + }, + { + "epoch": 0.22977941176470587, + "grad_norm": 1.796875, + "learning_rate": 0.0001966983875231298, + "loss": 3.2072, + "step": 1750 + }, + { + "epoch": 0.23634453781512604, + "grad_norm": 2.75, + "learning_rate": 0.00019656621728786678, + "loss": 3.2304, + "step": 1800 + }, + { + "epoch": 0.24290966386554622, + "grad_norm": 1.5703125, + "learning_rate": 0.00019643404705260375, + "loss": 3.2481, + "step": 1850 + }, + { + "epoch": 0.2494747899159664, + "grad_norm": 1.4140625, + "learning_rate": 0.00019630187681734073, + "loss": 3.194, + "step": 1900 + }, + { + "epoch": 0.25603991596638653, + "grad_norm": 1.3046875, + "learning_rate": 0.0001961697065820777, + "loss": 3.2233, + "step": 1950 + }, + { + "epoch": 0.26260504201680673, + "grad_norm": 2.1875, + "learning_rate": 0.0001960375363468147, + "loss": 3.2257, + "step": 2000 + }, + { + "epoch": 0.2691701680672269, + "grad_norm": 1.6796875, + "learning_rate": 0.00019590536611155168, + "loss": 3.1964, + "step": 2050 + }, + { + "epoch": 0.2757352941176471, + "grad_norm": 1.5703125, + "learning_rate": 0.00019577319587628866, + "loss": 3.1723, + "step": 2100 + }, + { + "epoch": 0.2823004201680672, + "grad_norm": 1.6640625, + "learning_rate": 0.00019564102564102563, + "loss": 3.1696, + "step": 2150 + }, + { + "epoch": 0.28886554621848737, + "grad_norm": 1.5625, + "learning_rate": 0.00019550885540576263, + "loss": 3.1652, + "step": 2200 + }, + { + "epoch": 0.29543067226890757, + "grad_norm": 1.7734375, + "learning_rate": 0.0001953766851704996, + "loss": 3.1808, + "step": 2250 + }, + { + "epoch": 0.3019957983193277, + "grad_norm": 1.3359375, + "learning_rate": 0.00019524451493523658, + "loss": 3.1573, + "step": 2300 + }, + { + "epoch": 0.3085609243697479, + "grad_norm": 0.92578125, + "learning_rate": 0.00019511234469997356, + "loss": 3.1409, + "step": 2350 + }, + { + "epoch": 0.31512605042016806, + "grad_norm": 1.4375, + "learning_rate": 0.00019498017446471056, + "loss": 3.1447, + "step": 2400 + }, + { + "epoch": 0.32169117647058826, + "grad_norm": 1.421875, + "learning_rate": 0.00019484800422944754, + "loss": 3.1546, + "step": 2450 + }, + { + "epoch": 0.3282563025210084, + "grad_norm": 1.4609375, + "learning_rate": 0.0001947158339941845, + "loss": 3.1002, + "step": 2500 + }, + { + "epoch": 0.33482142857142855, + "grad_norm": 1.15625, + "learning_rate": 0.0001945836637589215, + "loss": 3.1101, + "step": 2550 + }, + { + "epoch": 0.34138655462184875, + "grad_norm": 1.625, + "learning_rate": 0.0001944514935236585, + "loss": 3.1415, + "step": 2600 + }, + { + "epoch": 0.3479516806722689, + "grad_norm": 1.21875, + "learning_rate": 0.00019431932328839546, + "loss": 3.1367, + "step": 2650 + }, + { + "epoch": 0.3545168067226891, + "grad_norm": 1.265625, + "learning_rate": 0.00019418715305313244, + "loss": 3.1297, + "step": 2700 + }, + { + "epoch": 0.36108193277310924, + "grad_norm": 1.375, + "learning_rate": 0.00019405498281786942, + "loss": 3.1024, + "step": 2750 + }, + { + "epoch": 0.36764705882352944, + "grad_norm": 1.4609375, + "learning_rate": 0.00019392281258260642, + "loss": 3.0998, + "step": 2800 + }, + { + "epoch": 0.3742121848739496, + "grad_norm": 1.578125, + "learning_rate": 0.0001937906423473434, + "loss": 3.1066, + "step": 2850 + }, + { + "epoch": 0.3807773109243697, + "grad_norm": 1.3359375, + "learning_rate": 0.00019365847211208037, + "loss": 3.1117, + "step": 2900 + }, + { + "epoch": 0.3873424369747899, + "grad_norm": 1.96875, + "learning_rate": 0.00019352630187681734, + "loss": 3.0898, + "step": 2950 + }, + { + "epoch": 0.3939075630252101, + "grad_norm": 1.2578125, + "learning_rate": 0.00019339413164155435, + "loss": 3.1092, + "step": 3000 + }, + { + "epoch": 0.4004726890756303, + "grad_norm": 1.09375, + "learning_rate": 0.00019326196140629132, + "loss": 3.0731, + "step": 3050 + }, + { + "epoch": 0.4070378151260504, + "grad_norm": 1.28125, + "learning_rate": 0.0001931297911710283, + "loss": 3.0875, + "step": 3100 + }, + { + "epoch": 0.41360294117647056, + "grad_norm": 1.796875, + "learning_rate": 0.00019299762093576527, + "loss": 3.0708, + "step": 3150 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 1.1328125, + "learning_rate": 0.00019286545070050225, + "loss": 3.0658, + "step": 3200 + }, + { + "epoch": 0.4267331932773109, + "grad_norm": 1.046875, + "learning_rate": 0.00019273328046523925, + "loss": 3.0819, + "step": 3250 + }, + { + "epoch": 0.4332983193277311, + "grad_norm": 1.1015625, + "learning_rate": 0.00019260111022997622, + "loss": 3.0502, + "step": 3300 + }, + { + "epoch": 0.43986344537815125, + "grad_norm": 1.265625, + "learning_rate": 0.0001924689399947132, + "loss": 3.0808, + "step": 3350 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 1.0390625, + "learning_rate": 0.00019233676975945017, + "loss": 3.0657, + "step": 3400 + }, + { + "epoch": 0.4529936974789916, + "grad_norm": 1.3828125, + "learning_rate": 0.00019220459952418718, + "loss": 3.0879, + "step": 3450 + }, + { + "epoch": 0.45955882352941174, + "grad_norm": 1.4453125, + "learning_rate": 0.00019207242928892415, + "loss": 3.0342, + "step": 3500 + }, + { + "epoch": 0.46612394957983194, + "grad_norm": 1.703125, + "learning_rate": 0.00019194025905366113, + "loss": 3.0596, + "step": 3550 + }, + { + "epoch": 0.4726890756302521, + "grad_norm": 1.7734375, + "learning_rate": 0.0001918080888183981, + "loss": 3.0289, + "step": 3600 + }, + { + "epoch": 0.4792542016806723, + "grad_norm": 1.25, + "learning_rate": 0.0001916759185831351, + "loss": 3.025, + "step": 3650 + }, + { + "epoch": 0.48581932773109243, + "grad_norm": 1.4375, + "learning_rate": 0.00019154374834787208, + "loss": 3.0299, + "step": 3700 + }, + { + "epoch": 0.49238445378151263, + "grad_norm": 1.4921875, + "learning_rate": 0.00019141157811260905, + "loss": 3.0436, + "step": 3750 + }, + { + "epoch": 0.4989495798319328, + "grad_norm": 1.3203125, + "learning_rate": 0.00019127940787734603, + "loss": 3.0206, + "step": 3800 + }, + { + "epoch": 0.5055147058823529, + "grad_norm": 1.125, + "learning_rate": 0.00019114723764208303, + "loss": 3.0454, + "step": 3850 + }, + { + "epoch": 0.5120798319327731, + "grad_norm": 1.078125, + "learning_rate": 0.00019101506740682, + "loss": 3.0163, + "step": 3900 + }, + { + "epoch": 0.5186449579831933, + "grad_norm": 1.8125, + "learning_rate": 0.00019088289717155698, + "loss": 3.0114, + "step": 3950 + }, + { + "epoch": 0.5252100840336135, + "grad_norm": 1.359375, + "learning_rate": 0.00019075072693629396, + "loss": 3.0274, + "step": 4000 + }, + { + "epoch": 0.5317752100840336, + "grad_norm": 1.1484375, + "learning_rate": 0.00019061855670103093, + "loss": 3.0153, + "step": 4050 + }, + { + "epoch": 0.5383403361344538, + "grad_norm": 1.1328125, + "learning_rate": 0.0001904863864657679, + "loss": 3.0309, + "step": 4100 + }, + { + "epoch": 0.5449054621848739, + "grad_norm": 1.2421875, + "learning_rate": 0.00019035421623050488, + "loss": 3.0383, + "step": 4150 + }, + { + "epoch": 0.5514705882352942, + "grad_norm": 1.2109375, + "learning_rate": 0.00019022204599524186, + "loss": 2.9907, + "step": 4200 + }, + { + "epoch": 0.5580357142857143, + "grad_norm": 1.5625, + "learning_rate": 0.00019008987575997886, + "loss": 3.0058, + "step": 4250 + }, + { + "epoch": 0.5646008403361344, + "grad_norm": 1.28125, + "learning_rate": 0.00018995770552471584, + "loss": 3.0301, + "step": 4300 + }, + { + "epoch": 0.5711659663865546, + "grad_norm": 1.0625, + "learning_rate": 0.0001898255352894528, + "loss": 3.0068, + "step": 4350 + }, + { + "epoch": 0.5777310924369747, + "grad_norm": 1.703125, + "learning_rate": 0.00018969336505418979, + "loss": 2.9968, + "step": 4400 + }, + { + "epoch": 0.584296218487395, + "grad_norm": 1.140625, + "learning_rate": 0.0001895611948189268, + "loss": 3.0087, + "step": 4450 + }, + { + "epoch": 0.5908613445378151, + "grad_norm": 1.46875, + "learning_rate": 0.00018942902458366376, + "loss": 3.016, + "step": 4500 + }, + { + "epoch": 0.5974264705882353, + "grad_norm": 1.015625, + "learning_rate": 0.00018929685434840074, + "loss": 2.9857, + "step": 4550 + }, + { + "epoch": 0.6039915966386554, + "grad_norm": 1.2109375, + "learning_rate": 0.00018916468411313771, + "loss": 2.9912, + "step": 4600 + }, + { + "epoch": 0.6105567226890757, + "grad_norm": 1.1171875, + "learning_rate": 0.00018903251387787472, + "loss": 2.9913, + "step": 4650 + }, + { + "epoch": 0.6171218487394958, + "grad_norm": 1.3046875, + "learning_rate": 0.0001889003436426117, + "loss": 2.9865, + "step": 4700 + }, + { + "epoch": 0.623686974789916, + "grad_norm": 0.9375, + "learning_rate": 0.00018876817340734867, + "loss": 2.9865, + "step": 4750 + }, + { + "epoch": 0.6302521008403361, + "grad_norm": 1.2421875, + "learning_rate": 0.00018863600317208564, + "loss": 2.9858, + "step": 4800 + }, + { + "epoch": 0.6368172268907563, + "grad_norm": 1.2578125, + "learning_rate": 0.00018850383293682264, + "loss": 2.97, + "step": 4850 + }, + { + "epoch": 0.6433823529411765, + "grad_norm": 1.171875, + "learning_rate": 0.00018837166270155962, + "loss": 2.9701, + "step": 4900 + }, + { + "epoch": 0.6499474789915967, + "grad_norm": 1.6484375, + "learning_rate": 0.0001882394924662966, + "loss": 2.9929, + "step": 4950 + }, + { + "epoch": 0.6565126050420168, + "grad_norm": 1.1640625, + "learning_rate": 0.00018810732223103357, + "loss": 2.9927, + "step": 5000 + }, + { + "epoch": 0.663077731092437, + "grad_norm": 2.140625, + "learning_rate": 0.00018797515199577057, + "loss": 2.9872, + "step": 5050 + }, + { + "epoch": 0.6696428571428571, + "grad_norm": 1.1875, + "learning_rate": 0.00018784298176050755, + "loss": 2.9825, + "step": 5100 + }, + { + "epoch": 0.6762079831932774, + "grad_norm": 1.109375, + "learning_rate": 0.00018771081152524452, + "loss": 2.9643, + "step": 5150 + }, + { + "epoch": 0.6827731092436975, + "grad_norm": 0.98046875, + "learning_rate": 0.0001875786412899815, + "loss": 2.9671, + "step": 5200 + }, + { + "epoch": 0.6893382352941176, + "grad_norm": 1.53125, + "learning_rate": 0.00018744647105471847, + "loss": 2.9624, + "step": 5250 + }, + { + "epoch": 0.6959033613445378, + "grad_norm": 1.3125, + "learning_rate": 0.00018731430081945548, + "loss": 2.9481, + "step": 5300 + }, + { + "epoch": 0.7024684873949579, + "grad_norm": 1.3828125, + "learning_rate": 0.00018718213058419245, + "loss": 2.9712, + "step": 5350 + }, + { + "epoch": 0.7090336134453782, + "grad_norm": 1.21875, + "learning_rate": 0.00018704996034892943, + "loss": 2.9757, + "step": 5400 + }, + { + "epoch": 0.7155987394957983, + "grad_norm": 1.1328125, + "learning_rate": 0.0001869177901136664, + "loss": 2.9722, + "step": 5450 + }, + { + "epoch": 0.7221638655462185, + "grad_norm": 1.296875, + "learning_rate": 0.0001867856198784034, + "loss": 2.9506, + "step": 5500 + }, + { + "epoch": 0.7287289915966386, + "grad_norm": 1.328125, + "learning_rate": 0.00018665344964314038, + "loss": 2.9628, + "step": 5550 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 1.5859375, + "learning_rate": 0.00018652127940787735, + "loss": 2.9618, + "step": 5600 + }, + { + "epoch": 0.741859243697479, + "grad_norm": 1.1015625, + "learning_rate": 0.00018638910917261433, + "loss": 2.9802, + "step": 5650 + }, + { + "epoch": 0.7484243697478992, + "grad_norm": 1.171875, + "learning_rate": 0.00018625693893735133, + "loss": 2.9556, + "step": 5700 + }, + { + "epoch": 0.7549894957983193, + "grad_norm": 1.921875, + "learning_rate": 0.0001861247687020883, + "loss": 2.9388, + "step": 5750 + }, + { + "epoch": 0.7615546218487395, + "grad_norm": 1.125, + "learning_rate": 0.00018599259846682528, + "loss": 2.9328, + "step": 5800 + }, + { + "epoch": 0.7681197478991597, + "grad_norm": 1.1484375, + "learning_rate": 0.00018586042823156226, + "loss": 2.937, + "step": 5850 + }, + { + "epoch": 0.7746848739495799, + "grad_norm": 1.5859375, + "learning_rate": 0.00018572825799629926, + "loss": 2.9736, + "step": 5900 + }, + { + "epoch": 0.78125, + "grad_norm": 1.109375, + "learning_rate": 0.00018559608776103623, + "loss": 2.9638, + "step": 5950 + }, + { + "epoch": 0.7878151260504201, + "grad_norm": 1.7109375, + "learning_rate": 0.0001854639175257732, + "loss": 2.9147, + "step": 6000 + }, + { + "epoch": 0.7943802521008403, + "grad_norm": 1.4296875, + "learning_rate": 0.00018533174729051018, + "loss": 2.9329, + "step": 6050 + }, + { + "epoch": 0.8009453781512605, + "grad_norm": 1.21875, + "learning_rate": 0.0001851995770552472, + "loss": 2.923, + "step": 6100 + }, + { + "epoch": 0.8075105042016807, + "grad_norm": 0.91015625, + "learning_rate": 0.00018506740681998416, + "loss": 2.9503, + "step": 6150 + }, + { + "epoch": 0.8140756302521008, + "grad_norm": 1.171875, + "learning_rate": 0.00018493523658472114, + "loss": 2.9705, + "step": 6200 + }, + { + "epoch": 0.820640756302521, + "grad_norm": 0.96484375, + "learning_rate": 0.0001848030663494581, + "loss": 2.9244, + "step": 6250 + }, + { + "epoch": 0.8272058823529411, + "grad_norm": 1.3046875, + "learning_rate": 0.00018467089611419511, + "loss": 2.9116, + "step": 6300 + }, + { + "epoch": 0.8337710084033614, + "grad_norm": 1.2265625, + "learning_rate": 0.0001845387258789321, + "loss": 2.9746, + "step": 6350 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 2.5625, + "learning_rate": 0.00018440655564366907, + "loss": 2.9773, + "step": 6400 + }, + { + "epoch": 0.8469012605042017, + "grad_norm": 1.1640625, + "learning_rate": 0.000184274385408406, + "loss": 2.9174, + "step": 6450 + }, + { + "epoch": 0.8534663865546218, + "grad_norm": 1.2578125, + "learning_rate": 0.00018414221517314302, + "loss": 2.9405, + "step": 6500 + }, + { + "epoch": 0.8600315126050421, + "grad_norm": 1.3828125, + "learning_rate": 0.00018401004493788, + "loss": 2.9203, + "step": 6550 + }, + { + "epoch": 0.8665966386554622, + "grad_norm": 1.296875, + "learning_rate": 0.00018387787470261697, + "loss": 2.9295, + "step": 6600 + }, + { + "epoch": 0.8731617647058824, + "grad_norm": 1.265625, + "learning_rate": 0.00018374570446735394, + "loss": 2.9213, + "step": 6650 + }, + { + "epoch": 0.8797268907563025, + "grad_norm": 1.03125, + "learning_rate": 0.00018361353423209094, + "loss": 2.9434, + "step": 6700 + }, + { + "epoch": 0.8862920168067226, + "grad_norm": 1.1171875, + "learning_rate": 0.00018348136399682792, + "loss": 2.9609, + "step": 6750 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.28125, + "learning_rate": 0.0001833491937615649, + "loss": 2.9187, + "step": 6800 + }, + { + "epoch": 0.899422268907563, + "grad_norm": 1.1796875, + "learning_rate": 0.00018321702352630187, + "loss": 2.9306, + "step": 6850 + }, + { + "epoch": 0.9059873949579832, + "grad_norm": 1.03125, + "learning_rate": 0.00018308485329103887, + "loss": 2.9435, + "step": 6900 + }, + { + "epoch": 0.9125525210084033, + "grad_norm": 1.203125, + "learning_rate": 0.00018295268305577585, + "loss": 2.9247, + "step": 6950 + }, + { + "epoch": 0.9191176470588235, + "grad_norm": 1.25, + "learning_rate": 0.00018282051282051282, + "loss": 2.9573, + "step": 7000 + }, + { + "epoch": 0.9256827731092437, + "grad_norm": 1.3125, + "learning_rate": 0.0001826883425852498, + "loss": 2.9421, + "step": 7050 + }, + { + "epoch": 0.9322478991596639, + "grad_norm": 1.1328125, + "learning_rate": 0.0001825561723499868, + "loss": 2.9193, + "step": 7100 + }, + { + "epoch": 0.938813025210084, + "grad_norm": 1.15625, + "learning_rate": 0.00018242400211472377, + "loss": 2.8935, + "step": 7150 + }, + { + "epoch": 0.9453781512605042, + "grad_norm": 1.203125, + "learning_rate": 0.00018229183187946075, + "loss": 2.9295, + "step": 7200 + }, + { + "epoch": 0.9519432773109243, + "grad_norm": 1.0703125, + "learning_rate": 0.00018215966164419772, + "loss": 2.9219, + "step": 7250 + }, + { + "epoch": 0.9585084033613446, + "grad_norm": 1.3828125, + "learning_rate": 0.00018202749140893473, + "loss": 2.8695, + "step": 7300 + }, + { + "epoch": 0.9650735294117647, + "grad_norm": 1.203125, + "learning_rate": 0.0001818953211736717, + "loss": 2.9135, + "step": 7350 + }, + { + "epoch": 0.9716386554621849, + "grad_norm": 1.28125, + "learning_rate": 0.00018176315093840868, + "loss": 2.9415, + "step": 7400 + }, + { + "epoch": 0.978203781512605, + "grad_norm": 1.1171875, + "learning_rate": 0.00018163098070314565, + "loss": 2.926, + "step": 7450 + }, + { + "epoch": 0.9847689075630253, + "grad_norm": 1.375, + "learning_rate": 0.00018149881046788263, + "loss": 2.8909, + "step": 7500 + }, + { + "epoch": 0.9913340336134454, + "grad_norm": 1.2421875, + "learning_rate": 0.00018136664023261963, + "loss": 2.9088, + "step": 7550 + }, + { + "epoch": 0.9978991596638656, + "grad_norm": 1.34375, + "learning_rate": 0.0001812344699973566, + "loss": 2.9116, + "step": 7600 + }, + { + "epoch": 1.0044642857142858, + "grad_norm": 1.4140625, + "learning_rate": 0.00018110229976209358, + "loss": 2.7498, + "step": 7650 + }, + { + "epoch": 1.0110294117647058, + "grad_norm": 1.0078125, + "learning_rate": 0.00018097012952683056, + "loss": 2.7164, + "step": 7700 + }, + { + "epoch": 1.017594537815126, + "grad_norm": 1.296875, + "learning_rate": 0.00018083795929156756, + "loss": 2.6681, + "step": 7750 + }, + { + "epoch": 1.0241596638655461, + "grad_norm": 1.53125, + "learning_rate": 0.00018070578905630453, + "loss": 2.7374, + "step": 7800 + }, + { + "epoch": 1.0307247899159664, + "grad_norm": 1.359375, + "learning_rate": 0.0001805736188210415, + "loss": 2.6908, + "step": 7850 + }, + { + "epoch": 1.0372899159663866, + "grad_norm": 1.21875, + "learning_rate": 0.00018044144858577848, + "loss": 2.7301, + "step": 7900 + }, + { + "epoch": 1.0438550420168067, + "grad_norm": 1.296875, + "learning_rate": 0.00018030927835051549, + "loss": 2.7097, + "step": 7950 + }, + { + "epoch": 1.050420168067227, + "grad_norm": 1.484375, + "learning_rate": 0.00018017710811525246, + "loss": 2.7109, + "step": 8000 + }, + { + "epoch": 1.056985294117647, + "grad_norm": 0.9765625, + "learning_rate": 0.00018004493787998944, + "loss": 2.7128, + "step": 8050 + }, + { + "epoch": 1.0635504201680672, + "grad_norm": 1.4453125, + "learning_rate": 0.0001799127676447264, + "loss": 2.6722, + "step": 8100 + }, + { + "epoch": 1.0701155462184875, + "grad_norm": 1.1640625, + "learning_rate": 0.00017978059740946341, + "loss": 2.7261, + "step": 8150 + }, + { + "epoch": 1.0766806722689075, + "grad_norm": 1.3046875, + "learning_rate": 0.0001796484271742004, + "loss": 2.7065, + "step": 8200 + }, + { + "epoch": 1.0832457983193278, + "grad_norm": 1.6171875, + "learning_rate": 0.00017951625693893736, + "loss": 2.7079, + "step": 8250 + }, + { + "epoch": 1.0898109243697478, + "grad_norm": 1.3125, + "learning_rate": 0.00017938408670367434, + "loss": 2.7226, + "step": 8300 + }, + { + "epoch": 1.096376050420168, + "grad_norm": 1.1875, + "learning_rate": 0.00017925191646841134, + "loss": 2.7011, + "step": 8350 + }, + { + "epoch": 1.1029411764705883, + "grad_norm": 1.25, + "learning_rate": 0.00017911974623314832, + "loss": 2.7024, + "step": 8400 + }, + { + "epoch": 1.1095063025210083, + "grad_norm": 1.2734375, + "learning_rate": 0.0001789875759978853, + "loss": 2.7094, + "step": 8450 + }, + { + "epoch": 1.1160714285714286, + "grad_norm": 1.0703125, + "learning_rate": 0.00017885540576262227, + "loss": 2.7323, + "step": 8500 + }, + { + "epoch": 1.1226365546218486, + "grad_norm": 1.2421875, + "learning_rate": 0.00017872323552735924, + "loss": 2.7223, + "step": 8550 + }, + { + "epoch": 1.129201680672269, + "grad_norm": 1.1484375, + "learning_rate": 0.00017859106529209624, + "loss": 2.7014, + "step": 8600 + }, + { + "epoch": 1.1357668067226891, + "grad_norm": 1.34375, + "learning_rate": 0.00017845889505683322, + "loss": 2.7235, + "step": 8650 + }, + { + "epoch": 1.1423319327731092, + "grad_norm": 1.1484375, + "learning_rate": 0.0001783267248215702, + "loss": 2.7127, + "step": 8700 + }, + { + "epoch": 1.1488970588235294, + "grad_norm": 1.140625, + "learning_rate": 0.00017819455458630717, + "loss": 2.7256, + "step": 8750 + }, + { + "epoch": 1.1554621848739495, + "grad_norm": 1.0625, + "learning_rate": 0.00017806238435104415, + "loss": 2.7401, + "step": 8800 + }, + { + "epoch": 1.1620273109243697, + "grad_norm": 1.1875, + "learning_rate": 0.00017793021411578112, + "loss": 2.6972, + "step": 8850 + }, + { + "epoch": 1.16859243697479, + "grad_norm": 1.390625, + "learning_rate": 0.0001777980438805181, + "loss": 2.7009, + "step": 8900 + }, + { + "epoch": 1.17515756302521, + "grad_norm": 1.328125, + "learning_rate": 0.0001776658736452551, + "loss": 2.7225, + "step": 8950 + }, + { + "epoch": 1.1817226890756303, + "grad_norm": 1.546875, + "learning_rate": 0.00017753370340999207, + "loss": 2.7215, + "step": 9000 + }, + { + "epoch": 1.1882878151260505, + "grad_norm": 0.9609375, + "learning_rate": 0.00017740153317472905, + "loss": 2.7292, + "step": 9050 + }, + { + "epoch": 1.1948529411764706, + "grad_norm": 1.015625, + "learning_rate": 0.00017726936293946602, + "loss": 2.7364, + "step": 9100 + }, + { + "epoch": 1.2014180672268908, + "grad_norm": 1.203125, + "learning_rate": 0.00017713719270420303, + "loss": 2.7467, + "step": 9150 + }, + { + "epoch": 1.2079831932773109, + "grad_norm": 1.328125, + "learning_rate": 0.00017700502246894, + "loss": 2.7084, + "step": 9200 + }, + { + "epoch": 1.214548319327731, + "grad_norm": 1.3125, + "learning_rate": 0.00017687285223367698, + "loss": 2.7161, + "step": 9250 + }, + { + "epoch": 1.2211134453781511, + "grad_norm": 1.40625, + "learning_rate": 0.00017674068199841395, + "loss": 2.7257, + "step": 9300 + }, + { + "epoch": 1.2276785714285714, + "grad_norm": 1.2890625, + "learning_rate": 0.00017660851176315095, + "loss": 2.7518, + "step": 9350 + }, + { + "epoch": 1.2342436974789917, + "grad_norm": 1.375, + "learning_rate": 0.00017647634152788793, + "loss": 2.733, + "step": 9400 + }, + { + "epoch": 1.2408088235294117, + "grad_norm": 1.375, + "learning_rate": 0.0001763441712926249, + "loss": 2.7051, + "step": 9450 + }, + { + "epoch": 1.247373949579832, + "grad_norm": 1.1328125, + "learning_rate": 0.00017621200105736188, + "loss": 2.6999, + "step": 9500 + }, + { + "epoch": 1.2539390756302522, + "grad_norm": 1.734375, + "learning_rate": 0.00017607983082209886, + "loss": 2.7401, + "step": 9550 + }, + { + "epoch": 1.2605042016806722, + "grad_norm": 0.9375, + "learning_rate": 0.00017594766058683586, + "loss": 2.7076, + "step": 9600 + }, + { + "epoch": 1.2670693277310925, + "grad_norm": 1.421875, + "learning_rate": 0.00017581549035157283, + "loss": 2.7448, + "step": 9650 + }, + { + "epoch": 1.2736344537815127, + "grad_norm": 1.375, + "learning_rate": 0.0001756833201163098, + "loss": 2.7407, + "step": 9700 + }, + { + "epoch": 1.2801995798319328, + "grad_norm": 1.0078125, + "learning_rate": 0.00017555114988104678, + "loss": 2.7003, + "step": 9750 + }, + { + "epoch": 1.2867647058823528, + "grad_norm": 1.5703125, + "learning_rate": 0.00017541897964578379, + "loss": 2.6974, + "step": 9800 + }, + { + "epoch": 1.293329831932773, + "grad_norm": 1.25, + "learning_rate": 0.00017528680941052076, + "loss": 2.7261, + "step": 9850 + }, + { + "epoch": 1.2998949579831933, + "grad_norm": 1.1875, + "learning_rate": 0.00017515463917525774, + "loss": 2.7052, + "step": 9900 + }, + { + "epoch": 1.3064600840336134, + "grad_norm": 1.3671875, + "learning_rate": 0.0001750224689399947, + "loss": 2.6775, + "step": 9950 + }, + { + "epoch": 1.3130252100840336, + "grad_norm": 1.359375, + "learning_rate": 0.0001748902987047317, + "loss": 2.7389, + "step": 10000 + }, + { + "epoch": 1.3195903361344539, + "grad_norm": 1.4453125, + "learning_rate": 0.0001747581284694687, + "loss": 2.6958, + "step": 10050 + }, + { + "epoch": 1.326155462184874, + "grad_norm": 1.453125, + "learning_rate": 0.00017462595823420566, + "loss": 2.7364, + "step": 10100 + }, + { + "epoch": 1.3327205882352942, + "grad_norm": 1.234375, + "learning_rate": 0.00017449378799894264, + "loss": 2.7042, + "step": 10150 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 1.3359375, + "learning_rate": 0.00017436161776367964, + "loss": 2.7221, + "step": 10200 + }, + { + "epoch": 1.3458508403361344, + "grad_norm": 1.125, + "learning_rate": 0.00017422944752841662, + "loss": 2.6988, + "step": 10250 + }, + { + "epoch": 1.3524159663865547, + "grad_norm": 1.328125, + "learning_rate": 0.0001740972772931536, + "loss": 2.7224, + "step": 10300 + }, + { + "epoch": 1.3589810924369747, + "grad_norm": 1.3125, + "learning_rate": 0.00017396510705789057, + "loss": 2.7076, + "step": 10350 + }, + { + "epoch": 1.365546218487395, + "grad_norm": 1.2578125, + "learning_rate": 0.00017383293682262757, + "loss": 2.7175, + "step": 10400 + }, + { + "epoch": 1.372111344537815, + "grad_norm": 1.0625, + "learning_rate": 0.00017370076658736454, + "loss": 2.7272, + "step": 10450 + }, + { + "epoch": 1.3786764705882353, + "grad_norm": 1.296875, + "learning_rate": 0.00017356859635210152, + "loss": 2.7176, + "step": 10500 + }, + { + "epoch": 1.3852415966386555, + "grad_norm": 1.2109375, + "learning_rate": 0.0001734364261168385, + "loss": 2.7443, + "step": 10550 + }, + { + "epoch": 1.3918067226890756, + "grad_norm": 1.3125, + "learning_rate": 0.0001733042558815755, + "loss": 2.712, + "step": 10600 + }, + { + "epoch": 1.3983718487394958, + "grad_norm": 1.171875, + "learning_rate": 0.00017317208564631247, + "loss": 2.7126, + "step": 10650 + }, + { + "epoch": 1.404936974789916, + "grad_norm": 1.9453125, + "learning_rate": 0.00017303991541104945, + "loss": 2.7164, + "step": 10700 + }, + { + "epoch": 1.4115021008403361, + "grad_norm": 1.7734375, + "learning_rate": 0.00017290774517578642, + "loss": 2.7406, + "step": 10750 + }, + { + "epoch": 1.4180672268907564, + "grad_norm": 1.2890625, + "learning_rate": 0.0001727755749405234, + "loss": 2.7049, + "step": 10800 + }, + { + "epoch": 1.4246323529411764, + "grad_norm": 0.875, + "learning_rate": 0.0001726434047052604, + "loss": 2.7081, + "step": 10850 + }, + { + "epoch": 1.4311974789915967, + "grad_norm": 1.0625, + "learning_rate": 0.00017251123446999738, + "loss": 2.7084, + "step": 10900 + }, + { + "epoch": 1.4377626050420167, + "grad_norm": 1.15625, + "learning_rate": 0.00017237906423473435, + "loss": 2.7071, + "step": 10950 + }, + { + "epoch": 1.444327731092437, + "grad_norm": 1.5390625, + "learning_rate": 0.00017224689399947133, + "loss": 2.7335, + "step": 11000 + }, + { + "epoch": 1.4508928571428572, + "grad_norm": 1.0703125, + "learning_rate": 0.00017211472376420833, + "loss": 2.7514, + "step": 11050 + }, + { + "epoch": 1.4574579831932772, + "grad_norm": 1.203125, + "learning_rate": 0.0001719825535289453, + "loss": 2.702, + "step": 11100 + }, + { + "epoch": 1.4640231092436975, + "grad_norm": 1.140625, + "learning_rate": 0.00017185038329368225, + "loss": 2.7518, + "step": 11150 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.09375, + "learning_rate": 0.00017171821305841925, + "loss": 2.7228, + "step": 11200 + }, + { + "epoch": 1.4771533613445378, + "grad_norm": 1.6875, + "learning_rate": 0.00017158604282315623, + "loss": 2.7007, + "step": 11250 + }, + { + "epoch": 1.483718487394958, + "grad_norm": 1.21875, + "learning_rate": 0.0001714538725878932, + "loss": 2.7112, + "step": 11300 + }, + { + "epoch": 1.490283613445378, + "grad_norm": 1.2890625, + "learning_rate": 0.00017132170235263018, + "loss": 2.7264, + "step": 11350 + }, + { + "epoch": 1.4968487394957983, + "grad_norm": 1.1015625, + "learning_rate": 0.00017118953211736718, + "loss": 2.7302, + "step": 11400 + }, + { + "epoch": 1.5034138655462184, + "grad_norm": 1.03125, + "learning_rate": 0.00017105736188210416, + "loss": 2.7063, + "step": 11450 + }, + { + "epoch": 1.5099789915966386, + "grad_norm": 1.2265625, + "learning_rate": 0.00017092519164684113, + "loss": 2.7232, + "step": 11500 + }, + { + "epoch": 1.5165441176470589, + "grad_norm": 0.9296875, + "learning_rate": 0.0001707930214115781, + "loss": 2.7323, + "step": 11550 + }, + { + "epoch": 1.523109243697479, + "grad_norm": 1.0859375, + "learning_rate": 0.00017066085117631508, + "loss": 2.7504, + "step": 11600 + }, + { + "epoch": 1.5296743697478992, + "grad_norm": 1.109375, + "learning_rate": 0.00017052868094105208, + "loss": 2.7087, + "step": 11650 + }, + { + "epoch": 1.5362394957983194, + "grad_norm": 1.5703125, + "learning_rate": 0.00017039651070578906, + "loss": 2.687, + "step": 11700 + }, + { + "epoch": 1.5428046218487395, + "grad_norm": 1.0, + "learning_rate": 0.00017026434047052603, + "loss": 2.7249, + "step": 11750 + }, + { + "epoch": 1.5493697478991597, + "grad_norm": 1.375, + "learning_rate": 0.000170132170235263, + "loss": 2.7173, + "step": 11800 + }, + { + "epoch": 1.55593487394958, + "grad_norm": 1.046875, + "learning_rate": 0.00017, + "loss": 2.7128, + "step": 11850 + }, + { + "epoch": 1.5625, + "grad_norm": 1.3046875, + "learning_rate": 0.000169867829764737, + "loss": 2.6948, + "step": 11900 + }, + { + "epoch": 1.56906512605042, + "grad_norm": 0.953125, + "learning_rate": 0.00016973565952947396, + "loss": 2.7077, + "step": 11950 + }, + { + "epoch": 1.5756302521008403, + "grad_norm": 1.0, + "learning_rate": 0.00016960348929421094, + "loss": 2.7234, + "step": 12000 + }, + { + "epoch": 1.5821953781512605, + "grad_norm": 1.0859375, + "learning_rate": 0.00016947131905894794, + "loss": 2.7016, + "step": 12050 + }, + { + "epoch": 1.5887605042016806, + "grad_norm": 1.375, + "learning_rate": 0.00016933914882368492, + "loss": 2.7031, + "step": 12100 + }, + { + "epoch": 1.5953256302521008, + "grad_norm": 1.1796875, + "learning_rate": 0.0001692069785884219, + "loss": 2.7401, + "step": 12150 + }, + { + "epoch": 1.601890756302521, + "grad_norm": 1.7109375, + "learning_rate": 0.00016907480835315887, + "loss": 2.7053, + "step": 12200 + }, + { + "epoch": 1.6084558823529411, + "grad_norm": 1.25, + "learning_rate": 0.00016894263811789587, + "loss": 2.7007, + "step": 12250 + }, + { + "epoch": 1.6150210084033614, + "grad_norm": 1.1015625, + "learning_rate": 0.00016881046788263284, + "loss": 2.7229, + "step": 12300 + }, + { + "epoch": 1.6215861344537816, + "grad_norm": 1.2109375, + "learning_rate": 0.00016867829764736982, + "loss": 2.707, + "step": 12350 + }, + { + "epoch": 1.6281512605042017, + "grad_norm": 0.9375, + "learning_rate": 0.0001685461274121068, + "loss": 2.7095, + "step": 12400 + }, + { + "epoch": 1.6347163865546217, + "grad_norm": 1.078125, + "learning_rate": 0.0001684139571768438, + "loss": 2.7258, + "step": 12450 + }, + { + "epoch": 1.6412815126050422, + "grad_norm": 1.0078125, + "learning_rate": 0.00016828178694158077, + "loss": 2.6818, + "step": 12500 + }, + { + "epoch": 1.6478466386554622, + "grad_norm": 1.140625, + "learning_rate": 0.00016814961670631775, + "loss": 2.7119, + "step": 12550 + }, + { + "epoch": 1.6544117647058822, + "grad_norm": 1.2734375, + "learning_rate": 0.00016801744647105472, + "loss": 2.7186, + "step": 12600 + }, + { + "epoch": 1.6609768907563025, + "grad_norm": 1.3984375, + "learning_rate": 0.00016788527623579172, + "loss": 2.6939, + "step": 12650 + }, + { + "epoch": 1.6675420168067228, + "grad_norm": 1.1015625, + "learning_rate": 0.0001677531060005287, + "loss": 2.7297, + "step": 12700 + }, + { + "epoch": 1.6741071428571428, + "grad_norm": 1.09375, + "learning_rate": 0.00016762093576526567, + "loss": 2.7283, + "step": 12750 + }, + { + "epoch": 1.680672268907563, + "grad_norm": 1.3359375, + "learning_rate": 0.00016748876553000265, + "loss": 2.6951, + "step": 12800 + }, + { + "epoch": 1.6872373949579833, + "grad_norm": 1.3671875, + "learning_rate": 0.00016735659529473962, + "loss": 2.693, + "step": 12850 + }, + { + "epoch": 1.6938025210084033, + "grad_norm": 1.171875, + "learning_rate": 0.00016722442505947663, + "loss": 2.7194, + "step": 12900 + }, + { + "epoch": 1.7003676470588234, + "grad_norm": 1.296875, + "learning_rate": 0.0001670922548242136, + "loss": 2.6951, + "step": 12950 + }, + { + "epoch": 1.7069327731092439, + "grad_norm": 1.125, + "learning_rate": 0.00016696008458895058, + "loss": 2.7053, + "step": 13000 + }, + { + "epoch": 1.7134978991596639, + "grad_norm": 1.28125, + "learning_rate": 0.00016682791435368755, + "loss": 2.7034, + "step": 13050 + }, + { + "epoch": 1.720063025210084, + "grad_norm": 1.3359375, + "learning_rate": 0.00016669574411842455, + "loss": 2.7118, + "step": 13100 + }, + { + "epoch": 1.7266281512605042, + "grad_norm": 1.15625, + "learning_rate": 0.00016656357388316153, + "loss": 2.7267, + "step": 13150 + }, + { + "epoch": 1.7331932773109244, + "grad_norm": 1.0, + "learning_rate": 0.0001664314036478985, + "loss": 2.6885, + "step": 13200 + }, + { + "epoch": 1.7397584033613445, + "grad_norm": 1.234375, + "learning_rate": 0.00016629923341263548, + "loss": 2.6921, + "step": 13250 + }, + { + "epoch": 1.7463235294117647, + "grad_norm": 1.4765625, + "learning_rate": 0.00016616706317737248, + "loss": 2.7048, + "step": 13300 + }, + { + "epoch": 1.752888655462185, + "grad_norm": 1.2578125, + "learning_rate": 0.00016603489294210946, + "loss": 2.688, + "step": 13350 + }, + { + "epoch": 1.759453781512605, + "grad_norm": 1.0546875, + "learning_rate": 0.00016590272270684643, + "loss": 2.7187, + "step": 13400 + }, + { + "epoch": 1.7660189075630253, + "grad_norm": 1.1796875, + "learning_rate": 0.0001657705524715834, + "loss": 2.7089, + "step": 13450 + }, + { + "epoch": 1.7725840336134455, + "grad_norm": 1.046875, + "learning_rate": 0.0001656383822363204, + "loss": 2.7104, + "step": 13500 + }, + { + "epoch": 1.7791491596638656, + "grad_norm": 1.1953125, + "learning_rate": 0.00016550621200105736, + "loss": 2.694, + "step": 13550 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 1.84375, + "learning_rate": 0.00016537404176579433, + "loss": 2.7143, + "step": 13600 + }, + { + "epoch": 1.7922794117647058, + "grad_norm": 1.046875, + "learning_rate": 0.0001652418715305313, + "loss": 2.7057, + "step": 13650 + }, + { + "epoch": 1.798844537815126, + "grad_norm": 1.4375, + "learning_rate": 0.0001651097012952683, + "loss": 2.7065, + "step": 13700 + }, + { + "epoch": 1.8054096638655461, + "grad_norm": 1.234375, + "learning_rate": 0.0001649775310600053, + "loss": 2.7258, + "step": 13750 + }, + { + "epoch": 1.8119747899159664, + "grad_norm": 1.3359375, + "learning_rate": 0.00016484536082474226, + "loss": 2.7026, + "step": 13800 + }, + { + "epoch": 1.8185399159663866, + "grad_norm": 1.3828125, + "learning_rate": 0.00016471319058947924, + "loss": 2.7012, + "step": 13850 + }, + { + "epoch": 1.8251050420168067, + "grad_norm": 1.4140625, + "learning_rate": 0.00016458102035421624, + "loss": 2.677, + "step": 13900 + }, + { + "epoch": 1.831670168067227, + "grad_norm": 1.1875, + "learning_rate": 0.00016444885011895321, + "loss": 2.6752, + "step": 13950 + }, + { + "epoch": 1.8382352941176472, + "grad_norm": 1.1796875, + "learning_rate": 0.0001643166798836902, + "loss": 2.71, + "step": 14000 + }, + { + "epoch": 1.8448004201680672, + "grad_norm": 1.453125, + "learning_rate": 0.00016418450964842716, + "loss": 2.7039, + "step": 14050 + }, + { + "epoch": 1.8513655462184873, + "grad_norm": 1.3203125, + "learning_rate": 0.00016405233941316417, + "loss": 2.6974, + "step": 14100 + }, + { + "epoch": 1.8579306722689075, + "grad_norm": 1.0078125, + "learning_rate": 0.00016392016917790114, + "loss": 2.7124, + "step": 14150 + }, + { + "epoch": 1.8644957983193278, + "grad_norm": 1.4140625, + "learning_rate": 0.00016378799894263812, + "loss": 2.7038, + "step": 14200 + }, + { + "epoch": 1.8710609243697478, + "grad_norm": 1.3515625, + "learning_rate": 0.0001636558287073751, + "loss": 2.6604, + "step": 14250 + }, + { + "epoch": 1.877626050420168, + "grad_norm": 1.2109375, + "learning_rate": 0.0001635236584721121, + "loss": 2.6858, + "step": 14300 + }, + { + "epoch": 1.8841911764705883, + "grad_norm": 1.5234375, + "learning_rate": 0.00016339148823684907, + "loss": 2.7067, + "step": 14350 + }, + { + "epoch": 1.8907563025210083, + "grad_norm": 1.25, + "learning_rate": 0.00016325931800158605, + "loss": 2.7035, + "step": 14400 + }, + { + "epoch": 1.8973214285714286, + "grad_norm": 1.609375, + "learning_rate": 0.00016312714776632302, + "loss": 2.6906, + "step": 14450 + }, + { + "epoch": 1.9038865546218489, + "grad_norm": 1.2890625, + "learning_rate": 0.00016299497753106002, + "loss": 2.6797, + "step": 14500 + }, + { + "epoch": 1.910451680672269, + "grad_norm": 1.3046875, + "learning_rate": 0.000162862807295797, + "loss": 2.6928, + "step": 14550 + }, + { + "epoch": 1.917016806722689, + "grad_norm": 1.1640625, + "learning_rate": 0.00016273063706053397, + "loss": 2.7051, + "step": 14600 + }, + { + "epoch": 1.9235819327731094, + "grad_norm": 1.09375, + "learning_rate": 0.00016259846682527095, + "loss": 2.6779, + "step": 14650 + }, + { + "epoch": 1.9301470588235294, + "grad_norm": 1.4375, + "learning_rate": 0.00016246629659000795, + "loss": 2.6788, + "step": 14700 + }, + { + "epoch": 1.9367121848739495, + "grad_norm": 1.1953125, + "learning_rate": 0.00016233412635474493, + "loss": 2.6799, + "step": 14750 + }, + { + "epoch": 1.9432773109243697, + "grad_norm": 1.3046875, + "learning_rate": 0.0001622019561194819, + "loss": 2.6975, + "step": 14800 + }, + { + "epoch": 1.94984243697479, + "grad_norm": 1.546875, + "learning_rate": 0.00016206978588421888, + "loss": 2.7005, + "step": 14850 + }, + { + "epoch": 1.95640756302521, + "grad_norm": 1.015625, + "learning_rate": 0.00016193761564895585, + "loss": 2.7031, + "step": 14900 + }, + { + "epoch": 1.9629726890756303, + "grad_norm": 1.9765625, + "learning_rate": 0.00016180544541369285, + "loss": 2.6751, + "step": 14950 + }, + { + "epoch": 1.9695378151260505, + "grad_norm": 0.984375, + "learning_rate": 0.00016167327517842983, + "loss": 2.7391, + "step": 15000 + }, + { + "epoch": 1.9761029411764706, + "grad_norm": 1.0625, + "learning_rate": 0.0001615411049431668, + "loss": 2.7114, + "step": 15050 + }, + { + "epoch": 1.9826680672268906, + "grad_norm": 1.3828125, + "learning_rate": 0.00016140893470790378, + "loss": 2.6763, + "step": 15100 + }, + { + "epoch": 1.989233193277311, + "grad_norm": 1.203125, + "learning_rate": 0.00016127676447264078, + "loss": 2.7001, + "step": 15150 + }, + { + "epoch": 1.995798319327731, + "grad_norm": 1.2578125, + "learning_rate": 0.00016114459423737776, + "loss": 2.697, + "step": 15200 + }, + { + "epoch": 2.002363445378151, + "grad_norm": 1.3046875, + "learning_rate": 0.00016101242400211473, + "loss": 2.5799, + "step": 15250 + }, + { + "epoch": 2.0089285714285716, + "grad_norm": 1.234375, + "learning_rate": 0.0001608802537668517, + "loss": 2.4102, + "step": 15300 + }, + { + "epoch": 2.0154936974789917, + "grad_norm": 1.1171875, + "learning_rate": 0.0001607480835315887, + "loss": 2.4198, + "step": 15350 + }, + { + "epoch": 2.0220588235294117, + "grad_norm": 1.234375, + "learning_rate": 0.00016061591329632568, + "loss": 2.398, + "step": 15400 + }, + { + "epoch": 2.0286239495798317, + "grad_norm": 2.015625, + "learning_rate": 0.00016048374306106266, + "loss": 2.3839, + "step": 15450 + }, + { + "epoch": 2.035189075630252, + "grad_norm": 1.171875, + "learning_rate": 0.00016035157282579964, + "loss": 2.3944, + "step": 15500 + }, + { + "epoch": 2.0417542016806722, + "grad_norm": 1.46875, + "learning_rate": 0.00016021940259053664, + "loss": 2.3921, + "step": 15550 + }, + { + "epoch": 2.0483193277310923, + "grad_norm": 1.5, + "learning_rate": 0.0001600872323552736, + "loss": 2.3781, + "step": 15600 + }, + { + "epoch": 2.0548844537815127, + "grad_norm": 1.734375, + "learning_rate": 0.0001599550621200106, + "loss": 2.3969, + "step": 15650 + }, + { + "epoch": 2.0614495798319328, + "grad_norm": 1.3828125, + "learning_rate": 0.00015982289188474756, + "loss": 2.4146, + "step": 15700 + }, + { + "epoch": 2.068014705882353, + "grad_norm": 1.203125, + "learning_rate": 0.00015969072164948457, + "loss": 2.4412, + "step": 15750 + }, + { + "epoch": 2.0745798319327733, + "grad_norm": 1.5390625, + "learning_rate": 0.00015955855141422154, + "loss": 2.417, + "step": 15800 + }, + { + "epoch": 2.0811449579831933, + "grad_norm": 1.28125, + "learning_rate": 0.00015942638117895852, + "loss": 2.4068, + "step": 15850 + }, + { + "epoch": 2.0877100840336134, + "grad_norm": 1.328125, + "learning_rate": 0.00015929421094369546, + "loss": 2.4335, + "step": 15900 + }, + { + "epoch": 2.0942752100840334, + "grad_norm": 1.3359375, + "learning_rate": 0.00015916204070843247, + "loss": 2.4253, + "step": 15950 + }, + { + "epoch": 2.100840336134454, + "grad_norm": 1.5078125, + "learning_rate": 0.00015902987047316944, + "loss": 2.4018, + "step": 16000 + }, + { + "epoch": 2.107405462184874, + "grad_norm": 1.5546875, + "learning_rate": 0.00015889770023790642, + "loss": 2.4285, + "step": 16050 + }, + { + "epoch": 2.113970588235294, + "grad_norm": 1.7265625, + "learning_rate": 0.0001587655300026434, + "loss": 2.4368, + "step": 16100 + }, + { + "epoch": 2.1205357142857144, + "grad_norm": 1.4296875, + "learning_rate": 0.0001586333597673804, + "loss": 2.4384, + "step": 16150 + }, + { + "epoch": 2.1271008403361344, + "grad_norm": 1.515625, + "learning_rate": 0.00015850118953211737, + "loss": 2.414, + "step": 16200 + }, + { + "epoch": 2.1336659663865545, + "grad_norm": 1.4296875, + "learning_rate": 0.00015836901929685434, + "loss": 2.4166, + "step": 16250 + }, + { + "epoch": 2.140231092436975, + "grad_norm": 1.6484375, + "learning_rate": 0.00015823684906159132, + "loss": 2.4319, + "step": 16300 + }, + { + "epoch": 2.146796218487395, + "grad_norm": 1.46875, + "learning_rate": 0.00015810467882632832, + "loss": 2.4421, + "step": 16350 + }, + { + "epoch": 2.153361344537815, + "grad_norm": 1.515625, + "learning_rate": 0.0001579725085910653, + "loss": 2.4383, + "step": 16400 + }, + { + "epoch": 2.1599264705882355, + "grad_norm": 1.1796875, + "learning_rate": 0.00015784033835580227, + "loss": 2.4301, + "step": 16450 + }, + { + "epoch": 2.1664915966386555, + "grad_norm": 1.2890625, + "learning_rate": 0.00015770816812053925, + "loss": 2.4249, + "step": 16500 + }, + { + "epoch": 2.1730567226890756, + "grad_norm": 1.34375, + "learning_rate": 0.00015757599788527625, + "loss": 2.4465, + "step": 16550 + }, + { + "epoch": 2.1796218487394956, + "grad_norm": 1.453125, + "learning_rate": 0.00015744382765001323, + "loss": 2.4457, + "step": 16600 + }, + { + "epoch": 2.186186974789916, + "grad_norm": 1.5, + "learning_rate": 0.0001573116574147502, + "loss": 2.435, + "step": 16650 + }, + { + "epoch": 2.192752100840336, + "grad_norm": 1.296875, + "learning_rate": 0.00015717948717948718, + "loss": 2.4267, + "step": 16700 + }, + { + "epoch": 2.199317226890756, + "grad_norm": 1.34375, + "learning_rate": 0.00015704731694422418, + "loss": 2.4513, + "step": 16750 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 1.3359375, + "learning_rate": 0.00015691514670896115, + "loss": 2.4385, + "step": 16800 + }, + { + "epoch": 2.2124474789915967, + "grad_norm": 1.4140625, + "learning_rate": 0.00015678297647369813, + "loss": 2.4747, + "step": 16850 + }, + { + "epoch": 2.2190126050420167, + "grad_norm": 1.3828125, + "learning_rate": 0.0001566508062384351, + "loss": 2.4497, + "step": 16900 + }, + { + "epoch": 2.225577731092437, + "grad_norm": 1.4765625, + "learning_rate": 0.0001565186360031721, + "loss": 2.4393, + "step": 16950 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 1.203125, + "learning_rate": 0.00015638646576790908, + "loss": 2.4628, + "step": 17000 + }, + { + "epoch": 2.2387079831932772, + "grad_norm": 1.59375, + "learning_rate": 0.00015625429553264606, + "loss": 2.426, + "step": 17050 + }, + { + "epoch": 2.2452731092436973, + "grad_norm": 1.609375, + "learning_rate": 0.00015612212529738303, + "loss": 2.4226, + "step": 17100 + }, + { + "epoch": 2.2518382352941178, + "grad_norm": 1.3515625, + "learning_rate": 0.00015598995506212, + "loss": 2.461, + "step": 17150 + }, + { + "epoch": 2.258403361344538, + "grad_norm": 1.3046875, + "learning_rate": 0.000155857784826857, + "loss": 2.4646, + "step": 17200 + }, + { + "epoch": 2.264968487394958, + "grad_norm": 1.671875, + "learning_rate": 0.00015572561459159398, + "loss": 2.443, + "step": 17250 + }, + { + "epoch": 2.2715336134453783, + "grad_norm": 1.4296875, + "learning_rate": 0.00015559344435633096, + "loss": 2.4328, + "step": 17300 + }, + { + "epoch": 2.2780987394957983, + "grad_norm": 1.3671875, + "learning_rate": 0.00015546127412106793, + "loss": 2.4707, + "step": 17350 + }, + { + "epoch": 2.2846638655462184, + "grad_norm": 1.71875, + "learning_rate": 0.00015532910388580494, + "loss": 2.4422, + "step": 17400 + }, + { + "epoch": 2.291228991596639, + "grad_norm": 2.671875, + "learning_rate": 0.0001551969336505419, + "loss": 2.4504, + "step": 17450 + }, + { + "epoch": 2.297794117647059, + "grad_norm": 1.9375, + "learning_rate": 0.0001550647634152789, + "loss": 2.4589, + "step": 17500 + }, + { + "epoch": 2.304359243697479, + "grad_norm": 1.2890625, + "learning_rate": 0.00015493259318001586, + "loss": 2.4549, + "step": 17550 + }, + { + "epoch": 2.310924369747899, + "grad_norm": 1.8984375, + "learning_rate": 0.00015480042294475286, + "loss": 2.4549, + "step": 17600 + }, + { + "epoch": 2.3174894957983194, + "grad_norm": 1.3671875, + "learning_rate": 0.00015466825270948984, + "loss": 2.4304, + "step": 17650 + }, + { + "epoch": 2.3240546218487395, + "grad_norm": 1.671875, + "learning_rate": 0.00015453608247422681, + "loss": 2.4605, + "step": 17700 + }, + { + "epoch": 2.3306197478991595, + "grad_norm": 1.1640625, + "learning_rate": 0.0001544039122389638, + "loss": 2.4587, + "step": 17750 + }, + { + "epoch": 2.33718487394958, + "grad_norm": 1.3359375, + "learning_rate": 0.0001542717420037008, + "loss": 2.4478, + "step": 17800 + }, + { + "epoch": 2.34375, + "grad_norm": 1.734375, + "learning_rate": 0.00015413957176843777, + "loss": 2.4473, + "step": 17850 + }, + { + "epoch": 2.35031512605042, + "grad_norm": 1.1953125, + "learning_rate": 0.00015400740153317474, + "loss": 2.4497, + "step": 17900 + }, + { + "epoch": 2.3568802521008405, + "grad_norm": 1.3203125, + "learning_rate": 0.00015387523129791172, + "loss": 2.463, + "step": 17950 + }, + { + "epoch": 2.3634453781512605, + "grad_norm": 1.1328125, + "learning_rate": 0.00015374306106264872, + "loss": 2.4603, + "step": 18000 + }, + { + "epoch": 2.3700105042016806, + "grad_norm": 1.6875, + "learning_rate": 0.0001536108908273857, + "loss": 2.431, + "step": 18050 + }, + { + "epoch": 2.376575630252101, + "grad_norm": 1.671875, + "learning_rate": 0.00015347872059212267, + "loss": 2.4369, + "step": 18100 + }, + { + "epoch": 2.383140756302521, + "grad_norm": 1.5, + "learning_rate": 0.00015334655035685965, + "loss": 2.4516, + "step": 18150 + }, + { + "epoch": 2.389705882352941, + "grad_norm": 1.5, + "learning_rate": 0.00015321438012159662, + "loss": 2.4478, + "step": 18200 + }, + { + "epoch": 2.396271008403361, + "grad_norm": 1.25, + "learning_rate": 0.0001530822098863336, + "loss": 2.4471, + "step": 18250 + }, + { + "epoch": 2.4028361344537816, + "grad_norm": 1.8203125, + "learning_rate": 0.00015295003965107057, + "loss": 2.4637, + "step": 18300 + }, + { + "epoch": 2.4094012605042017, + "grad_norm": 1.0390625, + "learning_rate": 0.00015281786941580755, + "loss": 2.4466, + "step": 18350 + }, + { + "epoch": 2.4159663865546217, + "grad_norm": 1.3359375, + "learning_rate": 0.00015268569918054455, + "loss": 2.4603, + "step": 18400 + }, + { + "epoch": 2.422531512605042, + "grad_norm": 1.5234375, + "learning_rate": 0.00015255352894528152, + "loss": 2.4641, + "step": 18450 + }, + { + "epoch": 2.429096638655462, + "grad_norm": 0.97265625, + "learning_rate": 0.0001524213587100185, + "loss": 2.4667, + "step": 18500 + }, + { + "epoch": 2.4356617647058822, + "grad_norm": 1.78125, + "learning_rate": 0.00015228918847475547, + "loss": 2.4812, + "step": 18550 + }, + { + "epoch": 2.4422268907563023, + "grad_norm": 1.3046875, + "learning_rate": 0.00015215701823949248, + "loss": 2.458, + "step": 18600 + }, + { + "epoch": 2.4487920168067228, + "grad_norm": 1.0703125, + "learning_rate": 0.00015202484800422945, + "loss": 2.4618, + "step": 18650 + }, + { + "epoch": 2.455357142857143, + "grad_norm": 1.59375, + "learning_rate": 0.00015189267776896643, + "loss": 2.4799, + "step": 18700 + }, + { + "epoch": 2.4619222689075633, + "grad_norm": 1.65625, + "learning_rate": 0.0001517605075337034, + "loss": 2.4915, + "step": 18750 + }, + { + "epoch": 2.4684873949579833, + "grad_norm": 1.609375, + "learning_rate": 0.0001516283372984404, + "loss": 2.4903, + "step": 18800 + }, + { + "epoch": 2.4750525210084033, + "grad_norm": 1.3046875, + "learning_rate": 0.00015149616706317738, + "loss": 2.4655, + "step": 18850 + }, + { + "epoch": 2.4816176470588234, + "grad_norm": 1.6796875, + "learning_rate": 0.00015136399682791436, + "loss": 2.4695, + "step": 18900 + }, + { + "epoch": 2.488182773109244, + "grad_norm": 1.6640625, + "learning_rate": 0.00015123182659265133, + "loss": 2.4624, + "step": 18950 + }, + { + "epoch": 2.494747899159664, + "grad_norm": 1.46875, + "learning_rate": 0.00015109965635738833, + "loss": 2.505, + "step": 19000 + }, + { + "epoch": 2.501313025210084, + "grad_norm": 1.125, + "learning_rate": 0.0001509674861221253, + "loss": 2.4549, + "step": 19050 + }, + { + "epoch": 2.5078781512605044, + "grad_norm": 1.1953125, + "learning_rate": 0.00015083531588686228, + "loss": 2.4784, + "step": 19100 + }, + { + "epoch": 2.5144432773109244, + "grad_norm": 1.46875, + "learning_rate": 0.00015070314565159926, + "loss": 2.4385, + "step": 19150 + }, + { + "epoch": 2.5210084033613445, + "grad_norm": 1.0, + "learning_rate": 0.00015057097541633623, + "loss": 2.4701, + "step": 19200 + }, + { + "epoch": 2.5275735294117645, + "grad_norm": 1.390625, + "learning_rate": 0.00015043880518107324, + "loss": 2.4518, + "step": 19250 + }, + { + "epoch": 2.534138655462185, + "grad_norm": 1.6484375, + "learning_rate": 0.0001503066349458102, + "loss": 2.4421, + "step": 19300 + }, + { + "epoch": 2.540703781512605, + "grad_norm": 1.3046875, + "learning_rate": 0.00015017446471054719, + "loss": 2.4659, + "step": 19350 + }, + { + "epoch": 2.5472689075630255, + "grad_norm": 1.3671875, + "learning_rate": 0.00015004229447528416, + "loss": 2.444, + "step": 19400 + }, + { + "epoch": 2.5538340336134455, + "grad_norm": 1.3671875, + "learning_rate": 0.00014991012424002116, + "loss": 2.4648, + "step": 19450 + }, + { + "epoch": 2.5603991596638656, + "grad_norm": 1.3359375, + "learning_rate": 0.00014977795400475814, + "loss": 2.4668, + "step": 19500 + }, + { + "epoch": 2.5669642857142856, + "grad_norm": 1.3671875, + "learning_rate": 0.00014964578376949511, + "loss": 2.467, + "step": 19550 + }, + { + "epoch": 2.5735294117647056, + "grad_norm": 2.171875, + "learning_rate": 0.0001495136135342321, + "loss": 2.5089, + "step": 19600 + }, + { + "epoch": 2.580094537815126, + "grad_norm": 1.3671875, + "learning_rate": 0.0001493814432989691, + "loss": 2.4665, + "step": 19650 + }, + { + "epoch": 2.586659663865546, + "grad_norm": 1.1875, + "learning_rate": 0.00014924927306370607, + "loss": 2.4865, + "step": 19700 + }, + { + "epoch": 2.5932247899159666, + "grad_norm": 1.1875, + "learning_rate": 0.00014911710282844304, + "loss": 2.4689, + "step": 19750 + }, + { + "epoch": 2.5997899159663866, + "grad_norm": 1.078125, + "learning_rate": 0.00014898493259318002, + "loss": 2.4494, + "step": 19800 + }, + { + "epoch": 2.6063550420168067, + "grad_norm": 1.75, + "learning_rate": 0.00014885276235791702, + "loss": 2.4745, + "step": 19850 + }, + { + "epoch": 2.6129201680672267, + "grad_norm": 1.6171875, + "learning_rate": 0.000148720592122654, + "loss": 2.4324, + "step": 19900 + }, + { + "epoch": 2.619485294117647, + "grad_norm": 1.453125, + "learning_rate": 0.00014858842188739097, + "loss": 2.4776, + "step": 19950 + }, + { + "epoch": 2.6260504201680672, + "grad_norm": 1.703125, + "learning_rate": 0.00014845625165212794, + "loss": 2.4833, + "step": 20000 + }, + { + "epoch": 2.6326155462184873, + "grad_norm": 1.2265625, + "learning_rate": 0.00014832408141686495, + "loss": 2.5234, + "step": 20050 + }, + { + "epoch": 2.6391806722689077, + "grad_norm": 1.6796875, + "learning_rate": 0.00014819191118160192, + "loss": 2.4781, + "step": 20100 + }, + { + "epoch": 2.6457457983193278, + "grad_norm": 1.453125, + "learning_rate": 0.0001480597409463389, + "loss": 2.456, + "step": 20150 + }, + { + "epoch": 2.652310924369748, + "grad_norm": 1.5625, + "learning_rate": 0.00014792757071107587, + "loss": 2.5011, + "step": 20200 + }, + { + "epoch": 2.658876050420168, + "grad_norm": 1.90625, + "learning_rate": 0.00014779540047581288, + "loss": 2.4785, + "step": 20250 + }, + { + "epoch": 2.6654411764705883, + "grad_norm": 1.4296875, + "learning_rate": 0.00014766323024054985, + "loss": 2.516, + "step": 20300 + }, + { + "epoch": 2.6720063025210083, + "grad_norm": 1.3671875, + "learning_rate": 0.00014753106000528683, + "loss": 2.5028, + "step": 20350 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 1.25, + "learning_rate": 0.0001473988897700238, + "loss": 2.4378, + "step": 20400 + }, + { + "epoch": 2.685136554621849, + "grad_norm": 1.53125, + "learning_rate": 0.00014726671953476078, + "loss": 2.4658, + "step": 20450 + }, + { + "epoch": 2.691701680672269, + "grad_norm": 1.3515625, + "learning_rate": 0.00014713454929949778, + "loss": 2.4844, + "step": 20500 + }, + { + "epoch": 2.698266806722689, + "grad_norm": 1.5703125, + "learning_rate": 0.00014700237906423475, + "loss": 2.4709, + "step": 20550 + }, + { + "epoch": 2.7048319327731094, + "grad_norm": 1.3125, + "learning_rate": 0.0001468702088289717, + "loss": 2.4797, + "step": 20600 + }, + { + "epoch": 2.7113970588235294, + "grad_norm": 1.3828125, + "learning_rate": 0.0001467380385937087, + "loss": 2.5037, + "step": 20650 + }, + { + "epoch": 2.7179621848739495, + "grad_norm": 1.625, + "learning_rate": 0.00014660586835844568, + "loss": 2.4743, + "step": 20700 + }, + { + "epoch": 2.72452731092437, + "grad_norm": 1.3125, + "learning_rate": 0.00014647369812318265, + "loss": 2.5179, + "step": 20750 + }, + { + "epoch": 2.73109243697479, + "grad_norm": 1.3828125, + "learning_rate": 0.00014634152788791963, + "loss": 2.4847, + "step": 20800 + }, + { + "epoch": 2.73765756302521, + "grad_norm": 1.3515625, + "learning_rate": 0.00014620935765265663, + "loss": 2.5035, + "step": 20850 + }, + { + "epoch": 2.74422268907563, + "grad_norm": 1.34375, + "learning_rate": 0.0001460771874173936, + "loss": 2.4859, + "step": 20900 + }, + { + "epoch": 2.7507878151260505, + "grad_norm": 1.78125, + "learning_rate": 0.00014594501718213058, + "loss": 2.466, + "step": 20950 + }, + { + "epoch": 2.7573529411764706, + "grad_norm": 1.1640625, + "learning_rate": 0.00014581284694686756, + "loss": 2.4741, + "step": 21000 + }, + { + "epoch": 2.7639180672268906, + "grad_norm": 1.7578125, + "learning_rate": 0.00014568067671160456, + "loss": 2.4844, + "step": 21050 + }, + { + "epoch": 2.770483193277311, + "grad_norm": 1.640625, + "learning_rate": 0.00014554850647634153, + "loss": 2.4834, + "step": 21100 + }, + { + "epoch": 2.777048319327731, + "grad_norm": 1.1875, + "learning_rate": 0.0001454163362410785, + "loss": 2.4813, + "step": 21150 + }, + { + "epoch": 2.783613445378151, + "grad_norm": 1.453125, + "learning_rate": 0.00014528416600581549, + "loss": 2.4598, + "step": 21200 + }, + { + "epoch": 2.790178571428571, + "grad_norm": 1.359375, + "learning_rate": 0.00014515199577055246, + "loss": 2.4787, + "step": 21250 + }, + { + "epoch": 2.7967436974789917, + "grad_norm": 1.4453125, + "learning_rate": 0.00014501982553528946, + "loss": 2.5089, + "step": 21300 + }, + { + "epoch": 2.8033088235294117, + "grad_norm": 1.7734375, + "learning_rate": 0.00014488765530002644, + "loss": 2.4685, + "step": 21350 + }, + { + "epoch": 2.809873949579832, + "grad_norm": 1.2265625, + "learning_rate": 0.0001447554850647634, + "loss": 2.495, + "step": 21400 + }, + { + "epoch": 2.816439075630252, + "grad_norm": 1.296875, + "learning_rate": 0.0001446233148295004, + "loss": 2.5034, + "step": 21450 + }, + { + "epoch": 2.8230042016806722, + "grad_norm": 1.5546875, + "learning_rate": 0.0001444911445942374, + "loss": 2.4668, + "step": 21500 + }, + { + "epoch": 2.8295693277310923, + "grad_norm": 1.078125, + "learning_rate": 0.00014435897435897437, + "loss": 2.4759, + "step": 21550 + }, + { + "epoch": 2.8361344537815127, + "grad_norm": 1.234375, + "learning_rate": 0.00014422680412371134, + "loss": 2.5021, + "step": 21600 + }, + { + "epoch": 2.8426995798319328, + "grad_norm": 1.6640625, + "learning_rate": 0.00014409463388844832, + "loss": 2.4722, + "step": 21650 + }, + { + "epoch": 2.849264705882353, + "grad_norm": 1.3125, + "learning_rate": 0.00014396246365318532, + "loss": 2.4932, + "step": 21700 + }, + { + "epoch": 2.8558298319327733, + "grad_norm": 1.2265625, + "learning_rate": 0.0001438302934179223, + "loss": 2.4748, + "step": 21750 + }, + { + "epoch": 2.8623949579831933, + "grad_norm": 1.3515625, + "learning_rate": 0.00014369812318265927, + "loss": 2.4599, + "step": 21800 + }, + { + "epoch": 2.8689600840336134, + "grad_norm": 1.296875, + "learning_rate": 0.00014356595294739624, + "loss": 2.4848, + "step": 21850 + }, + { + "epoch": 2.8755252100840334, + "grad_norm": 1.3203125, + "learning_rate": 0.00014343378271213325, + "loss": 2.4876, + "step": 21900 + }, + { + "epoch": 2.882090336134454, + "grad_norm": 1.28125, + "learning_rate": 0.00014330161247687022, + "loss": 2.4786, + "step": 21950 + }, + { + "epoch": 2.888655462184874, + "grad_norm": 1.234375, + "learning_rate": 0.0001431694422416072, + "loss": 2.4788, + "step": 22000 + }, + { + "epoch": 2.8952205882352944, + "grad_norm": 1.6640625, + "learning_rate": 0.00014303727200634417, + "loss": 2.4968, + "step": 22050 + }, + { + "epoch": 2.9017857142857144, + "grad_norm": 1.234375, + "learning_rate": 0.00014290510177108117, + "loss": 2.4782, + "step": 22100 + }, + { + "epoch": 2.9083508403361344, + "grad_norm": 1.4453125, + "learning_rate": 0.00014277293153581815, + "loss": 2.4784, + "step": 22150 + }, + { + "epoch": 2.9149159663865545, + "grad_norm": 1.109375, + "learning_rate": 0.00014264076130055512, + "loss": 2.5067, + "step": 22200 + }, + { + "epoch": 2.9214810924369745, + "grad_norm": 1.4921875, + "learning_rate": 0.0001425085910652921, + "loss": 2.4691, + "step": 22250 + }, + { + "epoch": 2.928046218487395, + "grad_norm": 1.15625, + "learning_rate": 0.0001423764208300291, + "loss": 2.4585, + "step": 22300 + }, + { + "epoch": 2.934611344537815, + "grad_norm": 1.2890625, + "learning_rate": 0.00014224425059476608, + "loss": 2.4524, + "step": 22350 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.328125, + "learning_rate": 0.00014211208035950305, + "loss": 2.4853, + "step": 22400 + }, + { + "epoch": 2.9477415966386555, + "grad_norm": 1.3046875, + "learning_rate": 0.00014197991012424003, + "loss": 2.4661, + "step": 22450 + }, + { + "epoch": 2.9543067226890756, + "grad_norm": 1.9375, + "learning_rate": 0.000141847739888977, + "loss": 2.4908, + "step": 22500 + }, + { + "epoch": 2.9608718487394956, + "grad_norm": 1.46875, + "learning_rate": 0.000141715569653714, + "loss": 2.471, + "step": 22550 + }, + { + "epoch": 2.967436974789916, + "grad_norm": 1.3515625, + "learning_rate": 0.00014158339941845098, + "loss": 2.4783, + "step": 22600 + }, + { + "epoch": 2.974002100840336, + "grad_norm": 1.53125, + "learning_rate": 0.00014145122918318796, + "loss": 2.4755, + "step": 22650 + }, + { + "epoch": 2.980567226890756, + "grad_norm": 1.1953125, + "learning_rate": 0.00014131905894792493, + "loss": 2.4862, + "step": 22700 + }, + { + "epoch": 2.9871323529411766, + "grad_norm": 1.71875, + "learning_rate": 0.00014118688871266193, + "loss": 2.4763, + "step": 22750 + }, + { + "epoch": 2.9936974789915967, + "grad_norm": 1.609375, + "learning_rate": 0.0001410547184773989, + "loss": 2.4538, + "step": 22800 + }, + { + "epoch": 3.0002626050420167, + "grad_norm": 1.359375, + "learning_rate": 0.00014092254824213588, + "loss": 2.4822, + "step": 22850 + }, + { + "epoch": 3.006827731092437, + "grad_norm": 1.5078125, + "learning_rate": 0.00014079037800687286, + "loss": 2.1167, + "step": 22900 + }, + { + "epoch": 3.013392857142857, + "grad_norm": 1.984375, + "learning_rate": 0.00014065820777160986, + "loss": 2.1005, + "step": 22950 + }, + { + "epoch": 3.0199579831932772, + "grad_norm": 1.90625, + "learning_rate": 0.0001405260375363468, + "loss": 2.0618, + "step": 23000 + }, + { + "epoch": 3.0265231092436973, + "grad_norm": 2.671875, + "learning_rate": 0.00014039386730108378, + "loss": 2.0603, + "step": 23050 + }, + { + "epoch": 3.0330882352941178, + "grad_norm": 1.3203125, + "learning_rate": 0.0001402616970658208, + "loss": 2.0654, + "step": 23100 + }, + { + "epoch": 3.039653361344538, + "grad_norm": 1.7109375, + "learning_rate": 0.00014012952683055776, + "loss": 2.0578, + "step": 23150 + }, + { + "epoch": 3.046218487394958, + "grad_norm": 1.734375, + "learning_rate": 0.00013999735659529474, + "loss": 2.1235, + "step": 23200 + }, + { + "epoch": 3.0527836134453783, + "grad_norm": 1.9453125, + "learning_rate": 0.0001398651863600317, + "loss": 2.0834, + "step": 23250 + }, + { + "epoch": 3.0593487394957983, + "grad_norm": 2.21875, + "learning_rate": 0.0001397330161247687, + "loss": 2.1192, + "step": 23300 + }, + { + "epoch": 3.0659138655462184, + "grad_norm": 1.40625, + "learning_rate": 0.0001396008458895057, + "loss": 2.0848, + "step": 23350 + }, + { + "epoch": 3.072478991596639, + "grad_norm": 1.7421875, + "learning_rate": 0.00013946867565424266, + "loss": 2.0731, + "step": 23400 + }, + { + "epoch": 3.079044117647059, + "grad_norm": 1.78125, + "learning_rate": 0.00013933650541897964, + "loss": 2.1142, + "step": 23450 + }, + { + "epoch": 3.085609243697479, + "grad_norm": 1.1875, + "learning_rate": 0.00013920433518371662, + "loss": 2.1067, + "step": 23500 + }, + { + "epoch": 3.092174369747899, + "grad_norm": 1.640625, + "learning_rate": 0.00013907216494845362, + "loss": 2.0934, + "step": 23550 + }, + { + "epoch": 3.0987394957983194, + "grad_norm": 1.7109375, + "learning_rate": 0.0001389399947131906, + "loss": 2.1451, + "step": 23600 + }, + { + "epoch": 3.1053046218487395, + "grad_norm": 2.4375, + "learning_rate": 0.00013880782447792757, + "loss": 2.1069, + "step": 23650 + }, + { + "epoch": 3.1118697478991595, + "grad_norm": 1.6875, + "learning_rate": 0.00013867565424266454, + "loss": 2.0934, + "step": 23700 + }, + { + "epoch": 3.11843487394958, + "grad_norm": 1.2578125, + "learning_rate": 0.00013854348400740155, + "loss": 2.1632, + "step": 23750 + }, + { + "epoch": 3.125, + "grad_norm": 1.9609375, + "learning_rate": 0.00013841131377213852, + "loss": 2.119, + "step": 23800 + }, + { + "epoch": 3.13156512605042, + "grad_norm": 1.796875, + "learning_rate": 0.0001382791435368755, + "loss": 2.0729, + "step": 23850 + }, + { + "epoch": 3.1381302521008405, + "grad_norm": 1.625, + "learning_rate": 0.00013814697330161247, + "loss": 2.1073, + "step": 23900 + }, + { + "epoch": 3.1446953781512605, + "grad_norm": 2.078125, + "learning_rate": 0.00013801480306634947, + "loss": 2.1055, + "step": 23950 + }, + { + "epoch": 3.1512605042016806, + "grad_norm": 2.109375, + "learning_rate": 0.00013788263283108645, + "loss": 2.1141, + "step": 24000 + }, + { + "epoch": 3.157825630252101, + "grad_norm": 2.078125, + "learning_rate": 0.00013775046259582342, + "loss": 2.1012, + "step": 24050 + }, + { + "epoch": 3.164390756302521, + "grad_norm": 1.078125, + "learning_rate": 0.0001376182923605604, + "loss": 2.1016, + "step": 24100 + }, + { + "epoch": 3.170955882352941, + "grad_norm": 1.9375, + "learning_rate": 0.0001374861221252974, + "loss": 2.1559, + "step": 24150 + }, + { + "epoch": 3.177521008403361, + "grad_norm": 1.3671875, + "learning_rate": 0.00013735395189003438, + "loss": 2.1268, + "step": 24200 + }, + { + "epoch": 3.1840861344537816, + "grad_norm": 1.796875, + "learning_rate": 0.00013722178165477135, + "loss": 2.159, + "step": 24250 + }, + { + "epoch": 3.1906512605042017, + "grad_norm": 1.890625, + "learning_rate": 0.00013708961141950833, + "loss": 2.1487, + "step": 24300 + }, + { + "epoch": 3.1972163865546217, + "grad_norm": 1.3671875, + "learning_rate": 0.00013695744118424533, + "loss": 2.1238, + "step": 24350 + }, + { + "epoch": 3.203781512605042, + "grad_norm": 1.765625, + "learning_rate": 0.0001368252709489823, + "loss": 2.135, + "step": 24400 + }, + { + "epoch": 3.210346638655462, + "grad_norm": 1.5078125, + "learning_rate": 0.00013669310071371928, + "loss": 2.1486, + "step": 24450 + }, + { + "epoch": 3.2169117647058822, + "grad_norm": 1.765625, + "learning_rate": 0.00013656093047845625, + "loss": 2.138, + "step": 24500 + }, + { + "epoch": 3.2234768907563027, + "grad_norm": 1.7265625, + "learning_rate": 0.00013642876024319323, + "loss": 2.1057, + "step": 24550 + }, + { + "epoch": 3.2300420168067228, + "grad_norm": 2.34375, + "learning_rate": 0.00013629659000793023, + "loss": 2.116, + "step": 24600 + }, + { + "epoch": 3.236607142857143, + "grad_norm": 1.8515625, + "learning_rate": 0.0001361644197726672, + "loss": 2.1226, + "step": 24650 + }, + { + "epoch": 3.243172268907563, + "grad_norm": 1.9375, + "learning_rate": 0.00013603224953740418, + "loss": 2.1724, + "step": 24700 + }, + { + "epoch": 3.2497373949579833, + "grad_norm": 1.9296875, + "learning_rate": 0.00013590007930214116, + "loss": 2.1682, + "step": 24750 + }, + { + "epoch": 3.2563025210084033, + "grad_norm": 1.734375, + "learning_rate": 0.00013576790906687816, + "loss": 2.1606, + "step": 24800 + }, + { + "epoch": 3.2628676470588234, + "grad_norm": 1.78125, + "learning_rate": 0.00013563573883161514, + "loss": 2.1585, + "step": 24850 + }, + { + "epoch": 3.269432773109244, + "grad_norm": 1.4375, + "learning_rate": 0.0001355035685963521, + "loss": 2.1539, + "step": 24900 + }, + { + "epoch": 3.275997899159664, + "grad_norm": 1.5, + "learning_rate": 0.00013537139836108909, + "loss": 2.1504, + "step": 24950 + }, + { + "epoch": 3.282563025210084, + "grad_norm": 1.6640625, + "learning_rate": 0.0001352392281258261, + "loss": 2.1417, + "step": 25000 + }, + { + "epoch": 3.2891281512605044, + "grad_norm": 1.8828125, + "learning_rate": 0.00013510705789056306, + "loss": 2.1579, + "step": 25050 + }, + { + "epoch": 3.2956932773109244, + "grad_norm": 1.703125, + "learning_rate": 0.00013497488765530004, + "loss": 2.1358, + "step": 25100 + }, + { + "epoch": 3.3022584033613445, + "grad_norm": 2.5625, + "learning_rate": 0.00013484271742003701, + "loss": 2.1612, + "step": 25150 + }, + { + "epoch": 3.3088235294117645, + "grad_norm": 1.4609375, + "learning_rate": 0.00013471054718477402, + "loss": 2.1567, + "step": 25200 + }, + { + "epoch": 3.315388655462185, + "grad_norm": 1.7109375, + "learning_rate": 0.000134578376949511, + "loss": 2.1374, + "step": 25250 + }, + { + "epoch": 3.321953781512605, + "grad_norm": 2.46875, + "learning_rate": 0.00013444620671424797, + "loss": 2.1922, + "step": 25300 + }, + { + "epoch": 3.328518907563025, + "grad_norm": 1.375, + "learning_rate": 0.00013431403647898494, + "loss": 2.1584, + "step": 25350 + }, + { + "epoch": 3.3350840336134455, + "grad_norm": 2.21875, + "learning_rate": 0.00013418186624372192, + "loss": 2.1268, + "step": 25400 + }, + { + "epoch": 3.3416491596638656, + "grad_norm": 2.046875, + "learning_rate": 0.0001340496960084589, + "loss": 2.1464, + "step": 25450 + }, + { + "epoch": 3.3482142857142856, + "grad_norm": 2.359375, + "learning_rate": 0.00013391752577319587, + "loss": 2.1527, + "step": 25500 + }, + { + "epoch": 3.354779411764706, + "grad_norm": 1.71875, + "learning_rate": 0.00013378535553793284, + "loss": 2.1261, + "step": 25550 + }, + { + "epoch": 3.361344537815126, + "grad_norm": 1.4609375, + "learning_rate": 0.00013365318530266984, + "loss": 2.1905, + "step": 25600 + }, + { + "epoch": 3.367909663865546, + "grad_norm": 1.765625, + "learning_rate": 0.00013352101506740682, + "loss": 2.1369, + "step": 25650 + }, + { + "epoch": 3.3744747899159666, + "grad_norm": 1.7109375, + "learning_rate": 0.0001333888448321438, + "loss": 2.1478, + "step": 25700 + }, + { + "epoch": 3.3810399159663866, + "grad_norm": 1.5625, + "learning_rate": 0.00013325667459688077, + "loss": 2.1798, + "step": 25750 + }, + { + "epoch": 3.3876050420168067, + "grad_norm": 1.6640625, + "learning_rate": 0.00013312450436161777, + "loss": 2.1629, + "step": 25800 + }, + { + "epoch": 3.3941701680672267, + "grad_norm": 2.140625, + "learning_rate": 0.00013299233412635475, + "loss": 2.2038, + "step": 25850 + }, + { + "epoch": 3.400735294117647, + "grad_norm": 1.4765625, + "learning_rate": 0.00013286016389109172, + "loss": 2.1782, + "step": 25900 + }, + { + "epoch": 3.4073004201680672, + "grad_norm": 2.34375, + "learning_rate": 0.0001327279936558287, + "loss": 2.1563, + "step": 25950 + }, + { + "epoch": 3.4138655462184873, + "grad_norm": 2.609375, + "learning_rate": 0.0001325958234205657, + "loss": 2.1824, + "step": 26000 + }, + { + "epoch": 3.4204306722689077, + "grad_norm": 2.609375, + "learning_rate": 0.00013246365318530268, + "loss": 1.8937, + "step": 26050 + }, + { + "epoch": 3.4269957983193278, + "grad_norm": 1.5703125, + "learning_rate": 0.00013233148295003965, + "loss": 1.9172, + "step": 26100 + }, + { + "epoch": 3.433560924369748, + "grad_norm": 2.5625, + "learning_rate": 0.00013219931271477663, + "loss": 1.8899, + "step": 26150 + }, + { + "epoch": 3.440126050420168, + "grad_norm": 1.90625, + "learning_rate": 0.00013206714247951363, + "loss": 1.8592, + "step": 26200 + }, + { + "epoch": 3.4466911764705883, + "grad_norm": 1.828125, + "learning_rate": 0.0001319349722442506, + "loss": 1.9016, + "step": 26250 + }, + { + "epoch": 3.4532563025210083, + "grad_norm": 2.734375, + "learning_rate": 0.00013180280200898758, + "loss": 1.9376, + "step": 26300 + }, + { + "epoch": 3.4598214285714284, + "grad_norm": 1.640625, + "learning_rate": 0.00013167063177372455, + "loss": 1.9256, + "step": 26350 + }, + { + "epoch": 3.466386554621849, + "grad_norm": 2.109375, + "learning_rate": 0.00013153846153846156, + "loss": 1.8968, + "step": 26400 + }, + { + "epoch": 3.472951680672269, + "grad_norm": 2.421875, + "learning_rate": 0.00013140629130319853, + "loss": 1.8637, + "step": 26450 + }, + { + "epoch": 3.479516806722689, + "grad_norm": 1.6328125, + "learning_rate": 0.0001312741210679355, + "loss": 1.8956, + "step": 26500 + }, + { + "epoch": 3.4860819327731094, + "grad_norm": 1.4921875, + "learning_rate": 0.00013114195083267248, + "loss": 1.9324, + "step": 26550 + }, + { + "epoch": 3.4926470588235294, + "grad_norm": 1.625, + "learning_rate": 0.00013100978059740948, + "loss": 1.921, + "step": 26600 + }, + { + "epoch": 3.4992121848739495, + "grad_norm": 1.9921875, + "learning_rate": 0.00013087761036214646, + "loss": 1.8565, + "step": 26650 + }, + { + "epoch": 3.50577731092437, + "grad_norm": 2.4375, + "learning_rate": 0.00013074544012688343, + "loss": 1.9227, + "step": 26700 + }, + { + "epoch": 3.51234243697479, + "grad_norm": 2.0625, + "learning_rate": 0.0001306132698916204, + "loss": 1.8476, + "step": 26750 + }, + { + "epoch": 3.51890756302521, + "grad_norm": 2.453125, + "learning_rate": 0.00013048109965635738, + "loss": 1.9273, + "step": 26800 + }, + { + "epoch": 3.52547268907563, + "grad_norm": 3.109375, + "learning_rate": 0.0001303489294210944, + "loss": 1.9214, + "step": 26850 + }, + { + "epoch": 3.5320378151260505, + "grad_norm": 2.765625, + "learning_rate": 0.00013021675918583136, + "loss": 1.9062, + "step": 26900 + }, + { + "epoch": 3.5386029411764706, + "grad_norm": 1.828125, + "learning_rate": 0.00013008458895056834, + "loss": 1.8992, + "step": 26950 + }, + { + "epoch": 3.5451680672268906, + "grad_norm": 2.046875, + "learning_rate": 0.0001299524187153053, + "loss": 1.9827, + "step": 27000 + }, + { + "epoch": 3.551733193277311, + "grad_norm": 2.109375, + "learning_rate": 0.00012982024848004231, + "loss": 1.8935, + "step": 27050 + }, + { + "epoch": 3.558298319327731, + "grad_norm": 1.71875, + "learning_rate": 0.0001296880782447793, + "loss": 1.9372, + "step": 27100 + }, + { + "epoch": 3.564863445378151, + "grad_norm": 2.265625, + "learning_rate": 0.00012955590800951627, + "loss": 1.918, + "step": 27150 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.8515625, + "learning_rate": 0.00012942373777425324, + "loss": 1.9758, + "step": 27200 + }, + { + "epoch": 3.5779936974789917, + "grad_norm": 2.40625, + "learning_rate": 0.00012929156753899024, + "loss": 1.9002, + "step": 27250 + }, + { + "epoch": 3.5845588235294117, + "grad_norm": 2.0, + "learning_rate": 0.00012915939730372722, + "loss": 1.9045, + "step": 27300 + }, + { + "epoch": 3.591123949579832, + "grad_norm": 2.359375, + "learning_rate": 0.0001290272270684642, + "loss": 1.9385, + "step": 27350 + }, + { + "epoch": 3.597689075630252, + "grad_norm": 1.921875, + "learning_rate": 0.00012889505683320117, + "loss": 1.9586, + "step": 27400 + }, + { + "epoch": 3.6042542016806722, + "grad_norm": 2.515625, + "learning_rate": 0.00012876288659793817, + "loss": 1.9139, + "step": 27450 + }, + { + "epoch": 3.6108193277310923, + "grad_norm": 2.125, + "learning_rate": 0.00012863071636267515, + "loss": 1.949, + "step": 27500 + }, + { + "epoch": 3.6173844537815127, + "grad_norm": 1.8515625, + "learning_rate": 0.00012849854612741212, + "loss": 1.9769, + "step": 27550 + }, + { + "epoch": 3.6239495798319328, + "grad_norm": 2.453125, + "learning_rate": 0.0001283663758921491, + "loss": 1.9476, + "step": 27600 + }, + { + "epoch": 3.630514705882353, + "grad_norm": 2.453125, + "learning_rate": 0.0001282342056568861, + "loss": 1.9785, + "step": 27650 + }, + { + "epoch": 3.6370798319327733, + "grad_norm": 2.453125, + "learning_rate": 0.00012810203542162305, + "loss": 1.9009, + "step": 27700 + }, + { + "epoch": 3.6436449579831933, + "grad_norm": 2.046875, + "learning_rate": 0.00012796986518636002, + "loss": 1.9727, + "step": 27750 + }, + { + "epoch": 3.6502100840336134, + "grad_norm": 2.375, + "learning_rate": 0.000127837694951097, + "loss": 1.9782, + "step": 27800 + }, + { + "epoch": 3.6567752100840334, + "grad_norm": 2.140625, + "learning_rate": 0.000127705524715834, + "loss": 1.9606, + "step": 27850 + }, + { + "epoch": 3.663340336134454, + "grad_norm": 2.40625, + "learning_rate": 0.00012757335448057097, + "loss": 2.0027, + "step": 27900 + }, + { + "epoch": 3.669905462184874, + "grad_norm": 2.4375, + "learning_rate": 0.00012744118424530795, + "loss": 1.9761, + "step": 27950 + }, + { + "epoch": 3.6764705882352944, + "grad_norm": 1.6875, + "learning_rate": 0.00012730901401004493, + "loss": 1.9817, + "step": 28000 + }, + { + "epoch": 3.6830357142857144, + "grad_norm": 1.7890625, + "learning_rate": 0.00012717684377478193, + "loss": 1.9666, + "step": 28050 + }, + { + "epoch": 3.6896008403361344, + "grad_norm": 2.265625, + "learning_rate": 0.0001270446735395189, + "loss": 1.9876, + "step": 28100 + }, + { + "epoch": 3.6961659663865545, + "grad_norm": 1.71875, + "learning_rate": 0.00012691250330425588, + "loss": 1.9571, + "step": 28150 + }, + { + "epoch": 3.7027310924369745, + "grad_norm": 2.9375, + "learning_rate": 0.00012678033306899285, + "loss": 1.9859, + "step": 28200 + }, + { + "epoch": 3.709296218487395, + "grad_norm": 1.9921875, + "learning_rate": 0.00012664816283372986, + "loss": 2.0244, + "step": 28250 + }, + { + "epoch": 3.715861344537815, + "grad_norm": 1.6875, + "learning_rate": 0.00012651599259846683, + "loss": 2.0215, + "step": 28300 + }, + { + "epoch": 3.7224264705882355, + "grad_norm": 2.421875, + "learning_rate": 0.0001263838223632038, + "loss": 1.962, + "step": 28350 + }, + { + "epoch": 3.7289915966386555, + "grad_norm": 2.046875, + "learning_rate": 0.00012625165212794078, + "loss": 2.0002, + "step": 28400 + }, + { + "epoch": 3.7355567226890756, + "grad_norm": 2.109375, + "learning_rate": 0.00012611948189267778, + "loss": 1.962, + "step": 28450 + }, + { + "epoch": 3.7421218487394956, + "grad_norm": 1.4921875, + "learning_rate": 0.00012598731165741476, + "loss": 2.0032, + "step": 28500 + }, + { + "epoch": 3.748686974789916, + "grad_norm": 2.203125, + "learning_rate": 0.00012585514142215173, + "loss": 1.9378, + "step": 28550 + }, + { + "epoch": 3.755252100840336, + "grad_norm": 1.59375, + "learning_rate": 0.0001257229711868887, + "loss": 1.9797, + "step": 28600 + }, + { + "epoch": 3.761817226890756, + "grad_norm": 2.171875, + "learning_rate": 0.0001255908009516257, + "loss": 1.9942, + "step": 28650 + }, + { + "epoch": 3.7683823529411766, + "grad_norm": 2.125, + "learning_rate": 0.00012545863071636269, + "loss": 2.0173, + "step": 28700 + }, + { + "epoch": 3.7749474789915967, + "grad_norm": 1.4609375, + "learning_rate": 0.00012532646048109966, + "loss": 2.0285, + "step": 28750 + }, + { + "epoch": 3.7815126050420167, + "grad_norm": 1.59375, + "learning_rate": 0.00012519429024583664, + "loss": 2.0068, + "step": 28800 + }, + { + "epoch": 3.7880777310924367, + "grad_norm": 2.1875, + "learning_rate": 0.0001250621200105736, + "loss": 1.9377, + "step": 28850 + }, + { + "epoch": 3.794642857142857, + "grad_norm": 1.890625, + "learning_rate": 0.00012492994977531061, + "loss": 1.9896, + "step": 28900 + }, + { + "epoch": 3.8012079831932772, + "grad_norm": 1.65625, + "learning_rate": 0.0001247977795400476, + "loss": 1.9593, + "step": 28950 + }, + { + "epoch": 3.8077731092436977, + "grad_norm": 2.359375, + "learning_rate": 0.00012466560930478456, + "loss": 1.9841, + "step": 29000 + }, + { + "epoch": 3.8143382352941178, + "grad_norm": 1.5859375, + "learning_rate": 0.00012453343906952154, + "loss": 2.0124, + "step": 29050 + }, + { + "epoch": 3.820903361344538, + "grad_norm": 1.9140625, + "learning_rate": 0.00012440126883425854, + "loss": 1.9798, + "step": 29100 + }, + { + "epoch": 3.827468487394958, + "grad_norm": 1.8828125, + "learning_rate": 0.00012426909859899552, + "loss": 1.9527, + "step": 29150 + }, + { + "epoch": 3.8340336134453783, + "grad_norm": 2.234375, + "learning_rate": 0.0001241369283637325, + "loss": 2.0337, + "step": 29200 + }, + { + "epoch": 3.8405987394957983, + "grad_norm": 1.4609375, + "learning_rate": 0.00012400475812846947, + "loss": 2.0794, + "step": 29250 + }, + { + "epoch": 3.8471638655462184, + "grad_norm": 1.9453125, + "learning_rate": 0.00012387258789320647, + "loss": 1.9727, + "step": 29300 + }, + { + "epoch": 3.853728991596639, + "grad_norm": 2.421875, + "learning_rate": 0.00012374041765794345, + "loss": 2.0058, + "step": 29350 + }, + { + "epoch": 3.860294117647059, + "grad_norm": 1.515625, + "learning_rate": 0.00012360824742268042, + "loss": 1.9934, + "step": 29400 + }, + { + "epoch": 3.866859243697479, + "grad_norm": 1.65625, + "learning_rate": 0.0001234760771874174, + "loss": 2.0031, + "step": 29450 + }, + { + "epoch": 3.873424369747899, + "grad_norm": 2.03125, + "learning_rate": 0.0001233439069521544, + "loss": 1.9638, + "step": 29500 + }, + { + "epoch": 3.8799894957983194, + "grad_norm": 2.453125, + "learning_rate": 0.00012321173671689137, + "loss": 2.0159, + "step": 29550 + }, + { + "epoch": 3.8865546218487395, + "grad_norm": 2.09375, + "learning_rate": 0.00012307956648162835, + "loss": 2.0246, + "step": 29600 + }, + { + "epoch": 3.89311974789916, + "grad_norm": 1.765625, + "learning_rate": 0.00012294739624636532, + "loss": 1.9922, + "step": 29650 + }, + { + "epoch": 3.89968487394958, + "grad_norm": 2.671875, + "learning_rate": 0.00012281522601110233, + "loss": 2.0464, + "step": 29700 + }, + { + "epoch": 3.90625, + "grad_norm": 1.8828125, + "learning_rate": 0.0001226830557758393, + "loss": 1.9939, + "step": 29750 + }, + { + "epoch": 3.91281512605042, + "grad_norm": 2.9375, + "learning_rate": 0.00012255088554057628, + "loss": 2.0142, + "step": 29800 + }, + { + "epoch": 3.91938025210084, + "grad_norm": 1.8359375, + "learning_rate": 0.00012241871530531325, + "loss": 2.0387, + "step": 29850 + }, + { + "epoch": 3.9259453781512605, + "grad_norm": 1.609375, + "learning_rate": 0.00012228654507005025, + "loss": 2.0121, + "step": 29900 + }, + { + "epoch": 3.9325105042016806, + "grad_norm": 2.125, + "learning_rate": 0.00012215437483478723, + "loss": 2.0636, + "step": 29950 + }, + { + "epoch": 3.939075630252101, + "grad_norm": 1.7578125, + "learning_rate": 0.0001220222045995242, + "loss": 2.006, + "step": 30000 + }, + { + "epoch": 3.945640756302521, + "grad_norm": 1.8125, + "learning_rate": 0.00012189003436426118, + "loss": 2.0666, + "step": 30050 + }, + { + "epoch": 3.952205882352941, + "grad_norm": 2.4375, + "learning_rate": 0.00012175786412899814, + "loss": 1.9889, + "step": 30100 + }, + { + "epoch": 3.958771008403361, + "grad_norm": 2.078125, + "learning_rate": 0.00012162569389373513, + "loss": 1.9606, + "step": 30150 + }, + { + "epoch": 3.9653361344537816, + "grad_norm": 2.15625, + "learning_rate": 0.0001214935236584721, + "loss": 2.0165, + "step": 30200 + }, + { + "epoch": 3.9719012605042017, + "grad_norm": 1.8359375, + "learning_rate": 0.0001213613534232091, + "loss": 2.0325, + "step": 30250 + }, + { + "epoch": 3.9784663865546217, + "grad_norm": 2.015625, + "learning_rate": 0.00012122918318794607, + "loss": 2.0117, + "step": 30300 + }, + { + "epoch": 3.985031512605042, + "grad_norm": 1.828125, + "learning_rate": 0.00012109701295268306, + "loss": 1.9916, + "step": 30350 + }, + { + "epoch": 3.991596638655462, + "grad_norm": 1.65625, + "learning_rate": 0.00012096484271742003, + "loss": 2.0431, + "step": 30400 + }, + { + "epoch": 3.9981617647058822, + "grad_norm": 2.21875, + "learning_rate": 0.00012083267248215702, + "loss": 1.9578, + "step": 30450 + }, + { + "epoch": 4.004726890756302, + "grad_norm": 2.15625, + "learning_rate": 0.000120700502246894, + "loss": 1.7744, + "step": 30500 + }, + { + "epoch": 4.011292016806722, + "grad_norm": 1.8671875, + "learning_rate": 0.00012056833201163099, + "loss": 1.7033, + "step": 30550 + }, + { + "epoch": 4.017857142857143, + "grad_norm": 2.484375, + "learning_rate": 0.00012043616177636796, + "loss": 1.7349, + "step": 30600 + }, + { + "epoch": 4.024422268907563, + "grad_norm": 2.234375, + "learning_rate": 0.00012030399154110495, + "loss": 1.7361, + "step": 30650 + }, + { + "epoch": 4.030987394957983, + "grad_norm": 1.9296875, + "learning_rate": 0.00012017182130584192, + "loss": 1.7135, + "step": 30700 + }, + { + "epoch": 4.037552521008403, + "grad_norm": 4.25, + "learning_rate": 0.00012003965107057891, + "loss": 1.7177, + "step": 30750 + }, + { + "epoch": 4.044117647058823, + "grad_norm": 2.203125, + "learning_rate": 0.00011990748083531589, + "loss": 1.6554, + "step": 30800 + }, + { + "epoch": 4.050682773109243, + "grad_norm": 2.296875, + "learning_rate": 0.00011977531060005286, + "loss": 1.7107, + "step": 30850 + }, + { + "epoch": 4.057247899159663, + "grad_norm": 3.03125, + "learning_rate": 0.00011964314036478985, + "loss": 1.7729, + "step": 30900 + }, + { + "epoch": 4.063813025210084, + "grad_norm": 1.7734375, + "learning_rate": 0.00011951097012952683, + "loss": 1.7601, + "step": 30950 + }, + { + "epoch": 4.070378151260504, + "grad_norm": 2.828125, + "learning_rate": 0.00011937879989426382, + "loss": 1.7264, + "step": 31000 + }, + { + "epoch": 4.076943277310924, + "grad_norm": 2.453125, + "learning_rate": 0.00011924662965900079, + "loss": 1.7868, + "step": 31050 + }, + { + "epoch": 4.0835084033613445, + "grad_norm": 2.234375, + "learning_rate": 0.00011911445942373778, + "loss": 1.72, + "step": 31100 + }, + { + "epoch": 4.0900735294117645, + "grad_norm": 2.421875, + "learning_rate": 0.00011898228918847476, + "loss": 1.791, + "step": 31150 + }, + { + "epoch": 4.0966386554621845, + "grad_norm": 2.234375, + "learning_rate": 0.00011885011895321174, + "loss": 1.7327, + "step": 31200 + }, + { + "epoch": 4.1032037815126055, + "grad_norm": 2.578125, + "learning_rate": 0.00011871794871794872, + "loss": 1.7162, + "step": 31250 + }, + { + "epoch": 4.1097689075630255, + "grad_norm": 1.9296875, + "learning_rate": 0.00011858577848268571, + "loss": 1.7865, + "step": 31300 + }, + { + "epoch": 4.1163340336134455, + "grad_norm": 2.203125, + "learning_rate": 0.00011845360824742268, + "loss": 1.794, + "step": 31350 + }, + { + "epoch": 4.1228991596638656, + "grad_norm": 2.140625, + "learning_rate": 0.00011832143801215967, + "loss": 1.7417, + "step": 31400 + }, + { + "epoch": 4.129464285714286, + "grad_norm": 2.828125, + "learning_rate": 0.00011818926777689665, + "loss": 1.7886, + "step": 31450 + }, + { + "epoch": 4.136029411764706, + "grad_norm": 4.0625, + "learning_rate": 0.00011805709754163364, + "loss": 1.8182, + "step": 31500 + }, + { + "epoch": 4.142594537815126, + "grad_norm": 1.6796875, + "learning_rate": 0.00011792492730637061, + "loss": 1.7963, + "step": 31550 + }, + { + "epoch": 4.149159663865547, + "grad_norm": 3.265625, + "learning_rate": 0.0001177927570711076, + "loss": 1.6697, + "step": 31600 + }, + { + "epoch": 4.155724789915967, + "grad_norm": 2.234375, + "learning_rate": 0.00011766058683584458, + "loss": 1.7375, + "step": 31650 + }, + { + "epoch": 4.162289915966387, + "grad_norm": 2.28125, + "learning_rate": 0.00011752841660058156, + "loss": 1.8777, + "step": 31700 + }, + { + "epoch": 4.168855042016807, + "grad_norm": 3.125, + "learning_rate": 0.00011739624636531854, + "loss": 1.7834, + "step": 31750 + }, + { + "epoch": 4.175420168067227, + "grad_norm": 1.515625, + "learning_rate": 0.00011726407613005553, + "loss": 1.8656, + "step": 31800 + }, + { + "epoch": 4.181985294117647, + "grad_norm": 3.203125, + "learning_rate": 0.0001171319058947925, + "loss": 1.7457, + "step": 31850 + }, + { + "epoch": 4.188550420168067, + "grad_norm": 1.9921875, + "learning_rate": 0.00011699973565952949, + "loss": 1.7545, + "step": 31900 + }, + { + "epoch": 4.195115546218488, + "grad_norm": 2.5, + "learning_rate": 0.00011686756542426647, + "loss": 1.7942, + "step": 31950 + }, + { + "epoch": 4.201680672268908, + "grad_norm": 1.7265625, + "learning_rate": 0.00011673539518900346, + "loss": 1.8379, + "step": 32000 + }, + { + "epoch": 4.208245798319328, + "grad_norm": 2.890625, + "learning_rate": 0.00011660322495374043, + "loss": 1.8702, + "step": 32050 + }, + { + "epoch": 4.214810924369748, + "grad_norm": 3.84375, + "learning_rate": 0.0001164710547184774, + "loss": 1.8278, + "step": 32100 + }, + { + "epoch": 4.221376050420168, + "grad_norm": 1.96875, + "learning_rate": 0.0001163388844832144, + "loss": 1.7786, + "step": 32150 + }, + { + "epoch": 4.227941176470588, + "grad_norm": 2.953125, + "learning_rate": 0.00011620671424795137, + "loss": 1.8027, + "step": 32200 + }, + { + "epoch": 4.234506302521009, + "grad_norm": 3.125, + "learning_rate": 0.00011607454401268836, + "loss": 1.8067, + "step": 32250 + }, + { + "epoch": 4.241071428571429, + "grad_norm": 2.515625, + "learning_rate": 0.00011594237377742533, + "loss": 1.7544, + "step": 32300 + }, + { + "epoch": 4.247636554621849, + "grad_norm": 3.703125, + "learning_rate": 0.00011581020354216232, + "loss": 1.7908, + "step": 32350 + }, + { + "epoch": 4.254201680672269, + "grad_norm": 2.6875, + "learning_rate": 0.0001156780333068993, + "loss": 1.8138, + "step": 32400 + }, + { + "epoch": 4.260766806722689, + "grad_norm": 2.796875, + "learning_rate": 0.00011554586307163626, + "loss": 1.8029, + "step": 32450 + }, + { + "epoch": 4.267331932773109, + "grad_norm": 2.859375, + "learning_rate": 0.00011541369283637325, + "loss": 1.7928, + "step": 32500 + }, + { + "epoch": 4.273897058823529, + "grad_norm": 3.0, + "learning_rate": 0.00011528152260111022, + "loss": 1.7972, + "step": 32550 + }, + { + "epoch": 4.28046218487395, + "grad_norm": 1.921875, + "learning_rate": 0.00011514935236584721, + "loss": 1.7975, + "step": 32600 + }, + { + "epoch": 4.28702731092437, + "grad_norm": 3.015625, + "learning_rate": 0.00011501718213058419, + "loss": 1.8258, + "step": 32650 + }, + { + "epoch": 4.29359243697479, + "grad_norm": 2.34375, + "learning_rate": 0.00011488501189532118, + "loss": 1.829, + "step": 32700 + }, + { + "epoch": 4.30015756302521, + "grad_norm": 2.53125, + "learning_rate": 0.00011475284166005815, + "loss": 1.8588, + "step": 32750 + }, + { + "epoch": 4.30672268907563, + "grad_norm": 2.796875, + "learning_rate": 0.00011462067142479514, + "loss": 1.8308, + "step": 32800 + }, + { + "epoch": 4.31328781512605, + "grad_norm": 2.25, + "learning_rate": 0.00011448850118953212, + "loss": 1.7983, + "step": 32850 + }, + { + "epoch": 4.319852941176471, + "grad_norm": 3.078125, + "learning_rate": 0.0001143563309542691, + "loss": 1.8035, + "step": 32900 + }, + { + "epoch": 4.326418067226891, + "grad_norm": 2.96875, + "learning_rate": 0.00011422416071900608, + "loss": 1.7907, + "step": 32950 + }, + { + "epoch": 4.332983193277311, + "grad_norm": 2.21875, + "learning_rate": 0.00011409199048374305, + "loss": 1.8111, + "step": 33000 + }, + { + "epoch": 4.339548319327731, + "grad_norm": 2.359375, + "learning_rate": 0.00011395982024848004, + "loss": 1.7474, + "step": 33050 + }, + { + "epoch": 4.346113445378151, + "grad_norm": 3.078125, + "learning_rate": 0.00011382765001321702, + "loss": 1.903, + "step": 33100 + }, + { + "epoch": 4.352678571428571, + "grad_norm": 2.78125, + "learning_rate": 0.00011369547977795401, + "loss": 1.8627, + "step": 33150 + }, + { + "epoch": 4.359243697478991, + "grad_norm": 2.109375, + "learning_rate": 0.00011356330954269098, + "loss": 1.8208, + "step": 33200 + }, + { + "epoch": 4.365808823529412, + "grad_norm": 2.09375, + "learning_rate": 0.00011343113930742797, + "loss": 1.8314, + "step": 33250 + }, + { + "epoch": 4.372373949579832, + "grad_norm": 1.8984375, + "learning_rate": 0.00011329896907216495, + "loss": 1.7919, + "step": 33300 + }, + { + "epoch": 4.378939075630252, + "grad_norm": 1.9453125, + "learning_rate": 0.00011316679883690194, + "loss": 1.8027, + "step": 33350 + }, + { + "epoch": 4.385504201680672, + "grad_norm": 2.09375, + "learning_rate": 0.00011303462860163891, + "loss": 1.8715, + "step": 33400 + }, + { + "epoch": 4.392069327731092, + "grad_norm": 3.796875, + "learning_rate": 0.0001129024583663759, + "loss": 1.8567, + "step": 33450 + }, + { + "epoch": 4.398634453781512, + "grad_norm": 2.78125, + "learning_rate": 0.00011277028813111287, + "loss": 1.8811, + "step": 33500 + }, + { + "epoch": 4.405199579831933, + "grad_norm": 2.1875, + "learning_rate": 0.00011263811789584986, + "loss": 1.8409, + "step": 33550 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 2.25, + "learning_rate": 0.00011250594766058684, + "loss": 1.9408, + "step": 33600 + }, + { + "epoch": 4.418329831932773, + "grad_norm": 2.890625, + "learning_rate": 0.00011237377742532383, + "loss": 1.7322, + "step": 33650 + }, + { + "epoch": 4.424894957983193, + "grad_norm": 3.3125, + "learning_rate": 0.0001122416071900608, + "loss": 1.8245, + "step": 33700 + }, + { + "epoch": 4.431460084033613, + "grad_norm": 2.359375, + "learning_rate": 0.00011210943695479779, + "loss": 1.8147, + "step": 33750 + }, + { + "epoch": 4.438025210084033, + "grad_norm": 3.375, + "learning_rate": 0.00011197726671953477, + "loss": 1.8319, + "step": 33800 + }, + { + "epoch": 4.444590336134453, + "grad_norm": 2.0, + "learning_rate": 0.00011184509648427175, + "loss": 1.7941, + "step": 33850 + }, + { + "epoch": 4.451155462184874, + "grad_norm": 4.1875, + "learning_rate": 0.00011171292624900873, + "loss": 1.8681, + "step": 33900 + }, + { + "epoch": 4.457720588235294, + "grad_norm": 4.03125, + "learning_rate": 0.00011158075601374572, + "loss": 1.8473, + "step": 33950 + }, + { + "epoch": 4.464285714285714, + "grad_norm": 1.9609375, + "learning_rate": 0.0001114485857784827, + "loss": 1.825, + "step": 34000 + }, + { + "epoch": 4.4708508403361344, + "grad_norm": 2.84375, + "learning_rate": 0.00011131641554321968, + "loss": 1.8312, + "step": 34050 + }, + { + "epoch": 4.4774159663865545, + "grad_norm": 2.796875, + "learning_rate": 0.00011118424530795666, + "loss": 1.8738, + "step": 34100 + }, + { + "epoch": 4.4839810924369745, + "grad_norm": 2.578125, + "learning_rate": 0.00011105207507269363, + "loss": 1.8026, + "step": 34150 + }, + { + "epoch": 4.4905462184873945, + "grad_norm": 1.890625, + "learning_rate": 0.00011091990483743062, + "loss": 1.812, + "step": 34200 + }, + { + "epoch": 4.4971113445378155, + "grad_norm": 2.125, + "learning_rate": 0.0001107877346021676, + "loss": 1.852, + "step": 34250 + }, + { + "epoch": 4.5036764705882355, + "grad_norm": 3.125, + "learning_rate": 0.00011065556436690459, + "loss": 1.8035, + "step": 34300 + }, + { + "epoch": 4.5102415966386555, + "grad_norm": 3.84375, + "learning_rate": 0.00011052339413164156, + "loss": 1.8066, + "step": 34350 + }, + { + "epoch": 4.516806722689076, + "grad_norm": 2.25, + "learning_rate": 0.00011039122389637855, + "loss": 1.8587, + "step": 34400 + }, + { + "epoch": 4.523371848739496, + "grad_norm": 1.9765625, + "learning_rate": 0.00011025905366111552, + "loss": 1.8111, + "step": 34450 + }, + { + "epoch": 4.529936974789916, + "grad_norm": 2.234375, + "learning_rate": 0.00011012688342585251, + "loss": 1.8182, + "step": 34500 + }, + { + "epoch": 4.536502100840336, + "grad_norm": 2.21875, + "learning_rate": 0.00010999471319058949, + "loss": 1.8458, + "step": 34550 + }, + { + "epoch": 4.543067226890757, + "grad_norm": 2.015625, + "learning_rate": 0.00010986254295532648, + "loss": 1.8681, + "step": 34600 + }, + { + "epoch": 4.549632352941177, + "grad_norm": 1.953125, + "learning_rate": 0.00010973037272006345, + "loss": 1.8308, + "step": 34650 + }, + { + "epoch": 4.556197478991597, + "grad_norm": 1.9765625, + "learning_rate": 0.00010959820248480044, + "loss": 1.8915, + "step": 34700 + }, + { + "epoch": 4.562762605042017, + "grad_norm": 2.203125, + "learning_rate": 0.00010946603224953742, + "loss": 1.8739, + "step": 34750 + }, + { + "epoch": 4.569327731092437, + "grad_norm": 2.859375, + "learning_rate": 0.00010933386201427438, + "loss": 1.8601, + "step": 34800 + }, + { + "epoch": 4.575892857142857, + "grad_norm": 2.125, + "learning_rate": 0.00010920169177901137, + "loss": 1.7981, + "step": 34850 + }, + { + "epoch": 4.582457983193278, + "grad_norm": 2.296875, + "learning_rate": 0.00010906952154374834, + "loss": 1.8601, + "step": 34900 + }, + { + "epoch": 4.589023109243698, + "grad_norm": 1.9921875, + "learning_rate": 0.00010893735130848533, + "loss": 1.892, + "step": 34950 + }, + { + "epoch": 4.595588235294118, + "grad_norm": 2.59375, + "learning_rate": 0.0001088051810732223, + "loss": 1.8538, + "step": 35000 + }, + { + "epoch": 4.602153361344538, + "grad_norm": 2.578125, + "learning_rate": 0.00010867301083795928, + "loss": 1.8857, + "step": 35050 + }, + { + "epoch": 4.608718487394958, + "grad_norm": 2.953125, + "learning_rate": 0.00010854084060269627, + "loss": 1.8731, + "step": 35100 + }, + { + "epoch": 4.615283613445378, + "grad_norm": 2.171875, + "learning_rate": 0.00010840867036743325, + "loss": 1.9334, + "step": 35150 + }, + { + "epoch": 4.621848739495798, + "grad_norm": 2.078125, + "learning_rate": 0.00010827650013217023, + "loss": 1.9154, + "step": 35200 + }, + { + "epoch": 4.628413865546219, + "grad_norm": 1.6875, + "learning_rate": 0.00010814432989690721, + "loss": 1.8874, + "step": 35250 + }, + { + "epoch": 4.634978991596639, + "grad_norm": 3.734375, + "learning_rate": 0.0001080121596616442, + "loss": 1.8365, + "step": 35300 + }, + { + "epoch": 4.641544117647059, + "grad_norm": 2.359375, + "learning_rate": 0.00010787998942638117, + "loss": 1.9075, + "step": 35350 + }, + { + "epoch": 4.648109243697479, + "grad_norm": 3.46875, + "learning_rate": 0.00010774781919111816, + "loss": 1.7622, + "step": 35400 + }, + { + "epoch": 4.654674369747899, + "grad_norm": 2.09375, + "learning_rate": 0.00010761564895585514, + "loss": 1.8269, + "step": 35450 + }, + { + "epoch": 4.661239495798319, + "grad_norm": 2.5625, + "learning_rate": 0.00010748347872059213, + "loss": 1.8732, + "step": 35500 + }, + { + "epoch": 4.66780462184874, + "grad_norm": 2.0625, + "learning_rate": 0.0001073513084853291, + "loss": 1.9295, + "step": 35550 + }, + { + "epoch": 4.67436974789916, + "grad_norm": 2.359375, + "learning_rate": 0.00010721913825006609, + "loss": 1.8487, + "step": 35600 + }, + { + "epoch": 4.68093487394958, + "grad_norm": 2.953125, + "learning_rate": 0.00010708696801480307, + "loss": 1.8199, + "step": 35650 + }, + { + "epoch": 4.6875, + "grad_norm": 1.921875, + "learning_rate": 0.00010695479777954005, + "loss": 1.8303, + "step": 35700 + }, + { + "epoch": 4.69406512605042, + "grad_norm": 1.7890625, + "learning_rate": 0.00010682262754427703, + "loss": 1.8325, + "step": 35750 + }, + { + "epoch": 4.70063025210084, + "grad_norm": 2.421875, + "learning_rate": 0.00010669045730901402, + "loss": 1.9103, + "step": 35800 + }, + { + "epoch": 4.70719537815126, + "grad_norm": 1.5390625, + "learning_rate": 0.00010655828707375099, + "loss": 1.9129, + "step": 35850 + }, + { + "epoch": 4.713760504201681, + "grad_norm": 2.609375, + "learning_rate": 0.00010642611683848798, + "loss": 1.8434, + "step": 35900 + }, + { + "epoch": 4.720325630252101, + "grad_norm": 3.234375, + "learning_rate": 0.00010629394660322496, + "loss": 1.8424, + "step": 35950 + }, + { + "epoch": 4.726890756302521, + "grad_norm": 2.421875, + "learning_rate": 0.00010616177636796195, + "loss": 1.8432, + "step": 36000 + }, + { + "epoch": 4.733455882352941, + "grad_norm": 2.65625, + "learning_rate": 0.00010602960613269892, + "loss": 1.9235, + "step": 36050 + }, + { + "epoch": 4.740021008403361, + "grad_norm": 2.71875, + "learning_rate": 0.00010589743589743591, + "loss": 1.9234, + "step": 36100 + }, + { + "epoch": 4.746586134453781, + "grad_norm": 2.90625, + "learning_rate": 0.00010576526566217288, + "loss": 1.8803, + "step": 36150 + }, + { + "epoch": 4.753151260504202, + "grad_norm": 2.40625, + "learning_rate": 0.00010563309542690987, + "loss": 1.8847, + "step": 36200 + }, + { + "epoch": 4.759716386554622, + "grad_norm": 2.65625, + "learning_rate": 0.00010550092519164685, + "loss": 1.9664, + "step": 36250 + }, + { + "epoch": 4.766281512605042, + "grad_norm": 1.53125, + "learning_rate": 0.00010536875495638382, + "loss": 1.9284, + "step": 36300 + }, + { + "epoch": 4.772846638655462, + "grad_norm": 2.03125, + "learning_rate": 0.00010523658472112081, + "loss": 1.8572, + "step": 36350 + }, + { + "epoch": 4.779411764705882, + "grad_norm": 2.78125, + "learning_rate": 0.00010510441448585779, + "loss": 1.8823, + "step": 36400 + }, + { + "epoch": 4.785976890756302, + "grad_norm": 2.234375, + "learning_rate": 0.00010497224425059478, + "loss": 1.908, + "step": 36450 + }, + { + "epoch": 4.792542016806722, + "grad_norm": 2.09375, + "learning_rate": 0.00010484007401533175, + "loss": 1.8155, + "step": 36500 + }, + { + "epoch": 4.799107142857143, + "grad_norm": 4.4375, + "learning_rate": 0.00010470790378006874, + "loss": 1.8574, + "step": 36550 + }, + { + "epoch": 4.805672268907563, + "grad_norm": 1.8828125, + "learning_rate": 0.00010457573354480572, + "loss": 1.8486, + "step": 36600 + }, + { + "epoch": 4.812237394957983, + "grad_norm": 2.359375, + "learning_rate": 0.0001044435633095427, + "loss": 1.8647, + "step": 36650 + }, + { + "epoch": 4.818802521008403, + "grad_norm": 1.96875, + "learning_rate": 0.00010431139307427968, + "loss": 1.9243, + "step": 36700 + }, + { + "epoch": 4.825367647058823, + "grad_norm": 2.265625, + "learning_rate": 0.00010417922283901667, + "loss": 1.9146, + "step": 36750 + }, + { + "epoch": 4.831932773109243, + "grad_norm": 2.359375, + "learning_rate": 0.00010404705260375364, + "loss": 1.8984, + "step": 36800 + }, + { + "epoch": 4.838497899159664, + "grad_norm": 2.21875, + "learning_rate": 0.00010391488236849063, + "loss": 1.8802, + "step": 36850 + }, + { + "epoch": 4.845063025210084, + "grad_norm": 2.125, + "learning_rate": 0.00010378271213322761, + "loss": 1.8742, + "step": 36900 + }, + { + "epoch": 4.851628151260504, + "grad_norm": 2.25, + "learning_rate": 0.0001036505418979646, + "loss": 1.884, + "step": 36950 + }, + { + "epoch": 4.858193277310924, + "grad_norm": 1.8359375, + "learning_rate": 0.00010351837166270157, + "loss": 1.9064, + "step": 37000 + }, + { + "epoch": 4.8647584033613445, + "grad_norm": 2.234375, + "learning_rate": 0.00010338620142743856, + "loss": 1.8883, + "step": 37050 + }, + { + "epoch": 4.8713235294117645, + "grad_norm": 2.5625, + "learning_rate": 0.00010325403119217554, + "loss": 1.8774, + "step": 37100 + }, + { + "epoch": 4.8778886554621845, + "grad_norm": 2.625, + "learning_rate": 0.0001031218609569125, + "loss": 1.9345, + "step": 37150 + }, + { + "epoch": 4.884453781512605, + "grad_norm": 2.890625, + "learning_rate": 0.00010298969072164947, + "loss": 1.8626, + "step": 37200 + }, + { + "epoch": 4.8910189075630255, + "grad_norm": 2.203125, + "learning_rate": 0.00010285752048638646, + "loss": 1.9422, + "step": 37250 + }, + { + "epoch": 4.8975840336134455, + "grad_norm": 2.921875, + "learning_rate": 0.00010272535025112344, + "loss": 1.8108, + "step": 37300 + }, + { + "epoch": 4.9041491596638656, + "grad_norm": 1.8203125, + "learning_rate": 0.00010259318001586043, + "loss": 1.8922, + "step": 37350 + }, + { + "epoch": 4.910714285714286, + "grad_norm": 3.046875, + "learning_rate": 0.0001024610097805974, + "loss": 1.8552, + "step": 37400 + }, + { + "epoch": 4.917279411764706, + "grad_norm": 1.5390625, + "learning_rate": 0.00010232883954533439, + "loss": 1.8918, + "step": 37450 + }, + { + "epoch": 4.9238445378151265, + "grad_norm": 1.953125, + "learning_rate": 0.00010219666931007136, + "loss": 1.9388, + "step": 37500 + }, + { + "epoch": 4.930409663865547, + "grad_norm": 2.328125, + "learning_rate": 0.00010206449907480835, + "loss": 1.9304, + "step": 37550 + }, + { + "epoch": 4.936974789915967, + "grad_norm": 2.734375, + "learning_rate": 0.00010193232883954533, + "loss": 1.8836, + "step": 37600 + }, + { + "epoch": 4.943539915966387, + "grad_norm": 3.109375, + "learning_rate": 0.00010180015860428232, + "loss": 1.849, + "step": 37650 + }, + { + "epoch": 4.950105042016807, + "grad_norm": 2.328125, + "learning_rate": 0.00010166798836901929, + "loss": 1.8299, + "step": 37700 + }, + { + "epoch": 4.956670168067227, + "grad_norm": 2.546875, + "learning_rate": 0.00010153581813375628, + "loss": 1.8335, + "step": 37750 + }, + { + "epoch": 4.963235294117647, + "grad_norm": 2.625, + "learning_rate": 0.00010140364789849326, + "loss": 1.8908, + "step": 37800 + }, + { + "epoch": 4.969800420168067, + "grad_norm": 3.046875, + "learning_rate": 0.00010127147766323024, + "loss": 1.8803, + "step": 37850 + }, + { + "epoch": 4.976365546218488, + "grad_norm": 2.5625, + "learning_rate": 0.00010113930742796722, + "loss": 1.8904, + "step": 37900 + }, + { + "epoch": 4.982930672268908, + "grad_norm": 1.796875, + "learning_rate": 0.00010100713719270421, + "loss": 1.899, + "step": 37950 + }, + { + "epoch": 4.989495798319328, + "grad_norm": 2.265625, + "learning_rate": 0.00010087496695744118, + "loss": 1.8902, + "step": 38000 + }, + { + "epoch": 4.996060924369748, + "grad_norm": 2.140625, + "learning_rate": 0.00010074279672217817, + "loss": 1.9128, + "step": 38050 + }, + { + "epoch": 5.002626050420168, + "grad_norm": 3.109375, + "learning_rate": 0.00010061062648691515, + "loss": 1.6909, + "step": 38100 + }, + { + "epoch": 5.009191176470588, + "grad_norm": 2.5625, + "learning_rate": 0.00010047845625165214, + "loss": 1.3461, + "step": 38150 + }, + { + "epoch": 5.015756302521009, + "grad_norm": 3.46875, + "learning_rate": 0.00010034628601638911, + "loss": 1.4187, + "step": 38200 + }, + { + "epoch": 5.022321428571429, + "grad_norm": 2.375, + "learning_rate": 0.0001002141157811261, + "loss": 1.3511, + "step": 38250 + }, + { + "epoch": 5.028886554621849, + "grad_norm": 2.921875, + "learning_rate": 0.00010008194554586308, + "loss": 1.336, + "step": 38300 + }, + { + "epoch": 5.035451680672269, + "grad_norm": 3.828125, + "learning_rate": 9.994977531060006e-05, + "loss": 1.4256, + "step": 38350 + }, + { + "epoch": 5.042016806722689, + "grad_norm": 2.953125, + "learning_rate": 9.981760507533704e-05, + "loss": 1.3989, + "step": 38400 + }, + { + "epoch": 5.048581932773109, + "grad_norm": 2.71875, + "learning_rate": 9.968543484007401e-05, + "loss": 1.3702, + "step": 38450 + }, + { + "epoch": 5.055147058823529, + "grad_norm": 2.703125, + "learning_rate": 9.9553264604811e-05, + "loss": 1.3758, + "step": 38500 + }, + { + "epoch": 5.06171218487395, + "grad_norm": 2.265625, + "learning_rate": 9.942109436954798e-05, + "loss": 1.4057, + "step": 38550 + }, + { + "epoch": 5.06827731092437, + "grad_norm": 3.03125, + "learning_rate": 9.928892413428497e-05, + "loss": 1.4083, + "step": 38600 + }, + { + "epoch": 5.07484243697479, + "grad_norm": 2.25, + "learning_rate": 9.915675389902194e-05, + "loss": 1.3849, + "step": 38650 + }, + { + "epoch": 5.08140756302521, + "grad_norm": 3.15625, + "learning_rate": 9.902458366375893e-05, + "loss": 1.3812, + "step": 38700 + }, + { + "epoch": 5.08797268907563, + "grad_norm": 2.828125, + "learning_rate": 9.889241342849591e-05, + "loss": 1.4723, + "step": 38750 + }, + { + "epoch": 5.09453781512605, + "grad_norm": 2.75, + "learning_rate": 9.87602431932329e-05, + "loss": 1.3678, + "step": 38800 + }, + { + "epoch": 5.101102941176471, + "grad_norm": 2.5625, + "learning_rate": 9.862807295796987e-05, + "loss": 1.4548, + "step": 38850 + }, + { + "epoch": 5.107668067226891, + "grad_norm": 3.234375, + "learning_rate": 9.849590272270686e-05, + "loss": 1.4095, + "step": 38900 + }, + { + "epoch": 5.114233193277311, + "grad_norm": 3.90625, + "learning_rate": 9.836373248744382e-05, + "loss": 1.3976, + "step": 38950 + }, + { + "epoch": 5.120798319327731, + "grad_norm": 2.828125, + "learning_rate": 9.823156225218081e-05, + "loss": 1.4269, + "step": 39000 + }, + { + "epoch": 5.127363445378151, + "grad_norm": 2.625, + "learning_rate": 9.809939201691779e-05, + "loss": 1.4049, + "step": 39050 + }, + { + "epoch": 5.133928571428571, + "grad_norm": 2.921875, + "learning_rate": 9.796722178165477e-05, + "loss": 1.3801, + "step": 39100 + }, + { + "epoch": 5.140493697478991, + "grad_norm": 3.296875, + "learning_rate": 9.783505154639175e-05, + "loss": 1.4323, + "step": 39150 + }, + { + "epoch": 5.147058823529412, + "grad_norm": 3.015625, + "learning_rate": 9.770288131112874e-05, + "loss": 1.4407, + "step": 39200 + }, + { + "epoch": 5.153623949579832, + "grad_norm": 2.671875, + "learning_rate": 9.757071107586571e-05, + "loss": 1.4843, + "step": 39250 + }, + { + "epoch": 5.160189075630252, + "grad_norm": 2.90625, + "learning_rate": 9.74385408406027e-05, + "loss": 1.4161, + "step": 39300 + }, + { + "epoch": 5.166754201680672, + "grad_norm": 2.640625, + "learning_rate": 9.730637060533968e-05, + "loss": 1.426, + "step": 39350 + }, + { + "epoch": 5.173319327731092, + "grad_norm": 3.328125, + "learning_rate": 9.717420037007667e-05, + "loss": 1.4223, + "step": 39400 + }, + { + "epoch": 5.179884453781512, + "grad_norm": 2.453125, + "learning_rate": 9.704203013481364e-05, + "loss": 1.4874, + "step": 39450 + }, + { + "epoch": 5.186449579831933, + "grad_norm": 3.203125, + "learning_rate": 9.690985989955063e-05, + "loss": 1.4137, + "step": 39500 + }, + { + "epoch": 5.193014705882353, + "grad_norm": 2.921875, + "learning_rate": 9.67776896642876e-05, + "loss": 1.4493, + "step": 39550 + }, + { + "epoch": 5.199579831932773, + "grad_norm": 2.734375, + "learning_rate": 9.66455194290246e-05, + "loss": 1.4104, + "step": 39600 + }, + { + "epoch": 5.206144957983193, + "grad_norm": 2.859375, + "learning_rate": 9.651334919376157e-05, + "loss": 1.4584, + "step": 39650 + }, + { + "epoch": 5.212710084033613, + "grad_norm": 3.3125, + "learning_rate": 9.638117895849856e-05, + "loss": 1.4498, + "step": 39700 + }, + { + "epoch": 5.219275210084033, + "grad_norm": 2.84375, + "learning_rate": 9.624900872323553e-05, + "loss": 1.4353, + "step": 39750 + }, + { + "epoch": 5.225840336134453, + "grad_norm": 3.140625, + "learning_rate": 9.611683848797252e-05, + "loss": 1.3824, + "step": 39800 + }, + { + "epoch": 5.232405462184874, + "grad_norm": 3.421875, + "learning_rate": 9.59846682527095e-05, + "loss": 1.4526, + "step": 39850 + }, + { + "epoch": 5.238970588235294, + "grad_norm": 2.3125, + "learning_rate": 9.585249801744649e-05, + "loss": 1.4452, + "step": 39900 + }, + { + "epoch": 5.245535714285714, + "grad_norm": 2.375, + "learning_rate": 9.572032778218346e-05, + "loss": 1.4672, + "step": 39950 + }, + { + "epoch": 5.2521008403361344, + "grad_norm": 2.6875, + "learning_rate": 9.558815754692045e-05, + "loss": 1.455, + "step": 40000 + }, + { + "epoch": 5.2586659663865545, + "grad_norm": 2.640625, + "learning_rate": 9.545598731165742e-05, + "loss": 1.5045, + "step": 40050 + }, + { + "epoch": 5.2652310924369745, + "grad_norm": 2.3125, + "learning_rate": 9.53238170763944e-05, + "loss": 1.4684, + "step": 40100 + }, + { + "epoch": 5.2717962184873945, + "grad_norm": 3.09375, + "learning_rate": 9.519164684113137e-05, + "loss": 1.3692, + "step": 40150 + }, + { + "epoch": 5.2783613445378155, + "grad_norm": 3.09375, + "learning_rate": 9.505947660586836e-05, + "loss": 1.3653, + "step": 40200 + }, + { + "epoch": 5.2849264705882355, + "grad_norm": 3.453125, + "learning_rate": 9.492730637060534e-05, + "loss": 1.4225, + "step": 40250 + }, + { + "epoch": 5.2914915966386555, + "grad_norm": 2.8125, + "learning_rate": 9.479513613534233e-05, + "loss": 1.416, + "step": 40300 + }, + { + "epoch": 5.298056722689076, + "grad_norm": 2.921875, + "learning_rate": 9.46629659000793e-05, + "loss": 1.4618, + "step": 40350 + }, + { + "epoch": 5.304621848739496, + "grad_norm": 2.96875, + "learning_rate": 9.453079566481629e-05, + "loss": 1.4527, + "step": 40400 + }, + { + "epoch": 5.311186974789916, + "grad_norm": 2.890625, + "learning_rate": 9.439862542955327e-05, + "loss": 1.4499, + "step": 40450 + }, + { + "epoch": 5.317752100840336, + "grad_norm": 2.9375, + "learning_rate": 9.426645519429024e-05, + "loss": 1.4557, + "step": 40500 + }, + { + "epoch": 5.324317226890757, + "grad_norm": 2.828125, + "learning_rate": 9.413428495902723e-05, + "loss": 1.4044, + "step": 40550 + }, + { + "epoch": 5.330882352941177, + "grad_norm": 2.375, + "learning_rate": 9.40021147237642e-05, + "loss": 1.4238, + "step": 40600 + }, + { + "epoch": 5.337447478991597, + "grad_norm": 3.859375, + "learning_rate": 9.38699444885012e-05, + "loss": 1.5172, + "step": 40650 + }, + { + "epoch": 5.344012605042017, + "grad_norm": 3.21875, + "learning_rate": 9.373777425323817e-05, + "loss": 1.4602, + "step": 40700 + }, + { + "epoch": 5.350577731092437, + "grad_norm": 2.953125, + "learning_rate": 9.360560401797516e-05, + "loss": 1.435, + "step": 40750 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 2.796875, + "learning_rate": 9.347343378271213e-05, + "loss": 1.4009, + "step": 40800 + }, + { + "epoch": 5.363707983193278, + "grad_norm": 2.5625, + "learning_rate": 9.334126354744912e-05, + "loss": 1.5033, + "step": 40850 + }, + { + "epoch": 5.370273109243698, + "grad_norm": 2.828125, + "learning_rate": 9.32090933121861e-05, + "loss": 1.5243, + "step": 40900 + }, + { + "epoch": 5.376838235294118, + "grad_norm": 2.390625, + "learning_rate": 9.307692307692309e-05, + "loss": 1.512, + "step": 40950 + }, + { + "epoch": 5.383403361344538, + "grad_norm": 3.96875, + "learning_rate": 9.294475284166006e-05, + "loss": 1.4817, + "step": 41000 + }, + { + "epoch": 5.389968487394958, + "grad_norm": 3.21875, + "learning_rate": 9.281258260639705e-05, + "loss": 1.4368, + "step": 41050 + }, + { + "epoch": 5.396533613445378, + "grad_norm": 3.171875, + "learning_rate": 9.268041237113403e-05, + "loss": 1.4665, + "step": 41100 + }, + { + "epoch": 5.403098739495798, + "grad_norm": 4.0625, + "learning_rate": 9.254824213587101e-05, + "loss": 1.5117, + "step": 41150 + }, + { + "epoch": 5.409663865546219, + "grad_norm": 3.171875, + "learning_rate": 9.241607190060799e-05, + "loss": 1.4644, + "step": 41200 + }, + { + "epoch": 5.416228991596639, + "grad_norm": 2.6875, + "learning_rate": 9.228390166534498e-05, + "loss": 1.461, + "step": 41250 + }, + { + "epoch": 5.422794117647059, + "grad_norm": 3.640625, + "learning_rate": 9.215173143008194e-05, + "loss": 1.4493, + "step": 41300 + }, + { + "epoch": 5.429359243697479, + "grad_norm": 2.90625, + "learning_rate": 9.201956119481893e-05, + "loss": 1.49, + "step": 41350 + }, + { + "epoch": 5.435924369747899, + "grad_norm": 5.625, + "learning_rate": 9.18873909595559e-05, + "loss": 1.4682, + "step": 41400 + }, + { + "epoch": 5.442489495798319, + "grad_norm": 3.28125, + "learning_rate": 9.175522072429289e-05, + "loss": 1.4381, + "step": 41450 + }, + { + "epoch": 5.44905462184874, + "grad_norm": 2.546875, + "learning_rate": 9.162305048902987e-05, + "loss": 1.4631, + "step": 41500 + }, + { + "epoch": 5.45561974789916, + "grad_norm": 3.234375, + "learning_rate": 9.149088025376686e-05, + "loss": 1.4942, + "step": 41550 + }, + { + "epoch": 5.46218487394958, + "grad_norm": 3.421875, + "learning_rate": 9.135871001850383e-05, + "loss": 1.4826, + "step": 41600 + }, + { + "epoch": 5.46875, + "grad_norm": 2.578125, + "learning_rate": 9.122653978324082e-05, + "loss": 1.5446, + "step": 41650 + }, + { + "epoch": 5.47531512605042, + "grad_norm": 2.90625, + "learning_rate": 9.10943695479778e-05, + "loss": 1.5059, + "step": 41700 + }, + { + "epoch": 5.48188025210084, + "grad_norm": 3.28125, + "learning_rate": 9.096219931271478e-05, + "loss": 1.5444, + "step": 41750 + }, + { + "epoch": 5.48844537815126, + "grad_norm": 2.40625, + "learning_rate": 9.083002907745176e-05, + "loss": 1.5206, + "step": 41800 + }, + { + "epoch": 5.495010504201681, + "grad_norm": 3.25, + "learning_rate": 9.069785884218875e-05, + "loss": 1.4863, + "step": 41850 + }, + { + "epoch": 5.501575630252101, + "grad_norm": 2.78125, + "learning_rate": 9.056568860692572e-05, + "loss": 1.4104, + "step": 41900 + }, + { + "epoch": 5.508140756302521, + "grad_norm": 3.40625, + "learning_rate": 9.043351837166271e-05, + "loss": 1.5224, + "step": 41950 + }, + { + "epoch": 5.514705882352941, + "grad_norm": 3.546875, + "learning_rate": 9.030134813639969e-05, + "loss": 1.5038, + "step": 42000 + }, + { + "epoch": 5.521271008403361, + "grad_norm": 3.515625, + "learning_rate": 9.016917790113668e-05, + "loss": 1.4301, + "step": 42050 + }, + { + "epoch": 5.527836134453781, + "grad_norm": 3.734375, + "learning_rate": 9.003700766587365e-05, + "loss": 1.5132, + "step": 42100 + }, + { + "epoch": 5.534401260504202, + "grad_norm": 2.796875, + "learning_rate": 8.990483743061063e-05, + "loss": 1.5198, + "step": 42150 + }, + { + "epoch": 5.540966386554622, + "grad_norm": 3.125, + "learning_rate": 8.977266719534762e-05, + "loss": 1.4996, + "step": 42200 + }, + { + "epoch": 5.547531512605042, + "grad_norm": 3.015625, + "learning_rate": 8.964049696008459e-05, + "loss": 1.535, + "step": 42250 + }, + { + "epoch": 5.554096638655462, + "grad_norm": 3.15625, + "learning_rate": 8.950832672482158e-05, + "loss": 1.4931, + "step": 42300 + }, + { + "epoch": 5.560661764705882, + "grad_norm": 3.171875, + "learning_rate": 8.937615648955855e-05, + "loss": 1.4804, + "step": 42350 + }, + { + "epoch": 5.567226890756302, + "grad_norm": 3.140625, + "learning_rate": 8.924398625429554e-05, + "loss": 1.5138, + "step": 42400 + }, + { + "epoch": 5.573792016806722, + "grad_norm": 3.15625, + "learning_rate": 8.911181601903252e-05, + "loss": 1.498, + "step": 42450 + }, + { + "epoch": 5.580357142857143, + "grad_norm": 2.71875, + "learning_rate": 8.89796457837695e-05, + "loss": 1.524, + "step": 42500 + }, + { + "epoch": 5.586922268907563, + "grad_norm": 2.484375, + "learning_rate": 8.884747554850648e-05, + "loss": 1.4919, + "step": 42550 + }, + { + "epoch": 5.593487394957983, + "grad_norm": 1.8046875, + "learning_rate": 8.871530531324346e-05, + "loss": 1.4701, + "step": 42600 + }, + { + "epoch": 5.600052521008403, + "grad_norm": 2.1875, + "learning_rate": 8.858313507798043e-05, + "loss": 1.5287, + "step": 42650 + }, + { + "epoch": 5.606617647058823, + "grad_norm": 2.828125, + "learning_rate": 8.845096484271742e-05, + "loss": 1.4986, + "step": 42700 + }, + { + "epoch": 5.613182773109243, + "grad_norm": 2.703125, + "learning_rate": 8.83187946074544e-05, + "loss": 1.604, + "step": 42750 + }, + { + "epoch": 5.619747899159664, + "grad_norm": 3.4375, + "learning_rate": 8.818662437219139e-05, + "loss": 1.5137, + "step": 42800 + }, + { + "epoch": 5.626313025210084, + "grad_norm": 3.484375, + "learning_rate": 8.805445413692836e-05, + "loss": 1.5157, + "step": 42850 + }, + { + "epoch": 5.632878151260504, + "grad_norm": 1.71875, + "learning_rate": 8.792228390166535e-05, + "loss": 1.5347, + "step": 42900 + }, + { + "epoch": 5.639443277310924, + "grad_norm": 2.484375, + "learning_rate": 8.779011366640232e-05, + "loss": 1.4842, + "step": 42950 + }, + { + "epoch": 5.6460084033613445, + "grad_norm": 2.59375, + "learning_rate": 8.765794343113931e-05, + "loss": 1.4893, + "step": 43000 + }, + { + "epoch": 5.6525735294117645, + "grad_norm": 2.34375, + "learning_rate": 8.752577319587629e-05, + "loss": 1.5574, + "step": 43050 + }, + { + "epoch": 5.6591386554621845, + "grad_norm": 2.984375, + "learning_rate": 8.739360296061328e-05, + "loss": 1.5025, + "step": 43100 + }, + { + "epoch": 5.665703781512605, + "grad_norm": 2.859375, + "learning_rate": 8.726143272535025e-05, + "loss": 1.5131, + "step": 43150 + }, + { + "epoch": 5.6722689075630255, + "grad_norm": 2.765625, + "learning_rate": 8.712926249008724e-05, + "loss": 1.5278, + "step": 43200 + }, + { + "epoch": 5.6788340336134455, + "grad_norm": 4.1875, + "learning_rate": 8.699709225482422e-05, + "loss": 1.5521, + "step": 43250 + }, + { + "epoch": 5.6853991596638656, + "grad_norm": 3.234375, + "learning_rate": 8.68649220195612e-05, + "loss": 1.4944, + "step": 43300 + }, + { + "epoch": 5.691964285714286, + "grad_norm": 2.03125, + "learning_rate": 8.673275178429818e-05, + "loss": 1.5561, + "step": 43350 + }, + { + "epoch": 5.698529411764706, + "grad_norm": 2.890625, + "learning_rate": 8.660058154903517e-05, + "loss": 1.4284, + "step": 43400 + }, + { + "epoch": 5.7050945378151265, + "grad_norm": 5.4375, + "learning_rate": 8.646841131377214e-05, + "loss": 1.5662, + "step": 43450 + }, + { + "epoch": 5.711659663865547, + "grad_norm": 2.234375, + "learning_rate": 8.633624107850913e-05, + "loss": 1.5686, + "step": 43500 + }, + { + "epoch": 5.718224789915967, + "grad_norm": 4.0, + "learning_rate": 8.620407084324611e-05, + "loss": 1.5144, + "step": 43550 + }, + { + "epoch": 5.724789915966387, + "grad_norm": 3.875, + "learning_rate": 8.60719006079831e-05, + "loss": 1.5455, + "step": 43600 + }, + { + "epoch": 5.731355042016807, + "grad_norm": 3.0, + "learning_rate": 8.593973037272007e-05, + "loss": 1.5595, + "step": 43650 + }, + { + "epoch": 5.737920168067227, + "grad_norm": 2.828125, + "learning_rate": 8.580756013745705e-05, + "loss": 1.5109, + "step": 43700 + }, + { + "epoch": 5.744485294117647, + "grad_norm": 3.859375, + "learning_rate": 8.567538990219402e-05, + "loss": 1.4854, + "step": 43750 + }, + { + "epoch": 5.751050420168067, + "grad_norm": 3.96875, + "learning_rate": 8.554321966693101e-05, + "loss": 1.5283, + "step": 43800 + }, + { + "epoch": 5.757615546218488, + "grad_norm": 3.421875, + "learning_rate": 8.541104943166799e-05, + "loss": 1.5464, + "step": 43850 + }, + { + "epoch": 5.764180672268908, + "grad_norm": 3.265625, + "learning_rate": 8.527887919640498e-05, + "loss": 1.5518, + "step": 43900 + }, + { + "epoch": 5.770745798319328, + "grad_norm": 2.203125, + "learning_rate": 8.514670896114195e-05, + "loss": 1.5689, + "step": 43950 + }, + { + "epoch": 5.777310924369748, + "grad_norm": 2.53125, + "learning_rate": 8.501453872587894e-05, + "loss": 1.5002, + "step": 44000 + }, + { + "epoch": 5.783876050420168, + "grad_norm": 2.75, + "learning_rate": 8.488236849061591e-05, + "loss": 1.5407, + "step": 44050 + }, + { + "epoch": 5.790441176470588, + "grad_norm": 2.828125, + "learning_rate": 8.47501982553529e-05, + "loss": 1.5339, + "step": 44100 + }, + { + "epoch": 5.797006302521009, + "grad_norm": 3.078125, + "learning_rate": 8.461802802008988e-05, + "loss": 1.5005, + "step": 44150 + }, + { + "epoch": 5.803571428571429, + "grad_norm": 2.921875, + "learning_rate": 8.448585778482687e-05, + "loss": 1.506, + "step": 44200 + }, + { + "epoch": 5.810136554621849, + "grad_norm": 2.34375, + "learning_rate": 8.435368754956384e-05, + "loss": 1.5579, + "step": 44250 + }, + { + "epoch": 5.816701680672269, + "grad_norm": 2.59375, + "learning_rate": 8.422151731430082e-05, + "loss": 1.5815, + "step": 44300 + }, + { + "epoch": 5.823266806722689, + "grad_norm": 2.53125, + "learning_rate": 8.40893470790378e-05, + "loss": 1.5633, + "step": 44350 + }, + { + "epoch": 5.829831932773109, + "grad_norm": 2.546875, + "learning_rate": 8.395717684377478e-05, + "loss": 1.5406, + "step": 44400 + }, + { + "epoch": 5.836397058823529, + "grad_norm": 2.515625, + "learning_rate": 8.382500660851177e-05, + "loss": 1.5523, + "step": 44450 + }, + { + "epoch": 5.84296218487395, + "grad_norm": 3.71875, + "learning_rate": 8.369283637324875e-05, + "loss": 1.534, + "step": 44500 + }, + { + "epoch": 5.84952731092437, + "grad_norm": 3.5625, + "learning_rate": 8.356066613798573e-05, + "loss": 1.5479, + "step": 44550 + }, + { + "epoch": 5.85609243697479, + "grad_norm": 3.203125, + "learning_rate": 8.342849590272271e-05, + "loss": 1.5374, + "step": 44600 + }, + { + "epoch": 5.86265756302521, + "grad_norm": 3.25, + "learning_rate": 8.32963256674597e-05, + "loss": 1.5043, + "step": 44650 + }, + { + "epoch": 5.86922268907563, + "grad_norm": 2.3125, + "learning_rate": 8.316415543219667e-05, + "loss": 1.5133, + "step": 44700 + }, + { + "epoch": 5.87578781512605, + "grad_norm": 2.859375, + "learning_rate": 8.303198519693366e-05, + "loss": 1.5399, + "step": 44750 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 2.578125, + "learning_rate": 8.289981496167064e-05, + "loss": 1.5391, + "step": 44800 + }, + { + "epoch": 5.888918067226891, + "grad_norm": 2.953125, + "learning_rate": 8.276764472640761e-05, + "loss": 1.5306, + "step": 44850 + }, + { + "epoch": 5.895483193277311, + "grad_norm": 2.234375, + "learning_rate": 8.263547449114459e-05, + "loss": 1.5839, + "step": 44900 + }, + { + "epoch": 5.902048319327731, + "grad_norm": 3.421875, + "learning_rate": 8.250330425588158e-05, + "loss": 1.4878, + "step": 44950 + }, + { + "epoch": 5.908613445378151, + "grad_norm": 1.96875, + "learning_rate": 8.237113402061855e-05, + "loss": 1.5699, + "step": 45000 + }, + { + "epoch": 5.915178571428571, + "grad_norm": 2.515625, + "learning_rate": 8.223896378535554e-05, + "loss": 1.561, + "step": 45050 + }, + { + "epoch": 5.921743697478991, + "grad_norm": 4.28125, + "learning_rate": 8.210679355009252e-05, + "loss": 1.5284, + "step": 45100 + }, + { + "epoch": 5.928308823529412, + "grad_norm": 2.4375, + "learning_rate": 8.19746233148295e-05, + "loss": 1.5605, + "step": 45150 + }, + { + "epoch": 5.934873949579832, + "grad_norm": 1.8359375, + "learning_rate": 8.184245307956648e-05, + "loss": 1.5675, + "step": 45200 + }, + { + "epoch": 5.941439075630252, + "grad_norm": 2.921875, + "learning_rate": 8.171028284430347e-05, + "loss": 1.5244, + "step": 45250 + }, + { + "epoch": 5.948004201680672, + "grad_norm": 2.84375, + "learning_rate": 8.157811260904044e-05, + "loss": 1.5339, + "step": 45300 + }, + { + "epoch": 5.954569327731092, + "grad_norm": 3.015625, + "learning_rate": 8.144594237377743e-05, + "loss": 1.5613, + "step": 45350 + }, + { + "epoch": 5.961134453781512, + "grad_norm": 2.890625, + "learning_rate": 8.131377213851441e-05, + "loss": 1.5348, + "step": 45400 + }, + { + "epoch": 5.967699579831933, + "grad_norm": 2.65625, + "learning_rate": 8.11816019032514e-05, + "loss": 1.5699, + "step": 45450 + }, + { + "epoch": 5.974264705882353, + "grad_norm": 2.421875, + "learning_rate": 8.104943166798837e-05, + "loss": 1.5268, + "step": 45500 + }, + { + "epoch": 5.980829831932773, + "grad_norm": 3.828125, + "learning_rate": 8.091726143272536e-05, + "loss": 1.5175, + "step": 45550 + }, + { + "epoch": 5.987394957983193, + "grad_norm": 3.203125, + "learning_rate": 8.078509119746234e-05, + "loss": 1.5399, + "step": 45600 + }, + { + "epoch": 5.993960084033613, + "grad_norm": 2.640625, + "learning_rate": 8.065292096219932e-05, + "loss": 1.5178, + "step": 45650 + }, + { + "epoch": 6.000525210084033, + "grad_norm": 2.03125, + "learning_rate": 8.05207507269363e-05, + "loss": 1.5124, + "step": 45700 + }, + { + "epoch": 6.007090336134453, + "grad_norm": 2.828125, + "learning_rate": 8.038858049167329e-05, + "loss": 1.1681, + "step": 45750 + }, + { + "epoch": 6.013655462184874, + "grad_norm": 2.25, + "learning_rate": 8.025641025641026e-05, + "loss": 1.1803, + "step": 45800 + }, + { + "epoch": 6.020220588235294, + "grad_norm": 3.765625, + "learning_rate": 8.012424002114725e-05, + "loss": 1.1269, + "step": 45850 + }, + { + "epoch": 6.026785714285714, + "grad_norm": 3.828125, + "learning_rate": 7.999206978588423e-05, + "loss": 1.0631, + "step": 45900 + }, + { + "epoch": 6.0333508403361344, + "grad_norm": 2.921875, + "learning_rate": 7.98598995506212e-05, + "loss": 1.155, + "step": 45950 + }, + { + "epoch": 6.0399159663865545, + "grad_norm": 2.609375, + "learning_rate": 7.972772931535819e-05, + "loss": 1.0826, + "step": 46000 + }, + { + "epoch": 6.0464810924369745, + "grad_norm": 3.21875, + "learning_rate": 7.959555908009517e-05, + "loss": 1.1677, + "step": 46050 + }, + { + "epoch": 6.0530462184873945, + "grad_norm": 2.828125, + "learning_rate": 7.946338884483214e-05, + "loss": 1.089, + "step": 46100 + }, + { + "epoch": 6.0596113445378155, + "grad_norm": 2.328125, + "learning_rate": 7.933121860956913e-05, + "loss": 1.1358, + "step": 46150 + }, + { + "epoch": 6.0661764705882355, + "grad_norm": 3.84375, + "learning_rate": 7.91990483743061e-05, + "loss": 1.1714, + "step": 46200 + }, + { + "epoch": 6.0727415966386555, + "grad_norm": 4.375, + "learning_rate": 7.90668781390431e-05, + "loss": 1.1851, + "step": 46250 + }, + { + "epoch": 6.079306722689076, + "grad_norm": 2.28125, + "learning_rate": 7.893470790378007e-05, + "loss": 1.104, + "step": 46300 + }, + { + "epoch": 6.085871848739496, + "grad_norm": 3.484375, + "learning_rate": 7.880253766851704e-05, + "loss": 1.1089, + "step": 46350 + }, + { + "epoch": 6.092436974789916, + "grad_norm": 3.515625, + "learning_rate": 7.867036743325403e-05, + "loss": 1.1587, + "step": 46400 + }, + { + "epoch": 6.099002100840337, + "grad_norm": 2.65625, + "learning_rate": 7.853819719799101e-05, + "loss": 1.1539, + "step": 46450 + }, + { + "epoch": 6.105567226890757, + "grad_norm": 2.21875, + "learning_rate": 7.8406026962728e-05, + "loss": 1.1214, + "step": 46500 + }, + { + "epoch": 6.112132352941177, + "grad_norm": 3.40625, + "learning_rate": 7.827385672746497e-05, + "loss": 1.1342, + "step": 46550 + }, + { + "epoch": 6.118697478991597, + "grad_norm": 2.8125, + "learning_rate": 7.814168649220196e-05, + "loss": 1.1118, + "step": 46600 + }, + { + "epoch": 6.125262605042017, + "grad_norm": 3.890625, + "learning_rate": 7.800951625693894e-05, + "loss": 1.1136, + "step": 46650 + }, + { + "epoch": 6.131827731092437, + "grad_norm": 2.796875, + "learning_rate": 7.787734602167593e-05, + "loss": 1.1687, + "step": 46700 + }, + { + "epoch": 6.138392857142857, + "grad_norm": 3.25, + "learning_rate": 7.77451757864129e-05, + "loss": 1.1565, + "step": 46750 + }, + { + "epoch": 6.144957983193278, + "grad_norm": 4.09375, + "learning_rate": 7.761300555114989e-05, + "loss": 1.0907, + "step": 46800 + }, + { + "epoch": 6.151523109243698, + "grad_norm": 2.6875, + "learning_rate": 7.748083531588686e-05, + "loss": 1.1157, + "step": 46850 + }, + { + "epoch": 6.158088235294118, + "grad_norm": 3.015625, + "learning_rate": 7.734866508062385e-05, + "loss": 1.1268, + "step": 46900 + }, + { + "epoch": 6.164653361344538, + "grad_norm": 3.921875, + "learning_rate": 7.721649484536083e-05, + "loss": 1.2271, + "step": 46950 + }, + { + "epoch": 6.171218487394958, + "grad_norm": 2.90625, + "learning_rate": 7.708432461009782e-05, + "loss": 1.1368, + "step": 47000 + }, + { + "epoch": 6.177783613445378, + "grad_norm": 3.203125, + "learning_rate": 7.695215437483479e-05, + "loss": 1.1157, + "step": 47050 + }, + { + "epoch": 6.184348739495798, + "grad_norm": 3.34375, + "learning_rate": 7.681998413957178e-05, + "loss": 1.1223, + "step": 47100 + }, + { + "epoch": 6.190913865546219, + "grad_norm": 3.359375, + "learning_rate": 7.668781390430876e-05, + "loss": 1.2149, + "step": 47150 + }, + { + "epoch": 6.197478991596639, + "grad_norm": 2.671875, + "learning_rate": 7.655564366904573e-05, + "loss": 1.1906, + "step": 47200 + }, + { + "epoch": 6.204044117647059, + "grad_norm": 3.28125, + "learning_rate": 7.64234734337827e-05, + "loss": 1.0765, + "step": 47250 + }, + { + "epoch": 6.210609243697479, + "grad_norm": 1.9296875, + "learning_rate": 7.62913031985197e-05, + "loss": 1.1369, + "step": 47300 + }, + { + "epoch": 6.217174369747899, + "grad_norm": 3.875, + "learning_rate": 7.615913296325667e-05, + "loss": 1.1365, + "step": 47350 + }, + { + "epoch": 6.223739495798319, + "grad_norm": 2.890625, + "learning_rate": 7.602696272799366e-05, + "loss": 1.13, + "step": 47400 + }, + { + "epoch": 6.23030462184874, + "grad_norm": 2.71875, + "learning_rate": 7.589479249273063e-05, + "loss": 1.1538, + "step": 47450 + }, + { + "epoch": 6.23686974789916, + "grad_norm": 2.515625, + "learning_rate": 7.576262225746762e-05, + "loss": 1.1669, + "step": 47500 + }, + { + "epoch": 6.24343487394958, + "grad_norm": 4.5, + "learning_rate": 7.56304520222046e-05, + "loss": 1.1756, + "step": 47550 + }, + { + "epoch": 6.25, + "grad_norm": 2.65625, + "learning_rate": 7.549828178694159e-05, + "loss": 1.1435, + "step": 47600 + }, + { + "epoch": 6.25656512605042, + "grad_norm": 3.484375, + "learning_rate": 7.536611155167856e-05, + "loss": 1.1327, + "step": 47650 + }, + { + "epoch": 6.26313025210084, + "grad_norm": 3.421875, + "learning_rate": 7.523394131641555e-05, + "loss": 1.1752, + "step": 47700 + }, + { + "epoch": 6.26969537815126, + "grad_norm": 3.328125, + "learning_rate": 7.510177108115253e-05, + "loss": 1.1626, + "step": 47750 + }, + { + "epoch": 6.276260504201681, + "grad_norm": 4.375, + "learning_rate": 7.496960084588952e-05, + "loss": 1.166, + "step": 47800 + }, + { + "epoch": 6.282825630252101, + "grad_norm": 3.453125, + "learning_rate": 7.483743061062649e-05, + "loss": 1.17, + "step": 47850 + }, + { + "epoch": 6.289390756302521, + "grad_norm": 2.953125, + "learning_rate": 7.470526037536348e-05, + "loss": 1.2015, + "step": 47900 + }, + { + "epoch": 6.295955882352941, + "grad_norm": 2.609375, + "learning_rate": 7.457309014010045e-05, + "loss": 1.1405, + "step": 47950 + }, + { + "epoch": 6.302521008403361, + "grad_norm": 3.734375, + "learning_rate": 7.444091990483743e-05, + "loss": 1.1591, + "step": 48000 + }, + { + "epoch": 6.309086134453781, + "grad_norm": 3.953125, + "learning_rate": 7.430874966957442e-05, + "loss": 1.1504, + "step": 48050 + }, + { + "epoch": 6.315651260504202, + "grad_norm": 3.546875, + "learning_rate": 7.41765794343114e-05, + "loss": 1.1835, + "step": 48100 + }, + { + "epoch": 6.322216386554622, + "grad_norm": 3.234375, + "learning_rate": 7.404440919904838e-05, + "loss": 1.1839, + "step": 48150 + }, + { + "epoch": 6.328781512605042, + "grad_norm": 5.3125, + "learning_rate": 7.391223896378536e-05, + "loss": 1.1792, + "step": 48200 + }, + { + "epoch": 6.335346638655462, + "grad_norm": 4.03125, + "learning_rate": 7.378006872852235e-05, + "loss": 1.1407, + "step": 48250 + }, + { + "epoch": 6.341911764705882, + "grad_norm": 3.359375, + "learning_rate": 7.364789849325932e-05, + "loss": 1.2657, + "step": 48300 + }, + { + "epoch": 6.348476890756302, + "grad_norm": 3.125, + "learning_rate": 7.351572825799631e-05, + "loss": 1.1614, + "step": 48350 + }, + { + "epoch": 6.355042016806722, + "grad_norm": 2.796875, + "learning_rate": 7.338355802273329e-05, + "loss": 1.2149, + "step": 48400 + }, + { + "epoch": 6.361607142857143, + "grad_norm": 3.09375, + "learning_rate": 7.325138778747026e-05, + "loss": 1.1551, + "step": 48450 + }, + { + "epoch": 6.368172268907563, + "grad_norm": 4.125, + "learning_rate": 7.311921755220724e-05, + "loss": 1.2245, + "step": 48500 + }, + { + "epoch": 6.374737394957983, + "grad_norm": 2.75, + "learning_rate": 7.298704731694422e-05, + "loss": 1.225, + "step": 48550 + }, + { + "epoch": 6.381302521008403, + "grad_norm": 3.265625, + "learning_rate": 7.28548770816812e-05, + "loss": 1.2405, + "step": 48600 + }, + { + "epoch": 6.387867647058823, + "grad_norm": 2.90625, + "learning_rate": 7.272270684641819e-05, + "loss": 1.2447, + "step": 48650 + }, + { + "epoch": 6.394432773109243, + "grad_norm": 4.3125, + "learning_rate": 7.259053661115516e-05, + "loss": 1.1656, + "step": 48700 + }, + { + "epoch": 6.400997899159664, + "grad_norm": 3.296875, + "learning_rate": 7.245836637589215e-05, + "loss": 1.2142, + "step": 48750 + }, + { + "epoch": 6.407563025210084, + "grad_norm": 3.46875, + "learning_rate": 7.232619614062913e-05, + "loss": 1.1842, + "step": 48800 + }, + { + "epoch": 6.414128151260504, + "grad_norm": 3.40625, + "learning_rate": 7.219402590536612e-05, + "loss": 1.1958, + "step": 48850 + }, + { + "epoch": 6.420693277310924, + "grad_norm": 3.625, + "learning_rate": 7.206185567010309e-05, + "loss": 1.1988, + "step": 48900 + }, + { + "epoch": 6.4272584033613445, + "grad_norm": 2.328125, + "learning_rate": 7.192968543484008e-05, + "loss": 1.1995, + "step": 48950 + }, + { + "epoch": 6.4338235294117645, + "grad_norm": 3.046875, + "learning_rate": 7.179751519957706e-05, + "loss": 1.1896, + "step": 49000 + }, + { + "epoch": 6.4403886554621845, + "grad_norm": 3.046875, + "learning_rate": 7.166534496431404e-05, + "loss": 1.2092, + "step": 49050 + }, + { + "epoch": 6.4469537815126055, + "grad_norm": 3.28125, + "learning_rate": 7.153317472905102e-05, + "loss": 1.168, + "step": 49100 + }, + { + "epoch": 6.4535189075630255, + "grad_norm": 3.359375, + "learning_rate": 7.140100449378801e-05, + "loss": 1.2271, + "step": 49150 + }, + { + "epoch": 6.4600840336134455, + "grad_norm": 3.71875, + "learning_rate": 7.126883425852498e-05, + "loss": 1.1761, + "step": 49200 + }, + { + "epoch": 6.4666491596638656, + "grad_norm": 3.0625, + "learning_rate": 7.113666402326197e-05, + "loss": 1.1482, + "step": 49250 + }, + { + "epoch": 6.473214285714286, + "grad_norm": 4.03125, + "learning_rate": 7.100449378799895e-05, + "loss": 1.1622, + "step": 49300 + }, + { + "epoch": 6.479779411764706, + "grad_norm": 3.890625, + "learning_rate": 7.087232355273594e-05, + "loss": 1.2345, + "step": 49350 + }, + { + "epoch": 6.486344537815126, + "grad_norm": 3.046875, + "learning_rate": 7.074015331747291e-05, + "loss": 1.1928, + "step": 49400 + }, + { + "epoch": 6.492909663865547, + "grad_norm": 3.28125, + "learning_rate": 7.06079830822099e-05, + "loss": 1.1809, + "step": 49450 + }, + { + "epoch": 6.499474789915967, + "grad_norm": 3.0625, + "learning_rate": 7.047581284694688e-05, + "loss": 1.2192, + "step": 49500 + }, + { + "epoch": 6.506039915966387, + "grad_norm": 2.875, + "learning_rate": 7.034364261168386e-05, + "loss": 1.2011, + "step": 49550 + }, + { + "epoch": 6.512605042016807, + "grad_norm": 3.75, + "learning_rate": 7.021147237642083e-05, + "loss": 1.2188, + "step": 49600 + }, + { + "epoch": 6.519170168067227, + "grad_norm": 3.296875, + "learning_rate": 7.007930214115781e-05, + "loss": 1.2231, + "step": 49650 + }, + { + "epoch": 6.525735294117647, + "grad_norm": 2.734375, + "learning_rate": 6.994713190589479e-05, + "loss": 1.1578, + "step": 49700 + }, + { + "epoch": 6.532300420168067, + "grad_norm": 2.5, + "learning_rate": 6.981496167063178e-05, + "loss": 1.2147, + "step": 49750 + }, + { + "epoch": 6.538865546218488, + "grad_norm": 3.75, + "learning_rate": 6.968279143536875e-05, + "loss": 1.1989, + "step": 49800 + }, + { + "epoch": 6.545430672268908, + "grad_norm": 3.40625, + "learning_rate": 6.955062120010574e-05, + "loss": 1.2022, + "step": 49850 + }, + { + "epoch": 6.551995798319328, + "grad_norm": 4.65625, + "learning_rate": 6.941845096484272e-05, + "loss": 1.1972, + "step": 49900 + }, + { + "epoch": 6.558560924369748, + "grad_norm": 3.5, + "learning_rate": 6.92862807295797e-05, + "loss": 1.2239, + "step": 49950 + }, + { + "epoch": 6.565126050420168, + "grad_norm": 4.3125, + "learning_rate": 6.915411049431668e-05, + "loss": 1.1536, + "step": 50000 + }, + { + "epoch": 6.571691176470588, + "grad_norm": 3.9375, + "learning_rate": 6.902194025905367e-05, + "loss": 1.1373, + "step": 50050 + }, + { + "epoch": 6.578256302521009, + "grad_norm": 4.5, + "learning_rate": 6.888977002379065e-05, + "loss": 1.2083, + "step": 50100 + }, + { + "epoch": 6.584821428571429, + "grad_norm": 5.125, + "learning_rate": 6.875759978852762e-05, + "loss": 1.2293, + "step": 50150 + }, + { + "epoch": 6.591386554621849, + "grad_norm": 3.734375, + "learning_rate": 6.862542955326461e-05, + "loss": 1.2328, + "step": 50200 + }, + { + "epoch": 6.597951680672269, + "grad_norm": 2.390625, + "learning_rate": 6.849325931800158e-05, + "loss": 1.1966, + "step": 50250 + }, + { + "epoch": 6.604516806722689, + "grad_norm": 3.578125, + "learning_rate": 6.836108908273857e-05, + "loss": 1.2417, + "step": 50300 + }, + { + "epoch": 6.611081932773109, + "grad_norm": 3.25, + "learning_rate": 6.822891884747555e-05, + "loss": 1.2, + "step": 50350 + }, + { + "epoch": 6.617647058823529, + "grad_norm": 3.453125, + "learning_rate": 6.809674861221254e-05, + "loss": 1.1948, + "step": 50400 + }, + { + "epoch": 6.62421218487395, + "grad_norm": 3.953125, + "learning_rate": 6.796457837694951e-05, + "loss": 1.2059, + "step": 50450 + }, + { + "epoch": 6.63077731092437, + "grad_norm": 3.703125, + "learning_rate": 6.78324081416865e-05, + "loss": 1.1772, + "step": 50500 + }, + { + "epoch": 6.63734243697479, + "grad_norm": 2.734375, + "learning_rate": 6.770023790642348e-05, + "loss": 1.2675, + "step": 50550 + }, + { + "epoch": 6.64390756302521, + "grad_norm": 4.90625, + "learning_rate": 6.756806767116046e-05, + "loss": 1.1601, + "step": 50600 + }, + { + "epoch": 6.65047268907563, + "grad_norm": 3.578125, + "learning_rate": 6.743589743589744e-05, + "loss": 1.2007, + "step": 50650 + }, + { + "epoch": 6.65703781512605, + "grad_norm": 2.5, + "learning_rate": 6.730372720063443e-05, + "loss": 1.2893, + "step": 50700 + }, + { + "epoch": 6.663602941176471, + "grad_norm": 3.625, + "learning_rate": 6.717155696537139e-05, + "loss": 1.1959, + "step": 50750 + }, + { + "epoch": 6.670168067226891, + "grad_norm": 3.390625, + "learning_rate": 6.703938673010838e-05, + "loss": 1.2391, + "step": 50800 + }, + { + "epoch": 6.676733193277311, + "grad_norm": 3.3125, + "learning_rate": 6.690721649484535e-05, + "loss": 1.2726, + "step": 50850 + }, + { + "epoch": 6.683298319327731, + "grad_norm": 3.25, + "learning_rate": 6.677504625958234e-05, + "loss": 1.2429, + "step": 50900 + }, + { + "epoch": 6.689863445378151, + "grad_norm": 3.171875, + "learning_rate": 6.664287602431932e-05, + "loss": 1.2426, + "step": 50950 + }, + { + "epoch": 6.696428571428571, + "grad_norm": 2.484375, + "learning_rate": 6.651070578905631e-05, + "loss": 1.1643, + "step": 51000 + }, + { + "epoch": 6.702993697478991, + "grad_norm": 2.859375, + "learning_rate": 6.637853555379328e-05, + "loss": 1.2403, + "step": 51050 + }, + { + "epoch": 6.709558823529412, + "grad_norm": 2.3125, + "learning_rate": 6.624636531853027e-05, + "loss": 1.1968, + "step": 51100 + }, + { + "epoch": 6.716123949579832, + "grad_norm": 5.28125, + "learning_rate": 6.611419508326725e-05, + "loss": 1.2307, + "step": 51150 + }, + { + "epoch": 6.722689075630252, + "grad_norm": 3.765625, + "learning_rate": 6.598202484800423e-05, + "loss": 1.2227, + "step": 51200 + }, + { + "epoch": 6.729254201680672, + "grad_norm": 3.15625, + "learning_rate": 6.584985461274121e-05, + "loss": 1.2558, + "step": 51250 + }, + { + "epoch": 6.735819327731092, + "grad_norm": 3.515625, + "learning_rate": 6.57176843774782e-05, + "loss": 1.1961, + "step": 51300 + }, + { + "epoch": 6.742384453781512, + "grad_norm": 3.625, + "learning_rate": 6.558551414221517e-05, + "loss": 1.2159, + "step": 51350 + }, + { + "epoch": 6.748949579831933, + "grad_norm": 2.734375, + "learning_rate": 6.545334390695216e-05, + "loss": 1.2694, + "step": 51400 + }, + { + "epoch": 6.755514705882353, + "grad_norm": 3.015625, + "learning_rate": 6.532117367168914e-05, + "loss": 1.2875, + "step": 51450 + }, + { + "epoch": 6.762079831932773, + "grad_norm": 3.359375, + "learning_rate": 6.518900343642613e-05, + "loss": 1.2388, + "step": 51500 + }, + { + "epoch": 6.768644957983193, + "grad_norm": 2.765625, + "learning_rate": 6.50568332011631e-05, + "loss": 1.1447, + "step": 51550 + }, + { + "epoch": 6.775210084033613, + "grad_norm": 2.71875, + "learning_rate": 6.492466296590009e-05, + "loss": 1.2619, + "step": 51600 + }, + { + "epoch": 6.781775210084033, + "grad_norm": 3.53125, + "learning_rate": 6.479249273063707e-05, + "loss": 1.1767, + "step": 51650 + }, + { + "epoch": 6.788340336134453, + "grad_norm": 3.65625, + "learning_rate": 6.466032249537405e-05, + "loss": 1.2818, + "step": 51700 + }, + { + "epoch": 6.7949054621848735, + "grad_norm": 3.390625, + "learning_rate": 6.452815226011103e-05, + "loss": 1.2318, + "step": 51750 + }, + { + "epoch": 6.801470588235294, + "grad_norm": 3.234375, + "learning_rate": 6.4395982024848e-05, + "loss": 1.2754, + "step": 51800 + }, + { + "epoch": 6.808035714285714, + "grad_norm": 3.890625, + "learning_rate": 6.4263811789585e-05, + "loss": 1.2557, + "step": 51850 + }, + { + "epoch": 6.8146008403361344, + "grad_norm": 4.65625, + "learning_rate": 6.413164155432197e-05, + "loss": 1.2575, + "step": 51900 + }, + { + "epoch": 6.8211659663865545, + "grad_norm": 4.1875, + "learning_rate": 6.399947131905894e-05, + "loss": 1.2624, + "step": 51950 + }, + { + "epoch": 6.8277310924369745, + "grad_norm": 3.171875, + "learning_rate": 6.386730108379593e-05, + "loss": 1.1846, + "step": 52000 + }, + { + "epoch": 6.834296218487395, + "grad_norm": 3.25, + "learning_rate": 6.373513084853291e-05, + "loss": 1.2446, + "step": 52050 + }, + { + "epoch": 6.8408613445378155, + "grad_norm": 3.6875, + "learning_rate": 6.36029606132699e-05, + "loss": 1.2239, + "step": 52100 + }, + { + "epoch": 6.8474264705882355, + "grad_norm": 3.34375, + "learning_rate": 6.347079037800687e-05, + "loss": 1.2578, + "step": 52150 + }, + { + "epoch": 6.8539915966386555, + "grad_norm": 2.78125, + "learning_rate": 6.333862014274386e-05, + "loss": 1.2569, + "step": 52200 + }, + { + "epoch": 6.860556722689076, + "grad_norm": 3.515625, + "learning_rate": 6.320644990748084e-05, + "loss": 1.2031, + "step": 52250 + }, + { + "epoch": 6.867121848739496, + "grad_norm": 2.140625, + "learning_rate": 6.307427967221781e-05, + "loss": 1.2602, + "step": 52300 + }, + { + "epoch": 6.873686974789916, + "grad_norm": 5.5625, + "learning_rate": 6.29421094369548e-05, + "loss": 1.2312, + "step": 52350 + }, + { + "epoch": 6.880252100840336, + "grad_norm": 4.15625, + "learning_rate": 6.280993920169178e-05, + "loss": 1.2779, + "step": 52400 + }, + { + "epoch": 6.886817226890757, + "grad_norm": 3.984375, + "learning_rate": 6.267776896642876e-05, + "loss": 1.2193, + "step": 52450 + }, + { + "epoch": 6.893382352941177, + "grad_norm": 3.859375, + "learning_rate": 6.254559873116574e-05, + "loss": 1.2641, + "step": 52500 + }, + { + "epoch": 6.899947478991597, + "grad_norm": 2.65625, + "learning_rate": 6.241342849590273e-05, + "loss": 1.2719, + "step": 52550 + }, + { + "epoch": 6.906512605042017, + "grad_norm": 3.109375, + "learning_rate": 6.22812582606397e-05, + "loss": 1.2645, + "step": 52600 + }, + { + "epoch": 6.913077731092437, + "grad_norm": 3.71875, + "learning_rate": 6.214908802537669e-05, + "loss": 1.2796, + "step": 52650 + }, + { + "epoch": 6.919642857142857, + "grad_norm": 3.1875, + "learning_rate": 6.201691779011367e-05, + "loss": 1.2243, + "step": 52700 + }, + { + "epoch": 6.926207983193278, + "grad_norm": 4.53125, + "learning_rate": 6.188474755485066e-05, + "loss": 1.2344, + "step": 52750 + }, + { + "epoch": 6.932773109243698, + "grad_norm": 4.03125, + "learning_rate": 6.175257731958763e-05, + "loss": 1.2032, + "step": 52800 + }, + { + "epoch": 6.939338235294118, + "grad_norm": 4.78125, + "learning_rate": 6.162040708432462e-05, + "loss": 1.2038, + "step": 52850 + }, + { + "epoch": 6.945903361344538, + "grad_norm": 4.21875, + "learning_rate": 6.14882368490616e-05, + "loss": 1.2359, + "step": 52900 + }, + { + "epoch": 6.952468487394958, + "grad_norm": 3.015625, + "learning_rate": 6.135606661379858e-05, + "loss": 1.2409, + "step": 52950 + }, + { + "epoch": 6.959033613445378, + "grad_norm": 3.234375, + "learning_rate": 6.122389637853556e-05, + "loss": 1.2521, + "step": 53000 + }, + { + "epoch": 6.965598739495798, + "grad_norm": 3.734375, + "learning_rate": 6.109172614327255e-05, + "loss": 1.229, + "step": 53050 + }, + { + "epoch": 6.972163865546219, + "grad_norm": 2.921875, + "learning_rate": 6.095955590800952e-05, + "loss": 1.2429, + "step": 53100 + }, + { + "epoch": 6.978728991596639, + "grad_norm": 3.296875, + "learning_rate": 6.08273856727465e-05, + "loss": 1.1811, + "step": 53150 + }, + { + "epoch": 6.985294117647059, + "grad_norm": 3.484375, + "learning_rate": 6.069521543748348e-05, + "loss": 1.2411, + "step": 53200 + }, + { + "epoch": 6.991859243697479, + "grad_norm": 3.671875, + "learning_rate": 6.056304520222046e-05, + "loss": 1.2667, + "step": 53250 + }, + { + "epoch": 6.998424369747899, + "grad_norm": 2.890625, + "learning_rate": 6.043087496695744e-05, + "loss": 1.2438, + "step": 53300 + }, + { + "epoch": 7.004989495798319, + "grad_norm": 3.09375, + "learning_rate": 6.029870473169442e-05, + "loss": 1.0238, + "step": 53350 + }, + { + "epoch": 7.01155462184874, + "grad_norm": 2.875, + "learning_rate": 6.01665344964314e-05, + "loss": 0.9602, + "step": 53400 + }, + { + "epoch": 7.01811974789916, + "grad_norm": 2.9375, + "learning_rate": 6.003436426116838e-05, + "loss": 0.9521, + "step": 53450 + }, + { + "epoch": 7.02468487394958, + "grad_norm": 3.828125, + "learning_rate": 5.9902194025905365e-05, + "loss": 0.9348, + "step": 53500 + }, + { + "epoch": 7.03125, + "grad_norm": 3.078125, + "learning_rate": 5.977002379064235e-05, + "loss": 0.9541, + "step": 53550 + }, + { + "epoch": 7.03781512605042, + "grad_norm": 3.640625, + "learning_rate": 5.963785355537933e-05, + "loss": 1.0194, + "step": 53600 + }, + { + "epoch": 7.04438025210084, + "grad_norm": 3.59375, + "learning_rate": 5.950568332011631e-05, + "loss": 0.9803, + "step": 53650 + }, + { + "epoch": 7.05094537815126, + "grad_norm": 2.859375, + "learning_rate": 5.937351308485329e-05, + "loss": 1.0089, + "step": 53700 + }, + { + "epoch": 7.057510504201681, + "grad_norm": 2.546875, + "learning_rate": 5.9241342849590275e-05, + "loss": 0.9549, + "step": 53750 + }, + { + "epoch": 7.064075630252101, + "grad_norm": 2.984375, + "learning_rate": 5.910917261432726e-05, + "loss": 0.9472, + "step": 53800 + }, + { + "epoch": 7.070640756302521, + "grad_norm": 2.609375, + "learning_rate": 5.897700237906424e-05, + "loss": 0.9334, + "step": 53850 + }, + { + "epoch": 7.077205882352941, + "grad_norm": 4.9375, + "learning_rate": 5.884483214380122e-05, + "loss": 0.9778, + "step": 53900 + }, + { + "epoch": 7.083771008403361, + "grad_norm": 3.796875, + "learning_rate": 5.87126619085382e-05, + "loss": 0.9484, + "step": 53950 + }, + { + "epoch": 7.090336134453781, + "grad_norm": 2.984375, + "learning_rate": 5.8580491673275185e-05, + "loss": 0.9268, + "step": 54000 + }, + { + "epoch": 7.096901260504202, + "grad_norm": 2.65625, + "learning_rate": 5.844832143801217e-05, + "loss": 0.992, + "step": 54050 + }, + { + "epoch": 7.103466386554622, + "grad_norm": 3.25, + "learning_rate": 5.831615120274915e-05, + "loss": 0.9315, + "step": 54100 + }, + { + "epoch": 7.110031512605042, + "grad_norm": 4.09375, + "learning_rate": 5.818398096748613e-05, + "loss": 0.9577, + "step": 54150 + }, + { + "epoch": 7.116596638655462, + "grad_norm": 2.6875, + "learning_rate": 5.805181073222311e-05, + "loss": 0.9646, + "step": 54200 + }, + { + "epoch": 7.123161764705882, + "grad_norm": 3.59375, + "learning_rate": 5.7919640496960095e-05, + "loss": 1.0107, + "step": 54250 + }, + { + "epoch": 7.129726890756302, + "grad_norm": 2.78125, + "learning_rate": 5.778747026169706e-05, + "loss": 0.9089, + "step": 54300 + }, + { + "epoch": 7.136292016806722, + "grad_norm": 3.109375, + "learning_rate": 5.7655300026434045e-05, + "loss": 1.0186, + "step": 54350 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 3.96875, + "learning_rate": 5.752312979117103e-05, + "loss": 1.0012, + "step": 54400 + }, + { + "epoch": 7.149422268907563, + "grad_norm": 3.34375, + "learning_rate": 5.739095955590801e-05, + "loss": 0.9624, + "step": 54450 + }, + { + "epoch": 7.155987394957983, + "grad_norm": 3.125, + "learning_rate": 5.725878932064499e-05, + "loss": 0.9607, + "step": 54500 + }, + { + "epoch": 7.162552521008403, + "grad_norm": 3.296875, + "learning_rate": 5.712661908538197e-05, + "loss": 0.9862, + "step": 54550 + }, + { + "epoch": 7.169117647058823, + "grad_norm": 2.671875, + "learning_rate": 5.6994448850118955e-05, + "loss": 1.022, + "step": 54600 + }, + { + "epoch": 7.175682773109243, + "grad_norm": 3.984375, + "learning_rate": 5.686227861485594e-05, + "loss": 0.9618, + "step": 54650 + }, + { + "epoch": 7.182247899159663, + "grad_norm": 3.578125, + "learning_rate": 5.673010837959292e-05, + "loss": 1.04, + "step": 54700 + }, + { + "epoch": 7.188813025210084, + "grad_norm": 3.359375, + "learning_rate": 5.65979381443299e-05, + "loss": 0.9641, + "step": 54750 + }, + { + "epoch": 7.195378151260504, + "grad_norm": 5.09375, + "learning_rate": 5.646576790906688e-05, + "loss": 0.9545, + "step": 54800 + }, + { + "epoch": 7.201943277310924, + "grad_norm": 3.21875, + "learning_rate": 5.6333597673803865e-05, + "loss": 0.9333, + "step": 54850 + }, + { + "epoch": 7.2085084033613445, + "grad_norm": 4.21875, + "learning_rate": 5.620142743854085e-05, + "loss": 1.0082, + "step": 54900 + }, + { + "epoch": 7.2150735294117645, + "grad_norm": 2.65625, + "learning_rate": 5.606925720327782e-05, + "loss": 0.9802, + "step": 54950 + }, + { + "epoch": 7.2216386554621845, + "grad_norm": 3.515625, + "learning_rate": 5.5937086968014804e-05, + "loss": 0.9939, + "step": 55000 + }, + { + "epoch": 7.2282037815126055, + "grad_norm": 3.421875, + "learning_rate": 5.5804916732751786e-05, + "loss": 0.9994, + "step": 55050 + }, + { + "epoch": 7.2347689075630255, + "grad_norm": 4.1875, + "learning_rate": 5.567274649748877e-05, + "loss": 0.9503, + "step": 55100 + }, + { + "epoch": 7.2413340336134455, + "grad_norm": 3.609375, + "learning_rate": 5.554057626222575e-05, + "loss": 1.0269, + "step": 55150 + }, + { + "epoch": 7.2478991596638656, + "grad_norm": 2.953125, + "learning_rate": 5.540840602696273e-05, + "loss": 0.9995, + "step": 55200 + }, + { + "epoch": 7.254464285714286, + "grad_norm": 2.46875, + "learning_rate": 5.5276235791699714e-05, + "loss": 1.004, + "step": 55250 + }, + { + "epoch": 7.261029411764706, + "grad_norm": 3.921875, + "learning_rate": 5.5144065556436696e-05, + "loss": 0.9907, + "step": 55300 + }, + { + "epoch": 7.267594537815126, + "grad_norm": 2.96875, + "learning_rate": 5.501189532117368e-05, + "loss": 1.0151, + "step": 55350 + }, + { + "epoch": 7.274159663865547, + "grad_norm": 4.03125, + "learning_rate": 5.487972508591066e-05, + "loss": 1.0086, + "step": 55400 + }, + { + "epoch": 7.280724789915967, + "grad_norm": 4.1875, + "learning_rate": 5.474755485064764e-05, + "loss": 1.0091, + "step": 55450 + }, + { + "epoch": 7.287289915966387, + "grad_norm": 2.890625, + "learning_rate": 5.461538461538461e-05, + "loss": 0.9737, + "step": 55500 + }, + { + "epoch": 7.293855042016807, + "grad_norm": 3.765625, + "learning_rate": 5.448321438012159e-05, + "loss": 0.9781, + "step": 55550 + }, + { + "epoch": 7.300420168067227, + "grad_norm": 3.34375, + "learning_rate": 5.4351044144858574e-05, + "loss": 0.9661, + "step": 55600 + }, + { + "epoch": 7.306985294117647, + "grad_norm": 4.03125, + "learning_rate": 5.4218873909595556e-05, + "loss": 0.9372, + "step": 55650 + }, + { + "epoch": 7.313550420168067, + "grad_norm": 3.375, + "learning_rate": 5.408670367433254e-05, + "loss": 0.9873, + "step": 55700 + }, + { + "epoch": 7.320115546218488, + "grad_norm": 2.90625, + "learning_rate": 5.395453343906952e-05, + "loss": 0.9819, + "step": 55750 + }, + { + "epoch": 7.326680672268908, + "grad_norm": 3.765625, + "learning_rate": 5.38223632038065e-05, + "loss": 0.9967, + "step": 55800 + }, + { + "epoch": 7.333245798319328, + "grad_norm": 4.25, + "learning_rate": 5.3690192968543484e-05, + "loss": 0.9956, + "step": 55850 + }, + { + "epoch": 7.339810924369748, + "grad_norm": 3.5625, + "learning_rate": 5.3558022733280466e-05, + "loss": 0.9971, + "step": 55900 + }, + { + "epoch": 7.346376050420168, + "grad_norm": 3.625, + "learning_rate": 5.342585249801745e-05, + "loss": 0.9582, + "step": 55950 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 3.65625, + "learning_rate": 5.329368226275443e-05, + "loss": 0.9472, + "step": 56000 + }, + { + "epoch": 7.359506302521009, + "grad_norm": 3.9375, + "learning_rate": 5.316151202749141e-05, + "loss": 0.9408, + "step": 56050 + }, + { + "epoch": 7.366071428571429, + "grad_norm": 2.90625, + "learning_rate": 5.3029341792228394e-05, + "loss": 0.9915, + "step": 56100 + }, + { + "epoch": 7.372636554621849, + "grad_norm": 4.0625, + "learning_rate": 5.2897171556965376e-05, + "loss": 0.9697, + "step": 56150 + }, + { + "epoch": 7.379201680672269, + "grad_norm": 3.109375, + "learning_rate": 5.276500132170236e-05, + "loss": 1.0588, + "step": 56200 + }, + { + "epoch": 7.385766806722689, + "grad_norm": 3.046875, + "learning_rate": 5.263283108643934e-05, + "loss": 1.0586, + "step": 56250 + }, + { + "epoch": 7.392331932773109, + "grad_norm": 3.09375, + "learning_rate": 5.250066085117632e-05, + "loss": 1.0323, + "step": 56300 + }, + { + "epoch": 7.398897058823529, + "grad_norm": 3.109375, + "learning_rate": 5.2368490615913304e-05, + "loss": 0.9774, + "step": 56350 + }, + { + "epoch": 7.40546218487395, + "grad_norm": 3.296875, + "learning_rate": 5.2236320380650286e-05, + "loss": 1.0095, + "step": 56400 + }, + { + "epoch": 7.41202731092437, + "grad_norm": 4.78125, + "learning_rate": 5.210415014538727e-05, + "loss": 1.0282, + "step": 56450 + }, + { + "epoch": 7.41859243697479, + "grad_norm": 3.328125, + "learning_rate": 5.197197991012425e-05, + "loss": 0.9461, + "step": 56500 + }, + { + "epoch": 7.42515756302521, + "grad_norm": 4.03125, + "learning_rate": 5.183980967486123e-05, + "loss": 1.0255, + "step": 56550 + }, + { + "epoch": 7.43172268907563, + "grad_norm": 3.046875, + "learning_rate": 5.170763943959821e-05, + "loss": 0.9991, + "step": 56600 + }, + { + "epoch": 7.43828781512605, + "grad_norm": 3.578125, + "learning_rate": 5.157546920433519e-05, + "loss": 1.0288, + "step": 56650 + }, + { + "epoch": 7.444852941176471, + "grad_norm": 3.359375, + "learning_rate": 5.1443298969072164e-05, + "loss": 1.0095, + "step": 56700 + }, + { + "epoch": 7.451418067226891, + "grad_norm": 3.953125, + "learning_rate": 5.1311128733809146e-05, + "loss": 1.0206, + "step": 56750 + }, + { + "epoch": 7.457983193277311, + "grad_norm": 3.9375, + "learning_rate": 5.117895849854613e-05, + "loss": 1.0271, + "step": 56800 + }, + { + "epoch": 7.464548319327731, + "grad_norm": 4.59375, + "learning_rate": 5.104678826328311e-05, + "loss": 0.9945, + "step": 56850 + }, + { + "epoch": 7.471113445378151, + "grad_norm": 4.0625, + "learning_rate": 5.091461802802009e-05, + "loss": 0.9645, + "step": 56900 + }, + { + "epoch": 7.477678571428571, + "grad_norm": 4.53125, + "learning_rate": 5.0782447792757074e-05, + "loss": 1.0132, + "step": 56950 + }, + { + "epoch": 7.484243697478991, + "grad_norm": 3.625, + "learning_rate": 5.0650277557494056e-05, + "loss": 1.046, + "step": 57000 + }, + { + "epoch": 7.490808823529412, + "grad_norm": 3.5625, + "learning_rate": 5.051810732223103e-05, + "loss": 1.0368, + "step": 57050 + }, + { + "epoch": 7.497373949579832, + "grad_norm": 4.0625, + "learning_rate": 5.038593708696801e-05, + "loss": 1.0087, + "step": 57100 + }, + { + "epoch": 7.503939075630252, + "grad_norm": 3.359375, + "learning_rate": 5.0253766851704995e-05, + "loss": 0.9407, + "step": 57150 + }, + { + "epoch": 7.510504201680672, + "grad_norm": 2.609375, + "learning_rate": 5.012159661644198e-05, + "loss": 0.9882, + "step": 57200 + }, + { + "epoch": 7.517069327731092, + "grad_norm": 4.0, + "learning_rate": 4.998942638117896e-05, + "loss": 0.9315, + "step": 57250 + }, + { + "epoch": 7.523634453781512, + "grad_norm": 2.8125, + "learning_rate": 4.985725614591594e-05, + "loss": 1.001, + "step": 57300 + }, + { + "epoch": 7.530199579831933, + "grad_norm": 2.796875, + "learning_rate": 4.972508591065292e-05, + "loss": 0.9481, + "step": 57350 + }, + { + "epoch": 7.536764705882353, + "grad_norm": 4.4375, + "learning_rate": 4.9592915675389905e-05, + "loss": 1.0605, + "step": 57400 + }, + { + "epoch": 7.543329831932773, + "grad_norm": 3.125, + "learning_rate": 4.946074544012689e-05, + "loss": 1.034, + "step": 57450 + }, + { + "epoch": 7.549894957983193, + "grad_norm": 3.125, + "learning_rate": 4.932857520486387e-05, + "loss": 0.9914, + "step": 57500 + }, + { + "epoch": 7.556460084033613, + "grad_norm": 4.84375, + "learning_rate": 4.9196404969600844e-05, + "loss": 0.9786, + "step": 57550 + }, + { + "epoch": 7.563025210084033, + "grad_norm": 3.765625, + "learning_rate": 4.9064234734337826e-05, + "loss": 0.9747, + "step": 57600 + }, + { + "epoch": 7.569590336134453, + "grad_norm": 4.15625, + "learning_rate": 4.893206449907481e-05, + "loss": 0.9962, + "step": 57650 + }, + { + "epoch": 7.5761554621848735, + "grad_norm": 2.9375, + "learning_rate": 4.879989426381179e-05, + "loss": 0.9514, + "step": 57700 + }, + { + "epoch": 7.582720588235294, + "grad_norm": 4.125, + "learning_rate": 4.866772402854877e-05, + "loss": 0.9623, + "step": 57750 + }, + { + "epoch": 7.589285714285714, + "grad_norm": 4.8125, + "learning_rate": 4.8535553793285754e-05, + "loss": 0.9925, + "step": 57800 + }, + { + "epoch": 7.5958508403361344, + "grad_norm": 4.46875, + "learning_rate": 4.8403383558022736e-05, + "loss": 0.984, + "step": 57850 + }, + { + "epoch": 7.6024159663865545, + "grad_norm": 3.390625, + "learning_rate": 4.827121332275972e-05, + "loss": 0.9953, + "step": 57900 + }, + { + "epoch": 7.6089810924369745, + "grad_norm": 3.28125, + "learning_rate": 4.81390430874967e-05, + "loss": 1.0407, + "step": 57950 + }, + { + "epoch": 7.615546218487395, + "grad_norm": 3.703125, + "learning_rate": 4.800687285223368e-05, + "loss": 1.0593, + "step": 58000 + }, + { + "epoch": 7.6221113445378155, + "grad_norm": 4.84375, + "learning_rate": 4.7874702616970664e-05, + "loss": 1.0001, + "step": 58050 + }, + { + "epoch": 7.6286764705882355, + "grad_norm": 3.109375, + "learning_rate": 4.7742532381707646e-05, + "loss": 1.0068, + "step": 58100 + }, + { + "epoch": 7.6352415966386555, + "grad_norm": 3.6875, + "learning_rate": 4.761036214644462e-05, + "loss": 1.0269, + "step": 58150 + }, + { + "epoch": 7.641806722689076, + "grad_norm": 4.125, + "learning_rate": 4.74781919111816e-05, + "loss": 1.0594, + "step": 58200 + }, + { + "epoch": 7.648371848739496, + "grad_norm": 3.296875, + "learning_rate": 4.7346021675918585e-05, + "loss": 0.9949, + "step": 58250 + }, + { + "epoch": 7.654936974789916, + "grad_norm": 3.4375, + "learning_rate": 4.721385144065557e-05, + "loss": 0.9923, + "step": 58300 + }, + { + "epoch": 7.661502100840336, + "grad_norm": 2.46875, + "learning_rate": 4.708168120539255e-05, + "loss": 1.0112, + "step": 58350 + }, + { + "epoch": 7.668067226890757, + "grad_norm": 4.78125, + "learning_rate": 4.694951097012953e-05, + "loss": 1.0397, + "step": 58400 + }, + { + "epoch": 7.674632352941177, + "grad_norm": 2.625, + "learning_rate": 4.681734073486651e-05, + "loss": 1.0318, + "step": 58450 + }, + { + "epoch": 7.681197478991597, + "grad_norm": 3.625, + "learning_rate": 4.6685170499603495e-05, + "loss": 1.0572, + "step": 58500 + }, + { + "epoch": 7.687762605042017, + "grad_norm": 3.078125, + "learning_rate": 4.6553000264340477e-05, + "loss": 1.0385, + "step": 58550 + }, + { + "epoch": 7.694327731092437, + "grad_norm": 2.90625, + "learning_rate": 4.642083002907746e-05, + "loss": 1.0306, + "step": 58600 + }, + { + "epoch": 7.700892857142857, + "grad_norm": 5.21875, + "learning_rate": 4.628865979381444e-05, + "loss": 1.0104, + "step": 58650 + }, + { + "epoch": 7.707457983193278, + "grad_norm": 3.59375, + "learning_rate": 4.6156489558551416e-05, + "loss": 0.9509, + "step": 58700 + }, + { + "epoch": 7.714023109243698, + "grad_norm": 3.28125, + "learning_rate": 4.60243193232884e-05, + "loss": 1.0473, + "step": 58750 + }, + { + "epoch": 7.720588235294118, + "grad_norm": 2.515625, + "learning_rate": 4.589214908802538e-05, + "loss": 1.0077, + "step": 58800 + }, + { + "epoch": 7.727153361344538, + "grad_norm": 2.984375, + "learning_rate": 4.575997885276236e-05, + "loss": 1.0053, + "step": 58850 + }, + { + "epoch": 7.733718487394958, + "grad_norm": 3.5625, + "learning_rate": 4.5627808617499344e-05, + "loss": 0.9762, + "step": 58900 + }, + { + "epoch": 7.740283613445378, + "grad_norm": 3.921875, + "learning_rate": 4.549563838223632e-05, + "loss": 1.0106, + "step": 58950 + }, + { + "epoch": 7.746848739495798, + "grad_norm": 3.875, + "learning_rate": 4.53634681469733e-05, + "loss": 0.9795, + "step": 59000 + }, + { + "epoch": 7.753413865546219, + "grad_norm": 2.640625, + "learning_rate": 4.523129791171028e-05, + "loss": 0.9806, + "step": 59050 + }, + { + "epoch": 7.759978991596639, + "grad_norm": 3.28125, + "learning_rate": 4.5099127676447265e-05, + "loss": 1.07, + "step": 59100 + }, + { + "epoch": 7.766544117647059, + "grad_norm": 3.5625, + "learning_rate": 4.496695744118425e-05, + "loss": 1.0631, + "step": 59150 + }, + { + "epoch": 7.773109243697479, + "grad_norm": 4.3125, + "learning_rate": 4.483478720592123e-05, + "loss": 1.0104, + "step": 59200 + }, + { + "epoch": 7.779674369747899, + "grad_norm": 3.65625, + "learning_rate": 4.470261697065821e-05, + "loss": 0.9505, + "step": 59250 + }, + { + "epoch": 7.786239495798319, + "grad_norm": 3.0625, + "learning_rate": 4.457044673539519e-05, + "loss": 1.0277, + "step": 59300 + }, + { + "epoch": 7.79280462184874, + "grad_norm": 2.5, + "learning_rate": 4.443827650013217e-05, + "loss": 1.0377, + "step": 59350 + }, + { + "epoch": 7.79936974789916, + "grad_norm": 4.34375, + "learning_rate": 4.430610626486915e-05, + "loss": 0.9968, + "step": 59400 + }, + { + "epoch": 7.80593487394958, + "grad_norm": 3.109375, + "learning_rate": 4.417393602960613e-05, + "loss": 1.0513, + "step": 59450 + }, + { + "epoch": 7.8125, + "grad_norm": 4.15625, + "learning_rate": 4.4041765794343114e-05, + "loss": 1.0534, + "step": 59500 + }, + { + "epoch": 7.81906512605042, + "grad_norm": 4.4375, + "learning_rate": 4.3909595559080096e-05, + "loss": 1.0133, + "step": 59550 + }, + { + "epoch": 7.82563025210084, + "grad_norm": 3.859375, + "learning_rate": 4.377742532381708e-05, + "loss": 1.0018, + "step": 59600 + }, + { + "epoch": 7.83219537815126, + "grad_norm": 3.53125, + "learning_rate": 4.364525508855406e-05, + "loss": 1.0268, + "step": 59650 + }, + { + "epoch": 7.838760504201681, + "grad_norm": 3.984375, + "learning_rate": 4.351308485329104e-05, + "loss": 0.9991, + "step": 59700 + }, + { + "epoch": 7.845325630252101, + "grad_norm": 4.40625, + "learning_rate": 4.3380914618028024e-05, + "loss": 1.0332, + "step": 59750 + }, + { + "epoch": 7.851890756302521, + "grad_norm": 3.5625, + "learning_rate": 4.3248744382765005e-05, + "loss": 0.9982, + "step": 59800 + }, + { + "epoch": 7.858455882352941, + "grad_norm": 2.703125, + "learning_rate": 4.311657414750199e-05, + "loss": 1.0299, + "step": 59850 + }, + { + "epoch": 7.865021008403361, + "grad_norm": 3.515625, + "learning_rate": 4.298440391223897e-05, + "loss": 1.0344, + "step": 59900 + }, + { + "epoch": 7.871586134453781, + "grad_norm": 2.859375, + "learning_rate": 4.2852233676975945e-05, + "loss": 1.0037, + "step": 59950 + }, + { + "epoch": 7.878151260504202, + "grad_norm": 2.15625, + "learning_rate": 4.272006344171293e-05, + "loss": 0.9864, + "step": 60000 + }, + { + "epoch": 7.884716386554622, + "grad_norm": 2.59375, + "learning_rate": 4.258789320644991e-05, + "loss": 1.0155, + "step": 60050 + }, + { + "epoch": 7.891281512605042, + "grad_norm": 4.40625, + "learning_rate": 4.245572297118689e-05, + "loss": 0.9806, + "step": 60100 + }, + { + "epoch": 7.897846638655462, + "grad_norm": 2.765625, + "learning_rate": 4.232355273592387e-05, + "loss": 1.053, + "step": 60150 + }, + { + "epoch": 7.904411764705882, + "grad_norm": 3.84375, + "learning_rate": 4.2191382500660855e-05, + "loss": 1.0841, + "step": 60200 + }, + { + "epoch": 7.910976890756302, + "grad_norm": 3.125, + "learning_rate": 4.2059212265397836e-05, + "loss": 1.0433, + "step": 60250 + }, + { + "epoch": 7.917542016806722, + "grad_norm": 3.40625, + "learning_rate": 4.192704203013482e-05, + "loss": 1.0405, + "step": 60300 + }, + { + "epoch": 7.924107142857143, + "grad_norm": 2.90625, + "learning_rate": 4.17948717948718e-05, + "loss": 0.9592, + "step": 60350 + }, + { + "epoch": 7.930672268907563, + "grad_norm": 4.28125, + "learning_rate": 4.166270155960878e-05, + "loss": 0.9838, + "step": 60400 + }, + { + "epoch": 7.937237394957983, + "grad_norm": 3.40625, + "learning_rate": 4.1530531324345764e-05, + "loss": 1.0412, + "step": 60450 + }, + { + "epoch": 7.943802521008403, + "grad_norm": 4.5625, + "learning_rate": 4.139836108908274e-05, + "loss": 1.0022, + "step": 60500 + }, + { + "epoch": 7.950367647058823, + "grad_norm": 4.375, + "learning_rate": 4.126619085381972e-05, + "loss": 1.0335, + "step": 60550 + }, + { + "epoch": 7.956932773109243, + "grad_norm": 4.59375, + "learning_rate": 4.1134020618556704e-05, + "loss": 0.9674, + "step": 60600 + }, + { + "epoch": 7.963497899159664, + "grad_norm": 3.53125, + "learning_rate": 4.1001850383293685e-05, + "loss": 1.0426, + "step": 60650 + }, + { + "epoch": 7.970063025210084, + "grad_norm": 3.453125, + "learning_rate": 4.086968014803067e-05, + "loss": 0.9493, + "step": 60700 + }, + { + "epoch": 7.976628151260504, + "grad_norm": 3.234375, + "learning_rate": 4.073750991276765e-05, + "loss": 1.0219, + "step": 60750 + }, + { + "epoch": 7.983193277310924, + "grad_norm": 3.28125, + "learning_rate": 4.0605339677504625e-05, + "loss": 1.0364, + "step": 60800 + }, + { + "epoch": 7.9897584033613445, + "grad_norm": 3.078125, + "learning_rate": 4.0473169442241607e-05, + "loss": 1.0049, + "step": 60850 + }, + { + "epoch": 7.9963235294117645, + "grad_norm": 3.21875, + "learning_rate": 4.034099920697859e-05, + "loss": 1.0305, + "step": 60900 + }, + { + "epoch": 8.002888655462185, + "grad_norm": 3.359375, + "learning_rate": 4.020882897171557e-05, + "loss": 0.9846, + "step": 60950 + }, + { + "epoch": 8.009453781512605, + "grad_norm": 3.34375, + "learning_rate": 4.007665873645255e-05, + "loss": 0.8191, + "step": 61000 + }, + { + "epoch": 8.016018907563025, + "grad_norm": 2.921875, + "learning_rate": 3.9944488501189534e-05, + "loss": 0.836, + "step": 61050 + }, + { + "epoch": 8.022584033613445, + "grad_norm": 3.578125, + "learning_rate": 3.981231826592651e-05, + "loss": 0.8465, + "step": 61100 + }, + { + "epoch": 8.029149159663865, + "grad_norm": 2.859375, + "learning_rate": 3.968014803066349e-05, + "loss": 0.8613, + "step": 61150 + }, + { + "epoch": 8.035714285714286, + "grad_norm": 3.390625, + "learning_rate": 3.9547977795400474e-05, + "loss": 0.8724, + "step": 61200 + }, + { + "epoch": 8.042279411764707, + "grad_norm": 3.390625, + "learning_rate": 3.9415807560137456e-05, + "loss": 0.9122, + "step": 61250 + }, + { + "epoch": 8.048844537815127, + "grad_norm": 3.1875, + "learning_rate": 3.928363732487444e-05, + "loss": 0.8903, + "step": 61300 + }, + { + "epoch": 8.055409663865547, + "grad_norm": 3.25, + "learning_rate": 3.915146708961142e-05, + "loss": 0.8419, + "step": 61350 + }, + { + "epoch": 8.061974789915967, + "grad_norm": 3.5625, + "learning_rate": 3.90192968543484e-05, + "loss": 0.8467, + "step": 61400 + }, + { + "epoch": 8.068539915966387, + "grad_norm": 3.34375, + "learning_rate": 3.8887126619085383e-05, + "loss": 0.8947, + "step": 61450 + }, + { + "epoch": 8.075105042016807, + "grad_norm": 3.09375, + "learning_rate": 3.8754956383822365e-05, + "loss": 0.9266, + "step": 61500 + }, + { + "epoch": 8.081670168067227, + "grad_norm": 2.953125, + "learning_rate": 3.862278614855935e-05, + "loss": 0.837, + "step": 61550 + }, + { + "epoch": 8.088235294117647, + "grad_norm": 3.15625, + "learning_rate": 3.849061591329633e-05, + "loss": 0.8598, + "step": 61600 + }, + { + "epoch": 8.094800420168067, + "grad_norm": 3.859375, + "learning_rate": 3.835844567803331e-05, + "loss": 0.8766, + "step": 61650 + }, + { + "epoch": 8.101365546218487, + "grad_norm": 2.453125, + "learning_rate": 3.8226275442770287e-05, + "loss": 0.8423, + "step": 61700 + }, + { + "epoch": 8.107930672268907, + "grad_norm": 4.1875, + "learning_rate": 3.809410520750727e-05, + "loss": 0.8809, + "step": 61750 + }, + { + "epoch": 8.114495798319327, + "grad_norm": 3.453125, + "learning_rate": 3.796193497224425e-05, + "loss": 0.8501, + "step": 61800 + }, + { + "epoch": 8.121060924369749, + "grad_norm": 4.3125, + "learning_rate": 3.782976473698123e-05, + "loss": 0.8507, + "step": 61850 + }, + { + "epoch": 8.127626050420169, + "grad_norm": 3.40625, + "learning_rate": 3.7697594501718214e-05, + "loss": 0.8456, + "step": 61900 + }, + { + "epoch": 8.134191176470589, + "grad_norm": 3.125, + "learning_rate": 3.7565424266455196e-05, + "loss": 0.8371, + "step": 61950 + }, + { + "epoch": 8.140756302521009, + "grad_norm": 3.265625, + "learning_rate": 3.743325403119218e-05, + "loss": 0.893, + "step": 62000 + }, + { + "epoch": 8.147321428571429, + "grad_norm": 4.96875, + "learning_rate": 3.730108379592916e-05, + "loss": 0.8895, + "step": 62050 + }, + { + "epoch": 8.153886554621849, + "grad_norm": 3.65625, + "learning_rate": 3.716891356066614e-05, + "loss": 0.8391, + "step": 62100 + }, + { + "epoch": 8.160451680672269, + "grad_norm": 3.671875, + "learning_rate": 3.7036743325403124e-05, + "loss": 0.862, + "step": 62150 + }, + { + "epoch": 8.167016806722689, + "grad_norm": 3.15625, + "learning_rate": 3.6904573090140106e-05, + "loss": 0.8852, + "step": 62200 + }, + { + "epoch": 8.173581932773109, + "grad_norm": 3.703125, + "learning_rate": 3.677240285487709e-05, + "loss": 0.8824, + "step": 62250 + }, + { + "epoch": 8.180147058823529, + "grad_norm": 3.46875, + "learning_rate": 3.6640232619614063e-05, + "loss": 0.8212, + "step": 62300 + }, + { + "epoch": 8.186712184873949, + "grad_norm": 3.265625, + "learning_rate": 3.6508062384351045e-05, + "loss": 0.9166, + "step": 62350 + }, + { + "epoch": 8.193277310924369, + "grad_norm": 2.890625, + "learning_rate": 3.637589214908803e-05, + "loss": 0.8318, + "step": 62400 + }, + { + "epoch": 8.199842436974789, + "grad_norm": 4.8125, + "learning_rate": 3.624372191382501e-05, + "loss": 0.879, + "step": 62450 + }, + { + "epoch": 8.206407563025211, + "grad_norm": 2.84375, + "learning_rate": 3.611155167856199e-05, + "loss": 0.8354, + "step": 62500 + }, + { + "epoch": 8.212972689075631, + "grad_norm": 3.046875, + "learning_rate": 3.597938144329897e-05, + "loss": 0.8263, + "step": 62550 + }, + { + "epoch": 8.219537815126051, + "grad_norm": 2.765625, + "learning_rate": 3.5847211208035955e-05, + "loss": 0.9078, + "step": 62600 + }, + { + "epoch": 8.226102941176471, + "grad_norm": 4.03125, + "learning_rate": 3.571504097277294e-05, + "loss": 0.8386, + "step": 62650 + }, + { + "epoch": 8.232668067226891, + "grad_norm": 3.234375, + "learning_rate": 3.558287073750991e-05, + "loss": 0.8261, + "step": 62700 + }, + { + "epoch": 8.239233193277311, + "grad_norm": 2.609375, + "learning_rate": 3.5450700502246894e-05, + "loss": 0.8744, + "step": 62750 + }, + { + "epoch": 8.245798319327731, + "grad_norm": 4.40625, + "learning_rate": 3.5318530266983876e-05, + "loss": 0.9089, + "step": 62800 + }, + { + "epoch": 8.252363445378151, + "grad_norm": 2.96875, + "learning_rate": 3.518636003172086e-05, + "loss": 0.9168, + "step": 62850 + }, + { + "epoch": 8.258928571428571, + "grad_norm": 3.484375, + "learning_rate": 3.5054189796457834e-05, + "loss": 0.8495, + "step": 62900 + }, + { + "epoch": 8.265493697478991, + "grad_norm": 3.5625, + "learning_rate": 3.4922019561194816e-05, + "loss": 0.8767, + "step": 62950 + }, + { + "epoch": 8.272058823529411, + "grad_norm": 4.9375, + "learning_rate": 3.47898493259318e-05, + "loss": 0.9055, + "step": 63000 + }, + { + "epoch": 8.278623949579831, + "grad_norm": 3.53125, + "learning_rate": 3.465767909066878e-05, + "loss": 0.9432, + "step": 63050 + }, + { + "epoch": 8.285189075630251, + "grad_norm": 3.140625, + "learning_rate": 3.452550885540576e-05, + "loss": 0.9013, + "step": 63100 + }, + { + "epoch": 8.291754201680673, + "grad_norm": 3.1875, + "learning_rate": 3.4393338620142743e-05, + "loss": 0.8496, + "step": 63150 + }, + { + "epoch": 8.298319327731093, + "grad_norm": 3.171875, + "learning_rate": 3.4261168384879725e-05, + "loss": 0.8496, + "step": 63200 + }, + { + "epoch": 8.304884453781513, + "grad_norm": 2.921875, + "learning_rate": 3.412899814961671e-05, + "loss": 0.8911, + "step": 63250 + }, + { + "epoch": 8.311449579831933, + "grad_norm": 3.578125, + "learning_rate": 3.399682791435369e-05, + "loss": 0.8891, + "step": 63300 + }, + { + "epoch": 8.318014705882353, + "grad_norm": 3.78125, + "learning_rate": 3.386465767909067e-05, + "loss": 0.8304, + "step": 63350 + }, + { + "epoch": 8.324579831932773, + "grad_norm": 4.375, + "learning_rate": 3.373248744382765e-05, + "loss": 0.8785, + "step": 63400 + }, + { + "epoch": 8.331144957983193, + "grad_norm": 4.53125, + "learning_rate": 3.3600317208564635e-05, + "loss": 0.8848, + "step": 63450 + }, + { + "epoch": 8.337710084033613, + "grad_norm": 4.8125, + "learning_rate": 3.346814697330161e-05, + "loss": 0.9199, + "step": 63500 + }, + { + "epoch": 8.344275210084033, + "grad_norm": 3.5625, + "learning_rate": 3.333597673803859e-05, + "loss": 0.8831, + "step": 63550 + }, + { + "epoch": 8.350840336134453, + "grad_norm": 3.9375, + "learning_rate": 3.3203806502775574e-05, + "loss": 0.9287, + "step": 63600 + }, + { + "epoch": 8.357405462184873, + "grad_norm": 3.75, + "learning_rate": 3.3071636267512556e-05, + "loss": 0.9415, + "step": 63650 + }, + { + "epoch": 8.363970588235293, + "grad_norm": 3.734375, + "learning_rate": 3.293946603224954e-05, + "loss": 0.8426, + "step": 63700 + }, + { + "epoch": 8.370535714285714, + "grad_norm": 3.6875, + "learning_rate": 3.280729579698652e-05, + "loss": 0.8968, + "step": 63750 + }, + { + "epoch": 8.377100840336134, + "grad_norm": 3.734375, + "learning_rate": 3.26751255617235e-05, + "loss": 0.8862, + "step": 63800 + }, + { + "epoch": 8.383665966386555, + "grad_norm": 3.578125, + "learning_rate": 3.2542955326460484e-05, + "loss": 0.8493, + "step": 63850 + }, + { + "epoch": 8.390231092436975, + "grad_norm": 2.875, + "learning_rate": 3.2410785091197466e-05, + "loss": 0.919, + "step": 63900 + }, + { + "epoch": 8.396796218487395, + "grad_norm": 3.09375, + "learning_rate": 3.227861485593445e-05, + "loss": 0.8918, + "step": 63950 + }, + { + "epoch": 8.403361344537815, + "grad_norm": 3.015625, + "learning_rate": 3.214644462067143e-05, + "loss": 0.8576, + "step": 64000 + }, + { + "epoch": 8.409926470588236, + "grad_norm": 3.6875, + "learning_rate": 3.2014274385408405e-05, + "loss": 0.8066, + "step": 64050 + }, + { + "epoch": 8.416491596638656, + "grad_norm": 3.4375, + "learning_rate": 3.188210415014539e-05, + "loss": 0.8558, + "step": 64100 + }, + { + "epoch": 8.423056722689076, + "grad_norm": 3.625, + "learning_rate": 3.174993391488237e-05, + "loss": 0.8848, + "step": 64150 + }, + { + "epoch": 8.429621848739496, + "grad_norm": 4.0625, + "learning_rate": 3.161776367961935e-05, + "loss": 0.8849, + "step": 64200 + }, + { + "epoch": 8.436186974789916, + "grad_norm": 3.640625, + "learning_rate": 3.148559344435633e-05, + "loss": 0.9291, + "step": 64250 + }, + { + "epoch": 8.442752100840336, + "grad_norm": 3.296875, + "learning_rate": 3.1353423209093315e-05, + "loss": 0.8496, + "step": 64300 + }, + { + "epoch": 8.449317226890756, + "grad_norm": 3.859375, + "learning_rate": 3.12212529738303e-05, + "loss": 0.831, + "step": 64350 + }, + { + "epoch": 8.455882352941176, + "grad_norm": 2.671875, + "learning_rate": 3.108908273856728e-05, + "loss": 0.8968, + "step": 64400 + }, + { + "epoch": 8.462447478991596, + "grad_norm": 2.75, + "learning_rate": 3.095691250330426e-05, + "loss": 0.8889, + "step": 64450 + }, + { + "epoch": 8.469012605042018, + "grad_norm": 3.0, + "learning_rate": 3.082474226804124e-05, + "loss": 0.942, + "step": 64500 + }, + { + "epoch": 8.475577731092438, + "grad_norm": 5.03125, + "learning_rate": 3.0692572032778225e-05, + "loss": 0.8368, + "step": 64550 + }, + { + "epoch": 8.482142857142858, + "grad_norm": 3.84375, + "learning_rate": 3.05604017975152e-05, + "loss": 0.8742, + "step": 64600 + }, + { + "epoch": 8.488707983193278, + "grad_norm": 3.921875, + "learning_rate": 3.042823156225218e-05, + "loss": 0.879, + "step": 64650 + }, + { + "epoch": 8.495273109243698, + "grad_norm": 2.75, + "learning_rate": 3.029606132698916e-05, + "loss": 0.8647, + "step": 64700 + }, + { + "epoch": 8.501838235294118, + "grad_norm": 3.265625, + "learning_rate": 3.0163891091726143e-05, + "loss": 0.846, + "step": 64750 + }, + { + "epoch": 8.508403361344538, + "grad_norm": 3.375, + "learning_rate": 3.0031720856463125e-05, + "loss": 0.9151, + "step": 64800 + }, + { + "epoch": 8.514968487394958, + "grad_norm": 3.625, + "learning_rate": 2.9899550621200107e-05, + "loss": 0.8529, + "step": 64850 + }, + { + "epoch": 8.521533613445378, + "grad_norm": 2.671875, + "learning_rate": 2.976738038593709e-05, + "loss": 0.9216, + "step": 64900 + }, + { + "epoch": 8.528098739495798, + "grad_norm": 3.375, + "learning_rate": 2.963521015067407e-05, + "loss": 0.8465, + "step": 64950 + }, + { + "epoch": 8.534663865546218, + "grad_norm": 3.34375, + "learning_rate": 2.9503039915411053e-05, + "loss": 0.94, + "step": 65000 + }, + { + "epoch": 8.541228991596638, + "grad_norm": 3.78125, + "learning_rate": 2.9370869680148035e-05, + "loss": 0.8756, + "step": 65050 + }, + { + "epoch": 8.547794117647058, + "grad_norm": 3.796875, + "learning_rate": 2.9238699444885013e-05, + "loss": 0.9175, + "step": 65100 + }, + { + "epoch": 8.55435924369748, + "grad_norm": 2.953125, + "learning_rate": 2.9106529209621995e-05, + "loss": 0.9387, + "step": 65150 + }, + { + "epoch": 8.5609243697479, + "grad_norm": 3.40625, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.8691, + "step": 65200 + }, + { + "epoch": 8.56748949579832, + "grad_norm": 3.21875, + "learning_rate": 2.8842188739095956e-05, + "loss": 0.8827, + "step": 65250 + }, + { + "epoch": 8.57405462184874, + "grad_norm": 3.046875, + "learning_rate": 2.8710018503832938e-05, + "loss": 0.889, + "step": 65300 + }, + { + "epoch": 8.58061974789916, + "grad_norm": 3.21875, + "learning_rate": 2.8577848268569916e-05, + "loss": 0.9372, + "step": 65350 + }, + { + "epoch": 8.58718487394958, + "grad_norm": 4.46875, + "learning_rate": 2.84456780333069e-05, + "loss": 0.8589, + "step": 65400 + }, + { + "epoch": 8.59375, + "grad_norm": 3.5625, + "learning_rate": 2.831350779804388e-05, + "loss": 0.9059, + "step": 65450 + }, + { + "epoch": 8.60031512605042, + "grad_norm": 3.25, + "learning_rate": 2.8181337562780862e-05, + "loss": 0.86, + "step": 65500 + }, + { + "epoch": 8.60688025210084, + "grad_norm": 3.5625, + "learning_rate": 2.8049167327517844e-05, + "loss": 0.9566, + "step": 65550 + }, + { + "epoch": 8.61344537815126, + "grad_norm": 3.515625, + "learning_rate": 2.7916997092254826e-05, + "loss": 0.868, + "step": 65600 + }, + { + "epoch": 8.62001050420168, + "grad_norm": 3.375, + "learning_rate": 2.7784826856991808e-05, + "loss": 0.9989, + "step": 65650 + }, + { + "epoch": 8.6265756302521, + "grad_norm": 3.4375, + "learning_rate": 2.765265662172879e-05, + "loss": 0.9477, + "step": 65700 + }, + { + "epoch": 8.63314075630252, + "grad_norm": 4.28125, + "learning_rate": 2.7520486386465772e-05, + "loss": 0.8854, + "step": 65750 + }, + { + "epoch": 8.639705882352942, + "grad_norm": 3.1875, + "learning_rate": 2.7388316151202754e-05, + "loss": 0.8574, + "step": 65800 + }, + { + "epoch": 8.646271008403362, + "grad_norm": 3.4375, + "learning_rate": 2.725614591593973e-05, + "loss": 0.8614, + "step": 65850 + }, + { + "epoch": 8.652836134453782, + "grad_norm": 3.0625, + "learning_rate": 2.712397568067671e-05, + "loss": 0.8629, + "step": 65900 + }, + { + "epoch": 8.659401260504202, + "grad_norm": 3.515625, + "learning_rate": 2.6991805445413693e-05, + "loss": 0.9085, + "step": 65950 + }, + { + "epoch": 8.665966386554622, + "grad_norm": 4.40625, + "learning_rate": 2.6859635210150675e-05, + "loss": 0.9446, + "step": 66000 + }, + { + "epoch": 8.672531512605042, + "grad_norm": 4.1875, + "learning_rate": 2.6727464974887657e-05, + "loss": 0.8835, + "step": 66050 + }, + { + "epoch": 8.679096638655462, + "grad_norm": 3.453125, + "learning_rate": 2.659529473962464e-05, + "loss": 0.9335, + "step": 66100 + }, + { + "epoch": 8.685661764705882, + "grad_norm": 2.8125, + "learning_rate": 2.6463124504361618e-05, + "loss": 0.9176, + "step": 66150 + }, + { + "epoch": 8.692226890756302, + "grad_norm": 3.25, + "learning_rate": 2.63309542690986e-05, + "loss": 0.866, + "step": 66200 + }, + { + "epoch": 8.698792016806722, + "grad_norm": 3.703125, + "learning_rate": 2.619878403383558e-05, + "loss": 0.8773, + "step": 66250 + }, + { + "epoch": 8.705357142857142, + "grad_norm": 3.09375, + "learning_rate": 2.6066613798572564e-05, + "loss": 0.8938, + "step": 66300 + }, + { + "epoch": 8.711922268907562, + "grad_norm": 3.03125, + "learning_rate": 2.5934443563309546e-05, + "loss": 0.9595, + "step": 66350 + }, + { + "epoch": 8.718487394957982, + "grad_norm": 3.78125, + "learning_rate": 2.5802273328046528e-05, + "loss": 0.9481, + "step": 66400 + }, + { + "epoch": 8.725052521008404, + "grad_norm": 3.578125, + "learning_rate": 2.5670103092783503e-05, + "loss": 0.853, + "step": 66450 + }, + { + "epoch": 8.731617647058824, + "grad_norm": 4.65625, + "learning_rate": 2.5537932857520485e-05, + "loss": 0.9192, + "step": 66500 + }, + { + "epoch": 8.738182773109244, + "grad_norm": 3.625, + "learning_rate": 2.5405762622257467e-05, + "loss": 0.8477, + "step": 66550 + }, + { + "epoch": 8.744747899159664, + "grad_norm": 3.515625, + "learning_rate": 2.527359238699445e-05, + "loss": 0.9433, + "step": 66600 + }, + { + "epoch": 8.751313025210084, + "grad_norm": 3.4375, + "learning_rate": 2.514142215173143e-05, + "loss": 0.8813, + "step": 66650 + }, + { + "epoch": 8.757878151260504, + "grad_norm": 3.234375, + "learning_rate": 2.5009251916468413e-05, + "loss": 0.8904, + "step": 66700 + }, + { + "epoch": 8.764443277310924, + "grad_norm": 4.1875, + "learning_rate": 2.4877081681205395e-05, + "loss": 0.8608, + "step": 66750 + }, + { + "epoch": 8.771008403361344, + "grad_norm": 4.0, + "learning_rate": 2.4744911445942377e-05, + "loss": 0.9015, + "step": 66800 + }, + { + "epoch": 8.777573529411764, + "grad_norm": 3.59375, + "learning_rate": 2.461274121067936e-05, + "loss": 0.8966, + "step": 66850 + }, + { + "epoch": 8.784138655462185, + "grad_norm": 2.796875, + "learning_rate": 2.4480570975416337e-05, + "loss": 0.8921, + "step": 66900 + }, + { + "epoch": 8.790703781512605, + "grad_norm": 3.25, + "learning_rate": 2.434840074015332e-05, + "loss": 0.8334, + "step": 66950 + }, + { + "epoch": 8.797268907563025, + "grad_norm": 3.3125, + "learning_rate": 2.42162305048903e-05, + "loss": 0.9139, + "step": 67000 + }, + { + "epoch": 8.803834033613445, + "grad_norm": 3.53125, + "learning_rate": 2.4084060269627283e-05, + "loss": 0.888, + "step": 67050 + }, + { + "epoch": 8.810399159663866, + "grad_norm": 3.515625, + "learning_rate": 2.395189003436426e-05, + "loss": 0.9097, + "step": 67100 + }, + { + "epoch": 8.816964285714286, + "grad_norm": 2.578125, + "learning_rate": 2.3819719799101244e-05, + "loss": 0.8759, + "step": 67150 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 2.953125, + "learning_rate": 2.3687549563838222e-05, + "loss": 0.8913, + "step": 67200 + }, + { + "epoch": 8.830094537815127, + "grad_norm": 5.78125, + "learning_rate": 2.3555379328575204e-05, + "loss": 0.8569, + "step": 67250 + }, + { + "epoch": 8.836659663865547, + "grad_norm": 3.84375, + "learning_rate": 2.3423209093312186e-05, + "loss": 0.9132, + "step": 67300 + }, + { + "epoch": 8.843224789915967, + "grad_norm": 3.5625, + "learning_rate": 2.3291038858049168e-05, + "loss": 0.8841, + "step": 67350 + }, + { + "epoch": 8.849789915966387, + "grad_norm": 3.625, + "learning_rate": 2.315886862278615e-05, + "loss": 0.8678, + "step": 67400 + }, + { + "epoch": 8.856355042016807, + "grad_norm": 3.21875, + "learning_rate": 2.302669838752313e-05, + "loss": 0.8838, + "step": 67450 + }, + { + "epoch": 8.862920168067227, + "grad_norm": 3.5625, + "learning_rate": 2.289452815226011e-05, + "loss": 0.956, + "step": 67500 + }, + { + "epoch": 8.869485294117647, + "grad_norm": 3.3125, + "learning_rate": 2.2762357916997093e-05, + "loss": 0.9985, + "step": 67550 + }, + { + "epoch": 8.876050420168067, + "grad_norm": 4.0625, + "learning_rate": 2.2630187681734075e-05, + "loss": 0.9346, + "step": 67600 + }, + { + "epoch": 8.882615546218487, + "grad_norm": 4.78125, + "learning_rate": 2.2498017446471057e-05, + "loss": 0.9215, + "step": 67650 + }, + { + "epoch": 8.889180672268907, + "grad_norm": 2.546875, + "learning_rate": 2.236584721120804e-05, + "loss": 0.9132, + "step": 67700 + }, + { + "epoch": 8.895745798319329, + "grad_norm": 2.640625, + "learning_rate": 2.2233676975945017e-05, + "loss": 0.8704, + "step": 67750 + }, + { + "epoch": 8.902310924369749, + "grad_norm": 3.09375, + "learning_rate": 2.2101506740682e-05, + "loss": 0.9117, + "step": 67800 + }, + { + "epoch": 8.908876050420169, + "grad_norm": 3.40625, + "learning_rate": 2.196933650541898e-05, + "loss": 0.9104, + "step": 67850 + }, + { + "epoch": 8.915441176470589, + "grad_norm": 2.96875, + "learning_rate": 2.1837166270155963e-05, + "loss": 0.922, + "step": 67900 + }, + { + "epoch": 8.922006302521009, + "grad_norm": 3.65625, + "learning_rate": 2.1704996034892945e-05, + "loss": 0.9083, + "step": 67950 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 3.015625, + "learning_rate": 2.1572825799629927e-05, + "loss": 0.8764, + "step": 68000 + }, + { + "epoch": 8.935136554621849, + "grad_norm": 3.109375, + "learning_rate": 2.1440655564366906e-05, + "loss": 0.9047, + "step": 68050 + }, + { + "epoch": 8.941701680672269, + "grad_norm": 3.03125, + "learning_rate": 2.1308485329103888e-05, + "loss": 0.9171, + "step": 68100 + }, + { + "epoch": 8.948266806722689, + "grad_norm": 2.84375, + "learning_rate": 2.1176315093840866e-05, + "loss": 0.917, + "step": 68150 + }, + { + "epoch": 8.954831932773109, + "grad_norm": 3.90625, + "learning_rate": 2.1044144858577848e-05, + "loss": 0.9438, + "step": 68200 + }, + { + "epoch": 8.961397058823529, + "grad_norm": 3.34375, + "learning_rate": 2.091197462331483e-05, + "loss": 0.9877, + "step": 68250 + }, + { + "epoch": 8.967962184873949, + "grad_norm": 2.953125, + "learning_rate": 2.0779804388051812e-05, + "loss": 0.8838, + "step": 68300 + }, + { + "epoch": 8.974527310924369, + "grad_norm": 3.03125, + "learning_rate": 2.064763415278879e-05, + "loss": 0.8491, + "step": 68350 + }, + { + "epoch": 8.981092436974789, + "grad_norm": 4.09375, + "learning_rate": 2.0515463917525773e-05, + "loss": 0.8974, + "step": 68400 + }, + { + "epoch": 8.987657563025211, + "grad_norm": 3.578125, + "learning_rate": 2.0383293682262755e-05, + "loss": 0.8826, + "step": 68450 + }, + { + "epoch": 8.994222689075631, + "grad_norm": 3.828125, + "learning_rate": 2.0251123446999737e-05, + "loss": 0.953, + "step": 68500 + }, + { + "epoch": 9.000787815126051, + "grad_norm": 3.46875, + "learning_rate": 2.011895321173672e-05, + "loss": 0.9019, + "step": 68550 + }, + { + "epoch": 9.007352941176471, + "grad_norm": 2.640625, + "learning_rate": 1.99867829764737e-05, + "loss": 0.8543, + "step": 68600 + }, + { + "epoch": 9.013918067226891, + "grad_norm": 2.859375, + "learning_rate": 1.985461274121068e-05, + "loss": 0.8175, + "step": 68650 + }, + { + "epoch": 9.020483193277311, + "grad_norm": 3.234375, + "learning_rate": 1.972244250594766e-05, + "loss": 0.7925, + "step": 68700 + }, + { + "epoch": 9.027048319327731, + "grad_norm": 2.59375, + "learning_rate": 1.9590272270684643e-05, + "loss": 0.8177, + "step": 68750 + }, + { + "epoch": 9.033613445378151, + "grad_norm": 3.9375, + "learning_rate": 1.9458102035421625e-05, + "loss": 0.8351, + "step": 68800 + }, + { + "epoch": 9.040178571428571, + "grad_norm": 3.984375, + "learning_rate": 1.9325931800158607e-05, + "loss": 0.7946, + "step": 68850 + }, + { + "epoch": 9.046743697478991, + "grad_norm": 2.65625, + "learning_rate": 1.919376156489559e-05, + "loss": 0.836, + "step": 68900 + }, + { + "epoch": 9.053308823529411, + "grad_norm": 4.90625, + "learning_rate": 1.9061591329632568e-05, + "loss": 0.8177, + "step": 68950 + }, + { + "epoch": 9.059873949579831, + "grad_norm": 2.796875, + "learning_rate": 1.892942109436955e-05, + "loss": 0.8352, + "step": 69000 + }, + { + "epoch": 9.066439075630251, + "grad_norm": 3.1875, + "learning_rate": 1.879725085910653e-05, + "loss": 0.8551, + "step": 69050 + }, + { + "epoch": 9.073004201680673, + "grad_norm": 3.5625, + "learning_rate": 1.866508062384351e-05, + "loss": 0.807, + "step": 69100 + }, + { + "epoch": 9.079569327731093, + "grad_norm": 4.15625, + "learning_rate": 1.8532910388580492e-05, + "loss": 0.8683, + "step": 69150 + }, + { + "epoch": 9.086134453781513, + "grad_norm": 2.9375, + "learning_rate": 1.8400740153317474e-05, + "loss": 0.8108, + "step": 69200 + }, + { + "epoch": 9.092699579831933, + "grad_norm": 3.453125, + "learning_rate": 1.8268569918054453e-05, + "loss": 0.8421, + "step": 69250 + }, + { + "epoch": 9.099264705882353, + "grad_norm": 3.59375, + "learning_rate": 1.8136399682791435e-05, + "loss": 0.8363, + "step": 69300 + }, + { + "epoch": 9.105829831932773, + "grad_norm": 3.609375, + "learning_rate": 1.8004229447528417e-05, + "loss": 0.8291, + "step": 69350 + }, + { + "epoch": 9.112394957983193, + "grad_norm": 2.96875, + "learning_rate": 1.78720592122654e-05, + "loss": 0.8416, + "step": 69400 + }, + { + "epoch": 9.118960084033613, + "grad_norm": 3.9375, + "learning_rate": 1.773988897700238e-05, + "loss": 0.8704, + "step": 69450 + }, + { + "epoch": 9.125525210084033, + "grad_norm": 2.53125, + "learning_rate": 1.7607718741739362e-05, + "loss": 0.8508, + "step": 69500 + }, + { + "epoch": 9.132090336134453, + "grad_norm": 3.046875, + "learning_rate": 1.747554850647634e-05, + "loss": 0.8543, + "step": 69550 + }, + { + "epoch": 9.138655462184873, + "grad_norm": 3.09375, + "learning_rate": 1.7343378271213323e-05, + "loss": 0.8485, + "step": 69600 + }, + { + "epoch": 9.145220588235293, + "grad_norm": 3.59375, + "learning_rate": 1.7211208035950305e-05, + "loss": 0.8638, + "step": 69650 + }, + { + "epoch": 9.151785714285714, + "grad_norm": 3.046875, + "learning_rate": 1.7079037800687287e-05, + "loss": 0.8383, + "step": 69700 + }, + { + "epoch": 9.158350840336134, + "grad_norm": 4.25, + "learning_rate": 1.694686756542427e-05, + "loss": 0.8408, + "step": 69750 + }, + { + "epoch": 9.164915966386555, + "grad_norm": 2.6875, + "learning_rate": 1.681469733016125e-05, + "loss": 0.8616, + "step": 69800 + }, + { + "epoch": 9.171481092436975, + "grad_norm": 3.546875, + "learning_rate": 1.668252709489823e-05, + "loss": 0.7984, + "step": 69850 + }, + { + "epoch": 9.178046218487395, + "grad_norm": 3.0, + "learning_rate": 1.655035685963521e-05, + "loss": 0.8265, + "step": 69900 + }, + { + "epoch": 9.184611344537815, + "grad_norm": 3.8125, + "learning_rate": 1.6418186624372193e-05, + "loss": 0.826, + "step": 69950 + }, + { + "epoch": 9.191176470588236, + "grad_norm": 3.015625, + "learning_rate": 1.6286016389109175e-05, + "loss": 0.8131, + "step": 70000 + }, + { + "epoch": 9.197741596638656, + "grad_norm": 2.96875, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.795, + "step": 70050 + }, + { + "epoch": 9.204306722689076, + "grad_norm": 3.46875, + "learning_rate": 1.6021675918583136e-05, + "loss": 0.8655, + "step": 70100 + }, + { + "epoch": 9.210871848739496, + "grad_norm": 3.671875, + "learning_rate": 1.5889505683320115e-05, + "loss": 0.811, + "step": 70150 + }, + { + "epoch": 9.217436974789916, + "grad_norm": 3.8125, + "learning_rate": 1.5757335448057097e-05, + "loss": 0.8409, + "step": 70200 + }, + { + "epoch": 9.224002100840336, + "grad_norm": 3.828125, + "learning_rate": 1.562516521279408e-05, + "loss": 0.8675, + "step": 70250 + }, + { + "epoch": 9.230567226890756, + "grad_norm": 3.984375, + "learning_rate": 1.549299497753106e-05, + "loss": 0.7883, + "step": 70300 + }, + { + "epoch": 9.237132352941176, + "grad_norm": 3.15625, + "learning_rate": 1.5360824742268042e-05, + "loss": 0.8123, + "step": 70350 + }, + { + "epoch": 9.243697478991596, + "grad_norm": 3.5, + "learning_rate": 1.5228654507005021e-05, + "loss": 0.7888, + "step": 70400 + }, + { + "epoch": 9.250262605042018, + "grad_norm": 3.046875, + "learning_rate": 1.5096484271742003e-05, + "loss": 0.8466, + "step": 70450 + }, + { + "epoch": 9.256827731092438, + "grad_norm": 3.25, + "learning_rate": 1.4964314036478985e-05, + "loss": 0.8154, + "step": 70500 + }, + { + "epoch": 9.263392857142858, + "grad_norm": 2.796875, + "learning_rate": 1.4832143801215967e-05, + "loss": 0.8042, + "step": 70550 + }, + { + "epoch": 9.269957983193278, + "grad_norm": 3.171875, + "learning_rate": 1.4699973565952949e-05, + "loss": 0.8571, + "step": 70600 + }, + { + "epoch": 9.276523109243698, + "grad_norm": 3.75, + "learning_rate": 1.4567803330689931e-05, + "loss": 0.8109, + "step": 70650 + }, + { + "epoch": 9.283088235294118, + "grad_norm": 2.578125, + "learning_rate": 1.443563309542691e-05, + "loss": 0.8494, + "step": 70700 + }, + { + "epoch": 9.289653361344538, + "grad_norm": 3.25, + "learning_rate": 1.4303462860163891e-05, + "loss": 0.8483, + "step": 70750 + }, + { + "epoch": 9.296218487394958, + "grad_norm": 4.40625, + "learning_rate": 1.4171292624900873e-05, + "loss": 0.8561, + "step": 70800 + }, + { + "epoch": 9.302783613445378, + "grad_norm": 3.125, + "learning_rate": 1.4039122389637854e-05, + "loss": 0.8184, + "step": 70850 + }, + { + "epoch": 9.309348739495798, + "grad_norm": 4.15625, + "learning_rate": 1.3906952154374836e-05, + "loss": 0.8726, + "step": 70900 + }, + { + "epoch": 9.315913865546218, + "grad_norm": 2.640625, + "learning_rate": 1.3774781919111818e-05, + "loss": 0.8978, + "step": 70950 + }, + { + "epoch": 9.322478991596638, + "grad_norm": 3.859375, + "learning_rate": 1.3642611683848796e-05, + "loss": 0.8926, + "step": 71000 + }, + { + "epoch": 9.329044117647058, + "grad_norm": 3.34375, + "learning_rate": 1.3510441448585778e-05, + "loss": 0.8196, + "step": 71050 + }, + { + "epoch": 9.33560924369748, + "grad_norm": 3.515625, + "learning_rate": 1.337827121332276e-05, + "loss": 0.818, + "step": 71100 + }, + { + "epoch": 9.3421743697479, + "grad_norm": 3.09375, + "learning_rate": 1.3246100978059742e-05, + "loss": 0.8168, + "step": 71150 + }, + { + "epoch": 9.34873949579832, + "grad_norm": 3.046875, + "learning_rate": 1.3113930742796724e-05, + "loss": 0.8109, + "step": 71200 + }, + { + "epoch": 9.35530462184874, + "grad_norm": 4.25, + "learning_rate": 1.2981760507533704e-05, + "loss": 0.8238, + "step": 71250 + }, + { + "epoch": 9.36186974789916, + "grad_norm": 3.75, + "learning_rate": 1.2849590272270685e-05, + "loss": 0.8574, + "step": 71300 + }, + { + "epoch": 9.36843487394958, + "grad_norm": 4.5, + "learning_rate": 1.2717420037007665e-05, + "loss": 0.8113, + "step": 71350 + }, + { + "epoch": 9.375, + "grad_norm": 3.859375, + "learning_rate": 1.2585249801744647e-05, + "loss": 0.8139, + "step": 71400 + }, + { + "epoch": 9.38156512605042, + "grad_norm": 4.125, + "learning_rate": 1.2453079566481629e-05, + "loss": 0.8621, + "step": 71450 + }, + { + "epoch": 9.38813025210084, + "grad_norm": 3.890625, + "learning_rate": 1.2320909331218611e-05, + "loss": 0.8902, + "step": 71500 + }, + { + "epoch": 9.39469537815126, + "grad_norm": 4.15625, + "learning_rate": 1.2188739095955591e-05, + "loss": 0.8511, + "step": 71550 + }, + { + "epoch": 9.40126050420168, + "grad_norm": 3.4375, + "learning_rate": 1.2056568860692573e-05, + "loss": 0.8326, + "step": 71600 + }, + { + "epoch": 9.4078256302521, + "grad_norm": 4.03125, + "learning_rate": 1.1924398625429555e-05, + "loss": 0.8601, + "step": 71650 + }, + { + "epoch": 9.41439075630252, + "grad_norm": 4.25, + "learning_rate": 1.1792228390166535e-05, + "loss": 0.8633, + "step": 71700 + }, + { + "epoch": 9.420955882352942, + "grad_norm": 3.8125, + "learning_rate": 1.1660058154903517e-05, + "loss": 0.7743, + "step": 71750 + }, + { + "epoch": 9.427521008403362, + "grad_norm": 3.078125, + "learning_rate": 1.1527887919640498e-05, + "loss": 0.8187, + "step": 71800 + }, + { + "epoch": 9.434086134453782, + "grad_norm": 3.625, + "learning_rate": 1.1395717684377478e-05, + "loss": 0.849, + "step": 71850 + }, + { + "epoch": 9.440651260504202, + "grad_norm": 3.34375, + "learning_rate": 1.126354744911446e-05, + "loss": 0.8291, + "step": 71900 + }, + { + "epoch": 9.447216386554622, + "grad_norm": 3.765625, + "learning_rate": 1.113137721385144e-05, + "loss": 0.8571, + "step": 71950 + }, + { + "epoch": 9.453781512605042, + "grad_norm": 3.59375, + "learning_rate": 1.0999206978588422e-05, + "loss": 0.7721, + "step": 72000 + }, + { + "epoch": 9.460346638655462, + "grad_norm": 4.1875, + "learning_rate": 1.0867036743325404e-05, + "loss": 0.8919, + "step": 72050 + }, + { + "epoch": 9.466911764705882, + "grad_norm": 3.78125, + "learning_rate": 1.0734866508062384e-05, + "loss": 0.837, + "step": 72100 + }, + { + "epoch": 9.473476890756302, + "grad_norm": 3.015625, + "learning_rate": 1.0602696272799366e-05, + "loss": 0.8425, + "step": 72150 + }, + { + "epoch": 9.480042016806722, + "grad_norm": 3.109375, + "learning_rate": 1.0470526037536348e-05, + "loss": 0.8074, + "step": 72200 + }, + { + "epoch": 9.486607142857142, + "grad_norm": 3.078125, + "learning_rate": 1.0338355802273329e-05, + "loss": 0.8214, + "step": 72250 + }, + { + "epoch": 9.493172268907562, + "grad_norm": 2.875, + "learning_rate": 1.0206185567010309e-05, + "loss": 0.8287, + "step": 72300 + }, + { + "epoch": 9.499737394957982, + "grad_norm": 3.875, + "learning_rate": 1.007401533174729e-05, + "loss": 0.8272, + "step": 72350 + }, + { + "epoch": 9.506302521008404, + "grad_norm": 2.671875, + "learning_rate": 9.941845096484271e-06, + "loss": 0.8197, + "step": 72400 + }, + { + "epoch": 9.512867647058824, + "grad_norm": 3.484375, + "learning_rate": 9.809674861221253e-06, + "loss": 0.8158, + "step": 72450 + }, + { + "epoch": 9.519432773109244, + "grad_norm": 3.09375, + "learning_rate": 9.677504625958235e-06, + "loss": 0.8184, + "step": 72500 + }, + { + "epoch": 9.525997899159664, + "grad_norm": 3.515625, + "learning_rate": 9.545334390695215e-06, + "loss": 0.8088, + "step": 72550 + }, + { + "epoch": 9.532563025210084, + "grad_norm": 2.609375, + "learning_rate": 9.413164155432197e-06, + "loss": 0.7954, + "step": 72600 + }, + { + "epoch": 9.539128151260504, + "grad_norm": 2.421875, + "learning_rate": 9.28099392016918e-06, + "loss": 0.813, + "step": 72650 + }, + { + "epoch": 9.545693277310924, + "grad_norm": 3.734375, + "learning_rate": 9.14882368490616e-06, + "loss": 0.8708, + "step": 72700 + }, + { + "epoch": 9.552258403361344, + "grad_norm": 3.203125, + "learning_rate": 9.01665344964314e-06, + "loss": 0.8545, + "step": 72750 + }, + { + "epoch": 9.558823529411764, + "grad_norm": 4.09375, + "learning_rate": 8.884483214380122e-06, + "loss": 0.8873, + "step": 72800 + }, + { + "epoch": 9.565388655462185, + "grad_norm": 3.609375, + "learning_rate": 8.752312979117102e-06, + "loss": 0.8071, + "step": 72850 + }, + { + "epoch": 9.571953781512605, + "grad_norm": 3.359375, + "learning_rate": 8.620142743854084e-06, + "loss": 0.8507, + "step": 72900 + }, + { + "epoch": 9.578518907563025, + "grad_norm": 2.734375, + "learning_rate": 8.487972508591066e-06, + "loss": 0.8495, + "step": 72950 + }, + { + "epoch": 9.585084033613445, + "grad_norm": 2.90625, + "learning_rate": 8.355802273328046e-06, + "loss": 0.9413, + "step": 73000 + }, + { + "epoch": 9.591649159663866, + "grad_norm": 4.65625, + "learning_rate": 8.223632038065028e-06, + "loss": 0.8014, + "step": 73050 + }, + { + "epoch": 9.598214285714286, + "grad_norm": 2.796875, + "learning_rate": 8.09146180280201e-06, + "loss": 0.8129, + "step": 73100 + }, + { + "epoch": 9.604779411764707, + "grad_norm": 4.03125, + "learning_rate": 7.95929156753899e-06, + "loss": 0.8201, + "step": 73150 + }, + { + "epoch": 9.611344537815127, + "grad_norm": 3.328125, + "learning_rate": 7.827121332275973e-06, + "loss": 0.868, + "step": 73200 + }, + { + "epoch": 9.617909663865547, + "grad_norm": 3.796875, + "learning_rate": 7.694951097012953e-06, + "loss": 0.8526, + "step": 73250 + }, + { + "epoch": 9.624474789915967, + "grad_norm": 2.875, + "learning_rate": 7.562780861749934e-06, + "loss": 0.876, + "step": 73300 + }, + { + "epoch": 9.631039915966387, + "grad_norm": 3.734375, + "learning_rate": 7.430610626486916e-06, + "loss": 0.8259, + "step": 73350 + }, + { + "epoch": 9.637605042016807, + "grad_norm": 2.828125, + "learning_rate": 7.298440391223897e-06, + "loss": 0.8365, + "step": 73400 + }, + { + "epoch": 9.644170168067227, + "grad_norm": 3.859375, + "learning_rate": 7.166270155960877e-06, + "loss": 0.8087, + "step": 73450 + }, + { + "epoch": 9.650735294117647, + "grad_norm": 4.0625, + "learning_rate": 7.034099920697859e-06, + "loss": 0.8416, + "step": 73500 + }, + { + "epoch": 9.657300420168067, + "grad_norm": 3.265625, + "learning_rate": 6.901929685434841e-06, + "loss": 0.8645, + "step": 73550 + }, + { + "epoch": 9.663865546218487, + "grad_norm": 3.984375, + "learning_rate": 6.7697594501718215e-06, + "loss": 0.8237, + "step": 73600 + }, + { + "epoch": 9.670430672268907, + "grad_norm": 3.0625, + "learning_rate": 6.637589214908803e-06, + "loss": 0.8426, + "step": 73650 + }, + { + "epoch": 9.676995798319329, + "grad_norm": 3.359375, + "learning_rate": 6.505418979645785e-06, + "loss": 0.8183, + "step": 73700 + }, + { + "epoch": 9.683560924369749, + "grad_norm": 3.5, + "learning_rate": 6.373248744382765e-06, + "loss": 0.8546, + "step": 73750 + }, + { + "epoch": 9.690126050420169, + "grad_norm": 3.421875, + "learning_rate": 6.241078509119747e-06, + "loss": 0.8348, + "step": 73800 + }, + { + "epoch": 9.696691176470589, + "grad_norm": 3.625, + "learning_rate": 6.108908273856727e-06, + "loss": 0.7979, + "step": 73850 + }, + { + "epoch": 9.703256302521009, + "grad_norm": 3.390625, + "learning_rate": 5.976738038593709e-06, + "loss": 0.8129, + "step": 73900 + }, + { + "epoch": 9.709821428571429, + "grad_norm": 3.90625, + "learning_rate": 5.84456780333069e-06, + "loss": 0.8164, + "step": 73950 + }, + { + "epoch": 9.716386554621849, + "grad_norm": 3.078125, + "learning_rate": 5.712397568067671e-06, + "loss": 0.8302, + "step": 74000 + }, + { + "epoch": 9.722951680672269, + "grad_norm": 3.0, + "learning_rate": 5.580227332804653e-06, + "loss": 0.826, + "step": 74050 + }, + { + "epoch": 9.729516806722689, + "grad_norm": 2.328125, + "learning_rate": 5.448057097541634e-06, + "loss": 0.8459, + "step": 74100 + }, + { + "epoch": 9.736081932773109, + "grad_norm": 2.96875, + "learning_rate": 5.315886862278615e-06, + "loss": 0.8814, + "step": 74150 + }, + { + "epoch": 9.742647058823529, + "grad_norm": 3.265625, + "learning_rate": 5.183716627015597e-06, + "loss": 0.8289, + "step": 74200 + }, + { + "epoch": 9.749212184873949, + "grad_norm": 6.75, + "learning_rate": 5.051546391752578e-06, + "loss": 0.8271, + "step": 74250 + }, + { + "epoch": 9.755777310924369, + "grad_norm": 5.15625, + "learning_rate": 4.919376156489559e-06, + "loss": 0.7973, + "step": 74300 + }, + { + "epoch": 9.762342436974789, + "grad_norm": 2.546875, + "learning_rate": 4.78720592122654e-06, + "loss": 0.9321, + "step": 74350 + }, + { + "epoch": 9.768907563025211, + "grad_norm": 2.875, + "learning_rate": 4.655035685963521e-06, + "loss": 0.8266, + "step": 74400 + }, + { + "epoch": 9.775472689075631, + "grad_norm": 3.6875, + "learning_rate": 4.522865450700502e-06, + "loss": 0.8386, + "step": 74450 + }, + { + "epoch": 9.782037815126051, + "grad_norm": 3.484375, + "learning_rate": 4.3906952154374835e-06, + "loss": 0.8346, + "step": 74500 + }, + { + "epoch": 9.788602941176471, + "grad_norm": 3.296875, + "learning_rate": 4.2585249801744654e-06, + "loss": 0.87, + "step": 74550 + }, + { + "epoch": 9.795168067226891, + "grad_norm": 3.203125, + "learning_rate": 4.126354744911446e-06, + "loss": 0.7965, + "step": 74600 + }, + { + "epoch": 9.801733193277311, + "grad_norm": 4.375, + "learning_rate": 3.994184509648427e-06, + "loss": 0.8282, + "step": 74650 + }, + { + "epoch": 9.808298319327731, + "grad_norm": 3.375, + "learning_rate": 3.862014274385409e-06, + "loss": 0.8842, + "step": 74700 + }, + { + "epoch": 9.814863445378151, + "grad_norm": 2.796875, + "learning_rate": 3.72984403912239e-06, + "loss": 0.8473, + "step": 74750 + }, + { + "epoch": 9.821428571428571, + "grad_norm": 3.84375, + "learning_rate": 3.5976738038593706e-06, + "loss": 0.8195, + "step": 74800 + }, + { + "epoch": 9.827993697478991, + "grad_norm": 4.8125, + "learning_rate": 3.4655035685963526e-06, + "loss": 0.8491, + "step": 74850 + }, + { + "epoch": 9.834558823529411, + "grad_norm": 3.703125, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.8452, + "step": 74900 + }, + { + "epoch": 9.841123949579831, + "grad_norm": 4.46875, + "learning_rate": 3.2011630980703144e-06, + "loss": 0.8421, + "step": 74950 + }, + { + "epoch": 9.847689075630251, + "grad_norm": 3.609375, + "learning_rate": 3.068992862807296e-06, + "loss": 0.8377, + "step": 75000 + }, + { + "epoch": 9.854254201680671, + "grad_norm": 3.59375, + "learning_rate": 2.936822627544277e-06, + "loss": 0.8778, + "step": 75050 + }, + { + "epoch": 9.860819327731093, + "grad_norm": 3.046875, + "learning_rate": 2.8046523922812587e-06, + "loss": 0.8085, + "step": 75100 + }, + { + "epoch": 9.867384453781513, + "grad_norm": 3.296875, + "learning_rate": 2.6724821570182394e-06, + "loss": 0.8498, + "step": 75150 + }, + { + "epoch": 9.873949579831933, + "grad_norm": 3.328125, + "learning_rate": 2.540311921755221e-06, + "loss": 0.8532, + "step": 75200 + }, + { + "epoch": 9.880514705882353, + "grad_norm": 3.453125, + "learning_rate": 2.408141686492202e-06, + "loss": 0.8488, + "step": 75250 + }, + { + "epoch": 9.887079831932773, + "grad_norm": 2.9375, + "learning_rate": 2.275971451229183e-06, + "loss": 0.8264, + "step": 75300 + }, + { + "epoch": 9.893644957983193, + "grad_norm": 3.296875, + "learning_rate": 2.1438012159661647e-06, + "loss": 0.7632, + "step": 75350 + }, + { + "epoch": 9.900210084033613, + "grad_norm": 2.25, + "learning_rate": 2.011630980703146e-06, + "loss": 0.8703, + "step": 75400 + }, + { + "epoch": 9.906775210084033, + "grad_norm": 3.640625, + "learning_rate": 1.879460745440127e-06, + "loss": 0.8017, + "step": 75450 + }, + { + "epoch": 9.913340336134453, + "grad_norm": 3.125, + "learning_rate": 1.7472905101771083e-06, + "loss": 0.8494, + "step": 75500 + }, + { + "epoch": 9.919905462184873, + "grad_norm": 3.21875, + "learning_rate": 1.6151202749140896e-06, + "loss": 0.7825, + "step": 75550 + }, + { + "epoch": 9.926470588235293, + "grad_norm": 3.25, + "learning_rate": 1.4829500396510706e-06, + "loss": 0.8645, + "step": 75600 + }, + { + "epoch": 9.933035714285714, + "grad_norm": 4.90625, + "learning_rate": 1.350779804388052e-06, + "loss": 0.8095, + "step": 75650 + }, + { + "epoch": 9.939600840336134, + "grad_norm": 3.390625, + "learning_rate": 1.218609569125033e-06, + "loss": 0.8369, + "step": 75700 + }, + { + "epoch": 9.946165966386555, + "grad_norm": 3.46875, + "learning_rate": 1.0864393338620144e-06, + "loss": 0.8546, + "step": 75750 + }, + { + "epoch": 9.952731092436975, + "grad_norm": 2.859375, + "learning_rate": 9.542690985989957e-07, + "loss": 0.8088, + "step": 75800 + }, + { + "epoch": 9.959296218487395, + "grad_norm": 3.03125, + "learning_rate": 8.220988633359768e-07, + "loss": 0.8517, + "step": 75850 + }, + { + "epoch": 9.965861344537815, + "grad_norm": 3.265625, + "learning_rate": 6.89928628072958e-07, + "loss": 0.9301, + "step": 75900 + }, + { + "epoch": 9.972426470588236, + "grad_norm": 3.8125, + "learning_rate": 5.577583928099393e-07, + "loss": 0.8099, + "step": 75950 + }, + { + "epoch": 9.978991596638656, + "grad_norm": 3.046875, + "learning_rate": 4.255881575469204e-07, + "loss": 0.8555, + "step": 76000 + }, + { + "epoch": 9.985556722689076, + "grad_norm": 4.25, + "learning_rate": 2.934179222839017e-07, + "loss": 0.8108, + "step": 76050 + }, + { + "epoch": 9.992121848739496, + "grad_norm": 3.9375, + "learning_rate": 1.612476870208829e-07, + "loss": 0.8263, + "step": 76100 + }, + { + "epoch": 9.998686974789916, + "grad_norm": 3.640625, + "learning_rate": 2.907745175786413e-08, + "loss": 0.8353, + "step": 76150 + } + ], + "logging_steps": 50, + "max_steps": 76160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.554785504833884e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}